Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions app/backend/prepdocslib/cloudingestionstrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,9 @@ def _build_skillset(self) -> SearchIndexerSkillset:
resource_id=self.search_user_assigned_identity_resource_id
),
inputs=[
# Provide the binary payload expected by the document extractor custom skill.
InputFieldMappingEntry(name="file_data", source="/document/file_data"),
InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"),
InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"),
# Always provide the blob URL so the function can download large files (> 16MB)
InputFieldMappingEntry(name="metadata_storage_path", source="/document/metadata_storage_path"),
# We are not using the SaS token since the functions have RBAC access via managed identity
],
outputs=[
OutputFieldMappingEntry(name="pages", target_name="pages"),
Expand Down Expand Up @@ -310,7 +309,7 @@ async def setup(self) -> None:
configuration=IndexingParametersConfiguration(
query_timeout=None, # type: ignore
data_to_extract="storageMetadata",
allow_skillset_to_read_file_data=True,
allow_skillset_to_read_file_data=False,
)
),
)
Expand Down
74 changes: 44 additions & 30 deletions app/functions/document_extractor/function_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,25 @@
Custom skill for Azure AI Search that extracts and processes document content.
"""

import base64
import io
import json
import logging
import os
from dataclasses import dataclass
from typing import Any
from urllib.parse import unquote, urlparse

import azure.functions as func
from azure.core.exceptions import HttpResponseError
from azure.identity.aio import ManagedIdentityCredential

from prepdocslib.blobmanager import BlobManager
from prepdocslib.fileprocessor import FileProcessor
from prepdocslib.page import Page
from prepdocslib.servicesetup import (
build_file_processors,
select_processor_for_filename,
setup_blob_manager,
)

app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
Expand All @@ -31,6 +33,7 @@
class GlobalSettings:
file_processors: dict[str, FileProcessor]
azure_credential: ManagedIdentityCredential
blob_manager: BlobManager
Comment on lines 31 to +36
Copy link

Copilot AI Dec 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The GlobalSettings dataclass has been updated to include a blob_manager field, but existing tests create mock settings without this field. This will cause TypeError exceptions in tests since blob_manager is a required field.

Tests like test_document_extractor_requires_single_record, test_document_extractor_handles_processing_exception, and test_document_extractor_process_document_http_error in tests/test_function_apps.py need to be updated to include blob_manager in their mock GlobalSettings objects.

Copilot uses AI. Check for mistakes.


settings: GlobalSettings | None = None
Expand Down Expand Up @@ -63,9 +66,18 @@ def configure_global_settings():
process_figures=use_multimodal,
)

blob_manager = setup_blob_manager(
azure_credential=azure_credential,
storage_account=os.environ["AZURE_STORAGE_ACCOUNT"],
storage_container=os.environ["AZURE_STORAGE_CONTAINER"],
storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"],
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
)

settings = GlobalSettings(
file_processors=file_processors,
azure_credential=azure_credential,
blob_manager=blob_manager,
)


Expand All @@ -75,20 +87,15 @@ async def extract_document(req: func.HttpRequest) -> func.HttpResponse:
"""
Azure Search Custom Skill: Extract document content

Input format (single record; file data only):
# https://learn.microsoft.com/azure/search/cognitive-search-skill-document-intelligence-layout#skill-inputs
Input format (single record):
{
"values": [
{
"recordId": "1",
"data": {
// Base64 encoded file (skillset must enable file data)
"file_data": {
"$type": "file",
"data": "base64..."
},
// Optional
"file_name": "doc.pdf"
"metadata_storage_path": "https://<account>.blob.core.windows.net/<container>/<blob_path>",
"metadata_storage_name": "document.pdf",
"metadata_storage_content_type": "application/pdf"
}
}
]
Expand Down Expand Up @@ -176,45 +183,52 @@ async def process_document(data: dict[str, Any]) -> dict[str, Any]:
Process a single document: download, parse, extract figures, upload images

Args:
data: Input data with blobUrl, fileName, contentType
data: Input data with metadata_storage_path

Returns:
Dictionary with 'text' (markdown) and 'images' (list of {url, description})
"""
document_stream, file_name, content_type = get_document_stream_filedata(data)
logger.info("Processing document: %s", file_name)
if settings is None:
raise RuntimeError("Global settings not initialized")

# Get blob path from metadata_storage_path URL
# URL format: https://<account>.blob.core.windows.net/<container>/<blob_path>
storage_path = data["metadata_storage_path"]
parsed_url = urlparse(storage_path)
# Path is /<container>/<blob_path>, so split and take everything after container
path_parts = unquote(parsed_url.path).lstrip("/").split("/", 1)
if len(path_parts) < 2:
raise ValueError(f"Invalid storage path format: {storage_path}")
blob_path_within_container = path_parts[1] # Everything after the container name

logger.info("Downloading blob: %s", blob_path_within_container)
result = await settings.blob_manager.download_blob(blob_path_within_container)
if result is None:
raise ValueError(f"Blob not found: {blob_path_within_container}")

document_bytes, properties = result
document_stream = io.BytesIO(document_bytes)
document_stream.name = blob_path_within_container

logger.info("Processing document: %s", blob_path_within_container)

# Get parser from file_processors dict based on file extension
file_processor = select_processor_for_filename(file_name, settings.file_processors)
file_processor = select_processor_for_filename(blob_path_within_container, settings.file_processors)
parser = file_processor.parser

pages: list[Page] = []
try:
document_stream.seek(0)
pages = [page async for page in parser.parse(content=document_stream)]
except HttpResponseError as exc:
raise ValueError(f"Parser failed for {file_name}: {exc.message}") from exc
raise ValueError(f"Parser failed for {blob_path_within_container}: {exc.message}") from exc
finally:
document_stream.close()

components = build_document_components(file_name, pages)
components = build_document_components(blob_path_within_container, pages)
return components


def get_document_stream_filedata(data: dict[str, Any]) -> tuple[io.BytesIO, str, str]:
"""Return a BytesIO stream for file_data input only (skillset must send file bytes)."""
file_payload = data.get("file_data", {})
encoded = file_payload.get("data")
if not encoded:
raise ValueError("file_data payload missing base64 data")
document_bytes = base64.b64decode(encoded)
file_name = data.get("file_name") or data.get("fileName") or file_payload.get("name") or "document"
content_type = data.get("contentType") or file_payload.get("contentType") or "application/octet-stream"
stream = io.BytesIO(document_bytes)
stream.name = file_name
return stream, file_name, content_type


def build_document_components(file_name: str, pages: list[Page]) -> dict[str, Any]:
page_entries: list[dict[str, Any]] = []
figure_entries: list[dict[str, Any]] = []
Expand Down
2 changes: 1 addition & 1 deletion app/functions/document_extractor/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ async-timeout==5.0.1
# via aiohttp
attrs==25.3.0
# via aiohttp
azure-ai-documentintelligence==1.0.0b4
azure-ai-documentintelligence==1.0.2
# via -r requirements.in
azure-cognitiveservices-speech==1.40.0
# via -r requirements.in
Expand Down
2 changes: 1 addition & 1 deletion app/functions/figure_processor/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ async-timeout==5.0.1
# via aiohttp
attrs==25.3.0
# via aiohttp
azure-ai-documentintelligence==1.0.0b4
azure-ai-documentintelligence==1.0.2
# via -r requirements.in
azure-cognitiveservices-speech==1.40.0
# via -r requirements.in
Expand Down
2 changes: 1 addition & 1 deletion app/functions/text_processor/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ async-timeout==5.0.1
# via aiohttp
attrs==25.3.0
# via aiohttp
azure-ai-documentintelligence==1.0.0b4
azure-ai-documentintelligence==1.0.2
# via -r requirements.in
azure-cognitiveservices-speech==1.40.0
# via -r requirements.in
Expand Down
2 changes: 2 additions & 0 deletions infra/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,8 @@ module appServicePlan 'core/host/appserviceplan.bicep' = if (deploymentTarget ==
var appEnvVariables = {
AZURE_STORAGE_ACCOUNT: storage.outputs.name
AZURE_STORAGE_CONTAINER: storageContainerName
AZURE_STORAGE_RESOURCE_GROUP: storageResourceGroup.name
AZURE_SUBSCRIPTION_ID: subscription().subscriptionId
AZURE_SEARCH_INDEX: searchIndexName
AZURE_SEARCH_KNOWLEDGEBASE_NAME: knowledgeBaseName
AZURE_SEARCH_SERVICE: searchService.outputs.name
Expand Down
46 changes: 30 additions & 16 deletions tests/test_function_apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,14 @@ async def parse(self, content: Any):
".pdf": FileProcessor(StubParser([page]), None),
}

class MockBlobManager:
async def download_blob(self, blob_path: str):
return (b"pdf-bytes", {})

mock_settings = document_extractor.GlobalSettings(
file_processors=mock_file_processors,
azure_credential=object(),
blob_manager=MockBlobManager(),
)
monkeypatch.setattr(document_extractor, "settings", mock_settings)

Expand All @@ -94,9 +99,9 @@ async def parse(self, content: Any):
{
"recordId": "record-1",
"data": {
"file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")},
"file_name": "sample.pdf",
"contentType": "application/pdf",
"metadata_storage_path": "https://account.blob.core.windows.net/container/sample.pdf",
Copy link

Copilot AI Dec 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests only verify simple blob paths at the container root (e.g., sample.pdf). Consider adding test cases for:

  1. Nested paths with subdirectories (e.g., folder/subfolder/file.pdf)
  2. URL-encoded characters in the path (e.g., spaces encoded as %20 or other special characters)
  3. Edge cases like trailing slashes or empty blob names

This would ensure the URL parsing logic handles all realistic scenarios from Azure Search indexers.

Copilot uses AI. Check for mistakes.
"metadata_storage_name": "sample.pdf",
"metadata_storage_content_type": "application/pdf",
},
}
]
Expand Down Expand Up @@ -128,6 +133,7 @@ async def test_document_extractor_requires_single_record(monkeypatch: pytest.Mon
mock_settings = document_extractor.GlobalSettings(
file_processors={".pdf": FileProcessor(None, None)},
azure_credential=object(),
blob_manager=object(),
)
monkeypatch.setattr(document_extractor, "settings", mock_settings)
response = await document_extractor.extract_document(build_request({"values": []}))
Expand All @@ -144,6 +150,7 @@ async def failing_process(data: dict[str, Any]) -> dict[str, Any]:
mock_settings = document_extractor.GlobalSettings(
file_processors={".pdf": FileProcessor(None, None)},
azure_credential=object(),
blob_manager=object(),
)
monkeypatch.setattr(document_extractor, "settings", mock_settings)
monkeypatch.setattr(document_extractor, "process_document", failing_process)
Expand All @@ -153,9 +160,9 @@ async def failing_process(data: dict[str, Any]) -> dict[str, Any]:
{
"recordId": "rec-error",
"data": {
"file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")},
"file_name": "sample.pdf",
"contentType": "application/pdf",
"metadata_storage_path": "https://account.blob.core.windows.net/container/sample.pdf",
"metadata_storage_name": "sample.pdf",
"metadata_storage_content_type": "application/pdf",
},
}
]
Expand Down Expand Up @@ -186,16 +193,19 @@ async def parse(self, content):
".pdf": FileProcessor(FailingParser(), None),
}

class MockBlobManager:
async def download_blob(self, blob_path: str):
return (b"content", {})

mock_settings = document_extractor.GlobalSettings(
file_processors=mock_file_processors,
azure_credential=object(),
blob_manager=MockBlobManager(),
)
monkeypatch.setattr(document_extractor, "settings", mock_settings)

data = {
"file_data": {"data": base64.b64encode(b"content").decode("utf-8")},
"file_name": "doc.pdf",
"contentType": "application/pdf",
"metadata_storage_path": "https://account.blob.core.windows.net/container/doc.pdf",
}

with pytest.raises(ValueError) as exc_info:
Expand All @@ -204,12 +214,16 @@ async def parse(self, content):
assert "Parser failed" in str(exc_info.value)


def test_document_extractor_missing_file_data() -> None:
with pytest.raises(ValueError):
document_extractor.get_document_stream_filedata({"file_data": {}})
def test_document_extractor_managed_identity_reload(monkeypatch: pytest.MonkeyPatch) -> None:
# Set required environment variables
monkeypatch.setenv("AZURE_STORAGE_ACCOUNT", "teststorage")
monkeypatch.setenv("AZURE_STORAGE_CONTAINER", "testcontainer")
monkeypatch.setenv("AZURE_STORAGE_RESOURCE_GROUP", "testrg")
monkeypatch.setenv("AZURE_SUBSCRIPTION_ID", "test-sub-id")

# Mock setup_blob_manager to avoid actual Azure calls
monkeypatch.setattr(document_extractor, "setup_blob_manager", lambda **kwargs: object())

def test_document_extractor_managed_identity_reload(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setenv("AZURE_CLIENT_ID", "client-123")
document_extractor.configure_global_settings()
assert isinstance(document_extractor.settings.azure_credential, document_extractor.ManagedIdentityCredential)
Expand Down Expand Up @@ -471,9 +485,9 @@ async def test_document_extractor_without_settings(monkeypatch: pytest.MonkeyPat
{
"recordId": "record-1",
"data": {
"file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")},
"file_name": "sample.pdf",
"contentType": "application/pdf",
"metadata_storage_path": "https://account.blob.core.windows.net/container/sample.pdf",
"metadata_storage_name": "sample.pdf",
"metadata_storage_content_type": "application/pdf",
},
}
]
Expand Down
Loading