Skip to content

Commit 042e397

Browse files
committed
Move cloud functions to use blob download based off path
1 parent e7fb877 commit 042e397

File tree

7 files changed

+83
-69
lines changed

7 files changed

+83
-69
lines changed

app/backend/prepdocslib/cloudingestionstrategy.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,10 +161,9 @@ def _build_skillset(self) -> SearchIndexerSkillset:
161161
resource_id=self.search_user_assigned_identity_resource_id
162162
),
163163
inputs=[
164-
# Provide the binary payload expected by the document extractor custom skill.
165-
InputFieldMappingEntry(name="file_data", source="/document/file_data"),
166-
InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"),
167-
InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"),
164+
# Always provide the blob URL so the function can download large files (> 16MB)
165+
InputFieldMappingEntry(name="metadata_storage_path", source="/document/metadata_storage_path"),
166+
# We are not using the SaS token since the functions have RBAC access via managed identity
168167
],
169168
outputs=[
170169
OutputFieldMappingEntry(name="pages", target_name="pages"),
@@ -310,7 +309,7 @@ async def setup(self) -> None:
310309
configuration=IndexingParametersConfiguration(
311310
query_timeout=None, # type: ignore
312311
data_to_extract="storageMetadata",
313-
allow_skillset_to_read_file_data=True,
312+
allow_skillset_to_read_file_data=False,
314313
)
315314
),
316315
)

app/functions/document_extractor/function_app.py

Lines changed: 44 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,25 @@
33
Custom skill for Azure AI Search that extracts and processes document content.
44
"""
55

6-
import base64
76
import io
87
import json
98
import logging
109
import os
1110
from dataclasses import dataclass
1211
from typing import Any
12+
from urllib.parse import unquote, urlparse
1313

1414
import azure.functions as func
1515
from azure.core.exceptions import HttpResponseError
1616
from azure.identity.aio import ManagedIdentityCredential
1717

18+
from prepdocslib.blobmanager import BlobManager
1819
from prepdocslib.fileprocessor import FileProcessor
1920
from prepdocslib.page import Page
2021
from prepdocslib.servicesetup import (
2122
build_file_processors,
2223
select_processor_for_filename,
24+
setup_blob_manager,
2325
)
2426

2527
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
@@ -31,6 +33,7 @@
3133
class GlobalSettings:
3234
file_processors: dict[str, FileProcessor]
3335
azure_credential: ManagedIdentityCredential
36+
blob_manager: BlobManager
3437

3538

3639
settings: GlobalSettings | None = None
@@ -63,9 +66,18 @@ def configure_global_settings():
6366
process_figures=use_multimodal,
6467
)
6568

69+
blob_manager = setup_blob_manager(
70+
azure_credential=azure_credential,
71+
storage_account=os.environ["AZURE_STORAGE_ACCOUNT"],
72+
storage_container=os.environ["AZURE_STORAGE_CONTAINER"],
73+
storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"],
74+
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
75+
)
76+
6677
settings = GlobalSettings(
6778
file_processors=file_processors,
6879
azure_credential=azure_credential,
80+
blob_manager=blob_manager,
6981
)
7082

7183

@@ -75,20 +87,15 @@ async def extract_document(req: func.HttpRequest) -> func.HttpResponse:
7587
"""
7688
Azure Search Custom Skill: Extract document content
7789
78-
Input format (single record; file data only):
79-
# https://learn.microsoft.com/azure/search/cognitive-search-skill-document-intelligence-layout#skill-inputs
90+
Input format (single record):
8091
{
8192
"values": [
8293
{
8394
"recordId": "1",
8495
"data": {
85-
// Base64 encoded file (skillset must enable file data)
86-
"file_data": {
87-
"$type": "file",
88-
"data": "base64..."
89-
},
90-
// Optional
91-
"file_name": "doc.pdf"
96+
"metadata_storage_path": "https://<account>.blob.core.windows.net/<container>/<blob_path>",
97+
"metadata_storage_name": "document.pdf",
98+
"metadata_storage_content_type": "application/pdf"
9299
}
93100
}
94101
]
@@ -176,45 +183,52 @@ async def process_document(data: dict[str, Any]) -> dict[str, Any]:
176183
Process a single document: download, parse, extract figures, upload images
177184
178185
Args:
179-
data: Input data with blobUrl, fileName, contentType
186+
data: Input data with metadata_storage_path
180187
181188
Returns:
182189
Dictionary with 'text' (markdown) and 'images' (list of {url, description})
183190
"""
184-
document_stream, file_name, content_type = get_document_stream_filedata(data)
185-
logger.info("Processing document: %s", file_name)
191+
if settings is None:
192+
raise RuntimeError("Global settings not initialized")
193+
194+
# Get blob path from metadata_storage_path URL
195+
# URL format: https://<account>.blob.core.windows.net/<container>/<blob_path>
196+
storage_path = data["metadata_storage_path"]
197+
parsed_url = urlparse(storage_path)
198+
# Path is /<container>/<blob_path>, so split and take everything after container
199+
path_parts = unquote(parsed_url.path).lstrip("/").split("/", 1)
200+
if len(path_parts) < 2:
201+
raise ValueError(f"Invalid storage path format: {storage_path}")
202+
blob_path_within_container = path_parts[1] # Everything after the container name
203+
204+
logger.info("Downloading blob: %s", blob_path_within_container)
205+
result = await settings.blob_manager.download_blob(blob_path_within_container)
206+
if result is None:
207+
raise ValueError(f"Blob not found: {blob_path_within_container}")
208+
209+
document_bytes, properties = result
210+
document_stream = io.BytesIO(document_bytes)
211+
document_stream.name = blob_path_within_container
212+
213+
logger.info("Processing document: %s", blob_path_within_container)
186214

187215
# Get parser from file_processors dict based on file extension
188-
file_processor = select_processor_for_filename(file_name, settings.file_processors)
216+
file_processor = select_processor_for_filename(blob_path_within_container, settings.file_processors)
189217
parser = file_processor.parser
190218

191219
pages: list[Page] = []
192220
try:
193221
document_stream.seek(0)
194222
pages = [page async for page in parser.parse(content=document_stream)]
195223
except HttpResponseError as exc:
196-
raise ValueError(f"Parser failed for {file_name}: {exc.message}") from exc
224+
raise ValueError(f"Parser failed for {blob_path_within_container}: {exc.message}") from exc
197225
finally:
198226
document_stream.close()
199227

200-
components = build_document_components(file_name, pages)
228+
components = build_document_components(blob_path_within_container, pages)
201229
return components
202230

203231

204-
def get_document_stream_filedata(data: dict[str, Any]) -> tuple[io.BytesIO, str, str]:
205-
"""Return a BytesIO stream for file_data input only (skillset must send file bytes)."""
206-
file_payload = data.get("file_data", {})
207-
encoded = file_payload.get("data")
208-
if not encoded:
209-
raise ValueError("file_data payload missing base64 data")
210-
document_bytes = base64.b64decode(encoded)
211-
file_name = data.get("file_name") or data.get("fileName") or file_payload.get("name") or "document"
212-
content_type = data.get("contentType") or file_payload.get("contentType") or "application/octet-stream"
213-
stream = io.BytesIO(document_bytes)
214-
stream.name = file_name
215-
return stream, file_name, content_type
216-
217-
218232
def build_document_components(file_name: str, pages: list[Page]) -> dict[str, Any]:
219233
page_entries: list[dict[str, Any]] = []
220234
figure_entries: list[dict[str, Any]] = []

app/functions/document_extractor/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async-timeout==5.0.1
2424
# via aiohttp
2525
attrs==25.3.0
2626
# via aiohttp
27-
azure-ai-documentintelligence==1.0.0b4
27+
azure-ai-documentintelligence==1.0.2
2828
# via -r requirements.in
2929
azure-cognitiveservices-speech==1.40.0
3030
# via -r requirements.in

app/functions/figure_processor/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async-timeout==5.0.1
2424
# via aiohttp
2525
attrs==25.3.0
2626
# via aiohttp
27-
azure-ai-documentintelligence==1.0.0b4
27+
azure-ai-documentintelligence==1.0.2
2828
# via -r requirements.in
2929
azure-cognitiveservices-speech==1.40.0
3030
# via -r requirements.in

app/functions/text_processor/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ async-timeout==5.0.1
2424
# via aiohttp
2525
attrs==25.3.0
2626
# via aiohttp
27-
azure-ai-documentintelligence==1.0.0b4
27+
azure-ai-documentintelligence==1.0.2
2828
# via -r requirements.in
2929
azure-cognitiveservices-speech==1.40.0
3030
# via -r requirements.in

azure.yaml

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -40,37 +40,36 @@ services:
4040
run: cd ../frontend;npm install;npm run build
4141
interactive: false
4242
continueOnError: false
43-
# Un-comment this section if using USE_CLOUD_INGESTION option
44-
# document-extractor:
45-
# project: ./app/functions/document_extractor
46-
# language: py
47-
# host: function
48-
# hooks:
49-
# prepackage:
50-
# shell: pwsh
51-
# run: python ../../../scripts/copy_prepdocslib.py
52-
# interactive: false
53-
# continueOnError: false
54-
# figure-processor:
55-
# project: ./app/functions/figure_processor
56-
# language: py
57-
# host: function
58-
# hooks:
59-
# prepackage:
60-
# shell: pwsh
61-
# run: python ../../../scripts/copy_prepdocslib.py
62-
# interactive: false
63-
# continueOnError: false
64-
# text-processor:
65-
# project: ./app/functions/text_processor
66-
# language: py
67-
# host: function
68-
# hooks:
69-
# prepackage:
70-
# shell: pwsh
71-
# run: python ../../../scripts/copy_prepdocslib.py
72-
# interactive: false
73-
# continueOnError: false
43+
document-extractor:
44+
project: ./app/functions/document_extractor
45+
language: py
46+
host: function
47+
hooks:
48+
prepackage:
49+
shell: pwsh
50+
run: python ../../../scripts/copy_prepdocslib.py
51+
interactive: false
52+
continueOnError: false
53+
figure-processor:
54+
project: ./app/functions/figure_processor
55+
language: py
56+
host: function
57+
hooks:
58+
prepackage:
59+
shell: pwsh
60+
run: python ../../../scripts/copy_prepdocslib.py
61+
interactive: false
62+
continueOnError: false
63+
text-processor:
64+
project: ./app/functions/text_processor
65+
language: py
66+
host: function
67+
hooks:
68+
prepackage:
69+
shell: pwsh
70+
run: python ../../../scripts/copy_prepdocslib.py
71+
interactive: false
72+
continueOnError: false
7473
hooks:
7574
preprovision:
7675
windows:

infra/main.bicep

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,8 @@ module appServicePlan 'core/host/appserviceplan.bicep' = if (deploymentTarget ==
469469
var appEnvVariables = {
470470
AZURE_STORAGE_ACCOUNT: storage.outputs.name
471471
AZURE_STORAGE_CONTAINER: storageContainerName
472+
AZURE_STORAGE_RESOURCE_GROUP: storageResourceGroup.name
473+
AZURE_SUBSCRIPTION_ID: subscription().subscriptionId
472474
AZURE_SEARCH_INDEX: searchIndexName
473475
AZURE_SEARCH_KNOWLEDGEBASE_NAME: knowledgeBaseName
474476
AZURE_SEARCH_SERVICE: searchService.outputs.name

0 commit comments

Comments
 (0)