@@ -83,9 +83,14 @@ async def parse(self, content: Any):
8383 ".pdf" : FileProcessor (StubParser ([page ]), None ),
8484 }
8585
86+ class MockBlobManager :
87+ async def download_blob (self , blob_path : str ):
88+ return (b"pdf-bytes" , {})
89+
8690 mock_settings = document_extractor .GlobalSettings (
8791 file_processors = mock_file_processors ,
8892 azure_credential = object (),
93+ blob_manager = MockBlobManager (),
8994 )
9095 monkeypatch .setattr (document_extractor , "settings" , mock_settings )
9196
@@ -94,9 +99,9 @@ async def parse(self, content: Any):
9499 {
95100 "recordId" : "record-1" ,
96101 "data" : {
97- "file_data " : { "$type" : "file" , "data" : base64 . b64encode ( b" pdf-bytes" ). decode ( "utf-8" )} ,
98- "file_name " : "sample.pdf" ,
99- "contentType " : "application/pdf" ,
102+ "metadata_storage_path " : "https://account.blob.core.windows.net/container/sample. pdf" ,
103+ "metadata_storage_name " : "sample.pdf" ,
104+ "metadata_storage_content_type " : "application/pdf" ,
100105 },
101106 }
102107 ]
@@ -128,6 +133,7 @@ async def test_document_extractor_requires_single_record(monkeypatch: pytest.Mon
128133 mock_settings = document_extractor .GlobalSettings (
129134 file_processors = {".pdf" : FileProcessor (None , None )},
130135 azure_credential = object (),
136+ blob_manager = object (),
131137 )
132138 monkeypatch .setattr (document_extractor , "settings" , mock_settings )
133139 response = await document_extractor .extract_document (build_request ({"values" : []}))
@@ -144,6 +150,7 @@ async def failing_process(data: dict[str, Any]) -> dict[str, Any]:
144150 mock_settings = document_extractor .GlobalSettings (
145151 file_processors = {".pdf" : FileProcessor (None , None )},
146152 azure_credential = object (),
153+ blob_manager = object (),
147154 )
148155 monkeypatch .setattr (document_extractor , "settings" , mock_settings )
149156 monkeypatch .setattr (document_extractor , "process_document" , failing_process )
@@ -153,9 +160,9 @@ async def failing_process(data: dict[str, Any]) -> dict[str, Any]:
153160 {
154161 "recordId" : "rec-error" ,
155162 "data" : {
156- "file_data " : { "$type" : "file" , "data" : base64 . b64encode ( b" pdf-bytes" ). decode ( "utf-8" )} ,
157- "file_name " : "sample.pdf" ,
158- "contentType " : "application/pdf" ,
163+ "metadata_storage_path " : "https://account.blob.core.windows.net/container/sample. pdf" ,
164+ "metadata_storage_name " : "sample.pdf" ,
165+ "metadata_storage_content_type " : "application/pdf" ,
159166 },
160167 }
161168 ]
@@ -186,16 +193,19 @@ async def parse(self, content):
186193 ".pdf" : FileProcessor (FailingParser (), None ),
187194 }
188195
196+ class MockBlobManager :
197+ async def download_blob (self , blob_path : str ):
198+ return (b"content" , {})
199+
189200 mock_settings = document_extractor .GlobalSettings (
190201 file_processors = mock_file_processors ,
191202 azure_credential = object (),
203+ blob_manager = MockBlobManager (),
192204 )
193205 monkeypatch .setattr (document_extractor , "settings" , mock_settings )
194206
195207 data = {
196- "file_data" : {"data" : base64 .b64encode (b"content" ).decode ("utf-8" )},
197- "file_name" : "doc.pdf" ,
198- "contentType" : "application/pdf" ,
208+ "metadata_storage_path" : "https://account.blob.core.windows.net/container/doc.pdf" ,
199209 }
200210
201211 with pytest .raises (ValueError ) as exc_info :
@@ -204,12 +214,16 @@ async def parse(self, content):
204214 assert "Parser failed" in str (exc_info .value )
205215
206216
207- def test_document_extractor_missing_file_data () -> None :
208- with pytest .raises (ValueError ):
209- document_extractor .get_document_stream_filedata ({"file_data" : {}})
217+ def test_document_extractor_managed_identity_reload (monkeypatch : pytest .MonkeyPatch ) -> None :
218+ # Set required environment variables
219+ monkeypatch .setenv ("AZURE_STORAGE_ACCOUNT" , "teststorage" )
220+ monkeypatch .setenv ("AZURE_STORAGE_CONTAINER" , "testcontainer" )
221+ monkeypatch .setenv ("AZURE_STORAGE_RESOURCE_GROUP" , "testrg" )
222+ monkeypatch .setenv ("AZURE_SUBSCRIPTION_ID" , "test-sub-id" )
210223
224+ # Mock setup_blob_manager to avoid actual Azure calls
225+ monkeypatch .setattr (document_extractor , "setup_blob_manager" , lambda ** kwargs : object ())
211226
212- def test_document_extractor_managed_identity_reload (monkeypatch : pytest .MonkeyPatch ) -> None :
213227 monkeypatch .setenv ("AZURE_CLIENT_ID" , "client-123" )
214228 document_extractor .configure_global_settings ()
215229 assert isinstance (document_extractor .settings .azure_credential , document_extractor .ManagedIdentityCredential )
@@ -471,9 +485,9 @@ async def test_document_extractor_without_settings(monkeypatch: pytest.MonkeyPat
471485 {
472486 "recordId" : "record-1" ,
473487 "data" : {
474- "file_data " : { "$type" : "file" , "data" : base64 . b64encode ( b" pdf-bytes" ). decode ( "utf-8" )} ,
475- "file_name " : "sample.pdf" ,
476- "contentType " : "application/pdf" ,
488+ "metadata_storage_path " : "https://account.blob.core.windows.net/container/sample. pdf" ,
489+ "metadata_storage_name " : "sample.pdf" ,
490+ "metadata_storage_content_type " : "application/pdf" ,
477491 },
478492 }
479493 ]
0 commit comments