diff --git a/.gitignore b/.gitignore index a57fe2b..5db8bef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Local .terraform directories .terraform/ - +**__pycache__ # .tfstate files *.tfstate *.tfstate.* @@ -37,3 +37,101 @@ override.tf.json # Ignore CLI configuration files .terraformrc terraform.rc + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Azure Functions +.azure/ +.vscode/ +local.settings.json +.azurefunctions/ + +# Testing artifacts +testing/results/ +testing/sample_pdfs/ +*.pdf +*.png +*.jpg +*.jpeg + +# Temporary files +temp/ +tmp/ +*.tmp +*.temp +*.bak +*.swp +*.swo +*~ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# IDE files +.idea/ +*.iml +*.ipr +*.iws + +# Log files +*.log +logs/ + +# Cache files +.cache/ +*.cache + +# Generated samples and cleanup scripts +remove_emojis.py +clean_makefile.py + +# Alternative/duplicate test files +extract_content_test.py +test_pdf_analysis.py +simple_pdf_analysis.py + +# Generated metrics and documentation +metrics.json +README-SETUP.md + +# Performance test results +*.perf +profile_output/ + +# Package files +*.tar.gz +*.zip +*.rar + +# VS Code specific files +.vscode/ +*.code-workspace diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..3f63eb9 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,6 @@ +{ + "recommendations": [ + "ms-azuretools.vscode-azurefunctions", + "ms-python.python" + ] +} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..9a24428 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Attach to Python Functions", + "type": "debugpy", + "request": "attach", + "connect": { + "host": "localhost", + "port": 9091 + }, + "preLaunchTask": "func: host start" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1b400fa --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "azureFunctions.deploySubpath": "src", + "azureFunctions.scmDoBuildDuringDeployment": true, + "azureFunctions.pythonVenv": ".venv", + "azureFunctions.projectLanguage": "Python", + "azureFunctions.projectRuntime": "~4", + "debug.internalConsoleOptions": "neverOpen", + "azureFunctions.projectLanguageModel": 2 +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..478f3c1 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,33 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "func", + "label": "func: host start", + "command": "host start", + "problemMatcher": "$func-python-watch", + "isBackground": true, + "dependsOn": "pip install (functions)", + "options": { + "cwd": "${workspaceFolder}/src" + } + }, + { + "label": "pip install (functions)", + "type": "shell", + "osx": { + "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" + }, + "windows": { + "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt" + }, + "linux": { + "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" + }, + "problemMatcher": [], + "options": { + "cwd": "${workspaceFolder}/src" + } + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 0331e46..a9ad849 100644 --- a/README.md +++ b/README.md @@ -1,453 +1,453 @@ -# Demo: PDF Layout Extraction with Doc Intelligence
Supporting Multiple Document Versions with Visual Selection Cues (full-code approach) - -`Azure Storage + Document Intelligence + Function App + Cosmos DB` - -Costa Rica - -[![GitHub](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com) -[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) -[brown9804](https://github.com/brown9804) - -Last updated: 2025-07-16 - ------------------------------ - -> This solution is designed to be flexible and robust, supporting multiple versions of PDF documents with varying layouts, including those that use visual selection cues such as gray fills, hand-drawn Xs, checkmarks, or circles. By building on the [PDFs-Layouts-Processing-Fapp-DocIntelligence](https://github.com/MicrosoftCloudEssentials-LearningHub/PDFs-Layouts-Processing-Fapp-DocIntelligence) repository, we ensure that: - -- Table structure and text are extracted using Azure Document Intelligence (Layout model). -- Visual selection cues are detected using Azure AI Vision or image preprocessing. -- Visual indicators are mapped to structured data, returning only the selected values in a clean JSON format. -- The logic is abstracted to support multiple layout variations, so the system adapts easily to new document formats and selection styles. - -> [!IMPORTANT] -> This example is based on a `public network site and is intended for demonstration purposes only`. It showcases how several Azure resources can work together to achieve the desired result. Consider the section below about [Important Considerations for Production Environment](#important-considerations-for-production-environment). Please note that `these demos are intended as a guide and are based on my personal experiences. For official guidance, support, or more detailed information, please refer to Microsoft's official documentation or contact Microsoft directly`: [Microsoft Sales and Support](https://support.microsoft.com/contactus?ContactUsExperienceEntryPointAssetId=S.HP.SMC-HOME) - -
-List of References (Click to expand) - -- [Use Azure AI services with SynapseML in Microsoft Fabric](https://learn.microsoft.com/en-us/fabric/data-science/how-to-use-ai-services-with-synapseml) -- [Plan and manage costs for Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/costs-plan-manage) -- [Azure AI Document Intelligence documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/?view=doc-intel-4.0.0) -- [Get started with the Document Intelligence Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/try-sample-label-tool?view=doc-intel-2.1.0#prerequisites-for-training-a-custom-form-model) -- [Document Intelligence Sample Labeling tool](https://fott-2-1.azurewebsites.net/) -- [Assign an Azure role for access to blob data](https://learn.microsoft.com/en-us/azure/storage/blobs/assign-azure-role-data-access?tabs=portal) -- [Build and train a custom extraction model](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/build-a-custom-model?view=doc-intel-2.1.0) -- [Compose custom models - Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/compose-custom-models?view=doc-intel-2.1.0&tabs=studio) -- [Deploy the Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/deploy-label-tool?view=doc-intel-2.1.0) -- [Train a custom model using the Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/label-tool?view=doc-intel-2.1.0) -- [Train models with the sample-labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/supervised-table-tags?view=doc-intel-2.1.0) -- [Azure Cosmos DB - Database for the AI Era](https://learn.microsoft.com/en-us/azure/cosmos-db/introduction) -- [Consistency levels in Azure Cosmos DB](https://learn.microsoft.com/en-us/azure/cosmos-db/consistency-levels) -- [Azure Cosmos DB SQL API client library for Python](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python) -- [CosmosClient class documentation](https://learn.microsoft.com/en-us/python/api/azure-cosmos/azure.cosmos.cosmos_client.cosmosclient?view=azure-python) -- [Cosmos AAD Authentication](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python#aad-authentication) -- [Cosmos python examples](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python#examples) -- [Use control plane role-based access control with Azure Cosmos DB for NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/security/how-to-grant-control-plane-role-based-access?tabs=built-in-definition%2Ccsharp&pivots=azure-interface-portal) -- [Use data plane role-based access control with Azure Cosmos DB for NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/security/how-to-grant-data-plane-role-based-access?tabs=built-in-definition%2Ccsharp&pivots=azure-interface-cli) -- [Create or update Azure custom roles using Azure CLI](https://learn.microsoft.com/en-us/azure/role-based-access-control/custom-roles-cli) -- [Document Intelligence query field extraction](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept/query-fields?view=doc-intel-4.0.0) -- [What's new in Azure AI Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/whats-new?view=doc-intel-4.0.0) -- [Managed identities for Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/authentication/managed-identities?view=doc-intel-4.0.0) - -
- -
-Table of Content (Click to expand) - -- [Important Considerations for Production Environment](#important-considerations-for-production-environment) -- [Prerequisites](#prerequisites) -- [Where to start?](#where-to-start) -- [Overview](#overview) -- [Function App Hosting Options](#function-app-hosting-options) -- [Function App: Configure/Validate the Environment variables](#function-app-configurevalidate-the-environment-variables) -- [Function App: Develop the logic](#function-app-develop-the-logic) -- [Test the solution](#test-the-solution) - -
- -> How to extract layout elements from PDFs stored in an Azure Storage Account, process them using Azure Document Intelligence, and store the results in Cosmos DB for further analysis. -> -> 1. Upload your PDFs to an Azure Blob Storage container.
-> 2. An Azure Function is triggered by the upload, which calls the Azure Document Intelligence Layout API to analyze the document structure.
-> 3. The extracted layout data (such as tables, checkboxes, and text) is parsed and subsequently stored in a Cosmos DB database, ensuring a seamless and automated workflow from document upload to data storage. - -> [!NOTE] -> Advantages of Document Intelligence for organizations handling with large volumes of documents:
-> -> - Utilizes natural language processing, computer vision, deep learning, and machine learning.
-> - Handles structured, semi-structured, and unstructured documents.
-> - Automates the extraction and transformation of layout data into usable formats like JSON or CSV. - -
- Centered Image -
- -> [!NOTE] -> Azure Event Grid System Topics are free to create and manage, a System Topic is automatically created and managed by Azure for certain Azure services that emit events. It represents a source of events from an Azure resource (like a Storage Account, Key Vault, or Azure Maps). `You don't need to create or manage the topic yourself, Azure does it for you when you enable event publishing on a supported resource.`
-> -> - Emits predefined event types (e.g., Microsoft.Storage.BlobCreated, Microsoft.Resources.ResourceWriteSuccess).
-> - You can attach event handlers (like Azure Functions, Logic Apps, Webhooks) to respond to these events.
-> - Works seamlessly with serverless architectures for real-time automation.
-> For example: -> Suppose you have a Storage Account and want to trigger a function every time a new blob is uploaded:
-> - Azure automatically creates a System Topic for the Storage Account. -> - You subscribe to the BlobCreated event. -> - When a blob is uploaded, Event Grid routes the event to your Azure Function. - -
- Centered Image -
- -## Important Considerations for Production Environment - -
- Private Network Configuration - - > For enhanced security, consider configuring your Azure resources to operate within a private network. This can be achieved using Azure Virtual Network (VNet) to isolate your resources and control inbound and outbound traffic. Implementing private endpoints for services like Azure Blob Storage and Azure Functions can further secure your data by restricting access to your VNet. - -
- -
- Security - - > Ensure that you implement appropriate security measures when deploying this solution in a production environment. This includes:
- > - > - Securing Access: Use Azure Entra ID (formerly known as Azure Active Directory or Azure AD) for authentication and role-based access control (RBAC) to manage permissions.
- > - Managing Secrets: Store sensitive information such as connection strings and API keys in Azure Key Vault.
- > - Data Encryption: Enable encryption for data at rest and in transit to protect sensitive information. - -
- -
- Scalability - - > While this example provides a basic setup, you may need to scale the resources based on your specific requirements. Azure services offer various scaling options to handle increased workloads. Consider using:
- > - > - Auto-scaling: Configure auto-scaling for Azure Functions and other services to automatically adjust based on demand.
- > - Load Balancing: Use Azure Load Balancer or Application Gateway to distribute traffic and ensure high availability. - -
- -
- Cost Management - - > Monitor and manage the costs associated with your Azure resources. Use Azure Cost Management and Billing to track usage and optimize resource allocation. - -
- -
- Compliance - - > Ensure that your deployment complies with relevant regulations and standards. Use Azure Policy to enforce compliance and governance policies across your resources. -
- -
- Disaster Recovery - -> Implement a disaster recovery plan to ensure business continuity in case of failures. Use Azure Site Recovery and backup solutions to protect your data and applications. - -
- -## Prerequisites - -- An `Azure subscription is required`. All other resources, including instructions for creating a Resource Group, are provided in this workshop. -- `Contributor role assigned or any custom role that allows`: access to manage all resources, and the ability to deploy resources within subscription. -- If you choose to use the Terraform approach, please ensure that: - - [Terraform is installed on your local machine](https://developer.hashicorp.com/terraform/tutorials/azure-get-started/install-cli#install-terraform). - - [Install the Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli) to work with both Terraform and Azure commands. - -## Where to start? - -1. Please follow the [Terraform guide](./terraform-infrastructure/) to deploy the necessary Azure resources for the workshop. -2. Next, as this method `skips the creation of each resource` manually. Proceed with the configuration from [Configure/Validate the Environment variables](#function-app-configurevalidate-the-environment-variables). - -> [!IMPORTANT] -> Regarding `Networking`, this example will cover `Public access configuration`, and `system-managed identity`. However, please ensure you `review your privacy requirements and adjust network and access settings as necessary for your specific case`. - -## Overview - -> Using Cosmos DB provides you with a flexible, scalable, and globally distributed database solution that can handle both structured and semi-structured data efficiently.
-> -> - `Azure Blob Storage`: Store the PDF invoices.
-> - `Azure Functions`: Trigger on new PDF uploads, extract data, and process it.
-> - `Azure SQL Database or Cosmos DB`: Store the extracted data for querying and analytics.
- -| Resource | Recommendation | -|---------------------------|----------------------------------------------------------------------------------------------------------------------| -| **Azure Blob Storage** | Use for storing the PDF files. This keeps your file storage separate from your data storage, which is a common best practice. | -| **Azure SQL Database** | Use if your data is highly structured and you need complex queries and transactions. | -| **Azure Cosmos DB** | Use if you need a globally distributed database with low latency and the ability to handle semi-structured data. | - -## Function App Hosting Options - -> In the context of Azure Function Apps, a `hosting option refers to the plan you choose to run your function app`. This choice affects how your function app is scaled, the resources available to each function app instance, and the support for advanced functionalities like virtual network connectivity and container support. - -> [!TIP] -> -> - `Scale to Zero`: Indicates whether the service can automatically scale down to zero instances when idle. -> - **IDLE** stands for: -> - **I** โ€“ Inactive -> - **D** โ€“ During -> - **L** โ€“ Low -> - **E** โ€“ Engagement -> - In other words, when the application is not actively handling requests or events (it's in a low-activity or paused state). -> - `Scale Behavior`: Describes how the service scales (e.g., `event-driven`, `dedicated`, or `containerized`). -> - `Virtual Networking`: Whether the service supports integration with virtual networks for secure communication. -> - `Dedicated Compute & Reserved Cold Start`: Availability of always-on compute to avoid cold starts and ensure low latency. -> - `Max Scale Out (Instances)`: Maximum number of instances the service can scale out to. -> - `Example AI Use Cases`: Real-world scenarios where each plan excels. - -
-Flex Consumption - -| Feature | Description | -|--------|-------------| -| **Scale to Zero** | `Yes` | -| **Scale Behavior** | `Fast event-driven` | -| **Virtual Networking** | `Optional` | -| **Dedicated Compute & Reserved Cold Start** | `Optional (Always Ready)` | -| **Max Scale Out (Instances)** | `1000` | -| **Example AI Use Cases** | `Real-time data processing` for AI models, `high-traffic AI-powered APIs`, `event-driven AI microservices`. Ideal for fraud detection, real-time recommendations, NLP, and computer vision services. | - -
- -
-Consumption - -| Feature | Description | -|--------|-------------| -| **Scale to Zero** | `Yes` | -| **Scale Behavior** | `Event-driven` | -| **Virtual Networking** | `Optional` | -| **Dedicated Compute & Reserved Cold Start** | `No` | -| **Max Scale Out (Instances)** | `200` | -| **Example AI Use Cases** | `Lightweight AI APIs`, `scheduled AI tasks`, `low-traffic AI event processing`. Great for sentiment analysis, simple image recognition, and batch ML tasks. | - -
- -
-Functions Premium - -| Feature | Description | -|--------|-------------| -| **Scale to Zero** | `No` | -| **Scale Behavior** | `Event-driven with premium options` | -| **Virtual Networking** | `Yes` | -| **Dedicated Compute & Reserved Cold Start** | `Yes` | -| **Max Scale Out (Instances)** | `100` | -| **Example AI Use Cases** | `Enterprise AI applications`, `low-latency AI APIs`, `VNet integration`. Ideal for secure, high-performance AI services like customer support and analytics. | - -
- -
-App Service - -| Feature | Description | -|--------|-------------| -| **Scale to Zero** | `No` | -| **Scale Behavior** | `Dedicated VMs` | -| **Virtual Networking** | `Yes` | -| **Dedicated Compute & Reserved Cold Start** | `Yes` | -| **Max Scale Out (Instances)** | `Varies` | -| **Example AI Use Cases** | `AI-powered web applications`, `dedicated resources`. Great for chatbots, personalized content, and intensive AI inference. | - -
- -
-Container Apps Env. - -| Feature | Description | -|--------|-------------| -| **Scale to Zero** | `No` | -| **Scale Behavior** | `Containerized microservices environment` | -| **Virtual Networking** | `Yes` | -| **Dedicated Compute & Reserved Cold Start** | `Yes` | -| **Max Scale Out (Instances)** | `Varies` | -| **Example AI Use Cases** | `AI microservices architecture`, `containerized AI workloads`, `complex AI workflows`. Ideal for orchestrating AI services like image processing, text analysis, and real-time analytics. | - -
- -## Function App: Configure/Validate the Environment variables - -> [!NOTE] -> This example is using system-assigned managed identity to assign RBACs (Role-based Access Control). - -- Under `Settings`, go to `Environment variables`. And `+ Add` the following variables: - - - `COSMOS_DB_ENDPOINT`: Your Cosmos DB account endpoint ๐Ÿกข `Review the existence of this, if not create it` - - `COSMOS_DB_KEY`: Your Cosmos DB account key ๐Ÿกข `Review the existence of this, if not create it` - - `COSMOS_DB_CONNECTION_STRING`: Your Cosmos DB connection string ๐Ÿกข `Review the existence of this, if not create it` - - `invoicecontosostorage_STORAGE`: Your Storage Account connection string ๐Ÿกข `Review the existence of this, if not create it` - - `FORM_RECOGNIZER_ENDPOINT`: For example: `https://.cognitiveservices.azure.com/` ๐Ÿกข `Review the existence of this, if not create it` - - `FORM_RECOGNIZER_KEY`: Your Documment Intelligence Key (Form Recognizer). ๐Ÿกข - - `FUNCTIONS_EXTENSION_VERSION`: `~4` ๐Ÿกข `Review the existence of this, if not create it` - - `WEBSITE_RUN_FROM_PACKAGE`: `1` ๐Ÿกข `Review the existence of this, if not create it` - - `FUNCTIONS_WORKER_RUNTIME`: `python` ๐Ÿกข `Review the existence of this, if not create it` - - `FUNCTIONS_NODE_BLOCK_ON_ENTRY_POINT_ERROR`: `true` (This setting ensures that all entry point errors are visible in your application insights logs). ๐Ÿกข `Review the existence of this, if not create it` - - image - - image - - image - - image - - - Click on `Apply` to save your configuration. - - image - -## Function App: Develop the logic - -- You need to install [VSCode](https://code.visualstudio.com/download) -- Install python from Microsoft store: - - image - -- Open VSCode, and install some extensions: `python`, and `Azure Tools`. - - image - - image - -- Click on the `Azure` icon, and `sign in` into your account. Allow the extension `Azure Resources` to sign in using Microsoft, it will open a browser window. After doing so, you will be able to see your subscription and resources. - - image - -- Under Workspace, click on `Create Function Project`, and choose a path in your local computer to develop your function. - - image - -- Choose the language, in this case is `python`: - - image - -- Select the model version, for this example let's use `v2`: - - image - -- For the python interpreter, let's use the one installed via `Microsoft Store`: - - image - -- Choose a template (e.g., **Blob trigger**) and configure it to trigger on new PDF uploads in your Blob container. - - image - -- Provide a function name, like `BlobTriggerContosoPDFInvoicesDocIntelligence`: - - image - -- Next, it will prompt you for the path of the blob container where you expect the function to be triggered after a file is uploaded. In this case is `pdfinvoices` as was previously created. - - image - -- Click on `Create new local app settings`, and then choose your subscription. - - image - -- Choose `Azure Storage Account for remote storage`, and select one. I'll be using the `invoicecontosostorage`. - - image - -- Then click on `Open in the current window`. You will see something like this: - - image - -- Now we need to update the function code to extract data from PDFs and store it in Cosmos DB, use this an example: - - > 1. **PDF Upload**: A PDF file is uploaded to the Azure Blob Storage container (`pdfinvoices`). - > 2. **Trigger Azure Function**: The upload triggers the Azure Function `BlobTriggerContosoPDFLayoutsDocIntelligence`. - > 3. **Initialize Clients**: Sets up connections to Azure Document Intelligence and Cosmos DB. - > - Initializes the `DocumentAnalysisClient` using the `FORM_RECOGNIZER_ENDPOINT` and `FORM_RECOGNIZER_KEY` environment variables. - > - Initializes the `CosmosClient` using Azure Active Directory (AAD) via `DefaultAzureCredential`. - > 4. **Read PDF from Blob Storage**: Reads the PDF content from the blob into a byte stream. - > 5. **Analyze PDF**: Uses Azure Document Intelligence to analyze the layout of the PDF. - > - Calls `begin_analyze_document` with the `prebuilt-layout` model. - > - Waits for the analysis to complete and retrieves the layout result. - > 6. **Extract Layout Data**: Parses and structures the layout data from the analysis result. - > - Extracts lines, tables, and selection marks from each page. - > - Logs styles (e.g., handwritten content) and organizes data into a structured dictionary. - > 7. **Save Data to Cosmos DB**: Saves the structured layout data to Cosmos DB. - > - Ensures the database (`ContosoDBDocIntellig`) and container (`Layouts`) exist or creates them. - > - Inserts or updates the layout data using `upsert_item`. - > 8. **Logging (Process and Errors)**: Logs each step of the process, including success messages and detailed error handling for debugging and monitoring. - - - Update the function_app.py, for example [see the code used in this demo](./src/function_app.py): - - | Template Blob Trigger | Function Code updated | - | --- | --- | - | image | image| - - - Now, let's update the `requirements.txt`, [see the code used in this demo](./src/requirements.txt): - - | Template `requirements.txt` | Updated `requirements.txt` | - | --- | --- | - | image | image| - - - Since this function has already been tested, you can deploy your code to the function app in your subscription. If you want to test, you can use run your function locally for testing. - - Click on the `Azure` icon. - - Under `workspace`, click on the `Function App` icon. - - Click on `Deploy to Azure`. - - image - - - Select your `subscription`, your `function app`, and accept the prompt to overwrite: - - image - - - After completing, you see the status in your terminal: - - image - - image - -> [!IMPORTANT] -> If you need further assistance with the code, please click [here to view all the function code](./src/). - -> [!NOTE] -> Please ensure that all specified roles are assigned to the Function App. The provided example used `System assigned` for the Function App to facilitate the role assignment. - -## Test the solution - -> [!IMPORTANT] -> Please ensure that the user/system admin responsible for uploading the PDFs to the blob container has the necessary permissions. The error below illustrates what might occur if these roles are missing.
-> image
-> In that case, go to `Access Control (IAM)`, click on `+ Add`, and `Add role assignment`:
-> image
-> Search for `Storage Blob Data Contributor`, click `Next`.
-> image
-> Then, click on `select members` and search for your user/systen admin. Finally click on `Review + assign`. - -> Upload sample PDF invoices to the Blob container and verify that data is correctly ingested and stored in Cosmos DB. - -- Click on `Upload`, then select `Browse for files` and choose your PDF invoices to be stored in the blob container, which will trigger the function app to parse them. - - image - -- Check the logs, and traces from your function with `Application Insights`: - - image - -- Under `Investigate`, click on `Performance`. Filter by time range, and `drill into the samples`. Sort the results by date (if you have many, like in my case) and click on the last one. - - image - -- Click on `View all`: - - image - -- Check all the logs, and traces generated. Also review the information parsed: - - image - -- Validate that the information was uploaded to the Cosmos DB. Under `Data Explorer`, check your `Database`. - - image - +# Demo: PDF Layout Extraction with Doc Intelligence
Supporting Multiple Document Versions with Visual Selection Cues (full-code approach) + +`Azure Storage + Document Intelligence + Function App + Cosmos DB` + +Costa Rica + +[![GitHub](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com) +[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) +[brown9804](https://github.com/brown9804) + +Last updated: 2025-07-25 + +----------------------------- + +> This solution is designed to be flexible and robust, supporting multiple versions of PDF documents with varying layouts, including those that use visual selection cues such as gray fills, hand-drawn Xs, checkmarks, or circles. By building on the [PDFs-Layouts-Processing-Fapp-DocIntelligence](https://github.com/MicrosoftCloudEssentials-LearningHub/PDFs-Layouts-Processing-Fapp-DocIntelligence) repository, we ensure that: + +- Table structure and text are extracted using Azure Document Intelligence (Layout model). +- Visual selection cues are detected using Azure AI Vision or image preprocessing. +- Visual indicators are mapped to structured data, returning only the selected values in a clean JSON format. +- The logic is abstracted to support multiple layout variations, so the system adapts easily to new document formats and selection styles. + +> [!IMPORTANT] +> This example is based on a `public network site and is intended for demonstration purposes only`. It showcases how several Azure resources can work together to achieve the desired result. Consider the section below about [Important Considerations for Production Environment](#important-considerations-for-production-environment). Please note that `these demos are intended as a guide and are based on my personal experiences. For official guidance, support, or more detailed information, please refer to Microsoft's official documentation or contact Microsoft directly`: [Microsoft Sales and Support](https://support.microsoft.com/contactus?ContactUsExperienceEntryPointAssetId=S.HP.SMC-HOME) + +
+List of References (Click to expand) + +- [Use Azure AI services with SynapseML in Microsoft Fabric](https://learn.microsoft.com/en-us/fabric/data-science/how-to-use-ai-services-with-synapseml) +- [Plan and manage costs for Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/costs-plan-manage) +- [Azure AI Document Intelligence documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/?view=doc-intel-4.0.0) +- [Get started with the Document Intelligence Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/try-sample-label-tool?view=doc-intel-2.1.0#prerequisites-for-training-a-custom-form-model) +- [Document Intelligence Sample Labeling tool](https://fott-2-1.azurewebsites.net/) +- [Assign an Azure role for access to blob data](https://learn.microsoft.com/en-us/azure/storage/blobs/assign-azure-role-data-access?tabs=portal) +- [Build and train a custom extraction model](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/build-a-custom-model?view=doc-intel-2.1.0) +- [Compose custom models - Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/compose-custom-models?view=doc-intel-2.1.0&tabs=studio) +- [Deploy the Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/deploy-label-tool?view=doc-intel-2.1.0) +- [Train a custom model using the Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/label-tool?view=doc-intel-2.1.0) +- [Train models with the sample-labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/supervised-table-tags?view=doc-intel-2.1.0) +- [Azure Cosmos DB - Database for the AI Era](https://learn.microsoft.com/en-us/azure/cosmos-db/introduction) +- [Consistency levels in Azure Cosmos DB](https://learn.microsoft.com/en-us/azure/cosmos-db/consistency-levels) +- [Azure Cosmos DB SQL API client library for Python](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python) +- [CosmosClient class documentation](https://learn.microsoft.com/en-us/python/api/azure-cosmos/azure.cosmos.cosmos_client.cosmosclient?view=azure-python) +- [Cosmos AAD Authentication](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python#aad-authentication) +- [Cosmos python examples](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python#examples) +- [Use control plane role-based access control with Azure Cosmos DB for NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/security/how-to-grant-control-plane-role-based-access?tabs=built-in-definition%2Ccsharp&pivots=azure-interface-portal) +- [Use data plane role-based access control with Azure Cosmos DB for NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/security/how-to-grant-data-plane-role-based-access?tabs=built-in-definition%2Ccsharp&pivots=azure-interface-cli) +- [Create or update Azure custom roles using Azure CLI](https://learn.microsoft.com/en-us/azure/role-based-access-control/custom-roles-cli) +- [Document Intelligence query field extraction](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept/query-fields?view=doc-intel-4.0.0) +- [What's new in Azure AI Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/whats-new?view=doc-intel-4.0.0) +- [Managed identities for Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/authentication/managed-identities?view=doc-intel-4.0.0) + +
+ +
+Table of Content (Click to expand) + +- [Important Considerations for Production Environment](#important-considerations-for-production-environment) +- [Prerequisites](#prerequisites) +- [Where to start?](#where-to-start) +- [Overview](#overview) +- [Function App Hosting Options](#function-app-hosting-options) +- [Function App: Configure/Validate the Environment variables](#function-app-configurevalidate-the-environment-variables) +- [Function App: Develop the logic](#function-app-develop-the-logic) +- [Test the solution](#test-the-solution) + +
+ +> How to extract layout elements from PDFs stored in an Azure Storage Account, process them using Azure Document Intelligence, and store the results in Cosmos DB for further analysis. +> +> 1. Upload your PDFs to an Azure Blob Storage container.
+> 2. An Azure Function is triggered by the upload, which calls the Azure Document Intelligence Layout API to analyze the document structure.
+> 3. The extracted layout data (such as tables, checkboxes, and text) is parsed and subsequently stored in a Cosmos DB database, ensuring a seamless and automated workflow from document upload to data storage. + +> [!NOTE] +> Advantages of Document Intelligence for organizations handling with large volumes of documents:
+> +> - Utilizes natural language processing, computer vision, deep learning, and machine learning.
+> - Handles structured, semi-structured, and unstructured documents.
+> - Automates the extraction and transformation of layout data into usable formats like JSON or CSV. + +
+ Centered Image +
+ +> [!NOTE] +> Azure Event Grid System Topics are free to create and manage, a System Topic is automatically created and managed by Azure for certain Azure services that emit events. It represents a source of events from an Azure resource (like a Storage Account, Key Vault, or Azure Maps). `You don't need to create or manage the topic yourself, Azure does it for you when you enable event publishing on a supported resource.`
+> +> - Emits predefined event types (e.g., Microsoft.Storage.BlobCreated, Microsoft.Resources.ResourceWriteSuccess).
+> - You can attach event handlers (like Azure Functions, Logic Apps, Webhooks) to respond to these events.
+> - Works seamlessly with serverless architectures for real-time automation.
+> For example: +> Suppose you have a Storage Account and want to trigger a function every time a new blob is uploaded:
+> - Azure automatically creates a System Topic for the Storage Account. +> - You subscribe to the BlobCreated event. +> - When a blob is uploaded, Event Grid routes the event to your Azure Function. + +
+ Centered Image +
+ +## Important Considerations for Production Environment + +
+ Private Network Configuration + + > For enhanced security, consider configuring your Azure resources to operate within a private network. This can be achieved using Azure Virtual Network (VNet) to isolate your resources and control inbound and outbound traffic. Implementing private endpoints for services like Azure Blob Storage and Azure Functions can further secure your data by restricting access to your VNet. + +
+ +
+ Security + + > Ensure that you implement appropriate security measures when deploying this solution in a production environment. This includes:
+ > + > - Securing Access: Use Azure Entra ID (formerly known as Azure Active Directory or Azure AD) for authentication and role-based access control (RBAC) to manage permissions.
+ > - Managing Secrets: Store sensitive information such as connection strings and API keys in Azure Key Vault.
+ > - Data Encryption: Enable encryption for data at rest and in transit to protect sensitive information. + +
+ +
+ Scalability + + > While this example provides a basic setup, you may need to scale the resources based on your specific requirements. Azure services offer various scaling options to handle increased workloads. Consider using:
+ > + > - Auto-scaling: Configure auto-scaling for Azure Functions and other services to automatically adjust based on demand.
+ > - Load Balancing: Use Azure Load Balancer or Application Gateway to distribute traffic and ensure high availability. + +
+ +
+ Cost Management + + > Monitor and manage the costs associated with your Azure resources. Use Azure Cost Management and Billing to track usage and optimize resource allocation. + +
+ +
+ Compliance + + > Ensure that your deployment complies with relevant regulations and standards. Use Azure Policy to enforce compliance and governance policies across your resources. +
+ +
+ Disaster Recovery + +> Implement a disaster recovery plan to ensure business continuity in case of failures. Use Azure Site Recovery and backup solutions to protect your data and applications. + +
+ +## Prerequisites + +- An `Azure subscription is required`. All other resources, including instructions for creating a Resource Group, are provided in this workshop. +- `Contributor role assigned or any custom role that allows`: access to manage all resources, and the ability to deploy resources within subscription. +- If you choose to use the Terraform approach, please ensure that: + - [Terraform is installed on your local machine](https://developer.hashicorp.com/terraform/tutorials/azure-get-started/install-cli#install-terraform). + - [Install the Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli) to work with both Terraform and Azure commands. + +## Where to start? + +1. Please follow the [Terraform guide](./terraform-infrastructure/) to deploy the necessary Azure resources for the workshop. +2. Next, as this method `skips the creation of each resource` manually. Proceed with the configuration from [Configure/Validate the Environment variables](#function-app-configurevalidate-the-environment-variables). + +> [!IMPORTANT] +> Regarding `Networking`, this example will cover `Public access configuration`, and `system-managed identity`. However, please ensure you `review your privacy requirements and adjust network and access settings as necessary for your specific case`. + +## Overview + +> Using Cosmos DB provides you with a flexible, scalable, and globally distributed database solution that can handle both structured and semi-structured data efficiently.
+> +> - `Azure Blob Storage`: Store the PDF invoices.
+> - `Azure Functions`: Trigger on new PDF uploads, extract data, and process it.
+> - `Azure SQL Database or Cosmos DB`: Store the extracted data for querying and analytics.
+ +| Resource | Recommendation | +|---------------------------|----------------------------------------------------------------------------------------------------------------------| +| **Azure Blob Storage** | Use for storing the PDF files. This keeps your file storage separate from your data storage, which is a common best practice. | +| **Azure SQL Database** | Use if your data is highly structured and you need complex queries and transactions. | +| **Azure Cosmos DB** | Use if you need a globally distributed database with low latency and the ability to handle semi-structured data. | + +## Function App Hosting Options + +> In the context of Azure Function Apps, a `hosting option refers to the plan you choose to run your function app`. This choice affects how your function app is scaled, the resources available to each function app instance, and the support for advanced functionalities like virtual network connectivity and container support. + +> [!TIP] +> +> - `Scale to Zero`: Indicates whether the service can automatically scale down to zero instances when idle. +> - **IDLE** stands for: +> - **I** โ€“ Inactive +> - **D** โ€“ During +> - **L** โ€“ Low +> - **E** โ€“ Engagement +> - In other words, when the application is not actively handling requests or events (it's in a low-activity or paused state). +> - `Scale Behavior`: Describes how the service scales (e.g., `event-driven`, `dedicated`, or `containerized`). +> - `Virtual Networking`: Whether the service supports integration with virtual networks for secure communication. +> - `Dedicated Compute & Reserved Cold Start`: Availability of always-on compute to avoid cold starts and ensure low latency. +> - `Max Scale Out (Instances)`: Maximum number of instances the service can scale out to. +> - `Example AI Use Cases`: Real-world scenarios where each plan excels. + +
+Flex Consumption + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `Yes` | +| **Scale Behavior** | `Fast event-driven` | +| **Virtual Networking** | `Optional` | +| **Dedicated Compute & Reserved Cold Start** | `Optional (Always Ready)` | +| **Max Scale Out (Instances)** | `1000` | +| **Example AI Use Cases** | `Real-time data processing` for AI models, `high-traffic AI-powered APIs`, `event-driven AI microservices`. Ideal for fraud detection, real-time recommendations, NLP, and computer vision services. | + +
+ +
+Consumption + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `Yes` | +| **Scale Behavior** | `Event-driven` | +| **Virtual Networking** | `Optional` | +| **Dedicated Compute & Reserved Cold Start** | `No` | +| **Max Scale Out (Instances)** | `200` | +| **Example AI Use Cases** | `Lightweight AI APIs`, `scheduled AI tasks`, `low-traffic AI event processing`. Great for sentiment analysis, simple image recognition, and batch ML tasks. | + +
+ +
+Functions Premium + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `No` | +| **Scale Behavior** | `Event-driven with premium options` | +| **Virtual Networking** | `Yes` | +| **Dedicated Compute & Reserved Cold Start** | `Yes` | +| **Max Scale Out (Instances)** | `100` | +| **Example AI Use Cases** | `Enterprise AI applications`, `low-latency AI APIs`, `VNet integration`. Ideal for secure, high-performance AI services like customer support and analytics. | + +
+ +
+App Service + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `No` | +| **Scale Behavior** | `Dedicated VMs` | +| **Virtual Networking** | `Yes` | +| **Dedicated Compute & Reserved Cold Start** | `Yes` | +| **Max Scale Out (Instances)** | `Varies` | +| **Example AI Use Cases** | `AI-powered web applications`, `dedicated resources`. Great for chatbots, personalized content, and intensive AI inference. | + +
+ +
+Container Apps Env. + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `No` | +| **Scale Behavior** | `Containerized microservices environment` | +| **Virtual Networking** | `Yes` | +| **Dedicated Compute & Reserved Cold Start** | `Yes` | +| **Max Scale Out (Instances)** | `Varies` | +| **Example AI Use Cases** | `AI microservices architecture`, `containerized AI workloads`, `complex AI workflows`. Ideal for orchestrating AI services like image processing, text analysis, and real-time analytics. | + +
+ +## Function App: Configure/Validate the Environment variables + +> [!NOTE] +> This example is using system-assigned managed identity to assign RBACs (Role-based Access Control). + +- Under `Settings`, go to `Environment variables`. And `+ Add` the following variables: + + - `COSMOS_DB_ENDPOINT`: Your Cosmos DB account endpoint ๐Ÿกข `Review the existence of this, if not create it` + - `COSMOS_DB_KEY`: Your Cosmos DB account key ๐Ÿกข `Review the existence of this, if not create it` + - `COSMOS_DB_CONNECTION_STRING`: Your Cosmos DB connection string ๐Ÿกข `Review the existence of this, if not create it` + - `invoicecontosostorage_STORAGE`: Your Storage Account connection string ๐Ÿกข `Review the existence of this, if not create it` + - `FORM_RECOGNIZER_ENDPOINT`: For example: `https://.cognitiveservices.azure.com/` ๐Ÿกข `Review the existence of this, if not create it` + - `FORM_RECOGNIZER_KEY`: Your Documment Intelligence Key (Form Recognizer). ๐Ÿกข + - `FUNCTIONS_EXTENSION_VERSION`: `~4` ๐Ÿกข `Review the existence of this, if not create it` + - `WEBSITE_RUN_FROM_PACKAGE`: `1` ๐Ÿกข `Review the existence of this, if not create it` + - `FUNCTIONS_WORKER_RUNTIME`: `python` ๐Ÿกข `Review the existence of this, if not create it` + - `FUNCTIONS_NODE_BLOCK_ON_ENTRY_POINT_ERROR`: `true` (This setting ensures that all entry point errors are visible in your application insights logs). ๐Ÿกข `Review the existence of this, if not create it` + + image + + image + + image + + image + + - Click on `Apply` to save your configuration. + + image + +## Function App: Develop the logic + +- You need to install [VSCode](https://code.visualstudio.com/download) +- Install python from Microsoft store: + + image + +- Open VSCode, and install some extensions: `python`, and `Azure Tools`. + + image + + image + +- Click on the `Azure` icon, and `sign in` into your account. Allow the extension `Azure Resources` to sign in using Microsoft, it will open a browser window. After doing so, you will be able to see your subscription and resources. + + image + +- Under Workspace, click on `Create Function Project`, and choose a path in your local computer to develop your function. + + image + +- Choose the language, in this case is `python`: + + image + +- Select the model version, for this example let's use `v2`: + + image + +- For the python interpreter, let's use the one installed via `Microsoft Store`: + + image + +- Choose a template (e.g., **Blob trigger**) and configure it to trigger on new PDF uploads in your Blob container. + + image + +- Provide a function name, like `BlobTriggerContosoPDFInvoicesDocIntelligence`: + + image + +- Next, it will prompt you for the path of the blob container where you expect the function to be triggered after a file is uploaded. In this case is `pdfinvoices` as was previously created. + + image + +- Click on `Create new local app settings`, and then choose your subscription. + + image + +- Choose `Azure Storage Account for remote storage`, and select one. I'll be using the `invoicecontosostorage`. + + image + +- Then click on `Open in the current window`. You will see something like this: + + image + +- Now we need to update the function code to extract data from PDFs and store it in Cosmos DB, use this an example: + + > 1. **PDF Upload**: A PDF file is uploaded to the Azure Blob Storage container (`pdfinvoices`). + > 2. **Trigger Azure Function**: The upload triggers the Azure Function `BlobTriggerContosoPDFLayoutsDocIntelligence`. + > 3. **Initialize Clients**: Sets up connections to Azure Document Intelligence and Cosmos DB. + > - Initializes the `DocumentAnalysisClient` using the `FORM_RECOGNIZER_ENDPOINT` and `FORM_RECOGNIZER_KEY` environment variables. + > - Initializes the `CosmosClient` using Azure Active Directory (AAD) via `DefaultAzureCredential`. + > 4. **Read PDF from Blob Storage**: Reads the PDF content from the blob into a byte stream. + > 5. **Analyze PDF**: Uses Azure Document Intelligence to analyze the layout of the PDF. + > - Calls `begin_analyze_document` with the `prebuilt-layout` model. + > - Waits for the analysis to complete and retrieves the layout result. + > 6. **Extract Layout Data**: Parses and structures the layout data from the analysis result. + > - Extracts lines, tables, and selection marks from each page. + > - Logs styles (e.g., handwritten content) and organizes data into a structured dictionary. + > 7. **Save Data to Cosmos DB**: Saves the structured layout data to Cosmos DB. + > - Ensures the database (`ContosoDBDocIntellig`) and container (`Layouts`) exist or creates them. + > - Inserts or updates the layout data using `upsert_item`. + > 8. **Logging (Process and Errors)**: Logs each step of the process, including success messages and detailed error handling for debugging and monitoring. + + - Update the function_app.py, for example [see the code used in this demo](./src/function_app.py): + + | Template Blob Trigger | Function Code updated | + | --- | --- | + | image | image| + + - Now, let's update the `requirements.txt`, [see the code used in this demo](./src/requirements.txt): + + | Template `requirements.txt` | Updated `requirements.txt` | + | --- | --- | + | image | image| + + - Since this function has already been tested, you can deploy your code to the function app in your subscription. If you want to test, you can use run your function locally for testing. + - Click on the `Azure` icon. + - Under `workspace`, click on the `Function App` icon. + - Click on `Deploy to Azure`. + + image + + - Select your `subscription`, your `function app`, and accept the prompt to overwrite: + + image + + - After completing, you see the status in your terminal: + + image + + image + +> [!IMPORTANT] +> If you need further assistance with the code, please click [here to view all the function code](./src/). + +> [!NOTE] +> Please ensure that all specified roles are assigned to the Function App. The provided example used `System assigned` for the Function App to facilitate the role assignment. + +## Test the solution + +> [!IMPORTANT] +> Please ensure that the user/system admin responsible for uploading the PDFs to the blob container has the necessary permissions. The error below illustrates what might occur if these roles are missing.
+> image
+> In that case, go to `Access Control (IAM)`, click on `+ Add`, and `Add role assignment`:
+> image
+> Search for `Storage Blob Data Contributor`, click `Next`.
+> image
+> Then, click on `select members` and search for your user/systen admin. Finally click on `Review + assign`. + +> Upload sample PDF invoices to the Blob container and verify that data is correctly ingested and stored in Cosmos DB. + +- Click on `Upload`, then select `Browse for files` and choose your PDF invoices to be stored in the blob container, which will trigger the function app to parse them. + + image + +- Check the logs, and traces from your function with `Application Insights`: + + image + +- Under `Investigate`, click on `Performance`. Filter by time range, and `drill into the samples`. Sort the results by date (if you have many, like in my case) and click on the last one. + + image + +- Click on `View all`: + + image + +- Check all the logs, and traces generated. Also review the information parsed: + + image + +- Validate that the information was uploaded to the Cosmos DB. Under `Data Explorer`, check your `Database`. + + image +
- Total views + Total views

Refresh Date: 2025-07-25

- + diff --git a/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio index b05d1f0..2f8fa3c 100644 --- a/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio +++ b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio @@ -1,6 +1,6 @@ - + - + @@ -27,10 +27,10 @@ - + - + @@ -75,12 +75,13 @@ - + - + + @@ -89,16 +90,23 @@ - - + + - + - + + + + + + + + diff --git a/src/function_app.py b/src/function_app.py index 3e86267..69e73ad 100644 --- a/src/function_app.py +++ b/src/function_app.py @@ -1,15 +1,16 @@ import logging import azure.functions as func -from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult from azure.core.credentials import AzureKeyCredential from azure.cosmos import CosmosClient, PartitionKey, exceptions from azure.identity import DefaultAzureCredential import os import uuid import json - -# For image conversion and vision API -from typing import List +from datetime import datetime +import time +from typing import List, Dict, Any, Optional +from PIL import Image from io import BytesIO import requests # For REST API to Vision from pdf2image import convert_from_bytes # For PDF to image conversion @@ -17,7 +18,7 @@ app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) ## DEFINITIONS -def initialize_form_recognizer_client(): +def initialize_form_recognizer_client() -> DocumentAnalysisClient: endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT") key = os.getenv("FORM_RECOGNIZER_KEY") if not isinstance(key, str): @@ -25,11 +26,11 @@ def initialize_form_recognizer_client(): logging.info(f"Form Recognizer endpoint: {endpoint}") return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) -def read_pdf_content(myblob): +def read_pdf_content(myblob: func.InputStream) -> bytes: logging.info(f"Reading PDF content from blob: {myblob.name}") return myblob.read() -def analyze_pdf(form_recognizer_client, pdf_bytes): +def analyze_pdf(form_recognizer_client: DocumentAnalysisClient, pdf_bytes: bytes) -> AnalyzeResult: logging.info("Starting PDF layout analysis.") poller = form_recognizer_client.begin_analyze_document( model_id="prebuilt-layout", @@ -38,22 +39,33 @@ def analyze_pdf(form_recognizer_client, pdf_bytes): logging.info("PDF layout analysis in progress.") result = poller.result() logging.info("PDF layout analysis completed.") - logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).") + num_pages = len(result.pages) if hasattr(result, "pages") and isinstance(result.pages, list) else 0 + num_tables = len(result.tables) if hasattr(result, "tables") and isinstance(result.tables, list) else 0 + num_styles = len(result.styles) if hasattr(result, "styles") and result.styles is not None else 0 + logging.info(f"Document has {num_pages} page(s), {num_tables} table(s), and {num_styles} style(s).") return result -def extract_layout_data(result, visual_cues: List[dict] = None): +def extract_layout_data(result: AnalyzeResult, visual_cues: Optional[List[Dict[str, Any]]] = None, source_file: str = "unknown") -> Dict[str, Any]: logging.info("Extracting layout data from analysis result.") layout_data = { "id": str(uuid.uuid4()), + "metadata": { + "processed_at": datetime.utcnow().isoformat(), + "source_file": source_file, + "pages_count": len(result.pages) if hasattr(result, "pages") else 0, + "tables_count": len(result.tables) if hasattr(result, "tables") else 0, + "visual_cues_count": len(visual_cues) if visual_cues else 0 + }, "pages": [] } visual_cues = visual_cues or [] # List of dicts with visual cue info per cell # Log styles - for idx, style in enumerate(result.styles): - content_type = "handwritten" if style.is_handwritten else "no handwritten" - logging.info(f"Document contains {content_type} content") + if hasattr(result, "styles") and result.styles: + for idx, style in enumerate(result.styles): + content_type = "handwritten" if style.is_handwritten else "no handwritten" + logging.info(f"Document contains {content_type} content") # Process each page for page in result.pages: @@ -65,7 +77,7 @@ def extract_layout_data(result, visual_cues: List[dict] = None): "selection_marks": [ {"state": mark.state, "confidence": mark.confidence} for mark in page.selection_marks - ] + ] if hasattr(page, 'selection_marks') and page.selection_marks else [] } # Log extracted lines @@ -73,16 +85,17 @@ def extract_layout_data(result, visual_cues: List[dict] = None): logging.info(f"Line {line_idx}: '{line.content}'") # Log selection marks - for selection_mark in page.selection_marks: - logging.info( - f"Selection mark is '{selection_mark.state}' with confidence {selection_mark.confidence}" - ) + if hasattr(page, 'selection_marks') and page.selection_marks: + for selection_mark in page.selection_marks: + logging.info( + f"Selection mark is '{selection_mark.state}' with confidence {selection_mark.confidence}" + ) # Extract tables page_tables = [ table for table in result.tables if any(region.page_number == page.page_number for region in table.bounding_regions) - ] + ] if hasattr(result, 'tables') and result.tables else [] for table_index, table in enumerate(page_tables): logging.info(f"Table {table_index}: {table.row_count} rows, {table.column_count} columns") @@ -118,7 +131,7 @@ def extract_layout_data(result, visual_cues: List[dict] = None): return layout_data -def save_layout_data_to_cosmos(layout_data): +def save_layout_data_to_cosmos(layout_data: Dict[str, Any]) -> None: try: endpoint = os.getenv("COSMOS_DB_ENDPOINT") key = os.getenv("COSMOS_DB_KEY") @@ -164,35 +177,226 @@ def save_layout_data_to_cosmos(layout_data): except Exception as e: logging.error(f"Error inserting item into Cosmos DB: {e}") -## MAIN -@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", - connection="invoicecontosostorage_STORAGE") -def call_vision_api(image_bytes, subscription_key, endpoint): +def call_vision_api(image_bytes: bytes, subscription_key: str, endpoint: str, max_retries: int = 3) -> Dict[str, Any]: vision_url = endpoint + "/vision/v3.2/analyze" headers = { 'Ocp-Apim-Subscription-Key': subscription_key, 'Content-Type': 'application/octet-stream' } params = { - 'visualFeatures': 'Objects,Color', # Add more features if needed + 'visualFeatures': 'Objects,Color,Text', # Added Text feature for better text detection + 'language': 'en', + 'model-version': 'latest' } - response = requests.post(vision_url, headers=headers, params=params, data=image_bytes) - response.raise_for_status() - return response.json() - -def extract_visual_cues_from_vision(vision_result, page_number): - # Example: Detect gray fills, checkmarks, hand-drawn marks - cues = [] - # This is a placeholder. You need to parse vision_result for your cues. - # For example, if vision_result['objects'] contains a 'checkmark' or color info for gray fill - # cues.append({"page_number": page_number, "row_index": ..., "column_index": ..., "cue_type": "gray_fill"}) + + for attempt in range(max_retries): + try: + response = requests.post(vision_url, headers=headers, params=params, data=image_bytes) + response.raise_for_status() + return response.json() + except requests.exceptions.HTTPError as http_err: + if hasattr(http_err, 'response') and http_err.response.status_code == 429: # Too Many Requests + if attempt < max_retries - 1: + retry_after = int(http_err.response.headers.get('Retry-After', 1)) + logging.warning(f"Rate limit hit, waiting {retry_after} seconds...") + time.sleep(retry_after) + continue + logging.error(f"HTTP error occurred: {http_err}") + raise + except Exception as err: + logging.error(f"Error calling Vision API: {err}") + if attempt < max_retries - 1: + time.sleep(2 ** attempt) # Exponential backoff + continue + raise + + raise Exception("Max retries exceeded for Vision API call") + +def extract_visual_cues_from_vision(vision_result: Dict[str, Any], page_number: int) -> List[Dict[str, Any]]: + """ + Extract visual cues from Azure Vision API results with enhanced detection capabilities. + Detects: checkboxes, filled areas, handwritten text, signatures, tables, and form elements + + Args: + vision_result: The response from Azure Vision API + page_number: Current page being processed + + Returns: + List of detected visual cues with their properties and confidence scores + """ + cues: List[Dict[str, Any]] = [] + + if not vision_result: + logging.warning(f"Empty vision result for page {page_number}") + return cues + + # Enhanced object detection with better classification + if 'objects' in vision_result: + for obj in vision_result['objects']: + if 'rectangle' in obj: + rect = obj['rectangle'] + x, y = rect.get('x', 0), rect.get('y', 0) + w, h = rect.get('w', 0), rect.get('h', 0) + confidence = obj.get('confidence', 0.0) + + # Improved checkbox detection with confidence threshold + if 0.8 <= w/h <= 1.2 and 10 <= w <= 50 and 10 <= h <= 50 and confidence > 0.6: + cues.append({ + "page_number": page_number, + "x": x, + "y": y, + "width": w, + "height": h, + "cue_type": "checkbox", + "confidence": confidence, + "metadata": { + "aspect_ratio": w/h, + "area": w * h + } + }) + + # Detect possible table structures + elif w > 100 and h > 100 and 'table' in obj.get('tags', []): + cues.append({ + "page_number": page_number, + "x": x, + "y": y, + "width": w, + "height": h, + "cue_type": "table", + "confidence": confidence + }) + + # Enhanced color analysis for form elements + if 'color' in vision_result: + color_info = vision_result['color'] + dominant_colors = color_info.get('dominantColors', []) + for color in dominant_colors: + color_lower = color.lower() + if color_lower in ['gray', 'grey']: + cues.append({ + "page_number": page_number, + "cue_type": "filled_area", + "color": color_lower, + "confidence": color_info.get('dominantColorConfidence', 0.0), + "metadata": { + "color_scheme": color_info.get('accentColor'), + "is_black_and_white": color_info.get('isBWImg', False) + } + }) + + # Enhanced text analysis with better handwriting and signature detection + if 'text' in vision_result: + for text_result in vision_result.get('text', {}).get('lines', []): + content = text_result.get('content', '').strip() + confidence = text_result.get('confidence', 0.0) + + if text_result.get('isHandwritten', False): + cue_type = "signature" if _is_likely_signature(content) else "handwritten" + cues.append({ + "page_number": page_number, + "text": content, + "cue_type": cue_type, + "confidence": confidence, + "metadata": { + "length": len(content), + "position": text_result.get('boundingBox', {}), + "detected_language": text_result.get('language', 'unknown') + } + }) + + # Log what we found + if cues: + logging.info(f"Found {len(cues)} visual cues on page {page_number}: {[c['cue_type'] for c in cues]}") + else: + logging.info(f"No visual cues detected on page {page_number}") + return cues -def convert_pdf_to_images(pdf_bytes): +def _is_likely_signature(text: str) -> bool: + """ + Detect if the given text is likely to be a signature based on heuristics. + + Args: + text: The text content to analyze + + Returns: + bool: True if the text matches signature patterns + """ + # Common signature indicators + signature_indicators = [ + lambda t: len(t.split()) <= 3, # Most signatures are 1-3 words + lambda t: any(c.isalpha() for c in t), # Contains letters + lambda t: len(t) < 50, # Not too long + lambda t: not t.isupper(), # Not all uppercase (unlikely for signatures) + lambda t: not any(c.isdigit() for c in t) # Usually no numbers in signatures + ] + + return all(indicator(text) for indicator in signature_indicators) + +def convert_pdf_to_images(pdf_bytes: bytes) -> List[Image.Image]: images = convert_from_bytes(pdf_bytes) return images -def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): +def extract_skill_selections_from_table(table_data): + """ + Given a table_data dict (as in your layout_data['pages'][x]['tables'][y]), + returns a list of dicts: [{"skill": ..., "selected": ...}, ...] + Assumes first column is skill name, columns 2-7 are options 0-5. + """ + skills = [] + for row in range(table_data["row_count"]): + skill_name = None + selected = None + for cell in table_data["cells"]: + if cell["row_index"] == row: + col = cell["column_index"] + content = cell["content"].replace("\n", " ").strip() + # First column is skill name + if col == 0: + skill_name = content + # Columns 2-7 are options 0-5 + elif 2 <= col <= 7: + if ":selected:" in content: + selected = col - 2 # 0-based + if skill_name and selected is not None: + skills.append({"skill": skill_name, "selected": selected}) + return skills + +def infer_table_title(table_data, page_lines): + """ + Try to infer the table title by looking for text above the table or in the first row/merged cells. + page_lines: list of all lines on the page (in order) + """ + # Find the minimum row_index in the table (should be 0) + min_row = min(cell["row_index"] for cell in table_data["cells"]) + # Get all cells in the first row + first_row_cells = [cell for cell in table_data["cells"] if cell["row_index"] == min_row] + # If any cell in the first row spans all columns, treat as title + for cell in first_row_cells: + if cell.get("column_span", 1) == table_data["column_count"] and cell["content"].strip(): + return cell["content"].strip() + # Otherwise, look for a line above the first row that is not in the table + # Find the topmost cell's content + top_cell_content = None + if first_row_cells: + top_cell_content = first_row_cells[0]["content"].strip() + # Try to find a line above the table that is not the top cell content + if page_lines and top_cell_content: + for idx, line in enumerate(page_lines): + if line.strip() == top_cell_content and idx > 0: + # Return the previous line as the title + prev_line = page_lines[idx-1].strip() + if prev_line: + return prev_line + # Fallback: use the top cell content if not empty + if top_cell_content: + return top_cell_content + return "Unknown Table" + +@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", + connection="invoicecontosostorage_STORAGE") +def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream) -> None: logging.info(f"Python blob trigger function processed blob\n" f"Name: {myblob.name}\n" f"Blob Size: {myblob.length} bytes") @@ -215,22 +419,31 @@ def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): # --- Step: Convert PDF to image and call Azure AI Vision --- visual_cues = [] try: - images = convert_pdf_to_images(pdf_bytes) + # Validate Vision API credentials vision_key = os.getenv("VISION_API_KEY") vision_endpoint = os.getenv("VISION_API_ENDPOINT") - for page_num, image in enumerate(images, start=1): - img_bytes_io = BytesIO() - image.save(img_bytes_io, format='JPEG') - img_bytes = img_bytes_io.getvalue() - vision_result = call_vision_api(img_bytes, vision_key, vision_endpoint) - cues = extract_visual_cues_from_vision(vision_result, page_num) - visual_cues.extend(cues) - logging.info(f"Visual cues extracted: {visual_cues}") + + if not vision_key or not vision_endpoint: + logging.warning("Vision API credentials not configured - skipping visual cue detection") + else: + images = convert_pdf_to_images(pdf_bytes) + if not images: + logging.warning("No images extracted from PDF") + else: + for page_num, image in enumerate(images, start=1): + img_bytes_io = BytesIO() + image.save(img_bytes_io, format='JPEG') + img_bytes = img_bytes_io.getvalue() + vision_result = call_vision_api(img_bytes, vision_key, vision_endpoint) + cues = extract_visual_cues_from_vision(vision_result, page_num) + visual_cues.extend(cues) + logging.info(f"Visual cues extracted: {visual_cues}") except Exception as e: logging.error(f"Error processing visual cues with AI Vision: {e}") + # Continue processing without visual cues try: - layout_data = extract_layout_data(result, visual_cues) + layout_data = extract_layout_data(result, visual_cues, myblob.name) logging.info("Successfully extracted and merged layout data.") except Exception as e: logging.error(f"Error extracting layout data: {e}") @@ -241,3 +454,53 @@ def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): logging.info("Successfully saved layout data to Cosmos DB.") except Exception as e: logging.error(f"Error saving layout data to Cosmos DB: {e}") + + # For each table, infer the title, create both DataFrame-like and summary JSON, log both, and save only the summary JSON + for page in layout_data["pages"]: + page_lines = page.get("lines", []) + for table in page["tables"]: + # --- Table Title Inference --- + table_title = infer_table_title(table, page_lines) + + # --- DataFrame-like JSON --- + # Build a 2D array of cell contents + df_like = [[None for _ in range(table["column_count"])] for _ in range(table["row_count"]) ] + for cell in table["cells"]: + r, c = cell["row_index"], cell["column_index"] + df_like[r][c] = cell["content"].strip() + df_json = { + "table_title": table_title, + "data": df_like + } + + # --- Pretty-print table as grid --- + def pretty_print_table(table_title, df_like): + # Find max width for each column + if not df_like or not df_like[0]: + return "(Empty table)" + col_widths = [max(len(str(row[c])) if row[c] is not None else 0 for row in df_like) for c in range(len(df_like[0]))] + lines = [] + lines.append(f"Table: {table_title}") + border = "+" + "+".join("-" * (w+2) for w in col_widths) + "+" + lines.append(border) + for i, row in enumerate(df_like): + row_str = "|" + "|".join(f" {str(cell) if cell is not None else '' :<{col_widths[j]}} " for j, cell in enumerate(row)) + "|" + lines.append(row_str) + lines.append(border) + return "\n".join(lines) + + pretty_table_str = pretty_print_table(table_title, df_like) + logging.info(f"\n{pretty_table_str}") + + # --- Summary JSON --- + skill_selections = extract_skill_selections_from_table(table) + summary = { + "table_title": table_title, + "skills": skill_selections + } + + # Log both outputs for user inspection + logging.info(f"Table DataFrame-like JSON: {json.dumps(df_json, indent=2)}") + logging.info(f"Table summary JSON: {json.dumps(summary, indent=2)}") + # Only save the summary JSON if needed (e.g., to Cosmos DB or elsewhere) + # (Current implementation saves only the main layout_data to Cosmos DB) diff --git a/src/host.json b/src/host.json index d5f63c0..f156d82 100644 --- a/src/host.json +++ b/src/host.json @@ -1,5 +1,6 @@ { "version": "2.0", + "functionTimeout": "00:10:00", "logging": { "applicationInsights": { "samplingSettings": { @@ -11,5 +12,11 @@ "extensionBundle": { "id": "Microsoft.Azure.Functions.ExtensionBundle", "version": "[4.*, 5.0.0)" + }, + "retry": { + "strategy": "exponentialBackoff", + "maxRetryCount": 3, + "minimumInterval": "00:00:02", + "maximumInterval": "00:00:30" } -} \ No newline at end of file +} diff --git a/src/requirements.txt b/src/requirements.txt index 839eb3a..0c757b4 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -7,3 +7,6 @@ azure-ai-formrecognizer azure-core azure-cosmos==4.3.0 azure-identity==1.7.0 +Pillow==10.0.1 +pdf2image==1.16.3 +requests==2.31.0 diff --git a/terraform-infrastructure/README.md b/terraform-infrastructure/README.md index df46399..754b370 100644 --- a/terraform-infrastructure/README.md +++ b/terraform-infrastructure/README.md @@ -1,115 +1,115 @@ -# Azure Infrastructure Terraform Template - -Costa Rica - -[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) -[brown9804](https://github.com/brown9804) - -Last updated: 2025-07-16 - ----------- - -
- Centered Image -
- -
- Centered Image -
- -## Overview - -Templates structure: - -``` -. -โ”œโ”€โ”€ README.md -โ”œโ”€โ”€โ”€โ”€โ”€โ”€ main.tf -โ”œโ”€โ”€โ”€โ”€โ”€โ”€ variables.tf -โ”œโ”€โ”€โ”€โ”€โ”€โ”€ provider.tf -โ”œโ”€โ”€โ”€โ”€โ”€โ”€ terraform.tfvars -โ”œโ”€โ”€โ”€โ”€โ”€โ”€ outputs.tf -``` - -- main.tf `(Main Terraform configuration file)`: This file contains the core infrastructure code. It defines the resources you want to create, such as virtual machines, networks, and storage. It's the primary file where you describe your infrastructure in a declarative manner. -- variables.tf `(Variable definitions)`: This file is used to define variables that can be used throughout your Terraform configuration. By using variables, you can make your configuration more flexible and reusable. For example, you can define variables for resource names, sizes, and other parameters that might change between environments. -- provider.tf `(Provider configurations)`: Providers are plugins that Terraform uses to interact with cloud providers, SaaS providers, and other APIs. This file specifies which providers (e.g., AWS, Azure, Google Cloud) you are using and any necessary configuration for them, such as authentication details. -- terraform.tfvars `(Variable values)`: This file contains the actual values for the variables defined in `variables.tf`. By separating variable definitions and values, you can easily switch between different sets of values for different environments (e.g., development, staging, production) without changing the main configuration files. -- outputs.tf `(Output values)`: This file defines the output values that Terraform should return after applying the configuration. Outputs are useful for displaying information about the resources created, such as IP addresses, resource IDs, and other important details. They can also be used as inputs for other Terraform configurations or scripts. - -## How to execute it - -```mermaid -graph TD; - A[az login] --> B(terraform init) - B --> C{Terraform provisioning stage} - C -->|Review| D[terraform plan] - C -->|Order Now| E[terraform apply] - C -->|Delete Resource if needed| F[terraform destroy] -``` - -> [!IMPORTANT] -> Please modify `terraform.tfvars` with your information, then run the following flow. If you need more visual guidance, please check the video that illustrates the provisioning steps. - -1. **Login to Azure**: This command logs you into your Azure account. It opens a browser window where you can enter your Azure credentials. Once logged in, you can manage your Azure resources from the command line. - - > Go to the path where Terraform files are located: - - ```sh - cd terraform-infrastructure - ``` - - ```sh - az login - ``` - - img - - img - -2. **Initialize Terraform**: Initializes the working directory containing the Terraform configuration files. It downloads the necessary provider plugins and sets up the backend for storing the state. - - ``` sh - terraform init - ``` - - img - -3. **Terraform Provisioning Stage**: - - - **Review**: Creates an execution plan, showing what actions Terraform will take to achieve the desired state defined in your configuration files. It uses the variable values specified in `terraform.tfvars`. - - ```sh - terraform plan -var-file terraform.tfvars - ``` - - > At the end, you will see a message in green if everything was executed successfully: - - Screenshot 2025-03-18 145143 - - - **Order Now**: Applies the changes required to reach the desired state of the configuration. It prompts for confirmation before making any changes. It also uses the variable values specified in `terraform.tfvars`. - - ```sh - terraform apply -var-file terraform.tfvars - ``` - - > At the end, you will see a message in green if everything was executed successfully: - - image - - - **Remove**: Destroys the infrastructure managed by Terraform. It prompts for confirmation before deleting any resources. It also uses the variable values specified in `terraform.tfvars`. - - ```sh - terraform destroy -var-file terraform.tfvars - ``` - - > At the end, you will see a message in green if everything was executed successfully: - - image - +# Azure Infrastructure Terraform Template + +Costa Rica + +[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) +[brown9804](https://github.com/brown9804) + +Last updated: 2025-07-25 + +---------- + +
+ Centered Image +
+ +
+ Centered Image +
+ +## Overview + +Templates structure: + +``` +. +โ”œโ”€โ”€ README.md +โ”œโ”€โ”€โ”€โ”€โ”€โ”€ main.tf +โ”œโ”€โ”€โ”€โ”€โ”€โ”€ variables.tf +โ”œโ”€โ”€โ”€โ”€โ”€โ”€ provider.tf +โ”œโ”€โ”€โ”€โ”€โ”€โ”€ terraform.tfvars +โ”œโ”€โ”€โ”€โ”€โ”€โ”€ outputs.tf +``` + +- main.tf `(Main Terraform configuration file)`: This file contains the core infrastructure code. It defines the resources you want to create, such as virtual machines, networks, and storage. It's the primary file where you describe your infrastructure in a declarative manner. +- variables.tf `(Variable definitions)`: This file is used to define variables that can be used throughout your Terraform configuration. By using variables, you can make your configuration more flexible and reusable. For example, you can define variables for resource names, sizes, and other parameters that might change between environments. +- provider.tf `(Provider configurations)`: Providers are plugins that Terraform uses to interact with cloud providers, SaaS providers, and other APIs. This file specifies which providers (e.g., AWS, Azure, Google Cloud) you are using and any necessary configuration for them, such as authentication details. +- terraform.tfvars `(Variable values)`: This file contains the actual values for the variables defined in `variables.tf`. By separating variable definitions and values, you can easily switch between different sets of values for different environments (e.g., development, staging, production) without changing the main configuration files. +- outputs.tf `(Output values)`: This file defines the output values that Terraform should return after applying the configuration. Outputs are useful for displaying information about the resources created, such as IP addresses, resource IDs, and other important details. They can also be used as inputs for other Terraform configurations or scripts. + +## How to execute it + +```mermaid +graph TD; + A[az login] --> B(terraform init) + B --> C{Terraform provisioning stage} + C -->|Review| D[terraform plan] + C -->|Order Now| E[terraform apply] + C -->|Delete Resource if needed| F[terraform destroy] +``` + +> [!IMPORTANT] +> Please modify `terraform.tfvars` with your information, then run the following flow. If you need more visual guidance, please check the video that illustrates the provisioning steps. + +1. **Login to Azure**: This command logs you into your Azure account. It opens a browser window where you can enter your Azure credentials. Once logged in, you can manage your Azure resources from the command line. + + > Go to the path where Terraform files are located: + + ```sh + cd terraform-infrastructure + ``` + + ```sh + az login + ``` + + img + + img + +2. **Initialize Terraform**: Initializes the working directory containing the Terraform configuration files. It downloads the necessary provider plugins and sets up the backend for storing the state. + + ``` sh + terraform init + ``` + + img + +3. **Terraform Provisioning Stage**: + + - **Review**: Creates an execution plan, showing what actions Terraform will take to achieve the desired state defined in your configuration files. It uses the variable values specified in `terraform.tfvars`. + + ```sh + terraform plan -var-file terraform.tfvars + ``` + + > At the end, you will see a message in green if everything was executed successfully: + + Screenshot 2025-03-18 145143 + + - **Order Now**: Applies the changes required to reach the desired state of the configuration. It prompts for confirmation before making any changes. It also uses the variable values specified in `terraform.tfvars`. + + ```sh + terraform apply -var-file terraform.tfvars + ``` + + > At the end, you will see a message in green if everything was executed successfully: + + image + + - **Remove**: Destroys the infrastructure managed by Terraform. It prompts for confirmation before deleting any resources. It also uses the variable values specified in `terraform.tfvars`. + + ```sh + terraform destroy -var-file terraform.tfvars + ``` + + > At the end, you will see a message in green if everything was executed successfully: + + image +
- Total views + Total views

Refresh Date: 2025-07-25

- + diff --git a/terraform-infrastructure/main.tf b/terraform-infrastructure/main.tf index 1586717..9382126 100644 --- a/terraform-infrastructure/main.tf +++ b/terraform-infrastructure/main.tf @@ -379,30 +379,339 @@ resource "azurerm_linux_function_app" "function_app" { "FUNCTIONS_NODE_BLOCK_ON_ENTRY_POINT_ERROR" = "true" "WEBSITE_RUN_FROM_PACKAGE" = "1" + # === Core Azure Services Configuration === "COSMOS_DB_ENDPOINT" = azurerm_cosmosdb_account.cosmosdb.endpoint "COSMOS_DB_KEY" = azurerm_cosmosdb_account.cosmosdb.primary_key "invoicecontosostorage_STORAGE" = azurerm_storage_account.storage.primary_connection_string + # Document Intelligence (Form Recognizer) for PDF layout analysis "FORM_RECOGNIZER_ENDPOINT" = azurerm_cognitive_account.form_recognizer.endpoint "FORM_RECOGNIZER_KEY" = azurerm_cognitive_account.form_recognizer.primary_access_key + # Application Insights for monitoring and telemetry "APPINSIGHTS_INSTRUMENTATIONKEY" = azurerm_application_insights.appinsights.instrumentation_key "APPLICATIONINSIGHTS_CONNECTION_STRING" = azurerm_application_insights.appinsights.connection_string - # Azure AI Vision settings + # === AI Vision Services Configuration === + # Azure AI Vision for visual cue detection and image analysis "VISION_API_ENDPOINT" = azurerm_cognitive_account.ai_vision.endpoint "VISION_API_KEY" = azurerm_cognitive_account.ai_vision.primary_access_key + + # === Azure OpenAI Configuration for LLM-Powered PDF Analysis === + # Main OpenAI service endpoint and authentication + "AZURE_OPENAI_ENDPOINT" = azurerm_cognitive_account.openai.endpoint + "AZURE_OPENAI_KEY" = azurerm_cognitive_account.openai.primary_access_key + "AZURE_OPENAI_API_VERSION" = "2024-02-15-preview" # Latest API version + + # Model deployment names for different LLM capabilities + "AZURE_OPENAI_GPT4_DEPLOYMENT" = azurerm_cognitive_deployment.gpt4.name # For complex reasoning and analysis + "AZURE_OPENAI_GPT4O_DEPLOYMENT" = azurerm_cognitive_deployment.gpt4o.name # For advanced multimodal processing + "AZURE_OPENAI_EMBEDDING_DEPLOYMENT" = azurerm_cognitive_deployment.text_embedding.name # For semantic search and similarity + + # === AI Studio Configuration === + # AI Studio Hub and Project for model management and MLOps + "AI_HUB_NAME" = azurerm_machine_learning_workspace.ai_hub.name + "AI_PROJECT_NAME" = azurerm_machine_learning_workspace.ai_project.name + "AI_HUB_WORKSPACE_URL" = "https://ml.azure.com/workspaces/${azurerm_machine_learning_workspace.ai_hub.workspace_id}/computes?region=${azurerm_machine_learning_workspace.ai_hub.location}" + "AI_PROJECT_WORKSPACE_URL" = "https://ml.azure.com/workspaces/${azurerm_machine_learning_workspace.ai_project.workspace_id}/computes?region=${azurerm_machine_learning_workspace.ai_project.location}" + + # AI Storage account for model artifacts and experiment data + "AI_STORAGE_ACCOUNT_NAME" = azurerm_storage_account.runtime.name + "AI_STORAGE_CONNECTION" = azurerm_storage_account.runtime.primary_connection_string + + # === LLM Processing Configuration === + # Configuration for LLM-powered PDF processing features + "ENABLE_LLM_PROCESSING" = "true" + "LLM_MAX_TOKENS" = "4000" # Maximum tokens per request + "LLM_TEMPERATURE" = "0.1" # Low temperature for consistent extraction + "LLM_TIMEOUT_SECONDS" = "120" # Timeout for LLM requests } depends_on = [ azurerm_service_plan.asp, azurerm_application_insights.appinsights, - azurerm_cosmosdb_account.cosmosdb - + azurerm_cosmosdb_account.cosmosdb, + # AI and ML dependencies for LLM-powered processing + azurerm_cognitive_account.openai, + azurerm_cognitive_deployment.gpt4, + azurerm_cognitive_deployment.gpt4o, + azurerm_cognitive_deployment.text_embedding, + azurerm_machine_learning_workspace.ai_hub, + azurerm_machine_learning_workspace.ai_project, + azurerm_storage_account.runtime ] provisioner "local-exec" { command = "echo Function App: ${self.name}" } } + + +# Azure AI Foundry (AI Studio) Infrastructure + + +# Azure OpenAI Service for LLM capabilities +resource "azurerm_cognitive_account" "openai" { + name = var.openai_account_name + location = var.openai_location # Must be a region that supports OpenAI + resource_group_name = azurerm_resource_group.rg.name + kind = "OpenAI" + sku_name = "S0" + + # Enable custom subdomain for OpenAI + custom_subdomain_name = var.openai_account_name + + # Network access configuration + network_acls { + default_action = "Allow" # Can be restricted to "Deny" with specific IP rules + } + + # Enable identity for secure access + identity { + type = "SystemAssigned" + } + + tags = { + Environment = var.environment + Purpose = "LLM-powered PDF extraction" + } + + depends_on = [azurerm_resource_group.rg] + + provisioner "local-exec" { + command = "echo Azure OpenAI Account: ${self.name}" + } +} + +# GPT-4 Model Deployment for PDF Analysis and Extraction +resource "azurerm_cognitive_deployment" "gpt4" { + name = "gpt-4" + cognitive_account_id = azurerm_cognitive_account.openai.id + + model { + format = "OpenAI" + name = "gpt-4" + version = "turbo-2024-04-09" # Current stable version for GPT-4 Turbo + } + + sku { + name = "Standard" + capacity = 20 # Tokens per minute (TPM) in thousands + } + + depends_on = [azurerm_cognitive_account.openai] + + provisioner "local-exec" { + command = "echo GPT-4 Deployment: ${self.name}" + } +} + +# GPT-4o Model Deployment for Advanced PDF Processing (Recommended for PDF extraction) +resource "azurerm_cognitive_deployment" "gpt4o" { + name = "gpt-4o" + cognitive_account_id = azurerm_cognitive_account.openai.id + + model { + format = "OpenAI" + name = "gpt-4o" + version = "2024-08-06" # Latest GPT-4o version with improved multimodal capabilities + } + + sku { + name = "Standard" + capacity = 30 # Higher capacity for complex PDF processing + } + + depends_on = [azurerm_cognitive_account.openai] + + provisioner "local-exec" { + command = "echo GPT-4o Deployment: ${self.name}" + } +} + +# Text Embedding Model for Semantic Search and Document Analysis +resource "azurerm_cognitive_deployment" "text_embedding" { + name = "text-embedding-ada-002" + cognitive_account_id = azurerm_cognitive_account.openai.id + + model { + format = "OpenAI" + name = "text-embedding-ada-002" + version = "2" + } + + sku { + name = "Standard" + capacity = 120 # High capacity for batch document processing + } + + depends_on = [azurerm_cognitive_account.openai] + + provisioner "local-exec" { + command = "echo Text Embedding Deployment: ${self.name}" + } +} + +# AI Studio Hub - Central resource for AI projects +resource "azurerm_machine_learning_workspace" "ai_hub" { + name = var.ai_hub_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + application_insights_id = azurerm_application_insights.appinsights.id + key_vault_id = azurerm_key_vault.keyvault.id + storage_account_id = azurerm_storage_account.runtime.id + + # Enable identity for secure resource access + identity { + type = "SystemAssigned" + } + + # Hub-specific settings + description = "AI Studio Hub for PDF Intelligence and LLM Processing" + friendly_name = "PDF Intelligence Hub" + + # Enable public network access (can be restricted) + public_network_access_enabled = true + + tags = { + Environment = var.environment + Purpose = "AI Hub for PDF processing and LLM capabilities" + Component = "AIFoundry" + } + + depends_on = [ + azurerm_resource_group.rg, + azurerm_application_insights.appinsights, + azurerm_key_vault.keyvault, + azurerm_storage_account.runtime + ] + + provisioner "local-exec" { + command = "echo AI Studio Hub: ${self.name}" + } +} + +# AI Project - Specific project for PDF extraction workloads +resource "azurerm_machine_learning_workspace" "ai_project" { + name = var.ai_project_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + application_insights_id = azurerm_application_insights.appinsights.id + key_vault_id = azurerm_key_vault.keyvault.id + storage_account_id = azurerm_storage_account.runtime.id + + # Enable identity for secure resource access + identity { + type = "SystemAssigned" + } + + # Project-specific settings + description = "AI Project for PDF Document Intelligence and Skills Extraction" + friendly_name = "PDF Skills Extraction Project" + + # Enable public network access (can be restricted) + public_network_access_enabled = true + + tags = { + Environment = var.environment + Purpose = "AI Project for PDF skills extraction and LLM analysis" + Component = "AIFoundry" + ParentHub = azurerm_machine_learning_workspace.ai_hub.name + } + + depends_on = [ + azurerm_machine_learning_workspace.ai_hub, + azurerm_application_insights.appinsights, + azurerm_key_vault.keyvault, + azurerm_storage_account.runtime + ] + + provisioner "local-exec" { + command = "echo AI Project: ${self.name}" + } +} + +# AI Models Container for storing custom models and artifacts +resource "azurerm_storage_container" "ai_models" { + name = "aimodels" + storage_account_id = azurerm_storage_account.runtime.id + container_access_type = "private" + + depends_on = [azurerm_storage_account.runtime] + + provisioner "local-exec" { + command = "echo AI Models Container: ${self.name}" + } +} + +# AI Experiments Container for experiment outputs and logs +resource "azurerm_storage_container" "ai_experiments" { + name = "experiments" + storage_account_id = azurerm_storage_account.runtime.id + container_access_type = "private" + + depends_on = [azurerm_storage_account.runtime] + + provisioner "local-exec" { + command = "echo AI Experiments Container: ${self.name}" + } +} + + +# Role Assignments for Function App to access AI Resources +# These assignments enable the Function App's managed identity to securely +# access AI services without storing credentials in application settings + + +# Grant Function App access to Azure OpenAI Service +resource "azurerm_role_assignment" "function_openai_user" { + scope = azurerm_cognitive_account.openai.id + role_definition_name = "Cognitive Services OpenAI User" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [azurerm_linux_function_app.function_app] + + provisioner "local-exec" { + command = "echo Role Assignment: Function App -> OpenAI User" + } +} + +# Grant Function App access to AI Hub +resource "azurerm_role_assignment" "function_ai_hub_contributor" { + scope = azurerm_machine_learning_workspace.ai_hub.id + role_definition_name = "AzureML Data Scientist" # Allows model deployment and experimentation + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [azurerm_linux_function_app.function_app] + + provisioner "local-exec" { + command = "echo Role Assignment: Function App -> AI Hub Data Scientist" + } +} + +# Grant Function App access to AI Project +resource "azurerm_role_assignment" "function_ai_project_contributor" { + scope = azurerm_machine_learning_workspace.ai_project.id + role_definition_name = "AzureML Data Scientist" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [azurerm_linux_function_app.function_app] + + provisioner "local-exec" { + command = "echo Role Assignment: Function App -> AI Project Data Scientist" + } +} + +# Grant Function App access to AI Storage Account +resource "azurerm_role_assignment" "function_ai_storage_contributor" { + scope = azurerm_storage_account.runtime.id + role_definition_name = "Storage Blob Data Contributor" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [azurerm_linux_function_app.function_app] + + provisioner "local-exec" { + command = "echo Role Assignment: Function App -> AI Storage Contributor" + } +} diff --git a/terraform-infrastructure/output.tf b/terraform-infrastructure/output.tf index 17d3946..3dea2dd 100644 --- a/terraform-infrastructure/output.tf +++ b/terraform-infrastructure/output.tf @@ -58,3 +58,93 @@ output "form_recognizer_name" { output "form_recognizer_endpoint" { value = azurerm_cognitive_account.form_recognizer.endpoint } + +# Azure OpenAI Outputs +output "openai_account_name" { + description = "The name of the Azure OpenAI account" + value = azurerm_cognitive_account.openai.name +} + +output "openai_endpoint" { + description = "The endpoint URL for the Azure OpenAI service" + value = azurerm_cognitive_account.openai.endpoint +} + +output "openai_resource_id" { + description = "The resource ID of the Azure OpenAI account" + value = azurerm_cognitive_account.openai.id +} + +# Model Deployment Outputs +output "gpt4_deployment_name" { + description = "The name of the GPT-4 model deployment" + value = azurerm_cognitive_deployment.gpt4.name +} + +output "gpt4o_deployment_name" { + description = "The name of the GPT-4o model deployment" + value = azurerm_cognitive_deployment.gpt4o.name +} + +output "text_embedding_deployment_name" { + description = "The name of the text embedding model deployment" + value = azurerm_cognitive_deployment.text_embedding.name +} + +# AI Studio Hub Outputs +output "ai_hub_name" { + description = "The name of the AI Studio Hub" + value = azurerm_machine_learning_workspace.ai_hub.name +} + +output "ai_hub_id" { + description = "The resource ID of the AI Studio Hub" + value = azurerm_machine_learning_workspace.ai_hub.id +} + +output "ai_hub_workspace_url" { + description = "The workspace URL for the AI Studio Hub" + value = "https://ml.azure.com/workspaces/${azurerm_machine_learning_workspace.ai_hub.workspace_id}/computes?region=${azurerm_machine_learning_workspace.ai_hub.location}" +} + +# AI Project Outputs +output "ai_project_name" { + description = "The name of the AI Studio Project" + value = azurerm_machine_learning_workspace.ai_project.name +} + +output "ai_project_id" { + description = "The resource ID of the AI Studio Project" + value = azurerm_machine_learning_workspace.ai_project.id +} + +output "ai_project_workspace_url" { + description = "The workspace URL for the AI Studio Project" + value = "https://ml.azure.com/workspaces/${azurerm_machine_learning_workspace.ai_project.workspace_id}/computes?region=${azurerm_machine_learning_workspace.ai_project.location}" +} + +# AI Storage Outputs +output "ai_storage_account_name" { + description = "The name of the AI storage account" + value = azurerm_storage_account.runtime.name +} + +output "ai_storage_account_id" { + description = "The resource ID of the AI storage account" + value = azurerm_storage_account.runtime.id +} + +# Deployment Summary +output "ai_foundry_summary" { + description = "Summary of deployed AI Foundry resources" + value = { + openai_account = azurerm_cognitive_account.openai.name + openai_endpoint = azurerm_cognitive_account.openai.endpoint + ai_hub = azurerm_machine_learning_workspace.ai_hub.name + ai_project = azurerm_machine_learning_workspace.ai_project.name + gpt4_deployment = azurerm_cognitive_deployment.gpt4.name + gpt4o_deployment = azurerm_cognitive_deployment.gpt4o.name + embedding_deployment = azurerm_cognitive_deployment.text_embedding.name + ai_storage = azurerm_storage_account.runtime.name + } +} diff --git a/terraform-infrastructure/provider.tf b/terraform-infrastructure/provider.tf index f4f5a1b..725c9dc 100644 --- a/terraform-infrastructure/provider.tf +++ b/terraform-infrastructure/provider.tf @@ -7,19 +7,19 @@ terraform { # Specify the required provider and its version required_providers { azurerm = { - source = "hashicorp/azurerm" # Source of the AzureRM provider - version = "~> 4.16.0" # Version of the AzureRM provider + source = "hashicorp/azurerm" # Source of the AzureRM provider + version = "~> 4.16.0" # Version of the AzureRM provider } } } provider "azurerm" { - features { # Enable features for the AzureRM provider - key_vault { - recover_soft_deleted_key_vaults = false - purge_soft_delete_on_destroy = true - } + features { # Enable features for the AzureRM provider + key_vault { + recover_soft_deleted_key_vaults = false + purge_soft_delete_on_destroy = true + } } - - subscription_id = var.subscription_id # Use the subscription ID variable -} \ No newline at end of file + + subscription_id = var.subscription_id # Use the subscription ID variable +} diff --git a/terraform-infrastructure/terraform.tfvars b/terraform-infrastructure/terraform.tfvars index b716b4b..eda1419 100644 --- a/terraform-infrastructure/terraform.tfvars +++ b/terraform-infrastructure/terraform.tfvars @@ -1,30 +1,45 @@ # Sample values -subscription_id = "" # "your-subscription_id" -resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name" -location = "West US" # "your-location" +subscription_id = "" # "your-subscription_id" +resource_group_name = "RG-PDFLayoutVisualCue-ProcessDocIntelligencex11" # "your-resource-group-name" +location = "West US" # "your-location" # Storage Account -storage_account_name = "storageaccbrownpdfix2" # "your-storage-account-name" -storage_account_name_runtime = "runtimestorebrownix2" # "your-runtime-storage-account-name" +storage_account_name = "storageaccbrownpdfix11" # "your-storage-account-name" +storage_account_name_runtime = "runtimestorebrownix11" # "your-runtime-storage-account-name" # Function App -function_app_name = "fapdfbrownix2" # "your-function-app-name" +function_app_name = "fapdfbrownix11" # "your-function-app-name" # App Service Plan -app_service_plan_name = "asppdfbrownix2" # "your-app-service-plan-name" +app_service_plan_name = "asppdfbrownix11" # "your-app-service-plan-name" # Application Insights -app_insights_name = "apppdfbrownix2" # "your-app-insights-name" +app_insights_name = "apppdfbrownix11" # "your-app-insights-name" # Log Analytics Workspace -log_analytics_workspace_name = "logwspdfbrownix2" # "your-log-analytics-workspace-name" +log_analytics_workspace_name = "logwspdfbrownix11" # "your-log-analytics-workspace-name" # Key Vault -key_vault_name = "kvpdfrbrownrix2" # "your-key-vault-name" +key_vault_name = "kvpdfrbrownrix11" # "your-key-vault-name" # CosmosDB -cosmosdb_account_name = "cosmospdfbrownix2" # "your-cosmosdb-account-name" +cosmosdb_account_name = "cosmospdfbrownix11" # "your-cosmosdb-account-name" # Form Recognizer -> Document Intelligence -form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name" +form_recognizer_name = "docintelligt01ix11" # "your-document-intelligence-name" # AI Vision Service -ai_vision_name = "aivisionpdfrbrownix2" # "your-ai-vision-name" +ai_vision_name = "aivisionpdfrbrownix11" # "your-ai-vision-name" ai_vision_sku = "S0" ai_vision_tags = { Environment = "Development" Project = "PDF Processing" Service = "AI Vision" } + +# Azure AI Foundry (AI Studio) Configuration +# Environment configuration +environment = "dev" + +# Azure OpenAI Service +openai_account_name = "openai-pdf-brownix11" # "your-openai-account-name" +openai_location = "eastus" # Must be a region that supports OpenAI + +# AI Studio Hub and Project +ai_hub_name = "ai-hub-pdf-brownix11" # "your-ai-hub-name" +ai_project_name = "ai-proj-pdf-extraction-ix11" # "your-ai-project-name" + +# AI Storage Account (for models and artifacts) +ai_storage_account_name = "aistoragebrownix11" # "your-ai-storage-account-name" diff --git a/terraform-infrastructure/variables.tf b/terraform-infrastructure/variables.tf index d37c765..431063e 100644 --- a/terraform-infrastructure/variables.tf +++ b/terraform-infrastructure/variables.tf @@ -63,7 +63,7 @@ variable "ai_vision_sku" { variable "ai_vision_tags" { description = "Tags to be applied to the AI Vision resource" type = map(string) - default = { + default = { Environment = "Development" Service = "AI Vision" } @@ -92,3 +92,52 @@ variable "throughput" { description = "The throughput (RU/s) to be allocated to the Cosmos DB SQL database or container." default = 400 } + +# These variables configure the AI Studio Hub, Project, and OpenAI services +# for LLM-powered PDF extraction and analysis capabilities + +variable "environment" { + description = "Environment tag for resources (dev, staging, prod). Used for resource tagging and naming conventions." + type = string + default = "dev" + + validation { + condition = contains(["dev", "staging", "prod"], var.environment) + error_message = "Environment must be one of: dev, staging, prod." + } +} + +# Azure OpenAI Service Configuration +variable "openai_account_name" { + description = "The name of the Azure OpenAI account. Must be globally unique and support GPT-4 models for PDF analysis." + type = string +} + +variable "openai_location" { + description = "The Azure region for OpenAI resources. Must be a region that supports Azure OpenAI service (eastus, westeurope, etc.)" + type = string + default = "eastus" # Default to East US which supports OpenAI + + validation { + condition = contains(["eastus", "westeurope", "southcentralus", "westus"], var.openai_location) + error_message = "OpenAI location must be in a region that supports Azure OpenAI service." + } +} + +# AI Studio Hub Configuration +variable "ai_hub_name" { + description = "The name of the AI Studio Hub. Central resource for managing AI projects and shared resources." + type = string +} + +# AI Project Configuration +variable "ai_project_name" { + description = "The name of the AI Studio Project. Specific workspace for PDF extraction and skills analysis workflows." + type = string +} + +# AI Storage Configuration +variable "ai_storage_account_name" { + description = "The name of the storage account for AI Hub and Project. Stores model artifacts, experiments, and training data." + type = string +}