diff --git a/.github/.markdownlint.json b/.github/.markdownlint.json new file mode 100644 index 0000000..97a16cc --- /dev/null +++ b/.github/.markdownlint.json @@ -0,0 +1,14 @@ +{ + "default": true, + "MD005": false, + "MD009": false, + "MD013": false, + "MD028": false, + "MD029": false, + "MD033": false, + "MD048": false, + "MD040": false, + "MD041": false, + "MD045": false, + "MD046": false +} diff --git a/.github/workflows/update-md-date.yml b/.github/workflows/update-md-date.yml new file mode 100644 index 0000000..95f5cfd --- /dev/null +++ b/.github/workflows/update-md-date.yml @@ -0,0 +1,58 @@ +name: Update Last Modified Date + +on: + pull_request: + branches: + - main + +permissions: + contents: write + pull-requests: write + +jobs: + update-date: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.head_ref || github.ref_name }} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: pip install python-dateutil + + - name: Configure Git + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + + - name: Update last modified date in Markdown files + run: python .github/workflows/update_date.py + + - name: Commit and merge changes + env: + PR_BRANCH: ${{ github.head_ref || github.ref_name }} + GIT_AUTHOR_NAME: github-actions[bot] + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + run: | + # Ensure we're on the correct branch + git switch -c "$PR_BRANCH" || git switch "$PR_BRANCH" + + # Stage and commit all changes (new, modified, deleted) + git add -A + git diff --staged --quiet || git commit -m "Update last modified date in Markdown files" + + # Pull and merge existing changes from remote + git pull origin "$PR_BRANCH" --no-rebase + + # Push all changes to the PR branch + git push origin "$PR_BRANCH" diff --git a/.github/workflows/update_date.py b/.github/workflows/update_date.py new file mode 100644 index 0000000..ab86df5 --- /dev/null +++ b/.github/workflows/update_date.py @@ -0,0 +1,49 @@ +import os +import subprocess +from datetime import datetime, timezone + +# Get the list of modified files +result = subprocess.run(['git', 'diff', '--name-only', 'HEAD~1'], stdout=subprocess.PIPE) +modified_files = result.stdout.decode('utf-8').split() + +# Debugging: Print the list of modified files +print("Modified files:", modified_files) + +# Filter for Markdown files +modified_md_files = [f for f in modified_files if f.endswith('.md')] + +# Debugging: Print the list of modified Markdown files +print("Modified Markdown files:", modified_md_files) + +# Current date +current_date = datetime.now(timezone.utc).strftime('%Y-%m-%d') + +# Function to update the last modified date in a file +def update_date_in_file(file_path): + with open(file_path, 'r') as file: + lines = file.readlines() + + updated = False + with open(file_path, 'w') as file: + for line in lines: + if line.startswith('Last updated:'): + file.write(f'Last updated: {current_date}\n') + updated = True + else: + file.write(line) + if not updated: + file.write(f'\nLast updated: {current_date}\n') + +# Check if there are any modified Markdown files +if not modified_md_files: + print("No modified Markdown files found.") + exit(0) + +# Update the date in each modified Markdown file +for file_path in modified_md_files: + print(f"Updating file: {file_path}") # Debugging: Print the file being updated + update_date_in_file(file_path) + +# Add and commit changes +subprocess.run(['git', 'add', '-A']) +subprocess.run(['git', 'commit', '-m', 'Update last modified date in Markdown files']) diff --git a/.github/workflows/use-visitor-counter.yml b/.github/workflows/use-visitor-counter.yml new file mode 100644 index 0000000..973fb24 --- /dev/null +++ b/.github/workflows/use-visitor-counter.yml @@ -0,0 +1,80 @@ +name: Use Visitor Counter Logic + +on: + pull_request: + branches: + - main + schedule: + - cron: '0 0 * * *' # Runs daily at midnight + workflow_dispatch: # Allows manual triggering + +permissions: + contents: write + pull-requests: write + +jobs: + update-visitor-count: + runs-on: ubuntu-latest + + steps: + - name: Checkout current repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.head_ref || github.ref_name }} + + - name: Shallow clone visitor counter logic + run: git clone --depth=1 https://github.com/brown9804/github-visitor-counter.git + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install dependencies for github-visitor-counter + run: | + cd github-visitor-counter + npm ci + + - name: Run visitor counter logic (updates markdown badges and metrics.json) + run: node github-visitor-counter/update_repo_views_counter.js + env: + TRAFFIC_TOKEN: ${{ secrets.TRAFFIC_TOKEN }} + REPO: ${{ github.repository }} + + - name: Move generated metrics.json to root + run: mv github-visitor-counter/metrics.json . + + - name: List files for debugging + run: | + ls -l + ls -l github-visitor-counter + + - name: Clean up visitor counter logic + run: rm -rf github-visitor-counter + + - name: Configure Git author + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + - name: Commit and merge changes + env: + PR_BRANCH: ${{ github.head_ref || github.ref_name }} + GIT_AUTHOR_NAME: github-actions[bot] + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + run: | + # Ensure we're on the correct branch + git switch -c "$PR_BRANCH" || git switch "$PR_BRANCH" + + # Stage and commit changes if any + git add -A + git diff --staged --quiet || git commit -m "Update visitor count" + + # Pull and merge existing changes + git pull origin "$PR_BRANCH" --no-rebase + + # Push all changes + git push origin "$PR_BRANCH" diff --git a/.github/workflows/validate_and_fix_markdown.yml b/.github/workflows/validate_and_fix_markdown.yml new file mode 100644 index 0000000..8bb9f1f --- /dev/null +++ b/.github/workflows/validate_and_fix_markdown.yml @@ -0,0 +1,58 @@ +name: Validate and Fix Markdown + +on: + pull_request: + branches: + - main + +permissions: + contents: write + pull-requests: write + +jobs: + validate-and-fix-markdown: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.head_ref || github.ref_name }} + + - name: Set up Node.js + uses: actions/setup-node@v3 + with: + node-version: '16' + + - name: Install Markdown Linter + run: npm install -g markdownlint-cli + + - name: Lint and Fix Markdown files + run: markdownlint '**/*.md' --fix --config .github/.markdownlint.json + + - name: Configure Git + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + + - name: Commit and merge changes + env: + PR_BRANCH: ${{ github.head_ref || github.ref_name }} + GIT_AUTHOR_NAME: github-actions[bot] + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com + run: | + # Ensure we're on the correct branch + git switch -c "$PR_BRANCH" || git switch "$PR_BRANCH" + + # Stage and commit changes if any + git add -A + git diff --staged --quiet || git commit -m "Fix Markdown syntax issues" + + # Pull and merge existing changes + git pull origin "$PR_BRANCH" --no-rebase + + # Push all changes + git push origin "$PR_BRANCH" diff --git a/.gitignore b/.gitignore index 6349e36..a57fe2b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ # .tfstate files *.tfstate *.tfstate.* +.terraform.lock.hcl +terraform.tfstate.backup # Crash log files crash.log diff --git a/README.md b/README.md index 62befa4..d441406 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,453 @@ -# PDFs-MultiLayout-VisualCue-Fapp-DocIntelligence -Extended solution for extracting tables, checkboxes, and visually selected values (e.g., shaded cells, Xs, checkmarks) from PDFs using Azure Document Intelligence and Vision. +# Demo: PDF Layout Extraction with Doc Intelligence
Supporting Multiple Document Versions with Visual Selection Cues (full-code approach) + +`Azure Storage + Document Intelligence + Function App + Cosmos DB` + +Costa Rica + +[![GitHub](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com) +[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) +[brown9804](https://github.com/brown9804) + +Last updated: 2025-07-16 + +----------------------------- + +> This solution is designed to be flexible and robust, supporting multiple versions of PDF documents with varying layouts—including those that use visual selection cues such as gray fills, hand-drawn Xs, checkmarks, or circles. By building on the [PDFs-Layouts-Processing-Fapp-DocIntelligence](https://github.com/MicrosoftCloudEssentials-LearningHub/PDFs-Layouts-Processing-Fapp-DocIntelligence) repository, we ensure that: + +- Table structure and text are extracted using Azure Document Intelligence (Layout model). +- Visual selection cues are detected using Azure AI Vision or image preprocessing. +- Visual indicators are mapped to structured data, returning only the selected values in a clean JSON format. +- The logic is abstracted to support multiple layout variations, so the system adapts easily to new document formats and selection styles. + +> [!IMPORTANT] +> This example is based on a `public network site and is intended for demonstration purposes only`. It showcases how several Azure resources can work together to achieve the desired result. Consider the section below about [Important Considerations for Production Environment](#important-considerations-for-production-environment). Please note that `these demos are intended as a guide and are based on my personal experiences. For official guidance, support, or more detailed information, please refer to Microsoft's official documentation or contact Microsoft directly`: [Microsoft Sales and Support](https://support.microsoft.com/contactus?ContactUsExperienceEntryPointAssetId=S.HP.SMC-HOME) + +
+List of References (Click to expand) + +- [Use Azure AI services with SynapseML in Microsoft Fabric](https://learn.microsoft.com/en-us/fabric/data-science/how-to-use-ai-services-with-synapseml) +- [Plan and manage costs for Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/costs-plan-manage) +- [Azure AI Document Intelligence documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/?view=doc-intel-4.0.0) +- [Get started with the Document Intelligence Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/try-sample-label-tool?view=doc-intel-2.1.0#prerequisites-for-training-a-custom-form-model) +- [Document Intelligence Sample Labeling tool](https://fott-2-1.azurewebsites.net/) +- [Assign an Azure role for access to blob data](https://learn.microsoft.com/en-us/azure/storage/blobs/assign-azure-role-data-access?tabs=portal) +- [Build and train a custom extraction model](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/build-a-custom-model?view=doc-intel-2.1.0) +- [Compose custom models - Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/compose-custom-models?view=doc-intel-2.1.0&tabs=studio) +- [Deploy the Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/deploy-label-tool?view=doc-intel-2.1.0) +- [Train a custom model using the Sample Labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/label-tool?view=doc-intel-2.1.0) +- [Train models with the sample-labeling tool](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/v21/supervised-table-tags?view=doc-intel-2.1.0) +- [Azure Cosmos DB - Database for the AI Era](https://learn.microsoft.com/en-us/azure/cosmos-db/introduction) +- [Consistency levels in Azure Cosmos DB](https://learn.microsoft.com/en-us/azure/cosmos-db/consistency-levels) +- [Azure Cosmos DB SQL API client library for Python](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python) +- [CosmosClient class documentation](https://learn.microsoft.com/en-us/python/api/azure-cosmos/azure.cosmos.cosmos_client.cosmosclient?view=azure-python) +- [Cosmos AAD Authentication](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python#aad-authentication) +- [Cosmos python examples](https://learn.microsoft.com/en-us/python/api/overview/azure/cosmos-readme?view=azure-python#examples) +- [Use control plane role-based access control with Azure Cosmos DB for NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/security/how-to-grant-control-plane-role-based-access?tabs=built-in-definition%2Ccsharp&pivots=azure-interface-portal) +- [Use data plane role-based access control with Azure Cosmos DB for NoSQL](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/security/how-to-grant-data-plane-role-based-access?tabs=built-in-definition%2Ccsharp&pivots=azure-interface-cli) +- [Create or update Azure custom roles using Azure CLI](https://learn.microsoft.com/en-us/azure/role-based-access-control/custom-roles-cli) +- [Document Intelligence query field extraction](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept/query-fields?view=doc-intel-4.0.0) +- [What's new in Azure AI Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/whats-new?view=doc-intel-4.0.0) +- [Managed identities for Document Intelligence](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/authentication/managed-identities?view=doc-intel-4.0.0) + +
+ +
+Table of Content (Click to expand) + +- [Important Considerations for Production Environment](#important-considerations-for-production-environment) +- [Prerequisites](#prerequisites) +- [Where to start?](#where-to-start) +- [Overview](#overview) +- [Function App Hosting Options](#function-app-hosting-options) +- [Function App: Configure/Validate the Environment variables](#function-app-configurevalidate-the-environment-variables) +- [Function App: Develop the logic](#function-app-develop-the-logic) +- [Test the solution](#test-the-solution) + +
+ +> How to extract layout elements from PDFs stored in an Azure Storage Account, process them using Azure Document Intelligence, and store the results in Cosmos DB for further analysis. +> +> 1. Upload your PDFs to an Azure Blob Storage container.
+> 2. An Azure Function is triggered by the upload, which calls the Azure Document Intelligence Layout API to analyze the document structure.
+> 3. The extracted layout data (such as tables, checkboxes, and text) is parsed and subsequently stored in a Cosmos DB database, ensuring a seamless and automated workflow from document upload to data storage. + +> [!NOTE] +> Advantages of Document Intelligence for organizations handling with large volumes of documents:
+> +> - Utilizes natural language processing, computer vision, deep learning, and machine learning.
+> - Handles structured, semi-structured, and unstructured documents.
+> - Automates the extraction and transformation of layout data into usable formats like JSON or CSV. + +
+ Centered Image +
+ +> [!NOTE] +> Azure Event Grid System Topics are free to create and manage, a System Topic is automatically created and managed by Azure for certain Azure services that emit events. It represents a source of events from an Azure resource (like a Storage Account, Key Vault, or Azure Maps). `You don't need to create or manage the topic yourself, Azure does it for you when you enable event publishing on a supported resource.`
+> +> - Emits predefined event types (e.g., Microsoft.Storage.BlobCreated, Microsoft.Resources.ResourceWriteSuccess).
+> - You can attach event handlers (like Azure Functions, Logic Apps, Webhooks) to respond to these events.
+> - Works seamlessly with serverless architectures for real-time automation.
+> For example: +> Suppose you have a Storage Account and want to trigger a function every time a new blob is uploaded:
+> - Azure automatically creates a System Topic for the Storage Account. +> - You subscribe to the BlobCreated event. +> - When a blob is uploaded, Event Grid routes the event to your Azure Function. + +
+ Centered Image +
+ +## Important Considerations for Production Environment + +
+ Private Network Configuration + + > For enhanced security, consider configuring your Azure resources to operate within a private network. This can be achieved using Azure Virtual Network (VNet) to isolate your resources and control inbound and outbound traffic. Implementing private endpoints for services like Azure Blob Storage and Azure Functions can further secure your data by restricting access to your VNet. + +
+ +
+ Security + + > Ensure that you implement appropriate security measures when deploying this solution in a production environment. This includes:
+ > + > - Securing Access: Use Azure Entra ID (formerly known as Azure Active Directory or Azure AD) for authentication and role-based access control (RBAC) to manage permissions.
+ > - Managing Secrets: Store sensitive information such as connection strings and API keys in Azure Key Vault.
+ > - Data Encryption: Enable encryption for data at rest and in transit to protect sensitive information. + +
+ +
+ Scalability + + > While this example provides a basic setup, you may need to scale the resources based on your specific requirements. Azure services offer various scaling options to handle increased workloads. Consider using:
+ > + > - Auto-scaling: Configure auto-scaling for Azure Functions and other services to automatically adjust based on demand.
+ > - Load Balancing: Use Azure Load Balancer or Application Gateway to distribute traffic and ensure high availability. + +
+ +
+ Cost Management + + > Monitor and manage the costs associated with your Azure resources. Use Azure Cost Management and Billing to track usage and optimize resource allocation. + +
+ +
+ Compliance + + > Ensure that your deployment complies with relevant regulations and standards. Use Azure Policy to enforce compliance and governance policies across your resources. +
+ +
+ Disaster Recovery + +> Implement a disaster recovery plan to ensure business continuity in case of failures. Use Azure Site Recovery and backup solutions to protect your data and applications. + +
+ +## Prerequisites + +- An `Azure subscription is required`. All other resources, including instructions for creating a Resource Group, are provided in this workshop. +- `Contributor role assigned or any custom role that allows`: access to manage all resources, and the ability to deploy resources within subscription. +- If you choose to use the Terraform approach, please ensure that: + - [Terraform is installed on your local machine](https://developer.hashicorp.com/terraform/tutorials/azure-get-started/install-cli#install-terraform). + - [Install the Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli) to work with both Terraform and Azure commands. + +## Where to start? + +1. Please follow the [Terraform guide](./terraform-infrastructure/) to deploy the necessary Azure resources for the workshop. +2. Next, as this method `skips the creation of each resource` manually. Proceed with the configuration from [Configure/Validate the Environment variables](#function-app-configurevalidate-the-environment-variables). + +> [!IMPORTANT] +> Regarding `Networking`, this example will cover `Public access configuration`, and `system-managed identity`. However, please ensure you `review your privacy requirements and adjust network and access settings as necessary for your specific case`. + +## Overview + +> Using Cosmos DB provides you with a flexible, scalable, and globally distributed database solution that can handle both structured and semi-structured data efficiently.
+> +> - `Azure Blob Storage`: Store the PDF invoices.
+> - `Azure Functions`: Trigger on new PDF uploads, extract data, and process it.
+> - `Azure SQL Database or Cosmos DB`: Store the extracted data for querying and analytics.
+ +| Resource | Recommendation | +|---------------------------|----------------------------------------------------------------------------------------------------------------------| +| **Azure Blob Storage** | Use for storing the PDF files. This keeps your file storage separate from your data storage, which is a common best practice. | +| **Azure SQL Database** | Use if your data is highly structured and you need complex queries and transactions. | +| **Azure Cosmos DB** | Use if you need a globally distributed database with low latency and the ability to handle semi-structured data. | + +## Function App Hosting Options + +> In the context of Azure Function Apps, a `hosting option refers to the plan you choose to run your function app`. This choice affects how your function app is scaled, the resources available to each function app instance, and the support for advanced functionalities like virtual network connectivity and container support. + +> [!TIP] +> +> - `Scale to Zero`: Indicates whether the service can automatically scale down to zero instances when idle. +> - **IDLE** stands for: +> - **I** – Inactive +> - **D** – During +> - **L** – Low +> - **E** – Engagement +> - In other words, when the application is not actively handling requests or events (it's in a low-activity or paused state). +> - `Scale Behavior`: Describes how the service scales (e.g., `event-driven`, `dedicated`, or `containerized`). +> - `Virtual Networking`: Whether the service supports integration with virtual networks for secure communication. +> - `Dedicated Compute & Reserved Cold Start`: Availability of always-on compute to avoid cold starts and ensure low latency. +> - `Max Scale Out (Instances)`: Maximum number of instances the service can scale out to. +> - `Example AI Use Cases`: Real-world scenarios where each plan excels. + +
+Flex Consumption + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `Yes` | +| **Scale Behavior** | `Fast event-driven` | +| **Virtual Networking** | `Optional` | +| **Dedicated Compute & Reserved Cold Start** | `Optional (Always Ready)` | +| **Max Scale Out (Instances)** | `1000` | +| **Example AI Use Cases** | `Real-time data processing` for AI models, `high-traffic AI-powered APIs`, `event-driven AI microservices`. Ideal for fraud detection, real-time recommendations, NLP, and computer vision services. | + +
+ +
+Consumption + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `Yes` | +| **Scale Behavior** | `Event-driven` | +| **Virtual Networking** | `Optional` | +| **Dedicated Compute & Reserved Cold Start** | `No` | +| **Max Scale Out (Instances)** | `200` | +| **Example AI Use Cases** | `Lightweight AI APIs`, `scheduled AI tasks`, `low-traffic AI event processing`. Great for sentiment analysis, simple image recognition, and batch ML tasks. | + +
+ +
+Functions Premium + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `No` | +| **Scale Behavior** | `Event-driven with premium options` | +| **Virtual Networking** | `Yes` | +| **Dedicated Compute & Reserved Cold Start** | `Yes` | +| **Max Scale Out (Instances)** | `100` | +| **Example AI Use Cases** | `Enterprise AI applications`, `low-latency AI APIs`, `VNet integration`. Ideal for secure, high-performance AI services like customer support and analytics. | + +
+ +
+App Service + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `No` | +| **Scale Behavior** | `Dedicated VMs` | +| **Virtual Networking** | `Yes` | +| **Dedicated Compute & Reserved Cold Start** | `Yes` | +| **Max Scale Out (Instances)** | `Varies` | +| **Example AI Use Cases** | `AI-powered web applications`, `dedicated resources`. Great for chatbots, personalized content, and intensive AI inference. | + +
+ +
+Container Apps Env. + +| Feature | Description | +|--------|-------------| +| **Scale to Zero** | `No` | +| **Scale Behavior** | `Containerized microservices environment` | +| **Virtual Networking** | `Yes` | +| **Dedicated Compute & Reserved Cold Start** | `Yes` | +| **Max Scale Out (Instances)** | `Varies` | +| **Example AI Use Cases** | `AI microservices architecture`, `containerized AI workloads`, `complex AI workflows`. Ideal for orchestrating AI services like image processing, text analysis, and real-time analytics. | + +
+ +## Function App: Configure/Validate the Environment variables + +> [!NOTE] +> This example is using system-assigned managed identity to assign RBACs (Role-based Access Control). + +- Under `Settings`, go to `Environment variables`. And `+ Add` the following variables: + + - `COSMOS_DB_ENDPOINT`: Your Cosmos DB account endpoint 🡢 `Review the existence of this, if not create it` + - `COSMOS_DB_KEY`: Your Cosmos DB account key 🡢 `Review the existence of this, if not create it` + - `COSMOS_DB_CONNECTION_STRING`: Your Cosmos DB connection string 🡢 `Review the existence of this, if not create it` + - `invoicecontosostorage_STORAGE`: Your Storage Account connection string 🡢 `Review the existence of this, if not create it` + - `FORM_RECOGNIZER_ENDPOINT`: For example: `https://.cognitiveservices.azure.com/` 🡢 `Review the existence of this, if not create it` + - `FORM_RECOGNIZER_KEY`: Your Documment Intelligence Key (Form Recognizer). 🡢 + - `FUNCTIONS_EXTENSION_VERSION`: `~4` 🡢 `Review the existence of this, if not create it` + - `WEBSITE_RUN_FROM_PACKAGE`: `1` 🡢 `Review the existence of this, if not create it` + - `FUNCTIONS_WORKER_RUNTIME`: `python` 🡢 `Review the existence of this, if not create it` + - `FUNCTIONS_NODE_BLOCK_ON_ENTRY_POINT_ERROR`: `true` (This setting ensures that all entry point errors are visible in your application insights logs). 🡢 `Review the existence of this, if not create it` + + image + + image + + image + + image + + - Click on `Apply` to save your configuration. + + image + +## Function App: Develop the logic + +- You need to install [VSCode](https://code.visualstudio.com/download) +- Install python from Microsoft store: + + image + +- Open VSCode, and install some extensions: `python`, and `Azure Tools`. + + image + + image + +- Click on the `Azure` icon, and `sign in` into your account. Allow the extension `Azure Resources` to sign in using Microsoft, it will open a browser window. After doing so, you will be able to see your subscription and resources. + + image + +- Under Workspace, click on `Create Function Project`, and choose a path in your local computer to develop your function. + + image + +- Choose the language, in this case is `python`: + + image + +- Select the model version, for this example let's use `v2`: + + image + +- For the python interpreter, let's use the one installed via `Microsoft Store`: + + image + +- Choose a template (e.g., **Blob trigger**) and configure it to trigger on new PDF uploads in your Blob container. + + image + +- Provide a function name, like `BlobTriggerContosoPDFInvoicesDocIntelligence`: + + image + +- Next, it will prompt you for the path of the blob container where you expect the function to be triggered after a file is uploaded. In this case is `pdfinvoices` as was previously created. + + image + +- Click on `Create new local app settings`, and then choose your subscription. + + image + +- Choose `Azure Storage Account for remote storage`, and select one. I'll be using the `invoicecontosostorage`. + + image + +- Then click on `Open in the current window`. You will see something like this: + + image + +- Now we need to update the function code to extract data from PDFs and store it in Cosmos DB, use this an example: + + > 1. **PDF Upload**: A PDF file is uploaded to the Azure Blob Storage container (`pdfinvoices`). + > 2. **Trigger Azure Function**: The upload triggers the Azure Function `BlobTriggerContosoPDFLayoutsDocIntelligence`. + > 3. **Initialize Clients**: Sets up connections to Azure Document Intelligence and Cosmos DB. + > - Initializes the `DocumentAnalysisClient` using the `FORM_RECOGNIZER_ENDPOINT` and `FORM_RECOGNIZER_KEY` environment variables. + > - Initializes the `CosmosClient` using Azure Active Directory (AAD) via `DefaultAzureCredential`. + > 4. **Read PDF from Blob Storage**: Reads the PDF content from the blob into a byte stream. + > 5. **Analyze PDF**: Uses Azure Document Intelligence to analyze the layout of the PDF. + > - Calls `begin_analyze_document` with the `prebuilt-layout` model. + > - Waits for the analysis to complete and retrieves the layout result. + > 6. **Extract Layout Data**: Parses and structures the layout data from the analysis result. + > - Extracts lines, tables, and selection marks from each page. + > - Logs styles (e.g., handwritten content) and organizes data into a structured dictionary. + > 7. **Save Data to Cosmos DB**: Saves the structured layout data to Cosmos DB. + > - Ensures the database (`ContosoDBDocIntellig`) and container (`Layouts`) exist or creates them. + > - Inserts or updates the layout data using `upsert_item`. + > 8. **Logging (Process and Errors)**: Logs each step of the process, including success messages and detailed error handling for debugging and monitoring. + + - Update the function_app.py, for example [see the code used in this demo](./src/function_app.py): + + | Template Blob Trigger | Function Code updated | + | --- | --- | + | image | image| + + - Now, let's update the `requirements.txt`, [see the code used in this demo](./src/requirements.txt): + + | Template `requirements.txt` | Updated `requirements.txt` | + | --- | --- | + | image | image| + + - Since this function has already been tested, you can deploy your code to the function app in your subscription. If you want to test, you can use run your function locally for testing. + - Click on the `Azure` icon. + - Under `workspace`, click on the `Function App` icon. + - Click on `Deploy to Azure`. + + image + + - Select your `subscription`, your `function app`, and accept the prompt to overwrite: + + image + + - After completing, you see the status in your terminal: + + image + + image + +> [!IMPORTANT] +> If you need further assistance with the code, please click [here to view all the function code](./src/). + +> [!NOTE] +> Please ensure that all specified roles are assigned to the Function App. The provided example used `System assigned` for the Function App to facilitate the role assignment. + +## Test the solution + +> [!IMPORTANT] +> Please ensure that the user/system admin responsible for uploading the PDFs to the blob container has the necessary permissions. The error below illustrates what might occur if these roles are missing.
+> image
+> In that case, go to `Access Control (IAM)`, click on `+ Add`, and `Add role assignment`:
+> image
+> Search for `Storage Blob Data Contributor`, click `Next`.
+> image
+> Then, click on `select members` and search for your user/systen admin. Finally click on `Review + assign`. + +> Upload sample PDF invoices to the Blob container and verify that data is correctly ingested and stored in Cosmos DB. + +- Click on `Upload`, then select `Browse for files` and choose your PDF invoices to be stored in the blob container, which will trigger the function app to parse them. + + image + +- Check the logs, and traces from your function with `Application Insights`: + + image + +- Under `Investigate`, click on `Performance`. Filter by time range, and `drill into the samples`. Sort the results by date (if you have many, like in my case) and click on the last one. + + image + +- Click on `View all`: + + image + +- Check all the logs, and traces generated. Also review the information parsed: + + image + +- Validate that the information was uploaded to the Cosmos DB. Under `Data Explorer`, check your `Database`. + + image + + +
+ Total views +

Refresh Date: 2025-07-21

+
+ diff --git a/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio new file mode 100644 index 0000000..b05d1f0 --- /dev/null +++ b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio @@ -0,0 +1,105 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/metrics.json b/metrics.json new file mode 100644 index 0000000..8665541 --- /dev/null +++ b/metrics.json @@ -0,0 +1,37 @@ +[ + { + "date": "2025-07-07", + "count": 330, + "uniques": 20 + }, + { + "date": "2025-07-08", + "count": 159, + "uniques": 6 + }, + { + "date": "2025-07-10", + "count": 482, + "uniques": 1 + }, + { + "date": "2025-07-11", + "count": 170, + "uniques": 4 + }, + { + "date": "2025-07-12", + "count": 7, + "uniques": 1 + }, + { + "date": "2025-07-14", + "count": 130, + "uniques": 2 + }, + { + "date": "2025-07-15", + "count": 2, + "uniques": 1 + } +] \ No newline at end of file diff --git a/src/.funcignore b/src/.funcignore new file mode 100644 index 0000000..f1110d3 --- /dev/null +++ b/src/.funcignore @@ -0,0 +1,8 @@ +.git* +.vscode +__azurite_db*__.json +__blobstorage__ +__queuestorage__ +local.settings.json +test +.venv diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..74fc765 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,135 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Azure Functions artifacts +bin +obj +appsettings.json +local.settings.json + +# Azurite artifacts +__blobstorage__ +__queuestorage__ +__azurite_db*__.json +.python_packages diff --git a/src/function_app.py b/src/function_app.py new file mode 100644 index 0000000..3e86267 --- /dev/null +++ b/src/function_app.py @@ -0,0 +1,243 @@ +import logging +import azure.functions as func +from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.core.credentials import AzureKeyCredential +from azure.cosmos import CosmosClient, PartitionKey, exceptions +from azure.identity import DefaultAzureCredential +import os +import uuid +import json + +# For image conversion and vision API +from typing import List +from io import BytesIO +import requests # For REST API to Vision +from pdf2image import convert_from_bytes # For PDF to image conversion + +app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) + +## DEFINITIONS +def initialize_form_recognizer_client(): + endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT") + key = os.getenv("FORM_RECOGNIZER_KEY") + if not isinstance(key, str): + raise ValueError("FORM_RECOGNIZER_KEY must be a string") + logging.info(f"Form Recognizer endpoint: {endpoint}") + return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) + +def read_pdf_content(myblob): + logging.info(f"Reading PDF content from blob: {myblob.name}") + return myblob.read() + +def analyze_pdf(form_recognizer_client, pdf_bytes): + logging.info("Starting PDF layout analysis.") + poller = form_recognizer_client.begin_analyze_document( + model_id="prebuilt-layout", + document=pdf_bytes + ) + logging.info("PDF layout analysis in progress.") + result = poller.result() + logging.info("PDF layout analysis completed.") + logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).") + return result + +def extract_layout_data(result, visual_cues: List[dict] = None): + logging.info("Extracting layout data from analysis result.") + + layout_data = { + "id": str(uuid.uuid4()), + "pages": [] + } + visual_cues = visual_cues or [] # List of dicts with visual cue info per cell + + # Log styles + for idx, style in enumerate(result.styles): + content_type = "handwritten" if style.is_handwritten else "no handwritten" + logging.info(f"Document contains {content_type} content") + + # Process each page + for page in result.pages: + logging.info(f"--- Page {page.page_number} ---") + page_data = { + "page_number": page.page_number, + "lines": [line.content for line in page.lines], + "tables": [], + "selection_marks": [ + {"state": mark.state, "confidence": mark.confidence} + for mark in page.selection_marks + ] + } + + # Log extracted lines + for line_idx, line in enumerate(page.lines): + logging.info(f"Line {line_idx}: '{line.content}'") + + # Log selection marks + for selection_mark in page.selection_marks: + logging.info( + f"Selection mark is '{selection_mark.state}' with confidence {selection_mark.confidence}" + ) + + # Extract tables + page_tables = [ + table for table in result.tables + if any(region.page_number == page.page_number for region in table.bounding_regions) + ] + + for table_index, table in enumerate(page_tables): + logging.info(f"Table {table_index}: {table.row_count} rows, {table.column_count} columns") + + table_data = { + "row_count": table.row_count, + "column_count": table.column_count, + "cells": [] + } + + for cell in table.cells: + content = cell.content.strip() + # Find matching visual cue for this cell (if any) + cue = next((vc for vc in visual_cues if vc.get("page_number") == page.page_number and vc.get("row_index") == cell.row_index and vc.get("column_index") == cell.column_index), None) + cell_info = { + "row_index": cell.row_index, + "column_index": cell.column_index, + "content": content, + "visual_cue": cue["cue_type"] if cue else None + } + table_data["cells"].append(cell_info) + logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}', visual_cue: {cell_info['visual_cue']}") + + page_data["tables"].append(table_data) + + layout_data["pages"].append(page_data) + + try: + preview = json.dumps(layout_data, indent=2) + logging.info("Structured layout data preview:\n" + preview) + except Exception as e: + logging.warning(f"Could not serialize layout data for preview: {e}") + + return layout_data + +def save_layout_data_to_cosmos(layout_data): + try: + endpoint = os.getenv("COSMOS_DB_ENDPOINT") + key = os.getenv("COSMOS_DB_KEY") + aad_credentials = DefaultAzureCredential() + client = CosmosClient(endpoint, credential=aad_credentials, consistency_level='Session') + logging.info("Successfully connected to Cosmos DB using AAD default credential") + except Exception as e: + logging.error(f"Error connecting to Cosmos DB: {e}") + return + + database_name = "ContosoDBDocIntellig" + container_name = "Layouts" + + try: + database = client.create_database_if_not_exists(database_name) + logging.info(f"Database '{database_name}' does not exist. Creating it.") + except exceptions.CosmosResourceExistsError: + database = client.get_database_client(database_name) + logging.info(f"Database '{database_name}' already exists.") + + database.read() + logging.info(f"Reading into '{database_name}' DB") + + try: + container = database.create_container( + id=container_name, + partition_key=PartitionKey(path="/id"), + offer_throughput=400 + ) + logging.info(f"Container '{container_name}' does not exist. Creating it.") + except exceptions.CosmosResourceExistsError: + container = database.get_container_client(container_name) + logging.info(f"Container '{container_name}' already exists.") + except exceptions.CosmosHttpResponseError: + raise + + container.read() + logging.info(f"Reading into '{container}' container") + + try: + response = container.upsert_item(layout_data) + logging.info(f"Saved processed layout data to Cosmos DB. Response: {response}") + except Exception as e: + logging.error(f"Error inserting item into Cosmos DB: {e}") + +## MAIN +@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", + connection="invoicecontosostorage_STORAGE") +def call_vision_api(image_bytes, subscription_key, endpoint): + vision_url = endpoint + "/vision/v3.2/analyze" + headers = { + 'Ocp-Apim-Subscription-Key': subscription_key, + 'Content-Type': 'application/octet-stream' + } + params = { + 'visualFeatures': 'Objects,Color', # Add more features if needed + } + response = requests.post(vision_url, headers=headers, params=params, data=image_bytes) + response.raise_for_status() + return response.json() + +def extract_visual_cues_from_vision(vision_result, page_number): + # Example: Detect gray fills, checkmarks, hand-drawn marks + cues = [] + # This is a placeholder. You need to parse vision_result for your cues. + # For example, if vision_result['objects'] contains a 'checkmark' or color info for gray fill + # cues.append({"page_number": page_number, "row_index": ..., "column_index": ..., "cue_type": "gray_fill"}) + return cues + +def convert_pdf_to_images(pdf_bytes): + images = convert_from_bytes(pdf_bytes) + return images + +def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): + logging.info(f"Python blob trigger function processed blob\n" + f"Name: {myblob.name}\n" + f"Blob Size: {myblob.length} bytes") + + try: + form_recognizer_client = initialize_form_recognizer_client() + pdf_bytes = read_pdf_content(myblob) + logging.info("Successfully read PDF content from blob.") + except Exception as e: + logging.error(f"Error reading PDF: {e}") + return + + try: + result = analyze_pdf(form_recognizer_client, pdf_bytes) + logging.info("Successfully analyzed PDF using Document Intelligence.") + except Exception as e: + logging.error(f"Error analyzing PDF: {e}") + return + + # --- Step: Convert PDF to image and call Azure AI Vision --- + visual_cues = [] + try: + images = convert_pdf_to_images(pdf_bytes) + vision_key = os.getenv("VISION_API_KEY") + vision_endpoint = os.getenv("VISION_API_ENDPOINT") + for page_num, image in enumerate(images, start=1): + img_bytes_io = BytesIO() + image.save(img_bytes_io, format='JPEG') + img_bytes = img_bytes_io.getvalue() + vision_result = call_vision_api(img_bytes, vision_key, vision_endpoint) + cues = extract_visual_cues_from_vision(vision_result, page_num) + visual_cues.extend(cues) + logging.info(f"Visual cues extracted: {visual_cues}") + except Exception as e: + logging.error(f"Error processing visual cues with AI Vision: {e}") + + try: + layout_data = extract_layout_data(result, visual_cues) + logging.info("Successfully extracted and merged layout data.") + except Exception as e: + logging.error(f"Error extracting layout data: {e}") + return + + try: + save_layout_data_to_cosmos(layout_data) + logging.info("Successfully saved layout data to Cosmos DB.") + except Exception as e: + logging.error(f"Error saving layout data to Cosmos DB: {e}") diff --git a/src/host.json b/src/host.json new file mode 100644 index 0000000..d5f63c0 --- /dev/null +++ b/src/host.json @@ -0,0 +1,15 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + } +} \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..839eb3a --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,9 @@ +# DO NOT include azure-functions-worker in this file +# The Python Worker is managed by Azure Functions platform +# Manually managing azure-functions-worker may cause unexpected issues + +azure-functions +azure-ai-formrecognizer +azure-core +azure-cosmos==4.3.0 +azure-identity==1.7.0 diff --git a/terraform-infrastructure/README.md b/terraform-infrastructure/README.md new file mode 100644 index 0000000..751f985 --- /dev/null +++ b/terraform-infrastructure/README.md @@ -0,0 +1,115 @@ +# Azure Infrastructure Terraform Template + +Costa Rica + +[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) +[brown9804](https://github.com/brown9804) + +Last updated: 2025-07-16 + +---------- + +
+ Centered Image +
+ +
+ Centered Image +
+ +## Overview + +Templates structure: + +``` +. +├── README.md +├────── main.tf +├────── variables.tf +├────── provider.tf +├────── terraform.tfvars +├────── outputs.tf +``` + +- main.tf `(Main Terraform configuration file)`: This file contains the core infrastructure code. It defines the resources you want to create, such as virtual machines, networks, and storage. It's the primary file where you describe your infrastructure in a declarative manner. +- variables.tf `(Variable definitions)`: This file is used to define variables that can be used throughout your Terraform configuration. By using variables, you can make your configuration more flexible and reusable. For example, you can define variables for resource names, sizes, and other parameters that might change between environments. +- provider.tf `(Provider configurations)`: Providers are plugins that Terraform uses to interact with cloud providers, SaaS providers, and other APIs. This file specifies which providers (e.g., AWS, Azure, Google Cloud) you are using and any necessary configuration for them, such as authentication details. +- terraform.tfvars `(Variable values)`: This file contains the actual values for the variables defined in `variables.tf`. By separating variable definitions and values, you can easily switch between different sets of values for different environments (e.g., development, staging, production) without changing the main configuration files. +- outputs.tf `(Output values)`: This file defines the output values that Terraform should return after applying the configuration. Outputs are useful for displaying information about the resources created, such as IP addresses, resource IDs, and other important details. They can also be used as inputs for other Terraform configurations or scripts. + +## How to execute it + +```mermaid +graph TD; + A[az login] --> B(terraform init) + B --> C{Terraform provisioning stage} + C -->|Review| D[terraform plan] + C -->|Order Now| E[terraform apply] + C -->|Delete Resource if needed| F[terraform destroy] +``` + +> [!IMPORTANT] +> Please modify `terraform.tfvars` with your information, then run the following flow. If you need more visual guidance, please check the video that illustrates the provisioning steps. + +1. **Login to Azure**: This command logs you into your Azure account. It opens a browser window where you can enter your Azure credentials. Once logged in, you can manage your Azure resources from the command line. + + > Go to the path where Terraform files are located: + + ```sh + cd terraform-infrastructure + ``` + + ```sh + az login + ``` + + img + + img + +2. **Initialize Terraform**: Initializes the working directory containing the Terraform configuration files. It downloads the necessary provider plugins and sets up the backend for storing the state. + + ``` sh + terraform init + ``` + + img + +3. **Terraform Provisioning Stage**: + + - **Review**: Creates an execution plan, showing what actions Terraform will take to achieve the desired state defined in your configuration files. It uses the variable values specified in `terraform.tfvars`. + + ```sh + terraform plan -var-file terraform.tfvars + ``` + + > At the end, you will see a message in green if everything was executed successfully: + + Screenshot 2025-03-18 145143 + + - **Order Now**: Applies the changes required to reach the desired state of the configuration. It prompts for confirmation before making any changes. It also uses the variable values specified in `terraform.tfvars`. + + ```sh + terraform apply -var-file terraform.tfvars + ``` + + > At the end, you will see a message in green if everything was executed successfully: + + image + + - **Remove**: Destroys the infrastructure managed by Terraform. It prompts for confirmation before deleting any resources. It also uses the variable values specified in `terraform.tfvars`. + + ```sh + terraform destroy -var-file terraform.tfvars + ``` + + > At the end, you will see a message in green if everything was executed successfully: + + image + + +
+ Total views +

Refresh Date: 2025-07-21

+
+ diff --git a/terraform-infrastructure/main.tf b/terraform-infrastructure/main.tf new file mode 100644 index 0000000..4e31312 --- /dev/null +++ b/terraform-infrastructure/main.tf @@ -0,0 +1,408 @@ +# Resource Group +resource "azurerm_resource_group" "rg" { + name = var.resource_group_name + location = var.location + + # Output the resource group name + provisioner "local-exec" { + command = "echo Resource Group: ${self.name}" + } +} + +# Storage Account +resource "azurerm_storage_account" "storage" { + name = var.storage_account_name + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + account_tier = "Standard" + account_replication_type = "LRS" + + depends_on = [azurerm_resource_group.rg] + + # Output the storage account name + provisioner "local-exec" { + command = "echo Storage Account: ${self.name}" + } +} + +# Blob Container for Input Files +resource "azurerm_storage_container" "input_container" { + name = "pdfinvoices" + storage_account_id = azurerm_storage_account.storage.id + container_access_type = "private" + + depends_on = [azurerm_storage_account.storage] + + # Output the container name + provisioner "local-exec" { + command = "echo Input Container: ${self.name}" + } +} + +# Blob Container for Output Files +resource "azurerm_storage_container" "output_container" { + name = "output" + storage_account_id = azurerm_storage_account.storage.id + container_access_type = "private" + + depends_on = [azurerm_storage_account.storage] + + # Output the container name + provisioner "local-exec" { + command = "echo Output Container: ${self.name}" + } +} + +# Storage Account +resource "azurerm_storage_account" "runtime" { + name = var.storage_account_name_runtime + resource_group_name = azurerm_resource_group.rg.name + location = azurerm_resource_group.rg.location + account_tier = "Standard" + account_replication_type = "LRS" + + depends_on = [azurerm_resource_group.rg] + + # Output the storage account name + provisioner "local-exec" { + command = "echo Storage Account: ${self.name}" + } +} + +# Assign Storage Blob Data Contributor role +resource "azurerm_role_assignment" "blob_data_contributor" { + scope = azurerm_storage_account.runtime.id + role_definition_name = "Storage Blob Data Contributor" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_storage_account.runtime + ] + +} + +# Assign Storage File Data SMB Share Contributor role +resource "azurerm_role_assignment" "file_data_smb_share_contributor" { + scope = azurerm_storage_account.runtime.id + role_definition_name = "Storage File Data SMB Share Contributor" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_storage_account.runtime + ] +} + +# Assign Storage Blob Data Reader role +resource "azurerm_role_assignment" "blob_data_reader" { + scope = azurerm_storage_account.storage.id + role_definition_name = "Storage Blob Data Reader" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_storage_account.storage # Replace with the actual resource name + ] +} + + +# Service Plan +resource "azurerm_service_plan" "asp" { + name = var.app_service_plan_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + os_type = "Linux" + sku_name = "Y1" # Consumption plan + + depends_on = [azurerm_resource_group.rg] + + # Output the service plan name + provisioner "local-exec" { + command = "echo Service Plan: ${self.name}" + } +} + +# Application Insights +resource "azurerm_application_insights" "appinsights" { + name = var.app_insights_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + application_type = "web" + workspace_id = azurerm_log_analytics_workspace.loganalytics.id + + depends_on = [azurerm_resource_group.rg] + + provisioner "local-exec" { + command = "echo Application Insights: ${self.name}" + } +} + +# Log Analytics Workspace +resource "azurerm_log_analytics_workspace" "loganalytics" { + name = var.log_analytics_workspace_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + sku = "PerGB2018" + + depends_on = [azurerm_resource_group.rg] + + # Output the log analytics workspace name + provisioner "local-exec" { + command = "echo Log Analytics Workspace: ${self.name}" + } +} + +# Key Vault +resource "azurerm_key_vault" "keyvault" { + name = var.key_vault_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + tenant_id = data.azurerm_client_config.current.tenant_id + sku_name = "standard" + + depends_on = [azurerm_resource_group.rg] + + # Output the key vault name + provisioner "local-exec" { + command = "echo Key Vault: ${self.name}" + } +} + +# Data source to get tenant ID +data "azurerm_client_config" "current" {} + +# CosmosDB +resource "azurerm_cosmosdb_account" "cosmosdb" { + name = var.cosmosdb_account_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + offer_type = "Standard" + kind = "GlobalDocumentDB" + consistency_policy { + consistency_level = "Session" + } + + geo_location { + location = azurerm_resource_group.rg.location + failover_priority = 0 + } + + depends_on = [azurerm_resource_group.rg] +} + +# Cosmos DB SQL Database +resource "azurerm_cosmosdb_sql_database" "main" { + name = var.cosmosdb_sqldb_name + resource_group_name = azurerm_resource_group.rg.name + account_name = azurerm_cosmosdb_account.cosmosdb.name +} + +resource "azurerm_cosmosdb_sql_container" "outputcvscontainer" { + name = var.sql_container_name + resource_group_name = azurerm_resource_group.rg.name + account_name = azurerm_cosmosdb_account.cosmosdb.name + database_name = azurerm_cosmosdb_sql_database.main.name + throughput = var.throughput + partition_key_paths = ["/transactionId"] + partition_key_version = 1 + + indexing_policy { + indexing_mode = "consistent" + + included_path { + path = "/*" + } + + included_path { + path = "/included/?" + } + + excluded_path { + path = "/excluded/?" + } + } + + unique_key { + paths = ["/definition/idlong", "/definition/idshort"] + } +} + +# Cosmos DB Operator +resource "azurerm_role_assignment" "cosmosdb_operator" { + scope = azurerm_cosmosdb_account.cosmosdb.id + role_definition_name = "Cosmos DB Operator" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_cosmosdb_account.cosmosdb + ] +} + +# DocumentDB Account Contributor +resource "azurerm_role_assignment" "documentdb_contributor" { + scope = azurerm_cosmosdb_account.cosmosdb.id + role_definition_name = "DocumentDB Account Contributor" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_cosmosdb_account.cosmosdb + ] +} + +# Azure AI Administrator +resource "azurerm_role_assignment" "azure_ai_admin" { + scope = azurerm_cosmosdb_account.cosmosdb.id + role_definition_name = "Azure AI Administrator" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_cosmosdb_account.cosmosdb + ] +} + +# Cosmos DB Account Reader Role +resource "azurerm_role_assignment" "cosmosdb_reader" { + scope = azurerm_cosmosdb_account.cosmosdb.id + role_definition_name = "Cosmos DB Account Reader Role" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_cosmosdb_account.cosmosdb + ] +} + +# Contributor +resource "azurerm_role_assignment" "contributor" { + scope = azurerm_cosmosdb_account.cosmosdb.id + role_definition_name = "Contributor" + principal_id = azurerm_linux_function_app.function_app.identity[0].principal_id + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_cosmosdb_account.cosmosdb + ] +} + + +# Azure Form Recognizer (Document Intelligence) +resource "azurerm_cognitive_account" "form_recognizer" { + name = var.form_recognizer_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + kind = "FormRecognizer" + sku_name = "S0" + + depends_on = [azurerm_resource_group.rg] + + provisioner "local-exec" { + command = "echo Form Recognizer: ${self.name}" + } +} + +# Azure AI Vision (Cognitive Services) +resource "azurerm_cognitive_account" "ai_vision" { + name = var.ai_vision_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + kind = "CognitiveServices" + sku_name = var.ai_vision_sku + tags = var.ai_vision_tags + + depends_on = [azurerm_resource_group.rg] + + provisioner "local-exec" { + command = "echo AI Vision: ${self.name}" + } +} + +# We need to assign custom or built-in Cosmos DB SQL roles +# (like Cosmos DB Built-in Data Reader, etc.) at the data plane level, +# which is not currently supported directly in Terraform as of now. +# Workaround: Use null_resource with local-exec integrating the CLI command into +# Terraform using a null_resource as follow: +locals { + cosmosdb_role_assignment_id_function = uuid() + cosmosdb_role_assignment_id_user = uuid() +} + +resource "null_resource" "cosmosdb_sql_role_assignment" { + provisioner "local-exec" { + command = "az cosmosdb sql role assignment create --resource-group ${azurerm_resource_group.rg.name} --account-name ${azurerm_cosmosdb_account.cosmosdb.name} --role-definition-id /subscriptions/${data.azurerm_client_config.current.subscription_id}/resourceGroups/${azurerm_resource_group.rg.name}/providers/Microsoft.DocumentDB/databaseAccounts/${azurerm_cosmosdb_account.cosmosdb.name}/sqlRoleDefinitions/00000000-0000-0000-0000-000000000002 --principal-id ${azurerm_linux_function_app.function_app.identity[0].principal_id} --scope ${azurerm_cosmosdb_account.cosmosdb.id} --role-assignment-id ${local.cosmosdb_role_assignment_id_function}" + } + + depends_on = [ + azurerm_linux_function_app.function_app, + azurerm_cosmosdb_account.cosmosdb + ] +} + +# Assign the Cosmos DB role to the user running the deployment +resource "null_resource" "cosmosdb_sql_role_assignment_user" { + provisioner "local-exec" { + command = "az cosmosdb sql role assignment create --resource-group ${azurerm_resource_group.rg.name} --account-name ${azurerm_cosmosdb_account.cosmosdb.name} --role-definition-id /subscriptions/${data.azurerm_client_config.current.subscription_id}/resourceGroups/${azurerm_resource_group.rg.name}/providers/Microsoft.DocumentDB/databaseAccounts/${azurerm_cosmosdb_account.cosmosdb.name}/sqlRoleDefinitions/00000000-0000-0000-0000-000000000002 --principal-id ${data.azurerm_client_config.current.object_id} --scope ${azurerm_cosmosdb_account.cosmosdb.id} --role-assignment-id ${local.cosmosdb_role_assignment_id_user}" + } + + depends_on = [ + azurerm_cosmosdb_account.cosmosdb + ] +} + +# Linux Function App +resource "azurerm_linux_function_app" "function_app" { + name = var.function_app_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + service_plan_id = azurerm_service_plan.asp.id + storage_account_name = azurerm_storage_account.runtime.name + storage_account_access_key = azurerm_storage_account.runtime.primary_access_key + + identity { + type = "SystemAssigned" + } + + site_config { + # Other configurations can go here + application_stack { + python_version = "3.10" + } + } + + app_settings = { + "FUNCTIONS_WORKER_RUNTIME" = "python" + "FUNCTIONS_EXTENSION_VERSION" = "~4" + "FUNCTIONS_NODE_BLOCK_ON_ENTRY_POINT_ERROR" = "true" + "WEBSITE_RUN_FROM_PACKAGE" = "1" + + "COSMOS_DB_ENDPOINT" = azurerm_cosmosdb_account.cosmosdb.endpoint + "COSMOS_DB_KEY" = azurerm_cosmosdb_account.cosmosdb.primary_key + + "invoicecontosostorage_STORAGE" = azurerm_storage_account.storage.primary_connection_string + + "FORM_RECOGNIZER_ENDPOINT" = azurerm_cognitive_account.form_recognizer.endpoint + "FORM_RECOGNIZER_KEY" = azurerm_cognitive_account.form_recognizer.primary_access_key + + "APPINSIGHTS_INSTRUMENTATIONKEY" = azurerm_application_insights.appinsights.instrumentation_key + "APPLICATIONINSIGHTS_CONNECTION_STRING" = azurerm_application_insights.appinsights.connection_string + + # Azure AI Vision settings + "VISION_API_ENDPOINT" = azurerm_cognitive_account.ai_vision.endpoint + "VISION_API_KEY" = azurerm_cognitive_account.ai_vision.primary_access_key + } + + depends_on = [ + azurerm_service_plan.asp, + azurerm_application_insights.appinsights, + azurerm_cosmosdb_account.cosmosdb + + ] + + provisioner "local-exec" { + command = "echo Function App: ${self.name}" + } +} diff --git a/terraform-infrastructure/output.tf b/terraform-infrastructure/output.tf new file mode 100644 index 0000000..17d3946 --- /dev/null +++ b/terraform-infrastructure/output.tf @@ -0,0 +1,60 @@ +output "resource_group_name" { + description = "The name of the resource group." + value = azurerm_resource_group.rg.name +} + +output "storage_account_name" { + description = "The name of the storage account" + value = azurerm_storage_account.storage.name +} + +output "input_container_name" { + description = "The name of the input container" + value = azurerm_storage_container.input_container.name +} + +output "output_container_name" { + description = "The name of the output container" + value = azurerm_storage_container.output_container.name +} + +output "function_app_name" { + description = "The name of the Linux Function App." + value = azurerm_linux_function_app.function_app.name +} + +output "app_service_plan_name" { + description = "The name of the Service Plan" + value = azurerm_service_plan.asp.name +} + +output "app_insights_name" { + description = "The name of the Application Insights instance" + value = azurerm_application_insights.appinsights.name +} + +output "log_analytics_workspace_name" { + description = "The name of the Log Analytics workspace" + value = azurerm_log_analytics_workspace.loganalytics.name +} + +output "key_vault_name" { + description = "The name of the Key Vault" + value = azurerm_key_vault.keyvault.name +} + + +output "cosmosdb_account_name" { + description = "The name of the CosmosDB account." + value = azurerm_cosmosdb_account.cosmosdb.name +} + +# Output the Form Recognizer name +output "form_recognizer_name" { + value = azurerm_cognitive_account.form_recognizer.name +} + +# Output the Form Recognizer endpoint +output "form_recognizer_endpoint" { + value = azurerm_cognitive_account.form_recognizer.endpoint +} diff --git a/terraform-infrastructure/provider.tf b/terraform-infrastructure/provider.tf new file mode 100644 index 0000000..f4f5a1b --- /dev/null +++ b/terraform-infrastructure/provider.tf @@ -0,0 +1,25 @@ +# provider.tf +# This file configures the Azure provider to interact with Azure resources. +# It specifies the required provider and its version, along with provider-specific configurations. + +terraform { + required_version = ">= 1.8, < 2.0" + # Specify the required provider and its version + required_providers { + azurerm = { + source = "hashicorp/azurerm" # Source of the AzureRM provider + version = "~> 4.16.0" # Version of the AzureRM provider + } + } +} + +provider "azurerm" { + features { # Enable features for the AzureRM provider + key_vault { + recover_soft_deleted_key_vaults = false + purge_soft_delete_on_destroy = true + } + } + + subscription_id = var.subscription_id # Use the subscription ID variable +} \ No newline at end of file diff --git a/terraform-infrastructure/terraform.tfvars b/terraform-infrastructure/terraform.tfvars new file mode 100644 index 0000000..b716b4b --- /dev/null +++ b/terraform-infrastructure/terraform.tfvars @@ -0,0 +1,30 @@ +# Sample values +subscription_id = "" # "your-subscription_id" +resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name" +location = "West US" # "your-location" +# Storage Account +storage_account_name = "storageaccbrownpdfix2" # "your-storage-account-name" +storage_account_name_runtime = "runtimestorebrownix2" # "your-runtime-storage-account-name" +# Function App +function_app_name = "fapdfbrownix2" # "your-function-app-name" +# App Service Plan +app_service_plan_name = "asppdfbrownix2" # "your-app-service-plan-name" +# Application Insights +app_insights_name = "apppdfbrownix2" # "your-app-insights-name" +# Log Analytics Workspace +log_analytics_workspace_name = "logwspdfbrownix2" # "your-log-analytics-workspace-name" +# Key Vault +key_vault_name = "kvpdfrbrownrix2" # "your-key-vault-name" +# CosmosDB +cosmosdb_account_name = "cosmospdfbrownix2" # "your-cosmosdb-account-name" +# Form Recognizer -> Document Intelligence +form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name" + +# AI Vision Service +ai_vision_name = "aivisionpdfrbrownix2" # "your-ai-vision-name" +ai_vision_sku = "S0" +ai_vision_tags = { + Environment = "Development" + Project = "PDF Processing" + Service = "AI Vision" +} diff --git a/terraform-infrastructure/variables.tf b/terraform-infrastructure/variables.tf new file mode 100644 index 0000000..d37c765 --- /dev/null +++ b/terraform-infrastructure/variables.tf @@ -0,0 +1,94 @@ +variable "subscription_id" { + description = "The subscription ID for the Azure account." + type = string +} + +variable "resource_group_name" { + description = "The name of the resource group." + type = string +} + +variable "location" { + description = "The Azure region where resources will be created." + type = string +} + + +variable "storage_account_name" { + description = "The name of the storage account" + type = string +} + +variable "storage_account_name_runtime" { + description = "The name of the storage account runtime (Function App Storage)" + type = string +} + +variable "function_app_name" { + description = "The name of the Linux Function App." + type = string +} + +variable "app_service_plan_name" { + description = "The name of the App Service plan" + type = string +} + +variable "app_insights_name" { + description = "The name of the Application Insights instance" + type = string +} + +variable "log_analytics_workspace_name" { + description = "The name of the Log Analytics workspace" + type = string +} + +variable "key_vault_name" { + description = "The name of the Key Vault" + type = string +} + +variable "ai_vision_name" { + description = "The name of the AI Vision Cognitive Services account" + type = string +} + +variable "ai_vision_sku" { + description = "The SKU of the AI Vision Cognitive Services account" + type = string + default = "S0" +} + +variable "ai_vision_tags" { + description = "Tags to be applied to the AI Vision resource" + type = map(string) + default = { + Environment = "Development" + Service = "AI Vision" + } +} +variable "cosmosdb_account_name" { + description = "The name of the CosmosDB account." + type = string +} + +variable "form_recognizer_name" { + description = "The name of the Form Recognizer resource." + type = string +} + +variable "cosmosdb_sqldb_name" { + description = "The name of the Cosmos DB SQL database to be created." + default = "ContosoDBDocIntellig" +} + +variable "sql_container_name" { + description = "The name of the Cosmos DB SQL container to be created within the database." + default = "Invoices" +} + +variable "throughput" { + description = "The throughput (RU/s) to be allocated to the Cosmos DB SQL database or container." + default = 400 +}