|
| 1 | +import azure.functions as func |
| 2 | +import logging |
| 3 | +import json |
| 4 | +import os |
| 5 | +import uuid |
| 6 | +import io |
| 7 | +from pdfminer.high_level import extract_text |
| 8 | +from azure.cosmos import CosmosClient, PartitionKey |
| 9 | + |
| 10 | +app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) |
| 11 | + |
| 12 | +def read_pdf_content(myblob): |
| 13 | + # Read the blob content into a BytesIO stream |
| 14 | + blob_bytes = myblob.read() |
| 15 | + pdf_stream = io.BytesIO(blob_bytes) |
| 16 | + |
| 17 | + # Extract text from the PDF stream |
| 18 | + text = extract_text(pdf_stream) |
| 19 | + return text |
| 20 | + |
| 21 | +def extract_invoice_data(text): |
| 22 | + lines = text.split('\n') |
| 23 | + invoice_data = { |
| 24 | + "id": generate_id(), |
| 25 | + "customer_name": "", |
| 26 | + "customer_email": "", |
| 27 | + "customer_address": "", |
| 28 | + "company_name": "", |
| 29 | + "company_phone": "", |
| 30 | + "company_address": "", |
| 31 | + "rentals": [] |
| 32 | + } |
| 33 | + |
| 34 | + for i, line in enumerate(lines): |
| 35 | + if "BILL TO:" in line: |
| 36 | + invoice_data["customer_name"] = lines[i + 1].strip() |
| 37 | + invoice_data["customer_email"] = lines[i + 2].strip() |
| 38 | + invoice_data["customer_address"] = lines[i + 3].strip() |
| 39 | + elif "Company Information:" in line: |
| 40 | + invoice_data["company_name"] = lines[i + 1].strip() |
| 41 | + invoice_data["company_phone"] = lines[i + 2].strip() |
| 42 | + invoice_data["company_address"] = lines[i + 3].strip() |
| 43 | + elif "Rental Date" in line: |
| 44 | + for j in range(i + 1, len(lines)): |
| 45 | + if lines[j].strip() == "": |
| 46 | + break |
| 47 | + rental_details = lines[j].split() |
| 48 | + rental_date = rental_details[0] |
| 49 | + title = " ".join(rental_details[1:-3]) |
| 50 | + description = rental_details[-3] |
| 51 | + quantity = rental_details[-2] |
| 52 | + total_price = rental_details[-1] |
| 53 | + invoice_data["rentals"].append({ |
| 54 | + "rental_date": rental_date, |
| 55 | + "title": title, |
| 56 | + "description": description, |
| 57 | + "quantity": quantity, |
| 58 | + "total_price": total_price |
| 59 | + }) |
| 60 | + |
| 61 | + logging.info("Successfully extracted invoice data.") |
| 62 | + return invoice_data |
| 63 | + |
| 64 | +def save_invoice_data_to_cosmos(invoice_data, blob_name): |
| 65 | + try: |
| 66 | + endpoint = os.getenv("COSMOS_DB_ENDPOINT") |
| 67 | + key = os.getenv("COSMOS_DB_KEY") |
| 68 | + client = CosmosClient(endpoint, key) |
| 69 | + logging.info("Successfully connected to Cosmos DB.") |
| 70 | + except Exception as e: |
| 71 | + logging.error(f"Error connecting to Cosmos DB: {e}") |
| 72 | + return |
| 73 | + |
| 74 | + database_name = 'ContosoDBAIDemo' |
| 75 | + container_name = 'Invoices' |
| 76 | + |
| 77 | + try: |
| 78 | + database = client.create_database_if_not_exists(id=database_name) |
| 79 | + container = database.create_container_if_not_exists( |
| 80 | + id=container_name, |
| 81 | + partition_key=PartitionKey(path="/invoice_number"), |
| 82 | + offer_throughput=400 |
| 83 | + ) |
| 84 | + logging.info("Successfully ensured database and container exist.") |
| 85 | + except Exception as e: |
| 86 | + logging.error(f"Error creating database or container: {e}") |
| 87 | + return |
| 88 | + |
| 89 | + try: |
| 90 | + response = container.upsert_item(invoice_data) |
| 91 | + logging.info(f"Saved processed invoice data to Cosmos DB: {response}") |
| 92 | + except Exception as e: |
| 93 | + logging.error(f"Error inserting item into Cosmos DB: {e}") |
| 94 | + |
| 95 | +def generate_id(): |
| 96 | + return str(uuid.uuid4()) |
| 97 | + |
| 98 | +@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", |
| 99 | + connection="contosostorageaidemo_STORAGE") |
| 100 | +def BlobTriggerContosoPDFInvoicesRaw(myblob: func.InputStream): |
| 101 | + logging.info(f"Python blob trigger function processed blob\n" |
| 102 | + f"Name: {myblob.name}\n" |
| 103 | + f"Blob Size: {myblob.length} bytes") |
| 104 | + |
| 105 | + try: |
| 106 | + text = read_pdf_content(myblob) |
| 107 | + logging.info("Successfully read and extracted text from PDF.") |
| 108 | + except Exception as e: |
| 109 | + logging.error(f"Error reading PDF: {e}") |
| 110 | + return |
| 111 | + |
| 112 | + logging.info(f"Extracted text from PDF: {text}") |
| 113 | + |
| 114 | + try: |
| 115 | + invoice_data = extract_invoice_data(text) |
| 116 | + logging.info(f"Extracted invoice data: {invoice_data}") |
| 117 | + except Exception as e: |
| 118 | + logging.error(f"Error extracting invoice data: {e}") |
| 119 | + return |
| 120 | + |
| 121 | + try: |
| 122 | + save_invoice_data_to_cosmos(invoice_data, myblob.name) |
| 123 | + logging.info("Successfully saved invoice data to Cosmos DB.") |
| 124 | + except Exception as e: |
| 125 | + logging.error(f"Error saving invoice data to Cosmos DB: {e}") |
0 commit comments