Skip to content

Commit 86e3004

Browse files
authored
sample extract code
1 parent eacf4b5 commit 86e3004

File tree

1 file changed

+125
-0
lines changed

1 file changed

+125
-0
lines changed

src/function_app.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import azure.functions as func
2+
import logging
3+
import json
4+
import os
5+
import uuid
6+
import io
7+
from pdfminer.high_level import extract_text
8+
from azure.cosmos import CosmosClient, PartitionKey
9+
10+
app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
11+
12+
def read_pdf_content(myblob):
13+
# Read the blob content into a BytesIO stream
14+
blob_bytes = myblob.read()
15+
pdf_stream = io.BytesIO(blob_bytes)
16+
17+
# Extract text from the PDF stream
18+
text = extract_text(pdf_stream)
19+
return text
20+
21+
def extract_invoice_data(text):
22+
lines = text.split('\n')
23+
invoice_data = {
24+
"id": generate_id(),
25+
"customer_name": "",
26+
"customer_email": "",
27+
"customer_address": "",
28+
"company_name": "",
29+
"company_phone": "",
30+
"company_address": "",
31+
"rentals": []
32+
}
33+
34+
for i, line in enumerate(lines):
35+
if "BILL TO:" in line:
36+
invoice_data["customer_name"] = lines[i + 1].strip()
37+
invoice_data["customer_email"] = lines[i + 2].strip()
38+
invoice_data["customer_address"] = lines[i + 3].strip()
39+
elif "Company Information:" in line:
40+
invoice_data["company_name"] = lines[i + 1].strip()
41+
invoice_data["company_phone"] = lines[i + 2].strip()
42+
invoice_data["company_address"] = lines[i + 3].strip()
43+
elif "Rental Date" in line:
44+
for j in range(i + 1, len(lines)):
45+
if lines[j].strip() == "":
46+
break
47+
rental_details = lines[j].split()
48+
rental_date = rental_details[0]
49+
title = " ".join(rental_details[1:-3])
50+
description = rental_details[-3]
51+
quantity = rental_details[-2]
52+
total_price = rental_details[-1]
53+
invoice_data["rentals"].append({
54+
"rental_date": rental_date,
55+
"title": title,
56+
"description": description,
57+
"quantity": quantity,
58+
"total_price": total_price
59+
})
60+
61+
logging.info("Successfully extracted invoice data.")
62+
return invoice_data
63+
64+
def save_invoice_data_to_cosmos(invoice_data, blob_name):
65+
try:
66+
endpoint = os.getenv("COSMOS_DB_ENDPOINT")
67+
key = os.getenv("COSMOS_DB_KEY")
68+
client = CosmosClient(endpoint, key)
69+
logging.info("Successfully connected to Cosmos DB.")
70+
except Exception as e:
71+
logging.error(f"Error connecting to Cosmos DB: {e}")
72+
return
73+
74+
database_name = 'ContosoDBAIDemo'
75+
container_name = 'Invoices'
76+
77+
try:
78+
database = client.create_database_if_not_exists(id=database_name)
79+
container = database.create_container_if_not_exists(
80+
id=container_name,
81+
partition_key=PartitionKey(path="/invoice_number"),
82+
offer_throughput=400
83+
)
84+
logging.info("Successfully ensured database and container exist.")
85+
except Exception as e:
86+
logging.error(f"Error creating database or container: {e}")
87+
return
88+
89+
try:
90+
response = container.upsert_item(invoice_data)
91+
logging.info(f"Saved processed invoice data to Cosmos DB: {response}")
92+
except Exception as e:
93+
logging.error(f"Error inserting item into Cosmos DB: {e}")
94+
95+
def generate_id():
96+
return str(uuid.uuid4())
97+
98+
@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}",
99+
connection="contosostorageaidemo_STORAGE")
100+
def BlobTriggerContosoPDFInvoicesRaw(myblob: func.InputStream):
101+
logging.info(f"Python blob trigger function processed blob\n"
102+
f"Name: {myblob.name}\n"
103+
f"Blob Size: {myblob.length} bytes")
104+
105+
try:
106+
text = read_pdf_content(myblob)
107+
logging.info("Successfully read and extracted text from PDF.")
108+
except Exception as e:
109+
logging.error(f"Error reading PDF: {e}")
110+
return
111+
112+
logging.info(f"Extracted text from PDF: {text}")
113+
114+
try:
115+
invoice_data = extract_invoice_data(text)
116+
logging.info(f"Extracted invoice data: {invoice_data}")
117+
except Exception as e:
118+
logging.error(f"Error extracting invoice data: {e}")
119+
return
120+
121+
try:
122+
save_invoice_data_to_cosmos(invoice_data, myblob.name)
123+
logging.info("Successfully saved invoice data to Cosmos DB.")
124+
except Exception as e:
125+
logging.error(f"Error saving invoice data to Cosmos DB: {e}")

0 commit comments

Comments
 (0)