Skip to content

Commit 81ab2f6

Browse files
committed
[feat] llama3.2_vision update
1 parent 4929da8 commit 81ab2f6

File tree

2 files changed

+48
-7
lines changed

2 files changed

+48
-7
lines changed

app/ocr_strategies/llama_vision.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import base64
2+
from ocr_strategies.ocr_strategy import OCRStrategy
3+
import ollama
4+
import io
5+
import os
6+
from pdf2image import convert_from_bytes
7+
8+
class LlamaVisionOCRStrategy(OCRStrategy):
9+
"""Llama 3.2 Vision OCR Strategy"""
10+
11+
def extract_text_from_pdf(self, pdf_bytes):
12+
# Convert PDF bytes to images
13+
images = convert_from_bytes(pdf_bytes)
14+
extracted_text = ""
15+
16+
for i, image in enumerate(images):
17+
# Convert image to base64
18+
buffered = io.BytesIO()
19+
image.save(buffered, format="JPEG")
20+
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
21+
22+
# Generate text using the Llama 3.2 Vision model
23+
try:
24+
response = ollama.chat("llama3.2-vision", [{
25+
'content': os.getenv('LLAMA_VISION_PROMPT', "You are OCR. Convert image to markdown."),
26+
'images': [img_str]
27+
}], stream=True)
28+
for chunk in response:
29+
extracted_text += chunk['message']['content']
30+
31+
except ollama.ResponseError as e:
32+
print('Error:', e.error)
33+
raise Exception("Failed to generate text with Llama 3.2 Vision model")
34+
35+
print(response)
36+
#page_text = response.get("response", "")
37+
#extracted_text += f"--- Page {i + 1} ---\n{page_text}\n"
38+
39+
return extracted_text

app/tasks.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,16 @@
22
from celery_config import celery
33
from ocr_strategies.marker import MarkerOCRStrategy
44
from ocr_strategies.tesseract import TesseractOCRStrategy
5+
from ocr_strategies.llama_vision import LlamaVisionOCRStrategy
56
import redis
67
import os
78
import ollama
89
from storage_manager import StorageManager
910

1011
OCR_STRATEGIES = {
1112
'marker': MarkerOCRStrategy(),
12-
'tesseract': TesseractOCRStrategy()
13+
'tesseract': TesseractOCRStrategy(),
14+
'llama_vision': LlamaVisionOCRStrategy()
1315
}
1416

1517
# Connect to Redis
@@ -23,17 +25,17 @@ def ocr_task(self, pdf_bytes, strategy_name, pdf_filename, pdf_hash, ocr_cache,
2325
"""
2426
start_time = time.time()
2527
if strategy_name not in OCR_STRATEGIES:
26-
raise ValueError(f"Unknown strategy '{strategy_name}'. Available: marker, tesseract")
28+
raise ValueError(f"Unknown strategy '{strategy_name}'. Available: marker, tesseract, llama_vision")
2729

2830
ocr_strategy = OCR_STRATEGIES[strategy_name]
2931
self.update_state(state='PROGRESS', status="File uploaded successfully", meta={'progress': 10}) # Example progress update
3032

3133
extracted_text = None
32-
if ocr_cache:
33-
cached_result = redis_client.get(pdf_hash)
34-
if cached_result:
35-
# Return cached result if available
36-
extracted_text = cached_result.decode('utf-8')
34+
# if ocr_cache:
35+
# cached_result = redis_client.get(pdf_hash)
36+
# if cached_result:
37+
# # Return cached result if available
38+
# extracted_text = cached_result.decode('utf-8')
3739

3840
if extracted_text is None:
3941
print("Extracting text from PDF...")

0 commit comments

Comments
 (0)