Merge pull request #48 from CatchTheTornado/fix_46

pkarw · web-flow · commit fbd31e18ea68 · 2024-11-28T15:15:19.000+01:00
fix(#46) - fixed the way new ollama handles images
diff --git a/app/ocr_strategies/llama_vision.py b/app/ocr_strategies/llama_vision.py
@@ -1,4 +1,5 @@
 import base64
+import tempfile
 from ocr_strategies.ocr_strategy import OCRStrategy
 import ollama
 import io
@@ -20,14 +21,21 @@ def extract_text_from_pdf(self, pdf_bytes):
             # Convert image to base64
             buffered = io.BytesIO()
             image.save(buffered, format="JPEG")
-            img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            #img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            # Save image to a temporary file and get its path
+            temp_filename = None
+            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
+                image.save(temp_file, format="JPEG")
+                temp_filename = temp_file.name
 
             # Generate text using the Llama 3.2 Vision model
             try:
                 response = ollama.chat("llama3.2-vision", [{
+                    'role': 'user',
                     'content':  os.getenv('LLAMA_VISION_PROMPT', "You are OCR. Convert image to markdown."),
-                    'images': [img_str]
+                    'images': [temp_filename]
                 }], stream=True)
+                os.remove(temp_filename)
                 num_chunk = 1
                 for chunk in response:
                     self.update_state_callback(state='PROGRESS', meta={'progress': str(30 + ocr_percent_done), 'status': 'OCR Processing (page ' + str(i+1) + ' of ' + str(num_pages) +') chunk no: ' + str(num_chunk), 'start_time': start_time, 'elapsed_time': time.time() - start_time})  # Example progress update