[fix] env variables fixed

pkarw · pkarw · commit 70abc69054da · 2024-11-18T13:57:25.000+01:00
diff --git a/.env.example b/.env.example
@@ -2,6 +2,7 @@
 REDIS_CACHE_URL=redis://redis:6379/1
 OLLAMA_HOST=http://ollama:11434
 STORAGE_PROFILE_PATH=/storage_profiles
+LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
 
 # CLI settings
 OCR_URL=http://localhost:8000/ocr/upload
@@ -13,4 +14,4 @@ LIST_FILES_URL=http://localhost:8000/storage/list
 LOAD_FILE_URL=http://localhost:8000/storage/load
 DELETE_FILE_URL=http://localhost:8000/storage/delete
 OCR_REQUEST_URL=http://localhost:8000/ocr/request
-OCR_UPLOAD_URL=http://localhost:8000/ocr/upload
+OCR_UPLOAD_URL=http://localhost:8000/ocr/upload
diff --git a/.env.localhost.example b/.env.localhost.example
@@ -1,5 +1,6 @@
 #APP_ENV=production # sets the app into prod mode, othervise dev mode with auto-reload on code changes
 REDIS_CACHE_URL=redis://localhost:6379/1
+LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
 
 # CLI settings
 OCR_URL=http://localhost:8000/ocr/upload
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The API is built with FastAPI and uses Celery for asynchronous task processing.
 
 ## Features:
 - **No Cloud/external dependencies** all you need: PyTorch based OCR (Marker) + Ollama are shipped and configured via `docker-compose` no data is sent outside your dev/server environment,
-- **PDF to Markdown** conversion with very high accuracy using different OCR strategies including [marker](https://github.com/VikParuchuri/marker), [surya-ocr](https://github.com/VikParuchuri/surya) or [tessereact](https://github.com/h/pytesseract)
+- **PDF to Markdown** conversion with very high accuracy using different OCR strategies including [marker](https://github.com/VikParuchuri/marker) and [llama3.2-vision](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), [surya-ocr](https://github.com/VikParuchuri/surya) or [tessereact](https://github.com/h/pytesseract)
 - **PDF to JSON** conversion using Ollama supported models (eg. LLama 3.1)
 - **LLM Improving OCR results** LLama is pretty good with fixing spelling and text issues in the OCR text
 - **Removing PII** This tool can be used for removing Personally Identifiable Information out of PDF - see `examples`
@@ -149,6 +149,7 @@ Then modify the variables inside the file:
 #APP_ENV=production # sets the app into prod mode, othervise dev mode with auto-reload on code changes
 REDIS_CACHE_URL=redis://localhost:6379/1
 STORAGE_PROFILE_PATH=/storage_profiles
+LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
 
 # CLI settings
 OCR_URL=http://localhost:8000/ocr/upload
@@ -215,14 +216,17 @@ pip install -r requirements.txt
 ```
 
 
-### Pull the LLama3.1 model
+### Pull the LLama3.1 and LLama3.2-vision models
 
 You might want to test out [different models supported by LLama](https://ollama.com/library)
 
 ```bash
 python client/cli.py llm_pull --model llama3.1
+python client/cli.py llm_pull --model llama3.2-vision
 ```
 
+These models are required for most features supported by `pdf-extract-api`.
+
 
 ### Upload a File for OCR (converting to Markdown)
 
@@ -247,6 +251,7 @@ For example you must run:
 
 ```bash
 python client/cli.py llm_pull --model llama3.1
+python client/cli.py llm_pull --model llama3.2-vision
 ```
 
 and only after to run this specific prompt query:
@@ -334,7 +339,7 @@ const apiClient = new ApiClient('https://api.doctractor.com/', 'doctractor', 'Ae
 const formData = new FormData();
 formData.append('file', fileInput.files[0]);
 formData.append('prompt', 'Convert file to JSON and return only JSON'); // if not provided, no LLM transformation will gonna happen - just the OCR
-formData.append('strategy', 'marker');
+formData.append('strategy', 'llama_vision');
 formData.append('model', 'llama3.1')
 formData.append('ocr_cache', 'true');
 
@@ -350,7 +355,7 @@ apiClient.uploadFile(formData).then(response => {
 - **Method**: POST
 - **Parameters**:
   - **file**: PDF file to be processed.
-  - **strategy**: OCR strategy to use (`marker` or `tesseract`).
+  - **strategy**: OCR strategy to use (`marker`, `llama_vision` or `tesseract`).
   - **ocr_cache**: Whether to cache the OCR result (true or false).
   - **prompt**: When provided, will be used for Ollama processing the OCR result
   - **model**: When provided along with the prompt - this model will be used for LLM processing
@@ -368,7 +373,7 @@ curl -X POST -H "Content-Type: multipart/form-data" -F "file=@examples/example-m
 - **Method**: POST
 - **Parameters** (JSON body):
   - **file**: Base64 encoded PDF file content.
-  - **strategy**: OCR strategy to use (`marker` or `tesseract`).
+  - **strategy**: OCR strategy to use (`marker`, `llama_vision` or `tesseract`).
   - **ocr_cache**: Whether to cache the OCR result (true or false).
   - **prompt**: When provided, will be used for Ollama processing the OCR result.
   - **model**: When provided along with the prompt - this model will be used for LLM processing.
diff --git a/app/ocr_strategies/llama_vision.py b/app/ocr_strategies/llama_vision.py
@@ -3,6 +3,7 @@
 import ollama
 import io
 import os
+import time
 from pdf2image import convert_from_bytes
 
 class LlamaVisionOCRStrategy(OCRStrategy):
@@ -12,7 +13,9 @@ def extract_text_from_pdf(self, pdf_bytes):
         # Convert PDF bytes to images
         images = convert_from_bytes(pdf_bytes)
         extracted_text = ""
-
+        start_time = time.time()
+        ocr_percent_done = 0
+        num_pages = len(images)
         for i, image in enumerate(images):
             # Convert image to base64
             buffered = io.BytesIO()
@@ -25,9 +28,13 @@ def extract_text_from_pdf(self, pdf_bytes):
                     'content':  os.getenv('LLAMA_VISION_PROMPT', "You are OCR. Convert image to markdown."),
                     'images': [img_str]
                 }], stream=True)
+                num_chunk = 1
                 for chunk in response:
+                    self.update_state_callback(state='PROGRESS', meta={'progress': str(30 + ocr_percent_done), 'status': 'OCR Processing (page ' + str(i+1) + ' of ' + str(num_pages) +') chunk no: ' + str(num_chunk), 'start_time': start_time, 'elapsed_time': time.time() - start_time})  # Example progress update
+                    num_chunk += 1
                     extracted_text += chunk['message']['content']
 
+                ocr_percent_done += int(20/num_pages) #20% of work is for OCR - just a stupid assumption from tasks.py
             except ollama.ResponseError as e:
                 print('Error:', e.error)
                 raise Exception("Failed to generate text with Llama 3.2 Vision model")
diff --git a/app/ocr_strategies/ocr_strategy.py b/app/ocr_strategies/ocr_strategy.py
@@ -1,4 +1,16 @@
 class OCRStrategy:
+
+    def __init__(self):
+        print("a")
+        self.update_state_callback = None
+
+    def set_update_state_callback(self, callback):
+        self.update_state_callback = callback
+
+    def update_state(self, state, meta):
+        if self.update_state_callback:
+            self.update_state_callback(state, meta)
+                
     """Base OCR Strategy Interface"""
     def extract_text_from_pdf(self, pdf_bytes):
         raise NotImplementedError("Subclasses must implement this method")
diff --git a/app/tasks.py b/app/tasks.py
@@ -28,6 +28,8 @@ def ocr_task(self, pdf_bytes, strategy_name, pdf_filename, pdf_hash, ocr_cache,
         raise ValueError(f"Unknown strategy '{strategy_name}'. Available: marker, tesseract, llama_vision")
 
     ocr_strategy = OCR_STRATEGIES[strategy_name]
+    ocr_strategy.set_update_state_callback(self.update_state)
+
     self.update_state(state='PROGRESS', status="File uploaded successfully", meta={'progress': 10})  # Example progress update
     
     extracted_text = None
diff --git a/client/cli.py b/client/cli.py
@@ -4,9 +4,12 @@
 import time
 import os
 
-def ocr_upload(file_path, ocr_cache, prompt, prompt_file=None, model='llama3.1', strategy='marker', storage_profile='default', storage_filename=None):
+def ocr_upload(file_path, ocr_cache, prompt, prompt_file=None, model='llama3.1', strategy='llama_vision', storage_profile='default', storage_filename=None):
     ocr_url = os.getenv('OCR_UPLOAD_URL', 'http://localhost:8000/ocr/upload')
     files = {'file': open(file_path, 'rb')}
+    if not ocr_cache:
+        print("OCR cache disabled.")
+
     data = {'ocr_cache': ocr_cache, 'model': model, 'strategy': strategy, 'storage_profile': storage_profile}
 
     if storage_filename:
@@ -37,7 +40,7 @@ def ocr_upload(file_path, ocr_cache, prompt, prompt_file=None, model='llama3.1',
         print(f"Failed to upload file: {response.text}")
         return None
 
-def ocr_request(file_path, ocr_cache, prompt, prompt_file=None, model='llama3.1', strategy='marker', storage_profile='default', storage_filename=None):
+def ocr_request(file_path, ocr_cache, prompt, prompt_file=None, model='llama3.1', strategy='llama_vision', storage_profile='default', storage_filename=None):
     ocr_url = os.getenv('OCR_REQUEST_URL', 'http://localhost:8000/ocr/request')
     with open(file_path, 'rb') as f:
         file_content = base64.b64encode(f.read()).decode('utf-8')
@@ -162,10 +165,11 @@ def main():
     ocr_parser = subparsers.add_parser('ocr_upload', help='Upload a file to the OCR endpoint and get the result.')
     ocr_parser.add_argument('--file', type=str, default='examples/rmi-example.pdf', help='Path to the file to upload')
     ocr_parser.add_argument('--ocr_cache', default=True, action='store_true', help='Enable OCR result caching')
+    ocr_parser.add_argument('--disable_ocr_cache', default=True, action='store_true', help='Disable OCR result caching')
     ocr_parser.add_argument('--prompt', type=str, default=None, help='Prompt used for the Ollama model to fix or transform the file')
     ocr_parser.add_argument('--prompt_file', default=None, type=str, help='Prompt file name used for the Ollama model to fix or transform the file')
     ocr_parser.add_argument('--model', type=str, default='llama3.1', help='Model to use for the Ollama endpoint')
-    ocr_parser.add_argument('--strategy', type=str, default='marker', help='OCR strategy to use for the file')
+    ocr_parser.add_argument('--strategy', type=str, default='llama_vision', help='OCR strategy to use for the file')
     ocr_parser.add_argument('--print_progress', default=True, action='store_true', help='Print the progress of the OCR task')
     ocr_parser.add_argument('--storage_profile', type=str, default='default', help='Storage profile to use for the file')
     ocr_parser.add_argument('--storage_filename', type=str, default=None, help='Storage filename to use for the file. You may use some formatting - see the docs')
@@ -175,10 +179,11 @@ def main():
     ocr_parser = subparsers.add_parser('ocr', help='Upload a file to the OCR endpoint and get the result.')
     ocr_parser.add_argument('--file', type=str, default='examples/rmi-example.pdf', help='Path to the file to upload')
     ocr_parser.add_argument('--ocr_cache', default=True, action='store_true', help='Enable OCR result caching')
+    ocr_parser.add_argument('--disable_ocr_cache', default=True, action='store_true', help='Disable OCR result caching')    
     ocr_parser.add_argument('--prompt', type=str, default=None, help='Prompt used for the Ollama model to fix or transform the file')
     ocr_parser.add_argument('--prompt_file', default=None, type=str, help='Prompt file name used for the Ollama model to fix or transform the file')
     ocr_parser.add_argument('--model', type=str, default='llama3.1', help='Model to use for the Ollama endpoint')
-    ocr_parser.add_argument('--strategy', type=str, default='marker', help='OCR strategy to use for the file')
+    ocr_parser.add_argument('--strategy', type=str, default='llama_vision', help='OCR strategy to use for the file')
     ocr_parser.add_argument('--print_progress', default=True, action='store_true', help='Print the progress of the OCR task')
     ocr_parser.add_argument('--storage_profile', type=str, default='default', help='Storage profile to use for the file')
     ocr_parser.add_argument('--storage_filename', type=str, default=None, help='Storage filename to use for the file. You may use some formatting - see the docs')
@@ -189,10 +194,11 @@ def main():
     ocr_request_parser = subparsers.add_parser('ocr_request', help='Upload a file to the OCR endpoint via JSON and get the result.')
     ocr_request_parser.add_argument('--file', type=str, default='examples/rmi-example.pdf', help='Path to the file to upload')
     ocr_request_parser.add_argument('--ocr_cache', default=True, action='store_true', help='Enable OCR result caching')
+    ocr_request_parser.add_argument('--disable_ocr_cache', default=True, action='store_true', help='Disable OCR result caching')
     ocr_request_parser.add_argument('--prompt', type=str, default=None, help='Prompt used for the Ollama model to fix or transform the file')
     ocr_request_parser.add_argument('--prompt_file', default=None, type=str, help='Prompt file name used for the Ollama model to fix or transform the file')
     ocr_request_parser.add_argument('--model', type=str, default='llama3.1', help='Model to use for the Ollama endpoint')
-    ocr_request_parser.add_argument('--strategy', type=str, default='marker', help='OCR strategy to use')
+    ocr_request_parser.add_argument('--strategy', type=str, default='llama_vision', help='OCR strategy to use')
     ocr_request_parser.add_argument('--print_progress', default=True, action='store_true', help='Print the progress of the OCR task')
     ocr_request_parser.add_argument('--storage_profile', type=str, default='default', help='Storage profile to use. You may use some formatting - see the docs')
     ocr_request_parser.add_argument('--storage_filename', type=str, default=None, help='Storage filename to use')
@@ -231,7 +237,7 @@ def main():
 
     if args.command == 'ocr' or args.command == 'ocr_upload':
         print(args)
-        result = ocr_upload(args.file, args.ocr_cache, args.prompt, args.prompt_file, args.model, args.strategy, args.storage_profile, args.storage_filename)
+        result = ocr_upload(args.file, False if args.disable_ocr_cache else args.ocr_cache, args.prompt, args.prompt_file, args.model, args.strategy, args.storage_profile, args.storage_filename)
         if result is None:
             print("Error uploading file.")
             return
@@ -243,7 +249,7 @@ def main():
             if text_result:
                 print(text_result)
     elif args.command == 'ocr_request':
-        result = ocr_request(args.file, args.ocr_cache, args.prompt, args.prompt_file, args.model, args.strategy, args.storage_profile, args.storage_filename)
+        result = ocr_request(args.file, False if args.disable_ocr_cache else args.ocr_cache, args.prompt, args.prompt_file, args.model, args.strategy, args.storage_profile, args.storage_filename)
         if result is None:
             print("Error uploading file.")
             return
diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
@@ -25,6 +25,7 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}      
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
+      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
     depends_on:
       - redis
       - ollama
@@ -52,6 +53,7 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}      
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
+      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
     depends_on:
       - redis
     volumes:
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -22,6 +22,7 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
+      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
     depends_on:
       - redis
       - ollama
@@ -42,6 +43,7 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}      
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
+      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
     depends_on:
       - redis
     volumes:
diff --git a/run.sh b/run.sh
@@ -12,6 +12,9 @@ ollama serve &
 echo "Pulling LLama3.1 model"
 ollama pull llama3.1
 
+echo "Pulling LLama3.2-vision model"
+ollama pull llama3.2-vision
+
 echo "Starting Redis"
 docker run  -p 6379:6379 --restart always --detach redis &