adithya-s-k
diff --git a/‎omniparse/__init__.py‎
Lines changed: 22 additions & 2 deletions b/‎omniparse/__init__.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎omniparse/demo.py‎
Lines changed: 7 additions & 0 deletions b/‎omniparse/demo.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎omniparse/documents/__init__.py‎
Lines changed: 24 additions & 4 deletions b/‎omniparse/documents/__init__.py‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎omniparse/documents/cleaners/bullets.py‎
Lines changed: 0 additions & 8 deletions b/‎omniparse/documents/cleaners/bullets.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎omniparse/documents/cleaners/code.py‎
Lines changed: 0 additions & 131 deletions b/‎omniparse/documents/cleaners/code.py‎
Lines changed: 0 additions & 131 deletions
diff --git a/‎omniparse/documents/cleaners/fontstyle.py‎
Lines changed: 0 additions & 30 deletions b/‎omniparse/documents/cleaners/fontstyle.py‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎omniparse/documents/cleaners/headers.py‎
Lines changed: 0 additions & 82 deletions b/‎omniparse/documents/cleaners/headers.py‎
Lines changed: 0 additions & 82 deletions
@@ -1,11 +1,31 @@
+"""
+Title: OmniPrase
+Author: Adithya S Kolavi
+Date: 2024-07-02
+
+This code includes portions of code from the marker repository by VikParuchuri.
+Original repository: https://github.com/VikParuchuri/marker
+
+Original Author: VikParuchuri
+Original Date: 2024-01-15
+
+License: GNU General Public License (GPL) Version 3
+URL: https://github.com/VikParuchuri/marker/blob/master/LICENSE
+
+Description:
+This section of the code was adapted from the marker repository to load all the OCR, layout and reading order detection models. 
+All credits for the original implementation go to VikParuchuri.
+"""
+
 import torch
-from typing import Optional , Any
+from typing import  Any
 from pydantic import BaseModel
 from transformers import AutoProcessor, AutoModelForCausalLM
 import whisper
 from omniparse.utils import print_omniparse_text_art
 from omniparse.web.web_crawler import WebCrawler
-from omniparse.documents.models import load_all_models
+from marker.models import load_all_models
+# from omniparse.documents.models import load_all_models
 
 
 class SharedState(BaseModel):
 
@@ -1,3 +1,10 @@
+"""
+Title: OmniPrase
+Author: Adithya S Kolavi
+Date: 2024-07-02
+"""
+
+
 import os
 import base64
 import mimetypes
 
@@ -1,7 +1,27 @@
+"""
+Title: OmniPrase
+Author: Adithya S Kolavi
+Date: 2024-07-02
+
+This code includes portions of code from the marker repository by VikParuchuri.
+Original repository: https://github.com/VikParuchuri/marker
+
+Original Author: VikParuchuri
+Original Date: 2024-01-15
+
+License: GNU General Public License (GPL) Version 3
+URL: https://github.com/VikParuchuri/marker/blob/master/LICENSE
+
+Description:
+This section of the code was adapted from the marker repository to enhance text pdf/word/ppt parsing. 
+All credits for the original implementation go to VikParuchuri.
+"""
+
 import os
 import tempfile
 import subprocess
-from omniparse.documents.parse import parse_single_pdf
+# from omniparse.documents.parse import parse_single_pdf
+from marker.convert import convert_single_pdf
 from omniparse.utils import encode_images
 from omniparse.models import responseDocument
 # Function to handle PDF parsing
@@ -22,7 +42,7 @@ def parse_pdf(input_data , model_state) -> responseDocument:
         else:
             raise ValueError("Invalid input data format. Expected bytes or PDF file path.")
 
-        full_text, images, out_meta = parse_single_pdf(input_path, model_state.model_list)
+        full_text, images, out_meta = convert_single_pdf(input_path, model_state.model_list)
 
         parse_pdf_result = responseDocument(
             text=full_text,
@@ -61,7 +81,7 @@ def parse_ppt(input_data ,model_state) -> responseDocument:
             output_pdf_path = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
             input_path = output_pdf_path
 
-        full_text, images, out_meta = parse_single_pdf(input_path, model_state.model_list)
+        full_text, images, out_meta = convert_single_pdf(input_path, model_state.model_list)
         images = encode_images(images)
 
         parse_ppt_result = responseDocument(
@@ -99,7 +119,7 @@ def parse_doc(input_data ,model_state) -> responseDocument:
             output_pdf_path = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
             input_path = output_pdf_path
 
-        full_text, images, out_meta = parse_single_pdf(input_path, model_state.model_list)
+        full_text, images, out_meta = convert_single_pdf(input_path, model_state.model_list)
         images = encode_images(images)
 
         parse_doc_result = responseDocument(