Skip to content

Commit 81340bb

Browse files
committed
updated OmniParse with appropriate LICENSE - Adithya S K
1 parent afbfc03 commit 81340bb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+323
-3128
lines changed

omniparse/__init__.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,31 @@
1+
"""
2+
Title: OmniPrase
3+
Author: Adithya S Kolavi
4+
Date: 2024-07-02
5+
6+
This code includes portions of code from the marker repository by VikParuchuri.
7+
Original repository: https://github.com/VikParuchuri/marker
8+
9+
Original Author: VikParuchuri
10+
Original Date: 2024-01-15
11+
12+
License: GNU General Public License (GPL) Version 3
13+
URL: https://github.com/VikParuchuri/marker/blob/master/LICENSE
14+
15+
Description:
16+
This section of the code was adapted from the marker repository to load all the OCR, layout and reading order detection models.
17+
All credits for the original implementation go to VikParuchuri.
18+
"""
19+
120
import torch
2-
from typing import Optional , Any
21+
from typing import Any
322
from pydantic import BaseModel
423
from transformers import AutoProcessor, AutoModelForCausalLM
524
import whisper
625
from omniparse.utils import print_omniparse_text_art
726
from omniparse.web.web_crawler import WebCrawler
8-
from omniparse.documents.models import load_all_models
27+
from marker.models import load_all_models
28+
# from omniparse.documents.models import load_all_models
929

1030

1131
class SharedState(BaseModel):

omniparse/demo.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
"""
2+
Title: OmniPrase
3+
Author: Adithya S Kolavi
4+
Date: 2024-07-02
5+
"""
6+
7+
18
import os
29
import base64
310
import mimetypes

omniparse/documents/__init__.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,27 @@
1+
"""
2+
Title: OmniPrase
3+
Author: Adithya S Kolavi
4+
Date: 2024-07-02
5+
6+
This code includes portions of code from the marker repository by VikParuchuri.
7+
Original repository: https://github.com/VikParuchuri/marker
8+
9+
Original Author: VikParuchuri
10+
Original Date: 2024-01-15
11+
12+
License: GNU General Public License (GPL) Version 3
13+
URL: https://github.com/VikParuchuri/marker/blob/master/LICENSE
14+
15+
Description:
16+
This section of the code was adapted from the marker repository to enhance text pdf/word/ppt parsing.
17+
All credits for the original implementation go to VikParuchuri.
18+
"""
19+
120
import os
221
import tempfile
322
import subprocess
4-
from omniparse.documents.parse import parse_single_pdf
23+
# from omniparse.documents.parse import parse_single_pdf
24+
from marker.convert import convert_single_pdf
525
from omniparse.utils import encode_images
626
from omniparse.models import responseDocument
727
# Function to handle PDF parsing
@@ -22,7 +42,7 @@ def parse_pdf(input_data , model_state) -> responseDocument:
2242
else:
2343
raise ValueError("Invalid input data format. Expected bytes or PDF file path.")
2444

25-
full_text, images, out_meta = parse_single_pdf(input_path, model_state.model_list)
45+
full_text, images, out_meta = convert_single_pdf(input_path, model_state.model_list)
2646

2747
parse_pdf_result = responseDocument(
2848
text=full_text,
@@ -61,7 +81,7 @@ def parse_ppt(input_data ,model_state) -> responseDocument:
6181
output_pdf_path = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
6282
input_path = output_pdf_path
6383

64-
full_text, images, out_meta = parse_single_pdf(input_path, model_state.model_list)
84+
full_text, images, out_meta = convert_single_pdf(input_path, model_state.model_list)
6585
images = encode_images(images)
6686

6787
parse_ppt_result = responseDocument(
@@ -99,7 +119,7 @@ def parse_doc(input_data ,model_state) -> responseDocument:
99119
output_pdf_path = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
100120
input_path = output_pdf_path
101121

102-
full_text, images, out_meta = parse_single_pdf(input_path, model_state.model_list)
122+
full_text, images, out_meta = convert_single_pdf(input_path, model_state.model_list)
103123
images = encode_images(images)
104124

105125
parse_doc_result = responseDocument(

omniparse/documents/cleaners/bullets.py

Lines changed: 0 additions & 8 deletions
This file was deleted.

omniparse/documents/cleaners/code.py

Lines changed: 0 additions & 131 deletions
This file was deleted.

omniparse/documents/cleaners/fontstyle.py

Lines changed: 0 additions & 30 deletions
This file was deleted.

omniparse/documents/cleaners/headers.py

Lines changed: 0 additions & 82 deletions
This file was deleted.

0 commit comments

Comments
 (0)