Skip to content

Commit b1db569

Browse files
authored
Merge pull request #2 from AswanthManoj/main
Added an async client, table extraction from markdown to list of list with headers
2 parents 34c4660 + 6d045e4 commit b1db569

File tree

4 files changed

+1028
-3
lines changed

4 files changed

+1028
-3
lines changed

python-sdk/omniparse_client/omniparse.py

Lines changed: 349 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import os
2-
import requests
2+
import httpx
33
import base64
4-
from .utils import save_images_and_markdown
4+
import requests
5+
import aiofiles
6+
from typing import Optional
7+
from .utils import save_images_and_markdown, ParsedDocument
58

69
class OmniParse:
710
def __init__(self, api_key=None, base_url="http://localhost:8000"):
@@ -33,3 +36,347 @@ def convert_pdf_to_markdown_and_save(self, pdf_file_paths):
3336
else:
3437
print(f"Error: {response.text}")
3538

39+
40+
class AsyncOmniParse:
41+
"""
42+
An asynchronous client for interacting with the OmniParse server.
43+
44+
OmniParse is a platform that ingests and parses unstructured data into structured,
45+
actionable data optimized for GenAI (LLM) applications. This client provides methods
46+
to interact with the OmniParse server, allowing users to parse various types of
47+
unstructured data including documents, images, videos, audio files, and web pages.
48+
49+
The client supports parsing of multiple file types and provides structured output
50+
in markdown format, making it ideal for AI applications such as RAG (Retrieval-Augmented Generation)
51+
and fine-tuning.
52+
53+
Attributes:
54+
api_key (str): API key for authentication with the OmniParse server.
55+
base_url (str): Base URL for the OmniParse API endpoints.
56+
timeout (int): Timeout for API requests in seconds.
57+
58+
Usage Examples:
59+
```python
60+
# Initialize the client
61+
parser = AsyncOmniParse(api_key="your_api_key", base_url="http://localhost:8000")
62+
63+
# Parse a PDF document
64+
async def parse_pdf_example():
65+
result = await parser.parse_pdf("/path/to/document.pdf", output_folder="/path/to/output")
66+
print(result.markdown) # Access the parsed content
67+
68+
# Process an image
69+
async def process_image_example():
70+
result = await parser.process_image("/path/to/image.jpg", task="Caption", prompt="Describe this image")
71+
print(result) # Print the image processing result
72+
73+
# Parse a website
74+
async def parse_website_example():
75+
result = await parser.parse_website("https://example.com")
76+
print(result) # Print the parsed website content
77+
78+
# Parse a video file
79+
async def parse_video_example():
80+
result = await parser.parse_video("/path/to/video.mp4")
81+
print(result) # Print the parsed video content
82+
83+
# Use in an async context
84+
async def main():
85+
await parse_pdf_example()
86+
await process_image_example()
87+
await parse_website_example()
88+
await parse_video_example()
89+
90+
# Run the async main function
91+
import asyncio
92+
asyncio.run(main())
93+
```
94+
"""
95+
def __init__(self, api_key=None, base_url="http://localhost:8000", timeout=120):
96+
self.api_key = api_key
97+
self.base_url = base_url
98+
self.timeout = timeout
99+
100+
self.parse_media_endpoint = "/parse_media"
101+
self.parse_website_endpoint = "/parse_website"
102+
self.parse_document_endpoint = "/parse_document"
103+
104+
self.image_process_tasks = {
105+
"OCR", "OCR with Region", "Caption",
106+
"Detailed Caption", "More Detailed Caption",
107+
"Object Detection", "Dense Region Caption", "Region Proposal"
108+
}
109+
110+
self.allowed_audio_extentions = {".mp3", ".wav", ".aac"}
111+
self.allowed_video_extentions = {".mp4", ".mkv", ".avi", ".mov"}
112+
self.allowed_document_extentions = {".pdf", ".ppt", ".pptx", ".doc", ".docs"}
113+
self.allowed_image_extentions = {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".heic"}
114+
115+
async def __request__(self, endpoint: str, files: dict = None, json: dict = None) -> dict:
116+
"""
117+
Internal method to make API requests.
118+
119+
Args:
120+
endpoint (str): API endpoint.
121+
files (dict, optional): Files to be sent with the request.
122+
json (dict, optional): JSON data to be sent with the request.
123+
124+
Returns:
125+
dict: JSON response from the API.
126+
"""
127+
url = f"{self.base_url}{endpoint}"
128+
headers = {"Authorization": f"Bearer {self.api_key}"} if self.api_key else {}
129+
async with httpx.AsyncClient() as client:
130+
response = await client.post(url, files=files, json=json, headers=headers, timeout=self.timeout)
131+
response.raise_for_status()
132+
return response.json()
133+
134+
async def parse_document(self, file_path: str, output_folder: Optional[str]) -> ParsedDocument:
135+
"""
136+
Parse a document file (PDF, PPT, or DOCX) and convert it to structured markdown.
137+
138+
This method extracts text, tables, and images from the document, providing a
139+
structured output optimized for LLM applications.
140+
141+
Args:
142+
file_path (str): Path to the document file.
143+
output_folder (Optional[str]): If provided, the parsed data will be saved in this folder.
144+
A new subfolder will be created with the name of the input file, and the parsed
145+
content will be saved within this subfolder.
146+
147+
Returns:
148+
ParsedDocument: Parsed document data including extracted text, tables, and images.
149+
150+
Raises:
151+
ValueError: If the file type is not supported.
152+
153+
Note:
154+
If output_folder is provided, the method will save the parsed data and print a
155+
confirmation message.
156+
"""
157+
file_ext = os.path.splitext(file_path)[1].lower()
158+
159+
if file_ext not in self.allowed_document_extentions:
160+
raise ValueError(f"Unsupported file type. Only files of format {', '.join(self.allowed_document_extentions)} are allowed.")
161+
162+
async with aiofiles.open(file_path, 'rb') as file:
163+
file_data = await file.read()
164+
response = await self.__request__(self.parse_document_endpoint, files={'file': file_data})
165+
data = ParsedDocument(**response, source_path=file_path, output_folder=output_folder)
166+
if output_folder:
167+
data.save_data(echo=True)
168+
169+
async def parse_pdf(self, file_path: str, output_folder: Optional[str]) -> ParsedDocument:
170+
"""
171+
Parse a PDF file and convert it to structured markdown.
172+
173+
Args:
174+
file_path (str): Path to the PDF file.
175+
output_folder (Optional[str]): If provided, the parsed data will be saved in this folder.
176+
A new subfolder will be created with the name of the PDF file, and the parsed
177+
content will be saved within this subfolder.
178+
179+
Returns:
180+
ParsedDocument: Parsed PDF data including extracted text, tables, and images.
181+
182+
Raises:
183+
ValueError: If the file is not a PDF.
184+
185+
Note:
186+
If output_folder is provided, the method will save the parsed data and print a
187+
confirmation message.
188+
"""
189+
file_ext = os.path.splitext(file_path)[1].lower()
190+
if file_ext != ".pdf":
191+
raise ValueError(f"The file must be a PDF (.pdf), but received a file of type {file_ext}")
192+
193+
async with aiofiles.open(file_path, 'rb') as file:
194+
file_data = await file.read()
195+
response = await self.__request__(f"{self.parse_document_endpoint}/pdf", files={'file': file_data})
196+
data = ParsedDocument(**response, source_path=file_path, output_folder=output_folder)
197+
if output_folder:
198+
data.save_data(echo=True)
199+
200+
async def parse_ppt(self, file_path: str, output_folder: Optional[str]) -> ParsedDocument:
201+
"""
202+
Parse a PowerPoint file and convert it to structured markdown.
203+
204+
Args:
205+
file_path (str): Path to the PPT or PPTX file.
206+
output_folder (Optional[str]): If provided, the parsed data will be saved in this folder.
207+
A new subfolder will be created with the name of the PowerPoint file, and the parsed
208+
content will be saved within this subfolder.
209+
210+
Returns:
211+
ParsedDocument: Parsed PowerPoint data including extracted text, tables, and images.
212+
213+
Raises:
214+
ValueError: If the file is not a PPT or PPTX.
215+
216+
Note:
217+
If output_folder is provided, the method will save the parsed data and print a
218+
confirmation message.
219+
"""
220+
file_ext = os.path.splitext(file_path)[1].lower()
221+
if file_ext not in [".ppt", ".pptx"]:
222+
raise ValueError(f"The file must be a PPT file (.ppt or .pptx), but received a file of type {file_ext}")
223+
224+
async with aiofiles.open(file_path, 'rb') as file:
225+
file_data = await file.read()
226+
response = await self.__request__(f"{self.parse_document_endpoint}/ppt", files={'file': file_data})
227+
data = ParsedDocument(**response, source_path=file_path, output_folder=output_folder)
228+
if output_folder:
229+
data.save_data(echo=True)
230+
231+
async def parse_docs(self, file_path: str, output_folder: Optional[str]) -> ParsedDocument:
232+
"""
233+
Parse a Word document file and convert it to structured markdown.
234+
235+
Args:
236+
file_path (str): Path to the DOC or DOCS file.
237+
output_folder (Optional[str]): If provided, the parsed data will be saved in this folder.
238+
A new subfolder will be created with the name of the Word document file, and the parsed
239+
content will be saved within this subfolder.
240+
241+
Returns:
242+
ParsedDocument: Parsed Word document data including extracted text, tables, and images.
243+
244+
Raises:
245+
ValueError: If the file is not a DOC or DOCS.
246+
247+
Note:
248+
If output_folder is provided, the method will save the parsed data and print a
249+
confirmation message.
250+
"""
251+
file_ext = os.path.splitext(file_path)[1].lower()
252+
if file_ext not in [".doc", ".docs"]:
253+
raise ValueError(f"The file must be a DOC file (.doc or .docs), but received a file of type {file_ext}")
254+
255+
async with aiofiles.open(file_path, 'rb') as file:
256+
file_data = await file.read()
257+
response = await self.__request__(f"{self.parse_document_endpoint}/docs", files={'file': file_data})
258+
data = ParsedDocument(**response, source_path=file_path, output_folder=output_folder)
259+
if output_folder:
260+
data.save_data(echo=True)
261+
262+
async def parse_image(self, file_path: str) -> dict:
263+
"""
264+
Parse an image file, extracting visual information and generating captions.
265+
266+
This method can be used for tasks such as object detection, image captioning,
267+
and text extraction (OCR) from images.
268+
269+
Args:
270+
file_path (str): Path to the image file.
271+
272+
Returns:
273+
dict: Parsed image data including captions, detected objects, and extracted text.
274+
275+
Raises:
276+
ValueError: If the file type is not supported.
277+
"""
278+
file_ext = os.path.splitext(file_path)[1].lower()
279+
if file_ext not in self.allowed_image_extentions:
280+
raise ValueError(f"Unsupported file type. Only files of format {', '.join(self.allowed_image_extentions)} are allowed.")
281+
282+
async with aiofiles.open(file_path, 'rb') as file:
283+
file_data = await file.read()
284+
return await self.__request__(f"{self.parse_media_endpoint}/image", files={'file': file_data})
285+
286+
async def parse_video(self, file_path: str) -> dict:
287+
"""
288+
Parse a video file, extracting key frames, generating captions, and transcribing audio.
289+
290+
This method provides a structured representation of the video content, including
291+
visual and audio information.
292+
293+
Args:
294+
file_path (str): Path to the video file.
295+
296+
Returns:
297+
dict: Parsed video data including transcriptions, captions, and key frame information.
298+
299+
Raises:
300+
ValueError: If the file type is not supported.
301+
"""
302+
file_ext = os.path.splitext(file_path)[1].lower()
303+
if file_ext not in self.allowed_video_extentions:
304+
raise ValueError(f"Unsupported file type. Only files of format {', '.join(self.allowed_video_extentions)} are allowed.")
305+
306+
async with aiofiles.open(file_path, 'rb') as file:
307+
file_data = await file.read()
308+
return await self.__request__(f"{self.parse_media_endpoint}/video", files={'file': file_data})
309+
310+
async def parse_audio(self, file_path: str) -> dict:
311+
"""
312+
Parse an audio file, transcribing speech to text.
313+
314+
This method converts spoken words in the audio file to text, providing a textual
315+
representation of the audio content.
316+
317+
Args:
318+
file_path (str): Path to the audio file.
319+
320+
Returns:
321+
dict: Parsed audio data including the transcription.
322+
323+
Raises:
324+
ValueError: If the file type is not supported.
325+
"""
326+
file_ext = os.path.splitext(file_path)[1].lower()
327+
if file_ext not in self.allowed_audio_extentions:
328+
raise ValueError(f"Unsupported file type. Only files of format {', '.join(self.allowed_audio_extentions)} are allowed.")
329+
330+
async with aiofiles.open(file_path, 'rb') as file:
331+
file_data = await file.read()
332+
return await self.__request__(f"{self.parse_media_endpoint}/audio", files={'file': file_data})
333+
334+
async def process_image(self, file_path: str, task: str, prompt: Optional[str] = None) -> dict:
335+
"""
336+
Process an image with a specific task such as OCR, captioning, or object detection.
337+
338+
This method allows for more specific image processing tasks beyond basic parsing.
339+
340+
Args:
341+
file_path (str): Path to the image file.
342+
task (str): Image processing task to perform (e.g., "OCR", "Caption", "Object Detection").
343+
prompt (Optional[str]): Optional prompt for certain tasks, useful for guided processing.
344+
345+
Returns:
346+
dict: Processed image data specific to the requested task.
347+
348+
Raises:
349+
ValueError: If the task is invalid or the file type is not supported.
350+
"""
351+
if task not in self.image_process_tasks:
352+
raise ValueError(f"Invalid task. Choose from: {', '.join(self.image_process_tasks)}")
353+
file_ext = os.path.splitext(file_path)[1].lower()
354+
if file_ext not in self.allowed_image_extentions:
355+
raise ValueError(f"Unsupported file type. Only files of format {', '.join(self.allowed_image_extentions)} are allowed.")
356+
357+
async with aiofiles.open(file_path, 'rb') as file:
358+
file_data = await file.read()
359+
data = {'task': task}
360+
if prompt:
361+
data['prompt'] = prompt
362+
return await self.__request__(
363+
json = data,
364+
files = {'image': file_data},
365+
endpoint = f"{self.parse_media_endpoint}/process_image"
366+
)
367+
368+
async def parse_website(self, url: str) -> dict:
369+
"""
370+
Parse a website, extracting structured content from web pages.
371+
372+
This method crawls the specified URL, extracting text, images, and other relevant
373+
content in a structured format.
374+
375+
Args:
376+
url (str): URL of the website to parse.
377+
378+
Returns:
379+
dict: Parsed website data including extracted text, links, and media references.
380+
"""
381+
return await self.__request__(self.parse_website_endpoint, json={'url': url})
382+

0 commit comments

Comments
 (0)