11import os
2- import requests
2+ import httpx
33import base64
4- from .utils import save_images_and_markdown
4+ import requests
5+ import aiofiles
6+ from typing import Optional
7+ from .utils import save_images_and_markdown , ParsedDocument
58
69class OmniParse :
710 def __init__ (self , api_key = None , base_url = "http://localhost:8000" ):
@@ -33,3 +36,347 @@ def convert_pdf_to_markdown_and_save(self, pdf_file_paths):
3336 else :
3437 print (f"Error: { response .text } " )
3538
39+
40+ class AsyncOmniParse :
41+ """
42+ An asynchronous client for interacting with the OmniParse server.
43+
44+ OmniParse is a platform that ingests and parses unstructured data into structured,
45+ actionable data optimized for GenAI (LLM) applications. This client provides methods
46+ to interact with the OmniParse server, allowing users to parse various types of
47+ unstructured data including documents, images, videos, audio files, and web pages.
48+
49+ The client supports parsing of multiple file types and provides structured output
50+ in markdown format, making it ideal for AI applications such as RAG (Retrieval-Augmented Generation)
51+ and fine-tuning.
52+
53+ Attributes:
54+ api_key (str): API key for authentication with the OmniParse server.
55+ base_url (str): Base URL for the OmniParse API endpoints.
56+ timeout (int): Timeout for API requests in seconds.
57+
58+ Usage Examples:
59+ ```python
60+ # Initialize the client
61+ parser = AsyncOmniParse(api_key="your_api_key", base_url="http://localhost:8000")
62+
63+ # Parse a PDF document
64+ async def parse_pdf_example():
65+ result = await parser.parse_pdf("/path/to/document.pdf", output_folder="/path/to/output")
66+ print(result.markdown) # Access the parsed content
67+
68+ # Process an image
69+ async def process_image_example():
70+ result = await parser.process_image("/path/to/image.jpg", task="Caption", prompt="Describe this image")
71+ print(result) # Print the image processing result
72+
73+ # Parse a website
74+ async def parse_website_example():
75+ result = await parser.parse_website("https://example.com")
76+ print(result) # Print the parsed website content
77+
78+ # Parse a video file
79+ async def parse_video_example():
80+ result = await parser.parse_video("/path/to/video.mp4")
81+ print(result) # Print the parsed video content
82+
83+ # Use in an async context
84+ async def main():
85+ await parse_pdf_example()
86+ await process_image_example()
87+ await parse_website_example()
88+ await parse_video_example()
89+
90+ # Run the async main function
91+ import asyncio
92+ asyncio.run(main())
93+ ```
94+ """
95+ def __init__ (self , api_key = None , base_url = "http://localhost:8000" , timeout = 120 ):
96+ self .api_key = api_key
97+ self .base_url = base_url
98+ self .timeout = timeout
99+
100+ self .parse_media_endpoint = "/parse_media"
101+ self .parse_website_endpoint = "/parse_website"
102+ self .parse_document_endpoint = "/parse_document"
103+
104+ self .image_process_tasks = {
105+ "OCR" , "OCR with Region" , "Caption" ,
106+ "Detailed Caption" , "More Detailed Caption" ,
107+ "Object Detection" , "Dense Region Caption" , "Region Proposal"
108+ }
109+
110+ self .allowed_audio_extentions = {".mp3" , ".wav" , ".aac" }
111+ self .allowed_video_extentions = {".mp4" , ".mkv" , ".avi" , ".mov" }
112+ self .allowed_document_extentions = {".pdf" , ".ppt" , ".pptx" , ".doc" , ".docs" }
113+ self .allowed_image_extentions = {".png" , ".jpg" , ".jpeg" , ".tiff" , ".bmp" , ".heic" }
114+
115+ async def __request__ (self , endpoint : str , files : dict = None , json : dict = None ) -> dict :
116+ """
117+ Internal method to make API requests.
118+
119+ Args:
120+ endpoint (str): API endpoint.
121+ files (dict, optional): Files to be sent with the request.
122+ json (dict, optional): JSON data to be sent with the request.
123+
124+ Returns:
125+ dict: JSON response from the API.
126+ """
127+ url = f"{ self .base_url } { endpoint } "
128+ headers = {"Authorization" : f"Bearer { self .api_key } " } if self .api_key else {}
129+ async with httpx .AsyncClient () as client :
130+ response = await client .post (url , files = files , json = json , headers = headers , timeout = self .timeout )
131+ response .raise_for_status ()
132+ return response .json ()
133+
134+ async def parse_document (self , file_path : str , output_folder : Optional [str ]) -> ParsedDocument :
135+ """
136+ Parse a document file (PDF, PPT, or DOCX) and convert it to structured markdown.
137+
138+ This method extracts text, tables, and images from the document, providing a
139+ structured output optimized for LLM applications.
140+
141+ Args:
142+ file_path (str): Path to the document file.
143+ output_folder (Optional[str]): If provided, the parsed data will be saved in this folder.
144+ A new subfolder will be created with the name of the input file, and the parsed
145+ content will be saved within this subfolder.
146+
147+ Returns:
148+ ParsedDocument: Parsed document data including extracted text, tables, and images.
149+
150+ Raises:
151+ ValueError: If the file type is not supported.
152+
153+ Note:
154+ If output_folder is provided, the method will save the parsed data and print a
155+ confirmation message.
156+ """
157+ file_ext = os .path .splitext (file_path )[1 ].lower ()
158+
159+ if file_ext not in self .allowed_document_extentions :
160+ raise ValueError (f"Unsupported file type. Only files of format { ', ' .join (self .allowed_document_extentions )} are allowed." )
161+
162+ async with aiofiles .open (file_path , 'rb' ) as file :
163+ file_data = await file .read ()
164+ response = await self .__request__ (self .parse_document_endpoint , files = {'file' : file_data })
165+ data = ParsedDocument (** response , source_path = file_path , output_folder = output_folder )
166+ if output_folder :
167+ data .save_data (echo = True )
168+
169+ async def parse_pdf (self , file_path : str , output_folder : Optional [str ]) -> ParsedDocument :
170+ """
171+ Parse a PDF file and convert it to structured markdown.
172+
173+ Args:
174+ file_path (str): Path to the PDF file.
175+ output_folder (Optional[str]): If provided, the parsed data will be saved in this folder.
176+ A new subfolder will be created with the name of the PDF file, and the parsed
177+ content will be saved within this subfolder.
178+
179+ Returns:
180+ ParsedDocument: Parsed PDF data including extracted text, tables, and images.
181+
182+ Raises:
183+ ValueError: If the file is not a PDF.
184+
185+ Note:
186+ If output_folder is provided, the method will save the parsed data and print a
187+ confirmation message.
188+ """
189+ file_ext = os .path .splitext (file_path )[1 ].lower ()
190+ if file_ext != ".pdf" :
191+ raise ValueError (f"The file must be a PDF (.pdf), but received a file of type { file_ext } " )
192+
193+ async with aiofiles .open (file_path , 'rb' ) as file :
194+ file_data = await file .read ()
195+ response = await self .__request__ (f"{ self .parse_document_endpoint } /pdf" , files = {'file' : file_data })
196+ data = ParsedDocument (** response , source_path = file_path , output_folder = output_folder )
197+ if output_folder :
198+ data .save_data (echo = True )
199+
200+ async def parse_ppt (self , file_path : str , output_folder : Optional [str ]) -> ParsedDocument :
201+ """
202+ Parse a PowerPoint file and convert it to structured markdown.
203+
204+ Args:
205+ file_path (str): Path to the PPT or PPTX file.
206+ output_folder (Optional[str]): If provided, the parsed data will be saved in this folder.
207+ A new subfolder will be created with the name of the PowerPoint file, and the parsed
208+ content will be saved within this subfolder.
209+
210+ Returns:
211+ ParsedDocument: Parsed PowerPoint data including extracted text, tables, and images.
212+
213+ Raises:
214+ ValueError: If the file is not a PPT or PPTX.
215+
216+ Note:
217+ If output_folder is provided, the method will save the parsed data and print a
218+ confirmation message.
219+ """
220+ file_ext = os .path .splitext (file_path )[1 ].lower ()
221+ if file_ext not in [".ppt" , ".pptx" ]:
222+ raise ValueError (f"The file must be a PPT file (.ppt or .pptx), but received a file of type { file_ext } " )
223+
224+ async with aiofiles .open (file_path , 'rb' ) as file :
225+ file_data = await file .read ()
226+ response = await self .__request__ (f"{ self .parse_document_endpoint } /ppt" , files = {'file' : file_data })
227+ data = ParsedDocument (** response , source_path = file_path , output_folder = output_folder )
228+ if output_folder :
229+ data .save_data (echo = True )
230+
231+ async def parse_docs (self , file_path : str , output_folder : Optional [str ]) -> ParsedDocument :
232+ """
233+ Parse a Word document file and convert it to structured markdown.
234+
235+ Args:
236+ file_path (str): Path to the DOC or DOCS file.
237+ output_folder (Optional[str]): If provided, the parsed data will be saved in this folder.
238+ A new subfolder will be created with the name of the Word document file, and the parsed
239+ content will be saved within this subfolder.
240+
241+ Returns:
242+ ParsedDocument: Parsed Word document data including extracted text, tables, and images.
243+
244+ Raises:
245+ ValueError: If the file is not a DOC or DOCS.
246+
247+ Note:
248+ If output_folder is provided, the method will save the parsed data and print a
249+ confirmation message.
250+ """
251+ file_ext = os .path .splitext (file_path )[1 ].lower ()
252+ if file_ext not in [".doc" , ".docs" ]:
253+ raise ValueError (f"The file must be a DOC file (.doc or .docs), but received a file of type { file_ext } " )
254+
255+ async with aiofiles .open (file_path , 'rb' ) as file :
256+ file_data = await file .read ()
257+ response = await self .__request__ (f"{ self .parse_document_endpoint } /docs" , files = {'file' : file_data })
258+ data = ParsedDocument (** response , source_path = file_path , output_folder = output_folder )
259+ if output_folder :
260+ data .save_data (echo = True )
261+
262+ async def parse_image (self , file_path : str ) -> dict :
263+ """
264+ Parse an image file, extracting visual information and generating captions.
265+
266+ This method can be used for tasks such as object detection, image captioning,
267+ and text extraction (OCR) from images.
268+
269+ Args:
270+ file_path (str): Path to the image file.
271+
272+ Returns:
273+ dict: Parsed image data including captions, detected objects, and extracted text.
274+
275+ Raises:
276+ ValueError: If the file type is not supported.
277+ """
278+ file_ext = os .path .splitext (file_path )[1 ].lower ()
279+ if file_ext not in self .allowed_image_extentions :
280+ raise ValueError (f"Unsupported file type. Only files of format { ', ' .join (self .allowed_image_extentions )} are allowed." )
281+
282+ async with aiofiles .open (file_path , 'rb' ) as file :
283+ file_data = await file .read ()
284+ return await self .__request__ (f"{ self .parse_media_endpoint } /image" , files = {'file' : file_data })
285+
286+ async def parse_video (self , file_path : str ) -> dict :
287+ """
288+ Parse a video file, extracting key frames, generating captions, and transcribing audio.
289+
290+ This method provides a structured representation of the video content, including
291+ visual and audio information.
292+
293+ Args:
294+ file_path (str): Path to the video file.
295+
296+ Returns:
297+ dict: Parsed video data including transcriptions, captions, and key frame information.
298+
299+ Raises:
300+ ValueError: If the file type is not supported.
301+ """
302+ file_ext = os .path .splitext (file_path )[1 ].lower ()
303+ if file_ext not in self .allowed_video_extentions :
304+ raise ValueError (f"Unsupported file type. Only files of format { ', ' .join (self .allowed_video_extentions )} are allowed." )
305+
306+ async with aiofiles .open (file_path , 'rb' ) as file :
307+ file_data = await file .read ()
308+ return await self .__request__ (f"{ self .parse_media_endpoint } /video" , files = {'file' : file_data })
309+
310+ async def parse_audio (self , file_path : str ) -> dict :
311+ """
312+ Parse an audio file, transcribing speech to text.
313+
314+ This method converts spoken words in the audio file to text, providing a textual
315+ representation of the audio content.
316+
317+ Args:
318+ file_path (str): Path to the audio file.
319+
320+ Returns:
321+ dict: Parsed audio data including the transcription.
322+
323+ Raises:
324+ ValueError: If the file type is not supported.
325+ """
326+ file_ext = os .path .splitext (file_path )[1 ].lower ()
327+ if file_ext not in self .allowed_audio_extentions :
328+ raise ValueError (f"Unsupported file type. Only files of format { ', ' .join (self .allowed_audio_extentions )} are allowed." )
329+
330+ async with aiofiles .open (file_path , 'rb' ) as file :
331+ file_data = await file .read ()
332+ return await self .__request__ (f"{ self .parse_media_endpoint } /audio" , files = {'file' : file_data })
333+
334+ async def process_image (self , file_path : str , task : str , prompt : Optional [str ] = None ) -> dict :
335+ """
336+ Process an image with a specific task such as OCR, captioning, or object detection.
337+
338+ This method allows for more specific image processing tasks beyond basic parsing.
339+
340+ Args:
341+ file_path (str): Path to the image file.
342+ task (str): Image processing task to perform (e.g., "OCR", "Caption", "Object Detection").
343+ prompt (Optional[str]): Optional prompt for certain tasks, useful for guided processing.
344+
345+ Returns:
346+ dict: Processed image data specific to the requested task.
347+
348+ Raises:
349+ ValueError: If the task is invalid or the file type is not supported.
350+ """
351+ if task not in self .image_process_tasks :
352+ raise ValueError (f"Invalid task. Choose from: { ', ' .join (self .image_process_tasks )} " )
353+ file_ext = os .path .splitext (file_path )[1 ].lower ()
354+ if file_ext not in self .allowed_image_extentions :
355+ raise ValueError (f"Unsupported file type. Only files of format { ', ' .join (self .allowed_image_extentions )} are allowed." )
356+
357+ async with aiofiles .open (file_path , 'rb' ) as file :
358+ file_data = await file .read ()
359+ data = {'task' : task }
360+ if prompt :
361+ data ['prompt' ] = prompt
362+ return await self .__request__ (
363+ json = data ,
364+ files = {'image' : file_data },
365+ endpoint = f"{ self .parse_media_endpoint } /process_image"
366+ )
367+
368+ async def parse_website (self , url : str ) -> dict :
369+ """
370+ Parse a website, extracting structured content from web pages.
371+
372+ This method crawls the specified URL, extracting text, images, and other relevant
373+ content in a structured format.
374+
375+ Args:
376+ url (str): URL of the website to parse.
377+
378+ Returns:
379+ dict: Parsed website data including extracted text, links, and media references.
380+ """
381+ return await self .__request__ (self .parse_website_endpoint , json = {'url' : url })
382+
0 commit comments