diff --git a/README.md b/README.md index 223a84e..2f9e15c 100644 --- a/README.md +++ b/README.md @@ -58,14 +58,24 @@ pip install -r requirements.txt ### **3. Convert a Single File** +Use the ``docuparse`` CLI to convert a PDF into Markdown: + +```bash +python -m docuparse.cli convert /path/to/file.pdf /path/to/output.md --max-pages 10 +``` + +### **4. Run the API Server** + +Start a FastAPI server that exposes a ``/convert`` endpoint: + ```bash -python convert_single.py /path/to/file.pdf /path/to/output.md --parallel_factor 2 --max_pages 10 +uvicorn docuparse.fastapi_app:app --reload ``` -### **4. Convert Multiple Files** +### **5. Launch the Gradio UI** ```bash -python convert.py /path/to/input/folder /path/to/output/folder --workers 10 --max 10 +python -m docuparse.gradio_app ``` --- diff --git a/convert_single.py b/convert_single.py deleted file mode 100644 index 4e689d1..0000000 --- a/convert_single.py +++ /dev/null @@ -1,37 +0,0 @@ -import argparse -from pdfminer.high_level import extract_text - - -def convert_pdf_to_markdown(input_path: str, output_path: str, max_pages: int | None = None) -> None: - """Extract text from a PDF and write it to a Markdown file. - - Parameters - ---------- - input_path : str - Path to the input PDF file. - output_path : str - Path to the output Markdown file. - max_pages : int | None, optional - Maximum number of pages to process from the PDF. ``None`` processes all pages. - """ - text = extract_text(input_path, maxpages=max_pages) - with open(output_path, "w", encoding="utf-8") as f: - f.write(text) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Convert a PDF to Markdown text.") - parser.add_argument("input_pdf", help="Path to the PDF file") - parser.add_argument("output_md", help="Path for the output Markdown file") - parser.add_argument( - "--parallel_factor", - type=int, - default=1, - help="Reserved for future parallel processing; unused in this script", - ) - parser.add_argument( - "--max_pages", type=int, default=None, help="Maximum number of pages to process" - ) - args = parser.parse_args() - - convert_pdf_to_markdown(args.input_pdf, args.output_md, args.max_pages) diff --git a/docuparse/__init__.py b/docuparse/__init__.py new file mode 100644 index 0000000..1060135 --- /dev/null +++ b/docuparse/__init__.py @@ -0,0 +1,5 @@ +"""DocuParse package.""" + +__all__ = ["convert_pdf_to_markdown", "save_markdown"] + +from .converter import convert_pdf_to_markdown, save_markdown diff --git a/docuparse/__main__.py b/docuparse/__main__.py new file mode 100644 index 0000000..98dcca0 --- /dev/null +++ b/docuparse/__main__.py @@ -0,0 +1,4 @@ +from .cli import cli + +if __name__ == "__main__": + cli() diff --git a/docuparse/cli.py b/docuparse/cli.py new file mode 100644 index 0000000..7d36082 --- /dev/null +++ b/docuparse/cli.py @@ -0,0 +1,23 @@ +import click +from .converter import convert_pdf_to_markdown, save_markdown + + +@click.group() +def cli() -> None: + """DocuParse command line interface.""" + pass + + +@cli.command() +@click.argument("input_pdf") +@click.argument("output_md") +@click.option("--max-pages", type=int, default=None, help="Maximum pages to process") +def convert(input_pdf: str, output_md: str, max_pages: int | None) -> None: + """Convert a PDF file to Markdown.""" + text = convert_pdf_to_markdown(input_pdf, max_pages=max_pages) + save_markdown(text, output_md) + click.echo(f"Markdown saved to {output_md}") + + +if __name__ == "__main__": + cli() diff --git a/docuparse/converter.py b/docuparse/converter.py new file mode 100644 index 0000000..14184e0 --- /dev/null +++ b/docuparse/converter.py @@ -0,0 +1,13 @@ +from pdfminer.high_level import extract_text + + +def convert_pdf_to_markdown(input_path: str, max_pages: int | None = None) -> str: + """Extract text from a PDF and return it as Markdown string.""" + text = extract_text(input_path, maxpages=max_pages) + return text + + +def save_markdown(text: str, output_path: str) -> None: + """Write the Markdown text to the specified file.""" + with open(output_path, "w", encoding="utf-8") as f: + f.write(text) diff --git a/docuparse/fastapi_app.py b/docuparse/fastapi_app.py new file mode 100644 index 0000000..4c95f37 --- /dev/null +++ b/docuparse/fastapi_app.py @@ -0,0 +1,20 @@ +from fastapi import FastAPI, File, UploadFile, Form +from fastapi.responses import PlainTextResponse +import tempfile + +from .converter import convert_pdf_to_markdown + +app = FastAPI(title="DocuParse API") + + +@app.post("/convert", response_class=PlainTextResponse) +async def convert_endpoint( + file: UploadFile = File(...), max_pages: int | None = Form(None) +) -> str: + """Convert an uploaded PDF and return Markdown.""" + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: + contents = await file.read() + tmp.write(contents) + tmp.flush() + text = convert_pdf_to_markdown(tmp.name, max_pages=max_pages) + return text diff --git a/docuparse/gradio_app.py b/docuparse/gradio_app.py new file mode 100644 index 0000000..74d69de --- /dev/null +++ b/docuparse/gradio_app.py @@ -0,0 +1,23 @@ +import requests +import gradio as gr + +API_URL = "http://localhost:8000/convert" + +def convert_pdf(file: gr.files.FileData, max_pages: int | None) -> str: + """Send PDF to the API and return Markdown text.""" + with open(file.name, "rb") as f: + files = {"file": (file.name, f, "application/pdf")} + data = {"max_pages": max_pages} if max_pages is not None else {} + resp = requests.post(API_URL, files=files, data=data) + resp.raise_for_status() + return resp.text + +iface = gr.Interface( + fn=convert_pdf, + inputs=[gr.File(label="PDF"), gr.Number(label="Max pages", precision=0)], + outputs="text", + title="DocuParse UI", +) + +if __name__ == "__main__": + iface.launch() diff --git a/requirements.txt b/requirements.txt index 6d75b17..395b29e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,8 @@ pdf2image pillow numpy tqdm +click +fastapi +uvicorn +gradio +requests