Skip to content

Commit d68586e

Browse files
committed
Add PDF Image Extractor script with README documentation
1 parent ce24464 commit d68586e

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed

PDF Image Extractor/README.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# PDF Image Extractor
2+
3+
Recursively extracts all images from every PDF file in a directory tree, saving the images in a subfolder named `PDF` in the input root directory by default. Each PDF file is organized into its own folder, containing all images extracted from that document.
4+
5+
## Requirements
6+
7+
- Python 3.8+
8+
- [PyMuPDF](https://pymupdf.readthedocs.io) (`pip install pymupdf`)
9+
10+
## Usage
11+
12+
13+
1. Install dependencies:
14+
15+
```
16+
pip install pymupdf
17+
```
18+
19+
20+
2. Run the script:
21+
22+
```
23+
python pdf_image_extractor.py [--dir <input_dir>] [--out <output_dir>] [--dedup]
24+
```
25+
26+
- `--dir <input_dir>`: Root directory to search for PDFs (default: script directory)
27+
- `--out <output_dir>`: Output directory for images (default: `<dir>/PDF`)
28+
- `--dedup`: Enable deduplication of images per PDF (default: off)
29+
30+
### Examples
31+
32+
Extract all images (including duplicates) from PDFs in `./my_pdfs`:
33+
34+
```
35+
python pdf_image_extractor.py --dir ./my_pdfs
36+
```
37+
38+
Extract only unique images per PDF, saving to a custom output folder:
39+
40+
```
41+
python pdf_image_extractor.py --dir ./my_pdfs --out ./images --dedup
42+
```
43+
44+
## Output Structure
45+
46+
- For each PDF, a folder named after the PDF (without extension) is created in the output directory, mirroring the original structure.
47+
- Images are saved as `page<page_number>_img<image_index>.<ext>`.
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import os
2+
import argparse
3+
import fitz # PyMuPDF
4+
5+
6+
def extract_images(pdf_path, output_root, dedup):
7+
try:
8+
doc = fitz.open(pdf_path)
9+
except Exception as e:
10+
print(f"Failed to open {pdf_path}: {e}")
11+
return 0
12+
13+
# Create output folder structure mirroring input
14+
rel_path = os.path.relpath(
15+
pdf_path, start=os.path.commonpath([pdf_path, output_root])
16+
)
17+
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
18+
output_folder = os.path.join(output_root, os.path.dirname(rel_path), pdf_name)
19+
img_count = 0
20+
21+
seen = set()
22+
for page_num in range(len(doc)):
23+
page = doc[page_num]
24+
images = page.get_images(full=True)
25+
for img_index, img in enumerate(images):
26+
xref = img[0]
27+
if dedup and xref in seen:
28+
continue
29+
if dedup:
30+
seen.add(xref)
31+
32+
base_image = doc.extract_image(xref)
33+
image_bytes = base_image["image"]
34+
image_ext = base_image.get("ext", "png")
35+
if img_count == 0 and not os.path.exists(output_folder):
36+
os.makedirs(output_folder, exist_ok=True)
37+
img_count += 1
38+
img_filename = f"page{page_num + 1}_img{img_index + 1}.{image_ext}"
39+
img_path = os.path.join(output_folder, img_filename)
40+
with open(img_path, "wb") as img_file:
41+
img_file.write(image_bytes)
42+
doc.close()
43+
print(f"Extracted {img_count} images from {pdf_path} to {output_folder}")
44+
return img_count
45+
46+
47+
def main():
48+
parser = argparse.ArgumentParser(
49+
description="Recursively extract images from all PDFs in a directory tree."
50+
)
51+
parser.add_argument(
52+
"--dir",
53+
type=str,
54+
default=os.path.dirname(os.path.abspath(__file__)),
55+
help="Root directory to search for PDF files (default: script directory)",
56+
)
57+
parser.add_argument(
58+
"--out",
59+
type=str,
60+
default=None,
61+
help="Output directory for extracted images (default: <dir>/PDF)",
62+
)
63+
parser.add_argument(
64+
"--dedup",
65+
action="store_true",
66+
help="Enable deduplication of images per PDF (default: off, extract all images including duplicates).",
67+
)
68+
args = parser.parse_args()
69+
70+
pdf_dir = os.path.abspath(args.dir)
71+
# Default output is <dir>/PDF
72+
output_root = (
73+
os.path.abspath(args.out) if args.out else os.path.join(pdf_dir, "PDF")
74+
)
75+
76+
total_images = 0
77+
pdf_files = []
78+
for root, _, files in os.walk(pdf_dir):
79+
for f in files:
80+
if f.lower().endswith(".pdf"):
81+
pdf_files.append(os.path.join(root, f))
82+
83+
for pdf_path in pdf_files:
84+
total_images += extract_images(pdf_path, output_root, args.dedup)
85+
86+
print(
87+
f"---\nDone extracting images from all PDFs.\nTotal images extracted: {total_images}"
88+
)
89+
90+
91+
if __name__ == "__main__":
92+
main()

0 commit comments

Comments
 (0)