|
| 1 | +import io |
| 2 | +import numpy as np |
| 3 | +from PIL import Image |
| 4 | +import easyocr |
| 5 | + |
| 6 | +from text_extract_api.extract.strategies.strategy import Strategy |
| 7 | +from text_extract_api.files.file_formats.file_format import FileFormat |
| 8 | +from text_extract_api.files.file_formats.image_file_format import ImageFileFormat |
| 9 | + |
| 10 | + |
| 11 | +class EasyOCR(Strategy): |
| 12 | + @classmethod |
| 13 | + def name(cls) -> str: |
| 14 | + return "easyOCR" |
| 15 | + |
| 16 | + def extract_text(self, file_format: FileFormat) -> str: |
| 17 | + """ |
| 18 | + Extract text using EasyOCR after converting the input file to images |
| 19 | + (if not already an ImageFileFormat). |
| 20 | + """ |
| 21 | + |
| 22 | + # Ensure we can actually convert the input file to ImageFileFormat |
| 23 | + if ( |
| 24 | + not isinstance(file_format, ImageFileFormat) |
| 25 | + and not file_format.can_convert_to(ImageFileFormat) |
| 26 | + ): |
| 27 | + raise TypeError( |
| 28 | + f"EasyOCR - format {file_format.mime_type} is not supported (yet?)" |
| 29 | + ) |
| 30 | + |
| 31 | + # Convert the input file to a list of ImageFileFormat objects |
| 32 | + images = FileFormat.convert_to(file_format, ImageFileFormat) |
| 33 | + |
| 34 | + # Initialize the EasyOCR Reader |
| 35 | + # Add or change languages to your needs, e.g., ['en', 'fr'] |
| 36 | + reader = easyocr.Reader(['en']) |
| 37 | + |
| 38 | + # Process each image, extracting text |
| 39 | + all_extracted_text = [] |
| 40 | + for image_format in images: |
| 41 | + # Convert the in-memory bytes to a PIL Image |
| 42 | + pil_image = Image.open(io.BytesIO(image_format.binary)) |
| 43 | + |
| 44 | + # Convert PIL image to numpy array for EasyOCR |
| 45 | + np_image = np.array(pil_image) |
| 46 | + |
| 47 | + # Perform OCR; with `detail=0`, we get just text, no bounding boxes |
| 48 | + ocr_result = reader.readtext(np_image, detail=0) |
| 49 | + |
| 50 | + # Combine all lines into a single string for that image/page |
| 51 | + extracted_text = "\n".join(ocr_result) |
| 52 | + all_extracted_text.append(extracted_text) |
| 53 | + |
| 54 | + # Join text from all images/pages |
| 55 | + full_text = "\n\n".join(all_extracted_text) |
| 56 | + return full_text |
0 commit comments