|
1 | 1 | # frozen_string_literal: true |
2 | 2 |
|
3 | | -require 'stringio' |
4 | | -require 'marcel' |
5 | | - |
6 | | -require_relative '../pdf' |
7 | | -require_relative '../image' |
8 | | - |
9 | | -module Mindee |
10 | | - module Input |
11 | | - # Document source handling. |
12 | | - module Source |
13 | | - # Mime types accepted by the server. |
14 | | - ALLOWED_MIME_TYPES = [ |
15 | | - 'application/pdf', |
16 | | - 'image/heic', |
17 | | - 'image/png', |
18 | | - 'image/jpeg', |
19 | | - 'image/tiff', |
20 | | - 'image/webp', |
21 | | - ].freeze |
22 | | - |
23 | | - # Standard error for invalid mime types |
24 | | - class MimeTypeError < StandardError |
25 | | - end |
26 | | - |
27 | | - # Error sent if the file's mimetype isn't allowed |
28 | | - class InvalidMimeTypeError < MimeTypeError |
29 | | - # @return [String] |
30 | | - attr_reader :invalid_mimetype |
31 | | - |
32 | | - # @param mime_type [String] |
33 | | - def initialize(mime_type) |
34 | | - @invalid_mimetype = mime_type |
35 | | - super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}") |
36 | | - end |
37 | | - end |
38 | | - |
39 | | - # Error sent if a pdf file couldn't be fixed |
40 | | - class UnfixablePDFError < MimeTypeError |
41 | | - def initialize |
42 | | - super("Corrupted PDF couldn't be repaired.") |
43 | | - end |
44 | | - end |
45 | | - |
46 | | - # Base class for loading documents. |
47 | | - class LocalInputSource |
48 | | - # @return [String] |
49 | | - attr_reader :filename |
50 | | - # @return [String] |
51 | | - attr_reader :file_mimetype |
52 | | - # @return [StringIO] |
53 | | - attr_reader :io_stream |
54 | | - |
55 | | - # @param io_stream [StringIO] |
56 | | - # @param filename [String] |
57 | | - # @param fix_pdf [Boolean] |
58 | | - def initialize(io_stream, filename, fix_pdf: false) |
59 | | - @io_stream = io_stream |
60 | | - @filename = filename |
61 | | - @file_mimetype = if fix_pdf |
62 | | - Marcel::MimeType.for @io_stream |
63 | | - else |
64 | | - Marcel::MimeType.for @io_stream, name: @filename |
65 | | - end |
66 | | - return if ALLOWED_MIME_TYPES.include? @file_mimetype |
67 | | - |
68 | | - if filename.end_with?('.pdf') && fix_pdf |
69 | | - rescue_broken_pdf(@io_stream) |
70 | | - @file_mimetype = Marcel::MimeType.for @io_stream |
71 | | - |
72 | | - return if ALLOWED_MIME_TYPES.include? @file_mimetype |
73 | | - end |
74 | | - |
75 | | - raise InvalidMimeTypeError, @file_mimetype.to_s |
76 | | - end |
77 | | - |
78 | | - # Attempts to fix pdf files if mimetype is rejected. |
79 | | - # "Broken PDFs" are often a result of third-party injecting invalid headers. |
80 | | - # This attempts to remove them and send the file |
81 | | - # @param stream [StringIO] |
82 | | - def rescue_broken_pdf(stream) |
83 | | - stream.gets('%PDF-') |
84 | | - raise UnfixablePDFError if stream.eof? || stream.pos > 500 |
85 | | - |
86 | | - stream.pos = stream.pos - 5 |
87 | | - data = stream.read |
88 | | - @io_stream.close |
89 | | - |
90 | | - @io_stream = StringIO.new |
91 | | - @io_stream << data |
92 | | - end |
93 | | - |
94 | | - # Shorthand for pdf mimetype validation. |
95 | | - def pdf? |
96 | | - @file_mimetype.to_s == 'application/pdf' |
97 | | - end |
98 | | - |
99 | | - # Parses a PDF file according to provided options. |
100 | | - # @param options [Hash, nil] Page cutting/merge options: |
101 | | - # |
102 | | - # * `:page_indexes` Zero-based list of page indexes. |
103 | | - # * `:operation` Operation to apply on the document, given the `page_indexes specified: |
104 | | - # * `:KEEP_ONLY` - keep only the specified pages, and remove all others. |
105 | | - # * `:REMOVE` - remove the specified pages, and keep all others. |
106 | | - # * `:on_min_pages` Apply the operation only if document has at least this many pages. |
107 | | - def process_pdf(options) |
108 | | - @io_stream.seek(0) |
109 | | - @io_stream = PdfProcessor.parse(@io_stream, options) |
110 | | - end |
111 | | - |
112 | | - # Reads a document. |
113 | | - # @param close [Boolean] |
114 | | - # @return [Array<String, [String, aBinaryString ], [Hash, nil] >] |
115 | | - def read_document(close: true) |
116 | | - @io_stream.seek(0) |
117 | | - # Avoids needlessly re-packing some files |
118 | | - data = @io_stream.read |
119 | | - @io_stream.close if close |
120 | | - ['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }] |
121 | | - end |
122 | | - |
123 | | - def count_pdf_pages |
124 | | - return 1 unless pdf? |
125 | | - |
126 | | - @io_stream.seek(0) |
127 | | - pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream) |
128 | | - pdf_processor.pages.size |
129 | | - end |
130 | | - |
131 | | - # Compresses the file, according to the provided info. |
132 | | - # @param [Integer] quality Quality of the output file. |
133 | | - # @param [Integer, nil] max_width Maximum width (Ignored for PDFs). |
134 | | - # @param [Integer, nil] max_height Maximum height (Ignored for PDFs). |
135 | | - # @param [Boolean] force_source_text Whether to force the operation on PDFs with source text. |
136 | | - # This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation. |
137 | | - # WARNING: this operation is strongly discouraged. |
138 | | - # @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or |
139 | | - # not. Needs force_source_text to work. |
140 | | - def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true) |
141 | | - buffer = if pdf? |
142 | | - Mindee::PDF::PDFCompressor.compress_pdf( |
143 | | - @io_stream, |
144 | | - quality: quality, |
145 | | - force_source_text_compression: force_source_text, |
146 | | - disable_source_text: disable_source_text |
147 | | - ) |
148 | | - else |
149 | | - Mindee::Image::ImageCompressor.compress_image( |
150 | | - @io_stream, |
151 | | - quality: quality, |
152 | | - max_width: max_width, |
153 | | - max_height: max_height |
154 | | - ) |
155 | | - end |
156 | | - @io_stream = buffer |
157 | | - @io_stream.rewind |
158 | | - end |
159 | | - |
160 | | - # Checks whether the file has source text if it is a pdf. False otherwise |
161 | | - # @return [Boolean] True if the file is a PDF and has source text. |
162 | | - def source_text? |
163 | | - Mindee::PDF::PDFTools.source_text?(@io_stream) |
164 | | - end |
165 | | - end |
166 | | - |
167 | | - # Load a document from a path. |
168 | | - class PathInputSource < LocalInputSource |
169 | | - # @param filepath [String] |
170 | | - # @param fix_pdf [Boolean] |
171 | | - def initialize(filepath, fix_pdf: false) |
172 | | - io_stream = File.open(filepath, 'rb') |
173 | | - super(io_stream, File.basename(filepath), fix_pdf: fix_pdf) |
174 | | - end |
175 | | - end |
176 | | - |
177 | | - # Load a document from a base64 string. |
178 | | - class Base64InputSource < LocalInputSource |
179 | | - # @param base64_string [String] |
180 | | - # @param filename [String] |
181 | | - # @param fix_pdf [Boolean] |
182 | | - def initialize(base64_string, filename, fix_pdf: false) |
183 | | - io_stream = StringIO.new(base64_string.unpack1('m*')) |
184 | | - io_stream.set_encoding Encoding::BINARY |
185 | | - super(io_stream, filename, fix_pdf: fix_pdf) |
186 | | - end |
187 | | - |
188 | | - # Overload of the same function to prevent a base64 from being re-encoded. |
189 | | - # @param close [Boolean] |
190 | | - # @return [Array<String, [String, aBinaryString ], [Hash, nil] >] |
191 | | - def read_document(close: true) |
192 | | - @io_stream.seek(0) |
193 | | - data = @io_stream.read |
194 | | - @io_stream.close if close |
195 | | - ['document', [data].pack('m'), { filename: Source.convert_to_unicode_escape(@filename) }] |
196 | | - end |
197 | | - end |
198 | | - |
199 | | - # Load a document from raw bytes. |
200 | | - class BytesInputSource < LocalInputSource |
201 | | - # @param raw_bytes [String] |
202 | | - # @param filename [String] |
203 | | - # @param fix_pdf [Boolean] |
204 | | - def initialize(raw_bytes, filename, fix_pdf: false) |
205 | | - io_stream = StringIO.new(raw_bytes) |
206 | | - io_stream.set_encoding Encoding::BINARY |
207 | | - super(io_stream, filename, fix_pdf: fix_pdf) |
208 | | - end |
209 | | - end |
210 | | - |
211 | | - # Load a document from a file handle. |
212 | | - class FileInputSource < LocalInputSource |
213 | | - # @param input_file [File] |
214 | | - # @param filename [String] |
215 | | - # @param fix_pdf [Boolean] |
216 | | - def initialize(input_file, filename, fix_pdf: false) |
217 | | - io_stream = input_file |
218 | | - super(io_stream, filename, fix_pdf: fix_pdf) |
219 | | - end |
220 | | - end |
221 | | - |
222 | | - # Load a remote document from a file url. |
223 | | - class UrlInputSource |
224 | | - # @return [String] |
225 | | - attr_reader :url |
226 | | - |
227 | | - def initialize(url) |
228 | | - raise 'URL must be HTTPS' unless url.start_with? 'https://' |
229 | | - |
230 | | - @url = url |
231 | | - end |
232 | | - end |
233 | | - |
234 | | - # Replaces non-ASCII characters by their unicode escape sequence. |
235 | | - # Keeps other characters as is. |
236 | | - # @return A clean String. |
237 | | - def self.convert_to_unicode_escape(string) |
238 | | - unicode_escape_string = ''.dup |
239 | | - string.each_char do |char| |
240 | | - unicode_escape_string << if char.bytesize > 1 |
241 | | - "\\u#{char.unpack1('U').to_s(16).rjust(4, '0')}" |
242 | | - else |
243 | | - char |
244 | | - end |
245 | | - end |
246 | | - unicode_escape_string |
247 | | - end |
248 | | - end |
249 | | - end |
250 | | -end |
| 3 | +require_relative 'sources/local_input_source' |
| 4 | +require_relative 'sources/bytes_input_source' |
| 5 | +require_relative 'sources/base64_input_source' |
| 6 | +require_relative 'sources/file_input_source' |
| 7 | +require_relative 'sources/path_input_source' |
| 8 | +require_relative 'sources/url_input_source' |
0 commit comments