Skip to content

Commit eea3e0e

Browse files
♻️ refactor sources module (#131)
1 parent 6117b0a commit eea3e0e

File tree

7 files changed

+300
-248
lines changed

7 files changed

+300
-248
lines changed

lib/mindee/input/sources.rb

Lines changed: 6 additions & 248 deletions
Original file line numberDiff line numberDiff line change
@@ -1,250 +1,8 @@
11
# frozen_string_literal: true
22

3-
require 'stringio'
4-
require 'marcel'
5-
6-
require_relative '../pdf'
7-
require_relative '../image'
8-
9-
module Mindee
10-
module Input
11-
# Document source handling.
12-
module Source
13-
# Mime types accepted by the server.
14-
ALLOWED_MIME_TYPES = [
15-
'application/pdf',
16-
'image/heic',
17-
'image/png',
18-
'image/jpeg',
19-
'image/tiff',
20-
'image/webp',
21-
].freeze
22-
23-
# Standard error for invalid mime types
24-
class MimeTypeError < StandardError
25-
end
26-
27-
# Error sent if the file's mimetype isn't allowed
28-
class InvalidMimeTypeError < MimeTypeError
29-
# @return [String]
30-
attr_reader :invalid_mimetype
31-
32-
# @param mime_type [String]
33-
def initialize(mime_type)
34-
@invalid_mimetype = mime_type
35-
super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}")
36-
end
37-
end
38-
39-
# Error sent if a pdf file couldn't be fixed
40-
class UnfixablePDFError < MimeTypeError
41-
def initialize
42-
super("Corrupted PDF couldn't be repaired.")
43-
end
44-
end
45-
46-
# Base class for loading documents.
47-
class LocalInputSource
48-
# @return [String]
49-
attr_reader :filename
50-
# @return [String]
51-
attr_reader :file_mimetype
52-
# @return [StringIO]
53-
attr_reader :io_stream
54-
55-
# @param io_stream [StringIO]
56-
# @param filename [String]
57-
# @param fix_pdf [Boolean]
58-
def initialize(io_stream, filename, fix_pdf: false)
59-
@io_stream = io_stream
60-
@filename = filename
61-
@file_mimetype = if fix_pdf
62-
Marcel::MimeType.for @io_stream
63-
else
64-
Marcel::MimeType.for @io_stream, name: @filename
65-
end
66-
return if ALLOWED_MIME_TYPES.include? @file_mimetype
67-
68-
if filename.end_with?('.pdf') && fix_pdf
69-
rescue_broken_pdf(@io_stream)
70-
@file_mimetype = Marcel::MimeType.for @io_stream
71-
72-
return if ALLOWED_MIME_TYPES.include? @file_mimetype
73-
end
74-
75-
raise InvalidMimeTypeError, @file_mimetype.to_s
76-
end
77-
78-
# Attempts to fix pdf files if mimetype is rejected.
79-
# "Broken PDFs" are often a result of third-party injecting invalid headers.
80-
# This attempts to remove them and send the file
81-
# @param stream [StringIO]
82-
def rescue_broken_pdf(stream)
83-
stream.gets('%PDF-')
84-
raise UnfixablePDFError if stream.eof? || stream.pos > 500
85-
86-
stream.pos = stream.pos - 5
87-
data = stream.read
88-
@io_stream.close
89-
90-
@io_stream = StringIO.new
91-
@io_stream << data
92-
end
93-
94-
# Shorthand for pdf mimetype validation.
95-
def pdf?
96-
@file_mimetype.to_s == 'application/pdf'
97-
end
98-
99-
# Parses a PDF file according to provided options.
100-
# @param options [Hash, nil] Page cutting/merge options:
101-
#
102-
# * `:page_indexes` Zero-based list of page indexes.
103-
# * `:operation` Operation to apply on the document, given the `page_indexes specified:
104-
# * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
105-
# * `:REMOVE` - remove the specified pages, and keep all others.
106-
# * `:on_min_pages` Apply the operation only if document has at least this many pages.
107-
def process_pdf(options)
108-
@io_stream.seek(0)
109-
@io_stream = PdfProcessor.parse(@io_stream, options)
110-
end
111-
112-
# Reads a document.
113-
# @param close [Boolean]
114-
# @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
115-
def read_document(close: true)
116-
@io_stream.seek(0)
117-
# Avoids needlessly re-packing some files
118-
data = @io_stream.read
119-
@io_stream.close if close
120-
['document', data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
121-
end
122-
123-
def count_pdf_pages
124-
return 1 unless pdf?
125-
126-
@io_stream.seek(0)
127-
pdf_processor = Mindee::PDF::PdfProcessor.open_pdf(@io_stream)
128-
pdf_processor.pages.size
129-
end
130-
131-
# Compresses the file, according to the provided info.
132-
# @param [Integer] quality Quality of the output file.
133-
# @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
134-
# @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
135-
# @param [Boolean] force_source_text Whether to force the operation on PDFs with source text.
136-
# This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
137-
# WARNING: this operation is strongly discouraged.
138-
# @param [Boolean] disable_source_text If the PDF has source text, whether to re-apply it to the original or
139-
# not. Needs force_source_text to work.
140-
def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
141-
buffer = if pdf?
142-
Mindee::PDF::PDFCompressor.compress_pdf(
143-
@io_stream,
144-
quality: quality,
145-
force_source_text_compression: force_source_text,
146-
disable_source_text: disable_source_text
147-
)
148-
else
149-
Mindee::Image::ImageCompressor.compress_image(
150-
@io_stream,
151-
quality: quality,
152-
max_width: max_width,
153-
max_height: max_height
154-
)
155-
end
156-
@io_stream = buffer
157-
@io_stream.rewind
158-
end
159-
160-
# Checks whether the file has source text if it is a pdf. False otherwise
161-
# @return [Boolean] True if the file is a PDF and has source text.
162-
def source_text?
163-
Mindee::PDF::PDFTools.source_text?(@io_stream)
164-
end
165-
end
166-
167-
# Load a document from a path.
168-
class PathInputSource < LocalInputSource
169-
# @param filepath [String]
170-
# @param fix_pdf [Boolean]
171-
def initialize(filepath, fix_pdf: false)
172-
io_stream = File.open(filepath, 'rb')
173-
super(io_stream, File.basename(filepath), fix_pdf: fix_pdf)
174-
end
175-
end
176-
177-
# Load a document from a base64 string.
178-
class Base64InputSource < LocalInputSource
179-
# @param base64_string [String]
180-
# @param filename [String]
181-
# @param fix_pdf [Boolean]
182-
def initialize(base64_string, filename, fix_pdf: false)
183-
io_stream = StringIO.new(base64_string.unpack1('m*'))
184-
io_stream.set_encoding Encoding::BINARY
185-
super(io_stream, filename, fix_pdf: fix_pdf)
186-
end
187-
188-
# Overload of the same function to prevent a base64 from being re-encoded.
189-
# @param close [Boolean]
190-
# @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
191-
def read_document(close: true)
192-
@io_stream.seek(0)
193-
data = @io_stream.read
194-
@io_stream.close if close
195-
['document', [data].pack('m'), { filename: Source.convert_to_unicode_escape(@filename) }]
196-
end
197-
end
198-
199-
# Load a document from raw bytes.
200-
class BytesInputSource < LocalInputSource
201-
# @param raw_bytes [String]
202-
# @param filename [String]
203-
# @param fix_pdf [Boolean]
204-
def initialize(raw_bytes, filename, fix_pdf: false)
205-
io_stream = StringIO.new(raw_bytes)
206-
io_stream.set_encoding Encoding::BINARY
207-
super(io_stream, filename, fix_pdf: fix_pdf)
208-
end
209-
end
210-
211-
# Load a document from a file handle.
212-
class FileInputSource < LocalInputSource
213-
# @param input_file [File]
214-
# @param filename [String]
215-
# @param fix_pdf [Boolean]
216-
def initialize(input_file, filename, fix_pdf: false)
217-
io_stream = input_file
218-
super(io_stream, filename, fix_pdf: fix_pdf)
219-
end
220-
end
221-
222-
# Load a remote document from a file url.
223-
class UrlInputSource
224-
# @return [String]
225-
attr_reader :url
226-
227-
def initialize(url)
228-
raise 'URL must be HTTPS' unless url.start_with? 'https://'
229-
230-
@url = url
231-
end
232-
end
233-
234-
# Replaces non-ASCII characters by their unicode escape sequence.
235-
# Keeps other characters as is.
236-
# @return A clean String.
237-
def self.convert_to_unicode_escape(string)
238-
unicode_escape_string = ''.dup
239-
string.each_char do |char|
240-
unicode_escape_string << if char.bytesize > 1
241-
"\\u#{char.unpack1('U').to_s(16).rjust(4, '0')}"
242-
else
243-
char
244-
end
245-
end
246-
unicode_escape_string
247-
end
248-
end
249-
end
250-
end
3+
require_relative 'sources/local_input_source'
4+
require_relative 'sources/bytes_input_source'
5+
require_relative 'sources/base64_input_source'
6+
require_relative 'sources/file_input_source'
7+
require_relative 'sources/path_input_source'
8+
require_relative 'sources/url_input_source'
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# frozen_string_literal: true
2+
3+
require 'stringio'
4+
5+
module Mindee
6+
module Input
7+
module Source
8+
# Load a document from a base64 string.
9+
class Base64InputSource < LocalInputSource
10+
# @param base64_string [String]
11+
# @param filename [String]
12+
# @param fix_pdf [Boolean]
13+
def initialize(base64_string, filename, fix_pdf: false)
14+
io_stream = StringIO.new(base64_string.unpack1('m*'))
15+
io_stream.set_encoding Encoding::BINARY
16+
super(io_stream, filename, fix_pdf: fix_pdf)
17+
end
18+
19+
# Overload of the same function to prevent a base64 from being re-encoded.
20+
# @param close [Boolean]
21+
# @return [Array<String, [String, aBinaryString ], [Hash, nil] >]
22+
def read_document(close: true)
23+
@io_stream.seek(0)
24+
data = @io_stream.read
25+
@io_stream.close if close
26+
['document', [data].pack('m'), { filename: Source.convert_to_unicode_escape(@filename) }]
27+
end
28+
end
29+
end
30+
end
31+
end
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# frozen_string_literal: true
2+
3+
require 'stringio'
4+
5+
module Mindee
6+
module Input
7+
module Source
8+
# Load a document from raw bytes.
9+
class BytesInputSource < LocalInputSource
10+
# @param raw_bytes [String]
11+
# @param filename [String]
12+
# @param fix_pdf [Boolean]
13+
def initialize(raw_bytes, filename, fix_pdf: false)
14+
io_stream = StringIO.new(raw_bytes)
15+
io_stream.set_encoding Encoding::BINARY
16+
super(io_stream, filename, fix_pdf: fix_pdf)
17+
end
18+
end
19+
end
20+
end
21+
end
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# frozen_string_literal: true
2+
3+
require 'stringio'
4+
5+
module Mindee
6+
module Input
7+
module Source
8+
# Load a document from a file handle.
9+
class FileInputSource < LocalInputSource
10+
# @param input_file [File]
11+
# @param filename [String]
12+
# @param fix_pdf [Boolean]
13+
def initialize(input_file, filename, fix_pdf: false)
14+
io_stream = input_file
15+
super(io_stream, filename, fix_pdf: fix_pdf)
16+
end
17+
end
18+
end
19+
end
20+
end

0 commit comments

Comments
 (0)