Skip to content

Commit 6117b0a

Browse files
🐛 fix tax extractor script (#130)
1 parent 5e4a9de commit 6117b0a

File tree

1 file changed

+34
-19
lines changed

1 file changed

+34
-19
lines changed

lib/mindee/extraction/tax_extractor/tax_extractor.rb

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
require_relative 'ocr_extractor'
44

5+
# rubocop:disable Metrics/ClassLength
6+
57
module Mindee
68
module Extraction
79
# Tax extractor class
@@ -72,9 +74,12 @@ def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
7274
reconstructed_hash['code'] =
7375
found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
7476

75-
if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
76-
found_hash['rate'] =
77-
found_hash['rate'] * 100
77+
if found_hash['rate']
78+
if found_hash['rate'].abs < 1
79+
found_hash['rate'] *= 10
80+
elsif found_hash['rate'].abs > 100
81+
found_hash['rate'] /= 10
82+
end
7883
end
7984
found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
8085
found_hash = decimate_rates_if_needed(found_hash)
@@ -125,18 +130,28 @@ def self.decimate_rates_if_needed(found_hash)
125130
# @param found_hash [Hash] Hash of currently retrieved values
126131
# @return [Hash]
127132
def self.set_base_and_value(reconstructed_hash, found_hash)
128-
if found_hash['base'].nil?
129-
reconstructed_hash['base'] = found_hash['base']
130-
reconstructed_hash['value'] = found_hash['value']
131-
elsif found_hash['value'].nil? && found_hash['base'] < found_hash['value']
132-
reconstructed_hash['base'] = found_hash['value']
133-
reconstructed_hash['value'] = found_hash['base']
134-
else
135-
reconstructed_hash['value'] = found_hash['value']
133+
base = found_hash['base']
134+
value = found_hash['value']
135+
136+
if base && value
137+
reconstructed_hash['base'], reconstructed_hash['value'] = [base, value].minmax
138+
elsif base
139+
reconstructed_hash['base'] = base
140+
elsif value
141+
reconstructed_hash['value'] = value
142+
calculate_base(reconstructed_hash)
136143
end
144+
137145
reconstructed_hash
138146
end
139147

148+
def self.calculate_base(hash)
149+
rate = hash['rate']
150+
return unless rate&.positive?
151+
152+
hash['base'] = hash['value'] / (rate / 100.0)
153+
end
154+
140155
# Extracts a single custom type of tax.
141156
# For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
142157
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
@@ -149,7 +164,6 @@ def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_
149164

150165
tax_names.sort!
151166
found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
152-
# a tax is considered found horizontally if it has a value, otherwise it is vertical
153167
if found_hash.nil? || found_hash['value'].nil?
154168
found_hash = extract_vertical_tax(ocr_result, tax_names,
155169
found_hash)
@@ -240,14 +254,14 @@ def self.extract_horizontal_tax(ocr_result, tax_names)
240254
linear_pattern_percent_first = %r{
241255
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
242256
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
243-
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
244-
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
257+
((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
258+
((?:\s*-\s*)?(\d*[.,])*\d+)?
245259
}x
246260
linear_pattern_percent_second = %r{
247261
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
248262
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
249-
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
250-
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
263+
((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
264+
((?:\s*-\s*)?(\d*[.,])*\d+)?
251265
}x
252266
ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
253267
page.all_lines.each do |line|
@@ -304,7 +318,7 @@ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
304318
page.all_words.each do |word|
305319
next if match_index(word.text, tax_names).nil?
306320

307-
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
321+
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id, 0.25)
308322
found_hash['page_id'] = page_id if found_hash['page_id'].nil?
309323
found_hash['code'] = word.text.strip if found_hash['code'].nil?
310324
found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
@@ -316,8 +330,9 @@ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
316330
private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
317331
:extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
318332
:create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
319-
:decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
320-
:swap_rates_if_needed
333+
:decimate_rates_if_needed, :set_base_and_value, :valid_candidate?,
334+
:swap_rates_if_needed, :calculate_base
321335
end
322336
end
323337
end
338+
# rubocop:enable Metrics/ClassLength

0 commit comments

Comments
 (0)