22
33require_relative 'ocr_extractor'
44
5+ # rubocop:disable Metrics/ClassLength
6+
57module Mindee
68 module Extraction
79 # Tax extractor class
@@ -72,9 +74,12 @@ def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
7274 reconstructed_hash [ 'code' ] =
7375 found_hash [ 'code' ] . nil? ? found_hash [ 'code' ] : found_hash [ 'code' ] . sub ( %r{\s *\. *\s *$} , '' )
7476
75- if found_hash [ 'rate' ] && found_hash [ 'rate' ] < 1 && ( found_hash [ 'rate' ] ) . positive?
76- found_hash [ 'rate' ] =
77- found_hash [ 'rate' ] * 100
77+ if found_hash [ 'rate' ]
78+ if found_hash [ 'rate' ] . abs < 1
79+ found_hash [ 'rate' ] *= 10
80+ elsif found_hash [ 'rate' ] . abs > 100
81+ found_hash [ 'rate' ] /= 10
82+ end
7883 end
7984 found_hash = swap_rates_if_needed ( found_hash , min_rate_percentage , max_rate_percentage )
8085 found_hash = decimate_rates_if_needed ( found_hash )
@@ -125,18 +130,28 @@ def self.decimate_rates_if_needed(found_hash)
125130 # @param found_hash [Hash] Hash of currently retrieved values
126131 # @return [Hash]
127132 def self . set_base_and_value ( reconstructed_hash , found_hash )
128- if found_hash [ 'base' ] . nil?
129- reconstructed_hash [ 'base' ] = found_hash [ 'base' ]
130- reconstructed_hash [ 'value' ] = found_hash [ 'value' ]
131- elsif found_hash [ 'value' ] . nil? && found_hash [ 'base' ] < found_hash [ 'value' ]
132- reconstructed_hash [ 'base' ] = found_hash [ 'value' ]
133- reconstructed_hash [ 'value' ] = found_hash [ 'base' ]
134- else
135- reconstructed_hash [ 'value' ] = found_hash [ 'value' ]
133+ base = found_hash [ 'base' ]
134+ value = found_hash [ 'value' ]
135+
136+ if base && value
137+ reconstructed_hash [ 'base' ] , reconstructed_hash [ 'value' ] = [ base , value ] . minmax
138+ elsif base
139+ reconstructed_hash [ 'base' ] = base
140+ elsif value
141+ reconstructed_hash [ 'value' ] = value
142+ calculate_base ( reconstructed_hash )
136143 end
144+
137145 reconstructed_hash
138146 end
139147
148+ def self . calculate_base ( hash )
149+ rate = hash [ 'rate' ]
150+ return unless rate &.positive?
151+
152+ hash [ 'base' ] = hash [ 'value' ] / ( rate / 100.0 )
153+ end
154+
140155 # Extracts a single custom type of tax.
141156 # For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
142157 # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
@@ -149,7 +164,6 @@ def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_
149164
150165 tax_names . sort!
151166 found_hash = pick_best ( extract_horizontal_tax ( ocr_result , tax_names ) , tax_names )
152- # a tax is considered found horizontally if it has a value, otherwise it is vertical
153167 if found_hash . nil? || found_hash [ 'value' ] . nil?
154168 found_hash = extract_vertical_tax ( ocr_result , tax_names ,
155169 found_hash )
@@ -240,14 +254,14 @@ def self.extract_horizontal_tax(ocr_result, tax_names)
240254 linear_pattern_percent_first = %r{
241255 ((?:\s *-\s *)?(?:\d *[.,])*\d +[ ]?%?|%?[ ]?(?:\s *-\s *)?(?:\d *[.,])*\d +)?[ .]?
242256 ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
243- ((?:\s *-\s *)?(?:\d *[.,])+\d {2,} )?[ .]*
244- ((?:\s *-\s *)?(\d *[.,])*\d {2,} )?
257+ ((?:\s *-\s *)?(?:\d *[.,])+\d + )?[ .]*
258+ ((?:\s *-\s *)?(\d *[.,])*\d + )?
245259 }x
246260 linear_pattern_percent_second = %r{
247261 ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
248262 ((?:\s *-\s *)?(?:\d *[.,])*\d +[ ]?%?|%?[ ]?(?:\s *-\s *)?(?:\d *[.,])*\d +)?[ .]?
249- ((?:\s *-\s *)?(?:\d *[.,])+\d {2,} )?[ .]*
250- ((?:\s *-\s *)?(\d *[.,])*\d {2,} )?
263+ ((?:\s *-\s *)?(?:\d *[.,])+\d + )?[ .]*
264+ ((?:\s *-\s *)?(\d *[.,])*\d + )?
251265 }x
252266 ocr_result . mvision_v1 . pages . each . with_index do |page , page_id |
253267 page . all_lines . each do |line |
@@ -304,7 +318,7 @@ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
304318 page . all_words . each do |word |
305319 next if match_index ( word . text , tax_names ) . nil?
306320
307- reconstructed_line = ocr_result . reconstruct_vertically ( word . polygon , page_id )
321+ reconstructed_line = ocr_result . reconstruct_vertically ( word . polygon , page_id , 0.25 )
308322 found_hash [ 'page_id' ] = page_id if found_hash [ 'page_id' ] . nil?
309323 found_hash [ 'code' ] = word . text . strip if found_hash [ 'code' ] . nil?
310324 found_hash = extract_vertical_tax_values ( reconstructed_line , found_hash )
@@ -316,8 +330,9 @@ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
316330 private_class_method :extract_percentage_from_tax , :extract_basis_and_value , :extract_tax_from_horizontal_line ,
317331 :extract_horizontal_tax , :extract_vertical_tax_values , :extract_vertical_tax ,
318332 :create_tax_field , :fix_rate , :pick_best , :calculate_score , :curate_values ,
319- :decimate_rates_if_needed , :extract_basis_and_value , : set_base_and_value, :valid_candidate? ,
320- :swap_rates_if_needed
333+ :decimate_rates_if_needed , :set_base_and_value , :valid_candidate? ,
334+ :swap_rates_if_needed , :calculate_base
321335 end
322336 end
323337end
338+ # rubocop:enable Metrics/ClassLength
0 commit comments