Merge pull request #7 from Oefenweb/between-quotes-pattern-doesnt-match-any-entries

tersmitten · tersmitten · commit 970e0a4acc8a · 2015-02-24T16:13:40.000+01:00
BETWEEN_QUOTES_PATTERN doesn't match any entries
diff --git a/Makefile b/Makefile
@@ -31,5 +31,6 @@ clean:
 	python setup.py clean
 	rm -rf build deb_dist debian dist MANIFEST *.egg-info deb_dist
 	find . -name '*.pyc' -print0 | xargs --no-run-if-empty -0 rm
-	find . -name '*.*coded' -print0 | xargs --no-run-if-empty -0 rm
+	find . -name '*.decoded' -print0 | xargs --no-run-if-empty -0 rm
+	find . -name '*.encode' -print0 | xargs --no-run-if-empty -0 rm
 	find . -name '*.actual' -print0 | xargs --no-run-if-empty -0 rm
diff --git a/bin/locale-decode-category b/bin/locale-decode-category
@@ -3,40 +3,41 @@
 # -*- coding: utf-8 -*-
 
 import sys
-import re
 from glibc_locale_tools.glibc_locale_tools import *
 
-lines = ''
+lines = []
 for line in sys.stdin:
-  lines += line
+  lines.append(line)
+lines_joined = ''.join(lines).decode('utf-8')
 
-lines = lines.decode('utf-8')
+unsafe_spans = get_unsafe_spans(lines, lines_joined)
 
 between_quotes_map = []
-for between_quotes in reversed(list(re.finditer(BETWEEN_QUOTES_PATTERN, lines))):
+for between_quotes in reversed(list(re.finditer(BETWEEN_QUOTES_PATTERN, lines_joined))):
   between_quotes_match = between_quotes.group(1)
   between_quotes_match_start = between_quotes.start(1)
   between_quotes_match_end = between_quotes.end(1)
 
+  if in_unsafe_spans(between_quotes_match_start, between_quotes_match_end, unsafe_spans):
+    continue
+
   replacement = between_quotes_match
   for unicode_matches in reversed(list(re.finditer(UNICODE_PATTERN, between_quotes_match))):
-    unicode_match = unicode_matches.group(2)
-    unicode_match_start = unicode_matches.start(1)
-    unicode_match_end = unicode_matches.end(1)
-
     replacement = replace_positional(replacement,
-                                     unicode_match_start, unicode_decode(unicode_match), unicode_match_end)
+                                     unicode_matches.start(1),
+                                     unicode_decode(unicode_matches.group(2)),
+                                     unicode_matches.end(1))
 
   between_quotes_map.append({'start': between_quotes_match_start,
                              'end': between_quotes_match_end,
                              'replacement': replacement})
 
 for between_quotes_map_items in between_quotes_map:
-  lines = replace_positional(lines,
-                             between_quotes_map_items['start'],
-                             between_quotes_map_items['replacement'],
-                             between_quotes_map_items['end'])
+  lines_joined = replace_positional(lines_joined,
+                                    between_quotes_map_items['start'],
+                                    between_quotes_map_items['replacement'],
+                                    between_quotes_map_items['end'])
 
-sys.stdout.write(lines.encode('utf-8'))
+sys.stdout.write(lines_joined.encode('utf-8'))
 
 sys.exit(0)
diff --git a/bin/locale-encode-category b/bin/locale-encode-category
@@ -3,40 +3,41 @@
 # -*- coding: utf-8 -*-
 
 import sys
-import re
 from glibc_locale_tools.glibc_locale_tools import *
 
-lines = ''
+lines = []
 for line in sys.stdin:
-  lines += line
+  lines.append(line)
+lines_joined = ''.join(lines).decode('utf-8')
 
-lines = lines.decode('utf-8')
+unsafe_spans = get_unsafe_spans(lines, lines_joined)
 
 between_quotes_map = []
-for between_quotes in reverse_iter(re.finditer(BETWEEN_QUOTES_PATTERN, lines)):
+for between_quotes in reverse_iter(re.finditer(BETWEEN_QUOTES_PATTERN, lines_joined)):
   between_quotes_match = between_quotes.group(1)
   between_quotes_match_start = between_quotes.start(1)
   between_quotes_match_end = between_quotes.end(1)
 
+  if in_unsafe_spans(between_quotes_match_start, between_quotes_match_end, unsafe_spans):
+    continue
+
   replacement = between_quotes_match
   for to_decode_matches in reversed(list(re.finditer(TO_DECODE_PATTERN, between_quotes_match))):
-    to_decode_match = to_decode_matches.group(0)
-    to_decode_match_start = to_decode_matches.start(0)
-    to_decode_match_end = to_decode_matches.end(0)
-
     replacement = replace_positional(replacement,
-                                     to_decode_match_start, unicode_encode(to_decode_match), to_decode_match_end)
+                                     to_decode_matches.start(0),
+                                     unicode_encode(to_decode_matches.group(0)),
+                                     to_decode_matches.end(0))
 
   between_quotes_map.append({'start': between_quotes_match_start,
                              'end': between_quotes_match_end,
                              'replacement': replacement})
 
 for between_quotes_map_items in between_quotes_map:
-  lines = replace_positional(lines,
-                             between_quotes_map_items['start'],
-                             between_quotes_map_items['replacement'],
-                             between_quotes_map_items['end'])
+  lines_joined = replace_positional(lines_joined,
+                                    between_quotes_map_items['start'],
+                                    between_quotes_map_items['replacement'],
+                                    between_quotes_map_items['end'])
 
-sys.stdout.write(lines.encode('utf-8'))
+sys.stdout.write(lines_joined.encode('utf-8'))
 
 sys.exit(0)
diff --git a/glibc_locale_tools/glibc_locale_tools.py b/glibc_locale_tools/glibc_locale_tools.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 
-BETWEEN_QUOTES_PATTERN = r'^[^%].*"([^"]*)"'
+import re
+
+
+BETWEEN_QUOTES_PATTERN = r'"([^"]*)"'
 """
 A re pattern to match a between quotes section, that is not a comment.
 """
@@ -10,7 +13,7 @@
 A re pattern to match a unicode char (e.g. <U002D>).
 """
 
-TO_DECODE_PATTERN = r'[^\/\n]{1}'
+TO_DECODE_PATTERN = r'(/(?!\n)|[^/\n]){1}'
 """
 A re pattern to match a string section that needs decoding.
 """
@@ -36,6 +39,11 @@
 A re pattern to match a escape_char line.
 """
 
+COMMENT_LINE_WITH_QUOTES_PATTERN = r'^%.*"[^"]*"'
+"""
+A re pattern to match a comment line with a between quotes section.
+"""
+
 
 def unicode_decode(unicode_char):
   """
@@ -82,3 +90,57 @@ def reverse_iter(iterator):
   """
 
   return reversed(list(iterator))
+
+
+def between_range(range1, range2):
+  """
+  Checks whether or not range 2 is between range 1.
+
+  :param range1: A range
+  :param range2: A range
+  :return: Whether or not range 2 is between range 1
+  """
+
+  return range1['start'] <= range2['start'] <= range1['end'] and range1['start'] <= range2['end'] <= range1['end']
+
+
+def in_unsafe_spans(match_start, match_end, unsafe_spans):
+  """
+  Checks whether not a range (match start and end) is in unsafe ranges.
+
+  :param match_start: A match start position
+  :param match_end: A match end position
+  :param unsafe_spans: A list of unsafe spans
+  :return: Whether not a range is in unsafe ranges
+  """
+
+  for unsafe_span in unsafe_spans:
+    if between_range(unsafe_span, {'start': match_start, 'end': match_end}):
+      return True
+
+  return False
+
+
+def get_unsafe_spans(lines, lines_joined):
+  """
+  Generates a list of unsafe spans.
+
+  Unsafe span are comment lines that contain double quotes (that should not be (en|de)coded).
+
+  :param lines: A list of lines
+  :param lines_joined: A string of lines
+  :return: A list of unsafe spans
+  """
+
+  unsafe_lines = []
+  for line in lines:
+    if re.search(COMMENT_LINE_WITH_QUOTES_PATTERN, line):
+      unsafe_lines.append(line)
+
+  unsafe_lines_pattern = '({0})'.format('|'.join(map(re.escape, unsafe_lines)))
+
+  unsafe_spans = []
+  for unsafe_line in re.finditer(unsafe_lines_pattern, lines_joined):
+    unsafe_spans.append({'start': unsafe_line.start(0), 'end': unsafe_line.end(0)})
+
+  return unsafe_spans
diff --git a/glibc_locale_tools/test/data/da_DK.LC_TIME.expected-decoded b/glibc_locale_tools/test/data/da_DK.LC_TIME.expected-decoded
@@ -0,0 +1,42 @@
+comment_char %
+escape_char /
+abday       "søn";"man";/
+            "tir";"ons";/
+            "tor";"fre";/
+            "lør"
+day         "søndag";/
+            "mandag";/
+            "tirsdag";/
+            "onsdag";/
+            "torsdag";/
+            "fredag";/
+            "lørdag"
+abmon       "jan";"feb";/
+            "mar";"apr";/
+            "maj";"jun";/
+            "jul";"aug";/
+            "sep";"okt";/
+            "nov";"dec"
+mon         "januar";/
+            "februar";/
+            "marts";/
+            "april";/
+            "maj";/
+            "juni";/
+            "juli";/
+            "august";/
+            "september";/
+            "oktober";/
+            "november";/
+            "december"
+d_t_fmt     "%a %d %b %Y %T %Z"
+d_fmt       "%d-%m-%Y"
+t_fmt       "%T"
+am_pm       "";""
+t_fmt_ampm  ""
+date_fmt	"%a %b %e/
+ %H:%M:%S /
+%Z %Y"
+week    7;19971130;4
+first_weekday 2
+first_workday 2
diff --git a/glibc_locale_tools/test/data/en_US.LC_TIME.expected-decoded b/glibc_locale_tools/test/data/en_US.LC_TIME.expected-decoded
@@ -0,0 +1,60 @@
+comment_char %
+escape_char /
+abday	"Sun";"Mon";/
+	"Tue";"Wed";/
+	"Thu";"Fri";/
+	"Sat"
+day	"Sunday";/
+	"Monday";/
+	"Tuesday";/
+	"Wednesday";/
+	"Thursday";/
+	"Friday";/
+	"Saturday"
+
+week    7;19971130;7
+first_weekday	1
+first_workday	2
+abmon	"Jan";"Feb";/
+	"Mar";"Apr";/
+	"May";"Jun";/
+	"Jul";"Aug";/
+	"Sep";"Oct";/
+	"Nov";"Dec"
+mon	"January";/
+	"February";/
+	"March";/
+	"April";/
+	"May";/
+	"June";/
+	"July";/
+	"August";/
+	"September";/
+	"October";/
+	"November";/
+	"December"
+% Appropriate date and time representation (%c)
+%	"%a %d %b %Y %r %Z"
+d_t_fmt "%a %d %b %Y %r %Z"
+%
+% Appropriate date representation (%x)
+%	"%m/%d/%Y"
+d_fmt   "%m/%d/%Y"
+%
+% Appropriate time representation (%X)
+%	"%r"
+t_fmt   "%r"
+%
+% Appropriate AM/PM time representation (%r)
+%	"%I:%M:%S %p"
+t_fmt_ampm "%I:%M:%S /
+%p"
+%
+% Strings for AM/PM
+%
+am_pm	"AM";"PM"
+%
+% Appropriate date representation (date(1))   "%a %b %e %H:%M:%S %Z %Y"
+date_fmt	"%a %b %e/
+ %H:%M:%S /
+%Z %Y"
diff --git a/glibc_locale_tools/test/data/nl_NL.LC_TIME.expected-decoded b/glibc_locale_tools/test/data/nl_NL.LC_TIME.expected-decoded
@@ -0,0 +1,42 @@
+comment_char %
+escape_char  /
+abday   "zo";"ma";"di";/
+	"wo";"do";"vr";/
+	"za"
+day     "zondag";/
+	"maandag";/
+	"dinsdag";/
+	"woensdag";/
+	"donderdag";/
+	"vrijdag";/
+	"zaterdag"
+abmon   "jan";"feb";/
+	"mrt";"apr";/
+	"mei";"jun";/
+	"jul";"aug";/
+	"sep";"okt";/
+	"nov";"dec"
+mon     "januari";/
+	"februari";/
+	"maart";/
+	"april";/
+	"mei";/
+	"juni";/
+	"juli";/
+	"augustus";/
+	"september";/
+	"oktober";/
+	"november";/
+	"december"
+d_t_fmt "%a %d %b %Y %T %Z"
+d_fmt   "%d-%m-%y"
+t_fmt   "%T"
+am_pm   "";""
+t_fmt_ampm ""
+date_fmt       "%a %b %e/
+ %H:%M:%S /
+%Z %Y"
+
+week    7;19971130;4
+first_weekday 2
+first_workday 2
diff --git a/glibc_locale_tools/test/data/tr_TR.LC_TIME.expected-decoded b/glibc_locale_tools/test/data/tr_TR.LC_TIME.expected-decoded
diff --git a/glibc_locale_tools/test/test-category-decode-encode b/glibc_locale_tools/test/test-category-decode-encode
diff --git a/glibc_locale_tools/test/test_helper_functions.py b/glibc_locale_tools/test/test_helper_functions.py