Verify input is utf-8 (#131)

nitely · web-flow · commit 801ab3a2e1df · 2023-10-22T13:30:51.000-03:00
diff --git a/src/regex.nim b/src/regex.nim
@@ -352,6 +352,13 @@ export
 
 const reNonCapture* = nonCapture
 
+template debugCheckUtf8(s: untyped): untyped =
+  ## This is for input strings. Regex are already checked.
+  ## On release/danger the behaviour on invalid utf-8 input
+  ## is undefined
+  when not defined(release):
+    assert(verifyUtf8(s) == -1, "Invalid utf-8 input")
+
 when canUseMacro:
   func rex*(s: string): RegexLit =
     ## Raw regex literal string
@@ -462,9 +469,11 @@ func match*(
     doAssert "abcd".match(re2"abcd", m)
     doAssert not "abcd".match(re2"abc", m)
 
+  debugCheckUtf8 s
   result = matchImpl(s, toRegex(pattern), m, start)
 
 func match*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
+  debugCheckUtf8 s
   var m: RegexMatch2
   result = matchImpl(s, toRegex(pattern), m)
 
@@ -496,6 +505,7 @@ iterator findAll*(
     doAssert bounds == @[1 .. 2, 4 .. 5]
     doAssert found == @["bc", "bc"]
 
+  debugCheckUtf8 s
   var i = start
   var i2 = start-1
   var m: RegexMatch2
@@ -534,6 +544,7 @@ iterator findAllBounds*(
       bounds.add bd
     doAssert bounds == @[1 .. 2, 4 .. 5]
 
+  debugCheckUtf8 s
   var i = start
   var i2 = start-1
   var ms: RegexMatches2
@@ -598,6 +609,7 @@ iterator split*(s: string, sep: Regex2): string {.inline, raises: [].} =
       found.add s
     doAssert found == @["", "a", "Ϊ", "Ⓐ", "弢", ""]
 
+  debugCheckUtf8 s
   var
     first, last, i = 0
     i2 = -1
@@ -632,6 +644,7 @@ func splitIncl*(s: string, sep: Regex2): seq[string] {.inline, raises: [].} =
     doAssert parts == expected
 
   template ab: untyped = m.boundaries
+  debugCheckUtf8 s
   var
     first, last, i = 0
     i2 = -1
@@ -662,6 +675,7 @@ func startsWith*(
     doAssert "abc".startsWith(re2"\w")
     doAssert not "abc".startsWith(re2"\d")
 
+  debugCheckUtf8 s
   startsWithImpl2(s, toRegex(pattern), start)
 
 template runeIncAt(s: string, n: var int) =
@@ -680,6 +694,7 @@ func endsWith*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
     doAssert "abc".endsWith(re2"\w")
     doAssert not "abc".endsWith(re2"\d")
 
+  debugCheckUtf8 s
   result = false
   var
     m: RegexMatch2
@@ -732,7 +747,8 @@ func replace*(
     doAssert "Nim is awesome!".replace(re2"(\w\B)", "$1_") ==
       "N_i_m i_s a_w_e_s_o_m_e!"
 
-  result = ""
+  debugCheckUtf8 s
+  result = newStringOfCap(s.len)
   var
     i, j = 0
     capts = newSeqOfCap[string](toRegex(pattern).groupsCount)
@@ -772,7 +788,8 @@ func replace*(
     let text = "**this is a test**"
     doAssert text.replace(re2"(\*)", removeStars) == "this is a test"
 
-  result = ""
+  debugCheckUtf8 s
+  result = newStringOfCap(s.len)
   var i, j = 0
   for m in findAll(s, pattern):
     result.addsubstr(s, i, m.boundaries.a-1)
@@ -800,7 +817,8 @@ func escapeRe*(s: string): string {.raises: [].} =
   #
   # utf-8 ascii code-points cannot be part of multi-byte
   # code-points, so we can read/match byte by byte
-  result = ""
+  debugCheckUtf8 s
+  result = newStringOfCap(s.len)
   for c in s:
     case c
     of ' ', '#', '$', '&', '(',
@@ -950,9 +968,11 @@ func match*(
   m: var RegexMatch,
   start = 0
 ): bool {.inline, raises: [], deprecated: "use match(string, Regex2, var RegexMatch2) instead".} =
+  debugCheckUtf8 s
   result = matchImpl(s, pattern, m, start)
 
 func match*(s: string, pattern: Regex): bool {.inline, raises: [], deprecated: "use match(string, Regex2) instead".} =
+  debugCheckUtf8 s
   var m: RegexMatch
   result = matchImpl(s, pattern, m)
 
@@ -961,6 +981,7 @@ iterator findAll*(
   pattern: Regex,
   start = 0
 ): RegexMatch {.inline, raises: [], deprecated: "use findAll(string, Regex2) instead".} =
+  debugCheckUtf8 s
   var i = start
   var i2 = start-1
   var m: RegexMatch
@@ -989,6 +1010,7 @@ iterator findAllBounds*(
   pattern: Regex,
   start = 0
 ): Slice[int] {.inline, raises: [], deprecated: "use findAllBounds(string, Regex2) instead".} =
+  debugCheckUtf8 s
   var i = start
   var i2 = start-1
   var ms: RegexMatches
@@ -1036,6 +1058,7 @@ func find*(
   return false
 
 iterator split*(s: string, sep: Regex): string {.inline, raises: [], deprecated: "use split(string, Regex2) instead".} =
+  debugCheckUtf8 s
   var
     first, last, i = 0
     i2 = -1
@@ -1058,6 +1081,7 @@ func split*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated
 
 func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated: "use splitIncl(string, Regex2) instead".} =
   template ab: untyped = m.boundaries
+  debugCheckUtf8 s
   var
     first, last, i = 0
     i2 = -1
@@ -1082,10 +1106,11 @@ func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprec
 func startsWith*(
   s: string, pattern: Regex, start = 0
 ): bool {.inline, raises: [], deprecated: "use startsWith(string, Regex2) instead".} =
+  debugCheckUtf8 s
   startsWithImpl(s, pattern, start)
 
-# XXX use findAll and check last match bounds
 func endsWith*(s: string, pattern: Regex): bool {.inline, raises: [], deprecated: "use endsWith(string, Regex2) instead".} =
+  debugCheckUtf8 s
   result = false
   var
     m: RegexMatch
@@ -1121,6 +1146,7 @@ func replace*(
   by: string,
   limit = 0
 ): string {.inline, raises: [ValueError], deprecated: "use replace(string, Regex2, string) instead".} =
+  debugCheckUtf8 s
   result = ""
   var
     i, j = 0
@@ -1145,7 +1171,8 @@ func replace*(
   pattern: Regex,
   by: proc (m: RegexMatch, s: string): string,
   limit = 0
-): string {.inline, raises: [], effectsOf: by, deprecated: "use replace(string, Regex2, proc(RegexMatch2, string) :string) instead".} =
+): string {.inline, raises: [], effectsOf: by, deprecated: "use replace(string, Regex2, proc(RegexMatch2, string): string) instead".} =
+  debugCheckUtf8 s
   result = ""
   var i, j = 0
   for m in findAll(s, pattern):
@@ -1439,6 +1466,7 @@ when isMainModule:
     doAssert re2"\w" in "弢"
     doAssert "2222".find(re2"(22)*", m) and
       m.group(0) == 2 .. 3
+    doAssert raisesMsg("\xff") == "Invalid utf-8 regex"
     doAssert raisesMsg(r"[a-\w]") ==
       "Invalid set range. Range can't contain " &
       "a character-class or assertion\n" &
diff --git a/src/regex/common.nim b/src/regex/common.nim
@@ -69,27 +69,46 @@ proc `%%`*(
 proc `%%`*(formatstr: string, a: string): string =
   formatstr %% [a]
 
-# XXX this is to support literal optimization
-#     for unicode. It needs testing
-when false:
-  # XXX impl simpler find when memchr is not available?
-  func find*(s: string, r: Rune, start: Natural = 0): int =
-    ## Find unicode rune in a string.
-    if r.ord < 0xff:
-      return find(s, r.char, start)
-    let c = (r.ord and 0xff).char
-    let rsize = r.size()
-    var i = start+rsize-1
-    var r2 = 0'u32
-    doAssert rsize >= 1 and rsize <= 4
-    while i < len(s):
-      i = find(s, c, i)
-      if i == -1:
-        return -1
-      for j in i-rsize-1 .. i:
-        r2 = (r2 shl 8) or s[j].uint32
-      if r.uint32 == r2:
-        return i-rsize-1
-      r2 = 0
-      inc i
-    return -1
+type
+  verifyUtf8State = enum
+    vusError, vusStart, vusA, vusB, vusC, vusD, vusE, vusF, vusG
+
+# Taken from nim-unicodeplus
+func verifyUtf8*(s: string): int =
+  ## Return `-1` if `s` is a valid utf-8 string.
+  ## Otherwise, return the index of the first bad char.
+  var state = vusStart
+  var i = 0
+  let L = s.len
+  while i < L:
+    case state:
+    of vusStart:
+      result = i
+      state = if uint8(s[i]) in 0x00'u8 .. 0x7F'u8: vusStart
+      elif uint8(s[i]) in 0xC2'u8 .. 0xDF'u8: vusA
+      elif uint8(s[i]) in 0xE1'u8 .. 0xEC'u8 or uint8(s[i]) in 0xEE'u8 .. 0xEF'u8: vusB
+      elif uint8(s[i]) == 0xE0'u8: vusC
+      elif uint8(s[i]) == 0xED'u8: vusD
+      elif uint8(s[i]) in 0xF1'u8 .. 0xF3'u8: vusE
+      elif uint8(s[i]) == 0xF0'u8: vusF
+      elif uint8(s[i]) == 0xF4'u8: vusG
+      else: vusError
+    of vusA:
+      state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusStart else: vusError
+    of vusB:
+      state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusA else: vusError
+    of vusC:
+      state = if uint8(s[i]) in 0xA0'u8 .. 0xBF'u8: vusA else: vusError
+    of vusD:
+      state = if uint8(s[i]) in 0x80'u8 .. 0x9F'u8: vusA else: vusError
+    of vusE:
+      state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusB else: vusError
+    of vusF:
+      state = if uint8(s[i]) in 0x90'u8 .. 0xBF'u8: vusB else: vusError
+    of vusG:
+      state = if uint8(s[i]) in 0x80'u8 .. 0x8F'u8: vusB else: vusError
+    of vusError:
+      break
+    inc i
+  if state == vusStart:
+    result = -1
diff --git a/src/regex/compiler.nim b/src/regex/compiler.nim
@@ -1,3 +1,4 @@
+import ./common
 import ./parser
 import ./exptransformation
 import ./types
@@ -8,6 +9,8 @@ when defined(regexDotDir):
   import ./dotgraph
 
 func reImpl*(s: string): Regex {.inline.} =
+  if verifyUtf8(s) != -1:
+    raise newException(RegexError, "Invalid utf-8 regex")
   var groups: GroupsCapture
   let rpn = s
     .parse
diff --git a/tests/tests.nim b/tests/tests.nim
@@ -1126,7 +1126,7 @@ test "tstarts_with":
   check(not "abc".startsWith(re"bc"))
   check startsWith("弢ⒶΪ", re"弢Ⓐ")
   check startsWith("弢", re("\xF0\xAF\xA2\x94"))
-  check(not startsWith("弢", re("\xF0\xAF\xA2")))
+  #check(not startsWith("弢", re("\xF0\xAF\xA2")))
   check "abc".startsWith(re"\w")
   check(not "abc".startsWith(re"\d"))
   check "abc".startsWith(re"(a|b)")
@@ -1142,7 +1142,7 @@ test "tends_with":
   check(not "abc".endsWith(re"ab"))
   check endsWith("弢ⒶΪ", re"ⒶΪ")
   check endsWith("弢", re("\xF0\xAF\xA2\x94"))
-  check(not endsWith("弢", re("\xAF\xA2\x94")))
+  #check(not endsWith("弢", re("\xAF\xA2\x94")))
   check "abc".endsWith(re"(b|c)")
   check "ab".endsWith(re"(b|c)")
   check(not "a".endsWith(re"(b|c)"))
@@ -2475,7 +2475,7 @@ test "escapeRe":
   check match("$", re(escapeRe"$"))
   block:
     var s = ""
-    for c in 0 .. 255:
+    for c in 0 .. 127:
       s.add c.char
     discard re(escapeRe(s))
 
diff --git a/tests/tests2.nim b/tests/tests2.nim
@@ -1,5 +1,6 @@
 from std/unicode import runeLen
 from std/sequtils import map
+from std/strutils import contains
 
 import ../src/regex
 
@@ -1499,7 +1500,7 @@ test "tstarts_with":
   check(not "abc".startsWith(re2"bc"))
   check startsWith("弢ⒶΪ", re2"弢Ⓐ")
   check startsWith("弢", re2("\xF0\xAF\xA2\x94"))
-  check(not startsWith("弢", re2("\xF0\xAF\xA2")))
+  #check(not startsWith("弢", re2("\xF0\xAF\xA2")))
   check "abc".startsWith(re2"\w")
   check(not "abc".startsWith(re2"\d"))
   check "abc".startsWith(re2"(a|b)")
@@ -1515,7 +1516,7 @@ test "tends_with":
   check(not "abc".endsWith(re2"ab"))
   check endsWith("弢ⒶΪ", re2"ⒶΪ")
   check endsWith("弢", re2("\xF0\xAF\xA2\x94"))
-  check(not endsWith("弢", re2("\xAF\xA2\x94")))
+  #check(not endsWith("弢", re2("\xAF\xA2\x94")))
   check "abc".endsWith(re2"(b|c)")
   check "ab".endsWith(re2"(b|c)")
   check(not "a".endsWith(re2"(b|c)"))
@@ -2914,7 +2915,7 @@ test "escapere2":
   check match("$", re2(escapeRe"$"))
   block:
     var s = ""
-    for c in 0 .. 255:
+    for c in 0 .. 127:
       s.add c.char
     discard re2(escapeRe(s))
 
@@ -3025,3 +3026,32 @@ test "tlookaround_captures":
     m.captures == @[0 .. 0, 1 .. 3, nonCapture, nonCapture]
   check match("aaab", re2"(\w)(\w+)|\w+(?<=^(\w)(\w)(\w+))b", m) and
     m.captures == @[0 .. 0, 1 .. 3, nonCapture, nonCapture, nonCapture]
+
+when (NimMajor, NimMinor) >= (2, 0):
+  type MyAssertionDefect = ref AssertionDefect
+else:
+  type MyAssertionDefect = ref AssertionError
+
+template raisesInvalidUtf8(exp: untyped): untyped =
+  try:
+    discard exp
+    check false
+  except MyAssertionDefect:
+    check "Invalid utf-8 input" in getCurrentExceptionMsg()
+
+test "tverifyutf8":
+  check raisesMsg("\xff") == "Invalid utf-8 regex"
+  raisesInvalidUtf8 match("\xff", re2"abc")
+  block:
+    var m: RegexMatch2
+    raisesInvalidUtf8 match("\xff", re2"abc", m)
+  raisesInvalidUtf8 findAll("\xff", re2"abc")
+  raisesInvalidUtf8 findAllBounds("\xff", re2"abc")
+  raisesInvalidUtf8 split("\xff", re2"abc")
+  raisesInvalidUtf8 splitIncl("\xff", re2"abc")
+  raisesInvalidUtf8 startsWith("\xff", re2"abc")
+  raisesInvalidUtf8 endsWith("\xff", re2"abc")
+  raisesInvalidUtf8 replace("\xff", re2"abc", "abc")
+  raisesInvalidUtf8 replace("\xff", re2"abc",
+    (proc (m: RegexMatch2, s: string): string = discard))
+  raisesInvalidUtf8 escapeRe("\xff")