@@ -352,6 +352,13 @@ export
352352
353353const reNonCapture* = nonCapture
354354
355+ template debugCheckUtf8 (s: untyped ): untyped =
356+ # # This is for input strings. Regex are already checked.
357+ # # On release/danger the behaviour on invalid utf-8 input
358+ # # is undefined
359+ when not defined (release):
360+ assert (verifyUtf8 (s) == - 1 , " Invalid utf-8 input" )
361+
355362when canUseMacro:
356363 func rex * (s: string ): RegexLit =
357364 # # Raw regex literal string
@@ -462,9 +469,11 @@ func match*(
462469 doAssert " abcd" .match (re2 " abcd" , m)
463470 doAssert not " abcd" .match (re2 " abc" , m)
464471
472+ debugCheckUtf8 s
465473 result = matchImpl (s, toRegex (pattern), m, start)
466474
467475func match * (s: string , pattern: Regex2 ): bool {.inline , raises : [].} =
476+ debugCheckUtf8 s
468477 var m: RegexMatch2
469478 result = matchImpl (s, toRegex (pattern), m)
470479
@@ -496,6 +505,7 @@ iterator findAll*(
496505 doAssert bounds == @ [1 .. 2 , 4 .. 5 ]
497506 doAssert found == @ [" bc" , " bc" ]
498507
508+ debugCheckUtf8 s
499509 var i = start
500510 var i2 = start- 1
501511 var m: RegexMatch2
@@ -534,6 +544,7 @@ iterator findAllBounds*(
534544 bounds.add bd
535545 doAssert bounds == @ [1 .. 2 , 4 .. 5 ]
536546
547+ debugCheckUtf8 s
537548 var i = start
538549 var i2 = start- 1
539550 var ms: RegexMatches2
@@ -598,6 +609,7 @@ iterator split*(s: string, sep: Regex2): string {.inline, raises: [].} =
598609 found.add s
599610 doAssert found == @ [" " , " a" , " Ϊ" , " Ⓐ" , " 弢" , " " ]
600611
612+ debugCheckUtf8 s
601613 var
602614 first, last, i = 0
603615 i2 = - 1
@@ -632,6 +644,7 @@ func splitIncl*(s: string, sep: Regex2): seq[string] {.inline, raises: [].} =
632644 doAssert parts == expected
633645
634646 template ab : untyped = m.boundaries
647+ debugCheckUtf8 s
635648 var
636649 first, last, i = 0
637650 i2 = - 1
@@ -662,6 +675,7 @@ func startsWith*(
662675 doAssert " abc" .startsWith (re2 " \ w" )
663676 doAssert not " abc" .startsWith (re2 " \ d" )
664677
678+ debugCheckUtf8 s
665679 startsWithImpl2 (s, toRegex (pattern), start)
666680
667681template runeIncAt (s: string , n: var int ) =
@@ -680,6 +694,7 @@ func endsWith*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
680694 doAssert " abc" .endsWith (re2 " \ w" )
681695 doAssert not " abc" .endsWith (re2 " \ d" )
682696
697+ debugCheckUtf8 s
683698 result = false
684699 var
685700 m: RegexMatch2
@@ -732,7 +747,8 @@ func replace*(
732747 doAssert " Nim is awesome!" .replace (re2 " (\ w\B )" , " $1_" ) ==
733748 " N_i_m i_s a_w_e_s_o_m_e!"
734749
735- result = " "
750+ debugCheckUtf8 s
751+ result = newStringOfCap (s.len)
736752 var
737753 i, j = 0
738754 capts = newSeqOfCap [string ](toRegex (pattern).groupsCount)
@@ -772,7 +788,8 @@ func replace*(
772788 let text = " **this is a test**"
773789 doAssert text.replace (re2 " (\ *)" , removeStars) == " this is a test"
774790
775- result = " "
791+ debugCheckUtf8 s
792+ result = newStringOfCap (s.len)
776793 var i, j = 0
777794 for m in findAll (s, pattern):
778795 result .addsubstr (s, i, m.boundaries.a- 1 )
@@ -800,7 +817,8 @@ func escapeRe*(s: string): string {.raises: [].} =
800817 #
801818 # utf-8 ascii code-points cannot be part of multi-byte
802819 # code-points, so we can read/match byte by byte
803- result = " "
820+ debugCheckUtf8 s
821+ result = newStringOfCap (s.len)
804822 for c in s:
805823 case c
806824 of ' ' , '#' , '$' , '&' , '(' ,
@@ -950,9 +968,11 @@ func match*(
950968 m: var RegexMatch ,
951969 start = 0
952970): bool {.inline , raises : [], deprecated : " use match(string, Regex2, var RegexMatch2) instead" .} =
971+ debugCheckUtf8 s
953972 result = matchImpl (s, pattern, m, start)
954973
955974func match * (s: string , pattern: Regex ): bool {.inline , raises : [], deprecated : " use match(string, Regex2) instead" .} =
975+ debugCheckUtf8 s
956976 var m: RegexMatch
957977 result = matchImpl (s, pattern, m)
958978
@@ -961,6 +981,7 @@ iterator findAll*(
961981 pattern: Regex ,
962982 start = 0
963983): RegexMatch {.inline , raises : [], deprecated : " use findAll(string, Regex2) instead" .} =
984+ debugCheckUtf8 s
964985 var i = start
965986 var i2 = start- 1
966987 var m: RegexMatch
@@ -989,6 +1010,7 @@ iterator findAllBounds*(
9891010 pattern: Regex ,
9901011 start = 0
9911012): Slice [int ] {.inline , raises : [], deprecated : " use findAllBounds(string, Regex2) instead" .} =
1013+ debugCheckUtf8 s
9921014 var i = start
9931015 var i2 = start- 1
9941016 var ms: RegexMatches
@@ -1036,6 +1058,7 @@ func find*(
10361058 return false
10371059
10381060iterator split * (s: string , sep: Regex ): string {.inline , raises : [], deprecated : " use split(string, Regex2) instead" .} =
1061+ debugCheckUtf8 s
10391062 var
10401063 first, last, i = 0
10411064 i2 = - 1
@@ -1058,6 +1081,7 @@ func split*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated
10581081
10591082func splitIncl * (s: string , sep: Regex ): seq [string ] {.inline , raises : [], deprecated : " use splitIncl(string, Regex2) instead" .} =
10601083 template ab : untyped = m.boundaries
1084+ debugCheckUtf8 s
10611085 var
10621086 first, last, i = 0
10631087 i2 = - 1
@@ -1082,10 +1106,11 @@ func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprec
10821106func startsWith * (
10831107 s: string , pattern: Regex , start = 0
10841108): bool {.inline , raises : [], deprecated : " use startsWith(string, Regex2) instead" .} =
1109+ debugCheckUtf8 s
10851110 startsWithImpl (s, pattern, start)
10861111
1087- # XXX use findAll and check last match bounds
10881112func endsWith * (s: string , pattern: Regex ): bool {.inline , raises : [], deprecated : " use endsWith(string, Regex2) instead" .} =
1113+ debugCheckUtf8 s
10891114 result = false
10901115 var
10911116 m: RegexMatch
@@ -1121,6 +1146,7 @@ func replace*(
11211146 by: string ,
11221147 limit = 0
11231148): string {.inline , raises : [ValueError ], deprecated : " use replace(string, Regex2, string) instead" .} =
1149+ debugCheckUtf8 s
11241150 result = " "
11251151 var
11261152 i, j = 0
@@ -1145,7 +1171,8 @@ func replace*(
11451171 pattern: Regex ,
11461172 by: proc (m: RegexMatch , s: string ): string ,
11471173 limit = 0
1148- ): string {.inline , raises : [], effectsOf : by, deprecated : " use replace(string, Regex2, proc(RegexMatch2, string) :string) instead" .} =
1174+ ): string {.inline , raises : [], effectsOf : by, deprecated : " use replace(string, Regex2, proc(RegexMatch2, string): string) instead" .} =
1175+ debugCheckUtf8 s
11491176 result = " "
11501177 var i, j = 0
11511178 for m in findAll (s, pattern):
@@ -1439,6 +1466,7 @@ when isMainModule:
14391466 doAssert re2 " \ w" in " 弢"
14401467 doAssert " 2222" .find (re2 " (22)*" , m) and
14411468 m.group (0 ) == 2 .. 3
1469+ doAssert raisesMsg (" \xff " ) == " Invalid utf-8 regex"
14421470 doAssert raisesMsg (r " [a-\w] " ) ==
14431471 " Invalid set range. Range can't contain " &
14441472 " a character-class or assertion\n " &
0 commit comments