Skip to content

Commit 801ab3a

Browse files
authored
Verify input is utf-8 (#131)
1 parent 0fd81c5 commit 801ab3a

File tree

5 files changed

+115
-35
lines changed

5 files changed

+115
-35
lines changed

src/regex.nim

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,13 @@ export
352352

353353
const reNonCapture* = nonCapture
354354

355+
template debugCheckUtf8(s: untyped): untyped =
356+
## This is for input strings. Regex are already checked.
357+
## On release/danger the behaviour on invalid utf-8 input
358+
## is undefined
359+
when not defined(release):
360+
assert(verifyUtf8(s) == -1, "Invalid utf-8 input")
361+
355362
when canUseMacro:
356363
func rex*(s: string): RegexLit =
357364
## Raw regex literal string
@@ -462,9 +469,11 @@ func match*(
462469
doAssert "abcd".match(re2"abcd", m)
463470
doAssert not "abcd".match(re2"abc", m)
464471

472+
debugCheckUtf8 s
465473
result = matchImpl(s, toRegex(pattern), m, start)
466474

467475
func match*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
476+
debugCheckUtf8 s
468477
var m: RegexMatch2
469478
result = matchImpl(s, toRegex(pattern), m)
470479

@@ -496,6 +505,7 @@ iterator findAll*(
496505
doAssert bounds == @[1 .. 2, 4 .. 5]
497506
doAssert found == @["bc", "bc"]
498507

508+
debugCheckUtf8 s
499509
var i = start
500510
var i2 = start-1
501511
var m: RegexMatch2
@@ -534,6 +544,7 @@ iterator findAllBounds*(
534544
bounds.add bd
535545
doAssert bounds == @[1 .. 2, 4 .. 5]
536546

547+
debugCheckUtf8 s
537548
var i = start
538549
var i2 = start-1
539550
var ms: RegexMatches2
@@ -598,6 +609,7 @@ iterator split*(s: string, sep: Regex2): string {.inline, raises: [].} =
598609
found.add s
599610
doAssert found == @["", "a", "Ϊ", "", "弢", ""]
600611

612+
debugCheckUtf8 s
601613
var
602614
first, last, i = 0
603615
i2 = -1
@@ -632,6 +644,7 @@ func splitIncl*(s: string, sep: Regex2): seq[string] {.inline, raises: [].} =
632644
doAssert parts == expected
633645

634646
template ab: untyped = m.boundaries
647+
debugCheckUtf8 s
635648
var
636649
first, last, i = 0
637650
i2 = -1
@@ -662,6 +675,7 @@ func startsWith*(
662675
doAssert "abc".startsWith(re2"\w")
663676
doAssert not "abc".startsWith(re2"\d")
664677

678+
debugCheckUtf8 s
665679
startsWithImpl2(s, toRegex(pattern), start)
666680

667681
template runeIncAt(s: string, n: var int) =
@@ -680,6 +694,7 @@ func endsWith*(s: string, pattern: Regex2): bool {.inline, raises: [].} =
680694
doAssert "abc".endsWith(re2"\w")
681695
doAssert not "abc".endsWith(re2"\d")
682696

697+
debugCheckUtf8 s
683698
result = false
684699
var
685700
m: RegexMatch2
@@ -732,7 +747,8 @@ func replace*(
732747
doAssert "Nim is awesome!".replace(re2"(\w\B)", "$1_") ==
733748
"N_i_m i_s a_w_e_s_o_m_e!"
734749

735-
result = ""
750+
debugCheckUtf8 s
751+
result = newStringOfCap(s.len)
736752
var
737753
i, j = 0
738754
capts = newSeqOfCap[string](toRegex(pattern).groupsCount)
@@ -772,7 +788,8 @@ func replace*(
772788
let text = "**this is a test**"
773789
doAssert text.replace(re2"(\*)", removeStars) == "this is a test"
774790

775-
result = ""
791+
debugCheckUtf8 s
792+
result = newStringOfCap(s.len)
776793
var i, j = 0
777794
for m in findAll(s, pattern):
778795
result.addsubstr(s, i, m.boundaries.a-1)
@@ -800,7 +817,8 @@ func escapeRe*(s: string): string {.raises: [].} =
800817
#
801818
# utf-8 ascii code-points cannot be part of multi-byte
802819
# code-points, so we can read/match byte by byte
803-
result = ""
820+
debugCheckUtf8 s
821+
result = newStringOfCap(s.len)
804822
for c in s:
805823
case c
806824
of ' ', '#', '$', '&', '(',
@@ -950,9 +968,11 @@ func match*(
950968
m: var RegexMatch,
951969
start = 0
952970
): bool {.inline, raises: [], deprecated: "use match(string, Regex2, var RegexMatch2) instead".} =
971+
debugCheckUtf8 s
953972
result = matchImpl(s, pattern, m, start)
954973

955974
func match*(s: string, pattern: Regex): bool {.inline, raises: [], deprecated: "use match(string, Regex2) instead".} =
975+
debugCheckUtf8 s
956976
var m: RegexMatch
957977
result = matchImpl(s, pattern, m)
958978

@@ -961,6 +981,7 @@ iterator findAll*(
961981
pattern: Regex,
962982
start = 0
963983
): RegexMatch {.inline, raises: [], deprecated: "use findAll(string, Regex2) instead".} =
984+
debugCheckUtf8 s
964985
var i = start
965986
var i2 = start-1
966987
var m: RegexMatch
@@ -989,6 +1010,7 @@ iterator findAllBounds*(
9891010
pattern: Regex,
9901011
start = 0
9911012
): Slice[int] {.inline, raises: [], deprecated: "use findAllBounds(string, Regex2) instead".} =
1013+
debugCheckUtf8 s
9921014
var i = start
9931015
var i2 = start-1
9941016
var ms: RegexMatches
@@ -1036,6 +1058,7 @@ func find*(
10361058
return false
10371059

10381060
iterator split*(s: string, sep: Regex): string {.inline, raises: [], deprecated: "use split(string, Regex2) instead".} =
1061+
debugCheckUtf8 s
10391062
var
10401063
first, last, i = 0
10411064
i2 = -1
@@ -1058,6 +1081,7 @@ func split*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated
10581081

10591082
func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprecated: "use splitIncl(string, Regex2) instead".} =
10601083
template ab: untyped = m.boundaries
1084+
debugCheckUtf8 s
10611085
var
10621086
first, last, i = 0
10631087
i2 = -1
@@ -1082,10 +1106,11 @@ func splitIncl*(s: string, sep: Regex): seq[string] {.inline, raises: [], deprec
10821106
func startsWith*(
10831107
s: string, pattern: Regex, start = 0
10841108
): bool {.inline, raises: [], deprecated: "use startsWith(string, Regex2) instead".} =
1109+
debugCheckUtf8 s
10851110
startsWithImpl(s, pattern, start)
10861111

1087-
# XXX use findAll and check last match bounds
10881112
func endsWith*(s: string, pattern: Regex): bool {.inline, raises: [], deprecated: "use endsWith(string, Regex2) instead".} =
1113+
debugCheckUtf8 s
10891114
result = false
10901115
var
10911116
m: RegexMatch
@@ -1121,6 +1146,7 @@ func replace*(
11211146
by: string,
11221147
limit = 0
11231148
): string {.inline, raises: [ValueError], deprecated: "use replace(string, Regex2, string) instead".} =
1149+
debugCheckUtf8 s
11241150
result = ""
11251151
var
11261152
i, j = 0
@@ -1145,7 +1171,8 @@ func replace*(
11451171
pattern: Regex,
11461172
by: proc (m: RegexMatch, s: string): string,
11471173
limit = 0
1148-
): string {.inline, raises: [], effectsOf: by, deprecated: "use replace(string, Regex2, proc(RegexMatch2, string) :string) instead".} =
1174+
): string {.inline, raises: [], effectsOf: by, deprecated: "use replace(string, Regex2, proc(RegexMatch2, string): string) instead".} =
1175+
debugCheckUtf8 s
11491176
result = ""
11501177
var i, j = 0
11511178
for m in findAll(s, pattern):
@@ -1439,6 +1466,7 @@ when isMainModule:
14391466
doAssert re2"\w" in "弢"
14401467
doAssert "2222".find(re2"(22)*", m) and
14411468
m.group(0) == 2 .. 3
1469+
doAssert raisesMsg("\xff") == "Invalid utf-8 regex"
14421470
doAssert raisesMsg(r"[a-\w]") ==
14431471
"Invalid set range. Range can't contain " &
14441472
"a character-class or assertion\n" &

src/regex/common.nim

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -69,27 +69,46 @@ proc `%%`*(
6969
proc `%%`*(formatstr: string, a: string): string =
7070
formatstr %% [a]
7171

72-
# XXX this is to support literal optimization
73-
# for unicode. It needs testing
74-
when false:
75-
# XXX impl simpler find when memchr is not available?
76-
func find*(s: string, r: Rune, start: Natural = 0): int =
77-
## Find unicode rune in a string.
78-
if r.ord < 0xff:
79-
return find(s, r.char, start)
80-
let c = (r.ord and 0xff).char
81-
let rsize = r.size()
82-
var i = start+rsize-1
83-
var r2 = 0'u32
84-
doAssert rsize >= 1 and rsize <= 4
85-
while i < len(s):
86-
i = find(s, c, i)
87-
if i == -1:
88-
return -1
89-
for j in i-rsize-1 .. i:
90-
r2 = (r2 shl 8) or s[j].uint32
91-
if r.uint32 == r2:
92-
return i-rsize-1
93-
r2 = 0
94-
inc i
95-
return -1
72+
type
73+
verifyUtf8State = enum
74+
vusError, vusStart, vusA, vusB, vusC, vusD, vusE, vusF, vusG
75+
76+
# Taken from nim-unicodeplus
77+
func verifyUtf8*(s: string): int =
78+
## Return `-1` if `s` is a valid utf-8 string.
79+
## Otherwise, return the index of the first bad char.
80+
var state = vusStart
81+
var i = 0
82+
let L = s.len
83+
while i < L:
84+
case state:
85+
of vusStart:
86+
result = i
87+
state = if uint8(s[i]) in 0x00'u8 .. 0x7F'u8: vusStart
88+
elif uint8(s[i]) in 0xC2'u8 .. 0xDF'u8: vusA
89+
elif uint8(s[i]) in 0xE1'u8 .. 0xEC'u8 or uint8(s[i]) in 0xEE'u8 .. 0xEF'u8: vusB
90+
elif uint8(s[i]) == 0xE0'u8: vusC
91+
elif uint8(s[i]) == 0xED'u8: vusD
92+
elif uint8(s[i]) in 0xF1'u8 .. 0xF3'u8: vusE
93+
elif uint8(s[i]) == 0xF0'u8: vusF
94+
elif uint8(s[i]) == 0xF4'u8: vusG
95+
else: vusError
96+
of vusA:
97+
state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusStart else: vusError
98+
of vusB:
99+
state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusA else: vusError
100+
of vusC:
101+
state = if uint8(s[i]) in 0xA0'u8 .. 0xBF'u8: vusA else: vusError
102+
of vusD:
103+
state = if uint8(s[i]) in 0x80'u8 .. 0x9F'u8: vusA else: vusError
104+
of vusE:
105+
state = if uint8(s[i]) in 0x80'u8 .. 0xBF'u8: vusB else: vusError
106+
of vusF:
107+
state = if uint8(s[i]) in 0x90'u8 .. 0xBF'u8: vusB else: vusError
108+
of vusG:
109+
state = if uint8(s[i]) in 0x80'u8 .. 0x8F'u8: vusB else: vusError
110+
of vusError:
111+
break
112+
inc i
113+
if state == vusStart:
114+
result = -1

src/regex/compiler.nim

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import ./common
12
import ./parser
23
import ./exptransformation
34
import ./types
@@ -8,6 +9,8 @@ when defined(regexDotDir):
89
import ./dotgraph
910

1011
func reImpl*(s: string): Regex {.inline.} =
12+
if verifyUtf8(s) != -1:
13+
raise newException(RegexError, "Invalid utf-8 regex")
1114
var groups: GroupsCapture
1215
let rpn = s
1316
.parse

tests/tests.nim

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,7 +1126,7 @@ test "tstarts_with":
11261126
check(not "abc".startsWith(re"bc"))
11271127
check startsWith("弢ⒶΪ", re"弢Ⓐ")
11281128
check startsWith("弢", re("\xF0\xAF\xA2\x94"))
1129-
check(not startsWith("弢", re("\xF0\xAF\xA2")))
1129+
#check(not startsWith("弢", re("\xF0\xAF\xA2")))
11301130
check "abc".startsWith(re"\w")
11311131
check(not "abc".startsWith(re"\d"))
11321132
check "abc".startsWith(re"(a|b)")
@@ -1142,7 +1142,7 @@ test "tends_with":
11421142
check(not "abc".endsWith(re"ab"))
11431143
check endsWith("弢ⒶΪ", re"ⒶΪ")
11441144
check endsWith("弢", re("\xF0\xAF\xA2\x94"))
1145-
check(not endsWith("弢", re("\xAF\xA2\x94")))
1145+
#check(not endsWith("弢", re("\xAF\xA2\x94")))
11461146
check "abc".endsWith(re"(b|c)")
11471147
check "ab".endsWith(re"(b|c)")
11481148
check(not "a".endsWith(re"(b|c)"))
@@ -2475,7 +2475,7 @@ test "escapeRe":
24752475
check match("$", re(escapeRe"$"))
24762476
block:
24772477
var s = ""
2478-
for c in 0 .. 255:
2478+
for c in 0 .. 127:
24792479
s.add c.char
24802480
discard re(escapeRe(s))
24812481

tests/tests2.nim

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from std/unicode import runeLen
22
from std/sequtils import map
3+
from std/strutils import contains
34

45
import ../src/regex
56

@@ -1499,7 +1500,7 @@ test "tstarts_with":
14991500
check(not "abc".startsWith(re2"bc"))
15001501
check startsWith("弢ⒶΪ", re2"弢Ⓐ")
15011502
check startsWith("弢", re2("\xF0\xAF\xA2\x94"))
1502-
check(not startsWith("弢", re2("\xF0\xAF\xA2")))
1503+
#check(not startsWith("弢", re2("\xF0\xAF\xA2")))
15031504
check "abc".startsWith(re2"\w")
15041505
check(not "abc".startsWith(re2"\d"))
15051506
check "abc".startsWith(re2"(a|b)")
@@ -1515,7 +1516,7 @@ test "tends_with":
15151516
check(not "abc".endsWith(re2"ab"))
15161517
check endsWith("弢ⒶΪ", re2"ⒶΪ")
15171518
check endsWith("弢", re2("\xF0\xAF\xA2\x94"))
1518-
check(not endsWith("弢", re2("\xAF\xA2\x94")))
1519+
#check(not endsWith("弢", re2("\xAF\xA2\x94")))
15191520
check "abc".endsWith(re2"(b|c)")
15201521
check "ab".endsWith(re2"(b|c)")
15211522
check(not "a".endsWith(re2"(b|c)"))
@@ -2914,7 +2915,7 @@ test "escapere2":
29142915
check match("$", re2(escapeRe"$"))
29152916
block:
29162917
var s = ""
2917-
for c in 0 .. 255:
2918+
for c in 0 .. 127:
29182919
s.add c.char
29192920
discard re2(escapeRe(s))
29202921

@@ -3025,3 +3026,32 @@ test "tlookaround_captures":
30253026
m.captures == @[0 .. 0, 1 .. 3, nonCapture, nonCapture]
30263027
check match("aaab", re2"(\w)(\w+)|\w+(?<=^(\w)(\w)(\w+))b", m) and
30273028
m.captures == @[0 .. 0, 1 .. 3, nonCapture, nonCapture, nonCapture]
3029+
3030+
when (NimMajor, NimMinor) >= (2, 0):
3031+
type MyAssertionDefect = ref AssertionDefect
3032+
else:
3033+
type MyAssertionDefect = ref AssertionError
3034+
3035+
template raisesInvalidUtf8(exp: untyped): untyped =
3036+
try:
3037+
discard exp
3038+
check false
3039+
except MyAssertionDefect:
3040+
check "Invalid utf-8 input" in getCurrentExceptionMsg()
3041+
3042+
test "tverifyutf8":
3043+
check raisesMsg("\xff") == "Invalid utf-8 regex"
3044+
raisesInvalidUtf8 match("\xff", re2"abc")
3045+
block:
3046+
var m: RegexMatch2
3047+
raisesInvalidUtf8 match("\xff", re2"abc", m)
3048+
raisesInvalidUtf8 findAll("\xff", re2"abc")
3049+
raisesInvalidUtf8 findAllBounds("\xff", re2"abc")
3050+
raisesInvalidUtf8 split("\xff", re2"abc")
3051+
raisesInvalidUtf8 splitIncl("\xff", re2"abc")
3052+
raisesInvalidUtf8 startsWith("\xff", re2"abc")
3053+
raisesInvalidUtf8 endsWith("\xff", re2"abc")
3054+
raisesInvalidUtf8 replace("\xff", re2"abc", "abc")
3055+
raisesInvalidUtf8 replace("\xff", re2"abc",
3056+
(proc (m: RegexMatch2, s: string): string = discard))
3057+
raisesInvalidUtf8 escapeRe("\xff")

0 commit comments

Comments
 (0)