Skip to content

Commit e6bae73

Browse files
authored
match set node rework (#147)
1 parent 98a6e5a commit e6bae73

File tree

4 files changed

+70
-54
lines changed

4 files changed

+70
-54
lines changed

src/regex/nodematch.nim

Lines changed: 58 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,45 @@ func swapCase*(r: Rune): Rune =
106106
return
107107
result = r.toUpper()
108108

109+
func matchAsciiSet(n: Node, r: Rune): bool =
110+
assert n.shorthands.len == 0
111+
result = r in n.cps or
112+
r in n.ranges
113+
result = (result and n.kind == reInSet) or
114+
(not result and n.kind == reNotSet)
115+
116+
func matchShorthand(n: Node, r: Rune): bool =
117+
case n.kind
118+
of reWord: r.isWord()
119+
of reNotAlphaNum: not r.isWord()
120+
of reDigit: r.isDecimal()
121+
of reNotDigit: not r.isDecimal()
122+
of reWhiteSpace: r.isWhiteSpace()
123+
of reNotWhiteSpace: not r.isWhiteSpace()
124+
of reUCC: r.unicodeCategory() in n.cc
125+
of reNotUCC: r.unicodeCategory() notin n.cc
126+
of reWordAscii: r.isWordAscii()
127+
of reNotAlphaNumAscii: not r.isWordAscii()
128+
of reDigitAscii: r.isDigitAscii()
129+
of reNotDigitAscii: not r.isDigitAscii()
130+
of reWhiteSpaceAscii: r.isWhiteSpaceAscii()
131+
of reNotWhiteSpaceAscii: not r.isWhiteSpaceAscii()
132+
of reInSet, reNotSet: matchAsciiSet(n, r)
133+
else:
134+
doAssert false
135+
false
136+
137+
func matchSet(n: Node, r: Rune): bool =
138+
result = r in n.cps or
139+
r in n.ranges
140+
if not result:
141+
for nn in n.shorthands:
142+
result = matchShorthand(nn, r)
143+
if result:
144+
break
145+
result = (result and n.kind == reInSet) or
146+
(not result and n.kind == reNotSet)
147+
109148
func match*(n: Node, r: Rune): bool {.inline.} =
110149
## match for ``Node`` of matchable kind.
111150
## Return whether the node matches
@@ -115,52 +154,25 @@ func match*(n: Node, r: Rune): bool {.inline.} =
115154
if n.kind == reChar:
116155
return n.cp == r
117156
case n.kind
118-
of reEOE:
119-
r == invalidRune
120-
of reWord:
121-
r.isWord()
122-
of reNotAlphaNum:
123-
not r.isWord()
124-
of reDigit:
125-
r.isDecimal()
126-
of reNotDigit:
127-
not r.isDecimal()
128-
of reWhiteSpace:
129-
r.isWhiteSpace()
130-
of reNotWhiteSpace:
131-
not r.isWhiteSpace()
132-
of reInSet, reNotSet:
133-
var matches = (
134-
r in n.cps or
135-
r in n.ranges)
136-
if not matches:
137-
for nn in n.shorthands:
138-
matches = nn.match(r)
139-
if matches: break
140-
((matches and n.kind == reInSet) or
141-
(not matches and n.kind == reNotSet))
142-
of reAny:
143-
r != lineBreakRune
144-
of reAnyNL:
145-
true
146-
of reCharCI:
147-
r == n.cp or r == n.cp.swapCase()
148-
of reWordAscii:
149-
r.isWordAscii()
150-
of reDigitAscii:
151-
r.isDigitAscii()
152-
of reWhiteSpaceAscii:
153-
r.isWhiteSpaceAscii()
154-
of reUCC:
155-
r.unicodeCategory() in n.cc
156-
of reNotAlphaNumAscii:
157-
not r.isWordAscii()
158-
of reNotDigitAscii:
159-
not r.isDigitAscii()
160-
of reNotWhiteSpaceAscii:
161-
not r.isWhiteSpaceAscii()
162-
of reNotUCC:
163-
r.unicodeCategory() notin n.cc
157+
of reEOE: r == invalidRune
158+
of reWord: r.isWord()
159+
of reNotAlphaNum: not r.isWord()
160+
of reDigit: r.isDecimal()
161+
of reNotDigit: not r.isDecimal()
162+
of reWhiteSpace: r.isWhiteSpace()
163+
of reNotWhiteSpace: not r.isWhiteSpace()
164+
of reAny: r != lineBreakRune
165+
of reAnyNL: true
166+
of reCharCI: r == n.cp or r == n.cp.swapCase()
167+
of reUCC: r.unicodeCategory() in n.cc
168+
of reNotUCC: r.unicodeCategory() notin n.cc
169+
of reWordAscii: r.isWordAscii()
170+
of reNotAlphaNumAscii: not r.isWordAscii()
171+
of reDigitAscii: r.isDigitAscii()
172+
of reNotDigitAscii: not r.isDigitAscii()
173+
of reWhiteSpaceAscii: r.isWhiteSpaceAscii()
174+
of reNotWhiteSpaceAscii: not r.isWhiteSpaceAscii()
175+
of reInSet, reNotSet: matchSet(n, r)
164176
else:
165177
assert n.kind == reChar
166178
n.cp == r

src/regex/parser.nim

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,8 +275,7 @@ func parseSetEscapedSeq(sc: Scanner[Rune]): Node =
275275

276276
func parseAsciiSet(sc: Scanner[Rune]): Node =
277277
## Parse an ascii set (i.e: ``[:ascii:]``).
278-
## The ascii set will get expanded
279-
## and merged with the outer set
278+
## An expanded ascii set is returned.
280279
let startPos = sc.pos
281280
assert sc.peek == ":".toRune
282281
discard sc.next()

tests/tests.nim

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ proc raises(pattern: string): bool =
4242
result = true
4343

4444
proc raisesMsg(pattern: string): string =
45+
result = ""
4546
try:
4647
discard pattern.re()
4748
except RegexError:
@@ -71,6 +72,7 @@ func findAllCapt(s: string, reg: Regex): seq[seq[seq[Slice[int]]]] =
7172
result = map(
7273
findAll(s, reg),
7374
func (m: RegexMatch): seq[seq[Slice[int]]] =
75+
result = newSeq[seq[Slice[int]]]()
7476
for i in 0 .. m.groupsCount-1:
7577
result.add m.group(i))
7678

tests/tests2.nim

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ proc raises(pattern: string): bool =
4444
result = true
4545

4646
proc raisesMsg(pattern: string): string =
47+
result = ""
4748
try:
4849
discard pattern.re2()
4950
except RegexError:
@@ -52,7 +53,7 @@ proc raisesMsg(pattern: string): string =
5253
proc matchWithCapt(s: string, pattern: static Regex2): seq[string] =
5354
var m = RegexMatch2()
5455
check match(s, pattern, m)
55-
result.setLen m.captures.len
56+
result = newSeq[string](m.captures.len)
5657
for i, bounds in m.captures.pairs:
5758
result[i] = s[bounds]
5859

@@ -62,7 +63,7 @@ proc matchWithBounds(s: string, pattern: static Regex2): seq[Slice[int]] =
6263
return m.captures
6364

6465
proc toStrCaptures(m: RegexMatch2, s: string): seq[string] =
65-
result.setLen m.captures.len
66+
result = newSeq[string](m.captures.len)
6667
for i, bounds in m.captures.pairs:
6768
result[i] = s[bounds]
6869

@@ -75,6 +76,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] =
7576
result = map(
7677
findAll(s, reg),
7778
func (m: RegexMatch2): seq[Slice[int]] =
79+
result = newSeq[Slice[int]]()
7880
for i in 0 .. m.groupsCount-1:
7981
result.add m.group(i))
8082

@@ -104,17 +106,18 @@ template matchMacro(s, r: untyped): untyped =
104106

105107
template matchMacroCapt(s, r: untyped): untyped =
106108
(func (): seq[string] =
109+
result = newSeq[string]()
107110
var m = false
108111
let exp = s
109112
match exp, r:
110113
m = true
111-
result = matches
114+
result.add matches
112115
check m)()
113116

114117
test "tmatch_macro":
115118
block hasOwnScope:
116119
var m = false
117-
var matches: seq[string]
120+
var matches = newSeq[string]()
118121
match "abc", rex"(\w+)":
119122
check matches == @["abc"]
120123
m = true
@@ -2242,7 +2245,7 @@ test "treuse_regex_match":
22422245

22432246
test "tisInitialized":
22442247
block:
2245-
var re: Regex2
2248+
var re = default(Regex2)
22462249
check(not re.isInitialized)
22472250
re = re2"foo"
22482251
check re.isInitialized
@@ -3092,7 +3095,7 @@ test "tverifyutf8":
30923095
raisesInvalidUtf8 endsWith("\xff", re2"abc")
30933096
raisesInvalidUtf8 replace("\xff", re2"abc", "abc")
30943097
raisesInvalidUtf8 replace("\xff", re2"abc",
3095-
(proc (m: RegexMatch2, s: string): string = discard))
3098+
(proc (m: RegexMatch2, s: string): string = return ""))
30963099
raisesInvalidUtf8 escapeRe("\xff")
30973100

30983101
# bug: raises invalid utf8 regex in Nim 1.0 + js target

0 commit comments

Comments
 (0)