Skip to content

Commit 9ccc25f

Browse files
authored
Fix casefold (#150)
1 parent d4e6b73 commit 9ccc25f

File tree

6 files changed

+37
-17
lines changed

6 files changed

+37
-17
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ tests/test_bug
1414
docs/ugh
1515
bin/*
1616
bench/bench
17+
config.nims

regex.nimble

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ srcDir = "src"
88
skipDirs = @["tests", "bench", "docs"]
99

1010
requires "nim >= 1.6.0"
11-
requires "unicodedb >= 0.7.2"
11+
requires "unicodedb >= 0.13.1"
1212

1313
template execTest(lang, target: static string) =
1414
doAssert lang in ["c", "js"]

src/regex/exptransformation.nim

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@ import std/sets
33
import std/tables
44
import std/algorithm
55

6+
import pkg/unicodedb/casing
7+
68
import ./exptype
79
import ./types
810
import ./common
911
import ./scanner
1012

11-
# todo: can not use unicodeplus due to
12-
# https://github.com/nim-lang/Nim/issues/7059
1313
func swapCase(r: Rune): Rune =
1414
# Note a character can be
1515
# non-lower and non-upper
@@ -178,10 +178,12 @@ func applyFlag(n: var Node, f: Flag) =
178178
else:
179179
discard
180180
of flagCaseInsensitive:
181-
if n.kind == reChar and n.cp != n.cp.swapCase():
181+
if n.kind == reChar and n.cp.hasCaseFolds:
182182
n.kind = reCharCI
183+
n.cp = n.cp.simpleCaseFold
183184
# todo: apply recursevely to
184185
# shorthands of reInSet/reNotSet (i.e: [:ascii:])
186+
# XXX add all casefolds that map to the cp instead of swapCase
185187
if n.kind in {reInSet, reNotSet}:
186188
var cps = newSeq[Rune]()
187189
for cp in items n.cps:
@@ -190,9 +192,8 @@ func applyFlag(n: var Node, f: Flag) =
190192
cps.add cp2
191193
n.cps.add cps
192194
for sl in n.ranges[0 .. ^1]:
193-
let
194-
cpa = sl.a.swapCase()
195-
cpb = sl.b.swapCase()
195+
let cpa = sl.a.swapCase()
196+
let cpb = sl.b.swapCase()
196197
if sl.a != cpa and sl.b != cpb:
197198
n.ranges.add(cpa .. cpb)
198199
of flagUnGreedy:

src/regex/nfamacro.nim

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import std/tables
66
import std/sets
77
import std/algorithm
88

9+
import pkg/unicodedb/casing
910
import pkg/unicodedb/properties
1011
import pkg/unicodedb/types as utypes
1112

@@ -124,7 +125,8 @@ func genMatch(c: NimNode, n: Node): NimNode =
124125
quote do: true
125126
of reCharCI:
126127
let cp2Lit = newLit n.cp.swapCase().int32
127-
quote do: `c` == `cpLit` or `c` == `cp2Lit`
128+
let cp3Lit = newLit n.cp.simpleCaseFold().int32
129+
quote do: `c` == `cpLit` or `c` == `cp2Lit` or simpleCaseFold(`c`) == Rune(`cp3Lit`)
128130
of reWordAscii:
129131
genWordAsciiMatch(c)
130132
of reNotAlphaNumAscii:

src/regex/nodematch.nim

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import std/unicode except `==`
22

3+
import pkg/unicodedb/casing
34
import pkg/unicodedb/properties
45
import pkg/unicodedb/types as utypes
56

@@ -97,14 +98,6 @@ func isDigitAscii(r: Rune): bool {.inline.} =
9798
else:
9899
false
99100

100-
# todo: can not use unicodeplus due to
101-
# https://github.com/nim-lang/Nim/issues/7059
102-
func swapCase*(r: Rune): Rune =
103-
result = r.toLower()
104-
if result != r:
105-
return
106-
result = r.toUpper()
107-
108101
func matchAsciiSet(n: Node, r: Rune): bool =
109102
assert n.shorthands.len == 0
110103
result = r in n.cps or
@@ -162,7 +155,7 @@ func match*(n: Node, r: Rune): bool {.inline.} =
162155
of reNotWhiteSpace: not r.isWhiteSpace()
163156
of reAny: r != lineBreakRune
164157
of reAnyNL: true
165-
of reCharCI: r == n.cp or r == n.cp.swapCase()
158+
of reCharCI: r == n.cp or n.cp == r.simpleCaseFold
166159
of reUCC: r.unicodeCategory() in n.cc
167160
of reNotUCC: r.unicodeCategory() notin n.cc
168161
of reWordAscii: r.isWordAscii()

tests/tests_misc.nim

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] =
7070
result = map(
7171
findAll(s, reg),
7272
func (m: RegexMatch2): seq[Slice[int]] =
73+
result = newSeq[Slice[int]]()
7374
for i in 0 .. m.groupsCount-1:
7475
result.add m.group(i))
7576

@@ -696,3 +697,25 @@ test "rust_regression":
696697
check findAllBounds(r"hiya \N{snowman} bye", re2"(\\N\{[^}]+})|([{}])") == @[5 .. 15]
697698
check findAllCapt(r"hiya \N{snowman} bye", re2"(\\N\{[^}]+})|([{}])") ==
698699
@[@[5 .. 15, nonCapture]]
700+
701+
# https://github.com/BurntSushi/rebar/pull/20
702+
test "rebar":
703+
block:
704+
check match("ſ", re2(r"s", {regexCaseless}))
705+
check match("s", re2(r"ſ", {regexCaseless}))
706+
check match("ſ", re2(r"S", {regexCaseless}))
707+
check match("S", re2(r"ſ", {regexCaseless}))
708+
check "ſ".len == 2
709+
check findAllBounds("ſ", re2(r"s", {regexCaseless})) == @[0 .. 1]
710+
check findAllBounds("s", re2(r"ſ", {regexCaseless})) == @[0 .. 0]
711+
check findAllBounds("ſ", re2(r"S", {regexCaseless})) == @[0 .. 1]
712+
check findAllBounds("S", re2(r"ſ", {regexCaseless})) == @[0 .. 0]
713+
# XXX fix
714+
#check match("s", re2(r"[ſ]", {regexCaseless}))
715+
#check match("ſ", re2(r"[s]", {regexCaseless}))
716+
check match("a", re2(r"A", {regexCaseless}))
717+
check match("A", re2(r"a", {regexCaseless}))
718+
check match("@", re2(r"@", {regexCaseless}))
719+
check findAllBounds("a", re2(r"A", {regexCaseless})) == @[0 .. 0]
720+
check findAllBounds("A", re2(r"a", {regexCaseless})) == @[0 .. 0]
721+
check findAllBounds("@", re2(r"@", {regexCaseless})) == @[0 .. 0]

0 commit comments

Comments
 (0)