Skip to content

Commit ca8df08

Browse files
Fix DFA issue (wip)
1 parent 2646afb commit ca8df08

File tree

4 files changed

+50
-5
lines changed

4 files changed

+50
-5
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ dependencies = [
2323
"rstr",
2424
"exrex",
2525
"joblib",
26-
"automata-lib",
26+
#"automata-lib",
2727
]
2828

2929
[project.optional-dependencies]

src/zkregex_fuzzer/dfa.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,22 @@
1111

1212
from automata.fa.dfa import DFA
1313
from automata.fa.gnfa import GNFA
14-
from automata.fa.nfa import NFA
14+
from automata.fa.nfa import NFA, RESERVED_CHARACTERS
15+
16+
17+
from zkregex_fuzzer.utils import ASCII_CHARS
1518

1619

1720
def regex_to_dfa(regex: str) -> DFA:
1821
"""
1922
Convert a regex to a DFA.
2023
"""
24+
# Symbols should include at least all ASCII characters
25+
symbols = ASCII_CHARS
26+
symbols = symbols - RESERVED_CHARACTERS
27+
2128
try:
22-
nfa = NFA.from_regex(regex)
29+
nfa = NFA.from_regex(regex, input_symbols=symbols)
2330
except Exception as e:
2431
raise ValueError(f"Failed to parse '{regex}' into an automaton: {e}")
2532
try:

src/zkregex_fuzzer/utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,19 @@
1111
from zkregex_fuzzer.dfa import wrapped_has_one_accepting_state_regex
1212

1313

14+
def create_range(start_char: str, end_char: str) -> set[str]:
15+
"""
16+
Create a set of characters from start_char to end_char.
17+
"""
18+
return {chr(i) for i in range(ord(start_char), ord(end_char) + 1)}
19+
20+
21+
LATIN_EXT_CHARS = create_range("¡", "ƿ")
22+
GREEK_CHARS = create_range("Ͱ", "Ͽ")
23+
CYRILLIC_CHARS = create_range("Ѐ", "ӿ")
24+
ASCII_CHARS = set(string.printable)
25+
26+
1427
def is_valid_regex(regex: str) -> bool:
1528
"""
1629
Check if a regex is valid.

tests/test_dfa.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
transform_dfa_to_regex,
99
transform_dfa_to_single_accepting_state,
1010
)
11-
11+
import re
1212
regex_with_multiple_accepting_states = [
1313
r"(ab|aba)",
1414
r"(ab|aba)*",
@@ -27,7 +27,7 @@
2727
r"(hello)",
2828
r"(ab)*",
2929
r"(a|b|c)*",
30-
r"((a|b|c)*abc)", # This is somewhat comples, do we want to support this?
30+
r"((a|b|c)*abc)", # This is somewhat complex, do we want to support this?
3131
r"[a-zA-Z]+",
3232
r"[0-9]+",
3333
r"(abc|abcd|abcde)f",
@@ -39,6 +39,17 @@
3939
r"abc",
4040
r"(hello)",
4141
]
42+
zkemail_regexes = [
43+
">[^<>]+<.*",
44+
r"to:[^\r\n]+\r\n",
45+
r")subject:[^\r\n]+\r\n",
46+
r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~./]+@[A-Za-z0-9.\-@]+",
47+
r"dkim-signature:([a-z]+=[^;]+; )+bh=[a-zA-Z0-9+/=]+;",
48+
r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~./@]+@[A-Za-z0-9.\-]+",
49+
r"from:[^\r\n]+\r\n",
50+
r"dkim-signature:([a-z]+=[^;]+; )+t=[0-9]+;",
51+
r"message-id:<[A-Za-z0-9=@\.\+_-]+>\r\n",
52+
]
4253

4354

4455
def test_has_multiple_accepting_states_regex_without_multiple():
@@ -115,3 +126,17 @@ def test_dfa_string_matching():
115126
break
116127
if regex not in single_solution_regexes:
117128
assert string != string2
129+
130+
131+
def test_dfa_string_matching_zkemail():
132+
for regex in zkemail_regexes:
133+
string = dfa_string_matching(regex)
134+
print()
135+
print("--------------------------------")
136+
print(f"Testing regex: {regex}" )
137+
print(f"String: {string}")
138+
print("--------------------------------")
139+
print()
140+
assert string is not None
141+
# we also need to check against re module
142+
assert re.match(regex, string) is not None

0 commit comments

Comments
 (0)