Skip to content

Commit 62c5ac3

Browse files
Update check_zkregex_rules_basic
1 parent 7e4db6a commit 62c5ac3

File tree

4 files changed

+237
-55
lines changed

4 files changed

+237
-55
lines changed

src/zkregex_fuzzer/dfa.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import random
88
import string
9+
import re
910
from typing import Dict, Optional, Set
1011

1112
from automata.fa.dfa import DFA
@@ -42,6 +43,37 @@ def has_multiple_accepting_states_regex(regex: str) -> bool:
4243
return num_final_states > 1
4344

4445

46+
def has_one_accepting_state_regex(regex: str) -> bool:
47+
"""
48+
Returns True if converting the given regex to a DFA yields
49+
exactly one accepting (final) state. Returns False otherwise.
50+
"""
51+
dfa = regex_to_dfa(regex)
52+
return len(dfa.final_states) == 1
53+
54+
55+
def wrapped_has_one_accepting_state_regex(regex: str) -> bool:
56+
"""
57+
Returns True if converting the given regex to a DFA yields
58+
exactly one accepting (final) state. Returns False otherwise.
59+
60+
NOTE:
61+
- As the automata-lib does not support starting with '^' and ending with '$',
62+
we just remove them from the regex and check if the rest of the regex has one accepting state.
63+
"""
64+
if regex.startswith("^"):
65+
regex = regex[1:]
66+
# There are also some more cases with "starting" "^"
67+
if regex.startswith("(|^)"):
68+
regex = regex[4:]
69+
# Cases like '(\r\n|^)...', '(\r|^)...', '(\n|^)...'
70+
if bool(re.match(r'^\([\\r\\n]*|\s*\|\^\).*', regex)):
71+
regex = regex[regex.find("^")+2:]
72+
if regex.endswith("$"):
73+
regex = regex[:-1]
74+
return has_one_accepting_state_regex(regex)
75+
76+
4577
def has_multiple_accepting_states_dfa(dfa: DFA) -> bool:
4678
"""
4779
Returns True if the given DFA has multiple accepting (final) states.

src/zkregex_fuzzer/regexgen.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ def generate(self) -> str:
5454
regex = self.generate_unsafe()
5555
if not is_valid_regex(regex):
5656
continue
57-
if not check_zkregex_rules_basic(regex):
57+
correct, accepting_state_check = check_zkregex_rules_basic(regex)
58+
if not correct:
59+
# TODO: We should try to fix the regex if it has multiple accepting states
5860
continue
5961
logger.debug(f"Generated regex: {regex}")
6062
return regex

src/zkregex_fuzzer/utils.py

Lines changed: 73 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import string
88

99
from fuzzingbook.Grammars import Grammar, simple_grammar_fuzzer
10-
10+
from zkregex_fuzzer.dfa import wrapped_has_one_accepting_state_regex
1111

1212
def is_valid_regex(regex: str) -> bool:
1313
"""
@@ -20,65 +20,85 @@ def is_valid_regex(regex: str) -> bool:
2020
return False
2121

2222

23-
def check_zkregex_rules_basic(regex: str) -> bool:
23+
def has_lazy_quantifier(pattern: str) -> bool:
2424
"""
25-
Check partial zk-regex constraints with a text-based approach:
26-
1) Must end with '$'
27-
2) If '^' is present, it is either at index 0 or in substring '(|^)'
28-
3) No lazy quantifiers like '*?' or '+?' or '??' or '{m,n}?'
29-
Returns True if all checks pass, False otherwise.
30-
31-
TODO: DFA Checks -- code that actually compiles the regex to an automaton and verifies:
32-
- No loop from initial state back to itself (i.e. no .*-like or equivalent)
33-
- Only one accepting state
25+
Returns True if `pattern` contains any lazy quantifiers (i.e., *?, +?, ??, or {m,n}?),
26+
False otherwise.
27+
28+
This is a naive textual check and doesn't handle escaping inside character classes or
29+
more advanced regex syntax. For most simple usage, however, it suffices.
3430
"""
31+
# Regex to search for the typical lazy quantifier patterns:
32+
# *? +? ?? {m,n}?
33+
# We'll assume m,n are simple digit sets, e.g. {2,5}
34+
lazy_check = re.compile(r'(\*\?)|(\+\?)|(\?\?)|\{\d+(,\d+)?\}\?')
35+
36+
match = lazy_check.search(pattern)
37+
return bool(match)
3538

36-
# 1) Must end with '$' (if it present)
37-
if "$" in regex and not regex.endswith("$"):
38-
return False
3939

40-
# 2) '^' must be at start or in '(|^)'
41-
# We'll allow no '^' at all. If it appears, check positions.
42-
# We'll define a function to find all occurrences of '^'.
43-
allowed_positions = set()
44-
# If the string starts with '^', that’s allowed
45-
if len(regex) > 0 and regex[0] == "^":
46-
allowed_positions.add(0)
47-
48-
# If the string contains '|^', that means '^' is at position (idx+1)
49-
idx = 0
50-
while True:
51-
idx = regex.find("|^", idx)
52-
if idx == -1:
53-
break
54-
# '^' occurs at (idx + 1)
55-
allowed_positions.add(idx + 1)
56-
idx += 2 # skip past
57-
58-
# If the string contains '[^]', that means '^' is at position (idx+1)
59-
idx = 0
60-
while True:
61-
idx = regex.find("[^", idx)
62-
if idx == -1:
63-
break
64-
# '^' occurs at (idx + 1)
65-
allowed_positions.add(idx + 1)
66-
idx += 2 # skip past
67-
68-
# Now see if there's any '^' outside those allowed positions
69-
for match in re.finditer(r"\^", regex):
70-
pos = match.start()
71-
if pos not in allowed_positions:
40+
def correct_carret_position(regex: str) -> bool:
41+
"""
42+
Correct positions are:
43+
- At the start of the regex
44+
- In a capturing group that is at the start of the regex
45+
- In a negated character class
46+
Returns True if the '^' is in the correct position, False otherwise.
47+
48+
This is a naive textual check and doesn't handle escaping inside character classes or
49+
more advanced regex syntax. For most simple usage, however, it suffices.
50+
"""
51+
# Find all occurrences of '^' that are not escaped
52+
caret_positions = [match.start() for match in re.finditer(r'(?<!\\)\^', regex)]
53+
if len(caret_positions) == 0:
54+
return True
55+
# Check each position
56+
status = False
57+
for pos in caret_positions:
58+
status = False
59+
if pos == 0:
60+
status = True
61+
continue
62+
# We have '^' at the end of the regex
63+
if pos+1 == len(regex) and len(regex) > 1:
64+
continue
65+
# Let's check if the '^' is in a group that is at the start of the regex
66+
# and before '^' there is a '|' and before '|' there is either nothing or \r or \n until
67+
# the beginning of the group
68+
if regex[pos-1] == '|' and regex[pos+1] == ')' and regex[0] == '(' and bool(re.match(r'^\s*', regex[1:pos-1])):
69+
status = True
70+
continue
71+
# Let's check if the '^' is in a negated character class
72+
if regex[pos-1] == '[':
73+
status = True
74+
continue
75+
if status is False:
7276
return False
77+
return status
78+
7379

74-
# 3) Check no lazy quantifiers like *?, +?, ??, or {m,n}?
75-
# We do a simple regex search for them:
76-
# Patterns we search for: (*?), (+?), (??), ({\d+(,\d+)?}\?)
77-
lazy_pattern = re.compile(r"(\*\?|\+\?|\?\?|\{\d+(,\d+)?\}\?)")
78-
if lazy_pattern.search(regex):
79-
return False
80+
def check_zkregex_rules_basic(regex: str) -> tuple[bool, bool]:
81+
"""
82+
Check partial zk-regex constraints with a text-based approach:
83+
1) If '^' is present, it is either at index 0 or in substring '(|^)' or in (\r\n|^) or in substring '[^...]'
84+
2) No lazy quantifiers like '*?' or '+?' or '??' or '{m,n}?'
85+
3) Check that the regex has exactly one accepting state
86+
Returns True if all checks pass, False otherwise. Also return the status of the accepting state check.
87+
Returns (True, True) if all checks pass, (False, True) if the regex is invalid, (False, False) if the regex has multiple accepting states.
88+
"""
89+
# 1) If '^' is present, it is either at index 0 or in substring '(|^)' or in (\r\n|^) or in substring '[^...]'
90+
if not correct_carret_position(regex):
91+
return False, True # we return True as we haven't performed the DFA check
92+
93+
# 2) Check no lazy quantifiers like *?, +?, ??, or {m,n}?
94+
if has_lazy_quantifier(regex):
95+
return False, True # we return True as we haven't performed the DFA check
96+
97+
# 3) Check that the regex has exactly one accepting state
98+
if not wrapped_has_one_accepting_state_regex(regex):
99+
return False, False
80100

81-
return True
101+
return True, True
82102

83103

84104
def check_if_string_is_valid(regex: str, string: str) -> bool:

tests/test_utils.py

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from zkregex_fuzzer.utils import is_valid_regex
1+
from zkregex_fuzzer.utils import is_valid_regex, has_lazy_quantifier, correct_carret_position, check_zkregex_rules_basic
22

33

44
def test_valid_regex():
@@ -26,3 +26,131 @@ def test_invalid_regex():
2626
]
2727
for pattern in invalid_patterns:
2828
assert not is_valid_regex(pattern), f"Expected {pattern} to be invalid"
29+
30+
31+
def test_has_lazy_quantifier():
32+
"""Test that has_lazy_quantifier returns True for patterns with lazy quantifiers."""
33+
patterns = [
34+
(r"ab*c", False),
35+
(r"a+?", True),
36+
(r"(abc){2,5}?", True),
37+
(r"xyz", False),
38+
(r"[a-z]*", False),
39+
(r".+?", True),
40+
]
41+
for pattern, expected in patterns:
42+
assert has_lazy_quantifier(pattern) == expected, f"Expected {pattern} to have lazy quantifier: {expected}"
43+
44+
45+
def test_correct_carret_position():
46+
"""
47+
Test the correct_carret_position function with various corner cases.
48+
"""
49+
# Test cases with expected results
50+
test_cases = [
51+
# Basic cases
52+
(r"^abc", True), # Start of regex
53+
(r"abc", True), # No caret
54+
(r"abc^", False), # Invalid position at end
55+
56+
# Capturing group cases
57+
(r"(^abc)", False), # Start of capturing group
58+
(r"(|^)", True), # Alternative with caret
59+
(r"(abc|^def)", False), # Caret in middle of alternative
60+
(r"(|^)", True), # Simple alternative with caret
61+
(r"(\n|^)", True), # Newline alternative
62+
(r"abc(\n|^)", False), # Not at start of regex
63+
(r"(\r|^)", True), # Carriage return alternative
64+
(r"(\r\n|^)", True), # CRLF alternative
65+
(r"(\n\r|^)", True), # CRLF alternative
66+
(r"( |^)", True), # Spaces before alternative
67+
68+
# Character class cases
69+
(r"[^abc]", True), # Simple negated character class
70+
(r"abc[^xyz]def", True), # Negated character class in middle
71+
(r"[abc^]", False), # Caret not at start of character class
72+
(r"[[^]]", True), # Nested character class
73+
(r"[^]", True), # Empty negated character class
74+
75+
# Multiple caret cases
76+
(r"^abc[^xyz]", True), # Valid multiple carets
77+
(r"^abc^", False), # Invalid multiple carets
78+
(r"[^abc][^xyz]", True), # Multiple negated character classes
79+
80+
# Edge cases
81+
(r"", True), # Empty string
82+
(r"^", True), # Just caret
83+
(r"[]^]", False), # Invalid character class
84+
(r"(^)|^", False), # Multiple start anchors
85+
(r"(^abc|^def)", False), # Multiple start anchors in group
86+
87+
# Complex cases
88+
(r"(|^)abc[^xyz]123", True), # Combination of valid cases
89+
(r"^abc[^xyz](|^)def", False), # Invalid multiple start anchors
90+
(r"[^abc]^[^xyz]", False), # Invalid caret between character classes
91+
(r"( \r\n |^)abc", True), # Complex whitespace before alternative
92+
93+
# Escaped caret cases
94+
(r"abc\^", True),
95+
(r"abc\^def", True),
96+
]
97+
for regex, expected in test_cases:
98+
assert correct_carret_position(regex) == expected, f"Expected {regex} to have correct caret position: {expected}"
99+
100+
101+
def test_check_zkregex_rules_basic():
102+
"""
103+
Test the check_zkregex_rules_basic function with various test cases.
104+
"""
105+
# Test cases with expected results
106+
test_cases = [
107+
# 1. Dollar sign tests
108+
(r"abc$", (True, True)), # Valid dollar sign at end,
109+
(r"abc$def", (True, True)), # Valid dollar sign in middle
110+
(r"abc", (True, True)), # No dollar sign
111+
(r"$abc", (True, True)), # Dollar sign at start
112+
113+
# 2. Caret position tests
114+
(r"^abc", (True, True)), # Valid caret at start
115+
(r"(|^)abc", (True, True)), # Valid caret in alternative
116+
(r"(\r\n|^)abc", (True, True)), # Valid caret with CRLF alternative
117+
(r"[^abc]", (True, True)), # Valid caret in character class
118+
(r"abc^", (False, True)), # Invalid caret at end
119+
(r"abc^def", (False, True)), # Invalid caret in middle
120+
121+
# 3. Lazy quantifier tests
122+
(r"abc*", (True, True)), # Valid greedy quantifier
123+
(r"abc*?", (False, True)), # Invalid lazy star quantifier
124+
(r"abc+?", (False, True)), # Invalid lazy plus quantifier
125+
(r"abc??", (False, True)), # Invalid lazy question mark quantifier
126+
(r"abc{1,2}?", (False, True)), # Invalid lazy range quantifier
127+
128+
# 4. Combined valid cases
129+
(r"^abc$", (True, True)), # Valid start and end anchors
130+
(r"(|^)abc$", (True, True)), # Valid alternative and end anchor
131+
(r"[^abc].*$", (True, True)), # Valid character class and end anchor
132+
133+
# 5. Combined invalid cases
134+
(r"^abc$def", (True, True)), # Valid dollar position with caret
135+
(r"abc^def$", (False, True)), # Invalid caret with dollar
136+
(r"[^abc]*?$", (False, True)), # Invalid lazy quantifier with valid anchors
137+
138+
# 6. Complex cases
139+
(r"(|^)abc[^xyz]*$", (True, True)), # Complex valid regex
140+
(r"^abc[^xyz]+def$", (True, True)), # Complex valid regex with quantifiers
141+
(r"(|^)abc*?[^xyz]$", (False, True)), # Complex invalid regex with lazy quantifier
142+
(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", (True, True)),
143+
144+
# 7. The common regexes from zkemail
145+
(r">[^<>]+<.*", (True, True)),
146+
(r"(\r\n|^)to:[^\r\n]+\r\n", (True, True)),
147+
(r"(\r\n|^)subject:[^\r\n]+\r\n", (True, True)),
148+
#(r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+", (True, True)),
149+
#(r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+", (True, True)),
150+
(r"(\r\n|^)from:[^\r\n]+\r\n", (True, True)),
151+
(r"(\r\n|^)dkim-signature:([a-z]+=[^;]+; )+bh=[a-zA-Z0-9+/=]+;", (True, True)),
152+
(r"(\r\n|^)dkim-signature:([a-z]+=[^;]+; )+t=[0-9]+;", (True, True)),
153+
(r"(\r\n|^)message-id:<[A-Za-z0-9=@\.\+_-]+>\r\n", (True, True)),
154+
]
155+
for regex, expected in test_cases:
156+
assert check_zkregex_rules_basic(regex) == expected, f"Expected {regex} to have correct zk-regex rules: {expected}"

0 commit comments

Comments
 (0)