Skip to content

Commit 2646afb

Browse files
Fix an error in matchning entry pattern and linting
1 parent 62c5ac3 commit 2646afb

File tree

4 files changed

+74
-64
lines changed

4 files changed

+74
-64
lines changed

src/zkregex_fuzzer/dfa.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"""
66

77
import random
8-
import string
98
import re
9+
import string
1010
from typing import Dict, Optional, Set
1111

1212
from automata.fa.dfa import DFA
@@ -64,11 +64,11 @@ def wrapped_has_one_accepting_state_regex(regex: str) -> bool:
6464
if regex.startswith("^"):
6565
regex = regex[1:]
6666
# There are also some more cases with "starting" "^"
67-
if regex.startswith("(|^)"):
67+
elif regex.startswith("(|^)"):
6868
regex = regex[4:]
6969
# Cases like '(\r\n|^)...', '(\r|^)...', '(\n|^)...'
70-
if bool(re.match(r'^\([\\r\\n]*|\s*\|\^\).*', regex)):
71-
regex = regex[regex.find("^")+2:]
70+
elif bool(re.match(r"^\([\\r\\n]*\|\^\).*", regex)):
71+
regex = regex[regex.find("^") + 2 :]
7272
if regex.endswith("$"):
7373
regex = regex[:-1]
7474
return has_one_accepting_state_regex(regex)

src/zkregex_fuzzer/regexgen.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
from zkregex_fuzzer.dfa import (
2626
generate_random_dfa,
27+
regex_to_dfa,
2728
transform_dfa_to_regex,
2829
)
2930
from zkregex_fuzzer.logger import logger

src/zkregex_fuzzer/utils.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
import string
88

99
from fuzzingbook.Grammars import Grammar, simple_grammar_fuzzer
10+
1011
from zkregex_fuzzer.dfa import wrapped_has_one_accepting_state_regex
1112

13+
1214
def is_valid_regex(regex: str) -> bool:
1315
"""
1416
Check if a regex is valid.
@@ -31,8 +33,8 @@ def has_lazy_quantifier(pattern: str) -> bool:
3133
# Regex to search for the typical lazy quantifier patterns:
3234
# *? +? ?? {m,n}?
3335
# We'll assume m,n are simple digit sets, e.g. {2,5}
34-
lazy_check = re.compile(r'(\*\?)|(\+\?)|(\?\?)|\{\d+(,\d+)?\}\?')
35-
36+
lazy_check = re.compile(r"(\*\?)|(\+\?)|(\?\?)|\{\d+(,\d+)?\}\?")
37+
3638
match = lazy_check.search(pattern)
3739
return bool(match)
3840

@@ -49,7 +51,7 @@ def correct_carret_position(regex: str) -> bool:
4951
more advanced regex syntax. For most simple usage, however, it suffices.
5052
"""
5153
# Find all occurrences of '^' that are not escaped
52-
caret_positions = [match.start() for match in re.finditer(r'(?<!\\)\^', regex)]
54+
caret_positions = [match.start() for match in re.finditer(r"(?<!\\)\^", regex)]
5355
if len(caret_positions) == 0:
5456
return True
5557
# Check each position
@@ -60,22 +62,27 @@ def correct_carret_position(regex: str) -> bool:
6062
status = True
6163
continue
6264
# We have '^' at the end of the regex
63-
if pos+1 == len(regex) and len(regex) > 1:
65+
if pos + 1 == len(regex) and len(regex) > 1:
6466
continue
6567
# Let's check if the '^' is in a group that is at the start of the regex
6668
# and before '^' there is a '|' and before '|' there is either nothing or \r or \n until
6769
# the beginning of the group
68-
if regex[pos-1] == '|' and regex[pos+1] == ')' and regex[0] == '(' and bool(re.match(r'^\s*', regex[1:pos-1])):
70+
if (
71+
regex[pos - 1] == "|"
72+
and regex[pos + 1] == ")"
73+
and regex[0] == "("
74+
and bool(re.match(r"^\s*", regex[1 : pos - 1]))
75+
):
6976
status = True
7077
continue
7178
# Let's check if the '^' is in a negated character class
72-
if regex[pos-1] == '[':
79+
if regex[pos - 1] == "[":
7380
status = True
7481
continue
7582
if status is False:
7683
return False
7784
return status
78-
85+
7986

8087
def check_zkregex_rules_basic(regex: str) -> tuple[bool, bool]:
8188
"""
@@ -96,7 +103,7 @@ def check_zkregex_rules_basic(regex: str) -> tuple[bool, bool]:
96103

97104
# 3) Check that the regex has exactly one accepting state
98105
if not wrapped_has_one_accepting_state_regex(regex):
99-
return False, False
106+
return False, False
100107

101108
return True, True
102109

tests/test_utils.py

Lines changed: 54 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
from zkregex_fuzzer.utils import is_valid_regex, has_lazy_quantifier, correct_carret_position, check_zkregex_rules_basic
1+
from zkregex_fuzzer.utils import (
2+
check_zkregex_rules_basic,
3+
correct_carret_position,
4+
has_lazy_quantifier,
5+
is_valid_regex,
6+
)
27

38

49
def test_valid_regex():
@@ -31,15 +36,17 @@ def test_invalid_regex():
3136
def test_has_lazy_quantifier():
3237
"""Test that has_lazy_quantifier returns True for patterns with lazy quantifiers."""
3338
patterns = [
34-
(r"ab*c", False),
35-
(r"a+?", True),
36-
(r"(abc){2,5}?", True),
37-
(r"xyz", False),
38-
(r"[a-z]*", False),
39-
(r".+?", True),
39+
(r"ab*c", False),
40+
(r"a+?", True),
41+
(r"(abc){2,5}?", True),
42+
(r"xyz", False),
43+
(r"[a-z]*", False),
44+
(r".+?", True),
4045
]
4146
for pattern, expected in patterns:
42-
assert has_lazy_quantifier(pattern) == expected, f"Expected {pattern} to have lazy quantifier: {expected}"
47+
assert has_lazy_quantifier(pattern) == expected, (
48+
f"Expected {pattern} to have lazy quantifier: {expected}"
49+
)
4350

4451

4552
def test_correct_carret_position():
@@ -49,53 +56,49 @@ def test_correct_carret_position():
4956
# Test cases with expected results
5057
test_cases = [
5158
# Basic cases
52-
(r"^abc", True), # Start of regex
53-
(r"abc", True), # No caret
54-
(r"abc^", False), # Invalid position at end
55-
59+
(r"^abc", True), # Start of regex
60+
(r"abc", True), # No caret
61+
(r"abc^", False), # Invalid position at end
5662
# Capturing group cases
57-
(r"(^abc)", False), # Start of capturing group
58-
(r"(|^)", True), # Alternative with caret
59-
(r"(abc|^def)", False), # Caret in middle of alternative
60-
(r"(|^)", True), # Simple alternative with caret
61-
(r"(\n|^)", True), # Newline alternative
62-
(r"abc(\n|^)", False), # Not at start of regex
63-
(r"(\r|^)", True), # Carriage return alternative
64-
(r"(\r\n|^)", True), # CRLF alternative
65-
(r"(\n\r|^)", True), # CRLF alternative
66-
(r"( |^)", True), # Spaces before alternative
67-
63+
(r"(^abc)", False), # Start of capturing group
64+
(r"(|^)", True), # Alternative with caret
65+
(r"(abc|^def)", False), # Caret in middle of alternative
66+
(r"(|^)", True), # Simple alternative with caret
67+
(r"(\n|^)", True), # Newline alternative
68+
(r"abc(\n|^)", False), # Not at start of regex
69+
(r"(\r|^)", True), # Carriage return alternative
70+
(r"(\r\n|^)", True), # CRLF alternative
71+
(r"(\n\r|^)", True), # CRLF alternative
72+
(r"( |^)", True), # Spaces before alternative
6873
# Character class cases
69-
(r"[^abc]", True), # Simple negated character class
74+
(r"[^abc]", True), # Simple negated character class
7075
(r"abc[^xyz]def", True), # Negated character class in middle
71-
(r"[abc^]", False), # Caret not at start of character class
72-
(r"[[^]]", True), # Nested character class
73-
(r"[^]", True), # Empty negated character class
74-
76+
(r"[abc^]", False), # Caret not at start of character class
77+
(r"[[^]]", True), # Nested character class
78+
(r"[^]", True), # Empty negated character class
7579
# Multiple caret cases
76-
(r"^abc[^xyz]", True), # Valid multiple carets
77-
(r"^abc^", False), # Invalid multiple carets
80+
(r"^abc[^xyz]", True), # Valid multiple carets
81+
(r"^abc^", False), # Invalid multiple carets
7882
(r"[^abc][^xyz]", True), # Multiple negated character classes
79-
8083
# Edge cases
81-
(r"", True), # Empty string
82-
(r"^", True), # Just caret
83-
(r"[]^]", False), # Invalid character class
84-
(r"(^)|^", False), # Multiple start anchors
84+
(r"", True), # Empty string
85+
(r"^", True), # Just caret
86+
(r"[]^]", False), # Invalid character class
87+
(r"(^)|^", False), # Multiple start anchors
8588
(r"(^abc|^def)", False), # Multiple start anchors in group
86-
8789
# Complex cases
88-
(r"(|^)abc[^xyz]123", True), # Combination of valid cases
89-
(r"^abc[^xyz](|^)def", False), # Invalid multiple start anchors
90-
(r"[^abc]^[^xyz]", False), # Invalid caret between character classes
91-
(r"( \r\n |^)abc", True), # Complex whitespace before alternative
92-
90+
(r"(|^)abc[^xyz]123", True), # Combination of valid cases
91+
(r"^abc[^xyz](|^)def", False), # Invalid multiple start anchors
92+
(r"[^abc]^[^xyz]", False), # Invalid caret between character classes
93+
(r"( \r\n |^)abc", True), # Complex whitespace before alternative
9394
# Escaped caret cases
9495
(r"abc\^", True),
9596
(r"abc\^def", True),
9697
]
9798
for regex, expected in test_cases:
98-
assert correct_carret_position(regex) == expected, f"Expected {regex} to have correct caret position: {expected}"
99+
assert correct_carret_position(regex) == expected, (
100+
f"Expected {regex} to have correct caret position: {expected}"
101+
)
99102

100103

101104
def test_check_zkregex_rules_basic():
@@ -109,48 +112,47 @@ def test_check_zkregex_rules_basic():
109112
(r"abc$def", (True, True)), # Valid dollar sign in middle
110113
(r"abc", (True, True)), # No dollar sign
111114
(r"$abc", (True, True)), # Dollar sign at start
112-
113115
# 2. Caret position tests
114116
(r"^abc", (True, True)), # Valid caret at start
115117
(r"(|^)abc", (True, True)), # Valid caret in alternative
116118
(r"(\r\n|^)abc", (True, True)), # Valid caret with CRLF alternative
117119
(r"[^abc]", (True, True)), # Valid caret in character class
118120
(r"abc^", (False, True)), # Invalid caret at end
119121
(r"abc^def", (False, True)), # Invalid caret in middle
120-
121122
# 3. Lazy quantifier tests
122123
(r"abc*", (True, True)), # Valid greedy quantifier
123124
(r"abc*?", (False, True)), # Invalid lazy star quantifier
124125
(r"abc+?", (False, True)), # Invalid lazy plus quantifier
125126
(r"abc??", (False, True)), # Invalid lazy question mark quantifier
126127
(r"abc{1,2}?", (False, True)), # Invalid lazy range quantifier
127-
128128
# 4. Combined valid cases
129129
(r"^abc$", (True, True)), # Valid start and end anchors
130130
(r"(|^)abc$", (True, True)), # Valid alternative and end anchor
131131
(r"[^abc].*$", (True, True)), # Valid character class and end anchor
132-
133132
# 5. Combined invalid cases
134133
(r"^abc$def", (True, True)), # Valid dollar position with caret
135134
(r"abc^def$", (False, True)), # Invalid caret with dollar
136135
(r"[^abc]*?$", (False, True)), # Invalid lazy quantifier with valid anchors
137-
138136
# 6. Complex cases
139137
(r"(|^)abc[^xyz]*$", (True, True)), # Complex valid regex
140138
(r"^abc[^xyz]+def$", (True, True)), # Complex valid regex with quantifiers
141-
(r"(|^)abc*?[^xyz]$", (False, True)), # Complex invalid regex with lazy quantifier
139+
(
140+
r"(|^)abc*?[^xyz]$",
141+
(False, True),
142+
), # Complex invalid regex with lazy quantifier
142143
(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", (True, True)),
143-
144144
# 7. The common regexes from zkemail
145145
(r">[^<>]+<.*", (True, True)),
146146
(r"(\r\n|^)to:[^\r\n]+\r\n", (True, True)),
147147
(r"(\r\n|^)subject:[^\r\n]+\r\n", (True, True)),
148-
#(r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+", (True, True)),
149-
#(r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+", (True, True)),
148+
# (r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+", (True, True)),
149+
# (r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+", (True, True)),
150150
(r"(\r\n|^)from:[^\r\n]+\r\n", (True, True)),
151151
(r"(\r\n|^)dkim-signature:([a-z]+=[^;]+; )+bh=[a-zA-Z0-9+/=]+;", (True, True)),
152152
(r"(\r\n|^)dkim-signature:([a-z]+=[^;]+; )+t=[0-9]+;", (True, True)),
153153
(r"(\r\n|^)message-id:<[A-Za-z0-9=@\.\+_-]+>\r\n", (True, True)),
154154
]
155155
for regex, expected in test_cases:
156-
assert check_zkregex_rules_basic(regex) == expected, f"Expected {regex} to have correct zk-regex rules: {expected}"
156+
assert check_zkregex_rules_basic(regex) == expected, (
157+
f"Expected {regex} to have correct zk-regex rules: {expected}"
158+
)

0 commit comments

Comments
 (0)