Fix an error in matchning entry pattern and linting

StefanosChaliasos · StefanosChaliasos · commit 2646afb62255 · 2025-03-07T14:58:58.000+02:00
diff --git a/src/zkregex_fuzzer/dfa.py b/src/zkregex_fuzzer/dfa.py
@@ -5,8 +5,8 @@
 """
 
 import random
-import string
 import re
+import string
 from typing import Dict, Optional, Set
 
 from automata.fa.dfa import DFA
@@ -64,11 +64,11 @@ def wrapped_has_one_accepting_state_regex(regex: str) -> bool:
     if regex.startswith("^"):
         regex = regex[1:]
     # There are also some more cases with "starting" "^"
-    if regex.startswith("(|^)"):
+    elif regex.startswith("(|^)"):
         regex = regex[4:]
     # Cases like '(\r\n|^)...', '(\r|^)...', '(\n|^)...'
-    if bool(re.match(r'^\([\\r\\n]*|\s*\|\^\).*', regex)):
-        regex = regex[regex.find("^")+2:]
+    elif bool(re.match(r"^\([\\r\\n]*\|\^\).*", regex)):
+        regex = regex[regex.find("^") + 2 :]
     if regex.endswith("$"):
         regex = regex[:-1]
     return has_one_accepting_state_regex(regex)
diff --git a/src/zkregex_fuzzer/regexgen.py b/src/zkregex_fuzzer/regexgen.py
@@ -24,6 +24,7 @@
 
 from zkregex_fuzzer.dfa import (
     generate_random_dfa,
+    regex_to_dfa,
     transform_dfa_to_regex,
 )
 from zkregex_fuzzer.logger import logger
diff --git a/src/zkregex_fuzzer/utils.py b/src/zkregex_fuzzer/utils.py
@@ -7,8 +7,10 @@
 import string
 
 from fuzzingbook.Grammars import Grammar, simple_grammar_fuzzer
+
 from zkregex_fuzzer.dfa import wrapped_has_one_accepting_state_regex
 
+
 def is_valid_regex(regex: str) -> bool:
     """
     Check if a regex is valid.
@@ -31,8 +33,8 @@ def has_lazy_quantifier(pattern: str) -> bool:
     # Regex to search for the typical lazy quantifier patterns:
     #   *?   +?   ??   {m,n}?
     # We'll assume m,n are simple digit sets, e.g. {2,5}
-    lazy_check = re.compile(r'(\*\?)|(\+\?)|(\?\?)|\{\d+(,\d+)?\}\?')
-    
+    lazy_check = re.compile(r"(\*\?)|(\+\?)|(\?\?)|\{\d+(,\d+)?\}\?")
+
     match = lazy_check.search(pattern)
     return bool(match)
 
@@ -49,7 +51,7 @@ def correct_carret_position(regex: str) -> bool:
     more advanced regex syntax. For most simple usage, however, it suffices.
     """
     # Find all occurrences of '^' that are not escaped
-    caret_positions = [match.start() for match in re.finditer(r'(?<!\\)\^', regex)]
+    caret_positions = [match.start() for match in re.finditer(r"(?<!\\)\^", regex)]
     if len(caret_positions) == 0:
         return True
     # Check each position
@@ -60,22 +62,27 @@ def correct_carret_position(regex: str) -> bool:
             status = True
             continue
         # We have '^' at the end of the regex
-        if pos+1 == len(regex) and len(regex) > 1:
+        if pos + 1 == len(regex) and len(regex) > 1:
             continue
         # Let's check if the '^' is in a group that is at the start of the regex
         # and before '^' there is a '|' and before '|' there is either nothing or \r or \n until
         # the beginning of the group
-        if regex[pos-1] == '|' and regex[pos+1] == ')' and regex[0] == '(' and bool(re.match(r'^\s*', regex[1:pos-1])):
+        if (
+            regex[pos - 1] == "|"
+            and regex[pos + 1] == ")"
+            and regex[0] == "("
+            and bool(re.match(r"^\s*", regex[1 : pos - 1]))
+        ):
             status = True
             continue
         # Let's check if the '^' is in a negated character class
-        if regex[pos-1] == '[':
+        if regex[pos - 1] == "[":
             status = True
             continue
         if status is False:
             return False
     return status
-    
+
 
 def check_zkregex_rules_basic(regex: str) -> tuple[bool, bool]:
     """
@@ -96,7 +103,7 @@ def check_zkregex_rules_basic(regex: str) -> tuple[bool, bool]:
 
     # 3) Check that the regex has exactly one accepting state
     if not wrapped_has_one_accepting_state_regex(regex):
-        return False, False  
+        return False, False
 
     return True, True
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,4 +1,9 @@
-from zkregex_fuzzer.utils import is_valid_regex, has_lazy_quantifier, correct_carret_position, check_zkregex_rules_basic
+from zkregex_fuzzer.utils import (
+    check_zkregex_rules_basic,
+    correct_carret_position,
+    has_lazy_quantifier,
+    is_valid_regex,
+)
 
 
 def test_valid_regex():
@@ -31,15 +36,17 @@ def test_invalid_regex():
 def test_has_lazy_quantifier():
     """Test that has_lazy_quantifier returns True for patterns with lazy quantifiers."""
     patterns = [
-        (r"ab*c", False),         
-        (r"a+?", True),         
-        (r"(abc){2,5}?", True),  
-        (r"xyz", False),          
-        (r"[a-z]*", False),       
-        (r".+?", True),          
+        (r"ab*c", False),
+        (r"a+?", True),
+        (r"(abc){2,5}?", True),
+        (r"xyz", False),
+        (r"[a-z]*", False),
+        (r".+?", True),
     ]
     for pattern, expected in patterns:
-        assert has_lazy_quantifier(pattern) == expected, f"Expected {pattern} to have lazy quantifier: {expected}"
+        assert has_lazy_quantifier(pattern) == expected, (
+            f"Expected {pattern} to have lazy quantifier: {expected}"
+        )
 
 
 def test_correct_carret_position():
@@ -49,53 +56,49 @@ def test_correct_carret_position():
     # Test cases with expected results
     test_cases = [
         # Basic cases
-        (r"^abc", True),          # Start of regex
-        (r"abc", True),           # No caret
-        (r"abc^", False),         # Invalid position at end
-        
+        (r"^abc", True),  # Start of regex
+        (r"abc", True),  # No caret
+        (r"abc^", False),  # Invalid position at end
         # Capturing group cases
-        (r"(^abc)", False),        # Start of capturing group
-        (r"(|^)", True),          # Alternative with caret
-        (r"(abc|^def)", False),   # Caret in middle of alternative
-        (r"(|^)", True),          # Simple alternative with caret
-        (r"(\n|^)", True),        # Newline alternative
-        (r"abc(\n|^)", False),     # Not at start of regex
-        (r"(\r|^)", True),        # Carriage return alternative
-        (r"(\r\n|^)", True),      # CRLF alternative
-        (r"(\n\r|^)", True),      # CRLF alternative
-        (r"(  |^)", True),        # Spaces before alternative
-        
+        (r"(^abc)", False),  # Start of capturing group
+        (r"(|^)", True),  # Alternative with caret
+        (r"(abc|^def)", False),  # Caret in middle of alternative
+        (r"(|^)", True),  # Simple alternative with caret
+        (r"(\n|^)", True),  # Newline alternative
+        (r"abc(\n|^)", False),  # Not at start of regex
+        (r"(\r|^)", True),  # Carriage return alternative
+        (r"(\r\n|^)", True),  # CRLF alternative
+        (r"(\n\r|^)", True),  # CRLF alternative
+        (r"(  |^)", True),  # Spaces before alternative
         # Character class cases
-        (r"[^abc]", True),        # Simple negated character class
+        (r"[^abc]", True),  # Simple negated character class
         (r"abc[^xyz]def", True),  # Negated character class in middle
-        (r"[abc^]", False),       # Caret not at start of character class
-        (r"[[^]]", True),         # Nested character class
-        (r"[^]", True),           # Empty negated character class
-        
+        (r"[abc^]", False),  # Caret not at start of character class
+        (r"[[^]]", True),  # Nested character class
+        (r"[^]", True),  # Empty negated character class
         # Multiple caret cases
-        (r"^abc[^xyz]", True),    # Valid multiple carets
-        (r"^abc^", False),        # Invalid multiple carets
+        (r"^abc[^xyz]", True),  # Valid multiple carets
+        (r"^abc^", False),  # Invalid multiple carets
         (r"[^abc][^xyz]", True),  # Multiple negated character classes
-        
         # Edge cases
-        (r"", True),              # Empty string
-        (r"^", True),             # Just caret
-        (r"[]^]", False),         # Invalid character class
-        (r"(^)|^", False),        # Multiple start anchors
+        (r"", True),  # Empty string
+        (r"^", True),  # Just caret
+        (r"[]^]", False),  # Invalid character class
+        (r"(^)|^", False),  # Multiple start anchors
         (r"(^abc|^def)", False),  # Multiple start anchors in group
-        
         # Complex cases
-        (r"(|^)abc[^xyz]123", True),     # Combination of valid cases
-        (r"^abc[^xyz](|^)def", False),   # Invalid multiple start anchors
-        (r"[^abc]^[^xyz]", False),       # Invalid caret between character classes
-        (r"(  \r\n  |^)abc", True),      # Complex whitespace before alternative
-
+        (r"(|^)abc[^xyz]123", True),  # Combination of valid cases
+        (r"^abc[^xyz](|^)def", False),  # Invalid multiple start anchors
+        (r"[^abc]^[^xyz]", False),  # Invalid caret between character classes
+        (r"(  \r\n  |^)abc", True),  # Complex whitespace before alternative
         # Escaped caret cases
         (r"abc\^", True),
         (r"abc\^def", True),
     ]
     for regex, expected in test_cases:
-        assert correct_carret_position(regex) == expected, f"Expected {regex} to have correct caret position: {expected}"
+        assert correct_carret_position(regex) == expected, (
+            f"Expected {regex} to have correct caret position: {expected}"
+        )
 
 
 def test_check_zkregex_rules_basic():
@@ -109,48 +112,47 @@ def test_check_zkregex_rules_basic():
         (r"abc$def", (True, True)),  # Valid dollar sign in middle
         (r"abc", (True, True)),  # No dollar sign
         (r"$abc", (True, True)),  # Dollar sign at start
-        
         # 2. Caret position tests
         (r"^abc", (True, True)),  # Valid caret at start
         (r"(|^)abc", (True, True)),  # Valid caret in alternative
         (r"(\r\n|^)abc", (True, True)),  # Valid caret with CRLF alternative
         (r"[^abc]", (True, True)),  # Valid caret in character class
         (r"abc^", (False, True)),  # Invalid caret at end
         (r"abc^def", (False, True)),  # Invalid caret in middle
-        
         # 3. Lazy quantifier tests
         (r"abc*", (True, True)),  # Valid greedy quantifier
         (r"abc*?", (False, True)),  # Invalid lazy star quantifier
         (r"abc+?", (False, True)),  # Invalid lazy plus quantifier
         (r"abc??", (False, True)),  # Invalid lazy question mark quantifier
         (r"abc{1,2}?", (False, True)),  # Invalid lazy range quantifier
-        
         # 4. Combined valid cases
         (r"^abc$", (True, True)),  # Valid start and end anchors
         (r"(|^)abc$", (True, True)),  # Valid alternative and end anchor
         (r"[^abc].*$", (True, True)),  # Valid character class and end anchor
-        
         # 5. Combined invalid cases
         (r"^abc$def", (True, True)),  # Valid dollar position with caret
         (r"abc^def$", (False, True)),  # Invalid caret with dollar
         (r"[^abc]*?$", (False, True)),  # Invalid lazy quantifier with valid anchors
-        
         # 6. Complex cases
         (r"(|^)abc[^xyz]*$", (True, True)),  # Complex valid regex
         (r"^abc[^xyz]+def$", (True, True)),  # Complex valid regex with quantifiers
-        (r"(|^)abc*?[^xyz]$", (False, True)),  # Complex invalid regex with lazy quantifier
+        (
+            r"(|^)abc*?[^xyz]$",
+            (False, True),
+        ),  # Complex invalid regex with lazy quantifier
         (r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", (True, True)),
-
         # 7. The common regexes from zkemail
         (r">[^<>]+<.*", (True, True)),
         (r"(\r\n|^)to:[^\r\n]+\r\n", (True, True)),
         (r"(\r\n|^)subject:[^\r\n]+\r\n", (True, True)),
-        #(r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+", (True, True)),
-        #(r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+", (True, True)),
+        # (r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+", (True, True)),
+        # (r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+", (True, True)),
         (r"(\r\n|^)from:[^\r\n]+\r\n", (True, True)),
         (r"(\r\n|^)dkim-signature:([a-z]+=[^;]+; )+bh=[a-zA-Z0-9+/=]+;", (True, True)),
         (r"(\r\n|^)dkim-signature:([a-z]+=[^;]+; )+t=[0-9]+;", (True, True)),
         (r"(\r\n|^)message-id:<[A-Za-z0-9=@\.\+_-]+>\r\n", (True, True)),
     ]
     for regex, expected in test_cases:
-        assert check_zkregex_rules_basic(regex) == expected, f"Expected {regex} to have correct zk-regex rules: {expected}"
+        assert check_zkregex_rules_basic(regex) == expected, (
+            f"Expected {regex} to have correct zk-regex rules: {expected}"
+        )

Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@`
`24`	`24`
`25`	`25`	`from zkregex_fuzzer.dfa import (`
`26`	`26`	`generate_random_dfa,`
	`27`	`+ regex_to_dfa,`
`27`	`28`	`transform_dfa_to_regex,`
`28`	`29`	`)`
`29`	`30`	`from zkregex_fuzzer.logger import logger`