Update check_zkregex_rules_basic

StefanosChaliasos · StefanosChaliasos · commit 62c5ac39ae06 · 2025-03-07T14:58:58.000+02:00
diff --git a/src/zkregex_fuzzer/dfa.py b/src/zkregex_fuzzer/dfa.py
@@ -6,6 +6,7 @@
 
 import random
 import string
+import re
 from typing import Dict, Optional, Set
 
 from automata.fa.dfa import DFA
@@ -42,6 +43,37 @@ def has_multiple_accepting_states_regex(regex: str) -> bool:
     return num_final_states > 1
 
 
+def has_one_accepting_state_regex(regex: str) -> bool:
+    """
+    Returns True if converting the given regex to a DFA yields
+    exactly one accepting (final) state. Returns False otherwise.
+    """
+    dfa = regex_to_dfa(regex)
+    return len(dfa.final_states) == 1
+
+
+def wrapped_has_one_accepting_state_regex(regex: str) -> bool:
+    """
+    Returns True if converting the given regex to a DFA yields
+    exactly one accepting (final) state. Returns False otherwise.
+
+    NOTE:
+      - As the automata-lib does not support starting with '^' and ending with '$',
+      we just remove them from the regex and check if the rest of the regex has one accepting state.
+    """
+    if regex.startswith("^"):
+        regex = regex[1:]
+    # There are also some more cases with "starting" "^"
+    if regex.startswith("(|^)"):
+        regex = regex[4:]
+    # Cases like '(\r\n|^)...', '(\r|^)...', '(\n|^)...'
+    if bool(re.match(r'^\([\\r\\n]*|\s*\|\^\).*', regex)):
+        regex = regex[regex.find("^")+2:]
+    if regex.endswith("$"):
+        regex = regex[:-1]
+    return has_one_accepting_state_regex(regex)
+
+
 def has_multiple_accepting_states_dfa(dfa: DFA) -> bool:
     """
     Returns True if the given DFA has multiple accepting (final) states.
diff --git a/src/zkregex_fuzzer/regexgen.py b/src/zkregex_fuzzer/regexgen.py
@@ -54,7 +54,9 @@ def generate(self) -> str:
             regex = self.generate_unsafe()
             if not is_valid_regex(regex):
                 continue
-            if not check_zkregex_rules_basic(regex):
+            correct, accepting_state_check = check_zkregex_rules_basic(regex)
+            if not correct:
+                # TODO: We should try to fix the regex if it has multiple accepting states
                 continue
             logger.debug(f"Generated regex: {regex}")
             return regex
diff --git a/src/zkregex_fuzzer/utils.py b/src/zkregex_fuzzer/utils.py
@@ -7,7 +7,7 @@
 import string
 
 from fuzzingbook.Grammars import Grammar, simple_grammar_fuzzer
-
+from zkregex_fuzzer.dfa import wrapped_has_one_accepting_state_regex
 
 def is_valid_regex(regex: str) -> bool:
     """
@@ -20,65 +20,85 @@ def is_valid_regex(regex: str) -> bool:
         return False
 
 
-def check_zkregex_rules_basic(regex: str) -> bool:
+def has_lazy_quantifier(pattern: str) -> bool:
     """
-    Check partial zk-regex constraints with a text-based approach:
-      1) Must end with '$'
-      2) If '^' is present, it is either at index 0 or in substring '(|^)'
-      3) No lazy quantifiers like '*?' or '+?' or '??' or '{m,n}?'
-    Returns True if all checks pass, False otherwise.
-
-    TODO: DFA Checks -- code that actually compiles the regex to an automaton and verifies:
-        - No loop from initial state back to itself (i.e. no .*-like or equivalent)
-        - Only one accepting state
+    Returns True if `pattern` contains any lazy quantifiers (i.e., *?, +?, ??, or {m,n}?),
+    False otherwise.
+
+    This is a naive textual check and doesn't handle escaping inside character classes or
+    more advanced regex syntax. For most simple usage, however, it suffices.
     """
+    # Regex to search for the typical lazy quantifier patterns:
+    #   *?   +?   ??   {m,n}?
+    # We'll assume m,n are simple digit sets, e.g. {2,5}
+    lazy_check = re.compile(r'(\*\?)|(\+\?)|(\?\?)|\{\d+(,\d+)?\}\?')
+    
+    match = lazy_check.search(pattern)
+    return bool(match)
 
-    # 1) Must end with '$' (if it present)
-    if "$" in regex and not regex.endswith("$"):
-        return False
 
-    # 2) '^' must be at start or in '(|^)'
-    # We'll allow no '^' at all. If it appears, check positions.
-    # We'll define a function to find all occurrences of '^'.
-    allowed_positions = set()
-    # If the string starts with '^', that’s allowed
-    if len(regex) > 0 and regex[0] == "^":
-        allowed_positions.add(0)
-
-    # If the string contains '|^', that means '^' is at position (idx+1)
-    idx = 0
-    while True:
-        idx = regex.find("|^", idx)
-        if idx == -1:
-            break
-        # '^' occurs at (idx + 1)
-        allowed_positions.add(idx + 1)
-        idx += 2  # skip past
-
-    # If the string contains '[^]', that means '^' is at position (idx+1)
-    idx = 0
-    while True:
-        idx = regex.find("[^", idx)
-        if idx == -1:
-            break
-        # '^' occurs at (idx + 1)
-        allowed_positions.add(idx + 1)
-        idx += 2  # skip past
-
-    # Now see if there's any '^' outside those allowed positions
-    for match in re.finditer(r"\^", regex):
-        pos = match.start()
-        if pos not in allowed_positions:
+def correct_carret_position(regex: str) -> bool:
+    """
+    Correct positions are:
+        - At the start of the regex
+        - In a capturing group that is at the start of the regex
+        - In a negated character class
+    Returns True if the '^' is in the correct position, False otherwise.
+
+    This is a naive textual check and doesn't handle escaping inside character classes or
+    more advanced regex syntax. For most simple usage, however, it suffices.
+    """
+    # Find all occurrences of '^' that are not escaped
+    caret_positions = [match.start() for match in re.finditer(r'(?<!\\)\^', regex)]
+    if len(caret_positions) == 0:
+        return True
+    # Check each position
+    status = False
+    for pos in caret_positions:
+        status = False
+        if pos == 0:
+            status = True
+            continue
+        # We have '^' at the end of the regex
+        if pos+1 == len(regex) and len(regex) > 1:
+            continue
+        # Let's check if the '^' is in a group that is at the start of the regex
+        # and before '^' there is a '|' and before '|' there is either nothing or \r or \n until
+        # the beginning of the group
+        if regex[pos-1] == '|' and regex[pos+1] == ')' and regex[0] == '(' and bool(re.match(r'^\s*', regex[1:pos-1])):
+            status = True
+            continue
+        # Let's check if the '^' is in a negated character class
+        if regex[pos-1] == '[':
+            status = True
+            continue
+        if status is False:
             return False
+    return status
+    
 
-    # 3) Check no lazy quantifiers like *?, +?, ??, or {m,n}?
-    # We do a simple regex search for them:
-    # Patterns we search for: (*?), (+?), (??), ({\d+(,\d+)?}\?)
-    lazy_pattern = re.compile(r"(\*\?|\+\?|\?\?|\{\d+(,\d+)?\}\?)")
-    if lazy_pattern.search(regex):
-        return False
+def check_zkregex_rules_basic(regex: str) -> tuple[bool, bool]:
+    """
+    Check partial zk-regex constraints with a text-based approach:
+      1) If '^' is present, it is either at index 0 or in substring '(|^)' or in (\r\n|^) or in substring '[^...]'
+      2) No lazy quantifiers like '*?' or '+?' or '??' or '{m,n}?'
+      3) Check that the regex has exactly one accepting state
+    Returns True if all checks pass, False otherwise. Also return the status of the accepting state check.
+    Returns (True, True) if all checks pass, (False, True) if the regex is invalid, (False, False) if the regex has multiple accepting states.
+    """
+    # 1) If '^' is present, it is either at index 0 or in substring '(|^)' or in (\r\n|^) or in substring '[^...]'
+    if not correct_carret_position(regex):
+        return False, True  # we return True as we haven't performed the DFA check
+
+    # 2) Check no lazy quantifiers like *?, +?, ??, or {m,n}?
+    if has_lazy_quantifier(regex):
+        return False, True  # we return True as we haven't performed the DFA check
+
+    # 3) Check that the regex has exactly one accepting state
+    if not wrapped_has_one_accepting_state_regex(regex):
+        return False, False  
 
-    return True
+    return True, True
 
 
 def check_if_string_is_valid(regex: str, string: str) -> bool:
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,4 +1,4 @@
-from zkregex_fuzzer.utils import is_valid_regex
+from zkregex_fuzzer.utils import is_valid_regex, has_lazy_quantifier, correct_carret_position, check_zkregex_rules_basic
 
 
 def test_valid_regex():
@@ -26,3 +26,131 @@ def test_invalid_regex():
     ]
     for pattern in invalid_patterns:
         assert not is_valid_regex(pattern), f"Expected {pattern} to be invalid"
+
+
+def test_has_lazy_quantifier():
+    """Test that has_lazy_quantifier returns True for patterns with lazy quantifiers."""
+    patterns = [
+        (r"ab*c", False),         
+        (r"a+?", True),         
+        (r"(abc){2,5}?", True),  
+        (r"xyz", False),          
+        (r"[a-z]*", False),       
+        (r".+?", True),          
+    ]
+    for pattern, expected in patterns:
+        assert has_lazy_quantifier(pattern) == expected, f"Expected {pattern} to have lazy quantifier: {expected}"
+
+
+def test_correct_carret_position():
+    """
+    Test the correct_carret_position function with various corner cases.
+    """
+    # Test cases with expected results
+    test_cases = [
+        # Basic cases
+        (r"^abc", True),          # Start of regex
+        (r"abc", True),           # No caret
+        (r"abc^", False),         # Invalid position at end
+        
+        # Capturing group cases
+        (r"(^abc)", False),        # Start of capturing group
+        (r"(|^)", True),          # Alternative with caret
+        (r"(abc|^def)", False),   # Caret in middle of alternative
+        (r"(|^)", True),          # Simple alternative with caret
+        (r"(\n|^)", True),        # Newline alternative
+        (r"abc(\n|^)", False),     # Not at start of regex
+        (r"(\r|^)", True),        # Carriage return alternative
+        (r"(\r\n|^)", True),      # CRLF alternative
+        (r"(\n\r|^)", True),      # CRLF alternative
+        (r"(  |^)", True),        # Spaces before alternative
+        
+        # Character class cases
+        (r"[^abc]", True),        # Simple negated character class
+        (r"abc[^xyz]def", True),  # Negated character class in middle
+        (r"[abc^]", False),       # Caret not at start of character class
+        (r"[[^]]", True),         # Nested character class
+        (r"[^]", True),           # Empty negated character class
+        
+        # Multiple caret cases
+        (r"^abc[^xyz]", True),    # Valid multiple carets
+        (r"^abc^", False),        # Invalid multiple carets
+        (r"[^abc][^xyz]", True),  # Multiple negated character classes
+        
+        # Edge cases
+        (r"", True),              # Empty string
+        (r"^", True),             # Just caret
+        (r"[]^]", False),         # Invalid character class
+        (r"(^)|^", False),        # Multiple start anchors
+        (r"(^abc|^def)", False),  # Multiple start anchors in group
+        
+        # Complex cases
+        (r"(|^)abc[^xyz]123", True),     # Combination of valid cases
+        (r"^abc[^xyz](|^)def", False),   # Invalid multiple start anchors
+        (r"[^abc]^[^xyz]", False),       # Invalid caret between character classes
+        (r"(  \r\n  |^)abc", True),      # Complex whitespace before alternative
+
+        # Escaped caret cases
+        (r"abc\^", True),
+        (r"abc\^def", True),
+    ]
+    for regex, expected in test_cases:
+        assert correct_carret_position(regex) == expected, f"Expected {regex} to have correct caret position: {expected}"
+
+
+def test_check_zkregex_rules_basic():
+    """
+    Test the check_zkregex_rules_basic function with various test cases.
+    """
+    # Test cases with expected results
+    test_cases = [
+        # 1. Dollar sign tests
+        (r"abc$", (True, True)),  # Valid dollar sign at end,
+        (r"abc$def", (True, True)),  # Valid dollar sign in middle
+        (r"abc", (True, True)),  # No dollar sign
+        (r"$abc", (True, True)),  # Dollar sign at start
+        
+        # 2. Caret position tests
+        (r"^abc", (True, True)),  # Valid caret at start
+        (r"(|^)abc", (True, True)),  # Valid caret in alternative
+        (r"(\r\n|^)abc", (True, True)),  # Valid caret with CRLF alternative
+        (r"[^abc]", (True, True)),  # Valid caret in character class
+        (r"abc^", (False, True)),  # Invalid caret at end
+        (r"abc^def", (False, True)),  # Invalid caret in middle
+        
+        # 3. Lazy quantifier tests
+        (r"abc*", (True, True)),  # Valid greedy quantifier
+        (r"abc*?", (False, True)),  # Invalid lazy star quantifier
+        (r"abc+?", (False, True)),  # Invalid lazy plus quantifier
+        (r"abc??", (False, True)),  # Invalid lazy question mark quantifier
+        (r"abc{1,2}?", (False, True)),  # Invalid lazy range quantifier
+        
+        # 4. Combined valid cases
+        (r"^abc$", (True, True)),  # Valid start and end anchors
+        (r"(|^)abc$", (True, True)),  # Valid alternative and end anchor
+        (r"[^abc].*$", (True, True)),  # Valid character class and end anchor
+        
+        # 5. Combined invalid cases
+        (r"^abc$def", (True, True)),  # Valid dollar position with caret
+        (r"abc^def$", (False, True)),  # Invalid caret with dollar
+        (r"[^abc]*?$", (False, True)),  # Invalid lazy quantifier with valid anchors
+        
+        # 6. Complex cases
+        (r"(|^)abc[^xyz]*$", (True, True)),  # Complex valid regex
+        (r"^abc[^xyz]+def$", (True, True)),  # Complex valid regex with quantifiers
+        (r"(|^)abc*?[^xyz]$", (False, True)),  # Complex invalid regex with lazy quantifier
+        (r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", (True, True)),
+
+        # 7. The common regexes from zkemail
+        (r">[^<>]+<.*", (True, True)),
+        (r"(\r\n|^)to:[^\r\n]+\r\n", (True, True)),
+        (r"(\r\n|^)subject:[^\r\n]+\r\n", (True, True)),
+        #(r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/]+@[A-Za-z0-9.\-@]+", (True, True)),
+        #(r"[A-Za-z0-9!#$%&'*+=?\-\^_`{|}~.\/@]+@[A-Za-z0-9.\-]+", (True, True)),
+        (r"(\r\n|^)from:[^\r\n]+\r\n", (True, True)),
+        (r"(\r\n|^)dkim-signature:([a-z]+=[^;]+; )+bh=[a-zA-Z0-9+/=]+;", (True, True)),
+        (r"(\r\n|^)dkim-signature:([a-z]+=[^;]+; )+t=[0-9]+;", (True, True)),
+        (r"(\r\n|^)message-id:<[A-Za-z0-9=@\.\+_-]+>\r\n", (True, True)),
+    ]
+    for regex, expected in test_cases:
+        assert check_zkregex_rules_basic(regex) == expected, f"Expected {regex} to have correct zk-regex rules: {expected}"