Add dfa input generator

StefanosChaliasos · StefanosChaliasos · commit ee46d1f61d12 · 2025-03-05T10:03:46.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,7 @@ dev = [
 
 [tool.ruff]
 line-length = 88
-target-version = "py38"
+target-version = "py312"
 lint.select = ["E", "F", "W", "I"]
 lint.ignore = ["F401", "E501"]
 
diff --git a/src/zkregex_fuzzer/dfa.py b/src/zkregex_fuzzer/dfa.py
@@ -385,3 +385,102 @@ def generate_random_dfa(
         dfa = DFA.from_nfa(nfa, minify=True)
 
     return dfa
+
+
+def dfa_string_matching(
+    regex: str,
+    max_length: int = 10,
+) -> str:
+    """
+    Convert `regex` to a DFA using automata-lib, then randomly generate a string
+    that the DFA accepts. Returns a string that the DFA accepts.
+    """
+
+    # Step 1: Convert to NFA or directly to DFA
+    dfa = regex_to_dfa(regex)
+
+    # Step 2: Determine for each state if acceptance is possible from that state
+    # We'll do a BFS backward from each final state to mark reachable states.
+    can_reach_accept = _compute_accept_reachability(dfa)
+
+    # Step 3: Do a random walk
+    s = _random_walk_dfa(dfa, can_reach_accept, max_length)
+    if s is None:
+        raise ValueError("Failed to generate a string that the DFA accepts.")
+    return s
+
+
+def _compute_accept_reachability(dfa: DFA) -> dict:
+    """
+    For each state, store whether it's possible to reach a final state.
+    Returns a dict: state -> bool
+    """
+    # Start from final states and do BFS/DFS backwards:
+    # We'll create a graph reversed: from each state, we see where it can come from.
+    reverse_graph = {s: [] for s in dfa.states}
+    for s in dfa.states:
+        for sym, t in dfa.transitions[s].items():
+            reverse_graph[t].append((s, sym))
+
+    can_reach = {s: False for s in dfa.states}
+    # Mark final states as can_reach = True
+    queue = list(dfa.final_states)
+    for f in queue:
+        can_reach[f] = True
+
+    # BFS
+    idx = 0
+    while idx < len(queue):
+        current = queue[idx]
+        idx += 1
+        for prev_state, _symbol in reverse_graph[current]:
+            if not can_reach[prev_state]:
+                can_reach[prev_state] = True
+                queue.append(prev_state)
+
+    return can_reach
+
+
+def _random_walk_dfa(
+    dfa: DFA, can_reach_accept: dict, max_length: int
+) -> Optional[str]:
+    """
+    Start at dfa.initial_state, randomly choose transitions that lead to states
+    from which a final state is reachable, until we reach a final or exceed max_length.
+    Note that max_length is not a hard limit, but rather a wanted length.
+    Return the accepted string or None if we can't produce one.
+    """
+    hard_limit = 100
+    current_state = dfa.initial_state
+    out = []
+    # We'll limit the number of steps to avoid infinite loops
+    for length_counter in range(hard_limit):
+        # If current_state is final, maybe stop or continue?
+        # We'll do a random 50% chance to stop if final, producing a short string.
+        if current_state in dfa.final_states:
+            if length_counter >= max_length or random.random() < 0.5:
+                # 50% chance to end early if final
+                return "".join(out)
+        # gather possible transitions that lead to can_reach_accept state
+        next_options = [
+            (symbol, dest)
+            for symbol, dest in dfa.transitions[current_state].items()
+            if can_reach_accept[dest]
+        ]
+
+        if not next_options:
+            # no valid transitions, so if we are final we can stop; else give up
+            if current_state in dfa.final_states:
+                return "".join(out)
+            else:
+                return None
+
+        # choose a random transition
+        symbol, dest = random.choice(next_options)
+        out.append(symbol)
+        current_state = dest
+
+    # If we are here, we've reached max_length. Accept if the state is final
+    if current_state in dfa.final_states:
+        return "".join(out)
+    return None
diff --git a/tests/test_dfa.py b/tests/test_dfa.py
@@ -1,5 +1,6 @@
 from automata.regex.regex import isequal
 from zkregex_fuzzer.dfa import (
+    dfa_string_matching,
     generate_random_dfa,
     has_multiple_accepting_states_regex,
     regex_to_dfa,
@@ -17,26 +18,29 @@
     r"(a|ab|abc)",
     r"(1|12)",
 ]
+regex_without_multiple_accepting_states = [
+    r"(a|b)*",
+    r"abc",
+    r"(abc|def|ghi)",
+    r"(abc)*",
+    r"(hello)",
+    r"(ab)*",
+    r"(a|b|c)*",
+    r"((a|b|c)*abc)",  # This is somewhat comples, do we want to support this?
+    r"[a-zA-Z]+",
+    r"[0-9]+",
+    r"(abc|abcd|abcde)f",
+    r"(hello|helloo|hellooo)(foo|foob|fooba)?bar",
+    r"(foo|foob|fooba)?bar",
+    r"(abc|def)(gh|jk)(lm|nop)",
+]
+single_solution_regexes = [
+    r"abc",
+    r"(hello)",
+]
 
 
 def test_has_multiple_accepting_states_regex_without_multiple():
-    regex_without_multiple_accepting_states = [
-        r"(a|b)*",
-        r"abc",
-        r"(abc|def|ghi)",
-        r"(abc)*",
-        r"(hello)",
-        r"(ab)*",
-        r"(a|b|c)*",
-        r"((a|b|c)*abc)",
-        r"[a-zA-Z]+",
-        r"[0-9]+",
-        r"(abc|abcd|abcde)f",
-        r"(hello|helloo|hellooo)(foo|foob|fooba)?bar",
-        r"(foo|foob|fooba)?bar",
-        r"(abc|def)(gh|jk)(lm|nop)",
-    ]
-
     for regex in regex_without_multiple_accepting_states:
         assert not has_multiple_accepting_states_regex(regex)
 
@@ -79,10 +83,10 @@ def test_generate_dfa():
                 max_depth=10, use_unicode=False, single_final_state=True
             )
             regex_with_final = transform_dfa_to_regex(dfa_with_final)
+            dfa_from_regex_with_final = regex_to_dfa(regex_with_final)
             break
         except Exception:
             continue
-    dfa_from_regex_with_final = regex_to_dfa(regex_with_final)
     assert len(dfa_with_final.final_states) == 1
     assert len(dfa_from_regex_with_final.final_states) == 1
 
@@ -92,9 +96,21 @@ def test_generate_dfa():
                 max_depth=10, use_unicode=False, single_final_state=False
             )
             regex_without_final = transform_dfa_to_regex(dfa_without_final)
+            dfa_from_regex_without_final = regex_to_dfa(regex_without_final)
             break
         except Exception:
             continue
-    dfa_from_regex_without_final = regex_to_dfa(regex_without_final)
     assert len(dfa_without_final.final_states) >= 1
     assert len(dfa_from_regex_without_final.final_states) >= 1
+
+
+def test_dfa_string_matching():
+    for regex in regex_without_multiple_accepting_states:
+        string = dfa_string_matching(regex)
+        assert string is not None
+        for _ in range(5):
+            string2 = dfa_string_matching(regex)
+            if string != string2:
+                break
+        if regex not in single_solution_regexes:
+            assert string != string2