Add DFA generator

StefanosChaliasos · StefanosChaliasos · commit 86271744c52b · 2025-03-05T10:03:43.000+02:00
diff --git a/src/zkregex_fuzzer/cli.py b/src/zkregex_fuzzer/cli.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 
 from zkregex_fuzzer.configs import GENERATORS, TARGETS, VALID_INPUT_GENERATORS
-from zkregex_fuzzer.fuzzer import fuzz_with_database, fuzz_with_grammar
+from zkregex_fuzzer.fuzzer import fuzz_with_database, fuzz_with_dfa, fuzz_with_grammar
 from zkregex_fuzzer.grammar import REGEX_GRAMMAR
 from zkregex_fuzzer.harness import HarnessStatus
 from zkregex_fuzzer.logger import logger
@@ -234,6 +234,14 @@ def do_fuzz(args):
             inputs_num=args.inputs_num,
             kwargs=kwargs,
         )
+    elif args.fuzzer == "dfa":
+        fuzz_with_dfa(
+            target_implementation=args.target,
+            oracle_params=(args.oracle == "valid", args.valid_input_generator),
+            regex_num=args.regex_num,
+            inputs_num=args.inputs_num,
+            kwargs=kwargs,
+        )
 
 
 def do_reproduce(args):
diff --git a/src/zkregex_fuzzer/configs.py b/src/zkregex_fuzzer/configs.py
@@ -1,5 +1,9 @@
 from zkregex_fuzzer.grammar import REGEX_GRAMMAR
-from zkregex_fuzzer.regexgen import DatabaseRegexGenerator, GrammarRegexGenerator
+from zkregex_fuzzer.regexgen import (
+    DatabaseRegexGenerator,
+    DFARegexGenerator,
+    GrammarRegexGenerator,
+)
 from zkregex_fuzzer.runner import CircomRunner, NoirRunner, PythonReRunner
 from zkregex_fuzzer.vinpgen import ExrexGenerator, GrammarBasedGenerator, RstrGenerator
 
@@ -22,4 +26,5 @@
 GENERATORS = {
     "grammar": GrammarRegexGenerator,
     "database": DatabaseRegexGenerator,
+    "dfa": DFARegexGenerator,
 }
diff --git a/src/zkregex_fuzzer/dfa.py b/src/zkregex_fuzzer/dfa.py
@@ -5,6 +5,8 @@
 """
 
 import random
+import string
+from typing import Dict, Optional, Set
 
 from automata.fa.dfa import DFA
 from automata.fa.gnfa import GNFA
@@ -226,3 +228,160 @@ def transform_dfa_to_single_accepting_state(dfa: DFA, strategy: str = "random")
         )
     else:
         return _merge_strategy(states, alphabet, transitions, initial, original_finals)
+
+
+def _get_alphabet(
+    use_unicode: bool, num_states: int, min_size: int = 2, max_size: int = 10
+) -> Set[str]:
+    """
+    Generate a random alphabet for a DFA.
+    """
+    alphabet_size = random.randint(min_size, max_size)
+    if use_unicode:
+        alphabet = set()
+        while len(alphabet) < alphabet_size:
+            codepoint = random.randint(0, 0x10FFFF)
+            try:
+                char = chr(codepoint)
+            except ValueError:
+                continue  # skip invalid code points (if any)
+            alphabet.add(char)
+    else:
+        # Restricted character set: letters, digits, punctuation, whitespace
+        allowed_pool = (
+            string.ascii_letters
+            + string.digits
+            + string.punctuation
+            + string.whitespace
+        )
+        alphabet = set(random.sample(allowed_pool, alphabet_size))
+    return alphabet
+
+
+def generate_random_dfa(
+    max_depth: int = 5,
+    use_unicode: bool = False,
+    single_final_state: bool = False,
+    seed: Optional[int] = None,
+) -> DFA:
+    """
+    Generate a random DFA with a given seed for reproducibility.
+    """
+    # Seed the random number generator for reproducibility (if seed is given)
+    if seed is not None:
+        random.seed(seed)
+    else:
+        seed = random.randrange(0, 2**32)
+        random.seed(seed)
+
+    num_states = random.randint(1, max_depth)
+
+    # Define state names (q0, q1, ..., qN) and the initial state
+    states = {f"q{i}" for i in range(num_states)}
+    initial_state = "q0"
+
+    # Determine final state(s)
+    if single_final_state:
+        final_state = random.choice(list(states))
+        final_states = {final_state}
+    else:
+        # One or more final states (randomly chosen subset of states)
+        num_finals = random.randint(1, num_states)  # at least one final
+        final_states = set(random.sample(list(states), num_finals))
+
+    alphabet = _get_alphabet(use_unicode, num_states)
+
+    # Construct transitions: for each state and each symbol, choose a random next state
+    transitions: Dict[str, Dict[str, str]] = {}
+    for state in states:
+        transitions[state] = {}
+        for sym in alphabet:
+            transitions[state][sym] = random.choice(list(states))
+
+    # Ensure at least one self-loop (cycle)
+    loop_exists = any(
+        state == dest for state in states for dest in transitions[state].values()
+    )
+    if not loop_exists:
+        # Add a self-loop on a random state with a random symbol
+        some_state = random.choice(list(states))
+        some_symbol = random.choice(list(alphabet))
+        transitions[some_state][some_symbol] = some_state
+
+    # Ensure at least one branching point (one state with two different outgoing targets)
+    if len(alphabet) >= 2:
+        branching_exists = any(len(set(transitions[s].values())) >= 2 for s in states)
+        if not branching_exists:
+            # Force branching on the initial state (as an example)
+            sym_list = list(alphabet)
+            # Make sure we have at least two symbols to create a branch
+            if len(sym_list) >= 2:
+                sym1, sym2 = sym_list[0], sym_list[1]
+                # Assign different targets for sym1 and sym2 from the initial state
+                if transitions[initial_state][sym1] == transitions[initial_state][sym2]:
+                    # Pick a different state for sym2 if both symbols currently go to the same target
+                    possible_targets = list(states - {transitions[initial_state][sym1]})
+                    if possible_targets:
+                        transitions[initial_state][sym2] = random.choice(
+                            possible_targets
+                        )
+                    # (If no possible_targets, it means only one state exists, handled by loop above)
+
+    # Introduce an "optional" path (allow skipping or taking a symbol):
+    # We do this by creating an alternate route to a final state.
+    if single_final_state and len(states) > 1:
+        # For a single final state, ensure multiple paths (direct & indirect) to it
+        final_state = next(iter(final_states))  # the only final state
+        # If initial state doesn't already have a direct transition to final, add one
+        if final_state not in transitions[initial_state].values():
+            sym = random.choice(list(alphabet))
+            transitions[initial_state][sym] = final_state
+        # Also ensure an indirect path: find a symbol from initial that goes to an intermediate state
+        intermediate_symbols = [
+            sym
+            for sym, dest in transitions[initial_state].items()
+            if dest != final_state
+        ]
+        if intermediate_symbols:
+            sym = intermediate_symbols[0]
+            intermediate_state = transitions[initial_state][sym]
+            # Link the intermediate state to the final state on some symbol (if not already final)
+            if intermediate_state != final_state:
+                sym2 = random.choice(list(alphabet))
+                transitions[intermediate_state][sym2] = final_state
+    elif not single_final_state:
+        # If multiple finals are allowed, we can treat the start state as an optional accepting state
+        # (Accept empty string or early termination)
+        if initial_state not in final_states:
+            final_states.add(initial_state)
+
+    # Construct the DFA with the generated components
+    dfa = DFA(
+        states=states,
+        input_symbols=alphabet,
+        transitions=transitions,
+        initial_state=initial_state,
+        final_states=final_states,
+    )
+
+    # Minimize the DFA for a simpler equivalent automaton
+    try:
+        # If automata-lib provides a direct minification method
+        dfa = dfa.minify()
+    except AttributeError:
+        # Fallback: convert to NFA and use DFA.from_nfa for minimization
+        nfa_transitions: Dict[str, Dict[str, Set[str]]] = {}
+        for state, trans in transitions.items():
+            # Each DFA transition becomes a singleton set in the NFA transition
+            nfa_transitions[state] = {sym: {dest} for sym, dest in trans.items()}
+        nfa = NFA(
+            states=states,
+            input_symbols=alphabet,
+            transitions=nfa_transitions,
+            initial_state=initial_state,
+            final_states=final_states,
+        )
+        # Convert NFA to DFA with minimization
+        dfa = DFA.from_nfa(nfa, minify=True)
+
+    return dfa
diff --git a/src/zkregex_fuzzer/fuzzer.py b/src/zkregex_fuzzer/fuzzer.py
@@ -7,7 +7,11 @@
 from zkregex_fuzzer.configs import GRAMMARS, TARGETS, VALID_INPUT_GENERATORS
 from zkregex_fuzzer.harness import HarnessStatus, harness
 from zkregex_fuzzer.logger import logger
-from zkregex_fuzzer.regexgen import DatabaseRegexGenerator, GrammarRegexGenerator
+from zkregex_fuzzer.regexgen import (
+    DatabaseRegexGenerator,
+    DFARegexGenerator,
+    GrammarRegexGenerator,
+)
 from zkregex_fuzzer.runner import PythonReRunner
 from zkregex_fuzzer.runner.base_runner import Runner
 from zkregex_fuzzer.transformers import regex_to_grammar
@@ -55,6 +59,25 @@ def fuzz_with_database(
     fuzz_with_regexes(regexes, inputs_num, target_runner, oracle_params, kwargs)
 
 
+def fuzz_with_dfa(
+    target_implementation: str,
+    oracle_params: tuple[bool, str],
+    regex_num: int,
+    inputs_num: int,
+    kwargs: dict,
+):
+    """
+    Fuzz test with DFA.
+    """
+    target_runner = TARGETS[target_implementation]
+
+    regex_generator = DFARegexGenerator()
+    regexes = regex_generator.generate_many(regex_num)
+    logger.info(f"Generated {len(regexes)} regexes.")
+
+    fuzz_with_regexes(regexes, inputs_num, target_runner, oracle_params, kwargs)
+
+
 def fuzz_with_regexes(
     regexes: list[str],
     inputs_num: int,
diff --git a/src/zkregex_fuzzer/regexgen.py b/src/zkregex_fuzzer/regexgen.py
@@ -22,6 +22,10 @@
 
 from fuzzingbook.Grammars import Grammar
 
+from zkregex_fuzzer.dfa import (
+    generate_random_dfa,
+    transform_dfa_to_regex,
+)
 from zkregex_fuzzer.logger import logger
 from zkregex_fuzzer.utils import (
     check_zkregex_rules_basic,
@@ -144,3 +148,33 @@ def generate_many(self, num):
                         break
 
             return result
+
+
+class DFARegexGenerator(RegexGenerator):
+    """
+    Generate regexes using a DFA.
+    """
+
+    def __init__(
+        self,
+        max_depth: int = 5,
+        use_unicode: bool = False,
+        single_final_state: bool = True,
+    ):
+        self.max_depth = max_depth
+        self.use_unicode = use_unicode
+        self.single_final_state = single_final_state
+
+    def generate_unsafe(self) -> str:
+        """
+        Generate a regex using a DFA.
+        """
+        while True:
+            try:
+                dfa = generate_random_dfa(
+                    self.max_depth, self.use_unicode, self.single_final_state
+                )
+                return transform_dfa_to_regex(dfa)
+            except Exception as e:
+                logger.debug(f"Error generating DFA: {e}")
+                continue
diff --git a/tests/test_dfa.py b/tests/test_dfa.py
@@ -1,5 +1,6 @@
 from automata.regex.regex import isequal
 from zkregex_fuzzer.dfa import (
+    generate_random_dfa,
     has_multiple_accepting_states_regex,
     regex_to_dfa,
     transform_dfa_to_regex,
@@ -69,3 +70,31 @@ def test_transform_dfa_to_regex_with_multiple_accepting_states():
             transformed_regex = transform_dfa_to_regex(transformed_dfa)
             new_dfa = regex_to_dfa(transformed_regex)
             assert len(new_dfa.final_states) == 1
+
+
+def test_generate_dfa():
+    while True:
+        try:
+            dfa_with_final = generate_random_dfa(
+                max_depth=10, use_unicode=False, single_final_state=True
+            )
+            regex_with_final = transform_dfa_to_regex(dfa_with_final)
+            break
+        except Exception:
+            continue
+    dfa_from_regex_with_final = regex_to_dfa(regex_with_final)
+    assert len(dfa_with_final.final_states) == 1
+    assert len(dfa_from_regex_with_final.final_states) == 1
+
+    while True:
+        try:
+            dfa_without_final = generate_random_dfa(
+                max_depth=10, use_unicode=False, single_final_state=False
+            )
+            regex_without_final = transform_dfa_to_regex(dfa_without_final)
+            break
+        except Exception:
+            continue
+    dfa_from_regex_without_final = regex_to_dfa(regex_without_final)
+    assert len(dfa_without_final.final_states) >= 1
+    assert len(dfa_from_regex_without_final.final_states) >= 1