Use automate-lib to implement DFA related functions

StefanosChaliasos · StefanosChaliasos · commit df946b0d1f48 · 2025-03-05T10:02:47.000+02:00
Implement functions to:
* transform a regex to a minimized DFA
* check if a dfa has multiple accepting states
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
   "fuzzingbook",
   "rstr",
   "exrex",
+  "automata-lib",
 ]
 
 [project.optional-dependencies]
diff --git a/src/zkregex_fuzzer/dfa.py b/src/zkregex_fuzzer/dfa.py
@@ -0,0 +1,42 @@
+"""
+dfa
+
+A number of functions for working with DFAs.
+"""
+
+from automata.fa.nfa import NFA
+from automata.fa.dfa import DFA
+
+def regex_to_dfa(regex: str) -> DFA:
+    """
+    Convert a regex to a DFA.
+    """
+    try:
+        nfa = NFA.from_regex(regex)
+    except Exception as e:
+        raise ValueError(f"Failed to parse '{regex}' into an automaton: {e}")
+    try:
+        return DFA.from_nfa(nfa, minify=True)
+    except Exception as e:
+        raise ValueError(f"Failed to convert NFA to DFA: {e}")
+
+def has_multiple_accepting_states_regex(regex: str) -> bool:
+    """
+    Returns True if converting the given regex to a DFA yields
+    multiple accepting (final) states. Returns False otherwise.
+
+    NOTE:
+      - Only handles a subset of regex syntax recognized by automata-lib.
+      - For advanced Python regex features, a custom NFA builder is needed.
+    """
+    dfa = regex_to_dfa(regex)
+    num_final_states = len(dfa.final_states)
+
+    return num_final_states > 1
+
+def has_multiple_accepting_states_dfa(dfa: DFA) -> bool:
+    """
+    Returns True if the given DFA has multiple accepting (final) states.
+    Returns False otherwise.
+    """
+    return len(dfa.final_states) > 1
diff --git a/tests/test_dfa.py b/tests/test_dfa.py
@@ -0,0 +1,38 @@
+from zkregex_fuzzer.dfa import has_multiple_accepting_states_regex
+
+
+def test_has_multiple_accepting_states_regex_without_multiple():
+    regex_without_multiple_accepting_states = [
+        r"(a|b)*",
+        r"abc",
+        r"(abc|def|ghi)",
+        r"(abc)*",
+        r"(hello)",
+        r"(ab)*",
+        r"(a|b|c)*",
+        r"((a|b|c)*abc)",
+        r"[a-zA-Z]+",
+        r"[0-9]+",
+        r"(abc|abcd|abcde)f",
+        r"(hello|helloo|hellooo)(foo|foob|fooba)?bar",
+        r"(foo|foob|fooba)?bar",
+        r"(abc|def)(gh|jk)(lm|nop)",
+    ]
+
+    for regex in regex_without_multiple_accepting_states:
+        assert not has_multiple_accepting_states_regex(regex)
+
+def test_has_multiple_accepting_states_regex_with_multiple():
+    regex_with_multiple_accepting_states = [
+        r"(ab|aba)",
+        r"(ab|aba)*",
+        r"(hello|hell)",
+        r"b(aa|aaa)",
+        r"(cat|cats)",
+        r"(xy|xyx)",
+        r"(a|ab|abc)",
+        r"(1|12)",
+    ]
+
+    for regex in regex_with_multiple_accepting_states:
+        assert has_multiple_accepting_states_regex(regex)

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ dependencies = [`
`22`	`22`	`"fuzzingbook",`
`23`	`23`	`"rstr",`
`24`	`24`	`"exrex",`
	`25`	`+ "automata-lib",`
`25`	`26`	`]`
`26`	`27`
`27`	`28`	`[project.optional-dependencies]`