Skip to content

Commit ee46d1f

Browse files
Add dfa input generator
1 parent 8627174 commit ee46d1f

File tree

3 files changed

+135
-20
lines changed

3 files changed

+135
-20
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ dev = [
3333

3434
[tool.ruff]
3535
line-length = 88
36-
target-version = "py38"
36+
target-version = "py312"
3737
lint.select = ["E", "F", "W", "I"]
3838
lint.ignore = ["F401", "E501"]
3939

src/zkregex_fuzzer/dfa.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,3 +385,102 @@ def generate_random_dfa(
385385
dfa = DFA.from_nfa(nfa, minify=True)
386386

387387
return dfa
388+
389+
390+
def dfa_string_matching(
391+
regex: str,
392+
max_length: int = 10,
393+
) -> str:
394+
"""
395+
Convert `regex` to a DFA using automata-lib, then randomly generate a string
396+
that the DFA accepts. Returns a string that the DFA accepts.
397+
"""
398+
399+
# Step 1: Convert to NFA or directly to DFA
400+
dfa = regex_to_dfa(regex)
401+
402+
# Step 2: Determine for each state if acceptance is possible from that state
403+
# We'll do a BFS backward from each final state to mark reachable states.
404+
can_reach_accept = _compute_accept_reachability(dfa)
405+
406+
# Step 3: Do a random walk
407+
s = _random_walk_dfa(dfa, can_reach_accept, max_length)
408+
if s is None:
409+
raise ValueError("Failed to generate a string that the DFA accepts.")
410+
return s
411+
412+
413+
def _compute_accept_reachability(dfa: DFA) -> dict:
414+
"""
415+
For each state, store whether it's possible to reach a final state.
416+
Returns a dict: state -> bool
417+
"""
418+
# Start from final states and do BFS/DFS backwards:
419+
# We'll create a graph reversed: from each state, we see where it can come from.
420+
reverse_graph = {s: [] for s in dfa.states}
421+
for s in dfa.states:
422+
for sym, t in dfa.transitions[s].items():
423+
reverse_graph[t].append((s, sym))
424+
425+
can_reach = {s: False for s in dfa.states}
426+
# Mark final states as can_reach = True
427+
queue = list(dfa.final_states)
428+
for f in queue:
429+
can_reach[f] = True
430+
431+
# BFS
432+
idx = 0
433+
while idx < len(queue):
434+
current = queue[idx]
435+
idx += 1
436+
for prev_state, _symbol in reverse_graph[current]:
437+
if not can_reach[prev_state]:
438+
can_reach[prev_state] = True
439+
queue.append(prev_state)
440+
441+
return can_reach
442+
443+
444+
def _random_walk_dfa(
445+
dfa: DFA, can_reach_accept: dict, max_length: int
446+
) -> Optional[str]:
447+
"""
448+
Start at dfa.initial_state, randomly choose transitions that lead to states
449+
from which a final state is reachable, until we reach a final or exceed max_length.
450+
Note that max_length is not a hard limit, but rather a wanted length.
451+
Return the accepted string or None if we can't produce one.
452+
"""
453+
hard_limit = 100
454+
current_state = dfa.initial_state
455+
out = []
456+
# We'll limit the number of steps to avoid infinite loops
457+
for length_counter in range(hard_limit):
458+
# If current_state is final, maybe stop or continue?
459+
# We'll do a random 50% chance to stop if final, producing a short string.
460+
if current_state in dfa.final_states:
461+
if length_counter >= max_length or random.random() < 0.5:
462+
# 50% chance to end early if final
463+
return "".join(out)
464+
# gather possible transitions that lead to can_reach_accept state
465+
next_options = [
466+
(symbol, dest)
467+
for symbol, dest in dfa.transitions[current_state].items()
468+
if can_reach_accept[dest]
469+
]
470+
471+
if not next_options:
472+
# no valid transitions, so if we are final we can stop; else give up
473+
if current_state in dfa.final_states:
474+
return "".join(out)
475+
else:
476+
return None
477+
478+
# choose a random transition
479+
symbol, dest = random.choice(next_options)
480+
out.append(symbol)
481+
current_state = dest
482+
483+
# If we are here, we've reached max_length. Accept if the state is final
484+
if current_state in dfa.final_states:
485+
return "".join(out)
486+
return None

tests/test_dfa.py

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from automata.regex.regex import isequal
22
from zkregex_fuzzer.dfa import (
3+
dfa_string_matching,
34
generate_random_dfa,
45
has_multiple_accepting_states_regex,
56
regex_to_dfa,
@@ -17,26 +18,29 @@
1718
r"(a|ab|abc)",
1819
r"(1|12)",
1920
]
21+
regex_without_multiple_accepting_states = [
22+
r"(a|b)*",
23+
r"abc",
24+
r"(abc|def|ghi)",
25+
r"(abc)*",
26+
r"(hello)",
27+
r"(ab)*",
28+
r"(a|b|c)*",
29+
r"((a|b|c)*abc)", # This is somewhat comples, do we want to support this?
30+
r"[a-zA-Z]+",
31+
r"[0-9]+",
32+
r"(abc|abcd|abcde)f",
33+
r"(hello|helloo|hellooo)(foo|foob|fooba)?bar",
34+
r"(foo|foob|fooba)?bar",
35+
r"(abc|def)(gh|jk)(lm|nop)",
36+
]
37+
single_solution_regexes = [
38+
r"abc",
39+
r"(hello)",
40+
]
2041

2142

2243
def test_has_multiple_accepting_states_regex_without_multiple():
23-
regex_without_multiple_accepting_states = [
24-
r"(a|b)*",
25-
r"abc",
26-
r"(abc|def|ghi)",
27-
r"(abc)*",
28-
r"(hello)",
29-
r"(ab)*",
30-
r"(a|b|c)*",
31-
r"((a|b|c)*abc)",
32-
r"[a-zA-Z]+",
33-
r"[0-9]+",
34-
r"(abc|abcd|abcde)f",
35-
r"(hello|helloo|hellooo)(foo|foob|fooba)?bar",
36-
r"(foo|foob|fooba)?bar",
37-
r"(abc|def)(gh|jk)(lm|nop)",
38-
]
39-
4044
for regex in regex_without_multiple_accepting_states:
4145
assert not has_multiple_accepting_states_regex(regex)
4246

@@ -79,10 +83,10 @@ def test_generate_dfa():
7983
max_depth=10, use_unicode=False, single_final_state=True
8084
)
8185
regex_with_final = transform_dfa_to_regex(dfa_with_final)
86+
dfa_from_regex_with_final = regex_to_dfa(regex_with_final)
8287
break
8388
except Exception:
8489
continue
85-
dfa_from_regex_with_final = regex_to_dfa(regex_with_final)
8690
assert len(dfa_with_final.final_states) == 1
8791
assert len(dfa_from_regex_with_final.final_states) == 1
8892

@@ -92,9 +96,21 @@ def test_generate_dfa():
9296
max_depth=10, use_unicode=False, single_final_state=False
9397
)
9498
regex_without_final = transform_dfa_to_regex(dfa_without_final)
99+
dfa_from_regex_without_final = regex_to_dfa(regex_without_final)
95100
break
96101
except Exception:
97102
continue
98-
dfa_from_regex_without_final = regex_to_dfa(regex_without_final)
99103
assert len(dfa_without_final.final_states) >= 1
100104
assert len(dfa_from_regex_without_final.final_states) >= 1
105+
106+
107+
def test_dfa_string_matching():
108+
for regex in regex_without_multiple_accepting_states:
109+
string = dfa_string_matching(regex)
110+
assert string is not None
111+
for _ in range(5):
112+
string2 = dfa_string_matching(regex)
113+
if string != string2:
114+
break
115+
if regex not in single_solution_regexes:
116+
assert string != string2

0 commit comments

Comments
 (0)