Skip to content

Commit 8144889

Browse files
jayakarthik-jkJayakarthik K
andauthored
Fix(#219): split_high_level function to ignore delimiters inside single quotes (#220)
* Fix: split_high_level function to ignore delimiters inside single quotes Previously, the split function would incorrectly split strings containing delimiters inside single quotes. this causes issue when splitting columns with default value that has ',' within it. Now, delimiters within single quotes are ignored, ensuring proper splitting behavior. * Add unit tests for split_high_level function - Test ignoring delimiters inside single quotes - Test ignoring delimiters inside parentheses - Test realistic SQL scenarios including ENUM, SET, DEFAULT values - Cover edge cases like empty strings and nested structures - All 14 tests passing --------- Co-authored-by: Jayakarthik K <iamjayakarthik@gmail.com>
1 parent a26a732 commit 8144889

File tree

2 files changed

+174
-16
lines changed

2 files changed

+174
-16
lines changed

mysql_ch_replicator/converter.py

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -278,23 +278,67 @@ def strip_sql_name(name):
278278
return name
279279

280280

281-
def split_high_level(data, token):
282-
results = []
283-
level = 0
284-
curr_data = ''
285-
for c in data:
286-
if c == token and level == 0:
287-
results.append(curr_data.strip())
288-
curr_data = ''
281+
def split_high_level(data, delimiter):
282+
"""
283+
Split a string by a delimiter, ignoring delimiters inside parentheses or quotes.
284+
285+
This function performs a context-aware split, respecting nested structures:
286+
- Delimiters inside parentheses () are ignored
287+
- Delimiters inside single quotes '' are ignored
288+
- Handles nested parentheses at any depth
289+
290+
Args:
291+
data (str): The string to split
292+
delimiter (str): The character to split on (typically ',' or ';')
293+
294+
Returns:
295+
list[str]: List of split segments with whitespace stripped
296+
297+
Examples:
298+
>>> split_high_level("a,b(c,d),e", ",")
299+
['a', 'b(c,d)', 'e']
300+
301+
>>> split_high_level("name varchar(100) DEFAULT 'a,b',id int", ",")
302+
["name varchar(100) DEFAULT 'a,b'", 'id int']
303+
"""
304+
if not data:
305+
return []
306+
307+
segments = []
308+
current_segment = []
309+
paren_depth = 0
310+
in_quotes = False
311+
312+
for i, char in enumerate(data):
313+
# Handle quote toggling (ignore escaped quotes)
314+
if char == "'" and (i == 0 or data[i - 1] != '\\'):
315+
in_quotes = not in_quotes
316+
current_segment.append(char)
317+
continue
318+
319+
# Track parentheses depth only outside quotes
320+
if not in_quotes:
321+
if char == '(':
322+
paren_depth += 1
323+
elif char == ')':
324+
paren_depth -= 1
325+
326+
# Split only at top level (outside parentheses and quotes)
327+
if char == delimiter and paren_depth == 0 and not in_quotes:
328+
segment_text = ''.join(current_segment).strip()
329+
if segment_text: # Only add non-empty segments
330+
segments.append(segment_text)
331+
current_segment = []
289332
continue
290-
if c == '(':
291-
level += 1
292-
if c == ')':
293-
level -= 1
294-
curr_data += c
295-
if curr_data:
296-
results.append(curr_data.strip())
297-
return results
333+
334+
current_segment.append(char)
335+
336+
# Add final segment if it exists
337+
final_segment = ''.join(current_segment).strip()
338+
if final_segment:
339+
segments.append(final_segment)
340+
341+
return segments
298342

299343

300344
def strip_sql_comments(sql_statement):

tests/test_split_high_level.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import pytest
2+
from mysql_ch_replicator.converter import split_high_level
3+
4+
5+
@pytest.mark.parametrize("data,delimiter,expected", [
6+
# Basic column definitions without quotes or parentheses
7+
(
8+
"id int NOT NULL, name varchar(255), age int",
9+
",",
10+
['id int NOT NULL', 'name varchar(255)', 'age int']
11+
),
12+
13+
# Column with DEFAULT value containing comma inside single quotes
14+
(
15+
"status varchar(50) DEFAULT 'active,pending', id int",
16+
",",
17+
["status varchar(50) DEFAULT 'active,pending'", 'id int']
18+
),
19+
20+
# Multiple columns with quoted DEFAULT values containing commas
21+
(
22+
"col1 varchar(50) DEFAULT 'value,with,commas', col2 int, col3 varchar(100) DEFAULT 'another,comma'",
23+
",",
24+
["col1 varchar(50) DEFAULT 'value,with,commas'", 'col2 int', "col3 varchar(100) DEFAULT 'another,comma'"]
25+
),
26+
27+
# ENUM definition with multiple values (commas inside parentheses)
28+
(
29+
"status enum('active','inactive','pending'), id int",
30+
",",
31+
["status enum('active','inactive','pending')", 'id int']
32+
),
33+
34+
# SET type with multiple values
35+
(
36+
"permissions set('read','write','execute'), user_id int",
37+
",",
38+
["permissions set('read','write','execute')", 'user_id int']
39+
),
40+
41+
# Column with DEFAULT containing single quote with comma
42+
(
43+
"description text DEFAULT 'User, Admin', created_at datetime",
44+
",",
45+
["description text DEFAULT 'User, Admin'", 'created_at datetime']
46+
),
47+
48+
# DECIMAL with precision and scale (comma inside parentheses)
49+
(
50+
"price decimal(10,2), quantity int",
51+
",",
52+
['price decimal(10,2)', 'quantity int']
53+
),
54+
55+
# Complex: ENUM + DEFAULT with commas in both
56+
(
57+
"type enum('type1','type2') DEFAULT 'type1', description varchar(255) DEFAULT 'desc,with,comma'",
58+
",",
59+
["type enum('type1','type2') DEFAULT 'type1'", "description varchar(255) DEFAULT 'desc,with,comma'"]
60+
),
61+
62+
# VARCHAR with length and DEFAULT containing comma
63+
(
64+
"name varchar(100) DEFAULT 'Last, First', id int NOT NULL",
65+
",",
66+
["name varchar(100) DEFAULT 'Last, First'", 'id int NOT NULL']
67+
),
68+
69+
# Empty string should return empty list
70+
(
71+
"",
72+
",",
73+
[]
74+
),
75+
76+
# Single column definition
77+
(
78+
"id int PRIMARY KEY",
79+
",",
80+
['id int PRIMARY KEY']
81+
),
82+
83+
# Multiple nested parentheses
84+
(
85+
"data1 varchar(100), func(arg1, arg2), data2 int",
86+
",",
87+
['data1 varchar(100)', 'func(arg1, arg2)', 'data2 int']
88+
),
89+
90+
# ALTER TABLE multi-statement with commas in DEFAULT values
91+
(
92+
"ADD COLUMN status varchar(50) DEFAULT 'new,value', DROP COLUMN old_col",
93+
",",
94+
["ADD COLUMN status varchar(50) DEFAULT 'new,value'", 'DROP COLUMN old_col']
95+
),
96+
97+
# Real-world example from MySQL CREATE TABLE
98+
(
99+
"`id` int NOT NULL AUTO_INCREMENT, `email` varchar(255) DEFAULT 'user@example.com', `status` enum('active','inactive') DEFAULT 'active'",
100+
",",
101+
["`id` int NOT NULL AUTO_INCREMENT", "`email` varchar(255) DEFAULT 'user@example.com'", "`status` enum('active','inactive') DEFAULT 'active'"]
102+
),
103+
])
104+
def test_split_high_level(data, delimiter, expected):
105+
"""
106+
Test the split_high_level function with SQL column definitions.
107+
108+
This test verifies that the function correctly splits SQL statements by the delimiter
109+
while ignoring delimiters that appear inside:
110+
- Parentheses (e.g., enum values, function arguments, type precision)
111+
- Single quotes (e.g., DEFAULT values, string literals)
112+
"""
113+
result = split_high_level(data, delimiter)
114+
assert result == expected, f"Failed for input: {data} with delimiter: {delimiter}"

0 commit comments

Comments
 (0)