Skip to content

Commit ef81a9b

Browse files
committed
[IMP] spreadsheet: support array literals in formula tokenizer
The tokenizer now recognizes array literals (e.g. {1,2;3,4}) by handling braces and array row separators. This allows formulas using array literals to be properly tokenized. Task: 4735250
1 parent 09d18f5 commit ef81a9b

File tree

2 files changed

+71
-3
lines changed

2 files changed

+71
-3
lines changed

src/spreadsheet/tests/test_spreadsheet_tokenizer.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,48 @@ def test_wrong_references(self):
184184
tokenize("=''!A1"),
185185
[("OPERATOR", "="), ("SYMBOL", "''!A1")],
186186
)
187+
188+
def test_literal_array(self):
189+
self.assertEqual(
190+
tokenize("={1,2;3,4}"),
191+
[
192+
("OPERATOR", "="),
193+
("LEFT_BRACE", "{"),
194+
("NUMBER", "1"),
195+
("ARG_SEPARATOR", ","),
196+
("NUMBER", "2"),
197+
("ARRAY_ROW_SEPARATOR", ";"),
198+
("NUMBER", "3"),
199+
("ARG_SEPARATOR", ","),
200+
("NUMBER", "4"),
201+
("RIGHT_BRACE", "}"),
202+
],
203+
)
204+
self.assertEqual(
205+
tokenize("=SUM({1,2})"),
206+
[
207+
("OPERATOR", "="),
208+
("SYMBOL", "SUM"),
209+
("LEFT_PAREN", "("),
210+
("LEFT_BRACE", "{"),
211+
("NUMBER", "1"),
212+
("ARG_SEPARATOR", ","),
213+
("NUMBER", "2"),
214+
("RIGHT_BRACE", "}"),
215+
("RIGHT_PAREN", ")"),
216+
],
217+
)
218+
219+
def test_wrong_literal_array(self):
220+
# Array with a wrong/fake row separator (should be semicolon, here it's a pipe)
221+
self.assertEqual(
222+
tokenize("={1|2}"),
223+
[
224+
("OPERATOR", "="),
225+
("LEFT_BRACE", "{"),
226+
("NUMBER", "1"),
227+
("UNKNOWN", "|"),
228+
("NUMBER", "2"),
229+
("RIGHT_BRACE", "}"),
230+
],
231+
)

src/util/spreadsheet/tokenizer.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@ def tokenize(string, locale=DEFAULT_LOCALE):
8787
while not chars.is_over():
8888
token = (
8989
tokenize_space(chars)
90+
or tokenize_array_row_separator(chars, locale)
9091
or tokenize_args_separator(chars, locale)
92+
or tokenize_braces(chars)
9193
or tokenize_parenthesis(chars)
9294
or tokenize_operator(chars)
9395
or tokenize_string(chars)
@@ -112,14 +114,25 @@ def tokenize_debugger(chars):
112114
return None
113115

114116

115-
parenthesis = {"(": ("LEFT_PAREN", "("), ")": ("RIGHT_PAREN", ")")}
117+
PARENTHESIS = {"(": ("LEFT_PAREN", "("), ")": ("RIGHT_PAREN", ")")}
116118

117119

118120
def tokenize_parenthesis(chars):
119121
value = chars.current
120-
if value in parenthesis:
122+
if value in PARENTHESIS:
121123
chars.shift()
122-
return parenthesis[value]
124+
return PARENTHESIS[value]
125+
return None
126+
127+
128+
BRACES = {"{": ("LEFT_BRACE", "{"), "}": ("RIGHT_BRACE", "}")}
129+
130+
131+
def tokenize_braces(chars):
132+
value = chars.current
133+
if value in BRACES:
134+
chars.shift()
135+
return BRACES[value]
123136
return None
124137

125138

@@ -141,6 +154,16 @@ def tokenize_operator(chars):
141154
FIRST_POSSIBLE_NUMBER_CHARS = set("0123456789")
142155

143156

157+
def tokenize_array_row_separator(chars, locale):
158+
row_separator = "\\" if locale["formulaArgSeparator"] == ";" else ";"
159+
if not row_separator:
160+
return None
161+
if chars.current == row_separator:
162+
chars.shift()
163+
return "ARRAY_ROW_SEPARATOR", row_separator
164+
return None
165+
166+
144167
def tokenize_number(chars, locale):
145168
if chars.current not in FIRST_POSSIBLE_NUMBER_CHARS and chars.current != locale["decimalSeparator"]:
146169
return None

0 commit comments

Comments
 (0)