From f5db89ec030f7c744d9215cbe433c0d246266071 Mon Sep 17 00:00:00 2001 From: Alexis Lacroix Date: Tue, 25 Nov 2025 13:12:01 +0100 Subject: [PATCH] [IMP] spreadsheet: support array literals in formula tokenizer The tokenizer now recognizes array literals (e.g. {1,2;3,4}) by handling braces and array row separators. This allows formulas using array literals to be properly tokenized. Task: 4735250 --- .../tests/test_spreadsheet_tokenizer.py | 45 +++++++++++++++++++ src/util/spreadsheet/tokenizer.py | 29 ++++++++++-- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/src/spreadsheet/tests/test_spreadsheet_tokenizer.py b/src/spreadsheet/tests/test_spreadsheet_tokenizer.py index 823045706..32a06e4ad 100644 --- a/src/spreadsheet/tests/test_spreadsheet_tokenizer.py +++ b/src/spreadsheet/tests/test_spreadsheet_tokenizer.py @@ -184,3 +184,48 @@ def test_wrong_references(self): tokenize("=''!A1"), [("OPERATOR", "="), ("SYMBOL", "''!A1")], ) + + def test_literal_array(self): + self.assertEqual( + tokenize("={1,2;3,4}"), + [ + ("OPERATOR", "="), + ("LEFT_BRACE", "{"), + ("NUMBER", "1"), + ("ARG_SEPARATOR", ","), + ("NUMBER", "2"), + ("ARRAY_ROW_SEPARATOR", ";"), + ("NUMBER", "3"), + ("ARG_SEPARATOR", ","), + ("NUMBER", "4"), + ("RIGHT_BRACE", "}"), + ], + ) + self.assertEqual( + tokenize("=SUM({1,2})"), + [ + ("OPERATOR", "="), + ("SYMBOL", "SUM"), + ("LEFT_PAREN", "("), + ("LEFT_BRACE", "{"), + ("NUMBER", "1"), + ("ARG_SEPARATOR", ","), + ("NUMBER", "2"), + ("RIGHT_BRACE", "}"), + ("RIGHT_PAREN", ")"), + ], + ) + + def test_wrong_literal_array(self): + # Array with a wrong/fake row separator (should be semicolon, here it's a pipe) + self.assertEqual( + tokenize("={1|2}"), + [ + ("OPERATOR", "="), + ("LEFT_BRACE", "{"), + ("NUMBER", "1"), + ("UNKNOWN", "|"), + ("NUMBER", "2"), + ("RIGHT_BRACE", "}"), + ], + ) diff --git a/src/util/spreadsheet/tokenizer.py b/src/util/spreadsheet/tokenizer.py index d3a02ed1f..52ed73cec 100644 --- a/src/util/spreadsheet/tokenizer.py +++ b/src/util/spreadsheet/tokenizer.py @@ -87,7 +87,9 @@ def tokenize(string, locale=DEFAULT_LOCALE): while not chars.is_over(): token = ( tokenize_space(chars) + or tokenize_array_row_separator(chars, locale) or tokenize_args_separator(chars, locale) + or tokenize_braces(chars) or tokenize_parenthesis(chars) or tokenize_operator(chars) or tokenize_string(chars) @@ -112,14 +114,25 @@ def tokenize_debugger(chars): return None -parenthesis = {"(": ("LEFT_PAREN", "("), ")": ("RIGHT_PAREN", ")")} +PARENTHESIS = {"(": ("LEFT_PAREN", "("), ")": ("RIGHT_PAREN", ")")} def tokenize_parenthesis(chars): value = chars.current - if value in parenthesis: + if value in PARENTHESIS: chars.shift() - return parenthesis[value] + return PARENTHESIS[value] + return None + + +BRACES = {"{": ("LEFT_BRACE", "{"), "}": ("RIGHT_BRACE", "}")} + + +def tokenize_braces(chars): + value = chars.current + if value in BRACES: + chars.shift() + return BRACES[value] return None @@ -141,6 +154,16 @@ def tokenize_operator(chars): FIRST_POSSIBLE_NUMBER_CHARS = set("0123456789") +def tokenize_array_row_separator(chars, locale): + row_separator = "\\" if locale["formulaArgSeparator"] == ";" else ";" + if not row_separator: + return None + if chars.current == row_separator: + chars.shift() + return "ARRAY_ROW_SEPARATOR", row_separator + return None + + def tokenize_number(chars, locale): if chars.current not in FIRST_POSSIBLE_NUMBER_CHARS and chars.current != locale["decimalSeparator"]: return None