diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 4ba255d28bdcf6..6411049796bf12 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -982,9 +982,10 @@ dummy_func( DEOPT_IF(!_PyLong_IsNonNegativeCompact((PyLongObject *)sub)); Py_ssize_t index = ((PyLongObject*)sub)->long_value.ob_digit[0]; DEOPT_IF(PyUnicode_GET_LENGTH(str) <= index); - // Specialize for reading an ASCII character from any string: - Py_UCS4 c = PyUnicode_READ_CHAR(str, index); - DEOPT_IF(Py_ARRAY_LENGTH(_Py_SINGLETON(strings).ascii) <= c); + // Specialize for reading an ASCII character from an ASCII string: + DEOPT_IF(!PyUnicode_IS_COMPACT_ASCII(str)); + uint8_t c = PyUnicode_1BYTE_DATA(str)[index]; + assert(c < 128); STAT_INC(BINARY_OP, hit); PyObject *res_o = (PyObject*)&_Py_SINGLETON(strings).ascii[c]; PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc); diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 7273a87681b4dd..079d31da6c1b7a 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -1502,11 +1502,12 @@ UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } - Py_UCS4 c = PyUnicode_READ_CHAR(str, index); - if (Py_ARRAY_LENGTH(_Py_SINGLETON(strings).ascii) <= c) { + if (!PyUnicode_IS_COMPACT_ASCII(str)) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } + uint8_t c = PyUnicode_1BYTE_DATA(str)[index]; + assert(c < 128); STAT_INC(BINARY_OP, hit); PyObject *res_o = (PyObject*)&_Py_SINGLETON(strings).ascii[c]; PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 68d73cccec4d6b..3d5bf75ac0acae 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -892,12 +892,13 @@ assert(_PyOpcode_Deopt[opcode] == (BINARY_OP)); JUMP_TO_PREDICTED(BINARY_OP); } - Py_UCS4 c = PyUnicode_READ_CHAR(str, index); - if (Py_ARRAY_LENGTH(_Py_SINGLETON(strings).ascii) <= c) { + if (!PyUnicode_IS_COMPACT_ASCII(str)) { UPDATE_MISS_STATS(BINARY_OP); assert(_PyOpcode_Deopt[opcode] == (BINARY_OP)); JUMP_TO_PREDICTED(BINARY_OP); } + uint8_t c = PyUnicode_1BYTE_DATA(str)[index]; + assert(c < 128); STAT_INC(BINARY_OP, hit); PyObject *res_o = (PyObject*)&_Py_SINGLETON(strings).ascii[c]; PyStackRef_CLOSE_SPECIALIZED(sub_st, _PyLong_ExactDealloc); diff --git a/Python/jit.c b/Python/jit.c index 47d3d7a5d27180..ffb4b5b2e51cea 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -181,6 +181,7 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, #define IS_AARCH64_ADRP(I) (((I) & 0x9F000000) == 0x90000000) #define IS_AARCH64_BRANCH(I) (((I) & 0x7C000000) == 0x14000000) #define IS_AARCH64_BRANCH_COND(I) (((I) & 0x7C000000) == 0x54000000) +#define IS_AARCH64_BRANCH_ZERO(I) (((I) & 0x7E000000) == 0x34000000) #define IS_AARCH64_TEST_AND_BRANCH(I) (((I) & 0x7E000000) == 0x36000000) #define IS_AARCH64_LDR_OR_STR(I) (((I) & 0x3B000000) == 0x39000000) #define IS_AARCH64_MOV(I) (((I) & 0x9F800000) == 0x92800000) @@ -348,7 +349,7 @@ void patch_aarch64_19r(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_BRANCH_COND(*loc32)); + assert(IS_AARCH64_BRANCH_COND(*loc32) || IS_AARCH64_BRANCH_ZERO(*loc32)); value -= (uintptr_t)location; // Check that we're not out of range of 21 signed bits: assert((int64_t)value >= -(1 << 20)); diff --git a/Tools/cases_generator/analyzer.py b/Tools/cases_generator/analyzer.py index 93aa4899fe6ec8..9293a649e8a0ec 100644 --- a/Tools/cases_generator/analyzer.py +++ b/Tools/cases_generator/analyzer.py @@ -614,6 +614,8 @@ def has_error_without_pop(op: parser.CodeDef) -> bool: "PyUnicode_Concat", "PyUnicode_GET_LENGTH", "PyUnicode_READ_CHAR", + "PyUnicode_IS_COMPACT_ASCII", + "PyUnicode_1BYTE_DATA", "Py_ARRAY_LENGTH", "Py_FatalError", "Py_INCREF", diff --git a/Tools/jit/_optimizers.py b/Tools/jit/_optimizers.py index 0adc550ba5e84c..297b9517f6a27a 100644 --- a/Tools/jit/_optimizers.py +++ b/Tools/jit/_optimizers.py @@ -1,6 +1,7 @@ """Low-level optimization of textual assembly.""" import dataclasses +import enum import pathlib import re import typing @@ -65,23 +66,72 @@ # MyPy doesn't understand that a invariant variable can be initialized by a covariant value CUSTOM_AARCH64_BRANCH19: str | None = "CUSTOM_AARCH64_BRANCH19" -# Branches are either b.{cond} or bc.{cond} -_AARCH64_BRANCHES: dict[str, tuple[str | None, str | None]] = { - "b." + cond: (("b." + inverse if inverse else None), CUSTOM_AARCH64_BRANCH19) - for (cond, inverse) in _AARCH64_COND_CODES.items() -} | { - "bc." + cond: (("bc." + inverse if inverse else None), CUSTOM_AARCH64_BRANCH19) - for (cond, inverse) in _AARCH64_COND_CODES.items() +_AARCH64_SHORT_BRANCHES = { + "tbz": "tbnz", + "tbnz": "tbz", } +# Branches are either b.{cond}, bc.{cond}, cbz, cbnz, tbz or tbnz +_AARCH64_BRANCHES: dict[str, tuple[str | None, str | None]] = ( + { + "b." + cond: (("b." + inverse if inverse else None), CUSTOM_AARCH64_BRANCH19) + for (cond, inverse) in _AARCH64_COND_CODES.items() + } + | { + "bc." + cond: (("bc." + inverse if inverse else None), CUSTOM_AARCH64_BRANCH19) + for (cond, inverse) in _AARCH64_COND_CODES.items() + } + | { + "cbz": ("cbnz", CUSTOM_AARCH64_BRANCH19), + "cbnz": ("cbz", CUSTOM_AARCH64_BRANCH19), + } + | {cond: (inverse, None) for (cond, inverse) in _AARCH64_SHORT_BRANCHES.items()} +) + + +@enum.unique +class InstructionKind(enum.Enum): + + JUMP = enum.auto() + LONG_BRANCH = enum.auto() + SHORT_BRANCH = enum.auto() + RETURN = enum.auto() + OTHER = enum.auto() + @dataclasses.dataclass +class Instruction: + kind: InstructionKind + name: str + text: str + target: str | None + + def is_branch(self) -> bool: + return self.kind in (InstructionKind.LONG_BRANCH, InstructionKind.SHORT_BRANCH) + + def update_target(self, target: str) -> "Instruction": + assert self.target is not None + return Instruction( + self.kind, self.name, self.text.replace(self.target, target), target + ) + + def update_name_and_target(self, name: str, target: str) -> "Instruction": + assert self.target is not None + return Instruction( + self.kind, + name, + self.text.replace(self.name, name).replace(self.target, target), + target, + ) + + +@dataclasses.dataclass(eq=False) class _Block: label: str | None = None # Non-instruction lines like labels, directives, and comments: noninstructions: list[str] = dataclasses.field(default_factory=list) # Instruction lines: - instructions: list[str] = dataclasses.field(default_factory=list) + instructions: list[Instruction] = dataclasses.field(default_factory=list) # If this block ends in a jump, where to? target: typing.Self | None = None # The next block in the linked list: @@ -108,6 +158,7 @@ class Optimizer: # Prefixes used to mangle local labels and symbols: label_prefix: str symbol_prefix: str + re_global: re.Pattern[str] # The first block in the linked list: _root: _Block = dataclasses.field(init=False, default_factory=_Block) _labels: dict[str, _Block] = dataclasses.field(init=False, default_factory=dict) @@ -122,27 +173,36 @@ class Optimizer: # Override everything that follows in subclasses: _supports_external_relocations = True _branches: typing.ClassVar[dict[str, tuple[str | None, str | None]]] = {} + # Short branches are instructions that can branch within a micro-op, + # but might not have the reach to branch anywhere within a trace. + _short_branches: typing.ClassVar[dict[str, str]] = {} # Two groups (instruction and target): _re_branch: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH # One group (target): _re_jump: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH # No groups: _re_return: typing.ClassVar[re.Pattern[str]] = _RE_NEVER_MATCH + text: str = "" + globals: set[str] = dataclasses.field(default_factory=set) def __post_init__(self) -> None: # Split the code into a linked list of basic blocks. A basic block is an # optional label, followed by zero or more non-instruction lines, # followed by zero or more instruction lines (only the last of which may # be a branch, jump, or return): - text = self._preprocess(self.path.read_text()) + self.text = self._preprocess(self.path.read_text()) block = self._root - for line in text.splitlines(): + for line in self.text.splitlines(): # See if we need to start a new block: if match := self._re_label.match(line): # Label. New block: block.link = block = self._lookup_label(match["label"]) block.noninstructions.append(line) continue + if match := self.re_global.match(line): + self.globals.add(match["label"]) + block.noninstructions.append(line) + continue if self._re_noninstructions.match(line): if block.instructions: # Non-instruction lines. New block: @@ -152,16 +212,19 @@ def __post_init__(self) -> None: if block.target or not block.fallthrough: # Current block ends with a branch, jump, or return. New block: block.link = block = _Block() - block.instructions.append(line) - if match := self._re_branch.match(line): + inst = self._parse_instruction(line) + block.instructions.append(inst) + if inst.is_branch(): # A block ending in a branch has a target and fallthrough: - block.target = self._lookup_label(match["target"]) + assert inst.target is not None + block.target = self._lookup_label(inst.target) assert block.fallthrough - elif match := self._re_jump.match(line): + elif inst.kind == InstructionKind.JUMP: # A block ending in a jump has a target and no fallthrough: - block.target = self._lookup_label(match["target"]) + assert inst.target is not None + block.target = self._lookup_label(inst.target) block.fallthrough = False - elif self._re_return.match(line): + elif inst.kind == InstructionKind.RETURN: # A block ending in a return has no target and fallthrough: assert not block.target block.fallthrough = False @@ -174,39 +237,47 @@ def _preprocess(self, text: str) -> str: continue_label = f"{self.label_prefix}_JIT_CONTINUE" return re.sub(continue_symbol, continue_label, text) - @classmethod - def _invert_branch(cls, line: str, target: str) -> str | None: - match = cls._re_branch.match(line) - assert match - inverted_reloc = cls._branches.get(match["instruction"]) + def _parse_instruction(self, line: str) -> Instruction: + target = None + if match := self._re_branch.match(line): + target = match["target"] + name = match["instruction"] + if name in self._short_branches: + kind = InstructionKind.SHORT_BRANCH + else: + kind = InstructionKind.LONG_BRANCH + elif match := self._re_jump.match(line): + target = match["target"] + name = line[: -len(target)].strip() + kind = InstructionKind.JUMP + elif match := self._re_return.match(line): + name = line + kind = InstructionKind.RETURN + else: + name, *_ = line.split(" ") + kind = InstructionKind.OTHER + return Instruction(kind, name, line, target) + + def _invert_branch(self, inst: Instruction, target: str) -> Instruction | None: + assert inst.is_branch() + if inst.kind == InstructionKind.SHORT_BRANCH and self._is_far_target(target): + return None + inverted_reloc = self._branches.get(inst.name) if inverted_reloc is None: return None inverted = inverted_reloc[0] if not inverted: return None - (a, b), (c, d) = match.span("instruction"), match.span("target") - # Before: - # je FOO - # After: - # jne BAR - return "".join([line[:a], inverted, line[b:c], target, line[d:]]) - - @classmethod - def _update_jump(cls, line: str, target: str) -> str: - match = cls._re_jump.match(line) - assert match - a, b = match.span("target") - # Before: - # jmp FOO - # After: - # jmp BAR - return "".join([line[:a], target, line[b:]]) + return inst.update_name_and_target(inverted, target) def _lookup_label(self, label: str) -> _Block: if label not in self._labels: self._labels[label] = _Block(label) return self._labels[label] + def _is_far_target(self, label: str) -> bool: + return not label.startswith(self.label_prefix) + def _blocks(self) -> typing.Generator[_Block, None, None]: block: _Block | None = self._root while block: @@ -214,7 +285,7 @@ def _blocks(self) -> typing.Generator[_Block, None, None]: block = block.link def _body(self) -> str: - lines = [] + lines = ["#" + line for line in self.text.splitlines()] hot = True for block in self._blocks(): if hot != block.hot: @@ -222,7 +293,8 @@ def _body(self) -> str: # Make it easy to tell at a glance where cold code is: lines.append(f"# JIT: {'HOT' if hot else 'COLD'} ".ljust(80, "#")) lines.extend(block.noninstructions) - lines.extend(block.instructions) + for inst in block.instructions: + lines.append(inst.text) return "\n".join(lines) def _predecessors(self, block: _Block) -> typing.Generator[_Block, None, None]: @@ -289,8 +361,8 @@ def _invert_hot_branches(self) -> None: if inverted is None: continue branch.instructions[-1] = inverted - jump.instructions[-1] = self._update_jump( - jump.instructions[-1], branch.target.label + jump.instructions[-1] = jump.instructions[-1].update_target( + branch.target.label ) branch.target, jump.target = jump.target, branch.target jump.hot = True @@ -299,49 +371,106 @@ def _remove_redundant_jumps(self) -> None: # Zero-length jumps can be introduced by _insert_continue_label and # _invert_hot_branches: for block in self._blocks(): + target = block.target + if target is None: + continue + target = target.resolve() # Before: # jmp FOO # FOO: # After: # FOO: - if ( - block.target - and block.link - and block.target.resolve() is block.link.resolve() - ): + if block.link and target is block.link.resolve(): block.target = None block.fallthrough = True block.instructions.pop() + # Before: + # br ? FOO: + # ... + # FOO: + # jump BAR + # After: + # br cond BAR + # ... + elif ( + len(target.instructions) == 1 + and target.instructions[0].kind == InstructionKind.JUMP + ): + assert target.target is not None + assert target.target.label is not None + if block.instructions[ + -1 + ].kind == InstructionKind.SHORT_BRANCH and self._is_far_target( + target.target.label + ): + continue + block.target = target.target + block.instructions[-1] = block.instructions[-1].update_target( + target.target.label + ) + + def _find_live_blocks(self) -> set[_Block]: + live: set[_Block] = set() + # Externally reachable blocks are live + todo: set[_Block] = {b for b in self._blocks() if b.label in self.globals} + while todo: + block = todo.pop() + live.add(block) + if block.fallthrough: + next = block.link + if next is not None and next not in live: + todo.add(next) + next = block.target + if next is not None and next not in live: + todo.add(next) + return live + + def _remove_unreachable(self) -> None: + live = self._find_live_blocks() + continuation = self._lookup_label(f"{self.label_prefix}_JIT_CONTINUE") + # Keep blocks after continuation as they may contain data and + # metadata that the assembler needs + prev: _Block | None = None + block = self._root + while block is not continuation: + next = block.link + assert next is not None + if not block in live and prev: + prev.link = next + else: + prev = block + block = next + assert prev.link is block def _fixup_external_labels(self) -> None: if self._supports_external_relocations: # Nothing to fix up return - for block in self._blocks(): + for index, block in enumerate(self._blocks()): if block.target and block.fallthrough: branch = block.instructions[-1] - match = self._re_branch.match(branch) - assert match is not None - target = match["target"] - reloc = self._branches[match["instruction"]][1] - if reloc is not None and not target.startswith(self.label_prefix): + assert branch.is_branch() + target = branch.target + assert target is not None + reloc = self._branches[branch.name][1] + if reloc is not None and self._is_far_target(target): name = target[len(self.symbol_prefix) :] - block.instructions[-1] = ( - f"// target='{target}' prefix='{self.label_prefix}'" - ) - block.instructions.append( - f"{self.symbol_prefix}{reloc}_JIT_RELOCATION_{name}:" + label = f"{self.symbol_prefix}{reloc}_JIT_RELOCATION_{name}_JIT_RELOCATION_{index}:" + block.instructions[-1] = Instruction( + InstructionKind.OTHER, "", label, None ) - a, b = match.span("target") - branch = "".join([branch[:a], "0", branch[b:]]) - block.instructions.append(branch) + block.instructions.append(branch.update_target("0")) def run(self) -> None: """Run this optimizer.""" self._insert_continue_label() self._mark_hot_blocks() - self._invert_hot_branches() - self._remove_redundant_jumps() + # Removing branches can expose opportunities for more branch removal. + # Repeat a few times. 2 would probably do, but it's fast enough with 4. + for _ in range(4): + self._invert_hot_branches() + self._remove_redundant_jumps() + self._remove_unreachable() self._fixup_external_labels() self.path.write_text(self._body()) @@ -350,10 +479,12 @@ class OptimizerAArch64(Optimizer): # pylint: disable = too-few-public-methods """aarch64-pc-windows-msvc/aarch64-apple-darwin/aarch64-unknown-linux-gnu""" _branches = _AARCH64_BRANCHES + _short_branches = _AARCH64_SHORT_BRANCHES # Mach-O does not support the 19 bit branch locations needed for branch reordering _supports_external_relocations = False + _branch_patterns = [name.replace(".", r"\.") for name in _AARCH64_BRANCHES] _re_branch = re.compile( - rf"\s*(?P{'|'.join(_AARCH64_BRANCHES)})\s+(.+,\s+)*(?P[\w.]+)" + rf"\s*(?P{'|'.join(_branch_patterns)})\s+(.+,\s+)*(?P[\w.]+)" ) # https://developer.arm.com/documentation/ddi0602/2025-03/Base-Instructions/B--Branch- @@ -366,6 +497,7 @@ class OptimizerX86(Optimizer): # pylint: disable = too-few-public-methods """i686-pc-windows-msvc/x86_64-apple-darwin/x86_64-unknown-linux-gnu""" _branches = _X86_BRANCHES + _short_branches = {} _re_branch = re.compile( rf"\s*(?P{'|'.join(_X86_BRANCHES)})\s+(?P[\w.]+)" ) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index e717365b6b9785..5c45ab930a4ac4 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -226,7 +226,7 @@ def convert_labels_to_relocations(self) -> None: for name, hole_plus in self.symbols.items(): if isinstance(name, str) and "_JIT_RELOCATION_" in name: _, offset = hole_plus - reloc, target = name.split("_JIT_RELOCATION_") + reloc, target, _ = name.split("_JIT_RELOCATION_") value, symbol = symbol_to_value(target) hole = Hole( int(offset), typing.cast(_schema.HoleKind, reloc), value, symbol, 0 diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 4c188d74a68602..adb8a8d8ecb8a1 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -46,6 +46,7 @@ class _Target(typing.Generic[_S, _R]): optimizer: type[_optimizers.Optimizer] = _optimizers.Optimizer label_prefix: typing.ClassVar[str] symbol_prefix: typing.ClassVar[str] + re_global: typing.ClassVar[re.Pattern[str]] stable: bool = False debug: bool = False verbose: bool = False @@ -180,7 +181,10 @@ async def _compile( "clang", args_s, echo=self.verbose, llvm_version=self.llvm_version ) self.optimizer( - s, label_prefix=self.label_prefix, symbol_prefix=self.symbol_prefix + s, + label_prefix=self.label_prefix, + symbol_prefix=self.symbol_prefix, + re_global=self.re_global, ).run() args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"] await _llvm.run( @@ -355,12 +359,14 @@ class _COFF32(_COFF): # These mangle like Mach-O and other "older" formats: label_prefix = "L" symbol_prefix = "_" + re_global = re.compile(r'\s*\.def\s+(?P