diff --git a/Doc/c-api/exceptions.rst b/Doc/c-api/exceptions.rst index 7449935e69f721..d7fe9e2c9ec9b4 100644 --- a/Doc/c-api/exceptions.rst +++ b/Doc/c-api/exceptions.rst @@ -984,6 +984,9 @@ because the :ref:`call protocol ` takes care of recursion handling. be concatenated to the :exc:`RecursionError` message caused by the recursion depth limit. + .. seealso:: + The :c:func:`PyUnstable_ThreadState_SetStackProtection` function. + .. versionchanged:: 3.9 This function is now also available in the :ref:`limited API `. diff --git a/Doc/c-api/init.rst b/Doc/c-api/init.rst index b53fce8c727ade..8ea59539e5a095 100644 --- a/Doc/c-api/init.rst +++ b/Doc/c-api/init.rst @@ -1531,6 +1531,43 @@ All of the following functions must be called after :c:func:`Py_Initialize`. .. versionadded:: 3.11 +.. c:function:: int PyUnstable_ThreadState_SetStackProtection(PyThreadState *tstate, void *stack_start_addr, size_t stack_size) + + Set the stack protection start address and stack protection size + of a Python thread state. + + On success, return ``0``. + On failure, set an exception and return ``-1``. + + CPython implements :ref:`recursion control ` for C code by raising + :py:exc:`RecursionError` when it notices that the machine execution stack is close + to overflow. See for example the :c:func:`Py_EnterRecursiveCall` function. + For this, it needs to know the location of the current thread's stack, which it + normally gets from the operating system. + When the stack is changed, for example using context switching techniques like the + Boost library's ``boost::context``, you must call + :c:func:`~PyUnstable_ThreadState_SetStackProtection` to inform CPython of the change. + + Call :c:func:`~PyUnstable_ThreadState_SetStackProtection` either before + or after changing the stack. + Do not call any other Python C API between the call and the stack + change. + + See :c:func:`PyUnstable_ThreadState_ResetStackProtection` for undoing this operation. + + .. versionadded:: next + + +.. c:function:: void PyUnstable_ThreadState_ResetStackProtection(PyThreadState *tstate) + + Reset the stack protection start address and stack protection size + of a Python thread state to the operating system defaults. + + See :c:func:`PyUnstable_ThreadState_SetStackProtection` for an explanation. + + .. versionadded:: next + + .. c:function:: PyInterpreterState* PyInterpreterState_Get(void) Get the current interpreter. diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index e7d80f2694baf2..f428396411c5e5 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -243,6 +243,18 @@ PyAPI_FUNC(int) PyGILState_Check(void); */ PyAPI_FUNC(PyObject*) _PyThread_CurrentFrames(void); +// Set the stack protection start address and stack protection size +// of a Python thread state +PyAPI_FUNC(int) PyUnstable_ThreadState_SetStackProtection( + PyThreadState *tstate, + void *stack_start_addr, // Stack start address + size_t stack_size); // Stack size (in bytes) + +// Reset the stack protection start address and stack protection size +// of a Python thread state +PyAPI_FUNC(void) PyUnstable_ThreadState_ResetStackProtection( + PyThreadState *tstate); + /* Routines for advanced debuggers, requested by David Beazley. Don't use unless you know what you are doing! */ PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Main(void); diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 905cc0cf0509b8..cf5ad528217eab 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -201,10 +201,13 @@ extern void _PyEval_DeactivateOpCache(void); static inline int _Py_MakeRecCheck(PyThreadState *tstate) { uintptr_t here_addr = _Py_get_machine_stack_pointer(); _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; + // Overflow if stack pointer is between soft limit and the base of the hardware stack. + // If it is below the hardware stack base, assume that we have the wrong stack limits, and do nothing. + // We could have the wrong stack limits because of limited platform support, or user-space threads. #if _Py_STACK_GROWS_DOWN - return here_addr < _tstate->c_stack_soft_limit; + return here_addr < _tstate->c_stack_soft_limit && here_addr >= _tstate->c_stack_soft_limit - 2 * _PyOS_STACK_MARGIN_BYTES; #else - return here_addr > _tstate->c_stack_soft_limit; + return here_addr > _tstate->c_stack_soft_limit && here_addr <= _tstate->c_stack_soft_limit + 2 * _PyOS_STACK_MARGIN_BYTES; #endif } diff --git a/Include/internal/pycore_pythonrun.h b/Include/internal/pycore_pythonrun.h index c2832098ddb3e7..b232429c4d09c3 100644 --- a/Include/internal/pycore_pythonrun.h +++ b/Include/internal/pycore_pythonrun.h @@ -54,6 +54,12 @@ extern const char* _Py_SourceAsString( # define _PyOS_STACK_MARGIN_SHIFT (_PyOS_LOG2_STACK_MARGIN + 2) #endif +#ifdef _Py_THREAD_SANITIZER +# define _PyOS_MIN_STACK_SIZE (_PyOS_STACK_MARGIN_BYTES * 6) +#else +# define _PyOS_MIN_STACK_SIZE (_PyOS_STACK_MARGIN_BYTES * 3) +#endif + #ifdef __cplusplus } diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index bad968428c73a1..e0b77cc3e85af0 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -37,6 +37,10 @@ typedef struct _PyThreadStateImpl { uintptr_t c_stack_soft_limit; uintptr_t c_stack_hard_limit; + // PyUnstable_ThreadState_ResetStackProtection() values + uintptr_t c_stack_init_base; + uintptr_t c_stack_init_top; + PyObject *asyncio_running_loop; // Strong reference PyObject *asyncio_running_task; // Strong reference diff --git a/Misc/NEWS.d/next/C_API/2025-10-06-22-17-47.gh-issue-139653.6-1MOd.rst b/Misc/NEWS.d/next/C_API/2025-10-06-22-17-47.gh-issue-139653.6-1MOd.rst new file mode 100644 index 00000000000000..cd3d5262fa0f3a --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-10-06-22-17-47.gh-issue-139653.6-1MOd.rst @@ -0,0 +1,4 @@ +Add :c:func:`PyUnstable_ThreadState_SetStackProtection` and +:c:func:`PyUnstable_ThreadState_ResetStackProtection` functions to set the +stack protection base address and stack protection size of a Python thread +state. Patch by Victor Stinner. diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-17-14-40-45.gh-issue-139653.LzOy1M.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-17-14-40-45.gh-issue-139653.LzOy1M.rst new file mode 100644 index 00000000000000..c3ae0e8adab319 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-17-14-40-45.gh-issue-139653.LzOy1M.rst @@ -0,0 +1,4 @@ +Only raise a ``RecursionError`` or trigger a fatal error if the stack +pointer is both below the limit pointer *and* above the stack base. If +outside of these bounds assume that it is OK. This prevents false positives +when user-space threads swap stacks. diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 5d877b5655b89c..79d57c43723841 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -2408,6 +2408,57 @@ set_vectorcall_nop(PyObject *self, PyObject *func) Py_RETURN_NONE; } +static void +check_threadstate_set_stack_protection(PyThreadState *tstate, + void *start, size_t size) +{ + assert(PyUnstable_ThreadState_SetStackProtection(tstate, start, size) == 0); + assert(!PyErr_Occurred()); + + _PyThreadStateImpl *ts = (_PyThreadStateImpl *)tstate; + assert(ts->c_stack_top == (uintptr_t)start + size); + assert(ts->c_stack_hard_limit <= ts->c_stack_soft_limit); + assert(ts->c_stack_soft_limit < ts->c_stack_top); +} + + +static PyObject * +test_threadstate_set_stack_protection(PyObject *self, PyObject *Py_UNUSED(args)) +{ + PyThreadState *tstate = PyThreadState_GET(); + _PyThreadStateImpl *ts = (_PyThreadStateImpl *)tstate; + assert(!PyErr_Occurred()); + + uintptr_t init_base = ts->c_stack_init_base; + size_t init_top = ts->c_stack_init_top; + + // Test the minimum stack size + size_t size = _PyOS_MIN_STACK_SIZE; + void *start = (void*)(_Py_get_machine_stack_pointer() - size); + check_threadstate_set_stack_protection(tstate, start, size); + + // Test a larger size + size = 7654321; + assert(size > _PyOS_MIN_STACK_SIZE); + start = (void*)(_Py_get_machine_stack_pointer() - size); + check_threadstate_set_stack_protection(tstate, start, size); + + // Test invalid size (too small) + size = 5; + start = (void*)(_Py_get_machine_stack_pointer() - size); + assert(PyUnstable_ThreadState_SetStackProtection(tstate, start, size) == -1); + assert(PyErr_ExceptionMatches(PyExc_ValueError)); + PyErr_Clear(); + + // Test PyUnstable_ThreadState_ResetStackProtection() + PyUnstable_ThreadState_ResetStackProtection(tstate); + assert(ts->c_stack_init_base == init_base); + assert(ts->c_stack_init_top == init_top); + + Py_RETURN_NONE; +} + + static PyMethodDef module_functions[] = { {"get_configs", get_configs, METH_NOARGS}, {"get_recursion_depth", get_recursion_depth, METH_NOARGS}, @@ -2516,6 +2567,8 @@ static PyMethodDef module_functions[] = { {"emscripten_set_up_async_input_device", emscripten_set_up_async_input_device, METH_NOARGS}, #endif {"set_vectorcall_nop", set_vectorcall_nop, METH_O}, + {"test_threadstate_set_stack_protection", + test_threadstate_set_stack_protection, METH_NOARGS}, {NULL, NULL} /* sentinel */ }; diff --git a/Python/ceval.c b/Python/ceval.c index d37a98df418322..adf4eea06bb939 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -344,9 +344,11 @@ _Py_ReachedRecursionLimitWithMargin(PyThreadState *tstate, int margin_count) _Py_InitializeRecursionLimits(tstate); } #if _Py_STACK_GROWS_DOWN - return here_addr <= _tstate->c_stack_soft_limit + margin_count * _PyOS_STACK_MARGIN_BYTES; + return here_addr <= _tstate->c_stack_soft_limit + margin_count * _PyOS_STACK_MARGIN_BYTES && + here_addr >= _tstate->c_stack_soft_limit - 2 * _PyOS_STACK_MARGIN_BYTES; #else - return here_addr > _tstate->c_stack_soft_limit - margin_count * _PyOS_STACK_MARGIN_BYTES; + return here_addr > _tstate->c_stack_soft_limit - margin_count * _PyOS_STACK_MARGIN_BYTES && + here_addr <= _tstate->c_stack_soft_limit + 2 * _PyOS_STACK_MARGIN_BYTES; #endif } @@ -436,24 +438,26 @@ int pthread_attr_destroy(pthread_attr_t *a) #endif - -void -_Py_InitializeRecursionLimits(PyThreadState *tstate) +static void +hardware_stack_limits(uintptr_t *base, uintptr_t *top, uintptr_t sp) { - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; #ifdef WIN32 ULONG_PTR low, high; GetCurrentThreadStackLimits(&low, &high); - _tstate->c_stack_top = (uintptr_t)high; + *top = (uintptr_t)high; ULONG guarantee = 0; SetThreadStackGuarantee(&guarantee); - _tstate->c_stack_hard_limit = ((uintptr_t)low) + guarantee + _PyOS_STACK_MARGIN_BYTES; - _tstate->c_stack_soft_limit = _tstate->c_stack_hard_limit + _PyOS_STACK_MARGIN_BYTES; + *base = (uintptr_t)low + guarantee; +#elif defined(__APPLE__) + pthread_t this_thread = pthread_self(); + void *stack_addr = pthread_get_stackaddr_np(this_thread); // top of the stack + size_t stack_size = pthread_get_stacksize_np(this_thread); + *top = (uintptr_t)stack_addr; + *base = ((uintptr_t)stack_addr) - stack_size; #else - uintptr_t here_addr = _Py_get_machine_stack_pointer(); -/// XXX musl supports HAVE_PTHRED_GETATTR_NP, but the resulting stack size -/// (on alpine at least) is much smaller than expected and imposes undue limits -/// compared to the old stack size estimation. (We assume musl is not glibc.) + /// XXX musl supports HAVE_PTHRED_GETATTR_NP, but the resulting stack size + /// (on alpine at least) is much smaller than expected and imposes undue limits + /// compared to the old stack size estimation. (We assume musl is not glibc.) # if defined(HAVE_PTHREAD_GETATTR_NP) && !defined(_AIX) && \ !defined(__NetBSD__) && (defined(__GLIBC__) || !defined(__linux__)) size_t stack_size, guard_size; @@ -466,40 +470,101 @@ _Py_InitializeRecursionLimits(PyThreadState *tstate) err |= pthread_attr_destroy(&attr); } if (err == 0) { - uintptr_t base = ((uintptr_t)stack_addr) + guard_size; - uintptr_t top = base + stack_size; -# ifdef _Py_THREAD_SANITIZER - // Thread sanitizer crashes if we use a bit more than half the stack. -# if _Py_STACK_GROWS_DOWN - base += stack_size / 2; -# else - top -= stack_size / 2; -# endif -# endif -# if _Py_STACK_GROWS_DOWN - _tstate->c_stack_top = top; - _tstate->c_stack_hard_limit = base + _PyOS_STACK_MARGIN_BYTES; - _tstate->c_stack_soft_limit = base + _PyOS_STACK_MARGIN_BYTES * 2; - assert(_tstate->c_stack_soft_limit < here_addr); - assert(here_addr < _tstate->c_stack_top); -# else - _tstate->c_stack_top = base; - _tstate->c_stack_hard_limit = top - _PyOS_STACK_MARGIN_BYTES; - _tstate->c_stack_soft_limit = top - _PyOS_STACK_MARGIN_BYTES * 2; - assert(here_addr > base); - assert(here_addr < _tstate->c_stack_soft_limit); -# endif + *base = ((uintptr_t)stack_addr) + guard_size; + *top = (uintptr_t)stack_addr + stack_size; return; } # endif - _tstate->c_stack_top = _Py_SIZE_ROUND_UP(here_addr, 4096); - _tstate->c_stack_soft_limit = _tstate->c_stack_top - Py_C_STACK_SIZE; - _tstate->c_stack_hard_limit = _tstate->c_stack_top - (Py_C_STACK_SIZE + _PyOS_STACK_MARGIN_BYTES); + // Add some space for caller function then round to minimum page size + // This is a guess at the top of the stack, but should be a reasonably + // good guess if called from _PyThreadState_Attach when creating a thread. + // If the thread is attached deep in a call stack, then the guess will be poor. +#if _Py_STACK_GROWS_DOWN + uintptr_t top_addr = _Py_SIZE_ROUND_UP(sp + 8*sizeof(void*), SYSTEM_PAGE_SIZE); + *top = top_addr; + *base = top_addr - Py_C_STACK_SIZE; +# else + uintptr_t base_addr = _Py_SIZE_ROUND_DOWN(sp - 8*sizeof(void*), SYSTEM_PAGE_SIZE); + *base = base_addr; + *top = base_addr + Py_C_STACK_SIZE; +#endif +#endif +} + +static void +tstate_set_stack(PyThreadState *tstate, + uintptr_t base, uintptr_t top) +{ + assert(base < top); + assert((top - base) >= _PyOS_MIN_STACK_SIZE); + +#ifdef _Py_THREAD_SANITIZER + // Thread sanitizer crashes if we use more than half the stack. + uintptr_t stacksize = top - base; + base += stacksize / 2; +#endif + _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; + _tstate->c_stack_top = top; + _tstate->c_stack_hard_limit = base + _PyOS_STACK_MARGIN_BYTES; + _tstate->c_stack_soft_limit = base + _PyOS_STACK_MARGIN_BYTES * 2; + +#ifndef NDEBUG + // Sanity checks + _PyThreadStateImpl *ts = (_PyThreadStateImpl *)tstate; + assert(ts->c_stack_hard_limit <= ts->c_stack_soft_limit); + assert(ts->c_stack_soft_limit < ts->c_stack_top); #endif } +void +_Py_InitializeRecursionLimits(PyThreadState *tstate) +{ + uintptr_t base, top; + uintptr_t here_addr = _Py_get_machine_stack_pointer(); + hardware_stack_limits(&base, &top, here_addr); + assert(top != 0); + + tstate_set_stack(tstate, base, top); + _PyThreadStateImpl *ts = (_PyThreadStateImpl *)tstate; + ts->c_stack_init_base = base; + ts->c_stack_init_top = top; +} + +int +PyUnstable_ThreadState_SetStackProtection(PyThreadState *tstate, + void *stack_start_addr, size_t stack_size) +{ + if (stack_size < _PyOS_MIN_STACK_SIZE) { + PyErr_Format(PyExc_ValueError, + "stack_size must be at least %zu bytes", + _PyOS_MIN_STACK_SIZE); + return -1; + } + + uintptr_t base = (uintptr_t)stack_start_addr; + uintptr_t top = base + stack_size; + tstate_set_stack(tstate, base, top); + return 0; +} + + +void +PyUnstable_ThreadState_ResetStackProtection(PyThreadState *tstate) +{ + _PyThreadStateImpl *ts = (_PyThreadStateImpl *)tstate; + if (ts->c_stack_init_top != 0) { + tstate_set_stack(tstate, + ts->c_stack_init_base, + ts->c_stack_init_top); + return; + } + + _Py_InitializeRecursionLimits(tstate); +} + + /* The function _Py_EnterRecursiveCallTstate() only calls _Py_CheckRecursiveCall() - if the recursion_depth reaches recursion_limit. */ + if the stack pointer is between the stack base and c_stack_hard_limit. */ int _Py_CheckRecursiveCall(PyThreadState *tstate, const char *where) { @@ -508,10 +573,12 @@ _Py_CheckRecursiveCall(PyThreadState *tstate, const char *where) assert(_tstate->c_stack_soft_limit != 0); assert(_tstate->c_stack_hard_limit != 0); #if _Py_STACK_GROWS_DOWN + assert(here_addr >= _tstate->c_stack_hard_limit - _PyOS_STACK_MARGIN_BYTES); if (here_addr < _tstate->c_stack_hard_limit) { /* Overflowing while handling an overflow. Give up. */ int kbytes_used = (int)(_tstate->c_stack_top - here_addr)/1024; #else + assert(here_addr <= _tstate->c_stack_hard_limit + _PyOS_STACK_MARGIN_BYTES); if (here_addr > _tstate->c_stack_hard_limit) { /* Overflowing while handling an overflow. Give up. */ int kbytes_used = (int)(here_addr - _tstate->c_stack_top)/1024; diff --git a/Python/pystate.c b/Python/pystate.c index 06f997fb5ad5be..6781920b9eec66 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1583,6 +1583,9 @@ init_threadstate(_PyThreadStateImpl *_tstate, _tstate->c_stack_top = 0; _tstate->c_stack_hard_limit = 0; + _tstate->c_stack_init_base = 0; + _tstate->c_stack_init_top = 0; + _tstate->asyncio_running_loop = NULL; _tstate->asyncio_running_task = NULL;