From ae1cae22941a589facf8aed52392e048d6f65fd3 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 22 Jul 2025 12:17:49 +0300 Subject: [PATCH 1/3] gh-88886: Remove excessive encoding name normalization The codecs lookup function now performs only minimal normalization of the encoding name before passing it to the search functions: all ASCII letters are converted to lower case, spaces are replaced with hyphens. Excessive normalization broke third-party codecs providers, like python-iconv. Revert "bpo-37751: Fix codecs.lookup() normalization (GH-15092)" This reverts commit 20f59fe1f7748ae899aceee4cb560e5e1f528a1f. --- Doc/library/codecs.rst | 21 ++++++--- Lib/test/test_capi/test_codecs.py | 1 - Lib/test/test_codecs.py | 26 +++++------ ...5-07-28-17-01-05.gh-issue-88886.g4XFPb.rst | 3 ++ Python/codecs.c | 43 ++++++++++--------- 5 files changed, 51 insertions(+), 43 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index f96f2f8281f450..74657cd8a6e821 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -68,11 +68,21 @@ The full details for each codec can also be looked up directly: Looks up the codec info in the Python codec registry and returns a :class:`CodecInfo` object as defined below. - Encodings are first looked up in the registry's cache. If not found, the list of + This function first normalizes the *encoding*: all ASCII letters are + converted to lower case, spaces are replaced with hyphens. + Then encoding is looked up in the registry's cache. If not found, the list of registered search functions is scanned. If no :class:`CodecInfo` object is found, a :exc:`LookupError` is raised. Otherwise, the :class:`CodecInfo` object is stored in the cache and returned to the caller. + .. versionchanged:: 3.9 + Any characters except ASCII letters and digits and a dot are converted to underscore. + + .. versionchanged:: next + No characters are converted to underscore anymore. + Spaces are converted to hyphens. + + .. class:: CodecInfo(encode, decode, streamreader=None, streamwriter=None, incrementalencoder=None, incrementaldecoder=None, name=None) Codec details when looking up the codec registry. The constructor @@ -167,14 +177,11 @@ function: .. function:: register(search_function, /) Register a codec search function. Search functions are expected to take one - argument, being the encoding name in all lower case letters with hyphens - and spaces converted to underscores, and return a :class:`CodecInfo` object. + argument, being the encoding name in all lower case letters with spaces + converted to hyphens, and return a :class:`CodecInfo` object. In case a search function cannot find a given encoding, it should return ``None``. - .. versionchanged:: 3.9 - Hyphens and spaces are converted to underscore. - .. function:: unregister(search_function, /) @@ -1065,7 +1072,7 @@ or with dictionaries as mapping tables. The following table lists the codecs by name, together with a few common aliases, and the languages for which the encoding is likely used. Neither the list of aliases nor the list of languages is meant to be exhaustive. Notice that spelling alternatives that only differ in -case or use a hyphen instead of an underscore are also valid aliases +case or use a space or a hyphen instead of an underscore are also valid aliases because they are equivalent when normalized by :func:`~encodings.normalize_encoding`. For example, ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec. diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index a0355c7a388c57..1a3f476ed0f30d 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -630,7 +630,6 @@ def test_codec_known_encoding(self): for name in [ encoding_name, encoding_name.upper(), - encoding_name.replace('_', '-'), ]: with self.subTest(name): self.assertTrue(_testcapi.codec_known_encoding(name)) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index d8666f7290e72e..1d93c7c6f2ec39 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3873,26 +3873,22 @@ def test_rot13_func(self): class CodecNameNormalizationTest(unittest.TestCase): """Test codec name normalization""" def test_codecs_lookup(self): - FOUND = (1, 2, 3, 4) - NOT_FOUND = (None, None, None, None) def search_function(encoding): - if encoding == "aaa_8": - return FOUND + if encoding.startswith("test."): + return (encoding, 2, 3, 4) else: - return NOT_FOUND + return None codecs.register(search_function) self.addCleanup(codecs.unregister, search_function) - self.assertEqual(FOUND, codecs.lookup('aaa_8')) - self.assertEqual(FOUND, codecs.lookup('AAA-8')) - self.assertEqual(FOUND, codecs.lookup('AAA---8')) - self.assertEqual(FOUND, codecs.lookup('AAA 8')) - self.assertEqual(FOUND, codecs.lookup('aaa\xe9\u20ac-8')) - self.assertEqual(NOT_FOUND, codecs.lookup('AAA.8')) - self.assertEqual(NOT_FOUND, codecs.lookup('AAA...8')) - self.assertEqual(NOT_FOUND, codecs.lookup('BBB-8')) - self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8')) - self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) + self.assertEqual(codecs.lookup('test.aaa_8'), ('test.aaa_8', 2, 3, 4)) + self.assertEqual(codecs.lookup('TEST.AAA-8'), ('test.aaa-8', 2, 3, 4)) + self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa-8', 2, 3, 4)) + self.assertEqual(codecs.lookup('TEST.AAA---8'), ('test.aaa---8', 2, 3, 4)) + self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa---8', 2, 3, 4)) + self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4)) + self.assertEqual(codecs.lookup('TEST.AAA.8'), ('test.aaa.8', 2, 3, 4)) + self.assertEqual(codecs.lookup('TEST.AAA...8'), ('test.aaa...8', 2, 3, 4)) def test_encodings_normalize_encoding(self): # encodings.normalize_encoding() ignores non-ASCII characters. diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst new file mode 100644 index 00000000000000..898ba139aa48c2 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst @@ -0,0 +1,3 @@ +The codecs lookup function now performs only minimal normalization of the +encoding name before passing it to the search functions: all ASCII letters +are converted to lower case, spaces are replaced with hyphens. diff --git a/Python/codecs.c b/Python/codecs.c index caf8d9d5f3c188..1cd6c92aa275b2 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -92,14 +92,15 @@ PyCodec_Unregister(PyObject *search_function) extern int _Py_normalize_encoding(const char *, char *, size_t); -/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are - converted to lower case, spaces and hyphens are replaced with underscores. */ +/* Convert a string to a normalized Python string: all ASCII letters are + converted to lower case, spaces are replaced with hyphens. */ static PyObject *normalizestring(const char *string) { + size_t i; size_t len = strlen(string); - char *encoding; + char *p; PyObject *v; if (len > PY_SSIZE_T_MAX) { @@ -107,28 +108,30 @@ PyObject *normalizestring(const char *string) return NULL; } - encoding = PyMem_Malloc(len + 1); - if (encoding == NULL) + p = PyMem_Malloc(len + 1); + if (p == NULL) return PyErr_NoMemory(); - - if (!_Py_normalize_encoding(string, encoding, len + 1)) - { - PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed"); - PyMem_Free(encoding); - return NULL; - } - - v = PyUnicode_FromString(encoding); - PyMem_Free(encoding); + for (i = 0; i < len; i++) { + char ch = string[i]; + if (ch == ' ') + ch = '-'; + else + ch = Py_TOLOWER(Py_CHARMASK(ch)); + p[i] = ch; + } + p[i] = '\0'; + v = PyUnicode_FromString(p); + PyMem_Free(p); return v; } /* Lookup the given encoding and return a tuple providing the codec facilities. - The encoding string is looked up converted to all lower-case - characters. This makes encodings looked up through this mechanism - effectively case-insensitive. + ASCII letters in the encoding string is looked up converted to all + lower case. This makes encodings looked up through this mechanism + effectively case-insensitive. Spaces are replaced with hyphens for + names like "US ASCII" and "ISO 8859-1". If no codec is found, a LookupError is set and NULL returned. @@ -149,8 +152,8 @@ PyObject *_PyCodec_Lookup(const char *encoding) assert(interp->codecs.initialized); /* Convert the encoding to a normalized Python string: all - characters are converted to lower case, spaces and hyphens are - replaced with underscores. */ + ASCII letters are converted to lower case, spaces are + replaced with hyphens. */ PyObject *v = normalizestring(encoding); if (v == NULL) { return NULL; From 9c0f595a1b259654f079ececac68ecc8df291cb3 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 29 Jul 2025 14:02:44 +0300 Subject: [PATCH 2/3] Update a NEWS entry. --- .../2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst index 898ba139aa48c2..0d119efc93d20b 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst @@ -1,3 +1,4 @@ -The codecs lookup function now performs only minimal normalization of the -encoding name before passing it to the search functions: all ASCII letters -are converted to lower case, spaces are replaced with hyphens. +The codecs lookup function now again performs only minimal normalization of +the encoding name before passing it to the search functions: all ASCII +letters are converted to lower case, spaces are replaced with hyphens. +This restores the pre-Python 3.9 behavior. From 45461712b531de8fa43613ae1ab9f063e6320acf Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 9 Sep 2025 14:56:02 +0300 Subject: [PATCH 3/3] Update Python/codecs.c Co-authored-by: Victor Stinner --- Python/codecs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 1cd6c92aa275b2..29a72ea29ee43e 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -95,8 +95,8 @@ extern int _Py_normalize_encoding(const char *, char *, size_t); /* Convert a string to a normalized Python string: all ASCII letters are converted to lower case, spaces are replaced with hyphens. */ -static -PyObject *normalizestring(const char *string) +static PyObject* +normalizestring(const char *string) { size_t i; size_t len = strlen(string);