python · serhiy-storchaka · Oct 10, 2025 · Nov 4, 2021 · Oct 3, 2025 · Oct 3, 2025
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
@@ -267,6 +267,17 @@ def test_second_non_utf8_coding_line(self):
                b'print(ascii("\xc3\xa4"))\n')
         self.check_script_output(src, br"'\xc3\u20ac'")
 
+    def test_first_utf8_coding_line_error(self):
+        src = (b'#coding:ascii \xc3\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"'ascii' codec can't decode byte")
+
+    def test_second_utf8_coding_line_error(self):
+        src = (b'#!/usr/bin/python\n'
+               b'#coding:ascii \xc3\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"'ascii' codec can't decode byte")
+
     def test_utf8_bom(self):
         src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
         self.check_script_output(src, br"'\xe4'")
@@ -282,7 +293,57 @@ def test_utf8_bom_and_utf8_coding_line(self):
                b'print(ascii("\xc3\xa4"))\n')
         self.check_script_output(src, br"'\xe4'")
 
-    def test_utf8_non_utf8_comment_line_error(self):
+    def test_non_utf8_shebang(self):
+        src = (b'#!/home/\xa4/bin/python\n'
+               b'#coding:iso-8859-15\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xc3\u20ac'")
+
+    def test_utf8_shebang_error(self):
+        src = (b'#!/home/\xc3\xa4/bin/python\n'
+               b'#coding:ascii\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"'ascii' codec can't decode byte")
+
+    def test_non_utf8_shebang_error(self):
+        src = (b'#!/home/\xa4/bin/python\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1")
+
+    def test_non_utf8_second_line_error(self):
+        src = (b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"Non-UTF-8 code starting with .* on line 2")
+
+    def test_non_utf8_third_line_error(self):
+        src = (b'#\n'
+               b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"Non-UTF-8 code starting with .* on line 3")
+
+    def test_utf8_bom_non_utf8_third_line_error(self):
+        src = (b'\xef\xbb\xbf#\n'
+               b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"Non-UTF-8 code starting with .* on line 3|"
+                br"'utf-8' codec can't decode byte")
+
+    def test_utf_8_non_utf8_third_line_error(self):
+        src = (b'#coding: utf-8\n'
+               b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"Non-UTF-8 code starting with .* on line 3|"
+                br"'utf-8' codec can't decode byte")
+
+    def test_utf8_non_utf8_third_line_error(self):
         src = (b'#coding: utf8\n'
                b'#\n'
                b'#\xa4\n'

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst
@@ -0,0 +1,3 @@
+Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
+encoding is specified. Detect decoding error in comments for default (UTF-8)
+encoding.
@@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {
 }
 
 static int
-tok_underflow_file(struct tok_state *tok) {
-    if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
-        tok->cur = tok->inp = tok->buf;
-    }
+tok_underflow_file(struct tok_state *tok)
+{
     if (tok->decoding_state == STATE_INIT) {
         /* We have not yet determined the encoding.
            If an encoding is found, use the file-pointer
@@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {
         }
         assert(tok->decoding_state != STATE_INIT);
     }
+    int raw = tok->decoding_readline == NULL;
+    if (raw && tok->decoding_state != STATE_NORMAL) {
+        /* Keep the first line in the buffer to validate it later if
+         * the encoding has not yet been determined. */
+    }
+    else if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
+        tok->cur = tok->inp = tok->buf;
+    }
     /* Read until '\n' or EOF */
-    if (tok->decoding_readline != NULL) {
+    if (!raw) {
         /* We already have a codec associated with this input. */
         if (!tok_readline_recode(tok)) {
             return 0;
@@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {
 
     ADVANCE_LINENO();
     if (tok->decoding_state != STATE_NORMAL) {
-        if (tok->lineno > 2) {
-            tok->decoding_state = STATE_NORMAL;
-        }
-        else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
+        if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
                                     tok, fp_setreadl))
         {
             return 0;
         }
+        if (tok->lineno >= 2) {
+            tok->decoding_state = STATE_NORMAL;
+        }
     }
-    /* The default encoding is UTF-8, so make sure we don't have any
-       non-UTF-8 sequences in it. */
-    if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
-        _PyTokenizer_error_ret(tok);
-        return 0;
+    if (raw && tok->decoding_state == STATE_NORMAL) {
+        const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
+        int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
+        if (!tok->encoding) {
+            /* The default encoding is UTF-8, so make sure we don't have any
+               non-UTF-8 sequences in it. */
+            if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
+                _PyTokenizer_error_ret(tok);
+                return 0;
+            }
+        }
+        else {
+            PyObject *tmp = PyUnicode_Decode(line, strlen(line),
-        const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
-        int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
-        if (!tok->encoding) {
-            /* The default encoding is UTF-8, so make sure we don't have any
-               non-UTF-8 sequences in it. */
-            if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
-                _PyTokenizer_error_ret(tok);
-                return 0;
-            }
-        }
-        else {
-            PyObject *tmp = PyUnicode_Decode(line, strlen(line),
+        const int is_pseudo_line = (tok->lineno <= 2);
+        const char *line = is_pseudo_line ? tok->buf : tok->cur;
+        int lineno = is_pseudo_line ? 1 : tok->lineno;
+    
+        size_t slen = strlen(line);
+        if (slen > (size_t)PY_SSIZE_T_MAX) {
+            _PyTokenizer_error_ret(tok);
+            return 0;
+        }
+        
+        Py_ssize_t linelen = (Py_ssize_t)slen;
+        
+        if (!tok->encoding) {
+            /* The default encoding is UTF-8, so make sure we don't have any
+               non-UTF-8 sequences in it. */
+            if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
+                _PyTokenizer_error_ret(tok);
+                return 0;
+            }
+        }
+        else {
+            PyObject *tmp = PyUnicode_Decode(line, linelen,
-        const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
-        int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
-        if (!tok->encoding) {
-            /* The default encoding is UTF-8, so make sure we don't have any
-               non-UTF-8 sequences in it. */
-            if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
-                _PyTokenizer_error_ret(tok);
-                return 0;
-            }
-        }
-        else {
-            PyObject *tmp = PyUnicode_Decode(line, strlen(line),
+        const int is_pseudo_line = (tok->lineno <= 2);
+        const char *line = is_pseudo_line ? tok->buf : tok->cur;
+        int lineno = is_pseudo_line ? 1 : tok->lineno;
+    
+        size_t slen = strlen(line);
+        if (slen > (size_t)PY_SSIZE_T_MAX) {
+            _PyTokenizer_error_ret(tok);
+            return 0;
+        }
+        
+        Py_ssize_t linelen = (Py_ssize_t)slen;
+        
+        if (!tok->encoding) {
+            /* The default encoding is UTF-8, so make sure we don't have any
+               non-UTF-8 sequences in it. */
+            if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
+                _PyTokenizer_error_ret(tok);
+                return 0;
+            }
+        }
+        else {
+            PyObject *tmp = PyUnicode_Decode(line, linelen,
+                                             tok->encoding, NULL);
+            if (tmp == NULL) {
+                _PyTokenizer_error_ret(tok);
+                return 0;
+            }
+            Py_DECREF(tmp);
+        }
     }
     assert(tok->done == E_OK);
     return tok->done == E_OK;

@@ -496,24 +496,27 @@ valid_utf8(const unsigned char* s)
 }
 
 int
-_PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
+_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
 {
     int badchar = 0;
-    unsigned char *c;
+    const unsigned char *c;
     int length;
-    for (c = (unsigned char *)line; *c; c += length) {
+    for (c = (const unsigned char *)line; *c; c += length) {
         if (!(length = valid_utf8(c))) {
             badchar = *c;
             break;
         }
+        if (*c == '\n') {
+            lineno++;
+        }
     }
     if (badchar) {
         PyErr_Format(PyExc_SyntaxError,
                      "Non-UTF-8 code starting with '\\x%.2x' "
-                     "in file %U on line %i, "
+                     "in file %V on line %i, "
                      "but no encoding declared; "
                      "see https://peps.python.org/pep-0263/ for details",
-                     badchar, tok->filename, tok->lineno);
+                     badchar, tok->filename, "<string>", lineno);
         return 0;
     }
     return 1;

@@ -26,7 +26,7 @@ int _PyTokenizer_check_bom(int get_char(struct tok_state *),
           struct tok_state *tok);
 int _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
                   int set_readline(struct tok_state *, const char *));
-int _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok);
+int _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno);
 
 #ifdef Py_DEBUG
 void _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size);

@@ -97,7 +97,7 @@ tok_underflow_readline(struct tok_state* tok) {
     ADVANCE_LINENO();
     /* The default encoding is UTF-8, so make sure we don't have any
        non-UTF-8 sequences in it. */
-    if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
+    if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok, tok->lineno)) {
         _PyTokenizer_error_ret(tok);
         return 0;
     }

@@ -102,6 +102,9 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
             return _PyTokenizer_error_ret(tok);
         str = PyBytes_AS_STRING(utf8);
     }
+    else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
+        return _PyTokenizer_error_ret(tok);
+    }
     assert(tok->decoding_buffer == NULL);
     tok->decoding_buffer = utf8; /* CAUTION */
     return str;