Skip to content

Commit 0c1d6b4

Browse files
[3.13] pythongh-63161: Fix PEP 263 support (pythonGH-139481)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error in comments for UTF-8 encoding. * Include the decoding error position for default encoding in SyntaxError. (cherry picked from commit 5c942f1) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent b367d10 commit 0c1d6b4

File tree

9 files changed

+210
-46
lines changed

9 files changed

+210
-46
lines changed

Lib/test/test_exceptions.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ def check(self, src, lineno, offset, end_lineno=None, end_offset=None, encoding=
224224
if not isinstance(src, str):
225225
src = src.decode(encoding, 'replace')
226226
line = src.split('\n')[lineno-1]
227+
if lineno == 1:
228+
line = line.removeprefix('\ufeff')
227229
self.assertIn(line, cm.exception.text)
228230

229231
def test_error_offset_continuation_characters(self):
@@ -239,7 +241,9 @@ def testSyntaxErrorOffset(self):
239241
check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20)
240242
check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
241243
2, 19, encoding='cp1251')
242-
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 10)
244+
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
245+
check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +', 4, 12)
246+
check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
243247
check('x = "a', 1, 5)
244248
check('lambda x: x = 2', 1, 1)
245249
check('f{a + b + c}', 1, 2)
@@ -287,7 +291,7 @@ def baz():
287291
check("pass\npass\npass\n(1+)\npass\npass\npass", 4, 4)
288292
check("(1+)", 1, 4)
289293
check("[interesting\nfoo()\n", 1, 1)
290-
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 0, -1)
294+
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 1, 0)
291295
check("""f'''
292296
{
293297
(123_a)

Lib/test/test_source_encoding.py

Lines changed: 112 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# -*- coding: utf-8 -*-
22

33
import unittest
4-
from test.support import script_helper, captured_stdout, requires_subprocess, requires_resource
4+
from test import support
5+
from test.support import script_helper
56
from test.support.os_helper import TESTFN, unlink, rmtree
67
from test.support.import_helper import unload
78
import importlib
@@ -64,7 +65,7 @@ def test_issue7820(self):
6465
# two bytes in common with the UTF-8 BOM
6566
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
6667

67-
@requires_subprocess()
68+
@support.requires_subprocess()
6869
def test_20731(self):
6970
sub = subprocess.Popen([sys.executable,
7071
os.path.join(os.path.dirname(__file__),
@@ -268,6 +269,17 @@ def test_second_non_utf8_coding_line(self):
268269
b'print(ascii("\xc3\xa4"))\n')
269270
self.check_script_output(src, br"'\xc3\u20ac'")
270271

272+
def test_first_utf8_coding_line_error(self):
273+
src = (b'#coding:ascii \xc3\xa4\n'
274+
b'raise RuntimeError\n')
275+
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
276+
277+
def test_second_utf8_coding_line_error(self):
278+
src = (b'#!/usr/bin/python\n'
279+
b'#coding:ascii \xc3\xa4\n'
280+
b'raise RuntimeError\n')
281+
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
282+
271283
def test_utf8_bom(self):
272284
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
273285
self.check_script_output(src, br"'\xe4'")
@@ -283,10 +295,80 @@ def test_utf8_bom_and_utf8_coding_line(self):
283295
b'print(ascii("\xc3\xa4"))\n')
284296
self.check_script_output(src, br"'\xe4'")
285297

286-
def test_utf8_non_utf8_comment_line_error(self):
298+
def test_utf8_bom_and_non_utf8_first_coding_line(self):
299+
src = (b'\xef\xbb\xbf#coding:iso-8859-15\n'
300+
b'raise RuntimeError\n')
301+
self.check_script_error(src,
302+
br"encoding problem: iso-8859-15 with BOM",
303+
lineno=1)
304+
305+
def test_utf8_bom_and_non_utf8_second_coding_line(self):
306+
src = (b'\xef\xbb\xbf#first\n'
307+
b'#coding:iso-8859-15\n'
308+
b'raise RuntimeError\n')
309+
self.check_script_error(src,
310+
br"encoding problem: iso-8859-15 with BOM",
311+
lineno=2)
312+
313+
def test_non_utf8_shebang(self):
314+
src = (b'#!/home/\xa4/bin/python\n'
315+
b'#coding:iso-8859-15\n'
316+
b'print(ascii("\xc3\xa4"))\n')
317+
self.check_script_output(src, br"'\xc3\u20ac'")
318+
319+
def test_utf8_shebang_error(self):
320+
src = (b'#!/home/\xc3\xa4/bin/python\n'
321+
b'#coding:ascii\n'
322+
b'raise RuntimeError\n')
323+
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
324+
325+
def test_non_utf8_shebang_error(self):
326+
src = (b'#!/home/\xa4/bin/python\n'
327+
b'raise RuntimeError\n')
328+
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1",
329+
lineno=1)
330+
331+
def test_non_utf8_second_line_error(self):
332+
src = (b'#first\n'
333+
b'#second\xa4\n'
334+
b'raise RuntimeError\n')
335+
self.check_script_error(src,
336+
br"Non-UTF-8 code starting with .* on line 2",
337+
lineno=2)
338+
339+
def test_non_utf8_third_line_error(self):
340+
src = (b'#first\n'
341+
b'#second\n'
342+
b'#third\xa4\n'
343+
b'raise RuntimeError\n')
344+
self.check_script_error(src,
345+
br"Non-UTF-8 code starting with .* on line 3",
346+
lineno=3)
347+
348+
def test_utf8_bom_non_utf8_third_line_error(self):
349+
src = (b'\xef\xbb\xbf#first\n'
350+
b'#second\n'
351+
b'#third\xa4\n'
352+
b'raise RuntimeError\n')
353+
self.check_script_error(src,
354+
br"Non-UTF-8 code starting with .* on line 3|"
355+
br"'utf-8' codec can't decode byte",
356+
lineno=3)
357+
358+
def test_utf_8_non_utf8_third_line_error(self):
359+
src = (b'#coding: utf-8\n'
360+
b'#second\n'
361+
b'#third\xa4\n'
362+
b'raise RuntimeError\n')
363+
self.check_script_error(src,
364+
br"Non-UTF-8 code starting with .* on line 3|"
365+
br"'utf-8' codec can't decode byte",
366+
lineno=3)
367+
368+
def test_utf8_non_utf8_third_line_error(self):
287369
src = (b'#coding: utf8\n'
288-
b'#\n'
289-
b'#\xa4\n'
370+
b'#second\n'
371+
b'#third\xa4\n'
290372
b'raise RuntimeError\n')
291373
self.check_script_error(src,
292374
br"'utf-8' codec can't decode byte|"
@@ -327,7 +409,7 @@ def test_nul_in_second_coding_line(self):
327409
class UTF8ValidatorTest(unittest.TestCase):
328410
@unittest.skipIf(not sys.platform.startswith("linux"),
329411
"Too slow to run on non-Linux platforms")
330-
@requires_resource('cpu')
412+
@support.requires_resource('cpu')
331413
def test_invalid_utf8(self):
332414
# This is a port of test_utf8_decode_invalid_sequences in
333415
# test_unicode.py to exercise the separate utf8 validator in
@@ -393,19 +475,29 @@ def check(content):
393475
check(b'\xF4'+cb+b'\xBF\xBF')
394476

395477

478+
@support.force_not_colorized_test_class
396479
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
397480

398481
def check_script_output(self, src, expected):
399-
with captured_stdout() as stdout:
482+
with support.captured_stdout() as stdout:
400483
exec(src)
401484
out = stdout.getvalue().encode('latin1')
402485
self.assertEqual(out.rstrip(), expected)
403486

404-
def check_script_error(self, src, expected):
405-
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
487+
def check_script_error(self, src, expected, lineno=...):
488+
with self.assertRaises(SyntaxError) as cm:
406489
exec(src)
490+
exc = cm.exception
491+
self.assertRegex(str(exc), expected.decode())
492+
if lineno is not ...:
493+
self.assertEqual(exc.lineno, lineno)
494+
line = src.splitlines()[lineno-1].decode(errors='replace')
495+
if lineno == 1:
496+
line = line.removeprefix('\ufeff')
497+
self.assertEqual(line, exc.text)
407498

408499

500+
@support.force_not_colorized_test_class
409501
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
410502

411503
def check_script_output(self, src, expected):
@@ -416,13 +508,22 @@ def check_script_output(self, src, expected):
416508
res = script_helper.assert_python_ok(fn)
417509
self.assertEqual(res.out.rstrip(), expected)
418510

419-
def check_script_error(self, src, expected):
511+
def check_script_error(self, src, expected, lineno=...):
420512
with tempfile.TemporaryDirectory() as tmpd:
421513
fn = os.path.join(tmpd, 'test.py')
422514
with open(fn, 'wb') as fp:
423515
fp.write(src)
424516
res = script_helper.assert_python_failure(fn)
425-
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
517+
err = res.err.rstrip()
518+
self.assertRegex(err.splitlines()[-1], b'SyntaxError: ' + expected)
519+
if lineno is not ...:
520+
self.assertIn(f', line {lineno}\n'.encode(),
521+
err.replace(os.linesep.encode(), b'\n'))
522+
line = src.splitlines()[lineno-1].decode(errors='replace')
523+
if lineno == 1:
524+
line = line.removeprefix('\ufeff')
525+
self.assertIn(line.encode(), err)
526+
426527

427528

428529
if __name__ == "__main__":
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
2+
encoding is specified. Detect decoding error in comments for default (UTF-8)
3+
encoding. Show the line and position of decoding error for default encoding
4+
in a traceback. Show the line containing the coding cookie when it conflicts
5+
with the BOM in a traceback.

Parser/pegen_errors.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <errcode.h>
33

44
#include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
5+
#include "pycore_runtime.h" // _Py_ID()
56
#include "lexer/state.h"
67
#include "lexer/lexer.h"
78
#include "pegen.h"
@@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename)
2324
PyObject *value;
2425
PyObject *tback;
2526
PyErr_Fetch(&type, &value, &tback);
27+
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
28+
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
29+
goto error;
30+
}
31+
PyErr_Restore(type, value, tback);
32+
return;
33+
}
2634
errstr = PyObject_Str(value);
2735
if (!errstr) {
2836
goto error;

Parser/tokenizer/file_tokenizer.c

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -280,10 +280,8 @@ tok_underflow_interactive(struct tok_state *tok) {
280280
}
281281

282282
static int
283-
tok_underflow_file(struct tok_state *tok) {
284-
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
285-
tok->cur = tok->inp = tok->buf;
286-
}
283+
tok_underflow_file(struct tok_state *tok)
284+
{
287285
if (tok->decoding_state == STATE_INIT) {
288286
/* We have not yet determined the encoding.
289287
If an encoding is found, use the file-pointer
@@ -294,8 +292,16 @@ tok_underflow_file(struct tok_state *tok) {
294292
}
295293
assert(tok->decoding_state != STATE_INIT);
296294
}
295+
int raw = tok->decoding_readline == NULL;
296+
if (raw && tok->decoding_state != STATE_NORMAL) {
297+
/* Keep the first line in the buffer to validate it later if
298+
* the encoding has not yet been determined. */
299+
}
300+
else if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
301+
tok->cur = tok->inp = tok->buf;
302+
}
297303
/* Read until '\n' or EOF */
298-
if (tok->decoding_readline != NULL) {
304+
if (!raw) {
299305
/* We already have a codec associated with this input. */
300306
if (!tok_readline_recode(tok)) {
301307
return 0;
@@ -326,20 +332,35 @@ tok_underflow_file(struct tok_state *tok) {
326332

327333
ADVANCE_LINENO();
328334
if (tok->decoding_state != STATE_NORMAL) {
329-
if (tok->lineno > 2) {
330-
tok->decoding_state = STATE_NORMAL;
331-
}
332-
else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
335+
if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
333336
tok, fp_setreadl))
334337
{
335338
return 0;
336339
}
340+
if (tok->lineno >= 2) {
341+
tok->decoding_state = STATE_NORMAL;
342+
}
337343
}
338-
/* The default encoding is UTF-8, so make sure we don't have any
339-
non-UTF-8 sequences in it. */
340-
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
341-
_PyTokenizer_error_ret(tok);
342-
return 0;
344+
if (raw && tok->decoding_state == STATE_NORMAL) {
345+
const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
346+
int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
347+
if (!tok->encoding) {
348+
/* The default encoding is UTF-8, so make sure we don't have any
349+
non-UTF-8 sequences in it. */
350+
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
351+
_PyTokenizer_error_ret(tok);
352+
return 0;
353+
}
354+
}
355+
else {
356+
PyObject *tmp = PyUnicode_Decode(line, strlen(line),
357+
tok->encoding, NULL);
358+
if (tmp == NULL) {
359+
_PyTokenizer_error_ret(tok);
360+
return 0;
361+
}
362+
Py_DECREF(tmp);
363+
}
343364
}
344365
assert(tok->done == E_OK);
345366
return tok->done == E_OK;

0 commit comments

Comments
 (0)