Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,17 @@ def test_second_non_utf8_coding_line(self):
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_first_utf8_coding_line_error(self):
src = (b'#coding:ascii \xc3\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"'ascii' codec can't decode byte")

def test_second_utf8_coding_line_error(self):
src = (b'#!/usr/bin/python\n'
b'#coding:ascii \xc3\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"'ascii' codec can't decode byte")

def test_utf8_bom(self):
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")
Expand All @@ -282,7 +293,57 @@ def test_utf8_bom_and_utf8_coding_line(self):
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")

def test_utf8_non_utf8_comment_line_error(self):
def test_non_utf8_shebang(self):
src = (b'#!/home/\xa4/bin/python\n'
b'#coding:iso-8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_utf8_shebang_error(self):
src = (b'#!/home/\xc3\xa4/bin/python\n'
b'#coding:ascii\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"'ascii' codec can't decode byte")

def test_non_utf8_shebang_error(self):
src = (b'#!/home/\xa4/bin/python\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1")

def test_non_utf8_second_line_error(self):
src = (b'#\n'
b'#\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 2")

def test_non_utf8_third_line_error(self):
src = (b'#\n'
b'#\n'
b'#\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3")

def test_utf8_bom_non_utf8_third_line_error(self):
src = (b'\xef\xbb\xbf#\n'
b'#\n'
b'#\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3|"
br"'utf-8' codec can't decode byte")

def test_utf_8_non_utf8_third_line_error(self):
src = (b'#coding: utf-8\n'
b'#\n'
b'#\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3|"
br"'utf-8' codec can't decode byte")

def test_utf8_non_utf8_third_line_error(self):
src = (b'#coding: utf8\n'
b'#\n'
b'#\xa4\n'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
encoding is specified. Detect decoding error in comments for default (UTF-8)
encoding.
49 changes: 35 additions & 14 deletions Parser/tokenizer/file_tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {
}

static int
tok_underflow_file(struct tok_state *tok) {
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
tok->cur = tok->inp = tok->buf;
}
tok_underflow_file(struct tok_state *tok)
{
if (tok->decoding_state == STATE_INIT) {
/* We have not yet determined the encoding.
If an encoding is found, use the file-pointer
Expand All @@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {
}
assert(tok->decoding_state != STATE_INIT);
}
int raw = tok->decoding_readline == NULL;
if (raw && tok->decoding_state != STATE_NORMAL) {
/* Keep the first line in the buffer to validate it later if
* the encoding has not yet been determined. */
}
else if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
tok->cur = tok->inp = tok->buf;
}
/* Read until '\n' or EOF */
if (tok->decoding_readline != NULL) {
if (!raw) {
/* We already have a codec associated with this input. */
if (!tok_readline_recode(tok)) {
return 0;
Expand Down Expand Up @@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {

ADVANCE_LINENO();
if (tok->decoding_state != STATE_NORMAL) {
if (tok->lineno > 2) {
tok->decoding_state = STATE_NORMAL;
}
else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
tok, fp_setreadl))
{
return 0;
}
if (tok->lineno >= 2) {
tok->decoding_state = STATE_NORMAL;
}
}
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
_PyTokenizer_error_ret(tok);
return 0;
if (raw && tok->decoding_state == STATE_NORMAL) {
const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
if (!tok->encoding) {
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
_PyTokenizer_error_ret(tok);
return 0;
}
}
else {
PyObject *tmp = PyUnicode_Decode(line, strlen(line),
Comment on lines +347 to +358
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
if (!tok->encoding) {
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
_PyTokenizer_error_ret(tok);
return 0;
}
}
else {
PyObject *tmp = PyUnicode_Decode(line, strlen(line),
const int is_pseudo_line = (tok->lineno <= 2);
const char *line = is_pseudo_line ? tok->buf : tok->cur;
int lineno = is_pseudo_line ? 1 : tok->lineno;
size_t slen = strlen(line);
if (slen > (size_t)PY_SSIZE_T_MAX) {
_PyTokenizer_error_ret(tok);
return 0;
}
Py_ssize_t linelen = (Py_ssize_t)slen;
if (!tok->encoding) {
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
_PyTokenizer_error_ret(tok);
return 0;
}
}
else {
PyObject *tmp = PyUnicode_Decode(line, linelen,

tok->encoding, NULL);
if (tmp == NULL) {
_PyTokenizer_error_ret(tok);
return 0;
}
Py_DECREF(tmp);
}
}
assert(tok->done == E_OK);
return tok->done == E_OK;
Expand Down
13 changes: 8 additions & 5 deletions Parser/tokenizer/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -496,24 +496,27 @@ valid_utf8(const unsigned char* s)
}

int
_PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
{
int badchar = 0;
unsigned char *c;
const unsigned char *c;
int length;
for (c = (unsigned char *)line; *c; c += length) {
for (c = (const unsigned char *)line; *c; c += length) {
if (!(length = valid_utf8(c))) {
badchar = *c;
break;
}
if (*c == '\n') {
lineno++;
}
}
if (badchar) {
PyErr_Format(PyExc_SyntaxError,
"Non-UTF-8 code starting with '\\x%.2x' "
"in file %U on line %i, "
"in file %V on line %i, "
"but no encoding declared; "
"see https://peps.python.org/pep-0263/ for details",
badchar, tok->filename, tok->lineno);
badchar, tok->filename, "<string>", lineno);
return 0;
}
return 1;
Expand Down
2 changes: 1 addition & 1 deletion Parser/tokenizer/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ int _PyTokenizer_check_bom(int get_char(struct tok_state *),
struct tok_state *tok);
int _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
int set_readline(struct tok_state *, const char *));
int _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok);
int _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno);

#ifdef Py_DEBUG
void _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size);
Expand Down
2 changes: 1 addition & 1 deletion Parser/tokenizer/readline_tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ tok_underflow_readline(struct tok_state* tok) {
ADVANCE_LINENO();
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok, tok->lineno)) {
_PyTokenizer_error_ret(tok);
return 0;
}
Expand Down
3 changes: 3 additions & 0 deletions Parser/tokenizer/string_tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
return _PyTokenizer_error_ret(tok);
str = PyBytes_AS_STRING(utf8);
}
else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
return _PyTokenizer_error_ret(tok);
}
assert(tok->decoding_buffer == NULL);
tok->decoding_buffer = utf8; /* CAUTION */
return str;
Expand Down
Loading