@@ -48,11 +48,10 @@ typedef struct _PyEncoderObject {
4848 PyObject * indent ;
4949 PyObject * key_separator ;
5050 PyObject * item_separator ;
51+ int (* fast_encode )(PyUnicodeWriter * , PyObject * );
5152 bool sort_keys ;
5253 bool skipkeys ;
5354 bool allow_nan ;
54- bool fast_encode ;
55- bool ensure_ascii ; /* used only when fast_encode == true */
5655} PyEncoderObject ;
5756
5857#define PyEncoderObject_CAST (op ) ((PyEncoderObject *)(op))
@@ -304,18 +303,20 @@ escape_unicode(PyObject *pystr)
304303 return rval ;
305304}
306305
307- // Take a PyUnicode pystr and write an escaped string to writer.
306+ #define ESCAPE_BUF_SIZE 200
307+
308+ // Take a PyUnicode pystr and write an escaped string to writer. (ensure_ascii)
308309static int
309- write_escaped_unicode (PyUnicodeWriter * writer , PyObject * pystr , bool ascii_only )
310+ write_escaped_ascii (PyUnicodeWriter * writer , PyObject * pystr )
310311{
311312 Py_ssize_t i ;
312313 Py_ssize_t input_chars ;
313- Py_ssize_t chars ;
314- Py_ssize_t copy_len = 0 ;
314+ Py_ssize_t buf_len ;
315315 const void * input ;
316+ Py_UCS4 c = 0 ;
316317 int kind ;
317318 int ret ;
318- unsigned char buf [12 ];
319+ char buf [ESCAPE_BUF_SIZE ]; // avoid overhead of PyUnicodeWriter APIs
319320
320321 input_chars = PyUnicode_GET_LENGTH (pystr );
321322 input = PyUnicode_DATA (pystr );
@@ -324,27 +325,102 @@ write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr, bool ascii_only)
324325 ret = PyUnicodeWriter_WriteChar (writer , '"' );
325326 if (ret ) return ret ;
326327
328+ // Fast path for string doesn't need escape at all: e.g. "id", "name"
327329 for (i = 0 ; i < input_chars ; i ++ ) {
330+ c = PyUnicode_READ (kind , input , i );
331+ if (!S_CHAR (c )) {
332+ break ;
333+ }
334+ }
335+ if (i > 0 ) {
336+ ret = PyUnicodeWriter_WriteSubstring (writer , pystr , 0 , i );
337+ if (ret ) return ret ;
338+ }
339+ if (i == input_chars ) {
340+ return PyUnicodeWriter_WriteChar (writer , '"' );
341+ }
342+
343+ buf_len = ascii_escape_unichar (c , (unsigned char * )buf , 0 );
344+
345+ for (i ++ ; i < input_chars ; i ++ ) {
328346 Py_UCS4 c = PyUnicode_READ (kind , input , i );
329- if (c <= 0x1f || c == '\\' || c == '"' || (ascii_only && c >= 0x7f )) {
330- ret = PyUnicodeWriter_WriteSubstring (writer , pystr , i - copy_len , i );
331- if (ret ) return ret ;
332- copy_len = 0 ;
347+ if (S_CHAR (c )) {
348+ buf [buf_len ++ ] = c ;
349+ }
350+ else {
351+ buf_len = ascii_escape_unichar (c , (unsigned char * )buf , buf_len );
352+ }
333353
334- chars = ascii_escape_unichar ( c , buf , 0 );
335- ret = PyUnicodeWriter_WriteUTF8 (writer , ( const char * ) buf , chars );
354+ if ( buf_len + 12 > ESCAPE_BUF_SIZE ) {
355+ ret = PyUnicodeWriter_WriteUTF8 (writer , buf , buf_len );
336356 if (ret ) return ret ;
357+ buf_len = 0 ;
358+ }
359+ }
360+
361+ assert (buf_len < ESCAPE_BUF_SIZE );
362+ buf [buf_len ++ ] = '"' ;
363+ return PyUnicodeWriter_WriteUTF8 (writer , buf , buf_len );
364+ }
365+
366+ static int
367+ write_escaped_unicode (PyUnicodeWriter * writer , PyObject * pystr )
368+ {
369+ Py_ssize_t i ;
370+ Py_ssize_t input_size ;
371+ Py_ssize_t buf_len ;
372+ const unsigned char * input ;
373+ int ret ;
374+ unsigned char c ;
375+ char buf [ESCAPE_BUF_SIZE ];
376+
377+ // We don't need to escape non-ASCII chars.
378+ // So we just copy UTF-8 from pystr to buf.
379+ input = (const unsigned char * ) PyUnicode_AsUTF8AndSize (pystr , & input_size );
380+
381+ ret = PyUnicodeWriter_WriteChar (writer , '"' );
382+ if (ret ) return ret ;
383+
384+ // Fast path for string doesn't need escape at all: e.g. "id", "name"
385+ for (i = 0 ; i < input_size ; i ++ ) {
386+ c = input [i ];
387+ if (c <= 0x1f || c == '\\' || c == '"' ) {
388+ break ;
389+ }
390+ }
391+ if (i > 0 ) {
392+ ret = PyUnicodeWriter_WriteUTF8 (writer , (const char * )input , i );
393+ if (ret ) return ret ;
394+ }
395+ if (i == input_size ) {
396+ return PyUnicodeWriter_WriteChar (writer , '"' );
397+ }
398+
399+ buf_len = ascii_escape_unichar (c , (unsigned char * )buf , 0 );
400+
401+ for (i ++ ; i < input_size ; i ++ ) {
402+ c = input [i ];
403+ if (c <= 0x1f || c == '\\' || c == '"' ) {
404+ buf_len = ascii_escape_unichar (c , (unsigned char * )buf , buf_len );
337405 }
338406 else {
339- copy_len ++ ;
407+ buf [buf_len ++ ] = c ;
408+ }
409+
410+ if (buf_len + 6 > ESCAPE_BUF_SIZE ) {
411+ ret = PyUnicodeWriter_WriteUTF8 (writer , buf , buf_len );
412+ if (ret ) return ret ;
413+ buf_len = 0 ;
340414 }
341415 }
342416
343- ret = PyUnicodeWriter_WriteSubstring ( writer , pystr , i - copy_len , i );
344- if ( ret ) return ret ;
345- return PyUnicodeWriter_WriteChar (writer , '"' );
417+ assert ( buf_len < ESCAPE_BUF_SIZE );
418+ buf [ buf_len ++ ] = '"' ;
419+ return PyUnicodeWriter_WriteUTF8 (writer , buf , buf_len );
346420}
347421
422+ #undef ESCAPE_BUF_SIZE
423+
348424static void
349425raise_errmsg (const char * msg , PyObject * s , Py_ssize_t end )
350426{
@@ -1293,17 +1369,15 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12931369 s -> sort_keys = sort_keys ;
12941370 s -> skipkeys = skipkeys ;
12951371 s -> allow_nan = allow_nan ;
1296- s -> fast_encode = false;
1297- s -> ensure_ascii = false;
1372+ s -> fast_encode = NULL ;
12981373
12991374 if (PyCFunction_Check (s -> encoder )) {
13001375 PyCFunction f = PyCFunction_GetFunction (s -> encoder );
13011376 if (f == py_encode_basestring_ascii ){
1302- s -> fast_encode = true;
1303- s -> ensure_ascii = true;
1377+ s -> fast_encode = write_escaped_ascii ;
13041378 }
13051379 else if (f == py_encode_basestring ) {
1306- s -> fast_encode = true ;
1380+ s -> fast_encode = write_escaped_unicode ;
13071381 }
13081382 }
13091383
@@ -1497,7 +1571,7 @@ static int
14971571encoder_write_string (PyEncoderObject * s , PyUnicodeWriter * writer , PyObject * obj )
14981572{
14991573 if (s -> fast_encode ) {
1500- return write_escaped_unicode (writer , obj , s -> ensure_ascii );
1574+ return s -> fast_encode (writer , obj );
15011575 }
15021576
15031577 /* Return the JSON representation of a string */
0 commit comments