@@ -225,54 +225,81 @@ function utf8ToArray (str: string): number[] {
225225 */
226226export const toUTF8 = ( arr : number [ ] ) : string => {
227227 let result = ''
228- let skip = 0
229-
228+ const replacementChar = '\uFFFD'
230229 for ( let i = 0 ; i < arr . length ; i ++ ) {
231- const byte = arr [ i ]
232-
233- // this byte is part of a multi-byte sequence, skip it
234- // added to avoid modifying i within the loop which is considered unsafe.
235- if ( skip > 0 ) {
236- skip --
230+ const byte1 = arr [ i ]
231+ if ( byte1 <= 0x7f ) {
232+ result += String . fromCharCode ( byte1 )
237233 continue
238234 }
239-
240- // 1-byte sequence (0xxxxxxx)
241- if ( byte <= 0x7f ) {
242- result += String . fromCharCode ( byte )
243- } else if ( byte >= 0xc0 && byte <= 0xdf ) {
244- // 2-byte sequence (110xxxxx 10xxxxxx)
235+ const emitReplacement = ( ) => {
236+ result += replacementChar
237+ }
238+ if ( byte1 >= 0xc0 && byte1 <= 0xdf ) {
239+ if ( i + 1 >= arr . length ) {
240+ emitReplacement ( )
241+ continue
242+ }
245243 const byte2 = arr [ i + 1 ]
246- skip = 1
247- const codePoint = ( ( byte & 0x1f ) << 6 ) | ( byte2 & 0x3f )
244+ if ( ( byte2 & 0xc0 ) !== 0x80 ) {
245+ emitReplacement ( )
246+ continue
247+ }
248+ const codePoint = ( ( byte1 & 0x1f ) << 6 ) | ( byte2 & 0x3f )
248249 result += String . fromCharCode ( codePoint )
249- } else if ( byte >= 0xe0 && byte <= 0xef ) {
250- // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
250+ i += 1
251+ continue
252+ }
253+ if ( byte1 >= 0xe0 && byte1 <= 0xef ) {
254+ if ( i + 2 >= arr . length ) {
255+ emitReplacement ( )
256+ continue
257+ }
251258 const byte2 = arr [ i + 1 ]
252259 const byte3 = arr [ i + 2 ]
253- skip = 2
260+ if ( ( byte2 & 0xc0 ) !== 0x80 || ( byte3 & 0xc0 ) !== 0x80 ) {
261+ emitReplacement ( )
262+ continue
263+ }
254264 const codePoint =
255- ( ( byte & 0x0f ) << 12 ) | ( ( byte2 & 0x3f ) << 6 ) | ( byte3 & 0x3f )
265+ ( ( byte1 & 0x0f ) << 12 ) |
266+ ( ( byte2 & 0x3f ) << 6 ) |
267+ ( byte3 & 0x3f )
268+
256269 result += String . fromCharCode ( codePoint )
257- } else if ( byte >= 0xf0 && byte <= 0xf7 ) {
258- // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
270+ i += 2
271+ continue
272+ }
273+ if ( byte1 >= 0xf0 && byte1 <= 0xf7 ) {
274+ if ( i + 3 >= arr . length ) {
275+ emitReplacement ( )
276+ continue
277+ }
259278 const byte2 = arr [ i + 1 ]
260279 const byte3 = arr [ i + 2 ]
261280 const byte4 = arr [ i + 3 ]
262- skip = 3
281+ if (
282+ ( byte2 & 0xc0 ) !== 0x80 ||
283+ ( byte3 & 0xc0 ) !== 0x80 ||
284+ ( byte4 & 0xc0 ) !== 0x80
285+ ) {
286+ emitReplacement ( )
287+ continue
288+ }
263289 const codePoint =
264- ( ( byte & 0x07 ) << 18 ) |
290+ ( ( byte1 & 0x07 ) << 18 ) |
265291 ( ( byte2 & 0x3f ) << 12 ) |
266292 ( ( byte3 & 0x3f ) << 6 ) |
267293 ( byte4 & 0x3f )
268-
269- // Convert to UTF-16 surrogate pair
270- const surrogate1 = 0xd800 + ( ( codePoint - 0x10000 ) >> 10 )
271- const surrogate2 = 0xdc00 + ( ( codePoint - 0x10000 ) & 0x3ff )
272- result += String . fromCharCode ( surrogate1 , surrogate2 )
294+ const offset = codePoint - 0x10000
295+ const highSurrogate = 0xd800 + ( offset >> 10 )
296+ const lowSurrogate = 0xdc00 + ( offset & 0x3ff )
297+ result += String . fromCharCode ( highSurrogate , lowSurrogate )
298+ i += 3
299+ continue
273300 }
301+ emitReplacement ( )
274302 }
275-
276303 return result
277304}
278305
0 commit comments