@@ -480,7 +480,7 @@ function byteLength (string, encoding) {
480480 return len
481481 case 'utf8' :
482482 case 'utf-8' :
483- return utf8ToBytes ( string ) . length
483+ return utf8ByteLength ( string )
484484 case 'ucs2' :
485485 case 'ucs-2' :
486486 case 'utf16le' :
@@ -492,7 +492,7 @@ function byteLength (string, encoding) {
492492 return base64ToBytes ( string ) . length
493493 default :
494494 if ( loweredCase ) {
495- return mustMatch ? - 1 : utf8ToBytes ( string ) . length // assume utf8
495+ return mustMatch ? - 1 : utf8ByteLength ( string ) // assume utf8
496496 }
497497 encoding = ( '' + encoding ) . toLowerCase ( )
498498 loweredCase = true
@@ -870,7 +870,141 @@ function hexWrite (buf, string, offset, length) {
870870}
871871
872872function utf8Write ( buf , string , offset , length ) {
873- return blitBuffer ( utf8ToBytes ( string , buf . length - offset ) , buf , offset , length )
873+ let remaining = length
874+ let leadSurrogate = 0
875+ let pos = offset
876+
877+ for ( let i = 0 ; i < string . length ; i ++ ) {
878+ let codePoint = string . charCodeAt ( i )
879+
880+ // is surrogate component
881+ if ( codePoint > 0xd7ff && codePoint < 0xe000 ) {
882+ // last char was a lead
883+ if ( ! leadSurrogate ) {
884+ // no lead yet
885+ if ( codePoint > 0xdbff ) {
886+ // unexpected trail
887+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
888+ remaining -= 3
889+ continue
890+ } else if ( i + 1 === string . length ) {
891+ // unpaired lead
892+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
893+ remaining -= 3
894+ continue
895+ }
896+
897+ // valid lead
898+ leadSurrogate = codePoint
899+
900+ continue
901+ }
902+
903+ // 2 leads in a row
904+ if ( codePoint < 0xdc00 ) {
905+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
906+ remaining -= 3
907+ leadSurrogate = codePoint
908+ continue
909+ }
910+
911+ // valid surrogate pair
912+ codePoint -= 0xdc00
913+ codePoint |= ( leadSurrogate - 0xd800 ) << 10
914+ codePoint += 0x10000
915+ } else if ( leadSurrogate ) {
916+ // valid bmp char, but last char was a lead
917+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
918+ remaining -= 3
919+ }
920+
921+ leadSurrogate = 0
922+
923+ // encode utf8
924+ if ( codePoint < 0x80 ) {
925+ if ( remaining < 1 ) break
926+ buf [ pos ++ ] = codePoint
927+ remaining -= 1
928+ } else if ( codePoint < 0x800 ) {
929+ if ( remaining < 2 ) break
930+ buf [ pos ++ ] = ( codePoint >> 6 ) | 0xc0
931+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
932+ remaining -= 2
933+ } else if ( codePoint < 0x10000 ) {
934+ if ( remaining < 3 ) break
935+ buf [ pos ++ ] = ( codePoint >> 12 ) | 0xe0
936+ buf [ pos ++ ] = ( ( codePoint >> 6 ) & 0x3f ) | 0x80
937+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
938+ remaining -= 3
939+ } else if ( codePoint < 0x110000 ) {
940+ if ( remaining < 4 ) break
941+ buf [ pos ++ ] = ( codePoint >> 18 ) | 0xf0
942+ buf [ pos ++ ] = ( ( codePoint >> 12 ) & 0x3f ) | 0x80
943+ buf [ pos ++ ] = ( ( codePoint >> 6 ) & 0x3f ) | 0x80
944+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
945+ remaining -= 4
946+ } else {
947+ throw new Error ( 'Invalid code point' )
948+ }
949+ }
950+
951+ return pos - offset
952+ }
953+
954+ function utf8ByteLength ( string ) {
955+ let leadSurrogate = 0
956+ let size = 0
957+
958+ for ( let i = 0 ; i < string . length ; i ++ ) {
959+ let codePoint = string . charCodeAt ( i )
960+
961+ // is surrogate component
962+ if ( codePoint > 0xd7ff && codePoint < 0xe000 ) {
963+ // last char was a lead
964+ if ( ! leadSurrogate ) {
965+ // no lead yet
966+ if ( codePoint > 0xdbff ) {
967+ // unexpected trail
968+ size += 3
969+ continue
970+ } else if ( i + 1 === string . length ) {
971+ // unpaired lead
972+ size += 3
973+ continue
974+ }
975+
976+ // valid lead
977+ leadSurrogate = codePoint
978+
979+ continue
980+ }
981+
982+ // 2 leads in a row
983+ if ( codePoint < 0xdc00 ) {
984+ size += 3
985+ leadSurrogate = codePoint
986+ continue
987+ }
988+
989+ // valid surrogate pair
990+ codePoint -= 0xdc00
991+ codePoint |= ( leadSurrogate - 0xd800 ) << 10
992+ codePoint += 0x10000
993+ } else if ( leadSurrogate ) {
994+ // valid bmp char, but last char was a lead
995+ size += 3
996+ }
997+
998+ leadSurrogate = 0
999+
1000+ // encode utf8
1001+ size += 1
1002+ size += ( codePoint >= 0x80 ) | 0
1003+ size += ( codePoint >= 0x800 ) | 0
1004+ size += ( codePoint >= 0x10000 ) | 0
1005+ }
1006+
1007+ return size
8741008}
8751009
8761010function asciiWrite ( buf , string , offset , length ) {
@@ -1990,90 +2124,18 @@ function base64clean (str) {
19902124 return str
19912125}
19922126
1993- function utf8ToBytes ( string , units ) {
1994- units = units || Infinity
1995- let codePoint
1996- const length = string . length
1997- let leadSurrogate = null
1998- const bytes = [ ]
1999-
2000- for ( let i = 0 ; i < length ; ++ i ) {
2001- codePoint = string . charCodeAt ( i )
2002-
2003- // is surrogate component
2004- if ( codePoint > 0xD7FF && codePoint < 0xE000 ) {
2005- // last char was a lead
2006- if ( ! leadSurrogate ) {
2007- // no lead yet
2008- if ( codePoint > 0xDBFF ) {
2009- // unexpected trail
2010- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2011- continue
2012- } else if ( i + 1 === length ) {
2013- // unpaired lead
2014- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2015- continue
2016- }
2017-
2018- // valid lead
2019- leadSurrogate = codePoint
2020-
2021- continue
2022- }
2023-
2024- // 2 leads in a row
2025- if ( codePoint < 0xDC00 ) {
2026- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2027- leadSurrogate = codePoint
2028- continue
2029- }
2030-
2031- // valid surrogate pair
2032- codePoint = ( leadSurrogate - 0xD800 << 10 | codePoint - 0xDC00 ) + 0x10000
2033- } else if ( leadSurrogate ) {
2034- // valid bmp char, but last char was a lead
2035- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2036- }
2037-
2038- leadSurrogate = null
2039-
2040- // encode utf8
2041- if ( codePoint < 0x80 ) {
2042- if ( ( units -= 1 ) < 0 ) break
2043- bytes . push ( codePoint )
2044- } else if ( codePoint < 0x800 ) {
2045- if ( ( units -= 2 ) < 0 ) break
2046- bytes . push (
2047- codePoint >> 0x6 | 0xC0 ,
2048- codePoint & 0x3F | 0x80
2049- )
2050- } else if ( codePoint < 0x10000 ) {
2051- if ( ( units -= 3 ) < 0 ) break
2052- bytes . push (
2053- codePoint >> 0xC | 0xE0 ,
2054- codePoint >> 0x6 & 0x3F | 0x80 ,
2055- codePoint & 0x3F | 0x80
2056- )
2057- } else if ( codePoint < 0x110000 ) {
2058- if ( ( units -= 4 ) < 0 ) break
2059- bytes . push (
2060- codePoint >> 0x12 | 0xF0 ,
2061- codePoint >> 0xC & 0x3F | 0x80 ,
2062- codePoint >> 0x6 & 0x3F | 0x80 ,
2063- codePoint & 0x3F | 0x80
2064- )
2065- } else {
2066- throw new Error ( 'Invalid code point' )
2067- }
2068- }
2069-
2070- return bytes
2071- }
2072-
20732127function base64ToBytes ( str ) {
20742128 return base64 . toByteArray ( base64clean ( str ) )
20752129}
20762130
2131+ function writeInvalid ( buf , pos ) {
2132+ // U+FFFD (Replacement Character)
2133+ buf [ pos ++ ] = 0xef
2134+ buf [ pos ++ ] = 0xbf
2135+ buf [ pos ++ ] = 0xbd
2136+ return pos
2137+ }
2138+
20772139function blitBuffer ( src , dst , offset , length ) {
20782140 let i
20792141 for ( i = 0 ; i < length ; ++ i ) {
0 commit comments