@@ -44,48 +44,102 @@ class DefaultWordCutter implements Cutter {
4444 Sanity .nullCheck (message , "Message" );
4545 Sanity .truthiness (size > 0 , "Size must be positive" );
4646 List <String > list = new ArrayList <>();
47- if (message . length ( ) <= size ) {
47+ if (this . encodedLength ( message ) <= size ) {
4848 list .add (message );
4949 return list ;
5050 }
5151 StringBuilder builder = new StringBuilder (size );
5252 for (String word : message .split (" " )) {
53- if ((builder .length () + word .length () + ((builder .length () == 0 ) ? 0 : 1 )) > size ) {
54- if ((word .length () > size ) && ((builder .length () + 1 ) < size )) {
55- if (builder .length () > 0 ) {
53+ int builderLen = this .encodedLength (builder );
54+ if ((builderLen + this .encodedLength (word ) + ((builderLen == 0 ) ? 0 : 1 )) > size ) {
55+ if ((word .length () > size ) && ((builderLen + 1 ) < size )) {
56+ if (builderLen > 0 ) {
5657 builder .append (' ' );
58+ builderLen ++;
5759 }
58- int cut = size - builder . length () ;
60+ int cut = size - builderLen ;
5961 builder .append (word , 0 , cut );
6062 word = word .substring (cut );
6163 }
6264 list .add (builder .toString ().trim ());
6365 builder .setLength (0 );
64- while (word . length ( ) > size ) {
66+ while (this . encodedLength ( word ) > size ) {
6567 list .add (word .substring (0 , size ));
6668 word = word .substring (size );
6769 }
6870 }
69- if (builder . length ( ) > 0 ) {
71+ if (this . encodedLength ( builder ) > 0 ) {
7072 builder .append (' ' );
7173 }
7274 builder .append (word );
7375 }
74- if (builder . length ( ) > 0 ) {
76+ if (this . encodedLength ( builder ) > 0 ) {
7577 list .add (builder .toString ().trim ());
7678 }
7779 return list ;
7880 }
81+
82+ /*
83+ * The below two methods are from Guava's Utf8 class, licensed Apache 2.0 (see NOTICE file for more)
84+ * As this will always be small with minimal consequence, exceptions for large or malformed text are stripped.
85+ */
86+
87+ /**
88+ * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
89+ * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
90+ * time and space.
91+ */
92+ private int encodedLength (CharSequence sequence ) {
93+ // Warning to maintainers: this implementation is highly optimized.
94+ int utf16Length = sequence .length ();
95+ int utf8Length = utf16Length ;
96+ int i = 0 ;
97+
98+ // This loop optimizes for pure ASCII.
99+ while (i < utf16Length && sequence .charAt (i ) < 0x80 ) {
100+ i ++;
101+ }
102+
103+ // This loop optimizes for chars less than 0x800.
104+ for (; i < utf16Length ; i ++) {
105+ char c = sequence .charAt (i );
106+ if (c < 0x800 ) {
107+ utf8Length += ((0x7f - c ) >>> 31 ); // branch free!
108+ } else {
109+ utf8Length += this .encodedLengthGeneral (sequence , i );
110+ break ;
111+ }
112+ }
113+
114+ return utf8Length ;
115+ }
116+
117+ private int encodedLengthGeneral (CharSequence sequence , int start ) {
118+ int utf16Length = sequence .length ();
119+ int utf8Length = 0 ;
120+ for (int i = start ; i < utf16Length ; i ++) {
121+ char c = sequence .charAt (i );
122+ if (c < 0x800 ) {
123+ utf8Length += (0x7f - c ) >>> 31 ; // branch free!
124+ } else {
125+ utf8Length += 2 ;
126+ if (Character .isSurrogate (c )) {
127+ i ++;
128+ }
129+ }
130+ }
131+ return utf8Length ;
132+ }
79133 }
80134
81135 /**
82136 * Splits a message into items no longer than the size limit.
83137 *
84138 * @param message message to split
85- * @param size size limit per returned string
139+ * @param size size limit per returned string
86140 * @return split up string
87141 * @throws IllegalArgumentException if size is less than 1 or if
88- * message is null
142+ * message is null
89143 */
90144 @ NonNull List <String > split (@ NonNull String message , @ NonNegative int size );
91145}
0 commit comments