Skip to content

Commit c2f8860

Browse files
committed
Handling Cutter lengths as utf-8
1 parent 9dcc465 commit c2f8860

File tree

2 files changed

+82
-10
lines changed

2 files changed

+82
-10
lines changed

NOTICE

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,24 @@ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2222
-----------------------
2323

2424

25+
The license for Guava, for which some code from the Utf8 class was used in the KICL Cutter class, is as follows:
26+
-----------------------
27+
Copyright (C) 2013 The Guava Authors
28+
29+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
30+
in compliance with the License. You may obtain a copy of the License at
31+
32+
http://www.apache.org/licenses/LICENSE-2.0
33+
34+
Unless required by applicable law or agreed to in writing, software distributed under the License
35+
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
36+
or implied. See the License for the specific language governing permissions and limitations under
37+
the License.
38+
-----------------------
39+
The Utf8 class is noted to have the following two authors:
40+
Martin Buchholz
41+
Clément Roux
42+
2543

2644
The license for the Netty framework, located (in the binary) in the folder org/kitteh/irc/lib/io/netty/ is as follows:
2745
-----------------------

src/main/java/org/kitteh/irc/client/library/util/Cutter.java

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,48 +44,102 @@ class DefaultWordCutter implements Cutter {
4444
Sanity.nullCheck(message, "Message");
4545
Sanity.truthiness(size > 0, "Size must be positive");
4646
List<String> list = new ArrayList<>();
47-
if (message.length() <= size) {
47+
if (this.encodedLength(message) <= size) {
4848
list.add(message);
4949
return list;
5050
}
5151
StringBuilder builder = new StringBuilder(size);
5252
for (String word : message.split(" ")) {
53-
if ((builder.length() + word.length() + ((builder.length() == 0) ? 0 : 1)) > size) {
54-
if ((word.length() > size) && ((builder.length() + 1) < size)) {
55-
if (builder.length() > 0) {
53+
int builderLen = this.encodedLength(builder);
54+
if ((builderLen + this.encodedLength(word) + ((builderLen == 0) ? 0 : 1)) > size) {
55+
if ((word.length() > size) && ((builderLen + 1) < size)) {
56+
if (builderLen > 0) {
5657
builder.append(' ');
58+
builderLen++;
5759
}
58-
int cut = size - builder.length();
60+
int cut = size - builderLen;
5961
builder.append(word, 0, cut);
6062
word = word.substring(cut);
6163
}
6264
list.add(builder.toString().trim());
6365
builder.setLength(0);
64-
while (word.length() > size) {
66+
while (this.encodedLength(word) > size) {
6567
list.add(word.substring(0, size));
6668
word = word.substring(size);
6769
}
6870
}
69-
if (builder.length() > 0) {
71+
if (this.encodedLength(builder) > 0) {
7072
builder.append(' ');
7173
}
7274
builder.append(word);
7375
}
74-
if (builder.length() > 0) {
76+
if (this.encodedLength(builder) > 0) {
7577
list.add(builder.toString().trim());
7678
}
7779
return list;
7880
}
81+
82+
/*
83+
* The below two methods are from Guava's Utf8 class, licensed Apache 2.0 (see NOTICE file for more)
84+
* As this will always be small with minimal consequence, exceptions for large or malformed text are stripped.
85+
*/
86+
87+
/**
88+
* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
89+
* method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
90+
* time and space.
91+
*/
92+
private int encodedLength(CharSequence sequence) {
93+
// Warning to maintainers: this implementation is highly optimized.
94+
int utf16Length = sequence.length();
95+
int utf8Length = utf16Length;
96+
int i = 0;
97+
98+
// This loop optimizes for pure ASCII.
99+
while (i < utf16Length && sequence.charAt(i) < 0x80) {
100+
i++;
101+
}
102+
103+
// This loop optimizes for chars less than 0x800.
104+
for (; i < utf16Length; i++) {
105+
char c = sequence.charAt(i);
106+
if (c < 0x800) {
107+
utf8Length += ((0x7f - c) >>> 31); // branch free!
108+
} else {
109+
utf8Length += this.encodedLengthGeneral(sequence, i);
110+
break;
111+
}
112+
}
113+
114+
return utf8Length;
115+
}
116+
117+
private int encodedLengthGeneral(CharSequence sequence, int start) {
118+
int utf16Length = sequence.length();
119+
int utf8Length = 0;
120+
for (int i = start; i < utf16Length; i++) {
121+
char c = sequence.charAt(i);
122+
if (c < 0x800) {
123+
utf8Length += (0x7f - c) >>> 31; // branch free!
124+
} else {
125+
utf8Length += 2;
126+
if (Character.isSurrogate(c)) {
127+
i++;
128+
}
129+
}
130+
}
131+
return utf8Length;
132+
}
79133
}
80134

81135
/**
82136
* Splits a message into items no longer than the size limit.
83137
*
84138
* @param message message to split
85-
* @param size size limit per returned string
139+
* @param size size limit per returned string
86140
* @return split up string
87141
* @throws IllegalArgumentException if size is less than 1 or if
88-
* message is null
142+
* message is null
89143
*/
90144
@NonNull List<String> split(@NonNull String message, @NonNegative int size);
91145
}

0 commit comments

Comments
 (0)