Skip to content

Commit 76cd4bd

Browse files
authored
Merge pull request #4640 from JackStouffer/utf
Make std.utf.toUTF8 DRY by using byUTF internally
2 parents 43ca58d + e096f29 commit 76cd4bd

File tree

1 file changed

+50
-64
lines changed

1 file changed

+50
-64
lines changed

std/utf.d

Lines changed: 50 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -2418,11 +2418,7 @@ void validate(S)(in S str) @safe pure
24182418
}
24192419

24202420
/* =================== Conversion to UTF8 ======================= */
2421-
2422-
pure
2423-
{
2424-
2425-
char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe
2421+
char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe pure
24262422
{
24272423
if (c <= 0x7F)
24282424
{
@@ -2462,73 +2458,66 @@ char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe
24622458
}
24632459
}
24642460

2465-
/*******************
2466-
* Encodes string $(D_PARAM s) into UTF-8 and returns the encoded string.
2461+
/**
2462+
* Encodes the elements of `s` to UTF-8 and returns a newly allocated
2463+
* string of the elements.
2464+
*
2465+
* Params:
2466+
* s = the string to encode
2467+
* Returns:
2468+
* A UTF-8 string
2469+
* See_Also:
2470+
* For a lazy, non-allocating version of these functions, see $(LREF byUTF).
24672471
*/
2468-
string toUTF8(scope const char[] s) @safe
2472+
string toUTF8(S)(S s) if (isInputRange!S && isSomeChar!(ElementEncodingType!S))
24692473
{
2470-
validate(s);
2471-
return s.idup;
2472-
}
2474+
static if (is(S : string))
2475+
{
2476+
return s.idup;
2477+
}
2478+
else
2479+
{
2480+
import std.array : appender;
2481+
auto app = appender!string();
24732482

2474-
/// ditto
2475-
string toUTF8(scope const wchar[] s) @safe
2476-
{
2477-
char[] r;
2478-
size_t i;
2479-
immutable slen = s.length;
2483+
static if (hasLength!S || isSomeString!S)
2484+
app.reserve(s.length);
24802485

2481-
r.length = slen;
2482-
for (i = 0; i < slen; i++)
2483-
{
2484-
immutable c = s[i];
2486+
foreach (c; s.byUTF!char)
2487+
app.put(c);
24852488

2486-
if (c <= 0x7F)
2487-
r[i] = cast(char)c; // fast path for ascii
2488-
else
2489-
{
2490-
r.length = i;
2491-
while (i < slen)
2492-
encode(r, decode(s, i));
2493-
break;
2494-
}
2489+
return app.data;
24952490
}
2496-
2497-
return r;
24982491
}
24992492

2500-
/// ditto
2501-
string toUTF8(scope const dchar[] s) @safe
2493+
///
2494+
@safe pure unittest
25022495
{
2503-
char[] r;
2504-
size_t i;
2505-
immutable slen = s.length;
2496+
import std.algorithm.comparison : equal;
25062497

2507-
r.length = slen;
2508-
for (i = 0; i < slen; i++)
2509-
{
2510-
immutable c = s[i];
2498+
// The ö is represented by two UTF-8 code units
2499+
assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
25112500

2512-
if (c <= 0x7F)
2513-
r[i] = cast(char)c; // fast path for ascii
2514-
else
2515-
{
2516-
r.length = i;
2517-
foreach (dchar d; s[i .. slen])
2518-
{
2519-
encode(r, d);
2520-
}
2521-
break;
2522-
}
2523-
}
2501+
// 𐐷 is four code units in UTF-8
2502+
assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
2503+
}
25242504

2525-
return r;
2505+
@system pure unittest
2506+
{
2507+
import std.internal.test.dummyrange : ReferenceInputRange;
2508+
import std.algorithm.comparison : equal;
2509+
2510+
auto r1 = new ReferenceInputRange!dchar("Hellø");
2511+
auto r2 = new ReferenceInputRange!dchar("𐐷");
2512+
2513+
assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8]));
2514+
assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7]));
25262515
}
25272516

25282517

25292518
/* =================== Conversion to UTF16 ======================= */
25302519

2531-
wchar[] toUTF16(return ref wchar[2] buf, dchar c) nothrow @nogc @safe
2520+
wchar[] toUTF16(return ref wchar[2] buf, dchar c) nothrow @nogc @safe pure
25322521
in
25332522
{
25342523
assert(isValidDchar(c));
@@ -2551,7 +2540,7 @@ body
25512540
/****************
25522541
* Encodes string $(D s) into UTF-16 and returns the encoded string.
25532542
*/
2554-
wstring toUTF16(scope const char[] s) @safe
2543+
wstring toUTF16(scope const char[] s) @safe pure
25552544
{
25562545
wchar[] r;
25572546
immutable slen = s.length;
@@ -2577,14 +2566,14 @@ wstring toUTF16(scope const char[] s) @safe
25772566
}
25782567

25792568
/// ditto
2580-
wstring toUTF16(scope const wchar[] s) @safe
2569+
wstring toUTF16(scope const wchar[] s) @safe pure
25812570
{
25822571
validate(s);
25832572
return s.idup;
25842573
}
25852574

25862575
/// ditto
2587-
wstring toUTF16(scope const dchar[] s) @safe
2576+
wstring toUTF16(scope const dchar[] s) @safe pure
25882577
{
25892578
wchar[] r;
25902579
immutable slen = s.length;
@@ -2605,7 +2594,7 @@ wstring toUTF16(scope const dchar[] s) @safe
26052594
/*****
26062595
* Encodes string $(D_PARAM s) into UTF-32 and returns the encoded string.
26072596
*/
2608-
dstring toUTF32(scope const char[] s) @safe
2597+
dstring toUTF32(scope const char[] s) @safe pure
26092598
{
26102599
dchar[] r;
26112600
immutable slen = s.length;
@@ -2626,7 +2615,7 @@ dstring toUTF32(scope const char[] s) @safe
26262615
}
26272616

26282617
/// ditto
2629-
dstring toUTF32(scope const wchar[] s) @safe
2618+
dstring toUTF32(scope const wchar[] s) @safe pure
26302619
{
26312620
dchar[] r;
26322621
immutable slen = s.length;
@@ -2647,15 +2636,12 @@ dstring toUTF32(scope const wchar[] s) @safe
26472636
}
26482637

26492638
/// ditto
2650-
dstring toUTF32(scope const dchar[] s) @safe
2639+
dstring toUTF32(scope const dchar[] s) @safe pure
26512640
{
26522641
validate(s);
26532642
return s.idup;
26542643
}
26552644

2656-
} // Convert functions are @safe
2657-
2658-
26592645
/* =================== toUTFz ======================= */
26602646

26612647
/++

0 commit comments

Comments
 (0)