Skip to content

Commit 7143913

Browse files
Optimize DXT endpoints computation
This change improves the compression speed for DXT encoding. Explanation: When performing per-component endpoint optimization, the trial solutions are generated using all possible combinations of the component values. Then the error boundary computation is performed for each block color of the trial solution in order to check the possibility of early out. The important observation here is that some component values are present in several trial solutions and therefore are processed multiple times. The overall performance can therefore be improved by computing and caching the errors for all the possible component values in advance. DXT Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch (revision ea9b8d8). [Compressing Kodak set without mipmaps using DXT1 encoding] Original: 1582222 bytes / 28.843 sec Modified: 1468204 bytes / 6.067 sec Improvement: 7.21% (compression ratio) / 78.97% (compression time) [Compressing Kodak set with mipmaps using DXT1 encoding] Original: 2065243 bytes / 36.983 sec Modified: 1914805 bytes / 8.080 sec Improvement: 7.28% (compression ratio) / 78.15% (compression time) ETC Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). The ETC1 quantization parameters have been selected in such a way, so that ETC1 compression gives approximately the same average Luma PSNR as the corresponding DXT1 compression (which is equal to 34.044 dB for the Kodak test set compressed without mipmaps using DXT1 encoding and default quality settings). [Compressing Kodak set without mipmaps using ETC1 encoding] Total size: 1607858 bytes Total time: 13.421 sec Average bitrate: 1.363 bpp Average Luma PSNR: 34.050 dB
1 parent dbbef6a commit 7143913

File tree

3 files changed

+76
-113
lines changed

3 files changed

+76
-113
lines changed

bin/crunch_x64.exe

-512 Bytes
Binary file not shown.

crnlib/crn_dxt1.cpp

Lines changed: 75 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -295,125 +295,87 @@ void dxt1_endpoint_optimizer::return_solution() {
295295
}
296296

297297
// Per-component 1D endpoint optimization.
298-
void dxt1_endpoint_optimizer::optimize_endpoint_comps() {
299-
compute_selectors();
300-
if ((m_best_solution.m_alpha_block) || (!m_best_solution.m_error))
301-
return;
302298

303-
color_quad_u8 orig_l_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true));
304-
color_quad_u8 orig_h_scaled(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true));
305-
306-
color_quad_u8 min_color(0xFF, 0xFF, 0xFF, 0xFF);
307-
color_quad_u8 max_color(0, 0, 0, 0);
299+
void dxt1_endpoint_optimizer::compute_endpoint_component_errors(uint comp_index, uint64 (&error)[4][256], uint64 (&best_remaining_error)[4]) {
300+
uint64 W[4] = {}, WP2[4] = {}, WPP[4] = {};
308301
for (uint i = 0; i < m_unique_colors.size(); i++) {
309-
min_color = color_quad_u8::component_min(min_color, m_unique_colors[i].m_color);
310-
max_color = color_quad_u8::component_max(max_color, m_unique_colors[i].m_color);
302+
uint p = m_unique_colors[i].m_color[comp_index];
303+
uint w = m_unique_colors[i].m_weight;
304+
uint8 s = m_best_solution.m_selectors[i];
305+
W[s] += (int64)w;
306+
WP2[s] += (int64)w * p * 2;
307+
WPP[s] += (int64)w * p * p;
311308
}
312-
313-
// Try to separately optimize each component. This is a 1D problem so it's easy to compute accurate per-component error bounds.
314-
uint64 W[4] = {}, WD2[4] = {}, WDD[4] = {};
315-
for (uint comp_index = 0; comp_index < 3; comp_index++) {
316-
uint min_color_weight = 0;
317-
uint max_color_weight = 0;
318-
for (uint s = 0; s < 4; s++)
319-
W[s] = WD2[s] = WDD[s] = 0;
320-
for (uint i = 0; i < m_unique_colors.size(); i++) {
321-
uint c = m_unique_colors[i].m_color[comp_index];
322-
uint w = m_unique_colors[i].m_weight;
323-
uint8 s = m_best_solution.m_selectors[i];
324-
W[s] += (int64)w;
325-
WD2[s] += (int64)w * c * 2;
326-
WDD[s] += (int64)w * c * c;
327-
if (c == min_color[comp_index])
328-
min_color_weight += w;
329-
if (c == max_color[comp_index])
330-
max_color_weight += w;
309+
const uint comp_limit = comp_index == 1 ? 64 : 32;
310+
for (uint8 s = 0; s < 2; s++) {
311+
uint64 best_error = error[s][0] = WPP[s];
312+
for (uint8 c = 1; c < comp_limit; c++) {
313+
uint8 p = comp_index == 1 ? c << 2 | c >> 4 : c << 3 | c >> 2;
314+
error[s][c] = W[s] * p * p - WP2[s] * p + WPP[s];
315+
if (error[s][c] < best_error)
316+
best_error = error[s][c];
331317
}
318+
best_remaining_error[s] = best_error;
319+
}
320+
for (uint8 s = 2; s < 4; s++) {
321+
uint64 best_error = error[s][0] = WPP[s], d = W[s] - WP2[s], dd = W[s] << 1, e = WPP[s] + d;
322+
for (uint p = 1; p < 256; p++, d += dd, e += d) {
323+
error[s][p] = e;
324+
if (e < best_error)
325+
best_error = e;
326+
}
327+
best_remaining_error[s] = best_error;
328+
}
329+
for (uint8 s = 3; s; s--)
330+
best_remaining_error[s - 1] += best_remaining_error[s];
331+
}
332332

333-
uint ll[4];
334-
ll[0] = orig_l_scaled[comp_index];
335-
ll[1] = orig_h_scaled[comp_index];
336-
ll[2] = (ll[0] * 2 + ll[1]) / 3;
337-
ll[3] = (ll[0] + ll[1] * 2) / 3;
338-
339-
uint64 error_to_beat = 0;
340-
for (int s = 0; s < 4; s++)
341-
error_to_beat += W[s] * ll[s] * ll[s] - WD2[s] * ll[s] + WDD[s];
342-
343-
if (!error_to_beat)
333+
void dxt1_endpoint_optimizer::optimize_endpoint_comps() {
334+
compute_selectors();
335+
if (m_best_solution.m_alpha_block || !m_best_solution.m_error)
336+
return;
337+
color_quad_u8 source_low(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, true));
338+
color_quad_u8 source_high(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, true));
339+
uint64 error[4][256], best_remaining_error[4];
340+
for (uint comp_index = 0; comp_index < 3; comp_index++) {
341+
uint8 p0 = source_low[comp_index];
342+
uint8 p1 = source_high[comp_index];
343+
color_quad_u8 low(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
344+
color_quad_u8 high(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
345+
compute_endpoint_component_errors(comp_index, error, best_remaining_error);
346+
uint64 best_error = error[0][low[comp_index]] + error[1][high[comp_index]] + error[2][(p0 * 2 + p1) / 3] + error[3][(p0 + p1 * 2) / 3];
347+
if (best_remaining_error[0] >= best_error)
344348
continue;
345-
346-
CRNLIB_ASSERT((min_color_weight > 0) && (max_color_weight > 0));
347-
const uint error_to_beat_div_min_color_weight = min_color_weight ? ((error_to_beat + min_color_weight - 1) / min_color_weight) : 0;
348-
const uint error_to_beat_div_max_color_weight = max_color_weight ? ((error_to_beat + max_color_weight - 1) / max_color_weight) : 0;
349-
350-
const uint m = (comp_index == 1) ? 63 : 31;
351-
const uint m_shift = (comp_index == 1) ? 3 : 2;
352-
353-
for (uint o = 0; o <= m; o++) {
354-
uint tl[4];
355-
356-
tl[0] = (comp_index == 1) ? ((o << 2) | (o >> 4)) : ((o << 3) | (o >> 2));
357-
358-
for (uint h = 0; h < 8; h++) {
359-
const uint pl = h << m_shift;
360-
const uint ph = ((h + 1) << m_shift) - 1;
361-
362-
uint tl_l = (comp_index == 1) ? ((pl << 2) | (pl >> 4)) : ((pl << 3) | (pl >> 2));
363-
uint tl_h = (comp_index == 1) ? ((ph << 2) | (ph >> 4)) : ((ph << 3) | (ph >> 2));
364-
365-
tl_l = math::minimum(tl_l, tl[0]);
366-
tl_h = math::maximum(tl_h, tl[0]);
367-
368-
uint c_l = min_color[comp_index];
369-
uint c_h = max_color[comp_index];
370-
371-
if (c_h < tl_l) {
372-
uint min_possible_error = math::square<int>(tl_l - c_l);
373-
if (min_possible_error > error_to_beat_div_min_color_weight)
374-
continue;
375-
} else if (c_l > tl_h) {
376-
uint min_possible_error = math::square<int>(c_h - tl_h);
377-
if (min_possible_error > error_to_beat_div_max_color_weight)
378-
continue;
379-
}
380-
381-
for (uint p = pl; p <= ph; p++) {
382-
tl[1] = (comp_index == 1) ? ((p << 2) | (p >> 4)) : ((p << 3) | (p >> 2));
383-
384-
tl[2] = (tl[0] * 2 + tl[1]) / 3;
385-
tl[3] = (tl[0] + tl[1] * 2) / 3;
386-
387-
uint64 trial_error = 0;
388-
for (int s = 0; s < 4; s++)
389-
trial_error += W[s] * tl[s] * tl[s] - WD2[s] * tl[s] + WDD[s];
390-
391-
if (trial_error < error_to_beat) {
392-
color_quad_u8 l(dxt1_block::unpack_color(m_best_solution.m_coords.m_low_color, false));
393-
color_quad_u8 h(dxt1_block::unpack_color(m_best_solution.m_coords.m_high_color, false));
394-
l[comp_index] = static_cast<uint8>(o);
395-
h[comp_index] = static_cast<uint8>(p);
396-
397-
if (evaluate_solution(dxt1_solution_coordinates(dxt1_block::pack_color(l, false), dxt1_block::pack_color(h, false)))) {
398-
if (!m_best_solution.m_error)
399-
return;
400-
compute_selectors();
401-
for (uint s = 0; s < 4; s++)
402-
W[s] = WD2[s] = WDD[s] = 0;
403-
for (uint i = 0; i < m_unique_colors.size(); i++) {
404-
uint c = m_unique_colors[i].m_color[comp_index];
405-
uint w = m_unique_colors[i].m_weight;
406-
uint8 s = m_best_solution.m_selectors[i];
407-
W[s] += (int64)w;
408-
WD2[s] += (int64)w * c * 2;
409-
WDD[s] += (int64)w * c * c;
410-
}
411-
error_to_beat = 0;
412-
for (int s = 0; s < 4; s++)
413-
error_to_beat += W[s] * tl[s] * tl[s] - WD2[s] * tl[s] + WDD[s];
414-
}
415-
}
416-
}
349+
const uint comp_limit = comp_index == 1 ? 64 : 32;
350+
for (uint8 c0 = 0; c0 < comp_limit; c0++) {
351+
uint64 e0 = error[0][c0];
352+
if (e0 + best_remaining_error[1] >= best_error)
353+
continue;
354+
low[comp_index] = c0;
355+
uint16 packed_low = dxt1_block::pack_color(low, false);
356+
p0 = comp_index == 1 ? c0 << 2 | c0 >> 4 : c0 << 3 | c0 >> 2;
357+
for (uint8 c1 = 0; c1 < comp_limit; c1++) {
358+
uint64 e = e0 + error[1][c1];
359+
if (e + best_remaining_error[2] >= best_error)
360+
continue;
361+
p1 = comp_index == 1 ? c1 << 2 | c1 >> 4 : c1 << 3 | c1 >> 2;
362+
e += error[2][(p0 * 2 + p1) / 3];
363+
if (e + best_remaining_error[3] >= best_error)
364+
continue;
365+
e += error[3][(p0 + p1 * 2) / 3];
366+
if (e >= best_error)
367+
continue;
368+
high[comp_index] = c1;
369+
if (!evaluate_solution(dxt1_solution_coordinates(packed_low, dxt1_block::pack_color(high, false))))
370+
continue;
371+
if (!m_best_solution.m_error)
372+
return;
373+
compute_selectors();
374+
compute_endpoint_component_errors(comp_index, error, best_remaining_error);
375+
best_error = error[0][c0] + error[1][c1] + error[2][(p0 * 2 + p1) / 3] + error[3][(p0 + p1 * 2) / 3];
376+
e0 = error[0][c0];
377+
if (e0 + best_remaining_error[1] >= best_error)
378+
break;
417379
}
418380
}
419381
}

crnlib/crn_dxt1.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ class dxt1_endpoint_optimizer {
259259
void compute_vectors(const vec3F& perceptual_weights);
260260
void return_solution();
261261
void try_combinatorial_encoding();
262+
void compute_endpoint_component_errors(uint comp_index, uint64 (&error)[4][256], uint64 (&best_remaining_error)[4]);
262263
void optimize_endpoint_comps();
263264
void optimize_endpoints(vec3F& low_color, vec3F& high_color);
264265
bool try_alpha_as_black_optimization();

0 commit comments

Comments
 (0)