Skip to content

Commit 5dfcc5a

Browse files
committed
mtmd: add detailed comments for resize_bicubic_pillow
1 parent 2d918b3 commit 5dfcc5a

File tree

1 file changed

+101
-55
lines changed

1 file changed

+101
-55
lines changed

tools/mtmd/clip.cpp

Lines changed: 101 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4386,10 +4386,19 @@ struct img_tool {
43864386

43874387
// Bicubic resize function using Pillow's ImagingResample algorithm
43884388
// Adapted from https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Resample.c
4389+
//
4390+
// Key Difference with resize_bicubic:
4391+
// 1. Uses separable filtering: horizontal pass followed by vertical pass
4392+
// 2. Pre-computes normalized filter coefficients for each output pixel
4393+
// 3. Applies convolution using fixed-point integer arithmetic for performance
43894394
static bool resize_bicubic_pillow(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
4395+
// Fixed-point precision: 22 bits = 32 (int32_t) - 8 (uint8_t pixels) - 2 (headroom for accumulation)
4396+
// This allows encoding fractional weights as integers: weight * 2^22
43904397
const int PRECISION_BITS = 32 - 8 - 2;
43914398

4392-
// Bicubic filter function
4399+
// Bicubic filter function with a = -0.5 (Note that GGML/PyTorch takes a = -0.75)
4400+
// Returns filter weight for distance x from pixel center
4401+
// Support: [-2, 2], meaning the filter influences pixels within 2 units of distance
43934402
auto bicubic_filter = [](double x) -> double {
43944403
constexpr double a = -0.5;
43954404
if (x < 0.0) {
@@ -4401,9 +4410,10 @@ struct img_tool {
44014410
if (x < 2.0) {
44024411
return (((x - 5) * x + 8) * x - 4) * a;
44034412
}
4404-
return 0.0;
4413+
return 0.0; // Zero outside [-2, 2]
44054414
};
44064415

4416+
// Filter support radius: bicubic extends 2 pixels in each direction
44074417
constexpr double filter_support = 2.0;
44084418

44094419
// Clipping function for 8-bit values
@@ -4413,29 +4423,47 @@ struct img_tool {
44134423
return static_cast<uint8_t>(val);
44144424
};
44154425

4416-
// Precompute coefficients
4417-
auto precompute_coeffs = [&](int inSize, double in0, double in1, int outSize,
4418-
std::vector<int> & bounds, std::vector<int32_t> & kk) -> int {
4426+
// Precompute filter coefficients for ONE dimension (horizontal or vertical)
4427+
//
4428+
// Parameters:
4429+
// inSize - Number of pixels in input dimension (e.g., src_width or src_height)
4430+
// outSize - Number of pixels in output dimension (e.g., target_width or target_height)
4431+
// bounds - [OUTPUT] Array of size outSize*2 storing input pixel ranges:
4432+
// bounds[xx*2+0] = first input pixel index for output pixel xx (xmin)
4433+
// bounds[xx*2+1] = number of input pixels for output pixel xx (xcnt)
4434+
// weights - [OUTPUT] Array of size outSize*ksize storing fixed-point filter weights:
4435+
// kk[xx*ksize + x] = weight for input pixel x contributing to output pixel xx
4436+
//
4437+
// Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel
4438+
auto precompute_weights = [&](int inSize, int outSize,
4439+
std::vector<int> & bounds, std::vector<int32_t> & weights) -> int {
44194440
double support, scale, filterscale;
44204441
double center, ww, ss;
4421-
int xx, x, ksize, xmin, xmax;
4442+
int xx, x, ksize, xmin, xmax, xcnt;
44224443

4423-
filterscale = scale = (in1 - in0) / outSize;
4444+
// Calculate scaling factor: ratio of input range to output size
4445+
filterscale = scale = (double)inSize / outSize;
4446+
// For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
4447+
// For downsampling (scale > 1), widen filter to prevent aliasing
44244448
if (filterscale < 1.0) {
44254449
filterscale = 1.0;
44264450
}
44274451

4428-
support = filter_support * filterscale;
4429-
ksize = static_cast<int>(std::ceil(support)) * 2 + 1;
4452+
// Determine filter support radius and kernel size
4453+
support = filter_support * filterscale; // Widen filter when downsampling
4454+
ksize = static_cast<int>(std::ceil(support)) * 2 + 1; // Total pixels in kernel
44304455

4431-
std::vector<double> prekk(outSize * ksize);
4456+
std::vector<double> pre_weights(outSize * ksize); // Temporary weights
44324457
bounds.resize(outSize * 2);
44334458

4459+
// For each output pixel, compute its filter coefficients
44344460
for (xx = 0; xx < outSize; xx++) {
4435-
center = in0 + (xx + 0.5) * scale;
4436-
ww = 0.0;
4437-
ss = 1.0 / filterscale;
4461+
// Calculate the center position in input space (pixel-center convention: +0.5)
4462+
center = (xx + 0.5) * scale;
4463+
ww = 0.0; // Sum of weights for normalization
4464+
ss = 1.0 / filterscale; // Scale factor for filter function
44384465

4466+
// Determine the range of input pixels that contribute to this output pixel
44394467
xmin = static_cast<int>(center - support + 0.5);
44404468
if (xmin < 0) {
44414469
xmin = 0;
@@ -4445,65 +4473,77 @@ struct img_tool {
44454473
if (xmax > inSize) {
44464474
xmax = inSize;
44474475
}
4448-
xmax -= xmin;
4476+
4477+
xcnt = xmax - xmin;
44494478

4450-
double * k = &prekk[xx * ksize];
4451-
for (x = 0; x < xmax; x++) {
4479+
// Compute filter weights for each contributing input pixel
4480+
for (x = 0; x < xcnt; x++) {
4481+
// Distance from input pixel center to output pixel center in input space
44524482
double w = bicubic_filter((x + xmin - center + 0.5) * ss);
4453-
k[x] = w;
4454-
ww += w;
4483+
pre_weights[xx * ksize + x] = w;
4484+
ww += w; // Accumulate for normalization
44554485
}
44564486

4457-
for (x = 0; x < xmax; x++) {
4487+
// Normalize weights to sum to 1.0 (preserves brightness)
4488+
for (x = 0; x < xcnt; x++) {
44584489
if (ww != 0.0) {
4459-
k[x] /= ww;
4490+
pre_weights[xx * ksize + x] /= ww;
44604491
}
44614492
}
44624493

4494+
// Zero-pad remaining kernel positions
44634495
for (; x < ksize; x++) {
4464-
k[x] = 0;
4496+
pre_weights[xx * ksize + x] = 0;
44654497
}
44664498

4499+
// Store input pixel range for this output pixel
44674500
bounds[xx * 2 + 0] = xmin;
4468-
bounds[xx * 2 + 1] = xmax;
4501+
bounds[xx * 2 + 1] = xcnt;
44694502
}
44704503

4471-
// Normalize coefficients to fixed-point
4472-
kk.resize(outSize * ksize);
4504+
// Convert floating-point coefficients to fixed-point integers
4505+
// Formula: int32 = round(float * 2^PRECISION_BITS)
4506+
weights.resize(outSize * ksize);
44734507
for (int i = 0; i < outSize * ksize; i++) {
4474-
if (prekk[i] < 0) {
4475-
kk[i] = static_cast<int32_t>(-0.5 + prekk[i] * (1 << PRECISION_BITS));
4508+
if (pre_weights[i] < 0) {
4509+
weights[i] = static_cast<int32_t>(-0.5 + pre_weights[i] * (1 << PRECISION_BITS));
44764510
} else {
4477-
kk[i] = static_cast<int32_t>(0.5 + prekk[i] * (1 << PRECISION_BITS));
4511+
weights[i] = static_cast<int32_t>(0.5 + pre_weights[i] * (1 << PRECISION_BITS));
44784512
}
44794513
}
44804514

44814515
return ksize;
44824516
};
44834517

4484-
// Horizontal resampling
4518+
// Horizontal resampling pass
4519+
// Resizes width from imIn.nx to imOut.nx, preserving height
44854520
auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
4486-
int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & kk) {
4521+
int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weights) {
44874522
imOut.ny = imIn.ny;
44884523
imOut.buf.resize(3 * imOut.nx * imOut.ny);
44894524

4525+
// Process each row independently
44904526
for (int yy = 0; yy < imOut.ny; yy++) {
4527+
// For each output pixel in this row
44914528
for (int xx = 0; xx < imOut.nx; xx++) {
4492-
int xmin = bounds[xx * 2 + 0];
4493-
int xmax = bounds[xx * 2 + 1];
4494-
const int32_t * k = &kk[xx * ksize];
4529+
// Get the range of input pixels and filter coefficients
4530+
int xmin = bounds[xx * 2 + 0]; // First input pixel index
4531+
int xcnt = bounds[xx * 2 + 1]; // Number of input pixels
44954532

4533+
// Initialize accumulators for RGB channels with rounding bias (0.5 in fixed-point)
44964534
int32_t ss0 = 1 << (PRECISION_BITS - 1);
44974535
int32_t ss1 = 1 << (PRECISION_BITS - 1);
44984536
int32_t ss2 = 1 << (PRECISION_BITS - 1);
44994537

4500-
for (int x = 0; x < xmax; x++) {
4538+
// Convolve: sum weighted input pixels
4539+
for (int x = 0; x < xcnt; x++) {
45014540
int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3;
4502-
ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * k[x];
4503-
ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * k[x];
4504-
ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * k[x];
4541+
ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weights[xx * ksize + x]; // R channel
4542+
ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weights[xx * ksize + x]; // G channel
4543+
ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weights[xx * ksize + x]; // B channel
45054544
}
45064545

4546+
// Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255]
45074547
int dst_idx = (yy * imOut.nx + xx) * 3;
45084548
imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
45094549
imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
@@ -4512,29 +4552,35 @@ struct img_tool {
45124552
}
45134553
};
45144554

4515-
// Vertical resampling
4555+
// Vertical resampling pass
4556+
// Resizes height from imIn.ny to imOut.ny, preserving width
45164557
auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
4517-
int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & kk) {
4558+
int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weight) {
45184559
imOut.nx = imIn.nx;
45194560
imOut.buf.resize(3 * imOut.nx * imOut.ny);
45204561

4562+
// For each output row
45214563
for (int yy = 0; yy < imOut.ny; yy++) {
4522-
int ymin = bounds[yy * 2 + 0];
4523-
int ymax = bounds[yy * 2 + 1];
4524-
const int32_t * k = &kk[yy * ksize];
4564+
// Get the range of input rows and filter coefficients
4565+
int ymin = bounds[yy * 2 + 0]; // First input row index
4566+
int ycnt = bounds[yy * 2 + 1]; // Number of input rows
45254567

4568+
// Process each column in this output row
45264569
for (int xx = 0; xx < imOut.nx; xx++) {
4570+
// Initialize accumulators for RGB channels with rounding bias
45274571
int32_t ss0 = 1 << (PRECISION_BITS - 1);
45284572
int32_t ss1 = 1 << (PRECISION_BITS - 1);
45294573
int32_t ss2 = 1 << (PRECISION_BITS - 1);
45304574

4531-
for (int y = 0; y < ymax; y++) {
4575+
// Convolve: sum weighted input pixels vertically
4576+
for (int y = 0; y < ycnt; y++) {
45324577
int src_idx = ((y + ymin) * imIn.nx + xx) * 3;
4533-
ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * k[y];
4534-
ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * k[y];
4535-
ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * k[y];
4578+
ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weight[yy * ksize + y]; // R channel
4579+
ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weight[yy * ksize + y]; // G channel
4580+
ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weight[yy * ksize + y]; // B channel
45364581
}
45374582

4583+
// Convert back from fixed-point and clamp to [0,255]
45384584
int dst_idx = (yy * imOut.nx + xx) * 3;
45394585
imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
45404586
imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
@@ -4543,7 +4589,7 @@ struct img_tool {
45434589
}
45444590
};
45454591

4546-
// Main resampling logic
4592+
// Main resampling logic using separable two-pass approach
45474593
const int src_width = img.nx;
45484594
const int src_height = img.ny;
45494595

@@ -4553,34 +4599,34 @@ struct img_tool {
45534599
bool need_horizontal = (target_width != src_width);
45544600
bool need_vertical = (target_height != src_height);
45554601

4556-
// Precompute coefficients for both passes
4602+
// Precompute filter coefficients for both dimensions
45574603
std::vector<int> bounds_horiz, bounds_vert;
4558-
std::vector<int32_t> kk_horiz, kk_vert;
4604+
std::vector<int32_t> weights_horiz, weights_vert;
45594605
int ksize_horiz = 0, ksize_vert = 0;
45604606

45614607
if (need_horizontal) {
4562-
ksize_horiz = precompute_coeffs(src_width, 0.0, src_width, target_width, bounds_horiz, kk_horiz);
4608+
ksize_horiz = precompute_weights(src_width, target_width, bounds_horiz, weights_horiz);
45634609
}
45644610

45654611
if (need_vertical) {
4566-
ksize_vert = precompute_coeffs(src_height, 0.0, src_height, target_height, bounds_vert, kk_vert);
4612+
ksize_vert = precompute_weights(src_height, target_height, bounds_vert, weights_vert);
45674613
}
45684614

45694615
// Perform two-pass resampling
45704616
if (need_horizontal && need_vertical) {
45714617
// Both horizontal and vertical
45724618
clip_image_u8 temp;
45734619
temp.nx = target_width;
4574-
resample_horizontal(img, temp, ksize_horiz, bounds_horiz, kk_horiz);
4575-
resample_vertical(temp, dst, ksize_vert, bounds_vert, kk_vert);
4620+
resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz);
4621+
resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert);
45764622
} else if (need_horizontal) {
45774623
// Only horizontal
4578-
resample_horizontal(img, dst, ksize_horiz, bounds_horiz, kk_horiz);
4624+
resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz);
45794625
} else if (need_vertical) {
45804626
// Only vertical
4581-
resample_vertical(img, dst, ksize_vert, bounds_vert, kk_vert);
4627+
resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert);
45824628
} else {
4583-
// No resampling needed
4629+
// No resizing needed - direct copy
45844630
dst.buf = img.buf;
45854631
}
45864632

0 commit comments

Comments
 (0)