@@ -4386,10 +4386,19 @@ struct img_tool {
43864386
43874387 // Bicubic resize function using Pillow's ImagingResample algorithm
43884388 // Adapted from https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Resample.c
4389+ //
4390+ // Key Difference with resize_bicubic:
4391+ // 1. Uses separable filtering: horizontal pass followed by vertical pass
4392+ // 2. Pre-computes normalized filter coefficients for each output pixel
4393+ // 3. Applies convolution using fixed-point integer arithmetic for performance
43894394 static bool resize_bicubic_pillow (const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
4395+ // Fixed-point precision: 22 bits = 32 (int32_t) - 8 (uint8_t pixels) - 2 (headroom for accumulation)
4396+ // This allows encoding fractional weights as integers: weight * 2^22
43904397 const int PRECISION_BITS = 32 - 8 - 2 ;
43914398
4392- // Bicubic filter function
4399+ // Bicubic filter function with a = -0.5 (Note that GGML/PyTorch takes a = -0.75)
4400+ // Returns filter weight for distance x from pixel center
4401+ // Support: [-2, 2], meaning the filter influences pixels within 2 units of distance
43934402 auto bicubic_filter = [](double x) -> double {
43944403 constexpr double a = -0.5 ;
43954404 if (x < 0.0 ) {
@@ -4401,9 +4410,10 @@ struct img_tool {
44014410 if (x < 2.0 ) {
44024411 return (((x - 5 ) * x + 8 ) * x - 4 ) * a;
44034412 }
4404- return 0.0 ;
4413+ return 0.0 ; // Zero outside [-2, 2]
44054414 };
44064415
4416+ // Filter support radius: bicubic extends 2 pixels in each direction
44074417 constexpr double filter_support = 2.0 ;
44084418
44094419 // Clipping function for 8-bit values
@@ -4413,29 +4423,47 @@ struct img_tool {
44134423 return static_cast <uint8_t >(val);
44144424 };
44154425
4416- // Precompute coefficients
4417- auto precompute_coeffs = [&](int inSize, double in0, double in1, int outSize,
4418- std::vector<int > & bounds, std::vector<int32_t > & kk) -> int {
4426+ // Precompute filter coefficients for ONE dimension (horizontal or vertical)
4427+ //
4428+ // Parameters:
4429+ // inSize - Number of pixels in input dimension (e.g., src_width or src_height)
4430+ // outSize - Number of pixels in output dimension (e.g., target_width or target_height)
4431+ // bounds - [OUTPUT] Array of size outSize*2 storing input pixel ranges:
4432+ // bounds[xx*2+0] = first input pixel index for output pixel xx (xmin)
4433+ // bounds[xx*2+1] = number of input pixels for output pixel xx (xcnt)
4434+ // weights - [OUTPUT] Array of size outSize*ksize storing fixed-point filter weights:
4435+ // kk[xx*ksize + x] = weight for input pixel x contributing to output pixel xx
4436+ //
4437+ // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel
4438+ auto precompute_weights = [&](int inSize, int outSize,
4439+ std::vector<int > & bounds, std::vector<int32_t > & weights) -> int {
44194440 double support, scale, filterscale;
44204441 double center, ww, ss;
4421- int xx, x, ksize, xmin, xmax;
4442+ int xx, x, ksize, xmin, xmax, xcnt ;
44224443
4423- filterscale = scale = (in1 - in0) / outSize;
4444+ // Calculate scaling factor: ratio of input range to output size
4445+ filterscale = scale = (double )inSize / outSize;
4446+ // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
4447+ // For downsampling (scale > 1), widen filter to prevent aliasing
44244448 if (filterscale < 1.0 ) {
44254449 filterscale = 1.0 ;
44264450 }
44274451
4428- support = filter_support * filterscale;
4429- ksize = static_cast <int >(std::ceil (support)) * 2 + 1 ;
4452+ // Determine filter support radius and kernel size
4453+ support = filter_support * filterscale; // Widen filter when downsampling
4454+ ksize = static_cast <int >(std::ceil (support)) * 2 + 1 ; // Total pixels in kernel
44304455
4431- std::vector<double > prekk (outSize * ksize);
4456+ std::vector<double > pre_weights (outSize * ksize); // Temporary weights
44324457 bounds.resize (outSize * 2 );
44334458
4459+ // For each output pixel, compute its filter coefficients
44344460 for (xx = 0 ; xx < outSize; xx++) {
4435- center = in0 + (xx + 0.5 ) * scale;
4436- ww = 0.0 ;
4437- ss = 1.0 / filterscale;
4461+ // Calculate the center position in input space (pixel-center convention: +0.5)
4462+ center = (xx + 0.5 ) * scale;
4463+ ww = 0.0 ; // Sum of weights for normalization
4464+ ss = 1.0 / filterscale; // Scale factor for filter function
44384465
4466+ // Determine the range of input pixels that contribute to this output pixel
44394467 xmin = static_cast <int >(center - support + 0.5 );
44404468 if (xmin < 0 ) {
44414469 xmin = 0 ;
@@ -4445,65 +4473,77 @@ struct img_tool {
44454473 if (xmax > inSize) {
44464474 xmax = inSize;
44474475 }
4448- xmax -= xmin;
4476+
4477+ xcnt = xmax - xmin;
44494478
4450- double * k = &prekk[xx * ksize];
4451- for (x = 0 ; x < xmax; x++) {
4479+ // Compute filter weights for each contributing input pixel
4480+ for (x = 0 ; x < xcnt; x++) {
4481+ // Distance from input pixel center to output pixel center in input space
44524482 double w = bicubic_filter ((x + xmin - center + 0.5 ) * ss);
4453- k[ x] = w;
4454- ww += w;
4483+ pre_weights[xx * ksize + x] = w;
4484+ ww += w; // Accumulate for normalization
44554485 }
44564486
4457- for (x = 0 ; x < xmax; x++) {
4487+ // Normalize weights to sum to 1.0 (preserves brightness)
4488+ for (x = 0 ; x < xcnt; x++) {
44584489 if (ww != 0.0 ) {
4459- k[ x] /= ww;
4490+ pre_weights[xx * ksize + x] /= ww;
44604491 }
44614492 }
44624493
4494+ // Zero-pad remaining kernel positions
44634495 for (; x < ksize; x++) {
4464- k[ x] = 0 ;
4496+ pre_weights[xx * ksize + x] = 0 ;
44654497 }
44664498
4499+ // Store input pixel range for this output pixel
44674500 bounds[xx * 2 + 0 ] = xmin;
4468- bounds[xx * 2 + 1 ] = xmax ;
4501+ bounds[xx * 2 + 1 ] = xcnt ;
44694502 }
44704503
4471- // Normalize coefficients to fixed-point
4472- kk.resize (outSize * ksize);
4504+ // Convert floating-point coefficients to fixed-point integers
4505+ // Formula: int32 = round(float * 2^PRECISION_BITS)
4506+ weights.resize (outSize * ksize);
44734507 for (int i = 0 ; i < outSize * ksize; i++) {
4474- if (prekk [i] < 0 ) {
4475- kk [i] = static_cast <int32_t >(-0.5 + prekk [i] * (1 << PRECISION_BITS));
4508+ if (pre_weights [i] < 0 ) {
4509+ weights [i] = static_cast <int32_t >(-0.5 + pre_weights [i] * (1 << PRECISION_BITS));
44764510 } else {
4477- kk [i] = static_cast <int32_t >(0.5 + prekk [i] * (1 << PRECISION_BITS));
4511+ weights [i] = static_cast <int32_t >(0.5 + pre_weights [i] * (1 << PRECISION_BITS));
44784512 }
44794513 }
44804514
44814515 return ksize;
44824516 };
44834517
4484- // Horizontal resampling
4518+ // Horizontal resampling pass
4519+ // Resizes width from imIn.nx to imOut.nx, preserving height
44854520 auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
4486- int ksize, const std::vector<int > & bounds, const std::vector<int32_t > & kk ) {
4521+ int ksize, const std::vector<int > & bounds, const std::vector<int32_t > & weights ) {
44874522 imOut.ny = imIn.ny ;
44884523 imOut.buf .resize (3 * imOut.nx * imOut.ny );
44894524
4525+ // Process each row independently
44904526 for (int yy = 0 ; yy < imOut.ny ; yy++) {
4527+ // For each output pixel in this row
44914528 for (int xx = 0 ; xx < imOut.nx ; xx++) {
4492- int xmin = bounds[xx * 2 + 0 ];
4493- int xmax = bounds[xx * 2 + 1 ];
4494- const int32_t * k = &kk [xx * ksize];
4529+ // Get the range of input pixels and filter coefficients
4530+ int xmin = bounds[xx * 2 + 0 ]; // First input pixel index
4531+ int xcnt = bounds [xx * 2 + 1 ]; // Number of input pixels
44954532
4533+ // Initialize accumulators for RGB channels with rounding bias (0.5 in fixed-point)
44964534 int32_t ss0 = 1 << (PRECISION_BITS - 1 );
44974535 int32_t ss1 = 1 << (PRECISION_BITS - 1 );
44984536 int32_t ss2 = 1 << (PRECISION_BITS - 1 );
44994537
4500- for (int x = 0 ; x < xmax; x++) {
4538+ // Convolve: sum weighted input pixels
4539+ for (int x = 0 ; x < xcnt; x++) {
45014540 int src_idx = ((yy * imIn.nx ) + (x + xmin)) * 3 ;
4502- ss0 += static_cast <uint8_t >(imIn.buf [src_idx + 0 ]) * k[ x];
4503- ss1 += static_cast <uint8_t >(imIn.buf [src_idx + 1 ]) * k[ x];
4504- ss2 += static_cast <uint8_t >(imIn.buf [src_idx + 2 ]) * k[ x];
4541+ ss0 += static_cast <uint8_t >(imIn.buf [src_idx + 0 ]) * weights[xx * ksize + x]; // R channel
4542+ ss1 += static_cast <uint8_t >(imIn.buf [src_idx + 1 ]) * weights[xx * ksize + x]; // G channel
4543+ ss2 += static_cast <uint8_t >(imIn.buf [src_idx + 2 ]) * weights[xx * ksize + x]; // B channel
45054544 }
45064545
4546+ // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255]
45074547 int dst_idx = (yy * imOut.nx + xx) * 3 ;
45084548 imOut.buf [dst_idx + 0 ] = clip8 (ss0 >> PRECISION_BITS);
45094549 imOut.buf [dst_idx + 1 ] = clip8 (ss1 >> PRECISION_BITS);
@@ -4512,29 +4552,35 @@ struct img_tool {
45124552 }
45134553 };
45144554
4515- // Vertical resampling
4555+ // Vertical resampling pass
4556+ // Resizes height from imIn.ny to imOut.ny, preserving width
45164557 auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
4517- int ksize, const std::vector<int > & bounds, const std::vector<int32_t > & kk ) {
4558+ int ksize, const std::vector<int > & bounds, const std::vector<int32_t > & weight ) {
45184559 imOut.nx = imIn.nx ;
45194560 imOut.buf .resize (3 * imOut.nx * imOut.ny );
45204561
4562+ // For each output row
45214563 for (int yy = 0 ; yy < imOut.ny ; yy++) {
4522- int ymin = bounds[yy * 2 + 0 ];
4523- int ymax = bounds[yy * 2 + 1 ];
4524- const int32_t * k = &kk [yy * ksize];
4564+ // Get the range of input rows and filter coefficients
4565+ int ymin = bounds[yy * 2 + 0 ]; // First input row index
4566+ int ycnt = bounds [yy * 2 + 1 ]; // Number of input rows
45254567
4568+ // Process each column in this output row
45264569 for (int xx = 0 ; xx < imOut.nx ; xx++) {
4570+ // Initialize accumulators for RGB channels with rounding bias
45274571 int32_t ss0 = 1 << (PRECISION_BITS - 1 );
45284572 int32_t ss1 = 1 << (PRECISION_BITS - 1 );
45294573 int32_t ss2 = 1 << (PRECISION_BITS - 1 );
45304574
4531- for (int y = 0 ; y < ymax; y++) {
4575+ // Convolve: sum weighted input pixels vertically
4576+ for (int y = 0 ; y < ycnt; y++) {
45324577 int src_idx = ((y + ymin) * imIn.nx + xx) * 3 ;
4533- ss0 += static_cast <uint8_t >(imIn.buf [src_idx + 0 ]) * k[ y];
4534- ss1 += static_cast <uint8_t >(imIn.buf [src_idx + 1 ]) * k[ y];
4535- ss2 += static_cast <uint8_t >(imIn.buf [src_idx + 2 ]) * k[ y];
4578+ ss0 += static_cast <uint8_t >(imIn.buf [src_idx + 0 ]) * weight[yy * ksize + y]; // R channel
4579+ ss1 += static_cast <uint8_t >(imIn.buf [src_idx + 1 ]) * weight[yy * ksize + y]; // G channel
4580+ ss2 += static_cast <uint8_t >(imIn.buf [src_idx + 2 ]) * weight[yy * ksize + y]; // B channel
45364581 }
45374582
4583+ // Convert back from fixed-point and clamp to [0,255]
45384584 int dst_idx = (yy * imOut.nx + xx) * 3 ;
45394585 imOut.buf [dst_idx + 0 ] = clip8 (ss0 >> PRECISION_BITS);
45404586 imOut.buf [dst_idx + 1 ] = clip8 (ss1 >> PRECISION_BITS);
@@ -4543,7 +4589,7 @@ struct img_tool {
45434589 }
45444590 };
45454591
4546- // Main resampling logic
4592+ // Main resampling logic using separable two-pass approach
45474593 const int src_width = img.nx ;
45484594 const int src_height = img.ny ;
45494595
@@ -4553,34 +4599,34 @@ struct img_tool {
45534599 bool need_horizontal = (target_width != src_width);
45544600 bool need_vertical = (target_height != src_height);
45554601
4556- // Precompute coefficients for both passes
4602+ // Precompute filter coefficients for both dimensions
45574603 std::vector<int > bounds_horiz, bounds_vert;
4558- std::vector<int32_t > kk_horiz, kk_vert ;
4604+ std::vector<int32_t > weights_horiz, weights_vert ;
45594605 int ksize_horiz = 0 , ksize_vert = 0 ;
45604606
45614607 if (need_horizontal) {
4562- ksize_horiz = precompute_coeffs (src_width, 0.0 , src_width, target_width, bounds_horiz, kk_horiz );
4608+ ksize_horiz = precompute_weights (src_width, target_width, bounds_horiz, weights_horiz );
45634609 }
45644610
45654611 if (need_vertical) {
4566- ksize_vert = precompute_coeffs (src_height, 0.0 , src_height, target_height, bounds_vert, kk_vert );
4612+ ksize_vert = precompute_weights (src_height, target_height, bounds_vert, weights_vert );
45674613 }
45684614
45694615 // Perform two-pass resampling
45704616 if (need_horizontal && need_vertical) {
45714617 // Both horizontal and vertical
45724618 clip_image_u8 temp;
45734619 temp.nx = target_width;
4574- resample_horizontal (img, temp, ksize_horiz, bounds_horiz, kk_horiz );
4575- resample_vertical (temp, dst, ksize_vert, bounds_vert, kk_vert );
4620+ resample_horizontal (img, temp, ksize_horiz, bounds_horiz, weights_horiz );
4621+ resample_vertical (temp, dst, ksize_vert, bounds_vert, weights_vert );
45764622 } else if (need_horizontal) {
45774623 // Only horizontal
4578- resample_horizontal (img, dst, ksize_horiz, bounds_horiz, kk_horiz );
4624+ resample_horizontal (img, dst, ksize_horiz, bounds_horiz, weights_horiz );
45794625 } else if (need_vertical) {
45804626 // Only vertical
4581- resample_vertical (img, dst, ksize_vert, bounds_vert, kk_vert );
4627+ resample_vertical (img, dst, ksize_vert, bounds_vert, weights_vert );
45824628 } else {
4583- // No resampling needed
4629+ // No resizing needed - direct copy
45844630 dst.buf = img.buf ;
45854631 }
45864632
0 commit comments