@@ -118,18 +118,18 @@ class CBlitImageFilter :
118118 public:
119119 CState (blit_utils_t ::convolution_kernels_t && _kernels) : kernels(std::move(_kernels))
120120 {
121- inOffsetBaseLayer = core::vectorSIMDu32 ();
122- inExtentLayerCount = core::vectorSIMDu32 ();
123- outOffsetBaseLayer = core::vectorSIMDu32 ();
124- outExtentLayerCount = core::vectorSIMDu32 ();
121+ inOffsetBaseLayer = hlsl::uint32_t4 ();
122+ inExtentLayerCount = hlsl::uint32_t4 ();
123+ outOffsetBaseLayer = hlsl::uint32_t4 ();
124+ outExtentLayerCount = hlsl::uint32_t4 ();
125125 }
126126
127127 CState (const typename blit_utils_t ::convolution_kernels_t & _kernels) : kernels(_kernels)
128128 {
129- inOffsetBaseLayer = core::vectorSIMDu32 ();
130- inExtentLayerCount = core::vectorSIMDu32 ();
131- outOffsetBaseLayer = core::vectorSIMDu32 ();
132- outExtentLayerCount = core::vectorSIMDu32 ();
129+ inOffsetBaseLayer = hlsl::uint32_t4 ();
130+ inExtentLayerCount = hlsl::uint32_t4 ();
131+ outOffsetBaseLayer = hlsl::uint32_t4 ();
132+ outExtentLayerCount = hlsl::uint32_t4 ();
133133 }
134134
135135 CState (const CState& other) : IImageFilter::IState(), base_t ::CStateBase{other},
@@ -149,23 +149,23 @@ class CBlitImageFilter :
149149 return false ;
150150 const size_t offset = getScratchOffset (this ,ESU_SCALED_KERNEL_PHASED_LUT);
151151 const auto inType = inImage->getCreationParameters ().type ;
152- const size_t size = blit_utils_t::getScaledKernelPhasedLUTSize (inExtentLayerCount,outExtentLayerCount,inType,kernels);
152+ const size_t size = blit_utils_t::getScaledKernelPhasedLUTSize (inExtentLayerCount. xyz ,outExtentLayerCount. xyz ,inType,kernels);
153153 auto * lut = base_t ::CStateBase::scratchMemory+offset;
154- return blit_utils_t::computeScaledKernelPhasedLUT (lut,inExtentLayerCount,outExtentLayerCount,inType, kernels);
154+ return blit_utils_t::computeScaledKernelPhasedLUT (lut,inExtentLayerCount. xyz ,outExtentLayerCount. xyz ,inType, kernels);
155155 }
156156
157157 union
158158 {
159- core::vectorSIMDu32 inOffsetBaseLayer;
159+ hlsl::uint32_t4 inOffsetBaseLayer;
160160 struct
161161 {
162- VkOffset3D inOffset;
163- uint32_t inBaseLayer;
162+ VkOffset3D inOffset;
163+ uint32_t inBaseLayer;
164164 };
165165 };
166166 union
167167 {
168- core::vectorSIMDu32 inExtentLayerCount;
168+ hlsl::uint32_t4 inExtentLayerCount;
169169 struct
170170 {
171171 VkExtent3D inExtent;
@@ -174,7 +174,7 @@ class CBlitImageFilter :
174174 };
175175 union
176176 {
177- core::vectorSIMDu32 outOffsetBaseLayer;
177+ hlsl::uint32_t4 outOffsetBaseLayer;
178178 struct
179179 {
180180 VkOffset3D outOffset;
@@ -183,7 +183,7 @@ class CBlitImageFilter :
183183 };
184184 union
185185 {
186- core::vectorSIMDu32 outExtentLayerCount;
186+ hlsl::uint32_t4 outExtentLayerCount;
187187 struct
188188 {
189189 VkExtent3D outExtent;
@@ -208,8 +208,7 @@ class CBlitImageFilter :
208208 const auto windowSize = blit_utils_t::getWindowSize (inType, state->kernels );
209209 const size_t scaledKernelPhasedLUTSize = blit_utils_t::getScaledKernelPhasedLUTSize (state->inExtentLayerCount , state->outExtentLayerCount , inType, windowSize);
210210
211- core::vectorSIMDi32 intermediateExtent[3 ];
212- getIntermediateExtents (intermediateExtent, state, windowSize);
211+ const auto intermediateExtent = getIntermediateExtents (state,windowSize);
213212 assert (intermediateExtent[0 ].x == intermediateExtent[2 ].x );
214213
215214 uint32_t pingBufferElementCount = (state->inExtent .width + windowSize[0 ]) * m_maxParallelism; // decode
@@ -349,13 +348,7 @@ class CBlitImageFilter :
349348 // filtering and alpha handling happens separately for every layer, so save on scratch memory size
350349 const auto inImageType = inParams.type ;
351350 const auto real_window_size = blit_utils_t::getWindowSize (inImageType,state->kernels );
352- core::vectorSIMDi32 intermediateExtent[3 ];
353- getIntermediateExtents (intermediateExtent, state, real_window_size);
354- const core::vectorSIMDi32 intermediateLastCoord[3 ] = {
355- intermediateExtent[0 ]-core::vectorSIMDi32 (1 ,1 ,1 ,0 ),
356- intermediateExtent[1 ]-core::vectorSIMDi32 (1 ,1 ,1 ,0 ),
357- intermediateExtent[2 ]-core::vectorSIMDi32 (1 ,1 ,1 ,0 )
358- };
351+ const hlsl::int32_t3x3 intermediateExtent = getIntermediateExtents (state,real_window_size);
359352 value_t * const intermediateStorage[3 ] = {
360353 reinterpret_cast <value_t *>(state->scratchMemory + getScratchOffset (state, ESU_BLIT_X_AXIS_WRITE)),
361354 reinterpret_cast <value_t *>(state->scratchMemory + getScratchOffset (state, ESU_BLIT_Y_AXIS_WRITE)),
@@ -381,7 +374,7 @@ class CBlitImageFilter :
381374 };
382375 const std::span<const IImage::SBufferCopy> outRegions = outImg->getRegions (outMipLevel);
383376 auto storeToImage = [policy,coverageSemantic,needsNormalization,outExtent,intermediateStorage,&sampler,outFormat,alphaRefValue,outData,intermediateStrides,alphaChannel,storeToTexel,outMipLevel,outOffset,outRegions,outImg,state](
384- const core::rational<int64_t >& coverage, const int axis, const core::vectorSIMDu32 & outOffsetLayer
377+ const core::rational<int64_t >& coverage, const int axis, const hlsl::uint32_t4 & outOffsetLayer
385378 ) -> void
386379 {
387380 assert (needsNormalization);
@@ -432,15 +425,16 @@ class CBlitImageFilter :
432425 auto scaleCoverage = [outData,outOffsetLayer,intermediateStrides,axis,intermediateStorage,alphaChannel,coverageScale,storeToTexel](uint32_t writeBlockArrayOffset, core::vectorSIMDu32 writeBlockPos) -> void
433426 {
434427 void * const dstPix = outData+writeBlockArrayOffset;
435- const core::vectorSIMDu32 localOutPos = writeBlockPos - outOffsetLayer;
428+ for (auto i=0 ; i<4 ; i++)
429+ writeBlockPos[i] -= outOffsetLayer[i];
436430
437431 value_t sample[ChannelCount];
438- const size_t offset = IImage::SBufferCopy::getLocalByteOffset (localOutPos , intermediateStrides[axis]);
432+ const size_t offset = IImage::SBufferCopy::getLocalByteOffset (writeBlockPos , intermediateStrides[axis]);
439433 const auto * first = intermediateStorage[axis]+offset;
440434 std::copy (first,first+ChannelCount,sample);
441435
442436 sample[alphaChannel] *= coverageScale;
443- storeToTexel (sample,dstPix,localOutPos );
437+ storeToTexel (sample,dstPix,writeBlockPos );
444438 };
445439 const ICPUImage::SSubresourceLayers subresource = {static_cast <IImage::E_ASPECT_FLAGS>(0u ),outMipLevel,outOffsetLayer.w ,1 };
446440 const IImageFilter::IState::TexelRange range = {outOffset,outExtent};
@@ -452,32 +446,34 @@ class CBlitImageFilter :
452446
453447 // process
454448 state->normalization .template initialize <double >();
455- const core::vectorSIMDf fInExtent (inExtentLayerCount);
456- const core::vectorSIMDf fOutExtent (outExtentLayerCount);
457- const auto fScale = fInExtent .preciseDivision (fOutExtent );
458- const auto halfTexelOffset = fScale *0 .5f -core::vectorSIMDf (0 .f ,0 .f ,0 .f ,0 .5f );
459- const auto startCoord = [&halfTexelOffset,state]() -> core::vectorSIMDi32
449+ const hlsl::float64_t3 fInExtent (inExtentLayerCount.x ,inExtentLayerCount.y ,inExtentLayerCount.z );
450+ const hlsl::float64_t3 fOutExtent (outExtentLayerCount.x ,outExtentLayerCount.y ,outExtentLayerCount.z );
451+ const auto fScale = hlsl::float32_t3 (fInExtent /fOutExtent );
452+ const auto startCoord = [fScale ,state]() -> hlsl::int32_t4
460453 {
461- return core::vectorSIMDi32 (
454+ const auto halfTexelOffset = fScale *0 .5f ;
455+ return hlsl::int32_t4 (
462456 std::get<0 >(state->kernels ).getWindowMinCoord (halfTexelOffset.x ),
463457 std::get<1 >(state->kernels ).getWindowMinCoord (halfTexelOffset.y ),
464- std::get<2 >(state->kernels ).getWindowMinCoord (halfTexelOffset.z ),0 );
458+ std::get<2 >(state->kernels ).getWindowMinCoord (halfTexelOffset.z ),
459+ 0
460+ );
465461 }();
466- const auto windowMinCoordBase = inOffsetBaseLayer+startCoord;
462+ // important we are aware of signedness here
463+ const hlsl::int32_t4 windowMinCoordBase = hlsl::int32_t4 (inOffsetBaseLayer)+startCoord;
467464
468- core::vectorSIMDu32 phaseCount = IBlitUtilities::getPhaseCount (inExtentLayerCount, outExtentLayerCount, inImageType);
469- phaseCount = core ::max (phaseCount, core::vectorSIMDu32 (1 , 1 , 1 ));
470- const core::vectorSIMDu32 axisOffsets = blit_utils_t ::template getScaledKernelPhasedLUTAxisOffsets (phaseCount, real_window_size);
465+ auto phaseCount = IBlitUtilities::getPhaseCount (inExtentLayerCount. xyz , outExtentLayerCount. xyz , inImageType);
466+ phaseCount = hlsl ::max (phaseCount,hlsl::uint32_t3 (1 ,1 , 1 ));
467+ const auto axisOffsets = blit_utils_t ::template getScaledKernelPhasedLUTAxisOffsets (phaseCount,real_window_size);
471468 constexpr auto MaxAxisCount = 3 ;
472469 lut_value_t * scaledKernelPhasedLUTPixel[MaxAxisCount];
473470 for (auto i = 0 ; i < MaxAxisCount; ++i)
474471 scaledKernelPhasedLUTPixel[i] = reinterpret_cast <lut_value_t *>(state->scratchMemory + getScratchOffset (state, ESU_SCALED_KERNEL_PHASED_LUT) + axisOffsets[i]);
475472
476473 for (uint32_t layer=0 ; layer!=layerCount; layer++) // TODO: could be parallelized
477474 {
478- const core::vectorSIMDi32 vLayer (0 ,0 ,0 ,layer);
479- const auto windowMinCoord = windowMinCoordBase+vLayer;
480- const auto outOffsetLayer = outOffsetBaseLayer+vLayer;
475+ const hlsl::int32_t4 windowMinCoord (windowMinCoordBase.xyz ,windowMinCoordBase.w +layer);
476+ const hlsl::uint32_t4 outOffsetLayer (outOffsetBaseLayer.xyz ,outOffsetBaseLayer.w +layer);
481477 // reset coverage counter
482478 constexpr bool is_seq_policy_v = std::is_same_v<std::remove_reference_t <ExecutionPolicy>,core::execution::sequenced_policy>;
483479 using cond_atomic_int32_t = std::conditional_t <is_seq_policy_v,int32_t ,std::atomic_int32_t >;
@@ -517,19 +513,19 @@ class CBlitImageFilter :
517513 uint32_t decode_offset;
518514 // whole line plus window borders
519515 value_t * lineBuffer;
520- core::vectorSIMDi32 localTexCoord (0 );
516+ hlsl::int32_t3 localTexCoord (0 , 0 , 0 );
521517 localTexCoord[loopCoordID[0 ]] = batchCoord[0 ];
522518 localTexCoord[loopCoordID[1 ]] = batchCoord[1 ];
523519 if (axis!=IImage::ET_1D)
524- lineBuffer = intermediateStorage[axis-1 ]+core ::dot (static_cast <const core::vectorSIMDi32 &>(intermediateStrides[axis-1 ]),localTexCoord)[ 0 ] ;
520+ lineBuffer = intermediateStorage[axis-1 ]+hlsl ::dot (reinterpret_cast <const hlsl::int32_t3 &>(intermediateStrides[axis-1 ]),localTexCoord);
525521 else
526522 {
527523 const auto inputEnd = inExtent.width +real_window_size.x ;
528524 decode_offset = scratchHelper.template alloc <is_seq_policy_v>();
529525 lineBuffer = intermediateStorage[1 ]+decode_offset*ChannelCount*inputEnd;
530526 for (auto & i=localTexCoord.x ; i<inputEnd; i++)
531527 {
532- core::vectorSIMDi32 globalTexelCoord (localTexCoord+windowMinCoord);
528+ core::vectorSIMDi32 globalTexelCoord (localTexCoord. x +windowMinCoord. x ,localTexCoord. y +windowMinCoord. y ,localTexCoord. z +windowMinCoord. z );
533529
534530 core::vectorSIMDu32 blockLocalTexelCoord (0u );
535531 const void * srcPix[] = { // multiple loads for texture boundaries aren't that bad
@@ -562,11 +558,7 @@ class CBlitImageFilter :
562558
563559 auto getWeightedSample = [scaledKernelPhasedLUTPixel, windowSize, lineBuffer, &windowMinCoord, axis](const auto & windowCoord, const auto phaseIndex, const auto windowPixel, const auto channel) -> value_t
564560 {
565- value_t kernelWeight;
566- if constexpr (std::is_same_v<lut_value_t , uint16_t >)
567- kernelWeight = value_t (core::Float16Compressor::decompress (scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel]));
568- else
569- kernelWeight = scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel];
561+ const value_t kernelWeight = static_cast <value_t >(scaledKernelPhasedLUTPixel[axis][(phaseIndex * windowSize + windowPixel) * ChannelCount + channel]);
570562
571563 return kernelWeight * lineBuffer[(windowCoord - windowMinCoord[axis]) * ChannelCount + channel];
572564 };
@@ -576,11 +568,11 @@ class CBlitImageFilter :
576568 for (auto & i=(localTexCoord[axis]=0 ); i<outExtentLayerCount[axis]; i++)
577569 {
578570 // get output pixel
579- auto * const value = intermediateStorage[axis]+core ::dot (static_cast <const core::vectorSIMDi32 &>(intermediateStrides[axis]),localTexCoord)[ 0 ] ;
571+ auto * const value = intermediateStorage[axis]+hlsl ::dot (reinterpret_cast <const hlsl::int32_t3 &>(intermediateStrides[axis]),localTexCoord);
580572
581573 // do the filtering
582574 float tmp = float (i)+0 .5f ;
583- int32_t windowCoord = kernel.getWindowMinCoord (tmp*fScale [axis], tmp);
575+ int32_t windowCoord = kernel.getWindowMinCoord (tmp*fScale [axis],tmp);
584576
585577 for (auto ch = 0 ; ch < ChannelCount; ++ch)
586578 value[ch] = getWeightedSample (windowCoord, phaseIndex, 0 , ch);
@@ -594,7 +586,12 @@ class CBlitImageFilter :
594586 }
595587 if (lastPass)
596588 {
597- const core::vectorSIMDu32 localOutPos = localTexCoord+outOffsetBaseLayer+vLayer;
589+ const core::vectorSIMDu32 localOutPos (
590+ outOffsetLayer.x +localTexCoord.x ,
591+ outOffsetLayer.y +localTexCoord.y ,
592+ outOffsetLayer.z +localTexCoord.z ,
593+ outOffsetLayer.w +layer
594+ );
598595 if (needsNormalization)
599596 state->normalization .prepass (value,localOutPos,0u ,0u ,ChannelCount);
600597 else // store to image, we're done
@@ -678,13 +675,14 @@ class CBlitImageFilter :
678675 std::mutex mutex;
679676 };
680677
681- static inline void getIntermediateExtents (core::vectorSIMDi32* intermediateExtent, const state_type* state, const core::vectorSIMDi32& real_window_size)
678+ // the WxHxD extent for each blit axis output
679+ static inline hlsl::int32_t3x3 getIntermediateExtents (const state_type* state, const hlsl::int32_t3& real_window_size)
682680 {
683- assert ( intermediateExtent) ;
684-
685- intermediateExtent[0 ] = core::vectorSIMDi32 (state->outExtent .width , state->inExtent .height + real_window_size[ 1 ] , state->inExtent .depth + real_window_size[2 ]);
686- intermediateExtent[1 ] = core::vectorSIMDi32 (state->outExtent .width , state->outExtent .height , state->inExtent .depth + real_window_size[ 2 ] );
687- intermediateExtent[ 2 ] = core::vectorSIMDi32 (state-> outExtent . width , state-> outExtent . height , state-> outExtent . depth ) ;
681+ hlsl::int32_t3x3 intermediateExtent;
682+ intermediateExtent[ 0 ] = hlsl::int32_t3 (state-> outExtent . width , state-> inExtent . height + real_window_size[ 1 ], state-> inExtent . depth + real_window_size[ 2 ]);
683+ intermediateExtent[1 ] = hlsl::int32_t3 (state->outExtent .width , state->outExtent .height , state->inExtent .depth + real_window_size[2 ]);
684+ intermediateExtent[2 ] = hlsl::int32_t3 (state->outExtent .width , state->outExtent .height , state->outExtent .depth );
685+ return intermediateExtent ;
688686 }
689687};
690688
0 commit comments