@@ -282,6 +282,7 @@ namespace HWY_NAMESPACE
282282 uint8_t * out, uint32_t w, uint32_t h, uint32_t src_stride)
283283 {
284284 const HWY_FULL (int32_t ) di;
285+ const hn::Rebind<uint8_t , decltype (di)> du8;
285286 const uint32_t L = (uint32_t )Lanes (di);
286287
287288 for (uint32_t j = 0 ; j < h; ++j)
@@ -294,16 +295,10 @@ namespace HWY_NAMESPACE
294295 uint32_t i = 0 ;
295296 for (; i + L <= w; i += L)
296297 {
297- auto vr = LoadU (di, rRow + i);
298- auto vg = LoadU (di, gRow + i);
299- auto vb = LoadU (di, bRow + i);
300- /* Scalar store for the narrowing conversion — simple and correct */
301- for (uint32_t k = 0 ; k < L; ++k)
302- {
303- dst[(i + k) * 3 + 0 ] = (uint8_t )ExtractLane (vr, k);
304- dst[(i + k) * 3 + 1 ] = (uint8_t )ExtractLane (vg, k);
305- dst[(i + k) * 3 + 2 ] = (uint8_t )ExtractLane (vb, k);
306- }
298+ auto vr = DemoteTo (du8, LoadU (di, rRow + i));
299+ auto vg = DemoteTo (du8, LoadU (di, gRow + i));
300+ auto vb = DemoteTo (du8, LoadU (di, bRow + i));
301+ StoreInterleaved3 (vr, vg, vb, du8, dst + i * 3 );
307302 }
308303 for (; i < w; ++i)
309304 {
@@ -319,6 +314,7 @@ namespace HWY_NAMESPACE
319314 uint32_t w, uint32_t h, uint32_t dst_stride)
320315 {
321316 const HWY_FULL (int32_t ) di;
317+ const hn::Rebind<uint8_t , decltype (di)> du8;
322318 const uint32_t L = (uint32_t )Lanes (di);
323319
324320 for (uint32_t j = 0 ; j < h; ++j)
@@ -331,19 +327,11 @@ namespace HWY_NAMESPACE
331327 uint32_t i = 0 ;
332328 for (; i + L <= w; i += L)
333329 {
334- /* Scalar load for the widening conversion */
335- HWY_ALIGN int32_t tmpR[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
336- HWY_ALIGN int32_t tmpG[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
337- HWY_ALIGN int32_t tmpB[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
338- for (uint32_t k = 0 ; k < L; ++k)
339- {
340- tmpR[k] = (int32_t )src[(i + k) * 3 + 0 ];
341- tmpG[k] = (int32_t )src[(i + k) * 3 + 1 ];
342- tmpB[k] = (int32_t )src[(i + k) * 3 + 2 ];
343- }
344- StoreU (Load (di, tmpR), di, rRow + i);
345- StoreU (Load (di, tmpG), di, gRow + i);
346- StoreU (Load (di, tmpB), di, bRow + i);
330+ hn::VFromD<decltype (du8)> vr, vg, vb;
331+ LoadInterleaved3 (du8, src + i * 3 , vr, vg, vb);
332+ StoreU (PromoteTo (di, vr), di, rRow + i);
333+ StoreU (PromoteTo (di, vg), di, gRow + i);
334+ StoreU (PromoteTo (di, vb), di, bRow + i);
347335 }
348336 for (; i < w; ++i)
349337 {
@@ -359,6 +347,7 @@ namespace HWY_NAMESPACE
359347 uint16_t * out, uint32_t w, uint32_t h, uint32_t src_stride)
360348 {
361349 const HWY_FULL (int32_t ) di;
350+ const hn::Rebind<uint16_t , decltype (di)> du16;
362351 const uint32_t L = (uint32_t )Lanes (di);
363352
364353 for (uint32_t j = 0 ; j < h; ++j)
@@ -371,15 +360,10 @@ namespace HWY_NAMESPACE
371360 uint32_t i = 0 ;
372361 for (; i + L <= w; i += L)
373362 {
374- auto vr = LoadU (di, rRow + i);
375- auto vg = LoadU (di, gRow + i);
376- auto vb = LoadU (di, bRow + i);
377- for (uint32_t k = 0 ; k < L; ++k)
378- {
379- dst[(i + k) * 3 + 0 ] = (uint16_t )ExtractLane (vr, k);
380- dst[(i + k) * 3 + 1 ] = (uint16_t )ExtractLane (vg, k);
381- dst[(i + k) * 3 + 2 ] = (uint16_t )ExtractLane (vb, k);
382- }
363+ auto vr = DemoteTo (du16, LoadU (di, rRow + i));
364+ auto vg = DemoteTo (du16, LoadU (di, gRow + i));
365+ auto vb = DemoteTo (du16, LoadU (di, bRow + i));
366+ StoreInterleaved3 (vr, vg, vb, du16, dst + i * 3 );
383367 }
384368 for (; i < w; ++i)
385369 {
@@ -395,6 +379,7 @@ namespace HWY_NAMESPACE
395379 uint32_t w, uint32_t h, uint32_t dst_stride)
396380 {
397381 const HWY_FULL (int32_t ) di;
382+ const hn::Rebind<uint16_t , decltype (di)> du16;
398383 const uint32_t L = (uint32_t )Lanes (di);
399384
400385 for (uint32_t j = 0 ; j < h; ++j)
@@ -407,18 +392,11 @@ namespace HWY_NAMESPACE
407392 uint32_t i = 0 ;
408393 for (; i + L <= w; i += L)
409394 {
410- HWY_ALIGN int32_t tmpR[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
411- HWY_ALIGN int32_t tmpG[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
412- HWY_ALIGN int32_t tmpB[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
413- for (uint32_t k = 0 ; k < L; ++k)
414- {
415- tmpR[k] = (int32_t )src[(i + k) * 3 + 0 ];
416- tmpG[k] = (int32_t )src[(i + k) * 3 + 1 ];
417- tmpB[k] = (int32_t )src[(i + k) * 3 + 2 ];
418- }
419- StoreU (Load (di, tmpR), di, rRow + i);
420- StoreU (Load (di, tmpG), di, gRow + i);
421- StoreU (Load (di, tmpB), di, bRow + i);
395+ hn::VFromD<decltype (du16)> vr, vg, vb;
396+ LoadInterleaved3 (du16, src + i * 3 , vr, vg, vb);
397+ StoreU (PromoteTo (di, vr), di, rRow + i);
398+ StoreU (PromoteTo (di, vg), di, gRow + i);
399+ StoreU (PromoteTo (di, vb), di, bRow + i);
422400 }
423401 for (; i < w; ++i)
424402 {
@@ -434,16 +412,17 @@ namespace HWY_NAMESPACE
434412 uint32_t n)
435413 {
436414 const HWY_FULL (int32_t ) di;
415+ const hn::Rebind<uint8_t , decltype (di)> du8;
437416 const uint32_t L = (uint32_t )Lanes (di);
438417 const auto vZero = Zero (di);
439418 const auto vMax = Set (di, 255 );
440419
441420 uint32_t i = 0 ;
442421 for (; i + L <= n; i += L)
443422 {
423+ /* Clamp to [0,255] then saturating narrow to uint8 */
444424 auto v = Clamp (LoadU (di, src + i), vZero, vMax);
445- for (uint32_t k = 0 ; k < L; ++k)
446- dst[i + k] = (uint8_t )ExtractLane (v, k);
425+ StoreU (DemoteTo (du8, v), du8, dst + i);
447426 }
448427 for (; i < n; ++i)
449428 {
@@ -457,16 +436,14 @@ namespace HWY_NAMESPACE
457436 uint32_t n)
458437 {
459438 const HWY_FULL (int32_t ) di;
439+ const hn::Rebind<int16_t , decltype (di)> di16;
460440 const uint32_t L = (uint32_t )Lanes (di);
461- const auto vMin = Set (di, -32768 );
462- const auto vMax = Set (di, 32767 );
463441
464442 uint32_t i = 0 ;
465443 for (; i + L <= n; i += L)
466444 {
467- auto v = Clamp (LoadU (di, src + i), vMin, vMax);
468- for (uint32_t k = 0 ; k < L; ++k)
469- dst[i + k] = (int16_t )ExtractLane (v, k);
445+ /* DemoteTo int16 saturates to [-32768, 32767] */
446+ StoreU (DemoteTo (di16, LoadU (di, src + i)), di16, dst + i);
470447 }
471448 for (; i < n; ++i)
472449 {
@@ -480,16 +457,17 @@ namespace HWY_NAMESPACE
480457 uint32_t n)
481458 {
482459 const HWY_FULL (int32_t ) di;
460+ const hn::Rebind<uint16_t , decltype (di)> du16;
483461 const uint32_t L = (uint32_t )Lanes (di);
484462 const auto vZero = Zero (di);
485463 const auto vMax = Set (di, 65535 );
486464
487465 uint32_t i = 0 ;
488466 for (; i + L <= n; i += L)
489467 {
468+ /* Clamp to [0,65535] then saturating narrow to uint16 */
490469 auto v = Clamp (LoadU (di, src + i), vZero, vMax);
491- for (uint32_t k = 0 ; k < L; ++k)
492- dst[i + k] = (uint16_t )ExtractLane (v, k);
470+ StoreU (DemoteTo (du16, v), du16, dst + i);
493471 }
494472 for (; i < n; ++i)
495473 {
@@ -503,15 +481,16 @@ namespace HWY_NAMESPACE
503481 uint32_t n)
504482 {
505483 const HWY_FULL (int32_t ) di;
484+ const hn::Rebind<uint32_t , decltype (di)> du32;
506485 const uint32_t L = (uint32_t )Lanes (di);
507486 const auto vZero = Zero (di);
508487
509488 uint32_t i = 0 ;
510489 for (; i + L <= n; i += L)
511490 {
491+ /* Clamp negatives to 0 and reinterpret as uint32 */
512492 auto v = Max (LoadU (di, src + i), vZero);
513- for (uint32_t k = 0 ; k < L; ++k)
514- dst[i + k] = (uint32_t )ExtractLane (v, k);
493+ StoreU (BitCast (du32, v), du32, dst + i);
515494 }
516495 for (; i < n; ++i)
517496 dst[i] = src[i] < 0 ? 0u : (uint32_t )src[i];
@@ -631,19 +610,11 @@ namespace HWY_NAMESPACE
631610 uint32_t i = 0 ;
632611 for (; i + L <= w; i += L)
633612 {
634- /* Scalar gather from interleaved source, vectorized store */
635- HWY_ALIGN int32_t t0[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
636- HWY_ALIGN int32_t t1[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
637- HWY_ALIGN int32_t t2[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
638- for (uint32_t k = 0 ; k < L; ++k)
639- {
640- t0[k] = src[(i + k) * 3 + 0 ];
641- t1[k] = src[(i + k) * 3 + 1 ];
642- t2[k] = src[(i + k) * 3 + 2 ];
643- }
644- StoreU (Load (di, t0), di, dest[0 ] + i);
645- StoreU (Load (di, t1), di, dest[1 ] + i);
646- StoreU (Load (di, t2), di, dest[2 ] + i);
613+ hn::VFromD<decltype (di)> v0, v1, v2;
614+ LoadInterleaved3 (di, src + i * 3 , v0, v1, v2);
615+ StoreU (v0, di, dest[0 ] + i);
616+ StoreU (v1, di, dest[1 ] + i);
617+ StoreU (v2, di, dest[2 ] + i);
647618 }
648619 for (; i < w; ++i)
649620 {
@@ -657,21 +628,12 @@ namespace HWY_NAMESPACE
657628 uint32_t i = 0 ;
658629 for (; i + L <= w; i += L)
659630 {
660- HWY_ALIGN int32_t t0[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
661- HWY_ALIGN int32_t t1[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
662- HWY_ALIGN int32_t t2[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
663- HWY_ALIGN int32_t t3[HWY_MAX_LANES_D (HWY_FULL (int32_t ))];
664- for (uint32_t k = 0 ; k < L; ++k)
665- {
666- t0[k] = src[(i + k) * 4 + 0 ];
667- t1[k] = src[(i + k) * 4 + 1 ];
668- t2[k] = src[(i + k) * 4 + 2 ];
669- t3[k] = src[(i + k) * 4 + 3 ];
670- }
671- StoreU (Load (di, t0), di, dest[0 ] + i);
672- StoreU (Load (di, t1), di, dest[1 ] + i);
673- StoreU (Load (di, t2), di, dest[2 ] + i);
674- StoreU (Load (di, t3), di, dest[3 ] + i);
631+ hn::VFromD<decltype (di)> v0, v1, v2, v3;
632+ LoadInterleaved4 (di, src + i * 4 , v0, v1, v2, v3);
633+ StoreU (v0, di, dest[0 ] + i);
634+ StoreU (v1, di, dest[1 ] + i);
635+ StoreU (v2, di, dest[2 ] + i);
636+ StoreU (v3, di, dest[3 ] + i);
675637 }
676638 for (; i < w; ++i)
677639 {
@@ -696,6 +658,7 @@ namespace HWY_NAMESPACE
696658 uint32_t w, int32_t adjust)
697659 {
698660 const HWY_FULL (int32_t ) di;
661+ const hn::Rebind<uint8_t , decltype (di)> du8;
699662 const uint32_t L = (uint32_t )Lanes (di);
700663 const auto vAdj = Set (di, adjust);
701664
@@ -704,15 +667,10 @@ namespace HWY_NAMESPACE
704667 uint32_t i = 0 ;
705668 for (; i + L <= w; i += L)
706669 {
707- auto v0 = Add (LoadU (di, src[0 ] + i), vAdj);
708- auto v1 = Add (LoadU (di, src[1 ] + i), vAdj);
709- auto v2 = Add (LoadU (di, src[2 ] + i), vAdj);
710- for (uint32_t k = 0 ; k < L; ++k)
711- {
712- dest[(i + k) * 3 + 0 ] = (uint8_t )ExtractLane (v0, k);
713- dest[(i + k) * 3 + 1 ] = (uint8_t )ExtractLane (v1, k);
714- dest[(i + k) * 3 + 2 ] = (uint8_t )ExtractLane (v2, k);
715- }
670+ auto v0 = DemoteTo (du8, Add (LoadU (di, src[0 ] + i), vAdj));
671+ auto v1 = DemoteTo (du8, Add (LoadU (di, src[1 ] + i), vAdj));
672+ auto v2 = DemoteTo (du8, Add (LoadU (di, src[2 ] + i), vAdj));
673+ StoreInterleaved3 (v0, v1, v2, du8, dest + i * 3 );
716674 }
717675 for (; i < w; ++i)
718676 {
@@ -726,17 +684,11 @@ namespace HWY_NAMESPACE
726684 uint32_t i = 0 ;
727685 for (; i + L <= w; i += L)
728686 {
729- auto v0 = Add (LoadU (di, src[0 ] + i), vAdj);
730- auto v1 = Add (LoadU (di, src[1 ] + i), vAdj);
731- auto v2 = Add (LoadU (di, src[2 ] + i), vAdj);
732- auto v3 = Add (LoadU (di, src[3 ] + i), vAdj);
733- for (uint32_t k = 0 ; k < L; ++k)
734- {
735- dest[(i + k) * 4 + 0 ] = (uint8_t )ExtractLane (v0, k);
736- dest[(i + k) * 4 + 1 ] = (uint8_t )ExtractLane (v1, k);
737- dest[(i + k) * 4 + 2 ] = (uint8_t )ExtractLane (v2, k);
738- dest[(i + k) * 4 + 3 ] = (uint8_t )ExtractLane (v3, k);
739- }
687+ auto v0 = DemoteTo (du8, Add (LoadU (di, src[0 ] + i), vAdj));
688+ auto v1 = DemoteTo (du8, Add (LoadU (di, src[1 ] + i), vAdj));
689+ auto v2 = DemoteTo (du8, Add (LoadU (di, src[2 ] + i), vAdj));
690+ auto v3 = DemoteTo (du8, Add (LoadU (di, src[3 ] + i), vAdj));
691+ StoreInterleaved4 (v0, v1, v2, v3, du8, dest + i * 4 );
740692 }
741693 for (; i < w; ++i)
742694 {
@@ -761,6 +713,7 @@ namespace HWY_NAMESPACE
761713 uint32_t w, int32_t adjust)
762714 {
763715 const HWY_FULL (int32_t ) di;
716+ const hn::Rebind<uint16_t , decltype (di)> du16;
764717 const uint32_t L = (uint32_t )Lanes (di);
765718 const auto vAdj = Set (di, adjust);
766719
@@ -769,15 +722,10 @@ namespace HWY_NAMESPACE
769722 uint32_t i = 0 ;
770723 for (; i + L <= w; i += L)
771724 {
772- auto v0 = Add (LoadU (di, src[0 ] + i), vAdj);
773- auto v1 = Add (LoadU (di, src[1 ] + i), vAdj);
774- auto v2 = Add (LoadU (di, src[2 ] + i), vAdj);
775- for (uint32_t k = 0 ; k < L; ++k)
776- {
777- dest[(i + k) * 3 + 0 ] = (uint16_t )ExtractLane (v0, k);
778- dest[(i + k) * 3 + 1 ] = (uint16_t )ExtractLane (v1, k);
779- dest[(i + k) * 3 + 2 ] = (uint16_t )ExtractLane (v2, k);
780- }
725+ auto v0 = DemoteTo (du16, Add (LoadU (di, src[0 ] + i), vAdj));
726+ auto v1 = DemoteTo (du16, Add (LoadU (di, src[1 ] + i), vAdj));
727+ auto v2 = DemoteTo (du16, Add (LoadU (di, src[2 ] + i), vAdj));
728+ StoreInterleaved3 (v0, v1, v2, du16, dest + i * 3 );
781729 }
782730 for (; i < w; ++i)
783731 {
@@ -791,17 +739,11 @@ namespace HWY_NAMESPACE
791739 uint32_t i = 0 ;
792740 for (; i + L <= w; i += L)
793741 {
794- auto v0 = Add (LoadU (di, src[0 ] + i), vAdj);
795- auto v1 = Add (LoadU (di, src[1 ] + i), vAdj);
796- auto v2 = Add (LoadU (di, src[2 ] + i), vAdj);
797- auto v3 = Add (LoadU (di, src[3 ] + i), vAdj);
798- for (uint32_t k = 0 ; k < L; ++k)
799- {
800- dest[(i + k) * 4 + 0 ] = (uint16_t )ExtractLane (v0, k);
801- dest[(i + k) * 4 + 1 ] = (uint16_t )ExtractLane (v1, k);
802- dest[(i + k) * 4 + 2 ] = (uint16_t )ExtractLane (v2, k);
803- dest[(i + k) * 4 + 3 ] = (uint16_t )ExtractLane (v3, k);
804- }
742+ auto v0 = DemoteTo (du16, Add (LoadU (di, src[0 ] + i), vAdj));
743+ auto v1 = DemoteTo (du16, Add (LoadU (di, src[1 ] + i), vAdj));
744+ auto v2 = DemoteTo (du16, Add (LoadU (di, src[2 ] + i), vAdj));
745+ auto v3 = DemoteTo (du16, Add (LoadU (di, src[3 ] + i), vAdj));
746+ StoreInterleaved4 (v0, v1, v2, v3, du16, dest + i * 4 );
805747 }
806748 for (; i < w; ++i)
807749 {
0 commit comments