Skip to content

Commit 2b9ca4f

Browse files
author
Grok Compression
committed
SIMD: further optimize
1 parent ae40674 commit 2b9ca4f

1 file changed

Lines changed: 65 additions & 123 deletions

File tree

src/lib/core/util/GrkImageSIMD.cpp

Lines changed: 65 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ namespace HWY_NAMESPACE
282282
uint8_t* out, uint32_t w, uint32_t h, uint32_t src_stride)
283283
{
284284
const HWY_FULL(int32_t) di;
285+
const hn::Rebind<uint8_t, decltype(di)> du8;
285286
const uint32_t L = (uint32_t)Lanes(di);
286287

287288
for(uint32_t j = 0; j < h; ++j)
@@ -294,16 +295,10 @@ namespace HWY_NAMESPACE
294295
uint32_t i = 0;
295296
for(; i + L <= w; i += L)
296297
{
297-
auto vr = LoadU(di, rRow + i);
298-
auto vg = LoadU(di, gRow + i);
299-
auto vb = LoadU(di, bRow + i);
300-
/* Scalar store for the narrowing conversion — simple and correct */
301-
for(uint32_t k = 0; k < L; ++k)
302-
{
303-
dst[(i + k) * 3 + 0] = (uint8_t)ExtractLane(vr, k);
304-
dst[(i + k) * 3 + 1] = (uint8_t)ExtractLane(vg, k);
305-
dst[(i + k) * 3 + 2] = (uint8_t)ExtractLane(vb, k);
306-
}
298+
auto vr = DemoteTo(du8, LoadU(di, rRow + i));
299+
auto vg = DemoteTo(du8, LoadU(di, gRow + i));
300+
auto vb = DemoteTo(du8, LoadU(di, bRow + i));
301+
StoreInterleaved3(vr, vg, vb, du8, dst + i * 3);
307302
}
308303
for(; i < w; ++i)
309304
{
@@ -319,6 +314,7 @@ namespace HWY_NAMESPACE
319314
uint32_t w, uint32_t h, uint32_t dst_stride)
320315
{
321316
const HWY_FULL(int32_t) di;
317+
const hn::Rebind<uint8_t, decltype(di)> du8;
322318
const uint32_t L = (uint32_t)Lanes(di);
323319

324320
for(uint32_t j = 0; j < h; ++j)
@@ -331,19 +327,11 @@ namespace HWY_NAMESPACE
331327
uint32_t i = 0;
332328
for(; i + L <= w; i += L)
333329
{
334-
/* Scalar load for the widening conversion */
335-
HWY_ALIGN int32_t tmpR[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
336-
HWY_ALIGN int32_t tmpG[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
337-
HWY_ALIGN int32_t tmpB[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
338-
for(uint32_t k = 0; k < L; ++k)
339-
{
340-
tmpR[k] = (int32_t)src[(i + k) * 3 + 0];
341-
tmpG[k] = (int32_t)src[(i + k) * 3 + 1];
342-
tmpB[k] = (int32_t)src[(i + k) * 3 + 2];
343-
}
344-
StoreU(Load(di, tmpR), di, rRow + i);
345-
StoreU(Load(di, tmpG), di, gRow + i);
346-
StoreU(Load(di, tmpB), di, bRow + i);
330+
hn::VFromD<decltype(du8)> vr, vg, vb;
331+
LoadInterleaved3(du8, src + i * 3, vr, vg, vb);
332+
StoreU(PromoteTo(di, vr), di, rRow + i);
333+
StoreU(PromoteTo(di, vg), di, gRow + i);
334+
StoreU(PromoteTo(di, vb), di, bRow + i);
347335
}
348336
for(; i < w; ++i)
349337
{
@@ -359,6 +347,7 @@ namespace HWY_NAMESPACE
359347
uint16_t* out, uint32_t w, uint32_t h, uint32_t src_stride)
360348
{
361349
const HWY_FULL(int32_t) di;
350+
const hn::Rebind<uint16_t, decltype(di)> du16;
362351
const uint32_t L = (uint32_t)Lanes(di);
363352

364353
for(uint32_t j = 0; j < h; ++j)
@@ -371,15 +360,10 @@ namespace HWY_NAMESPACE
371360
uint32_t i = 0;
372361
for(; i + L <= w; i += L)
373362
{
374-
auto vr = LoadU(di, rRow + i);
375-
auto vg = LoadU(di, gRow + i);
376-
auto vb = LoadU(di, bRow + i);
377-
for(uint32_t k = 0; k < L; ++k)
378-
{
379-
dst[(i + k) * 3 + 0] = (uint16_t)ExtractLane(vr, k);
380-
dst[(i + k) * 3 + 1] = (uint16_t)ExtractLane(vg, k);
381-
dst[(i + k) * 3 + 2] = (uint16_t)ExtractLane(vb, k);
382-
}
363+
auto vr = DemoteTo(du16, LoadU(di, rRow + i));
364+
auto vg = DemoteTo(du16, LoadU(di, gRow + i));
365+
auto vb = DemoteTo(du16, LoadU(di, bRow + i));
366+
StoreInterleaved3(vr, vg, vb, du16, dst + i * 3);
383367
}
384368
for(; i < w; ++i)
385369
{
@@ -395,6 +379,7 @@ namespace HWY_NAMESPACE
395379
uint32_t w, uint32_t h, uint32_t dst_stride)
396380
{
397381
const HWY_FULL(int32_t) di;
382+
const hn::Rebind<uint16_t, decltype(di)> du16;
398383
const uint32_t L = (uint32_t)Lanes(di);
399384

400385
for(uint32_t j = 0; j < h; ++j)
@@ -407,18 +392,11 @@ namespace HWY_NAMESPACE
407392
uint32_t i = 0;
408393
for(; i + L <= w; i += L)
409394
{
410-
HWY_ALIGN int32_t tmpR[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
411-
HWY_ALIGN int32_t tmpG[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
412-
HWY_ALIGN int32_t tmpB[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
413-
for(uint32_t k = 0; k < L; ++k)
414-
{
415-
tmpR[k] = (int32_t)src[(i + k) * 3 + 0];
416-
tmpG[k] = (int32_t)src[(i + k) * 3 + 1];
417-
tmpB[k] = (int32_t)src[(i + k) * 3 + 2];
418-
}
419-
StoreU(Load(di, tmpR), di, rRow + i);
420-
StoreU(Load(di, tmpG), di, gRow + i);
421-
StoreU(Load(di, tmpB), di, bRow + i);
395+
hn::VFromD<decltype(du16)> vr, vg, vb;
396+
LoadInterleaved3(du16, src + i * 3, vr, vg, vb);
397+
StoreU(PromoteTo(di, vr), di, rRow + i);
398+
StoreU(PromoteTo(di, vg), di, gRow + i);
399+
StoreU(PromoteTo(di, vb), di, bRow + i);
422400
}
423401
for(; i < w; ++i)
424402
{
@@ -434,16 +412,17 @@ namespace HWY_NAMESPACE
434412
uint32_t n)
435413
{
436414
const HWY_FULL(int32_t) di;
415+
const hn::Rebind<uint8_t, decltype(di)> du8;
437416
const uint32_t L = (uint32_t)Lanes(di);
438417
const auto vZero = Zero(di);
439418
const auto vMax = Set(di, 255);
440419

441420
uint32_t i = 0;
442421
for(; i + L <= n; i += L)
443422
{
423+
/* Clamp to [0,255] then saturating narrow to uint8 */
444424
auto v = Clamp(LoadU(di, src + i), vZero, vMax);
445-
for(uint32_t k = 0; k < L; ++k)
446-
dst[i + k] = (uint8_t)ExtractLane(v, k);
425+
StoreU(DemoteTo(du8, v), du8, dst + i);
447426
}
448427
for(; i < n; ++i)
449428
{
@@ -457,16 +436,14 @@ namespace HWY_NAMESPACE
457436
uint32_t n)
458437
{
459438
const HWY_FULL(int32_t) di;
439+
const hn::Rebind<int16_t, decltype(di)> di16;
460440
const uint32_t L = (uint32_t)Lanes(di);
461-
const auto vMin = Set(di, -32768);
462-
const auto vMax = Set(di, 32767);
463441

464442
uint32_t i = 0;
465443
for(; i + L <= n; i += L)
466444
{
467-
auto v = Clamp(LoadU(di, src + i), vMin, vMax);
468-
for(uint32_t k = 0; k < L; ++k)
469-
dst[i + k] = (int16_t)ExtractLane(v, k);
445+
/* DemoteTo int16 saturates to [-32768, 32767] */
446+
StoreU(DemoteTo(di16, LoadU(di, src + i)), di16, dst + i);
470447
}
471448
for(; i < n; ++i)
472449
{
@@ -480,16 +457,17 @@ namespace HWY_NAMESPACE
480457
uint32_t n)
481458
{
482459
const HWY_FULL(int32_t) di;
460+
const hn::Rebind<uint16_t, decltype(di)> du16;
483461
const uint32_t L = (uint32_t)Lanes(di);
484462
const auto vZero = Zero(di);
485463
const auto vMax = Set(di, 65535);
486464

487465
uint32_t i = 0;
488466
for(; i + L <= n; i += L)
489467
{
468+
/* Clamp to [0,65535] then saturating narrow to uint16 */
490469
auto v = Clamp(LoadU(di, src + i), vZero, vMax);
491-
for(uint32_t k = 0; k < L; ++k)
492-
dst[i + k] = (uint16_t)ExtractLane(v, k);
470+
StoreU(DemoteTo(du16, v), du16, dst + i);
493471
}
494472
for(; i < n; ++i)
495473
{
@@ -503,15 +481,16 @@ namespace HWY_NAMESPACE
503481
uint32_t n)
504482
{
505483
const HWY_FULL(int32_t) di;
484+
const hn::Rebind<uint32_t, decltype(di)> du32;
506485
const uint32_t L = (uint32_t)Lanes(di);
507486
const auto vZero = Zero(di);
508487

509488
uint32_t i = 0;
510489
for(; i + L <= n; i += L)
511490
{
491+
/* Clamp negatives to 0 and reinterpret as uint32 */
512492
auto v = Max(LoadU(di, src + i), vZero);
513-
for(uint32_t k = 0; k < L; ++k)
514-
dst[i + k] = (uint32_t)ExtractLane(v, k);
493+
StoreU(BitCast(du32, v), du32, dst + i);
515494
}
516495
for(; i < n; ++i)
517496
dst[i] = src[i] < 0 ? 0u : (uint32_t)src[i];
@@ -631,19 +610,11 @@ namespace HWY_NAMESPACE
631610
uint32_t i = 0;
632611
for(; i + L <= w; i += L)
633612
{
634-
/* Scalar gather from interleaved source, vectorized store */
635-
HWY_ALIGN int32_t t0[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
636-
HWY_ALIGN int32_t t1[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
637-
HWY_ALIGN int32_t t2[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
638-
for(uint32_t k = 0; k < L; ++k)
639-
{
640-
t0[k] = src[(i + k) * 3 + 0];
641-
t1[k] = src[(i + k) * 3 + 1];
642-
t2[k] = src[(i + k) * 3 + 2];
643-
}
644-
StoreU(Load(di, t0), di, dest[0] + i);
645-
StoreU(Load(di, t1), di, dest[1] + i);
646-
StoreU(Load(di, t2), di, dest[2] + i);
613+
hn::VFromD<decltype(di)> v0, v1, v2;
614+
LoadInterleaved3(di, src + i * 3, v0, v1, v2);
615+
StoreU(v0, di, dest[0] + i);
616+
StoreU(v1, di, dest[1] + i);
617+
StoreU(v2, di, dest[2] + i);
647618
}
648619
for(; i < w; ++i)
649620
{
@@ -657,21 +628,12 @@ namespace HWY_NAMESPACE
657628
uint32_t i = 0;
658629
for(; i + L <= w; i += L)
659630
{
660-
HWY_ALIGN int32_t t0[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
661-
HWY_ALIGN int32_t t1[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
662-
HWY_ALIGN int32_t t2[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
663-
HWY_ALIGN int32_t t3[HWY_MAX_LANES_D(HWY_FULL(int32_t))];
664-
for(uint32_t k = 0; k < L; ++k)
665-
{
666-
t0[k] = src[(i + k) * 4 + 0];
667-
t1[k] = src[(i + k) * 4 + 1];
668-
t2[k] = src[(i + k) * 4 + 2];
669-
t3[k] = src[(i + k) * 4 + 3];
670-
}
671-
StoreU(Load(di, t0), di, dest[0] + i);
672-
StoreU(Load(di, t1), di, dest[1] + i);
673-
StoreU(Load(di, t2), di, dest[2] + i);
674-
StoreU(Load(di, t3), di, dest[3] + i);
631+
hn::VFromD<decltype(di)> v0, v1, v2, v3;
632+
LoadInterleaved4(di, src + i * 4, v0, v1, v2, v3);
633+
StoreU(v0, di, dest[0] + i);
634+
StoreU(v1, di, dest[1] + i);
635+
StoreU(v2, di, dest[2] + i);
636+
StoreU(v3, di, dest[3] + i);
675637
}
676638
for(; i < w; ++i)
677639
{
@@ -696,6 +658,7 @@ namespace HWY_NAMESPACE
696658
uint32_t w, int32_t adjust)
697659
{
698660
const HWY_FULL(int32_t) di;
661+
const hn::Rebind<uint8_t, decltype(di)> du8;
699662
const uint32_t L = (uint32_t)Lanes(di);
700663
const auto vAdj = Set(di, adjust);
701664

@@ -704,15 +667,10 @@ namespace HWY_NAMESPACE
704667
uint32_t i = 0;
705668
for(; i + L <= w; i += L)
706669
{
707-
auto v0 = Add(LoadU(di, src[0] + i), vAdj);
708-
auto v1 = Add(LoadU(di, src[1] + i), vAdj);
709-
auto v2 = Add(LoadU(di, src[2] + i), vAdj);
710-
for(uint32_t k = 0; k < L; ++k)
711-
{
712-
dest[(i + k) * 3 + 0] = (uint8_t)ExtractLane(v0, k);
713-
dest[(i + k) * 3 + 1] = (uint8_t)ExtractLane(v1, k);
714-
dest[(i + k) * 3 + 2] = (uint8_t)ExtractLane(v2, k);
715-
}
670+
auto v0 = DemoteTo(du8, Add(LoadU(di, src[0] + i), vAdj));
671+
auto v1 = DemoteTo(du8, Add(LoadU(di, src[1] + i), vAdj));
672+
auto v2 = DemoteTo(du8, Add(LoadU(di, src[2] + i), vAdj));
673+
StoreInterleaved3(v0, v1, v2, du8, dest + i * 3);
716674
}
717675
for(; i < w; ++i)
718676
{
@@ -726,17 +684,11 @@ namespace HWY_NAMESPACE
726684
uint32_t i = 0;
727685
for(; i + L <= w; i += L)
728686
{
729-
auto v0 = Add(LoadU(di, src[0] + i), vAdj);
730-
auto v1 = Add(LoadU(di, src[1] + i), vAdj);
731-
auto v2 = Add(LoadU(di, src[2] + i), vAdj);
732-
auto v3 = Add(LoadU(di, src[3] + i), vAdj);
733-
for(uint32_t k = 0; k < L; ++k)
734-
{
735-
dest[(i + k) * 4 + 0] = (uint8_t)ExtractLane(v0, k);
736-
dest[(i + k) * 4 + 1] = (uint8_t)ExtractLane(v1, k);
737-
dest[(i + k) * 4 + 2] = (uint8_t)ExtractLane(v2, k);
738-
dest[(i + k) * 4 + 3] = (uint8_t)ExtractLane(v3, k);
739-
}
687+
auto v0 = DemoteTo(du8, Add(LoadU(di, src[0] + i), vAdj));
688+
auto v1 = DemoteTo(du8, Add(LoadU(di, src[1] + i), vAdj));
689+
auto v2 = DemoteTo(du8, Add(LoadU(di, src[2] + i), vAdj));
690+
auto v3 = DemoteTo(du8, Add(LoadU(di, src[3] + i), vAdj));
691+
StoreInterleaved4(v0, v1, v2, v3, du8, dest + i * 4);
740692
}
741693
for(; i < w; ++i)
742694
{
@@ -761,6 +713,7 @@ namespace HWY_NAMESPACE
761713
uint32_t w, int32_t adjust)
762714
{
763715
const HWY_FULL(int32_t) di;
716+
const hn::Rebind<uint16_t, decltype(di)> du16;
764717
const uint32_t L = (uint32_t)Lanes(di);
765718
const auto vAdj = Set(di, adjust);
766719

@@ -769,15 +722,10 @@ namespace HWY_NAMESPACE
769722
uint32_t i = 0;
770723
for(; i + L <= w; i += L)
771724
{
772-
auto v0 = Add(LoadU(di, src[0] + i), vAdj);
773-
auto v1 = Add(LoadU(di, src[1] + i), vAdj);
774-
auto v2 = Add(LoadU(di, src[2] + i), vAdj);
775-
for(uint32_t k = 0; k < L; ++k)
776-
{
777-
dest[(i + k) * 3 + 0] = (uint16_t)ExtractLane(v0, k);
778-
dest[(i + k) * 3 + 1] = (uint16_t)ExtractLane(v1, k);
779-
dest[(i + k) * 3 + 2] = (uint16_t)ExtractLane(v2, k);
780-
}
725+
auto v0 = DemoteTo(du16, Add(LoadU(di, src[0] + i), vAdj));
726+
auto v1 = DemoteTo(du16, Add(LoadU(di, src[1] + i), vAdj));
727+
auto v2 = DemoteTo(du16, Add(LoadU(di, src[2] + i), vAdj));
728+
StoreInterleaved3(v0, v1, v2, du16, dest + i * 3);
781729
}
782730
for(; i < w; ++i)
783731
{
@@ -791,17 +739,11 @@ namespace HWY_NAMESPACE
791739
uint32_t i = 0;
792740
for(; i + L <= w; i += L)
793741
{
794-
auto v0 = Add(LoadU(di, src[0] + i), vAdj);
795-
auto v1 = Add(LoadU(di, src[1] + i), vAdj);
796-
auto v2 = Add(LoadU(di, src[2] + i), vAdj);
797-
auto v3 = Add(LoadU(di, src[3] + i), vAdj);
798-
for(uint32_t k = 0; k < L; ++k)
799-
{
800-
dest[(i + k) * 4 + 0] = (uint16_t)ExtractLane(v0, k);
801-
dest[(i + k) * 4 + 1] = (uint16_t)ExtractLane(v1, k);
802-
dest[(i + k) * 4 + 2] = (uint16_t)ExtractLane(v2, k);
803-
dest[(i + k) * 4 + 3] = (uint16_t)ExtractLane(v3, k);
804-
}
742+
auto v0 = DemoteTo(du16, Add(LoadU(di, src[0] + i), vAdj));
743+
auto v1 = DemoteTo(du16, Add(LoadU(di, src[1] + i), vAdj));
744+
auto v2 = DemoteTo(du16, Add(LoadU(di, src[2] + i), vAdj));
745+
auto v3 = DemoteTo(du16, Add(LoadU(di, src[3] + i), vAdj));
746+
StoreInterleaved4(v0, v1, v2, v3, du16, dest + i * 4);
805747
}
806748
for(; i < w; ++i)
807749
{

0 commit comments

Comments
 (0)