Skip to content

Commit b8ca990

Browse files
committed
rng: Add AVX2 + NEON asm.
Merge after minio#228 to include arm64 checks. Before/after... ``` pkg: github.com/minio/pkg/v3/rng cpu: AMD Ryzen 9 9950X 16-Core Processor BenchmarkReader BenchmarkReader/1000-32 46546988 25.88 ns/op 38635.30 MB/s 0 B/op 0 allocs/op BenchmarkReader/1024-32 70920727 17.14 ns/op 59755.33 MB/s 0 B/op 0 allocs/op BenchmarkReader/16384-32 5805674 204.9 ns/op 79950.02 MB/s 0 B/op 0 allocs/op BenchmarkReader/1048576-32 92539 14080 ns/op 74470.24 MB/s 0 B/op 0 allocs/op BenchmarkReader/1000-32 52974752 22.57 ns/op 44300.70 MB/s 0 B/op 0 allocs/op BenchmarkReader/1024-32 100000000 11.37 ns/op 90096.95 MB/s 0 B/op 0 allocs/op BenchmarkReader/16384-32 14598060 81.69 ns/op 200552.58 MB/s 0 B/op 0 allocs/op BenchmarkReader/1048576-32 174301 6384 ns/op 164256.53 MB/s 0 B/op 0 allocs/op ```
1 parent 54ff789 commit b8ca990

6 files changed

Lines changed: 256 additions & 11 deletions

File tree

rng/reader_test.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package rng
1919

2020
import (
2121
"bytes"
22+
"encoding/binary"
2223
"io"
2324
"math/rand"
2425
"strconv"
@@ -188,3 +189,114 @@ func TestXor(t *testing.T) {
188189
}
189190
}
190191
}
192+
193+
func TestXorZeroKey(t *testing.T) {
194+
var keys [4]uint64
195+
for _, size := range []int{0, 32, 64, 96, 128, 1024} {
196+
in := make([]byte, size)
197+
for i := range in {
198+
in[i] = byte(i)
199+
}
200+
out := make([]byte, size)
201+
xorSlice(in, out, &keys)
202+
if !bytes.Equal(in, out) {
203+
t.Fatalf("size %d: zero-key xor should copy input\nexpected %x\ngot %x", size, in, out)
204+
}
205+
out2 := make([]byte, size)
206+
xor32Go(in, out2, &keys)
207+
if !bytes.Equal(in, out2) {
208+
t.Fatalf("size %d: zero-key xor32Go should copy input", size)
209+
}
210+
}
211+
}
212+
213+
func TestXorDoubleApply(t *testing.T) {
214+
rng := rand.New(rand.NewSource(42))
215+
var keys [4]uint64
216+
for i := range keys {
217+
keys[i] = rng.Uint64()
218+
}
219+
for _, size := range []int{32, 64, 96, 128, 256, 1024, 4096} {
220+
in := make([]byte, size)
221+
_, _ = io.ReadFull(rng, in)
222+
orig := make([]byte, size)
223+
copy(orig, in)
224+
225+
tmp := make([]byte, size)
226+
out := make([]byte, size)
227+
xorSlice(in, tmp, &keys)
228+
xorSlice(tmp, out, &keys)
229+
if !bytes.Equal(orig, out) {
230+
t.Fatalf("size %d: double xor should return original\nexpected %x\ngot %x", size, orig[:32], out[:32])
231+
}
232+
}
233+
}
234+
235+
func TestXorAllSizes(t *testing.T) {
236+
rng := rand.New(rand.NewSource(99))
237+
var keys [4]uint64
238+
for i := range keys {
239+
keys[i] = rng.Uint64()
240+
}
241+
in := make([]byte, 8192)
242+
_, _ = io.ReadFull(rng, in)
243+
244+
// Every multiple of 32 up to 8192 exercises different loop paths:
245+
// 0 (empty), 32 (single block), 64 (two blocks / 64-byte loop only),
246+
// 96 (64-byte loop + 32-byte tail), etc.
247+
for size := 0; size <= len(in); size += 32 {
248+
outAsm := make([]byte, size)
249+
outGo := make([]byte, size)
250+
xorSlice(in[:size], outAsm, &keys)
251+
xor32Go(in[:size], outGo, &keys)
252+
if !bytes.Equal(outAsm, outGo) {
253+
t.Fatalf("size %d: asm and Go disagree\nasm %x\ngo %x", size, outAsm[:min(64, size)], outGo[:min(64, size)])
254+
}
255+
}
256+
}
257+
258+
func TestXorDistinctKeys(t *testing.T) {
259+
in := make([]byte, 256)
260+
for i := range in {
261+
in[i] = byte(i)
262+
}
263+
keys1 := [4]uint64{1, 2, 3, 4}
264+
keys2 := [4]uint64{5, 6, 7, 8}
265+
out1 := make([]byte, 256)
266+
out2 := make([]byte, 256)
267+
xorSlice(in, out1, &keys1)
268+
xorSlice(in, out2, &keys2)
269+
if bytes.Equal(out1, out2) {
270+
t.Fatal("different keys should produce different output")
271+
}
272+
}
273+
274+
func TestXorKnownValues(t *testing.T) {
275+
in := make([]byte, 32)
276+
for i := range in {
277+
in[i] = byte(i)
278+
}
279+
keys := [4]uint64{0x0807060504030201, 0x100f0e0d0c0b0a09, 0x1817161514131211, 0x201f1e1d1c1b1a19}
280+
out := make([]byte, 32)
281+
xor32Go(in, out, &keys)
282+
283+
// Each 8-byte block: out[i..i+7] = in[i..i+7] XOR keys[i/8]
284+
// Block 0: in[0..7] = 00 01 02 03 04 05 06 07, key = 01 02 03 04 05 06 07 08 (LE)
285+
// XOR => 01 03 01 07 01 03 01 0f
286+
expected := make([]byte, 32)
287+
for i := 0; i < 32; i++ {
288+
keyBytes := make([]byte, 8)
289+
binary.LittleEndian.PutUint64(keyBytes, keys[i/8])
290+
expected[i] = in[i] ^ keyBytes[i%8]
291+
}
292+
if !bytes.Equal(out, expected) {
293+
t.Fatalf("known values mismatch\nexpected %x\ngot %x", expected, out)
294+
}
295+
296+
// Now verify xorSlice matches
297+
outAsm := make([]byte, 32)
298+
xorSlice(in, outAsm, &keys)
299+
if !bytes.Equal(outAsm, expected) {
300+
t.Fatalf("xorSlice known values mismatch\nexpected %x\ngot %x", expected, outAsm)
301+
}
302+
}

rng/xor_amd64.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,18 @@
1919

2020
package rng
2121

22+
import "github.com/klauspost/cpuid/v2"
23+
24+
func xorSlice(in, out []byte, v *[4]uint64) {
25+
if cpuid.CPU.Has(cpuid.AVX2) {
26+
xorSliceAvx2(in, out, v)
27+
} else {
28+
xorSliceSSE2(in, out, v)
29+
}
30+
}
31+
32+
//go:noescape
33+
func xorSliceSSE2(in, out []byte, v *[4]uint64)
34+
2235
//go:noescape
23-
func xorSlice(in, out []byte, v *[4]uint64)
36+
func xorSliceAvx2(in, out []byte, v *[4]uint64)

rng/xor_amd64.s

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@
1919
//+build !appengine
2020
//+build !gccgo
2121

22-
// func xorSlice(in, out []byte, v *[4]uint64)
23-
TEXT ·xorSlice(SB), 7, $0
24-
MOVQ v+48(FP), AX // AX: v
25-
MOVQ in+0(FP), SI // SI: &in
26-
MOVQ out+24(FP), DX // DX: &out
27-
MOVQ out_len+32(FP), R9 // R9: len(out)
28-
MOVOU (AX), X0 // v[x]
29-
MOVOU 16(AX), X1 // v[x+2]
30-
SHRQ $5, R9 // len(in) / 32
22+
// func xorSliceSSE2(in, out []byte, v *[4]uint64)
23+
TEXT ·xorSliceSSE2(SB), 7, $0
24+
MOVQ v+48(FP), AX // AX: v
25+
MOVQ in+0(FP), SI // SI: &in
26+
MOVQ out+24(FP), DX // DX: &out
27+
MOVQ out_len+32(FP), R9 // R9: len(out)
28+
MOVOU (AX), X0 // v[x]
29+
MOVOU 16(AX), X1 // v[x+2]
30+
SHRQ $5, R9 // len(in) / 32
3131
JZ done_xor_sse2_32
3232

3333
loopback_xor_sse2_32:
@@ -44,3 +44,40 @@ loopback_xor_sse2_32:
4444

4545
done_xor_sse2_32:
4646
RET
47+
48+
// func xorSliceAvx2(in, out []byte, v *[4]uint64)
49+
TEXT ·xorSliceAvx2(SB), 7, $0
50+
MOVQ v+48(FP), AX // AX: v
51+
MOVQ in+0(FP), SI // SI: &in
52+
MOVQ out+24(FP), DX // DX: &out
53+
MOVQ out_len+32(FP), R9 // R9: len(out)
54+
VMOVDQU (AX), Y0 // v[x]
55+
SHRQ $5, R9 // len(in) / 32
56+
CMPQ R9, $1
57+
JEQ loopback_xor_avx2_32
58+
JB end_xor_avx2_32
59+
60+
loopback_xor_avx2_64:
61+
SUBQ $2, R9
62+
VMOVDQU (SI), Y1
63+
VMOVDQU 32(SI), Y2
64+
VPXOR Y1, Y0, Y1
65+
VPXOR Y2, Y0, Y2
66+
VMOVDQU Y1, (DX)
67+
VMOVDQU Y2, 32(DX)
68+
ADDQ $64, SI // in+=64
69+
ADDQ $64, DX // out+=64
70+
CMPQ R9, $1
71+
JA loopback_xor_avx2_64
72+
JEQ loopback_xor_avx2_32
73+
JMP end_xor_avx2_32
74+
75+
loopback_xor_avx2_32:
76+
VPXOR (SI), Y0, Y1
77+
VMOVDQU Y1, (DX)
78+
ADDQ $32, SI // in+=32
79+
ADDQ $32, DX // out+=32
80+
81+
end_xor_avx2_32:
82+
VZEROUPPER
83+
RET

rng/xor_arm64.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright (c) 2015-2026 MinIO, Inc.
2+
//
3+
// This file is part of MinIO Object Storage stack
4+
//
5+
// This program is free software: you can redistribute it and/or modify
6+
// it under the terms of the GNU Affero General Public License as published by
7+
// the Free Software Foundation, either version 3 of the License, or
8+
// (at your option) any later version.
9+
//
10+
// This program is distributed in the hope that it will be useful
11+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
// GNU Affero General Public License for more details.
14+
//
15+
// You should have received a copy of the GNU Affero General Public License
16+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
//go:build !noasm && !appengine && !gccgo
19+
20+
package rng
21+
22+
func xorSlice(in, out []byte, v *[4]uint64) {
23+
xorSliceNEON(in, out, v)
24+
}
25+
26+
//go:noescape
27+
func xorSliceNEON(in, out []byte, v *[4]uint64)

rng/xor_arm64.s

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Copyright (c) 2015-2026 MinIO, Inc.
2+
//
3+
// This file is part of MinIO Object Storage stack
4+
//
5+
// This program is free software: you can redistribute it and/or modify
6+
// it under the terms of the GNU Affero General Public License as published by
7+
// the Free Software Foundation, either version 3 of the License, or
8+
// (at your option) any later version.
9+
//
10+
// This program is distributed in the hope that it will be useful
11+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
// GNU Affero General Public License for more details.
14+
//
15+
// You should have received a copy of the GNU Affero General Public License
16+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
//+build !noasm
19+
//+build !appengine
20+
//+build !gccgo
21+
22+
// func xorSliceNEON(in, out []byte, v *[4]uint64)
23+
TEXT ·xorSliceNEON(SB), 7, $0
24+
MOVD v+48(FP), R2
25+
MOVD in+0(FP), R0
26+
MOVD out+24(FP), R1
27+
MOVD out_len+32(FP), R3
28+
VLD1 (R2), [V0.B16, V1.B16]
29+
LSR $5, R3, R3
30+
CBZ R3, done_xor_neon
31+
CMP $1, R3
32+
BEQ loopback_xor_neon_32
33+
34+
loopback_xor_neon_64:
35+
SUB $2, R3, R3
36+
VLD1 (R0), [V2.B16, V3.B16, V4.B16, V5.B16]
37+
VEOR V0.B16, V2.B16, V2.B16
38+
VEOR V1.B16, V3.B16, V3.B16
39+
VEOR V0.B16, V4.B16, V4.B16
40+
VEOR V1.B16, V5.B16, V5.B16
41+
VST1 [V2.B16, V3.B16, V4.B16, V5.B16], (R1)
42+
ADD $64, R0
43+
ADD $64, R1
44+
CMP $1, R3
45+
BGT loopback_xor_neon_64
46+
BEQ loopback_xor_neon_32
47+
B done_xor_neon
48+
49+
loopback_xor_neon_32:
50+
VLD1 (R0), [V2.B16, V3.B16]
51+
VEOR V0.B16, V2.B16, V2.B16
52+
VEOR V1.B16, V3.B16, V3.B16
53+
VST1 [V2.B16, V3.B16], (R1)
54+
55+
done_xor_neon:
56+
RET

rng/xor_noasm.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//go:build !amd64 || noasm || appengine || gccgo
1+
//go:build (!amd64 && !arm64) || noasm || appengine || gccgo
22

33
// Copyright (c) 2015-2021 MinIO, Inc.
44
//

0 commit comments

Comments
 (0)