Skip to content

Commit 4972fb5

Browse files
rng: Add AVX2 + NEON asm. (#229)
* rng: Add AVX2 + NEON asm. Merge after #228 to include arm64 checks. Before/after... ``` pkg: github.com/minio/pkg/v3/rng cpu: AMD Ryzen 9 9950X 16-Core Processor BenchmarkReader BenchmarkReader/1000-32 46546988 25.88 ns/op 38635.30 MB/s 0 B/op 0 allocs/op BenchmarkReader/1024-32 70920727 17.14 ns/op 59755.33 MB/s 0 B/op 0 allocs/op BenchmarkReader/16384-32 5805674 204.9 ns/op 79950.02 MB/s 0 B/op 0 allocs/op BenchmarkReader/1048576-32 92539 14080 ns/op 74470.24 MB/s 0 B/op 0 allocs/op BenchmarkReader/1000-32 52974752 22.57 ns/op 44300.70 MB/s 0 B/op 0 allocs/op BenchmarkReader/1024-32 100000000 11.37 ns/op 90096.95 MB/s 0 B/op 0 allocs/op BenchmarkReader/16384-32 14598060 81.69 ns/op 200552.58 MB/s 0 B/op 0 allocs/op BenchmarkReader/1048576-32 174301 6384 ns/op 164256.53 MB/s 0 B/op 0 allocs/op ``` * Test AVX2 and SSE2 in tests. * Add missing tag. Test tags. --------- Co-authored-by: Harshavardhana <harsha@minio.io>
1 parent 8c246ec commit 4972fb5

8 files changed

Lines changed: 314 additions & 36 deletions

File tree

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ lint-fix: getdeps
1919
test: lint
2020
@echo "Running unit tests"
2121
@go test -race -tags kqueue ./...
22+
@go test -tags kqueue,noasm ./...
23+
@go test -tags kqueue,purego ./...
24+
@go test -tags kqueue,nounsafe,noasm ./...
2225

2326
test-ldap: lint
2427
@echo "Running unit tests for LDAP with LDAP server at '"${LDAP_TEST_SERVER}"'"

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ require (
1515
github.com/go-ldap/ldap/v3 v3.4.12
1616
github.com/go-openapi/swag/conv v0.24.0
1717
github.com/golang-jwt/jwt/v4 v4.5.2
18+
github.com/klauspost/cpuid/v2 v2.3.0
1819
github.com/lestrrat-go/jwx/v3 v3.0.12
1920
github.com/mattn/go-colorable v0.1.14
2021
github.com/mattn/go-isatty v0.0.20
@@ -251,7 +252,6 @@ require (
251252
github.com/golang/protobuf v1.5.4 // indirect
252253
github.com/google/uuid v1.6.0 // indirect
253254
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect
254-
github.com/klauspost/cpuid/v2 v2.3.0 // indirect
255255
github.com/lestrrat-go/blackmagic v1.0.4 // indirect
256256
github.com/lestrrat-go/httpcc v1.0.1 // indirect
257257
github.com/lestrrat-go/option v1.0.1 // indirect

rng/reader_test.go

Lines changed: 163 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ package rng
1919

2020
import (
2121
"bytes"
22+
"encoding/binary"
2223
"io"
2324
"math/rand"
2425
"strconv"
2526
"testing"
27+
28+
"github.com/klauspost/cpuid/v2"
2629
)
2730

2831
func TestSubkeysInitialized(t *testing.T) {
@@ -204,36 +207,173 @@ func TestReaderSeeker(t *testing.T) {
204207
}
205208
}
206209

210+
func forEachXorImpl(t *testing.T, fn func(t *testing.T)) {
211+
t.Helper()
212+
if !cpuid.CPU.Has(cpuid.SSE2) {
213+
fn(t)
214+
return
215+
}
216+
avx2 := cpuid.CPU.Has(cpuid.AVX2)
217+
if avx2 {
218+
t.Run("AVX2", func(t *testing.T) {
219+
fn(t)
220+
})
221+
t.Run("SSE2", func(t *testing.T) {
222+
cpuid.CPU.Disable(cpuid.AVX2)
223+
defer cpuid.CPU.Enable(cpuid.AVX2)
224+
fn(t)
225+
})
226+
return
227+
228+
}
229+
fn(t)
230+
}
231+
207232
func TestXor(t *testing.T) {
208-
// Validate asm, if any, otherwise validate ourselves.
209-
rng := rand.New(rand.NewSource(0))
210-
for _, size := range []int{1000, 1024, 16384, 1 << 20} {
211-
bufIn := make([]byte, size)
212-
_, err := io.ReadFull(rng, bufIn)
213-
if err != nil {
214-
t.Fatal(err)
233+
forEachXorImpl(t, func(t *testing.T) {
234+
rng := rand.New(rand.NewSource(0))
235+
for _, size := range []int{1000, 1024, 16384, 1 << 20} {
236+
bufIn := make([]byte, size)
237+
_, err := io.ReadFull(rng, bufIn)
238+
if err != nil {
239+
t.Fatal(err)
240+
}
241+
bufOut := make([]byte, size)
242+
bufOut2 := make([]byte, size)
243+
var keys [4]uint64
244+
for i := range keys {
245+
keys[i] = rng.Uint64()
246+
}
247+
for i := 0; i < 1000; i++ {
248+
bSize := (rand.Intn(size) / 32) * 32
249+
bufOut := bufOut[:bSize]
250+
for i := 0; i < len(bufOut); i++ {
251+
bufOut[i] = 0
252+
}
253+
bufOut2 := bufOut2[:bSize]
254+
for i := 0; i < len(bufOut2); i++ {
255+
bufOut2[i] = 0
256+
}
257+
xorSlice(bufIn, bufOut, &keys)
258+
xor32Go(bufIn, bufOut2, &keys)
259+
if !bytes.Equal(bufOut, bufOut2) {
260+
t.Fatalf("\nexpected %x\ngot %x", bufOut, bufOut2)
261+
}
262+
}
215263
}
216-
bufOut := make([]byte, size)
217-
bufOut2 := make([]byte, size)
264+
})
265+
}
266+
267+
func TestXorZeroKey(t *testing.T) {
268+
forEachXorImpl(t, func(t *testing.T) {
269+
var keys [4]uint64
270+
for _, size := range []int{0, 32, 64, 96, 128, 1024} {
271+
in := make([]byte, size)
272+
for i := range in {
273+
in[i] = byte(i)
274+
}
275+
out := make([]byte, size)
276+
xorSlice(in, out, &keys)
277+
if !bytes.Equal(in, out) {
278+
t.Fatalf("size %d: zero-key xor should copy input\nexpected %x\ngot %x", size, in, out)
279+
}
280+
out2 := make([]byte, size)
281+
xor32Go(in, out2, &keys)
282+
if !bytes.Equal(in, out2) {
283+
t.Fatalf("size %d: zero-key xor32Go should copy input", size)
284+
}
285+
}
286+
})
287+
}
288+
289+
func TestXorDoubleApply(t *testing.T) {
290+
forEachXorImpl(t, func(t *testing.T) {
291+
rng := rand.New(rand.NewSource(42))
218292
var keys [4]uint64
219293
for i := range keys {
220294
keys[i] = rng.Uint64()
221295
}
222-
for i := 0; i < 1000; i++ {
223-
bSize := (rand.Intn(size) / 32) * 32
224-
bufOut := bufOut[:bSize]
225-
for i := 0; i < len(bufOut); i++ {
226-
bufOut[i] = 0
227-
}
228-
bufOut2 := bufOut2[:bSize]
229-
for i := 0; i < len(bufOut2); i++ {
230-
bufOut2[i] = 0
296+
for _, size := range []int{32, 64, 96, 128, 256, 1024, 4096} {
297+
in := make([]byte, size)
298+
_, _ = io.ReadFull(rng, in)
299+
orig := make([]byte, size)
300+
copy(orig, in)
301+
302+
tmp := make([]byte, size)
303+
out := make([]byte, size)
304+
xorSlice(in, tmp, &keys)
305+
xorSlice(tmp, out, &keys)
306+
if !bytes.Equal(orig, out) {
307+
t.Fatalf("size %d: double xor should return original\nexpected %x\ngot %x", size, orig[:32], out[:32])
231308
}
232-
xorSlice(bufIn, bufOut, &keys)
233-
xor32Go(bufIn, bufOut2, &keys)
234-
if !bytes.Equal(bufOut, bufOut2) {
235-
t.Fatalf("\nexpected %x\ngot %x", bufOut, bufOut2)
309+
}
310+
})
311+
}
312+
313+
func TestXorAllSizes(t *testing.T) {
314+
forEachXorImpl(t, func(t *testing.T) {
315+
rng := rand.New(rand.NewSource(99))
316+
var keys [4]uint64
317+
for i := range keys {
318+
keys[i] = rng.Uint64()
319+
}
320+
in := make([]byte, 8192)
321+
_, _ = io.ReadFull(rng, in)
322+
323+
for size := 0; size <= len(in); size += 32 {
324+
outAsm := make([]byte, size)
325+
outGo := make([]byte, size)
326+
xorSlice(in[:size], outAsm, &keys)
327+
xor32Go(in[:size], outGo, &keys)
328+
if !bytes.Equal(outAsm, outGo) {
329+
t.Fatalf("size %d: asm and Go disagree\nasm %x\ngo %x", size, outAsm[:min(64, size)], outGo[:min(64, size)])
236330
}
237331
}
238-
}
332+
})
333+
}
334+
335+
func TestXorDistinctKeys(t *testing.T) {
336+
forEachXorImpl(t, func(t *testing.T) {
337+
in := make([]byte, 256)
338+
for i := range in {
339+
in[i] = byte(i)
340+
}
341+
keys1 := [4]uint64{1, 2, 3, 4}
342+
keys2 := [4]uint64{5, 6, 7, 8}
343+
out1 := make([]byte, 256)
344+
out2 := make([]byte, 256)
345+
xorSlice(in, out1, &keys1)
346+
xorSlice(in, out2, &keys2)
347+
if bytes.Equal(out1, out2) {
348+
t.Fatal("different keys should produce different output")
349+
}
350+
})
351+
}
352+
353+
func TestXorKnownValues(t *testing.T) {
354+
forEachXorImpl(t, func(t *testing.T) {
355+
in := make([]byte, 32)
356+
for i := range in {
357+
in[i] = byte(i)
358+
}
359+
keys := [4]uint64{0x0807060504030201, 0x100f0e0d0c0b0a09, 0x1817161514131211, 0x201f1e1d1c1b1a19}
360+
out := make([]byte, 32)
361+
xor32Go(in, out, &keys)
362+
363+
expected := make([]byte, 32)
364+
for i := 0; i < 32; i++ {
365+
keyBytes := make([]byte, 8)
366+
binary.LittleEndian.PutUint64(keyBytes, keys[i/8])
367+
expected[i] = in[i] ^ keyBytes[i%8]
368+
}
369+
if !bytes.Equal(out, expected) {
370+
t.Fatalf("known values mismatch\nexpected %x\ngot %x", expected, out)
371+
}
372+
373+
outAsm := make([]byte, 32)
374+
xorSlice(in, outAsm, &keys)
375+
if !bytes.Equal(outAsm, expected) {
376+
t.Fatalf("xorSlice known values mismatch\nexpected %x\ngot %x", expected, outAsm)
377+
}
378+
})
239379
}

rng/xor_amd64.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,22 @@
1515
// You should have received a copy of the GNU Affero General Public License
1616
// along with this program. If not, see <http://www.gnu.org/licenses/>.
1717

18-
//go:build !noasm && !appengine && !gccgo
18+
//go:build !noasm && !appengine && !gccgo && !purego
1919

2020
package rng
2121

22+
import "github.com/klauspost/cpuid/v2"
23+
24+
func xorSlice(in, out []byte, v *[4]uint64) {
25+
if cpuid.CPU.Has(cpuid.AVX2) {
26+
xorSliceAvx2(in, out, v)
27+
} else {
28+
xorSliceSSE2(in, out, v)
29+
}
30+
}
31+
32+
//go:noescape
33+
func xorSliceSSE2(in, out []byte, v *[4]uint64)
34+
2235
//go:noescape
23-
func xorSlice(in, out []byte, v *[4]uint64)
36+
func xorSliceAvx2(in, out []byte, v *[4]uint64)

rng/xor_amd64.s

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,17 @@
1818
//+build !noasm
1919
//+build !appengine
2020
//+build !gccgo
21+
//+build !purego
2122

22-
// func xorSlice(in, out []byte, v *[4]uint64)
23-
TEXT ·xorSlice(SB), 7, $0
24-
MOVQ v+48(FP), AX // AX: v
25-
MOVQ in+0(FP), SI // SI: &in
26-
MOVQ out+24(FP), DX // DX: &out
27-
MOVQ out_len+32(FP), R9 // R9: len(out)
28-
MOVOU (AX), X0 // v[x]
29-
MOVOU 16(AX), X1 // v[x+2]
30-
SHRQ $5, R9 // len(in) / 32
23+
// func xorSliceSSE2(in, out []byte, v *[4]uint64)
24+
TEXT ·xorSliceSSE2(SB), 7, $0
25+
MOVQ v+48(FP), AX // AX: v
26+
MOVQ in+0(FP), SI // SI: &in
27+
MOVQ out+24(FP), DX // DX: &out
28+
MOVQ out_len+32(FP), R9 // R9: len(out)
29+
MOVOU (AX), X0 // v[x]
30+
MOVOU 16(AX), X1 // v[x+2]
31+
SHRQ $5, R9 // len(in) / 32
3132
JZ done_xor_sse2_32
3233

3334
loopback_xor_sse2_32:
@@ -44,3 +45,40 @@ loopback_xor_sse2_32:
4445

4546
done_xor_sse2_32:
4647
RET
48+
49+
// func xorSliceAvx2(in, out []byte, v *[4]uint64)
50+
TEXT ·xorSliceAvx2(SB), 7, $0
51+
MOVQ v+48(FP), AX // AX: v
52+
MOVQ in+0(FP), SI // SI: &in
53+
MOVQ out+24(FP), DX // DX: &out
54+
MOVQ out_len+32(FP), R9 // R9: len(out)
55+
VMOVDQU (AX), Y0 // v[x]
56+
SHRQ $5, R9 // len(in) / 32
57+
CMPQ R9, $1
58+
JEQ loopback_xor_avx2_32
59+
JB end_xor_avx2_32
60+
61+
loopback_xor_avx2_64:
62+
SUBQ $2, R9
63+
VMOVDQU (SI), Y1
64+
VMOVDQU 32(SI), Y2
65+
VPXOR Y1, Y0, Y1
66+
VPXOR Y2, Y0, Y2
67+
VMOVDQU Y1, (DX)
68+
VMOVDQU Y2, 32(DX)
69+
ADDQ $64, SI // in+=64
70+
ADDQ $64, DX // out+=64
71+
CMPQ R9, $1
72+
JA loopback_xor_avx2_64
73+
JEQ loopback_xor_avx2_32
74+
JMP end_xor_avx2_32
75+
76+
loopback_xor_avx2_32:
77+
VPXOR (SI), Y0, Y1
78+
VMOVDQU Y1, (DX)
79+
ADDQ $32, SI // in+=32
80+
ADDQ $32, DX // out+=32
81+
82+
end_xor_avx2_32:
83+
VZEROUPPER
84+
RET

rng/xor_arm64.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright (c) 2015-2026 MinIO, Inc.
2+
//
3+
// This file is part of MinIO Object Storage stack
4+
//
5+
// This program is free software: you can redistribute it and/or modify
6+
// it under the terms of the GNU Affero General Public License as published by
7+
// the Free Software Foundation, either version 3 of the License, or
8+
// (at your option) any later version.
9+
//
10+
// This program is distributed in the hope that it will be useful
11+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
// GNU Affero General Public License for more details.
14+
//
15+
// You should have received a copy of the GNU Affero General Public License
16+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
//go:build !noasm && !appengine && !gccgo && !purego
19+
20+
package rng
21+
22+
func xorSlice(in, out []byte, v *[4]uint64) {
23+
xorSliceNEON(in, out, v)
24+
}
25+
26+
//go:noescape
27+
func xorSliceNEON(in, out []byte, v *[4]uint64)

0 commit comments

Comments
 (0)