Skip to content

Commit d3ca975

Browse files
committed
2 parents 91b646c + 5276ea1 commit d3ca975

10 files changed

Lines changed: 188 additions & 65 deletions

File tree

crypto/crc32c.c

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
// SPDX-License-Identifier: GPL-2.0-or-later
22
/*
3-
* Cryptographic API.
4-
*
5-
* CRC32C chksum
3+
* crypto_shash support for CRC-32C
64
*
75
*@Article{castagnoli-crc,
86
* author = { Guy Castagnoli and Stefan Braeuer and Martin Herrman},
@@ -15,16 +13,6 @@
1513
* pages = {},
1614
* month = {June},
1715
*}
18-
* Used by the iSCSI driver, possibly others, and derived from
19-
* the iscsi-crc.c module of the linux-iscsi driver at
20-
* http://linux-iscsi.sourceforge.net.
21-
*
22-
* Following the example of lib/crc32, this function is intended to be
23-
* flexible and useful for all users. Modules that currently have their
24-
* own crc32c, but hopefully may be able to use this one are:
25-
* net/sctp (please add all your doco to here if you change to
26-
* use this one!)
27-
* <endoflist>
2816
*
2917
* Copyright (c) 2004 Cisco Systems, Inc.
3018
* Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
@@ -49,11 +37,6 @@ struct chksum_desc_ctx {
4937
u32 crc;
5038
};
5139

52-
/*
53-
* Steps through buffer one byte at a time, calculates reflected
54-
* crc using table.
55-
*/
56-
5740
static int chksum_init(struct shash_desc *desc)
5841
{
5942
struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);

lib/crc/.kunitconfig

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
CONFIG_KUNIT=y
2+
CONFIG_CRC_ENABLE_ALL_FOR_KUNIT=y
3+
CONFIG_CRC_KUNIT_TEST=y

lib/crc/Kconfig

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ config CRC_T10DIF_ARCH
4848
bool
4949
depends on CRC_T10DIF && CRC_OPTIMIZATIONS
5050
default y if ARM && KERNEL_MODE_NEON
51-
default y if ARM64 && KERNEL_MODE_NEON
51+
default y if ARM64
5252
default y if PPC64 && ALTIVEC
5353
default y if RISCV && RISCV_ISA_ZBC
5454
default y if X86
@@ -82,6 +82,7 @@ config CRC64
8282
config CRC64_ARCH
8383
bool
8484
depends on CRC64 && CRC_OPTIMIZATIONS
85+
default y if ARM64
8586
default y if RISCV && RISCV_ISA_ZBC && 64BIT
8687
default y if X86_64
8788

@@ -99,18 +100,27 @@ config CRC_OPTIMIZATIONS
99100

100101
config CRC_KUNIT_TEST
101102
tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS
102-
depends on KUNIT
103+
depends on KUNIT && (CRC7 || CRC16 || CRC_T10DIF || CRC32 || CRC64)
103104
default KUNIT_ALL_TESTS
105+
help
106+
Unit tests for the CRC library functions.
107+
108+
This is intended to help people writing architecture-specific
109+
optimized versions. If unsure, say N.
110+
111+
config CRC_ENABLE_ALL_FOR_KUNIT
112+
tristate "Enable all CRC functions for KUnit test"
113+
depends on KUNIT
104114
select CRC7
105115
select CRC16
106116
select CRC_T10DIF
107117
select CRC32
108118
select CRC64
109119
help
110-
Unit tests for the CRC library functions.
120+
Enable all CRC functions that have test code in CRC_KUNIT_TEST.
111121

112-
This is intended to help people writing architecture-specific
113-
optimized versions. If unsure, say N.
122+
Enable this only if you'd like the CRC KUnit test suite to test all
123+
the CRC variants, even ones that wouldn't otherwise need to be built.
114124

115125
config CRC_BENCHMARK
116126
bool "Benchmark for the CRC functions"

lib/crc/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,15 @@ obj-$(CONFIG_CRC64) += crc64.o
3838
crc64-y := crc64-main.o
3939
ifeq ($(CONFIG_CRC64_ARCH),y)
4040
CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
41+
42+
CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only
43+
CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto
44+
CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
45+
crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
46+
4147
crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
4248
crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
43-
endif
49+
endif # CONFIG_CRC64_ARCH
4450

4551
obj-y += tests/
4652

lib/crc/arm64/crc-t10dif-core.S

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -181,13 +181,13 @@ SYM_FUNC_END(__pmull_p8_16x64)
181181

182182
pmull16x64_\p fold_consts, \reg1, v8
183183

184-
CPU_LE( rev64 v11.16b, v11.16b )
185-
CPU_LE( rev64 v12.16b, v12.16b )
184+
rev64 v11.16b, v11.16b
185+
rev64 v12.16b, v12.16b
186186

187187
pmull16x64_\p fold_consts, \reg2, v9
188188

189-
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
190-
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
189+
ext v11.16b, v11.16b, v11.16b, #8
190+
ext v12.16b, v12.16b, v12.16b, #8
191191

192192
eor \reg1\().16b, \reg1\().16b, v8.16b
193193
eor \reg2\().16b, \reg2\().16b, v9.16b
@@ -220,22 +220,22 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
220220
ldp q4, q5, [buf, #0x40]
221221
ldp q6, q7, [buf, #0x60]
222222
add buf, buf, #0x80
223-
CPU_LE( rev64 v0.16b, v0.16b )
224-
CPU_LE( rev64 v1.16b, v1.16b )
225-
CPU_LE( rev64 v2.16b, v2.16b )
226-
CPU_LE( rev64 v3.16b, v3.16b )
227-
CPU_LE( rev64 v4.16b, v4.16b )
228-
CPU_LE( rev64 v5.16b, v5.16b )
229-
CPU_LE( rev64 v6.16b, v6.16b )
230-
CPU_LE( rev64 v7.16b, v7.16b )
231-
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
232-
CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
233-
CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
234-
CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
235-
CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
236-
CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
237-
CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
238-
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
223+
rev64 v0.16b, v0.16b
224+
rev64 v1.16b, v1.16b
225+
rev64 v2.16b, v2.16b
226+
rev64 v3.16b, v3.16b
227+
rev64 v4.16b, v4.16b
228+
rev64 v5.16b, v5.16b
229+
rev64 v6.16b, v6.16b
230+
rev64 v7.16b, v7.16b
231+
ext v0.16b, v0.16b, v0.16b, #8
232+
ext v1.16b, v1.16b, v1.16b, #8
233+
ext v2.16b, v2.16b, v2.16b, #8
234+
ext v3.16b, v3.16b, v3.16b, #8
235+
ext v4.16b, v4.16b, v4.16b, #8
236+
ext v5.16b, v5.16b, v5.16b, #8
237+
ext v6.16b, v6.16b, v6.16b, #8
238+
ext v7.16b, v7.16b, v7.16b, #8
239239

240240
// XOR the first 16 data *bits* with the initial CRC value.
241241
movi v8.16b, #0
@@ -288,8 +288,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
288288
pmull16x64_\p fold_consts, v7, v8
289289
eor v7.16b, v7.16b, v8.16b
290290
ldr q0, [buf], #16
291-
CPU_LE( rev64 v0.16b, v0.16b )
292-
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
291+
rev64 v0.16b, v0.16b
292+
ext v0.16b, v0.16b, v0.16b, #8
293293
eor v7.16b, v7.16b, v0.16b
294294
subs len, len, #16
295295
b.ge .Lfold_16_bytes_loop_\@
@@ -310,8 +310,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
310310
// v0 = last 16 original data bytes
311311
add buf, buf, len
312312
ldr q0, [buf, #-16]
313-
CPU_LE( rev64 v0.16b, v0.16b )
314-
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
313+
rev64 v0.16b, v0.16b
314+
ext v0.16b, v0.16b, v0.16b, #8
315315

316316
// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
317317
adr_l x4, .Lbyteshift_table + 16
@@ -344,8 +344,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
344344

345345
// Load the first 16 data bytes.
346346
ldr q7, [buf], #0x10
347-
CPU_LE( rev64 v7.16b, v7.16b )
348-
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
347+
rev64 v7.16b, v7.16b
348+
ext v7.16b, v7.16b, v7.16b, #8
349349

350350
// XOR the first 16 data *bits* with the initial CRC value.
351351
movi v0.16b, #0
@@ -382,8 +382,8 @@ SYM_FUNC_START(crc_t10dif_pmull_p8)
382382

383383
crc_t10dif_pmull p8
384384

385-
CPU_LE( rev64 v7.16b, v7.16b )
386-
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
385+
rev64 v7.16b, v7.16b
386+
ext v7.16b, v7.16b, v7.16b, #8
387387
str q7, [x3]
388388

389389
frame_pop

lib/crc/arm64/crc32-core.S

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,24 +29,19 @@
2929
.endm
3030

3131
.macro hwordle, reg
32-
CPU_BE( rev16 \reg, \reg )
3332
.endm
3433

3534
.macro hwordbe, reg
36-
CPU_LE( rev \reg, \reg )
35+
rev \reg, \reg
3736
rbit \reg, \reg
38-
CPU_BE( lsr \reg, \reg, #16 )
3937
.endm
4038

4139
.macro le, regs:vararg
42-
.irp r, \regs
43-
CPU_BE( rev \r, \r )
44-
.endr
4540
.endm
4641

4742
.macro be, regs:vararg
4843
.irp r, \regs
49-
CPU_LE( rev \r, \r )
44+
rev \r, \r
5045
.endr
5146
.irp r, \regs
5247
rbit \r, \r

lib/crc/arm64/crc64-neon-inner.c

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
/*
3+
* Accelerated CRC64 (NVMe) using ARM NEON C intrinsics
4+
*/
5+
6+
#include <linux/types.h>
7+
#include <asm/neon-intrinsics.h>
8+
9+
u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
10+
11+
#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
12+
#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
13+
14+
/* x^191 mod G, x^127 mod G */
15+
static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
16+
0x21e9761e252621acULL };
17+
/* floor(x^127 / G), (G - x^64) / x */
18+
static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
19+
0x34d926535897936aULL };
20+
21+
u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
22+
{
23+
uint64x2_t v0_u64 = { crc, 0 };
24+
poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
25+
poly64x2_t fold_consts =
26+
vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
27+
poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
28+
29+
v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
30+
vreinterpretq_u8_p64(v1)));
31+
p += 16;
32+
len -= 16;
33+
34+
do {
35+
v1 = vreinterpretq_p64_u8(vld1q_u8(p));
36+
37+
poly128_t v2 = vmull_high_p64(fold_consts, v0);
38+
poly128_t v0_128 =
39+
vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
40+
41+
uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
42+
vreinterpretq_u8_p128(v2));
43+
44+
x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
45+
v0 = vreinterpretq_p64_u8(x0);
46+
47+
p += 16;
48+
len -= 16;
49+
} while (len >= 16);
50+
51+
/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
52+
poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 });
53+
poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
54+
55+
uint8x16_t ext_v0 =
56+
vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
57+
uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
58+
59+
v0 = vreinterpretq_p64_u8(x0);
60+
61+
/* Final Barrett reduction */
62+
poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
63+
64+
v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
65+
66+
poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
67+
poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
68+
69+
x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
70+
71+
uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
72+
vreinterpretq_u8_p128(v1_128), 8);
73+
74+
x0 = veorq_u8(x0, ext_v2);
75+
76+
v0 = vreinterpretq_p64_u8(x0);
77+
return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
78+
}

lib/crc/arm64/crc64.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/* SPDX-License-Identifier: GPL-2.0-only */
2+
/*
3+
* CRC64 using ARM64 PMULL instructions
4+
*/
5+
6+
#include <linux/cpufeature.h>
7+
#include <asm/simd.h>
8+
#include <linux/minmax.h>
9+
#include <linux/sizes.h>
10+
11+
u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
12+
13+
#define crc64_be_arch crc64_be_generic
14+
15+
static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
16+
{
17+
if (len >= 128 && cpu_have_named_feature(PMULL) &&
18+
likely(may_use_simd())) {
19+
do {
20+
size_t chunk = min_t(size_t, len & ~15, SZ_4K);
21+
22+
scoped_ksimd()
23+
crc = crc64_nvme_arm64_c(crc, p, chunk);
24+
25+
p += chunk;
26+
len -= chunk;
27+
} while (len >= 128);
28+
}
29+
return crc64_nvme_generic(crc, p, len);
30+
}

0 commit comments

Comments
 (0)