crypto: arm64/crct10dif - preparatory refactor for 8x8 PMULL version

Reorganize the CRC-T10DIF asm routine so we can easily instantiate an alternative version based on 8x8 polynomial multiplication in a subsequent patch. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2018-08-27 17:38:11 +02:00
parent 598b7d41e5
commit 6c1b0da13e
2 changed files with 90 additions and 76 deletions
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -80,7 +80,46 @@
 	vzr		.req	v13
-ENTRY(crc_t10dif_pmull)
+	.macro		fold64, p, reg1, reg2
 	ldp		q11, q12, [arg2], #0x20
 	__pmull_\p	v8, \reg1, v10, 2
 	__pmull_\p	\reg1, \reg1, v10
 CPU_LE(	rev64		v11.16b, v11.16b		)
 CPU_LE(	rev64		v12.16b, v12.16b		)
 	__pmull_\p	v9, \reg2, v10, 2
 	__pmull_\p	\reg2, \reg2, v10
 CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
 CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	eor		\reg1\().16b, \reg1\().16b, v8.16b
 	eor		\reg2\().16b, \reg2\().16b, v9.16b
 	eor		\reg1\().16b, \reg1\().16b, v11.16b
 	eor		\reg2\().16b, \reg2\().16b, v12.16b
 	.endm
 	.macro		fold16, p, reg, rk
 	__pmull_\p	v8, \reg, v10
 	__pmull_\p	\reg, \reg, v10, 2
 	.ifnb		\rk
 	ldr_l		q10, \rk, x8
 	.endif
 	eor		v7.16b, v7.16b, v8.16b
 	eor		v7.16b, v7.16b, \reg\().16b
 	.endm
 	.macro		__pmull_p64, rd, rn, rm, n
 	.ifb		\n
 	pmull		\rd\().1q, \rn\().1d, \rm\().1d
 	.else
 	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
 	.endif
 	.endm
 	.macro		crc_t10dif_pmull, p
 	frame_push	3, 128
 	mov		arg1_low32, w0
@@ -96,7 +135,7 @@ ENTRY(crc_t10dif_pmull)
 	cmp		arg3, #256
 	// for sizes less than 128, we can't fold 64B at a time...
-	b.lt		_less_than_128
+	b.lt		.L_less_than_128_\@
 	// load the initial crc value
 	// crc value does not need to be byte-reflected, but it needs
@@ -147,41 +186,19 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	// buffer. The _fold_64_B_loop will fold 64B at a time
 	// until we have 64+y Bytes of buffer
 	// fold 64B at a time. This section of the code folds 4 vector
 	// registers in parallel
-_fold_64_B_loop:
+.L_fold_64_B_loop_\@:
-	.macro		fold64, reg1, reg2
+	fold64		\p, v0, v1
-	ldp		q11, q12, [arg2], #0x20
+	fold64		\p, v2, v3
-
+	fold64		\p, v4, v5
-	pmull2		v8.1q, \reg1\().2d, v10.2d
+	fold64		\p, v6, v7
 	pmull		\reg1\().1q, \reg1\().1d, v10.1d
 CPU_LE(	rev64		v11.16b, v11.16b		)
 CPU_LE(	rev64		v12.16b, v12.16b		)
 	pmull2		v9.1q, \reg2\().2d, v10.2d
 	pmull		\reg2\().1q, \reg2\().1d, v10.1d
 CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
 CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	eor		\reg1\().16b, \reg1\().16b, v8.16b
 	eor		\reg2\().16b, \reg2\().16b, v9.16b
 	eor		\reg1\().16b, \reg1\().16b, v11.16b
 	eor		\reg2\().16b, \reg2\().16b, v12.16b
 	.endm
 	fold64		v0, v1
 	fold64		v2, v3
 	fold64		v4, v5
 	fold64		v6, v7
 	subs		arg3, arg3, #128
 	// check if there is another 64B in the buffer to be able to fold
-	b.lt		_fold_64_B_end
+	b.lt		.L_fold_64_B_end_\@
 	if_will_cond_yield_neon
 	stp		q0, q1, [sp, #.Lframe_local_offset]
@@ -197,9 +214,9 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	movi		vzr.16b, #0		// init zero register
 	endif_yield_neon
-	b		_fold_64_B_loop
+	b		.L_fold_64_B_loop_\@
-_fold_64_B_end:
+.L_fold_64_B_end_\@:
 	// at this point, the buffer pointer is pointing at the last y Bytes
 	// of the buffer the 64B of folded data is in 4 of the vector
 	// registers: v0, v1, v2, v3
@@ -209,37 +226,27 @@ _fold_64_B_end:
 	ldr_l		q10, rk9, x8
-	.macro		fold16, reg, rk
+	fold16		\p, v0, rk11
-	pmull		v8.1q, \reg\().1d, v10.1d
+	fold16		\p, v1, rk13
-	pmull2		\reg\().1q, \reg\().2d, v10.2d
+	fold16		\p, v2, rk15
-	.ifnb		\rk
+	fold16		\p, v3, rk17
-	ldr_l		q10, \rk, x8
+	fold16		\p, v4, rk19
-	.endif
+	fold16		\p, v5, rk1
-	eor		v7.16b, v7.16b, v8.16b
+	fold16		\p, v6
 	eor		v7.16b, v7.16b, \reg\().16b
 	.endm
 	fold16		v0, rk11
 	fold16		v1, rk13
 	fold16		v2, rk15
 	fold16		v3, rk17
 	fold16		v4, rk19
 	fold16		v5, rk1
 	fold16		v6
 	// instead of 64, we add 48 to the loop counter to save 1 instruction
 	// from the loop instead of a cmp instruction, we use the negative
 	// flag with the jl instruction
 	adds		arg3, arg3, #(128-16)
-	b.lt		_final_reduction_for_128
+	b.lt		.L_final_reduction_for_128_\@
 	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
 	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
 	// continue folding 16B at a time
-_16B_reduction_loop:
+.L_16B_reduction_loop_\@:
-	pmull		v8.1q, v7.1d, v10.1d
+	__pmull_\p	v8, v7, v10
-	pmull2		v7.1q, v7.2d, v10.2d
+	__pmull_\p	v7, v7, v10, 2
 	eor		v7.16b, v7.16b, v8.16b
 	ldr		q0, [arg2], #16
@@ -251,22 +258,22 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	// instead of a cmp instruction, we utilize the flags with the
 	// jge instruction equivalent of: cmp arg3, 16-16
 	// check if there is any more 16B in the buffer to be able to fold
-	b.ge		_16B_reduction_loop
+	b.ge		.L_16B_reduction_loop_\@
 	// now we have 16+z bytes left to reduce, where 0<= z < 16.
 	// first, we reduce the data in the xmm7 register
-_final_reduction_for_128:
+.L_final_reduction_for_128_\@:
 	// check if any more data to fold. If not, compute the CRC of
 	// the final 128 bits
 	adds		arg3, arg3, #16
-	b.eq		_128_done
+	b.eq		.L_128_done_\@
 	// here we are getting data that is less than 16 bytes.
 	// since we know that there was data before the pointer, we can
 	// offset the input pointer before the actual point, to receive
 	// exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_regs:
+.L_get_last_two_regs_\@:
 	add		arg2, arg2, arg3
 	ldr		q1, [arg2, #-16]
 CPU_LE(	rev64		v1.16b, v1.16b			)
@@ -291,47 +298,46 @@ CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
 	bsl		v0.16b, v2.16b, v1.16b
 	// fold 16 Bytes
-	pmull		v8.1q, v7.1d, v10.1d
+	__pmull_\p	v8, v7, v10
-	pmull2		v7.1q, v7.2d, v10.2d
+	__pmull_\p	v7, v7, v10, 2
 	eor		v7.16b, v7.16b, v8.16b
 	eor		v7.16b, v7.16b, v0.16b
-_128_done:
+.L_128_done_\@:
 	// compute crc of a 128-bit value
 	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
 	// 64b fold
 	ext		v0.16b, vzr.16b, v7.16b, #8
 	mov		v7.d[0], v7.d[1]
-	pmull		v7.1q, v7.1d, v10.1d
+	__pmull_\p	v7, v7, v10
 	eor		v7.16b, v7.16b, v0.16b
 	// 32b fold
 	ext		v0.16b, v7.16b, vzr.16b, #4
 	mov		v7.s[3], vzr.s[0]
-	pmull2		v0.1q, v0.2d, v10.2d
+	__pmull_\p	v0, v0, v10, 2
 	eor		v7.16b, v7.16b, v0.16b
 	// barrett reduction
 _barrett:
 	ldr_l		q10, rk7, x8
 	mov		v0.d[0], v7.d[1]
-	pmull		v0.1q, v0.1d, v10.1d
+	__pmull_\p	v0, v0, v10
 	ext		v0.16b, vzr.16b, v0.16b, #12
-	pmull2		v0.1q, v0.2d, v10.2d
+	__pmull_\p	v0, v0, v10, 2
 	ext		v0.16b, vzr.16b, v0.16b, #12
 	eor		v7.16b, v7.16b, v0.16b
 	mov		w0, v7.s[1]
-_cleanup:
+.L_cleanup_\@:
 	// scale the result back to 16 bits
 	lsr		x0, x0, #16
 	frame_pop
 	ret
-_less_than_128:
+.L_less_than_128_\@:
-	cbz		arg3, _cleanup
+	cbz		arg3, .L_cleanup_\@
 	movi		v0.16b, #0
 	mov		v0.s[3], arg1_low32	// get the initial crc value
@@ -342,20 +348,20 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
 	cmp		arg3, #16
-	b.eq		_128_done		// exactly 16 left
+	b.eq		.L_128_done_\@		// exactly 16 left
-	b.lt		_less_than_16_left
+	b.lt		.L_less_than_16_left_\@
 	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
 	// update the counter. subtract 32 instead of 16 to save one
 	// instruction from the loop
 	subs		arg3, arg3, #32
-	b.ge		_16B_reduction_loop
+	b.ge		.L_16B_reduction_loop_\@
 	add		arg3, arg3, #16
-	b		_get_last_two_regs
+	b		.L_get_last_two_regs_\@
-_less_than_16_left:
+.L_less_than_16_left_\@:
 	// shl r9, 4
 	adr_l		x0, tbl_shf_table + 16
 	sub		x0, x0, arg3
@@ -363,8 +369,12 @@ _less_than_16_left:
 	movi		v9.16b, #0x80
 	eor		v0.16b, v0.16b, v9.16b
 	tbl		v7.16b, {v7.16b}, v0.16b
-	b		_128_done
+	b		.L_128_done_\@
-ENDPROC(crc_t10dif_pmull)
+	.endm
 ENTRY(crc_t10dif_pmull_p64)
 	crc_t10dif_pmull	p64
 ENDPROC(crc_t10dif_pmull_p64)
 // precomputed constants
 // these constants are precomputed from the poly:
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -22,7 +22,9 @@
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
 static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
 static int crct10dif_init(struct shash_desc *desc)
 {
@@ -85,6 +87,8 @@ static struct shash_alg crc_t10dif_alg = {
 static int __init crc_t10dif_mod_init(void)
 {
 	crc_t10dif_pmull = crc_t10dif_pmull_p64;
 	return crypto_register_shash(&crc_t10dif_alg);
 }