crypto: arm64/crct10dif - preparatory refactor for 8x8 PMULL version
Reorganize the CRC-T10DIF asm routine so we can easily instantiate an alternative version based on 8x8 polynomial multiplication in a subsequent patch. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
committed by
Herbert Xu
parent
598b7d41e5
commit
6c1b0da13e
@@ -80,7 +80,46 @@
|
|||||||
|
|
||||||
vzr .req v13
|
vzr .req v13
|
||||||
|
|
||||||
ENTRY(crc_t10dif_pmull)
|
.macro fold64, p, reg1, reg2
|
||||||
|
ldp q11, q12, [arg2], #0x20
|
||||||
|
|
||||||
|
__pmull_\p v8, \reg1, v10, 2
|
||||||
|
__pmull_\p \reg1, \reg1, v10
|
||||||
|
|
||||||
|
CPU_LE( rev64 v11.16b, v11.16b )
|
||||||
|
CPU_LE( rev64 v12.16b, v12.16b )
|
||||||
|
|
||||||
|
__pmull_\p v9, \reg2, v10, 2
|
||||||
|
__pmull_\p \reg2, \reg2, v10
|
||||||
|
|
||||||
|
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
|
||||||
|
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
||||||
|
|
||||||
|
eor \reg1\().16b, \reg1\().16b, v8.16b
|
||||||
|
eor \reg2\().16b, \reg2\().16b, v9.16b
|
||||||
|
eor \reg1\().16b, \reg1\().16b, v11.16b
|
||||||
|
eor \reg2\().16b, \reg2\().16b, v12.16b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro fold16, p, reg, rk
|
||||||
|
__pmull_\p v8, \reg, v10
|
||||||
|
__pmull_\p \reg, \reg, v10, 2
|
||||||
|
.ifnb \rk
|
||||||
|
ldr_l q10, \rk, x8
|
||||||
|
.endif
|
||||||
|
eor v7.16b, v7.16b, v8.16b
|
||||||
|
eor v7.16b, v7.16b, \reg\().16b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro __pmull_p64, rd, rn, rm, n
|
||||||
|
.ifb \n
|
||||||
|
pmull \rd\().1q, \rn\().1d, \rm\().1d
|
||||||
|
.else
|
||||||
|
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro crc_t10dif_pmull, p
|
||||||
frame_push 3, 128
|
frame_push 3, 128
|
||||||
|
|
||||||
mov arg1_low32, w0
|
mov arg1_low32, w0
|
||||||
@@ -96,7 +135,7 @@ ENTRY(crc_t10dif_pmull)
|
|||||||
cmp arg3, #256
|
cmp arg3, #256
|
||||||
|
|
||||||
// for sizes less than 128, we can't fold 64B at a time...
|
// for sizes less than 128, we can't fold 64B at a time...
|
||||||
b.lt _less_than_128
|
b.lt .L_less_than_128_\@
|
||||||
|
|
||||||
// load the initial crc value
|
// load the initial crc value
|
||||||
// crc value does not need to be byte-reflected, but it needs
|
// crc value does not need to be byte-reflected, but it needs
|
||||||
@@ -147,41 +186,19 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
|||||||
// buffer. The _fold_64_B_loop will fold 64B at a time
|
// buffer. The _fold_64_B_loop will fold 64B at a time
|
||||||
// until we have 64+y Bytes of buffer
|
// until we have 64+y Bytes of buffer
|
||||||
|
|
||||||
|
|
||||||
// fold 64B at a time. This section of the code folds 4 vector
|
// fold 64B at a time. This section of the code folds 4 vector
|
||||||
// registers in parallel
|
// registers in parallel
|
||||||
_fold_64_B_loop:
|
.L_fold_64_B_loop_\@:
|
||||||
|
|
||||||
.macro fold64, reg1, reg2
|
fold64 \p, v0, v1
|
||||||
ldp q11, q12, [arg2], #0x20
|
fold64 \p, v2, v3
|
||||||
|
fold64 \p, v4, v5
|
||||||
pmull2 v8.1q, \reg1\().2d, v10.2d
|
fold64 \p, v6, v7
|
||||||
pmull \reg1\().1q, \reg1\().1d, v10.1d
|
|
||||||
|
|
||||||
CPU_LE( rev64 v11.16b, v11.16b )
|
|
||||||
CPU_LE( rev64 v12.16b, v12.16b )
|
|
||||||
|
|
||||||
pmull2 v9.1q, \reg2\().2d, v10.2d
|
|
||||||
pmull \reg2\().1q, \reg2\().1d, v10.1d
|
|
||||||
|
|
||||||
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
|
|
||||||
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
|
||||||
|
|
||||||
eor \reg1\().16b, \reg1\().16b, v8.16b
|
|
||||||
eor \reg2\().16b, \reg2\().16b, v9.16b
|
|
||||||
eor \reg1\().16b, \reg1\().16b, v11.16b
|
|
||||||
eor \reg2\().16b, \reg2\().16b, v12.16b
|
|
||||||
.endm
|
|
||||||
|
|
||||||
fold64 v0, v1
|
|
||||||
fold64 v2, v3
|
|
||||||
fold64 v4, v5
|
|
||||||
fold64 v6, v7
|
|
||||||
|
|
||||||
subs arg3, arg3, #128
|
subs arg3, arg3, #128
|
||||||
|
|
||||||
// check if there is another 64B in the buffer to be able to fold
|
// check if there is another 64B in the buffer to be able to fold
|
||||||
b.lt _fold_64_B_end
|
b.lt .L_fold_64_B_end_\@
|
||||||
|
|
||||||
if_will_cond_yield_neon
|
if_will_cond_yield_neon
|
||||||
stp q0, q1, [sp, #.Lframe_local_offset]
|
stp q0, q1, [sp, #.Lframe_local_offset]
|
||||||
@@ -197,9 +214,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
|||||||
movi vzr.16b, #0 // init zero register
|
movi vzr.16b, #0 // init zero register
|
||||||
endif_yield_neon
|
endif_yield_neon
|
||||||
|
|
||||||
b _fold_64_B_loop
|
b .L_fold_64_B_loop_\@
|
||||||
|
|
||||||
_fold_64_B_end:
|
.L_fold_64_B_end_\@:
|
||||||
// at this point, the buffer pointer is pointing at the last y Bytes
|
// at this point, the buffer pointer is pointing at the last y Bytes
|
||||||
// of the buffer the 64B of folded data is in 4 of the vector
|
// of the buffer the 64B of folded data is in 4 of the vector
|
||||||
// registers: v0, v1, v2, v3
|
// registers: v0, v1, v2, v3
|
||||||
@@ -209,37 +226,27 @@ _fold_64_B_end:
|
|||||||
|
|
||||||
ldr_l q10, rk9, x8
|
ldr_l q10, rk9, x8
|
||||||
|
|
||||||
.macro fold16, reg, rk
|
fold16 \p, v0, rk11
|
||||||
pmull v8.1q, \reg\().1d, v10.1d
|
fold16 \p, v1, rk13
|
||||||
pmull2 \reg\().1q, \reg\().2d, v10.2d
|
fold16 \p, v2, rk15
|
||||||
.ifnb \rk
|
fold16 \p, v3, rk17
|
||||||
ldr_l q10, \rk, x8
|
fold16 \p, v4, rk19
|
||||||
.endif
|
fold16 \p, v5, rk1
|
||||||
eor v7.16b, v7.16b, v8.16b
|
fold16 \p, v6
|
||||||
eor v7.16b, v7.16b, \reg\().16b
|
|
||||||
.endm
|
|
||||||
|
|
||||||
fold16 v0, rk11
|
|
||||||
fold16 v1, rk13
|
|
||||||
fold16 v2, rk15
|
|
||||||
fold16 v3, rk17
|
|
||||||
fold16 v4, rk19
|
|
||||||
fold16 v5, rk1
|
|
||||||
fold16 v6
|
|
||||||
|
|
||||||
// instead of 64, we add 48 to the loop counter to save 1 instruction
|
// instead of 64, we add 48 to the loop counter to save 1 instruction
|
||||||
// from the loop instead of a cmp instruction, we use the negative
|
// from the loop instead of a cmp instruction, we use the negative
|
||||||
// flag with the jl instruction
|
// flag with the jl instruction
|
||||||
adds arg3, arg3, #(128-16)
|
adds arg3, arg3, #(128-16)
|
||||||
b.lt _final_reduction_for_128
|
b.lt .L_final_reduction_for_128_\@
|
||||||
|
|
||||||
// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
|
// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
|
||||||
// and the rest is in memory. We can fold 16 bytes at a time if y>=16
|
// and the rest is in memory. We can fold 16 bytes at a time if y>=16
|
||||||
// continue folding 16B at a time
|
// continue folding 16B at a time
|
||||||
|
|
||||||
_16B_reduction_loop:
|
.L_16B_reduction_loop_\@:
|
||||||
pmull v8.1q, v7.1d, v10.1d
|
__pmull_\p v8, v7, v10
|
||||||
pmull2 v7.1q, v7.2d, v10.2d
|
__pmull_\p v7, v7, v10, 2
|
||||||
eor v7.16b, v7.16b, v8.16b
|
eor v7.16b, v7.16b, v8.16b
|
||||||
|
|
||||||
ldr q0, [arg2], #16
|
ldr q0, [arg2], #16
|
||||||
@@ -251,22 +258,22 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
|
|||||||
// instead of a cmp instruction, we utilize the flags with the
|
// instead of a cmp instruction, we utilize the flags with the
|
||||||
// jge instruction equivalent of: cmp arg3, 16-16
|
// jge instruction equivalent of: cmp arg3, 16-16
|
||||||
// check if there is any more 16B in the buffer to be able to fold
|
// check if there is any more 16B in the buffer to be able to fold
|
||||||
b.ge _16B_reduction_loop
|
b.ge .L_16B_reduction_loop_\@
|
||||||
|
|
||||||
// now we have 16+z bytes left to reduce, where 0<= z < 16.
|
// now we have 16+z bytes left to reduce, where 0<= z < 16.
|
||||||
// first, we reduce the data in the xmm7 register
|
// first, we reduce the data in the xmm7 register
|
||||||
|
|
||||||
_final_reduction_for_128:
|
.L_final_reduction_for_128_\@:
|
||||||
// check if any more data to fold. If not, compute the CRC of
|
// check if any more data to fold. If not, compute the CRC of
|
||||||
// the final 128 bits
|
// the final 128 bits
|
||||||
adds arg3, arg3, #16
|
adds arg3, arg3, #16
|
||||||
b.eq _128_done
|
b.eq .L_128_done_\@
|
||||||
|
|
||||||
// here we are getting data that is less than 16 bytes.
|
// here we are getting data that is less than 16 bytes.
|
||||||
// since we know that there was data before the pointer, we can
|
// since we know that there was data before the pointer, we can
|
||||||
// offset the input pointer before the actual point, to receive
|
// offset the input pointer before the actual point, to receive
|
||||||
// exactly 16 bytes. after that the registers need to be adjusted.
|
// exactly 16 bytes. after that the registers need to be adjusted.
|
||||||
_get_last_two_regs:
|
.L_get_last_two_regs_\@:
|
||||||
add arg2, arg2, arg3
|
add arg2, arg2, arg3
|
||||||
ldr q1, [arg2, #-16]
|
ldr q1, [arg2, #-16]
|
||||||
CPU_LE( rev64 v1.16b, v1.16b )
|
CPU_LE( rev64 v1.16b, v1.16b )
|
||||||
@@ -291,47 +298,46 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
|
|||||||
bsl v0.16b, v2.16b, v1.16b
|
bsl v0.16b, v2.16b, v1.16b
|
||||||
|
|
||||||
// fold 16 Bytes
|
// fold 16 Bytes
|
||||||
pmull v8.1q, v7.1d, v10.1d
|
__pmull_\p v8, v7, v10
|
||||||
pmull2 v7.1q, v7.2d, v10.2d
|
__pmull_\p v7, v7, v10, 2
|
||||||
eor v7.16b, v7.16b, v8.16b
|
eor v7.16b, v7.16b, v8.16b
|
||||||
eor v7.16b, v7.16b, v0.16b
|
eor v7.16b, v7.16b, v0.16b
|
||||||
|
|
||||||
_128_done:
|
.L_128_done_\@:
|
||||||
// compute crc of a 128-bit value
|
// compute crc of a 128-bit value
|
||||||
ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
|
ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
|
||||||
|
|
||||||
// 64b fold
|
// 64b fold
|
||||||
ext v0.16b, vzr.16b, v7.16b, #8
|
ext v0.16b, vzr.16b, v7.16b, #8
|
||||||
mov v7.d[0], v7.d[1]
|
mov v7.d[0], v7.d[1]
|
||||||
pmull v7.1q, v7.1d, v10.1d
|
__pmull_\p v7, v7, v10
|
||||||
eor v7.16b, v7.16b, v0.16b
|
eor v7.16b, v7.16b, v0.16b
|
||||||
|
|
||||||
// 32b fold
|
// 32b fold
|
||||||
ext v0.16b, v7.16b, vzr.16b, #4
|
ext v0.16b, v7.16b, vzr.16b, #4
|
||||||
mov v7.s[3], vzr.s[0]
|
mov v7.s[3], vzr.s[0]
|
||||||
pmull2 v0.1q, v0.2d, v10.2d
|
__pmull_\p v0, v0, v10, 2
|
||||||
eor v7.16b, v7.16b, v0.16b
|
eor v7.16b, v7.16b, v0.16b
|
||||||
|
|
||||||
// barrett reduction
|
// barrett reduction
|
||||||
_barrett:
|
|
||||||
ldr_l q10, rk7, x8
|
ldr_l q10, rk7, x8
|
||||||
mov v0.d[0], v7.d[1]
|
mov v0.d[0], v7.d[1]
|
||||||
|
|
||||||
pmull v0.1q, v0.1d, v10.1d
|
__pmull_\p v0, v0, v10
|
||||||
ext v0.16b, vzr.16b, v0.16b, #12
|
ext v0.16b, vzr.16b, v0.16b, #12
|
||||||
pmull2 v0.1q, v0.2d, v10.2d
|
__pmull_\p v0, v0, v10, 2
|
||||||
ext v0.16b, vzr.16b, v0.16b, #12
|
ext v0.16b, vzr.16b, v0.16b, #12
|
||||||
eor v7.16b, v7.16b, v0.16b
|
eor v7.16b, v7.16b, v0.16b
|
||||||
mov w0, v7.s[1]
|
mov w0, v7.s[1]
|
||||||
|
|
||||||
_cleanup:
|
.L_cleanup_\@:
|
||||||
// scale the result back to 16 bits
|
// scale the result back to 16 bits
|
||||||
lsr x0, x0, #16
|
lsr x0, x0, #16
|
||||||
frame_pop
|
frame_pop
|
||||||
ret
|
ret
|
||||||
|
|
||||||
_less_than_128:
|
.L_less_than_128_\@:
|
||||||
cbz arg3, _cleanup
|
cbz arg3, .L_cleanup_\@
|
||||||
|
|
||||||
movi v0.16b, #0
|
movi v0.16b, #0
|
||||||
mov v0.s[3], arg1_low32 // get the initial crc value
|
mov v0.s[3], arg1_low32 // get the initial crc value
|
||||||
@@ -342,20 +348,20 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
|||||||
eor v7.16b, v7.16b, v0.16b // xor the initial crc value
|
eor v7.16b, v7.16b, v0.16b // xor the initial crc value
|
||||||
|
|
||||||
cmp arg3, #16
|
cmp arg3, #16
|
||||||
b.eq _128_done // exactly 16 left
|
b.eq .L_128_done_\@ // exactly 16 left
|
||||||
b.lt _less_than_16_left
|
b.lt .L_less_than_16_left_\@
|
||||||
|
|
||||||
ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
|
ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
|
||||||
|
|
||||||
// update the counter. subtract 32 instead of 16 to save one
|
// update the counter. subtract 32 instead of 16 to save one
|
||||||
// instruction from the loop
|
// instruction from the loop
|
||||||
subs arg3, arg3, #32
|
subs arg3, arg3, #32
|
||||||
b.ge _16B_reduction_loop
|
b.ge .L_16B_reduction_loop_\@
|
||||||
|
|
||||||
add arg3, arg3, #16
|
add arg3, arg3, #16
|
||||||
b _get_last_two_regs
|
b .L_get_last_two_regs_\@
|
||||||
|
|
||||||
_less_than_16_left:
|
.L_less_than_16_left_\@:
|
||||||
// shl r9, 4
|
// shl r9, 4
|
||||||
adr_l x0, tbl_shf_table + 16
|
adr_l x0, tbl_shf_table + 16
|
||||||
sub x0, x0, arg3
|
sub x0, x0, arg3
|
||||||
@@ -363,8 +369,12 @@ _less_than_16_left:
|
|||||||
movi v9.16b, #0x80
|
movi v9.16b, #0x80
|
||||||
eor v0.16b, v0.16b, v9.16b
|
eor v0.16b, v0.16b, v9.16b
|
||||||
tbl v7.16b, {v7.16b}, v0.16b
|
tbl v7.16b, {v7.16b}, v0.16b
|
||||||
b _128_done
|
b .L_128_done_\@
|
||||||
ENDPROC(crc_t10dif_pmull)
|
.endm
|
||||||
|
|
||||||
|
ENTRY(crc_t10dif_pmull_p64)
|
||||||
|
crc_t10dif_pmull p64
|
||||||
|
ENDPROC(crc_t10dif_pmull_p64)
|
||||||
|
|
||||||
// precomputed constants
|
// precomputed constants
|
||||||
// these constants are precomputed from the poly:
|
// these constants are precomputed from the poly:
|
||||||
|
|||||||
@@ -22,7 +22,9 @@
|
|||||||
|
|
||||||
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
|
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
|
||||||
|
|
||||||
asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
|
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
|
||||||
|
|
||||||
|
static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
|
||||||
|
|
||||||
static int crct10dif_init(struct shash_desc *desc)
|
static int crct10dif_init(struct shash_desc *desc)
|
||||||
{
|
{
|
||||||
@@ -85,6 +87,8 @@ static struct shash_alg crc_t10dif_alg = {
|
|||||||
|
|
||||||
static int __init crc_t10dif_mod_init(void)
|
static int __init crc_t10dif_mod_init(void)
|
||||||
{
|
{
|
||||||
|
crc_t10dif_pmull = crc_t10dif_pmull_p64;
|
||||||
|
|
||||||
return crypto_register_shash(&crc_t10dif_alg);
|
return crypto_register_shash(&crc_t10dif_alg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user