crypto: aesni-intel - Add AES-NI accelerated CTR mode
To take advantage of the hardware pipeline implementation of AES-NI instructions. CTR mode cryption is implemented in ASM to schedule multiple AES-NI instructions one after another. This way, some latency of AES-NI instruction can be eliminated. Performance testing based on dm-crypt should 50% reduction of ecryption/decryption time. Signed-off-by: Huang Ying <ying.huang@intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
@@ -32,6 +32,9 @@
|
||||
#define IN IN1
|
||||
#define KEY %xmm2
|
||||
#define IV %xmm3
|
||||
#define BSWAP_MASK %xmm10
|
||||
#define CTR %xmm11
|
||||
#define INC %xmm12
|
||||
|
||||
#define KEYP %rdi
|
||||
#define OUTP %rsi
|
||||
@@ -42,6 +45,7 @@
|
||||
#define T1 %r10
|
||||
#define TKEYP T1
|
||||
#define T2 %r11
|
||||
#define TCTR_LOW T2
|
||||
|
||||
_key_expansion_128:
|
||||
_key_expansion_256a:
|
||||
@@ -724,3 +728,114 @@ ENTRY(aesni_cbc_dec)
|
||||
movups IV, (IVP)
|
||||
.Lcbc_dec_just_ret:
|
||||
ret
|
||||
|
||||
.align 16
|
||||
.Lbswap_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
|
||||
/*
|
||||
* _aesni_inc_init: internal ABI
|
||||
* setup registers used by _aesni_inc
|
||||
* input:
|
||||
* IV
|
||||
* output:
|
||||
* CTR: == IV, in little endian
|
||||
* TCTR_LOW: == lower qword of CTR
|
||||
* INC: == 1, in little endian
|
||||
* BSWAP_MASK == endian swapping mask
|
||||
*/
|
||||
_aesni_inc_init:
|
||||
movaps .Lbswap_mask, BSWAP_MASK
|
||||
movaps IV, CTR
|
||||
PSHUFB_XMM BSWAP_MASK CTR
|
||||
mov $1, TCTR_LOW
|
||||
movq TCTR_LOW, INC
|
||||
movq CTR, TCTR_LOW
|
||||
ret
|
||||
|
||||
/*
|
||||
* _aesni_inc: internal ABI
|
||||
* Increase IV by 1, IV is in big endian
|
||||
* input:
|
||||
* IV
|
||||
* CTR: == IV, in little endian
|
||||
* TCTR_LOW: == lower qword of CTR
|
||||
* INC: == 1, in little endian
|
||||
* BSWAP_MASK == endian swapping mask
|
||||
* output:
|
||||
* IV: Increase by 1
|
||||
* changed:
|
||||
* CTR: == output IV, in little endian
|
||||
* TCTR_LOW: == lower qword of CTR
|
||||
*/
|
||||
_aesni_inc:
|
||||
paddq INC, CTR
|
||||
add $1, TCTR_LOW
|
||||
jnc .Linc_low
|
||||
pslldq $8, INC
|
||||
paddq INC, CTR
|
||||
psrldq $8, INC
|
||||
.Linc_low:
|
||||
movaps CTR, IV
|
||||
PSHUFB_XMM BSWAP_MASK IV
|
||||
ret
|
||||
|
||||
/*
|
||||
* void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
|
||||
* size_t len, u8 *iv)
|
||||
*/
|
||||
ENTRY(aesni_ctr_enc)
|
||||
cmp $16, LEN
|
||||
jb .Lctr_enc_just_ret
|
||||
mov 480(KEYP), KLEN
|
||||
movups (IVP), IV
|
||||
call _aesni_inc_init
|
||||
cmp $64, LEN
|
||||
jb .Lctr_enc_loop1
|
||||
.align 4
|
||||
.Lctr_enc_loop4:
|
||||
movaps IV, STATE1
|
||||
call _aesni_inc
|
||||
movups (INP), IN1
|
||||
movaps IV, STATE2
|
||||
call _aesni_inc
|
||||
movups 0x10(INP), IN2
|
||||
movaps IV, STATE3
|
||||
call _aesni_inc
|
||||
movups 0x20(INP), IN3
|
||||
movaps IV, STATE4
|
||||
call _aesni_inc
|
||||
movups 0x30(INP), IN4
|
||||
call _aesni_enc4
|
||||
pxor IN1, STATE1
|
||||
movups STATE1, (OUTP)
|
||||
pxor IN2, STATE2
|
||||
movups STATE2, 0x10(OUTP)
|
||||
pxor IN3, STATE3
|
||||
movups STATE3, 0x20(OUTP)
|
||||
pxor IN4, STATE4
|
||||
movups STATE4, 0x30(OUTP)
|
||||
sub $64, LEN
|
||||
add $64, INP
|
||||
add $64, OUTP
|
||||
cmp $64, LEN
|
||||
jge .Lctr_enc_loop4
|
||||
cmp $16, LEN
|
||||
jb .Lctr_enc_ret
|
||||
.align 4
|
||||
.Lctr_enc_loop1:
|
||||
movaps IV, STATE
|
||||
call _aesni_inc
|
||||
movups (INP), IN
|
||||
call _aesni_enc1
|
||||
pxor IN, STATE
|
||||
movups STATE, (OUTP)
|
||||
sub $16, LEN
|
||||
add $16, INP
|
||||
add $16, OUTP
|
||||
cmp $16, LEN
|
||||
jge .Lctr_enc_loop1
|
||||
.Lctr_enc_ret:
|
||||
movups IV, (IVP)
|
||||
.Lctr_enc_just_ret:
|
||||
ret
|
||||
|
Reference in New Issue
Block a user