crypto: aesni_intel - add more optimized XTS mode for x86-64

Add more optimized XTS code for aesni_intel in 64-bit mode, for smaller stack
usage and boost for speed.

tcrypt results, with Intel i5-2450M:
256-bit key
        enc     dec
16B     0.98x   0.99x
64B     0.64x   0.63x
256B    1.29x   1.32x
1024B   1.54x   1.58x
8192B   1.57x   1.60x

512-bit key
        enc     dec
16B     0.98x   0.99x
64B     0.60x   0.59x
256B    1.24x   1.25x
1024B   1.39x   1.42x
8192B   1.38x   1.42x

I chose not to optimize smaller than block size of 256 bytes, since XTS is
practically always used with data blocks of size 512 bytes. This is why
performance is reduced in tcrypt for 64 byte long blocks.

Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Jussi Kivilinna
2013-04-08 21:51:16 +03:00
committed by Herbert Xu
parent b5c5b072dc
commit c456a9cd1a
3 changed files with 198 additions and 0 deletions

View File

@@ -34,6 +34,10 @@
#ifdef __x86_64__
.data
.align 16
.Lgf128mul_x_ble_mask:
.octa 0x00000000000000010000000000000087
POLY: .octa 0xC2000000000000000000000000000001
TWOONE: .octa 0x00000001000000000000000000000001
@@ -105,6 +109,8 @@ enc: .octa 0x2
#define CTR %xmm11
#define INC %xmm12
#define GF128MUL_MASK %xmm10
#ifdef __x86_64__
#define AREG %rax
#define KEYP %rdi
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
.Lctr_enc_just_ret:
ret
ENDPROC(aesni_ctr_enc)
/*
* _aesni_gf128mul_x_ble: internal ABI
* Multiply in GF(2^128) for XTS IVs
* input:
* IV: current IV
* GF128MUL_MASK == mask with 0x87 and 0x01
* output:
* IV: next IV
* changed:
* CTR: == temporary value
*/
#define _aesni_gf128mul_x_ble() \
pshufd $0x13, IV, CTR; \
paddq IV, IV; \
psrad $31, CTR; \
pand GF128MUL_MASK, CTR; \
pxor CTR, IV;
/*
* void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* bool enc, u8 *iv)
*/
ENTRY(aesni_xts_crypt8)
cmpb $0, %cl
movl $0, %ecx
movl $240, %r10d
leaq _aesni_enc4, %r11
leaq _aesni_dec4, %rax
cmovel %r10d, %ecx
cmoveq %rax, %r11
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
movups (IVP), IV
mov 480(KEYP), KLEN
addq %rcx, KEYP
movdqa IV, STATE1
pxor 0x00(INP), STATE1
movdqu IV, 0x00(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE2
pxor 0x10(INP), STATE2
movdqu IV, 0x10(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE3
pxor 0x20(INP), STATE3
movdqu IV, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
pxor 0x30(INP), STATE4
movdqu IV, 0x30(OUTP)
call *%r11
pxor 0x00(OUTP), STATE1
movdqu STATE1, 0x00(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE1
pxor 0x40(INP), STATE1
movdqu IV, 0x40(OUTP)
pxor 0x10(OUTP), STATE2
movdqu STATE2, 0x10(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE2
pxor 0x50(INP), STATE2
movdqu IV, 0x50(OUTP)
pxor 0x20(OUTP), STATE3
movdqu STATE3, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE3
pxor 0x60(INP), STATE3
movdqu IV, 0x60(OUTP)
pxor 0x30(OUTP), STATE4
movdqu STATE4, 0x30(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
pxor 0x70(INP), STATE4
movdqu IV, 0x70(OUTP)
_aesni_gf128mul_x_ble()
movups IV, (IVP)
call *%r11
pxor 0x40(OUTP), STATE1
movdqu STATE1, 0x40(OUTP)
pxor 0x50(OUTP), STATE2
movdqu STATE2, 0x50(OUTP)
pxor 0x60(OUTP), STATE3
movdqu STATE3, 0x60(OUTP)
pxor 0x70(OUTP), STATE4
movdqu STATE4, 0x70(OUTP)
ret
ENDPROC(aesni_xts_crypt8)
#endif