crypto: chacha20 - Add an eight block AVX2 variant for x86_64
Extends the x86_64 ChaCha20 implementation by a function processing eight ChaCha20 blocks in parallel using AVX2. For large messages, throughput increases by ~55-70% compared to four block SSSE3: testing speed of chacha20 (chacha20-simd) encryption test 0 (256 bit key, 16 byte blocks): 42249230 operations in 10 seconds (675987680 bytes) test 1 (256 bit key, 64 byte blocks): 46441641 operations in 10 seconds (2972265024 bytes) test 2 (256 bit key, 256 byte blocks): 33028112 operations in 10 seconds (8455196672 bytes) test 3 (256 bit key, 1024 byte blocks): 11568759 operations in 10 seconds (11846409216 bytes) test 4 (256 bit key, 8192 byte blocks): 1448761 operations in 10 seconds (11868250112 bytes) testing speed of chacha20 (chacha20-simd) encryption test 0 (256 bit key, 16 byte blocks): 41999675 operations in 10 seconds (671994800 bytes) test 1 (256 bit key, 64 byte blocks): 45805908 operations in 10 seconds (2931578112 bytes) test 2 (256 bit key, 256 byte blocks): 32814947 operations in 10 seconds (8400626432 bytes) test 3 (256 bit key, 1024 byte blocks): 19777167 operations in 10 seconds (20251819008 bytes) test 4 (256 bit key, 8192 byte blocks): 2279321 operations in 10 seconds (18672197632 bytes) Benchmark results from a Core i5-4670T. Signed-off-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
@@ -21,12 +21,27 @@
|
||||
|
||||
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
|
||||
static bool chacha20_use_avx2;
|
||||
#endif
|
||||
|
||||
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes)
|
||||
{
|
||||
u8 buf[CHACHA20_BLOCK_SIZE];
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (chacha20_use_avx2) {
|
||||
while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
|
||||
chacha20_8block_xor_avx2(state, dst, src);
|
||||
bytes -= CHACHA20_BLOCK_SIZE * 8;
|
||||
src += CHACHA20_BLOCK_SIZE * 8;
|
||||
dst += CHACHA20_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
|
||||
chacha20_4block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA20_BLOCK_SIZE * 4;
|
||||
@@ -113,6 +128,10 @@ static int __init chacha20_simd_mod_init(void)
|
||||
if (!cpu_has_ssse3)
|
||||
return -ENODEV;
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
|
||||
cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL);
|
||||
#endif
|
||||
return crypto_register_alg(&alg);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user