Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Add 1472-byte test to tcrypt for IPsec - Reintroduced crypto stats interface with numerous changes - Support incremental algorithm dumps Algorithms: - Add xchacha12/20 - Add nhpoly1305 - Add adiantum - Add streebog hash - Mark cts(cbc(aes)) as FIPS allowed Drivers: - Improve performance of arm64/chacha20 - Improve performance of x86/chacha20 - Add NEON-accelerated nhpoly1305 - Add SSE2 accelerated nhpoly1305 - Add AVX2 accelerated nhpoly1305 - Add support for 192/256-bit keys in gcmaes AVX - Add SG support in gcmaes AVX - ESN for inline IPsec tx in chcr - Add support for CryptoCell 703 in ccree - Add support for CryptoCell 713 in ccree - Add SM4 support in ccree - Add SM3 support in ccree - Add support for chacha20 in caam/qi2 - Add support for chacha20 + poly1305 in caam/jr - Add support for chacha20 + poly1305 in caam/qi2 - Add AEAD cipher support in cavium/nitrox" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (130 commits) crypto: skcipher - remove remnants of internal IV generators crypto: cavium/nitrox - Fix build with !CONFIG_DEBUG_FS crypto: salsa20-generic - don't unnecessarily use atomic walk crypto: skcipher - add might_sleep() to skcipher_walk_virt() crypto: x86/chacha - avoid sleeping under kernel_fpu_begin() crypto: cavium/nitrox - Added AEAD cipher support crypto: mxc-scc - fix build warnings on ARM64 crypto: api - document missing stats member crypto: user - remove unused dump functions crypto: chelsio - Fix wrong error counter increments crypto: chelsio - Reset counters on cxgb4 Detach crypto: chelsio - Handle PCI shutdown event crypto: chelsio - cleanup:send addr as value in function argument crypto: chelsio - Use same value for both channel in single WR crypto: chelsio - Swap location of AAD and IV sent in WR crypto: chelsio - remove set but not used variable 'kctx_len' crypto: ux500 - Use proper enum in hash_set_dma_transfer crypto: ux500 - Use proper enum in cryp_set_dma_transfer crypto: aesni - Add scatter/gather avx stubs, and use them in C crypto: aesni - Introduce partial block macro ..
This commit is contained in:
@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
|
||||
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
|
||||
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
|
||||
$(comma)4)$(comma)%ymm2,yes,no)
|
||||
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
|
||||
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
|
||||
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
|
||||
|
||||
@@ -23,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
|
||||
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
|
||||
@@ -46,6 +47,9 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
|
||||
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
|
||||
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
|
||||
obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
|
||||
|
||||
# These modules require assembler to support AVX.
|
||||
ifeq ($(avx_supported),yes)
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
|
||||
@@ -74,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
|
||||
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
|
||||
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
|
||||
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
|
||||
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
|
||||
chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
|
||||
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
|
||||
|
||||
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
|
||||
@@ -84,6 +88,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
|
||||
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
|
||||
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
|
||||
|
||||
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
|
||||
|
||||
ifeq ($(avx_supported),yes)
|
||||
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
||||
camellia_aesni_avx_glue.o
|
||||
@@ -97,10 +103,16 @@ endif
|
||||
|
||||
ifeq ($(avx2_supported),yes)
|
||||
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
|
||||
chacha20-x86_64-y += chacha20-avx2-x86_64.o
|
||||
chacha-x86_64-y += chacha-avx2-x86_64.o
|
||||
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
|
||||
|
||||
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
|
||||
|
||||
nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
|
||||
endif
|
||||
|
||||
ifeq ($(avx512_supported),yes)
|
||||
chacha-x86_64-y += chacha-avx512vl-x86_64.o
|
||||
endif
|
||||
|
||||
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -84,7 +84,7 @@ struct gcm_context_data {
|
||||
u8 current_counter[GCM_BLOCK_LEN];
|
||||
u64 partial_block_len;
|
||||
u64 unused;
|
||||
u8 hash_keys[GCM_BLOCK_LEN * 8];
|
||||
u8 hash_keys[GCM_BLOCK_LEN * 16];
|
||||
};
|
||||
|
||||
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
|
||||
@@ -175,6 +175,32 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
|
||||
struct gcm_context_data *gdata,
|
||||
u8 *auth_tag, unsigned long auth_tag_len);
|
||||
|
||||
static struct aesni_gcm_tfm_s {
|
||||
void (*init)(void *ctx,
|
||||
struct gcm_context_data *gdata,
|
||||
u8 *iv,
|
||||
u8 *hash_subkey, const u8 *aad,
|
||||
unsigned long aad_len);
|
||||
void (*enc_update)(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in,
|
||||
unsigned long plaintext_len);
|
||||
void (*dec_update)(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in,
|
||||
unsigned long ciphertext_len);
|
||||
void (*finalize)(void *ctx,
|
||||
struct gcm_context_data *gdata,
|
||||
u8 *auth_tag, unsigned long auth_tag_len);
|
||||
} *aesni_gcm_tfm;
|
||||
|
||||
struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
|
||||
.init = &aesni_gcm_init,
|
||||
.enc_update = &aesni_gcm_enc_update,
|
||||
.dec_update = &aesni_gcm_dec_update,
|
||||
.finalize = &aesni_gcm_finalize,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_AS_AVX
|
||||
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
|
||||
void *keys, u8 *out, unsigned int num_bytes);
|
||||
@@ -183,136 +209,94 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
|
||||
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
|
||||
void *keys, u8 *out, unsigned int num_bytes);
|
||||
/*
|
||||
* asmlinkage void aesni_gcm_precomp_avx_gen2()
|
||||
* asmlinkage void aesni_gcm_init_avx_gen2()
|
||||
* gcm_data *my_ctx_data, context data
|
||||
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
|
||||
*/
|
||||
asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
|
||||
asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
|
||||
struct gcm_context_data *gdata,
|
||||
u8 *iv,
|
||||
u8 *hash_subkey,
|
||||
const u8 *aad,
|
||||
unsigned long aad_len);
|
||||
|
||||
asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
|
||||
asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in, unsigned long plaintext_len);
|
||||
asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in,
|
||||
unsigned long ciphertext_len);
|
||||
asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
|
||||
struct gcm_context_data *gdata,
|
||||
u8 *auth_tag, unsigned long auth_tag_len);
|
||||
|
||||
asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in, unsigned long plaintext_len, u8 *iv,
|
||||
const u8 *aad, unsigned long aad_len,
|
||||
u8 *auth_tag, unsigned long auth_tag_len);
|
||||
|
||||
asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
|
||||
asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in, unsigned long ciphertext_len, u8 *iv,
|
||||
const u8 *aad, unsigned long aad_len,
|
||||
u8 *auth_tag, unsigned long auth_tag_len);
|
||||
|
||||
static void aesni_gcm_enc_avx(void *ctx,
|
||||
struct gcm_context_data *data, u8 *out,
|
||||
const u8 *in, unsigned long plaintext_len, u8 *iv,
|
||||
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
|
||||
u8 *auth_tag, unsigned long auth_tag_len)
|
||||
{
|
||||
struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
|
||||
if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
|
||||
aesni_gcm_enc(ctx, data, out, in,
|
||||
plaintext_len, iv, hash_subkey, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
} else {
|
||||
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
|
||||
aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
}
|
||||
}
|
||||
struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
|
||||
.init = &aesni_gcm_init_avx_gen2,
|
||||
.enc_update = &aesni_gcm_enc_update_avx_gen2,
|
||||
.dec_update = &aesni_gcm_dec_update_avx_gen2,
|
||||
.finalize = &aesni_gcm_finalize_avx_gen2,
|
||||
};
|
||||
|
||||
static void aesni_gcm_dec_avx(void *ctx,
|
||||
struct gcm_context_data *data, u8 *out,
|
||||
const u8 *in, unsigned long ciphertext_len, u8 *iv,
|
||||
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
|
||||
u8 *auth_tag, unsigned long auth_tag_len)
|
||||
{
|
||||
struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
|
||||
if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
|
||||
aesni_gcm_dec(ctx, data, out, in,
|
||||
ciphertext_len, iv, hash_subkey, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
} else {
|
||||
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
|
||||
aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
/*
|
||||
* asmlinkage void aesni_gcm_precomp_avx_gen4()
|
||||
* asmlinkage void aesni_gcm_init_avx_gen4()
|
||||
* gcm_data *my_ctx_data, context data
|
||||
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
|
||||
*/
|
||||
asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
|
||||
asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
|
||||
struct gcm_context_data *gdata,
|
||||
u8 *iv,
|
||||
u8 *hash_subkey,
|
||||
const u8 *aad,
|
||||
unsigned long aad_len);
|
||||
|
||||
asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
|
||||
asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in, unsigned long plaintext_len);
|
||||
asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in,
|
||||
unsigned long ciphertext_len);
|
||||
asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
|
||||
struct gcm_context_data *gdata,
|
||||
u8 *auth_tag, unsigned long auth_tag_len);
|
||||
|
||||
asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in, unsigned long plaintext_len, u8 *iv,
|
||||
const u8 *aad, unsigned long aad_len,
|
||||
u8 *auth_tag, unsigned long auth_tag_len);
|
||||
|
||||
asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
|
||||
asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
|
||||
struct gcm_context_data *gdata, u8 *out,
|
||||
const u8 *in, unsigned long ciphertext_len, u8 *iv,
|
||||
const u8 *aad, unsigned long aad_len,
|
||||
u8 *auth_tag, unsigned long auth_tag_len);
|
||||
|
||||
static void aesni_gcm_enc_avx2(void *ctx,
|
||||
struct gcm_context_data *data, u8 *out,
|
||||
const u8 *in, unsigned long plaintext_len, u8 *iv,
|
||||
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
|
||||
u8 *auth_tag, unsigned long auth_tag_len)
|
||||
{
|
||||
struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
|
||||
if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
|
||||
aesni_gcm_enc(ctx, data, out, in,
|
||||
plaintext_len, iv, hash_subkey, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
|
||||
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
|
||||
aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
} else {
|
||||
aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
|
||||
aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
}
|
||||
}
|
||||
struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
|
||||
.init = &aesni_gcm_init_avx_gen4,
|
||||
.enc_update = &aesni_gcm_enc_update_avx_gen4,
|
||||
.dec_update = &aesni_gcm_dec_update_avx_gen4,
|
||||
.finalize = &aesni_gcm_finalize_avx_gen4,
|
||||
};
|
||||
|
||||
static void aesni_gcm_dec_avx2(void *ctx,
|
||||
struct gcm_context_data *data, u8 *out,
|
||||
const u8 *in, unsigned long ciphertext_len, u8 *iv,
|
||||
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
|
||||
u8 *auth_tag, unsigned long auth_tag_len)
|
||||
{
|
||||
struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
|
||||
if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
|
||||
aesni_gcm_dec(ctx, data, out, in,
|
||||
ciphertext_len, iv, hash_subkey,
|
||||
aad, aad_len, auth_tag, auth_tag_len);
|
||||
} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
|
||||
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
|
||||
aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
} else {
|
||||
aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
|
||||
aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
|
||||
aad_len, auth_tag, auth_tag_len);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void (*aesni_gcm_enc_tfm)(void *ctx,
|
||||
struct gcm_context_data *data, u8 *out,
|
||||
const u8 *in, unsigned long plaintext_len,
|
||||
u8 *iv, u8 *hash_subkey, const u8 *aad,
|
||||
unsigned long aad_len, u8 *auth_tag,
|
||||
unsigned long auth_tag_len);
|
||||
|
||||
static void (*aesni_gcm_dec_tfm)(void *ctx,
|
||||
struct gcm_context_data *data, u8 *out,
|
||||
const u8 *in, unsigned long ciphertext_len,
|
||||
u8 *iv, u8 *hash_subkey, const u8 *aad,
|
||||
unsigned long aad_len, u8 *auth_tag,
|
||||
unsigned long auth_tag_len);
|
||||
|
||||
static inline struct
|
||||
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
|
||||
{
|
||||
@@ -794,6 +778,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
|
||||
{
|
||||
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
|
||||
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
|
||||
struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
|
||||
struct gcm_context_data data AESNI_ALIGN_ATTR;
|
||||
struct scatter_walk dst_sg_walk = {};
|
||||
unsigned long left = req->cryptlen;
|
||||
@@ -811,6 +796,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
|
||||
if (!enc)
|
||||
left -= auth_tag_len;
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
|
||||
gcm_tfm = &aesni_gcm_tfm_avx_gen2;
|
||||
#endif
|
||||
#ifdef CONFIG_AS_AVX
|
||||
if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
|
||||
gcm_tfm = &aesni_gcm_tfm_sse;
|
||||
#endif
|
||||
|
||||
/* Linearize assoc, if not already linear */
|
||||
if (req->src->length >= assoclen && req->src->length &&
|
||||
(!PageHighMem(sg_page(req->src)) ||
|
||||
@@ -835,7 +829,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
|
||||
}
|
||||
|
||||
kernel_fpu_begin();
|
||||
aesni_gcm_init(aes_ctx, &data, iv,
|
||||
gcm_tfm->init(aes_ctx, &data, iv,
|
||||
hash_subkey, assoc, assoclen);
|
||||
if (req->src != req->dst) {
|
||||
while (left) {
|
||||
@@ -846,10 +840,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
|
||||
len = min(srclen, dstlen);
|
||||
if (len) {
|
||||
if (enc)
|
||||
aesni_gcm_enc_update(aes_ctx, &data,
|
||||
gcm_tfm->enc_update(aes_ctx, &data,
|
||||
dst, src, len);
|
||||
else
|
||||
aesni_gcm_dec_update(aes_ctx, &data,
|
||||
gcm_tfm->dec_update(aes_ctx, &data,
|
||||
dst, src, len);
|
||||
}
|
||||
left -= len;
|
||||
@@ -867,10 +861,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
|
||||
len = scatterwalk_clamp(&src_sg_walk, left);
|
||||
if (len) {
|
||||
if (enc)
|
||||
aesni_gcm_enc_update(aes_ctx, &data,
|
||||
gcm_tfm->enc_update(aes_ctx, &data,
|
||||
src, src, len);
|
||||
else
|
||||
aesni_gcm_dec_update(aes_ctx, &data,
|
||||
gcm_tfm->dec_update(aes_ctx, &data,
|
||||
src, src, len);
|
||||
}
|
||||
left -= len;
|
||||
@@ -879,7 +873,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
|
||||
scatterwalk_done(&src_sg_walk, 1, left);
|
||||
}
|
||||
}
|
||||
aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len);
|
||||
gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
|
||||
kernel_fpu_end();
|
||||
|
||||
if (!assocmem)
|
||||
@@ -912,147 +906,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
|
||||
static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
|
||||
u8 *hash_subkey, u8 *iv, void *aes_ctx)
|
||||
{
|
||||
u8 one_entry_in_sg = 0;
|
||||
u8 *src, *dst, *assoc;
|
||||
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
|
||||
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
|
||||
struct scatter_walk src_sg_walk;
|
||||
struct scatter_walk dst_sg_walk = {};
|
||||
struct gcm_context_data data AESNI_ALIGN_ATTR;
|
||||
|
||||
if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
|
||||
aesni_gcm_enc_tfm == aesni_gcm_enc ||
|
||||
req->cryptlen < AVX_GEN2_OPTSIZE) {
|
||||
return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
|
||||
aes_ctx);
|
||||
}
|
||||
if (sg_is_last(req->src) &&
|
||||
(!PageHighMem(sg_page(req->src)) ||
|
||||
req->src->offset + req->src->length <= PAGE_SIZE) &&
|
||||
sg_is_last(req->dst) &&
|
||||
(!PageHighMem(sg_page(req->dst)) ||
|
||||
req->dst->offset + req->dst->length <= PAGE_SIZE)) {
|
||||
one_entry_in_sg = 1;
|
||||
scatterwalk_start(&src_sg_walk, req->src);
|
||||
assoc = scatterwalk_map(&src_sg_walk);
|
||||
src = assoc + req->assoclen;
|
||||
dst = src;
|
||||
if (unlikely(req->src != req->dst)) {
|
||||
scatterwalk_start(&dst_sg_walk, req->dst);
|
||||
dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
|
||||
}
|
||||
} else {
|
||||
/* Allocate memory for src, dst, assoc */
|
||||
assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
|
||||
GFP_ATOMIC);
|
||||
if (unlikely(!assoc))
|
||||
return -ENOMEM;
|
||||
scatterwalk_map_and_copy(assoc, req->src, 0,
|
||||
req->assoclen + req->cryptlen, 0);
|
||||
src = assoc + req->assoclen;
|
||||
dst = src;
|
||||
}
|
||||
|
||||
kernel_fpu_begin();
|
||||
aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv,
|
||||
hash_subkey, assoc, assoclen,
|
||||
dst + req->cryptlen, auth_tag_len);
|
||||
kernel_fpu_end();
|
||||
|
||||
/* The authTag (aka the Integrity Check Value) needs to be written
|
||||
* back to the packet. */
|
||||
if (one_entry_in_sg) {
|
||||
if (unlikely(req->src != req->dst)) {
|
||||
scatterwalk_unmap(dst - req->assoclen);
|
||||
scatterwalk_advance(&dst_sg_walk, req->dst->length);
|
||||
scatterwalk_done(&dst_sg_walk, 1, 0);
|
||||
}
|
||||
scatterwalk_unmap(assoc);
|
||||
scatterwalk_advance(&src_sg_walk, req->src->length);
|
||||
scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
|
||||
} else {
|
||||
scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
|
||||
req->cryptlen + auth_tag_len, 1);
|
||||
kfree(assoc);
|
||||
}
|
||||
return 0;
|
||||
return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
|
||||
aes_ctx);
|
||||
}
|
||||
|
||||
static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
|
||||
u8 *hash_subkey, u8 *iv, void *aes_ctx)
|
||||
{
|
||||
u8 one_entry_in_sg = 0;
|
||||
u8 *src, *dst, *assoc;
|
||||
unsigned long tempCipherLen = 0;
|
||||
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
|
||||
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
|
||||
u8 authTag[16];
|
||||
struct scatter_walk src_sg_walk;
|
||||
struct scatter_walk dst_sg_walk = {};
|
||||
struct gcm_context_data data AESNI_ALIGN_ATTR;
|
||||
int retval = 0;
|
||||
|
||||
if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
|
||||
aesni_gcm_enc_tfm == aesni_gcm_enc ||
|
||||
req->cryptlen < AVX_GEN2_OPTSIZE) {
|
||||
return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
|
||||
aes_ctx);
|
||||
}
|
||||
tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
|
||||
|
||||
if (sg_is_last(req->src) &&
|
||||
(!PageHighMem(sg_page(req->src)) ||
|
||||
req->src->offset + req->src->length <= PAGE_SIZE) &&
|
||||
sg_is_last(req->dst) && req->dst->length &&
|
||||
(!PageHighMem(sg_page(req->dst)) ||
|
||||
req->dst->offset + req->dst->length <= PAGE_SIZE)) {
|
||||
one_entry_in_sg = 1;
|
||||
scatterwalk_start(&src_sg_walk, req->src);
|
||||
assoc = scatterwalk_map(&src_sg_walk);
|
||||
src = assoc + req->assoclen;
|
||||
dst = src;
|
||||
if (unlikely(req->src != req->dst)) {
|
||||
scatterwalk_start(&dst_sg_walk, req->dst);
|
||||
dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
|
||||
}
|
||||
} else {
|
||||
/* Allocate memory for src, dst, assoc */
|
||||
assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
|
||||
if (!assoc)
|
||||
return -ENOMEM;
|
||||
scatterwalk_map_and_copy(assoc, req->src, 0,
|
||||
req->assoclen + req->cryptlen, 0);
|
||||
src = assoc + req->assoclen;
|
||||
dst = src;
|
||||
}
|
||||
|
||||
|
||||
kernel_fpu_begin();
|
||||
aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv,
|
||||
hash_subkey, assoc, assoclen,
|
||||
authTag, auth_tag_len);
|
||||
kernel_fpu_end();
|
||||
|
||||
/* Compare generated tag with passed in tag. */
|
||||
retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
|
||||
-EBADMSG : 0;
|
||||
|
||||
if (one_entry_in_sg) {
|
||||
if (unlikely(req->src != req->dst)) {
|
||||
scatterwalk_unmap(dst - req->assoclen);
|
||||
scatterwalk_advance(&dst_sg_walk, req->dst->length);
|
||||
scatterwalk_done(&dst_sg_walk, 1, 0);
|
||||
}
|
||||
scatterwalk_unmap(assoc);
|
||||
scatterwalk_advance(&src_sg_walk, req->src->length);
|
||||
scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
|
||||
} else {
|
||||
scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
|
||||
tempCipherLen, 1);
|
||||
kfree(assoc);
|
||||
}
|
||||
return retval;
|
||||
|
||||
return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
|
||||
aes_ctx);
|
||||
}
|
||||
|
||||
static int helper_rfc4106_encrypt(struct aead_request *req)
|
||||
@@ -1420,21 +1282,18 @@ static int __init aesni_init(void)
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (boot_cpu_has(X86_FEATURE_AVX2)) {
|
||||
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
|
||||
aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
|
||||
aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
|
||||
aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
|
||||
} else
|
||||
#endif
|
||||
#ifdef CONFIG_AS_AVX
|
||||
if (boot_cpu_has(X86_FEATURE_AVX)) {
|
||||
pr_info("AVX version of gcm_enc/dec engaged.\n");
|
||||
aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
|
||||
aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
|
||||
aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
pr_info("SSE version of gcm_enc/dec engaged.\n");
|
||||
aesni_gcm_enc_tfm = aesni_gcm_enc;
|
||||
aesni_gcm_dec_tfm = aesni_gcm_dec;
|
||||
aesni_gcm_tfm = &aesni_gcm_tfm_sse;
|
||||
}
|
||||
aesni_ctr_enc_tfm = aesni_ctr_enc;
|
||||
#ifdef CONFIG_AS_AVX
|
||||
|
1025
arch/x86/crypto/chacha-avx2-x86_64.S
Normal file
1025
arch/x86/crypto/chacha-avx2-x86_64.S
Normal file
File diff suppressed because it is too large
Load Diff
836
arch/x86/crypto/chacha-avx512vl-x86_64.S
Normal file
836
arch/x86/crypto/chacha-avx512vl-x86_64.S
Normal file
@@ -0,0 +1,836 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||
/*
|
||||
* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
|
||||
*
|
||||
* Copyright (C) 2018 Martin Willi
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR2BL: .octa 0x00000000000000000000000000000000
|
||||
.octa 0x00000000000000000000000000000001
|
||||
|
||||
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR4BL: .octa 0x00000000000000000000000000000002
|
||||
.octa 0x00000000000000000000000000000003
|
||||
|
||||
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR8BL: .octa 0x00000003000000020000000100000000
|
||||
.octa 0x00000007000000060000000500000004
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha_2block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 2 data blocks output, o
|
||||
# %rdx: up to 2 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts two ChaCha blocks by loading the state
|
||||
# matrix twice across four AVX registers. It performs matrix operations
|
||||
# on four words in each matrix in parallel, but requires shuffling to
|
||||
# rearrange the words after each round.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..3[0-2] = s0..3
|
||||
vbroadcasti128 0x00(%rdi),%ymm0
|
||||
vbroadcasti128 0x10(%rdi),%ymm1
|
||||
vbroadcasti128 0x20(%rdi),%ymm2
|
||||
vbroadcasti128 0x30(%rdi),%ymm3
|
||||
|
||||
vpaddd CTR2BL(%rip),%ymm3,%ymm3
|
||||
|
||||
vmovdqa %ymm0,%ymm8
|
||||
vmovdqa %ymm1,%ymm9
|
||||
vmovdqa %ymm2,%ymm10
|
||||
vmovdqa %ymm3,%ymm11
|
||||
|
||||
.Ldoubleround:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm1,%ymm1
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm3,%ymm3
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm1,%ymm1
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm3,%ymm3
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround
|
||||
|
||||
# o0 = i0 ^ (x0 + s0)
|
||||
vpaddd %ymm8,%ymm0,%ymm7
|
||||
cmp $0x10,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x00(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x00(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm0
|
||||
# o1 = i1 ^ (x1 + s1)
|
||||
vpaddd %ymm9,%ymm1,%ymm7
|
||||
cmp $0x20,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x10(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x10(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm1
|
||||
# o2 = i2 ^ (x2 + s2)
|
||||
vpaddd %ymm10,%ymm2,%ymm7
|
||||
cmp $0x30,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x20(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x20(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm2
|
||||
# o3 = i3 ^ (x3 + s3)
|
||||
vpaddd %ymm11,%ymm3,%ymm7
|
||||
cmp $0x40,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x30(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x30(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm3
|
||||
|
||||
# xor and write second block
|
||||
vmovdqa %xmm0,%xmm7
|
||||
cmp $0x50,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x40(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x40(%rsi)
|
||||
|
||||
vmovdqa %xmm1,%xmm7
|
||||
cmp $0x60,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x50(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x50(%rsi)
|
||||
|
||||
vmovdqa %xmm2,%xmm7
|
||||
cmp $0x70,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x60(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x60(%rsi)
|
||||
|
||||
vmovdqa %xmm3,%xmm7
|
||||
cmp $0x80,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x70(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x70(%rsi)
|
||||
|
||||
.Ldone2:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart2:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0xf,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0xf,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
|
||||
vpxord %xmm7,%xmm1,%xmm1
|
||||
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone2
|
||||
|
||||
ENDPROC(chacha_2block_xor_avx512vl)
|
||||
|
||||
ENTRY(chacha_4block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 4 data blocks output, o
|
||||
# %rdx: up to 4 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts four ChaCha blocks by loading the state
|
||||
# matrix four times across eight AVX registers. It performs matrix
|
||||
# operations on four words in two matrices in parallel, sequentially
|
||||
# to the operations on the four words of the other two matrices. The
|
||||
# required word shuffling has a rather high latency, we can do the
|
||||
# arithmetic on two matrix-pairs without much slowdown.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..3[0-4] = s0..3
|
||||
vbroadcasti128 0x00(%rdi),%ymm0
|
||||
vbroadcasti128 0x10(%rdi),%ymm1
|
||||
vbroadcasti128 0x20(%rdi),%ymm2
|
||||
vbroadcasti128 0x30(%rdi),%ymm3
|
||||
|
||||
vmovdqa %ymm0,%ymm4
|
||||
vmovdqa %ymm1,%ymm5
|
||||
vmovdqa %ymm2,%ymm6
|
||||
vmovdqa %ymm3,%ymm7
|
||||
|
||||
vpaddd CTR2BL(%rip),%ymm3,%ymm3
|
||||
vpaddd CTR4BL(%rip),%ymm7,%ymm7
|
||||
|
||||
vmovdqa %ymm0,%ymm11
|
||||
vmovdqa %ymm1,%ymm12
|
||||
vmovdqa %ymm2,%ymm13
|
||||
vmovdqa %ymm3,%ymm14
|
||||
vmovdqa %ymm7,%ymm15
|
||||
|
||||
.Ldoubleround4:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $16,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $8,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
|
||||
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm1,%ymm1
|
||||
vpshufd $0x39,%ymm5,%ymm5
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
vpshufd $0x4e,%ymm6,%ymm6
|
||||
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm3,%ymm3
|
||||
vpshufd $0x93,%ymm7,%ymm7
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $16,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $8,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
|
||||
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm1,%ymm1
|
||||
vpshufd $0x93,%ymm5,%ymm5
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
vpshufd $0x4e,%ymm6,%ymm6
|
||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm3,%ymm3
|
||||
vpshufd $0x39,%ymm7,%ymm7
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround4
|
||||
|
||||
# o0 = i0 ^ (x0 + s0), first block
|
||||
vpaddd %ymm11,%ymm0,%ymm10
|
||||
cmp $0x10,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x00(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x00(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm0
|
||||
# o1 = i1 ^ (x1 + s1), first block
|
||||
vpaddd %ymm12,%ymm1,%ymm10
|
||||
cmp $0x20,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x10(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x10(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm1
|
||||
# o2 = i2 ^ (x2 + s2), first block
|
||||
vpaddd %ymm13,%ymm2,%ymm10
|
||||
cmp $0x30,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x20(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x20(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm2
|
||||
# o3 = i3 ^ (x3 + s3), first block
|
||||
vpaddd %ymm14,%ymm3,%ymm10
|
||||
cmp $0x40,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x30(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x30(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm3
|
||||
|
||||
# xor and write second block
|
||||
vmovdqa %xmm0,%xmm10
|
||||
cmp $0x50,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x40(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x40(%rsi)
|
||||
|
||||
vmovdqa %xmm1,%xmm10
|
||||
cmp $0x60,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x50(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x50(%rsi)
|
||||
|
||||
vmovdqa %xmm2,%xmm10
|
||||
cmp $0x70,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x60(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x60(%rsi)
|
||||
|
||||
vmovdqa %xmm3,%xmm10
|
||||
cmp $0x80,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x70(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x70(%rsi)
|
||||
|
||||
# o0 = i0 ^ (x0 + s0), third block
|
||||
vpaddd %ymm11,%ymm4,%ymm10
|
||||
cmp $0x90,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x80(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x80(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm4
|
||||
# o1 = i1 ^ (x1 + s1), third block
|
||||
vpaddd %ymm12,%ymm5,%ymm10
|
||||
cmp $0xa0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x90(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x90(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm5
|
||||
# o2 = i2 ^ (x2 + s2), third block
|
||||
vpaddd %ymm13,%ymm6,%ymm10
|
||||
cmp $0xb0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xa0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xa0(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm6
|
||||
# o3 = i3 ^ (x3 + s3), third block
|
||||
vpaddd %ymm15,%ymm7,%ymm10
|
||||
cmp $0xc0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xb0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xb0(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm7
|
||||
|
||||
# xor and write fourth block
|
||||
vmovdqa %xmm4,%xmm10
|
||||
cmp $0xd0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xc0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xc0(%rsi)
|
||||
|
||||
vmovdqa %xmm5,%xmm10
|
||||
cmp $0xe0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xd0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xd0(%rsi)
|
||||
|
||||
vmovdqa %xmm6,%xmm10
|
||||
cmp $0xf0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xe0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xe0(%rsi)
|
||||
|
||||
vmovdqa %xmm7,%xmm10
|
||||
cmp $0x100,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xf0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xf0(%rsi)
|
||||
|
||||
.Ldone4:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart4:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0xf,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0xf,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
|
||||
vpxord %xmm10,%xmm1,%xmm1
|
||||
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone4
|
||||
|
||||
ENDPROC(chacha_4block_xor_avx512vl)
|
||||
|
||||
ENTRY(chacha_8block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 8 data blocks output, o
|
||||
# %rdx: up to 8 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts eight consecutive ChaCha blocks by loading
|
||||
# the state matrix in AVX registers eight times. Compared to AVX2, this
|
||||
# mostly benefits from the new rotate instructions in VL and the
|
||||
# additional registers.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..15[0-7] = s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpbroadcastd 0x04(%rdi),%ymm1
|
||||
vpbroadcastd 0x08(%rdi),%ymm2
|
||||
vpbroadcastd 0x0c(%rdi),%ymm3
|
||||
vpbroadcastd 0x10(%rdi),%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm15
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd CTR8BL(%rip),%ymm12,%ymm12
|
||||
|
||||
vmovdqa64 %ymm0,%ymm16
|
||||
vmovdqa64 %ymm1,%ymm17
|
||||
vmovdqa64 %ymm2,%ymm18
|
||||
vmovdqa64 %ymm3,%ymm19
|
||||
vmovdqa64 %ymm4,%ymm20
|
||||
vmovdqa64 %ymm5,%ymm21
|
||||
vmovdqa64 %ymm6,%ymm22
|
||||
vmovdqa64 %ymm7,%ymm23
|
||||
vmovdqa64 %ymm8,%ymm24
|
||||
vmovdqa64 %ymm9,%ymm25
|
||||
vmovdqa64 %ymm10,%ymm26
|
||||
vmovdqa64 %ymm11,%ymm27
|
||||
vmovdqa64 %ymm12,%ymm28
|
||||
vmovdqa64 %ymm13,%ymm29
|
||||
vmovdqa64 %ymm14,%ymm30
|
||||
vmovdqa64 %ymm15,%ymm31
|
||||
|
||||
.Ldoubleround8:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
vpaddd %ymm0,%ymm4,%ymm0
|
||||
vpxord %ymm0,%ymm12,%ymm12
|
||||
vprold $16,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
vpaddd %ymm1,%ymm5,%ymm1
|
||||
vpxord %ymm1,%ymm13,%ymm13
|
||||
vprold $16,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
vpaddd %ymm2,%ymm6,%ymm2
|
||||
vpxord %ymm2,%ymm14,%ymm14
|
||||
vprold $16,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
vpaddd %ymm3,%ymm7,%ymm3
|
||||
vpxord %ymm3,%ymm15,%ymm15
|
||||
vprold $16,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm4,%ymm4
|
||||
vprold $12,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm6,%ymm6
|
||||
vprold $12,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm7,%ymm7
|
||||
vprold $12,%ymm7,%ymm7
|
||||
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
vpaddd %ymm0,%ymm4,%ymm0
|
||||
vpxord %ymm0,%ymm12,%ymm12
|
||||
vprold $8,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
vpaddd %ymm1,%ymm5,%ymm1
|
||||
vpxord %ymm1,%ymm13,%ymm13
|
||||
vprold $8,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
vpaddd %ymm2,%ymm6,%ymm2
|
||||
vpxord %ymm2,%ymm14,%ymm14
|
||||
vprold $8,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
vpaddd %ymm3,%ymm7,%ymm3
|
||||
vpxord %ymm3,%ymm15,%ymm15
|
||||
vprold $8,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm4,%ymm4
|
||||
vprold $7,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm6,%ymm6
|
||||
vprold $7,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm7,%ymm7
|
||||
vprold $7,%ymm7,%ymm7
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
vpaddd %ymm0,%ymm5,%ymm0
|
||||
vpxord %ymm0,%ymm15,%ymm15
|
||||
vprold $16,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
|
||||
vpaddd %ymm1,%ymm6,%ymm1
|
||||
vpxord %ymm1,%ymm12,%ymm12
|
||||
vprold $16,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
vpaddd %ymm2,%ymm7,%ymm2
|
||||
vpxord %ymm2,%ymm13,%ymm13
|
||||
vprold $16,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
vpaddd %ymm3,%ymm4,%ymm3
|
||||
vpxord %ymm3,%ymm14,%ymm14
|
||||
vprold $16,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm6,%ymm6
|
||||
vprold $12,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm7,%ymm7
|
||||
vprold $12,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm4,%ymm4
|
||||
vprold $12,%ymm4,%ymm4
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
vpaddd %ymm0,%ymm5,%ymm0
|
||||
vpxord %ymm0,%ymm15,%ymm15
|
||||
vprold $8,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
vpaddd %ymm1,%ymm6,%ymm1
|
||||
vpxord %ymm1,%ymm12,%ymm12
|
||||
vprold $8,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
vpaddd %ymm2,%ymm7,%ymm2
|
||||
vpxord %ymm2,%ymm13,%ymm13
|
||||
vprold $8,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
vpaddd %ymm3,%ymm4,%ymm3
|
||||
vpxord %ymm3,%ymm14,%ymm14
|
||||
vprold $8,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm6,%ymm6
|
||||
vprold $7,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm7,%ymm7
|
||||
vprold $7,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm4,%ymm4
|
||||
vprold $7,%ymm4,%ymm4
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround8
|
||||
|
||||
# x0..15[0-3] += s[0..15]
|
||||
vpaddd %ymm16,%ymm0,%ymm0
|
||||
vpaddd %ymm17,%ymm1,%ymm1
|
||||
vpaddd %ymm18,%ymm2,%ymm2
|
||||
vpaddd %ymm19,%ymm3,%ymm3
|
||||
vpaddd %ymm20,%ymm4,%ymm4
|
||||
vpaddd %ymm21,%ymm5,%ymm5
|
||||
vpaddd %ymm22,%ymm6,%ymm6
|
||||
vpaddd %ymm23,%ymm7,%ymm7
|
||||
vpaddd %ymm24,%ymm8,%ymm8
|
||||
vpaddd %ymm25,%ymm9,%ymm9
|
||||
vpaddd %ymm26,%ymm10,%ymm10
|
||||
vpaddd %ymm27,%ymm11,%ymm11
|
||||
vpaddd %ymm28,%ymm12,%ymm12
|
||||
vpaddd %ymm29,%ymm13,%ymm13
|
||||
vpaddd %ymm30,%ymm14,%ymm14
|
||||
vpaddd %ymm31,%ymm15,%ymm15
|
||||
|
||||
# interleave 32-bit words in state n, n+1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm16
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm17
|
||||
vpunpckldq %ymm3,%ymm2,%ymm18
|
||||
vpunpckhdq %ymm3,%ymm2,%ymm19
|
||||
vpunpckldq %ymm5,%ymm4,%ymm20
|
||||
vpunpckhdq %ymm5,%ymm4,%ymm21
|
||||
vpunpckldq %ymm7,%ymm6,%ymm22
|
||||
vpunpckhdq %ymm7,%ymm6,%ymm23
|
||||
vpunpckldq %ymm9,%ymm8,%ymm24
|
||||
vpunpckhdq %ymm9,%ymm8,%ymm25
|
||||
vpunpckldq %ymm11,%ymm10,%ymm26
|
||||
vpunpckhdq %ymm11,%ymm10,%ymm27
|
||||
vpunpckldq %ymm13,%ymm12,%ymm28
|
||||
vpunpckhdq %ymm13,%ymm12,%ymm29
|
||||
vpunpckldq %ymm15,%ymm14,%ymm30
|
||||
vpunpckhdq %ymm15,%ymm14,%ymm31
|
||||
|
||||
# interleave 64-bit words in state n, n+2
|
||||
vpunpcklqdq %ymm18,%ymm16,%ymm0
|
||||
vpunpcklqdq %ymm19,%ymm17,%ymm1
|
||||
vpunpckhqdq %ymm18,%ymm16,%ymm2
|
||||
vpunpckhqdq %ymm19,%ymm17,%ymm3
|
||||
vpunpcklqdq %ymm22,%ymm20,%ymm4
|
||||
vpunpcklqdq %ymm23,%ymm21,%ymm5
|
||||
vpunpckhqdq %ymm22,%ymm20,%ymm6
|
||||
vpunpckhqdq %ymm23,%ymm21,%ymm7
|
||||
vpunpcklqdq %ymm26,%ymm24,%ymm8
|
||||
vpunpcklqdq %ymm27,%ymm25,%ymm9
|
||||
vpunpckhqdq %ymm26,%ymm24,%ymm10
|
||||
vpunpckhqdq %ymm27,%ymm25,%ymm11
|
||||
vpunpcklqdq %ymm30,%ymm28,%ymm12
|
||||
vpunpcklqdq %ymm31,%ymm29,%ymm13
|
||||
vpunpckhqdq %ymm30,%ymm28,%ymm14
|
||||
vpunpckhqdq %ymm31,%ymm29,%ymm15
|
||||
|
||||
# interleave 128-bit words in state n, n+4
|
||||
# xor/write first four blocks
|
||||
vmovdqa64 %ymm0,%ymm16
|
||||
vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
|
||||
cmp $0x0020,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0000(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0000(%rsi)
|
||||
vmovdqa64 %ymm16,%ymm0
|
||||
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
||||
|
||||
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
||||
cmp $0x0040,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0020(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0020(%rsi)
|
||||
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
||||
|
||||
vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
|
||||
cmp $0x0060,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0040(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0040(%rsi)
|
||||
vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
|
||||
|
||||
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
||||
cmp $0x0080,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0060(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0060(%rsi)
|
||||
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
||||
|
||||
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
|
||||
cmp $0x00a0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0080(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0080(%rsi)
|
||||
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
|
||||
|
||||
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
||||
cmp $0x00c0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00a0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00a0(%rsi)
|
||||
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
||||
|
||||
vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
|
||||
cmp $0x00e0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00c0(%rsi)
|
||||
vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
|
||||
|
||||
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
||||
cmp $0x0100,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00e0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00e0(%rsi)
|
||||
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
||||
|
||||
# xor remaining blocks, write to output
|
||||
vmovdqa64 %ymm4,%ymm0
|
||||
cmp $0x0120,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0100(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0100(%rsi)
|
||||
|
||||
vmovdqa64 %ymm12,%ymm0
|
||||
cmp $0x0140,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0120(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0120(%rsi)
|
||||
|
||||
vmovdqa64 %ymm6,%ymm0
|
||||
cmp $0x0160,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0140(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0140(%rsi)
|
||||
|
||||
vmovdqa64 %ymm14,%ymm0
|
||||
cmp $0x0180,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0160(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0160(%rsi)
|
||||
|
||||
vmovdqa64 %ymm5,%ymm0
|
||||
cmp $0x01a0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0180(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0180(%rsi)
|
||||
|
||||
vmovdqa64 %ymm13,%ymm0
|
||||
cmp $0x01c0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01a0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01a0(%rsi)
|
||||
|
||||
vmovdqa64 %ymm7,%ymm0
|
||||
cmp $0x01e0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01c0(%rsi)
|
||||
|
||||
vmovdqa64 %ymm15,%ymm0
|
||||
cmp $0x0200,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01e0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01e0(%rsi)
|
||||
|
||||
.Ldone8:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart8:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0x1f,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0x1f,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
|
||||
vpxord %ymm0,%ymm1,%ymm1
|
||||
vmovdqu8 %ymm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone8
|
||||
|
||||
ENDPROC(chacha_8block_xor_avx512vl)
|
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
|
||||
* ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
@@ -10,6 +10,7 @@
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/frame.h>
|
||||
|
||||
.section .rodata.cst16.ROT8, "aM", @progbits, 16
|
||||
.align 16
|
||||
@@ -23,35 +24,25 @@ CTRINC: .octa 0x00000003000000020000000100000000
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha20_block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 1 data block output, o
|
||||
# %rdx: 1 data block input, i
|
||||
|
||||
# This function encrypts one ChaCha20 block by loading the state matrix
|
||||
# in four SSE registers. It performs matrix operation on four words in
|
||||
# parallel, but requireds shuffling to rearrange the words after each
|
||||
# round. 8/16-bit word rotation is done with the slightly better
|
||||
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
|
||||
# traditional shift+OR.
|
||||
|
||||
# x0..3 = s0..3
|
||||
movdqa 0x00(%rdi),%xmm0
|
||||
movdqa 0x10(%rdi),%xmm1
|
||||
movdqa 0x20(%rdi),%xmm2
|
||||
movdqa 0x30(%rdi),%xmm3
|
||||
movdqa %xmm0,%xmm8
|
||||
movdqa %xmm1,%xmm9
|
||||
movdqa %xmm2,%xmm10
|
||||
movdqa %xmm3,%xmm11
|
||||
/*
|
||||
* chacha_permute - permute one block
|
||||
*
|
||||
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
|
||||
* function performs matrix operations on four words in parallel, but requires
|
||||
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
|
||||
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
|
||||
* rotation uses traditional shift+OR.
|
||||
*
|
||||
* The round count is given in %r8d.
|
||||
*
|
||||
* Clobbers: %r8d, %xmm4-%xmm7
|
||||
*/
|
||||
chacha_permute:
|
||||
|
||||
movdqa ROT8(%rip),%xmm4
|
||||
movdqa ROT16(%rip),%xmm5
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
@@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
|
||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
pshufd $0x39,%xmm3,%xmm3
|
||||
|
||||
dec %ecx
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround
|
||||
|
||||
ret
|
||||
ENDPROC(chacha_permute)
|
||||
|
||||
ENTRY(chacha_block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 1 data block output, o
|
||||
# %rdx: up to 1 data block input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
FRAME_BEGIN
|
||||
|
||||
# x0..3 = s0..3
|
||||
movdqa 0x00(%rdi),%xmm0
|
||||
movdqa 0x10(%rdi),%xmm1
|
||||
movdqa 0x20(%rdi),%xmm2
|
||||
movdqa 0x30(%rdi),%xmm3
|
||||
movdqa %xmm0,%xmm8
|
||||
movdqa %xmm1,%xmm9
|
||||
movdqa %xmm2,%xmm10
|
||||
movdqa %xmm3,%xmm11
|
||||
|
||||
mov %rcx,%rax
|
||||
call chacha_permute
|
||||
|
||||
# o0 = i0 ^ (x0 + s0)
|
||||
movdqu 0x00(%rdx),%xmm4
|
||||
paddd %xmm8,%xmm0
|
||||
cmp $0x10,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x00(%rdx),%xmm4
|
||||
pxor %xmm4,%xmm0
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
# o1 = i1 ^ (x1 + s1)
|
||||
movdqu 0x10(%rdx),%xmm5
|
||||
paddd %xmm9,%xmm1
|
||||
pxor %xmm5,%xmm1
|
||||
movdqu %xmm1,0x10(%rsi)
|
||||
movdqa %xmm1,%xmm0
|
||||
cmp $0x20,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x10(%rdx),%xmm0
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x10(%rsi)
|
||||
# o2 = i2 ^ (x2 + s2)
|
||||
movdqu 0x20(%rdx),%xmm6
|
||||
paddd %xmm10,%xmm2
|
||||
pxor %xmm6,%xmm2
|
||||
movdqu %xmm2,0x20(%rsi)
|
||||
movdqa %xmm2,%xmm0
|
||||
cmp $0x30,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x20(%rdx),%xmm0
|
||||
pxor %xmm2,%xmm0
|
||||
movdqu %xmm0,0x20(%rsi)
|
||||
# o3 = i3 ^ (x3 + s3)
|
||||
movdqu 0x30(%rdx),%xmm7
|
||||
paddd %xmm11,%xmm3
|
||||
pxor %xmm7,%xmm3
|
||||
movdqu %xmm3,0x30(%rsi)
|
||||
movdqa %xmm3,%xmm0
|
||||
cmp $0x40,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x30(%rdx),%xmm0
|
||||
pxor %xmm3,%xmm0
|
||||
movdqu %xmm0,0x30(%rsi)
|
||||
|
||||
.Ldone:
|
||||
FRAME_END
|
||||
ret
|
||||
ENDPROC(chacha20_block_xor_ssse3)
|
||||
|
||||
ENTRY(chacha20_4block_xor_ssse3)
|
||||
.Lxorpart:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rax,%r9
|
||||
and $0x0f,%r9
|
||||
jz .Ldone
|
||||
and $~0x0f,%rax
|
||||
|
||||
mov %rsi,%r11
|
||||
|
||||
lea 8(%rsp),%r10
|
||||
sub $0x10,%rsp
|
||||
and $~31,%rsp
|
||||
|
||||
lea (%rdx,%rax),%rsi
|
||||
mov %rsp,%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
pxor 0x00(%rsp),%xmm0
|
||||
movdqa %xmm0,0x00(%rsp)
|
||||
|
||||
mov %rsp,%rsi
|
||||
lea (%r11,%rax),%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
lea -8(%r10),%rsp
|
||||
jmp .Ldone
|
||||
|
||||
ENDPROC(chacha_block_xor_ssse3)
|
||||
|
||||
ENTRY(hchacha_block_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 4 data blocks output, o
|
||||
# %rdx: 4 data blocks input, i
|
||||
# %rsi: output (8 32-bit words)
|
||||
# %edx: nrounds
|
||||
FRAME_BEGIN
|
||||
|
||||
# This function encrypts four consecutive ChaCha20 blocks by loading the
|
||||
movdqa 0x00(%rdi),%xmm0
|
||||
movdqa 0x10(%rdi),%xmm1
|
||||
movdqa 0x20(%rdi),%xmm2
|
||||
movdqa 0x30(%rdi),%xmm3
|
||||
|
||||
mov %edx,%r8d
|
||||
call chacha_permute
|
||||
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
movdqu %xmm3,0x10(%rsi)
|
||||
|
||||
FRAME_END
|
||||
ret
|
||||
ENDPROC(hchacha_block_ssse3)
|
||||
|
||||
ENTRY(chacha_4block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 4 data blocks output, o
|
||||
# %rdx: up to 4 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts four consecutive ChaCha blocks by loading the
|
||||
# the state matrix in SSE registers four times. As we need some scratch
|
||||
# registers, we save the first four registers on the stack. The
|
||||
# algorithm performs each operation on the corresponding word of each
|
||||
@@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||
lea 8(%rsp),%r10
|
||||
sub $0x80,%rsp
|
||||
and $~63,%rsp
|
||||
mov %rcx,%rax
|
||||
|
||||
# x0..15[0-3] = s0..3[0..3]
|
||||
movq 0x00(%rdi),%xmm1
|
||||
@@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||
# x12 += counter values 0-3
|
||||
paddd %xmm1,%xmm12
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround4:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
movdqa 0x00(%rsp),%xmm0
|
||||
@@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||
psrld $25,%xmm4
|
||||
por %xmm0,%xmm4
|
||||
|
||||
dec %ecx
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround4
|
||||
|
||||
# x0[0-3] += s0[0]
|
||||
@@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||
|
||||
# xor with corresponding input, write to output
|
||||
movdqa 0x00(%rsp),%xmm0
|
||||
cmp $0x10,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x00(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
movdqa 0x10(%rsp),%xmm0
|
||||
movdqu 0x80(%rdx),%xmm1
|
||||
|
||||
movdqu %xmm4,%xmm0
|
||||
cmp $0x20,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x10(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x80(%rsi)
|
||||
movdqu %xmm0,0x10(%rsi)
|
||||
|
||||
movdqu %xmm8,%xmm0
|
||||
cmp $0x30,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x20(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x20(%rsi)
|
||||
|
||||
movdqu %xmm12,%xmm0
|
||||
cmp $0x40,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x30(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x30(%rsi)
|
||||
|
||||
movdqa 0x20(%rsp),%xmm0
|
||||
cmp $0x50,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x40(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x40(%rsi)
|
||||
|
||||
movdqu %xmm6,%xmm0
|
||||
cmp $0x60,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x50(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x50(%rsi)
|
||||
|
||||
movdqu %xmm10,%xmm0
|
||||
cmp $0x70,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x60(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x60(%rsi)
|
||||
|
||||
movdqu %xmm14,%xmm0
|
||||
cmp $0x80,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x70(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x70(%rsi)
|
||||
|
||||
movdqa 0x10(%rsp),%xmm0
|
||||
cmp $0x90,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x80(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x80(%rsi)
|
||||
|
||||
movdqu %xmm5,%xmm0
|
||||
cmp $0xa0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x90(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x90(%rsi)
|
||||
|
||||
movdqu %xmm9,%xmm0
|
||||
cmp $0xb0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xa0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xa0(%rsi)
|
||||
|
||||
movdqu %xmm13,%xmm0
|
||||
cmp $0xc0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xb0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xb0(%rsi)
|
||||
|
||||
movdqa 0x30(%rsp),%xmm0
|
||||
cmp $0xd0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xc0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xc0(%rsi)
|
||||
movdqu 0x10(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm4
|
||||
movdqu %xmm4,0x10(%rsi)
|
||||
movdqu 0x90(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm5
|
||||
movdqu %xmm5,0x90(%rsi)
|
||||
movdqu 0x50(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm6
|
||||
movdqu %xmm6,0x50(%rsi)
|
||||
movdqu 0xd0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm7
|
||||
movdqu %xmm7,0xd0(%rsi)
|
||||
movdqu 0x20(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm8
|
||||
movdqu %xmm8,0x20(%rsi)
|
||||
movdqu 0xa0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm9
|
||||
movdqu %xmm9,0xa0(%rsi)
|
||||
movdqu 0x60(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm10
|
||||
movdqu %xmm10,0x60(%rsi)
|
||||
movdqu 0xe0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm11
|
||||
movdqu %xmm11,0xe0(%rsi)
|
||||
movdqu 0x30(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm12
|
||||
movdqu %xmm12,0x30(%rsi)
|
||||
movdqu 0xb0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm13
|
||||
movdqu %xmm13,0xb0(%rsi)
|
||||
movdqu 0x70(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm14
|
||||
movdqu %xmm14,0x70(%rsi)
|
||||
movdqu 0xf0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm15
|
||||
movdqu %xmm15,0xf0(%rsi)
|
||||
|
||||
movdqu %xmm7,%xmm0
|
||||
cmp $0xe0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xd0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xd0(%rsi)
|
||||
|
||||
movdqu %xmm11,%xmm0
|
||||
cmp $0xf0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xe0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xe0(%rsi)
|
||||
|
||||
movdqu %xmm15,%xmm0
|
||||
cmp $0x100,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xf0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xf0(%rsi)
|
||||
|
||||
.Ldone4:
|
||||
lea -8(%r10),%rsp
|
||||
ret
|
||||
ENDPROC(chacha20_4block_xor_ssse3)
|
||||
|
||||
.Lxorpart4:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rax,%r9
|
||||
and $0x0f,%r9
|
||||
jz .Ldone4
|
||||
and $~0x0f,%rax
|
||||
|
||||
mov %rsi,%r11
|
||||
|
||||
lea (%rdx,%rax),%rsi
|
||||
mov %rsp,%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
pxor 0x00(%rsp),%xmm0
|
||||
movdqa %xmm0,0x00(%rsp)
|
||||
|
||||
mov %rsp,%rsi
|
||||
lea (%r11,%rax),%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
jmp .Ldone4
|
||||
|
||||
ENDPROC(chacha_4block_xor_ssse3)
|
@@ -1,448 +0,0 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.ROT8, "aM", @progbits, 32
|
||||
.align 32
|
||||
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
|
||||
.octa 0x0e0d0c0f0a09080b0605040702010003
|
||||
|
||||
.section .rodata.cst32.ROT16, "aM", @progbits, 32
|
||||
.align 32
|
||||
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
|
||||
.octa 0x0d0c0f0e09080b0a0504070601000302
|
||||
|
||||
.section .rodata.cst32.CTRINC, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTRINC: .octa 0x00000003000000020000000100000000
|
||||
.octa 0x00000007000000060000000500000004
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha20_8block_xor_avx2)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 8 data blocks output, o
|
||||
# %rdx: 8 data blocks input, i
|
||||
|
||||
# This function encrypts eight consecutive ChaCha20 blocks by loading
|
||||
# the state matrix in AVX registers eight times. As we need some
|
||||
# scratch registers, we save the first four registers on the stack. The
|
||||
# algorithm performs each operation on the corresponding word of each
|
||||
# state matrix, hence requires no word shuffling. For final XORing step
|
||||
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
|
||||
# words, which allows us to do XOR in AVX registers. 8/16-bit word
|
||||
# rotation is done with the slightly better performing byte shuffling,
|
||||
# 7/12-bit word rotation uses traditional shift+OR.
|
||||
|
||||
vzeroupper
|
||||
# 4 * 32 byte stack, 32-byte aligned
|
||||
lea 8(%rsp),%r10
|
||||
and $~31, %rsp
|
||||
sub $0x80, %rsp
|
||||
|
||||
# x0..15[0-7] = s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpbroadcastd 0x04(%rdi),%ymm1
|
||||
vpbroadcastd 0x08(%rdi),%ymm2
|
||||
vpbroadcastd 0x0c(%rdi),%ymm3
|
||||
vpbroadcastd 0x10(%rdi),%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm15
|
||||
# x0..3 on stack
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa %ymm3,0x60(%rsp)
|
||||
|
||||
vmovdqa CTRINC(%rip),%ymm1
|
||||
vmovdqa ROT8(%rip),%ymm2
|
||||
vmovdqa ROT16(%rip),%ymm3
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd %ymm1,%ymm12,%ymm12
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround8:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm3,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm3,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm3,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm3,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm4,%ymm4
|
||||
vpslld $12,%ymm4,%ymm0
|
||||
vpsrld $20,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm5,%ymm5
|
||||
vpslld $12,%ymm5,%ymm0
|
||||
vpsrld $20,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm6,%ymm6
|
||||
vpslld $12,%ymm6,%ymm0
|
||||
vpsrld $20,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm7,%ymm7
|
||||
vpslld $12,%ymm7,%ymm0
|
||||
vpsrld $20,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm2,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm2,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm2,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm2,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm4,%ymm4
|
||||
vpslld $7,%ymm4,%ymm0
|
||||
vpsrld $25,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm5,%ymm5
|
||||
vpslld $7,%ymm5,%ymm0
|
||||
vpsrld $25,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm6,%ymm6
|
||||
vpslld $7,%ymm6,%ymm0
|
||||
vpsrld $25,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm7,%ymm7
|
||||
vpslld $7,%ymm7,%ymm0
|
||||
vpsrld $25,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm3,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
|
||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm3,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm3,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm3,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm5,%ymm5
|
||||
vpslld $12,%ymm5,%ymm0
|
||||
vpsrld $20,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm6,%ymm6
|
||||
vpslld $12,%ymm6,%ymm0
|
||||
vpsrld $20,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm7,%ymm7
|
||||
vpslld $12,%ymm7,%ymm0
|
||||
vpsrld $20,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm4,%ymm4
|
||||
vpslld $12,%ymm4,%ymm0
|
||||
vpsrld $20,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm2,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm2,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm2,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm2,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm5,%ymm5
|
||||
vpslld $7,%ymm5,%ymm0
|
||||
vpsrld $25,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm6,%ymm6
|
||||
vpslld $7,%ymm6,%ymm0
|
||||
vpsrld $25,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm7,%ymm7
|
||||
vpslld $7,%ymm7,%ymm0
|
||||
vpsrld $25,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm4,%ymm4
|
||||
vpslld $7,%ymm4,%ymm0
|
||||
vpsrld $25,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
|
||||
dec %ecx
|
||||
jnz .Ldoubleround8
|
||||
|
||||
# x0..15[0-3] += s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpaddd 0x00(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpbroadcastd 0x04(%rdi),%ymm0
|
||||
vpaddd 0x20(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpbroadcastd 0x08(%rdi),%ymm0
|
||||
vpaddd 0x40(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpbroadcastd 0x0c(%rdi),%ymm0
|
||||
vpaddd 0x60(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpbroadcastd 0x10(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm4,%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm5,%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm6,%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm7,%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm8,%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm9,%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm10,%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm11,%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm12,%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm13,%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm14,%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm15,%ymm15
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd %ymm1,%ymm12,%ymm12
|
||||
|
||||
# interleave 32-bit words in state n, n+1
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vmovdqa 0x20(%rsp),%ymm1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
||||
vmovdqa %ymm2,0x00(%rsp)
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vmovdqa 0x60(%rsp),%ymm1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa %ymm1,0x60(%rsp)
|
||||
vmovdqa %ymm4,%ymm0
|
||||
vpunpckldq %ymm5,%ymm0,%ymm4
|
||||
vpunpckhdq %ymm5,%ymm0,%ymm5
|
||||
vmovdqa %ymm6,%ymm0
|
||||
vpunpckldq %ymm7,%ymm0,%ymm6
|
||||
vpunpckhdq %ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm8,%ymm0
|
||||
vpunpckldq %ymm9,%ymm0,%ymm8
|
||||
vpunpckhdq %ymm9,%ymm0,%ymm9
|
||||
vmovdqa %ymm10,%ymm0
|
||||
vpunpckldq %ymm11,%ymm0,%ymm10
|
||||
vpunpckhdq %ymm11,%ymm0,%ymm11
|
||||
vmovdqa %ymm12,%ymm0
|
||||
vpunpckldq %ymm13,%ymm0,%ymm12
|
||||
vpunpckhdq %ymm13,%ymm0,%ymm13
|
||||
vmovdqa %ymm14,%ymm0
|
||||
vpunpckldq %ymm15,%ymm0,%ymm14
|
||||
vpunpckhdq %ymm15,%ymm0,%ymm15
|
||||
|
||||
# interleave 64-bit words in state n, n+2
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vmovdqa 0x40(%rsp),%ymm2
|
||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
||||
vmovdqa %ymm1,0x00(%rsp)
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vmovdqa 0x60(%rsp),%ymm2
|
||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa %ymm2,0x60(%rsp)
|
||||
vmovdqa %ymm4,%ymm0
|
||||
vpunpcklqdq %ymm6,%ymm0,%ymm4
|
||||
vpunpckhqdq %ymm6,%ymm0,%ymm6
|
||||
vmovdqa %ymm5,%ymm0
|
||||
vpunpcklqdq %ymm7,%ymm0,%ymm5
|
||||
vpunpckhqdq %ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm8,%ymm0
|
||||
vpunpcklqdq %ymm10,%ymm0,%ymm8
|
||||
vpunpckhqdq %ymm10,%ymm0,%ymm10
|
||||
vmovdqa %ymm9,%ymm0
|
||||
vpunpcklqdq %ymm11,%ymm0,%ymm9
|
||||
vpunpckhqdq %ymm11,%ymm0,%ymm11
|
||||
vmovdqa %ymm12,%ymm0
|
||||
vpunpcklqdq %ymm14,%ymm0,%ymm12
|
||||
vpunpckhqdq %ymm14,%ymm0,%ymm14
|
||||
vmovdqa %ymm13,%ymm0
|
||||
vpunpcklqdq %ymm15,%ymm0,%ymm13
|
||||
vpunpckhqdq %ymm15,%ymm0,%ymm15
|
||||
|
||||
# interleave 128-bit words in state n, n+4
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
||||
vmovdqa %ymm1,0x00(%rsp)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
|
||||
vmovdqa %ymm1,0x40(%rsp)
|
||||
vmovdqa 0x60(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm1,0x60(%rsp)
|
||||
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
||||
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
||||
vmovdqa %ymm0,%ymm8
|
||||
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
||||
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
||||
vmovdqa %ymm0,%ymm9
|
||||
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
||||
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
||||
vmovdqa %ymm0,%ymm10
|
||||
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
||||
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
||||
vmovdqa %ymm0,%ymm11
|
||||
|
||||
# xor with corresponding input, write to output
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vpxor 0x0000(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0000(%rsi)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vpxor 0x0080(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0080(%rsi)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vpxor 0x0040(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0040(%rsi)
|
||||
vmovdqa 0x60(%rsp),%ymm0
|
||||
vpxor 0x00c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x00c0(%rsi)
|
||||
vpxor 0x0100(%rdx),%ymm4,%ymm4
|
||||
vmovdqu %ymm4,0x0100(%rsi)
|
||||
vpxor 0x0180(%rdx),%ymm5,%ymm5
|
||||
vmovdqu %ymm5,0x00180(%rsi)
|
||||
vpxor 0x0140(%rdx),%ymm6,%ymm6
|
||||
vmovdqu %ymm6,0x0140(%rsi)
|
||||
vpxor 0x01c0(%rdx),%ymm7,%ymm7
|
||||
vmovdqu %ymm7,0x01c0(%rsi)
|
||||
vpxor 0x0020(%rdx),%ymm8,%ymm8
|
||||
vmovdqu %ymm8,0x0020(%rsi)
|
||||
vpxor 0x00a0(%rdx),%ymm9,%ymm9
|
||||
vmovdqu %ymm9,0x00a0(%rsi)
|
||||
vpxor 0x0060(%rdx),%ymm10,%ymm10
|
||||
vmovdqu %ymm10,0x0060(%rsi)
|
||||
vpxor 0x00e0(%rdx),%ymm11,%ymm11
|
||||
vmovdqu %ymm11,0x00e0(%rsi)
|
||||
vpxor 0x0120(%rdx),%ymm12,%ymm12
|
||||
vmovdqu %ymm12,0x0120(%rsi)
|
||||
vpxor 0x01a0(%rdx),%ymm13,%ymm13
|
||||
vmovdqu %ymm13,0x01a0(%rsi)
|
||||
vpxor 0x0160(%rdx),%ymm14,%ymm14
|
||||
vmovdqu %ymm14,0x0160(%rsi)
|
||||
vpxor 0x01e0(%rdx),%ymm15,%ymm15
|
||||
vmovdqu %ymm15,0x01e0(%rsi)
|
||||
|
||||
vzeroupper
|
||||
lea -8(%r10),%rsp
|
||||
ret
|
||||
ENDPROC(chacha20_8block_xor_avx2)
|
@@ -1,146 +0,0 @@
|
||||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha20.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
#define CHACHA20_STATE_ALIGN 16
|
||||
|
||||
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
|
||||
static bool chacha20_use_avx2;
|
||||
#endif
|
||||
|
||||
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes)
|
||||
{
|
||||
u8 buf[CHACHA20_BLOCK_SIZE];
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (chacha20_use_avx2) {
|
||||
while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
|
||||
chacha20_8block_xor_avx2(state, dst, src);
|
||||
bytes -= CHACHA20_BLOCK_SIZE * 8;
|
||||
src += CHACHA20_BLOCK_SIZE * 8;
|
||||
dst += CHACHA20_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
|
||||
chacha20_4block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA20_BLOCK_SIZE * 4;
|
||||
src += CHACHA20_BLOCK_SIZE * 4;
|
||||
dst += CHACHA20_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
while (bytes >= CHACHA20_BLOCK_SIZE) {
|
||||
chacha20_block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA20_BLOCK_SIZE;
|
||||
src += CHACHA20_BLOCK_SIZE;
|
||||
dst += CHACHA20_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha20_block_xor_ssse3(state, buf, buf);
|
||||
memcpy(dst, buf, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
static int chacha20_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
u32 *state, state_buf[16 + 2] __aligned(8);
|
||||
struct skcipher_walk walk;
|
||||
int err;
|
||||
|
||||
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
|
||||
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
|
||||
|
||||
if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
|
||||
return crypto_chacha20_crypt(req);
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, true);
|
||||
|
||||
crypto_chacha20_init(state, ctx, walk.iv);
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
|
||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
|
||||
err = skcipher_walk_done(&walk,
|
||||
walk.nbytes % CHACHA20_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
if (walk.nbytes) {
|
||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
walk.nbytes);
|
||||
err = skcipher_walk_done(&walk, 0);
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct skcipher_alg alg = {
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha20_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA20_KEY_SIZE,
|
||||
.max_keysize = CHACHA20_KEY_SIZE,
|
||||
.ivsize = CHACHA20_IV_SIZE,
|
||||
.chunksize = CHACHA20_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.encrypt = chacha20_simd,
|
||||
.decrypt = chacha20_simd,
|
||||
};
|
||||
|
||||
static int __init chacha20_simd_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return -ENODEV;
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
|
||||
#endif
|
||||
return crypto_register_skcipher(&alg);
|
||||
}
|
||||
|
||||
static void __exit chacha20_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_skcipher(&alg);
|
||||
}
|
||||
|
||||
module_init(chacha20_simd_mod_init);
|
||||
module_exit(chacha20_simd_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
304
arch/x86/crypto/chacha_glue.c
Normal file
304
arch/x86/crypto/chacha_glue.c
Normal file
@@ -0,0 +1,304 @@
|
||||
/*
|
||||
* x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
#define CHACHA_STATE_ALIGN 16
|
||||
|
||||
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
static bool chacha_use_avx2;
|
||||
#ifdef CONFIG_AS_AVX512
|
||||
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
static bool chacha_use_avx512vl;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
|
||||
{
|
||||
len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
|
||||
return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds)
|
||||
{
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
#ifdef CONFIG_AS_AVX512
|
||||
if (chacha_use_avx512vl) {
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||
src += CHACHA_BLOCK_SIZE * 8;
|
||||
dst += CHACHA_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 8);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 2) {
|
||||
chacha_4block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes) {
|
||||
chacha_2block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (chacha_use_avx2) {
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||
src += CHACHA_BLOCK_SIZE * 8;
|
||||
dst += CHACHA_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 8);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 2) {
|
||||
chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE) {
|
||||
chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE) {
|
||||
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes) {
|
||||
chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
state[12]++;
|
||||
}
|
||||
}
|
||||
|
||||
static int chacha_simd_stream_xor(struct skcipher_walk *walk,
|
||||
struct chacha_ctx *ctx, u8 *iv)
|
||||
{
|
||||
u32 *state, state_buf[16 + 2] __aligned(8);
|
||||
int next_yield = 4096; /* bytes until next FPU yield */
|
||||
int err = 0;
|
||||
|
||||
BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
|
||||
state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
|
||||
|
||||
crypto_chacha_init(state, ctx, iv);
|
||||
|
||||
while (walk->nbytes > 0) {
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
|
||||
if (nbytes < walk->total) {
|
||||
nbytes = round_down(nbytes, walk->stride);
|
||||
next_yield -= nbytes;
|
||||
}
|
||||
|
||||
chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
|
||||
nbytes, ctx->nrounds);
|
||||
|
||||
if (next_yield <= 0) {
|
||||
/* temporarily allow preemption */
|
||||
kernel_fpu_end();
|
||||
kernel_fpu_begin();
|
||||
next_yield = 4096;
|
||||
}
|
||||
|
||||
err = skcipher_walk_done(walk, walk->nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int chacha_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
struct skcipher_walk walk;
|
||||
int err;
|
||||
|
||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
|
||||
return crypto_chacha_crypt(req);
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, true);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
kernel_fpu_begin();
|
||||
err = chacha_simd_stream_xor(&walk, ctx, req->iv);
|
||||
kernel_fpu_end();
|
||||
return err;
|
||||
}
|
||||
|
||||
static int xchacha_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
struct skcipher_walk walk;
|
||||
struct chacha_ctx subctx;
|
||||
u32 *state, state_buf[16 + 2] __aligned(8);
|
||||
u8 real_iv[16];
|
||||
int err;
|
||||
|
||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
|
||||
return crypto_xchacha_crypt(req);
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, true);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
|
||||
state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
|
||||
crypto_chacha_init(state, ctx, req->iv);
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||
err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
|
||||
|
||||
kernel_fpu_end();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct skcipher_alg algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.encrypt = chacha_simd,
|
||||
.decrypt = chacha_simd,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.encrypt = xchacha_simd,
|
||||
.decrypt = xchacha_simd,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha12_setkey,
|
||||
.encrypt = xchacha_simd,
|
||||
.decrypt = xchacha_simd,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return -ENODEV;
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
|
||||
#ifdef CONFIG_AS_AVX512
|
||||
chacha_use_avx512vl = chacha_use_avx2 &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
|
||||
#endif
|
||||
#endif
|
||||
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
||||
module_exit(chacha_simd_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-simd");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-simd");
|
157
arch/x86/crypto/nh-avx2-x86_64.S
Normal file
157
arch/x86/crypto/nh-avx2-x86_64.S
Normal file
@@ -0,0 +1,157 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
|
||||
*
|
||||
* Copyright 2018 Google LLC
|
||||
*
|
||||
* Author: Eric Biggers <ebiggers@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#define PASS0_SUMS %ymm0
|
||||
#define PASS1_SUMS %ymm1
|
||||
#define PASS2_SUMS %ymm2
|
||||
#define PASS3_SUMS %ymm3
|
||||
#define K0 %ymm4
|
||||
#define K0_XMM %xmm4
|
||||
#define K1 %ymm5
|
||||
#define K1_XMM %xmm5
|
||||
#define K2 %ymm6
|
||||
#define K2_XMM %xmm6
|
||||
#define K3 %ymm7
|
||||
#define K3_XMM %xmm7
|
||||
#define T0 %ymm8
|
||||
#define T1 %ymm9
|
||||
#define T2 %ymm10
|
||||
#define T2_XMM %xmm10
|
||||
#define T3 %ymm11
|
||||
#define T3_XMM %xmm11
|
||||
#define T4 %ymm12
|
||||
#define T5 %ymm13
|
||||
#define T6 %ymm14
|
||||
#define T7 %ymm15
|
||||
#define KEY %rdi
|
||||
#define MESSAGE %rsi
|
||||
#define MESSAGE_LEN %rdx
|
||||
#define HASH %rcx
|
||||
|
||||
.macro _nh_2xstride k0, k1, k2, k3
|
||||
|
||||
// Add message words to key words
|
||||
vpaddd \k0, T3, T0
|
||||
vpaddd \k1, T3, T1
|
||||
vpaddd \k2, T3, T2
|
||||
vpaddd \k3, T3, T3
|
||||
|
||||
// Multiply 32x32 => 64 and accumulate
|
||||
vpshufd $0x10, T0, T4
|
||||
vpshufd $0x32, T0, T0
|
||||
vpshufd $0x10, T1, T5
|
||||
vpshufd $0x32, T1, T1
|
||||
vpshufd $0x10, T2, T6
|
||||
vpshufd $0x32, T2, T2
|
||||
vpshufd $0x10, T3, T7
|
||||
vpshufd $0x32, T3, T3
|
||||
vpmuludq T4, T0, T0
|
||||
vpmuludq T5, T1, T1
|
||||
vpmuludq T6, T2, T2
|
||||
vpmuludq T7, T3, T3
|
||||
vpaddq T0, PASS0_SUMS, PASS0_SUMS
|
||||
vpaddq T1, PASS1_SUMS, PASS1_SUMS
|
||||
vpaddq T2, PASS2_SUMS, PASS2_SUMS
|
||||
vpaddq T3, PASS3_SUMS, PASS3_SUMS
|
||||
.endm
|
||||
|
||||
/*
|
||||
* void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
|
||||
* u8 hash[NH_HASH_BYTES])
|
||||
*
|
||||
* It's guaranteed that message_len % 16 == 0.
|
||||
*/
|
||||
ENTRY(nh_avx2)
|
||||
|
||||
vmovdqu 0x00(KEY), K0
|
||||
vmovdqu 0x10(KEY), K1
|
||||
add $0x20, KEY
|
||||
vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
|
||||
vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
|
||||
vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
|
||||
vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
|
||||
|
||||
sub $0x40, MESSAGE_LEN
|
||||
jl .Lloop4_done
|
||||
.Lloop4:
|
||||
vmovdqu (MESSAGE), T3
|
||||
vmovdqu 0x00(KEY), K2
|
||||
vmovdqu 0x10(KEY), K3
|
||||
_nh_2xstride K0, K1, K2, K3
|
||||
|
||||
vmovdqu 0x20(MESSAGE), T3
|
||||
vmovdqu 0x20(KEY), K0
|
||||
vmovdqu 0x30(KEY), K1
|
||||
_nh_2xstride K2, K3, K0, K1
|
||||
|
||||
add $0x40, MESSAGE
|
||||
add $0x40, KEY
|
||||
sub $0x40, MESSAGE_LEN
|
||||
jge .Lloop4
|
||||
|
||||
.Lloop4_done:
|
||||
and $0x3f, MESSAGE_LEN
|
||||
jz .Ldone
|
||||
|
||||
cmp $0x20, MESSAGE_LEN
|
||||
jl .Llast
|
||||
|
||||
// 2 or 3 strides remain; do 2 more.
|
||||
vmovdqu (MESSAGE), T3
|
||||
vmovdqu 0x00(KEY), K2
|
||||
vmovdqu 0x10(KEY), K3
|
||||
_nh_2xstride K0, K1, K2, K3
|
||||
add $0x20, MESSAGE
|
||||
add $0x20, KEY
|
||||
sub $0x20, MESSAGE_LEN
|
||||
jz .Ldone
|
||||
vmovdqa K2, K0
|
||||
vmovdqa K3, K1
|
||||
.Llast:
|
||||
// Last stride. Zero the high 128 bits of the message and keys so they
|
||||
// don't affect the result when processing them like 2 strides.
|
||||
vmovdqu (MESSAGE), T3_XMM
|
||||
vmovdqa K0_XMM, K0_XMM
|
||||
vmovdqa K1_XMM, K1_XMM
|
||||
vmovdqu 0x00(KEY), K2_XMM
|
||||
vmovdqu 0x10(KEY), K3_XMM
|
||||
_nh_2xstride K0, K1, K2, K3
|
||||
|
||||
.Ldone:
|
||||
// Sum the accumulators for each pass, then store the sums to 'hash'
|
||||
|
||||
// PASS0_SUMS is (0A 0B 0C 0D)
|
||||
// PASS1_SUMS is (1A 1B 1C 1D)
|
||||
// PASS2_SUMS is (2A 2B 2C 2D)
|
||||
// PASS3_SUMS is (3A 3B 3C 3D)
|
||||
// We need the horizontal sums:
|
||||
// (0A + 0B + 0C + 0D,
|
||||
// 1A + 1B + 1C + 1D,
|
||||
// 2A + 2B + 2C + 2D,
|
||||
// 3A + 3B + 3C + 3D)
|
||||
//
|
||||
|
||||
vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C)
|
||||
vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D)
|
||||
vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C)
|
||||
vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D)
|
||||
|
||||
vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A)
|
||||
vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B)
|
||||
vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C)
|
||||
vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D)
|
||||
|
||||
vpaddq T5, T4, T4
|
||||
vpaddq T1, T0, T0
|
||||
vpaddq T4, T0, T0
|
||||
vmovdqu T0, (HASH)
|
||||
ret
|
||||
ENDPROC(nh_avx2)
|
123
arch/x86/crypto/nh-sse2-x86_64.S
Normal file
123
arch/x86/crypto/nh-sse2-x86_64.S
Normal file
@@ -0,0 +1,123 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
|
||||
*
|
||||
* Copyright 2018 Google LLC
|
||||
*
|
||||
* Author: Eric Biggers <ebiggers@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
#define PASS0_SUMS %xmm0
|
||||
#define PASS1_SUMS %xmm1
|
||||
#define PASS2_SUMS %xmm2
|
||||
#define PASS3_SUMS %xmm3
|
||||
#define K0 %xmm4
|
||||
#define K1 %xmm5
|
||||
#define K2 %xmm6
|
||||
#define K3 %xmm7
|
||||
#define T0 %xmm8
|
||||
#define T1 %xmm9
|
||||
#define T2 %xmm10
|
||||
#define T3 %xmm11
|
||||
#define T4 %xmm12
|
||||
#define T5 %xmm13
|
||||
#define T6 %xmm14
|
||||
#define T7 %xmm15
|
||||
#define KEY %rdi
|
||||
#define MESSAGE %rsi
|
||||
#define MESSAGE_LEN %rdx
|
||||
#define HASH %rcx
|
||||
|
||||
.macro _nh_stride k0, k1, k2, k3, offset
|
||||
|
||||
// Load next message stride
|
||||
movdqu \offset(MESSAGE), T1
|
||||
|
||||
// Load next key stride
|
||||
movdqu \offset(KEY), \k3
|
||||
|
||||
// Add message words to key words
|
||||
movdqa T1, T2
|
||||
movdqa T1, T3
|
||||
paddd T1, \k0 // reuse k0 to avoid a move
|
||||
paddd \k1, T1
|
||||
paddd \k2, T2
|
||||
paddd \k3, T3
|
||||
|
||||
// Multiply 32x32 => 64 and accumulate
|
||||
pshufd $0x10, \k0, T4
|
||||
pshufd $0x32, \k0, \k0
|
||||
pshufd $0x10, T1, T5
|
||||
pshufd $0x32, T1, T1
|
||||
pshufd $0x10, T2, T6
|
||||
pshufd $0x32, T2, T2
|
||||
pshufd $0x10, T3, T7
|
||||
pshufd $0x32, T3, T3
|
||||
pmuludq T4, \k0
|
||||
pmuludq T5, T1
|
||||
pmuludq T6, T2
|
||||
pmuludq T7, T3
|
||||
paddq \k0, PASS0_SUMS
|
||||
paddq T1, PASS1_SUMS
|
||||
paddq T2, PASS2_SUMS
|
||||
paddq T3, PASS3_SUMS
|
||||
.endm
|
||||
|
||||
/*
|
||||
* void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
|
||||
* u8 hash[NH_HASH_BYTES])
|
||||
*
|
||||
* It's guaranteed that message_len % 16 == 0.
|
||||
*/
|
||||
ENTRY(nh_sse2)
|
||||
|
||||
movdqu 0x00(KEY), K0
|
||||
movdqu 0x10(KEY), K1
|
||||
movdqu 0x20(KEY), K2
|
||||
add $0x30, KEY
|
||||
pxor PASS0_SUMS, PASS0_SUMS
|
||||
pxor PASS1_SUMS, PASS1_SUMS
|
||||
pxor PASS2_SUMS, PASS2_SUMS
|
||||
pxor PASS3_SUMS, PASS3_SUMS
|
||||
|
||||
sub $0x40, MESSAGE_LEN
|
||||
jl .Lloop4_done
|
||||
.Lloop4:
|
||||
_nh_stride K0, K1, K2, K3, 0x00
|
||||
_nh_stride K1, K2, K3, K0, 0x10
|
||||
_nh_stride K2, K3, K0, K1, 0x20
|
||||
_nh_stride K3, K0, K1, K2, 0x30
|
||||
add $0x40, KEY
|
||||
add $0x40, MESSAGE
|
||||
sub $0x40, MESSAGE_LEN
|
||||
jge .Lloop4
|
||||
|
||||
.Lloop4_done:
|
||||
and $0x3f, MESSAGE_LEN
|
||||
jz .Ldone
|
||||
_nh_stride K0, K1, K2, K3, 0x00
|
||||
|
||||
sub $0x10, MESSAGE_LEN
|
||||
jz .Ldone
|
||||
_nh_stride K1, K2, K3, K0, 0x10
|
||||
|
||||
sub $0x10, MESSAGE_LEN
|
||||
jz .Ldone
|
||||
_nh_stride K2, K3, K0, K1, 0x20
|
||||
|
||||
.Ldone:
|
||||
// Sum the accumulators for each pass, then store the sums to 'hash'
|
||||
movdqa PASS0_SUMS, T0
|
||||
movdqa PASS2_SUMS, T1
|
||||
punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
|
||||
punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
|
||||
punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
|
||||
punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
|
||||
paddq PASS0_SUMS, T0
|
||||
paddq PASS2_SUMS, T1
|
||||
movdqu T0, 0x00(HASH)
|
||||
movdqu T1, 0x10(HASH)
|
||||
ret
|
||||
ENDPROC(nh_sse2)
|
77
arch/x86/crypto/nhpoly1305-avx2-glue.c
Normal file
77
arch/x86/crypto/nhpoly1305-avx2-glue.c
Normal file
@@ -0,0 +1,77 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
|
||||
* (AVX2 accelerated version)
|
||||
*
|
||||
* Copyright 2018 Google LLC
|
||||
*/
|
||||
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/nhpoly1305.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
|
||||
asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
|
||||
u8 hash[NH_HASH_BYTES]);
|
||||
|
||||
/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
|
||||
static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
|
||||
__le64 hash[NH_NUM_PASSES])
|
||||
{
|
||||
nh_avx2(key, message, message_len, (u8 *)hash);
|
||||
}
|
||||
|
||||
static int nhpoly1305_avx2_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
if (srclen < 64 || !irq_fpu_usable())
|
||||
return crypto_nhpoly1305_update(desc, src, srclen);
|
||||
|
||||
do {
|
||||
unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
|
||||
|
||||
kernel_fpu_begin();
|
||||
crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
|
||||
kernel_fpu_end();
|
||||
src += n;
|
||||
srclen -= n;
|
||||
} while (srclen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg nhpoly1305_alg = {
|
||||
.base.cra_name = "nhpoly1305",
|
||||
.base.cra_driver_name = "nhpoly1305-avx2",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_ctxsize = sizeof(struct nhpoly1305_key),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.init = crypto_nhpoly1305_init,
|
||||
.update = nhpoly1305_avx2_update,
|
||||
.final = crypto_nhpoly1305_final,
|
||||
.setkey = crypto_nhpoly1305_setkey,
|
||||
.descsize = sizeof(struct nhpoly1305_state),
|
||||
};
|
||||
|
||||
static int __init nhpoly1305_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_AVX2) ||
|
||||
!boot_cpu_has(X86_FEATURE_OSXSAVE))
|
||||
return -ENODEV;
|
||||
|
||||
return crypto_register_shash(&nhpoly1305_alg);
|
||||
}
|
||||
|
||||
static void __exit nhpoly1305_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_shash(&nhpoly1305_alg);
|
||||
}
|
||||
|
||||
module_init(nhpoly1305_mod_init);
|
||||
module_exit(nhpoly1305_mod_exit);
|
||||
|
||||
MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
|
||||
MODULE_ALIAS_CRYPTO("nhpoly1305");
|
||||
MODULE_ALIAS_CRYPTO("nhpoly1305-avx2");
|
76
arch/x86/crypto/nhpoly1305-sse2-glue.c
Normal file
76
arch/x86/crypto/nhpoly1305-sse2-glue.c
Normal file
@@ -0,0 +1,76 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
|
||||
* (SSE2 accelerated version)
|
||||
*
|
||||
* Copyright 2018 Google LLC
|
||||
*/
|
||||
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/nhpoly1305.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
|
||||
asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
|
||||
u8 hash[NH_HASH_BYTES]);
|
||||
|
||||
/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
|
||||
static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
|
||||
__le64 hash[NH_NUM_PASSES])
|
||||
{
|
||||
nh_sse2(key, message, message_len, (u8 *)hash);
|
||||
}
|
||||
|
||||
static int nhpoly1305_sse2_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
if (srclen < 64 || !irq_fpu_usable())
|
||||
return crypto_nhpoly1305_update(desc, src, srclen);
|
||||
|
||||
do {
|
||||
unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
|
||||
|
||||
kernel_fpu_begin();
|
||||
crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
|
||||
kernel_fpu_end();
|
||||
src += n;
|
||||
srclen -= n;
|
||||
} while (srclen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg nhpoly1305_alg = {
|
||||
.base.cra_name = "nhpoly1305",
|
||||
.base.cra_driver_name = "nhpoly1305-sse2",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_ctxsize = sizeof(struct nhpoly1305_key),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.init = crypto_nhpoly1305_init,
|
||||
.update = nhpoly1305_sse2_update,
|
||||
.final = crypto_nhpoly1305_final,
|
||||
.setkey = crypto_nhpoly1305_setkey,
|
||||
.descsize = sizeof(struct nhpoly1305_state),
|
||||
};
|
||||
|
||||
static int __init nhpoly1305_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_XMM2))
|
||||
return -ENODEV;
|
||||
|
||||
return crypto_register_shash(&nhpoly1305_alg);
|
||||
}
|
||||
|
||||
static void __exit nhpoly1305_mod_exit(void)
|
||||
{
|
||||
crypto_unregister_shash(&nhpoly1305_alg);
|
||||
}
|
||||
|
||||
module_init(nhpoly1305_mod_init);
|
||||
module_exit(nhpoly1305_mod_exit);
|
||||
|
||||
MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
|
||||
MODULE_ALIAS_CRYPTO("nhpoly1305");
|
||||
MODULE_ALIAS_CRYPTO("nhpoly1305-sse2");
|
@@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
|
||||
if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
|
||||
if (unlikely(!sctx->wset)) {
|
||||
if (!sctx->uset) {
|
||||
memcpy(sctx->u, dctx->r, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u, dctx->r);
|
||||
memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u, dctx->r.r);
|
||||
sctx->uset = true;
|
||||
}
|
||||
memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u + 5, dctx->r);
|
||||
poly1305_simd_mult(sctx->u + 5, dctx->r.r);
|
||||
memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u + 10, dctx->r);
|
||||
poly1305_simd_mult(sctx->u + 10, dctx->r.r);
|
||||
sctx->wset = true;
|
||||
}
|
||||
blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
|
||||
poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
|
||||
poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
|
||||
sctx->u);
|
||||
src += POLY1305_BLOCK_SIZE * 4 * blocks;
|
||||
srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
|
||||
}
|
||||
#endif
|
||||
if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
|
||||
if (unlikely(!sctx->uset)) {
|
||||
memcpy(sctx->u, dctx->r, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u, dctx->r);
|
||||
memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
|
||||
poly1305_simd_mult(sctx->u, dctx->r.r);
|
||||
sctx->uset = true;
|
||||
}
|
||||
blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
|
||||
poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
|
||||
poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
|
||||
sctx->u);
|
||||
src += POLY1305_BLOCK_SIZE * 2 * blocks;
|
||||
srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
|
||||
}
|
||||
if (srclen >= POLY1305_BLOCK_SIZE) {
|
||||
poly1305_block_sse2(dctx->h, src, dctx->r, 1);
|
||||
poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
|
||||
srclen -= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
return srclen;
|
||||
|
Reference in New Issue
Block a user