Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu:
 "API:
   - Add 1472-byte test to tcrypt for IPsec
   - Reintroduced crypto stats interface with numerous changes
   - Support incremental algorithm dumps

  Algorithms:
   - Add xchacha12/20
   - Add nhpoly1305
   - Add adiantum
   - Add streebog hash
   - Mark cts(cbc(aes)) as FIPS allowed

  Drivers:
   - Improve performance of arm64/chacha20
   - Improve performance of x86/chacha20
   - Add NEON-accelerated nhpoly1305
   - Add SSE2 accelerated nhpoly1305
   - Add AVX2 accelerated nhpoly1305
   - Add support for 192/256-bit keys in gcmaes AVX
   - Add SG support in gcmaes AVX
   - ESN for inline IPsec tx in chcr
   - Add support for CryptoCell 703 in ccree
   - Add support for CryptoCell 713 in ccree
   - Add SM4 support in ccree
   - Add SM3 support in ccree
   - Add support for chacha20 in caam/qi2
   - Add support for chacha20 + poly1305 in caam/jr
   - Add support for chacha20 + poly1305 in caam/qi2
   - Add AEAD cipher support in cavium/nitrox"

* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (130 commits)
  crypto: skcipher - remove remnants of internal IV generators
  crypto: cavium/nitrox - Fix build with !CONFIG_DEBUG_FS
  crypto: salsa20-generic - don't unnecessarily use atomic walk
  crypto: skcipher - add might_sleep() to skcipher_walk_virt()
  crypto: x86/chacha - avoid sleeping under kernel_fpu_begin()
  crypto: cavium/nitrox - Added AEAD cipher support
  crypto: mxc-scc - fix build warnings on ARM64
  crypto: api - document missing stats member
  crypto: user - remove unused dump functions
  crypto: chelsio - Fix wrong error counter increments
  crypto: chelsio - Reset counters on cxgb4 Detach
  crypto: chelsio - Handle PCI shutdown event
  crypto: chelsio - cleanup:send addr as value in function argument
  crypto: chelsio - Use same value for both channel in single WR
  crypto: chelsio - Swap location of AAD and IV sent in WR
  crypto: chelsio - remove set but not used variable 'kctx_len'
  crypto: ux500 - Use proper enum in hash_set_dma_transfer
  crypto: ux500 - Use proper enum in cryp_set_dma_transfer
  crypto: aesni - Add scatter/gather avx stubs, and use them in C
  crypto: aesni - Introduce partial block macro
  ..
This commit is contained in:
Linus Torvalds
2018-12-27 13:53:32 -08:00
183 changed files with 15860 additions and 5111 deletions

View File

@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
@@ -23,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -46,6 +47,9 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -74,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@@ -84,6 +88,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
@@ -97,10 +103,16 @@ endif
ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
chacha20-x86_64-y += chacha20-avx2-x86_64.o
chacha-x86_64-y += chacha-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
endif
ifeq ($(avx512_supported),yes)
chacha-x86_64-y += chacha-avx512vl-x86_64.o
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o

File diff suppressed because it is too large Load Diff

View File

@@ -84,7 +84,7 @@ struct gcm_context_data {
u8 current_counter[GCM_BLOCK_LEN];
u64 partial_block_len;
u64 unused;
u8 hash_keys[GCM_BLOCK_LEN * 8];
u8 hash_keys[GCM_BLOCK_LEN * 16];
};
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
@@ -175,6 +175,32 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
static struct aesni_gcm_tfm_s {
void (*init)(void *ctx,
struct gcm_context_data *gdata,
u8 *iv,
u8 *hash_subkey, const u8 *aad,
unsigned long aad_len);
void (*enc_update)(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long plaintext_len);
void (*dec_update)(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long ciphertext_len);
void (*finalize)(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
} *aesni_gcm_tfm;
struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
.init = &aesni_gcm_init,
.enc_update = &aesni_gcm_enc_update,
.dec_update = &aesni_gcm_dec_update,
.finalize = &aesni_gcm_finalize,
};
#ifdef CONFIG_AS_AVX
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
@@ -183,136 +209,94 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
/*
* asmlinkage void aesni_gcm_precomp_avx_gen2()
* asmlinkage void aesni_gcm_init_avx_gen2()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
struct gcm_context_data *gdata,
u8 *iv,
u8 *hash_subkey,
const u8 *aad,
unsigned long aad_len);
asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len);
asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long ciphertext_len);
asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
static void aesni_gcm_enc_avx(void *ctx,
struct gcm_context_data *data, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
aesni_gcm_enc(ctx, data, out, in,
plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
}
}
struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
.init = &aesni_gcm_init_avx_gen2,
.enc_update = &aesni_gcm_enc_update_avx_gen2,
.dec_update = &aesni_gcm_dec_update_avx_gen2,
.finalize = &aesni_gcm_finalize_avx_gen2,
};
static void aesni_gcm_dec_avx(void *ctx,
struct gcm_context_data *data, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
aesni_gcm_dec(ctx, data, out, in,
ciphertext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
}
}
#endif
#ifdef CONFIG_AS_AVX2
/*
* asmlinkage void aesni_gcm_precomp_avx_gen4()
* asmlinkage void aesni_gcm_init_avx_gen4()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
struct gcm_context_data *gdata,
u8 *iv,
u8 *hash_subkey,
const u8 *aad,
unsigned long aad_len);
asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len);
asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in,
unsigned long ciphertext_len);
asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
static void aesni_gcm_enc_avx2(void *ctx,
struct gcm_context_data *data, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
aesni_gcm_enc(ctx, data, out, in,
plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
} else {
aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
}
}
struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
.init = &aesni_gcm_init_avx_gen4,
.enc_update = &aesni_gcm_enc_update_avx_gen4,
.dec_update = &aesni_gcm_dec_update_avx_gen4,
.finalize = &aesni_gcm_finalize_avx_gen4,
};
static void aesni_gcm_dec_avx2(void *ctx,
struct gcm_context_data *data, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
aesni_gcm_dec(ctx, data, out, in,
ciphertext_len, iv, hash_subkey,
aad, aad_len, auth_tag, auth_tag_len);
} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
} else {
aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
aad_len, auth_tag, auth_tag_len);
}
}
#endif
static void (*aesni_gcm_enc_tfm)(void *ctx,
struct gcm_context_data *data, u8 *out,
const u8 *in, unsigned long plaintext_len,
u8 *iv, u8 *hash_subkey, const u8 *aad,
unsigned long aad_len, u8 *auth_tag,
unsigned long auth_tag_len);
static void (*aesni_gcm_dec_tfm)(void *ctx,
struct gcm_context_data *data, u8 *out,
const u8 *in, unsigned long ciphertext_len,
u8 *iv, u8 *hash_subkey, const u8 *aad,
unsigned long aad_len, u8 *auth_tag,
unsigned long auth_tag_len);
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
{
@@ -794,6 +778,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
struct gcm_context_data data AESNI_ALIGN_ATTR;
struct scatter_walk dst_sg_walk = {};
unsigned long left = req->cryptlen;
@@ -811,6 +796,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
if (!enc)
left -= auth_tag_len;
#ifdef CONFIG_AS_AVX2
if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
gcm_tfm = &aesni_gcm_tfm_avx_gen2;
#endif
#ifdef CONFIG_AS_AVX
if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
gcm_tfm = &aesni_gcm_tfm_sse;
#endif
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length &&
(!PageHighMem(sg_page(req->src)) ||
@@ -835,7 +829,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
}
kernel_fpu_begin();
aesni_gcm_init(aes_ctx, &data, iv,
gcm_tfm->init(aes_ctx, &data, iv,
hash_subkey, assoc, assoclen);
if (req->src != req->dst) {
while (left) {
@@ -846,10 +840,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
len = min(srclen, dstlen);
if (len) {
if (enc)
aesni_gcm_enc_update(aes_ctx, &data,
gcm_tfm->enc_update(aes_ctx, &data,
dst, src, len);
else
aesni_gcm_dec_update(aes_ctx, &data,
gcm_tfm->dec_update(aes_ctx, &data,
dst, src, len);
}
left -= len;
@@ -867,10 +861,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
len = scatterwalk_clamp(&src_sg_walk, left);
if (len) {
if (enc)
aesni_gcm_enc_update(aes_ctx, &data,
gcm_tfm->enc_update(aes_ctx, &data,
src, src, len);
else
aesni_gcm_dec_update(aes_ctx, &data,
gcm_tfm->dec_update(aes_ctx, &data,
src, src, len);
}
left -= len;
@@ -879,7 +873,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
scatterwalk_done(&src_sg_walk, 1, left);
}
}
aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len);
gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
kernel_fpu_end();
if (!assocmem)
@@ -912,147 +906,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
u8 one_entry_in_sg = 0;
u8 *src, *dst, *assoc;
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
struct scatter_walk src_sg_walk;
struct scatter_walk dst_sg_walk = {};
struct gcm_context_data data AESNI_ALIGN_ATTR;
if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
aesni_gcm_enc_tfm == aesni_gcm_enc ||
req->cryptlen < AVX_GEN2_OPTSIZE) {
return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
aes_ctx);
}
if (sg_is_last(req->src) &&
(!PageHighMem(sg_page(req->src)) ||
req->src->offset + req->src->length <= PAGE_SIZE) &&
sg_is_last(req->dst) &&
(!PageHighMem(sg_page(req->dst)) ||
req->dst->offset + req->dst->length <= PAGE_SIZE)) {
one_entry_in_sg = 1;
scatterwalk_start(&src_sg_walk, req->src);
assoc = scatterwalk_map(&src_sg_walk);
src = assoc + req->assoclen;
dst = src;
if (unlikely(req->src != req->dst)) {
scatterwalk_start(&dst_sg_walk, req->dst);
dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
}
} else {
/* Allocate memory for src, dst, assoc */
assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
GFP_ATOMIC);
if (unlikely(!assoc))
return -ENOMEM;
scatterwalk_map_and_copy(assoc, req->src, 0,
req->assoclen + req->cryptlen, 0);
src = assoc + req->assoclen;
dst = src;
}
kernel_fpu_begin();
aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv,
hash_subkey, assoc, assoclen,
dst + req->cryptlen, auth_tag_len);
kernel_fpu_end();
/* The authTag (aka the Integrity Check Value) needs to be written
* back to the packet. */
if (one_entry_in_sg) {
if (unlikely(req->src != req->dst)) {
scatterwalk_unmap(dst - req->assoclen);
scatterwalk_advance(&dst_sg_walk, req->dst->length);
scatterwalk_done(&dst_sg_walk, 1, 0);
}
scatterwalk_unmap(assoc);
scatterwalk_advance(&src_sg_walk, req->src->length);
scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
} else {
scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
req->cryptlen + auth_tag_len, 1);
kfree(assoc);
}
return 0;
return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
aes_ctx);
}
static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
u8 one_entry_in_sg = 0;
u8 *src, *dst, *assoc;
unsigned long tempCipherLen = 0;
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
u8 authTag[16];
struct scatter_walk src_sg_walk;
struct scatter_walk dst_sg_walk = {};
struct gcm_context_data data AESNI_ALIGN_ATTR;
int retval = 0;
if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
aesni_gcm_enc_tfm == aesni_gcm_enc ||
req->cryptlen < AVX_GEN2_OPTSIZE) {
return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
aes_ctx);
}
tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
if (sg_is_last(req->src) &&
(!PageHighMem(sg_page(req->src)) ||
req->src->offset + req->src->length <= PAGE_SIZE) &&
sg_is_last(req->dst) && req->dst->length &&
(!PageHighMem(sg_page(req->dst)) ||
req->dst->offset + req->dst->length <= PAGE_SIZE)) {
one_entry_in_sg = 1;
scatterwalk_start(&src_sg_walk, req->src);
assoc = scatterwalk_map(&src_sg_walk);
src = assoc + req->assoclen;
dst = src;
if (unlikely(req->src != req->dst)) {
scatterwalk_start(&dst_sg_walk, req->dst);
dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
}
} else {
/* Allocate memory for src, dst, assoc */
assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
if (!assoc)
return -ENOMEM;
scatterwalk_map_and_copy(assoc, req->src, 0,
req->assoclen + req->cryptlen, 0);
src = assoc + req->assoclen;
dst = src;
}
kernel_fpu_begin();
aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv,
hash_subkey, assoc, assoclen,
authTag, auth_tag_len);
kernel_fpu_end();
/* Compare generated tag with passed in tag. */
retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
-EBADMSG : 0;
if (one_entry_in_sg) {
if (unlikely(req->src != req->dst)) {
scatterwalk_unmap(dst - req->assoclen);
scatterwalk_advance(&dst_sg_walk, req->dst->length);
scatterwalk_done(&dst_sg_walk, 1, 0);
}
scatterwalk_unmap(assoc);
scatterwalk_advance(&src_sg_walk, req->src->length);
scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
} else {
scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
tempCipherLen, 1);
kfree(assoc);
}
return retval;
return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
aes_ctx);
}
static int helper_rfc4106_encrypt(struct aead_request *req)
@@ -1420,21 +1282,18 @@ static int __init aesni_init(void)
#ifdef CONFIG_AS_AVX2
if (boot_cpu_has(X86_FEATURE_AVX2)) {
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
} else
#endif
#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
pr_info("AVX version of gcm_enc/dec engaged.\n");
aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
} else
#endif
{
pr_info("SSE version of gcm_enc/dec engaged.\n");
aesni_gcm_enc_tfm = aesni_gcm_enc;
aesni_gcm_dec_tfm = aesni_gcm_dec;
aesni_gcm_tfm = &aesni_gcm_tfm_sse;
}
aesni_ctr_enc_tfm = aesni_ctr_enc;
#ifdef CONFIG_AS_AVX

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,836 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
*
* Copyright (C) 2018 Martin Willi
*/
#include <linux/linkage.h>
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
.align 32
CTR2BL: .octa 0x00000000000000000000000000000000
.octa 0x00000000000000000000000000000001
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
.align 32
CTR4BL: .octa 0x00000000000000000000000000000002
.octa 0x00000000000000000000000000000003
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
.align 32
CTR8BL: .octa 0x00000003000000020000000100000000
.octa 0x00000007000000060000000500000004
.text
ENTRY(chacha_2block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 2 data blocks output, o
# %rdx: up to 2 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts two ChaCha blocks by loading the state
# matrix twice across four AVX registers. It performs matrix operations
# on four words in each matrix in parallel, but requires shuffling to
# rearrange the words after each round.
vzeroupper
# x0..3[0-2] = s0..3
vbroadcasti128 0x00(%rdi),%ymm0
vbroadcasti128 0x10(%rdi),%ymm1
vbroadcasti128 0x20(%rdi),%ymm2
vbroadcasti128 0x30(%rdi),%ymm3
vpaddd CTR2BL(%rip),%ymm3,%ymm3
vmovdqa %ymm0,%ymm8
vmovdqa %ymm1,%ymm9
vmovdqa %ymm2,%ymm10
vmovdqa %ymm3,%ymm11
.Ldoubleround:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm1,%ymm1
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm3,%ymm3
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm1,%ymm1
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm3,%ymm3
sub $2,%r8d
jnz .Ldoubleround
# o0 = i0 ^ (x0 + s0)
vpaddd %ymm8,%ymm0,%ymm7
cmp $0x10,%rcx
jl .Lxorpart2
vpxord 0x00(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x00(%rsi)
vextracti128 $1,%ymm7,%xmm0
# o1 = i1 ^ (x1 + s1)
vpaddd %ymm9,%ymm1,%ymm7
cmp $0x20,%rcx
jl .Lxorpart2
vpxord 0x10(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x10(%rsi)
vextracti128 $1,%ymm7,%xmm1
# o2 = i2 ^ (x2 + s2)
vpaddd %ymm10,%ymm2,%ymm7
cmp $0x30,%rcx
jl .Lxorpart2
vpxord 0x20(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x20(%rsi)
vextracti128 $1,%ymm7,%xmm2
# o3 = i3 ^ (x3 + s3)
vpaddd %ymm11,%ymm3,%ymm7
cmp $0x40,%rcx
jl .Lxorpart2
vpxord 0x30(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x30(%rsi)
vextracti128 $1,%ymm7,%xmm3
# xor and write second block
vmovdqa %xmm0,%xmm7
cmp $0x50,%rcx
jl .Lxorpart2
vpxord 0x40(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x40(%rsi)
vmovdqa %xmm1,%xmm7
cmp $0x60,%rcx
jl .Lxorpart2
vpxord 0x50(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x50(%rsi)
vmovdqa %xmm2,%xmm7
cmp $0x70,%rcx
jl .Lxorpart2
vpxord 0x60(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x60(%rsi)
vmovdqa %xmm3,%xmm7
cmp $0x80,%rcx
jl .Lxorpart2
vpxord 0x70(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x70(%rsi)
.Ldone2:
vzeroupper
ret
.Lxorpart2:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
jz .Ldone8
mov %rax,%r9
and $~0xf,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
vpxord %xmm7,%xmm1,%xmm1
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
jmp .Ldone2
ENDPROC(chacha_2block_xor_avx512vl)
ENTRY(chacha_4block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o
# %rdx: up to 4 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts four ChaCha blocks by loading the state
# matrix four times across eight AVX registers. It performs matrix
# operations on four words in two matrices in parallel, sequentially
# to the operations on the four words of the other two matrices. The
# required word shuffling has a rather high latency, we can do the
# arithmetic on two matrix-pairs without much slowdown.
vzeroupper
# x0..3[0-4] = s0..3
vbroadcasti128 0x00(%rdi),%ymm0
vbroadcasti128 0x10(%rdi),%ymm1
vbroadcasti128 0x20(%rdi),%ymm2
vbroadcasti128 0x30(%rdi),%ymm3
vmovdqa %ymm0,%ymm4
vmovdqa %ymm1,%ymm5
vmovdqa %ymm2,%ymm6
vmovdqa %ymm3,%ymm7
vpaddd CTR2BL(%rip),%ymm3,%ymm3
vpaddd CTR4BL(%rip),%ymm7,%ymm7
vmovdqa %ymm0,%ymm11
vmovdqa %ymm1,%ymm12
vmovdqa %ymm2,%ymm13
vmovdqa %ymm3,%ymm14
vmovdqa %ymm7,%ymm15
.Ldoubleround4:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $16,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $8,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm1,%ymm1
vpshufd $0x39,%ymm5,%ymm5
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
vpshufd $0x4e,%ymm6,%ymm6
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm3,%ymm3
vpshufd $0x93,%ymm7,%ymm7
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $16,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $8,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm1,%ymm1
vpshufd $0x93,%ymm5,%ymm5
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
vpshufd $0x4e,%ymm6,%ymm6
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm3,%ymm3
vpshufd $0x39,%ymm7,%ymm7
sub $2,%r8d
jnz .Ldoubleround4
# o0 = i0 ^ (x0 + s0), first block
vpaddd %ymm11,%ymm0,%ymm10
cmp $0x10,%rcx
jl .Lxorpart4
vpxord 0x00(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x00(%rsi)
vextracti128 $1,%ymm10,%xmm0
# o1 = i1 ^ (x1 + s1), first block
vpaddd %ymm12,%ymm1,%ymm10
cmp $0x20,%rcx
jl .Lxorpart4
vpxord 0x10(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x10(%rsi)
vextracti128 $1,%ymm10,%xmm1
# o2 = i2 ^ (x2 + s2), first block
vpaddd %ymm13,%ymm2,%ymm10
cmp $0x30,%rcx
jl .Lxorpart4
vpxord 0x20(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x20(%rsi)
vextracti128 $1,%ymm10,%xmm2
# o3 = i3 ^ (x3 + s3), first block
vpaddd %ymm14,%ymm3,%ymm10
cmp $0x40,%rcx
jl .Lxorpart4
vpxord 0x30(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x30(%rsi)
vextracti128 $1,%ymm10,%xmm3
# xor and write second block
vmovdqa %xmm0,%xmm10
cmp $0x50,%rcx
jl .Lxorpart4
vpxord 0x40(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x40(%rsi)
vmovdqa %xmm1,%xmm10
cmp $0x60,%rcx
jl .Lxorpart4
vpxord 0x50(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x50(%rsi)
vmovdqa %xmm2,%xmm10
cmp $0x70,%rcx
jl .Lxorpart4
vpxord 0x60(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x60(%rsi)
vmovdqa %xmm3,%xmm10
cmp $0x80,%rcx
jl .Lxorpart4
vpxord 0x70(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x70(%rsi)
# o0 = i0 ^ (x0 + s0), third block
vpaddd %ymm11,%ymm4,%ymm10
cmp $0x90,%rcx
jl .Lxorpart4
vpxord 0x80(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x80(%rsi)
vextracti128 $1,%ymm10,%xmm4
# o1 = i1 ^ (x1 + s1), third block
vpaddd %ymm12,%ymm5,%ymm10
cmp $0xa0,%rcx
jl .Lxorpart4
vpxord 0x90(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x90(%rsi)
vextracti128 $1,%ymm10,%xmm5
# o2 = i2 ^ (x2 + s2), third block
vpaddd %ymm13,%ymm6,%ymm10
cmp $0xb0,%rcx
jl .Lxorpart4
vpxord 0xa0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xa0(%rsi)
vextracti128 $1,%ymm10,%xmm6
# o3 = i3 ^ (x3 + s3), third block
vpaddd %ymm15,%ymm7,%ymm10
cmp $0xc0,%rcx
jl .Lxorpart4
vpxord 0xb0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xb0(%rsi)
vextracti128 $1,%ymm10,%xmm7
# xor and write fourth block
vmovdqa %xmm4,%xmm10
cmp $0xd0,%rcx
jl .Lxorpart4
vpxord 0xc0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xc0(%rsi)
vmovdqa %xmm5,%xmm10
cmp $0xe0,%rcx
jl .Lxorpart4
vpxord 0xd0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xd0(%rsi)
vmovdqa %xmm6,%xmm10
cmp $0xf0,%rcx
jl .Lxorpart4
vpxord 0xe0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xe0(%rsi)
vmovdqa %xmm7,%xmm10
cmp $0x100,%rcx
jl .Lxorpart4
vpxord 0xf0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xf0(%rsi)
.Ldone4:
vzeroupper
ret
.Lxorpart4:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
jz .Ldone8
mov %rax,%r9
and $~0xf,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
vpxord %xmm10,%xmm1,%xmm1
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
jmp .Ldone4
ENDPROC(chacha_4block_xor_avx512vl)
ENTRY(chacha_8block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 8 data blocks output, o
# %rdx: up to 8 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts eight consecutive ChaCha blocks by loading
# the state matrix in AVX registers eight times. Compared to AVX2, this
# mostly benefits from the new rotate instructions in VL and the
# additional registers.
vzeroupper
# x0..15[0-7] = s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpbroadcastd 0x04(%rdi),%ymm1
vpbroadcastd 0x08(%rdi),%ymm2
vpbroadcastd 0x0c(%rdi),%ymm3
vpbroadcastd 0x10(%rdi),%ymm4
vpbroadcastd 0x14(%rdi),%ymm5
vpbroadcastd 0x18(%rdi),%ymm6
vpbroadcastd 0x1c(%rdi),%ymm7
vpbroadcastd 0x20(%rdi),%ymm8
vpbroadcastd 0x24(%rdi),%ymm9
vpbroadcastd 0x28(%rdi),%ymm10
vpbroadcastd 0x2c(%rdi),%ymm11
vpbroadcastd 0x30(%rdi),%ymm12
vpbroadcastd 0x34(%rdi),%ymm13
vpbroadcastd 0x38(%rdi),%ymm14
vpbroadcastd 0x3c(%rdi),%ymm15
# x12 += counter values 0-3
vpaddd CTR8BL(%rip),%ymm12,%ymm12
vmovdqa64 %ymm0,%ymm16
vmovdqa64 %ymm1,%ymm17
vmovdqa64 %ymm2,%ymm18
vmovdqa64 %ymm3,%ymm19
vmovdqa64 %ymm4,%ymm20
vmovdqa64 %ymm5,%ymm21
vmovdqa64 %ymm6,%ymm22
vmovdqa64 %ymm7,%ymm23
vmovdqa64 %ymm8,%ymm24
vmovdqa64 %ymm9,%ymm25
vmovdqa64 %ymm10,%ymm26
vmovdqa64 %ymm11,%ymm27
vmovdqa64 %ymm12,%ymm28
vmovdqa64 %ymm13,%ymm29
vmovdqa64 %ymm14,%ymm30
vmovdqa64 %ymm15,%ymm31
.Ldoubleround8:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
vpaddd %ymm0,%ymm4,%ymm0
vpxord %ymm0,%ymm12,%ymm12
vprold $16,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
vpaddd %ymm1,%ymm5,%ymm1
vpxord %ymm1,%ymm13,%ymm13
vprold $16,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
vpaddd %ymm2,%ymm6,%ymm2
vpxord %ymm2,%ymm14,%ymm14
vprold $16,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
vpaddd %ymm3,%ymm7,%ymm3
vpxord %ymm3,%ymm15,%ymm15
vprold $16,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
vpaddd %ymm12,%ymm8,%ymm8
vpxord %ymm8,%ymm4,%ymm4
vprold $12,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
vpaddd %ymm13,%ymm9,%ymm9
vpxord %ymm9,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
vpaddd %ymm14,%ymm10,%ymm10
vpxord %ymm10,%ymm6,%ymm6
vprold $12,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
vpaddd %ymm15,%ymm11,%ymm11
vpxord %ymm11,%ymm7,%ymm7
vprold $12,%ymm7,%ymm7
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
vpaddd %ymm0,%ymm4,%ymm0
vpxord %ymm0,%ymm12,%ymm12
vprold $8,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
vpaddd %ymm1,%ymm5,%ymm1
vpxord %ymm1,%ymm13,%ymm13
vprold $8,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
vpaddd %ymm2,%ymm6,%ymm2
vpxord %ymm2,%ymm14,%ymm14
vprold $8,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
vpaddd %ymm3,%ymm7,%ymm3
vpxord %ymm3,%ymm15,%ymm15
vprold $8,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
vpaddd %ymm12,%ymm8,%ymm8
vpxord %ymm8,%ymm4,%ymm4
vprold $7,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
vpaddd %ymm13,%ymm9,%ymm9
vpxord %ymm9,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
vpaddd %ymm14,%ymm10,%ymm10
vpxord %ymm10,%ymm6,%ymm6
vprold $7,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
vpaddd %ymm15,%ymm11,%ymm11
vpxord %ymm11,%ymm7,%ymm7
vprold $7,%ymm7,%ymm7
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
vpaddd %ymm0,%ymm5,%ymm0
vpxord %ymm0,%ymm15,%ymm15
vprold $16,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
vpaddd %ymm1,%ymm6,%ymm1
vpxord %ymm1,%ymm12,%ymm12
vprold $16,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
vpaddd %ymm2,%ymm7,%ymm2
vpxord %ymm2,%ymm13,%ymm13
vprold $16,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
vpaddd %ymm3,%ymm4,%ymm3
vpxord %ymm3,%ymm14,%ymm14
vprold $16,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
vpaddd %ymm15,%ymm10,%ymm10
vpxord %ymm10,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
vpaddd %ymm12,%ymm11,%ymm11
vpxord %ymm11,%ymm6,%ymm6
vprold $12,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
vpaddd %ymm13,%ymm8,%ymm8
vpxord %ymm8,%ymm7,%ymm7
vprold $12,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
vpaddd %ymm14,%ymm9,%ymm9
vpxord %ymm9,%ymm4,%ymm4
vprold $12,%ymm4,%ymm4
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
vpaddd %ymm0,%ymm5,%ymm0
vpxord %ymm0,%ymm15,%ymm15
vprold $8,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
vpaddd %ymm1,%ymm6,%ymm1
vpxord %ymm1,%ymm12,%ymm12
vprold $8,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
vpaddd %ymm2,%ymm7,%ymm2
vpxord %ymm2,%ymm13,%ymm13
vprold $8,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
vpaddd %ymm3,%ymm4,%ymm3
vpxord %ymm3,%ymm14,%ymm14
vprold $8,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
vpaddd %ymm15,%ymm10,%ymm10
vpxord %ymm10,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
vpaddd %ymm12,%ymm11,%ymm11
vpxord %ymm11,%ymm6,%ymm6
vprold $7,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
vpaddd %ymm13,%ymm8,%ymm8
vpxord %ymm8,%ymm7,%ymm7
vprold $7,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
vpaddd %ymm14,%ymm9,%ymm9
vpxord %ymm9,%ymm4,%ymm4
vprold $7,%ymm4,%ymm4
sub $2,%r8d
jnz .Ldoubleround8
# x0..15[0-3] += s[0..15]
vpaddd %ymm16,%ymm0,%ymm0
vpaddd %ymm17,%ymm1,%ymm1
vpaddd %ymm18,%ymm2,%ymm2
vpaddd %ymm19,%ymm3,%ymm3
vpaddd %ymm20,%ymm4,%ymm4
vpaddd %ymm21,%ymm5,%ymm5
vpaddd %ymm22,%ymm6,%ymm6
vpaddd %ymm23,%ymm7,%ymm7
vpaddd %ymm24,%ymm8,%ymm8
vpaddd %ymm25,%ymm9,%ymm9
vpaddd %ymm26,%ymm10,%ymm10
vpaddd %ymm27,%ymm11,%ymm11
vpaddd %ymm28,%ymm12,%ymm12
vpaddd %ymm29,%ymm13,%ymm13
vpaddd %ymm30,%ymm14,%ymm14
vpaddd %ymm31,%ymm15,%ymm15
# interleave 32-bit words in state n, n+1
vpunpckldq %ymm1,%ymm0,%ymm16
vpunpckhdq %ymm1,%ymm0,%ymm17
vpunpckldq %ymm3,%ymm2,%ymm18
vpunpckhdq %ymm3,%ymm2,%ymm19
vpunpckldq %ymm5,%ymm4,%ymm20
vpunpckhdq %ymm5,%ymm4,%ymm21
vpunpckldq %ymm7,%ymm6,%ymm22
vpunpckhdq %ymm7,%ymm6,%ymm23
vpunpckldq %ymm9,%ymm8,%ymm24
vpunpckhdq %ymm9,%ymm8,%ymm25
vpunpckldq %ymm11,%ymm10,%ymm26
vpunpckhdq %ymm11,%ymm10,%ymm27
vpunpckldq %ymm13,%ymm12,%ymm28
vpunpckhdq %ymm13,%ymm12,%ymm29
vpunpckldq %ymm15,%ymm14,%ymm30
vpunpckhdq %ymm15,%ymm14,%ymm31
# interleave 64-bit words in state n, n+2
vpunpcklqdq %ymm18,%ymm16,%ymm0
vpunpcklqdq %ymm19,%ymm17,%ymm1
vpunpckhqdq %ymm18,%ymm16,%ymm2
vpunpckhqdq %ymm19,%ymm17,%ymm3
vpunpcklqdq %ymm22,%ymm20,%ymm4
vpunpcklqdq %ymm23,%ymm21,%ymm5
vpunpckhqdq %ymm22,%ymm20,%ymm6
vpunpckhqdq %ymm23,%ymm21,%ymm7
vpunpcklqdq %ymm26,%ymm24,%ymm8
vpunpcklqdq %ymm27,%ymm25,%ymm9
vpunpckhqdq %ymm26,%ymm24,%ymm10
vpunpckhqdq %ymm27,%ymm25,%ymm11
vpunpcklqdq %ymm30,%ymm28,%ymm12
vpunpcklqdq %ymm31,%ymm29,%ymm13
vpunpckhqdq %ymm30,%ymm28,%ymm14
vpunpckhqdq %ymm31,%ymm29,%ymm15
# interleave 128-bit words in state n, n+4
# xor/write first four blocks
vmovdqa64 %ymm0,%ymm16
vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
cmp $0x0020,%rcx
jl .Lxorpart8
vpxord 0x0000(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0000(%rsi)
vmovdqa64 %ymm16,%ymm0
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
cmp $0x0040,%rcx
jl .Lxorpart8
vpxord 0x0020(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0020(%rsi)
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
cmp $0x0060,%rcx
jl .Lxorpart8
vpxord 0x0040(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0040(%rsi)
vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
cmp $0x0080,%rcx
jl .Lxorpart8
vpxord 0x0060(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0060(%rsi)
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
cmp $0x00a0,%rcx
jl .Lxorpart8
vpxord 0x0080(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0080(%rsi)
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
cmp $0x00c0,%rcx
jl .Lxorpart8
vpxord 0x00a0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00a0(%rsi)
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
cmp $0x00e0,%rcx
jl .Lxorpart8
vpxord 0x00c0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00c0(%rsi)
vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
cmp $0x0100,%rcx
jl .Lxorpart8
vpxord 0x00e0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00e0(%rsi)
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
# xor remaining blocks, write to output
vmovdqa64 %ymm4,%ymm0
cmp $0x0120,%rcx
jl .Lxorpart8
vpxord 0x0100(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0100(%rsi)
vmovdqa64 %ymm12,%ymm0
cmp $0x0140,%rcx
jl .Lxorpart8
vpxord 0x0120(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0120(%rsi)
vmovdqa64 %ymm6,%ymm0
cmp $0x0160,%rcx
jl .Lxorpart8
vpxord 0x0140(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0140(%rsi)
vmovdqa64 %ymm14,%ymm0
cmp $0x0180,%rcx
jl .Lxorpart8
vpxord 0x0160(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0160(%rsi)
vmovdqa64 %ymm5,%ymm0
cmp $0x01a0,%rcx
jl .Lxorpart8
vpxord 0x0180(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0180(%rsi)
vmovdqa64 %ymm13,%ymm0
cmp $0x01c0,%rcx
jl .Lxorpart8
vpxord 0x01a0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01a0(%rsi)
vmovdqa64 %ymm7,%ymm0
cmp $0x01e0,%rcx
jl .Lxorpart8
vpxord 0x01c0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01c0(%rsi)
vmovdqa64 %ymm15,%ymm0
cmp $0x0200,%rcx
jl .Lxorpart8
vpxord 0x01e0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01e0(%rsi)
.Ldone8:
vzeroupper
ret
.Lxorpart8:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0x1f,%rcx
jz .Ldone8
mov %rax,%r9
and $~0x1f,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
vpxord %ymm0,%ymm1,%ymm1
vmovdqu8 %ymm1,(%rsi,%r9){%k1}
jmp .Ldone8
ENDPROC(chacha_8block_xor_avx512vl)

View File

@@ -1,5 +1,5 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
* ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
*
@@ -10,6 +10,7 @@
*/
#include <linux/linkage.h>
#include <asm/frame.h>
.section .rodata.cst16.ROT8, "aM", @progbits, 16
.align 16
@@ -23,35 +24,25 @@ CTRINC: .octa 0x00000003000000020000000100000000
.text
ENTRY(chacha20_block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: 1 data block output, o
# %rdx: 1 data block input, i
# This function encrypts one ChaCha20 block by loading the state matrix
# in four SSE registers. It performs matrix operation on four words in
# parallel, but requireds shuffling to rearrange the words after each
# round. 8/16-bit word rotation is done with the slightly better
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
# traditional shift+OR.
# x0..3 = s0..3
movdqa 0x00(%rdi),%xmm0
movdqa 0x10(%rdi),%xmm1
movdqa 0x20(%rdi),%xmm2
movdqa 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
/*
* chacha_permute - permute one block
*
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
* function performs matrix operations on four words in parallel, but requires
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
* rotation uses traditional shift+OR.
*
* The round count is given in %r8d.
*
* Clobbers: %r8d, %xmm4-%xmm7
*/
chacha_permute:
movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5
mov $10,%ecx
.Ldoubleround:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
pshufd $0x39,%xmm3,%xmm3
dec %ecx
sub $2,%r8d
jnz .Ldoubleround
ret
ENDPROC(chacha_permute)
ENTRY(chacha_block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: up to 1 data block output, o
# %rdx: up to 1 data block input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
FRAME_BEGIN
# x0..3 = s0..3
movdqa 0x00(%rdi),%xmm0
movdqa 0x10(%rdi),%xmm1
movdqa 0x20(%rdi),%xmm2
movdqa 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
mov %rcx,%rax
call chacha_permute
# o0 = i0 ^ (x0 + s0)
movdqu 0x00(%rdx),%xmm4
paddd %xmm8,%xmm0
cmp $0x10,%rax
jl .Lxorpart
movdqu 0x00(%rdx),%xmm4
pxor %xmm4,%xmm0
movdqu %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1)
movdqu 0x10(%rdx),%xmm5
paddd %xmm9,%xmm1
pxor %xmm5,%xmm1
movdqu %xmm1,0x10(%rsi)
movdqa %xmm1,%xmm0
cmp $0x20,%rax
jl .Lxorpart
movdqu 0x10(%rdx),%xmm0
pxor %xmm1,%xmm0
movdqu %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2)
movdqu 0x20(%rdx),%xmm6
paddd %xmm10,%xmm2
pxor %xmm6,%xmm2
movdqu %xmm2,0x20(%rsi)
movdqa %xmm2,%xmm0
cmp $0x30,%rax
jl .Lxorpart
movdqu 0x20(%rdx),%xmm0
pxor %xmm2,%xmm0
movdqu %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3)
movdqu 0x30(%rdx),%xmm7
paddd %xmm11,%xmm3
pxor %xmm7,%xmm3
movdqu %xmm3,0x30(%rsi)
movdqa %xmm3,%xmm0
cmp $0x40,%rax
jl .Lxorpart
movdqu 0x30(%rdx),%xmm0
pxor %xmm3,%xmm0
movdqu %xmm0,0x30(%rsi)
.Ldone:
FRAME_END
ret
ENDPROC(chacha20_block_xor_ssse3)
ENTRY(chacha20_4block_xor_ssse3)
.Lxorpart:
# xor remaining bytes from partial register into output
mov %rax,%r9
and $0x0f,%r9
jz .Ldone
and $~0x0f,%rax
mov %rsi,%r11
lea 8(%rsp),%r10
sub $0x10,%rsp
and $~31,%rsp
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
lea -8(%r10),%rsp
jmp .Ldone
ENDPROC(chacha_block_xor_ssse3)
ENTRY(hchacha_block_ssse3)
# %rdi: Input state matrix, s
# %rsi: 4 data blocks output, o
# %rdx: 4 data blocks input, i
# %rsi: output (8 32-bit words)
# %edx: nrounds
FRAME_BEGIN
# This function encrypts four consecutive ChaCha20 blocks by loading the
movdqa 0x00(%rdi),%xmm0
movdqa 0x10(%rdi),%xmm1
movdqa 0x20(%rdi),%xmm2
movdqa 0x30(%rdi),%xmm3
mov %edx,%r8d
call chacha_permute
movdqu %xmm0,0x00(%rsi)
movdqu %xmm3,0x10(%rsi)
FRAME_END
ret
ENDPROC(hchacha_block_ssse3)
ENTRY(chacha_4block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o
# %rdx: up to 4 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts four consecutive ChaCha blocks by loading the
# the state matrix in SSE registers four times. As we need some scratch
# registers, we save the first four registers on the stack. The
# algorithm performs each operation on the corresponding word of each
@@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
mov %rcx,%rax
# x0..15[0-3] = s0..3[0..3]
movq 0x00(%rdi),%xmm1
@@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
# x12 += counter values 0-3
paddd %xmm1,%xmm12
mov $10,%ecx
.Ldoubleround4:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
movdqa 0x00(%rsp),%xmm0
@@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
psrld $25,%xmm4
por %xmm0,%xmm4
dec %ecx
sub $2,%r8d
jnz .Ldoubleround4
# x0[0-3] += s0[0]
@@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
# xor with corresponding input, write to output
movdqa 0x00(%rsp),%xmm0
cmp $0x10,%rax
jl .Lxorpart4
movdqu 0x00(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x00(%rsi)
movdqa 0x10(%rsp),%xmm0
movdqu 0x80(%rdx),%xmm1
movdqu %xmm4,%xmm0
cmp $0x20,%rax
jl .Lxorpart4
movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x80(%rsi)
movdqu %xmm0,0x10(%rsi)
movdqu %xmm8,%xmm0
cmp $0x30,%rax
jl .Lxorpart4
movdqu 0x20(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x20(%rsi)
movdqu %xmm12,%xmm0
cmp $0x40,%rax
jl .Lxorpart4
movdqu 0x30(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x30(%rsi)
movdqa 0x20(%rsp),%xmm0
cmp $0x50,%rax
jl .Lxorpart4
movdqu 0x40(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x40(%rsi)
movdqu %xmm6,%xmm0
cmp $0x60,%rax
jl .Lxorpart4
movdqu 0x50(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x50(%rsi)
movdqu %xmm10,%xmm0
cmp $0x70,%rax
jl .Lxorpart4
movdqu 0x60(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x60(%rsi)
movdqu %xmm14,%xmm0
cmp $0x80,%rax
jl .Lxorpart4
movdqu 0x70(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x70(%rsi)
movdqa 0x10(%rsp),%xmm0
cmp $0x90,%rax
jl .Lxorpart4
movdqu 0x80(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x80(%rsi)
movdqu %xmm5,%xmm0
cmp $0xa0,%rax
jl .Lxorpart4
movdqu 0x90(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x90(%rsi)
movdqu %xmm9,%xmm0
cmp $0xb0,%rax
jl .Lxorpart4
movdqu 0xa0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xa0(%rsi)
movdqu %xmm13,%xmm0
cmp $0xc0,%rax
jl .Lxorpart4
movdqu 0xb0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xb0(%rsi)
movdqa 0x30(%rsp),%xmm0
cmp $0xd0,%rax
jl .Lxorpart4
movdqu 0xc0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xc0(%rsi)
movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm4
movdqu %xmm4,0x10(%rsi)
movdqu 0x90(%rdx),%xmm1
pxor %xmm1,%xmm5
movdqu %xmm5,0x90(%rsi)
movdqu 0x50(%rdx),%xmm1
pxor %xmm1,%xmm6
movdqu %xmm6,0x50(%rsi)
movdqu 0xd0(%rdx),%xmm1
pxor %xmm1,%xmm7
movdqu %xmm7,0xd0(%rsi)
movdqu 0x20(%rdx),%xmm1
pxor %xmm1,%xmm8
movdqu %xmm8,0x20(%rsi)
movdqu 0xa0(%rdx),%xmm1
pxor %xmm1,%xmm9
movdqu %xmm9,0xa0(%rsi)
movdqu 0x60(%rdx),%xmm1
pxor %xmm1,%xmm10
movdqu %xmm10,0x60(%rsi)
movdqu 0xe0(%rdx),%xmm1
pxor %xmm1,%xmm11
movdqu %xmm11,0xe0(%rsi)
movdqu 0x30(%rdx),%xmm1
pxor %xmm1,%xmm12
movdqu %xmm12,0x30(%rsi)
movdqu 0xb0(%rdx),%xmm1
pxor %xmm1,%xmm13
movdqu %xmm13,0xb0(%rsi)
movdqu 0x70(%rdx),%xmm1
pxor %xmm1,%xmm14
movdqu %xmm14,0x70(%rsi)
movdqu 0xf0(%rdx),%xmm1
pxor %xmm1,%xmm15
movdqu %xmm15,0xf0(%rsi)
movdqu %xmm7,%xmm0
cmp $0xe0,%rax
jl .Lxorpart4
movdqu 0xd0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xd0(%rsi)
movdqu %xmm11,%xmm0
cmp $0xf0,%rax
jl .Lxorpart4
movdqu 0xe0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xe0(%rsi)
movdqu %xmm15,%xmm0
cmp $0x100,%rax
jl .Lxorpart4
movdqu 0xf0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xf0(%rsi)
.Ldone4:
lea -8(%r10),%rsp
ret
ENDPROC(chacha20_4block_xor_ssse3)
.Lxorpart4:
# xor remaining bytes from partial register into output
mov %rax,%r9
and $0x0f,%r9
jz .Ldone4
and $~0x0f,%rax
mov %rsi,%r11
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
jmp .Ldone4
ENDPROC(chacha_4block_xor_ssse3)

View File

@@ -1,448 +0,0 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
.section .rodata.cst32.ROT8, "aM", @progbits, 32
.align 32
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
.octa 0x0e0d0c0f0a09080b0605040702010003
.section .rodata.cst32.ROT16, "aM", @progbits, 32
.align 32
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
.octa 0x0d0c0f0e09080b0a0504070601000302
.section .rodata.cst32.CTRINC, "aM", @progbits, 32
.align 32
CTRINC: .octa 0x00000003000000020000000100000000
.octa 0x00000007000000060000000500000004
.text
ENTRY(chacha20_8block_xor_avx2)
# %rdi: Input state matrix, s
# %rsi: 8 data blocks output, o
# %rdx: 8 data blocks input, i
# This function encrypts eight consecutive ChaCha20 blocks by loading
# the state matrix in AVX registers eight times. As we need some
# scratch registers, we save the first four registers on the stack. The
# algorithm performs each operation on the corresponding word of each
# state matrix, hence requires no word shuffling. For final XORing step
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
# words, which allows us to do XOR in AVX registers. 8/16-bit word
# rotation is done with the slightly better performing byte shuffling,
# 7/12-bit word rotation uses traditional shift+OR.
vzeroupper
# 4 * 32 byte stack, 32-byte aligned
lea 8(%rsp),%r10
and $~31, %rsp
sub $0x80, %rsp
# x0..15[0-7] = s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpbroadcastd 0x04(%rdi),%ymm1
vpbroadcastd 0x08(%rdi),%ymm2
vpbroadcastd 0x0c(%rdi),%ymm3
vpbroadcastd 0x10(%rdi),%ymm4
vpbroadcastd 0x14(%rdi),%ymm5
vpbroadcastd 0x18(%rdi),%ymm6
vpbroadcastd 0x1c(%rdi),%ymm7
vpbroadcastd 0x20(%rdi),%ymm8
vpbroadcastd 0x24(%rdi),%ymm9
vpbroadcastd 0x28(%rdi),%ymm10
vpbroadcastd 0x2c(%rdi),%ymm11
vpbroadcastd 0x30(%rdi),%ymm12
vpbroadcastd 0x34(%rdi),%ymm13
vpbroadcastd 0x38(%rdi),%ymm14
vpbroadcastd 0x3c(%rdi),%ymm15
# x0..3 on stack
vmovdqa %ymm0,0x00(%rsp)
vmovdqa %ymm1,0x20(%rsp)
vmovdqa %ymm2,0x40(%rsp)
vmovdqa %ymm3,0x60(%rsp)
vmovdqa CTRINC(%rip),%ymm1
vmovdqa ROT8(%rip),%ymm2
vmovdqa ROT16(%rip),%ymm3
# x12 += counter values 0-3
vpaddd %ymm1,%ymm12,%ymm12
mov $10,%ecx
.Ldoubleround8:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
vpaddd 0x00(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm3,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
vpaddd 0x20(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm3,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
vpaddd 0x40(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm3,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
vpaddd 0x60(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm3,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
vpaddd %ymm12,%ymm8,%ymm8
vpxor %ymm8,%ymm4,%ymm4
vpslld $12,%ymm4,%ymm0
vpsrld $20,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
vpaddd %ymm13,%ymm9,%ymm9
vpxor %ymm9,%ymm5,%ymm5
vpslld $12,%ymm5,%ymm0
vpsrld $20,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
vpaddd %ymm14,%ymm10,%ymm10
vpxor %ymm10,%ymm6,%ymm6
vpslld $12,%ymm6,%ymm0
vpsrld $20,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
vpaddd %ymm15,%ymm11,%ymm11
vpxor %ymm11,%ymm7,%ymm7
vpslld $12,%ymm7,%ymm0
vpsrld $20,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
vpaddd 0x00(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm2,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
vpaddd 0x20(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm2,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
vpaddd 0x40(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm2,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
vpaddd 0x60(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm2,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
vpaddd %ymm12,%ymm8,%ymm8
vpxor %ymm8,%ymm4,%ymm4
vpslld $7,%ymm4,%ymm0
vpsrld $25,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
vpaddd %ymm13,%ymm9,%ymm9
vpxor %ymm9,%ymm5,%ymm5
vpslld $7,%ymm5,%ymm0
vpsrld $25,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
vpaddd %ymm14,%ymm10,%ymm10
vpxor %ymm10,%ymm6,%ymm6
vpslld $7,%ymm6,%ymm0
vpsrld $25,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
vpaddd %ymm15,%ymm11,%ymm11
vpxor %ymm11,%ymm7,%ymm7
vpslld $7,%ymm7,%ymm0
vpsrld $25,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
vpaddd 0x00(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm3,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
vpaddd 0x20(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm3,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
vpaddd 0x40(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm3,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
vpaddd 0x60(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm3,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
vpaddd %ymm15,%ymm10,%ymm10
vpxor %ymm10,%ymm5,%ymm5
vpslld $12,%ymm5,%ymm0
vpsrld $20,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
vpaddd %ymm12,%ymm11,%ymm11
vpxor %ymm11,%ymm6,%ymm6
vpslld $12,%ymm6,%ymm0
vpsrld $20,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
vpaddd %ymm13,%ymm8,%ymm8
vpxor %ymm8,%ymm7,%ymm7
vpslld $12,%ymm7,%ymm0
vpsrld $20,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
vpaddd %ymm14,%ymm9,%ymm9
vpxor %ymm9,%ymm4,%ymm4
vpslld $12,%ymm4,%ymm0
vpsrld $20,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
vpaddd 0x00(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm2,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
vpaddd 0x20(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm2,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
vpaddd 0x40(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm2,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
vpaddd 0x60(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm2,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
vpaddd %ymm15,%ymm10,%ymm10
vpxor %ymm10,%ymm5,%ymm5
vpslld $7,%ymm5,%ymm0
vpsrld $25,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
vpaddd %ymm12,%ymm11,%ymm11
vpxor %ymm11,%ymm6,%ymm6
vpslld $7,%ymm6,%ymm0
vpsrld $25,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
vpaddd %ymm13,%ymm8,%ymm8
vpxor %ymm8,%ymm7,%ymm7
vpslld $7,%ymm7,%ymm0
vpsrld $25,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
vpaddd %ymm14,%ymm9,%ymm9
vpxor %ymm9,%ymm4,%ymm4
vpslld $7,%ymm4,%ymm0
vpsrld $25,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
dec %ecx
jnz .Ldoubleround8
# x0..15[0-3] += s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpaddd 0x00(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpbroadcastd 0x04(%rdi),%ymm0
vpaddd 0x20(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpbroadcastd 0x08(%rdi),%ymm0
vpaddd 0x40(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpbroadcastd 0x0c(%rdi),%ymm0
vpaddd 0x60(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpbroadcastd 0x10(%rdi),%ymm0
vpaddd %ymm0,%ymm4,%ymm4
vpbroadcastd 0x14(%rdi),%ymm0
vpaddd %ymm0,%ymm5,%ymm5
vpbroadcastd 0x18(%rdi),%ymm0
vpaddd %ymm0,%ymm6,%ymm6
vpbroadcastd 0x1c(%rdi),%ymm0
vpaddd %ymm0,%ymm7,%ymm7
vpbroadcastd 0x20(%rdi),%ymm0
vpaddd %ymm0,%ymm8,%ymm8
vpbroadcastd 0x24(%rdi),%ymm0
vpaddd %ymm0,%ymm9,%ymm9
vpbroadcastd 0x28(%rdi),%ymm0
vpaddd %ymm0,%ymm10,%ymm10
vpbroadcastd 0x2c(%rdi),%ymm0
vpaddd %ymm0,%ymm11,%ymm11
vpbroadcastd 0x30(%rdi),%ymm0
vpaddd %ymm0,%ymm12,%ymm12
vpbroadcastd 0x34(%rdi),%ymm0
vpaddd %ymm0,%ymm13,%ymm13
vpbroadcastd 0x38(%rdi),%ymm0
vpaddd %ymm0,%ymm14,%ymm14
vpbroadcastd 0x3c(%rdi),%ymm0
vpaddd %ymm0,%ymm15,%ymm15
# x12 += counter values 0-3
vpaddd %ymm1,%ymm12,%ymm12
# interleave 32-bit words in state n, n+1
vmovdqa 0x00(%rsp),%ymm0
vmovdqa 0x20(%rsp),%ymm1
vpunpckldq %ymm1,%ymm0,%ymm2
vpunpckhdq %ymm1,%ymm0,%ymm1
vmovdqa %ymm2,0x00(%rsp)
vmovdqa %ymm1,0x20(%rsp)
vmovdqa 0x40(%rsp),%ymm0
vmovdqa 0x60(%rsp),%ymm1
vpunpckldq %ymm1,%ymm0,%ymm2
vpunpckhdq %ymm1,%ymm0,%ymm1
vmovdqa %ymm2,0x40(%rsp)
vmovdqa %ymm1,0x60(%rsp)
vmovdqa %ymm4,%ymm0
vpunpckldq %ymm5,%ymm0,%ymm4
vpunpckhdq %ymm5,%ymm0,%ymm5
vmovdqa %ymm6,%ymm0
vpunpckldq %ymm7,%ymm0,%ymm6
vpunpckhdq %ymm7,%ymm0,%ymm7
vmovdqa %ymm8,%ymm0
vpunpckldq %ymm9,%ymm0,%ymm8
vpunpckhdq %ymm9,%ymm0,%ymm9
vmovdqa %ymm10,%ymm0
vpunpckldq %ymm11,%ymm0,%ymm10
vpunpckhdq %ymm11,%ymm0,%ymm11
vmovdqa %ymm12,%ymm0
vpunpckldq %ymm13,%ymm0,%ymm12
vpunpckhdq %ymm13,%ymm0,%ymm13
vmovdqa %ymm14,%ymm0
vpunpckldq %ymm15,%ymm0,%ymm14
vpunpckhdq %ymm15,%ymm0,%ymm15
# interleave 64-bit words in state n, n+2
vmovdqa 0x00(%rsp),%ymm0
vmovdqa 0x40(%rsp),%ymm2
vpunpcklqdq %ymm2,%ymm0,%ymm1
vpunpckhqdq %ymm2,%ymm0,%ymm2
vmovdqa %ymm1,0x00(%rsp)
vmovdqa %ymm2,0x40(%rsp)
vmovdqa 0x20(%rsp),%ymm0
vmovdqa 0x60(%rsp),%ymm2
vpunpcklqdq %ymm2,%ymm0,%ymm1
vpunpckhqdq %ymm2,%ymm0,%ymm2
vmovdqa %ymm1,0x20(%rsp)
vmovdqa %ymm2,0x60(%rsp)
vmovdqa %ymm4,%ymm0
vpunpcklqdq %ymm6,%ymm0,%ymm4
vpunpckhqdq %ymm6,%ymm0,%ymm6
vmovdqa %ymm5,%ymm0
vpunpcklqdq %ymm7,%ymm0,%ymm5
vpunpckhqdq %ymm7,%ymm0,%ymm7
vmovdqa %ymm8,%ymm0
vpunpcklqdq %ymm10,%ymm0,%ymm8
vpunpckhqdq %ymm10,%ymm0,%ymm10
vmovdqa %ymm9,%ymm0
vpunpcklqdq %ymm11,%ymm0,%ymm9
vpunpckhqdq %ymm11,%ymm0,%ymm11
vmovdqa %ymm12,%ymm0
vpunpcklqdq %ymm14,%ymm0,%ymm12
vpunpckhqdq %ymm14,%ymm0,%ymm14
vmovdqa %ymm13,%ymm0
vpunpcklqdq %ymm15,%ymm0,%ymm13
vpunpckhqdq %ymm15,%ymm0,%ymm15
# interleave 128-bit words in state n, n+4
vmovdqa 0x00(%rsp),%ymm0
vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
vmovdqa %ymm1,0x00(%rsp)
vmovdqa 0x20(%rsp),%ymm0
vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
vmovdqa %ymm1,0x20(%rsp)
vmovdqa 0x40(%rsp),%ymm0
vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
vmovdqa %ymm1,0x40(%rsp)
vmovdqa 0x60(%rsp),%ymm0
vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
vmovdqa %ymm1,0x60(%rsp)
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
vmovdqa %ymm0,%ymm8
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
vmovdqa %ymm0,%ymm9
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
vmovdqa %ymm0,%ymm10
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
vmovdqa %ymm0,%ymm11
# xor with corresponding input, write to output
vmovdqa 0x00(%rsp),%ymm0
vpxor 0x0000(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0000(%rsi)
vmovdqa 0x20(%rsp),%ymm0
vpxor 0x0080(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0080(%rsi)
vmovdqa 0x40(%rsp),%ymm0
vpxor 0x0040(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0040(%rsi)
vmovdqa 0x60(%rsp),%ymm0
vpxor 0x00c0(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x00c0(%rsi)
vpxor 0x0100(%rdx),%ymm4,%ymm4
vmovdqu %ymm4,0x0100(%rsi)
vpxor 0x0180(%rdx),%ymm5,%ymm5
vmovdqu %ymm5,0x00180(%rsi)
vpxor 0x0140(%rdx),%ymm6,%ymm6
vmovdqu %ymm6,0x0140(%rsi)
vpxor 0x01c0(%rdx),%ymm7,%ymm7
vmovdqu %ymm7,0x01c0(%rsi)
vpxor 0x0020(%rdx),%ymm8,%ymm8
vmovdqu %ymm8,0x0020(%rsi)
vpxor 0x00a0(%rdx),%ymm9,%ymm9
vmovdqu %ymm9,0x00a0(%rsi)
vpxor 0x0060(%rdx),%ymm10,%ymm10
vmovdqu %ymm10,0x0060(%rsi)
vpxor 0x00e0(%rdx),%ymm11,%ymm11
vmovdqu %ymm11,0x00e0(%rsi)
vpxor 0x0120(%rdx),%ymm12,%ymm12
vmovdqu %ymm12,0x0120(%rsi)
vpxor 0x01a0(%rdx),%ymm13,%ymm13
vmovdqu %ymm13,0x01a0(%rsi)
vpxor 0x0160(%rdx),%ymm14,%ymm14
vmovdqu %ymm14,0x0160(%rsi)
vpxor 0x01e0(%rdx),%ymm15,%ymm15
vmovdqu %ymm15,0x01e0(%rsi)
vzeroupper
lea -8(%r10),%rsp
ret
ENDPROC(chacha20_8block_xor_avx2)

View File

@@ -1,146 +0,0 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha20.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/simd.h>
#define CHACHA20_STATE_ALIGN 16
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
#ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
static bool chacha20_use_avx2;
#endif
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA20_BLOCK_SIZE];
#ifdef CONFIG_AS_AVX2
if (chacha20_use_avx2) {
while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
chacha20_8block_xor_avx2(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE * 8;
src += CHACHA20_BLOCK_SIZE * 8;
dst += CHACHA20_BLOCK_SIZE * 8;
state[12] += 8;
}
}
#endif
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
chacha20_4block_xor_ssse3(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE * 4;
src += CHACHA20_BLOCK_SIZE * 4;
dst += CHACHA20_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA20_BLOCK_SIZE) {
chacha20_block_xor_ssse3(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE;
src += CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha20_block_xor_ssse3(state, buf, buf);
memcpy(dst, buf, bytes);
}
}
static int chacha20_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
u32 *state, state_buf[16 + 2] __aligned(8);
struct skcipher_walk walk;
int err;
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
return crypto_chacha20_crypt(req);
err = skcipher_walk_virt(&walk, req, true);
crypto_chacha20_init(state, ctx, walk.iv);
kernel_fpu_begin();
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
err = skcipher_walk_done(&walk,
walk.nbytes % CHACHA20_BLOCK_SIZE);
}
if (walk.nbytes) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes);
err = skcipher_walk_done(&walk, 0);
}
kernel_fpu_end();
return err;
}
static struct skcipher_alg alg = {
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha20_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA20_KEY_SIZE,
.max_keysize = CHACHA20_KEY_SIZE,
.ivsize = CHACHA20_IV_SIZE,
.chunksize = CHACHA20_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_simd,
.decrypt = chacha20_simd,
};
static int __init chacha20_simd_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return -ENODEV;
#ifdef CONFIG_AS_AVX2
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
#endif
return crypto_register_skcipher(&alg);
}
static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_skcipher(&alg);
}
module_init(chacha20_simd_mod_init);
module_exit(chacha20_simd_mod_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-simd");

View File

@@ -0,0 +1,304 @@
/*
* x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/simd.h>
#define CHACHA_STATE_ALIGN 16
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
#ifdef CONFIG_AS_AVX2
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
static bool chacha_use_avx2;
#ifdef CONFIG_AS_AVX512
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
static bool chacha_use_avx512vl;
#endif
#endif
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
{
len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
}
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
#ifdef CONFIG_AS_AVX2
#ifdef CONFIG_AS_AVX512
if (chacha_use_avx512vl) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx512vl(state, dst, src, bytes,
nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
if (bytes > CHACHA_BLOCK_SIZE * 4) {
chacha_8block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 8);
return;
}
if (bytes > CHACHA_BLOCK_SIZE * 2) {
chacha_4block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes) {
chacha_2block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 2);
return;
}
}
#endif
if (chacha_use_avx2) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
if (bytes > CHACHA_BLOCK_SIZE * 4) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 8);
return;
}
if (bytes > CHACHA_BLOCK_SIZE * 2) {
chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes > CHACHA_BLOCK_SIZE) {
chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 2);
return;
}
}
#endif
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
if (bytes > CHACHA_BLOCK_SIZE) {
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes) {
chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
state[12]++;
}
}
static int chacha_simd_stream_xor(struct skcipher_walk *walk,
struct chacha_ctx *ctx, u8 *iv)
{
u32 *state, state_buf[16 + 2] __aligned(8);
int next_yield = 4096; /* bytes until next FPU yield */
int err = 0;
BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
crypto_chacha_init(state, ctx, iv);
while (walk->nbytes > 0) {
unsigned int nbytes = walk->nbytes;
if (nbytes < walk->total) {
nbytes = round_down(nbytes, walk->stride);
next_yield -= nbytes;
}
chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
nbytes, ctx->nrounds);
if (next_yield <= 0) {
/* temporarily allow preemption */
kernel_fpu_end();
kernel_fpu_begin();
next_yield = 4096;
}
err = skcipher_walk_done(walk, walk->nbytes - nbytes);
}
return err;
}
static int chacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
int err;
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
return crypto_chacha_crypt(req);
err = skcipher_walk_virt(&walk, req, true);
if (err)
return err;
kernel_fpu_begin();
err = chacha_simd_stream_xor(&walk, ctx, req->iv);
kernel_fpu_end();
return err;
}
static int xchacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
struct chacha_ctx subctx;
u32 *state, state_buf[16 + 2] __aligned(8);
u8 real_iv[16];
int err;
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
return crypto_xchacha_crypt(req);
err = skcipher_walk_virt(&walk, req, true);
if (err)
return err;
BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
crypto_chacha_init(state, ctx, req->iv);
kernel_fpu_begin();
hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
kernel_fpu_end();
return err;
}
static struct skcipher_alg algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha_simd,
.decrypt = chacha_simd,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = xchacha_simd,
.decrypt = xchacha_simd,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha12_setkey,
.encrypt = xchacha_simd,
.decrypt = xchacha_simd,
},
};
static int __init chacha_simd_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return -ENODEV;
#ifdef CONFIG_AS_AVX2
chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
#ifdef CONFIG_AS_AVX512
chacha_use_avx512vl = chacha_use_avx2 &&
boot_cpu_has(X86_FEATURE_AVX512VL) &&
boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
#endif
#endif
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
}
static void __exit chacha_simd_mod_fini(void)
{
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
module_exit(chacha_simd_mod_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-simd");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-simd");
MODULE_ALIAS_CRYPTO("xchacha12");
MODULE_ALIAS_CRYPTO("xchacha12-simd");

View File

@@ -0,0 +1,157 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
*
* Copyright 2018 Google LLC
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
#define PASS0_SUMS %ymm0
#define PASS1_SUMS %ymm1
#define PASS2_SUMS %ymm2
#define PASS3_SUMS %ymm3
#define K0 %ymm4
#define K0_XMM %xmm4
#define K1 %ymm5
#define K1_XMM %xmm5
#define K2 %ymm6
#define K2_XMM %xmm6
#define K3 %ymm7
#define K3_XMM %xmm7
#define T0 %ymm8
#define T1 %ymm9
#define T2 %ymm10
#define T2_XMM %xmm10
#define T3 %ymm11
#define T3_XMM %xmm11
#define T4 %ymm12
#define T5 %ymm13
#define T6 %ymm14
#define T7 %ymm15
#define KEY %rdi
#define MESSAGE %rsi
#define MESSAGE_LEN %rdx
#define HASH %rcx
.macro _nh_2xstride k0, k1, k2, k3
// Add message words to key words
vpaddd \k0, T3, T0
vpaddd \k1, T3, T1
vpaddd \k2, T3, T2
vpaddd \k3, T3, T3
// Multiply 32x32 => 64 and accumulate
vpshufd $0x10, T0, T4
vpshufd $0x32, T0, T0
vpshufd $0x10, T1, T5
vpshufd $0x32, T1, T1
vpshufd $0x10, T2, T6
vpshufd $0x32, T2, T2
vpshufd $0x10, T3, T7
vpshufd $0x32, T3, T3
vpmuludq T4, T0, T0
vpmuludq T5, T1, T1
vpmuludq T6, T2, T2
vpmuludq T7, T3, T3
vpaddq T0, PASS0_SUMS, PASS0_SUMS
vpaddq T1, PASS1_SUMS, PASS1_SUMS
vpaddq T2, PASS2_SUMS, PASS2_SUMS
vpaddq T3, PASS3_SUMS, PASS3_SUMS
.endm
/*
* void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
* u8 hash[NH_HASH_BYTES])
*
* It's guaranteed that message_len % 16 == 0.
*/
ENTRY(nh_avx2)
vmovdqu 0x00(KEY), K0
vmovdqu 0x10(KEY), K1
add $0x20, KEY
vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
sub $0x40, MESSAGE_LEN
jl .Lloop4_done
.Lloop4:
vmovdqu (MESSAGE), T3
vmovdqu 0x00(KEY), K2
vmovdqu 0x10(KEY), K3
_nh_2xstride K0, K1, K2, K3
vmovdqu 0x20(MESSAGE), T3
vmovdqu 0x20(KEY), K0
vmovdqu 0x30(KEY), K1
_nh_2xstride K2, K3, K0, K1
add $0x40, MESSAGE
add $0x40, KEY
sub $0x40, MESSAGE_LEN
jge .Lloop4
.Lloop4_done:
and $0x3f, MESSAGE_LEN
jz .Ldone
cmp $0x20, MESSAGE_LEN
jl .Llast
// 2 or 3 strides remain; do 2 more.
vmovdqu (MESSAGE), T3
vmovdqu 0x00(KEY), K2
vmovdqu 0x10(KEY), K3
_nh_2xstride K0, K1, K2, K3
add $0x20, MESSAGE
add $0x20, KEY
sub $0x20, MESSAGE_LEN
jz .Ldone
vmovdqa K2, K0
vmovdqa K3, K1
.Llast:
// Last stride. Zero the high 128 bits of the message and keys so they
// don't affect the result when processing them like 2 strides.
vmovdqu (MESSAGE), T3_XMM
vmovdqa K0_XMM, K0_XMM
vmovdqa K1_XMM, K1_XMM
vmovdqu 0x00(KEY), K2_XMM
vmovdqu 0x10(KEY), K3_XMM
_nh_2xstride K0, K1, K2, K3
.Ldone:
// Sum the accumulators for each pass, then store the sums to 'hash'
// PASS0_SUMS is (0A 0B 0C 0D)
// PASS1_SUMS is (1A 1B 1C 1D)
// PASS2_SUMS is (2A 2B 2C 2D)
// PASS3_SUMS is (3A 3B 3C 3D)
// We need the horizontal sums:
// (0A + 0B + 0C + 0D,
// 1A + 1B + 1C + 1D,
// 2A + 2B + 2C + 2D,
// 3A + 3B + 3C + 3D)
//
vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C)
vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D)
vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C)
vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D)
vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A)
vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B)
vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C)
vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D)
vpaddq T5, T4, T4
vpaddq T1, T0, T0
vpaddq T4, T0, T0
vmovdqu T0, (HASH)
ret
ENDPROC(nh_avx2)

View File

@@ -0,0 +1,123 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
*
* Copyright 2018 Google LLC
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
#define PASS0_SUMS %xmm0
#define PASS1_SUMS %xmm1
#define PASS2_SUMS %xmm2
#define PASS3_SUMS %xmm3
#define K0 %xmm4
#define K1 %xmm5
#define K2 %xmm6
#define K3 %xmm7
#define T0 %xmm8
#define T1 %xmm9
#define T2 %xmm10
#define T3 %xmm11
#define T4 %xmm12
#define T5 %xmm13
#define T6 %xmm14
#define T7 %xmm15
#define KEY %rdi
#define MESSAGE %rsi
#define MESSAGE_LEN %rdx
#define HASH %rcx
.macro _nh_stride k0, k1, k2, k3, offset
// Load next message stride
movdqu \offset(MESSAGE), T1
// Load next key stride
movdqu \offset(KEY), \k3
// Add message words to key words
movdqa T1, T2
movdqa T1, T3
paddd T1, \k0 // reuse k0 to avoid a move
paddd \k1, T1
paddd \k2, T2
paddd \k3, T3
// Multiply 32x32 => 64 and accumulate
pshufd $0x10, \k0, T4
pshufd $0x32, \k0, \k0
pshufd $0x10, T1, T5
pshufd $0x32, T1, T1
pshufd $0x10, T2, T6
pshufd $0x32, T2, T2
pshufd $0x10, T3, T7
pshufd $0x32, T3, T3
pmuludq T4, \k0
pmuludq T5, T1
pmuludq T6, T2
pmuludq T7, T3
paddq \k0, PASS0_SUMS
paddq T1, PASS1_SUMS
paddq T2, PASS2_SUMS
paddq T3, PASS3_SUMS
.endm
/*
* void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
* u8 hash[NH_HASH_BYTES])
*
* It's guaranteed that message_len % 16 == 0.
*/
ENTRY(nh_sse2)
movdqu 0x00(KEY), K0
movdqu 0x10(KEY), K1
movdqu 0x20(KEY), K2
add $0x30, KEY
pxor PASS0_SUMS, PASS0_SUMS
pxor PASS1_SUMS, PASS1_SUMS
pxor PASS2_SUMS, PASS2_SUMS
pxor PASS3_SUMS, PASS3_SUMS
sub $0x40, MESSAGE_LEN
jl .Lloop4_done
.Lloop4:
_nh_stride K0, K1, K2, K3, 0x00
_nh_stride K1, K2, K3, K0, 0x10
_nh_stride K2, K3, K0, K1, 0x20
_nh_stride K3, K0, K1, K2, 0x30
add $0x40, KEY
add $0x40, MESSAGE
sub $0x40, MESSAGE_LEN
jge .Lloop4
.Lloop4_done:
and $0x3f, MESSAGE_LEN
jz .Ldone
_nh_stride K0, K1, K2, K3, 0x00
sub $0x10, MESSAGE_LEN
jz .Ldone
_nh_stride K1, K2, K3, K0, 0x10
sub $0x10, MESSAGE_LEN
jz .Ldone
_nh_stride K2, K3, K0, K1, 0x20
.Ldone:
// Sum the accumulators for each pass, then store the sums to 'hash'
movdqa PASS0_SUMS, T0
movdqa PASS2_SUMS, T1
punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
paddq PASS0_SUMS, T0
paddq PASS2_SUMS, T1
movdqu T0, 0x00(HASH)
movdqu T1, 0x10(HASH)
ret
ENDPROC(nh_sse2)

View File

@@ -0,0 +1,77 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
* (AVX2 accelerated version)
*
* Copyright 2018 Google LLC
*/
#include <crypto/internal/hash.h>
#include <crypto/nhpoly1305.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
u8 hash[NH_HASH_BYTES]);
/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
__le64 hash[NH_NUM_PASSES])
{
nh_avx2(key, message, message_len, (u8 *)hash);
}
static int nhpoly1305_avx2_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
if (srclen < 64 || !irq_fpu_usable())
return crypto_nhpoly1305_update(desc, src, srclen);
do {
unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
kernel_fpu_end();
src += n;
srclen -= n;
} while (srclen);
return 0;
}
static struct shash_alg nhpoly1305_alg = {
.base.cra_name = "nhpoly1305",
.base.cra_driver_name = "nhpoly1305-avx2",
.base.cra_priority = 300,
.base.cra_ctxsize = sizeof(struct nhpoly1305_key),
.base.cra_module = THIS_MODULE,
.digestsize = POLY1305_DIGEST_SIZE,
.init = crypto_nhpoly1305_init,
.update = nhpoly1305_avx2_update,
.final = crypto_nhpoly1305_final,
.setkey = crypto_nhpoly1305_setkey,
.descsize = sizeof(struct nhpoly1305_state),
};
static int __init nhpoly1305_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_AVX2) ||
!boot_cpu_has(X86_FEATURE_OSXSAVE))
return -ENODEV;
return crypto_register_shash(&nhpoly1305_alg);
}
static void __exit nhpoly1305_mod_exit(void)
{
crypto_unregister_shash(&nhpoly1305_alg);
}
module_init(nhpoly1305_mod_init);
module_exit(nhpoly1305_mod_exit);
MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("nhpoly1305");
MODULE_ALIAS_CRYPTO("nhpoly1305-avx2");

View File

@@ -0,0 +1,76 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
* (SSE2 accelerated version)
*
* Copyright 2018 Google LLC
*/
#include <crypto/internal/hash.h>
#include <crypto/nhpoly1305.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
u8 hash[NH_HASH_BYTES]);
/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
__le64 hash[NH_NUM_PASSES])
{
nh_sse2(key, message, message_len, (u8 *)hash);
}
static int nhpoly1305_sse2_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
if (srclen < 64 || !irq_fpu_usable())
return crypto_nhpoly1305_update(desc, src, srclen);
do {
unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
kernel_fpu_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
kernel_fpu_end();
src += n;
srclen -= n;
} while (srclen);
return 0;
}
static struct shash_alg nhpoly1305_alg = {
.base.cra_name = "nhpoly1305",
.base.cra_driver_name = "nhpoly1305-sse2",
.base.cra_priority = 200,
.base.cra_ctxsize = sizeof(struct nhpoly1305_key),
.base.cra_module = THIS_MODULE,
.digestsize = POLY1305_DIGEST_SIZE,
.init = crypto_nhpoly1305_init,
.update = nhpoly1305_sse2_update,
.final = crypto_nhpoly1305_final,
.setkey = crypto_nhpoly1305_setkey,
.descsize = sizeof(struct nhpoly1305_state),
};
static int __init nhpoly1305_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2))
return -ENODEV;
return crypto_register_shash(&nhpoly1305_alg);
}
static void __exit nhpoly1305_mod_exit(void)
{
crypto_unregister_shash(&nhpoly1305_alg);
}
module_init(nhpoly1305_mod_init);
module_exit(nhpoly1305_mod_exit);
MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("nhpoly1305");
MODULE_ALIAS_CRYPTO("nhpoly1305-sse2");

View File

@@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
if (unlikely(!sctx->wset)) {
if (!sctx->uset) {
memcpy(sctx->u, dctx->r, sizeof(sctx->u));
poly1305_simd_mult(sctx->u, dctx->r);
memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
poly1305_simd_mult(sctx->u + 5, dctx->r);
poly1305_simd_mult(sctx->u + 5, dctx->r.r);
memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
poly1305_simd_mult(sctx->u + 10, dctx->r);
poly1305_simd_mult(sctx->u + 10, dctx->r.r);
sctx->wset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
sctx->u);
src += POLY1305_BLOCK_SIZE * 4 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
}
#endif
if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
if (unlikely(!sctx->uset)) {
memcpy(sctx->u, dctx->r, sizeof(sctx->u));
poly1305_simd_mult(sctx->u, dctx->r);
memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
sctx->u);
src += POLY1305_BLOCK_SIZE * 2 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
}
if (srclen >= POLY1305_BLOCK_SIZE) {
poly1305_block_sse2(dctx->h, src, dctx->r, 1);
poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
srclen -= POLY1305_BLOCK_SIZE;
}
return srclen;