Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu: - Added aesni/avx/x86_64 implementations for camellia. - Optimised AVX code for cast5/serpent/twofish/cast6. - Fixed vmac bug with unaligned input. - Allow compression algorithms in FIPS mode. - Optimised crc32c implementation for Intel. - Misc fixes. * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (32 commits) crypto: caam - Updated SEC-4.0 device tree binding for ERA information. crypto: testmgr - remove superfluous initializers for xts(aes) crypto: testmgr - allow compression algs in fips mode crypto: testmgr - add larger crc32c test vector to test FPU path in crc32c_intel crypto: testmgr - clean alg_test_null entries in alg_test_descs[] crypto: testmgr - remove fips_allowed flag from camellia-aesni null-tests crypto: cast5/cast6 - move lookup tables to shared module padata: use __this_cpu_read per-cpu helper crypto: s5p-sss - Fix compilation error crypto: picoxcell - Add terminating entry for platform_device_id table crypto: omap-aes - select BLKCIPHER2 crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of camellia cipher crypto: camellia-x86_64 - share common functions and move structures and function definitions to header file crypto: tcrypt - add async speed test for camellia cipher crypto: tegra-aes - fix error-valued pointer dereference crypto: tegra - fix missing unlock on error case crypto: cast5/avx - avoid using temporary stack buffers crypto: serpent/avx - avoid using temporary stack buffers crypto: twofish/avx - avoid using temporary stack buffers crypto: cast6/avx - avoid using temporary stack buffers ...
This commit is contained in:
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
|
||||
@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
|
||||
|
||||
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
|
||||
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
|
||||
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
||||
camellia_aesni_avx_glue.o
|
||||
cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
|
||||
cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
|
||||
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
|
||||
@@ -47,3 +50,5 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
|
||||
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
|
||||
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
|
||||
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
|
||||
crc32c-intel-y := crc32c-intel_glue.o
|
||||
crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
|
||||
|
1102
arch/x86/crypto/camellia-aesni-avx-asm_64.S
Normal file
1102
arch/x86/crypto/camellia-aesni-avx-asm_64.S
Normal file
File diff suppressed because it is too large
Load Diff
558
arch/x86/crypto/camellia_aesni_avx_glue.c
Normal file
558
arch/x86/crypto/camellia_aesni_avx_glue.c
Normal file
@@ -0,0 +1,558 @@
|
||||
/*
|
||||
* Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
|
||||
*
|
||||
* Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/err.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/ctr.h>
|
||||
#include <crypto/lrw.h>
|
||||
#include <crypto/xts.h>
|
||||
#include <asm/xcr.h>
|
||||
#include <asm/xsave.h>
|
||||
#include <asm/crypto/camellia.h>
|
||||
#include <asm/crypto/ablk_helper.h>
|
||||
#include <asm/crypto/glue_helper.h>
|
||||
|
||||
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
|
||||
|
||||
/* 16-way AES-NI parallel cipher functions */
|
||||
asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
static const struct common_glue_ctx camellia_enc = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
|
||||
}, {
|
||||
.num_blocks = 2,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_ctr = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
|
||||
}, {
|
||||
.num_blocks = 2,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_dec = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 2,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static const struct common_glue_ctx camellia_dec_cbc = {
|
||||
.num_funcs = 3,
|
||||
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
|
||||
}, {
|
||||
.num_blocks = 2,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
|
||||
} }
|
||||
};
|
||||
|
||||
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
|
||||
dst, src, nbytes);
|
||||
}
|
||||
|
||||
static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
|
||||
nbytes);
|
||||
}
|
||||
|
||||
static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
|
||||
}
|
||||
|
||||
static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
|
||||
{
|
||||
return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
|
||||
CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
|
||||
nbytes);
|
||||
}
|
||||
|
||||
static inline void camellia_fpu_end(bool fpu_enabled)
|
||||
{
|
||||
glue_fpu_end(fpu_enabled);
|
||||
}
|
||||
|
||||
static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
|
||||
unsigned int key_len)
|
||||
{
|
||||
return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
|
||||
&tfm->crt_flags);
|
||||
}
|
||||
|
||||
struct crypt_priv {
|
||||
struct camellia_ctx *ctx;
|
||||
bool fpu_enabled;
|
||||
};
|
||||
|
||||
static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
{
|
||||
const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
|
||||
struct crypt_priv *ctx = priv;
|
||||
int i;
|
||||
|
||||
ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
|
||||
camellia_enc_blk(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
{
|
||||
const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
|
||||
struct crypt_priv *ctx = priv;
|
||||
int i;
|
||||
|
||||
ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
|
||||
camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
|
||||
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
|
||||
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
|
||||
camellia_dec_blk(ctx->ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->camellia_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct lrw_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.table_ctx = &ctx->lrw_table,
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
camellia_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->camellia_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct lrw_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.table_ctx = &ctx->lrw_table,
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
camellia_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = encrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
camellia_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
{
|
||||
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
|
||||
struct crypt_priv crypt_ctx = {
|
||||
.ctx = &ctx->crypt_ctx,
|
||||
.fpu_enabled = false,
|
||||
};
|
||||
struct xts_crypt_req req = {
|
||||
.tbuf = buf,
|
||||
.tbuflen = sizeof(buf),
|
||||
|
||||
.tweak_ctx = &ctx->tweak_ctx,
|
||||
.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
|
||||
.crypt_ctx = &crypt_ctx,
|
||||
.crypt_fn = decrypt_callback,
|
||||
};
|
||||
int ret;
|
||||
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
ret = xts_crypt(desc, dst, src, nbytes, &req);
|
||||
camellia_fpu_end(crypt_ctx.fpu_enabled);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct crypto_alg cmll_algs[10] = { {
|
||||
.cra_name = "__ecb-camellia-aesni",
|
||||
.cra_driver_name = "__driver-ecb-camellia-aesni",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct camellia_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.setkey = camellia_setkey,
|
||||
.encrypt = ecb_encrypt,
|
||||
.decrypt = ecb_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__cbc-camellia-aesni",
|
||||
.cra_driver_name = "__driver-cbc-camellia-aesni",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct camellia_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.setkey = camellia_setkey,
|
||||
.encrypt = cbc_encrypt,
|
||||
.decrypt = cbc_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__ctr-camellia-aesni",
|
||||
.cra_driver_name = "__driver-ctr-camellia-aesni",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct camellia_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = camellia_setkey,
|
||||
.encrypt = ctr_crypt,
|
||||
.decrypt = ctr_crypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__lrw-camellia-aesni",
|
||||
.cra_driver_name = "__driver-lrw-camellia-aesni",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct camellia_lrw_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_exit = lrw_camellia_exit_tfm,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE +
|
||||
CAMELLIA_BLOCK_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE +
|
||||
CAMELLIA_BLOCK_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = lrw_camellia_setkey,
|
||||
.encrypt = lrw_encrypt,
|
||||
.decrypt = lrw_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "__xts-camellia-aesni",
|
||||
.cra_driver_name = "__driver-xts-camellia-aesni",
|
||||
.cra_priority = 0,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct camellia_xts_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = xts_camellia_setkey,
|
||||
.encrypt = xts_encrypt,
|
||||
.decrypt = xts_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ecb(camellia)",
|
||||
.cra_driver_name = "ecb-camellia-aesni",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "cbc(camellia)",
|
||||
.cra_driver_name = "cbc-camellia-aesni",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = __ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "ctr(camellia)",
|
||||
.cra_driver_name = "ctr-camellia-aesni",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = 1,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_encrypt,
|
||||
.geniv = "chainiv",
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "lrw(camellia)",
|
||||
.cra_driver_name = "lrw-camellia-aesni",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE +
|
||||
CAMELLIA_BLOCK_SIZE,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE +
|
||||
CAMELLIA_BLOCK_SIZE,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
}, {
|
||||
.cra_name = "xts(camellia)",
|
||||
.cra_driver_name = "xts-camellia-aesni",
|
||||
.cra_priority = 400,
|
||||
.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
|
||||
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
|
||||
.cra_ctxsize = sizeof(struct async_helper_ctx),
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_ablkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_init = ablk_init,
|
||||
.cra_exit = ablk_exit,
|
||||
.cra_u = {
|
||||
.ablkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
|
||||
.max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
|
||||
.ivsize = CAMELLIA_BLOCK_SIZE,
|
||||
.setkey = ablk_set_key,
|
||||
.encrypt = ablk_encrypt,
|
||||
.decrypt = ablk_decrypt,
|
||||
},
|
||||
},
|
||||
} };
|
||||
|
||||
static int __init camellia_aesni_init(void)
|
||||
{
|
||||
u64 xcr0;
|
||||
|
||||
if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
|
||||
pr_info("AVX or AES-NI instructions are not detected.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
|
||||
if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
|
||||
pr_info("AVX detected but unusable.\n");
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
|
||||
}
|
||||
|
||||
static void __exit camellia_aesni_fini(void)
|
||||
{
|
||||
crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
|
||||
}
|
||||
|
||||
module_init(camellia_aesni_init);
|
||||
module_exit(camellia_aesni_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
|
||||
MODULE_ALIAS("camellia");
|
||||
MODULE_ALIAS("camellia-asm");
|
@@ -32,53 +32,24 @@
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/lrw.h>
|
||||
#include <crypto/xts.h>
|
||||
#include <asm/crypto/camellia.h>
|
||||
#include <asm/crypto/glue_helper.h>
|
||||
|
||||
#define CAMELLIA_MIN_KEY_SIZE 16
|
||||
#define CAMELLIA_MAX_KEY_SIZE 32
|
||||
#define CAMELLIA_BLOCK_SIZE 16
|
||||
#define CAMELLIA_TABLE_BYTE_LEN 272
|
||||
|
||||
struct camellia_ctx {
|
||||
u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
|
||||
u32 key_length;
|
||||
};
|
||||
|
||||
/* regular block cipher functions */
|
||||
asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, bool xor);
|
||||
EXPORT_SYMBOL_GPL(__camellia_enc_blk);
|
||||
asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
EXPORT_SYMBOL_GPL(camellia_dec_blk);
|
||||
|
||||
/* 2-way parallel cipher functions */
|
||||
asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src, bool xor);
|
||||
EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
|
||||
asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__camellia_enc_blk(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__camellia_enc_blk(ctx, dst, src, true);
|
||||
}
|
||||
|
||||
static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__camellia_enc_blk_2way(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__camellia_enc_blk_2way(ctx, dst, src, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
|
||||
|
||||
static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
|
||||
{
|
||||
@@ -1275,9 +1246,8 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey)
|
||||
camellia_setup256(kk, subkey);
|
||||
}
|
||||
|
||||
static int __camellia_setkey(struct camellia_ctx *cctx,
|
||||
const unsigned char *key,
|
||||
unsigned int key_len, u32 *flags)
|
||||
int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
|
||||
unsigned int key_len, u32 *flags)
|
||||
{
|
||||
if (key_len != 16 && key_len != 24 && key_len != 32) {
|
||||
*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
|
||||
@@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx,
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__camellia_setkey);
|
||||
|
||||
static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
|
||||
unsigned int key_len)
|
||||
@@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
|
||||
&tfm->crt_flags);
|
||||
}
|
||||
|
||||
static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
|
||||
void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
|
||||
{
|
||||
u128 iv = *src;
|
||||
|
||||
@@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
|
||||
|
||||
u128_xor(&dst[1], &dst[1], &iv);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
|
||||
|
||||
static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
|
||||
void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
be128 ctrblk;
|
||||
|
||||
if (dst != src)
|
||||
*dst = *src;
|
||||
|
||||
u128_to_be128(&ctrblk, iv);
|
||||
u128_inc(iv);
|
||||
le128_to_be128(&ctrblk, iv);
|
||||
le128_inc(iv);
|
||||
|
||||
camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
|
||||
|
||||
static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
|
||||
u128 *iv)
|
||||
void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
be128 ctrblks[2];
|
||||
|
||||
@@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
|
||||
dst[1] = src[1];
|
||||
}
|
||||
|
||||
u128_to_be128(&ctrblks[0], iv);
|
||||
u128_inc(iv);
|
||||
u128_to_be128(&ctrblks[1], iv);
|
||||
u128_inc(iv);
|
||||
le128_to_be128(&ctrblks[0], iv);
|
||||
le128_inc(iv);
|
||||
le128_to_be128(&ctrblks[1], iv);
|
||||
le128_inc(iv);
|
||||
|
||||
camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
|
||||
|
||||
static const struct common_glue_ctx camellia_enc = {
|
||||
.num_funcs = 2,
|
||||
@@ -1464,13 +1437,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
camellia_dec_blk(ctx, srcdst, srcdst);
|
||||
}
|
||||
|
||||
struct camellia_lrw_ctx {
|
||||
struct lrw_table_ctx lrw_table;
|
||||
struct camellia_ctx camellia_ctx;
|
||||
};
|
||||
|
||||
static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
{
|
||||
struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
int err;
|
||||
@@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
return lrw_init_table(&ctx->lrw_table,
|
||||
key + keylen - CAMELLIA_BLOCK_SIZE);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(lrw_camellia_setkey);
|
||||
|
||||
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
@@ -1519,20 +1488,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
return lrw_crypt(desc, dst, src, nbytes, &req);
|
||||
}
|
||||
|
||||
static void lrw_exit_tfm(struct crypto_tfm *tfm)
|
||||
void lrw_camellia_exit_tfm(struct crypto_tfm *tfm)
|
||||
{
|
||||
struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
|
||||
lrw_free_table(&ctx->lrw_table);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm);
|
||||
|
||||
struct camellia_xts_ctx {
|
||||
struct camellia_ctx tweak_ctx;
|
||||
struct camellia_ctx crypt_ctx;
|
||||
};
|
||||
|
||||
static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
{
|
||||
struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);
|
||||
u32 *flags = &tfm->crt_flags;
|
||||
@@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
|
||||
return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
|
||||
flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xts_camellia_setkey);
|
||||
|
||||
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
|
||||
struct scatterlist *src, unsigned int nbytes)
|
||||
@@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { {
|
||||
.cra_alignmask = 0,
|
||||
.cra_type = &crypto_blkcipher_type,
|
||||
.cra_module = THIS_MODULE,
|
||||
.cra_exit = lrw_exit_tfm,
|
||||
.cra_exit = lrw_camellia_exit_tfm,
|
||||
.cra_u = {
|
||||
.blkcipher = {
|
||||
.min_keysize = CAMELLIA_MIN_KEY_SIZE +
|
||||
|
@@ -25,10 +25,10 @@
|
||||
|
||||
.file "cast5-avx-x86_64-asm_64.S"
|
||||
|
||||
.extern cast5_s1
|
||||
.extern cast5_s2
|
||||
.extern cast5_s3
|
||||
.extern cast5_s4
|
||||
.extern cast_s1
|
||||
.extern cast_s2
|
||||
.extern cast_s3
|
||||
.extern cast_s4
|
||||
|
||||
/* structure of crypto context */
|
||||
#define km 0
|
||||
@@ -36,10 +36,10 @@
|
||||
#define rr ((16*4)+16)
|
||||
|
||||
/* s-boxes */
|
||||
#define s1 cast5_s1
|
||||
#define s2 cast5_s2
|
||||
#define s3 cast5_s3
|
||||
#define s4 cast5_s4
|
||||
#define s1 cast_s1
|
||||
#define s2 cast_s2
|
||||
#define s3 cast_s3
|
||||
#define s4 cast_s4
|
||||
|
||||
/**********************************************************************
|
||||
16-way AVX cast5
|
||||
@@ -180,31 +180,17 @@
|
||||
vpunpcklqdq t1, t0, x0; \
|
||||
vpunpckhqdq t1, t0, x1;
|
||||
|
||||
#define inpack_blocks(in, x0, x1, t0, t1, rmask) \
|
||||
vmovdqu (0*4*4)(in), x0; \
|
||||
vmovdqu (1*4*4)(in), x1; \
|
||||
#define inpack_blocks(x0, x1, t0, t1, rmask) \
|
||||
vpshufb rmask, x0, x0; \
|
||||
vpshufb rmask, x1, x1; \
|
||||
\
|
||||
transpose_2x4(x0, x1, t0, t1)
|
||||
|
||||
#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \
|
||||
#define outunpack_blocks(x0, x1, t0, t1, rmask) \
|
||||
transpose_2x4(x0, x1, t0, t1) \
|
||||
\
|
||||
vpshufb rmask, x0, x0; \
|
||||
vpshufb rmask, x1, x1; \
|
||||
vmovdqu x0, (0*4*4)(out); \
|
||||
vmovdqu x1, (1*4*4)(out);
|
||||
|
||||
#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
|
||||
transpose_2x4(x0, x1, t0, t1) \
|
||||
\
|
||||
vpshufb rmask, x0, x0; \
|
||||
vpshufb rmask, x1, x1; \
|
||||
vpxor (0*4*4)(out), x0, x0; \
|
||||
vmovdqu x0, (0*4*4)(out); \
|
||||
vpxor (1*4*4)(out), x1, x1; \
|
||||
vmovdqu x1, (1*4*4)(out);
|
||||
vpshufb rmask, x1, x1;
|
||||
|
||||
.data
|
||||
|
||||
@@ -213,6 +199,8 @@
|
||||
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
.Lbswap_iv_mask:
|
||||
.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
.L16_mask:
|
||||
.byte 16, 16, 16, 16
|
||||
.L32_mask:
|
||||
@@ -223,35 +211,42 @@
|
||||
.text
|
||||
|
||||
.align 16
|
||||
.global __cast5_enc_blk_16way
|
||||
.type __cast5_enc_blk_16way,@function;
|
||||
.type __cast5_enc_blk16,@function;
|
||||
|
||||
__cast5_enc_blk_16way:
|
||||
__cast5_enc_blk16:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: bool, if true: xor output
|
||||
* RL1: blocks 1 and 2
|
||||
* RR1: blocks 3 and 4
|
||||
* RL2: blocks 5 and 6
|
||||
* RR2: blocks 7 and 8
|
||||
* RL3: blocks 9 and 10
|
||||
* RR3: blocks 11 and 12
|
||||
* RL4: blocks 13 and 14
|
||||
* RR4: blocks 15 and 16
|
||||
* output:
|
||||
* RL1: encrypted blocks 1 and 2
|
||||
* RR1: encrypted blocks 3 and 4
|
||||
* RL2: encrypted blocks 5 and 6
|
||||
* RR2: encrypted blocks 7 and 8
|
||||
* RL3: encrypted blocks 9 and 10
|
||||
* RR3: encrypted blocks 11 and 12
|
||||
* RL4: encrypted blocks 13 and 14
|
||||
* RR4: encrypted blocks 15 and 16
|
||||
*/
|
||||
|
||||
pushq %rbp;
|
||||
pushq %rbx;
|
||||
pushq %rcx;
|
||||
|
||||
vmovdqa .Lbswap_mask, RKM;
|
||||
vmovd .Lfirst_mask, R1ST;
|
||||
vmovd .L32_mask, R32;
|
||||
enc_preload_rkr();
|
||||
|
||||
leaq 1*(2*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
|
||||
inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
|
||||
leaq 2*(2*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
|
||||
leaq 3*(2*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
|
||||
|
||||
movq %rsi, %r11;
|
||||
inpack_blocks(RL1, RR1, RTMP, RX, RKM);
|
||||
inpack_blocks(RL2, RR2, RTMP, RX, RKM);
|
||||
inpack_blocks(RL3, RR3, RTMP, RX, RKM);
|
||||
inpack_blocks(RL4, RR4, RTMP, RX, RKM);
|
||||
|
||||
round(RL, RR, 0, 1);
|
||||
round(RR, RL, 1, 2);
|
||||
@@ -276,44 +271,41 @@ __cast5_enc_blk_16way:
|
||||
round(RR, RL, 15, 1);
|
||||
|
||||
__skip_enc:
|
||||
popq %rcx;
|
||||
popq %rbx;
|
||||
popq %rbp;
|
||||
|
||||
vmovdqa .Lbswap_mask, RKM;
|
||||
leaq 1*(2*4*4)(%r11), %rax;
|
||||
|
||||
testb %cl, %cl;
|
||||
jnz __enc_xor16;
|
||||
|
||||
outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
|
||||
outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
|
||||
leaq 2*(2*4*4)(%r11), %rax;
|
||||
outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
|
||||
leaq 3*(2*4*4)(%r11), %rax;
|
||||
outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
|
||||
|
||||
ret;
|
||||
|
||||
__enc_xor16:
|
||||
outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
|
||||
outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
|
||||
leaq 2*(2*4*4)(%r11), %rax;
|
||||
outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
|
||||
leaq 3*(2*4*4)(%r11), %rax;
|
||||
outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
|
||||
outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
|
||||
outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
|
||||
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
|
||||
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
|
||||
|
||||
ret;
|
||||
|
||||
.align 16
|
||||
.global cast5_dec_blk_16way
|
||||
.type cast5_dec_blk_16way,@function;
|
||||
.type __cast5_dec_blk16,@function;
|
||||
|
||||
cast5_dec_blk_16way:
|
||||
__cast5_dec_blk16:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* RL1: encrypted blocks 1 and 2
|
||||
* RR1: encrypted blocks 3 and 4
|
||||
* RL2: encrypted blocks 5 and 6
|
||||
* RR2: encrypted blocks 7 and 8
|
||||
* RL3: encrypted blocks 9 and 10
|
||||
* RR3: encrypted blocks 11 and 12
|
||||
* RL4: encrypted blocks 13 and 14
|
||||
* RR4: encrypted blocks 15 and 16
|
||||
* output:
|
||||
* RL1: decrypted blocks 1 and 2
|
||||
* RR1: decrypted blocks 3 and 4
|
||||
* RL2: decrypted blocks 5 and 6
|
||||
* RR2: decrypted blocks 7 and 8
|
||||
* RL3: decrypted blocks 9 and 10
|
||||
* RR3: decrypted blocks 11 and 12
|
||||
* RL4: decrypted blocks 13 and 14
|
||||
* RR4: decrypted blocks 15 and 16
|
||||
*/
|
||||
|
||||
pushq %rbp;
|
||||
@@ -324,15 +316,10 @@ cast5_dec_blk_16way:
|
||||
vmovd .L32_mask, R32;
|
||||
dec_preload_rkr();
|
||||
|
||||
leaq 1*(2*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
|
||||
inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
|
||||
leaq 2*(2*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
|
||||
leaq 3*(2*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
|
||||
|
||||
movq %rsi, %r11;
|
||||
inpack_blocks(RL1, RR1, RTMP, RX, RKM);
|
||||
inpack_blocks(RL2, RR2, RTMP, RX, RKM);
|
||||
inpack_blocks(RL3, RR3, RTMP, RX, RKM);
|
||||
inpack_blocks(RL4, RR4, RTMP, RX, RKM);
|
||||
|
||||
movzbl rr(CTX), %eax;
|
||||
testl %eax, %eax;
|
||||
@@ -361,16 +348,211 @@ __dec_tail:
|
||||
popq %rbx;
|
||||
popq %rbp;
|
||||
|
||||
leaq 1*(2*4*4)(%r11), %rax;
|
||||
outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
|
||||
outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
|
||||
leaq 2*(2*4*4)(%r11), %rax;
|
||||
outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
|
||||
leaq 3*(2*4*4)(%r11), %rax;
|
||||
outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
|
||||
outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
|
||||
outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
|
||||
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
|
||||
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
|
||||
|
||||
ret;
|
||||
|
||||
__skip_dec:
|
||||
vpsrldq $4, RKR, RKR;
|
||||
jmp __dec_tail;
|
||||
|
||||
.align 16
|
||||
.global cast5_ecb_enc_16way
|
||||
.type cast5_ecb_enc_16way,@function;
|
||||
|
||||
cast5_ecb_enc_16way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
vmovdqu (0*4*4)(%rdx), RL1;
|
||||
vmovdqu (1*4*4)(%rdx), RR1;
|
||||
vmovdqu (2*4*4)(%rdx), RL2;
|
||||
vmovdqu (3*4*4)(%rdx), RR2;
|
||||
vmovdqu (4*4*4)(%rdx), RL3;
|
||||
vmovdqu (5*4*4)(%rdx), RR3;
|
||||
vmovdqu (6*4*4)(%rdx), RL4;
|
||||
vmovdqu (7*4*4)(%rdx), RR4;
|
||||
|
||||
call __cast5_enc_blk16;
|
||||
|
||||
vmovdqu RR1, (0*4*4)(%r11);
|
||||
vmovdqu RL1, (1*4*4)(%r11);
|
||||
vmovdqu RR2, (2*4*4)(%r11);
|
||||
vmovdqu RL2, (3*4*4)(%r11);
|
||||
vmovdqu RR3, (4*4*4)(%r11);
|
||||
vmovdqu RL3, (5*4*4)(%r11);
|
||||
vmovdqu RR4, (6*4*4)(%r11);
|
||||
vmovdqu RL4, (7*4*4)(%r11);
|
||||
|
||||
ret;
|
||||
|
||||
.align 16
|
||||
.global cast5_ecb_dec_16way
|
||||
.type cast5_ecb_dec_16way,@function;
|
||||
|
||||
cast5_ecb_dec_16way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
vmovdqu (0*4*4)(%rdx), RL1;
|
||||
vmovdqu (1*4*4)(%rdx), RR1;
|
||||
vmovdqu (2*4*4)(%rdx), RL2;
|
||||
vmovdqu (3*4*4)(%rdx), RR2;
|
||||
vmovdqu (4*4*4)(%rdx), RL3;
|
||||
vmovdqu (5*4*4)(%rdx), RR3;
|
||||
vmovdqu (6*4*4)(%rdx), RL4;
|
||||
vmovdqu (7*4*4)(%rdx), RR4;
|
||||
|
||||
call __cast5_dec_blk16;
|
||||
|
||||
vmovdqu RR1, (0*4*4)(%r11);
|
||||
vmovdqu RL1, (1*4*4)(%r11);
|
||||
vmovdqu RR2, (2*4*4)(%r11);
|
||||
vmovdqu RL2, (3*4*4)(%r11);
|
||||
vmovdqu RR3, (4*4*4)(%r11);
|
||||
vmovdqu RL3, (5*4*4)(%r11);
|
||||
vmovdqu RR4, (6*4*4)(%r11);
|
||||
vmovdqu RL4, (7*4*4)(%r11);
|
||||
|
||||
ret;
|
||||
|
||||
.align 16
|
||||
.global cast5_cbc_dec_16way
|
||||
.type cast5_cbc_dec_16way,@function;
|
||||
|
||||
cast5_cbc_dec_16way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
pushq %r12;
|
||||
|
||||
movq %rsi, %r11;
|
||||
movq %rdx, %r12;
|
||||
|
||||
vmovdqu (0*16)(%rdx), RL1;
|
||||
vmovdqu (1*16)(%rdx), RR1;
|
||||
vmovdqu (2*16)(%rdx), RL2;
|
||||
vmovdqu (3*16)(%rdx), RR2;
|
||||
vmovdqu (4*16)(%rdx), RL3;
|
||||
vmovdqu (5*16)(%rdx), RR3;
|
||||
vmovdqu (6*16)(%rdx), RL4;
|
||||
vmovdqu (7*16)(%rdx), RR4;
|
||||
|
||||
call __cast5_dec_blk16;
|
||||
|
||||
/* xor with src */
|
||||
vmovq (%r12), RX;
|
||||
vpshufd $0x4f, RX, RX;
|
||||
vpxor RX, RR1, RR1;
|
||||
vpxor 0*16+8(%r12), RL1, RL1;
|
||||
vpxor 1*16+8(%r12), RR2, RR2;
|
||||
vpxor 2*16+8(%r12), RL2, RL2;
|
||||
vpxor 3*16+8(%r12), RR3, RR3;
|
||||
vpxor 4*16+8(%r12), RL3, RL3;
|
||||
vpxor 5*16+8(%r12), RR4, RR4;
|
||||
vpxor 6*16+8(%r12), RL4, RL4;
|
||||
|
||||
vmovdqu RR1, (0*16)(%r11);
|
||||
vmovdqu RL1, (1*16)(%r11);
|
||||
vmovdqu RR2, (2*16)(%r11);
|
||||
vmovdqu RL2, (3*16)(%r11);
|
||||
vmovdqu RR3, (4*16)(%r11);
|
||||
vmovdqu RL3, (5*16)(%r11);
|
||||
vmovdqu RR4, (6*16)(%r11);
|
||||
vmovdqu RL4, (7*16)(%r11);
|
||||
|
||||
popq %r12;
|
||||
|
||||
ret;
|
||||
|
||||
.align 16
|
||||
.global cast5_ctr_16way
|
||||
.type cast5_ctr_16way,@function;
|
||||
|
||||
cast5_ctr_16way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (big endian, 64bit)
|
||||
*/
|
||||
|
||||
pushq %r12;
|
||||
|
||||
movq %rsi, %r11;
|
||||
movq %rdx, %r12;
|
||||
|
||||
vpcmpeqd RTMP, RTMP, RTMP;
|
||||
vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
|
||||
|
||||
vpcmpeqd RKR, RKR, RKR;
|
||||
vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
|
||||
vmovdqa .Lbswap_iv_mask, R1ST;
|
||||
vmovdqa .Lbswap128_mask, RKM;
|
||||
|
||||
/* load IV and byteswap */
|
||||
vmovq (%rcx), RX;
|
||||
vpshufb R1ST, RX, RX;
|
||||
|
||||
/* construct IVs */
|
||||
vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
|
||||
vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
|
||||
vpsubq RKR, RX, RX;
|
||||
vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
|
||||
vpsubq RKR, RX, RX;
|
||||
vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
|
||||
vpsubq RKR, RX, RX;
|
||||
vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
|
||||
vpsubq RKR, RX, RX;
|
||||
vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
|
||||
vpsubq RKR, RX, RX;
|
||||
vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
|
||||
vpsubq RKR, RX, RX;
|
||||
vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
|
||||
vpsubq RKR, RX, RX;
|
||||
vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
|
||||
|
||||
/* store last IV */
|
||||
vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
|
||||
vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
|
||||
vmovq RX, (%rcx);
|
||||
|
||||
call __cast5_enc_blk16;
|
||||
|
||||
/* dst = src ^ iv */
|
||||
vpxor (0*16)(%r12), RR1, RR1;
|
||||
vpxor (1*16)(%r12), RL1, RL1;
|
||||
vpxor (2*16)(%r12), RR2, RR2;
|
||||
vpxor (3*16)(%r12), RL2, RL2;
|
||||
vpxor (4*16)(%r12), RR3, RR3;
|
||||
vpxor (5*16)(%r12), RL3, RL3;
|
||||
vpxor (6*16)(%r12), RR4, RR4;
|
||||
vpxor (7*16)(%r12), RL4, RL4;
|
||||
vmovdqu RR1, (0*16)(%r11);
|
||||
vmovdqu RL1, (1*16)(%r11);
|
||||
vmovdqu RR2, (2*16)(%r11);
|
||||
vmovdqu RL2, (3*16)(%r11);
|
||||
vmovdqu RR3, (4*16)(%r11);
|
||||
vmovdqu RL3, (5*16)(%r11);
|
||||
vmovdqu RR4, (6*16)(%r11);
|
||||
vmovdqu RL4, (7*16)(%r11);
|
||||
|
||||
popq %r12;
|
||||
|
||||
ret;
|
||||
|
@@ -37,29 +37,14 @@
|
||||
|
||||
#define CAST5_PARALLEL_BLOCKS 16
|
||||
|
||||
asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
|
||||
const u8 *src, bool xor);
|
||||
asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
|
||||
asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__cast5_enc_blk_16way(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__cast5_enc_blk_16way(ctx, dst, src, true);
|
||||
}
|
||||
|
||||
static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
cast5_dec_blk_16way(ctx, dst, src);
|
||||
}
|
||||
|
||||
asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
|
||||
__be64 *iv);
|
||||
|
||||
static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
|
||||
{
|
||||
@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
|
||||
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
|
||||
const unsigned int bsize = CAST5_BLOCK_SIZE;
|
||||
unsigned int nbytes;
|
||||
void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
|
||||
int err;
|
||||
|
||||
fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
|
||||
|
||||
err = blkcipher_walk_virt(desc, walk);
|
||||
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
|
||||
|
||||
@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
|
||||
/* Process multi-block batch */
|
||||
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
|
||||
do {
|
||||
if (enc)
|
||||
cast5_enc_blk_xway(ctx, wdst, wsrc);
|
||||
else
|
||||
cast5_dec_blk_xway(ctx, wdst, wsrc);
|
||||
fn(ctx, wdst, wsrc);
|
||||
|
||||
wsrc += bsize * CAST5_PARALLEL_BLOCKS;
|
||||
wdst += bsize * CAST5_PARALLEL_BLOCKS;
|
||||
@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
|
||||
goto done;
|
||||
}
|
||||
|
||||
fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
|
||||
|
||||
/* Handle leftovers */
|
||||
do {
|
||||
if (enc)
|
||||
__cast5_encrypt(ctx, wdst, wsrc);
|
||||
else
|
||||
__cast5_decrypt(ctx, wdst, wsrc);
|
||||
fn(ctx, wdst, wsrc);
|
||||
|
||||
wsrc += bsize;
|
||||
wdst += bsize;
|
||||
@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
u64 *src = (u64 *)walk->src.virt.addr;
|
||||
u64 *dst = (u64 *)walk->dst.virt.addr;
|
||||
u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
|
||||
u64 last_iv;
|
||||
int i;
|
||||
|
||||
/* Start of the last block. */
|
||||
src += nbytes / bsize - 1;
|
||||
@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
|
||||
src -= CAST5_PARALLEL_BLOCKS - 1;
|
||||
dst -= CAST5_PARALLEL_BLOCKS - 1;
|
||||
|
||||
for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
|
||||
ivs[i] = src[i];
|
||||
|
||||
cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
|
||||
|
||||
for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
|
||||
*(dst + (i + 1)) ^= *(ivs + i);
|
||||
cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
|
||||
|
||||
nbytes -= bsize;
|
||||
if (nbytes < bsize)
|
||||
@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
u64 *src = (u64 *)walk->src.virt.addr;
|
||||
u64 *dst = (u64 *)walk->dst.virt.addr;
|
||||
u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
|
||||
__be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
|
||||
int i;
|
||||
|
||||
/* Process multi-block batch */
|
||||
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
|
||||
do {
|
||||
/* create ctrblks for parallel encrypt */
|
||||
for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
|
||||
if (dst != src)
|
||||
dst[i] = src[i];
|
||||
|
||||
ctrblocks[i] = cpu_to_be64(ctrblk++);
|
||||
}
|
||||
|
||||
cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
|
||||
(u8 *)ctrblocks);
|
||||
cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
|
||||
(__be64 *)walk->iv);
|
||||
|
||||
src += CAST5_PARALLEL_BLOCKS;
|
||||
dst += CAST5_PARALLEL_BLOCKS;
|
||||
@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
|
||||
|
||||
/* Handle leftovers */
|
||||
do {
|
||||
u64 ctrblk;
|
||||
|
||||
if (dst != src)
|
||||
*dst = *src;
|
||||
|
||||
ctrblocks[0] = cpu_to_be64(ctrblk++);
|
||||
ctrblk = *(u64 *)walk->iv;
|
||||
be64_add_cpu((__be64 *)walk->iv, 1);
|
||||
|
||||
__cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
|
||||
*dst ^= ctrblocks[0];
|
||||
__cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
|
||||
*dst ^= ctrblk;
|
||||
|
||||
src += 1;
|
||||
dst += 1;
|
||||
@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
|
||||
} while (nbytes >= bsize);
|
||||
|
||||
done:
|
||||
*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
|
@@ -23,22 +23,24 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "glue_helper-asm-avx.S"
|
||||
|
||||
.file "cast6-avx-x86_64-asm_64.S"
|
||||
|
||||
.extern cast6_s1
|
||||
.extern cast6_s2
|
||||
.extern cast6_s3
|
||||
.extern cast6_s4
|
||||
.extern cast_s1
|
||||
.extern cast_s2
|
||||
.extern cast_s3
|
||||
.extern cast_s4
|
||||
|
||||
/* structure of crypto context */
|
||||
#define km 0
|
||||
#define kr (12*4*4)
|
||||
|
||||
/* s-boxes */
|
||||
#define s1 cast6_s1
|
||||
#define s2 cast6_s2
|
||||
#define s3 cast6_s3
|
||||
#define s4 cast6_s4
|
||||
#define s1 cast_s1
|
||||
#define s2 cast_s2
|
||||
#define s3 cast_s3
|
||||
#define s4 cast_s4
|
||||
|
||||
/**********************************************************************
|
||||
8-way AVX cast6
|
||||
@@ -205,11 +207,7 @@
|
||||
vpunpcklqdq x3, t2, x2; \
|
||||
vpunpckhqdq x3, t2, x3;
|
||||
|
||||
#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
|
||||
vmovdqu (0*4*4)(in), x0; \
|
||||
vmovdqu (1*4*4)(in), x1; \
|
||||
vmovdqu (2*4*4)(in), x2; \
|
||||
vmovdqu (3*4*4)(in), x3; \
|
||||
#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
|
||||
vpshufb rmask, x0, x0; \
|
||||
vpshufb rmask, x1, x1; \
|
||||
vpshufb rmask, x2, x2; \
|
||||
@@ -217,39 +215,21 @@
|
||||
\
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
|
||||
|
||||
#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
|
||||
#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||
\
|
||||
vpshufb rmask, x0, x0; \
|
||||
vpshufb rmask, x1, x1; \
|
||||
vpshufb rmask, x2, x2; \
|
||||
vpshufb rmask, x3, x3; \
|
||||
vmovdqu x0, (0*4*4)(out); \
|
||||
vmovdqu x1, (1*4*4)(out); \
|
||||
vmovdqu x2, (2*4*4)(out); \
|
||||
vmovdqu x3, (3*4*4)(out);
|
||||
|
||||
#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||
\
|
||||
vpshufb rmask, x0, x0; \
|
||||
vpshufb rmask, x1, x1; \
|
||||
vpshufb rmask, x2, x2; \
|
||||
vpshufb rmask, x3, x3; \
|
||||
vpxor (0*4*4)(out), x0, x0; \
|
||||
vmovdqu x0, (0*4*4)(out); \
|
||||
vpxor (1*4*4)(out), x1, x1; \
|
||||
vmovdqu x1, (1*4*4)(out); \
|
||||
vpxor (2*4*4)(out), x2, x2; \
|
||||
vmovdqu x2, (2*4*4)(out); \
|
||||
vpxor (3*4*4)(out), x3, x3; \
|
||||
vmovdqu x3, (3*4*4)(out);
|
||||
vpshufb rmask, x3, x3;
|
||||
|
||||
.data
|
||||
|
||||
.align 16
|
||||
.Lbswap_mask:
|
||||
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
.Lrkr_enc_Q_Q_QBAR_QBAR:
|
||||
.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
|
||||
.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
|
||||
@@ -269,31 +249,26 @@
|
||||
|
||||
.text
|
||||
|
||||
.align 16
|
||||
.global __cast6_enc_blk_8way
|
||||
.type __cast6_enc_blk_8way,@function;
|
||||
.align 8
|
||||
.type __cast6_enc_blk8,@function;
|
||||
|
||||
__cast6_enc_blk_8way:
|
||||
__cast6_enc_blk8:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: bool, if true: xor output
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
|
||||
* output:
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
|
||||
*/
|
||||
|
||||
pushq %rbp;
|
||||
pushq %rbx;
|
||||
pushq %rcx;
|
||||
|
||||
vmovdqa .Lbswap_mask, RKM;
|
||||
vmovd .Lfirst_mask, R1ST;
|
||||
vmovd .L32_mask, R32;
|
||||
|
||||
leaq (4*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
|
||||
movq %rsi, %r11;
|
||||
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
|
||||
preload_rkr(0, dummy, none);
|
||||
Q(0);
|
||||
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
|
||||
QBAR(10);
|
||||
QBAR(11);
|
||||
|
||||
popq %rcx;
|
||||
popq %rbx;
|
||||
popq %rbp;
|
||||
|
||||
vmovdqa .Lbswap_mask, RKM;
|
||||
leaq (4*4*4)(%r11), %rax;
|
||||
|
||||
testb %cl, %cl;
|
||||
jnz __enc_xor8;
|
||||
|
||||
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
|
||||
ret;
|
||||
|
||||
__enc_xor8:
|
||||
outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
.align 8
|
||||
.type __cast6_dec_blk8,@function;
|
||||
|
||||
ret;
|
||||
|
||||
.align 16
|
||||
.global cast6_dec_blk_8way
|
||||
.type cast6_dec_blk_8way,@function;
|
||||
|
||||
cast6_dec_blk_8way:
|
||||
__cast6_dec_blk8:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
|
||||
* output:
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
|
||||
*/
|
||||
|
||||
pushq %rbp;
|
||||
@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
|
||||
vmovd .Lfirst_mask, R1ST;
|
||||
vmovd .L32_mask, R32;
|
||||
|
||||
leaq (4*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
|
||||
movq %rsi, %r11;
|
||||
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
|
||||
preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
|
||||
Q(11);
|
||||
@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
|
||||
popq %rbp;
|
||||
|
||||
vmovdqa .Lbswap_mask, RKM;
|
||||
leaq (4*4*4)(%r11), %rax;
|
||||
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
||||
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global cast6_ecb_enc_8way
|
||||
.type cast6_ecb_enc_8way,@function;
|
||||
|
||||
cast6_ecb_enc_8way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __cast6_enc_blk8;
|
||||
|
||||
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global cast6_ecb_dec_8way
|
||||
.type cast6_ecb_dec_8way,@function;
|
||||
|
||||
cast6_ecb_dec_8way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __cast6_dec_blk8;
|
||||
|
||||
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global cast6_cbc_dec_8way
|
||||
.type cast6_cbc_dec_8way,@function;
|
||||
|
||||
cast6_cbc_dec_8way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
pushq %r12;
|
||||
|
||||
movq %rsi, %r11;
|
||||
movq %rdx, %r12;
|
||||
|
||||
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __cast6_dec_blk8;
|
||||
|
||||
store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
popq %r12;
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global cast6_ctr_8way
|
||||
.type cast6_ctr_8way,@function;
|
||||
|
||||
cast6_ctr_8way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (little endian, 128bit)
|
||||
*/
|
||||
|
||||
pushq %r12;
|
||||
|
||||
movq %rsi, %r11;
|
||||
movq %rdx, %r12;
|
||||
|
||||
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
|
||||
RD2, RX, RKR, RKM);
|
||||
|
||||
call __cast6_enc_blk8;
|
||||
|
||||
store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
popq %r12;
|
||||
|
||||
ret;
|
||||
|
@@ -40,79 +40,34 @@
|
||||
|
||||
#define CAST6_PARALLEL_BLOCKS 8
|
||||
|
||||
asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src, bool xor);
|
||||
asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst,
|
||||
asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__cast6_enc_blk_8way(ctx, dst, src, false);
|
||||
}
|
||||
asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
|
||||
le128 *iv);
|
||||
|
||||
static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__cast6_enc_blk_8way(ctx, dst, src, true);
|
||||
}
|
||||
|
||||
static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
cast6_dec_blk_8way(ctx, dst, src);
|
||||
}
|
||||
|
||||
|
||||
static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
|
||||
{
|
||||
u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
|
||||
unsigned int j;
|
||||
|
||||
for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
|
||||
ivs[j] = src[j];
|
||||
|
||||
cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
|
||||
|
||||
for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
|
||||
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
|
||||
}
|
||||
|
||||
static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
|
||||
static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
be128 ctrblk;
|
||||
|
||||
u128_to_be128(&ctrblk, iv);
|
||||
u128_inc(iv);
|
||||
le128_to_be128(&ctrblk, iv);
|
||||
le128_inc(iv);
|
||||
|
||||
__cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
|
||||
u128_xor(dst, src, (u128 *)&ctrblk);
|
||||
}
|
||||
|
||||
static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
|
||||
u128 *iv)
|
||||
{
|
||||
be128 ctrblks[CAST6_PARALLEL_BLOCKS];
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
|
||||
if (dst != src)
|
||||
dst[i] = src[i];
|
||||
|
||||
u128_to_be128(&ctrblks[i], iv);
|
||||
u128_inc(iv);
|
||||
}
|
||||
|
||||
cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
|
||||
}
|
||||
|
||||
static const struct common_glue_ctx cast6_enc = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAST6_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) }
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
|
||||
@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAST6_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) }
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
|
||||
@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAST6_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) }
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
|
||||
@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = CAST6_PARALLEL_BLOCKS,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) }
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
|
||||
@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
|
||||
cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst);
|
||||
cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
|
||||
cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst);
|
||||
cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -32,6 +32,8 @@
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/cpu_device_id.h>
|
||||
#include <asm/i387.h>
|
||||
#include <asm/fpu-internal.h>
|
||||
|
||||
#define CHKSUM_BLOCK_SIZE 1
|
||||
#define CHKSUM_DIGEST_SIZE 4
|
||||
@@ -44,6 +46,31 @@
|
||||
#define REX_PRE
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* use carryless multiply version of crc32c when buffer
|
||||
* size is >= 512 (when eager fpu is enabled) or
|
||||
* >= 1024 (when eager fpu is disabled) to account
|
||||
* for fpu state save/restore overhead.
|
||||
*/
|
||||
#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512
|
||||
#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024
|
||||
|
||||
asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
|
||||
unsigned int crc_init);
|
||||
static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
|
||||
#if defined(X86_FEATURE_EAGER_FPU)
|
||||
#define set_pcl_breakeven_point() \
|
||||
do { \
|
||||
if (!use_eager_fpu()) \
|
||||
crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
|
||||
} while (0)
|
||||
#else
|
||||
#define set_pcl_breakeven_point() \
|
||||
(crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
|
||||
#endif
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
|
||||
{
|
||||
while (length--) {
|
||||
@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len)
|
||||
{
|
||||
u32 *crcp = shash_desc_ctx(desc);
|
||||
|
||||
/*
|
||||
* use faster PCL version if datasize is large enough to
|
||||
* overcome kernel fpu state save/restore overhead
|
||||
*/
|
||||
if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
|
||||
kernel_fpu_begin();
|
||||
*crcp = crc_pcl(data, len, *crcp);
|
||||
kernel_fpu_end();
|
||||
} else
|
||||
*crcp = crc32c_intel_le_hw(*crcp, data, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
|
||||
u8 *out)
|
||||
{
|
||||
if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
|
||||
kernel_fpu_begin();
|
||||
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
|
||||
kernel_fpu_end();
|
||||
} else
|
||||
*(__le32 *)out =
|
||||
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len, u8 *out)
|
||||
{
|
||||
return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
|
||||
}
|
||||
|
||||
static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int len, u8 *out)
|
||||
{
|
||||
return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
|
||||
out);
|
||||
}
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
static struct shash_alg alg = {
|
||||
.setkey = crc32c_intel_setkey,
|
||||
.init = crc32c_intel_init,
|
||||
@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void)
|
||||
{
|
||||
if (!x86_match_cpu(crc32c_cpu_id))
|
||||
return -ENODEV;
|
||||
#ifdef CONFIG_X86_64
|
||||
if (cpu_has_pclmulqdq) {
|
||||
alg.update = crc32c_pcl_intel_update;
|
||||
alg.finup = crc32c_pcl_intel_finup;
|
||||
alg.digest = crc32c_pcl_intel_digest;
|
||||
set_pcl_breakeven_point();
|
||||
}
|
||||
#endif
|
||||
return crypto_register_shash(&alg);
|
||||
}
|
||||
|
460
arch/x86/crypto/crc32c-pcl-intel-asm_64.S
Normal file
460
arch/x86/crypto/crc32c-pcl-intel-asm_64.S
Normal file
@@ -0,0 +1,460 @@
|
||||
/*
|
||||
* Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
|
||||
*
|
||||
* The white paper on CRC32C calculations with PCLMULQDQ instruction can be
|
||||
* downloaded from:
|
||||
* http://download.intel.com/design/intarch/papers/323405.pdf
|
||||
*
|
||||
* Copyright (C) 2012 Intel Corporation.
|
||||
*
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* James Guilford <james.guilford@intel.com>
|
||||
* David Cote <david.m.cote@intel.com>
|
||||
* Tim Chen <tim.c.chen@linux.intel.com>
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the
|
||||
* OpenIB.org BSD license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
|
||||
|
||||
.macro LABEL prefix n
|
||||
\prefix\n\():
|
||||
.endm
|
||||
|
||||
.macro JMPTBL_ENTRY i
|
||||
.word crc_\i - crc_array
|
||||
.endm
|
||||
|
||||
.macro JNC_LESS_THAN j
|
||||
jnc less_than_\j
|
||||
.endm
|
||||
|
||||
# Define threshold where buffers are considered "small" and routed to more
|
||||
# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
|
||||
# SMALL_SIZE can be no larger than 255.
|
||||
|
||||
#define SMALL_SIZE 200
|
||||
|
||||
.if (SMALL_SIZE > 255)
|
||||
.error "SMALL_ SIZE must be < 256"
|
||||
.endif
|
||||
|
||||
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
|
||||
|
||||
.global crc_pcl
|
||||
crc_pcl:
|
||||
#define bufp %rdi
|
||||
#define bufp_dw %edi
|
||||
#define bufp_w %di
|
||||
#define bufp_b %dil
|
||||
#define bufptmp %rcx
|
||||
#define block_0 %rcx
|
||||
#define block_1 %rdx
|
||||
#define block_2 %r11
|
||||
#define len %rsi
|
||||
#define len_dw %esi
|
||||
#define len_w %si
|
||||
#define len_b %sil
|
||||
#define crc_init_arg %rdx
|
||||
#define tmp %rbx
|
||||
#define crc_init %r8
|
||||
#define crc_init_dw %r8d
|
||||
#define crc1 %r9
|
||||
#define crc2 %r10
|
||||
|
||||
pushq %rbx
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
|
||||
## Move crc_init for Linux to a different
|
||||
mov crc_init_arg, crc_init
|
||||
|
||||
################################################################
|
||||
## 1) ALIGN:
|
||||
################################################################
|
||||
|
||||
mov bufp, bufptmp # rdi = *buf
|
||||
neg bufp
|
||||
and $7, bufp # calculate the unalignment amount of
|
||||
# the address
|
||||
je proc_block # Skip if aligned
|
||||
|
||||
## If len is less than 8 and we're unaligned, we need to jump
|
||||
## to special code to avoid reading beyond the end of the buffer
|
||||
cmp $8, len
|
||||
jae do_align
|
||||
# less_than_8 expects length in upper 3 bits of len_dw
|
||||
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||||
shl $32-3+1, len_dw
|
||||
jmp less_than_8_post_shl1
|
||||
|
||||
do_align:
|
||||
#### Calculate CRC of unaligned bytes of the buffer (if any)
|
||||
movq (bufptmp), tmp # load a quadward from the buffer
|
||||
add bufp, bufptmp # align buffer pointer for quadword
|
||||
# processing
|
||||
sub bufp, len # update buffer length
|
||||
align_loop:
|
||||
crc32b %bl, crc_init_dw # compute crc32 of 1-byte
|
||||
shr $8, tmp # get next byte
|
||||
dec bufp
|
||||
jne align_loop
|
||||
|
||||
proc_block:
|
||||
|
||||
################################################################
|
||||
## 2) PROCESS BLOCKS:
|
||||
################################################################
|
||||
|
||||
## compute num of bytes to be processed
|
||||
movq len, tmp # save num bytes in tmp
|
||||
|
||||
cmpq $128*24, len
|
||||
jae full_block
|
||||
|
||||
continue_block:
|
||||
cmpq $SMALL_SIZE, len
|
||||
jb small
|
||||
|
||||
## len < 128*24
|
||||
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
mul len_dw
|
||||
shrq $16, %rax
|
||||
|
||||
## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||||
|
||||
## process rax 24-byte chunks (128 >= rax >= 0)
|
||||
|
||||
## compute end address of each block
|
||||
## block 0 (base addr + RAX * 8)
|
||||
## block 1 (base addr + RAX * 16)
|
||||
## block 2 (base addr + RAX * 24)
|
||||
lea (bufptmp, %rax, 8), block_0
|
||||
lea (block_0, %rax, 8), block_1
|
||||
lea (block_1, %rax, 8), block_2
|
||||
|
||||
xor crc1, crc1
|
||||
xor crc2, crc2
|
||||
|
||||
## branch into array
|
||||
lea jump_table(%rip), bufp
|
||||
movzxw (bufp, %rax, 2), len
|
||||
offset=crc_array-jump_table
|
||||
lea offset(bufp, len, 1), bufp
|
||||
jmp *bufp
|
||||
|
||||
################################################################
|
||||
## 2a) PROCESS FULL BLOCKS:
|
||||
################################################################
|
||||
full_block:
|
||||
movq $128,%rax
|
||||
lea 128*8*2(block_0), block_1
|
||||
lea 128*8*3(block_0), block_2
|
||||
add $128*8*1, block_0
|
||||
|
||||
xor crc1,crc1
|
||||
xor crc2,crc2
|
||||
|
||||
# Fall thruogh into top of crc array (crc_128)
|
||||
|
||||
################################################################
|
||||
## 3) CRC Array:
|
||||
################################################################
|
||||
|
||||
crc_array:
|
||||
i=128
|
||||
.rept 128-1
|
||||
.altmacro
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
crc32q -i*8(block_0), crc_init
|
||||
crc32q -i*8(block_1), crc1
|
||||
crc32q -i*8(block_2), crc2
|
||||
i=(i-1)
|
||||
.endr
|
||||
|
||||
.altmacro
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
crc32q -i*8(block_0), crc_init
|
||||
crc32q -i*8(block_1), crc1
|
||||
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||||
|
||||
mov block_2, block_0
|
||||
|
||||
################################################################
|
||||
## 4) Combine three results:
|
||||
################################################################
|
||||
|
||||
lea (K_table-16)(%rip), bufp # first entry is for idx 1
|
||||
shlq $3, %rax # rax *= 8
|
||||
subq %rax, tmp # tmp -= rax*8
|
||||
shlq $1, %rax
|
||||
subq %rax, tmp # tmp -= rax*16
|
||||
# (total tmp -= rax*24)
|
||||
addq %rax, bufp
|
||||
|
||||
movdqa (bufp), %xmm0 # 2 consts: K1:K2
|
||||
|
||||
movq crc_init, %xmm1 # CRC for block 1
|
||||
pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2
|
||||
|
||||
movq crc1, %xmm2 # CRC for block 2
|
||||
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
|
||||
|
||||
pxor %xmm2,%xmm1
|
||||
movq %xmm1, %rax
|
||||
xor -i*8(block_2), %rax
|
||||
mov crc2, crc_init
|
||||
crc32 %rax, crc_init
|
||||
|
||||
################################################################
|
||||
## 5) Check for end:
|
||||
################################################################
|
||||
|
||||
LABEL crc_ 0
|
||||
mov tmp, len
|
||||
cmp $128*24, tmp
|
||||
jae full_block
|
||||
cmp $24, tmp
|
||||
jae continue_block
|
||||
|
||||
less_than_24:
|
||||
shl $32-4, len_dw # less_than_16 expects length
|
||||
# in upper 4 bits of len_dw
|
||||
jnc less_than_16
|
||||
crc32q (bufptmp), crc_init
|
||||
crc32q 8(bufptmp), crc_init
|
||||
jz do_return
|
||||
add $16, bufptmp
|
||||
# len is less than 8 if we got here
|
||||
# less_than_8 expects length in upper 3 bits of len_dw
|
||||
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||||
shl $2, len_dw
|
||||
jmp less_than_8_post_shl1
|
||||
|
||||
#######################################################################
|
||||
## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
|
||||
#######################################################################
|
||||
small:
|
||||
shl $32-8, len_dw # Prepare len_dw for less_than_256
|
||||
j=256
|
||||
.rept 5 # j = {256, 128, 64, 32, 16}
|
||||
.altmacro
|
||||
LABEL less_than_ %j # less_than_j: Length should be in
|
||||
# upper lg(j) bits of len_dw
|
||||
j=(j/2)
|
||||
shl $1, len_dw # Get next MSB
|
||||
JNC_LESS_THAN %j
|
||||
.noaltmacro
|
||||
i=0
|
||||
.rept (j/8)
|
||||
crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
|
||||
i=i+8
|
||||
.endr
|
||||
jz do_return # Return if remaining length is zero
|
||||
add $j, bufptmp # Advance buf
|
||||
.endr
|
||||
|
||||
less_than_8: # Length should be stored in
|
||||
# upper 3 bits of len_dw
|
||||
shl $1, len_dw
|
||||
less_than_8_post_shl1:
|
||||
jnc less_than_4
|
||||
crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
|
||||
jz do_return # return if remaining data is zero
|
||||
add $4, bufptmp
|
||||
less_than_4: # Length should be stored in
|
||||
# upper 2 bits of len_dw
|
||||
shl $1, len_dw
|
||||
jnc less_than_2
|
||||
crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
|
||||
jz do_return # return if remaining data is zero
|
||||
add $2, bufptmp
|
||||
less_than_2: # Length should be stored in the MSB
|
||||
# of len_dw
|
||||
shl $1, len_dw
|
||||
jnc less_than_1
|
||||
crc32b (bufptmp), crc_init_dw # CRC of 1 byte
|
||||
less_than_1: # Length should be zero
|
||||
do_return:
|
||||
movq crc_init, %rax
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbx
|
||||
ret
|
||||
|
||||
################################################################
|
||||
## jump table Table is 129 entries x 2 bytes each
|
||||
################################################################
|
||||
.align 4
|
||||
jump_table:
|
||||
i=0
|
||||
.rept 129
|
||||
.altmacro
|
||||
JMPTBL_ENTRY %i
|
||||
.noaltmacro
|
||||
i=i+1
|
||||
.endr
|
||||
################################################################
|
||||
## PCLMULQDQ tables
|
||||
## Table is 128 entries x 2 quad words each
|
||||
################################################################
|
||||
.data
|
||||
.align 64
|
||||
K_table:
|
||||
.quad 0x14cd00bd6,0x105ec76f0
|
||||
.quad 0x0ba4fc28e,0x14cd00bd6
|
||||
.quad 0x1d82c63da,0x0f20c0dfe
|
||||
.quad 0x09e4addf8,0x0ba4fc28e
|
||||
.quad 0x039d3b296,0x1384aa63a
|
||||
.quad 0x102f9b8a2,0x1d82c63da
|
||||
.quad 0x14237f5e6,0x01c291d04
|
||||
.quad 0x00d3b6092,0x09e4addf8
|
||||
.quad 0x0c96cfdc0,0x0740eef02
|
||||
.quad 0x18266e456,0x039d3b296
|
||||
.quad 0x0daece73e,0x0083a6eec
|
||||
.quad 0x0ab7aff2a,0x102f9b8a2
|
||||
.quad 0x1248ea574,0x1c1733996
|
||||
.quad 0x083348832,0x14237f5e6
|
||||
.quad 0x12c743124,0x02ad91c30
|
||||
.quad 0x0b9e02b86,0x00d3b6092
|
||||
.quad 0x018b33a4e,0x06992cea2
|
||||
.quad 0x1b331e26a,0x0c96cfdc0
|
||||
.quad 0x17d35ba46,0x07e908048
|
||||
.quad 0x1bf2e8b8a,0x18266e456
|
||||
.quad 0x1a3e0968a,0x11ed1f9d8
|
||||
.quad 0x0ce7f39f4,0x0daece73e
|
||||
.quad 0x061d82e56,0x0f1d0f55e
|
||||
.quad 0x0d270f1a2,0x0ab7aff2a
|
||||
.quad 0x1c3f5f66c,0x0a87ab8a8
|
||||
.quad 0x12ed0daac,0x1248ea574
|
||||
.quad 0x065863b64,0x08462d800
|
||||
.quad 0x11eef4f8e,0x083348832
|
||||
.quad 0x1ee54f54c,0x071d111a8
|
||||
.quad 0x0b3e32c28,0x12c743124
|
||||
.quad 0x0064f7f26,0x0ffd852c6
|
||||
.quad 0x0dd7e3b0c,0x0b9e02b86
|
||||
.quad 0x0f285651c,0x0dcb17aa4
|
||||
.quad 0x010746f3c,0x018b33a4e
|
||||
.quad 0x1c24afea4,0x0f37c5aee
|
||||
.quad 0x0271d9844,0x1b331e26a
|
||||
.quad 0x08e766a0c,0x06051d5a2
|
||||
.quad 0x093a5f730,0x17d35ba46
|
||||
.quad 0x06cb08e5c,0x11d5ca20e
|
||||
.quad 0x06b749fb2,0x1bf2e8b8a
|
||||
.quad 0x1167f94f2,0x021f3d99c
|
||||
.quad 0x0cec3662e,0x1a3e0968a
|
||||
.quad 0x19329634a,0x08f158014
|
||||
.quad 0x0e6fc4e6a,0x0ce7f39f4
|
||||
.quad 0x08227bb8a,0x1a5e82106
|
||||
.quad 0x0b0cd4768,0x061d82e56
|
||||
.quad 0x13c2b89c4,0x188815ab2
|
||||
.quad 0x0d7a4825c,0x0d270f1a2
|
||||
.quad 0x10f5ff2ba,0x105405f3e
|
||||
.quad 0x00167d312,0x1c3f5f66c
|
||||
.quad 0x0f6076544,0x0e9adf796
|
||||
.quad 0x026f6a60a,0x12ed0daac
|
||||
.quad 0x1a2adb74e,0x096638b34
|
||||
.quad 0x19d34af3a,0x065863b64
|
||||
.quad 0x049c3cc9c,0x1e50585a0
|
||||
.quad 0x068bce87a,0x11eef4f8e
|
||||
.quad 0x1524fa6c6,0x19f1c69dc
|
||||
.quad 0x16cba8aca,0x1ee54f54c
|
||||
.quad 0x042d98888,0x12913343e
|
||||
.quad 0x1329d9f7e,0x0b3e32c28
|
||||
.quad 0x1b1c69528,0x088f25a3a
|
||||
.quad 0x02178513a,0x0064f7f26
|
||||
.quad 0x0e0ac139e,0x04e36f0b0
|
||||
.quad 0x0170076fa,0x0dd7e3b0c
|
||||
.quad 0x141a1a2e2,0x0bd6f81f8
|
||||
.quad 0x16ad828b4,0x0f285651c
|
||||
.quad 0x041d17b64,0x19425cbba
|
||||
.quad 0x1fae1cc66,0x010746f3c
|
||||
.quad 0x1a75b4b00,0x18db37e8a
|
||||
.quad 0x0f872e54c,0x1c24afea4
|
||||
.quad 0x01e41e9fc,0x04c144932
|
||||
.quad 0x086d8e4d2,0x0271d9844
|
||||
.quad 0x160f7af7a,0x052148f02
|
||||
.quad 0x05bb8f1bc,0x08e766a0c
|
||||
.quad 0x0a90fd27a,0x0a3c6f37a
|
||||
.quad 0x0b3af077a,0x093a5f730
|
||||
.quad 0x04984d782,0x1d22c238e
|
||||
.quad 0x0ca6ef3ac,0x06cb08e5c
|
||||
.quad 0x0234e0b26,0x063ded06a
|
||||
.quad 0x1d88abd4a,0x06b749fb2
|
||||
.quad 0x04597456a,0x04d56973c
|
||||
.quad 0x0e9e28eb4,0x1167f94f2
|
||||
.quad 0x07b3ff57a,0x19385bf2e
|
||||
.quad 0x0c9c8b782,0x0cec3662e
|
||||
.quad 0x13a9cba9e,0x0e417f38a
|
||||
.quad 0x093e106a4,0x19329634a
|
||||
.quad 0x167001a9c,0x14e727980
|
||||
.quad 0x1ddffc5d4,0x0e6fc4e6a
|
||||
.quad 0x00df04680,0x0d104b8fc
|
||||
.quad 0x02342001e,0x08227bb8a
|
||||
.quad 0x00a2a8d7e,0x05b397730
|
||||
.quad 0x168763fa6,0x0b0cd4768
|
||||
.quad 0x1ed5a407a,0x0e78eb416
|
||||
.quad 0x0d2c3ed1a,0x13c2b89c4
|
||||
.quad 0x0995a5724,0x1641378f0
|
||||
.quad 0x19b1afbc4,0x0d7a4825c
|
||||
.quad 0x109ffedc0,0x08d96551c
|
||||
.quad 0x0f2271e60,0x10f5ff2ba
|
||||
.quad 0x00b0bf8ca,0x00bf80dd2
|
||||
.quad 0x123888b7a,0x00167d312
|
||||
.quad 0x1e888f7dc,0x18dcddd1c
|
||||
.quad 0x002ee03b2,0x0f6076544
|
||||
.quad 0x183e8d8fe,0x06a45d2b2
|
||||
.quad 0x133d7a042,0x026f6a60a
|
||||
.quad 0x116b0f50c,0x1dd3e10e8
|
||||
.quad 0x05fabe670,0x1a2adb74e
|
||||
.quad 0x130004488,0x0de87806c
|
||||
.quad 0x000bcf5f6,0x19d34af3a
|
||||
.quad 0x18f0c7078,0x014338754
|
||||
.quad 0x017f27698,0x049c3cc9c
|
||||
.quad 0x058ca5f00,0x15e3e77ee
|
||||
.quad 0x1af900c24,0x068bce87a
|
||||
.quad 0x0b5cfca28,0x0dd07448e
|
||||
.quad 0x0ded288f8,0x1524fa6c6
|
||||
.quad 0x059f229bc,0x1d8048348
|
||||
.quad 0x06d390dec,0x16cba8aca
|
||||
.quad 0x037170390,0x0a3e3e02c
|
||||
.quad 0x06353c1cc,0x042d98888
|
||||
.quad 0x0c4584f5c,0x0d73c7bea
|
||||
.quad 0x1f16a3418,0x1329d9f7e
|
||||
.quad 0x0531377e2,0x185137662
|
||||
.quad 0x1d8d9ca7c,0x1b1c69528
|
||||
.quad 0x0b25b29f2,0x18a08b5bc
|
||||
.quad 0x19fb2a8b0,0x02178513a
|
||||
.quad 0x1a08fe6ac,0x1da758ae0
|
||||
.quad 0x045cddf4e,0x0e0ac139e
|
||||
.quad 0x1a91647f2,0x169cf9eb0
|
||||
.quad 0x1a0f717c4,0x0170076fa
|
91
arch/x86/crypto/glue_helper-asm-avx.S
Normal file
91
arch/x86/crypto/glue_helper-asm-avx.S
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Shared glue code for 128bit block ciphers, AVX assembler macros
|
||||
*
|
||||
* Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
*/
|
||||
|
||||
#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vmovdqu (0*16)(src), x0; \
|
||||
vmovdqu (1*16)(src), x1; \
|
||||
vmovdqu (2*16)(src), x2; \
|
||||
vmovdqu (3*16)(src), x3; \
|
||||
vmovdqu (4*16)(src), x4; \
|
||||
vmovdqu (5*16)(src), x5; \
|
||||
vmovdqu (6*16)(src), x6; \
|
||||
vmovdqu (7*16)(src), x7;
|
||||
|
||||
#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vmovdqu x0, (0*16)(dst); \
|
||||
vmovdqu x1, (1*16)(dst); \
|
||||
vmovdqu x2, (2*16)(dst); \
|
||||
vmovdqu x3, (3*16)(dst); \
|
||||
vmovdqu x4, (4*16)(dst); \
|
||||
vmovdqu x5, (5*16)(dst); \
|
||||
vmovdqu x6, (6*16)(dst); \
|
||||
vmovdqu x7, (7*16)(dst);
|
||||
|
||||
#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vpxor (0*16)(src), x1, x1; \
|
||||
vpxor (1*16)(src), x2, x2; \
|
||||
vpxor (2*16)(src), x3, x3; \
|
||||
vpxor (3*16)(src), x4, x4; \
|
||||
vpxor (4*16)(src), x5, x5; \
|
||||
vpxor (5*16)(src), x6, x6; \
|
||||
vpxor (6*16)(src), x7, x7; \
|
||||
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
|
||||
#define inc_le128(x, minus_one, tmp) \
|
||||
vpcmpeqq minus_one, x, tmp; \
|
||||
vpsubq minus_one, x, x; \
|
||||
vpslldq $8, tmp, tmp; \
|
||||
vpsubq tmp, x, x;
|
||||
|
||||
#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
|
||||
vpcmpeqd t0, t0, t0; \
|
||||
vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
|
||||
vmovdqa bswap, t1; \
|
||||
\
|
||||
/* load IV and byteswap */ \
|
||||
vmovdqu (iv), x7; \
|
||||
vpshufb t1, x7, x0; \
|
||||
\
|
||||
/* construct IVs */ \
|
||||
inc_le128(x7, t0, t2); \
|
||||
vpshufb t1, x7, x1; \
|
||||
inc_le128(x7, t0, t2); \
|
||||
vpshufb t1, x7, x2; \
|
||||
inc_le128(x7, t0, t2); \
|
||||
vpshufb t1, x7, x3; \
|
||||
inc_le128(x7, t0, t2); \
|
||||
vpshufb t1, x7, x4; \
|
||||
inc_le128(x7, t0, t2); \
|
||||
vpshufb t1, x7, x5; \
|
||||
inc_le128(x7, t0, t2); \
|
||||
vpshufb t1, x7, x6; \
|
||||
inc_le128(x7, t0, t2); \
|
||||
vmovdqa x7, t2; \
|
||||
vpshufb t1, x7, x7; \
|
||||
inc_le128(t2, t0, t1); \
|
||||
vmovdqu t2, (iv);
|
||||
|
||||
#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
|
||||
vpxor (0*16)(src), x0, x0; \
|
||||
vpxor (1*16)(src), x1, x1; \
|
||||
vpxor (2*16)(src), x2, x2; \
|
||||
vpxor (3*16)(src), x3, x3; \
|
||||
vpxor (4*16)(src), x4, x4; \
|
||||
vpxor (5*16)(src), x5, x5; \
|
||||
vpxor (6*16)(src), x6, x6; \
|
||||
vpxor (7*16)(src), x7, x7; \
|
||||
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
|
@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
|
||||
u8 *src = (u8 *)walk->src.virt.addr;
|
||||
u8 *dst = (u8 *)walk->dst.virt.addr;
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
u128 ctrblk;
|
||||
le128 ctrblk;
|
||||
u128 tmp;
|
||||
|
||||
be128_to_u128(&ctrblk, (be128 *)walk->iv);
|
||||
be128_to_le128(&ctrblk, (be128 *)walk->iv);
|
||||
|
||||
memcpy(&tmp, src, nbytes);
|
||||
fn_ctr(ctx, &tmp, &tmp, &ctrblk);
|
||||
memcpy(dst, &tmp, nbytes);
|
||||
|
||||
u128_to_be128((be128 *)walk->iv, &ctrblk);
|
||||
le128_to_be128((be128 *)walk->iv, &ctrblk);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
|
||||
|
||||
@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
|
||||
unsigned int nbytes = walk->nbytes;
|
||||
u128 *src = (u128 *)walk->src.virt.addr;
|
||||
u128 *dst = (u128 *)walk->dst.virt.addr;
|
||||
u128 ctrblk;
|
||||
le128 ctrblk;
|
||||
unsigned int num_blocks, func_bytes;
|
||||
unsigned int i;
|
||||
|
||||
be128_to_u128(&ctrblk, (be128 *)walk->iv);
|
||||
be128_to_le128(&ctrblk, (be128 *)walk->iv);
|
||||
|
||||
/* Process multi-block batch */
|
||||
for (i = 0; i < gctx->num_funcs; i++) {
|
||||
@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
|
||||
}
|
||||
|
||||
done:
|
||||
u128_to_be128((be128 *)walk->iv, &ctrblk);
|
||||
le128_to_be128((be128 *)walk->iv, &ctrblk);
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
|
@@ -24,7 +24,16 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "glue_helper-asm-avx.S"
|
||||
|
||||
.file "serpent-avx-x86_64-asm_64.S"
|
||||
|
||||
.data
|
||||
.align 16
|
||||
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
|
||||
.text
|
||||
|
||||
#define CTX %rdi
|
||||
@@ -550,51 +559,27 @@
|
||||
vpunpcklqdq x3, t2, x2; \
|
||||
vpunpckhqdq x3, t2, x3;
|
||||
|
||||
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
|
||||
vmovdqu (0*4*4)(in), x0; \
|
||||
vmovdqu (1*4*4)(in), x1; \
|
||||
vmovdqu (2*4*4)(in), x2; \
|
||||
vmovdqu (3*4*4)(in), x3; \
|
||||
\
|
||||
#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
|
||||
|
||||
#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||
\
|
||||
vmovdqu x0, (0*4*4)(out); \
|
||||
vmovdqu x1, (1*4*4)(out); \
|
||||
vmovdqu x2, (2*4*4)(out); \
|
||||
vmovdqu x3, (3*4*4)(out);
|
||||
|
||||
#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||
\
|
||||
vpxor (0*4*4)(out), x0, x0; \
|
||||
vmovdqu x0, (0*4*4)(out); \
|
||||
vpxor (1*4*4)(out), x1, x1; \
|
||||
vmovdqu x1, (1*4*4)(out); \
|
||||
vpxor (2*4*4)(out), x2, x2; \
|
||||
vmovdqu x2, (2*4*4)(out); \
|
||||
vpxor (3*4*4)(out), x3, x3; \
|
||||
vmovdqu x3, (3*4*4)(out);
|
||||
#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
|
||||
|
||||
.align 8
|
||||
.global __serpent_enc_blk_8way_avx
|
||||
.type __serpent_enc_blk_8way_avx,@function;
|
||||
.type __serpent_enc_blk8_avx,@function;
|
||||
|
||||
__serpent_enc_blk_8way_avx:
|
||||
__serpent_enc_blk8_avx:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: bool, if true: xor output
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
|
||||
* output:
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
|
||||
*/
|
||||
|
||||
vpcmpeqd RNOT, RNOT, RNOT;
|
||||
|
||||
leaq (4*4*4)(%rdx), %rax;
|
||||
read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
|
||||
K2(RA, RB, RC, RD, RE, 0);
|
||||
S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
|
||||
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
|
||||
S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
|
||||
S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
|
||||
|
||||
leaq (4*4*4)(%rsi), %rax;
|
||||
|
||||
testb %cl, %cl;
|
||||
jnz __enc_xor8;
|
||||
|
||||
write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
|
||||
ret;
|
||||
|
||||
__enc_xor8:
|
||||
xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global serpent_dec_blk_8way_avx
|
||||
.type serpent_dec_blk_8way_avx,@function;
|
||||
.type __serpent_dec_blk8_avx,@function;
|
||||
|
||||
serpent_dec_blk_8way_avx:
|
||||
__serpent_dec_blk8_avx:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
|
||||
* output:
|
||||
* RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
|
||||
*/
|
||||
|
||||
vpcmpeqd RNOT, RNOT, RNOT;
|
||||
|
||||
leaq (4*4*4)(%rdx), %rax;
|
||||
read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
|
||||
read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
|
||||
|
||||
K2(RA, RB, RC, RD, RE, 32);
|
||||
SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
|
||||
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
|
||||
SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
|
||||
S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
|
||||
|
||||
leaq (4*4*4)(%rsi), %rax;
|
||||
write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
|
||||
write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
|
||||
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
|
||||
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global serpent_ecb_enc_8way_avx
|
||||
.type serpent_ecb_enc_8way_avx,@function;
|
||||
|
||||
serpent_ecb_enc_8way_avx:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __serpent_enc_blk8_avx;
|
||||
|
||||
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global serpent_ecb_dec_8way_avx
|
||||
.type serpent_ecb_dec_8way_avx,@function;
|
||||
|
||||
serpent_ecb_dec_8way_avx:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __serpent_dec_blk8_avx;
|
||||
|
||||
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global serpent_cbc_dec_8way_avx
|
||||
.type serpent_cbc_dec_8way_avx,@function;
|
||||
|
||||
serpent_cbc_dec_8way_avx:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __serpent_dec_blk8_avx;
|
||||
|
||||
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global serpent_ctr_8way_avx
|
||||
.type serpent_ctr_8way_avx,@function;
|
||||
|
||||
serpent_ctr_8way_avx:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (little endian, 128bit)
|
||||
*/
|
||||
|
||||
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
|
||||
RD2, RK0, RK1, RK2);
|
||||
|
||||
call __serpent_enc_blk8_avx;
|
||||
|
||||
store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
|
@@ -42,55 +42,24 @@
|
||||
#include <asm/crypto/ablk_helper.h>
|
||||
#include <asm/crypto/glue_helper.h>
|
||||
|
||||
static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
|
||||
{
|
||||
u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
|
||||
unsigned int j;
|
||||
|
||||
for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
|
||||
ivs[j] = src[j];
|
||||
|
||||
serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
|
||||
|
||||
for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
|
||||
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
|
||||
}
|
||||
|
||||
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
|
||||
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
be128 ctrblk;
|
||||
|
||||
u128_to_be128(&ctrblk, iv);
|
||||
u128_inc(iv);
|
||||
le128_to_be128(&ctrblk, iv);
|
||||
le128_inc(iv);
|
||||
|
||||
__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
|
||||
u128_xor(dst, src, (u128 *)&ctrblk);
|
||||
}
|
||||
|
||||
static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
|
||||
u128 *iv)
|
||||
{
|
||||
be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
|
||||
if (dst != src)
|
||||
dst[i] = src[i];
|
||||
|
||||
u128_to_be128(&ctrblks[i], iv);
|
||||
u128_inc(iv);
|
||||
}
|
||||
|
||||
serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
|
||||
}
|
||||
|
||||
static const struct common_glue_ctx serpent_enc = {
|
||||
.num_funcs = 2,
|
||||
.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = SERPENT_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
|
||||
@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = SERPENT_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
|
||||
@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = SERPENT_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
|
||||
@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = SERPENT_PARALLEL_BLOCKS,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
|
||||
}, {
|
||||
.num_blocks = 1,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
|
||||
@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
|
||||
serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
|
||||
serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
|
||||
serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
|
||||
serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
|
||||
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
|
||||
}
|
||||
|
||||
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
|
||||
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
be128 ctrblk;
|
||||
|
||||
u128_to_be128(&ctrblk, iv);
|
||||
u128_inc(iv);
|
||||
le128_to_be128(&ctrblk, iv);
|
||||
le128_inc(iv);
|
||||
|
||||
__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
|
||||
u128_xor(dst, src, (u128 *)&ctrblk);
|
||||
}
|
||||
|
||||
static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
|
||||
u128 *iv)
|
||||
le128 *iv)
|
||||
{
|
||||
be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
|
||||
unsigned int i;
|
||||
@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
|
||||
if (dst != src)
|
||||
dst[i] = src[i];
|
||||
|
||||
u128_to_be128(&ctrblks[i], iv);
|
||||
u128_inc(iv);
|
||||
le128_to_be128(&ctrblks[i], iv);
|
||||
le128_inc(iv);
|
||||
}
|
||||
|
||||
serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
|
||||
|
@@ -23,7 +23,16 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "glue_helper-asm-avx.S"
|
||||
|
||||
.file "twofish-avx-x86_64-asm_64.S"
|
||||
|
||||
.data
|
||||
.align 16
|
||||
|
||||
.Lbswap128_mask:
|
||||
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
||||
|
||||
.text
|
||||
|
||||
/* structure of crypto context */
|
||||
@@ -217,69 +226,45 @@
|
||||
vpunpcklqdq x3, t2, x2; \
|
||||
vpunpckhqdq x3, t2, x3;
|
||||
|
||||
#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
|
||||
vpxor (0*4*4)(in), wkey, x0; \
|
||||
vpxor (1*4*4)(in), wkey, x1; \
|
||||
vpxor (2*4*4)(in), wkey, x2; \
|
||||
vpxor (3*4*4)(in), wkey, x3; \
|
||||
#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
|
||||
vpxor x0, wkey, x0; \
|
||||
vpxor x1, wkey, x1; \
|
||||
vpxor x2, wkey, x2; \
|
||||
vpxor x3, wkey, x3; \
|
||||
\
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
|
||||
|
||||
#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
|
||||
#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||
\
|
||||
vpxor x0, wkey, x0; \
|
||||
vmovdqu x0, (0*4*4)(out); \
|
||||
vpxor x1, wkey, x1; \
|
||||
vmovdqu x1, (1*4*4)(out); \
|
||||
vpxor x2, wkey, x2; \
|
||||
vmovdqu x2, (2*4*4)(out); \
|
||||
vpxor x3, wkey, x3; \
|
||||
vmovdqu x3, (3*4*4)(out);
|
||||
|
||||
#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
|
||||
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||
\
|
||||
vpxor x0, wkey, x0; \
|
||||
vpxor (0*4*4)(out), x0, x0; \
|
||||
vmovdqu x0, (0*4*4)(out); \
|
||||
vpxor x1, wkey, x1; \
|
||||
vpxor (1*4*4)(out), x1, x1; \
|
||||
vmovdqu x1, (1*4*4)(out); \
|
||||
vpxor x2, wkey, x2; \
|
||||
vpxor (2*4*4)(out), x2, x2; \
|
||||
vmovdqu x2, (2*4*4)(out); \
|
||||
vpxor x3, wkey, x3; \
|
||||
vpxor (3*4*4)(out), x3, x3; \
|
||||
vmovdqu x3, (3*4*4)(out);
|
||||
vpxor x0, wkey, x0; \
|
||||
vpxor x1, wkey, x1; \
|
||||
vpxor x2, wkey, x2; \
|
||||
vpxor x3, wkey, x3;
|
||||
|
||||
.align 8
|
||||
.global __twofish_enc_blk_8way
|
||||
.type __twofish_enc_blk_8way,@function;
|
||||
.type __twofish_enc_blk8,@function;
|
||||
|
||||
__twofish_enc_blk_8way:
|
||||
__twofish_enc_blk8:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: bool, if true: xor output
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
|
||||
* output:
|
||||
* RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
|
||||
*/
|
||||
|
||||
vmovdqu w(CTX), RK1;
|
||||
|
||||
pushq %rbp;
|
||||
pushq %rbx;
|
||||
pushq %rcx;
|
||||
|
||||
vmovdqu w(CTX), RK1;
|
||||
|
||||
leaq (4*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
|
||||
inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
|
||||
preload_rgi(RA1);
|
||||
rotate_1l(RD1);
|
||||
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
|
||||
inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
|
||||
rotate_1l(RD2);
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
encrypt_cycle(0);
|
||||
encrypt_cycle(1);
|
||||
encrypt_cycle(2);
|
||||
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
|
||||
popq %rbx;
|
||||
popq %rbp;
|
||||
|
||||
leaq (4*4*4)(%r11), %rax;
|
||||
|
||||
testb %cl, %cl;
|
||||
jnz __enc_xor8;
|
||||
|
||||
outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
|
||||
outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
|
||||
|
||||
ret;
|
||||
|
||||
__enc_xor8:
|
||||
outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
|
||||
outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
|
||||
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
|
||||
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global twofish_dec_blk_8way
|
||||
.type twofish_dec_blk_8way,@function;
|
||||
.type __twofish_dec_blk8,@function;
|
||||
|
||||
twofish_dec_blk_8way:
|
||||
__twofish_dec_blk8:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
|
||||
* output:
|
||||
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
|
||||
*/
|
||||
|
||||
vmovdqu (w+4*4)(CTX), RK1;
|
||||
|
||||
pushq %rbp;
|
||||
pushq %rbx;
|
||||
|
||||
vmovdqu (w+4*4)(CTX), RK1;
|
||||
|
||||
leaq (4*4*4)(%rdx), %rax;
|
||||
inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
|
||||
inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
|
||||
preload_rgi(RC1);
|
||||
rotate_1l(RA1);
|
||||
inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
|
||||
inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
|
||||
rotate_1l(RA2);
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
decrypt_cycle(7);
|
||||
decrypt_cycle(6);
|
||||
decrypt_cycle(5);
|
||||
@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
|
||||
popq %rbx;
|
||||
popq %rbp;
|
||||
|
||||
leaq (4*4*4)(%r11), %rax;
|
||||
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
|
||||
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
|
||||
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
|
||||
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global twofish_ecb_enc_8way
|
||||
.type twofish_ecb_enc_8way,@function;
|
||||
|
||||
twofish_ecb_enc_8way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
call __twofish_enc_blk8;
|
||||
|
||||
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global twofish_ecb_dec_8way
|
||||
.type twofish_ecb_dec_8way,@function;
|
||||
|
||||
twofish_ecb_dec_8way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
movq %rsi, %r11;
|
||||
|
||||
load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
|
||||
|
||||
call __twofish_dec_blk8;
|
||||
|
||||
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global twofish_cbc_dec_8way
|
||||
.type twofish_cbc_dec_8way,@function;
|
||||
|
||||
twofish_cbc_dec_8way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
*/
|
||||
|
||||
pushq %r12;
|
||||
|
||||
movq %rsi, %r11;
|
||||
movq %rdx, %r12;
|
||||
|
||||
load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
|
||||
|
||||
call __twofish_dec_blk8;
|
||||
|
||||
store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
||||
|
||||
popq %r12;
|
||||
|
||||
ret;
|
||||
|
||||
.align 8
|
||||
.global twofish_ctr_8way
|
||||
.type twofish_ctr_8way,@function;
|
||||
|
||||
twofish_ctr_8way:
|
||||
/* input:
|
||||
* %rdi: ctx, CTX
|
||||
* %rsi: dst
|
||||
* %rdx: src
|
||||
* %rcx: iv (little endian, 128bit)
|
||||
*/
|
||||
|
||||
pushq %r12;
|
||||
|
||||
movq %rsi, %r11;
|
||||
movq %rdx, %r12;
|
||||
|
||||
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
|
||||
RD2, RX0, RX1, RY0);
|
||||
|
||||
call __twofish_enc_blk8;
|
||||
|
||||
store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
|
||||
|
||||
popq %r12;
|
||||
|
||||
ret;
|
||||
|
@@ -45,66 +45,23 @@
|
||||
|
||||
#define TWOFISH_PARALLEL_BLOCKS 8
|
||||
|
||||
/* 8-way parallel cipher functions */
|
||||
asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, le128 *iv);
|
||||
|
||||
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__twofish_enc_blk_3way(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
/* 8-way parallel cipher functions */
|
||||
asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src, bool xor);
|
||||
asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src);
|
||||
|
||||
static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__twofish_enc_blk_8way(ctx, dst, src, false);
|
||||
}
|
||||
|
||||
static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
__twofish_enc_blk_8way(ctx, dst, src, true);
|
||||
}
|
||||
|
||||
static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
|
||||
const u8 *src)
|
||||
{
|
||||
twofish_dec_blk_8way(ctx, dst, src);
|
||||
}
|
||||
|
||||
static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
|
||||
{
|
||||
u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
|
||||
unsigned int j;
|
||||
|
||||
for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
|
||||
ivs[j] = src[j];
|
||||
|
||||
twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
|
||||
|
||||
for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
|
||||
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
|
||||
}
|
||||
|
||||
static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
|
||||
u128 *iv)
|
||||
{
|
||||
be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
|
||||
if (dst != src)
|
||||
dst[i] = src[i];
|
||||
|
||||
u128_to_be128(&ctrblks[i], iv);
|
||||
u128_inc(iv);
|
||||
}
|
||||
|
||||
twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
|
||||
}
|
||||
|
||||
static const struct common_glue_ctx twofish_enc = {
|
||||
.num_funcs = 3,
|
||||
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
|
||||
}, {
|
||||
.num_blocks = 3,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
|
||||
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
|
||||
}, {
|
||||
.num_blocks = 3,
|
||||
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
|
||||
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 3,
|
||||
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
|
||||
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
|
||||
|
||||
.funcs = { {
|
||||
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
|
||||
}, {
|
||||
.num_blocks = 3,
|
||||
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
|
||||
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
|
||||
twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
|
||||
twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
|
||||
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
|
||||
|
||||
if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
|
||||
twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
|
||||
twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
|
||||
|
||||
void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
|
||||
void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
|
||||
{
|
||||
be128 ctrblk;
|
||||
|
||||
if (dst != src)
|
||||
*dst = *src;
|
||||
|
||||
u128_to_be128(&ctrblk, iv);
|
||||
u128_inc(iv);
|
||||
le128_to_be128(&ctrblk, iv);
|
||||
le128_inc(iv);
|
||||
|
||||
twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
|
||||
u128_xor(dst, dst, (u128 *)&ctrblk);
|
||||
@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
|
||||
EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
|
||||
|
||||
void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
|
||||
u128 *iv)
|
||||
le128 *iv)
|
||||
{
|
||||
be128 ctrblks[3];
|
||||
|
||||
@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
|
||||
dst[2] = src[2];
|
||||
}
|
||||
|
||||
u128_to_be128(&ctrblks[0], iv);
|
||||
u128_inc(iv);
|
||||
u128_to_be128(&ctrblks[1], iv);
|
||||
u128_inc(iv);
|
||||
u128_to_be128(&ctrblks[2], iv);
|
||||
u128_inc(iv);
|
||||
le128_to_be128(&ctrblks[0], iv);
|
||||
le128_inc(iv);
|
||||
le128_to_be128(&ctrblks[1], iv);
|
||||
le128_inc(iv);
|
||||
le128_to_be128(&ctrblks[2], iv);
|
||||
le128_inc(iv);
|
||||
|
||||
twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
|
||||
}
|
||||
|
Reference in New Issue
Block a user