chacha_glue.c 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
  4. * including ChaCha20 (RFC7539)
  5. *
  6. * Copyright (C) 2015 Martin Willi
  7. */
  8. #include <crypto/algapi.h>
  9. #include <crypto/internal/chacha.h>
  10. #include <crypto/internal/simd.h>
  11. #include <crypto/internal/skcipher.h>
  12. #include <linux/kernel.h>
  13. #include <linux/module.h>
  14. #include <linux/sizes.h>
  15. #include <asm/simd.h>
  16. asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
  17. unsigned int len, int nrounds);
  18. asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
  19. unsigned int len, int nrounds);
  20. asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
  21. asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
  22. unsigned int len, int nrounds);
  23. asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
  24. unsigned int len, int nrounds);
  25. asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
  26. unsigned int len, int nrounds);
  27. asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
  28. unsigned int len, int nrounds);
  29. asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
  30. unsigned int len, int nrounds);
  31. asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
  32. unsigned int len, int nrounds);
  33. static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
  34. static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
  35. static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
  36. static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
  37. {
  38. len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
  39. return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
  40. }
  41. static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
  42. unsigned int bytes, int nrounds)
  43. {
  44. if (IS_ENABLED(CONFIG_AS_AVX512) &&
  45. static_branch_likely(&chacha_use_avx512vl)) {
  46. while (bytes >= CHACHA_BLOCK_SIZE * 8) {
  47. chacha_8block_xor_avx512vl(state, dst, src, bytes,
  48. nrounds);
  49. bytes -= CHACHA_BLOCK_SIZE * 8;
  50. src += CHACHA_BLOCK_SIZE * 8;
  51. dst += CHACHA_BLOCK_SIZE * 8;
  52. state[12] += 8;
  53. }
  54. if (bytes > CHACHA_BLOCK_SIZE * 4) {
  55. chacha_8block_xor_avx512vl(state, dst, src, bytes,
  56. nrounds);
  57. state[12] += chacha_advance(bytes, 8);
  58. return;
  59. }
  60. if (bytes > CHACHA_BLOCK_SIZE * 2) {
  61. chacha_4block_xor_avx512vl(state, dst, src, bytes,
  62. nrounds);
  63. state[12] += chacha_advance(bytes, 4);
  64. return;
  65. }
  66. if (bytes) {
  67. chacha_2block_xor_avx512vl(state, dst, src, bytes,
  68. nrounds);
  69. state[12] += chacha_advance(bytes, 2);
  70. return;
  71. }
  72. }
  73. if (static_branch_likely(&chacha_use_avx2)) {
  74. while (bytes >= CHACHA_BLOCK_SIZE * 8) {
  75. chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
  76. bytes -= CHACHA_BLOCK_SIZE * 8;
  77. src += CHACHA_BLOCK_SIZE * 8;
  78. dst += CHACHA_BLOCK_SIZE * 8;
  79. state[12] += 8;
  80. }
  81. if (bytes > CHACHA_BLOCK_SIZE * 4) {
  82. chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
  83. state[12] += chacha_advance(bytes, 8);
  84. return;
  85. }
  86. if (bytes > CHACHA_BLOCK_SIZE * 2) {
  87. chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
  88. state[12] += chacha_advance(bytes, 4);
  89. return;
  90. }
  91. if (bytes > CHACHA_BLOCK_SIZE) {
  92. chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
  93. state[12] += chacha_advance(bytes, 2);
  94. return;
  95. }
  96. }
  97. while (bytes >= CHACHA_BLOCK_SIZE * 4) {
  98. chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
  99. bytes -= CHACHA_BLOCK_SIZE * 4;
  100. src += CHACHA_BLOCK_SIZE * 4;
  101. dst += CHACHA_BLOCK_SIZE * 4;
  102. state[12] += 4;
  103. }
  104. if (bytes > CHACHA_BLOCK_SIZE) {
  105. chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
  106. state[12] += chacha_advance(bytes, 4);
  107. return;
  108. }
  109. if (bytes) {
  110. chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
  111. state[12]++;
  112. }
  113. }
  114. void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
  115. {
  116. if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
  117. hchacha_block_generic(state, stream, nrounds);
  118. } else {
  119. kernel_fpu_begin();
  120. hchacha_block_ssse3(state, stream, nrounds);
  121. kernel_fpu_end();
  122. }
  123. }
  124. EXPORT_SYMBOL(hchacha_block_arch);
  125. void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
  126. {
  127. chacha_init_generic(state, key, iv);
  128. }
  129. EXPORT_SYMBOL(chacha_init_arch);
  130. void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
  131. int nrounds)
  132. {
  133. if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
  134. bytes <= CHACHA_BLOCK_SIZE)
  135. return chacha_crypt_generic(state, dst, src, bytes, nrounds);
  136. do {
  137. unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
  138. kernel_fpu_begin();
  139. chacha_dosimd(state, dst, src, todo, nrounds);
  140. kernel_fpu_end();
  141. bytes -= todo;
  142. src += todo;
  143. dst += todo;
  144. } while (bytes);
  145. }
  146. EXPORT_SYMBOL(chacha_crypt_arch);
  147. static int chacha_simd_stream_xor(struct skcipher_request *req,
  148. const struct chacha_ctx *ctx, const u8 *iv)
  149. {
  150. u32 state[CHACHA_STATE_WORDS] __aligned(8);
  151. struct skcipher_walk walk;
  152. int err;
  153. err = skcipher_walk_virt(&walk, req, false);
  154. chacha_init_generic(state, ctx->key, iv);
  155. while (walk.nbytes > 0) {
  156. unsigned int nbytes = walk.nbytes;
  157. if (nbytes < walk.total)
  158. nbytes = round_down(nbytes, walk.stride);
  159. if (!static_branch_likely(&chacha_use_simd) ||
  160. !crypto_simd_usable()) {
  161. chacha_crypt_generic(state, walk.dst.virt.addr,
  162. walk.src.virt.addr, nbytes,
  163. ctx->nrounds);
  164. } else {
  165. kernel_fpu_begin();
  166. chacha_dosimd(state, walk.dst.virt.addr,
  167. walk.src.virt.addr, nbytes,
  168. ctx->nrounds);
  169. kernel_fpu_end();
  170. }
  171. err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
  172. }
  173. return err;
  174. }
  175. static int chacha_simd(struct skcipher_request *req)
  176. {
  177. struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
  178. struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
  179. return chacha_simd_stream_xor(req, ctx, req->iv);
  180. }
  181. static int xchacha_simd(struct skcipher_request *req)
  182. {
  183. struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
  184. struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
  185. u32 state[CHACHA_STATE_WORDS] __aligned(8);
  186. struct chacha_ctx subctx;
  187. u8 real_iv[16];
  188. chacha_init_generic(state, ctx->key, req->iv);
  189. if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
  190. kernel_fpu_begin();
  191. hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
  192. kernel_fpu_end();
  193. } else {
  194. hchacha_block_generic(state, subctx.key, ctx->nrounds);
  195. }
  196. subctx.nrounds = ctx->nrounds;
  197. memcpy(&real_iv[0], req->iv + 24, 8);
  198. memcpy(&real_iv[8], req->iv + 16, 8);
  199. return chacha_simd_stream_xor(req, &subctx, real_iv);
  200. }
  201. static struct skcipher_alg algs[] = {
  202. {
  203. .base.cra_name = "chacha20",
  204. .base.cra_driver_name = "chacha20-simd",
  205. .base.cra_priority = 300,
  206. .base.cra_blocksize = 1,
  207. .base.cra_ctxsize = sizeof(struct chacha_ctx),
  208. .base.cra_module = THIS_MODULE,
  209. .min_keysize = CHACHA_KEY_SIZE,
  210. .max_keysize = CHACHA_KEY_SIZE,
  211. .ivsize = CHACHA_IV_SIZE,
  212. .chunksize = CHACHA_BLOCK_SIZE,
  213. .setkey = chacha20_setkey,
  214. .encrypt = chacha_simd,
  215. .decrypt = chacha_simd,
  216. }, {
  217. .base.cra_name = "xchacha20",
  218. .base.cra_driver_name = "xchacha20-simd",
  219. .base.cra_priority = 300,
  220. .base.cra_blocksize = 1,
  221. .base.cra_ctxsize = sizeof(struct chacha_ctx),
  222. .base.cra_module = THIS_MODULE,
  223. .min_keysize = CHACHA_KEY_SIZE,
  224. .max_keysize = CHACHA_KEY_SIZE,
  225. .ivsize = XCHACHA_IV_SIZE,
  226. .chunksize = CHACHA_BLOCK_SIZE,
  227. .setkey = chacha20_setkey,
  228. .encrypt = xchacha_simd,
  229. .decrypt = xchacha_simd,
  230. }, {
  231. .base.cra_name = "xchacha12",
  232. .base.cra_driver_name = "xchacha12-simd",
  233. .base.cra_priority = 300,
  234. .base.cra_blocksize = 1,
  235. .base.cra_ctxsize = sizeof(struct chacha_ctx),
  236. .base.cra_module = THIS_MODULE,
  237. .min_keysize = CHACHA_KEY_SIZE,
  238. .max_keysize = CHACHA_KEY_SIZE,
  239. .ivsize = XCHACHA_IV_SIZE,
  240. .chunksize = CHACHA_BLOCK_SIZE,
  241. .setkey = chacha12_setkey,
  242. .encrypt = xchacha_simd,
  243. .decrypt = xchacha_simd,
  244. },
  245. };
  246. static int __init chacha_simd_mod_init(void)
  247. {
  248. if (!boot_cpu_has(X86_FEATURE_SSSE3))
  249. return 0;
  250. static_branch_enable(&chacha_use_simd);
  251. if (boot_cpu_has(X86_FEATURE_AVX) &&
  252. boot_cpu_has(X86_FEATURE_AVX2) &&
  253. cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
  254. static_branch_enable(&chacha_use_avx2);
  255. if (IS_ENABLED(CONFIG_AS_AVX512) &&
  256. boot_cpu_has(X86_FEATURE_AVX512VL) &&
  257. boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
  258. static_branch_enable(&chacha_use_avx512vl);
  259. }
  260. return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
  261. crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
  262. }
  263. static void __exit chacha_simd_mod_fini(void)
  264. {
  265. if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
  266. crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
  267. }
  268. module_init(chacha_simd_mod_init);
  269. module_exit(chacha_simd_mod_fini);
  270. MODULE_LICENSE("GPL");
  271. MODULE_AUTHOR("Martin Willi <[email protected]>");
  272. MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
  273. MODULE_ALIAS_CRYPTO("chacha20");
  274. MODULE_ALIAS_CRYPTO("chacha20-simd");
  275. MODULE_ALIAS_CRYPTO("xchacha20");
  276. MODULE_ALIAS_CRYPTO("xchacha20-simd");
  277. MODULE_ALIAS_CRYPTO("xchacha12");
  278. MODULE_ALIAS_CRYPTO("xchacha12-simd");