polyval-clmulni_asm.S 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * Copyright 2021 Google LLC
  4. */
  5. /*
  6. * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
  7. * instructions. It works on 8 blocks at a time, by precomputing the first 8
  8. * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
  9. * allows us to split finite field multiplication into two steps.
  10. *
  11. * In the first step, we consider h^i, m_i as normal polynomials of degree less
  12. * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
  13. * is simply polynomial multiplication.
  14. *
  15. * In the second step, we compute the reduction of p(x) modulo the finite field
  16. * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
  17. *
  18. * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
  19. * multiplication is finite field multiplication. The advantage is that the
  20. * two-step process only requires 1 finite field reduction for every 8
  21. * polynomial multiplications. Further parallelism is gained by interleaving the
  22. * multiplications and polynomial reductions.
  23. */
  24. #include <linux/linkage.h>
  25. #include <asm/frame.h>
  26. #define STRIDE_BLOCKS 8
  27. #define GSTAR %xmm7
  28. #define PL %xmm8
  29. #define PH %xmm9
  30. #define TMP_XMM %xmm11
  31. #define LO %xmm12
  32. #define HI %xmm13
  33. #define MI %xmm14
  34. #define SUM %xmm15
  35. #define KEY_POWERS %rdi
  36. #define MSG %rsi
  37. #define BLOCKS_LEFT %rdx
  38. #define ACCUMULATOR %rcx
  39. #define TMP %rax
  40. .section .rodata.cst16.gstar, "aM", @progbits, 16
  41. .align 16
  42. .Lgstar:
  43. .quad 0xc200000000000000, 0xc200000000000000
  44. .text
  45. /*
  46. * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
  47. * count pointed to by MSG and KEY_POWERS.
  48. */
  49. .macro schoolbook1 count
  50. .set i, 0
  51. .rept (\count)
  52. schoolbook1_iteration i 0
  53. .set i, (i +1)
  54. .endr
  55. .endm
  56. /*
  57. * Computes the product of two 128-bit polynomials at the memory locations
  58. * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
  59. * the 256-bit product into LO, MI, HI.
  60. *
  61. * Given:
  62. * X = [X_1 : X_0]
  63. * Y = [Y_1 : Y_0]
  64. *
  65. * We compute:
  66. * LO += X_0 * Y_0
  67. * MI += X_0 * Y_1 + X_1 * Y_0
  68. * HI += X_1 * Y_1
  69. *
  70. * Later, the 256-bit result can be extracted as:
  71. * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
  72. * This step is done when computing the polynomial reduction for efficiency
  73. * reasons.
  74. *
  75. * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
  76. * extra multiplication of SUM and h^8.
  77. */
  78. .macro schoolbook1_iteration i xor_sum
  79. movups (16*\i)(MSG), %xmm0
  80. .if (\i == 0 && \xor_sum == 1)
  81. pxor SUM, %xmm0
  82. .endif
  83. vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
  84. vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
  85. vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
  86. vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
  87. vpxor %xmm2, MI, MI
  88. vpxor %xmm1, LO, LO
  89. vpxor %xmm4, HI, HI
  90. vpxor %xmm3, MI, MI
  91. .endm
  92. /*
  93. * Performs the same computation as schoolbook1_iteration, except we expect the
  94. * arguments to already be loaded into xmm0 and xmm1 and we set the result
  95. * registers LO, MI, and HI directly rather than XOR'ing into them.
  96. */
  97. .macro schoolbook1_noload
  98. vpclmulqdq $0x01, %xmm0, %xmm1, MI
  99. vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
  100. vpclmulqdq $0x00, %xmm0, %xmm1, LO
  101. vpclmulqdq $0x11, %xmm0, %xmm1, HI
  102. vpxor %xmm2, MI, MI
  103. .endm
  104. /*
  105. * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
  106. * the result in PL, PH.
  107. * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
  108. */
  109. .macro schoolbook2
  110. vpslldq $8, MI, PL
  111. vpsrldq $8, MI, PH
  112. pxor LO, PL
  113. pxor HI, PH
  114. .endm
  115. /*
  116. * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
  117. *
  118. * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
  119. * x^128 + x^127 + x^126 + x^121 + 1.
  120. *
  121. * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
  122. * product of two 128-bit polynomials in Montgomery form. We need to reduce it
  123. * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
  124. * of x^128, this product has two extra factors of x^128. To get it back into
  125. * Montgomery form, we need to remove one of these factors by dividing by x^128.
  126. *
  127. * To accomplish both of these goals, we add multiples of g(x) that cancel out
  128. * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
  129. * bits are zero, the polynomial division by x^128 can be done by right shifting.
  130. *
  131. * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
  132. * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
  133. * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
  134. * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
  135. * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
  136. * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
  137. *
  138. * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
  139. * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
  140. * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
  141. * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
  142. * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
  143. *
  144. * So our final computation is:
  145. * T = T_1 : T_0 = g*(x) * P_0
  146. * V = V_1 : V_0 = g*(x) * (P_1 + T_0)
  147. * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
  148. *
  149. * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
  150. * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
  151. * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
  152. */
  153. .macro montgomery_reduction dest
  154. vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
  155. pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
  156. pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
  157. pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
  158. pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
  159. vpxor TMP_XMM, PH, \dest
  160. .endm
  161. /*
  162. * Compute schoolbook multiplication for 8 blocks
  163. * m_0h^8 + ... + m_7h^1
  164. *
  165. * If reduce is set, also computes the montgomery reduction of the
  166. * previous full_stride call and XORs with the first message block.
  167. * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
  168. * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
  169. */
  170. .macro full_stride reduce
  171. pxor LO, LO
  172. pxor HI, HI
  173. pxor MI, MI
  174. schoolbook1_iteration 7 0
  175. .if \reduce
  176. vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
  177. .endif
  178. schoolbook1_iteration 6 0
  179. .if \reduce
  180. pshufd $0b01001110, TMP_XMM, TMP_XMM
  181. .endif
  182. schoolbook1_iteration 5 0
  183. .if \reduce
  184. pxor PL, TMP_XMM
  185. .endif
  186. schoolbook1_iteration 4 0
  187. .if \reduce
  188. pxor TMP_XMM, PH
  189. .endif
  190. schoolbook1_iteration 3 0
  191. .if \reduce
  192. pclmulqdq $0x11, GSTAR, TMP_XMM
  193. .endif
  194. schoolbook1_iteration 2 0
  195. .if \reduce
  196. vpxor TMP_XMM, PH, SUM
  197. .endif
  198. schoolbook1_iteration 1 0
  199. schoolbook1_iteration 0 1
  200. addq $(8*16), MSG
  201. schoolbook2
  202. .endm
  203. /*
  204. * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
  205. */
  206. .macro partial_stride
  207. mov BLOCKS_LEFT, TMP
  208. shlq $4, TMP
  209. addq $(16*STRIDE_BLOCKS), KEY_POWERS
  210. subq TMP, KEY_POWERS
  211. movups (MSG), %xmm0
  212. pxor SUM, %xmm0
  213. movaps (KEY_POWERS), %xmm1
  214. schoolbook1_noload
  215. dec BLOCKS_LEFT
  216. addq $16, MSG
  217. addq $16, KEY_POWERS
  218. test $4, BLOCKS_LEFT
  219. jz .Lpartial4BlocksDone
  220. schoolbook1 4
  221. addq $(4*16), MSG
  222. addq $(4*16), KEY_POWERS
  223. .Lpartial4BlocksDone:
  224. test $2, BLOCKS_LEFT
  225. jz .Lpartial2BlocksDone
  226. schoolbook1 2
  227. addq $(2*16), MSG
  228. addq $(2*16), KEY_POWERS
  229. .Lpartial2BlocksDone:
  230. test $1, BLOCKS_LEFT
  231. jz .LpartialDone
  232. schoolbook1 1
  233. .LpartialDone:
  234. schoolbook2
  235. montgomery_reduction SUM
  236. .endm
  237. /*
  238. * Perform montgomery multiplication in GF(2^128) and store result in op1.
  239. *
  240. * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
  241. * If op1, op2 are in montgomery form, this computes the montgomery
  242. * form of op1*op2.
  243. *
  244. * void clmul_polyval_mul(u8 *op1, const u8 *op2);
  245. */
  246. SYM_FUNC_START(clmul_polyval_mul)
  247. FRAME_BEGIN
  248. vmovdqa .Lgstar(%rip), GSTAR
  249. movups (%rdi), %xmm0
  250. movups (%rsi), %xmm1
  251. schoolbook1_noload
  252. schoolbook2
  253. montgomery_reduction SUM
  254. movups SUM, (%rdi)
  255. FRAME_END
  256. RET
  257. SYM_FUNC_END(clmul_polyval_mul)
  258. /*
  259. * Perform polynomial evaluation as specified by POLYVAL. This computes:
  260. * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
  261. * where n=nblocks, h is the hash key, and m_i are the message blocks.
  262. *
  263. * rdi - pointer to precomputed key powers h^8 ... h^1
  264. * rsi - pointer to message blocks
  265. * rdx - number of blocks to hash
  266. * rcx - pointer to the accumulator
  267. *
  268. * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
  269. * const u8 *in, size_t nblocks, u8 *accumulator);
  270. */
  271. SYM_FUNC_START(clmul_polyval_update)
  272. FRAME_BEGIN
  273. vmovdqa .Lgstar(%rip), GSTAR
  274. movups (ACCUMULATOR), SUM
  275. subq $STRIDE_BLOCKS, BLOCKS_LEFT
  276. js .LstrideLoopExit
  277. full_stride 0
  278. subq $STRIDE_BLOCKS, BLOCKS_LEFT
  279. js .LstrideLoopExitReduce
  280. .LstrideLoop:
  281. full_stride 1
  282. subq $STRIDE_BLOCKS, BLOCKS_LEFT
  283. jns .LstrideLoop
  284. .LstrideLoopExitReduce:
  285. montgomery_reduction SUM
  286. .LstrideLoopExit:
  287. add $STRIDE_BLOCKS, BLOCKS_LEFT
  288. jz .LskipPartial
  289. partial_stride
  290. .LskipPartial:
  291. movups SUM, (ACCUMULATOR)
  292. FRAME_END
  293. RET
  294. SYM_FUNC_END(clmul_polyval_update)