ghash-clmulni-intel_asm.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
  4. * instructions. This file contains accelerated part of ghash
  5. * implementation. More information about PCLMULQDQ can be found at:
  6. *
  7. * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
  8. *
  9. * Copyright (c) 2009 Intel Corp.
  10. * Author: Huang Ying <[email protected]>
  11. * Vinodh Gopal
  12. * Erdinc Ozturk
  13. * Deniz Karakoyunlu
  14. */
  15. #include <linux/linkage.h>
  16. #include <asm/frame.h>
  17. .section .rodata.cst16.bswap_mask, "aM", @progbits, 16
  18. .align 16
  19. .Lbswap_mask:
  20. .octa 0x000102030405060708090a0b0c0d0e0f
  21. #define DATA %xmm0
  22. #define SHASH %xmm1
  23. #define T1 %xmm2
  24. #define T2 %xmm3
  25. #define T3 %xmm4
  26. #define BSWAP %xmm5
  27. #define IN1 %xmm6
  28. .text
  29. /*
  30. * __clmul_gf128mul_ble: internal ABI
  31. * input:
  32. * DATA: operand1
  33. * SHASH: operand2, hash_key << 1 mod poly
  34. * output:
  35. * DATA: operand1 * operand2 mod poly
  36. * changed:
  37. * T1
  38. * T2
  39. * T3
  40. */
  41. SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
  42. movaps DATA, T1
  43. pshufd $0b01001110, DATA, T2
  44. pshufd $0b01001110, SHASH, T3
  45. pxor DATA, T2
  46. pxor SHASH, T3
  47. pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0
  48. pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1
  49. pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0)
  50. pxor DATA, T2
  51. pxor T1, T2 # T2 = a0 * b1 + a1 * b0
  52. movaps T2, T3
  53. pslldq $8, T3
  54. psrldq $8, T2
  55. pxor T3, DATA
  56. pxor T2, T1 # <T1:DATA> is result of
  57. # carry-less multiplication
  58. # first phase of the reduction
  59. movaps DATA, T3
  60. psllq $1, T3
  61. pxor DATA, T3
  62. psllq $5, T3
  63. pxor DATA, T3
  64. psllq $57, T3
  65. movaps T3, T2
  66. pslldq $8, T2
  67. psrldq $8, T3
  68. pxor T2, DATA
  69. pxor T3, T1
  70. # second phase of the reduction
  71. movaps DATA, T2
  72. psrlq $5, T2
  73. pxor DATA, T2
  74. psrlq $1, T2
  75. pxor DATA, T2
  76. psrlq $1, T2
  77. pxor T2, T1
  78. pxor T1, DATA
  79. RET
  80. SYM_FUNC_END(__clmul_gf128mul_ble)
  81. /* void clmul_ghash_mul(char *dst, const u128 *shash) */
  82. SYM_FUNC_START(clmul_ghash_mul)
  83. FRAME_BEGIN
  84. movups (%rdi), DATA
  85. movups (%rsi), SHASH
  86. movaps .Lbswap_mask, BSWAP
  87. pshufb BSWAP, DATA
  88. call __clmul_gf128mul_ble
  89. pshufb BSWAP, DATA
  90. movups DATA, (%rdi)
  91. FRAME_END
  92. RET
  93. SYM_FUNC_END(clmul_ghash_mul)
  94. /*
  95. * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
  96. * const u128 *shash);
  97. */
  98. SYM_FUNC_START(clmul_ghash_update)
  99. FRAME_BEGIN
  100. cmp $16, %rdx
  101. jb .Lupdate_just_ret # check length
  102. movaps .Lbswap_mask, BSWAP
  103. movups (%rdi), DATA
  104. movups (%rcx), SHASH
  105. pshufb BSWAP, DATA
  106. .align 4
  107. .Lupdate_loop:
  108. movups (%rsi), IN1
  109. pshufb BSWAP, IN1
  110. pxor IN1, DATA
  111. call __clmul_gf128mul_ble
  112. sub $16, %rdx
  113. add $16, %rsi
  114. cmp $16, %rdx
  115. jge .Lupdate_loop
  116. pshufb BSWAP, DATA
  117. movups DATA, (%rdi)
  118. .Lupdate_just_ret:
  119. FRAME_END
  120. RET
  121. SYM_FUNC_END(clmul_ghash_update)