crc32-pclmul_asm.S 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright 2012 Xyratex Technology Limited
  4. *
  5. * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  6. * calculation.
  7. * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  8. * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  9. * at:
  10. * http://www.intel.com/products/processor/manuals/
  11. * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  12. * Volume 2B: Instruction Set Reference, N-Z
  13. *
  14. * Authors: Gregory Prestas <[email protected]>
  15. * Alexander Boyko <[email protected]>
  16. */
  17. #include <linux/linkage.h>
  18. .section .rodata
  19. .align 16
  20. /*
  21. * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
  22. * #define CONSTANT_R1 0x154442bd4LL
  23. *
  24. * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
  25. * #define CONSTANT_R2 0x1c6e41596LL
  26. */
  27. .Lconstant_R2R1:
  28. .octa 0x00000001c6e415960000000154442bd4
  29. /*
  30. * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
  31. * #define CONSTANT_R3 0x1751997d0LL
  32. *
  33. * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
  34. * #define CONSTANT_R4 0x0ccaa009eLL
  35. */
  36. .Lconstant_R4R3:
  37. .octa 0x00000000ccaa009e00000001751997d0
  38. /*
  39. * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
  40. * #define CONSTANT_R5 0x163cd6124LL
  41. */
  42. .Lconstant_R5:
  43. .octa 0x00000000000000000000000163cd6124
  44. .Lconstant_mask32:
  45. .octa 0x000000000000000000000000FFFFFFFF
  46. /*
  47. * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
  48. *
  49. * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
  50. * #define CONSTANT_RU 0x1F7011641LL
  51. */
  52. .Lconstant_RUpoly:
  53. .octa 0x00000001F701164100000001DB710641
  54. #define CONSTANT %xmm0
  55. #ifdef __x86_64__
  56. #define BUF %rdi
  57. #define LEN %rsi
  58. #define CRC %edx
  59. #else
  60. #define BUF %eax
  61. #define LEN %edx
  62. #define CRC %ecx
  63. #endif
  64. .text
  65. /**
  66. * Calculate crc32
  67. * BUF - buffer (16 bytes aligned)
  68. * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
  69. * CRC - initial crc32
  70. * return %eax crc32
  71. * uint crc32_pclmul_le_16(unsigned char const *buffer,
  72. * size_t len, uint crc32)
  73. */
  74. SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
  75. movdqa (BUF), %xmm1
  76. movdqa 0x10(BUF), %xmm2
  77. movdqa 0x20(BUF), %xmm3
  78. movdqa 0x30(BUF), %xmm4
  79. movd CRC, CONSTANT
  80. pxor CONSTANT, %xmm1
  81. sub $0x40, LEN
  82. add $0x40, BUF
  83. cmp $0x40, LEN
  84. jb less_64
  85. #ifdef __x86_64__
  86. movdqa .Lconstant_R2R1(%rip), CONSTANT
  87. #else
  88. movdqa .Lconstant_R2R1, CONSTANT
  89. #endif
  90. loop_64:/* 64 bytes Full cache line folding */
  91. prefetchnta 0x40(BUF)
  92. movdqa %xmm1, %xmm5
  93. movdqa %xmm2, %xmm6
  94. movdqa %xmm3, %xmm7
  95. #ifdef __x86_64__
  96. movdqa %xmm4, %xmm8
  97. #endif
  98. pclmulqdq $0x00, CONSTANT, %xmm1
  99. pclmulqdq $0x00, CONSTANT, %xmm2
  100. pclmulqdq $0x00, CONSTANT, %xmm3
  101. #ifdef __x86_64__
  102. pclmulqdq $0x00, CONSTANT, %xmm4
  103. #endif
  104. pclmulqdq $0x11, CONSTANT, %xmm5
  105. pclmulqdq $0x11, CONSTANT, %xmm6
  106. pclmulqdq $0x11, CONSTANT, %xmm7
  107. #ifdef __x86_64__
  108. pclmulqdq $0x11, CONSTANT, %xmm8
  109. #endif
  110. pxor %xmm5, %xmm1
  111. pxor %xmm6, %xmm2
  112. pxor %xmm7, %xmm3
  113. #ifdef __x86_64__
  114. pxor %xmm8, %xmm4
  115. #else
  116. /* xmm8 unsupported for x32 */
  117. movdqa %xmm4, %xmm5
  118. pclmulqdq $0x00, CONSTANT, %xmm4
  119. pclmulqdq $0x11, CONSTANT, %xmm5
  120. pxor %xmm5, %xmm4
  121. #endif
  122. pxor (BUF), %xmm1
  123. pxor 0x10(BUF), %xmm2
  124. pxor 0x20(BUF), %xmm3
  125. pxor 0x30(BUF), %xmm4
  126. sub $0x40, LEN
  127. add $0x40, BUF
  128. cmp $0x40, LEN
  129. jge loop_64
  130. less_64:/* Folding cache line into 128bit */
  131. #ifdef __x86_64__
  132. movdqa .Lconstant_R4R3(%rip), CONSTANT
  133. #else
  134. movdqa .Lconstant_R4R3, CONSTANT
  135. #endif
  136. prefetchnta (BUF)
  137. movdqa %xmm1, %xmm5
  138. pclmulqdq $0x00, CONSTANT, %xmm1
  139. pclmulqdq $0x11, CONSTANT, %xmm5
  140. pxor %xmm5, %xmm1
  141. pxor %xmm2, %xmm1
  142. movdqa %xmm1, %xmm5
  143. pclmulqdq $0x00, CONSTANT, %xmm1
  144. pclmulqdq $0x11, CONSTANT, %xmm5
  145. pxor %xmm5, %xmm1
  146. pxor %xmm3, %xmm1
  147. movdqa %xmm1, %xmm5
  148. pclmulqdq $0x00, CONSTANT, %xmm1
  149. pclmulqdq $0x11, CONSTANT, %xmm5
  150. pxor %xmm5, %xmm1
  151. pxor %xmm4, %xmm1
  152. cmp $0x10, LEN
  153. jb fold_64
  154. loop_16:/* Folding rest buffer into 128bit */
  155. movdqa %xmm1, %xmm5
  156. pclmulqdq $0x00, CONSTANT, %xmm1
  157. pclmulqdq $0x11, CONSTANT, %xmm5
  158. pxor %xmm5, %xmm1
  159. pxor (BUF), %xmm1
  160. sub $0x10, LEN
  161. add $0x10, BUF
  162. cmp $0x10, LEN
  163. jge loop_16
  164. fold_64:
  165. /* perform the last 64 bit fold, also adds 32 zeroes
  166. * to the input stream */
  167. pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
  168. psrldq $0x08, %xmm1
  169. pxor CONSTANT, %xmm1
  170. /* final 32-bit fold */
  171. movdqa %xmm1, %xmm2
  172. #ifdef __x86_64__
  173. movdqa .Lconstant_R5(%rip), CONSTANT
  174. movdqa .Lconstant_mask32(%rip), %xmm3
  175. #else
  176. movdqa .Lconstant_R5, CONSTANT
  177. movdqa .Lconstant_mask32, %xmm3
  178. #endif
  179. psrldq $0x04, %xmm2
  180. pand %xmm3, %xmm1
  181. pclmulqdq $0x00, CONSTANT, %xmm1
  182. pxor %xmm2, %xmm1
  183. /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
  184. #ifdef __x86_64__
  185. movdqa .Lconstant_RUpoly(%rip), CONSTANT
  186. #else
  187. movdqa .Lconstant_RUpoly, CONSTANT
  188. #endif
  189. movdqa %xmm1, %xmm2
  190. pand %xmm3, %xmm1
  191. pclmulqdq $0x10, CONSTANT, %xmm1
  192. pand %xmm3, %xmm1
  193. pclmulqdq $0x00, CONSTANT, %xmm1
  194. pxor %xmm2, %xmm1
  195. pextrd $0x01, %xmm1, %eax
  196. RET
  197. SYM_FUNC_END(crc32_pclmul_le_16)