nh-sse2-x86_64.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
  4. *
  5. * Copyright 2018 Google LLC
  6. *
  7. * Author: Eric Biggers <[email protected]>
  8. */
  9. #include <linux/linkage.h>
  10. #define PASS0_SUMS %xmm0
  11. #define PASS1_SUMS %xmm1
  12. #define PASS2_SUMS %xmm2
  13. #define PASS3_SUMS %xmm3
  14. #define K0 %xmm4
  15. #define K1 %xmm5
  16. #define K2 %xmm6
  17. #define K3 %xmm7
  18. #define T0 %xmm8
  19. #define T1 %xmm9
  20. #define T2 %xmm10
  21. #define T3 %xmm11
  22. #define T4 %xmm12
  23. #define T5 %xmm13
  24. #define T6 %xmm14
  25. #define T7 %xmm15
  26. #define KEY %rdi
  27. #define MESSAGE %rsi
  28. #define MESSAGE_LEN %rdx
  29. #define HASH %rcx
  30. .macro _nh_stride k0, k1, k2, k3, offset
  31. // Load next message stride
  32. movdqu \offset(MESSAGE), T1
  33. // Load next key stride
  34. movdqu \offset(KEY), \k3
  35. // Add message words to key words
  36. movdqa T1, T2
  37. movdqa T1, T3
  38. paddd T1, \k0 // reuse k0 to avoid a move
  39. paddd \k1, T1
  40. paddd \k2, T2
  41. paddd \k3, T3
  42. // Multiply 32x32 => 64 and accumulate
  43. pshufd $0x10, \k0, T4
  44. pshufd $0x32, \k0, \k0
  45. pshufd $0x10, T1, T5
  46. pshufd $0x32, T1, T1
  47. pshufd $0x10, T2, T6
  48. pshufd $0x32, T2, T2
  49. pshufd $0x10, T3, T7
  50. pshufd $0x32, T3, T3
  51. pmuludq T4, \k0
  52. pmuludq T5, T1
  53. pmuludq T6, T2
  54. pmuludq T7, T3
  55. paddq \k0, PASS0_SUMS
  56. paddq T1, PASS1_SUMS
  57. paddq T2, PASS2_SUMS
  58. paddq T3, PASS3_SUMS
  59. .endm
  60. /*
  61. * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
  62. * u8 hash[NH_HASH_BYTES])
  63. *
  64. * It's guaranteed that message_len % 16 == 0.
  65. */
  66. SYM_FUNC_START(nh_sse2)
  67. movdqu 0x00(KEY), K0
  68. movdqu 0x10(KEY), K1
  69. movdqu 0x20(KEY), K2
  70. add $0x30, KEY
  71. pxor PASS0_SUMS, PASS0_SUMS
  72. pxor PASS1_SUMS, PASS1_SUMS
  73. pxor PASS2_SUMS, PASS2_SUMS
  74. pxor PASS3_SUMS, PASS3_SUMS
  75. sub $0x40, MESSAGE_LEN
  76. jl .Lloop4_done
  77. .Lloop4:
  78. _nh_stride K0, K1, K2, K3, 0x00
  79. _nh_stride K1, K2, K3, K0, 0x10
  80. _nh_stride K2, K3, K0, K1, 0x20
  81. _nh_stride K3, K0, K1, K2, 0x30
  82. add $0x40, KEY
  83. add $0x40, MESSAGE
  84. sub $0x40, MESSAGE_LEN
  85. jge .Lloop4
  86. .Lloop4_done:
  87. and $0x3f, MESSAGE_LEN
  88. jz .Ldone
  89. _nh_stride K0, K1, K2, K3, 0x00
  90. sub $0x10, MESSAGE_LEN
  91. jz .Ldone
  92. _nh_stride K1, K2, K3, K0, 0x10
  93. sub $0x10, MESSAGE_LEN
  94. jz .Ldone
  95. _nh_stride K2, K3, K0, K1, 0x20
  96. .Ldone:
  97. // Sum the accumulators for each pass, then store the sums to 'hash'
  98. movdqa PASS0_SUMS, T0
  99. movdqa PASS2_SUMS, T1
  100. punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
  101. punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
  102. punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
  103. punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
  104. paddq PASS0_SUMS, T0
  105. paddq PASS2_SUMS, T1
  106. movdqu T0, 0x00(HASH)
  107. movdqu T1, 0x10(HASH)
  108. RET
  109. SYM_FUNC_END(nh_sse2)