xor-neon.c 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * arch/arm64/lib/xor-neon.c
  4. *
  5. * Authors: Jackie Liu <[email protected]>
  6. * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
  7. */
  8. #include <linux/raid/xor.h>
  9. #include <linux/module.h>
  10. #include <asm/neon-intrinsics.h>
  11. void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
  12. const unsigned long * __restrict p2)
  13. {
  14. uint64_t *dp1 = (uint64_t *)p1;
  15. uint64_t *dp2 = (uint64_t *)p2;
  16. register uint64x2_t v0, v1, v2, v3;
  17. long lines = bytes / (sizeof(uint64x2_t) * 4);
  18. do {
  19. /* p1 ^= p2 */
  20. v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
  21. v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
  22. v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
  23. v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
  24. /* store */
  25. vst1q_u64(dp1 + 0, v0);
  26. vst1q_u64(dp1 + 2, v1);
  27. vst1q_u64(dp1 + 4, v2);
  28. vst1q_u64(dp1 + 6, v3);
  29. dp1 += 8;
  30. dp2 += 8;
  31. } while (--lines > 0);
  32. }
  33. void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
  34. const unsigned long * __restrict p2,
  35. const unsigned long * __restrict p3)
  36. {
  37. uint64_t *dp1 = (uint64_t *)p1;
  38. uint64_t *dp2 = (uint64_t *)p2;
  39. uint64_t *dp3 = (uint64_t *)p3;
  40. register uint64x2_t v0, v1, v2, v3;
  41. long lines = bytes / (sizeof(uint64x2_t) * 4);
  42. do {
  43. /* p1 ^= p2 */
  44. v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
  45. v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
  46. v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
  47. v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
  48. /* p1 ^= p3 */
  49. v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
  50. v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
  51. v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
  52. v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
  53. /* store */
  54. vst1q_u64(dp1 + 0, v0);
  55. vst1q_u64(dp1 + 2, v1);
  56. vst1q_u64(dp1 + 4, v2);
  57. vst1q_u64(dp1 + 6, v3);
  58. dp1 += 8;
  59. dp2 += 8;
  60. dp3 += 8;
  61. } while (--lines > 0);
  62. }
  63. void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
  64. const unsigned long * __restrict p2,
  65. const unsigned long * __restrict p3,
  66. const unsigned long * __restrict p4)
  67. {
  68. uint64_t *dp1 = (uint64_t *)p1;
  69. uint64_t *dp2 = (uint64_t *)p2;
  70. uint64_t *dp3 = (uint64_t *)p3;
  71. uint64_t *dp4 = (uint64_t *)p4;
  72. register uint64x2_t v0, v1, v2, v3;
  73. long lines = bytes / (sizeof(uint64x2_t) * 4);
  74. do {
  75. /* p1 ^= p2 */
  76. v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
  77. v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
  78. v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
  79. v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
  80. /* p1 ^= p3 */
  81. v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
  82. v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
  83. v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
  84. v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
  85. /* p1 ^= p4 */
  86. v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
  87. v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
  88. v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
  89. v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
  90. /* store */
  91. vst1q_u64(dp1 + 0, v0);
  92. vst1q_u64(dp1 + 2, v1);
  93. vst1q_u64(dp1 + 4, v2);
  94. vst1q_u64(dp1 + 6, v3);
  95. dp1 += 8;
  96. dp2 += 8;
  97. dp3 += 8;
  98. dp4 += 8;
  99. } while (--lines > 0);
  100. }
  101. void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
  102. const unsigned long * __restrict p2,
  103. const unsigned long * __restrict p3,
  104. const unsigned long * __restrict p4,
  105. const unsigned long * __restrict p5)
  106. {
  107. uint64_t *dp1 = (uint64_t *)p1;
  108. uint64_t *dp2 = (uint64_t *)p2;
  109. uint64_t *dp3 = (uint64_t *)p3;
  110. uint64_t *dp4 = (uint64_t *)p4;
  111. uint64_t *dp5 = (uint64_t *)p5;
  112. register uint64x2_t v0, v1, v2, v3;
  113. long lines = bytes / (sizeof(uint64x2_t) * 4);
  114. do {
  115. /* p1 ^= p2 */
  116. v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
  117. v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
  118. v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
  119. v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
  120. /* p1 ^= p3 */
  121. v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
  122. v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
  123. v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
  124. v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
  125. /* p1 ^= p4 */
  126. v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
  127. v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
  128. v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
  129. v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
  130. /* p1 ^= p5 */
  131. v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
  132. v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
  133. v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
  134. v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
  135. /* store */
  136. vst1q_u64(dp1 + 0, v0);
  137. vst1q_u64(dp1 + 2, v1);
  138. vst1q_u64(dp1 + 4, v2);
  139. vst1q_u64(dp1 + 6, v3);
  140. dp1 += 8;
  141. dp2 += 8;
  142. dp3 += 8;
  143. dp4 += 8;
  144. dp5 += 8;
  145. } while (--lines > 0);
  146. }
  147. struct xor_block_template xor_block_inner_neon __ro_after_init = {
  148. .name = "__inner_neon__",
  149. .do_2 = xor_arm64_neon_2,
  150. .do_3 = xor_arm64_neon_3,
  151. .do_4 = xor_arm64_neon_4,
  152. .do_5 = xor_arm64_neon_5,
  153. };
  154. EXPORT_SYMBOL(xor_block_inner_neon);
  155. static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
  156. {
  157. uint64x2_t res;
  158. asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
  159. "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
  160. : "=w"(res) : "w"(p), "w"(q), "w"(r));
  161. return res;
  162. }
  163. static void xor_arm64_eor3_3(unsigned long bytes,
  164. unsigned long * __restrict p1,
  165. const unsigned long * __restrict p2,
  166. const unsigned long * __restrict p3)
  167. {
  168. uint64_t *dp1 = (uint64_t *)p1;
  169. uint64_t *dp2 = (uint64_t *)p2;
  170. uint64_t *dp3 = (uint64_t *)p3;
  171. register uint64x2_t v0, v1, v2, v3;
  172. long lines = bytes / (sizeof(uint64x2_t) * 4);
  173. do {
  174. /* p1 ^= p2 ^ p3 */
  175. v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
  176. vld1q_u64(dp3 + 0));
  177. v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
  178. vld1q_u64(dp3 + 2));
  179. v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
  180. vld1q_u64(dp3 + 4));
  181. v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
  182. vld1q_u64(dp3 + 6));
  183. /* store */
  184. vst1q_u64(dp1 + 0, v0);
  185. vst1q_u64(dp1 + 2, v1);
  186. vst1q_u64(dp1 + 4, v2);
  187. vst1q_u64(dp1 + 6, v3);
  188. dp1 += 8;
  189. dp2 += 8;
  190. dp3 += 8;
  191. } while (--lines > 0);
  192. }
  193. static void xor_arm64_eor3_4(unsigned long bytes,
  194. unsigned long * __restrict p1,
  195. const unsigned long * __restrict p2,
  196. const unsigned long * __restrict p3,
  197. const unsigned long * __restrict p4)
  198. {
  199. uint64_t *dp1 = (uint64_t *)p1;
  200. uint64_t *dp2 = (uint64_t *)p2;
  201. uint64_t *dp3 = (uint64_t *)p3;
  202. uint64_t *dp4 = (uint64_t *)p4;
  203. register uint64x2_t v0, v1, v2, v3;
  204. long lines = bytes / (sizeof(uint64x2_t) * 4);
  205. do {
  206. /* p1 ^= p2 ^ p3 */
  207. v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
  208. vld1q_u64(dp3 + 0));
  209. v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
  210. vld1q_u64(dp3 + 2));
  211. v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
  212. vld1q_u64(dp3 + 4));
  213. v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
  214. vld1q_u64(dp3 + 6));
  215. /* p1 ^= p4 */
  216. v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
  217. v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
  218. v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
  219. v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
  220. /* store */
  221. vst1q_u64(dp1 + 0, v0);
  222. vst1q_u64(dp1 + 2, v1);
  223. vst1q_u64(dp1 + 4, v2);
  224. vst1q_u64(dp1 + 6, v3);
  225. dp1 += 8;
  226. dp2 += 8;
  227. dp3 += 8;
  228. dp4 += 8;
  229. } while (--lines > 0);
  230. }
  231. static void xor_arm64_eor3_5(unsigned long bytes,
  232. unsigned long * __restrict p1,
  233. const unsigned long * __restrict p2,
  234. const unsigned long * __restrict p3,
  235. const unsigned long * __restrict p4,
  236. const unsigned long * __restrict p5)
  237. {
  238. uint64_t *dp1 = (uint64_t *)p1;
  239. uint64_t *dp2 = (uint64_t *)p2;
  240. uint64_t *dp3 = (uint64_t *)p3;
  241. uint64_t *dp4 = (uint64_t *)p4;
  242. uint64_t *dp5 = (uint64_t *)p5;
  243. register uint64x2_t v0, v1, v2, v3;
  244. long lines = bytes / (sizeof(uint64x2_t) * 4);
  245. do {
  246. /* p1 ^= p2 ^ p3 */
  247. v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
  248. vld1q_u64(dp3 + 0));
  249. v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
  250. vld1q_u64(dp3 + 2));
  251. v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
  252. vld1q_u64(dp3 + 4));
  253. v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
  254. vld1q_u64(dp3 + 6));
  255. /* p1 ^= p4 ^ p5 */
  256. v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
  257. v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
  258. v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
  259. v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
  260. /* store */
  261. vst1q_u64(dp1 + 0, v0);
  262. vst1q_u64(dp1 + 2, v1);
  263. vst1q_u64(dp1 + 4, v2);
  264. vst1q_u64(dp1 + 6, v3);
  265. dp1 += 8;
  266. dp2 += 8;
  267. dp3 += 8;
  268. dp4 += 8;
  269. dp5 += 8;
  270. } while (--lines > 0);
  271. }
  272. static int __init xor_neon_init(void)
  273. {
  274. if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
  275. xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
  276. xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
  277. xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
  278. }
  279. return 0;
  280. }
  281. module_init(xor_neon_init);
  282. static void __exit xor_neon_exit(void)
  283. {
  284. }
  285. module_exit(xor_neon_exit);
  286. MODULE_AUTHOR("Jackie Liu <[email protected]>");
  287. MODULE_DESCRIPTION("ARMv8 XOR Extensions");
  288. MODULE_LICENSE("GPL");