xor.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. #ifndef _ASM_X86_XOR_H
  3. #define _ASM_X86_XOR_H
  4. /*
  5. * Optimized RAID-5 checksumming functions for SSE.
  6. */
  7. /*
  8. * Cache avoiding checksumming functions utilizing KNI instructions
  9. * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  10. */
  11. /*
  12. * Based on
  13. * High-speed RAID5 checksumming functions utilizing SSE instructions.
  14. * Copyright (C) 1998 Ingo Molnar.
  15. */
  16. /*
  17. * x86-64 changes / gcc fixes from Andi Kleen.
  18. * Copyright 2002 Andi Kleen, SuSE Labs.
  19. *
  20. * This hasn't been optimized for the hammer yet, but there are likely
  21. * no advantages to be gotten from x86-64 here anyways.
  22. */
  23. #include <asm/fpu/api.h>
  24. #ifdef CONFIG_X86_32
  25. /* reduce register pressure */
  26. # define XOR_CONSTANT_CONSTRAINT "i"
  27. #else
  28. # define XOR_CONSTANT_CONSTRAINT "re"
  29. #endif
  30. #define OFFS(x) "16*("#x")"
  31. #define PF_OFFS(x) "256+16*("#x")"
  32. #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
  33. #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
  34. #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
  35. #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
  36. #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
  37. #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
  38. #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
  39. #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
  40. #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
  41. #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
  42. #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
  43. #define NOP(x)
  44. #define BLK64(pf, op, i) \
  45. pf(i) \
  46. op(i, 0) \
  47. op(i + 1, 1) \
  48. op(i + 2, 2) \
  49. op(i + 3, 3)
  50. static void
  51. xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
  52. const unsigned long * __restrict p2)
  53. {
  54. unsigned long lines = bytes >> 8;
  55. kernel_fpu_begin();
  56. asm volatile(
  57. #undef BLOCK
  58. #define BLOCK(i) \
  59. LD(i, 0) \
  60. LD(i + 1, 1) \
  61. PF1(i) \
  62. PF1(i + 2) \
  63. LD(i + 2, 2) \
  64. LD(i + 3, 3) \
  65. PF0(i + 4) \
  66. PF0(i + 6) \
  67. XO1(i, 0) \
  68. XO1(i + 1, 1) \
  69. XO1(i + 2, 2) \
  70. XO1(i + 3, 3) \
  71. ST(i, 0) \
  72. ST(i + 1, 1) \
  73. ST(i + 2, 2) \
  74. ST(i + 3, 3) \
  75. PF0(0)
  76. PF0(2)
  77. " .align 32 ;\n"
  78. " 1: ;\n"
  79. BLOCK(0)
  80. BLOCK(4)
  81. BLOCK(8)
  82. BLOCK(12)
  83. " add %[inc], %[p1] ;\n"
  84. " add %[inc], %[p2] ;\n"
  85. " dec %[cnt] ;\n"
  86. " jnz 1b ;\n"
  87. : [cnt] "+r" (lines),
  88. [p1] "+r" (p1), [p2] "+r" (p2)
  89. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  90. : "memory");
  91. kernel_fpu_end();
  92. }
  93. static void
  94. xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
  95. const unsigned long * __restrict p2)
  96. {
  97. unsigned long lines = bytes >> 8;
  98. kernel_fpu_begin();
  99. asm volatile(
  100. #undef BLOCK
  101. #define BLOCK(i) \
  102. BLK64(PF0, LD, i) \
  103. BLK64(PF1, XO1, i) \
  104. BLK64(NOP, ST, i) \
  105. " .align 32 ;\n"
  106. " 1: ;\n"
  107. BLOCK(0)
  108. BLOCK(4)
  109. BLOCK(8)
  110. BLOCK(12)
  111. " add %[inc], %[p1] ;\n"
  112. " add %[inc], %[p2] ;\n"
  113. " dec %[cnt] ;\n"
  114. " jnz 1b ;\n"
  115. : [cnt] "+r" (lines),
  116. [p1] "+r" (p1), [p2] "+r" (p2)
  117. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  118. : "memory");
  119. kernel_fpu_end();
  120. }
  121. static void
  122. xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
  123. const unsigned long * __restrict p2,
  124. const unsigned long * __restrict p3)
  125. {
  126. unsigned long lines = bytes >> 8;
  127. kernel_fpu_begin();
  128. asm volatile(
  129. #undef BLOCK
  130. #define BLOCK(i) \
  131. PF1(i) \
  132. PF1(i + 2) \
  133. LD(i, 0) \
  134. LD(i + 1, 1) \
  135. LD(i + 2, 2) \
  136. LD(i + 3, 3) \
  137. PF2(i) \
  138. PF2(i + 2) \
  139. PF0(i + 4) \
  140. PF0(i + 6) \
  141. XO1(i, 0) \
  142. XO1(i + 1, 1) \
  143. XO1(i + 2, 2) \
  144. XO1(i + 3, 3) \
  145. XO2(i, 0) \
  146. XO2(i + 1, 1) \
  147. XO2(i + 2, 2) \
  148. XO2(i + 3, 3) \
  149. ST(i, 0) \
  150. ST(i + 1, 1) \
  151. ST(i + 2, 2) \
  152. ST(i + 3, 3) \
  153. PF0(0)
  154. PF0(2)
  155. " .align 32 ;\n"
  156. " 1: ;\n"
  157. BLOCK(0)
  158. BLOCK(4)
  159. BLOCK(8)
  160. BLOCK(12)
  161. " add %[inc], %[p1] ;\n"
  162. " add %[inc], %[p2] ;\n"
  163. " add %[inc], %[p3] ;\n"
  164. " dec %[cnt] ;\n"
  165. " jnz 1b ;\n"
  166. : [cnt] "+r" (lines),
  167. [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
  168. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  169. : "memory");
  170. kernel_fpu_end();
  171. }
  172. static void
  173. xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
  174. const unsigned long * __restrict p2,
  175. const unsigned long * __restrict p3)
  176. {
  177. unsigned long lines = bytes >> 8;
  178. kernel_fpu_begin();
  179. asm volatile(
  180. #undef BLOCK
  181. #define BLOCK(i) \
  182. BLK64(PF0, LD, i) \
  183. BLK64(PF1, XO1, i) \
  184. BLK64(PF2, XO2, i) \
  185. BLK64(NOP, ST, i) \
  186. " .align 32 ;\n"
  187. " 1: ;\n"
  188. BLOCK(0)
  189. BLOCK(4)
  190. BLOCK(8)
  191. BLOCK(12)
  192. " add %[inc], %[p1] ;\n"
  193. " add %[inc], %[p2] ;\n"
  194. " add %[inc], %[p3] ;\n"
  195. " dec %[cnt] ;\n"
  196. " jnz 1b ;\n"
  197. : [cnt] "+r" (lines),
  198. [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
  199. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  200. : "memory");
  201. kernel_fpu_end();
  202. }
  203. static void
  204. xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
  205. const unsigned long * __restrict p2,
  206. const unsigned long * __restrict p3,
  207. const unsigned long * __restrict p4)
  208. {
  209. unsigned long lines = bytes >> 8;
  210. kernel_fpu_begin();
  211. asm volatile(
  212. #undef BLOCK
  213. #define BLOCK(i) \
  214. PF1(i) \
  215. PF1(i + 2) \
  216. LD(i, 0) \
  217. LD(i + 1, 1) \
  218. LD(i + 2, 2) \
  219. LD(i + 3, 3) \
  220. PF2(i) \
  221. PF2(i + 2) \
  222. XO1(i, 0) \
  223. XO1(i + 1, 1) \
  224. XO1(i + 2, 2) \
  225. XO1(i + 3, 3) \
  226. PF3(i) \
  227. PF3(i + 2) \
  228. PF0(i + 4) \
  229. PF0(i + 6) \
  230. XO2(i, 0) \
  231. XO2(i + 1, 1) \
  232. XO2(i + 2, 2) \
  233. XO2(i + 3, 3) \
  234. XO3(i, 0) \
  235. XO3(i + 1, 1) \
  236. XO3(i + 2, 2) \
  237. XO3(i + 3, 3) \
  238. ST(i, 0) \
  239. ST(i + 1, 1) \
  240. ST(i + 2, 2) \
  241. ST(i + 3, 3) \
  242. PF0(0)
  243. PF0(2)
  244. " .align 32 ;\n"
  245. " 1: ;\n"
  246. BLOCK(0)
  247. BLOCK(4)
  248. BLOCK(8)
  249. BLOCK(12)
  250. " add %[inc], %[p1] ;\n"
  251. " add %[inc], %[p2] ;\n"
  252. " add %[inc], %[p3] ;\n"
  253. " add %[inc], %[p4] ;\n"
  254. " dec %[cnt] ;\n"
  255. " jnz 1b ;\n"
  256. : [cnt] "+r" (lines), [p1] "+r" (p1),
  257. [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
  258. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  259. : "memory");
  260. kernel_fpu_end();
  261. }
  262. static void
  263. xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
  264. const unsigned long * __restrict p2,
  265. const unsigned long * __restrict p3,
  266. const unsigned long * __restrict p4)
  267. {
  268. unsigned long lines = bytes >> 8;
  269. kernel_fpu_begin();
  270. asm volatile(
  271. #undef BLOCK
  272. #define BLOCK(i) \
  273. BLK64(PF0, LD, i) \
  274. BLK64(PF1, XO1, i) \
  275. BLK64(PF2, XO2, i) \
  276. BLK64(PF3, XO3, i) \
  277. BLK64(NOP, ST, i) \
  278. " .align 32 ;\n"
  279. " 1: ;\n"
  280. BLOCK(0)
  281. BLOCK(4)
  282. BLOCK(8)
  283. BLOCK(12)
  284. " add %[inc], %[p1] ;\n"
  285. " add %[inc], %[p2] ;\n"
  286. " add %[inc], %[p3] ;\n"
  287. " add %[inc], %[p4] ;\n"
  288. " dec %[cnt] ;\n"
  289. " jnz 1b ;\n"
  290. : [cnt] "+r" (lines), [p1] "+r" (p1),
  291. [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
  292. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  293. : "memory");
  294. kernel_fpu_end();
  295. }
  296. static void
  297. xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
  298. const unsigned long * __restrict p2,
  299. const unsigned long * __restrict p3,
  300. const unsigned long * __restrict p4,
  301. const unsigned long * __restrict p5)
  302. {
  303. unsigned long lines = bytes >> 8;
  304. kernel_fpu_begin();
  305. asm volatile(
  306. #undef BLOCK
  307. #define BLOCK(i) \
  308. PF1(i) \
  309. PF1(i + 2) \
  310. LD(i, 0) \
  311. LD(i + 1, 1) \
  312. LD(i + 2, 2) \
  313. LD(i + 3, 3) \
  314. PF2(i) \
  315. PF2(i + 2) \
  316. XO1(i, 0) \
  317. XO1(i + 1, 1) \
  318. XO1(i + 2, 2) \
  319. XO1(i + 3, 3) \
  320. PF3(i) \
  321. PF3(i + 2) \
  322. XO2(i, 0) \
  323. XO2(i + 1, 1) \
  324. XO2(i + 2, 2) \
  325. XO2(i + 3, 3) \
  326. PF4(i) \
  327. PF4(i + 2) \
  328. PF0(i + 4) \
  329. PF0(i + 6) \
  330. XO3(i, 0) \
  331. XO3(i + 1, 1) \
  332. XO3(i + 2, 2) \
  333. XO3(i + 3, 3) \
  334. XO4(i, 0) \
  335. XO4(i + 1, 1) \
  336. XO4(i + 2, 2) \
  337. XO4(i + 3, 3) \
  338. ST(i, 0) \
  339. ST(i + 1, 1) \
  340. ST(i + 2, 2) \
  341. ST(i + 3, 3) \
  342. PF0(0)
  343. PF0(2)
  344. " .align 32 ;\n"
  345. " 1: ;\n"
  346. BLOCK(0)
  347. BLOCK(4)
  348. BLOCK(8)
  349. BLOCK(12)
  350. " add %[inc], %[p1] ;\n"
  351. " add %[inc], %[p2] ;\n"
  352. " add %[inc], %[p3] ;\n"
  353. " add %[inc], %[p4] ;\n"
  354. " add %[inc], %[p5] ;\n"
  355. " dec %[cnt] ;\n"
  356. " jnz 1b ;\n"
  357. : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
  358. [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
  359. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  360. : "memory");
  361. kernel_fpu_end();
  362. }
  363. static void
  364. xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
  365. const unsigned long * __restrict p2,
  366. const unsigned long * __restrict p3,
  367. const unsigned long * __restrict p4,
  368. const unsigned long * __restrict p5)
  369. {
  370. unsigned long lines = bytes >> 8;
  371. kernel_fpu_begin();
  372. asm volatile(
  373. #undef BLOCK
  374. #define BLOCK(i) \
  375. BLK64(PF0, LD, i) \
  376. BLK64(PF1, XO1, i) \
  377. BLK64(PF2, XO2, i) \
  378. BLK64(PF3, XO3, i) \
  379. BLK64(PF4, XO4, i) \
  380. BLK64(NOP, ST, i) \
  381. " .align 32 ;\n"
  382. " 1: ;\n"
  383. BLOCK(0)
  384. BLOCK(4)
  385. BLOCK(8)
  386. BLOCK(12)
  387. " add %[inc], %[p1] ;\n"
  388. " add %[inc], %[p2] ;\n"
  389. " add %[inc], %[p3] ;\n"
  390. " add %[inc], %[p4] ;\n"
  391. " add %[inc], %[p5] ;\n"
  392. " dec %[cnt] ;\n"
  393. " jnz 1b ;\n"
  394. : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
  395. [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
  396. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  397. : "memory");
  398. kernel_fpu_end();
  399. }
  400. static struct xor_block_template xor_block_sse_pf64 = {
  401. .name = "prefetch64-sse",
  402. .do_2 = xor_sse_2_pf64,
  403. .do_3 = xor_sse_3_pf64,
  404. .do_4 = xor_sse_4_pf64,
  405. .do_5 = xor_sse_5_pf64,
  406. };
  407. #undef LD
  408. #undef XO1
  409. #undef XO2
  410. #undef XO3
  411. #undef XO4
  412. #undef ST
  413. #undef NOP
  414. #undef BLK64
  415. #undef BLOCK
  416. #undef XOR_CONSTANT_CONSTRAINT
  417. #ifdef CONFIG_X86_32
  418. # include <asm/xor_32.h>
  419. #else
  420. # include <asm/xor_64.h>
  421. #endif
  422. #define XOR_SELECT_TEMPLATE(FASTEST) \
  423. AVX_SELECT(FASTEST)
  424. #endif /* _ASM_X86_XOR_H */