copyuser_power7.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. *
  4. * Copyright (C) IBM Corporation, 2011
  5. *
  6. * Author: Anton Blanchard <[email protected]>
  7. */
  8. #include <asm/ppc_asm.h>
  9. #ifndef SELFTEST_CASE
  10. /* 0 == don't use VMX, 1 == use VMX */
  11. #define SELFTEST_CASE 0
  12. #endif
  13. #ifdef __BIG_ENDIAN__
  14. #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
  15. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
  16. #else
  17. #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
  18. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
  19. #endif
  20. .macro err1
  21. 100:
  22. EX_TABLE(100b,.Ldo_err1)
  23. .endm
  24. .macro err2
  25. 200:
  26. EX_TABLE(200b,.Ldo_err2)
  27. .endm
  28. #ifdef CONFIG_ALTIVEC
  29. .macro err3
  30. 300:
  31. EX_TABLE(300b,.Ldo_err3)
  32. .endm
  33. .macro err4
  34. 400:
  35. EX_TABLE(400b,.Ldo_err4)
  36. .endm
  37. .Ldo_err4:
  38. ld r16,STK_REG(R16)(r1)
  39. ld r15,STK_REG(R15)(r1)
  40. ld r14,STK_REG(R14)(r1)
  41. .Ldo_err3:
  42. bl exit_vmx_usercopy
  43. ld r0,STACKFRAMESIZE+16(r1)
  44. mtlr r0
  45. b .Lexit
  46. #endif /* CONFIG_ALTIVEC */
  47. .Ldo_err2:
  48. ld r22,STK_REG(R22)(r1)
  49. ld r21,STK_REG(R21)(r1)
  50. ld r20,STK_REG(R20)(r1)
  51. ld r19,STK_REG(R19)(r1)
  52. ld r18,STK_REG(R18)(r1)
  53. ld r17,STK_REG(R17)(r1)
  54. ld r16,STK_REG(R16)(r1)
  55. ld r15,STK_REG(R15)(r1)
  56. ld r14,STK_REG(R14)(r1)
  57. .Lexit:
  58. addi r1,r1,STACKFRAMESIZE
  59. .Ldo_err1:
  60. ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  61. ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  62. ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  63. b __copy_tofrom_user_base
  64. _GLOBAL(__copy_tofrom_user_power7)
  65. cmpldi r5,16
  66. cmpldi cr1,r5,3328
  67. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  68. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  69. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  70. blt .Lshort_copy
  71. #ifdef CONFIG_ALTIVEC
  72. test_feature = SELFTEST_CASE
  73. BEGIN_FTR_SECTION
  74. bgt cr1,.Lvmx_copy
  75. END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  76. #endif
  77. .Lnonvmx_copy:
  78. /* Get the source 8B aligned */
  79. neg r6,r4
  80. mtocrf 0x01,r6
  81. clrldi r6,r6,(64-3)
  82. bf cr7*4+3,1f
  83. err1; lbz r0,0(r4)
  84. addi r4,r4,1
  85. err1; stb r0,0(r3)
  86. addi r3,r3,1
  87. 1: bf cr7*4+2,2f
  88. err1; lhz r0,0(r4)
  89. addi r4,r4,2
  90. err1; sth r0,0(r3)
  91. addi r3,r3,2
  92. 2: bf cr7*4+1,3f
  93. err1; lwz r0,0(r4)
  94. addi r4,r4,4
  95. err1; stw r0,0(r3)
  96. addi r3,r3,4
  97. 3: sub r5,r5,r6
  98. cmpldi r5,128
  99. blt 5f
  100. mflr r0
  101. stdu r1,-STACKFRAMESIZE(r1)
  102. std r14,STK_REG(R14)(r1)
  103. std r15,STK_REG(R15)(r1)
  104. std r16,STK_REG(R16)(r1)
  105. std r17,STK_REG(R17)(r1)
  106. std r18,STK_REG(R18)(r1)
  107. std r19,STK_REG(R19)(r1)
  108. std r20,STK_REG(R20)(r1)
  109. std r21,STK_REG(R21)(r1)
  110. std r22,STK_REG(R22)(r1)
  111. std r0,STACKFRAMESIZE+16(r1)
  112. srdi r6,r5,7
  113. mtctr r6
  114. /* Now do cacheline (128B) sized loads and stores. */
  115. .align 5
  116. 4:
  117. err2; ld r0,0(r4)
  118. err2; ld r6,8(r4)
  119. err2; ld r7,16(r4)
  120. err2; ld r8,24(r4)
  121. err2; ld r9,32(r4)
  122. err2; ld r10,40(r4)
  123. err2; ld r11,48(r4)
  124. err2; ld r12,56(r4)
  125. err2; ld r14,64(r4)
  126. err2; ld r15,72(r4)
  127. err2; ld r16,80(r4)
  128. err2; ld r17,88(r4)
  129. err2; ld r18,96(r4)
  130. err2; ld r19,104(r4)
  131. err2; ld r20,112(r4)
  132. err2; ld r21,120(r4)
  133. addi r4,r4,128
  134. err2; std r0,0(r3)
  135. err2; std r6,8(r3)
  136. err2; std r7,16(r3)
  137. err2; std r8,24(r3)
  138. err2; std r9,32(r3)
  139. err2; std r10,40(r3)
  140. err2; std r11,48(r3)
  141. err2; std r12,56(r3)
  142. err2; std r14,64(r3)
  143. err2; std r15,72(r3)
  144. err2; std r16,80(r3)
  145. err2; std r17,88(r3)
  146. err2; std r18,96(r3)
  147. err2; std r19,104(r3)
  148. err2; std r20,112(r3)
  149. err2; std r21,120(r3)
  150. addi r3,r3,128
  151. bdnz 4b
  152. clrldi r5,r5,(64-7)
  153. ld r14,STK_REG(R14)(r1)
  154. ld r15,STK_REG(R15)(r1)
  155. ld r16,STK_REG(R16)(r1)
  156. ld r17,STK_REG(R17)(r1)
  157. ld r18,STK_REG(R18)(r1)
  158. ld r19,STK_REG(R19)(r1)
  159. ld r20,STK_REG(R20)(r1)
  160. ld r21,STK_REG(R21)(r1)
  161. ld r22,STK_REG(R22)(r1)
  162. addi r1,r1,STACKFRAMESIZE
  163. /* Up to 127B to go */
  164. 5: srdi r6,r5,4
  165. mtocrf 0x01,r6
  166. 6: bf cr7*4+1,7f
  167. err1; ld r0,0(r4)
  168. err1; ld r6,8(r4)
  169. err1; ld r7,16(r4)
  170. err1; ld r8,24(r4)
  171. err1; ld r9,32(r4)
  172. err1; ld r10,40(r4)
  173. err1; ld r11,48(r4)
  174. err1; ld r12,56(r4)
  175. addi r4,r4,64
  176. err1; std r0,0(r3)
  177. err1; std r6,8(r3)
  178. err1; std r7,16(r3)
  179. err1; std r8,24(r3)
  180. err1; std r9,32(r3)
  181. err1; std r10,40(r3)
  182. err1; std r11,48(r3)
  183. err1; std r12,56(r3)
  184. addi r3,r3,64
  185. /* Up to 63B to go */
  186. 7: bf cr7*4+2,8f
  187. err1; ld r0,0(r4)
  188. err1; ld r6,8(r4)
  189. err1; ld r7,16(r4)
  190. err1; ld r8,24(r4)
  191. addi r4,r4,32
  192. err1; std r0,0(r3)
  193. err1; std r6,8(r3)
  194. err1; std r7,16(r3)
  195. err1; std r8,24(r3)
  196. addi r3,r3,32
  197. /* Up to 31B to go */
  198. 8: bf cr7*4+3,9f
  199. err1; ld r0,0(r4)
  200. err1; ld r6,8(r4)
  201. addi r4,r4,16
  202. err1; std r0,0(r3)
  203. err1; std r6,8(r3)
  204. addi r3,r3,16
  205. 9: clrldi r5,r5,(64-4)
  206. /* Up to 15B to go */
  207. .Lshort_copy:
  208. mtocrf 0x01,r5
  209. bf cr7*4+0,12f
  210. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  211. err1; lwz r6,4(r4)
  212. addi r4,r4,8
  213. err1; stw r0,0(r3)
  214. err1; stw r6,4(r3)
  215. addi r3,r3,8
  216. 12: bf cr7*4+1,13f
  217. err1; lwz r0,0(r4)
  218. addi r4,r4,4
  219. err1; stw r0,0(r3)
  220. addi r3,r3,4
  221. 13: bf cr7*4+2,14f
  222. err1; lhz r0,0(r4)
  223. addi r4,r4,2
  224. err1; sth r0,0(r3)
  225. addi r3,r3,2
  226. 14: bf cr7*4+3,15f
  227. err1; lbz r0,0(r4)
  228. err1; stb r0,0(r3)
  229. 15: li r3,0
  230. blr
  231. .Lunwind_stack_nonvmx_copy:
  232. addi r1,r1,STACKFRAMESIZE
  233. b .Lnonvmx_copy
  234. .Lvmx_copy:
  235. #ifdef CONFIG_ALTIVEC
  236. mflr r0
  237. std r0,16(r1)
  238. stdu r1,-STACKFRAMESIZE(r1)
  239. bl enter_vmx_usercopy
  240. cmpwi cr1,r3,0
  241. ld r0,STACKFRAMESIZE+16(r1)
  242. ld r3,STK_REG(R31)(r1)
  243. ld r4,STK_REG(R30)(r1)
  244. ld r5,STK_REG(R29)(r1)
  245. mtlr r0
  246. /*
  247. * We prefetch both the source and destination using enhanced touch
  248. * instructions. We use a stream ID of 0 for the load side and
  249. * 1 for the store side.
  250. */
  251. clrrdi r6,r4,7
  252. clrrdi r9,r3,7
  253. ori r9,r9,1 /* stream=1 */
  254. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  255. cmpldi r7,0x3FF
  256. ble 1f
  257. li r7,0x3FF
  258. 1: lis r0,0x0E00 /* depth=7 */
  259. sldi r7,r7,7
  260. or r7,r7,r0
  261. ori r10,r7,1 /* stream=1 */
  262. lis r8,0x8000 /* GO=1 */
  263. clrldi r8,r8,32
  264. /* setup read stream 0 */
  265. dcbt 0,r6,0b01000 /* addr from */
  266. dcbt 0,r7,0b01010 /* length and depth from */
  267. /* setup write stream 1 */
  268. dcbtst 0,r9,0b01000 /* addr to */
  269. dcbtst 0,r10,0b01010 /* length and depth to */
  270. eieio
  271. dcbt 0,r8,0b01010 /* all streams GO */
  272. beq cr1,.Lunwind_stack_nonvmx_copy
  273. /*
  274. * If source and destination are not relatively aligned we use a
  275. * slower permute loop.
  276. */
  277. xor r6,r4,r3
  278. rldicl. r6,r6,0,(64-4)
  279. bne .Lvmx_unaligned_copy
  280. /* Get the destination 16B aligned */
  281. neg r6,r3
  282. mtocrf 0x01,r6
  283. clrldi r6,r6,(64-4)
  284. bf cr7*4+3,1f
  285. err3; lbz r0,0(r4)
  286. addi r4,r4,1
  287. err3; stb r0,0(r3)
  288. addi r3,r3,1
  289. 1: bf cr7*4+2,2f
  290. err3; lhz r0,0(r4)
  291. addi r4,r4,2
  292. err3; sth r0,0(r3)
  293. addi r3,r3,2
  294. 2: bf cr7*4+1,3f
  295. err3; lwz r0,0(r4)
  296. addi r4,r4,4
  297. err3; stw r0,0(r3)
  298. addi r3,r3,4
  299. 3: bf cr7*4+0,4f
  300. err3; ld r0,0(r4)
  301. addi r4,r4,8
  302. err3; std r0,0(r3)
  303. addi r3,r3,8
  304. 4: sub r5,r5,r6
  305. /* Get the desination 128B aligned */
  306. neg r6,r3
  307. srdi r7,r6,4
  308. mtocrf 0x01,r7
  309. clrldi r6,r6,(64-7)
  310. li r9,16
  311. li r10,32
  312. li r11,48
  313. bf cr7*4+3,5f
  314. err3; lvx v1,0,r4
  315. addi r4,r4,16
  316. err3; stvx v1,0,r3
  317. addi r3,r3,16
  318. 5: bf cr7*4+2,6f
  319. err3; lvx v1,0,r4
  320. err3; lvx v0,r4,r9
  321. addi r4,r4,32
  322. err3; stvx v1,0,r3
  323. err3; stvx v0,r3,r9
  324. addi r3,r3,32
  325. 6: bf cr7*4+1,7f
  326. err3; lvx v3,0,r4
  327. err3; lvx v2,r4,r9
  328. err3; lvx v1,r4,r10
  329. err3; lvx v0,r4,r11
  330. addi r4,r4,64
  331. err3; stvx v3,0,r3
  332. err3; stvx v2,r3,r9
  333. err3; stvx v1,r3,r10
  334. err3; stvx v0,r3,r11
  335. addi r3,r3,64
  336. 7: sub r5,r5,r6
  337. srdi r6,r5,7
  338. std r14,STK_REG(R14)(r1)
  339. std r15,STK_REG(R15)(r1)
  340. std r16,STK_REG(R16)(r1)
  341. li r12,64
  342. li r14,80
  343. li r15,96
  344. li r16,112
  345. mtctr r6
  346. /*
  347. * Now do cacheline sized loads and stores. By this stage the
  348. * cacheline stores are also cacheline aligned.
  349. */
  350. .align 5
  351. 8:
  352. err4; lvx v7,0,r4
  353. err4; lvx v6,r4,r9
  354. err4; lvx v5,r4,r10
  355. err4; lvx v4,r4,r11
  356. err4; lvx v3,r4,r12
  357. err4; lvx v2,r4,r14
  358. err4; lvx v1,r4,r15
  359. err4; lvx v0,r4,r16
  360. addi r4,r4,128
  361. err4; stvx v7,0,r3
  362. err4; stvx v6,r3,r9
  363. err4; stvx v5,r3,r10
  364. err4; stvx v4,r3,r11
  365. err4; stvx v3,r3,r12
  366. err4; stvx v2,r3,r14
  367. err4; stvx v1,r3,r15
  368. err4; stvx v0,r3,r16
  369. addi r3,r3,128
  370. bdnz 8b
  371. ld r14,STK_REG(R14)(r1)
  372. ld r15,STK_REG(R15)(r1)
  373. ld r16,STK_REG(R16)(r1)
  374. /* Up to 127B to go */
  375. clrldi r5,r5,(64-7)
  376. srdi r6,r5,4
  377. mtocrf 0x01,r6
  378. bf cr7*4+1,9f
  379. err3; lvx v3,0,r4
  380. err3; lvx v2,r4,r9
  381. err3; lvx v1,r4,r10
  382. err3; lvx v0,r4,r11
  383. addi r4,r4,64
  384. err3; stvx v3,0,r3
  385. err3; stvx v2,r3,r9
  386. err3; stvx v1,r3,r10
  387. err3; stvx v0,r3,r11
  388. addi r3,r3,64
  389. 9: bf cr7*4+2,10f
  390. err3; lvx v1,0,r4
  391. err3; lvx v0,r4,r9
  392. addi r4,r4,32
  393. err3; stvx v1,0,r3
  394. err3; stvx v0,r3,r9
  395. addi r3,r3,32
  396. 10: bf cr7*4+3,11f
  397. err3; lvx v1,0,r4
  398. addi r4,r4,16
  399. err3; stvx v1,0,r3
  400. addi r3,r3,16
  401. /* Up to 15B to go */
  402. 11: clrldi r5,r5,(64-4)
  403. mtocrf 0x01,r5
  404. bf cr7*4+0,12f
  405. err3; ld r0,0(r4)
  406. addi r4,r4,8
  407. err3; std r0,0(r3)
  408. addi r3,r3,8
  409. 12: bf cr7*4+1,13f
  410. err3; lwz r0,0(r4)
  411. addi r4,r4,4
  412. err3; stw r0,0(r3)
  413. addi r3,r3,4
  414. 13: bf cr7*4+2,14f
  415. err3; lhz r0,0(r4)
  416. addi r4,r4,2
  417. err3; sth r0,0(r3)
  418. addi r3,r3,2
  419. 14: bf cr7*4+3,15f
  420. err3; lbz r0,0(r4)
  421. err3; stb r0,0(r3)
  422. 15: addi r1,r1,STACKFRAMESIZE
  423. b exit_vmx_usercopy /* tail call optimise */
  424. .Lvmx_unaligned_copy:
  425. /* Get the destination 16B aligned */
  426. neg r6,r3
  427. mtocrf 0x01,r6
  428. clrldi r6,r6,(64-4)
  429. bf cr7*4+3,1f
  430. err3; lbz r0,0(r4)
  431. addi r4,r4,1
  432. err3; stb r0,0(r3)
  433. addi r3,r3,1
  434. 1: bf cr7*4+2,2f
  435. err3; lhz r0,0(r4)
  436. addi r4,r4,2
  437. err3; sth r0,0(r3)
  438. addi r3,r3,2
  439. 2: bf cr7*4+1,3f
  440. err3; lwz r0,0(r4)
  441. addi r4,r4,4
  442. err3; stw r0,0(r3)
  443. addi r3,r3,4
  444. 3: bf cr7*4+0,4f
  445. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  446. err3; lwz r7,4(r4)
  447. addi r4,r4,8
  448. err3; stw r0,0(r3)
  449. err3; stw r7,4(r3)
  450. addi r3,r3,8
  451. 4: sub r5,r5,r6
  452. /* Get the desination 128B aligned */
  453. neg r6,r3
  454. srdi r7,r6,4
  455. mtocrf 0x01,r7
  456. clrldi r6,r6,(64-7)
  457. li r9,16
  458. li r10,32
  459. li r11,48
  460. LVS(v16,0,r4) /* Setup permute control vector */
  461. err3; lvx v0,0,r4
  462. addi r4,r4,16
  463. bf cr7*4+3,5f
  464. err3; lvx v1,0,r4
  465. VPERM(v8,v0,v1,v16)
  466. addi r4,r4,16
  467. err3; stvx v8,0,r3
  468. addi r3,r3,16
  469. vor v0,v1,v1
  470. 5: bf cr7*4+2,6f
  471. err3; lvx v1,0,r4
  472. VPERM(v8,v0,v1,v16)
  473. err3; lvx v0,r4,r9
  474. VPERM(v9,v1,v0,v16)
  475. addi r4,r4,32
  476. err3; stvx v8,0,r3
  477. err3; stvx v9,r3,r9
  478. addi r3,r3,32
  479. 6: bf cr7*4+1,7f
  480. err3; lvx v3,0,r4
  481. VPERM(v8,v0,v3,v16)
  482. err3; lvx v2,r4,r9
  483. VPERM(v9,v3,v2,v16)
  484. err3; lvx v1,r4,r10
  485. VPERM(v10,v2,v1,v16)
  486. err3; lvx v0,r4,r11
  487. VPERM(v11,v1,v0,v16)
  488. addi r4,r4,64
  489. err3; stvx v8,0,r3
  490. err3; stvx v9,r3,r9
  491. err3; stvx v10,r3,r10
  492. err3; stvx v11,r3,r11
  493. addi r3,r3,64
  494. 7: sub r5,r5,r6
  495. srdi r6,r5,7
  496. std r14,STK_REG(R14)(r1)
  497. std r15,STK_REG(R15)(r1)
  498. std r16,STK_REG(R16)(r1)
  499. li r12,64
  500. li r14,80
  501. li r15,96
  502. li r16,112
  503. mtctr r6
  504. /*
  505. * Now do cacheline sized loads and stores. By this stage the
  506. * cacheline stores are also cacheline aligned.
  507. */
  508. .align 5
  509. 8:
  510. err4; lvx v7,0,r4
  511. VPERM(v8,v0,v7,v16)
  512. err4; lvx v6,r4,r9
  513. VPERM(v9,v7,v6,v16)
  514. err4; lvx v5,r4,r10
  515. VPERM(v10,v6,v5,v16)
  516. err4; lvx v4,r4,r11
  517. VPERM(v11,v5,v4,v16)
  518. err4; lvx v3,r4,r12
  519. VPERM(v12,v4,v3,v16)
  520. err4; lvx v2,r4,r14
  521. VPERM(v13,v3,v2,v16)
  522. err4; lvx v1,r4,r15
  523. VPERM(v14,v2,v1,v16)
  524. err4; lvx v0,r4,r16
  525. VPERM(v15,v1,v0,v16)
  526. addi r4,r4,128
  527. err4; stvx v8,0,r3
  528. err4; stvx v9,r3,r9
  529. err4; stvx v10,r3,r10
  530. err4; stvx v11,r3,r11
  531. err4; stvx v12,r3,r12
  532. err4; stvx v13,r3,r14
  533. err4; stvx v14,r3,r15
  534. err4; stvx v15,r3,r16
  535. addi r3,r3,128
  536. bdnz 8b
  537. ld r14,STK_REG(R14)(r1)
  538. ld r15,STK_REG(R15)(r1)
  539. ld r16,STK_REG(R16)(r1)
  540. /* Up to 127B to go */
  541. clrldi r5,r5,(64-7)
  542. srdi r6,r5,4
  543. mtocrf 0x01,r6
  544. bf cr7*4+1,9f
  545. err3; lvx v3,0,r4
  546. VPERM(v8,v0,v3,v16)
  547. err3; lvx v2,r4,r9
  548. VPERM(v9,v3,v2,v16)
  549. err3; lvx v1,r4,r10
  550. VPERM(v10,v2,v1,v16)
  551. err3; lvx v0,r4,r11
  552. VPERM(v11,v1,v0,v16)
  553. addi r4,r4,64
  554. err3; stvx v8,0,r3
  555. err3; stvx v9,r3,r9
  556. err3; stvx v10,r3,r10
  557. err3; stvx v11,r3,r11
  558. addi r3,r3,64
  559. 9: bf cr7*4+2,10f
  560. err3; lvx v1,0,r4
  561. VPERM(v8,v0,v1,v16)
  562. err3; lvx v0,r4,r9
  563. VPERM(v9,v1,v0,v16)
  564. addi r4,r4,32
  565. err3; stvx v8,0,r3
  566. err3; stvx v9,r3,r9
  567. addi r3,r3,32
  568. 10: bf cr7*4+3,11f
  569. err3; lvx v1,0,r4
  570. VPERM(v8,v0,v1,v16)
  571. addi r4,r4,16
  572. err3; stvx v8,0,r3
  573. addi r3,r3,16
  574. /* Up to 15B to go */
  575. 11: clrldi r5,r5,(64-4)
  576. addi r4,r4,-16 /* Unwind the +16 load offset */
  577. mtocrf 0x01,r5
  578. bf cr7*4+0,12f
  579. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  580. err3; lwz r6,4(r4)
  581. addi r4,r4,8
  582. err3; stw r0,0(r3)
  583. err3; stw r6,4(r3)
  584. addi r3,r3,8
  585. 12: bf cr7*4+1,13f
  586. err3; lwz r0,0(r4)
  587. addi r4,r4,4
  588. err3; stw r0,0(r3)
  589. addi r3,r3,4
  590. 13: bf cr7*4+2,14f
  591. err3; lhz r0,0(r4)
  592. addi r4,r4,2
  593. err3; sth r0,0(r3)
  594. addi r3,r3,2
  595. 14: bf cr7*4+3,15f
  596. err3; lbz r0,0(r4)
  597. err3; stb r0,0(r3)
  598. 15: addi r1,r1,STACKFRAMESIZE
  599. b exit_vmx_usercopy /* tail call optimise */
  600. #endif /* CONFIG_ALTIVEC */