copyuser_64.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * Copyright (C) 2002 Paul Mackerras, IBM Corp.
  4. */
  5. #include <asm/processor.h>
  6. #include <asm/ppc_asm.h>
  7. #include <asm/export.h>
  8. #include <asm/asm-compat.h>
  9. #include <asm/feature-fixups.h>
  10. #ifndef SELFTEST_CASE
  11. /* 0 == most CPUs, 1 == POWER6, 2 == Cell */
  12. #define SELFTEST_CASE 0
  13. #endif
  14. #ifdef __BIG_ENDIAN__
  15. #define sLd sld /* Shift towards low-numbered address. */
  16. #define sHd srd /* Shift towards high-numbered address. */
  17. #else
  18. #define sLd srd /* Shift towards low-numbered address. */
  19. #define sHd sld /* Shift towards high-numbered address. */
  20. #endif
  21. /*
  22. * These macros are used to generate exception table entries.
  23. * The exception handlers below use the original arguments
  24. * (stored on the stack) and the point where we're up to in
  25. * the destination buffer, i.e. the address of the first
  26. * unmodified byte. Generally r3 points into the destination
  27. * buffer, but the first unmodified byte is at a variable
  28. * offset from r3. In the code below, the symbol r3_offset
  29. * is set to indicate the current offset at each point in
  30. * the code. This offset is then used as a negative offset
  31. * from the exception handler code, and those instructions
  32. * before the exception handlers are addi instructions that
  33. * adjust r3 to point to the correct place.
  34. */
  35. .macro lex /* exception handler for load */
  36. 100: EX_TABLE(100b, .Lld_exc - r3_offset)
  37. .endm
  38. .macro stex /* exception handler for store */
  39. 100: EX_TABLE(100b, .Lst_exc - r3_offset)
  40. .endm
  41. .align 7
  42. _GLOBAL_TOC(__copy_tofrom_user)
  43. #ifdef CONFIG_PPC_BOOK3S_64
  44. BEGIN_FTR_SECTION
  45. nop
  46. FTR_SECTION_ELSE
  47. b __copy_tofrom_user_power7
  48. ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
  49. #endif
  50. _GLOBAL(__copy_tofrom_user_base)
  51. /* first check for a 4kB copy on a 4kB boundary */
  52. cmpldi cr1,r5,16
  53. cmpdi cr6,r5,4096
  54. or r0,r3,r4
  55. neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
  56. andi. r0,r0,4095
  57. std r3,-24(r1)
  58. crand cr0*4+2,cr0*4+2,cr6*4+2
  59. std r4,-16(r1)
  60. std r5,-8(r1)
  61. dcbt 0,r4
  62. beq .Lcopy_page_4K
  63. andi. r6,r6,7
  64. PPC_MTOCRF(0x01,r5)
  65. blt cr1,.Lshort_copy
  66. /* Below we want to nop out the bne if we're on a CPU that has the
  67. * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
  68. * cleared.
  69. * At the time of writing the only CPU that has this combination of bits
  70. * set is Power6.
  71. */
  72. test_feature = (SELFTEST_CASE == 1)
  73. BEGIN_FTR_SECTION
  74. nop
  75. FTR_SECTION_ELSE
  76. bne .Ldst_unaligned
  77. ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
  78. CPU_FTR_UNALIGNED_LD_STD)
  79. .Ldst_aligned:
  80. addi r3,r3,-16
  81. r3_offset = 16
  82. test_feature = (SELFTEST_CASE == 0)
  83. BEGIN_FTR_SECTION
  84. andi. r0,r4,7
  85. bne .Lsrc_unaligned
  86. END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  87. blt cr1,.Ldo_tail /* if < 16 bytes to copy */
  88. srdi r0,r5,5
  89. cmpdi cr1,r0,0
  90. lex; ld r7,0(r4)
  91. lex; ld r6,8(r4)
  92. addi r4,r4,16
  93. mtctr r0
  94. andi. r0,r5,0x10
  95. beq 22f
  96. addi r3,r3,16
  97. r3_offset = 0
  98. addi r4,r4,-16
  99. mr r9,r7
  100. mr r8,r6
  101. beq cr1,72f
  102. 21:
  103. lex; ld r7,16(r4)
  104. lex; ld r6,24(r4)
  105. addi r4,r4,32
  106. stex; std r9,0(r3)
  107. r3_offset = 8
  108. stex; std r8,8(r3)
  109. r3_offset = 16
  110. 22:
  111. lex; ld r9,0(r4)
  112. lex; ld r8,8(r4)
  113. stex; std r7,16(r3)
  114. r3_offset = 24
  115. stex; std r6,24(r3)
  116. addi r3,r3,32
  117. r3_offset = 0
  118. bdnz 21b
  119. 72:
  120. stex; std r9,0(r3)
  121. r3_offset = 8
  122. stex; std r8,8(r3)
  123. r3_offset = 16
  124. andi. r5,r5,0xf
  125. beq+ 3f
  126. addi r4,r4,16
  127. .Ldo_tail:
  128. addi r3,r3,16
  129. r3_offset = 0
  130. bf cr7*4+0,246f
  131. lex; ld r9,0(r4)
  132. addi r4,r4,8
  133. stex; std r9,0(r3)
  134. addi r3,r3,8
  135. 246: bf cr7*4+1,1f
  136. lex; lwz r9,0(r4)
  137. addi r4,r4,4
  138. stex; stw r9,0(r3)
  139. addi r3,r3,4
  140. 1: bf cr7*4+2,2f
  141. lex; lhz r9,0(r4)
  142. addi r4,r4,2
  143. stex; sth r9,0(r3)
  144. addi r3,r3,2
  145. 2: bf cr7*4+3,3f
  146. lex; lbz r9,0(r4)
  147. stex; stb r9,0(r3)
  148. 3: li r3,0
  149. blr
  150. .Lsrc_unaligned:
  151. r3_offset = 16
  152. srdi r6,r5,3
  153. addi r5,r5,-16
  154. subf r4,r0,r4
  155. srdi r7,r5,4
  156. sldi r10,r0,3
  157. cmpldi cr6,r6,3
  158. andi. r5,r5,7
  159. mtctr r7
  160. subfic r11,r10,64
  161. add r5,r5,r0
  162. bt cr7*4+0,28f
  163. lex; ld r9,0(r4) /* 3+2n loads, 2+2n stores */
  164. lex; ld r0,8(r4)
  165. sLd r6,r9,r10
  166. lex; ldu r9,16(r4)
  167. sHd r7,r0,r11
  168. sLd r8,r0,r10
  169. or r7,r7,r6
  170. blt cr6,79f
  171. lex; ld r0,8(r4)
  172. b 2f
  173. 28:
  174. lex; ld r0,0(r4) /* 4+2n loads, 3+2n stores */
  175. lex; ldu r9,8(r4)
  176. sLd r8,r0,r10
  177. addi r3,r3,-8
  178. r3_offset = 24
  179. blt cr6,5f
  180. lex; ld r0,8(r4)
  181. sHd r12,r9,r11
  182. sLd r6,r9,r10
  183. lex; ldu r9,16(r4)
  184. or r12,r8,r12
  185. sHd r7,r0,r11
  186. sLd r8,r0,r10
  187. addi r3,r3,16
  188. r3_offset = 8
  189. beq cr6,78f
  190. 1: or r7,r7,r6
  191. lex; ld r0,8(r4)
  192. stex; std r12,8(r3)
  193. r3_offset = 16
  194. 2: sHd r12,r9,r11
  195. sLd r6,r9,r10
  196. lex; ldu r9,16(r4)
  197. or r12,r8,r12
  198. stex; stdu r7,16(r3)
  199. r3_offset = 8
  200. sHd r7,r0,r11
  201. sLd r8,r0,r10
  202. bdnz 1b
  203. 78:
  204. stex; std r12,8(r3)
  205. r3_offset = 16
  206. or r7,r7,r6
  207. 79:
  208. stex; std r7,16(r3)
  209. r3_offset = 24
  210. 5: sHd r12,r9,r11
  211. or r12,r8,r12
  212. stex; std r12,24(r3)
  213. r3_offset = 32
  214. bne 6f
  215. li r3,0
  216. blr
  217. 6: cmpwi cr1,r5,8
  218. addi r3,r3,32
  219. r3_offset = 0
  220. sLd r9,r9,r10
  221. ble cr1,7f
  222. lex; ld r0,8(r4)
  223. sHd r7,r0,r11
  224. or r9,r7,r9
  225. 7:
  226. bf cr7*4+1,1f
  227. #ifdef __BIG_ENDIAN__
  228. rotldi r9,r9,32
  229. #endif
  230. stex; stw r9,0(r3)
  231. #ifdef __LITTLE_ENDIAN__
  232. rotrdi r9,r9,32
  233. #endif
  234. addi r3,r3,4
  235. 1: bf cr7*4+2,2f
  236. #ifdef __BIG_ENDIAN__
  237. rotldi r9,r9,16
  238. #endif
  239. stex; sth r9,0(r3)
  240. #ifdef __LITTLE_ENDIAN__
  241. rotrdi r9,r9,16
  242. #endif
  243. addi r3,r3,2
  244. 2: bf cr7*4+3,3f
  245. #ifdef __BIG_ENDIAN__
  246. rotldi r9,r9,8
  247. #endif
  248. stex; stb r9,0(r3)
  249. #ifdef __LITTLE_ENDIAN__
  250. rotrdi r9,r9,8
  251. #endif
  252. 3: li r3,0
  253. blr
  254. .Ldst_unaligned:
  255. r3_offset = 0
  256. PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */
  257. subf r5,r6,r5
  258. li r7,0
  259. cmpldi cr1,r5,16
  260. bf cr7*4+3,1f
  261. 100: EX_TABLE(100b, .Lld_exc_r7)
  262. lbz r0,0(r4)
  263. 100: EX_TABLE(100b, .Lst_exc_r7)
  264. stb r0,0(r3)
  265. addi r7,r7,1
  266. 1: bf cr7*4+2,2f
  267. 100: EX_TABLE(100b, .Lld_exc_r7)
  268. lhzx r0,r7,r4
  269. 100: EX_TABLE(100b, .Lst_exc_r7)
  270. sthx r0,r7,r3
  271. addi r7,r7,2
  272. 2: bf cr7*4+1,3f
  273. 100: EX_TABLE(100b, .Lld_exc_r7)
  274. lwzx r0,r7,r4
  275. 100: EX_TABLE(100b, .Lst_exc_r7)
  276. stwx r0,r7,r3
  277. 3: PPC_MTOCRF(0x01,r5)
  278. add r4,r6,r4
  279. add r3,r6,r3
  280. b .Ldst_aligned
  281. .Lshort_copy:
  282. r3_offset = 0
  283. bf cr7*4+0,1f
  284. lex; lwz r0,0(r4)
  285. lex; lwz r9,4(r4)
  286. addi r4,r4,8
  287. stex; stw r0,0(r3)
  288. stex; stw r9,4(r3)
  289. addi r3,r3,8
  290. 1: bf cr7*4+1,2f
  291. lex; lwz r0,0(r4)
  292. addi r4,r4,4
  293. stex; stw r0,0(r3)
  294. addi r3,r3,4
  295. 2: bf cr7*4+2,3f
  296. lex; lhz r0,0(r4)
  297. addi r4,r4,2
  298. stex; sth r0,0(r3)
  299. addi r3,r3,2
  300. 3: bf cr7*4+3,4f
  301. lex; lbz r0,0(r4)
  302. stex; stb r0,0(r3)
  303. 4: li r3,0
  304. blr
  305. /*
  306. * exception handlers follow
  307. * we have to return the number of bytes not copied
  308. * for an exception on a load, we set the rest of the destination to 0
  309. * Note that the number of bytes of instructions for adjusting r3 needs
  310. * to equal the amount of the adjustment, due to the trick of using
  311. * .Lld_exc - r3_offset as the handler address.
  312. */
  313. .Lld_exc_r7:
  314. add r3,r3,r7
  315. b .Lld_exc
  316. /* adjust by 24 */
  317. addi r3,r3,8
  318. nop
  319. /* adjust by 16 */
  320. addi r3,r3,8
  321. nop
  322. /* adjust by 8 */
  323. addi r3,r3,8
  324. nop
  325. /*
  326. * Here we have had a fault on a load and r3 points to the first
  327. * unmodified byte of the destination. We use the original arguments
  328. * and r3 to work out how much wasn't copied. Since we load some
  329. * distance ahead of the stores, we continue copying byte-by-byte until
  330. * we hit the load fault again in order to copy as much as possible.
  331. */
  332. .Lld_exc:
  333. ld r6,-24(r1)
  334. ld r4,-16(r1)
  335. ld r5,-8(r1)
  336. subf r6,r6,r3
  337. add r4,r4,r6
  338. subf r5,r6,r5 /* #bytes left to go */
  339. /*
  340. * first see if we can copy any more bytes before hitting another exception
  341. */
  342. mtctr r5
  343. r3_offset = 0
  344. 100: EX_TABLE(100b, .Ldone)
  345. 43: lbz r0,0(r4)
  346. addi r4,r4,1
  347. stex; stb r0,0(r3)
  348. addi r3,r3,1
  349. bdnz 43b
  350. li r3,0 /* huh? all copied successfully this time? */
  351. blr
  352. /*
  353. * here we have trapped again, amount remaining is in ctr.
  354. */
  355. .Ldone:
  356. mfctr r3
  357. blr
  358. /*
  359. * exception handlers for stores: we need to work out how many bytes
  360. * weren't copied, and we may need to copy some more.
  361. * Note that the number of bytes of instructions for adjusting r3 needs
  362. * to equal the amount of the adjustment, due to the trick of using
  363. * .Lst_exc - r3_offset as the handler address.
  364. */
  365. .Lst_exc_r7:
  366. add r3,r3,r7
  367. b .Lst_exc
  368. /* adjust by 24 */
  369. addi r3,r3,8
  370. nop
  371. /* adjust by 16 */
  372. addi r3,r3,8
  373. nop
  374. /* adjust by 8 */
  375. addi r3,r3,4
  376. /* adjust by 4 */
  377. addi r3,r3,4
  378. .Lst_exc:
  379. ld r6,-24(r1) /* original destination pointer */
  380. ld r4,-16(r1) /* original source pointer */
  381. ld r5,-8(r1) /* original number of bytes */
  382. add r7,r6,r5
  383. /*
  384. * If the destination pointer isn't 8-byte aligned,
  385. * we may have got the exception as a result of a
  386. * store that overlapped a page boundary, so we may be
  387. * able to copy a few more bytes.
  388. */
  389. 17: andi. r0,r3,7
  390. beq 19f
  391. subf r8,r6,r3 /* #bytes copied */
  392. 100: EX_TABLE(100b,19f)
  393. lbzx r0,r8,r4
  394. 100: EX_TABLE(100b,19f)
  395. stb r0,0(r3)
  396. addi r3,r3,1
  397. cmpld r3,r7
  398. blt 17b
  399. 19: subf r3,r3,r7 /* #bytes not copied in r3 */
  400. blr
  401. /*
  402. * Routine to copy a whole page of data, optimized for POWER4.
  403. * On POWER4 it is more than 50% faster than the simple loop
  404. * above (following the .Ldst_aligned label).
  405. */
  406. .macro exc
  407. 100: EX_TABLE(100b, .Labort)
  408. .endm
  409. .Lcopy_page_4K:
  410. std r31,-32(1)
  411. std r30,-40(1)
  412. std r29,-48(1)
  413. std r28,-56(1)
  414. std r27,-64(1)
  415. std r26,-72(1)
  416. std r25,-80(1)
  417. std r24,-88(1)
  418. std r23,-96(1)
  419. std r22,-104(1)
  420. std r21,-112(1)
  421. std r20,-120(1)
  422. li r5,4096/32 - 1
  423. addi r3,r3,-8
  424. li r0,5
  425. 0: addi r5,r5,-24
  426. mtctr r0
  427. exc; ld r22,640(4)
  428. exc; ld r21,512(4)
  429. exc; ld r20,384(4)
  430. exc; ld r11,256(4)
  431. exc; ld r9,128(4)
  432. exc; ld r7,0(4)
  433. exc; ld r25,648(4)
  434. exc; ld r24,520(4)
  435. exc; ld r23,392(4)
  436. exc; ld r10,264(4)
  437. exc; ld r8,136(4)
  438. exc; ldu r6,8(4)
  439. cmpwi r5,24
  440. 1:
  441. exc; std r22,648(3)
  442. exc; std r21,520(3)
  443. exc; std r20,392(3)
  444. exc; std r11,264(3)
  445. exc; std r9,136(3)
  446. exc; std r7,8(3)
  447. exc; ld r28,648(4)
  448. exc; ld r27,520(4)
  449. exc; ld r26,392(4)
  450. exc; ld r31,264(4)
  451. exc; ld r30,136(4)
  452. exc; ld r29,8(4)
  453. exc; std r25,656(3)
  454. exc; std r24,528(3)
  455. exc; std r23,400(3)
  456. exc; std r10,272(3)
  457. exc; std r8,144(3)
  458. exc; std r6,16(3)
  459. exc; ld r22,656(4)
  460. exc; ld r21,528(4)
  461. exc; ld r20,400(4)
  462. exc; ld r11,272(4)
  463. exc; ld r9,144(4)
  464. exc; ld r7,16(4)
  465. exc; std r28,664(3)
  466. exc; std r27,536(3)
  467. exc; std r26,408(3)
  468. exc; std r31,280(3)
  469. exc; std r30,152(3)
  470. exc; stdu r29,24(3)
  471. exc; ld r25,664(4)
  472. exc; ld r24,536(4)
  473. exc; ld r23,408(4)
  474. exc; ld r10,280(4)
  475. exc; ld r8,152(4)
  476. exc; ldu r6,24(4)
  477. bdnz 1b
  478. exc; std r22,648(3)
  479. exc; std r21,520(3)
  480. exc; std r20,392(3)
  481. exc; std r11,264(3)
  482. exc; std r9,136(3)
  483. exc; std r7,8(3)
  484. addi r4,r4,640
  485. addi r3,r3,648
  486. bge 0b
  487. mtctr r5
  488. exc; ld r7,0(4)
  489. exc; ld r8,8(4)
  490. exc; ldu r9,16(4)
  491. 3:
  492. exc; ld r10,8(4)
  493. exc; std r7,8(3)
  494. exc; ld r7,16(4)
  495. exc; std r8,16(3)
  496. exc; ld r8,24(4)
  497. exc; std r9,24(3)
  498. exc; ldu r9,32(4)
  499. exc; stdu r10,32(3)
  500. bdnz 3b
  501. 4:
  502. exc; ld r10,8(4)
  503. exc; std r7,8(3)
  504. exc; std r8,16(3)
  505. exc; std r9,24(3)
  506. exc; std r10,32(3)
  507. 9: ld r20,-120(1)
  508. ld r21,-112(1)
  509. ld r22,-104(1)
  510. ld r23,-96(1)
  511. ld r24,-88(1)
  512. ld r25,-80(1)
  513. ld r26,-72(1)
  514. ld r27,-64(1)
  515. ld r28,-56(1)
  516. ld r29,-48(1)
  517. ld r30,-40(1)
  518. ld r31,-32(1)
  519. li r3,0
  520. blr
  521. /*
  522. * on an exception, reset to the beginning and jump back into the
  523. * standard __copy_tofrom_user
  524. */
  525. .Labort:
  526. ld r20,-120(1)
  527. ld r21,-112(1)
  528. ld r22,-104(1)
  529. ld r23,-96(1)
  530. ld r24,-88(1)
  531. ld r25,-80(1)
  532. ld r26,-72(1)
  533. ld r27,-64(1)
  534. ld r28,-56(1)
  535. ld r29,-48(1)
  536. ld r30,-40(1)
  537. ld r31,-32(1)
  538. ld r3,-24(r1)
  539. ld r4,-16(r1)
  540. li r5,4096
  541. b .Ldst_aligned
  542. EXPORT_SYMBOL(__copy_tofrom_user)