lusercopy.S 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * User Space Access Routines
  4. *
  5. * Copyright (C) 2000-2002 Hewlett-Packard (John Marvin)
  6. * Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
  7. * Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr>
  8. * Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org>
  9. * Copyright (C) 2017 Helge Deller <[email protected]>
  10. * Copyright (C) 2017 John David Anglin <[email protected]>
  11. */
  12. /*
  13. * These routines still have plenty of room for optimization
  14. * (word & doubleword load/store, dual issue, store hints, etc.).
  15. */
  16. /*
  17. * The following routines assume that space register 3 (sr3) contains
  18. * the space id associated with the current users address space.
  19. */
  20. .text
  21. #include <asm/assembly.h>
  22. #include <asm/errno.h>
  23. #include <linux/linkage.h>
  24. /*
  25. * unsigned long lclear_user(void *to, unsigned long n)
  26. *
  27. * Returns 0 for success.
  28. * otherwise, returns number of bytes not transferred.
  29. */
  30. ENTRY_CFI(lclear_user)
  31. comib,=,n 0,%r25,$lclu_done
  32. $lclu_loop:
  33. addib,<> -1,%r25,$lclu_loop
  34. 1: stbs,ma %r0,1(%sr3,%r26)
  35. $lclu_done:
  36. bv %r0(%r2)
  37. copy %r25,%r28
  38. 2: b $lclu_done
  39. ldo 1(%r25),%r25
  40. ASM_EXCEPTIONTABLE_ENTRY(1b,2b)
  41. ENDPROC_CFI(lclear_user)
  42. /*
  43. * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
  44. *
  45. * Inputs:
  46. * - sr1 already contains space of source region
  47. * - sr2 already contains space of destination region
  48. *
  49. * Returns:
  50. * - number of bytes that could not be copied.
  51. * On success, this will be zero.
  52. *
  53. * This code is based on a C-implementation of a copy routine written by
  54. * Randolph Chung, which in turn was derived from the glibc.
  55. *
  56. * Several strategies are tried to try to get the best performance for various
  57. * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes
  58. * at a time using general registers. Unaligned copies are handled either by
  59. * aligning the destination and then using shift-and-write method, or in a few
  60. * cases by falling back to a byte-at-a-time copy.
  61. *
  62. * Testing with various alignments and buffer sizes shows that this code is
  63. * often >10x faster than a simple byte-at-a-time copy, even for strangely
  64. * aligned operands. It is interesting to note that the glibc version of memcpy
  65. * (written in C) is actually quite fast already. This routine is able to beat
  66. * it by 30-40% for aligned copies because of the loop unrolling, but in some
  67. * cases the glibc version is still slightly faster. This lends more
  68. * credibility that gcc can generate very good code as long as we are careful.
  69. *
  70. * Possible optimizations:
  71. * - add cache prefetching
  72. * - try not to use the post-increment address modifiers; they may create
  73. * additional interlocks. Assumption is that those were only efficient on old
  74. * machines (pre PA8000 processors)
  75. */
  76. dst = arg0
  77. src = arg1
  78. len = arg2
  79. end = arg3
  80. t1 = r19
  81. t2 = r20
  82. t3 = r21
  83. t4 = r22
  84. srcspc = sr1
  85. dstspc = sr2
  86. t0 = r1
  87. a1 = t1
  88. a2 = t2
  89. a3 = t3
  90. a0 = t4
  91. save_src = ret0
  92. save_dst = ret1
  93. save_len = r31
  94. ENTRY_CFI(pa_memcpy)
  95. /* Last destination address */
  96. add dst,len,end
  97. /* short copy with less than 16 bytes? */
  98. cmpib,COND(>>=),n 15,len,.Lbyte_loop
  99. /* same alignment? */
  100. xor src,dst,t0
  101. extru t0,31,2,t1
  102. cmpib,<>,n 0,t1,.Lunaligned_copy
  103. #ifdef CONFIG_64BIT
  104. /* only do 64-bit copies if we can get aligned. */
  105. extru t0,31,3,t1
  106. cmpib,<>,n 0,t1,.Lalign_loop32
  107. /* loop until we are 64-bit aligned */
  108. .Lalign_loop64:
  109. extru dst,31,3,t1
  110. cmpib,=,n 0,t1,.Lcopy_loop_16_start
  111. 20: ldb,ma 1(srcspc,src),t1
  112. 21: stb,ma t1,1(dstspc,dst)
  113. b .Lalign_loop64
  114. ldo -1(len),len
  115. ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
  116. ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
  117. .Lcopy_loop_16_start:
  118. ldi 31,t0
  119. .Lcopy_loop_16:
  120. cmpb,COND(>>=),n t0,len,.Lword_loop
  121. 10: ldd 0(srcspc,src),t1
  122. 11: ldd 8(srcspc,src),t2
  123. ldo 16(src),src
  124. 12: std,ma t1,8(dstspc,dst)
  125. 13: std,ma t2,8(dstspc,dst)
  126. 14: ldd 0(srcspc,src),t1
  127. 15: ldd 8(srcspc,src),t2
  128. ldo 16(src),src
  129. 16: std,ma t1,8(dstspc,dst)
  130. 17: std,ma t2,8(dstspc,dst)
  131. ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
  132. ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault)
  133. ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
  134. ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
  135. ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
  136. ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault)
  137. ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
  138. ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
  139. b .Lcopy_loop_16
  140. ldo -32(len),len
  141. .Lword_loop:
  142. cmpib,COND(>>=),n 3,len,.Lbyte_loop
  143. 20: ldw,ma 4(srcspc,src),t1
  144. 21: stw,ma t1,4(dstspc,dst)
  145. b .Lword_loop
  146. ldo -4(len),len
  147. ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
  148. ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
  149. #endif /* CONFIG_64BIT */
  150. /* loop until we are 32-bit aligned */
  151. .Lalign_loop32:
  152. extru dst,31,2,t1
  153. cmpib,=,n 0,t1,.Lcopy_loop_8
  154. 20: ldb,ma 1(srcspc,src),t1
  155. 21: stb,ma t1,1(dstspc,dst)
  156. b .Lalign_loop32
  157. ldo -1(len),len
  158. ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
  159. ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
  160. .Lcopy_loop_8:
  161. cmpib,COND(>>=),n 15,len,.Lbyte_loop
  162. 10: ldw 0(srcspc,src),t1
  163. 11: ldw 4(srcspc,src),t2
  164. 12: stw,ma t1,4(dstspc,dst)
  165. 13: stw,ma t2,4(dstspc,dst)
  166. 14: ldw 8(srcspc,src),t1
  167. 15: ldw 12(srcspc,src),t2
  168. ldo 16(src),src
  169. 16: stw,ma t1,4(dstspc,dst)
  170. 17: stw,ma t2,4(dstspc,dst)
  171. ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
  172. ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault)
  173. ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
  174. ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
  175. ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
  176. ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault)
  177. ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
  178. ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
  179. b .Lcopy_loop_8
  180. ldo -16(len),len
  181. .Lbyte_loop:
  182. cmpclr,COND(<>) len,%r0,%r0
  183. b,n .Lcopy_done
  184. 20: ldb 0(srcspc,src),t1
  185. ldo 1(src),src
  186. 21: stb,ma t1,1(dstspc,dst)
  187. b .Lbyte_loop
  188. ldo -1(len),len
  189. ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
  190. ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
  191. .Lcopy_done:
  192. bv %r0(%r2)
  193. sub end,dst,ret0
  194. /* src and dst are not aligned the same way. */
  195. /* need to go the hard way */
  196. .Lunaligned_copy:
  197. /* align until dst is 32bit-word-aligned */
  198. extru dst,31,2,t1
  199. cmpib,=,n 0,t1,.Lcopy_dstaligned
  200. 20: ldb 0(srcspc,src),t1
  201. ldo 1(src),src
  202. 21: stb,ma t1,1(dstspc,dst)
  203. b .Lunaligned_copy
  204. ldo -1(len),len
  205. ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
  206. ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
  207. .Lcopy_dstaligned:
  208. /* store src, dst and len in safe place */
  209. copy src,save_src
  210. copy dst,save_dst
  211. copy len,save_len
  212. /* len now needs give number of words to copy */
  213. SHRREG len,2,len
  214. /*
  215. * Copy from a not-aligned src to an aligned dst using shifts.
  216. * Handles 4 words per loop.
  217. */
  218. depw,z src,28,2,t0
  219. subi 32,t0,t0
  220. mtsar t0
  221. extru len,31,2,t0
  222. cmpib,= 2,t0,.Lcase2
  223. /* Make src aligned by rounding it down. */
  224. depi 0,31,2,src
  225. cmpiclr,<> 3,t0,%r0
  226. b,n .Lcase3
  227. cmpiclr,<> 1,t0,%r0
  228. b,n .Lcase1
  229. .Lcase0:
  230. cmpb,COND(=) %r0,len,.Lcda_finish
  231. nop
  232. 1: ldw,ma 4(srcspc,src), a3
  233. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  234. 1: ldw,ma 4(srcspc,src), a0
  235. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  236. b,n .Ldo3
  237. .Lcase1:
  238. 1: ldw,ma 4(srcspc,src), a2
  239. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  240. 1: ldw,ma 4(srcspc,src), a3
  241. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  242. ldo -1(len),len
  243. cmpb,COND(=),n %r0,len,.Ldo0
  244. .Ldo4:
  245. 1: ldw,ma 4(srcspc,src), a0
  246. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  247. shrpw a2, a3, %sar, t0
  248. 1: stw,ma t0, 4(dstspc,dst)
  249. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
  250. .Ldo3:
  251. 1: ldw,ma 4(srcspc,src), a1
  252. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  253. shrpw a3, a0, %sar, t0
  254. 1: stw,ma t0, 4(dstspc,dst)
  255. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
  256. .Ldo2:
  257. 1: ldw,ma 4(srcspc,src), a2
  258. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  259. shrpw a0, a1, %sar, t0
  260. 1: stw,ma t0, 4(dstspc,dst)
  261. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
  262. .Ldo1:
  263. 1: ldw,ma 4(srcspc,src), a3
  264. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  265. shrpw a1, a2, %sar, t0
  266. 1: stw,ma t0, 4(dstspc,dst)
  267. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
  268. ldo -4(len),len
  269. cmpb,COND(<>) %r0,len,.Ldo4
  270. nop
  271. .Ldo0:
  272. shrpw a2, a3, %sar, t0
  273. 1: stw,ma t0, 4(dstspc,dst)
  274. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
  275. .Lcda_rdfault:
  276. .Lcda_finish:
  277. /* calculate new src, dst and len and jump to byte-copy loop */
  278. sub dst,save_dst,t0
  279. add save_src,t0,src
  280. b .Lbyte_loop
  281. sub save_len,t0,len
  282. .Lcase3:
  283. 1: ldw,ma 4(srcspc,src), a0
  284. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  285. 1: ldw,ma 4(srcspc,src), a1
  286. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  287. b .Ldo2
  288. ldo 1(len),len
  289. .Lcase2:
  290. 1: ldw,ma 4(srcspc,src), a1
  291. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  292. 1: ldw,ma 4(srcspc,src), a2
  293. ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
  294. b .Ldo1
  295. ldo 2(len),len
  296. /* fault exception fixup handlers: */
  297. #ifdef CONFIG_64BIT
  298. .Lcopy16_fault:
  299. b .Lcopy_done
  300. 10: std,ma t1,8(dstspc,dst)
  301. ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
  302. #endif
  303. .Lcopy8_fault:
  304. b .Lcopy_done
  305. 10: stw,ma t1,4(dstspc,dst)
  306. ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
  307. ENDPROC_CFI(pa_memcpy)
  308. .end