memcpy.S 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright (c) 2012-2021, Arm Limited.
  4. *
  5. * Adapted from the original at:
  6. * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/assembler.h>
  10. /* Assumptions:
  11. *
  12. * ARMv8-a, AArch64, unaligned accesses.
  13. *
  14. */
  15. #define L(label) .L ## label
  16. #define dstin x0
  17. #define src x1
  18. #define count x2
  19. #define dst x3
  20. #define srcend x4
  21. #define dstend x5
  22. #define A_l x6
  23. #define A_lw w6
  24. #define A_h x7
  25. #define B_l x8
  26. #define B_lw w8
  27. #define B_h x9
  28. #define C_l x10
  29. #define C_lw w10
  30. #define C_h x11
  31. #define D_l x12
  32. #define D_h x13
  33. #define E_l x14
  34. #define E_h x15
  35. #define F_l x16
  36. #define F_h x17
  37. #define G_l count
  38. #define G_h dst
  39. #define H_l src
  40. #define H_h srcend
  41. #define tmp1 x14
  42. /* This implementation handles overlaps and supports both memcpy and memmove
  43. from a single entry point. It uses unaligned accesses and branchless
  44. sequences to keep the code small, simple and improve performance.
  45. Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  46. copies of up to 128 bytes, and large copies. The overhead of the overlap
  47. check is negligible since it is only required for large copies.
  48. Large copies use a software pipelined loop processing 64 bytes per iteration.
  49. The destination pointer is 16-byte aligned to minimize unaligned accesses.
  50. The loop tail is handled by always copying 64 bytes from the end.
  51. */
  52. SYM_FUNC_START(__pi_memcpy)
  53. add srcend, src, count
  54. add dstend, dstin, count
  55. cmp count, 128
  56. b.hi L(copy_long)
  57. cmp count, 32
  58. b.hi L(copy32_128)
  59. /* Small copies: 0..32 bytes. */
  60. cmp count, 16
  61. b.lo L(copy16)
  62. ldp A_l, A_h, [src]
  63. ldp D_l, D_h, [srcend, -16]
  64. stp A_l, A_h, [dstin]
  65. stp D_l, D_h, [dstend, -16]
  66. ret
  67. /* Copy 8-15 bytes. */
  68. L(copy16):
  69. tbz count, 3, L(copy8)
  70. ldr A_l, [src]
  71. ldr A_h, [srcend, -8]
  72. str A_l, [dstin]
  73. str A_h, [dstend, -8]
  74. ret
  75. .p2align 3
  76. /* Copy 4-7 bytes. */
  77. L(copy8):
  78. tbz count, 2, L(copy4)
  79. ldr A_lw, [src]
  80. ldr B_lw, [srcend, -4]
  81. str A_lw, [dstin]
  82. str B_lw, [dstend, -4]
  83. ret
  84. /* Copy 0..3 bytes using a branchless sequence. */
  85. L(copy4):
  86. cbz count, L(copy0)
  87. lsr tmp1, count, 1
  88. ldrb A_lw, [src]
  89. ldrb C_lw, [srcend, -1]
  90. ldrb B_lw, [src, tmp1]
  91. strb A_lw, [dstin]
  92. strb B_lw, [dstin, tmp1]
  93. strb C_lw, [dstend, -1]
  94. L(copy0):
  95. ret
  96. .p2align 4
  97. /* Medium copies: 33..128 bytes. */
  98. L(copy32_128):
  99. ldp A_l, A_h, [src]
  100. ldp B_l, B_h, [src, 16]
  101. ldp C_l, C_h, [srcend, -32]
  102. ldp D_l, D_h, [srcend, -16]
  103. cmp count, 64
  104. b.hi L(copy128)
  105. stp A_l, A_h, [dstin]
  106. stp B_l, B_h, [dstin, 16]
  107. stp C_l, C_h, [dstend, -32]
  108. stp D_l, D_h, [dstend, -16]
  109. ret
  110. .p2align 4
  111. /* Copy 65..128 bytes. */
  112. L(copy128):
  113. ldp E_l, E_h, [src, 32]
  114. ldp F_l, F_h, [src, 48]
  115. cmp count, 96
  116. b.ls L(copy96)
  117. ldp G_l, G_h, [srcend, -64]
  118. ldp H_l, H_h, [srcend, -48]
  119. stp G_l, G_h, [dstend, -64]
  120. stp H_l, H_h, [dstend, -48]
  121. L(copy96):
  122. stp A_l, A_h, [dstin]
  123. stp B_l, B_h, [dstin, 16]
  124. stp E_l, E_h, [dstin, 32]
  125. stp F_l, F_h, [dstin, 48]
  126. stp C_l, C_h, [dstend, -32]
  127. stp D_l, D_h, [dstend, -16]
  128. ret
  129. .p2align 4
  130. /* Copy more than 128 bytes. */
  131. L(copy_long):
  132. /* Use backwards copy if there is an overlap. */
  133. sub tmp1, dstin, src
  134. cbz tmp1, L(copy0)
  135. cmp tmp1, count
  136. b.lo L(copy_long_backwards)
  137. /* Copy 16 bytes and then align dst to 16-byte alignment. */
  138. ldp D_l, D_h, [src]
  139. and tmp1, dstin, 15
  140. bic dst, dstin, 15
  141. sub src, src, tmp1
  142. add count, count, tmp1 /* Count is now 16 too large. */
  143. ldp A_l, A_h, [src, 16]
  144. stp D_l, D_h, [dstin]
  145. ldp B_l, B_h, [src, 32]
  146. ldp C_l, C_h, [src, 48]
  147. ldp D_l, D_h, [src, 64]!
  148. subs count, count, 128 + 16 /* Test and readjust count. */
  149. b.ls L(copy64_from_end)
  150. L(loop64):
  151. stp A_l, A_h, [dst, 16]
  152. ldp A_l, A_h, [src, 16]
  153. stp B_l, B_h, [dst, 32]
  154. ldp B_l, B_h, [src, 32]
  155. stp C_l, C_h, [dst, 48]
  156. ldp C_l, C_h, [src, 48]
  157. stp D_l, D_h, [dst, 64]!
  158. ldp D_l, D_h, [src, 64]!
  159. subs count, count, 64
  160. b.hi L(loop64)
  161. /* Write the last iteration and copy 64 bytes from the end. */
  162. L(copy64_from_end):
  163. ldp E_l, E_h, [srcend, -64]
  164. stp A_l, A_h, [dst, 16]
  165. ldp A_l, A_h, [srcend, -48]
  166. stp B_l, B_h, [dst, 32]
  167. ldp B_l, B_h, [srcend, -32]
  168. stp C_l, C_h, [dst, 48]
  169. ldp C_l, C_h, [srcend, -16]
  170. stp D_l, D_h, [dst, 64]
  171. stp E_l, E_h, [dstend, -64]
  172. stp A_l, A_h, [dstend, -48]
  173. stp B_l, B_h, [dstend, -32]
  174. stp C_l, C_h, [dstend, -16]
  175. ret
  176. .p2align 4
  177. /* Large backwards copy for overlapping copies.
  178. Copy 16 bytes and then align dst to 16-byte alignment. */
  179. L(copy_long_backwards):
  180. ldp D_l, D_h, [srcend, -16]
  181. and tmp1, dstend, 15
  182. sub srcend, srcend, tmp1
  183. sub count, count, tmp1
  184. ldp A_l, A_h, [srcend, -16]
  185. stp D_l, D_h, [dstend, -16]
  186. ldp B_l, B_h, [srcend, -32]
  187. ldp C_l, C_h, [srcend, -48]
  188. ldp D_l, D_h, [srcend, -64]!
  189. sub dstend, dstend, tmp1
  190. subs count, count, 128
  191. b.ls L(copy64_from_start)
  192. L(loop64_backwards):
  193. stp A_l, A_h, [dstend, -16]
  194. ldp A_l, A_h, [srcend, -16]
  195. stp B_l, B_h, [dstend, -32]
  196. ldp B_l, B_h, [srcend, -32]
  197. stp C_l, C_h, [dstend, -48]
  198. ldp C_l, C_h, [srcend, -48]
  199. stp D_l, D_h, [dstend, -64]!
  200. ldp D_l, D_h, [srcend, -64]!
  201. subs count, count, 64
  202. b.hi L(loop64_backwards)
  203. /* Write the last iteration and copy 64 bytes from the start. */
  204. L(copy64_from_start):
  205. ldp G_l, G_h, [src, 48]
  206. stp A_l, A_h, [dstend, -16]
  207. ldp A_l, A_h, [src, 32]
  208. stp B_l, B_h, [dstend, -32]
  209. ldp B_l, B_h, [src, 16]
  210. stp C_l, C_h, [dstend, -48]
  211. ldp C_l, C_h, [src]
  212. stp D_l, D_h, [dstend, -64]
  213. stp G_l, G_h, [dstin, 48]
  214. stp A_l, A_h, [dstin, 32]
  215. stp B_l, B_h, [dstin, 16]
  216. stp C_l, C_h, [dstin]
  217. ret
  218. SYM_FUNC_END(__pi_memcpy)
  219. SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
  220. EXPORT_SYMBOL(__memcpy)
  221. SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
  222. EXPORT_SYMBOL(memcpy)
  223. SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
  224. SYM_FUNC_ALIAS(__memmove, __pi_memmove)
  225. EXPORT_SYMBOL(__memmove)
  226. SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
  227. EXPORT_SYMBOL(memmove)