memmove.S 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright (C) 2022 Michael T. Kloos <[email protected]>
  4. */
  5. #include <linux/linkage.h>
  6. #include <asm/asm.h>
  7. SYM_FUNC_START(__memmove)
  8. SYM_FUNC_START_WEAK(memmove)
  9. /*
  10. * Returns
  11. * a0 - dest
  12. *
  13. * Parameters
  14. * a0 - Inclusive first byte of dest
  15. * a1 - Inclusive first byte of src
  16. * a2 - Length of copy n
  17. *
  18. * Because the return matches the parameter register a0,
  19. * we will not clobber or modify that register.
  20. *
  21. * Note: This currently only works on little-endian.
  22. * To port to big-endian, reverse the direction of shifts
  23. * in the 2 misaligned fixup copy loops.
  24. */
  25. /* Return if nothing to do */
  26. beq a0, a1, return_from_memmove
  27. beqz a2, return_from_memmove
  28. /*
  29. * Register Uses
  30. * Forward Copy: a1 - Index counter of src
  31. * Reverse Copy: a4 - Index counter of src
  32. * Forward Copy: t3 - Index counter of dest
  33. * Reverse Copy: t4 - Index counter of dest
  34. * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
  35. * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
  36. * Both Copy Modes: t0 - Link / Temporary for load-store
  37. * Both Copy Modes: t1 - Temporary for load-store
  38. * Both Copy Modes: t2 - Temporary for load-store
  39. * Both Copy Modes: a5 - dest to src alignment offset
  40. * Both Copy Modes: a6 - Shift ammount
  41. * Both Copy Modes: a7 - Inverse Shift ammount
  42. * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
  43. */
  44. /*
  45. * Solve for some register values now.
  46. * Byte copy does not need t5 or t6.
  47. */
  48. mv t3, a0
  49. add t4, a0, a2
  50. add a4, a1, a2
  51. /*
  52. * Byte copy if copying less than (2 * SZREG) bytes. This can
  53. * cause problems with the bulk copy implementation and is
  54. * small enough not to bother.
  55. */
  56. andi t0, a2, -(2 * SZREG)
  57. beqz t0, byte_copy
  58. /*
  59. * Now solve for t5 and t6.
  60. */
  61. andi t5, t3, -SZREG
  62. andi t6, t4, -SZREG
  63. /*
  64. * If dest(Register t3) rounded down to the nearest naturally
  65. * aligned SZREG address, does not equal dest, then add SZREG
  66. * to find the low-bound of SZREG alignment in the dest memory
  67. * region. Note that this could overshoot the dest memory
  68. * region if n is less than SZREG. This is one reason why
  69. * we always byte copy if n is less than SZREG.
  70. * Otherwise, dest is already naturally aligned to SZREG.
  71. */
  72. beq t5, t3, 1f
  73. addi t5, t5, SZREG
  74. 1:
  75. /*
  76. * If the dest and src are co-aligned to SZREG, then there is
  77. * no need for the full rigmarole of a full misaligned fixup copy.
  78. * Instead, do a simpler co-aligned copy.
  79. */
  80. xor t0, a0, a1
  81. andi t1, t0, (SZREG - 1)
  82. beqz t1, coaligned_copy
  83. /* Fall through to misaligned fixup copy */
  84. misaligned_fixup_copy:
  85. bltu a1, a0, misaligned_fixup_copy_reverse
  86. misaligned_fixup_copy_forward:
  87. jal t0, byte_copy_until_aligned_forward
  88. andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
  89. slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
  90. sub a5, a1, t3 /* Find the difference between src and dest */
  91. andi a1, a1, -SZREG /* Align the src pointer */
  92. addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
  93. /*
  94. * Compute The Inverse Shift
  95. * a7 = XLEN - a6 = XLEN + -a6
  96. * 2s complement negation to find the negative: -a6 = ~a6 + 1
  97. * Add that to XLEN. XLEN = SZREG * 8.
  98. */
  99. not a7, a6
  100. addi a7, a7, (SZREG * 8 + 1)
  101. /*
  102. * Fix Misalignment Copy Loop - Forward
  103. * load_val0 = load_ptr[0];
  104. * do {
  105. * load_val1 = load_ptr[1];
  106. * store_ptr += 2;
  107. * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
  108. *
  109. * if (store_ptr == {a2})
  110. * break;
  111. *
  112. * load_val0 = load_ptr[2];
  113. * load_ptr += 2;
  114. * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
  115. *
  116. * } while (store_ptr != store_ptr_end);
  117. * store_ptr = store_ptr_end;
  118. */
  119. REG_L t0, (0 * SZREG)(a1)
  120. 1:
  121. REG_L t1, (1 * SZREG)(a1)
  122. addi t3, t3, (2 * SZREG)
  123. srl t0, t0, a6
  124. sll t2, t1, a7
  125. or t2, t0, t2
  126. REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
  127. beq t3, a2, 2f
  128. REG_L t0, (2 * SZREG)(a1)
  129. addi a1, a1, (2 * SZREG)
  130. srl t1, t1, a6
  131. sll t2, t0, a7
  132. or t2, t1, t2
  133. REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
  134. bne t3, t6, 1b
  135. 2:
  136. mv t3, t6 /* Fix the dest pointer in case the loop was broken */
  137. add a1, t3, a5 /* Restore the src pointer */
  138. j byte_copy_forward /* Copy any remaining bytes */
  139. misaligned_fixup_copy_reverse:
  140. jal t0, byte_copy_until_aligned_reverse
  141. andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
  142. slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
  143. sub a5, a4, t4 /* Find the difference between src and dest */
  144. andi a4, a4, -SZREG /* Align the src pointer */
  145. addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
  146. /*
  147. * Compute The Inverse Shift
  148. * a7 = XLEN - a6 = XLEN + -a6
  149. * 2s complement negation to find the negative: -a6 = ~a6 + 1
  150. * Add that to XLEN. XLEN = SZREG * 8.
  151. */
  152. not a7, a6
  153. addi a7, a7, (SZREG * 8 + 1)
  154. /*
  155. * Fix Misalignment Copy Loop - Reverse
  156. * load_val1 = load_ptr[0];
  157. * do {
  158. * load_val0 = load_ptr[-1];
  159. * store_ptr -= 2;
  160. * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
  161. *
  162. * if (store_ptr == {a2})
  163. * break;
  164. *
  165. * load_val1 = load_ptr[-2];
  166. * load_ptr -= 2;
  167. * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
  168. *
  169. * } while (store_ptr != store_ptr_end);
  170. * store_ptr = store_ptr_end;
  171. */
  172. REG_L t1, ( 0 * SZREG)(a4)
  173. 1:
  174. REG_L t0, (-1 * SZREG)(a4)
  175. addi t4, t4, (-2 * SZREG)
  176. sll t1, t1, a7
  177. srl t2, t0, a6
  178. or t2, t1, t2
  179. REG_S t2, ( 1 * SZREG)(t4)
  180. beq t4, a2, 2f
  181. REG_L t1, (-2 * SZREG)(a4)
  182. addi a4, a4, (-2 * SZREG)
  183. sll t0, t0, a7
  184. srl t2, t1, a6
  185. or t2, t0, t2
  186. REG_S t2, ( 0 * SZREG)(t4)
  187. bne t4, t5, 1b
  188. 2:
  189. mv t4, t5 /* Fix the dest pointer in case the loop was broken */
  190. add a4, t4, a5 /* Restore the src pointer */
  191. j byte_copy_reverse /* Copy any remaining bytes */
  192. /*
  193. * Simple copy loops for SZREG co-aligned memory locations.
  194. * These also make calls to do byte copies for any unaligned
  195. * data at their terminations.
  196. */
  197. coaligned_copy:
  198. bltu a1, a0, coaligned_copy_reverse
  199. coaligned_copy_forward:
  200. jal t0, byte_copy_until_aligned_forward
  201. 1:
  202. REG_L t1, ( 0 * SZREG)(a1)
  203. addi a1, a1, SZREG
  204. addi t3, t3, SZREG
  205. REG_S t1, (-1 * SZREG)(t3)
  206. bne t3, t6, 1b
  207. j byte_copy_forward /* Copy any remaining bytes */
  208. coaligned_copy_reverse:
  209. jal t0, byte_copy_until_aligned_reverse
  210. 1:
  211. REG_L t1, (-1 * SZREG)(a4)
  212. addi a4, a4, -SZREG
  213. addi t4, t4, -SZREG
  214. REG_S t1, ( 0 * SZREG)(t4)
  215. bne t4, t5, 1b
  216. j byte_copy_reverse /* Copy any remaining bytes */
  217. /*
  218. * These are basically sub-functions within the function. They
  219. * are used to byte copy until the dest pointer is in alignment.
  220. * At which point, a bulk copy method can be used by the
  221. * calling code. These work on the same registers as the bulk
  222. * copy loops. Therefore, the register values can be picked
  223. * up from where they were left and we avoid code duplication
  224. * without any overhead except the call in and return jumps.
  225. */
  226. byte_copy_until_aligned_forward:
  227. beq t3, t5, 2f
  228. 1:
  229. lb t1, 0(a1)
  230. addi a1, a1, 1
  231. addi t3, t3, 1
  232. sb t1, -1(t3)
  233. bne t3, t5, 1b
  234. 2:
  235. jalr zero, 0x0(t0) /* Return to multibyte copy loop */
  236. byte_copy_until_aligned_reverse:
  237. beq t4, t6, 2f
  238. 1:
  239. lb t1, -1(a4)
  240. addi a4, a4, -1
  241. addi t4, t4, -1
  242. sb t1, 0(t4)
  243. bne t4, t6, 1b
  244. 2:
  245. jalr zero, 0x0(t0) /* Return to multibyte copy loop */
  246. /*
  247. * Simple byte copy loops.
  248. * These will byte copy until they reach the end of data to copy.
  249. * At that point, they will call to return from memmove.
  250. */
  251. byte_copy:
  252. bltu a1, a0, byte_copy_reverse
  253. byte_copy_forward:
  254. beq t3, t4, 2f
  255. 1:
  256. lb t1, 0(a1)
  257. addi a1, a1, 1
  258. addi t3, t3, 1
  259. sb t1, -1(t3)
  260. bne t3, t4, 1b
  261. 2:
  262. ret
  263. byte_copy_reverse:
  264. beq t4, t3, 2f
  265. 1:
  266. lb t1, -1(a4)
  267. addi a4, a4, -1
  268. addi t4, t4, -1
  269. sb t1, 0(t4)
  270. bne t4, t3, 1b
  271. 2:
  272. return_from_memmove:
  273. ret
  274. SYM_FUNC_END(memmove)
  275. SYM_FUNC_END(__memmove)