memset.S 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright (C) 2013 ARM Ltd.
  4. * Copyright (C) 2013 Linaro.
  5. *
  6. * This code is based on glibc cortex strings work originally authored by Linaro
  7. * be found @
  8. *
  9. * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10. * files/head:/src/aarch64/
  11. */
  12. #include <linux/linkage.h>
  13. #include <asm/assembler.h>
  14. #include <asm/cache.h>
  15. /*
  16. * Fill in the buffer with character c (alignment handled by the hardware)
  17. *
  18. * Parameters:
  19. * x0 - buf
  20. * x1 - c
  21. * x2 - n
  22. * Returns:
  23. * x0 - buf
  24. */
  25. dstin .req x0
  26. val .req w1
  27. count .req x2
  28. tmp1 .req x3
  29. tmp1w .req w3
  30. tmp2 .req x4
  31. tmp2w .req w4
  32. zva_len_x .req x5
  33. zva_len .req w5
  34. zva_bits_x .req x6
  35. A_l .req x7
  36. A_lw .req w7
  37. dst .req x8
  38. tmp3w .req w9
  39. tmp3 .req x9
  40. SYM_FUNC_START(__pi_memset)
  41. mov dst, dstin /* Preserve return value. */
  42. and A_lw, val, #255
  43. orr A_lw, A_lw, A_lw, lsl #8
  44. orr A_lw, A_lw, A_lw, lsl #16
  45. orr A_l, A_l, A_l, lsl #32
  46. cmp count, #15
  47. b.hi .Lover16_proc
  48. /*All store maybe are non-aligned..*/
  49. tbz count, #3, 1f
  50. str A_l, [dst], #8
  51. 1:
  52. tbz count, #2, 2f
  53. str A_lw, [dst], #4
  54. 2:
  55. tbz count, #1, 3f
  56. strh A_lw, [dst], #2
  57. 3:
  58. tbz count, #0, 4f
  59. strb A_lw, [dst]
  60. 4:
  61. ret
  62. .Lover16_proc:
  63. /*Whether the start address is aligned with 16.*/
  64. neg tmp2, dst
  65. ands tmp2, tmp2, #15
  66. b.eq .Laligned
  67. /*
  68. * The count is not less than 16, we can use stp to store the start 16 bytes,
  69. * then adjust the dst aligned with 16.This process will make the current
  70. * memory address at alignment boundary.
  71. */
  72. stp A_l, A_l, [dst] /*non-aligned store..*/
  73. /*make the dst aligned..*/
  74. sub count, count, tmp2
  75. add dst, dst, tmp2
  76. .Laligned:
  77. cbz A_l, .Lzero_mem
  78. .Ltail_maybe_long:
  79. cmp count, #64
  80. b.ge .Lnot_short
  81. .Ltail63:
  82. ands tmp1, count, #0x30
  83. b.eq 3f
  84. cmp tmp1w, #0x20
  85. b.eq 1f
  86. b.lt 2f
  87. stp A_l, A_l, [dst], #16
  88. 1:
  89. stp A_l, A_l, [dst], #16
  90. 2:
  91. stp A_l, A_l, [dst], #16
  92. /*
  93. * The last store length is less than 16,use stp to write last 16 bytes.
  94. * It will lead some bytes written twice and the access is non-aligned.
  95. */
  96. 3:
  97. ands count, count, #15
  98. cbz count, 4f
  99. add dst, dst, count
  100. stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
  101. 4:
  102. ret
  103. /*
  104. * Critical loop. Start at a new cache line boundary. Assuming
  105. * 64 bytes per line, this ensures the entire loop is in one line.
  106. */
  107. .p2align L1_CACHE_SHIFT
  108. .Lnot_short:
  109. sub dst, dst, #16/* Pre-bias. */
  110. sub count, count, #64
  111. 1:
  112. stp A_l, A_l, [dst, #16]
  113. stp A_l, A_l, [dst, #32]
  114. stp A_l, A_l, [dst, #48]
  115. stp A_l, A_l, [dst, #64]!
  116. subs count, count, #64
  117. b.ge 1b
  118. tst count, #0x3f
  119. add dst, dst, #16
  120. b.ne .Ltail63
  121. .Lexitfunc:
  122. ret
  123. /*
  124. * For zeroing memory, check to see if we can use the ZVA feature to
  125. * zero entire 'cache' lines.
  126. */
  127. .Lzero_mem:
  128. cmp count, #63
  129. b.le .Ltail63
  130. /*
  131. * For zeroing small amounts of memory, it's not worth setting up
  132. * the line-clear code.
  133. */
  134. cmp count, #128
  135. b.lt .Lnot_short /*count is at least 128 bytes*/
  136. mrs tmp1, dczid_el0
  137. tbnz tmp1, #4, .Lnot_short
  138. mov tmp3w, #4
  139. and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
  140. lsl zva_len, tmp3w, zva_len
  141. ands tmp3w, zva_len, #63
  142. /*
  143. * ensure the zva_len is not less than 64.
  144. * It is not meaningful to use ZVA if the block size is less than 64.
  145. */
  146. b.ne .Lnot_short
  147. .Lzero_by_line:
  148. /*
  149. * Compute how far we need to go to become suitably aligned. We're
  150. * already at quad-word alignment.
  151. */
  152. cmp count, zva_len_x
  153. b.lt .Lnot_short /* Not enough to reach alignment. */
  154. sub zva_bits_x, zva_len_x, #1
  155. neg tmp2, dst
  156. ands tmp2, tmp2, zva_bits_x
  157. b.eq 2f /* Already aligned. */
  158. /* Not aligned, check that there's enough to copy after alignment.*/
  159. sub tmp1, count, tmp2
  160. /*
  161. * grantee the remain length to be ZVA is bigger than 64,
  162. * avoid to make the 2f's process over mem range.*/
  163. cmp tmp1, #64
  164. ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
  165. b.lt .Lnot_short
  166. /*
  167. * We know that there's at least 64 bytes to zero and that it's safe
  168. * to overrun by 64 bytes.
  169. */
  170. mov count, tmp1
  171. 1:
  172. stp A_l, A_l, [dst]
  173. stp A_l, A_l, [dst, #16]
  174. stp A_l, A_l, [dst, #32]
  175. subs tmp2, tmp2, #64
  176. stp A_l, A_l, [dst, #48]
  177. add dst, dst, #64
  178. b.ge 1b
  179. /* We've overrun a bit, so adjust dst downwards.*/
  180. add dst, dst, tmp2
  181. 2:
  182. sub count, count, zva_len_x
  183. 3:
  184. dc zva, dst
  185. add dst, dst, zva_len_x
  186. subs count, count, zva_len_x
  187. b.ge 3b
  188. ands count, count, zva_bits_x
  189. b.ne .Ltail_maybe_long
  190. ret
  191. SYM_FUNC_END(__pi_memset)
  192. SYM_FUNC_ALIAS(__memset, __pi_memset)
  193. EXPORT_SYMBOL(__memset)
  194. SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
  195. EXPORT_SYMBOL(memset)