strlen.S 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright (c) 2013-2021, Arm Limited.
  4. *
  5. * Adapted from the original at:
  6. * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/assembler.h>
  10. #include <asm/mte-def.h>
  11. /* Assumptions:
  12. *
  13. * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  14. */
  15. #define L(label) .L ## label
  16. /* Arguments and results. */
  17. #define srcin x0
  18. #define len x0
  19. /* Locals and temporaries. */
  20. #define src x1
  21. #define data1 x2
  22. #define data2 x3
  23. #define has_nul1 x4
  24. #define has_nul2 x5
  25. #define tmp1 x4
  26. #define tmp2 x5
  27. #define tmp3 x6
  28. #define tmp4 x7
  29. #define zeroones x8
  30. /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  31. (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  32. can be done in parallel across the entire word. A faster check
  33. (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
  34. false hits for characters 129..255. */
  35. #define REP8_01 0x0101010101010101
  36. #define REP8_7f 0x7f7f7f7f7f7f7f7f
  37. #define REP8_80 0x8080808080808080
  38. /*
  39. * When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE
  40. * (16-byte) granularity, and we must ensure that no access straddles this
  41. * alignment boundary.
  42. */
  43. #ifdef CONFIG_KASAN_HW_TAGS
  44. #define MIN_PAGE_SIZE MTE_GRANULE_SIZE
  45. #else
  46. #define MIN_PAGE_SIZE 4096
  47. #endif
  48. /* Since strings are short on average, we check the first 16 bytes
  49. of the string for a NUL character. In order to do an unaligned ldp
  50. safely we have to do a page cross check first. If there is a NUL
  51. byte we calculate the length from the 2 8-byte words using
  52. conditional select to reduce branch mispredictions (it is unlikely
  53. strlen will be repeatedly called on strings with the same length).
  54. If the string is longer than 16 bytes, we align src so don't need
  55. further page cross checks, and process 32 bytes per iteration
  56. using the fast NUL check. If we encounter non-ASCII characters,
  57. fallback to a second loop using the full NUL check.
  58. If the page cross check fails, we read 16 bytes from an aligned
  59. address, remove any characters before the string, and continue
  60. in the main loop using aligned loads. Since strings crossing a
  61. page in the first 16 bytes are rare (probability of
  62. 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
  63. AArch64 systems have a minimum page size of 4k. We don't bother
  64. checking for larger page sizes - the cost of setting up the correct
  65. page size is just not worth the extra gain from a small reduction in
  66. the cases taking the slow path. Note that we only care about
  67. whether the first fetch, which may be misaligned, crosses a page
  68. boundary. */
  69. SYM_FUNC_START(__pi_strlen)
  70. and tmp1, srcin, MIN_PAGE_SIZE - 1
  71. mov zeroones, REP8_01
  72. cmp tmp1, MIN_PAGE_SIZE - 16
  73. b.gt L(page_cross)
  74. ldp data1, data2, [srcin]
  75. #ifdef __AARCH64EB__
  76. /* For big-endian, carry propagation (if the final byte in the
  77. string is 0x01) means we cannot use has_nul1/2 directly.
  78. Since we expect strings to be small and early-exit,
  79. byte-swap the data now so has_null1/2 will be correct. */
  80. rev data1, data1
  81. rev data2, data2
  82. #endif
  83. sub tmp1, data1, zeroones
  84. orr tmp2, data1, REP8_7f
  85. sub tmp3, data2, zeroones
  86. orr tmp4, data2, REP8_7f
  87. bics has_nul1, tmp1, tmp2
  88. bic has_nul2, tmp3, tmp4
  89. ccmp has_nul2, 0, 0, eq
  90. beq L(main_loop_entry)
  91. /* Enter with C = has_nul1 == 0. */
  92. csel has_nul1, has_nul1, has_nul2, cc
  93. mov len, 8
  94. rev has_nul1, has_nul1
  95. clz tmp1, has_nul1
  96. csel len, xzr, len, cc
  97. add len, len, tmp1, lsr 3
  98. ret
  99. /* The inner loop processes 32 bytes per iteration and uses the fast
  100. NUL check. If we encounter non-ASCII characters, use a second
  101. loop with the accurate NUL check. */
  102. .p2align 4
  103. L(main_loop_entry):
  104. bic src, srcin, 15
  105. sub src, src, 16
  106. L(main_loop):
  107. ldp data1, data2, [src, 32]!
  108. L(page_cross_entry):
  109. sub tmp1, data1, zeroones
  110. sub tmp3, data2, zeroones
  111. orr tmp2, tmp1, tmp3
  112. tst tmp2, zeroones, lsl 7
  113. bne 1f
  114. ldp data1, data2, [src, 16]
  115. sub tmp1, data1, zeroones
  116. sub tmp3, data2, zeroones
  117. orr tmp2, tmp1, tmp3
  118. tst tmp2, zeroones, lsl 7
  119. beq L(main_loop)
  120. add src, src, 16
  121. 1:
  122. /* The fast check failed, so do the slower, accurate NUL check. */
  123. orr tmp2, data1, REP8_7f
  124. orr tmp4, data2, REP8_7f
  125. bics has_nul1, tmp1, tmp2
  126. bic has_nul2, tmp3, tmp4
  127. ccmp has_nul2, 0, 0, eq
  128. beq L(nonascii_loop)
  129. /* Enter with C = has_nul1 == 0. */
  130. L(tail):
  131. #ifdef __AARCH64EB__
  132. /* For big-endian, carry propagation (if the final byte in the
  133. string is 0x01) means we cannot use has_nul1/2 directly. The
  134. easiest way to get the correct byte is to byte-swap the data
  135. and calculate the syndrome a second time. */
  136. csel data1, data1, data2, cc
  137. rev data1, data1
  138. sub tmp1, data1, zeroones
  139. orr tmp2, data1, REP8_7f
  140. bic has_nul1, tmp1, tmp2
  141. #else
  142. csel has_nul1, has_nul1, has_nul2, cc
  143. #endif
  144. sub len, src, srcin
  145. rev has_nul1, has_nul1
  146. add tmp2, len, 8
  147. clz tmp1, has_nul1
  148. csel len, len, tmp2, cc
  149. add len, len, tmp1, lsr 3
  150. ret
  151. L(nonascii_loop):
  152. ldp data1, data2, [src, 16]!
  153. sub tmp1, data1, zeroones
  154. orr tmp2, data1, REP8_7f
  155. sub tmp3, data2, zeroones
  156. orr tmp4, data2, REP8_7f
  157. bics has_nul1, tmp1, tmp2
  158. bic has_nul2, tmp3, tmp4
  159. ccmp has_nul2, 0, 0, eq
  160. bne L(tail)
  161. ldp data1, data2, [src, 16]!
  162. sub tmp1, data1, zeroones
  163. orr tmp2, data1, REP8_7f
  164. sub tmp3, data2, zeroones
  165. orr tmp4, data2, REP8_7f
  166. bics has_nul1, tmp1, tmp2
  167. bic has_nul2, tmp3, tmp4
  168. ccmp has_nul2, 0, 0, eq
  169. beq L(nonascii_loop)
  170. b L(tail)
  171. /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
  172. srcin to 0x7f, so we ignore any NUL bytes before the string.
  173. Then continue in the aligned loop. */
  174. L(page_cross):
  175. bic src, srcin, 15
  176. ldp data1, data2, [src]
  177. lsl tmp1, srcin, 3
  178. mov tmp4, -1
  179. #ifdef __AARCH64EB__
  180. /* Big-endian. Early bytes are at MSB. */
  181. lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
  182. #else
  183. /* Little-endian. Early bytes are at LSB. */
  184. lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
  185. #endif
  186. orr tmp1, tmp1, REP8_80
  187. orn data1, data1, tmp1
  188. orn tmp2, data2, tmp1
  189. tst srcin, 8
  190. csel data1, data1, tmp4, eq
  191. csel data2, data2, tmp2, eq
  192. b L(page_cross_entry)
  193. SYM_FUNC_END(__pi_strlen)
  194. SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen)
  195. EXPORT_SYMBOL_NOKASAN(strlen)