strncmp.S 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright (c) 2013-2022, Arm Limited.
  4. *
  5. * Adapted from the original at:
  6. * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/assembler.h>
  10. /* Assumptions:
  11. *
  12. * ARMv8-a, AArch64.
  13. * MTE compatible.
  14. */
  15. #define L(label) .L ## label
  16. #define REP8_01 0x0101010101010101
  17. #define REP8_7f 0x7f7f7f7f7f7f7f7f
  18. /* Parameters and result. */
  19. #define src1 x0
  20. #define src2 x1
  21. #define limit x2
  22. #define result x0
  23. /* Internal variables. */
  24. #define data1 x3
  25. #define data1w w3
  26. #define data2 x4
  27. #define data2w w4
  28. #define has_nul x5
  29. #define diff x6
  30. #define syndrome x7
  31. #define tmp1 x8
  32. #define tmp2 x9
  33. #define tmp3 x10
  34. #define zeroones x11
  35. #define pos x12
  36. #define mask x13
  37. #define endloop x14
  38. #define count mask
  39. #define offset pos
  40. #define neg_offset x15
  41. /* Define endian dependent shift operations.
  42. On big-endian early bytes are at MSB and on little-endian LSB.
  43. LS_FW means shifting towards early bytes.
  44. LS_BK means shifting towards later bytes.
  45. */
  46. #ifdef __AARCH64EB__
  47. #define LS_FW lsl
  48. #define LS_BK lsr
  49. #else
  50. #define LS_FW lsr
  51. #define LS_BK lsl
  52. #endif
  53. SYM_FUNC_START(__pi_strncmp)
  54. cbz limit, L(ret0)
  55. eor tmp1, src1, src2
  56. mov zeroones, #REP8_01
  57. tst tmp1, #7
  58. and count, src1, #7
  59. b.ne L(misaligned8)
  60. cbnz count, L(mutual_align)
  61. /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  62. (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  63. can be done in parallel across the entire word. */
  64. .p2align 4
  65. L(loop_aligned):
  66. ldr data1, [src1], #8
  67. ldr data2, [src2], #8
  68. L(start_realigned):
  69. subs limit, limit, #8
  70. sub tmp1, data1, zeroones
  71. orr tmp2, data1, #REP8_7f
  72. eor diff, data1, data2 /* Non-zero if differences found. */
  73. csinv endloop, diff, xzr, hi /* Last Dword or differences. */
  74. bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
  75. ccmp endloop, #0, #0, eq
  76. b.eq L(loop_aligned)
  77. /* End of main loop */
  78. L(full_check):
  79. #ifndef __AARCH64EB__
  80. orr syndrome, diff, has_nul
  81. add limit, limit, 8 /* Rewind limit to before last subs. */
  82. L(syndrome_check):
  83. /* Limit was reached. Check if the NUL byte or the difference
  84. is before the limit. */
  85. rev syndrome, syndrome
  86. rev data1, data1
  87. clz pos, syndrome
  88. rev data2, data2
  89. lsl data1, data1, pos
  90. cmp limit, pos, lsr #3
  91. lsl data2, data2, pos
  92. /* But we need to zero-extend (char is unsigned) the value and then
  93. perform a signed 32-bit subtraction. */
  94. lsr data1, data1, #56
  95. sub result, data1, data2, lsr #56
  96. csel result, result, xzr, hi
  97. ret
  98. #else
  99. /* Not reached the limit, must have found the end or a diff. */
  100. tbz limit, #63, L(not_limit)
  101. add tmp1, limit, 8
  102. cbz limit, L(not_limit)
  103. lsl limit, tmp1, #3 /* Bits -> bytes. */
  104. mov mask, #~0
  105. lsr mask, mask, limit
  106. bic data1, data1, mask
  107. bic data2, data2, mask
  108. /* Make sure that the NUL byte is marked in the syndrome. */
  109. orr has_nul, has_nul, mask
  110. L(not_limit):
  111. /* For big-endian we cannot use the trick with the syndrome value
  112. as carry-propagation can corrupt the upper bits if the trailing
  113. bytes in the string contain 0x01. */
  114. /* However, if there is no NUL byte in the dword, we can generate
  115. the result directly. We can't just subtract the bytes as the
  116. MSB might be significant. */
  117. cbnz has_nul, 1f
  118. cmp data1, data2
  119. cset result, ne
  120. cneg result, result, lo
  121. ret
  122. 1:
  123. /* Re-compute the NUL-byte detection, using a byte-reversed value. */
  124. rev tmp3, data1
  125. sub tmp1, tmp3, zeroones
  126. orr tmp2, tmp3, #REP8_7f
  127. bic has_nul, tmp1, tmp2
  128. rev has_nul, has_nul
  129. orr syndrome, diff, has_nul
  130. clz pos, syndrome
  131. /* The most-significant-non-zero bit of the syndrome marks either the
  132. first bit that is different, or the top bit of the first zero byte.
  133. Shifting left now will bring the critical information into the
  134. top bits. */
  135. L(end_quick):
  136. lsl data1, data1, pos
  137. lsl data2, data2, pos
  138. /* But we need to zero-extend (char is unsigned) the value and then
  139. perform a signed 32-bit subtraction. */
  140. lsr data1, data1, #56
  141. sub result, data1, data2, lsr #56
  142. ret
  143. #endif
  144. L(mutual_align):
  145. /* Sources are mutually aligned, but are not currently at an
  146. alignment boundary. Round down the addresses and then mask off
  147. the bytes that precede the start point.
  148. We also need to adjust the limit calculations, but without
  149. overflowing if the limit is near ULONG_MAX. */
  150. bic src1, src1, #7
  151. bic src2, src2, #7
  152. ldr data1, [src1], #8
  153. neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
  154. ldr data2, [src2], #8
  155. mov tmp2, #~0
  156. LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
  157. /* Adjust the limit and ensure it doesn't overflow. */
  158. adds limit, limit, count
  159. csinv limit, limit, xzr, lo
  160. orr data1, data1, tmp2
  161. orr data2, data2, tmp2
  162. b L(start_realigned)
  163. .p2align 4
  164. /* Don't bother with dwords for up to 16 bytes. */
  165. L(misaligned8):
  166. cmp limit, #16
  167. b.hs L(try_misaligned_words)
  168. L(byte_loop):
  169. /* Perhaps we can do better than this. */
  170. ldrb data1w, [src1], #1
  171. ldrb data2w, [src2], #1
  172. subs limit, limit, #1
  173. ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
  174. ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
  175. b.eq L(byte_loop)
  176. L(done):
  177. sub result, data1, data2
  178. ret
  179. /* Align the SRC1 to a dword by doing a bytewise compare and then do
  180. the dword loop. */
  181. L(try_misaligned_words):
  182. cbz count, L(src1_aligned)
  183. neg count, count
  184. and count, count, #7
  185. sub limit, limit, count
  186. L(page_end_loop):
  187. ldrb data1w, [src1], #1
  188. ldrb data2w, [src2], #1
  189. cmp data1w, #1
  190. ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
  191. b.ne L(done)
  192. subs count, count, #1
  193. b.hi L(page_end_loop)
  194. /* The following diagram explains the comparison of misaligned strings.
  195. The bytes are shown in natural order. For little-endian, it is
  196. reversed in the registers. The "x" bytes are before the string.
  197. The "|" separates data that is loaded at one time.
  198. src1 | a a a a a a a a | b b b c c c c c | . . .
  199. src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
  200. After shifting in each step, the data looks like this:
  201. STEP_A STEP_B STEP_C
  202. data1 a a a a a a a a b b b c c c c c b b b c c c c c
  203. data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
  204. The bytes with "0" are eliminated from the syndrome via mask.
  205. Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
  206. time from SRC2. The comparison happens in 3 steps. After each step
  207. the loop can exit, or read from SRC1 or SRC2. */
  208. L(src1_aligned):
  209. /* Calculate offset from 8 byte alignment to string start in bits. No
  210. need to mask offset since shifts are ignoring upper bits. */
  211. lsl offset, src2, #3
  212. bic src2, src2, #0xf
  213. mov mask, -1
  214. neg neg_offset, offset
  215. ldr data1, [src1], #8
  216. ldp tmp1, tmp2, [src2], #16
  217. LS_BK mask, mask, neg_offset
  218. and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
  219. /* Skip the first compare if data in tmp1 is irrelevant. */
  220. tbnz offset, 6, L(misaligned_mid_loop)
  221. L(loop_misaligned):
  222. /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
  223. LS_FW data2, tmp1, offset
  224. LS_BK tmp1, tmp2, neg_offset
  225. subs limit, limit, #8
  226. orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
  227. sub has_nul, data1, zeroones
  228. eor diff, data1, data2 /* Non-zero if differences found. */
  229. orr tmp3, data1, #REP8_7f
  230. csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
  231. bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
  232. orr tmp3, endloop, has_nul
  233. cbnz tmp3, L(full_check)
  234. ldr data1, [src1], #8
  235. L(misaligned_mid_loop):
  236. /* STEP_B: Compare first part of data1 to second part of tmp2. */
  237. LS_FW data2, tmp2, offset
  238. #ifdef __AARCH64EB__
  239. /* For big-endian we do a byte reverse to avoid carry-propagation
  240. problem described above. This way we can reuse the has_nul in the
  241. next step and also use syndrome value trick at the end. */
  242. rev tmp3, data1
  243. #define data1_fixed tmp3
  244. #else
  245. #define data1_fixed data1
  246. #endif
  247. sub has_nul, data1_fixed, zeroones
  248. orr tmp3, data1_fixed, #REP8_7f
  249. eor diff, data2, data1 /* Non-zero if differences found. */
  250. bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
  251. #ifdef __AARCH64EB__
  252. rev has_nul, has_nul
  253. #endif
  254. cmp limit, neg_offset, lsr #3
  255. orr syndrome, diff, has_nul
  256. bic syndrome, syndrome, mask /* Ignore later bytes. */
  257. csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
  258. cbnz tmp3, L(syndrome_check)
  259. /* STEP_C: Compare second part of data1 to first part of tmp1. */
  260. ldp tmp1, tmp2, [src2], #16
  261. cmp limit, #8
  262. LS_BK data2, tmp1, neg_offset
  263. eor diff, data2, data1 /* Non-zero if differences found. */
  264. orr syndrome, diff, has_nul
  265. and syndrome, syndrome, mask /* Ignore earlier bytes. */
  266. csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
  267. cbnz tmp3, L(syndrome_check)
  268. ldr data1, [src1], #8
  269. sub limit, limit, #8
  270. b L(loop_misaligned)
  271. #ifdef __AARCH64EB__
  272. L(syndrome_check):
  273. clz pos, syndrome
  274. cmp pos, limit, lsl #3
  275. b.lo L(end_quick)
  276. #endif
  277. L(ret0):
  278. mov result, #0
  279. ret
  280. SYM_FUNC_END(__pi_strncmp)
  281. SYM_FUNC_ALIAS_WEAK(strncmp, __pi_strncmp)
  282. EXPORT_SYMBOL_NOKASAN(strncmp)