strcmp.S 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright (c) 2012-2022, Arm Limited.
  4. *
  5. * Adapted from the original at:
  6. * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strcmp.S
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/assembler.h>
  10. /* Assumptions:
  11. *
  12. * ARMv8-a, AArch64.
  13. * MTE compatible.
  14. */
  15. #define L(label) .L ## label
  16. #define REP8_01 0x0101010101010101
  17. #define REP8_7f 0x7f7f7f7f7f7f7f7f
  18. #define src1 x0
  19. #define src2 x1
  20. #define result x0
  21. #define data1 x2
  22. #define data1w w2
  23. #define data2 x3
  24. #define data2w w3
  25. #define has_nul x4
  26. #define diff x5
  27. #define off1 x5
  28. #define syndrome x6
  29. #define tmp x6
  30. #define data3 x7
  31. #define zeroones x8
  32. #define shift x9
  33. #define off2 x10
  34. /* On big-endian early bytes are at MSB and on little-endian LSB.
  35. LS_FW means shifting towards early bytes. */
  36. #ifdef __AARCH64EB__
  37. # define LS_FW lsl
  38. #else
  39. # define LS_FW lsr
  40. #endif
  41. /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  42. (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  43. can be done in parallel across the entire word.
  44. Since carry propagation makes 0x1 bytes before a NUL byte appear
  45. NUL too in big-endian, byte-reverse the data before the NUL check. */
  46. SYM_FUNC_START(__pi_strcmp)
  47. sub off2, src2, src1
  48. mov zeroones, REP8_01
  49. and tmp, src1, 7
  50. tst off2, 7
  51. b.ne L(misaligned8)
  52. cbnz tmp, L(mutual_align)
  53. .p2align 4
  54. L(loop_aligned):
  55. ldr data2, [src1, off2]
  56. ldr data1, [src1], 8
  57. L(start_realigned):
  58. #ifdef __AARCH64EB__
  59. rev tmp, data1
  60. sub has_nul, tmp, zeroones
  61. orr tmp, tmp, REP8_7f
  62. #else
  63. sub has_nul, data1, zeroones
  64. orr tmp, data1, REP8_7f
  65. #endif
  66. bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
  67. ccmp data1, data2, 0, eq
  68. b.eq L(loop_aligned)
  69. #ifdef __AARCH64EB__
  70. rev has_nul, has_nul
  71. #endif
  72. eor diff, data1, data2
  73. orr syndrome, diff, has_nul
  74. L(end):
  75. #ifndef __AARCH64EB__
  76. rev syndrome, syndrome
  77. rev data1, data1
  78. rev data2, data2
  79. #endif
  80. clz shift, syndrome
  81. /* The most-significant-non-zero bit of the syndrome marks either the
  82. first bit that is different, or the top bit of the first zero byte.
  83. Shifting left now will bring the critical information into the
  84. top bits. */
  85. lsl data1, data1, shift
  86. lsl data2, data2, shift
  87. /* But we need to zero-extend (char is unsigned) the value and then
  88. perform a signed 32-bit subtraction. */
  89. lsr data1, data1, 56
  90. sub result, data1, data2, lsr 56
  91. ret
  92. .p2align 4
  93. L(mutual_align):
  94. /* Sources are mutually aligned, but are not currently at an
  95. alignment boundary. Round down the addresses and then mask off
  96. the bytes that precede the start point. */
  97. bic src1, src1, 7
  98. ldr data2, [src1, off2]
  99. ldr data1, [src1], 8
  100. neg shift, src2, lsl 3 /* Bits to alignment -64. */
  101. mov tmp, -1
  102. LS_FW tmp, tmp, shift
  103. orr data1, data1, tmp
  104. orr data2, data2, tmp
  105. b L(start_realigned)
  106. L(misaligned8):
  107. /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
  108. checking to make sure that we don't access beyond the end of SRC2. */
  109. cbz tmp, L(src1_aligned)
  110. L(do_misaligned):
  111. ldrb data1w, [src1], 1
  112. ldrb data2w, [src2], 1
  113. cmp data1w, 0
  114. ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
  115. b.ne L(done)
  116. tst src1, 7
  117. b.ne L(do_misaligned)
  118. L(src1_aligned):
  119. neg shift, src2, lsl 3
  120. bic src2, src2, 7
  121. ldr data3, [src2], 8
  122. #ifdef __AARCH64EB__
  123. rev data3, data3
  124. #endif
  125. lsr tmp, zeroones, shift
  126. orr data3, data3, tmp
  127. sub has_nul, data3, zeroones
  128. orr tmp, data3, REP8_7f
  129. bics has_nul, has_nul, tmp
  130. b.ne L(tail)
  131. sub off1, src2, src1
  132. .p2align 4
  133. L(loop_unaligned):
  134. ldr data3, [src1, off1]
  135. ldr data2, [src1, off2]
  136. #ifdef __AARCH64EB__
  137. rev data3, data3
  138. #endif
  139. sub has_nul, data3, zeroones
  140. orr tmp, data3, REP8_7f
  141. ldr data1, [src1], 8
  142. bics has_nul, has_nul, tmp
  143. ccmp data1, data2, 0, eq
  144. b.eq L(loop_unaligned)
  145. lsl tmp, has_nul, shift
  146. #ifdef __AARCH64EB__
  147. rev tmp, tmp
  148. #endif
  149. eor diff, data1, data2
  150. orr syndrome, diff, tmp
  151. cbnz syndrome, L(end)
  152. L(tail):
  153. ldr data1, [src1]
  154. neg shift, shift
  155. lsr data2, data3, shift
  156. lsr has_nul, has_nul, shift
  157. #ifdef __AARCH64EB__
  158. rev data2, data2
  159. rev has_nul, has_nul
  160. #endif
  161. eor diff, data1, data2
  162. orr syndrome, diff, has_nul
  163. b L(end)
  164. L(done):
  165. sub result, data1, data2
  166. ret
  167. SYM_FUNC_END(__pi_strcmp)
  168. SYM_FUNC_ALIAS_WEAK(strcmp, __pi_strcmp)
  169. EXPORT_SYMBOL_NOKASAN(strcmp)