strnlen.S 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. /* SPDX-License-Identifier: GPL-2.0-only */
  2. /*
  3. * Copyright (C) 2013 ARM Ltd.
  4. * Copyright (C) 2013 Linaro.
  5. *
  6. * This code is based on glibc cortex strings work originally authored by Linaro
  7. * be found @
  8. *
  9. * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10. * files/head:/src/aarch64/
  11. */
  12. #include <linux/linkage.h>
  13. #include <asm/assembler.h>
  14. /*
  15. * determine the length of a fixed-size string
  16. *
  17. * Parameters:
  18. * x0 - const string pointer
  19. * x1 - maximal string length
  20. * Returns:
  21. * x0 - the return length of specific string
  22. */
  23. /* Arguments and results. */
  24. srcin .req x0
  25. len .req x0
  26. limit .req x1
  27. /* Locals and temporaries. */
  28. src .req x2
  29. data1 .req x3
  30. data2 .req x4
  31. data2a .req x5
  32. has_nul1 .req x6
  33. has_nul2 .req x7
  34. tmp1 .req x8
  35. tmp2 .req x9
  36. tmp3 .req x10
  37. tmp4 .req x11
  38. zeroones .req x12
  39. pos .req x13
  40. limit_wd .req x14
  41. #define REP8_01 0x0101010101010101
  42. #define REP8_7f 0x7f7f7f7f7f7f7f7f
  43. #define REP8_80 0x8080808080808080
  44. SYM_FUNC_START(__pi_strnlen)
  45. cbz limit, .Lhit_limit
  46. mov zeroones, #REP8_01
  47. bic src, srcin, #15
  48. ands tmp1, srcin, #15
  49. b.ne .Lmisaligned
  50. /* Calculate the number of full and partial words -1. */
  51. sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
  52. lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
  53. /*
  54. * NUL detection works on the principle that (X - 1) & (~X) & 0x80
  55. * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  56. * can be done in parallel across the entire word.
  57. */
  58. /*
  59. * The inner loop deals with two Dwords at a time. This has a
  60. * slightly higher start-up cost, but we should win quite quickly,
  61. * especially on cores with a high number of issue slots per
  62. * cycle, as we get much better parallelism out of the operations.
  63. */
  64. .Lloop:
  65. ldp data1, data2, [src], #16
  66. .Lrealigned:
  67. sub tmp1, data1, zeroones
  68. orr tmp2, data1, #REP8_7f
  69. sub tmp3, data2, zeroones
  70. orr tmp4, data2, #REP8_7f
  71. bic has_nul1, tmp1, tmp2
  72. bic has_nul2, tmp3, tmp4
  73. subs limit_wd, limit_wd, #1
  74. orr tmp1, has_nul1, has_nul2
  75. ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
  76. b.eq .Lloop
  77. cbz tmp1, .Lhit_limit /* No null in final Qword. */
  78. /*
  79. * We know there's a null in the final Qword. The easiest thing
  80. * to do now is work out the length of the string and return
  81. * MIN (len, limit).
  82. */
  83. sub len, src, srcin
  84. cbz has_nul1, .Lnul_in_data2
  85. CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/
  86. sub len, len, #8
  87. mov has_nul2, has_nul1
  88. .Lnul_in_data2:
  89. /*
  90. * For big-endian, carry propagation (if the final byte in the
  91. * string is 0x01) means we cannot use has_nul directly. The
  92. * easiest way to get the correct byte is to byte-swap the data
  93. * and calculate the syndrome a second time.
  94. */
  95. CPU_BE( rev data2, data2 )
  96. CPU_BE( sub tmp1, data2, zeroones )
  97. CPU_BE( orr tmp2, data2, #REP8_7f )
  98. CPU_BE( bic has_nul2, tmp1, tmp2 )
  99. sub len, len, #8
  100. rev has_nul2, has_nul2
  101. clz pos, has_nul2
  102. add len, len, pos, lsr #3 /* Bits to bytes. */
  103. cmp len, limit
  104. csel len, len, limit, ls /* Return the lower value. */
  105. ret
  106. .Lmisaligned:
  107. /*
  108. * Deal with a partial first word.
  109. * We're doing two things in parallel here;
  110. * 1) Calculate the number of words (but avoiding overflow if
  111. * limit is near ULONG_MAX) - to do this we need to work out
  112. * limit + tmp1 - 1 as a 65-bit value before shifting it;
  113. * 2) Load and mask the initial data words - we force the bytes
  114. * before the ones we are interested in to 0xff - this ensures
  115. * early bytes will not hit any zero detection.
  116. */
  117. ldp data1, data2, [src], #16
  118. sub limit_wd, limit, #1
  119. and tmp3, limit_wd, #15
  120. lsr limit_wd, limit_wd, #4
  121. add tmp3, tmp3, tmp1
  122. add limit_wd, limit_wd, tmp3, lsr #4
  123. neg tmp4, tmp1
  124. lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
  125. mov tmp2, #~0
  126. /* Big-endian. Early bytes are at MSB. */
  127. CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
  128. /* Little-endian. Early bytes are at LSB. */
  129. CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
  130. cmp tmp1, #8
  131. orr data1, data1, tmp2
  132. orr data2a, data2, tmp2
  133. csinv data1, data1, xzr, le
  134. csel data2, data2, data2a, le
  135. b .Lrealigned
  136. .Lhit_limit:
  137. mov len, limit
  138. ret
  139. SYM_FUNC_END(__pi_strnlen)
  140. SYM_FUNC_ALIAS_WEAK(strnlen, __pi_strnlen)
  141. EXPORT_SYMBOL_NOKASAN(strnlen)