123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- /* SPDX-License-Identifier: GPL-2.0-only */
- /*
- * Copyright (c) 2013-2021, Arm Limited.
- *
- * Adapted from the original at:
- * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S
- */
- #include <linux/linkage.h>
- #include <asm/assembler.h>
- #include <asm/mte-def.h>
- /* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
- */
- #define L(label) .L ## label
- /* Arguments and results. */
- #define srcin x0
- #define len x0
- /* Locals and temporaries. */
- #define src x1
- #define data1 x2
- #define data2 x3
- #define has_nul1 x4
- #define has_nul2 x5
- #define tmp1 x4
- #define tmp2 x5
- #define tmp3 x6
- #define tmp4 x7
- #define zeroones x8
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. A faster check
- (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
- false hits for characters 129..255. */
- #define REP8_01 0x0101010101010101
- #define REP8_7f 0x7f7f7f7f7f7f7f7f
- #define REP8_80 0x8080808080808080
- /*
- * When KASAN_HW_TAGS is in use, memory is checked at MTE_GRANULE_SIZE
- * (16-byte) granularity, and we must ensure that no access straddles this
- * alignment boundary.
- */
- #ifdef CONFIG_KASAN_HW_TAGS
- #define MIN_PAGE_SIZE MTE_GRANULE_SIZE
- #else
- #define MIN_PAGE_SIZE 4096
- #endif
- /* Since strings are short on average, we check the first 16 bytes
- of the string for a NUL character. In order to do an unaligned ldp
- safely we have to do a page cross check first. If there is a NUL
- byte we calculate the length from the 2 8-byte words using
- conditional select to reduce branch mispredictions (it is unlikely
- strlen will be repeatedly called on strings with the same length).
- If the string is longer than 16 bytes, we align src so don't need
- further page cross checks, and process 32 bytes per iteration
- using the fast NUL check. If we encounter non-ASCII characters,
- fallback to a second loop using the full NUL check.
- If the page cross check fails, we read 16 bytes from an aligned
- address, remove any characters before the string, and continue
- in the main loop using aligned loads. Since strings crossing a
- page in the first 16 bytes are rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
- AArch64 systems have a minimum page size of 4k. We don't bother
- checking for larger page sizes - the cost of setting up the correct
- page size is just not worth the extra gain from a small reduction in
- the cases taking the slow path. Note that we only care about
- whether the first fetch, which may be misaligned, crosses a page
- boundary. */
- SYM_FUNC_START(__pi_strlen)
- and tmp1, srcin, MIN_PAGE_SIZE - 1
- mov zeroones, REP8_01
- cmp tmp1, MIN_PAGE_SIZE - 16
- b.gt L(page_cross)
- ldp data1, data2, [srcin]
- #ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul1/2 directly.
- Since we expect strings to be small and early-exit,
- byte-swap the data now so has_null1/2 will be correct. */
- rev data1, data1
- rev data2, data2
- #endif
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- beq L(main_loop_entry)
- /* Enter with C = has_nul1 == 0. */
- csel has_nul1, has_nul1, has_nul2, cc
- mov len, 8
- rev has_nul1, has_nul1
- clz tmp1, has_nul1
- csel len, xzr, len, cc
- add len, len, tmp1, lsr 3
- ret
- /* The inner loop processes 32 bytes per iteration and uses the fast
- NUL check. If we encounter non-ASCII characters, use a second
- loop with the accurate NUL check. */
- .p2align 4
- L(main_loop_entry):
- bic src, srcin, 15
- sub src, src, 16
- L(main_loop):
- ldp data1, data2, [src, 32]!
- L(page_cross_entry):
- sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- bne 1f
- ldp data1, data2, [src, 16]
- sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- beq L(main_loop)
- add src, src, 16
- 1:
- /* The fast check failed, so do the slower, accurate NUL check. */
- orr tmp2, data1, REP8_7f
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
- /* Enter with C = has_nul1 == 0. */
- L(tail):
- #ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul1/2 directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- csel data1, data1, data2, cc
- rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- bic has_nul1, tmp1, tmp2
- #else
- csel has_nul1, has_nul1, has_nul2, cc
- #endif
- sub len, src, srcin
- rev has_nul1, has_nul1
- add tmp2, len, 8
- clz tmp1, has_nul1
- csel len, len, tmp2, cc
- add len, len, tmp1, lsr 3
- ret
- L(nonascii_loop):
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- bne L(tail)
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
- b L(tail)
- /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
- srcin to 0x7f, so we ignore any NUL bytes before the string.
- Then continue in the aligned loop. */
- L(page_cross):
- bic src, srcin, 15
- ldp data1, data2, [src]
- lsl tmp1, srcin, 3
- mov tmp4, -1
- #ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
- #else
- /* Little-endian. Early bytes are at LSB. */
- lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
- #endif
- orr tmp1, tmp1, REP8_80
- orn data1, data1, tmp1
- orn tmp2, data2, tmp1
- tst srcin, 8
- csel data1, data1, tmp4, eq
- csel data2, data2, tmp2, eq
- b L(page_cross_entry)
- SYM_FUNC_END(__pi_strlen)
- SYM_FUNC_ALIAS_WEAK(strlen, __pi_strlen)
- EXPORT_SYMBOL_NOKASAN(strlen)
|