123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638 |
- /* SPDX-License-Identifier: GPL-2.0-or-later */
- /*
- * Author: Anton Blanchard <[email protected]>
- * Copyright 2015 IBM Corporation.
- */
- #include <asm/ppc_asm.h>
- #include <asm/export.h>
- #include <asm/ppc-opcode.h>
- #define off8 r6
- #define off16 r7
- #define off24 r8
- #define rA r9
- #define rB r10
- #define rC r11
- #define rD r27
- #define rE r28
- #define rF r29
- #define rG r30
- #define rH r31
- #ifdef __LITTLE_ENDIAN__
- #define LH lhbrx
- #define LW lwbrx
- #define LD ldbrx
- #define LVS lvsr
- #define VPERM(_VRT,_VRA,_VRB,_VRC) \
- vperm _VRT,_VRB,_VRA,_VRC
- #else
- #define LH lhzx
- #define LW lwzx
- #define LD ldx
- #define LVS lvsl
- #define VPERM(_VRT,_VRA,_VRB,_VRC) \
- vperm _VRT,_VRA,_VRB,_VRC
- #endif
- #define VMX_THRESH 4096
- #define ENTER_VMX_OPS \
- mflr r0; \
- std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
- std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
- std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
- std r0,16(r1); \
- stdu r1,-STACKFRAMESIZE(r1); \
- bl enter_vmx_ops; \
- cmpwi cr1,r3,0; \
- ld r0,STACKFRAMESIZE+16(r1); \
- ld r3,STK_REG(R31)(r1); \
- ld r4,STK_REG(R30)(r1); \
- ld r5,STK_REG(R29)(r1); \
- addi r1,r1,STACKFRAMESIZE; \
- mtlr r0
- #define EXIT_VMX_OPS \
- mflr r0; \
- std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
- std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
- std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
- std r0,16(r1); \
- stdu r1,-STACKFRAMESIZE(r1); \
- bl exit_vmx_ops; \
- ld r0,STACKFRAMESIZE+16(r1); \
- ld r3,STK_REG(R31)(r1); \
- ld r4,STK_REG(R30)(r1); \
- ld r5,STK_REG(R29)(r1); \
- addi r1,r1,STACKFRAMESIZE; \
- mtlr r0
- /*
- * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
- * 16 bytes boundary and permute the result with the 1st 16 bytes.
- * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
- * ^ ^ ^
- * 0xbbbb10 0xbbbb20 0xbbb30
- * ^
- * _vaddr
- *
- *
- * _vmask is the mask generated by LVS
- * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
- * for example: 0xyyyyyyyyyyyyy012 for big endian
- * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
- * for example: 0x3456789abcdefzzz for big endian
- * The permute result is saved in _v_res.
- * for example: 0x0123456789abcdef for big endian.
- */
- #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
- lvx _v2nd_qw,_vaddr,off16; \
- VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
- /*
- * There are 2 categories for memcmp:
- * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
- * are named like .Lsameoffset_xxxx
- * 2) src/dst has different offset to the 8 bytes boundary. The handlers
- * are named like .Ldiffoffset_xxxx
- */
- _GLOBAL_TOC(memcmp)
- cmpdi cr1,r5,0
- /* Use the short loop if the src/dst addresses are not
- * with the same offset of 8 bytes align boundary.
- */
- xor r6,r3,r4
- andi. r6,r6,7
- /* Fall back to short loop if compare at aligned addrs
- * with less than 8 bytes.
- */
- cmpdi cr6,r5,7
- beq cr1,.Lzero
- bgt cr6,.Lno_short
- .Lshort:
- mtctr r5
- 1: lbz rA,0(r3)
- lbz rB,0(r4)
- subf. rC,rB,rA
- bne .Lnon_zero
- bdz .Lzero
- lbz rA,1(r3)
- lbz rB,1(r4)
- subf. rC,rB,rA
- bne .Lnon_zero
- bdz .Lzero
- lbz rA,2(r3)
- lbz rB,2(r4)
- subf. rC,rB,rA
- bne .Lnon_zero
- bdz .Lzero
- lbz rA,3(r3)
- lbz rB,3(r4)
- subf. rC,rB,rA
- bne .Lnon_zero
- addi r3,r3,4
- addi r4,r4,4
- bdnz 1b
- .Lzero:
- li r3,0
- blr
- .Lno_short:
- dcbt 0,r3
- dcbt 0,r4
- bne .Ldiffoffset_8bytes_make_align_start
- .Lsameoffset_8bytes_make_align_start:
- /* attempt to compare bytes not aligned with 8 bytes so that
- * rest comparison can run based on 8 bytes alignment.
- */
- andi. r6,r3,7
- /* Try to compare the first double word which is not 8 bytes aligned:
- * load the first double word at (src & ~7UL) and shift left appropriate
- * bits before comparision.
- */
- rlwinm r6,r3,3,26,28
- beq .Lsameoffset_8bytes_aligned
- clrrdi r3,r3,3
- clrrdi r4,r4,3
- LD rA,0,r3
- LD rB,0,r4
- sld rA,rA,r6
- sld rB,rB,r6
- cmpld cr0,rA,rB
- srwi r6,r6,3
- bne cr0,.LcmpAB_lightweight
- subfic r6,r6,8
- subf. r5,r6,r5
- addi r3,r3,8
- addi r4,r4,8
- beq .Lzero
- .Lsameoffset_8bytes_aligned:
- /* now we are aligned with 8 bytes.
- * Use .Llong loop if left cmp bytes are equal or greater than 32B.
- */
- cmpdi cr6,r5,31
- bgt cr6,.Llong
- .Lcmp_lt32bytes:
- /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
- cmpdi cr5,r5,7
- srdi r0,r5,3
- ble cr5,.Lcmp_rest_lt8bytes
- /* handle 8 ~ 31 bytes */
- clrldi r5,r5,61
- mtctr r0
- 2:
- LD rA,0,r3
- LD rB,0,r4
- cmpld cr0,rA,rB
- addi r3,r3,8
- addi r4,r4,8
- bne cr0,.LcmpAB_lightweight
- bdnz 2b
- cmpwi r5,0
- beq .Lzero
- .Lcmp_rest_lt8bytes:
- /*
- * Here we have less than 8 bytes to compare. At least s1 is aligned to
- * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
- * page boundary, otherwise we might read past the end of the buffer and
- * trigger a page fault. We use 4K as the conservative minimum page
- * size. If we detect that case we go to the byte-by-byte loop.
- *
- * Otherwise the next double word is loaded from s1 and s2, and shifted
- * right to compare the appropriate bits.
- */
- clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
- cmpdi r6,0xff8
- bgt .Lshort
- subfic r6,r5,8
- slwi r6,r6,3
- LD rA,0,r3
- LD rB,0,r4
- srd rA,rA,r6
- srd rB,rB,r6
- cmpld cr0,rA,rB
- bne cr0,.LcmpAB_lightweight
- b .Lzero
- .Lnon_zero:
- mr r3,rC
- blr
- .Llong:
- #ifdef CONFIG_ALTIVEC
- BEGIN_FTR_SECTION
- /* Try to use vmx loop if length is equal or greater than 4K */
- cmpldi cr6,r5,VMX_THRESH
- bge cr6,.Lsameoffset_vmx_cmp
- END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- .Llong_novmx_cmp:
- #endif
- /* At least s1 addr is aligned with 8 bytes */
- li off8,8
- li off16,16
- li off24,24
- std r31,-8(r1)
- std r30,-16(r1)
- std r29,-24(r1)
- std r28,-32(r1)
- std r27,-40(r1)
- srdi r0,r5,5
- mtctr r0
- andi. r5,r5,31
- LD rA,0,r3
- LD rB,0,r4
- LD rC,off8,r3
- LD rD,off8,r4
- LD rE,off16,r3
- LD rF,off16,r4
- LD rG,off24,r3
- LD rH,off24,r4
- cmpld cr0,rA,rB
- addi r3,r3,32
- addi r4,r4,32
- bdz .Lfirst32
- LD rA,0,r3
- LD rB,0,r4
- cmpld cr1,rC,rD
- LD rC,off8,r3
- LD rD,off8,r4
- cmpld cr6,rE,rF
- LD rE,off16,r3
- LD rF,off16,r4
- cmpld cr7,rG,rH
- bne cr0,.LcmpAB
- LD rG,off24,r3
- LD rH,off24,r4
- cmpld cr0,rA,rB
- bne cr1,.LcmpCD
- addi r3,r3,32
- addi r4,r4,32
- bdz .Lsecond32
- .balign 16
- 1: LD rA,0,r3
- LD rB,0,r4
- cmpld cr1,rC,rD
- bne cr6,.LcmpEF
- LD rC,off8,r3
- LD rD,off8,r4
- cmpld cr6,rE,rF
- bne cr7,.LcmpGH
- LD rE,off16,r3
- LD rF,off16,r4
- cmpld cr7,rG,rH
- bne cr0,.LcmpAB
- LD rG,off24,r3
- LD rH,off24,r4
- cmpld cr0,rA,rB
- bne cr1,.LcmpCD
- addi r3,r3,32
- addi r4,r4,32
- bdnz 1b
- .Lsecond32:
- cmpld cr1,rC,rD
- bne cr6,.LcmpEF
- cmpld cr6,rE,rF
- bne cr7,.LcmpGH
- cmpld cr7,rG,rH
- bne cr0,.LcmpAB
- bne cr1,.LcmpCD
- bne cr6,.LcmpEF
- bne cr7,.LcmpGH
- .Ltail:
- ld r31,-8(r1)
- ld r30,-16(r1)
- ld r29,-24(r1)
- ld r28,-32(r1)
- ld r27,-40(r1)
- cmpdi r5,0
- beq .Lzero
- b .Lshort
- .Lfirst32:
- cmpld cr1,rC,rD
- cmpld cr6,rE,rF
- cmpld cr7,rG,rH
- bne cr0,.LcmpAB
- bne cr1,.LcmpCD
- bne cr6,.LcmpEF
- bne cr7,.LcmpGH
- b .Ltail
- .LcmpAB:
- li r3,1
- bgt cr0,.Lout
- li r3,-1
- b .Lout
- .LcmpCD:
- li r3,1
- bgt cr1,.Lout
- li r3,-1
- b .Lout
- .LcmpEF:
- li r3,1
- bgt cr6,.Lout
- li r3,-1
- b .Lout
- .LcmpGH:
- li r3,1
- bgt cr7,.Lout
- li r3,-1
- .Lout:
- ld r31,-8(r1)
- ld r30,-16(r1)
- ld r29,-24(r1)
- ld r28,-32(r1)
- ld r27,-40(r1)
- blr
- .LcmpAB_lightweight: /* skip NV GPRS restore */
- li r3,1
- bgtlr
- li r3,-1
- blr
- #ifdef CONFIG_ALTIVEC
- .Lsameoffset_vmx_cmp:
- /* Enter with src/dst addrs has the same offset with 8 bytes
- * align boundary.
- *
- * There is an optimization based on following fact: memcmp()
- * prones to fail early at the first 32 bytes.
- * Before applying VMX instructions which will lead to 32x128bits
- * VMX regs load/restore penalty, we compare the first 32 bytes
- * so that we can catch the ~80% fail cases.
- */
- li r0,4
- mtctr r0
- .Lsameoffset_prechk_32B_loop:
- LD rA,0,r3
- LD rB,0,r4
- cmpld cr0,rA,rB
- addi r3,r3,8
- addi r4,r4,8
- bne cr0,.LcmpAB_lightweight
- addi r5,r5,-8
- bdnz .Lsameoffset_prechk_32B_loop
- ENTER_VMX_OPS
- beq cr1,.Llong_novmx_cmp
- 3:
- /* need to check whether r4 has the same offset with r3
- * for 16 bytes boundary.
- */
- xor r0,r3,r4
- andi. r0,r0,0xf
- bne .Ldiffoffset_vmx_cmp_start
- /* len is no less than 4KB. Need to align with 16 bytes further.
- */
- andi. rA,r3,8
- LD rA,0,r3
- beq 4f
- LD rB,0,r4
- cmpld cr0,rA,rB
- addi r3,r3,8
- addi r4,r4,8
- addi r5,r5,-8
- beq cr0,4f
- /* save and restore cr0 */
- mfocrf r5,128
- EXIT_VMX_OPS
- mtocrf 128,r5
- b .LcmpAB_lightweight
- 4:
- /* compare 32 bytes for each loop */
- srdi r0,r5,5
- mtctr r0
- clrldi r5,r5,59
- li off16,16
- .balign 16
- 5:
- lvx v0,0,r3
- lvx v1,0,r4
- VCMPEQUD_RC(v0,v0,v1)
- bnl cr6,7f
- lvx v0,off16,r3
- lvx v1,off16,r4
- VCMPEQUD_RC(v0,v0,v1)
- bnl cr6,6f
- addi r3,r3,32
- addi r4,r4,32
- bdnz 5b
- EXIT_VMX_OPS
- cmpdi r5,0
- beq .Lzero
- b .Lcmp_lt32bytes
- 6:
- addi r3,r3,16
- addi r4,r4,16
- 7:
- /* diff the last 16 bytes */
- EXIT_VMX_OPS
- LD rA,0,r3
- LD rB,0,r4
- cmpld cr0,rA,rB
- li off8,8
- bne cr0,.LcmpAB_lightweight
- LD rA,off8,r3
- LD rB,off8,r4
- cmpld cr0,rA,rB
- bne cr0,.LcmpAB_lightweight
- b .Lzero
- #endif
- .Ldiffoffset_8bytes_make_align_start:
- /* now try to align s1 with 8 bytes */
- rlwinm r6,r3,3,26,28
- beq .Ldiffoffset_align_s1_8bytes
- clrrdi r3,r3,3
- LD rA,0,r3
- LD rB,0,r4 /* unaligned load */
- sld rA,rA,r6
- srd rA,rA,r6
- srd rB,rB,r6
- cmpld cr0,rA,rB
- srwi r6,r6,3
- bne cr0,.LcmpAB_lightweight
- subfic r6,r6,8
- subf. r5,r6,r5
- addi r3,r3,8
- add r4,r4,r6
- beq .Lzero
- .Ldiffoffset_align_s1_8bytes:
- /* now s1 is aligned with 8 bytes. */
- #ifdef CONFIG_ALTIVEC
- BEGIN_FTR_SECTION
- /* only do vmx ops when the size equal or greater than 4K bytes */
- cmpdi cr5,r5,VMX_THRESH
- bge cr5,.Ldiffoffset_vmx_cmp
- END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- .Ldiffoffset_novmx_cmp:
- #endif
- cmpdi cr5,r5,31
- ble cr5,.Lcmp_lt32bytes
- #ifdef CONFIG_ALTIVEC
- b .Llong_novmx_cmp
- #else
- b .Llong
- #endif
- #ifdef CONFIG_ALTIVEC
- .Ldiffoffset_vmx_cmp:
- /* perform a 32 bytes pre-checking before
- * enable VMX operations.
- */
- li r0,4
- mtctr r0
- .Ldiffoffset_prechk_32B_loop:
- LD rA,0,r3
- LD rB,0,r4
- cmpld cr0,rA,rB
- addi r3,r3,8
- addi r4,r4,8
- bne cr0,.LcmpAB_lightweight
- addi r5,r5,-8
- bdnz .Ldiffoffset_prechk_32B_loop
- ENTER_VMX_OPS
- beq cr1,.Ldiffoffset_novmx_cmp
- .Ldiffoffset_vmx_cmp_start:
- /* Firstly try to align r3 with 16 bytes */
- andi. r6,r3,0xf
- li off16,16
- beq .Ldiffoffset_vmx_s1_16bytes_align
- LVS v3,0,r3
- LVS v4,0,r4
- lvx v5,0,r3
- lvx v6,0,r4
- LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
- LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
- VCMPEQUB_RC(v7,v9,v10)
- bnl cr6,.Ldiffoffset_vmx_diff_found
- subfic r6,r6,16
- subf r5,r6,r5
- add r3,r3,r6
- add r4,r4,r6
- .Ldiffoffset_vmx_s1_16bytes_align:
- /* now s1 is aligned with 16 bytes */
- lvx v6,0,r4
- LVS v4,0,r4
- srdi r6,r5,5 /* loop for 32 bytes each */
- clrldi r5,r5,59
- mtctr r6
- .balign 16
- .Ldiffoffset_vmx_32bytesloop:
- /* the first qw of r4 was saved in v6 */
- lvx v9,0,r3
- LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
- VCMPEQUB_RC(v7,v9,v10)
- vor v6,v8,v8
- bnl cr6,.Ldiffoffset_vmx_diff_found
- addi r3,r3,16
- addi r4,r4,16
- lvx v9,0,r3
- LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
- VCMPEQUB_RC(v7,v9,v10)
- vor v6,v8,v8
- bnl cr6,.Ldiffoffset_vmx_diff_found
- addi r3,r3,16
- addi r4,r4,16
- bdnz .Ldiffoffset_vmx_32bytesloop
- EXIT_VMX_OPS
- cmpdi r5,0
- beq .Lzero
- b .Lcmp_lt32bytes
- .Ldiffoffset_vmx_diff_found:
- EXIT_VMX_OPS
- /* anyway, the diff will appear in next 16 bytes */
- li r5,16
- b .Lcmp_lt32bytes
- #endif
- EXPORT_SYMBOL(memcmp)
|