123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 |
- // SPDX-License-Identifier: GPL-2.0-only
- /*
- * arch/arm64/lib/xor-neon.c
- *
- * Authors: Jackie Liu <[email protected]>
- * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
- */
- #include <linux/raid/xor.h>
- #include <linux/module.h>
- #include <asm/neon-intrinsics.h>
- void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2)
- {
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
- do {
- /* p1 ^= p2 */
- v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
- v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
- v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
- v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
- dp1 += 8;
- dp2 += 8;
- } while (--lines > 0);
- }
- void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3)
- {
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
- do {
- /* p1 ^= p2 */
- v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
- v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
- v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
- v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
- /* p1 ^= p3 */
- v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- } while (--lines > 0);
- }
- void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4)
- {
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- uint64_t *dp4 = (uint64_t *)p4;
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
- do {
- /* p1 ^= p2 */
- v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
- v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
- v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
- v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
- /* p1 ^= p3 */
- v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
- /* p1 ^= p4 */
- v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- dp4 += 8;
- } while (--lines > 0);
- }
- void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4,
- const unsigned long * __restrict p5)
- {
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- uint64_t *dp4 = (uint64_t *)p4;
- uint64_t *dp5 = (uint64_t *)p5;
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
- do {
- /* p1 ^= p2 */
- v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
- v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
- v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
- v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
- /* p1 ^= p3 */
- v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
- /* p1 ^= p4 */
- v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
- /* p1 ^= p5 */
- v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- dp4 += 8;
- dp5 += 8;
- } while (--lines > 0);
- }
- struct xor_block_template xor_block_inner_neon __ro_after_init = {
- .name = "__inner_neon__",
- .do_2 = xor_arm64_neon_2,
- .do_3 = xor_arm64_neon_3,
- .do_4 = xor_arm64_neon_4,
- .do_5 = xor_arm64_neon_5,
- };
- EXPORT_SYMBOL(xor_block_inner_neon);
- static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
- {
- uint64x2_t res;
- asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
- "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
- : "=w"(res) : "w"(p), "w"(q), "w"(r));
- return res;
- }
- static void xor_arm64_eor3_3(unsigned long bytes,
- unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3)
- {
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
- do {
- /* p1 ^= p2 ^ p3 */
- v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
- vld1q_u64(dp3 + 0));
- v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
- vld1q_u64(dp3 + 2));
- v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
- vld1q_u64(dp3 + 4));
- v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
- vld1q_u64(dp3 + 6));
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- } while (--lines > 0);
- }
- static void xor_arm64_eor3_4(unsigned long bytes,
- unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4)
- {
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- uint64_t *dp4 = (uint64_t *)p4;
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
- do {
- /* p1 ^= p2 ^ p3 */
- v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
- vld1q_u64(dp3 + 0));
- v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
- vld1q_u64(dp3 + 2));
- v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
- vld1q_u64(dp3 + 4));
- v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
- vld1q_u64(dp3 + 6));
- /* p1 ^= p4 */
- v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- dp4 += 8;
- } while (--lines > 0);
- }
- static void xor_arm64_eor3_5(unsigned long bytes,
- unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4,
- const unsigned long * __restrict p5)
- {
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- uint64_t *dp4 = (uint64_t *)p4;
- uint64_t *dp5 = (uint64_t *)p5;
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
- do {
- /* p1 ^= p2 ^ p3 */
- v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
- vld1q_u64(dp3 + 0));
- v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
- vld1q_u64(dp3 + 2));
- v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
- vld1q_u64(dp3 + 4));
- v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
- vld1q_u64(dp3 + 6));
- /* p1 ^= p4 ^ p5 */
- v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
- v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
- v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
- v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- dp4 += 8;
- dp5 += 8;
- } while (--lines > 0);
- }
- static int __init xor_neon_init(void)
- {
- if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
- xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
- xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
- xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
- }
- return 0;
- }
- module_init(xor_neon_init);
- static void __exit xor_neon_exit(void)
- {
- }
- module_exit(xor_neon_exit);
- MODULE_AUTHOR("Jackie Liu <[email protected]>");
- MODULE_DESCRIPTION("ARMv8 XOR Extensions");
- MODULE_LICENSE("GPL");
|