Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "Here is the crypto update for 4.12: API: - Add batch registration for acomp/scomp - Change acomp testing to non-unique compressed result - Extend algorithm name limit to 128 bytes - Require setkey before accept(2) in algif_aead Algorithms: - Add support for deflate rfc1950 (zlib) Drivers: - Add accelerated crct10dif for powerpc - Add crc32 in stm32 - Add sha384/sha512 in ccp - Add 3des/gcm(aes) for v5 devices in ccp - Add Queue Interface (QI) backend support in caam - Add new Exynos RNG driver - Add ThunderX ZIP driver - Add driver for hardware random generator on MT7623 SoC" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (101 commits) crypto: stm32 - Fix OF module alias information crypto: algif_aead - Require setkey before accept(2) crypto: scomp - add support for deflate rfc1950 (zlib) crypto: scomp - allow registration of multiple scomps crypto: ccp - Change ISR handler method for a v5 CCP crypto: ccp - Change ISR handler method for a v3 CCP crypto: crypto4xx - rename ce_ring_contol to ce_ring_control crypto: testmgr - Allow ecb(cipher_null) in FIPS mode Revert "crypto: arm64/sha - Add constant operand modifier to ASM_EXPORT" crypto: ccp - Disable interrupts early on unload crypto: ccp - Use only the relevant interrupt bits hwrng: mtk - Add driver for hardware random generator on MT7623 SoC dt-bindings: hwrng: Add Mediatek hardware random generator bindings crypto: crct10dif-vpmsum - Fix missing preempt_disable() crypto: testmgr - replace compression known answer test crypto: acomp - allow registration of multiple acomps hwrng: n2 - Use devm_kcalloc() in n2rng_probe() crypto: chcr - Fix error handling related to 'chcr_alloc_shash' padata: get_next is never NULL crypto: exynos - Add new Exynos RNG driver ...
This commit is contained in:
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o
|
||||
obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o
|
||||
obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
|
||||
obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
|
||||
obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
|
||||
obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
|
||||
|
||||
aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
|
||||
md5-ppc-y := md5-asm.o md5-glue.o
|
||||
@@ -17,3 +19,4 @@ sha1-powerpc-y := sha1-powerpc-asm.o sha1.o
|
||||
sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
|
||||
sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
|
||||
crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
|
||||
crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
|
||||
|
137
arch/powerpc/crypto/crc-vpmsum_test.c
Normal file
137
arch/powerpc/crypto/crc-vpmsum_test.c
Normal file
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
* CRC vpmsum tester
|
||||
* Copyright 2017 Daniel Axtens, IBM Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/crc-t10dif.h>
|
||||
#include <linux/crc32.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <asm/switch_to.h>
|
||||
|
||||
static unsigned long iterations = 10000;
|
||||
|
||||
#define MAX_CRC_LENGTH 65535
|
||||
|
||||
|
||||
static int __init crc_test_init(void)
|
||||
{
|
||||
u16 crc16 = 0, verify16 = 0;
|
||||
u32 crc32 = 0, verify32 = 0;
|
||||
__le32 verify32le = 0;
|
||||
unsigned char *data;
|
||||
unsigned long i;
|
||||
int ret;
|
||||
|
||||
struct crypto_shash *crct10dif_tfm;
|
||||
struct crypto_shash *crc32c_tfm;
|
||||
|
||||
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
|
||||
return -ENODEV;
|
||||
|
||||
data = kmalloc(MAX_CRC_LENGTH, GFP_KERNEL);
|
||||
if (!data)
|
||||
return -ENOMEM;
|
||||
|
||||
crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
|
||||
|
||||
if (IS_ERR(crct10dif_tfm)) {
|
||||
pr_err("Error allocating crc-t10dif\n");
|
||||
goto free_buf;
|
||||
}
|
||||
|
||||
crc32c_tfm = crypto_alloc_shash("crc32c", 0, 0);
|
||||
|
||||
if (IS_ERR(crc32c_tfm)) {
|
||||
pr_err("Error allocating crc32c\n");
|
||||
goto free_16;
|
||||
}
|
||||
|
||||
do {
|
||||
SHASH_DESC_ON_STACK(crct10dif_shash, crct10dif_tfm);
|
||||
SHASH_DESC_ON_STACK(crc32c_shash, crc32c_tfm);
|
||||
|
||||
crct10dif_shash->tfm = crct10dif_tfm;
|
||||
ret = crypto_shash_init(crct10dif_shash);
|
||||
|
||||
if (ret) {
|
||||
pr_err("Error initing crc-t10dif\n");
|
||||
goto free_32;
|
||||
}
|
||||
|
||||
|
||||
crc32c_shash->tfm = crc32c_tfm;
|
||||
ret = crypto_shash_init(crc32c_shash);
|
||||
|
||||
if (ret) {
|
||||
pr_err("Error initing crc32c\n");
|
||||
goto free_32;
|
||||
}
|
||||
|
||||
pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations);
|
||||
for (i=0; i<iterations; i++) {
|
||||
size_t len, offset;
|
||||
|
||||
get_random_bytes(data, MAX_CRC_LENGTH);
|
||||
get_random_bytes(&len, sizeof(len));
|
||||
get_random_bytes(&offset, sizeof(offset));
|
||||
|
||||
len %= MAX_CRC_LENGTH;
|
||||
offset &= 15;
|
||||
if (len <= offset)
|
||||
continue;
|
||||
len -= offset;
|
||||
|
||||
crypto_shash_update(crct10dif_shash, data+offset, len);
|
||||
crypto_shash_final(crct10dif_shash, (u8 *)(&crc16));
|
||||
verify16 = crc_t10dif_generic(verify16, data+offset, len);
|
||||
|
||||
|
||||
if (crc16 != verify16) {
|
||||
pr_err("FAILURE in CRC16: got 0x%04x expected 0x%04x (len %lu)\n",
|
||||
crc16, verify16, len);
|
||||
break;
|
||||
}
|
||||
|
||||
crypto_shash_update(crc32c_shash, data+offset, len);
|
||||
crypto_shash_final(crc32c_shash, (u8 *)(&crc32));
|
||||
verify32 = le32_to_cpu(verify32le);
|
||||
verify32le = ~cpu_to_le32(__crc32c_le(~verify32, data+offset, len));
|
||||
if (crc32 != (u32)verify32le) {
|
||||
pr_err("FAILURE in CRC32: got 0x%08x expected 0x%08x (len %lu)\n",
|
||||
crc32, verify32, len);
|
||||
break;
|
||||
}
|
||||
}
|
||||
pr_info("crc-vpmsum_test done, completed %lu iterations\n", i);
|
||||
} while (0);
|
||||
|
||||
free_32:
|
||||
crypto_free_shash(crc32c_tfm);
|
||||
|
||||
free_16:
|
||||
crypto_free_shash(crct10dif_tfm);
|
||||
|
||||
free_buf:
|
||||
kfree(data);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit crc_test_exit(void) {}
|
||||
|
||||
module_init(crc_test_init);
|
||||
module_exit(crc_test_exit);
|
||||
module_param(iterations, long, 0400);
|
||||
|
||||
MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
|
||||
MODULE_DESCRIPTION("Vector polynomial multiply-sum CRC tester");
|
||||
MODULE_LICENSE("GPL");
|
755
arch/powerpc/crypto/crc32-vpmsum_core.S
Normal file
755
arch/powerpc/crypto/crc32-vpmsum_core.S
Normal file
@@ -0,0 +1,755 @@
|
||||
/*
|
||||
* Core of the accelerated CRC algorithm.
|
||||
* In your file, define the constants and CRC_FUNCTION_NAME
|
||||
* Then include this file.
|
||||
*
|
||||
* Calculate the checksum of data that is 16 byte aligned and a multiple of
|
||||
* 16 bytes.
|
||||
*
|
||||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
|
||||
* chunks in order to mask the latency of the vpmsum instructions. If we
|
||||
* have more than 32 kB of data to checksum we repeat this step multiple
|
||||
* times, passing in the previous 1024 bits.
|
||||
*
|
||||
* The next step is to reduce the 1024 bits to 64 bits. This step adds
|
||||
* 32 bits of 0s to the end - this matches what a CRC does. We just
|
||||
* calculate constants that land the data in this 32 bits.
|
||||
*
|
||||
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
|
||||
* for n = CRC using POWER8 instructions. We use x = 32.
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
*
|
||||
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <asm/ppc_asm.h>
|
||||
#include <asm/ppc-opcode.h>
|
||||
|
||||
#define MAX_SIZE 32768
|
||||
|
||||
.text
|
||||
|
||||
#if defined(__BIG_ENDIAN__) && defined(REFLECT)
|
||||
#define BYTESWAP_DATA
|
||||
#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
|
||||
#define BYTESWAP_DATA
|
||||
#else
|
||||
#undef BYTESWAP_DATA
|
||||
#endif
|
||||
|
||||
#define off16 r25
|
||||
#define off32 r26
|
||||
#define off48 r27
|
||||
#define off64 r28
|
||||
#define off80 r29
|
||||
#define off96 r30
|
||||
#define off112 r31
|
||||
|
||||
#define const1 v24
|
||||
#define const2 v25
|
||||
|
||||
#define byteswap v26
|
||||
#define mask_32bit v27
|
||||
#define mask_64bit v28
|
||||
#define zeroes v29
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
#define VPERM(A, B, C, D) vperm A, B, C, D
|
||||
#else
|
||||
#define VPERM(A, B, C, D)
|
||||
#endif
|
||||
|
||||
/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
|
||||
FUNC_START(CRC_FUNCTION_NAME)
|
||||
std r31,-8(r1)
|
||||
std r30,-16(r1)
|
||||
std r29,-24(r1)
|
||||
std r28,-32(r1)
|
||||
std r27,-40(r1)
|
||||
std r26,-48(r1)
|
||||
std r25,-56(r1)
|
||||
|
||||
li off16,16
|
||||
li off32,32
|
||||
li off48,48
|
||||
li off64,64
|
||||
li off80,80
|
||||
li off96,96
|
||||
li off112,112
|
||||
li r0,0
|
||||
|
||||
/* Enough room for saving 10 non volatile VMX registers */
|
||||
subi r6,r1,56+10*16
|
||||
subi r7,r1,56+2*16
|
||||
|
||||
stvx v20,0,r6
|
||||
stvx v21,off16,r6
|
||||
stvx v22,off32,r6
|
||||
stvx v23,off48,r6
|
||||
stvx v24,off64,r6
|
||||
stvx v25,off80,r6
|
||||
stvx v26,off96,r6
|
||||
stvx v27,off112,r6
|
||||
stvx v28,0,r7
|
||||
stvx v29,off16,r7
|
||||
|
||||
mr r10,r3
|
||||
|
||||
vxor zeroes,zeroes,zeroes
|
||||
vspltisw v0,-1
|
||||
|
||||
vsldoi mask_32bit,zeroes,v0,4
|
||||
vsldoi mask_64bit,zeroes,v0,8
|
||||
|
||||
/* Get the initial value into v8 */
|
||||
vxor v8,v8,v8
|
||||
MTVRD(v8, R3)
|
||||
#ifdef REFLECT
|
||||
vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
|
||||
#else
|
||||
vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
|
||||
#endif
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
addis r3,r2,.byteswap_constant@toc@ha
|
||||
addi r3,r3,.byteswap_constant@toc@l
|
||||
|
||||
lvx byteswap,0,r3
|
||||
addi r3,r3,16
|
||||
#endif
|
||||
|
||||
cmpdi r5,256
|
||||
blt .Lshort
|
||||
|
||||
rldicr r6,r5,0,56
|
||||
|
||||
/* Checksum in blocks of MAX_SIZE */
|
||||
1: lis r7,MAX_SIZE@h
|
||||
ori r7,r7,MAX_SIZE@l
|
||||
mr r9,r7
|
||||
cmpd r6,r7
|
||||
bgt 2f
|
||||
mr r7,r6
|
||||
2: subf r6,r7,r6
|
||||
|
||||
/* our main loop does 128 bytes at a time */
|
||||
srdi r7,r7,7
|
||||
|
||||
/*
|
||||
* Work out the offset into the constants table to start at. Each
|
||||
* constant is 16 bytes, and it is used against 128 bytes of input
|
||||
* data - 128 / 16 = 8
|
||||
*/
|
||||
sldi r8,r7,4
|
||||
srdi r9,r9,3
|
||||
subf r8,r8,r9
|
||||
|
||||
/* We reduce our final 128 bytes in a separate step */
|
||||
addi r7,r7,-1
|
||||
mtctr r7
|
||||
|
||||
addis r3,r2,.constants@toc@ha
|
||||
addi r3,r3,.constants@toc@l
|
||||
|
||||
/* Find the start of our constants */
|
||||
add r3,r3,r8
|
||||
|
||||
/* zero v0-v7 which will contain our checksums */
|
||||
vxor v0,v0,v0
|
||||
vxor v1,v1,v1
|
||||
vxor v2,v2,v2
|
||||
vxor v3,v3,v3
|
||||
vxor v4,v4,v4
|
||||
vxor v5,v5,v5
|
||||
vxor v6,v6,v6
|
||||
vxor v7,v7,v7
|
||||
|
||||
lvx const1,0,r3
|
||||
|
||||
/*
|
||||
* If we are looping back to consume more data we use the values
|
||||
* already in v16-v23.
|
||||
*/
|
||||
cmpdi r0,1
|
||||
beq 2f
|
||||
|
||||
/* First warm up pass */
|
||||
lvx v16,0,r4
|
||||
lvx v17,off16,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
lvx v18,off32,r4
|
||||
lvx v19,off48,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
lvx v20,off64,r4
|
||||
lvx v21,off80,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
lvx v22,off96,r4
|
||||
lvx v23,off112,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
addi r4,r4,8*16
|
||||
|
||||
/* xor in initial value */
|
||||
vxor v16,v16,v8
|
||||
|
||||
2: bdz .Lfirst_warm_up_done
|
||||
|
||||
addi r3,r3,16
|
||||
lvx const2,0,r3
|
||||
|
||||
/* Second warm up pass */
|
||||
VPMSUMD(v8,v16,const1)
|
||||
lvx v16,0,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v9,v17,const1)
|
||||
lvx v17,off16,r4
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v10,v18,const1)
|
||||
lvx v18,off32,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v11,v19,const1)
|
||||
lvx v19,off48,r4
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v12,v20,const1)
|
||||
lvx v20,off64,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v13,v21,const1)
|
||||
lvx v21,off80,r4
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v14,v22,const1)
|
||||
lvx v22,off96,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v15,v23,const1)
|
||||
lvx v23,off112,r4
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
bdz .Lfirst_cool_down
|
||||
|
||||
/*
|
||||
* main loop. We modulo schedule it such that it takes three iterations
|
||||
* to complete - first iteration load, second iteration vpmsum, third
|
||||
* iteration xor.
|
||||
*/
|
||||
.balign 16
|
||||
4: lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v0,v0,v8
|
||||
VPMSUMD(v8,v16,const2)
|
||||
lvx v16,0,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v1,v1,v9
|
||||
VPMSUMD(v9,v17,const2)
|
||||
lvx v17,off16,r4
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v2,v2,v10
|
||||
VPMSUMD(v10,v18,const2)
|
||||
lvx v18,off32,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v3,v3,v11
|
||||
VPMSUMD(v11,v19,const2)
|
||||
lvx v19,off48,r4
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
lvx const2,0,r3
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v4,v4,v12
|
||||
VPMSUMD(v12,v20,const1)
|
||||
lvx v20,off64,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v5,v5,v13
|
||||
VPMSUMD(v13,v21,const1)
|
||||
lvx v21,off80,r4
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v6,v6,v14
|
||||
VPMSUMD(v14,v22,const1)
|
||||
lvx v22,off96,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v7,v7,v15
|
||||
VPMSUMD(v15,v23,const1)
|
||||
lvx v23,off112,r4
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
bdnz 4b
|
||||
|
||||
.Lfirst_cool_down:
|
||||
/* First cool down pass */
|
||||
lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
|
||||
vxor v0,v0,v8
|
||||
VPMSUMD(v8,v16,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v1,v1,v9
|
||||
VPMSUMD(v9,v17,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v2,v2,v10
|
||||
VPMSUMD(v10,v18,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v3,v3,v11
|
||||
VPMSUMD(v11,v19,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v4,v4,v12
|
||||
VPMSUMD(v12,v20,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v5,v5,v13
|
||||
VPMSUMD(v13,v21,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v6,v6,v14
|
||||
VPMSUMD(v14,v22,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v7,v7,v15
|
||||
VPMSUMD(v15,v23,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
.Lsecond_cool_down:
|
||||
/* Second cool down pass */
|
||||
vxor v0,v0,v8
|
||||
vxor v1,v1,v9
|
||||
vxor v2,v2,v10
|
||||
vxor v3,v3,v11
|
||||
vxor v4,v4,v12
|
||||
vxor v5,v5,v13
|
||||
vxor v6,v6,v14
|
||||
vxor v7,v7,v15
|
||||
|
||||
#ifdef REFLECT
|
||||
/*
|
||||
* vpmsumd produces a 96 bit result in the least significant bits
|
||||
* of the register. Since we are bit reflected we have to shift it
|
||||
* left 32 bits so it occupies the least significant bits in the
|
||||
* bit reflected domain.
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,4
|
||||
vsldoi v1,v1,zeroes,4
|
||||
vsldoi v2,v2,zeroes,4
|
||||
vsldoi v3,v3,zeroes,4
|
||||
vsldoi v4,v4,zeroes,4
|
||||
vsldoi v5,v5,zeroes,4
|
||||
vsldoi v6,v6,zeroes,4
|
||||
vsldoi v7,v7,zeroes,4
|
||||
#endif
|
||||
|
||||
/* xor with last 1024 bits */
|
||||
lvx v8,0,r4
|
||||
lvx v9,off16,r4
|
||||
VPERM(v8,v8,v8,byteswap)
|
||||
VPERM(v9,v9,v9,byteswap)
|
||||
lvx v10,off32,r4
|
||||
lvx v11,off48,r4
|
||||
VPERM(v10,v10,v10,byteswap)
|
||||
VPERM(v11,v11,v11,byteswap)
|
||||
lvx v12,off64,r4
|
||||
lvx v13,off80,r4
|
||||
VPERM(v12,v12,v12,byteswap)
|
||||
VPERM(v13,v13,v13,byteswap)
|
||||
lvx v14,off96,r4
|
||||
lvx v15,off112,r4
|
||||
VPERM(v14,v14,v14,byteswap)
|
||||
VPERM(v15,v15,v15,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
vxor v16,v0,v8
|
||||
vxor v17,v1,v9
|
||||
vxor v18,v2,v10
|
||||
vxor v19,v3,v11
|
||||
vxor v20,v4,v12
|
||||
vxor v21,v5,v13
|
||||
vxor v22,v6,v14
|
||||
vxor v23,v7,v15
|
||||
|
||||
li r0,1
|
||||
cmpdi r6,0
|
||||
addi r6,r6,128
|
||||
bne 1b
|
||||
|
||||
/* Work out how many bytes we have left */
|
||||
andi. r5,r5,127
|
||||
|
||||
/* Calculate where in the constant table we need to start */
|
||||
subfic r6,r5,128
|
||||
add r3,r3,r6
|
||||
|
||||
/* How many 16 byte chunks are in the tail */
|
||||
srdi r7,r5,4
|
||||
mtctr r7
|
||||
|
||||
/*
|
||||
* Reduce the previously calculated 1024 bits to 64 bits, shifting
|
||||
* 32 bits to include the trailing 32 bits of zeros
|
||||
*/
|
||||
lvx v0,0,r3
|
||||
lvx v1,off16,r3
|
||||
lvx v2,off32,r3
|
||||
lvx v3,off48,r3
|
||||
lvx v4,off64,r3
|
||||
lvx v5,off80,r3
|
||||
lvx v6,off96,r3
|
||||
lvx v7,off112,r3
|
||||
addi r3,r3,8*16
|
||||
|
||||
VPMSUMW(v0,v16,v0)
|
||||
VPMSUMW(v1,v17,v1)
|
||||
VPMSUMW(v2,v18,v2)
|
||||
VPMSUMW(v3,v19,v3)
|
||||
VPMSUMW(v4,v20,v4)
|
||||
VPMSUMW(v5,v21,v5)
|
||||
VPMSUMW(v6,v22,v6)
|
||||
VPMSUMW(v7,v23,v7)
|
||||
|
||||
/* Now reduce the tail (0 - 112 bytes) */
|
||||
cmpdi r7,0
|
||||
beq 1f
|
||||
|
||||
lvx v16,0,r4
|
||||
lvx v17,0,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off32,r4
|
||||
lvx v17,off32,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off64,r4
|
||||
lvx v17,off64,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off96,r4
|
||||
lvx v17,off96,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
|
||||
/* Now xor all the parallel chunks together */
|
||||
1: vxor v0,v0,v1
|
||||
vxor v2,v2,v3
|
||||
vxor v4,v4,v5
|
||||
vxor v6,v6,v7
|
||||
|
||||
vxor v0,v0,v2
|
||||
vxor v4,v4,v6
|
||||
|
||||
vxor v0,v0,v4
|
||||
|
||||
.Lbarrett_reduction:
|
||||
/* Barrett constants */
|
||||
addis r3,r2,.barrett_constants@toc@ha
|
||||
addi r3,r3,.barrett_constants@toc@l
|
||||
|
||||
lvx const1,0,r3
|
||||
lvx const2,off16,r3
|
||||
|
||||
vsldoi v1,v0,v0,8
|
||||
vxor v0,v0,v1 /* xor two 64 bit results together */
|
||||
|
||||
#ifdef REFLECT
|
||||
/* shift left one bit */
|
||||
vspltisb v1,1
|
||||
vsl v0,v0,v1
|
||||
#endif
|
||||
|
||||
vand v0,v0,mask_64bit
|
||||
#ifndef REFLECT
|
||||
/*
|
||||
* Now for the Barrett reduction algorithm. The idea is to calculate q,
|
||||
* the multiple of our polynomial that we need to subtract. By
|
||||
* doing the computation 2x bits higher (ie 64 bits) and shifting the
|
||||
* result back down 2x bits, we round down to the nearest multiple.
|
||||
*/
|
||||
VPMSUMD(v1,v0,const1) /* ma */
|
||||
vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
|
||||
VPMSUMD(v1,v1,const2) /* qn */
|
||||
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
|
||||
|
||||
/*
|
||||
* Get the result into r3. We need to shift it left 8 bytes:
|
||||
* V0 [ 0 1 2 X ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
|
||||
#else
|
||||
/*
|
||||
* The reflected version of Barrett reduction. Instead of bit
|
||||
* reflecting our data (which is expensive to do), we bit reflect our
|
||||
* constants and our algorithm, which means the intermediate data in
|
||||
* our vector registers goes from 0-63 instead of 63-0. We can reflect
|
||||
* the algorithm because we don't carry in mod 2 arithmetic.
|
||||
*/
|
||||
vand v1,v0,mask_32bit /* bottom 32 bits of a */
|
||||
VPMSUMD(v1,v1,const1) /* ma */
|
||||
vand v1,v1,mask_32bit /* bottom 32bits of ma */
|
||||
VPMSUMD(v1,v1,const2) /* qn */
|
||||
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
|
||||
|
||||
/*
|
||||
* Since we are bit reflected, the result (ie the low 32 bits) is in
|
||||
* the high 32 bits. We just need to shift it left 4 bytes
|
||||
* V0 [ 0 1 X 3 ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
|
||||
#endif
|
||||
|
||||
/* Get it into r3 */
|
||||
MFVRD(R3, v0)
|
||||
|
||||
.Lout:
|
||||
subi r6,r1,56+10*16
|
||||
subi r7,r1,56+2*16
|
||||
|
||||
lvx v20,0,r6
|
||||
lvx v21,off16,r6
|
||||
lvx v22,off32,r6
|
||||
lvx v23,off48,r6
|
||||
lvx v24,off64,r6
|
||||
lvx v25,off80,r6
|
||||
lvx v26,off96,r6
|
||||
lvx v27,off112,r6
|
||||
lvx v28,0,r7
|
||||
lvx v29,off16,r7
|
||||
|
||||
ld r31,-8(r1)
|
||||
ld r30,-16(r1)
|
||||
ld r29,-24(r1)
|
||||
ld r28,-32(r1)
|
||||
ld r27,-40(r1)
|
||||
ld r26,-48(r1)
|
||||
ld r25,-56(r1)
|
||||
|
||||
blr
|
||||
|
||||
.Lfirst_warm_up_done:
|
||||
lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
|
||||
VPMSUMD(v8,v16,const1)
|
||||
VPMSUMD(v9,v17,const1)
|
||||
VPMSUMD(v10,v18,const1)
|
||||
VPMSUMD(v11,v19,const1)
|
||||
VPMSUMD(v12,v20,const1)
|
||||
VPMSUMD(v13,v21,const1)
|
||||
VPMSUMD(v14,v22,const1)
|
||||
VPMSUMD(v15,v23,const1)
|
||||
|
||||
b .Lsecond_cool_down
|
||||
|
||||
.Lshort:
|
||||
cmpdi r5,0
|
||||
beq .Lzero
|
||||
|
||||
addis r3,r2,.short_constants@toc@ha
|
||||
addi r3,r3,.short_constants@toc@l
|
||||
|
||||
/* Calculate where in the constant table we need to start */
|
||||
subfic r6,r5,256
|
||||
add r3,r3,r6
|
||||
|
||||
/* How many 16 byte chunks? */
|
||||
srdi r7,r5,4
|
||||
mtctr r7
|
||||
|
||||
vxor v19,v19,v19
|
||||
vxor v20,v20,v20
|
||||
|
||||
lvx v0,0,r4
|
||||
lvx v16,0,r3
|
||||
VPERM(v0,v0,v16,byteswap)
|
||||
vxor v0,v0,v8 /* xor in initial value */
|
||||
VPMSUMW(v0,v0,v16)
|
||||
bdz .Lv0
|
||||
|
||||
lvx v1,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v1,v1,v17,byteswap)
|
||||
VPMSUMW(v1,v1,v17)
|
||||
bdz .Lv1
|
||||
|
||||
lvx v2,off32,r4
|
||||
lvx v16,off32,r3
|
||||
VPERM(v2,v2,v16,byteswap)
|
||||
VPMSUMW(v2,v2,v16)
|
||||
bdz .Lv2
|
||||
|
||||
lvx v3,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v3,v3,v17,byteswap)
|
||||
VPMSUMW(v3,v3,v17)
|
||||
bdz .Lv3
|
||||
|
||||
lvx v4,off64,r4
|
||||
lvx v16,off64,r3
|
||||
VPERM(v4,v4,v16,byteswap)
|
||||
VPMSUMW(v4,v4,v16)
|
||||
bdz .Lv4
|
||||
|
||||
lvx v5,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v5,v5,v17,byteswap)
|
||||
VPMSUMW(v5,v5,v17)
|
||||
bdz .Lv5
|
||||
|
||||
lvx v6,off96,r4
|
||||
lvx v16,off96,r3
|
||||
VPERM(v6,v6,v16,byteswap)
|
||||
VPMSUMW(v6,v6,v16)
|
||||
bdz .Lv6
|
||||
|
||||
lvx v7,off112,r4
|
||||
lvx v17,off112,r3
|
||||
VPERM(v7,v7,v17,byteswap)
|
||||
VPMSUMW(v7,v7,v17)
|
||||
bdz .Lv7
|
||||
|
||||
addi r3,r3,128
|
||||
addi r4,r4,128
|
||||
|
||||
lvx v8,0,r4
|
||||
lvx v16,0,r3
|
||||
VPERM(v8,v8,v16,byteswap)
|
||||
VPMSUMW(v8,v8,v16)
|
||||
bdz .Lv8
|
||||
|
||||
lvx v9,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v9,v9,v17,byteswap)
|
||||
VPMSUMW(v9,v9,v17)
|
||||
bdz .Lv9
|
||||
|
||||
lvx v10,off32,r4
|
||||
lvx v16,off32,r3
|
||||
VPERM(v10,v10,v16,byteswap)
|
||||
VPMSUMW(v10,v10,v16)
|
||||
bdz .Lv10
|
||||
|
||||
lvx v11,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v11,v11,v17,byteswap)
|
||||
VPMSUMW(v11,v11,v17)
|
||||
bdz .Lv11
|
||||
|
||||
lvx v12,off64,r4
|
||||
lvx v16,off64,r3
|
||||
VPERM(v12,v12,v16,byteswap)
|
||||
VPMSUMW(v12,v12,v16)
|
||||
bdz .Lv12
|
||||
|
||||
lvx v13,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v13,v13,v17,byteswap)
|
||||
VPMSUMW(v13,v13,v17)
|
||||
bdz .Lv13
|
||||
|
||||
lvx v14,off96,r4
|
||||
lvx v16,off96,r3
|
||||
VPERM(v14,v14,v16,byteswap)
|
||||
VPMSUMW(v14,v14,v16)
|
||||
bdz .Lv14
|
||||
|
||||
lvx v15,off112,r4
|
||||
lvx v17,off112,r3
|
||||
VPERM(v15,v15,v17,byteswap)
|
||||
VPMSUMW(v15,v15,v17)
|
||||
|
||||
.Lv15: vxor v19,v19,v15
|
||||
.Lv14: vxor v20,v20,v14
|
||||
.Lv13: vxor v19,v19,v13
|
||||
.Lv12: vxor v20,v20,v12
|
||||
.Lv11: vxor v19,v19,v11
|
||||
.Lv10: vxor v20,v20,v10
|
||||
.Lv9: vxor v19,v19,v9
|
||||
.Lv8: vxor v20,v20,v8
|
||||
.Lv7: vxor v19,v19,v7
|
||||
.Lv6: vxor v20,v20,v6
|
||||
.Lv5: vxor v19,v19,v5
|
||||
.Lv4: vxor v20,v20,v4
|
||||
.Lv3: vxor v19,v19,v3
|
||||
.Lv2: vxor v20,v20,v2
|
||||
.Lv1: vxor v19,v19,v1
|
||||
.Lv0: vxor v20,v20,v0
|
||||
|
||||
vxor v0,v19,v20
|
||||
|
||||
b .Lbarrett_reduction
|
||||
|
||||
.Lzero:
|
||||
mr r3,r10
|
||||
b .Lout
|
||||
|
||||
FUNC_END(CRC_FUNCTION_NAME)
|
@@ -1,20 +1,5 @@
|
||||
/*
|
||||
* Calculate the checksum of data that is 16 byte aligned and a multiple of
|
||||
* 16 bytes.
|
||||
*
|
||||
* The first step is to reduce it to 1024 bits. We do this in 8 parallel
|
||||
* chunks in order to mask the latency of the vpmsum instructions. If we
|
||||
* have more than 32 kB of data to checksum we repeat this step multiple
|
||||
* times, passing in the previous 1024 bits.
|
||||
*
|
||||
* The next step is to reduce the 1024 bits to 64 bits. This step adds
|
||||
* 32 bits of 0s to the end - this matches what a CRC does. We just
|
||||
* calculate constants that land the data in this 32 bits.
|
||||
*
|
||||
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
|
||||
* for n = CRC using POWER8 instructions. We use x = 32.
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
* Calculate a crc32c with vpmsum acceleration
|
||||
*
|
||||
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
|
||||
*
|
||||
@@ -23,9 +8,6 @@
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
#include <asm/ppc_asm.h>
|
||||
#include <asm/ppc-opcode.h>
|
||||
|
||||
.section .rodata
|
||||
.balign 16
|
||||
|
||||
@@ -33,7 +15,6 @@
|
||||
/* byte reverse permute constant */
|
||||
.octa 0x0F0E0D0C0B0A09080706050403020100
|
||||
|
||||
#define MAX_SIZE 32768
|
||||
.constants:
|
||||
|
||||
/* Reduce 262144 kbits to 1024 bits */
|
||||
@@ -860,694 +841,6 @@
|
||||
/* 33 bit reflected Barrett constant n */
|
||||
.octa 0x00000000000000000000000105ec76f1
|
||||
|
||||
.text
|
||||
|
||||
#if defined(__BIG_ENDIAN__)
|
||||
#define BYTESWAP_DATA
|
||||
#else
|
||||
#undef BYTESWAP_DATA
|
||||
#endif
|
||||
|
||||
#define off16 r25
|
||||
#define off32 r26
|
||||
#define off48 r27
|
||||
#define off64 r28
|
||||
#define off80 r29
|
||||
#define off96 r30
|
||||
#define off112 r31
|
||||
|
||||
#define const1 v24
|
||||
#define const2 v25
|
||||
|
||||
#define byteswap v26
|
||||
#define mask_32bit v27
|
||||
#define mask_64bit v28
|
||||
#define zeroes v29
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
#define VPERM(A, B, C, D) vperm A, B, C, D
|
||||
#else
|
||||
#define VPERM(A, B, C, D)
|
||||
#endif
|
||||
|
||||
/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
|
||||
FUNC_START(__crc32c_vpmsum)
|
||||
std r31,-8(r1)
|
||||
std r30,-16(r1)
|
||||
std r29,-24(r1)
|
||||
std r28,-32(r1)
|
||||
std r27,-40(r1)
|
||||
std r26,-48(r1)
|
||||
std r25,-56(r1)
|
||||
|
||||
li off16,16
|
||||
li off32,32
|
||||
li off48,48
|
||||
li off64,64
|
||||
li off80,80
|
||||
li off96,96
|
||||
li off112,112
|
||||
li r0,0
|
||||
|
||||
/* Enough room for saving 10 non volatile VMX registers */
|
||||
subi r6,r1,56+10*16
|
||||
subi r7,r1,56+2*16
|
||||
|
||||
stvx v20,0,r6
|
||||
stvx v21,off16,r6
|
||||
stvx v22,off32,r6
|
||||
stvx v23,off48,r6
|
||||
stvx v24,off64,r6
|
||||
stvx v25,off80,r6
|
||||
stvx v26,off96,r6
|
||||
stvx v27,off112,r6
|
||||
stvx v28,0,r7
|
||||
stvx v29,off16,r7
|
||||
|
||||
mr r10,r3
|
||||
|
||||
vxor zeroes,zeroes,zeroes
|
||||
vspltisw v0,-1
|
||||
|
||||
vsldoi mask_32bit,zeroes,v0,4
|
||||
vsldoi mask_64bit,zeroes,v0,8
|
||||
|
||||
/* Get the initial value into v8 */
|
||||
vxor v8,v8,v8
|
||||
MTVRD(v8, R3)
|
||||
vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
|
||||
|
||||
#ifdef BYTESWAP_DATA
|
||||
addis r3,r2,.byteswap_constant@toc@ha
|
||||
addi r3,r3,.byteswap_constant@toc@l
|
||||
|
||||
lvx byteswap,0,r3
|
||||
addi r3,r3,16
|
||||
#endif
|
||||
|
||||
cmpdi r5,256
|
||||
blt .Lshort
|
||||
|
||||
rldicr r6,r5,0,56
|
||||
|
||||
/* Checksum in blocks of MAX_SIZE */
|
||||
1: lis r7,MAX_SIZE@h
|
||||
ori r7,r7,MAX_SIZE@l
|
||||
mr r9,r7
|
||||
cmpd r6,r7
|
||||
bgt 2f
|
||||
mr r7,r6
|
||||
2: subf r6,r7,r6
|
||||
|
||||
/* our main loop does 128 bytes at a time */
|
||||
srdi r7,r7,7
|
||||
|
||||
/*
|
||||
* Work out the offset into the constants table to start at. Each
|
||||
* constant is 16 bytes, and it is used against 128 bytes of input
|
||||
* data - 128 / 16 = 8
|
||||
*/
|
||||
sldi r8,r7,4
|
||||
srdi r9,r9,3
|
||||
subf r8,r8,r9
|
||||
|
||||
/* We reduce our final 128 bytes in a separate step */
|
||||
addi r7,r7,-1
|
||||
mtctr r7
|
||||
|
||||
addis r3,r2,.constants@toc@ha
|
||||
addi r3,r3,.constants@toc@l
|
||||
|
||||
/* Find the start of our constants */
|
||||
add r3,r3,r8
|
||||
|
||||
/* zero v0-v7 which will contain our checksums */
|
||||
vxor v0,v0,v0
|
||||
vxor v1,v1,v1
|
||||
vxor v2,v2,v2
|
||||
vxor v3,v3,v3
|
||||
vxor v4,v4,v4
|
||||
vxor v5,v5,v5
|
||||
vxor v6,v6,v6
|
||||
vxor v7,v7,v7
|
||||
|
||||
lvx const1,0,r3
|
||||
|
||||
/*
|
||||
* If we are looping back to consume more data we use the values
|
||||
* already in v16-v23.
|
||||
*/
|
||||
cmpdi r0,1
|
||||
beq 2f
|
||||
|
||||
/* First warm up pass */
|
||||
lvx v16,0,r4
|
||||
lvx v17,off16,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
lvx v18,off32,r4
|
||||
lvx v19,off48,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
lvx v20,off64,r4
|
||||
lvx v21,off80,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
lvx v22,off96,r4
|
||||
lvx v23,off112,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
addi r4,r4,8*16
|
||||
|
||||
/* xor in initial value */
|
||||
vxor v16,v16,v8
|
||||
|
||||
2: bdz .Lfirst_warm_up_done
|
||||
|
||||
addi r3,r3,16
|
||||
lvx const2,0,r3
|
||||
|
||||
/* Second warm up pass */
|
||||
VPMSUMD(v8,v16,const1)
|
||||
lvx v16,0,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v9,v17,const1)
|
||||
lvx v17,off16,r4
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v10,v18,const1)
|
||||
lvx v18,off32,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v11,v19,const1)
|
||||
lvx v19,off48,r4
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v12,v20,const1)
|
||||
lvx v20,off64,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v13,v21,const1)
|
||||
lvx v21,off80,r4
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v14,v22,const1)
|
||||
lvx v22,off96,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
VPMSUMD(v15,v23,const1)
|
||||
lvx v23,off112,r4
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
bdz .Lfirst_cool_down
|
||||
|
||||
/*
|
||||
* main loop. We modulo schedule it such that it takes three iterations
|
||||
* to complete - first iteration load, second iteration vpmsum, third
|
||||
* iteration xor.
|
||||
*/
|
||||
.balign 16
|
||||
4: lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v0,v0,v8
|
||||
VPMSUMD(v8,v16,const2)
|
||||
lvx v16,0,r4
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v1,v1,v9
|
||||
VPMSUMD(v9,v17,const2)
|
||||
lvx v17,off16,r4
|
||||
VPERM(v17,v17,v17,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v2,v2,v10
|
||||
VPMSUMD(v10,v18,const2)
|
||||
lvx v18,off32,r4
|
||||
VPERM(v18,v18,v18,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v3,v3,v11
|
||||
VPMSUMD(v11,v19,const2)
|
||||
lvx v19,off48,r4
|
||||
VPERM(v19,v19,v19,byteswap)
|
||||
lvx const2,0,r3
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v4,v4,v12
|
||||
VPMSUMD(v12,v20,const1)
|
||||
lvx v20,off64,r4
|
||||
VPERM(v20,v20,v20,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v5,v5,v13
|
||||
VPMSUMD(v13,v21,const1)
|
||||
lvx v21,off80,r4
|
||||
VPERM(v21,v21,v21,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v6,v6,v14
|
||||
VPMSUMD(v14,v22,const1)
|
||||
lvx v22,off96,r4
|
||||
VPERM(v22,v22,v22,byteswap)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v7,v7,v15
|
||||
VPMSUMD(v15,v23,const1)
|
||||
lvx v23,off112,r4
|
||||
VPERM(v23,v23,v23,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
bdnz 4b
|
||||
|
||||
.Lfirst_cool_down:
|
||||
/* First cool down pass */
|
||||
lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
|
||||
vxor v0,v0,v8
|
||||
VPMSUMD(v8,v16,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v1,v1,v9
|
||||
VPMSUMD(v9,v17,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v2,v2,v10
|
||||
VPMSUMD(v10,v18,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v3,v3,v11
|
||||
VPMSUMD(v11,v19,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v4,v4,v12
|
||||
VPMSUMD(v12,v20,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v5,v5,v13
|
||||
VPMSUMD(v13,v21,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v6,v6,v14
|
||||
VPMSUMD(v14,v22,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
vxor v7,v7,v15
|
||||
VPMSUMD(v15,v23,const1)
|
||||
ori r2,r2,0
|
||||
|
||||
.Lsecond_cool_down:
|
||||
/* Second cool down pass */
|
||||
vxor v0,v0,v8
|
||||
vxor v1,v1,v9
|
||||
vxor v2,v2,v10
|
||||
vxor v3,v3,v11
|
||||
vxor v4,v4,v12
|
||||
vxor v5,v5,v13
|
||||
vxor v6,v6,v14
|
||||
vxor v7,v7,v15
|
||||
|
||||
/*
|
||||
* vpmsumd produces a 96 bit result in the least significant bits
|
||||
* of the register. Since we are bit reflected we have to shift it
|
||||
* left 32 bits so it occupies the least significant bits in the
|
||||
* bit reflected domain.
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,4
|
||||
vsldoi v1,v1,zeroes,4
|
||||
vsldoi v2,v2,zeroes,4
|
||||
vsldoi v3,v3,zeroes,4
|
||||
vsldoi v4,v4,zeroes,4
|
||||
vsldoi v5,v5,zeroes,4
|
||||
vsldoi v6,v6,zeroes,4
|
||||
vsldoi v7,v7,zeroes,4
|
||||
|
||||
/* xor with last 1024 bits */
|
||||
lvx v8,0,r4
|
||||
lvx v9,off16,r4
|
||||
VPERM(v8,v8,v8,byteswap)
|
||||
VPERM(v9,v9,v9,byteswap)
|
||||
lvx v10,off32,r4
|
||||
lvx v11,off48,r4
|
||||
VPERM(v10,v10,v10,byteswap)
|
||||
VPERM(v11,v11,v11,byteswap)
|
||||
lvx v12,off64,r4
|
||||
lvx v13,off80,r4
|
||||
VPERM(v12,v12,v12,byteswap)
|
||||
VPERM(v13,v13,v13,byteswap)
|
||||
lvx v14,off96,r4
|
||||
lvx v15,off112,r4
|
||||
VPERM(v14,v14,v14,byteswap)
|
||||
VPERM(v15,v15,v15,byteswap)
|
||||
|
||||
addi r4,r4,8*16
|
||||
|
||||
vxor v16,v0,v8
|
||||
vxor v17,v1,v9
|
||||
vxor v18,v2,v10
|
||||
vxor v19,v3,v11
|
||||
vxor v20,v4,v12
|
||||
vxor v21,v5,v13
|
||||
vxor v22,v6,v14
|
||||
vxor v23,v7,v15
|
||||
|
||||
li r0,1
|
||||
cmpdi r6,0
|
||||
addi r6,r6,128
|
||||
bne 1b
|
||||
|
||||
/* Work out how many bytes we have left */
|
||||
andi. r5,r5,127
|
||||
|
||||
/* Calculate where in the constant table we need to start */
|
||||
subfic r6,r5,128
|
||||
add r3,r3,r6
|
||||
|
||||
/* How many 16 byte chunks are in the tail */
|
||||
srdi r7,r5,4
|
||||
mtctr r7
|
||||
|
||||
/*
|
||||
* Reduce the previously calculated 1024 bits to 64 bits, shifting
|
||||
* 32 bits to include the trailing 32 bits of zeros
|
||||
*/
|
||||
lvx v0,0,r3
|
||||
lvx v1,off16,r3
|
||||
lvx v2,off32,r3
|
||||
lvx v3,off48,r3
|
||||
lvx v4,off64,r3
|
||||
lvx v5,off80,r3
|
||||
lvx v6,off96,r3
|
||||
lvx v7,off112,r3
|
||||
addi r3,r3,8*16
|
||||
|
||||
VPMSUMW(v0,v16,v0)
|
||||
VPMSUMW(v1,v17,v1)
|
||||
VPMSUMW(v2,v18,v2)
|
||||
VPMSUMW(v3,v19,v3)
|
||||
VPMSUMW(v4,v20,v4)
|
||||
VPMSUMW(v5,v21,v5)
|
||||
VPMSUMW(v6,v22,v6)
|
||||
VPMSUMW(v7,v23,v7)
|
||||
|
||||
/* Now reduce the tail (0 - 112 bytes) */
|
||||
cmpdi r7,0
|
||||
beq 1f
|
||||
|
||||
lvx v16,0,r4
|
||||
lvx v17,0,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off32,r4
|
||||
lvx v17,off32,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off64,r4
|
||||
lvx v17,off64,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
bdz 1f
|
||||
|
||||
lvx v16,off96,r4
|
||||
lvx v17,off96,r3
|
||||
VPERM(v16,v16,v16,byteswap)
|
||||
VPMSUMW(v16,v16,v17)
|
||||
vxor v0,v0,v16
|
||||
|
||||
/* Now xor all the parallel chunks together */
|
||||
1: vxor v0,v0,v1
|
||||
vxor v2,v2,v3
|
||||
vxor v4,v4,v5
|
||||
vxor v6,v6,v7
|
||||
|
||||
vxor v0,v0,v2
|
||||
vxor v4,v4,v6
|
||||
|
||||
vxor v0,v0,v4
|
||||
|
||||
.Lbarrett_reduction:
|
||||
/* Barrett constants */
|
||||
addis r3,r2,.barrett_constants@toc@ha
|
||||
addi r3,r3,.barrett_constants@toc@l
|
||||
|
||||
lvx const1,0,r3
|
||||
lvx const2,off16,r3
|
||||
|
||||
vsldoi v1,v0,v0,8
|
||||
vxor v0,v0,v1 /* xor two 64 bit results together */
|
||||
|
||||
/* shift left one bit */
|
||||
vspltisb v1,1
|
||||
vsl v0,v0,v1
|
||||
|
||||
vand v0,v0,mask_64bit
|
||||
|
||||
/*
|
||||
* The reflected version of Barrett reduction. Instead of bit
|
||||
* reflecting our data (which is expensive to do), we bit reflect our
|
||||
* constants and our algorithm, which means the intermediate data in
|
||||
* our vector registers goes from 0-63 instead of 63-0. We can reflect
|
||||
* the algorithm because we don't carry in mod 2 arithmetic.
|
||||
*/
|
||||
vand v1,v0,mask_32bit /* bottom 32 bits of a */
|
||||
VPMSUMD(v1,v1,const1) /* ma */
|
||||
vand v1,v1,mask_32bit /* bottom 32bits of ma */
|
||||
VPMSUMD(v1,v1,const2) /* qn */
|
||||
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
|
||||
|
||||
/*
|
||||
* Since we are bit reflected, the result (ie the low 32 bits) is in
|
||||
* the high 32 bits. We just need to shift it left 4 bytes
|
||||
* V0 [ 0 1 X 3 ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
|
||||
|
||||
/* Get it into r3 */
|
||||
MFVRD(R3, v0)
|
||||
|
||||
.Lout:
|
||||
subi r6,r1,56+10*16
|
||||
subi r7,r1,56+2*16
|
||||
|
||||
lvx v20,0,r6
|
||||
lvx v21,off16,r6
|
||||
lvx v22,off32,r6
|
||||
lvx v23,off48,r6
|
||||
lvx v24,off64,r6
|
||||
lvx v25,off80,r6
|
||||
lvx v26,off96,r6
|
||||
lvx v27,off112,r6
|
||||
lvx v28,0,r7
|
||||
lvx v29,off16,r7
|
||||
|
||||
ld r31,-8(r1)
|
||||
ld r30,-16(r1)
|
||||
ld r29,-24(r1)
|
||||
ld r28,-32(r1)
|
||||
ld r27,-40(r1)
|
||||
ld r26,-48(r1)
|
||||
ld r25,-56(r1)
|
||||
|
||||
blr
|
||||
|
||||
.Lfirst_warm_up_done:
|
||||
lvx const1,0,r3
|
||||
addi r3,r3,16
|
||||
|
||||
VPMSUMD(v8,v16,const1)
|
||||
VPMSUMD(v9,v17,const1)
|
||||
VPMSUMD(v10,v18,const1)
|
||||
VPMSUMD(v11,v19,const1)
|
||||
VPMSUMD(v12,v20,const1)
|
||||
VPMSUMD(v13,v21,const1)
|
||||
VPMSUMD(v14,v22,const1)
|
||||
VPMSUMD(v15,v23,const1)
|
||||
|
||||
b .Lsecond_cool_down
|
||||
|
||||
.Lshort:
|
||||
cmpdi r5,0
|
||||
beq .Lzero
|
||||
|
||||
addis r3,r2,.short_constants@toc@ha
|
||||
addi r3,r3,.short_constants@toc@l
|
||||
|
||||
/* Calculate where in the constant table we need to start */
|
||||
subfic r6,r5,256
|
||||
add r3,r3,r6
|
||||
|
||||
/* How many 16 byte chunks? */
|
||||
srdi r7,r5,4
|
||||
mtctr r7
|
||||
|
||||
vxor v19,v19,v19
|
||||
vxor v20,v20,v20
|
||||
|
||||
lvx v0,0,r4
|
||||
lvx v16,0,r3
|
||||
VPERM(v0,v0,v16,byteswap)
|
||||
vxor v0,v0,v8 /* xor in initial value */
|
||||
VPMSUMW(v0,v0,v16)
|
||||
bdz .Lv0
|
||||
|
||||
lvx v1,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v1,v1,v17,byteswap)
|
||||
VPMSUMW(v1,v1,v17)
|
||||
bdz .Lv1
|
||||
|
||||
lvx v2,off32,r4
|
||||
lvx v16,off32,r3
|
||||
VPERM(v2,v2,v16,byteswap)
|
||||
VPMSUMW(v2,v2,v16)
|
||||
bdz .Lv2
|
||||
|
||||
lvx v3,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v3,v3,v17,byteswap)
|
||||
VPMSUMW(v3,v3,v17)
|
||||
bdz .Lv3
|
||||
|
||||
lvx v4,off64,r4
|
||||
lvx v16,off64,r3
|
||||
VPERM(v4,v4,v16,byteswap)
|
||||
VPMSUMW(v4,v4,v16)
|
||||
bdz .Lv4
|
||||
|
||||
lvx v5,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v5,v5,v17,byteswap)
|
||||
VPMSUMW(v5,v5,v17)
|
||||
bdz .Lv5
|
||||
|
||||
lvx v6,off96,r4
|
||||
lvx v16,off96,r3
|
||||
VPERM(v6,v6,v16,byteswap)
|
||||
VPMSUMW(v6,v6,v16)
|
||||
bdz .Lv6
|
||||
|
||||
lvx v7,off112,r4
|
||||
lvx v17,off112,r3
|
||||
VPERM(v7,v7,v17,byteswap)
|
||||
VPMSUMW(v7,v7,v17)
|
||||
bdz .Lv7
|
||||
|
||||
addi r3,r3,128
|
||||
addi r4,r4,128
|
||||
|
||||
lvx v8,0,r4
|
||||
lvx v16,0,r3
|
||||
VPERM(v8,v8,v16,byteswap)
|
||||
VPMSUMW(v8,v8,v16)
|
||||
bdz .Lv8
|
||||
|
||||
lvx v9,off16,r4
|
||||
lvx v17,off16,r3
|
||||
VPERM(v9,v9,v17,byteswap)
|
||||
VPMSUMW(v9,v9,v17)
|
||||
bdz .Lv9
|
||||
|
||||
lvx v10,off32,r4
|
||||
lvx v16,off32,r3
|
||||
VPERM(v10,v10,v16,byteswap)
|
||||
VPMSUMW(v10,v10,v16)
|
||||
bdz .Lv10
|
||||
|
||||
lvx v11,off48,r4
|
||||
lvx v17,off48,r3
|
||||
VPERM(v11,v11,v17,byteswap)
|
||||
VPMSUMW(v11,v11,v17)
|
||||
bdz .Lv11
|
||||
|
||||
lvx v12,off64,r4
|
||||
lvx v16,off64,r3
|
||||
VPERM(v12,v12,v16,byteswap)
|
||||
VPMSUMW(v12,v12,v16)
|
||||
bdz .Lv12
|
||||
|
||||
lvx v13,off80,r4
|
||||
lvx v17,off80,r3
|
||||
VPERM(v13,v13,v17,byteswap)
|
||||
VPMSUMW(v13,v13,v17)
|
||||
bdz .Lv13
|
||||
|
||||
lvx v14,off96,r4
|
||||
lvx v16,off96,r3
|
||||
VPERM(v14,v14,v16,byteswap)
|
||||
VPMSUMW(v14,v14,v16)
|
||||
bdz .Lv14
|
||||
|
||||
lvx v15,off112,r4
|
||||
lvx v17,off112,r3
|
||||
VPERM(v15,v15,v17,byteswap)
|
||||
VPMSUMW(v15,v15,v17)
|
||||
|
||||
.Lv15: vxor v19,v19,v15
|
||||
.Lv14: vxor v20,v20,v14
|
||||
.Lv13: vxor v19,v19,v13
|
||||
.Lv12: vxor v20,v20,v12
|
||||
.Lv11: vxor v19,v19,v11
|
||||
.Lv10: vxor v20,v20,v10
|
||||
.Lv9: vxor v19,v19,v9
|
||||
.Lv8: vxor v20,v20,v8
|
||||
.Lv7: vxor v19,v19,v7
|
||||
.Lv6: vxor v20,v20,v6
|
||||
.Lv5: vxor v19,v19,v5
|
||||
.Lv4: vxor v20,v20,v4
|
||||
.Lv3: vxor v19,v19,v3
|
||||
.Lv2: vxor v20,v20,v2
|
||||
.Lv1: vxor v19,v19,v1
|
||||
.Lv0: vxor v20,v20,v0
|
||||
|
||||
vxor v0,v19,v20
|
||||
|
||||
b .Lbarrett_reduction
|
||||
|
||||
.Lzero:
|
||||
mr r3,r10
|
||||
b .Lout
|
||||
|
||||
FUNC_END(__crc32_vpmsum)
|
||||
#define CRC_FUNCTION_NAME __crc32c_vpmsum
|
||||
#define REFLECT
|
||||
#include "crc32-vpmsum_core.S"
|
||||
|
850
arch/powerpc/crypto/crct10dif-vpmsum_asm.S
Normal file
850
arch/powerpc/crypto/crct10dif-vpmsum_asm.S
Normal file
@@ -0,0 +1,850 @@
|
||||
/*
|
||||
* Calculate a CRC T10DIF with vpmsum acceleration
|
||||
*
|
||||
* Constants generated by crc32-vpmsum, available at
|
||||
* https://github.com/antonblanchard/crc32-vpmsum
|
||||
*
|
||||
* crc32-vpmsum is
|
||||
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
|
||||
* and is available under the GPL v2 or later.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
.section .rodata
|
||||
.balign 16
|
||||
|
||||
.byteswap_constant:
|
||||
/* byte reverse permute constant */
|
||||
.octa 0x0F0E0D0C0B0A09080706050403020100
|
||||
|
||||
.constants:
|
||||
|
||||
/* Reduce 262144 kbits to 1024 bits */
|
||||
/* x^261184 mod p(x), x^261120 mod p(x) */
|
||||
.octa 0x0000000056d300000000000052550000
|
||||
|
||||
/* x^260160 mod p(x), x^260096 mod p(x) */
|
||||
.octa 0x00000000ee67000000000000a1e40000
|
||||
|
||||
/* x^259136 mod p(x), x^259072 mod p(x) */
|
||||
.octa 0x0000000060830000000000004ad10000
|
||||
|
||||
/* x^258112 mod p(x), x^258048 mod p(x) */
|
||||
.octa 0x000000008cfe0000000000009ab40000
|
||||
|
||||
/* x^257088 mod p(x), x^257024 mod p(x) */
|
||||
.octa 0x000000003e93000000000000fdb50000
|
||||
|
||||
/* x^256064 mod p(x), x^256000 mod p(x) */
|
||||
.octa 0x000000003c2000000000000045480000
|
||||
|
||||
/* x^255040 mod p(x), x^254976 mod p(x) */
|
||||
.octa 0x00000000b1fc0000000000008d690000
|
||||
|
||||
/* x^254016 mod p(x), x^253952 mod p(x) */
|
||||
.octa 0x00000000f82b00000000000024ad0000
|
||||
|
||||
/* x^252992 mod p(x), x^252928 mod p(x) */
|
||||
.octa 0x0000000044420000000000009f1a0000
|
||||
|
||||
/* x^251968 mod p(x), x^251904 mod p(x) */
|
||||
.octa 0x00000000e88c00000000000066ec0000
|
||||
|
||||
/* x^250944 mod p(x), x^250880 mod p(x) */
|
||||
.octa 0x00000000385c000000000000c87d0000
|
||||
|
||||
/* x^249920 mod p(x), x^249856 mod p(x) */
|
||||
.octa 0x000000003227000000000000c8ff0000
|
||||
|
||||
/* x^248896 mod p(x), x^248832 mod p(x) */
|
||||
.octa 0x00000000a9a900000000000033440000
|
||||
|
||||
/* x^247872 mod p(x), x^247808 mod p(x) */
|
||||
.octa 0x00000000abaa00000000000066eb0000
|
||||
|
||||
/* x^246848 mod p(x), x^246784 mod p(x) */
|
||||
.octa 0x000000001ac3000000000000c4ef0000
|
||||
|
||||
/* x^245824 mod p(x), x^245760 mod p(x) */
|
||||
.octa 0x0000000063f000000000000056f30000
|
||||
|
||||
/* x^244800 mod p(x), x^244736 mod p(x) */
|
||||
.octa 0x0000000032cc00000000000002050000
|
||||
|
||||
/* x^243776 mod p(x), x^243712 mod p(x) */
|
||||
.octa 0x00000000f8b5000000000000568e0000
|
||||
|
||||
/* x^242752 mod p(x), x^242688 mod p(x) */
|
||||
.octa 0x000000008db100000000000064290000
|
||||
|
||||
/* x^241728 mod p(x), x^241664 mod p(x) */
|
||||
.octa 0x0000000059ca0000000000006b660000
|
||||
|
||||
/* x^240704 mod p(x), x^240640 mod p(x) */
|
||||
.octa 0x000000005f5c00000000000018f80000
|
||||
|
||||
/* x^239680 mod p(x), x^239616 mod p(x) */
|
||||
.octa 0x0000000061af000000000000b6090000
|
||||
|
||||
/* x^238656 mod p(x), x^238592 mod p(x) */
|
||||
.octa 0x00000000e29e000000000000099a0000
|
||||
|
||||
/* x^237632 mod p(x), x^237568 mod p(x) */
|
||||
.octa 0x000000000975000000000000a8360000
|
||||
|
||||
/* x^236608 mod p(x), x^236544 mod p(x) */
|
||||
.octa 0x0000000043900000000000004f570000
|
||||
|
||||
/* x^235584 mod p(x), x^235520 mod p(x) */
|
||||
.octa 0x00000000f9cd000000000000134c0000
|
||||
|
||||
/* x^234560 mod p(x), x^234496 mod p(x) */
|
||||
.octa 0x000000007c29000000000000ec380000
|
||||
|
||||
/* x^233536 mod p(x), x^233472 mod p(x) */
|
||||
.octa 0x000000004c6a000000000000b0d10000
|
||||
|
||||
/* x^232512 mod p(x), x^232448 mod p(x) */
|
||||
.octa 0x00000000e7290000000000007d3e0000
|
||||
|
||||
/* x^231488 mod p(x), x^231424 mod p(x) */
|
||||
.octa 0x00000000f1ab000000000000f0b20000
|
||||
|
||||
/* x^230464 mod p(x), x^230400 mod p(x) */
|
||||
.octa 0x0000000039db0000000000009c270000
|
||||
|
||||
/* x^229440 mod p(x), x^229376 mod p(x) */
|
||||
.octa 0x000000005e2800000000000092890000
|
||||
|
||||
/* x^228416 mod p(x), x^228352 mod p(x) */
|
||||
.octa 0x00000000d44e000000000000d5ee0000
|
||||
|
||||
/* x^227392 mod p(x), x^227328 mod p(x) */
|
||||
.octa 0x00000000cd0a00000000000041f50000
|
||||
|
||||
/* x^226368 mod p(x), x^226304 mod p(x) */
|
||||
.octa 0x00000000c5b400000000000010520000
|
||||
|
||||
/* x^225344 mod p(x), x^225280 mod p(x) */
|
||||
.octa 0x00000000fd2100000000000042170000
|
||||
|
||||
/* x^224320 mod p(x), x^224256 mod p(x) */
|
||||
.octa 0x000000002f2500000000000095c20000
|
||||
|
||||
/* x^223296 mod p(x), x^223232 mod p(x) */
|
||||
.octa 0x000000001b0100000000000001ce0000
|
||||
|
||||
/* x^222272 mod p(x), x^222208 mod p(x) */
|
||||
.octa 0x000000000d430000000000002aca0000
|
||||
|
||||
/* x^221248 mod p(x), x^221184 mod p(x) */
|
||||
.octa 0x0000000030a6000000000000385e0000
|
||||
|
||||
/* x^220224 mod p(x), x^220160 mod p(x) */
|
||||
.octa 0x00000000e37b0000000000006f7a0000
|
||||
|
||||
/* x^219200 mod p(x), x^219136 mod p(x) */
|
||||
.octa 0x00000000873600000000000024320000
|
||||
|
||||
/* x^218176 mod p(x), x^218112 mod p(x) */
|
||||
.octa 0x00000000e9fb000000000000bd9c0000
|
||||
|
||||
/* x^217152 mod p(x), x^217088 mod p(x) */
|
||||
.octa 0x000000003b9500000000000054bc0000
|
||||
|
||||
/* x^216128 mod p(x), x^216064 mod p(x) */
|
||||
.octa 0x00000000133e000000000000a4660000
|
||||
|
||||
/* x^215104 mod p(x), x^215040 mod p(x) */
|
||||
.octa 0x00000000784500000000000079930000
|
||||
|
||||
/* x^214080 mod p(x), x^214016 mod p(x) */
|
||||
.octa 0x00000000b9800000000000001bb80000
|
||||
|
||||
/* x^213056 mod p(x), x^212992 mod p(x) */
|
||||
.octa 0x00000000687600000000000024400000
|
||||
|
||||
/* x^212032 mod p(x), x^211968 mod p(x) */
|
||||
.octa 0x00000000aff300000000000029e10000
|
||||
|
||||
/* x^211008 mod p(x), x^210944 mod p(x) */
|
||||
.octa 0x0000000024b50000000000005ded0000
|
||||
|
||||
/* x^209984 mod p(x), x^209920 mod p(x) */
|
||||
.octa 0x0000000017e8000000000000b12e0000
|
||||
|
||||
/* x^208960 mod p(x), x^208896 mod p(x) */
|
||||
.octa 0x00000000128400000000000026d20000
|
||||
|
||||
/* x^207936 mod p(x), x^207872 mod p(x) */
|
||||
.octa 0x000000002115000000000000a32a0000
|
||||
|
||||
/* x^206912 mod p(x), x^206848 mod p(x) */
|
||||
.octa 0x000000009595000000000000a1210000
|
||||
|
||||
/* x^205888 mod p(x), x^205824 mod p(x) */
|
||||
.octa 0x00000000281e000000000000ee8b0000
|
||||
|
||||
/* x^204864 mod p(x), x^204800 mod p(x) */
|
||||
.octa 0x0000000006010000000000003d0d0000
|
||||
|
||||
/* x^203840 mod p(x), x^203776 mod p(x) */
|
||||
.octa 0x00000000e2b600000000000034e90000
|
||||
|
||||
/* x^202816 mod p(x), x^202752 mod p(x) */
|
||||
.octa 0x000000001bd40000000000004cdb0000
|
||||
|
||||
/* x^201792 mod p(x), x^201728 mod p(x) */
|
||||
.octa 0x00000000df2800000000000030e90000
|
||||
|
||||
/* x^200768 mod p(x), x^200704 mod p(x) */
|
||||
.octa 0x0000000049c200000000000042590000
|
||||
|
||||
/* x^199744 mod p(x), x^199680 mod p(x) */
|
||||
.octa 0x000000009b97000000000000df950000
|
||||
|
||||
/* x^198720 mod p(x), x^198656 mod p(x) */
|
||||
.octa 0x000000006184000000000000da7b0000
|
||||
|
||||
/* x^197696 mod p(x), x^197632 mod p(x) */
|
||||
.octa 0x00000000461700000000000012510000
|
||||
|
||||
/* x^196672 mod p(x), x^196608 mod p(x) */
|
||||
.octa 0x000000009b40000000000000f37e0000
|
||||
|
||||
/* x^195648 mod p(x), x^195584 mod p(x) */
|
||||
.octa 0x00000000eeb2000000000000ecf10000
|
||||
|
||||
/* x^194624 mod p(x), x^194560 mod p(x) */
|
||||
.octa 0x00000000b2e800000000000050f20000
|
||||
|
||||
/* x^193600 mod p(x), x^193536 mod p(x) */
|
||||
.octa 0x00000000f59a000000000000e0b30000
|
||||
|
||||
/* x^192576 mod p(x), x^192512 mod p(x) */
|
||||
.octa 0x00000000467f0000000000004d5a0000
|
||||
|
||||
/* x^191552 mod p(x), x^191488 mod p(x) */
|
||||
.octa 0x00000000da92000000000000bb010000
|
||||
|
||||
/* x^190528 mod p(x), x^190464 mod p(x) */
|
||||
.octa 0x000000001e1000000000000022a40000
|
||||
|
||||
/* x^189504 mod p(x), x^189440 mod p(x) */
|
||||
.octa 0x0000000058fe000000000000836f0000
|
||||
|
||||
/* x^188480 mod p(x), x^188416 mod p(x) */
|
||||
.octa 0x00000000b9ce000000000000d78d0000
|
||||
|
||||
/* x^187456 mod p(x), x^187392 mod p(x) */
|
||||
.octa 0x0000000022210000000000004f8d0000
|
||||
|
||||
/* x^186432 mod p(x), x^186368 mod p(x) */
|
||||
.octa 0x00000000744600000000000033760000
|
||||
|
||||
/* x^185408 mod p(x), x^185344 mod p(x) */
|
||||
.octa 0x000000001c2e000000000000a1e50000
|
||||
|
||||
/* x^184384 mod p(x), x^184320 mod p(x) */
|
||||
.octa 0x00000000dcc8000000000000a1a40000
|
||||
|
||||
/* x^183360 mod p(x), x^183296 mod p(x) */
|
||||
.octa 0x00000000910f00000000000019a20000
|
||||
|
||||
/* x^182336 mod p(x), x^182272 mod p(x) */
|
||||
.octa 0x0000000055d5000000000000f6ae0000
|
||||
|
||||
/* x^181312 mod p(x), x^181248 mod p(x) */
|
||||
.octa 0x00000000c8ba000000000000a7ac0000
|
||||
|
||||
/* x^180288 mod p(x), x^180224 mod p(x) */
|
||||
.octa 0x0000000031f8000000000000eea20000
|
||||
|
||||
/* x^179264 mod p(x), x^179200 mod p(x) */
|
||||
.octa 0x000000001966000000000000c4d90000
|
||||
|
||||
/* x^178240 mod p(x), x^178176 mod p(x) */
|
||||
.octa 0x00000000b9810000000000002b470000
|
||||
|
||||
/* x^177216 mod p(x), x^177152 mod p(x) */
|
||||
.octa 0x000000008303000000000000f7cf0000
|
||||
|
||||
/* x^176192 mod p(x), x^176128 mod p(x) */
|
||||
.octa 0x000000002ce500000000000035b30000
|
||||
|
||||
/* x^175168 mod p(x), x^175104 mod p(x) */
|
||||
.octa 0x000000002fae0000000000000c7c0000
|
||||
|
||||
/* x^174144 mod p(x), x^174080 mod p(x) */
|
||||
.octa 0x00000000f50c0000000000009edf0000
|
||||
|
||||
/* x^173120 mod p(x), x^173056 mod p(x) */
|
||||
.octa 0x00000000714f00000000000004cd0000
|
||||
|
||||
/* x^172096 mod p(x), x^172032 mod p(x) */
|
||||
.octa 0x00000000c161000000000000541b0000
|
||||
|
||||
/* x^171072 mod p(x), x^171008 mod p(x) */
|
||||
.octa 0x0000000021c8000000000000e2700000
|
||||
|
||||
/* x^170048 mod p(x), x^169984 mod p(x) */
|
||||
.octa 0x00000000b93d00000000000009a60000
|
||||
|
||||
/* x^169024 mod p(x), x^168960 mod p(x) */
|
||||
.octa 0x00000000fbcf000000000000761c0000
|
||||
|
||||
/* x^168000 mod p(x), x^167936 mod p(x) */
|
||||
.octa 0x0000000026350000000000009db30000
|
||||
|
||||
/* x^166976 mod p(x), x^166912 mod p(x) */
|
||||
.octa 0x00000000b64f0000000000003e9f0000
|
||||
|
||||
/* x^165952 mod p(x), x^165888 mod p(x) */
|
||||
.octa 0x00000000bd0e00000000000078590000
|
||||
|
||||
/* x^164928 mod p(x), x^164864 mod p(x) */
|
||||
.octa 0x00000000d9360000000000008bc80000
|
||||
|
||||
/* x^163904 mod p(x), x^163840 mod p(x) */
|
||||
.octa 0x000000002f140000000000008c9f0000
|
||||
|
||||
/* x^162880 mod p(x), x^162816 mod p(x) */
|
||||
.octa 0x000000006a270000000000006af70000
|
||||
|
||||
/* x^161856 mod p(x), x^161792 mod p(x) */
|
||||
.octa 0x000000006685000000000000e5210000
|
||||
|
||||
/* x^160832 mod p(x), x^160768 mod p(x) */
|
||||
.octa 0x0000000062da00000000000008290000
|
||||
|
||||
/* x^159808 mod p(x), x^159744 mod p(x) */
|
||||
.octa 0x00000000bb4b000000000000e4d00000
|
||||
|
||||
/* x^158784 mod p(x), x^158720 mod p(x) */
|
||||
.octa 0x00000000d2490000000000004ae10000
|
||||
|
||||
/* x^157760 mod p(x), x^157696 mod p(x) */
|
||||
.octa 0x00000000c85b00000000000000e70000
|
||||
|
||||
/* x^156736 mod p(x), x^156672 mod p(x) */
|
||||
.octa 0x00000000c37a00000000000015650000
|
||||
|
||||
/* x^155712 mod p(x), x^155648 mod p(x) */
|
||||
.octa 0x0000000018530000000000001c2f0000
|
||||
|
||||
/* x^154688 mod p(x), x^154624 mod p(x) */
|
||||
.octa 0x00000000b46600000000000037bd0000
|
||||
|
||||
/* x^153664 mod p(x), x^153600 mod p(x) */
|
||||
.octa 0x00000000439b00000000000012190000
|
||||
|
||||
/* x^152640 mod p(x), x^152576 mod p(x) */
|
||||
.octa 0x00000000b1260000000000005ece0000
|
||||
|
||||
/* x^151616 mod p(x), x^151552 mod p(x) */
|
||||
.octa 0x00000000d8110000000000002a5e0000
|
||||
|
||||
/* x^150592 mod p(x), x^150528 mod p(x) */
|
||||
.octa 0x00000000099f00000000000052330000
|
||||
|
||||
/* x^149568 mod p(x), x^149504 mod p(x) */
|
||||
.octa 0x00000000f9f9000000000000f9120000
|
||||
|
||||
/* x^148544 mod p(x), x^148480 mod p(x) */
|
||||
.octa 0x000000005cc00000000000000ddc0000
|
||||
|
||||
/* x^147520 mod p(x), x^147456 mod p(x) */
|
||||
.octa 0x00000000343b00000000000012200000
|
||||
|
||||
/* x^146496 mod p(x), x^146432 mod p(x) */
|
||||
.octa 0x000000009222000000000000d12b0000
|
||||
|
||||
/* x^145472 mod p(x), x^145408 mod p(x) */
|
||||
.octa 0x00000000d781000000000000eb2d0000
|
||||
|
||||
/* x^144448 mod p(x), x^144384 mod p(x) */
|
||||
.octa 0x000000000bf400000000000058970000
|
||||
|
||||
/* x^143424 mod p(x), x^143360 mod p(x) */
|
||||
.octa 0x00000000094200000000000013690000
|
||||
|
||||
/* x^142400 mod p(x), x^142336 mod p(x) */
|
||||
.octa 0x00000000d55100000000000051950000
|
||||
|
||||
/* x^141376 mod p(x), x^141312 mod p(x) */
|
||||
.octa 0x000000008f11000000000000954b0000
|
||||
|
||||
/* x^140352 mod p(x), x^140288 mod p(x) */
|
||||
.octa 0x00000000140f000000000000b29e0000
|
||||
|
||||
/* x^139328 mod p(x), x^139264 mod p(x) */
|
||||
.octa 0x00000000c6db000000000000db5d0000
|
||||
|
||||
/* x^138304 mod p(x), x^138240 mod p(x) */
|
||||
.octa 0x00000000715b000000000000dfaf0000
|
||||
|
||||
/* x^137280 mod p(x), x^137216 mod p(x) */
|
||||
.octa 0x000000000dea000000000000e3b60000
|
||||
|
||||
/* x^136256 mod p(x), x^136192 mod p(x) */
|
||||
.octa 0x000000006f94000000000000ddaf0000
|
||||
|
||||
/* x^135232 mod p(x), x^135168 mod p(x) */
|
||||
.octa 0x0000000024e1000000000000e4f70000
|
||||
|
||||
/* x^134208 mod p(x), x^134144 mod p(x) */
|
||||
.octa 0x000000008810000000000000aa110000
|
||||
|
||||
/* x^133184 mod p(x), x^133120 mod p(x) */
|
||||
.octa 0x0000000030c2000000000000a8e60000
|
||||
|
||||
/* x^132160 mod p(x), x^132096 mod p(x) */
|
||||
.octa 0x00000000e6d0000000000000ccf30000
|
||||
|
||||
/* x^131136 mod p(x), x^131072 mod p(x) */
|
||||
.octa 0x000000004da000000000000079bf0000
|
||||
|
||||
/* x^130112 mod p(x), x^130048 mod p(x) */
|
||||
.octa 0x000000007759000000000000b3a30000
|
||||
|
||||
/* x^129088 mod p(x), x^129024 mod p(x) */
|
||||
.octa 0x00000000597400000000000028790000
|
||||
|
||||
/* x^128064 mod p(x), x^128000 mod p(x) */
|
||||
.octa 0x000000007acd000000000000b5820000
|
||||
|
||||
/* x^127040 mod p(x), x^126976 mod p(x) */
|
||||
.octa 0x00000000e6e400000000000026ad0000
|
||||
|
||||
/* x^126016 mod p(x), x^125952 mod p(x) */
|
||||
.octa 0x000000006d49000000000000985b0000
|
||||
|
||||
/* x^124992 mod p(x), x^124928 mod p(x) */
|
||||
.octa 0x000000000f0800000000000011520000
|
||||
|
||||
/* x^123968 mod p(x), x^123904 mod p(x) */
|
||||
.octa 0x000000002c7f000000000000846c0000
|
||||
|
||||
/* x^122944 mod p(x), x^122880 mod p(x) */
|
||||
.octa 0x000000005ce7000000000000ae1d0000
|
||||
|
||||
/* x^121920 mod p(x), x^121856 mod p(x) */
|
||||
.octa 0x00000000d4cb000000000000e21d0000
|
||||
|
||||
/* x^120896 mod p(x), x^120832 mod p(x) */
|
||||
.octa 0x000000003a2300000000000019bb0000
|
||||
|
||||
/* x^119872 mod p(x), x^119808 mod p(x) */
|
||||
.octa 0x000000000e1700000000000095290000
|
||||
|
||||
/* x^118848 mod p(x), x^118784 mod p(x) */
|
||||
.octa 0x000000006e6400000000000050d20000
|
||||
|
||||
/* x^117824 mod p(x), x^117760 mod p(x) */
|
||||
.octa 0x000000008d5c0000000000000cd10000
|
||||
|
||||
/* x^116800 mod p(x), x^116736 mod p(x) */
|
||||
.octa 0x00000000ef310000000000007b570000
|
||||
|
||||
/* x^115776 mod p(x), x^115712 mod p(x) */
|
||||
.octa 0x00000000645d00000000000053d60000
|
||||
|
||||
/* x^114752 mod p(x), x^114688 mod p(x) */
|
||||
.octa 0x0000000018fc00000000000077510000
|
||||
|
||||
/* x^113728 mod p(x), x^113664 mod p(x) */
|
||||
.octa 0x000000000cb3000000000000a7b70000
|
||||
|
||||
/* x^112704 mod p(x), x^112640 mod p(x) */
|
||||
.octa 0x00000000991b000000000000d0780000
|
||||
|
||||
/* x^111680 mod p(x), x^111616 mod p(x) */
|
||||
.octa 0x00000000845a000000000000be3c0000
|
||||
|
||||
/* x^110656 mod p(x), x^110592 mod p(x) */
|
||||
.octa 0x00000000d3a9000000000000df020000
|
||||
|
||||
/* x^109632 mod p(x), x^109568 mod p(x) */
|
||||
.octa 0x0000000017d7000000000000063e0000
|
||||
|
||||
/* x^108608 mod p(x), x^108544 mod p(x) */
|
||||
.octa 0x000000007a860000000000008ab40000
|
||||
|
||||
/* x^107584 mod p(x), x^107520 mod p(x) */
|
||||
.octa 0x00000000fd7c000000000000c7bd0000
|
||||
|
||||
/* x^106560 mod p(x), x^106496 mod p(x) */
|
||||
.octa 0x00000000a56b000000000000efd60000
|
||||
|
||||
/* x^105536 mod p(x), x^105472 mod p(x) */
|
||||
.octa 0x0000000010e400000000000071380000
|
||||
|
||||
/* x^104512 mod p(x), x^104448 mod p(x) */
|
||||
.octa 0x00000000994500000000000004d30000
|
||||
|
||||
/* x^103488 mod p(x), x^103424 mod p(x) */
|
||||
.octa 0x00000000b83c0000000000003b0e0000
|
||||
|
||||
/* x^102464 mod p(x), x^102400 mod p(x) */
|
||||
.octa 0x00000000d6c10000000000008b020000
|
||||
|
||||
/* x^101440 mod p(x), x^101376 mod p(x) */
|
||||
.octa 0x000000009efc000000000000da940000
|
||||
|
||||
/* x^100416 mod p(x), x^100352 mod p(x) */
|
||||
.octa 0x000000005e87000000000000f9f70000
|
||||
|
||||
/* x^99392 mod p(x), x^99328 mod p(x) */
|
||||
.octa 0x000000006c9b00000000000045e40000
|
||||
|
||||
/* x^98368 mod p(x), x^98304 mod p(x) */
|
||||
.octa 0x00000000178a00000000000083940000
|
||||
|
||||
/* x^97344 mod p(x), x^97280 mod p(x) */
|
||||
.octa 0x00000000f0c8000000000000f0a00000
|
||||
|
||||
/* x^96320 mod p(x), x^96256 mod p(x) */
|
||||
.octa 0x00000000f699000000000000b74b0000
|
||||
|
||||
/* x^95296 mod p(x), x^95232 mod p(x) */
|
||||
.octa 0x00000000316d000000000000c1cf0000
|
||||
|
||||
/* x^94272 mod p(x), x^94208 mod p(x) */
|
||||
.octa 0x00000000987e00000000000072680000
|
||||
|
||||
/* x^93248 mod p(x), x^93184 mod p(x) */
|
||||
.octa 0x00000000acff000000000000e0ab0000
|
||||
|
||||
/* x^92224 mod p(x), x^92160 mod p(x) */
|
||||
.octa 0x00000000a1f6000000000000c5a80000
|
||||
|
||||
/* x^91200 mod p(x), x^91136 mod p(x) */
|
||||
.octa 0x0000000061bd000000000000cf690000
|
||||
|
||||
/* x^90176 mod p(x), x^90112 mod p(x) */
|
||||
.octa 0x00000000c9f2000000000000cbcc0000
|
||||
|
||||
/* x^89152 mod p(x), x^89088 mod p(x) */
|
||||
.octa 0x000000005a33000000000000de050000
|
||||
|
||||
/* x^88128 mod p(x), x^88064 mod p(x) */
|
||||
.octa 0x00000000e416000000000000ccd70000
|
||||
|
||||
/* x^87104 mod p(x), x^87040 mod p(x) */
|
||||
.octa 0x0000000058930000000000002f670000
|
||||
|
||||
/* x^86080 mod p(x), x^86016 mod p(x) */
|
||||
.octa 0x00000000a9d3000000000000152f0000
|
||||
|
||||
/* x^85056 mod p(x), x^84992 mod p(x) */
|
||||
.octa 0x00000000c114000000000000ecc20000
|
||||
|
||||
/* x^84032 mod p(x), x^83968 mod p(x) */
|
||||
.octa 0x00000000b9270000000000007c890000
|
||||
|
||||
/* x^83008 mod p(x), x^82944 mod p(x) */
|
||||
.octa 0x000000002e6000000000000006ee0000
|
||||
|
||||
/* x^81984 mod p(x), x^81920 mod p(x) */
|
||||
.octa 0x00000000dfc600000000000009100000
|
||||
|
||||
/* x^80960 mod p(x), x^80896 mod p(x) */
|
||||
.octa 0x000000004911000000000000ad4e0000
|
||||
|
||||
/* x^79936 mod p(x), x^79872 mod p(x) */
|
||||
.octa 0x00000000ae1b000000000000b04d0000
|
||||
|
||||
/* x^78912 mod p(x), x^78848 mod p(x) */
|
||||
.octa 0x0000000005fa000000000000e9900000
|
||||
|
||||
/* x^77888 mod p(x), x^77824 mod p(x) */
|
||||
.octa 0x0000000004a1000000000000cc6f0000
|
||||
|
||||
/* x^76864 mod p(x), x^76800 mod p(x) */
|
||||
.octa 0x00000000af73000000000000ed110000
|
||||
|
||||
/* x^75840 mod p(x), x^75776 mod p(x) */
|
||||
.octa 0x0000000082530000000000008f7e0000
|
||||
|
||||
/* x^74816 mod p(x), x^74752 mod p(x) */
|
||||
.octa 0x00000000cfdc000000000000594f0000
|
||||
|
||||
/* x^73792 mod p(x), x^73728 mod p(x) */
|
||||
.octa 0x00000000a6b6000000000000a8750000
|
||||
|
||||
/* x^72768 mod p(x), x^72704 mod p(x) */
|
||||
.octa 0x00000000fd76000000000000aa0c0000
|
||||
|
||||
/* x^71744 mod p(x), x^71680 mod p(x) */
|
||||
.octa 0x0000000006f500000000000071db0000
|
||||
|
||||
/* x^70720 mod p(x), x^70656 mod p(x) */
|
||||
.octa 0x0000000037ca000000000000ab0c0000
|
||||
|
||||
/* x^69696 mod p(x), x^69632 mod p(x) */
|
||||
.octa 0x00000000d7ab000000000000b7a00000
|
||||
|
||||
/* x^68672 mod p(x), x^68608 mod p(x) */
|
||||
.octa 0x00000000440800000000000090d30000
|
||||
|
||||
/* x^67648 mod p(x), x^67584 mod p(x) */
|
||||
.octa 0x00000000186100000000000054730000
|
||||
|
||||
/* x^66624 mod p(x), x^66560 mod p(x) */
|
||||
.octa 0x000000007368000000000000a3a20000
|
||||
|
||||
/* x^65600 mod p(x), x^65536 mod p(x) */
|
||||
.octa 0x0000000026d0000000000000f9040000
|
||||
|
||||
/* x^64576 mod p(x), x^64512 mod p(x) */
|
||||
.octa 0x00000000fe770000000000009c0a0000
|
||||
|
||||
/* x^63552 mod p(x), x^63488 mod p(x) */
|
||||
.octa 0x000000002cba000000000000d1e70000
|
||||
|
||||
/* x^62528 mod p(x), x^62464 mod p(x) */
|
||||
.octa 0x00000000f8bd0000000000005ac10000
|
||||
|
||||
/* x^61504 mod p(x), x^61440 mod p(x) */
|
||||
.octa 0x000000007372000000000000d68d0000
|
||||
|
||||
/* x^60480 mod p(x), x^60416 mod p(x) */
|
||||
.octa 0x00000000f37f00000000000089f60000
|
||||
|
||||
/* x^59456 mod p(x), x^59392 mod p(x) */
|
||||
.octa 0x00000000078400000000000008a90000
|
||||
|
||||
/* x^58432 mod p(x), x^58368 mod p(x) */
|
||||
.octa 0x00000000d3e400000000000042360000
|
||||
|
||||
/* x^57408 mod p(x), x^57344 mod p(x) */
|
||||
.octa 0x00000000eba800000000000092d50000
|
||||
|
||||
/* x^56384 mod p(x), x^56320 mod p(x) */
|
||||
.octa 0x00000000afbe000000000000b4d50000
|
||||
|
||||
/* x^55360 mod p(x), x^55296 mod p(x) */
|
||||
.octa 0x00000000d8ca000000000000c9060000
|
||||
|
||||
/* x^54336 mod p(x), x^54272 mod p(x) */
|
||||
.octa 0x00000000c2d00000000000008f4f0000
|
||||
|
||||
/* x^53312 mod p(x), x^53248 mod p(x) */
|
||||
.octa 0x00000000373200000000000028690000
|
||||
|
||||
/* x^52288 mod p(x), x^52224 mod p(x) */
|
||||
.octa 0x0000000046ae000000000000c3b30000
|
||||
|
||||
/* x^51264 mod p(x), x^51200 mod p(x) */
|
||||
.octa 0x00000000b243000000000000f8700000
|
||||
|
||||
/* x^50240 mod p(x), x^50176 mod p(x) */
|
||||
.octa 0x00000000f7f500000000000029eb0000
|
||||
|
||||
/* x^49216 mod p(x), x^49152 mod p(x) */
|
||||
.octa 0x000000000c7e000000000000fe730000
|
||||
|
||||
/* x^48192 mod p(x), x^48128 mod p(x) */
|
||||
.octa 0x00000000c38200000000000096000000
|
||||
|
||||
/* x^47168 mod p(x), x^47104 mod p(x) */
|
||||
.octa 0x000000008956000000000000683c0000
|
||||
|
||||
/* x^46144 mod p(x), x^46080 mod p(x) */
|
||||
.octa 0x00000000422d0000000000005f1e0000
|
||||
|
||||
/* x^45120 mod p(x), x^45056 mod p(x) */
|
||||
.octa 0x00000000ac0f0000000000006f810000
|
||||
|
||||
/* x^44096 mod p(x), x^44032 mod p(x) */
|
||||
.octa 0x00000000ce30000000000000031f0000
|
||||
|
||||
/* x^43072 mod p(x), x^43008 mod p(x) */
|
||||
.octa 0x000000003d43000000000000455a0000
|
||||
|
||||
/* x^42048 mod p(x), x^41984 mod p(x) */
|
||||
.octa 0x000000007ebe000000000000a6050000
|
||||
|
||||
/* x^41024 mod p(x), x^40960 mod p(x) */
|
||||
.octa 0x00000000976e00000000000077eb0000
|
||||
|
||||
/* x^40000 mod p(x), x^39936 mod p(x) */
|
||||
.octa 0x000000000872000000000000389c0000
|
||||
|
||||
/* x^38976 mod p(x), x^38912 mod p(x) */
|
||||
.octa 0x000000008979000000000000c7b20000
|
||||
|
||||
/* x^37952 mod p(x), x^37888 mod p(x) */
|
||||
.octa 0x000000005c1e0000000000001d870000
|
||||
|
||||
/* x^36928 mod p(x), x^36864 mod p(x) */
|
||||
.octa 0x00000000aebb00000000000045810000
|
||||
|
||||
/* x^35904 mod p(x), x^35840 mod p(x) */
|
||||
.octa 0x000000004f7e0000000000006d4a0000
|
||||
|
||||
/* x^34880 mod p(x), x^34816 mod p(x) */
|
||||
.octa 0x00000000ea98000000000000b9200000
|
||||
|
||||
/* x^33856 mod p(x), x^33792 mod p(x) */
|
||||
.octa 0x00000000f39600000000000022f20000
|
||||
|
||||
/* x^32832 mod p(x), x^32768 mod p(x) */
|
||||
.octa 0x000000000bc500000000000041ca0000
|
||||
|
||||
/* x^31808 mod p(x), x^31744 mod p(x) */
|
||||
.octa 0x00000000786400000000000078500000
|
||||
|
||||
/* x^30784 mod p(x), x^30720 mod p(x) */
|
||||
.octa 0x00000000be970000000000009e7e0000
|
||||
|
||||
/* x^29760 mod p(x), x^29696 mod p(x) */
|
||||
.octa 0x00000000dd6d000000000000a53c0000
|
||||
|
||||
/* x^28736 mod p(x), x^28672 mod p(x) */
|
||||
.octa 0x000000004c3f00000000000039340000
|
||||
|
||||
/* x^27712 mod p(x), x^27648 mod p(x) */
|
||||
.octa 0x0000000093a4000000000000b58e0000
|
||||
|
||||
/* x^26688 mod p(x), x^26624 mod p(x) */
|
||||
.octa 0x0000000050fb00000000000062d40000
|
||||
|
||||
/* x^25664 mod p(x), x^25600 mod p(x) */
|
||||
.octa 0x00000000f505000000000000a26f0000
|
||||
|
||||
/* x^24640 mod p(x), x^24576 mod p(x) */
|
||||
.octa 0x0000000064f900000000000065e60000
|
||||
|
||||
/* x^23616 mod p(x), x^23552 mod p(x) */
|
||||
.octa 0x00000000e8c2000000000000aad90000
|
||||
|
||||
/* x^22592 mod p(x), x^22528 mod p(x) */
|
||||
.octa 0x00000000720b000000000000a3b00000
|
||||
|
||||
/* x^21568 mod p(x), x^21504 mod p(x) */
|
||||
.octa 0x00000000e992000000000000d2680000
|
||||
|
||||
/* x^20544 mod p(x), x^20480 mod p(x) */
|
||||
.octa 0x000000009132000000000000cf4c0000
|
||||
|
||||
/* x^19520 mod p(x), x^19456 mod p(x) */
|
||||
.octa 0x00000000608a00000000000076610000
|
||||
|
||||
/* x^18496 mod p(x), x^18432 mod p(x) */
|
||||
.octa 0x000000009948000000000000fb9f0000
|
||||
|
||||
/* x^17472 mod p(x), x^17408 mod p(x) */
|
||||
.octa 0x00000000173000000000000003770000
|
||||
|
||||
/* x^16448 mod p(x), x^16384 mod p(x) */
|
||||
.octa 0x000000006fe300000000000004880000
|
||||
|
||||
/* x^15424 mod p(x), x^15360 mod p(x) */
|
||||
.octa 0x00000000e15300000000000056a70000
|
||||
|
||||
/* x^14400 mod p(x), x^14336 mod p(x) */
|
||||
.octa 0x0000000092d60000000000009dfd0000
|
||||
|
||||
/* x^13376 mod p(x), x^13312 mod p(x) */
|
||||
.octa 0x0000000002fd00000000000074c80000
|
||||
|
||||
/* x^12352 mod p(x), x^12288 mod p(x) */
|
||||
.octa 0x00000000c78b000000000000a3ec0000
|
||||
|
||||
/* x^11328 mod p(x), x^11264 mod p(x) */
|
||||
.octa 0x000000009262000000000000b3530000
|
||||
|
||||
/* x^10304 mod p(x), x^10240 mod p(x) */
|
||||
.octa 0x0000000084f200000000000047bf0000
|
||||
|
||||
/* x^9280 mod p(x), x^9216 mod p(x) */
|
||||
.octa 0x0000000067ee000000000000e97c0000
|
||||
|
||||
/* x^8256 mod p(x), x^8192 mod p(x) */
|
||||
.octa 0x00000000535b00000000000091e10000
|
||||
|
||||
/* x^7232 mod p(x), x^7168 mod p(x) */
|
||||
.octa 0x000000007ebb00000000000055060000
|
||||
|
||||
/* x^6208 mod p(x), x^6144 mod p(x) */
|
||||
.octa 0x00000000c6a1000000000000fd360000
|
||||
|
||||
/* x^5184 mod p(x), x^5120 mod p(x) */
|
||||
.octa 0x000000001be500000000000055860000
|
||||
|
||||
/* x^4160 mod p(x), x^4096 mod p(x) */
|
||||
.octa 0x00000000ae0e0000000000005bd00000
|
||||
|
||||
/* x^3136 mod p(x), x^3072 mod p(x) */
|
||||
.octa 0x0000000022040000000000008db20000
|
||||
|
||||
/* x^2112 mod p(x), x^2048 mod p(x) */
|
||||
.octa 0x00000000c9eb000000000000efe20000
|
||||
|
||||
/* x^1088 mod p(x), x^1024 mod p(x) */
|
||||
.octa 0x0000000039b400000000000051d10000
|
||||
|
||||
.short_constants:
|
||||
|
||||
/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
|
||||
/* x^2048 mod p(x), x^2016 mod p(x), x^1984 mod p(x), x^1952 mod p(x) */
|
||||
.octa 0xefe20000dccf00009440000033590000
|
||||
|
||||
/* x^1920 mod p(x), x^1888 mod p(x), x^1856 mod p(x), x^1824 mod p(x) */
|
||||
.octa 0xee6300002f3f000062180000e0ed0000
|
||||
|
||||
/* x^1792 mod p(x), x^1760 mod p(x), x^1728 mod p(x), x^1696 mod p(x) */
|
||||
.octa 0xcf5f000017ef0000ccbe000023d30000
|
||||
|
||||
/* x^1664 mod p(x), x^1632 mod p(x), x^1600 mod p(x), x^1568 mod p(x) */
|
||||
.octa 0x6d0c0000a30e00000920000042630000
|
||||
|
||||
/* x^1536 mod p(x), x^1504 mod p(x), x^1472 mod p(x), x^1440 mod p(x) */
|
||||
.octa 0x21d30000932b0000a7a00000efcc0000
|
||||
|
||||
/* x^1408 mod p(x), x^1376 mod p(x), x^1344 mod p(x), x^1312 mod p(x) */
|
||||
.octa 0x10be00000b310000666f00000d1c0000
|
||||
|
||||
/* x^1280 mod p(x), x^1248 mod p(x), x^1216 mod p(x), x^1184 mod p(x) */
|
||||
.octa 0x1f240000ce9e0000caad0000589e0000
|
||||
|
||||
/* x^1152 mod p(x), x^1120 mod p(x), x^1088 mod p(x), x^1056 mod p(x) */
|
||||
.octa 0x29610000d02b000039b400007cf50000
|
||||
|
||||
/* x^1024 mod p(x), x^992 mod p(x), x^960 mod p(x), x^928 mod p(x) */
|
||||
.octa 0x51d100009d9d00003c0e0000bfd60000
|
||||
|
||||
/* x^896 mod p(x), x^864 mod p(x), x^832 mod p(x), x^800 mod p(x) */
|
||||
.octa 0xda390000ceae000013830000713c0000
|
||||
|
||||
/* x^768 mod p(x), x^736 mod p(x), x^704 mod p(x), x^672 mod p(x) */
|
||||
.octa 0xb67800001e16000085c0000080a60000
|
||||
|
||||
/* x^640 mod p(x), x^608 mod p(x), x^576 mod p(x), x^544 mod p(x) */
|
||||
.octa 0x0db40000f7f90000371d0000e6580000
|
||||
|
||||
/* x^512 mod p(x), x^480 mod p(x), x^448 mod p(x), x^416 mod p(x) */
|
||||
.octa 0x87e70000044c0000aadb0000a4970000
|
||||
|
||||
/* x^384 mod p(x), x^352 mod p(x), x^320 mod p(x), x^288 mod p(x) */
|
||||
.octa 0x1f990000ad180000d8b30000e7b50000
|
||||
|
||||
/* x^256 mod p(x), x^224 mod p(x), x^192 mod p(x), x^160 mod p(x) */
|
||||
.octa 0xbe6c00006ee300004c1a000006df0000
|
||||
|
||||
/* x^128 mod p(x), x^96 mod p(x), x^64 mod p(x), x^32 mod p(x) */
|
||||
.octa 0xfb0b00002d560000136800008bb70000
|
||||
|
||||
|
||||
.barrett_constants:
|
||||
/* Barrett constant m - (4^32)/n */
|
||||
.octa 0x000000000000000000000001f65a57f8 /* x^64 div p(x) */
|
||||
/* Barrett constant n */
|
||||
.octa 0x0000000000000000000000018bb70000
|
||||
|
||||
#define CRC_FUNCTION_NAME __crct10dif_vpmsum
|
||||
#include "crc32-vpmsum_core.S"
|
128
arch/powerpc/crypto/crct10dif-vpmsum_glue.c
Normal file
128
arch/powerpc/crypto/crct10dif-vpmsum_glue.c
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Calculate a CRC T10-DIF with vpmsum acceleration
|
||||
*
|
||||
* Copyright 2017, Daniel Axtens, IBM Corporation.
|
||||
* [based on crc32c-vpmsum_glue.c]
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/crc-t10dif.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <asm/switch_to.h>
|
||||
|
||||
#define VMX_ALIGN 16
|
||||
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
|
||||
|
||||
#define VECTOR_BREAKPOINT 64
|
||||
|
||||
u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len);
|
||||
|
||||
static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len)
|
||||
{
|
||||
unsigned int prealign;
|
||||
unsigned int tail;
|
||||
u32 crc = crci;
|
||||
|
||||
if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt())
|
||||
return crc_t10dif_generic(crc, p, len);
|
||||
|
||||
if ((unsigned long)p & VMX_ALIGN_MASK) {
|
||||
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
|
||||
crc = crc_t10dif_generic(crc, p, prealign);
|
||||
len -= prealign;
|
||||
p += prealign;
|
||||
}
|
||||
|
||||
if (len & ~VMX_ALIGN_MASK) {
|
||||
crc <<= 16;
|
||||
preempt_disable();
|
||||
pagefault_disable();
|
||||
enable_kernel_altivec();
|
||||
crc = __crct10dif_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
|
||||
disable_kernel_altivec();
|
||||
pagefault_enable();
|
||||
preempt_enable();
|
||||
crc >>= 16;
|
||||
}
|
||||
|
||||
tail = len & VMX_ALIGN_MASK;
|
||||
if (tail) {
|
||||
p += len & ~VMX_ALIGN_MASK;
|
||||
crc = crc_t10dif_generic(crc, p, tail);
|
||||
}
|
||||
|
||||
return crc & 0xffff;
|
||||
}
|
||||
|
||||
static int crct10dif_vpmsum_init(struct shash_desc *desc)
|
||||
{
|
||||
u16 *crc = shash_desc_ctx(desc);
|
||||
|
||||
*crc = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crct10dif_vpmsum_update(struct shash_desc *desc, const u8 *data,
|
||||
unsigned int length)
|
||||
{
|
||||
u16 *crc = shash_desc_ctx(desc);
|
||||
|
||||
*crc = crct10dif_vpmsum(*crc, data, length);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int crct10dif_vpmsum_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
u16 *crcp = shash_desc_ctx(desc);
|
||||
|
||||
*(u16 *)out = *crcp;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg alg = {
|
||||
.init = crct10dif_vpmsum_init,
|
||||
.update = crct10dif_vpmsum_update,
|
||||
.final = crct10dif_vpmsum_final,
|
||||
.descsize = CRC_T10DIF_DIGEST_SIZE,
|
||||
.digestsize = CRC_T10DIF_DIGEST_SIZE,
|
||||
.base = {
|
||||
.cra_name = "crct10dif",
|
||||
.cra_driver_name = "crct10dif-vpmsum",
|
||||
.cra_priority = 200,
|
||||
.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
|
||||
.cra_module = THIS_MODULE,
|
||||
}
|
||||
};
|
||||
|
||||
static int __init crct10dif_vpmsum_mod_init(void)
|
||||
{
|
||||
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
|
||||
return -ENODEV;
|
||||
|
||||
return crypto_register_shash(&alg);
|
||||
}
|
||||
|
||||
static void __exit crct10dif_vpmsum_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_shash(&alg);
|
||||
}
|
||||
|
||||
module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crct10dif_vpmsum_mod_init);
|
||||
module_exit(crct10dif_vpmsum_mod_fini);
|
||||
|
||||
MODULE_AUTHOR("Daniel Axtens <dja@axtens.net>");
|
||||
MODULE_DESCRIPTION("CRCT10DIF using vector polynomial multiply-sum instructions");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_CRYPTO("crct10dif");
|
||||
MODULE_ALIAS_CRYPTO("crct10dif-vpmsum");
|
Reference in New Issue
Block a user