poly1305-armv4.pl 29 KB


  1. #!/usr/bin/env perl
  2. # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
  3. #
  4. # ====================================================================
  5. # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
  6. # project.
  7. # ====================================================================
  8. #
  9. # IALU(*)/gcc-4.4 NEON
  10. #
  11. # ARM11xx(ARMv6) 7.78/+100% -
  12. # Cortex-A5 6.35/+130% 3.00
  13. # Cortex-A8 6.25/+115% 2.36
  14. # Cortex-A9 5.10/+95% 2.55
  15. # Cortex-A15 3.85/+85% 1.25(**)
  16. # Snapdragon S4 5.70/+100% 1.48(**)
  17. #
  18. # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
  19. # (**) these are trade-off results, they can be improved by ~8% but at
  20. # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
  21. # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
  22. $flavour = shift;
  23. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  24. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  25. if ($flavour && $flavour ne "void") {
  26. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  27. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  28. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  29. die "can't locate arm-xlate.pl";
  30. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  31. } else {
  32. open STDOUT,">$output";
  33. }
  34. ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
  35. $code.=<<___;
  36. #ifndef __KERNEL__
  37. # include "arm_arch.h"
  38. #else
  39. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  40. # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
  41. # define poly1305_init poly1305_init_arm
  42. # define poly1305_blocks poly1305_blocks_arm
  43. # define poly1305_emit poly1305_emit_arm
  44. .globl poly1305_blocks_neon
  45. #endif
  46. #if defined(__thumb2__)
  47. .syntax unified
  48. .thumb
  49. #else
  50. .code 32
  51. #endif
  52. .text
  53. .globl poly1305_emit
  54. .globl poly1305_blocks
  55. .globl poly1305_init
  56. .type poly1305_init,%function
  57. .align 5
  58. poly1305_init:
  59. .Lpoly1305_init:
  60. stmdb sp!,{r4-r11}
  61. eor r3,r3,r3
  62. cmp $inp,#0
  63. str r3,[$ctx,#0] @ zero hash value
  64. str r3,[$ctx,#4]
  65. str r3,[$ctx,#8]
  66. str r3,[$ctx,#12]
  67. str r3,[$ctx,#16]
  68. str r3,[$ctx,#36] @ clear is_base2_26
  69. add $ctx,$ctx,#20
  70. #ifdef __thumb2__
  71. it eq
  72. #endif
  73. moveq r0,#0
  74. beq .Lno_key
  75. #if __ARM_MAX_ARCH__>=7
  76. mov r3,#-1
  77. str r3,[$ctx,#28] @ impossible key power value
  78. # ifndef __KERNEL__
  79. adr r11,.Lpoly1305_init
  80. ldr r12,.LOPENSSL_armcap
  81. # endif
  82. #endif
  83. ldrb r4,[$inp,#0]
  84. mov r10,#0x0fffffff
  85. ldrb r5,[$inp,#1]
  86. and r3,r10,#-4 @ 0x0ffffffc
  87. ldrb r6,[$inp,#2]
  88. ldrb r7,[$inp,#3]
  89. orr r4,r4,r5,lsl#8
  90. ldrb r5,[$inp,#4]
  91. orr r4,r4,r6,lsl#16
  92. ldrb r6,[$inp,#5]
  93. orr r4,r4,r7,lsl#24
  94. ldrb r7,[$inp,#6]
  95. and r4,r4,r10
  96. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  97. # if !defined(_WIN32)
  98. ldr r12,[r11,r12] @ OPENSSL_armcap_P
  99. # endif
  100. # if defined(__APPLE__) || defined(_WIN32)
  101. ldr r12,[r12]
  102. # endif
  103. #endif
  104. ldrb r8,[$inp,#7]
  105. orr r5,r5,r6,lsl#8
  106. ldrb r6,[$inp,#8]
  107. orr r5,r5,r7,lsl#16
  108. ldrb r7,[$inp,#9]
  109. orr r5,r5,r8,lsl#24
  110. ldrb r8,[$inp,#10]
  111. and r5,r5,r3
  112. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  113. tst r12,#ARMV7_NEON @ check for NEON
  114. # ifdef __thumb2__
  115. adr r9,.Lpoly1305_blocks_neon
  116. adr r11,.Lpoly1305_blocks
  117. it ne
  118. movne r11,r9
  119. adr r12,.Lpoly1305_emit
  120. orr r11,r11,#1 @ thumb-ify addresses
  121. orr r12,r12,#1
  122. # else
  123. add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
  124. ite eq
  125. addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
  126. addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
  127. # endif
  128. #endif
  129. ldrb r9,[$inp,#11]
  130. orr r6,r6,r7,lsl#8
  131. ldrb r7,[$inp,#12]
  132. orr r6,r6,r8,lsl#16
  133. ldrb r8,[$inp,#13]
  134. orr r6,r6,r9,lsl#24
  135. ldrb r9,[$inp,#14]
  136. and r6,r6,r3
  137. ldrb r10,[$inp,#15]
  138. orr r7,r7,r8,lsl#8
  139. str r4,[$ctx,#0]
  140. orr r7,r7,r9,lsl#16
  141. str r5,[$ctx,#4]
  142. orr r7,r7,r10,lsl#24
  143. str r6,[$ctx,#8]
  144. and r7,r7,r3
  145. str r7,[$ctx,#12]
  146. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  147. stmia r2,{r11,r12} @ fill functions table
  148. mov r0,#1
  149. #else
  150. mov r0,#0
  151. #endif
  152. .Lno_key:
  153. ldmia sp!,{r4-r11}
  154. #if __ARM_ARCH__>=5
  155. ret @ bx lr
  156. #else
  157. tst lr,#1
  158. moveq pc,lr @ be binary compatible with V4, yet
  159. bx lr @ interoperable with Thumb ISA:-)
  160. #endif
  161. .size poly1305_init,.-poly1305_init
  162. ___
  163. {
  164. my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
  165. my ($s1,$s2,$s3)=($r1,$r2,$r3);
  166. $code.=<<___;
  167. .type poly1305_blocks,%function
  168. .align 5
  169. poly1305_blocks:
  170. .Lpoly1305_blocks:
  171. stmdb sp!,{r3-r11,lr}
  172. ands $len,$len,#-16
  173. beq .Lno_data
  174. add $len,$len,$inp @ end pointer
  175. sub sp,sp,#32
  176. #if __ARM_ARCH__<7
  177. ldmia $ctx,{$h0-$r3} @ load context
  178. add $ctx,$ctx,#20
  179. str $len,[sp,#16] @ offload stuff
  180. str $ctx,[sp,#12]
  181. #else
  182. ldr lr,[$ctx,#36] @ is_base2_26
  183. ldmia $ctx!,{$h0-$h4} @ load hash value
  184. str $len,[sp,#16] @ offload stuff
  185. str $ctx,[sp,#12]
  186. adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
  187. mov $r1,$h1,lsr#6
  188. adcs $r1,$r1,$h2,lsl#20
  189. mov $r2,$h2,lsr#12
  190. adcs $r2,$r2,$h3,lsl#14
  191. mov $r3,$h3,lsr#18
  192. adcs $r3,$r3,$h4,lsl#8
  193. mov $len,#0
  194. teq lr,#0
  195. str $len,[$ctx,#16] @ clear is_base2_26
  196. adc $len,$len,$h4,lsr#24
  197. itttt ne
  198. movne $h0,$r0 @ choose between radixes
  199. movne $h1,$r1
  200. movne $h2,$r2
  201. movne $h3,$r3
  202. ldmia $ctx,{$r0-$r3} @ load key
  203. it ne
  204. movne $h4,$len
  205. #endif
  206. mov lr,$inp
  207. cmp $padbit,#0
  208. str $r1,[sp,#20]
  209. str $r2,[sp,#24]
  210. str $r3,[sp,#28]
  211. b .Loop
  212. .align 4
  213. .Loop:
  214. #if __ARM_ARCH__<7
  215. ldrb r0,[lr],#16 @ load input
  216. # ifdef __thumb2__
  217. it hi
  218. # endif
  219. addhi $h4,$h4,#1 @ 1<<128
  220. ldrb r1,[lr,#-15]
  221. ldrb r2,[lr,#-14]
  222. ldrb r3,[lr,#-13]
  223. orr r1,r0,r1,lsl#8
  224. ldrb r0,[lr,#-12]
  225. orr r2,r1,r2,lsl#16
  226. ldrb r1,[lr,#-11]
  227. orr r3,r2,r3,lsl#24
  228. ldrb r2,[lr,#-10]
  229. adds $h0,$h0,r3 @ accumulate input
  230. ldrb r3,[lr,#-9]
  231. orr r1,r0,r1,lsl#8
  232. ldrb r0,[lr,#-8]
  233. orr r2,r1,r2,lsl#16
  234. ldrb r1,[lr,#-7]
  235. orr r3,r2,r3,lsl#24
  236. ldrb r2,[lr,#-6]
  237. adcs $h1,$h1,r3
  238. ldrb r3,[lr,#-5]
  239. orr r1,r0,r1,lsl#8
  240. ldrb r0,[lr,#-4]
  241. orr r2,r1,r2,lsl#16
  242. ldrb r1,[lr,#-3]
  243. orr r3,r2,r3,lsl#24
  244. ldrb r2,[lr,#-2]
  245. adcs $h2,$h2,r3
  246. ldrb r3,[lr,#-1]
  247. orr r1,r0,r1,lsl#8
  248. str lr,[sp,#8] @ offload input pointer
  249. orr r2,r1,r2,lsl#16
  250. add $s1,$r1,$r1,lsr#2
  251. orr r3,r2,r3,lsl#24
  252. #else
  253. ldr r0,[lr],#16 @ load input
  254. it hi
  255. addhi $h4,$h4,#1 @ padbit
  256. ldr r1,[lr,#-12]
  257. ldr r2,[lr,#-8]
  258. ldr r3,[lr,#-4]
  259. # ifdef __ARMEB__
  260. rev r0,r0
  261. rev r1,r1
  262. rev r2,r2
  263. rev r3,r3
  264. # endif
  265. adds $h0,$h0,r0 @ accumulate input
  266. str lr,[sp,#8] @ offload input pointer
  267. adcs $h1,$h1,r1
  268. add $s1,$r1,$r1,lsr#2
  269. adcs $h2,$h2,r2
  270. #endif
  271. add $s2,$r2,$r2,lsr#2
  272. adcs $h3,$h3,r3
  273. add $s3,$r3,$r3,lsr#2
  274. umull r2,r3,$h1,$r0
  275. adc $h4,$h4,#0
  276. umull r0,r1,$h0,$r0
  277. umlal r2,r3,$h4,$s1
  278. umlal r0,r1,$h3,$s1
  279. ldr $r1,[sp,#20] @ reload $r1
  280. umlal r2,r3,$h2,$s3
  281. umlal r0,r1,$h1,$s3
  282. umlal r2,r3,$h3,$s2
  283. umlal r0,r1,$h2,$s2
  284. umlal r2,r3,$h0,$r1
  285. str r0,[sp,#0] @ future $h0
  286. mul r0,$s2,$h4
  287. ldr $r2,[sp,#24] @ reload $r2
  288. adds r2,r2,r1 @ d1+=d0>>32
  289. eor r1,r1,r1
  290. adc lr,r3,#0 @ future $h2
  291. str r2,[sp,#4] @ future $h1
  292. mul r2,$s3,$h4
  293. eor r3,r3,r3
  294. umlal r0,r1,$h3,$s3
  295. ldr $r3,[sp,#28] @ reload $r3
  296. umlal r2,r3,$h3,$r0
  297. umlal r0,r1,$h2,$r0
  298. umlal r2,r3,$h2,$r1
  299. umlal r0,r1,$h1,$r1
  300. umlal r2,r3,$h1,$r2
  301. umlal r0,r1,$h0,$r2
  302. umlal r2,r3,$h0,$r3
  303. ldr $h0,[sp,#0]
  304. mul $h4,$r0,$h4
  305. ldr $h1,[sp,#4]
  306. adds $h2,lr,r0 @ d2+=d1>>32
  307. ldr lr,[sp,#8] @ reload input pointer
  308. adc r1,r1,#0
  309. adds $h3,r2,r1 @ d3+=d2>>32
  310. ldr r0,[sp,#16] @ reload end pointer
  311. adc r3,r3,#0
  312. add $h4,$h4,r3 @ h4+=d3>>32
  313. and r1,$h4,#-4
  314. and $h4,$h4,#3
  315. add r1,r1,r1,lsr#2 @ *=5
  316. adds $h0,$h0,r1
  317. adcs $h1,$h1,#0
  318. adcs $h2,$h2,#0
  319. adcs $h3,$h3,#0
  320. adc $h4,$h4,#0
  321. cmp r0,lr @ done yet?
  322. bhi .Loop
  323. ldr $ctx,[sp,#12]
  324. add sp,sp,#32
  325. stmdb $ctx,{$h0-$h4} @ store the result
  326. .Lno_data:
  327. #if __ARM_ARCH__>=5
  328. ldmia sp!,{r3-r11,pc}
  329. #else
  330. ldmia sp!,{r3-r11,lr}
  331. tst lr,#1
  332. moveq pc,lr @ be binary compatible with V4, yet
  333. bx lr @ interoperable with Thumb ISA:-)
  334. #endif
  335. .size poly1305_blocks,.-poly1305_blocks
  336. ___
  337. }
  338. {
  339. my ($ctx,$mac,$nonce)=map("r$_",(0..2));
  340. my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
  341. my $g4=$ctx;
  342. $code.=<<___;
  343. .type poly1305_emit,%function
  344. .align 5
  345. poly1305_emit:
  346. .Lpoly1305_emit:
  347. stmdb sp!,{r4-r11}
  348. ldmia $ctx,{$h0-$h4}
  349. #if __ARM_ARCH__>=7
  350. ldr ip,[$ctx,#36] @ is_base2_26
  351. adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
  352. mov $g1,$h1,lsr#6
  353. adcs $g1,$g1,$h2,lsl#20
  354. mov $g2,$h2,lsr#12
  355. adcs $g2,$g2,$h3,lsl#14
  356. mov $g3,$h3,lsr#18
  357. adcs $g3,$g3,$h4,lsl#8
  358. mov $g4,#0
  359. adc $g4,$g4,$h4,lsr#24
  360. tst ip,ip
  361. itttt ne
  362. movne $h0,$g0
  363. movne $h1,$g1
  364. movne $h2,$g2
  365. movne $h3,$g3
  366. it ne
  367. movne $h4,$g4
  368. #endif
  369. adds $g0,$h0,#5 @ compare to modulus
  370. adcs $g1,$h1,#0
  371. adcs $g2,$h2,#0
  372. adcs $g3,$h3,#0
  373. adc $g4,$h4,#0
  374. tst $g4,#4 @ did it carry/borrow?
  375. #ifdef __thumb2__
  376. it ne
  377. #endif
  378. movne $h0,$g0
  379. ldr $g0,[$nonce,#0]
  380. #ifdef __thumb2__
  381. it ne
  382. #endif
  383. movne $h1,$g1
  384. ldr $g1,[$nonce,#4]
  385. #ifdef __thumb2__
  386. it ne
  387. #endif
  388. movne $h2,$g2
  389. ldr $g2,[$nonce,#8]
  390. #ifdef __thumb2__
  391. it ne
  392. #endif
  393. movne $h3,$g3
  394. ldr $g3,[$nonce,#12]
  395. adds $h0,$h0,$g0
  396. adcs $h1,$h1,$g1
  397. adcs $h2,$h2,$g2
  398. adc $h3,$h3,$g3
  399. #if __ARM_ARCH__>=7
  400. # ifdef __ARMEB__
  401. rev $h0,$h0
  402. rev $h1,$h1
  403. rev $h2,$h2
  404. rev $h3,$h3
  405. # endif
  406. str $h0,[$mac,#0]
  407. str $h1,[$mac,#4]
  408. str $h2,[$mac,#8]
  409. str $h3,[$mac,#12]
  410. #else
  411. strb $h0,[$mac,#0]
  412. mov $h0,$h0,lsr#8
  413. strb $h1,[$mac,#4]
  414. mov $h1,$h1,lsr#8
  415. strb $h2,[$mac,#8]
  416. mov $h2,$h2,lsr#8
  417. strb $h3,[$mac,#12]
  418. mov $h3,$h3,lsr#8
  419. strb $h0,[$mac,#1]
  420. mov $h0,$h0,lsr#8
  421. strb $h1,[$mac,#5]
  422. mov $h1,$h1,lsr#8
  423. strb $h2,[$mac,#9]
  424. mov $h2,$h2,lsr#8
  425. strb $h3,[$mac,#13]
  426. mov $h3,$h3,lsr#8
  427. strb $h0,[$mac,#2]
  428. mov $h0,$h0,lsr#8
  429. strb $h1,[$mac,#6]
  430. mov $h1,$h1,lsr#8
  431. strb $h2,[$mac,#10]
  432. mov $h2,$h2,lsr#8
  433. strb $h3,[$mac,#14]
  434. mov $h3,$h3,lsr#8
  435. strb $h0,[$mac,#3]
  436. strb $h1,[$mac,#7]
  437. strb $h2,[$mac,#11]
  438. strb $h3,[$mac,#15]
  439. #endif
  440. ldmia sp!,{r4-r11}
  441. #if __ARM_ARCH__>=5
  442. ret @ bx lr
  443. #else
  444. tst lr,#1
  445. moveq pc,lr @ be binary compatible with V4, yet
  446. bx lr @ interoperable with Thumb ISA:-)
  447. #endif
  448. .size poly1305_emit,.-poly1305_emit
  449. ___
  450. {
  451. my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
  452. my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
  453. my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
  454. my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
  455. $code.=<<___;
  456. #if __ARM_MAX_ARCH__>=7
  457. .fpu neon
  458. .type poly1305_init_neon,%function
  459. .align 5
  460. poly1305_init_neon:
  461. .Lpoly1305_init_neon:
  462. ldr r3,[$ctx,#48] @ first table element
  463. cmp r3,#-1 @ is value impossible?
  464. bne .Lno_init_neon
  465. ldr r4,[$ctx,#20] @ load key base 2^32
  466. ldr r5,[$ctx,#24]
  467. ldr r6,[$ctx,#28]
  468. ldr r7,[$ctx,#32]
  469. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  470. mov r3,r4,lsr#26
  471. mov r4,r5,lsr#20
  472. orr r3,r3,r5,lsl#6
  473. mov r5,r6,lsr#14
  474. orr r4,r4,r6,lsl#12
  475. mov r6,r7,lsr#8
  476. orr r5,r5,r7,lsl#18
  477. and r3,r3,#0x03ffffff
  478. and r4,r4,#0x03ffffff
  479. and r5,r5,#0x03ffffff
  480. vdup.32 $R0,r2 @ r^1 in both lanes
  481. add r2,r3,r3,lsl#2 @ *5
  482. vdup.32 $R1,r3
  483. add r3,r4,r4,lsl#2
  484. vdup.32 $S1,r2
  485. vdup.32 $R2,r4
  486. add r4,r5,r5,lsl#2
  487. vdup.32 $S2,r3
  488. vdup.32 $R3,r5
  489. add r5,r6,r6,lsl#2
  490. vdup.32 $S3,r4
  491. vdup.32 $R4,r6
  492. vdup.32 $S4,r5
  493. mov $zeros,#2 @ counter
  494. .Lsquare_neon:
  495. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  496. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  497. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  498. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  499. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  500. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  501. vmull.u32 $D0,$R0,${R0}[1]
  502. vmull.u32 $D1,$R1,${R0}[1]
  503. vmull.u32 $D2,$R2,${R0}[1]
  504. vmull.u32 $D3,$R3,${R0}[1]
  505. vmull.u32 $D4,$R4,${R0}[1]
  506. vmlal.u32 $D0,$R4,${S1}[1]
  507. vmlal.u32 $D1,$R0,${R1}[1]
  508. vmlal.u32 $D2,$R1,${R1}[1]
  509. vmlal.u32 $D3,$R2,${R1}[1]
  510. vmlal.u32 $D4,$R3,${R1}[1]
  511. vmlal.u32 $D0,$R3,${S2}[1]
  512. vmlal.u32 $D1,$R4,${S2}[1]
  513. vmlal.u32 $D3,$R1,${R2}[1]
  514. vmlal.u32 $D2,$R0,${R2}[1]
  515. vmlal.u32 $D4,$R2,${R2}[1]
  516. vmlal.u32 $D0,$R2,${S3}[1]
  517. vmlal.u32 $D3,$R0,${R3}[1]
  518. vmlal.u32 $D1,$R3,${S3}[1]
  519. vmlal.u32 $D2,$R4,${S3}[1]
  520. vmlal.u32 $D4,$R1,${R3}[1]
  521. vmlal.u32 $D3,$R4,${S4}[1]
  522. vmlal.u32 $D0,$R1,${S4}[1]
  523. vmlal.u32 $D1,$R2,${S4}[1]
  524. vmlal.u32 $D2,$R3,${S4}[1]
  525. vmlal.u32 $D4,$R0,${R4}[1]
  526. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  527. @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  528. @ and P. Schwabe
  529. @
  530. @ H0>>+H1>>+H2>>+H3>>+H4
  531. @ H3>>+H4>>*5+H0>>+H1
  532. @
  533. @ Trivia.
  534. @
  535. @ Result of multiplication of n-bit number by m-bit number is
  536. @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
  537. @ m-bit number multiplied by 2^n is still n+m bits wide.
  538. @
  539. @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
  540. @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
  541. @ one is n+1 bits wide.
  542. @
  543. @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
  544. @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
  545. @ can be 27. However! In cases when their width exceeds 26 bits
  546. @ they are limited by 2^26+2^6. This in turn means that *sum*
  547. @ of the products with these values can still be viewed as sum
  548. @ of 52-bit numbers as long as the amount of addends is not a
  549. @ power of 2. For example,
  550. @
  551. @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
  552. @
  553. @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
  554. @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
  555. @ 8 * (2^52) or 2^55. However, the value is then multiplied by
  556. @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
  557. @ which is less than 32 * (2^52) or 2^57. And when processing
  558. @ data we are looking at triple as many addends...
  559. @
  560. @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
  561. @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
  562. @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
  563. @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
  564. @ instruction accepts 2x32-bit input and writes 2x64-bit result.
  565. @ This means that result of reduction have to be compressed upon
  566. @ loop wrap-around. This can be done in the process of reduction
  567. @ to minimize amount of instructions [as well as amount of
  568. @ 128-bit instructions, which benefits low-end processors], but
  569. @ one has to watch for H2 (which is narrower than H0) and 5*H4
  570. @ not being wider than 58 bits, so that result of right shift
  571. @ by 26 bits fits in 32 bits. This is also useful on x86,
  572. @ because it allows to use paddd in place for paddq, which
  573. @ benefits Atom, where paddq is ridiculously slow.
  574. vshr.u64 $T0,$D3,#26
  575. vmovn.i64 $D3#lo,$D3
  576. vshr.u64 $T1,$D0,#26
  577. vmovn.i64 $D0#lo,$D0
  578. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  579. vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
  580. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  581. vbic.i32 $D0#lo,#0xfc000000
  582. vshrn.u64 $T0#lo,$D4,#26
  583. vmovn.i64 $D4#lo,$D4
  584. vshr.u64 $T1,$D1,#26
  585. vmovn.i64 $D1#lo,$D1
  586. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  587. vbic.i32 $D4#lo,#0xfc000000
  588. vbic.i32 $D1#lo,#0xfc000000
  589. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  590. vshl.u32 $T0#lo,$T0#lo,#2
  591. vshrn.u64 $T1#lo,$D2,#26
  592. vmovn.i64 $D2#lo,$D2
  593. vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
  594. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  595. vbic.i32 $D2#lo,#0xfc000000
  596. vshr.u32 $T0#lo,$D0#lo,#26
  597. vbic.i32 $D0#lo,#0xfc000000
  598. vshr.u32 $T1#lo,$D3#lo,#26
  599. vbic.i32 $D3#lo,#0xfc000000
  600. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  601. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  602. subs $zeros,$zeros,#1
  603. beq .Lsquare_break_neon
  604. add $tbl0,$ctx,#(48+0*9*4)
  605. add $tbl1,$ctx,#(48+1*9*4)
  606. vtrn.32 $R0,$D0#lo @ r^2:r^1
  607. vtrn.32 $R2,$D2#lo
  608. vtrn.32 $R3,$D3#lo
  609. vtrn.32 $R1,$D1#lo
  610. vtrn.32 $R4,$D4#lo
  611. vshl.u32 $S2,$R2,#2 @ *5
  612. vshl.u32 $S3,$R3,#2
  613. vshl.u32 $S1,$R1,#2
  614. vshl.u32 $S4,$R4,#2
  615. vadd.i32 $S2,$S2,$R2
  616. vadd.i32 $S1,$S1,$R1
  617. vadd.i32 $S3,$S3,$R3
  618. vadd.i32 $S4,$S4,$R4
  619. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  620. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  621. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  622. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  623. vst1.32 {${S4}[0]},[$tbl0,:32]
  624. vst1.32 {${S4}[1]},[$tbl1,:32]
  625. b .Lsquare_neon
  626. .align 4
  627. .Lsquare_break_neon:
  628. add $tbl0,$ctx,#(48+2*4*9)
  629. add $tbl1,$ctx,#(48+3*4*9)
  630. vmov $R0,$D0#lo @ r^4:r^3
  631. vshl.u32 $S1,$D1#lo,#2 @ *5
  632. vmov $R1,$D1#lo
  633. vshl.u32 $S2,$D2#lo,#2
  634. vmov $R2,$D2#lo
  635. vshl.u32 $S3,$D3#lo,#2
  636. vmov $R3,$D3#lo
  637. vshl.u32 $S4,$D4#lo,#2
  638. vmov $R4,$D4#lo
  639. vadd.i32 $S1,$S1,$D1#lo
  640. vadd.i32 $S2,$S2,$D2#lo
  641. vadd.i32 $S3,$S3,$D3#lo
  642. vadd.i32 $S4,$S4,$D4#lo
  643. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  644. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  645. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  646. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  647. vst1.32 {${S4}[0]},[$tbl0]
  648. vst1.32 {${S4}[1]},[$tbl1]
  649. .Lno_init_neon:
  650. ret @ bx lr
  651. .size poly1305_init_neon,.-poly1305_init_neon
  652. .type poly1305_blocks_neon,%function
  653. .align 5
  654. poly1305_blocks_neon:
  655. .Lpoly1305_blocks_neon:
  656. ldr ip,[$ctx,#36] @ is_base2_26
  657. cmp $len,#64
  658. blo .Lpoly1305_blocks
  659. stmdb sp!,{r4-r7}
  660. vstmdb sp!,{d8-d15} @ ABI specification says so
  661. tst ip,ip @ is_base2_26?
  662. bne .Lbase2_26_neon
  663. stmdb sp!,{r1-r3,lr}
  664. bl .Lpoly1305_init_neon
  665. ldr r4,[$ctx,#0] @ load hash value base 2^32
  666. ldr r5,[$ctx,#4]
  667. ldr r6,[$ctx,#8]
  668. ldr r7,[$ctx,#12]
  669. ldr ip,[$ctx,#16]
  670. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  671. mov r3,r4,lsr#26
  672. veor $D0#lo,$D0#lo,$D0#lo
  673. mov r4,r5,lsr#20
  674. orr r3,r3,r5,lsl#6
  675. veor $D1#lo,$D1#lo,$D1#lo
  676. mov r5,r6,lsr#14
  677. orr r4,r4,r6,lsl#12
  678. veor $D2#lo,$D2#lo,$D2#lo
  679. mov r6,r7,lsr#8
  680. orr r5,r5,r7,lsl#18
  681. veor $D3#lo,$D3#lo,$D3#lo
  682. and r3,r3,#0x03ffffff
  683. orr r6,r6,ip,lsl#24
  684. veor $D4#lo,$D4#lo,$D4#lo
  685. and r4,r4,#0x03ffffff
  686. mov r1,#1
  687. and r5,r5,#0x03ffffff
  688. str r1,[$ctx,#36] @ set is_base2_26
  689. vmov.32 $D0#lo[0],r2
  690. vmov.32 $D1#lo[0],r3
  691. vmov.32 $D2#lo[0],r4
  692. vmov.32 $D3#lo[0],r5
  693. vmov.32 $D4#lo[0],r6
  694. adr $zeros,.Lzeros
  695. ldmia sp!,{r1-r3,lr}
  696. b .Lhash_loaded
  697. .align 4
  698. .Lbase2_26_neon:
  699. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  700. @ load hash value
  701. veor $D0#lo,$D0#lo,$D0#lo
  702. veor $D1#lo,$D1#lo,$D1#lo
  703. veor $D2#lo,$D2#lo,$D2#lo
  704. veor $D3#lo,$D3#lo,$D3#lo
  705. veor $D4#lo,$D4#lo,$D4#lo
  706. vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  707. adr $zeros,.Lzeros
  708. vld1.32 {$D4#lo[0]},[$ctx]
  709. sub $ctx,$ctx,#16 @ rewind
  710. .Lhash_loaded:
  711. add $in2,$inp,#32
  712. mov $padbit,$padbit,lsl#24
  713. tst $len,#31
  714. beq .Leven
  715. vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
  716. vmov.32 $H4#lo[0],$padbit
  717. sub $len,$len,#16
  718. add $in2,$inp,#32
  719. # ifdef __ARMEB__
  720. vrev32.8 $H0,$H0
  721. vrev32.8 $H3,$H3
  722. vrev32.8 $H1,$H1
  723. vrev32.8 $H2,$H2
  724. # endif
  725. vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
  726. vshl.u32 $H3#lo,$H3#lo,#18
  727. vsri.u32 $H3#lo,$H2#lo,#14
  728. vshl.u32 $H2#lo,$H2#lo,#12
  729. vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
  730. vbic.i32 $H3#lo,#0xfc000000
  731. vsri.u32 $H2#lo,$H1#lo,#20
  732. vshl.u32 $H1#lo,$H1#lo,#6
  733. vbic.i32 $H2#lo,#0xfc000000
  734. vsri.u32 $H1#lo,$H0#lo,#26
  735. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  736. vbic.i32 $H0#lo,#0xfc000000
  737. vbic.i32 $H1#lo,#0xfc000000
  738. vadd.i32 $H2#hi,$H2#lo,$D2#lo
  739. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  740. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  741. mov $tbl1,$zeros
  742. add $tbl0,$ctx,#48
  743. cmp $len,$len
  744. b .Long_tail
  745. .align 4
  746. .Leven:
  747. subs $len,$len,#64
  748. it lo
  749. movlo $in2,$zeros
  750. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  751. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  752. add $inp,$inp,#64
  753. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  754. add $in2,$in2,#64
  755. itt hi
  756. addhi $tbl1,$ctx,#(48+1*9*4)
  757. addhi $tbl0,$ctx,#(48+3*9*4)
  758. # ifdef __ARMEB__
  759. vrev32.8 $H0,$H0
  760. vrev32.8 $H3,$H3
  761. vrev32.8 $H1,$H1
  762. vrev32.8 $H2,$H2
  763. # endif
  764. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  765. vshl.u32 $H3,$H3,#18
  766. vsri.u32 $H3,$H2,#14
  767. vshl.u32 $H2,$H2,#12
  768. vbic.i32 $H3,#0xfc000000
  769. vsri.u32 $H2,$H1,#20
  770. vshl.u32 $H1,$H1,#6
  771. vbic.i32 $H2,#0xfc000000
  772. vsri.u32 $H1,$H0,#26
  773. vbic.i32 $H0,#0xfc000000
  774. vbic.i32 $H1,#0xfc000000
  775. bls .Lskip_loop
  776. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
  777. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  778. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  779. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  780. b .Loop_neon
  781. .align 5
  782. .Loop_neon:
  783. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  784. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  785. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  786. @ \___________________/
  787. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  788. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  789. @ \___________________/ \____________________/
  790. @
  791. @ Note that we start with inp[2:3]*r^2. This is because it
  792. @ doesn't depend on reduction in previous iteration.
  793. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  794. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  795. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  796. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  797. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  798. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  799. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  800. @ inp[2:3]*r^2
  801. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
  802. vmull.u32 $D2,$H2#hi,${R0}[1]
  803. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  804. vmull.u32 $D0,$H0#hi,${R0}[1]
  805. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  806. vmull.u32 $D3,$H3#hi,${R0}[1]
  807. vmlal.u32 $D2,$H1#hi,${R1}[1]
  808. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  809. vmull.u32 $D1,$H1#hi,${R0}[1]
  810. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  811. vmull.u32 $D4,$H4#hi,${R0}[1]
  812. subs $len,$len,#64
  813. vmlal.u32 $D0,$H4#hi,${S1}[1]
  814. it lo
  815. movlo $in2,$zeros
  816. vmlal.u32 $D3,$H2#hi,${R1}[1]
  817. vld1.32 ${S4}[1],[$tbl1,:32]
  818. vmlal.u32 $D1,$H0#hi,${R1}[1]
  819. vmlal.u32 $D4,$H3#hi,${R1}[1]
  820. vmlal.u32 $D0,$H3#hi,${S2}[1]
  821. vmlal.u32 $D3,$H1#hi,${R2}[1]
  822. vmlal.u32 $D4,$H2#hi,${R2}[1]
  823. vmlal.u32 $D1,$H4#hi,${S2}[1]
  824. vmlal.u32 $D2,$H0#hi,${R2}[1]
  825. vmlal.u32 $D3,$H0#hi,${R3}[1]
  826. vmlal.u32 $D0,$H2#hi,${S3}[1]
  827. vmlal.u32 $D4,$H1#hi,${R3}[1]
  828. vmlal.u32 $D1,$H3#hi,${S3}[1]
  829. vmlal.u32 $D2,$H4#hi,${S3}[1]
  830. vmlal.u32 $D3,$H4#hi,${S4}[1]
  831. vmlal.u32 $D0,$H1#hi,${S4}[1]
  832. vmlal.u32 $D4,$H0#hi,${R4}[1]
  833. vmlal.u32 $D1,$H2#hi,${S4}[1]
  834. vmlal.u32 $D2,$H3#hi,${S4}[1]
  835. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  836. add $in2,$in2,#64
  837. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  838. @ (hash+inp[0:1])*r^4 and accumulate
  839. vmlal.u32 $D3,$H3#lo,${R0}[0]
  840. vmlal.u32 $D0,$H0#lo,${R0}[0]
  841. vmlal.u32 $D4,$H4#lo,${R0}[0]
  842. vmlal.u32 $D1,$H1#lo,${R0}[0]
  843. vmlal.u32 $D2,$H2#lo,${R0}[0]
  844. vld1.32 ${S4}[0],[$tbl0,:32]
  845. vmlal.u32 $D3,$H2#lo,${R1}[0]
  846. vmlal.u32 $D0,$H4#lo,${S1}[0]
  847. vmlal.u32 $D4,$H3#lo,${R1}[0]
  848. vmlal.u32 $D1,$H0#lo,${R1}[0]
  849. vmlal.u32 $D2,$H1#lo,${R1}[0]
  850. vmlal.u32 $D3,$H1#lo,${R2}[0]
  851. vmlal.u32 $D0,$H3#lo,${S2}[0]
  852. vmlal.u32 $D4,$H2#lo,${R2}[0]
  853. vmlal.u32 $D1,$H4#lo,${S2}[0]
  854. vmlal.u32 $D2,$H0#lo,${R2}[0]
  855. vmlal.u32 $D3,$H0#lo,${R3}[0]
  856. vmlal.u32 $D0,$H2#lo,${S3}[0]
  857. vmlal.u32 $D4,$H1#lo,${R3}[0]
  858. vmlal.u32 $D1,$H3#lo,${S3}[0]
  859. vmlal.u32 $D3,$H4#lo,${S4}[0]
  860. vmlal.u32 $D2,$H4#lo,${S3}[0]
  861. vmlal.u32 $D0,$H1#lo,${S4}[0]
  862. vmlal.u32 $D4,$H0#lo,${R4}[0]
  863. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  864. vmlal.u32 $D1,$H2#lo,${S4}[0]
  865. vmlal.u32 $D2,$H3#lo,${S4}[0]
  866. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  867. add $inp,$inp,#64
  868. # ifdef __ARMEB__
  869. vrev32.8 $H0,$H0
  870. vrev32.8 $H1,$H1
  871. vrev32.8 $H2,$H2
  872. vrev32.8 $H3,$H3
  873. # endif
  874. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  875. @ lazy reduction interleaved with base 2^32 -> base 2^26 of
  876. @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
  877. vshr.u64 $T0,$D3,#26
  878. vmovn.i64 $D3#lo,$D3
  879. vshr.u64 $T1,$D0,#26
  880. vmovn.i64 $D0#lo,$D0
  881. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  882. vbic.i32 $D3#lo,#0xfc000000
  883. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  884. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  885. vshl.u32 $H3,$H3,#18
  886. vbic.i32 $D0#lo,#0xfc000000
  887. vshrn.u64 $T0#lo,$D4,#26
  888. vmovn.i64 $D4#lo,$D4
  889. vshr.u64 $T1,$D1,#26
  890. vmovn.i64 $D1#lo,$D1
  891. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  892. vsri.u32 $H3,$H2,#14
  893. vbic.i32 $D4#lo,#0xfc000000
  894. vshl.u32 $H2,$H2,#12
  895. vbic.i32 $D1#lo,#0xfc000000
  896. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  897. vshl.u32 $T0#lo,$T0#lo,#2
  898. vbic.i32 $H3,#0xfc000000
  899. vshrn.u64 $T1#lo,$D2,#26
  900. vmovn.i64 $D2#lo,$D2
  901. vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
  902. vsri.u32 $H2,$H1,#20
  903. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  904. vshl.u32 $H1,$H1,#6
  905. vbic.i32 $D2#lo,#0xfc000000
  906. vbic.i32 $H2,#0xfc000000
  907. vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
  908. vmovn.i64 $D0#lo,$D0
  909. vsri.u32 $H1,$H0,#26
  910. vbic.i32 $H0,#0xfc000000
  911. vshr.u32 $T1#lo,$D3#lo,#26
  912. vbic.i32 $D3#lo,#0xfc000000
  913. vbic.i32 $D0#lo,#0xfc000000
  914. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  915. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  916. vbic.i32 $H1,#0xfc000000
  917. bhi .Loop_neon
  918. .Lskip_loop:
  919. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  920. @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  921. add $tbl1,$ctx,#(48+0*9*4)
  922. add $tbl0,$ctx,#(48+1*9*4)
  923. adds $len,$len,#32
  924. it ne
  925. movne $len,#0
  926. bne .Long_tail
  927. vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
  928. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  929. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  930. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  931. vadd.i32 $H4#hi,$H4#lo,$D4#lo
  932. .Long_tail:
  933. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
  934. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
  935. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
  936. vmull.u32 $D2,$H2#hi,$R0
  937. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  938. vmull.u32 $D0,$H0#hi,$R0
  939. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  940. vmull.u32 $D3,$H3#hi,$R0
  941. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  942. vmull.u32 $D1,$H1#hi,$R0
  943. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  944. vmull.u32 $D4,$H4#hi,$R0
  945. vmlal.u32 $D0,$H4#hi,$S1
  946. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  947. vmlal.u32 $D3,$H2#hi,$R1
  948. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  949. vmlal.u32 $D1,$H0#hi,$R1
  950. vmlal.u32 $D4,$H3#hi,$R1
  951. vmlal.u32 $D2,$H1#hi,$R1
  952. vmlal.u32 $D3,$H1#hi,$R2
  953. vld1.32 ${S4}[1],[$tbl1,:32]
  954. vmlal.u32 $D0,$H3#hi,$S2
  955. vld1.32 ${S4}[0],[$tbl0,:32]
  956. vmlal.u32 $D4,$H2#hi,$R2
  957. vmlal.u32 $D1,$H4#hi,$S2
  958. vmlal.u32 $D2,$H0#hi,$R2
  959. vmlal.u32 $D3,$H0#hi,$R3
  960. it ne
  961. addne $tbl1,$ctx,#(48+2*9*4)
  962. vmlal.u32 $D0,$H2#hi,$S3
  963. it ne
  964. addne $tbl0,$ctx,#(48+3*9*4)
  965. vmlal.u32 $D4,$H1#hi,$R3
  966. vmlal.u32 $D1,$H3#hi,$S3
  967. vmlal.u32 $D2,$H4#hi,$S3
  968. vmlal.u32 $D3,$H4#hi,$S4
  969. vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
  970. vmlal.u32 $D0,$H1#hi,$S4
  971. vshr.u64 $MASK,$MASK,#38
  972. vmlal.u32 $D4,$H0#hi,$R4
  973. vmlal.u32 $D1,$H2#hi,$S4
  974. vmlal.u32 $D2,$H3#hi,$S4
  975. beq .Lshort_tail
  976. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  977. @ (hash+inp[0:1])*r^4:r^3 and accumulate
  978. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
  979. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  980. vmlal.u32 $D2,$H2#lo,$R0
  981. vmlal.u32 $D0,$H0#lo,$R0
  982. vmlal.u32 $D3,$H3#lo,$R0
  983. vmlal.u32 $D1,$H1#lo,$R0
  984. vmlal.u32 $D4,$H4#lo,$R0
  985. vmlal.u32 $D0,$H4#lo,$S1
  986. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  987. vmlal.u32 $D3,$H2#lo,$R1
  988. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  989. vmlal.u32 $D1,$H0#lo,$R1
  990. vmlal.u32 $D4,$H3#lo,$R1
  991. vmlal.u32 $D2,$H1#lo,$R1
  992. vmlal.u32 $D3,$H1#lo,$R2
  993. vld1.32 ${S4}[1],[$tbl1,:32]
  994. vmlal.u32 $D0,$H3#lo,$S2
  995. vld1.32 ${S4}[0],[$tbl0,:32]
  996. vmlal.u32 $D4,$H2#lo,$R2
  997. vmlal.u32 $D1,$H4#lo,$S2
  998. vmlal.u32 $D2,$H0#lo,$R2
  999. vmlal.u32 $D3,$H0#lo,$R3
  1000. vmlal.u32 $D0,$H2#lo,$S3
  1001. vmlal.u32 $D4,$H1#lo,$R3
  1002. vmlal.u32 $D1,$H3#lo,$S3
  1003. vmlal.u32 $D2,$H4#lo,$S3
  1004. vmlal.u32 $D3,$H4#lo,$S4
  1005. vorn $MASK,$MASK,$MASK @ all-ones
  1006. vmlal.u32 $D0,$H1#lo,$S4
  1007. vshr.u64 $MASK,$MASK,#38
  1008. vmlal.u32 $D4,$H0#lo,$R4
  1009. vmlal.u32 $D1,$H2#lo,$S4
  1010. vmlal.u32 $D2,$H3#lo,$S4
  1011. .Lshort_tail:
  1012. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  1013. @ horizontal addition
  1014. vadd.i64 $D3#lo,$D3#lo,$D3#hi
  1015. vadd.i64 $D0#lo,$D0#lo,$D0#hi
  1016. vadd.i64 $D4#lo,$D4#lo,$D4#hi
  1017. vadd.i64 $D1#lo,$D1#lo,$D1#hi
  1018. vadd.i64 $D2#lo,$D2#lo,$D2#hi
  1019. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  1020. @ lazy reduction, but without narrowing
  1021. vshr.u64 $T0,$D3,#26
  1022. vand.i64 $D3,$D3,$MASK
  1023. vshr.u64 $T1,$D0,#26
  1024. vand.i64 $D0,$D0,$MASK
  1025. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  1026. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  1027. vshr.u64 $T0,$D4,#26
  1028. vand.i64 $D4,$D4,$MASK
  1029. vshr.u64 $T1,$D1,#26
  1030. vand.i64 $D1,$D1,$MASK
  1031. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  1032. vadd.i64 $D0,$D0,$T0
  1033. vshl.u64 $T0,$T0,#2
  1034. vshr.u64 $T1,$D2,#26
  1035. vand.i64 $D2,$D2,$MASK
  1036. vadd.i64 $D0,$D0,$T0 @ h4 -> h0
  1037. vadd.i64 $D3,$D3,$T1 @ h2 -> h3
  1038. vshr.u64 $T0,$D0,#26
  1039. vand.i64 $D0,$D0,$MASK
  1040. vshr.u64 $T1,$D3,#26
  1041. vand.i64 $D3,$D3,$MASK
  1042. vadd.i64 $D1,$D1,$T0 @ h0 -> h1
  1043. vadd.i64 $D4,$D4,$T1 @ h3 -> h4
  1044. cmp $len,#0
  1045. bne .Leven
  1046. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  1047. @ store hash value
  1048. vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  1049. vst1.32 {$D4#lo[0]},[$ctx]
  1050. vldmia sp!,{d8-d15} @ epilogue
  1051. ldmia sp!,{r4-r7}
  1052. ret @ bx lr
  1053. .size poly1305_blocks_neon,.-poly1305_blocks_neon
  1054. .align 5
  1055. .Lzeros:
  1056. .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  1057. #ifndef __KERNEL__
  1058. .LOPENSSL_armcap:
  1059. # ifdef _WIN32
  1060. .word OPENSSL_armcap_P
  1061. # else
  1062. .word OPENSSL_armcap_P-.Lpoly1305_init
  1063. # endif
  1064. .comm OPENSSL_armcap_P,4,4
  1065. .hidden OPENSSL_armcap_P
  1066. #endif
  1067. #endif
  1068. ___
  1069. } }
  1070. $code.=<<___;
  1071. .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
  1072. .align 2
  1073. ___
  1074. foreach (split("\n",$code)) {
  1075. s/\`([^\`]*)\`/eval $1/geo;
  1076. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  1077. s/\bret\b/bx lr/go or
  1078. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  1079. print $_,"\n";
  1080. }
  1081. close STDOUT; # enforce flush