sha256-armv4.pl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. #!/usr/bin/env perl
  2. # SPDX-License-Identifier: GPL-2.0
  3. # This code is taken from the OpenSSL project but the author (Andy Polyakov)
  4. # has relicensed it under the GPLv2. Therefore this program is free software;
  5. # you can redistribute it and/or modify it under the terms of the GNU General
  6. # Public License version 2 as published by the Free Software Foundation.
  7. #
  8. # The original headers, including the original license headers, are
  9. # included below for completeness.
  10. # ====================================================================
  11. # Written by Andy Polyakov <[email protected]> for the OpenSSL
  12. # project. The module is, however, dual licensed under OpenSSL and
  13. # CRYPTOGAMS licenses depending on where you obtain it. For further
  14. # details see https://www.openssl.org/~appro/cryptogams/.
  15. # ====================================================================
  16. # SHA256 block procedure for ARMv4. May 2007.
  17. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  18. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  19. # byte [on single-issue Xscale PXA250 core].
  20. # July 2010.
  21. #
  22. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  23. # Cortex A8 core and ~20 cycles per processed byte.
  24. # February 2011.
  25. #
  26. # Profiler-assisted and platform-specific optimization resulted in 16%
  27. # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  28. # September 2013.
  29. #
  30. # Add NEON implementation. On Cortex A8 it was measured to process one
  31. # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  32. # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  33. # code (meaning that latter performs sub-optimally, nothing was done
  34. # about it).
  35. # May 2014.
  36. #
  37. # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  38. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  39. open STDOUT,">$output";
  40. $ctx="r0"; $t0="r0";
  41. $inp="r1"; $t4="r1";
  42. $len="r2"; $t1="r2";
  43. $T1="r3"; $t3="r3";
  44. $A="r4";
  45. $B="r5";
  46. $C="r6";
  47. $D="r7";
  48. $E="r8";
  49. $F="r9";
  50. $G="r10";
  51. $H="r11";
  52. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  53. $t2="r12";
  54. $Ktbl="r14";
  55. @Sigma0=( 2,13,22);
  56. @Sigma1=( 6,11,25);
  57. @sigma0=( 7,18, 3);
  58. @sigma1=(17,19,10);
  59. sub BODY_00_15 {
  60. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  61. $code.=<<___ if ($i<16);
  62. #if __ARM_ARCH__>=7
  63. @ ldr $t1,[$inp],#4 @ $i
  64. # if $i==15
  65. str $inp,[sp,#17*4] @ make room for $t4
  66. # endif
  67. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  68. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  69. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  70. # ifndef __ARMEB__
  71. rev $t1,$t1
  72. # endif
  73. #else
  74. @ ldrb $t1,[$inp,#3] @ $i
  75. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  76. ldrb $t2,[$inp,#2]
  77. ldrb $t0,[$inp,#1]
  78. orr $t1,$t1,$t2,lsl#8
  79. ldrb $t2,[$inp],#4
  80. orr $t1,$t1,$t0,lsl#16
  81. # if $i==15
  82. str $inp,[sp,#17*4] @ make room for $t4
  83. # endif
  84. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  85. orr $t1,$t1,$t2,lsl#24
  86. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  87. #endif
  88. ___
  89. $code.=<<___;
  90. ldr $t2,[$Ktbl],#4 @ *K256++
  91. add $h,$h,$t1 @ h+=X[i]
  92. str $t1,[sp,#`$i%16`*4]
  93. eor $t1,$f,$g
  94. add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
  95. and $t1,$t1,$e
  96. add $h,$h,$t2 @ h+=K256[i]
  97. eor $t1,$t1,$g @ Ch(e,f,g)
  98. eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  99. add $h,$h,$t1 @ h+=Ch(e,f,g)
  100. #if $i==31
  101. and $t2,$t2,#0xff
  102. cmp $t2,#0xf2 @ done?
  103. #endif
  104. #if $i<15
  105. # if __ARM_ARCH__>=7
  106. ldr $t1,[$inp],#4 @ prefetch
  107. # else
  108. ldrb $t1,[$inp,#3]
  109. # endif
  110. eor $t2,$a,$b @ a^b, b^c in next round
  111. #else
  112. ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
  113. eor $t2,$a,$b @ a^b, b^c in next round
  114. ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
  115. #endif
  116. eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
  117. and $t3,$t3,$t2 @ (b^c)&=(a^b)
  118. add $d,$d,$h @ d+=h
  119. eor $t3,$t3,$b @ Maj(a,b,c)
  120. add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
  121. @ add $h,$h,$t3 @ h+=Maj(a,b,c)
  122. ___
  123. ($t2,$t3)=($t3,$t2);
  124. }
  125. sub BODY_16_XX {
  126. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  127. $code.=<<___;
  128. @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
  129. @ ldr $t4,[sp,#`($i+14)%16`*4]
  130. mov $t0,$t1,ror#$sigma0[0]
  131. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  132. mov $t2,$t4,ror#$sigma1[0]
  133. eor $t0,$t0,$t1,ror#$sigma0[1]
  134. eor $t2,$t2,$t4,ror#$sigma1[1]
  135. eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
  136. ldr $t1,[sp,#`($i+0)%16`*4]
  137. eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
  138. ldr $t4,[sp,#`($i+9)%16`*4]
  139. add $t2,$t2,$t0
  140. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
  141. add $t1,$t1,$t2
  142. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  143. add $t1,$t1,$t4 @ X[i]
  144. ___
  145. &BODY_00_15(@_);
  146. }
  147. $code=<<___;
  148. #ifndef __KERNEL__
  149. # include "arm_arch.h"
  150. #else
  151. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  152. # define __ARM_MAX_ARCH__ 7
  153. #endif
  154. .text
  155. #if __ARM_ARCH__<7
  156. .code 32
  157. #else
  158. .syntax unified
  159. # ifdef __thumb2__
  160. .thumb
  161. # else
  162. .code 32
  163. # endif
  164. #endif
  165. .type K256,%object
  166. .align 5
  167. K256:
  168. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  169. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  170. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  171. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  172. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  173. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  174. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  175. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  176. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  177. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  178. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  179. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  180. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  181. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  182. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  183. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  184. .size K256,.-K256
  185. .word 0 @ terminator
  186. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  187. .LOPENSSL_armcap:
  188. .word OPENSSL_armcap_P-sha256_block_data_order
  189. #endif
  190. .align 5
  191. .global sha256_block_data_order
  192. .type sha256_block_data_order,%function
  193. sha256_block_data_order:
  194. .Lsha256_block_data_order:
  195. #if __ARM_ARCH__<7
  196. sub r3,pc,#8 @ sha256_block_data_order
  197. #else
  198. adr r3,.Lsha256_block_data_order
  199. #endif
  200. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  201. ldr r12,.LOPENSSL_armcap
  202. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  203. tst r12,#ARMV8_SHA256
  204. bne .LARMv8
  205. tst r12,#ARMV7_NEON
  206. bne .LNEON
  207. #endif
  208. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  209. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  210. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  211. sub $Ktbl,r3,#256+32 @ K256
  212. sub sp,sp,#16*4 @ alloca(X[16])
  213. .Loop:
  214. # if __ARM_ARCH__>=7
  215. ldr $t1,[$inp],#4
  216. # else
  217. ldrb $t1,[$inp,#3]
  218. # endif
  219. eor $t3,$B,$C @ magic
  220. eor $t2,$t2,$t2
  221. ___
  222. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  223. $code.=".Lrounds_16_xx:\n";
  224. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  225. $code.=<<___;
  226. #if __ARM_ARCH__>=7
  227. ite eq @ Thumb2 thing, sanity check in ARM
  228. #endif
  229. ldreq $t3,[sp,#16*4] @ pull ctx
  230. bne .Lrounds_16_xx
  231. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  232. ldr $t0,[$t3,#0]
  233. ldr $t1,[$t3,#4]
  234. ldr $t2,[$t3,#8]
  235. add $A,$A,$t0
  236. ldr $t0,[$t3,#12]
  237. add $B,$B,$t1
  238. ldr $t1,[$t3,#16]
  239. add $C,$C,$t2
  240. ldr $t2,[$t3,#20]
  241. add $D,$D,$t0
  242. ldr $t0,[$t3,#24]
  243. add $E,$E,$t1
  244. ldr $t1,[$t3,#28]
  245. add $F,$F,$t2
  246. ldr $inp,[sp,#17*4] @ pull inp
  247. ldr $t2,[sp,#18*4] @ pull inp+len
  248. add $G,$G,$t0
  249. add $H,$H,$t1
  250. stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  251. cmp $inp,$t2
  252. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  253. bne .Loop
  254. add sp,sp,#`16+3`*4 @ destroy frame
  255. #if __ARM_ARCH__>=5
  256. ldmia sp!,{r4-r11,pc}
  257. #else
  258. ldmia sp!,{r4-r11,lr}
  259. tst lr,#1
  260. moveq pc,lr @ be binary compatible with V4, yet
  261. bx lr @ interoperable with Thumb ISA:-)
  262. #endif
  263. .size sha256_block_data_order,.-sha256_block_data_order
  264. ___
  265. ######################################################################
  266. # NEON stuff
  267. #
  268. {{{
  269. my @X=map("q$_",(0..3));
  270. my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
  271. my $Xfer=$t4;
  272. my $j=0;
  273. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  274. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  275. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  276. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  277. my $arg = pop;
  278. $arg = "#$arg" if ($arg*1 eq $arg);
  279. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  280. }
  281. sub Xupdate()
  282. { use integer;
  283. my $body = shift;
  284. my @insns = (&$body,&$body,&$body,&$body);
  285. my ($a,$b,$c,$d,$e,$f,$g,$h);
  286. &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
  287. eval(shift(@insns));
  288. eval(shift(@insns));
  289. eval(shift(@insns));
  290. &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
  291. eval(shift(@insns));
  292. eval(shift(@insns));
  293. eval(shift(@insns));
  294. &vshr_u32 ($T2,$T0,$sigma0[0]);
  295. eval(shift(@insns));
  296. eval(shift(@insns));
  297. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
  298. eval(shift(@insns));
  299. eval(shift(@insns));
  300. &vshr_u32 ($T1,$T0,$sigma0[2]);
  301. eval(shift(@insns));
  302. eval(shift(@insns));
  303. &vsli_32 ($T2,$T0,32-$sigma0[0]);
  304. eval(shift(@insns));
  305. eval(shift(@insns));
  306. &vshr_u32 ($T3,$T0,$sigma0[1]);
  307. eval(shift(@insns));
  308. eval(shift(@insns));
  309. &veor ($T1,$T1,$T2);
  310. eval(shift(@insns));
  311. eval(shift(@insns));
  312. &vsli_32 ($T3,$T0,32-$sigma0[1]);
  313. eval(shift(@insns));
  314. eval(shift(@insns));
  315. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
  316. eval(shift(@insns));
  317. eval(shift(@insns));
  318. &veor ($T1,$T1,$T3); # sigma0(X[1..4])
  319. eval(shift(@insns));
  320. eval(shift(@insns));
  321. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
  322. eval(shift(@insns));
  323. eval(shift(@insns));
  324. &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
  325. eval(shift(@insns));
  326. eval(shift(@insns));
  327. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
  328. eval(shift(@insns));
  329. eval(shift(@insns));
  330. &veor ($T5,$T5,$T4);
  331. eval(shift(@insns));
  332. eval(shift(@insns));
  333. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
  334. eval(shift(@insns));
  335. eval(shift(@insns));
  336. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
  337. eval(shift(@insns));
  338. eval(shift(@insns));
  339. &veor ($T5,$T5,$T4); # sigma1(X[14..15])
  340. eval(shift(@insns));
  341. eval(shift(@insns));
  342. &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
  343. eval(shift(@insns));
  344. eval(shift(@insns));
  345. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
  346. eval(shift(@insns));
  347. eval(shift(@insns));
  348. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
  349. eval(shift(@insns));
  350. eval(shift(@insns));
  351. &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
  352. eval(shift(@insns));
  353. eval(shift(@insns));
  354. &veor ($T5,$T5,$T4);
  355. eval(shift(@insns));
  356. eval(shift(@insns));
  357. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
  358. eval(shift(@insns));
  359. eval(shift(@insns));
  360. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  361. eval(shift(@insns));
  362. eval(shift(@insns));
  363. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
  364. eval(shift(@insns));
  365. eval(shift(@insns));
  366. &veor ($T5,$T5,$T4); # sigma1(X[16..17])
  367. eval(shift(@insns));
  368. eval(shift(@insns));
  369. &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
  370. eval(shift(@insns));
  371. eval(shift(@insns));
  372. &vadd_i32 ($T0,$T0,@X[0]);
  373. while($#insns>=2) { eval(shift(@insns)); }
  374. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  375. eval(shift(@insns));
  376. eval(shift(@insns));
  377. push(@X,shift(@X)); # "rotate" X[]
  378. }
  379. sub Xpreload()
  380. { use integer;
  381. my $body = shift;
  382. my @insns = (&$body,&$body,&$body,&$body);
  383. my ($a,$b,$c,$d,$e,$f,$g,$h);
  384. eval(shift(@insns));
  385. eval(shift(@insns));
  386. eval(shift(@insns));
  387. eval(shift(@insns));
  388. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  389. eval(shift(@insns));
  390. eval(shift(@insns));
  391. eval(shift(@insns));
  392. eval(shift(@insns));
  393. &vrev32_8 (@X[0],@X[0]);
  394. eval(shift(@insns));
  395. eval(shift(@insns));
  396. eval(shift(@insns));
  397. eval(shift(@insns));
  398. &vadd_i32 ($T0,$T0,@X[0]);
  399. foreach (@insns) { eval; } # remaining instructions
  400. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  401. push(@X,shift(@X)); # "rotate" X[]
  402. }
  403. sub body_00_15 () {
  404. (
  405. '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
  406. '&add ($h,$h,$t1)', # h+=X[i]+K[i]
  407. '&eor ($t1,$f,$g)',
  408. '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
  409. '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
  410. '&and ($t1,$t1,$e)',
  411. '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
  412. '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
  413. '&eor ($t1,$t1,$g)', # Ch(e,f,g)
  414. '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
  415. '&eor ($t2,$a,$b)', # a^b, b^c in next round
  416. '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
  417. '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
  418. '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
  419. '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
  420. '&ldr ($t1,"[sp,#64]") if ($j==31)',
  421. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
  422. '&add ($d,$d,$h)', # d+=h
  423. '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
  424. '&eor ($t3,$t3,$b)', # Maj(a,b,c)
  425. '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
  426. )
  427. }
  428. $code.=<<___;
  429. #if __ARM_MAX_ARCH__>=7
  430. .arch armv7-a
  431. .fpu neon
  432. .global sha256_block_data_order_neon
  433. .type sha256_block_data_order_neon,%function
  434. .align 4
  435. sha256_block_data_order_neon:
  436. .LNEON:
  437. stmdb sp!,{r4-r12,lr}
  438. sub $H,sp,#16*4+16
  439. adr $Ktbl,.Lsha256_block_data_order
  440. sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
  441. bic $H,$H,#15 @ align for 128-bit stores
  442. mov $t2,sp
  443. mov sp,$H @ alloca
  444. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  445. vld1.8 {@X[0]},[$inp]!
  446. vld1.8 {@X[1]},[$inp]!
  447. vld1.8 {@X[2]},[$inp]!
  448. vld1.8 {@X[3]},[$inp]!
  449. vld1.32 {$T0},[$Ktbl,:128]!
  450. vld1.32 {$T1},[$Ktbl,:128]!
  451. vld1.32 {$T2},[$Ktbl,:128]!
  452. vld1.32 {$T3},[$Ktbl,:128]!
  453. vrev32.8 @X[0],@X[0] @ yes, even on
  454. str $ctx,[sp,#64]
  455. vrev32.8 @X[1],@X[1] @ big-endian
  456. str $inp,[sp,#68]
  457. mov $Xfer,sp
  458. vrev32.8 @X[2],@X[2]
  459. str $len,[sp,#72]
  460. vrev32.8 @X[3],@X[3]
  461. str $t2,[sp,#76] @ save original sp
  462. vadd.i32 $T0,$T0,@X[0]
  463. vadd.i32 $T1,$T1,@X[1]
  464. vst1.32 {$T0},[$Xfer,:128]!
  465. vadd.i32 $T2,$T2,@X[2]
  466. vst1.32 {$T1},[$Xfer,:128]!
  467. vadd.i32 $T3,$T3,@X[3]
  468. vst1.32 {$T2},[$Xfer,:128]!
  469. vst1.32 {$T3},[$Xfer,:128]!
  470. ldmia $ctx,{$A-$H}
  471. sub $Xfer,$Xfer,#64
  472. ldr $t1,[sp,#0]
  473. eor $t2,$t2,$t2
  474. eor $t3,$B,$C
  475. b .L_00_48
  476. .align 4
  477. .L_00_48:
  478. ___
  479. &Xupdate(\&body_00_15);
  480. &Xupdate(\&body_00_15);
  481. &Xupdate(\&body_00_15);
  482. &Xupdate(\&body_00_15);
  483. $code.=<<___;
  484. teq $t1,#0 @ check for K256 terminator
  485. ldr $t1,[sp,#0]
  486. sub $Xfer,$Xfer,#64
  487. bne .L_00_48
  488. ldr $inp,[sp,#68]
  489. ldr $t0,[sp,#72]
  490. sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
  491. teq $inp,$t0
  492. it eq
  493. subeq $inp,$inp,#64 @ avoid SEGV
  494. vld1.8 {@X[0]},[$inp]! @ load next input block
  495. vld1.8 {@X[1]},[$inp]!
  496. vld1.8 {@X[2]},[$inp]!
  497. vld1.8 {@X[3]},[$inp]!
  498. it ne
  499. strne $inp,[sp,#68]
  500. mov $Xfer,sp
  501. ___
  502. &Xpreload(\&body_00_15);
  503. &Xpreload(\&body_00_15);
  504. &Xpreload(\&body_00_15);
  505. &Xpreload(\&body_00_15);
  506. $code.=<<___;
  507. ldr $t0,[$t1,#0]
  508. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  509. ldr $t2,[$t1,#4]
  510. ldr $t3,[$t1,#8]
  511. ldr $t4,[$t1,#12]
  512. add $A,$A,$t0 @ accumulate
  513. ldr $t0,[$t1,#16]
  514. add $B,$B,$t2
  515. ldr $t2,[$t1,#20]
  516. add $C,$C,$t3
  517. ldr $t3,[$t1,#24]
  518. add $D,$D,$t4
  519. ldr $t4,[$t1,#28]
  520. add $E,$E,$t0
  521. str $A,[$t1],#4
  522. add $F,$F,$t2
  523. str $B,[$t1],#4
  524. add $G,$G,$t3
  525. str $C,[$t1],#4
  526. add $H,$H,$t4
  527. str $D,[$t1],#4
  528. stmia $t1,{$E-$H}
  529. ittte ne
  530. movne $Xfer,sp
  531. ldrne $t1,[sp,#0]
  532. eorne $t2,$t2,$t2
  533. ldreq sp,[sp,#76] @ restore original sp
  534. itt ne
  535. eorne $t3,$B,$C
  536. bne .L_00_48
  537. ldmia sp!,{r4-r12,pc}
  538. .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
  539. #endif
  540. ___
  541. }}}
  542. ######################################################################
  543. # ARMv8 stuff
  544. #
  545. {{{
  546. my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
  547. my @MSG=map("q$_",(8..11));
  548. my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
  549. my $Ktbl="r3";
  550. $code.=<<___;
  551. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  552. # ifdef __thumb2__
  553. # define INST(a,b,c,d) .byte c,d|0xc,a,b
  554. # else
  555. # define INST(a,b,c,d) .byte a,b,c,d
  556. # endif
  557. .type sha256_block_data_order_armv8,%function
  558. .align 5
  559. sha256_block_data_order_armv8:
  560. .LARMv8:
  561. vld1.32 {$ABCD,$EFGH},[$ctx]
  562. # ifdef __thumb2__
  563. adr $Ktbl,.LARMv8
  564. sub $Ktbl,$Ktbl,#.LARMv8-K256
  565. # else
  566. adrl $Ktbl,K256
  567. # endif
  568. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  569. .Loop_v8:
  570. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  571. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  572. vld1.32 {$W0},[$Ktbl]!
  573. vrev32.8 @MSG[0],@MSG[0]
  574. vrev32.8 @MSG[1],@MSG[1]
  575. vrev32.8 @MSG[2],@MSG[2]
  576. vrev32.8 @MSG[3],@MSG[3]
  577. vmov $ABCD_SAVE,$ABCD @ offload
  578. vmov $EFGH_SAVE,$EFGH
  579. teq $inp,$len
  580. ___
  581. for($i=0;$i<12;$i++) {
  582. $code.=<<___;
  583. vld1.32 {$W1},[$Ktbl]!
  584. vadd.i32 $W0,$W0,@MSG[0]
  585. sha256su0 @MSG[0],@MSG[1]
  586. vmov $abcd,$ABCD
  587. sha256h $ABCD,$EFGH,$W0
  588. sha256h2 $EFGH,$abcd,$W0
  589. sha256su1 @MSG[0],@MSG[2],@MSG[3]
  590. ___
  591. ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
  592. }
  593. $code.=<<___;
  594. vld1.32 {$W1},[$Ktbl]!
  595. vadd.i32 $W0,$W0,@MSG[0]
  596. vmov $abcd,$ABCD
  597. sha256h $ABCD,$EFGH,$W0
  598. sha256h2 $EFGH,$abcd,$W0
  599. vld1.32 {$W0},[$Ktbl]!
  600. vadd.i32 $W1,$W1,@MSG[1]
  601. vmov $abcd,$ABCD
  602. sha256h $ABCD,$EFGH,$W1
  603. sha256h2 $EFGH,$abcd,$W1
  604. vld1.32 {$W1},[$Ktbl]
  605. vadd.i32 $W0,$W0,@MSG[2]
  606. sub $Ktbl,$Ktbl,#256-16 @ rewind
  607. vmov $abcd,$ABCD
  608. sha256h $ABCD,$EFGH,$W0
  609. sha256h2 $EFGH,$abcd,$W0
  610. vadd.i32 $W1,$W1,@MSG[3]
  611. vmov $abcd,$ABCD
  612. sha256h $ABCD,$EFGH,$W1
  613. sha256h2 $EFGH,$abcd,$W1
  614. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  615. vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
  616. it ne
  617. bne .Loop_v8
  618. vst1.32 {$ABCD,$EFGH},[$ctx]
  619. ret @ bx lr
  620. .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
  621. #endif
  622. ___
  623. }}}
  624. $code.=<<___;
  625. .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  626. .align 2
  627. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  628. .comm OPENSSL_armcap_P,4,4
  629. #endif
  630. ___
  631. open SELF,$0;
  632. while(<SELF>) {
  633. next if (/^#!/);
  634. last if (!s/^#/@/ and !/^$/);
  635. print;
  636. }
  637. close SELF;
  638. { my %opcode = (
  639. "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
  640. "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
  641. sub unsha256 {
  642. my ($mnemonic,$arg)=@_;
  643. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  644. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  645. |(($2&7)<<17)|(($2&8)<<4)
  646. |(($3&7)<<1) |(($3&8)<<2);
  647. # since ARMv7 instructions are always encoded little-endian.
  648. # correct solution is to use .inst directive, but older
  649. # assemblers don't implement it:-(
  650. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  651. $word&0xff,($word>>8)&0xff,
  652. ($word>>16)&0xff,($word>>24)&0xff,
  653. $mnemonic,$arg;
  654. }
  655. }
  656. }
  657. foreach (split($/,$code)) {
  658. s/\`([^\`]*)\`/eval $1/geo;
  659. s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
  660. s/\bret\b/bx lr/go or
  661. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  662. print $_,"\n";
  663. }
  664. close STDOUT; # enforce flush