sha1-armv7-neon.S 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
  3. *
  4. * Copyright © 2013-2014 Jussi Kivilinna <[email protected]>
  5. */
  6. #include <linux/linkage.h>
  7. #include <asm/assembler.h>
  8. .syntax unified
  9. .fpu neon
  10. .text
  11. /* Context structure */
  12. #define state_h0 0
  13. #define state_h1 4
  14. #define state_h2 8
  15. #define state_h3 12
  16. #define state_h4 16
  17. /* Constants */
  18. #define K1 0x5A827999
  19. #define K2 0x6ED9EBA1
  20. #define K3 0x8F1BBCDC
  21. #define K4 0xCA62C1D6
  22. .align 4
  23. .LK_VEC:
  24. .LK1: .long K1, K1, K1, K1
  25. .LK2: .long K2, K2, K2, K2
  26. .LK3: .long K3, K3, K3, K3
  27. .LK4: .long K4, K4, K4, K4
  28. /* Register macros */
  29. #define RSTATE r0
  30. #define RDATA r1
  31. #define RNBLKS r2
  32. #define ROLDSTACK r3
  33. #define RWK lr
  34. #define _a r4
  35. #define _b r5
  36. #define _c r6
  37. #define _d r7
  38. #define _e r8
  39. #define RT0 r9
  40. #define RT1 r10
  41. #define RT2 r11
  42. #define RT3 r12
  43. #define W0 q0
  44. #define W1 q7
  45. #define W2 q2
  46. #define W3 q3
  47. #define W4 q4
  48. #define W5 q6
  49. #define W6 q5
  50. #define W7 q1
  51. #define tmp0 q8
  52. #define tmp1 q9
  53. #define tmp2 q10
  54. #define tmp3 q11
  55. #define qK1 q12
  56. #define qK2 q13
  57. #define qK3 q14
  58. #define qK4 q15
  59. #ifdef CONFIG_CPU_BIG_ENDIAN
  60. #define ARM_LE(code...)
  61. #else
  62. #define ARM_LE(code...) code
  63. #endif
  64. /* Round function macros. */
  65. #define WK_offs(i) (((i) & 15) * 4)
  66. #define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  67. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  68. ldr RT3, [sp, WK_offs(i)]; \
  69. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  70. bic RT0, d, b; \
  71. add e, e, a, ror #(32 - 5); \
  72. and RT1, c, b; \
  73. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  74. add RT0, RT0, RT3; \
  75. add e, e, RT1; \
  76. ror b, #(32 - 30); \
  77. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  78. add e, e, RT0;
  79. #define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  80. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  81. ldr RT3, [sp, WK_offs(i)]; \
  82. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  83. eor RT0, d, b; \
  84. add e, e, a, ror #(32 - 5); \
  85. eor RT0, RT0, c; \
  86. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  87. add e, e, RT3; \
  88. ror b, #(32 - 30); \
  89. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  90. add e, e, RT0; \
  91. #define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  92. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  93. ldr RT3, [sp, WK_offs(i)]; \
  94. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  95. eor RT0, b, c; \
  96. and RT1, b, c; \
  97. add e, e, a, ror #(32 - 5); \
  98. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  99. and RT0, RT0, d; \
  100. add RT1, RT1, RT3; \
  101. add e, e, RT0; \
  102. ror b, #(32 - 30); \
  103. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  104. add e, e, RT1;
  105. #define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  106. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  107. _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  108. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  109. #define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
  110. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  111. _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  112. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  113. #define R(a,b,c,d,e,f,i) \
  114. _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
  115. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  116. #define dummy(...)
  117. /* Input expansion macros. */
  118. /********* Precalc macros for rounds 0-15 *************************************/
  119. #define W_PRECALC_00_15() \
  120. add RWK, sp, #(WK_offs(0)); \
  121. \
  122. vld1.32 {W0, W7}, [RDATA]!; \
  123. ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
  124. vld1.32 {W6, W5}, [RDATA]!; \
  125. vadd.u32 tmp0, W0, curK; \
  126. ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
  127. ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
  128. vadd.u32 tmp1, W7, curK; \
  129. ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
  130. vadd.u32 tmp2, W6, curK; \
  131. vst1.32 {tmp0, tmp1}, [RWK]!; \
  132. vadd.u32 tmp3, W5, curK; \
  133. vst1.32 {tmp2, tmp3}, [RWK]; \
  134. #define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  135. vld1.32 {W0, W7}, [RDATA]!; \
  136. #define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  137. add RWK, sp, #(WK_offs(0)); \
  138. #define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  139. ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
  140. #define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  141. vld1.32 {W6, W5}, [RDATA]!; \
  142. #define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  143. vadd.u32 tmp0, W0, curK; \
  144. #define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  145. ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
  146. #define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  147. ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
  148. #define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  149. vadd.u32 tmp1, W7, curK; \
  150. #define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  151. ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
  152. #define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  153. vadd.u32 tmp2, W6, curK; \
  154. #define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  155. vst1.32 {tmp0, tmp1}, [RWK]!; \
  156. #define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  157. vadd.u32 tmp3, W5, curK; \
  158. #define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  159. vst1.32 {tmp2, tmp3}, [RWK]; \
  160. /********* Precalc macros for rounds 16-31 ************************************/
  161. #define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  162. veor tmp0, tmp0; \
  163. vext.8 W, W_m16, W_m12, #8; \
  164. #define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  165. add RWK, sp, #(WK_offs(i)); \
  166. vext.8 tmp0, W_m04, tmp0, #4; \
  167. #define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  168. veor tmp0, tmp0, W_m16; \
  169. veor.32 W, W, W_m08; \
  170. #define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  171. veor tmp1, tmp1; \
  172. veor W, W, tmp0; \
  173. #define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  174. vshl.u32 tmp0, W, #1; \
  175. #define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  176. vext.8 tmp1, tmp1, W, #(16-12); \
  177. vshr.u32 W, W, #31; \
  178. #define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  179. vorr tmp0, tmp0, W; \
  180. vshr.u32 W, tmp1, #30; \
  181. #define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  182. vshl.u32 tmp1, tmp1, #2; \
  183. #define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  184. veor tmp0, tmp0, W; \
  185. #define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  186. veor W, tmp0, tmp1; \
  187. #define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  188. vadd.u32 tmp0, W, curK; \
  189. #define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  190. vst1.32 {tmp0}, [RWK];
  191. /********* Precalc macros for rounds 32-79 ************************************/
  192. #define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  193. veor W, W_m28; \
  194. #define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  195. vext.8 tmp0, W_m08, W_m04, #8; \
  196. #define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  197. veor W, W_m16; \
  198. #define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  199. veor W, tmp0; \
  200. #define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  201. add RWK, sp, #(WK_offs(i&~3)); \
  202. #define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  203. vshl.u32 tmp1, W, #2; \
  204. #define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  205. vshr.u32 tmp0, W, #30; \
  206. #define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  207. vorr W, tmp0, tmp1; \
  208. #define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  209. vadd.u32 tmp0, W, curK; \
  210. #define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  211. vst1.32 {tmp0}, [RWK];
  212. /*
  213. * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  214. *
  215. * unsigned int
  216. * sha1_transform_neon (void *ctx, const unsigned char *data,
  217. * unsigned int nblks)
  218. */
  219. .align 3
  220. ENTRY(sha1_transform_neon)
  221. /* input:
  222. * r0: ctx, CTX
  223. * r1: data (64*nblks bytes)
  224. * r2: nblks
  225. */
  226. cmp RNBLKS, #0;
  227. beq .Ldo_nothing;
  228. push {r4-r12, lr};
  229. /*vpush {q4-q7};*/
  230. adr RT3, .LK_VEC;
  231. mov ROLDSTACK, sp;
  232. /* Align stack. */
  233. sub RT0, sp, #(16*4);
  234. and RT0, #(~(16-1));
  235. mov sp, RT0;
  236. vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
  237. /* Get the values of the chaining variables. */
  238. ldm RSTATE, {_a-_e};
  239. vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
  240. #undef curK
  241. #define curK qK1
  242. /* Precalc 0-15. */
  243. W_PRECALC_00_15();
  244. .Loop:
  245. /* Transform 0-15 + Precalc 16-31. */
  246. _R( _a, _b, _c, _d, _e, F1, 0,
  247. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
  248. W4, W5, W6, W7, W0, _, _, _ );
  249. _R( _e, _a, _b, _c, _d, F1, 1,
  250. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
  251. W4, W5, W6, W7, W0, _, _, _ );
  252. _R( _d, _e, _a, _b, _c, F1, 2,
  253. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
  254. W4, W5, W6, W7, W0, _, _, _ );
  255. _R( _c, _d, _e, _a, _b, F1, 3,
  256. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
  257. W4, W5, W6, W7, W0, _, _, _ );
  258. #undef curK
  259. #define curK qK2
  260. _R( _b, _c, _d, _e, _a, F1, 4,
  261. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
  262. W3, W4, W5, W6, W7, _, _, _ );
  263. _R( _a, _b, _c, _d, _e, F1, 5,
  264. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
  265. W3, W4, W5, W6, W7, _, _, _ );
  266. _R( _e, _a, _b, _c, _d, F1, 6,
  267. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
  268. W3, W4, W5, W6, W7, _, _, _ );
  269. _R( _d, _e, _a, _b, _c, F1, 7,
  270. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
  271. W3, W4, W5, W6, W7, _, _, _ );
  272. _R( _c, _d, _e, _a, _b, F1, 8,
  273. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
  274. W2, W3, W4, W5, W6, _, _, _ );
  275. _R( _b, _c, _d, _e, _a, F1, 9,
  276. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
  277. W2, W3, W4, W5, W6, _, _, _ );
  278. _R( _a, _b, _c, _d, _e, F1, 10,
  279. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
  280. W2, W3, W4, W5, W6, _, _, _ );
  281. _R( _e, _a, _b, _c, _d, F1, 11,
  282. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
  283. W2, W3, W4, W5, W6, _, _, _ );
  284. _R( _d, _e, _a, _b, _c, F1, 12,
  285. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
  286. W1, W2, W3, W4, W5, _, _, _ );
  287. _R( _c, _d, _e, _a, _b, F1, 13,
  288. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
  289. W1, W2, W3, W4, W5, _, _, _ );
  290. _R( _b, _c, _d, _e, _a, F1, 14,
  291. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
  292. W1, W2, W3, W4, W5, _, _, _ );
  293. _R( _a, _b, _c, _d, _e, F1, 15,
  294. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
  295. W1, W2, W3, W4, W5, _, _, _ );
  296. /* Transform 16-63 + Precalc 32-79. */
  297. _R( _e, _a, _b, _c, _d, F1, 16,
  298. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
  299. W0, W1, W2, W3, W4, W5, W6, W7);
  300. _R( _d, _e, _a, _b, _c, F1, 17,
  301. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
  302. W0, W1, W2, W3, W4, W5, W6, W7);
  303. _R( _c, _d, _e, _a, _b, F1, 18,
  304. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
  305. W0, W1, W2, W3, W4, W5, W6, W7);
  306. _R( _b, _c, _d, _e, _a, F1, 19,
  307. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
  308. W0, W1, W2, W3, W4, W5, W6, W7);
  309. _R( _a, _b, _c, _d, _e, F2, 20,
  310. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
  311. W7, W0, W1, W2, W3, W4, W5, W6);
  312. _R( _e, _a, _b, _c, _d, F2, 21,
  313. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
  314. W7, W0, W1, W2, W3, W4, W5, W6);
  315. _R( _d, _e, _a, _b, _c, F2, 22,
  316. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
  317. W7, W0, W1, W2, W3, W4, W5, W6);
  318. _R( _c, _d, _e, _a, _b, F2, 23,
  319. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
  320. W7, W0, W1, W2, W3, W4, W5, W6);
  321. #undef curK
  322. #define curK qK3
  323. _R( _b, _c, _d, _e, _a, F2, 24,
  324. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
  325. W6, W7, W0, W1, W2, W3, W4, W5);
  326. _R( _a, _b, _c, _d, _e, F2, 25,
  327. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
  328. W6, W7, W0, W1, W2, W3, W4, W5);
  329. _R( _e, _a, _b, _c, _d, F2, 26,
  330. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
  331. W6, W7, W0, W1, W2, W3, W4, W5);
  332. _R( _d, _e, _a, _b, _c, F2, 27,
  333. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
  334. W6, W7, W0, W1, W2, W3, W4, W5);
  335. _R( _c, _d, _e, _a, _b, F2, 28,
  336. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
  337. W5, W6, W7, W0, W1, W2, W3, W4);
  338. _R( _b, _c, _d, _e, _a, F2, 29,
  339. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
  340. W5, W6, W7, W0, W1, W2, W3, W4);
  341. _R( _a, _b, _c, _d, _e, F2, 30,
  342. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
  343. W5, W6, W7, W0, W1, W2, W3, W4);
  344. _R( _e, _a, _b, _c, _d, F2, 31,
  345. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
  346. W5, W6, W7, W0, W1, W2, W3, W4);
  347. _R( _d, _e, _a, _b, _c, F2, 32,
  348. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
  349. W4, W5, W6, W7, W0, W1, W2, W3);
  350. _R( _c, _d, _e, _a, _b, F2, 33,
  351. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
  352. W4, W5, W6, W7, W0, W1, W2, W3);
  353. _R( _b, _c, _d, _e, _a, F2, 34,
  354. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
  355. W4, W5, W6, W7, W0, W1, W2, W3);
  356. _R( _a, _b, _c, _d, _e, F2, 35,
  357. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
  358. W4, W5, W6, W7, W0, W1, W2, W3);
  359. _R( _e, _a, _b, _c, _d, F2, 36,
  360. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
  361. W3, W4, W5, W6, W7, W0, W1, W2);
  362. _R( _d, _e, _a, _b, _c, F2, 37,
  363. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
  364. W3, W4, W5, W6, W7, W0, W1, W2);
  365. _R( _c, _d, _e, _a, _b, F2, 38,
  366. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
  367. W3, W4, W5, W6, W7, W0, W1, W2);
  368. _R( _b, _c, _d, _e, _a, F2, 39,
  369. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
  370. W3, W4, W5, W6, W7, W0, W1, W2);
  371. _R( _a, _b, _c, _d, _e, F3, 40,
  372. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
  373. W2, W3, W4, W5, W6, W7, W0, W1);
  374. _R( _e, _a, _b, _c, _d, F3, 41,
  375. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
  376. W2, W3, W4, W5, W6, W7, W0, W1);
  377. _R( _d, _e, _a, _b, _c, F3, 42,
  378. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
  379. W2, W3, W4, W5, W6, W7, W0, W1);
  380. _R( _c, _d, _e, _a, _b, F3, 43,
  381. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
  382. W2, W3, W4, W5, W6, W7, W0, W1);
  383. #undef curK
  384. #define curK qK4
  385. _R( _b, _c, _d, _e, _a, F3, 44,
  386. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
  387. W1, W2, W3, W4, W5, W6, W7, W0);
  388. _R( _a, _b, _c, _d, _e, F3, 45,
  389. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
  390. W1, W2, W3, W4, W5, W6, W7, W0);
  391. _R( _e, _a, _b, _c, _d, F3, 46,
  392. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
  393. W1, W2, W3, W4, W5, W6, W7, W0);
  394. _R( _d, _e, _a, _b, _c, F3, 47,
  395. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
  396. W1, W2, W3, W4, W5, W6, W7, W0);
  397. _R( _c, _d, _e, _a, _b, F3, 48,
  398. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
  399. W0, W1, W2, W3, W4, W5, W6, W7);
  400. _R( _b, _c, _d, _e, _a, F3, 49,
  401. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
  402. W0, W1, W2, W3, W4, W5, W6, W7);
  403. _R( _a, _b, _c, _d, _e, F3, 50,
  404. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
  405. W0, W1, W2, W3, W4, W5, W6, W7);
  406. _R( _e, _a, _b, _c, _d, F3, 51,
  407. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
  408. W0, W1, W2, W3, W4, W5, W6, W7);
  409. _R( _d, _e, _a, _b, _c, F3, 52,
  410. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
  411. W7, W0, W1, W2, W3, W4, W5, W6);
  412. _R( _c, _d, _e, _a, _b, F3, 53,
  413. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
  414. W7, W0, W1, W2, W3, W4, W5, W6);
  415. _R( _b, _c, _d, _e, _a, F3, 54,
  416. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
  417. W7, W0, W1, W2, W3, W4, W5, W6);
  418. _R( _a, _b, _c, _d, _e, F3, 55,
  419. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
  420. W7, W0, W1, W2, W3, W4, W5, W6);
  421. _R( _e, _a, _b, _c, _d, F3, 56,
  422. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
  423. W6, W7, W0, W1, W2, W3, W4, W5);
  424. _R( _d, _e, _a, _b, _c, F3, 57,
  425. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
  426. W6, W7, W0, W1, W2, W3, W4, W5);
  427. _R( _c, _d, _e, _a, _b, F3, 58,
  428. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
  429. W6, W7, W0, W1, W2, W3, W4, W5);
  430. _R( _b, _c, _d, _e, _a, F3, 59,
  431. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
  432. W6, W7, W0, W1, W2, W3, W4, W5);
  433. subs RNBLKS, #1;
  434. _R( _a, _b, _c, _d, _e, F4, 60,
  435. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
  436. W5, W6, W7, W0, W1, W2, W3, W4);
  437. _R( _e, _a, _b, _c, _d, F4, 61,
  438. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
  439. W5, W6, W7, W0, W1, W2, W3, W4);
  440. _R( _d, _e, _a, _b, _c, F4, 62,
  441. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
  442. W5, W6, W7, W0, W1, W2, W3, W4);
  443. _R( _c, _d, _e, _a, _b, F4, 63,
  444. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
  445. W5, W6, W7, W0, W1, W2, W3, W4);
  446. beq .Lend;
  447. /* Transform 64-79 + Precalc 0-15 of next block. */
  448. #undef curK
  449. #define curK qK1
  450. _R( _b, _c, _d, _e, _a, F4, 64,
  451. WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  452. _R( _a, _b, _c, _d, _e, F4, 65,
  453. WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  454. _R( _e, _a, _b, _c, _d, F4, 66,
  455. WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  456. _R( _d, _e, _a, _b, _c, F4, 67,
  457. WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  458. _R( _c, _d, _e, _a, _b, F4, 68,
  459. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  460. _R( _b, _c, _d, _e, _a, F4, 69,
  461. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  462. _R( _a, _b, _c, _d, _e, F4, 70,
  463. WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  464. _R( _e, _a, _b, _c, _d, F4, 71,
  465. WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  466. _R( _d, _e, _a, _b, _c, F4, 72,
  467. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  468. _R( _c, _d, _e, _a, _b, F4, 73,
  469. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  470. _R( _b, _c, _d, _e, _a, F4, 74,
  471. WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  472. _R( _a, _b, _c, _d, _e, F4, 75,
  473. WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  474. _R( _e, _a, _b, _c, _d, F4, 76,
  475. WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  476. _R( _d, _e, _a, _b, _c, F4, 77,
  477. WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  478. _R( _c, _d, _e, _a, _b, F4, 78,
  479. WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  480. _R( _b, _c, _d, _e, _a, F4, 79,
  481. WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
  482. /* Update the chaining variables. */
  483. ldm RSTATE, {RT0-RT3};
  484. add _a, RT0;
  485. ldr RT0, [RSTATE, #state_h4];
  486. add _b, RT1;
  487. add _c, RT2;
  488. add _d, RT3;
  489. add _e, RT0;
  490. stm RSTATE, {_a-_e};
  491. b .Loop;
  492. .Lend:
  493. /* Transform 64-79 */
  494. R( _b, _c, _d, _e, _a, F4, 64 );
  495. R( _a, _b, _c, _d, _e, F4, 65 );
  496. R( _e, _a, _b, _c, _d, F4, 66 );
  497. R( _d, _e, _a, _b, _c, F4, 67 );
  498. R( _c, _d, _e, _a, _b, F4, 68 );
  499. R( _b, _c, _d, _e, _a, F4, 69 );
  500. R( _a, _b, _c, _d, _e, F4, 70 );
  501. R( _e, _a, _b, _c, _d, F4, 71 );
  502. R( _d, _e, _a, _b, _c, F4, 72 );
  503. R( _c, _d, _e, _a, _b, F4, 73 );
  504. R( _b, _c, _d, _e, _a, F4, 74 );
  505. R( _a, _b, _c, _d, _e, F4, 75 );
  506. R( _e, _a, _b, _c, _d, F4, 76 );
  507. R( _d, _e, _a, _b, _c, F4, 77 );
  508. R( _c, _d, _e, _a, _b, F4, 78 );
  509. R( _b, _c, _d, _e, _a, F4, 79 );
  510. mov sp, ROLDSTACK;
  511. /* Update the chaining variables. */
  512. ldm RSTATE, {RT0-RT3};
  513. add _a, RT0;
  514. ldr RT0, [RSTATE, #state_h4];
  515. add _b, RT1;
  516. add _c, RT2;
  517. add _d, RT3;
  518. /*vpop {q4-q7};*/
  519. add _e, RT0;
  520. stm RSTATE, {_a-_e};
  521. pop {r4-r12, pc};
  522. .Ldo_nothing:
  523. bx lr
  524. ENDPROC(sha1_transform_neon)