sha256-avx2-asm.S 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768
  1. ########################################################################
  2. # Implement fast SHA-256 with AVX2 instructions. (x86_64)
  3. #
  4. # Copyright (C) 2013 Intel Corporation.
  5. #
  6. # Authors:
  7. # James Guilford <[email protected]>
  8. # Kirk Yap <[email protected]>
  9. # Tim Chen <[email protected]>
  10. #
  11. # This software is available to you under a choice of one of two
  12. # licenses. You may choose to be licensed under the terms of the GNU
  13. # General Public License (GPL) Version 2, available from the file
  14. # COPYING in the main directory of this source tree, or the
  15. # OpenIB.org BSD license below:
  16. #
  17. # Redistribution and use in source and binary forms, with or
  18. # without modification, are permitted provided that the following
  19. # conditions are met:
  20. #
  21. # - Redistributions of source code must retain the above
  22. # copyright notice, this list of conditions and the following
  23. # disclaimer.
  24. #
  25. # - Redistributions in binary form must reproduce the above
  26. # copyright notice, this list of conditions and the following
  27. # disclaimer in the documentation and/or other materials
  28. # provided with the distribution.
  29. #
  30. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33. # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34. # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35. # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36. # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37. # SOFTWARE.
  38. #
  39. ########################################################################
  40. #
  41. # This code is described in an Intel White-Paper:
  42. # "Fast SHA-256 Implementations on Intel Architecture Processors"
  43. #
  44. # To find it, surf to http://www.intel.com/p/en_US/embedded
  45. # and search for that title.
  46. #
  47. ########################################################################
  48. # This code schedules 2 blocks at a time, with 4 lanes per block
  49. ########################################################################
  50. #include <linux/linkage.h>
  51. #include <linux/cfi_types.h>
  52. ## assume buffers not aligned
  53. #define VMOVDQ vmovdqu
  54. ################################ Define Macros
  55. # addm [mem], reg
  56. # Add reg to mem using reg-mem add and store
  57. .macro addm p1 p2
  58. add \p1, \p2
  59. mov \p2, \p1
  60. .endm
  61. ################################
  62. X0 = %ymm4
  63. X1 = %ymm5
  64. X2 = %ymm6
  65. X3 = %ymm7
  66. # XMM versions of above
  67. XWORD0 = %xmm4
  68. XWORD1 = %xmm5
  69. XWORD2 = %xmm6
  70. XWORD3 = %xmm7
  71. XTMP0 = %ymm0
  72. XTMP1 = %ymm1
  73. XTMP2 = %ymm2
  74. XTMP3 = %ymm3
  75. XTMP4 = %ymm8
  76. XFER = %ymm9
  77. XTMP5 = %ymm11
  78. SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
  79. SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
  80. BYTE_FLIP_MASK = %ymm13
  81. X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
  82. NUM_BLKS = %rdx # 3rd arg
  83. INP = %rsi # 2nd arg
  84. CTX = %rdi # 1st arg
  85. c = %ecx
  86. d = %r8d
  87. e = %edx # clobbers NUM_BLKS
  88. y3 = %esi # clobbers INP
  89. SRND = CTX # SRND is same register as CTX
  90. a = %eax
  91. b = %ebx
  92. f = %r9d
  93. g = %r10d
  94. h = %r11d
  95. old_h = %r11d
  96. T1 = %r12d
  97. y0 = %r13d
  98. y1 = %r14d
  99. y2 = %r15d
  100. _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
  101. _XMM_SAVE_SIZE = 0
  102. _INP_END_SIZE = 8
  103. _INP_SIZE = 8
  104. _CTX_SIZE = 8
  105. _XFER = 0
  106. _XMM_SAVE = _XFER + _XFER_SIZE
  107. _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
  108. _INP = _INP_END + _INP_END_SIZE
  109. _CTX = _INP + _INP_SIZE
  110. STACK_SIZE = _CTX + _CTX_SIZE
  111. # rotate_Xs
  112. # Rotate values of symbols X0...X3
  113. .macro rotate_Xs
  114. X_ = X0
  115. X0 = X1
  116. X1 = X2
  117. X2 = X3
  118. X3 = X_
  119. .endm
  120. # ROTATE_ARGS
  121. # Rotate values of symbols a...h
  122. .macro ROTATE_ARGS
  123. old_h = h
  124. TMP_ = h
  125. h = g
  126. g = f
  127. f = e
  128. e = d
  129. d = c
  130. c = b
  131. b = a
  132. a = TMP_
  133. .endm
  134. .macro FOUR_ROUNDS_AND_SCHED disp
  135. ################################### RND N + 0 ############################
  136. mov a, y3 # y3 = a # MAJA
  137. rorx $25, e, y0 # y0 = e >> 25 # S1A
  138. rorx $11, e, y1 # y1 = e >> 11 # S1B
  139. addl \disp(%rsp, SRND), h # h = k + w + h # --
  140. or c, y3 # y3 = a|c # MAJA
  141. vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
  142. mov f, y2 # y2 = f # CH
  143. rorx $13, a, T1 # T1 = a >> 13 # S0B
  144. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  145. xor g, y2 # y2 = f^g # CH
  146. vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
  147. rorx $6, e, y1 # y1 = (e >> 6) # S1
  148. and e, y2 # y2 = (f^g)&e # CH
  149. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  150. rorx $22, a, y1 # y1 = a >> 22 # S0A
  151. add h, d # d = k + w + h + d # --
  152. and b, y3 # y3 = (a|c)&b # MAJA
  153. vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
  154. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  155. rorx $2, a, T1 # T1 = (a >> 2) # S0
  156. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  157. vpsrld $7, XTMP1, XTMP2
  158. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  159. mov a, T1 # T1 = a # MAJB
  160. and c, T1 # T1 = a&c # MAJB
  161. add y0, y2 # y2 = S1 + CH # --
  162. vpslld $(32-7), XTMP1, XTMP3
  163. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  164. add y1, h # h = k + w + h + S0 # --
  165. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  166. vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
  167. vpsrld $18, XTMP1, XTMP2
  168. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  169. add y3, h # h = t1 + S0 + MAJ # --
  170. ROTATE_ARGS
  171. ################################### RND N + 1 ############################
  172. mov a, y3 # y3 = a # MAJA
  173. rorx $25, e, y0 # y0 = e >> 25 # S1A
  174. rorx $11, e, y1 # y1 = e >> 11 # S1B
  175. offset = \disp + 1*4
  176. addl offset(%rsp, SRND), h # h = k + w + h # --
  177. or c, y3 # y3 = a|c # MAJA
  178. vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
  179. mov f, y2 # y2 = f # CH
  180. rorx $13, a, T1 # T1 = a >> 13 # S0B
  181. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  182. xor g, y2 # y2 = f^g # CH
  183. rorx $6, e, y1 # y1 = (e >> 6) # S1
  184. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  185. rorx $22, a, y1 # y1 = a >> 22 # S0A
  186. and e, y2 # y2 = (f^g)&e # CH
  187. add h, d # d = k + w + h + d # --
  188. vpslld $(32-18), XTMP1, XTMP1
  189. and b, y3 # y3 = (a|c)&b # MAJA
  190. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  191. vpxor XTMP1, XTMP3, XTMP3
  192. rorx $2, a, T1 # T1 = (a >> 2) # S0
  193. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  194. vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
  195. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  196. mov a, T1 # T1 = a # MAJB
  197. and c, T1 # T1 = a&c # MAJB
  198. add y0, y2 # y2 = S1 + CH # --
  199. vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
  200. vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
  201. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  202. add y1, h # h = k + w + h + S0 # --
  203. vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
  204. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  205. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  206. add y3, h # h = t1 + S0 + MAJ # --
  207. vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
  208. ROTATE_ARGS
  209. ################################### RND N + 2 ############################
  210. mov a, y3 # y3 = a # MAJA
  211. rorx $25, e, y0 # y0 = e >> 25 # S1A
  212. offset = \disp + 2*4
  213. addl offset(%rsp, SRND), h # h = k + w + h # --
  214. vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
  215. rorx $11, e, y1 # y1 = e >> 11 # S1B
  216. or c, y3 # y3 = a|c # MAJA
  217. mov f, y2 # y2 = f # CH
  218. xor g, y2 # y2 = f^g # CH
  219. rorx $13, a, T1 # T1 = a >> 13 # S0B
  220. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  221. vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
  222. and e, y2 # y2 = (f^g)&e # CH
  223. rorx $6, e, y1 # y1 = (e >> 6) # S1
  224. vpxor XTMP3, XTMP2, XTMP2
  225. add h, d # d = k + w + h + d # --
  226. and b, y3 # y3 = (a|c)&b # MAJA
  227. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  228. rorx $22, a, y1 # y1 = a >> 22 # S0A
  229. vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
  230. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  231. vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
  232. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  233. rorx $2, a ,T1 # T1 = (a >> 2) # S0
  234. vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
  235. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  236. mov a, T1 # T1 = a # MAJB
  237. and c, T1 # T1 = a&c # MAJB
  238. add y0, y2 # y2 = S1 + CH # --
  239. vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
  240. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  241. add y1,h # h = k + w + h + S0 # --
  242. add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
  243. add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  244. add y3,h # h = t1 + S0 + MAJ # --
  245. ROTATE_ARGS
  246. ################################### RND N + 3 ############################
  247. mov a, y3 # y3 = a # MAJA
  248. rorx $25, e, y0 # y0 = e >> 25 # S1A
  249. rorx $11, e, y1 # y1 = e >> 11 # S1B
  250. offset = \disp + 3*4
  251. addl offset(%rsp, SRND), h # h = k + w + h # --
  252. or c, y3 # y3 = a|c # MAJA
  253. vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
  254. mov f, y2 # y2 = f # CH
  255. rorx $13, a, T1 # T1 = a >> 13 # S0B
  256. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  257. xor g, y2 # y2 = f^g # CH
  258. vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
  259. rorx $6, e, y1 # y1 = (e >> 6) # S1
  260. and e, y2 # y2 = (f^g)&e # CH
  261. add h, d # d = k + w + h + d # --
  262. and b, y3 # y3 = (a|c)&b # MAJA
  263. vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
  264. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  265. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  266. vpxor XTMP3, XTMP2, XTMP2
  267. rorx $22, a, y1 # y1 = a >> 22 # S0A
  268. add y0, y2 # y2 = S1 + CH # --
  269. vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
  270. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  271. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  272. rorx $2, a, T1 # T1 = (a >> 2) # S0
  273. vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
  274. vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
  275. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  276. mov a, T1 # T1 = a # MAJB
  277. and c, T1 # T1 = a&c # MAJB
  278. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  279. add y1, h # h = k + w + h + S0 # --
  280. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  281. add y3, h # h = t1 + S0 + MAJ # --
  282. ROTATE_ARGS
  283. rotate_Xs
  284. .endm
  285. .macro DO_4ROUNDS disp
  286. ################################### RND N + 0 ###########################
  287. mov f, y2 # y2 = f # CH
  288. rorx $25, e, y0 # y0 = e >> 25 # S1A
  289. rorx $11, e, y1 # y1 = e >> 11 # S1B
  290. xor g, y2 # y2 = f^g # CH
  291. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  292. rorx $6, e, y1 # y1 = (e >> 6) # S1
  293. and e, y2 # y2 = (f^g)&e # CH
  294. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  295. rorx $13, a, T1 # T1 = a >> 13 # S0B
  296. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  297. rorx $22, a, y1 # y1 = a >> 22 # S0A
  298. mov a, y3 # y3 = a # MAJA
  299. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  300. rorx $2, a, T1 # T1 = (a >> 2) # S0
  301. addl \disp(%rsp, SRND), h # h = k + w + h # --
  302. or c, y3 # y3 = a|c # MAJA
  303. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  304. mov a, T1 # T1 = a # MAJB
  305. and b, y3 # y3 = (a|c)&b # MAJA
  306. and c, T1 # T1 = a&c # MAJB
  307. add y0, y2 # y2 = S1 + CH # --
  308. add h, d # d = k + w + h + d # --
  309. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  310. add y1, h # h = k + w + h + S0 # --
  311. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  312. ROTATE_ARGS
  313. ################################### RND N + 1 ###########################
  314. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  315. mov f, y2 # y2 = f # CH
  316. rorx $25, e, y0 # y0 = e >> 25 # S1A
  317. rorx $11, e, y1 # y1 = e >> 11 # S1B
  318. xor g, y2 # y2 = f^g # CH
  319. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  320. rorx $6, e, y1 # y1 = (e >> 6) # S1
  321. and e, y2 # y2 = (f^g)&e # CH
  322. add y3, old_h # h = t1 + S0 + MAJ # --
  323. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  324. rorx $13, a, T1 # T1 = a >> 13 # S0B
  325. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  326. rorx $22, a, y1 # y1 = a >> 22 # S0A
  327. mov a, y3 # y3 = a # MAJA
  328. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  329. rorx $2, a, T1 # T1 = (a >> 2) # S0
  330. offset = 4*1 + \disp
  331. addl offset(%rsp, SRND), h # h = k + w + h # --
  332. or c, y3 # y3 = a|c # MAJA
  333. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  334. mov a, T1 # T1 = a # MAJB
  335. and b, y3 # y3 = (a|c)&b # MAJA
  336. and c, T1 # T1 = a&c # MAJB
  337. add y0, y2 # y2 = S1 + CH # --
  338. add h, d # d = k + w + h + d # --
  339. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  340. add y1, h # h = k + w + h + S0 # --
  341. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  342. ROTATE_ARGS
  343. ################################### RND N + 2 ##############################
  344. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  345. mov f, y2 # y2 = f # CH
  346. rorx $25, e, y0 # y0 = e >> 25 # S1A
  347. rorx $11, e, y1 # y1 = e >> 11 # S1B
  348. xor g, y2 # y2 = f^g # CH
  349. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  350. rorx $6, e, y1 # y1 = (e >> 6) # S1
  351. and e, y2 # y2 = (f^g)&e # CH
  352. add y3, old_h # h = t1 + S0 + MAJ # --
  353. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  354. rorx $13, a, T1 # T1 = a >> 13 # S0B
  355. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  356. rorx $22, a, y1 # y1 = a >> 22 # S0A
  357. mov a, y3 # y3 = a # MAJA
  358. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  359. rorx $2, a, T1 # T1 = (a >> 2) # S0
  360. offset = 4*2 + \disp
  361. addl offset(%rsp, SRND), h # h = k + w + h # --
  362. or c, y3 # y3 = a|c # MAJA
  363. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  364. mov a, T1 # T1 = a # MAJB
  365. and b, y3 # y3 = (a|c)&b # MAJA
  366. and c, T1 # T1 = a&c # MAJB
  367. add y0, y2 # y2 = S1 + CH # --
  368. add h, d # d = k + w + h + d # --
  369. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  370. add y1, h # h = k + w + h + S0 # --
  371. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  372. ROTATE_ARGS
  373. ################################### RND N + 3 ###########################
  374. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  375. mov f, y2 # y2 = f # CH
  376. rorx $25, e, y0 # y0 = e >> 25 # S1A
  377. rorx $11, e, y1 # y1 = e >> 11 # S1B
  378. xor g, y2 # y2 = f^g # CH
  379. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  380. rorx $6, e, y1 # y1 = (e >> 6) # S1
  381. and e, y2 # y2 = (f^g)&e # CH
  382. add y3, old_h # h = t1 + S0 + MAJ # --
  383. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  384. rorx $13, a, T1 # T1 = a >> 13 # S0B
  385. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  386. rorx $22, a, y1 # y1 = a >> 22 # S0A
  387. mov a, y3 # y3 = a # MAJA
  388. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  389. rorx $2, a, T1 # T1 = (a >> 2) # S0
  390. offset = 4*3 + \disp
  391. addl offset(%rsp, SRND), h # h = k + w + h # --
  392. or c, y3 # y3 = a|c # MAJA
  393. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  394. mov a, T1 # T1 = a # MAJB
  395. and b, y3 # y3 = (a|c)&b # MAJA
  396. and c, T1 # T1 = a&c # MAJB
  397. add y0, y2 # y2 = S1 + CH # --
  398. add h, d # d = k + w + h + d # --
  399. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  400. add y1, h # h = k + w + h + S0 # --
  401. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  402. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  403. add y3, h # h = t1 + S0 + MAJ # --
  404. ROTATE_ARGS
  405. .endm
  406. ########################################################################
  407. ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
  408. ## arg 1 : pointer to state
  409. ## arg 2 : pointer to input data
  410. ## arg 3 : Num blocks
  411. ########################################################################
  412. .text
  413. SYM_TYPED_FUNC_START(sha256_transform_rorx)
  414. .align 32
  415. pushq %rbx
  416. pushq %r12
  417. pushq %r13
  418. pushq %r14
  419. pushq %r15
  420. push %rbp
  421. mov %rsp, %rbp
  422. subq $STACK_SIZE, %rsp
  423. and $-32, %rsp # align rsp to 32 byte boundary
  424. shl $6, NUM_BLKS # convert to bytes
  425. jz done_hash
  426. lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
  427. mov NUM_BLKS, _INP_END(%rsp)
  428. cmp NUM_BLKS, INP
  429. je only_one_block
  430. ## load initial digest
  431. mov (CTX), a
  432. mov 4*1(CTX), b
  433. mov 4*2(CTX), c
  434. mov 4*3(CTX), d
  435. mov 4*4(CTX), e
  436. mov 4*5(CTX), f
  437. mov 4*6(CTX), g
  438. mov 4*7(CTX), h
  439. vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
  440. vmovdqa _SHUF_00BA(%rip), SHUF_00BA
  441. vmovdqa _SHUF_DC00(%rip), SHUF_DC00
  442. mov CTX, _CTX(%rsp)
  443. loop0:
  444. ## Load first 16 dwords from two blocks
  445. VMOVDQ 0*32(INP),XTMP0
  446. VMOVDQ 1*32(INP),XTMP1
  447. VMOVDQ 2*32(INP),XTMP2
  448. VMOVDQ 3*32(INP),XTMP3
  449. ## byte swap data
  450. vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
  451. vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
  452. vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
  453. vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
  454. ## transpose data into high/low halves
  455. vperm2i128 $0x20, XTMP2, XTMP0, X0
  456. vperm2i128 $0x31, XTMP2, XTMP0, X1
  457. vperm2i128 $0x20, XTMP3, XTMP1, X2
  458. vperm2i128 $0x31, XTMP3, XTMP1, X3
  459. last_block_enter:
  460. add $64, INP
  461. mov INP, _INP(%rsp)
  462. ## schedule 48 input dwords, by doing 3 rounds of 12 each
  463. xor SRND, SRND
  464. .align 16
  465. loop1:
  466. vpaddd K256+0*32(SRND), X0, XFER
  467. vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
  468. FOUR_ROUNDS_AND_SCHED _XFER + 0*32
  469. vpaddd K256+1*32(SRND), X0, XFER
  470. vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
  471. FOUR_ROUNDS_AND_SCHED _XFER + 1*32
  472. vpaddd K256+2*32(SRND), X0, XFER
  473. vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
  474. FOUR_ROUNDS_AND_SCHED _XFER + 2*32
  475. vpaddd K256+3*32(SRND), X0, XFER
  476. vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
  477. FOUR_ROUNDS_AND_SCHED _XFER + 3*32
  478. add $4*32, SRND
  479. cmp $3*4*32, SRND
  480. jb loop1
  481. loop2:
  482. ## Do last 16 rounds with no scheduling
  483. vpaddd K256+0*32(SRND), X0, XFER
  484. vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
  485. DO_4ROUNDS _XFER + 0*32
  486. vpaddd K256+1*32(SRND), X1, XFER
  487. vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
  488. DO_4ROUNDS _XFER + 1*32
  489. add $2*32, SRND
  490. vmovdqa X2, X0
  491. vmovdqa X3, X1
  492. cmp $4*4*32, SRND
  493. jb loop2
  494. mov _CTX(%rsp), CTX
  495. mov _INP(%rsp), INP
  496. addm (4*0)(CTX),a
  497. addm (4*1)(CTX),b
  498. addm (4*2)(CTX),c
  499. addm (4*3)(CTX),d
  500. addm (4*4)(CTX),e
  501. addm (4*5)(CTX),f
  502. addm (4*6)(CTX),g
  503. addm (4*7)(CTX),h
  504. cmp _INP_END(%rsp), INP
  505. ja done_hash
  506. #### Do second block using previously scheduled results
  507. xor SRND, SRND
  508. .align 16
  509. loop3:
  510. DO_4ROUNDS _XFER + 0*32 + 16
  511. DO_4ROUNDS _XFER + 1*32 + 16
  512. add $2*32, SRND
  513. cmp $4*4*32, SRND
  514. jb loop3
  515. mov _CTX(%rsp), CTX
  516. mov _INP(%rsp), INP
  517. add $64, INP
  518. addm (4*0)(CTX),a
  519. addm (4*1)(CTX),b
  520. addm (4*2)(CTX),c
  521. addm (4*3)(CTX),d
  522. addm (4*4)(CTX),e
  523. addm (4*5)(CTX),f
  524. addm (4*6)(CTX),g
  525. addm (4*7)(CTX),h
  526. cmp _INP_END(%rsp), INP
  527. jb loop0
  528. ja done_hash
  529. do_last_block:
  530. VMOVDQ 0*16(INP),XWORD0
  531. VMOVDQ 1*16(INP),XWORD1
  532. VMOVDQ 2*16(INP),XWORD2
  533. VMOVDQ 3*16(INP),XWORD3
  534. vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
  535. vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
  536. vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
  537. vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
  538. jmp last_block_enter
  539. only_one_block:
  540. ## load initial digest
  541. mov (4*0)(CTX),a
  542. mov (4*1)(CTX),b
  543. mov (4*2)(CTX),c
  544. mov (4*3)(CTX),d
  545. mov (4*4)(CTX),e
  546. mov (4*5)(CTX),f
  547. mov (4*6)(CTX),g
  548. mov (4*7)(CTX),h
  549. vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
  550. vmovdqa _SHUF_00BA(%rip), SHUF_00BA
  551. vmovdqa _SHUF_DC00(%rip), SHUF_DC00
  552. mov CTX, _CTX(%rsp)
  553. jmp do_last_block
  554. done_hash:
  555. mov %rbp, %rsp
  556. pop %rbp
  557. popq %r15
  558. popq %r14
  559. popq %r13
  560. popq %r12
  561. popq %rbx
  562. RET
  563. SYM_FUNC_END(sha256_transform_rorx)
  564. .section .rodata.cst512.K256, "aM", @progbits, 512
  565. .align 64
  566. K256:
  567. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  568. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  569. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  570. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  571. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  572. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  573. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  574. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  575. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  576. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  577. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  578. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  579. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  580. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  581. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  582. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  583. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  584. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  585. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  586. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  587. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  588. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  589. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  590. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  591. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  592. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  593. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  594. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  595. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  596. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  597. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  598. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  599. .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
  600. .align 32
  601. PSHUFFLE_BYTE_FLIP_MASK:
  602. .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
  603. # shuffle xBxA -> 00BA
  604. .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
  605. .align 32
  606. _SHUF_00BA:
  607. .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
  608. # shuffle xDxC -> DC00
  609. .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
  610. .align 32
  611. _SHUF_DC00:
  612. .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF