sha256-ssse3-asm.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. ########################################################################
  2. # Implement fast SHA-256 with SSSE3 instructions. (x86_64)
  3. #
  4. # Copyright (C) 2013 Intel Corporation.
  5. #
  6. # Authors:
  7. # James Guilford <[email protected]>
  8. # Kirk Yap <[email protected]>
  9. # Tim Chen <[email protected]>
  10. #
  11. # This software is available to you under a choice of one of two
  12. # licenses. You may choose to be licensed under the terms of the GNU
  13. # General Public License (GPL) Version 2, available from the file
  14. # COPYING in the main directory of this source tree, or the
  15. # OpenIB.org BSD license below:
  16. #
  17. # Redistribution and use in source and binary forms, with or
  18. # without modification, are permitted provided that the following
  19. # conditions are met:
  20. #
  21. # - Redistributions of source code must retain the above
  22. # copyright notice, this list of conditions and the following
  23. # disclaimer.
  24. #
  25. # - Redistributions in binary form must reproduce the above
  26. # copyright notice, this list of conditions and the following
  27. # disclaimer in the documentation and/or other materials
  28. # provided with the distribution.
  29. #
  30. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33. # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34. # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35. # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36. # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37. # SOFTWARE.
  38. #
  39. ########################################################################
  40. #
  41. # This code is described in an Intel White-Paper:
  42. # "Fast SHA-256 Implementations on Intel Architecture Processors"
  43. #
  44. # To find it, surf to http://www.intel.com/p/en_US/embedded
  45. # and search for that title.
  46. #
  47. ########################################################################
  48. #include <linux/linkage.h>
  49. #include <linux/cfi_types.h>
  50. ## assume buffers not aligned
  51. #define MOVDQ movdqu
  52. ################################ Define Macros
  53. # addm [mem], reg
  54. # Add reg to mem using reg-mem add and store
  55. .macro addm p1 p2
  56. add \p1, \p2
  57. mov \p2, \p1
  58. .endm
  59. ################################
  60. # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  61. # Load xmm with mem and byte swap each dword
  62. .macro COPY_XMM_AND_BSWAP p1 p2 p3
  63. MOVDQ \p2, \p1
  64. pshufb \p3, \p1
  65. .endm
  66. ################################
  67. X0 = %xmm4
  68. X1 = %xmm5
  69. X2 = %xmm6
  70. X3 = %xmm7
  71. XTMP0 = %xmm0
  72. XTMP1 = %xmm1
  73. XTMP2 = %xmm2
  74. XTMP3 = %xmm3
  75. XTMP4 = %xmm8
  76. XFER = %xmm9
  77. SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
  78. SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
  79. BYTE_FLIP_MASK = %xmm12
  80. NUM_BLKS = %rdx # 3rd arg
  81. INP = %rsi # 2nd arg
  82. CTX = %rdi # 1st arg
  83. SRND = %rsi # clobbers INP
  84. c = %ecx
  85. d = %r8d
  86. e = %edx
  87. TBL = %r12
  88. a = %eax
  89. b = %ebx
  90. f = %r9d
  91. g = %r10d
  92. h = %r11d
  93. y0 = %r13d
  94. y1 = %r14d
  95. y2 = %r15d
  96. _INP_END_SIZE = 8
  97. _INP_SIZE = 8
  98. _XFER_SIZE = 16
  99. _XMM_SAVE_SIZE = 0
  100. _INP_END = 0
  101. _INP = _INP_END + _INP_END_SIZE
  102. _XFER = _INP + _INP_SIZE
  103. _XMM_SAVE = _XFER + _XFER_SIZE
  104. STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
  105. # rotate_Xs
  106. # Rotate values of symbols X0...X3
  107. .macro rotate_Xs
  108. X_ = X0
  109. X0 = X1
  110. X1 = X2
  111. X2 = X3
  112. X3 = X_
  113. .endm
  114. # ROTATE_ARGS
  115. # Rotate values of symbols a...h
  116. .macro ROTATE_ARGS
  117. TMP_ = h
  118. h = g
  119. g = f
  120. f = e
  121. e = d
  122. d = c
  123. c = b
  124. b = a
  125. a = TMP_
  126. .endm
  127. .macro FOUR_ROUNDS_AND_SCHED
  128. ## compute s0 four at a time and s1 two at a time
  129. ## compute W[-16] + W[-7] 4 at a time
  130. movdqa X3, XTMP0
  131. mov e, y0 # y0 = e
  132. ror $(25-11), y0 # y0 = e >> (25-11)
  133. mov a, y1 # y1 = a
  134. palignr $4, X2, XTMP0 # XTMP0 = W[-7]
  135. ror $(22-13), y1 # y1 = a >> (22-13)
  136. xor e, y0 # y0 = e ^ (e >> (25-11))
  137. mov f, y2 # y2 = f
  138. ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  139. movdqa X1, XTMP1
  140. xor a, y1 # y1 = a ^ (a >> (22-13)
  141. xor g, y2 # y2 = f^g
  142. paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
  143. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  144. and e, y2 # y2 = (f^g)&e
  145. ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  146. ## compute s0
  147. palignr $4, X0, XTMP1 # XTMP1 = W[-15]
  148. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  149. ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  150. xor g, y2 # y2 = CH = ((f^g)&e)^g
  151. movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
  152. ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  153. add y0, y2 # y2 = S1 + CH
  154. add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
  155. movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
  156. mov a, y0 # y0 = a
  157. add y2, h # h = h + S1 + CH + k + w
  158. mov a, y2 # y2 = a
  159. pslld $(32-7), XTMP1 #
  160. or c, y0 # y0 = a|c
  161. add h, d # d = d + h + S1 + CH + k + w
  162. and c, y2 # y2 = a&c
  163. psrld $7, XTMP2 #
  164. and b, y0 # y0 = (a|c)&b
  165. add y1, h # h = h + S1 + CH + k + w + S0
  166. por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
  167. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  168. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  169. #
  170. ROTATE_ARGS #
  171. movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
  172. mov e, y0 # y0 = e
  173. mov a, y1 # y1 = a
  174. movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
  175. ror $(25-11), y0 # y0 = e >> (25-11)
  176. xor e, y0 # y0 = e ^ (e >> (25-11))
  177. mov f, y2 # y2 = f
  178. ror $(22-13), y1 # y1 = a >> (22-13)
  179. pslld $(32-18), XTMP3 #
  180. xor a, y1 # y1 = a ^ (a >> (22-13)
  181. ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  182. xor g, y2 # y2 = f^g
  183. psrld $18, XTMP2 #
  184. ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  185. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  186. and e, y2 # y2 = (f^g)&e
  187. ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  188. pxor XTMP3, XTMP1
  189. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  190. xor g, y2 # y2 = CH = ((f^g)&e)^g
  191. psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
  192. add y0, y2 # y2 = S1 + CH
  193. add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
  194. ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  195. pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
  196. mov a, y0 # y0 = a
  197. add y2, h # h = h + S1 + CH + k + w
  198. mov a, y2 # y2 = a
  199. pxor XTMP4, XTMP1 # XTMP1 = s0
  200. or c, y0 # y0 = a|c
  201. add h, d # d = d + h + S1 + CH + k + w
  202. and c, y2 # y2 = a&c
  203. ## compute low s1
  204. pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
  205. and b, y0 # y0 = (a|c)&b
  206. add y1, h # h = h + S1 + CH + k + w + S0
  207. paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
  208. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  209. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  210. ROTATE_ARGS
  211. movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
  212. mov e, y0 # y0 = e
  213. mov a, y1 # y1 = a
  214. ror $(25-11), y0 # y0 = e >> (25-11)
  215. movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
  216. xor e, y0 # y0 = e ^ (e >> (25-11))
  217. ror $(22-13), y1 # y1 = a >> (22-13)
  218. mov f, y2 # y2 = f
  219. xor a, y1 # y1 = a ^ (a >> (22-13)
  220. ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  221. psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
  222. xor g, y2 # y2 = f^g
  223. psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
  224. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  225. and e, y2 # y2 = (f^g)&e
  226. psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
  227. ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  228. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  229. xor g, y2 # y2 = CH = ((f^g)&e)^g
  230. ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  231. pxor XTMP3, XTMP2
  232. add y0, y2 # y2 = S1 + CH
  233. ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  234. add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
  235. pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
  236. mov a, y0 # y0 = a
  237. add y2, h # h = h + S1 + CH + k + w
  238. mov a, y2 # y2 = a
  239. pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
  240. or c, y0 # y0 = a|c
  241. add h, d # d = d + h + S1 + CH + k + w
  242. and c, y2 # y2 = a&c
  243. paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
  244. and b, y0 # y0 = (a|c)&b
  245. add y1, h # h = h + S1 + CH + k + w + S0
  246. ## compute high s1
  247. pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
  248. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  249. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  250. #
  251. ROTATE_ARGS #
  252. movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
  253. mov e, y0 # y0 = e
  254. ror $(25-11), y0 # y0 = e >> (25-11)
  255. mov a, y1 # y1 = a
  256. movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
  257. ror $(22-13), y1 # y1 = a >> (22-13)
  258. xor e, y0 # y0 = e ^ (e >> (25-11))
  259. mov f, y2 # y2 = f
  260. ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  261. psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
  262. xor a, y1 # y1 = a ^ (a >> (22-13)
  263. xor g, y2 # y2 = f^g
  264. psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
  265. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
  266. and e, y2 # y2 = (f^g)&e
  267. ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  268. psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
  269. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
  270. ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
  271. xor g, y2 # y2 = CH = ((f^g)&e)^g
  272. pxor XTMP3, XTMP2 #
  273. ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
  274. add y0, y2 # y2 = S1 + CH
  275. add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
  276. pxor XTMP2, X0 # X0 = s1 {xDxC}
  277. mov a, y0 # y0 = a
  278. add y2, h # h = h + S1 + CH + k + w
  279. mov a, y2 # y2 = a
  280. pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
  281. or c, y0 # y0 = a|c
  282. add h, d # d = d + h + S1 + CH + k + w
  283. and c, y2 # y2 = a&c
  284. paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
  285. and b, y0 # y0 = (a|c)&b
  286. add y1, h # h = h + S1 + CH + k + w + S0
  287. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  288. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  289. ROTATE_ARGS
  290. rotate_Xs
  291. .endm
  292. ## input is [rsp + _XFER + %1 * 4]
  293. .macro DO_ROUND round
  294. mov e, y0 # y0 = e
  295. ror $(25-11), y0 # y0 = e >> (25-11)
  296. mov a, y1 # y1 = a
  297. xor e, y0 # y0 = e ^ (e >> (25-11))
  298. ror $(22-13), y1 # y1 = a >> (22-13)
  299. mov f, y2 # y2 = f
  300. xor a, y1 # y1 = a ^ (a >> (22-13)
  301. ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  302. xor g, y2 # y2 = f^g
  303. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  304. ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  305. and e, y2 # y2 = (f^g)&e
  306. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  307. ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  308. xor g, y2 # y2 = CH = ((f^g)&e)^g
  309. add y0, y2 # y2 = S1 + CH
  310. ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  311. offset = \round * 4 + _XFER
  312. add offset(%rsp), y2 # y2 = k + w + S1 + CH
  313. mov a, y0 # y0 = a
  314. add y2, h # h = h + S1 + CH + k + w
  315. mov a, y2 # y2 = a
  316. or c, y0 # y0 = a|c
  317. add h, d # d = d + h + S1 + CH + k + w
  318. and c, y2 # y2 = a&c
  319. and b, y0 # y0 = (a|c)&b
  320. add y1, h # h = h + S1 + CH + k + w + S0
  321. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  322. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  323. ROTATE_ARGS
  324. .endm
  325. ########################################################################
  326. ## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
  327. ## int blocks);
  328. ## arg 1 : pointer to state
  329. ## (struct sha256_state is assumed to begin with u32 state[8])
  330. ## arg 2 : pointer to input data
  331. ## arg 3 : Num blocks
  332. ########################################################################
  333. .text
  334. SYM_TYPED_FUNC_START(sha256_transform_ssse3)
  335. .align 32
  336. pushq %rbx
  337. pushq %r12
  338. pushq %r13
  339. pushq %r14
  340. pushq %r15
  341. pushq %rbp
  342. mov %rsp, %rbp
  343. subq $STACK_SIZE, %rsp
  344. and $~15, %rsp
  345. shl $6, NUM_BLKS # convert to bytes
  346. jz done_hash
  347. add INP, NUM_BLKS
  348. mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
  349. ## load initial digest
  350. mov 4*0(CTX), a
  351. mov 4*1(CTX), b
  352. mov 4*2(CTX), c
  353. mov 4*3(CTX), d
  354. mov 4*4(CTX), e
  355. mov 4*5(CTX), f
  356. mov 4*6(CTX), g
  357. mov 4*7(CTX), h
  358. movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
  359. movdqa _SHUF_00BA(%rip), SHUF_00BA
  360. movdqa _SHUF_DC00(%rip), SHUF_DC00
  361. loop0:
  362. lea K256(%rip), TBL
  363. ## byte swap first 16 dwords
  364. COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
  365. COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
  366. COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
  367. COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
  368. mov INP, _INP(%rsp)
  369. ## schedule 48 input dwords, by doing 3 rounds of 16 each
  370. mov $3, SRND
  371. .align 16
  372. loop1:
  373. movdqa (TBL), XFER
  374. paddd X0, XFER
  375. movdqa XFER, _XFER(%rsp)
  376. FOUR_ROUNDS_AND_SCHED
  377. movdqa 1*16(TBL), XFER
  378. paddd X0, XFER
  379. movdqa XFER, _XFER(%rsp)
  380. FOUR_ROUNDS_AND_SCHED
  381. movdqa 2*16(TBL), XFER
  382. paddd X0, XFER
  383. movdqa XFER, _XFER(%rsp)
  384. FOUR_ROUNDS_AND_SCHED
  385. movdqa 3*16(TBL), XFER
  386. paddd X0, XFER
  387. movdqa XFER, _XFER(%rsp)
  388. add $4*16, TBL
  389. FOUR_ROUNDS_AND_SCHED
  390. sub $1, SRND
  391. jne loop1
  392. mov $2, SRND
  393. loop2:
  394. paddd (TBL), X0
  395. movdqa X0, _XFER(%rsp)
  396. DO_ROUND 0
  397. DO_ROUND 1
  398. DO_ROUND 2
  399. DO_ROUND 3
  400. paddd 1*16(TBL), X1
  401. movdqa X1, _XFER(%rsp)
  402. add $2*16, TBL
  403. DO_ROUND 0
  404. DO_ROUND 1
  405. DO_ROUND 2
  406. DO_ROUND 3
  407. movdqa X2, X0
  408. movdqa X3, X1
  409. sub $1, SRND
  410. jne loop2
  411. addm (4*0)(CTX),a
  412. addm (4*1)(CTX),b
  413. addm (4*2)(CTX),c
  414. addm (4*3)(CTX),d
  415. addm (4*4)(CTX),e
  416. addm (4*5)(CTX),f
  417. addm (4*6)(CTX),g
  418. addm (4*7)(CTX),h
  419. mov _INP(%rsp), INP
  420. add $64, INP
  421. cmp _INP_END(%rsp), INP
  422. jne loop0
  423. done_hash:
  424. mov %rbp, %rsp
  425. popq %rbp
  426. popq %r15
  427. popq %r14
  428. popq %r13
  429. popq %r12
  430. popq %rbx
  431. RET
  432. SYM_FUNC_END(sha256_transform_ssse3)
  433. .section .rodata.cst256.K256, "aM", @progbits, 256
  434. .align 64
  435. K256:
  436. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  437. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  438. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  439. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  440. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  441. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  442. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  443. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  444. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  445. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  446. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  447. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  448. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  449. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  450. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  451. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  452. .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
  453. .align 16
  454. PSHUFFLE_BYTE_FLIP_MASK:
  455. .octa 0x0c0d0e0f08090a0b0405060700010203
  456. .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
  457. .align 16
  458. # shuffle xBxA -> 00BA
  459. _SHUF_00BA:
  460. .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
  461. .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
  462. .align 16
  463. # shuffle xDxC -> DC00
  464. _SHUF_DC00:
  465. .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF