M7memcpy.S 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923
  1. /*
  2. * M7memcpy: Optimized SPARC M7 memcpy
  3. *
  4. * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
  5. */
  6. .file "M7memcpy.S"
  7. /*
  8. * memcpy(s1, s2, len)
  9. *
  10. * Copy s2 to s1, always copy n bytes.
  11. * Note: this C code does not work for overlapped copies.
  12. *
  13. * Fast assembler language version of the following C-program for memcpy
  14. * which represents the `standard' for the C-library.
  15. *
  16. * void *
  17. * memcpy(void *s, const void *s0, size_t n)
  18. * {
  19. * if (n != 0) {
  20. * char *s1 = s;
  21. * const char *s2 = s0;
  22. * do {
  23. * *s1++ = *s2++;
  24. * } while (--n != 0);
  25. * }
  26. * return (s);
  27. * }
  28. *
  29. *
  30. * SPARC T7/M7 Flow :
  31. *
  32. * if (count < SMALL_MAX) {
  33. * if count < SHORTCOPY (SHORTCOPY=3)
  34. * copy bytes; exit with dst addr
  35. * if src & dst aligned on word boundary but not long word boundary,
  36. * copy with ldw/stw; branch to finish_up
  37. * if src & dst aligned on long word boundary
  38. * copy with ldx/stx; branch to finish_up
  39. * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14)
  40. * copy bytes; exit with dst addr
  41. * move enough bytes to get src to word boundary
  42. * if dst now on word boundary
  43. * move_words:
  44. * copy words; branch to finish_up
  45. * if dst now on half word boundary
  46. * load words, shift half words, store words; branch to finish_up
  47. * if dst on byte 1
  48. * load words, shift 3 bytes, store words; branch to finish_up
  49. * if dst on byte 3
  50. * load words, shift 1 byte, store words; branch to finish_up
  51. * finish_up:
  52. * copy bytes; exit with dst addr
  53. * } else { More than SMALL_MAX bytes
  54. * move bytes until dst is on long word boundary
  55. * if( src is on long word boundary ) {
  56. * if (count < MED_MAX) {
  57. * finish_long: src/dst aligned on 8 bytes
  58. * copy with ldx/stx in 8-way unrolled loop;
  59. * copy final 0-63 bytes; exit with dst addr
  60. * } else { src/dst aligned; count > MED_MAX
  61. * align dst on 64 byte boundary; for main data movement:
  62. * prefetch src data to L2 cache; let HW prefetch move data to L1 cache
  63. * Use BIS (block initializing store) to avoid copying store cache
  64. * lines from memory. But pre-store first element of each cache line
  65. * ST_CHUNK lines in advance of the rest of that cache line. That
  66. * gives time for replacement cache lines to be written back without
  67. * excess STQ and Miss Buffer filling. Repeat until near the end,
  68. * then finish up storing before going to finish_long.
  69. * }
  70. * } else { src/dst not aligned on 8 bytes
  71. * if src is word aligned and count < MED_WMAX
  72. * move words in 8-way unrolled loop
  73. * move final 0-31 bytes; exit with dst addr
  74. * if count < MED_UMAX
  75. * use alignaddr/faligndata combined with ldd/std in 8-way
  76. * unrolled loop to move data.
  77. * go to unalign_done
  78. * else
  79. * setup alignaddr for faligndata instructions
  80. * align dst on 64 byte boundary; prefetch src data to L1 cache
  81. * loadx8, falign, block-store, prefetch loop
  82. * (only use block-init-store when src/dst on 8 byte boundaries.)
  83. * unalign_done:
  84. * move remaining bytes for unaligned cases. exit with dst addr.
  85. * }
  86. *
  87. */
  88. #include <asm/visasm.h>
  89. #include <asm/asi.h>
  90. #if !defined(EX_LD) && !defined(EX_ST)
  91. #define NON_USER_COPY
  92. #endif
  93. #ifndef EX_LD
  94. #define EX_LD(x,y) x
  95. #endif
  96. #ifndef EX_LD_FP
  97. #define EX_LD_FP(x,y) x
  98. #endif
  99. #ifndef EX_ST
  100. #define EX_ST(x,y) x
  101. #endif
  102. #ifndef EX_ST_FP
  103. #define EX_ST_FP(x,y) x
  104. #endif
  105. #ifndef EX_RETVAL
  106. #define EX_RETVAL(x) x
  107. #endif
  108. #ifndef LOAD
  109. #define LOAD(type,addr,dest) type [addr], dest
  110. #endif
  111. #ifndef STORE
  112. #define STORE(type,src,addr) type src, [addr]
  113. #endif
  114. /*
  115. * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
  116. * line as "least recently used" which means if many threads are
  117. * active, it has a high probability of being pushed out of the cache
  118. * between the first initializing store and the final stores.
  119. * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
  120. * marks the cache line as "most recently used" for all
  121. * but the last cache line
  122. */
  123. #ifndef STORE_ASI
  124. #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  125. #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
  126. #else
  127. #define STORE_ASI 0x80 /* ASI_P */
  128. #endif
  129. #endif
  130. #ifndef STORE_MRU_ASI
  131. #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
  132. #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P
  133. #else
  134. #define STORE_MRU_ASI 0x80 /* ASI_P */
  135. #endif
  136. #endif
  137. #ifndef STORE_INIT
  138. #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
  139. #endif
  140. #ifndef STORE_INIT_MRU
  141. #define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI
  142. #endif
  143. #ifndef FUNC_NAME
  144. #define FUNC_NAME M7memcpy
  145. #endif
  146. #ifndef PREAMBLE
  147. #define PREAMBLE
  148. #endif
  149. #define BLOCK_SIZE 64
  150. #define SHORTCOPY 3
  151. #define SHORTCHECK 14
  152. #define SHORT_LONG 64 /* max copy for short longword-aligned case */
  153. /* must be at least 64 */
  154. #define SMALL_MAX 128
  155. #define MED_UMAX 1024 /* max copy for medium un-aligned case */
  156. #define MED_WMAX 1024 /* max copy for medium word-aligned case */
  157. #define MED_MAX 1024 /* max copy for medium longword-aligned case */
  158. #define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */
  159. #define ALIGN_PRE 24 /* distance for aligned prefetch loop */
  160. .register %g2,#scratch
  161. .section ".text"
  162. .global FUNC_NAME
  163. .type FUNC_NAME, #function
  164. .align 16
  165. FUNC_NAME:
  166. srlx %o2, 31, %g2
  167. cmp %g2, 0
  168. tne %xcc, 5
  169. PREAMBLE
  170. mov %o0, %g1 ! save %o0
  171. brz,pn %o2, .Lsmallx
  172. cmp %o2, 3
  173. ble,pn %icc, .Ltiny_cp
  174. cmp %o2, 19
  175. ble,pn %icc, .Lsmall_cp
  176. or %o0, %o1, %g2
  177. cmp %o2, SMALL_MAX
  178. bl,pn %icc, .Lmedium_cp
  179. nop
  180. .Lmedium:
  181. neg %o0, %o5
  182. andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
  183. brz,pt %o5, .Ldst_aligned_on_8
  184. ! %o5 has the bytes to be written in partial store.
  185. sub %o2, %o5, %o2
  186. sub %o1, %o0, %o1 ! %o1 gets the difference
  187. 7: ! dst aligning loop
  188. add %o1, %o0, %o4
  189. EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte
  190. subcc %o5, 1, %o5
  191. EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
  192. bgu,pt %xcc, 7b
  193. add %o0, 1, %o0 ! advance dst
  194. add %o1, %o0, %o1 ! restore %o1
  195. .Ldst_aligned_on_8:
  196. andcc %o1, 7, %o5
  197. brnz,pt %o5, .Lsrc_dst_unaligned_on_8
  198. nop
  199. .Lsrc_dst_aligned_on_8:
  200. ! check if we are copying MED_MAX or more bytes
  201. set MED_MAX, %o3
  202. cmp %o2, %o3 ! limit to store buffer size
  203. bgu,pn %xcc, .Llarge_align8_copy
  204. nop
  205. /*
  206. * Special case for handling when src and dest are both long word aligned
  207. * and total data to move is less than MED_MAX bytes
  208. */
  209. .Lmedlong:
  210. subcc %o2, 63, %o2 ! adjust length to allow cc test
  211. ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes
  212. nop
  213. .Lmedl64:
  214. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load
  215. subcc %o2, 64, %o2 ! decrement length count
  216. EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store
  217. EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64
  218. EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
  219. EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
  220. EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
  221. EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
  222. EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
  223. EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
  224. EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
  225. EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
  226. add %o1, 64, %o1 ! increase src ptr by 64
  227. EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
  228. EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
  229. add %o0, 64, %o0 ! increase dst ptr by 64
  230. EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
  231. EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
  232. bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left
  233. EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
  234. .Lmedl63:
  235. addcc %o2, 32, %o2 ! adjust remaining count
  236. ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left
  237. nop
  238. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load
  239. sub %o2, 32, %o2 ! decrement length count
  240. EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store
  241. EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32
  242. add %o1, 32, %o1 ! increase src ptr by 32
  243. EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
  244. EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
  245. add %o0, 32, %o0 ! increase dst ptr by 32
  246. EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
  247. EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
  248. EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
  249. .Lmedl31:
  250. addcc %o2, 16, %o2 ! adjust remaining count
  251. ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left
  252. nop !
  253. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
  254. add %o1, 16, %o1 ! increase src ptr by 16
  255. EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
  256. sub %o2, 16, %o2 ! decrease count by 16
  257. EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
  258. add %o0, 16, %o0 ! increase dst ptr by 16
  259. EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
  260. .Lmedl15:
  261. addcc %o2, 15, %o2 ! restore count
  262. bz,pt %xcc, .Lsmallx ! exit if finished
  263. cmp %o2, 8
  264. blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left
  265. tst %o2
  266. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes
  267. add %o1, 8, %o1 ! increase src ptr by 8
  268. add %o0, 8, %o0 ! increase dst ptr by 8
  269. subcc %o2, 8, %o2 ! decrease count by 8
  270. bnz,pn %xcc, .Lmedw7
  271. EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8
  272. retl
  273. mov EX_RETVAL(%g1), %o0 ! restore %o0
  274. .align 16
  275. .Lsrc_dst_unaligned_on_8:
  276. ! DST is 8-byte aligned, src is not
  277. 2:
  278. andcc %o1, 0x3, %o5 ! test word alignment
  279. bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned
  280. nop
  281. /*
  282. * Handle all cases where src and dest are aligned on word
  283. * boundaries. Use unrolled loops for better performance.
  284. * This option wins over standard large data move when
  285. * source and destination is in cache for.Lmedium
  286. * to short data moves.
  287. */
  288. set MED_WMAX, %o3
  289. cmp %o2, %o3 ! limit to store buffer size
  290. bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop
  291. nop
  292. subcc %o2, 31, %o2 ! adjust length to allow cc test
  293. ! for end of loop
  294. ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16
  295. .Lmedw32:
  296. EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
  297. sllx %o4, 32, %o5
  298. EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
  299. or %o4, %o5, %o5
  300. EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
  301. subcc %o2, 32, %o2 ! decrement length count
  302. EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
  303. sllx %o4, 32, %o5
  304. EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
  305. or %o4, %o5, %o5
  306. EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
  307. add %o1, 32, %o1 ! increase src ptr by 32
  308. EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
  309. sllx %o4, 32, %o5
  310. EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
  311. or %o4, %o5, %o5
  312. EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
  313. add %o0, 32, %o0 ! increase dst ptr by 32
  314. EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
  315. sllx %o4, 32, %o5
  316. EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
  317. or %o4, %o5, %o5
  318. bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left
  319. EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
  320. .Lmedw31:
  321. addcc %o2, 31, %o2 ! restore count
  322. bz,pt %xcc, .Lsmallx ! exit if finished
  323. nop
  324. cmp %o2, 16
  325. blt,pt %xcc, .Lmedw15
  326. nop
  327. EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
  328. sllx %o4, 32, %o5
  329. subcc %o2, 16, %o2 ! decrement length count
  330. EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
  331. or %o4, %o5, %o5
  332. EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
  333. add %o1, 16, %o1 ! increase src ptr by 16
  334. EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
  335. add %o0, 16, %o0 ! increase dst ptr by 16
  336. sllx %o4, 32, %o5
  337. EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
  338. or %o4, %o5, %o5
  339. EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
  340. .Lmedw15:
  341. bz,pt %xcc, .Lsmallx ! exit if finished
  342. cmp %o2, 8
  343. blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left
  344. tst %o2
  345. EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes
  346. subcc %o2, 8, %o2 ! decrease count by 8
  347. EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
  348. add %o1, 8, %o1 ! increase src ptr by 8
  349. EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes
  350. add %o0, 8, %o0 ! increase dst ptr by 8
  351. EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
  352. bz,pt %xcc, .Lsmallx ! exit if finished
  353. .Lmedw7: ! count is ge 1, less than 8
  354. cmp %o2, 4 ! check for 4 bytes left
  355. blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left
  356. nop !
  357. EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes
  358. add %o1, 4, %o1 ! increase src ptr by 4
  359. add %o0, 4, %o0 ! increase dst ptr by 4
  360. subcc %o2, 4, %o2 ! decrease count by 4
  361. bnz .Lsmallleft3
  362. EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
  363. retl
  364. mov EX_RETVAL(%g1), %o0
  365. .align 16
  366. .Llarge_align8_copy: ! Src and dst share 8 byte alignment
  367. ! align dst to 64 byte boundary
  368. andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
  369. brz,pn %o3, .Laligned_to_64
  370. andcc %o0, 8, %o3 ! odd long words to move?
  371. brz,pt %o3, .Laligned_to_16
  372. nop
  373. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
  374. sub %o2, 8, %o2
  375. add %o1, 8, %o1 ! increment src ptr
  376. add %o0, 8, %o0 ! increment dst ptr
  377. EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
  378. .Laligned_to_16:
  379. andcc %o0, 16, %o3 ! pair of long words to move?
  380. brz,pt %o3, .Laligned_to_32
  381. nop
  382. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
  383. sub %o2, 16, %o2
  384. EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
  385. add %o1, 16, %o1 ! increment src ptr
  386. EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
  387. add %o0, 16, %o0 ! increment dst ptr
  388. EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
  389. .Laligned_to_32:
  390. andcc %o0, 32, %o3 ! four long words to move?
  391. brz,pt %o3, .Laligned_to_64
  392. nop
  393. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
  394. sub %o2, 32, %o2
  395. EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
  396. EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
  397. EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
  398. EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
  399. EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
  400. add %o1, 32, %o1 ! increment src ptr
  401. EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
  402. add %o0, 32, %o0 ! increment dst ptr
  403. EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
  404. .Laligned_to_64:
  405. !
  406. ! Using block init store (BIS) instructions to avoid fetching cache
  407. ! lines from memory. Use ST_CHUNK stores to first element of each cache
  408. ! line (similar to prefetching) to avoid overfilling STQ or miss buffers.
  409. ! Gives existing cache lines time to be moved out of L1/L2/L3 cache.
  410. ! Initial stores using MRU version of BIS to keep cache line in
  411. ! cache until we are ready to store final element of cache line.
  412. ! Then store last element using the LRU version of BIS.
  413. !
  414. andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
  415. and %o2, 0x3f, %o2 ! residue bytes in %o2
  416. !
  417. ! We use STORE_MRU_ASI for the first seven stores to each cache line
  418. ! followed by STORE_ASI (mark as LRU) for the last store. That
  419. ! mixed approach reduces the probability that the cache line is removed
  420. ! before we finish setting it, while minimizing the effects on
  421. ! other cached values during a large memcpy
  422. !
  423. ! ST_CHUNK batches up initial BIS operations for several cache lines
  424. ! to allow multiple requests to not be blocked by overflowing the
  425. ! the store miss buffer. Then the matching stores for all those
  426. ! BIS operations are executed.
  427. !
  428. sub %o0, 8, %o0 ! adjust %o0 for ASI alignment
  429. .Lalign_loop:
  430. cmp %o5, ST_CHUNK*64
  431. blu,pt %xcc, .Lalign_loop_fin
  432. mov ST_CHUNK,%o3
  433. .Lalign_loop_start:
  434. prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
  435. subcc %o3, 1, %o3
  436. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
  437. add %o1, 64, %o1
  438. add %o0, 8, %o0
  439. EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
  440. bgu %xcc,.Lalign_loop_start
  441. add %o0, 56, %o0
  442. mov ST_CHUNK,%o3
  443. sllx %o3, 6, %o4 ! ST_CHUNK*64
  444. sub %o1, %o4, %o1 ! reset %o1
  445. sub %o0, %o4, %o0 ! reset %o0
  446. .Lalign_loop_rest:
  447. EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
  448. add %o0, 16, %o0
  449. EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
  450. EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
  451. add %o0, 8, %o0
  452. EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
  453. subcc %o3, 1, %o3
  454. EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
  455. add %o0, 8, %o0
  456. EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
  457. EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
  458. add %o0, 8, %o0
  459. EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
  460. EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
  461. add %o0, 8, %o0
  462. EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
  463. EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
  464. add %o1, 64, %o1
  465. add %o0, 8, %o0
  466. EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
  467. add %o0, 8, %o0
  468. EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
  469. sub %o5, 64, %o5
  470. bgu %xcc,.Lalign_loop_rest
  471. ! mark cache line as LRU
  472. EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
  473. cmp %o5, ST_CHUNK*64
  474. bgu,pt %xcc, .Lalign_loop_start
  475. mov ST_CHUNK,%o3
  476. cmp %o5, 0
  477. beq .Lalign_done
  478. nop
  479. .Lalign_loop_fin:
  480. EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
  481. EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
  482. EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
  483. EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
  484. EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
  485. EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
  486. subcc %o5, 64, %o5
  487. EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
  488. EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
  489. EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
  490. EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
  491. EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
  492. EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
  493. EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
  494. add %o1, 64, %o1
  495. EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
  496. add %o0, 64, %o0
  497. EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
  498. bgu %xcc,.Lalign_loop_fin
  499. EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
  500. .Lalign_done:
  501. add %o0, 8, %o0 ! restore %o0 from ASI alignment
  502. membar #StoreStore
  503. sub %o2, 63, %o2 ! adjust length to allow cc test
  504. ba .Lmedl63 ! in .Lmedl63
  505. nop
  506. .align 16
  507. ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
  508. .Lunalignsetup:
  509. .Lunalignrejoin:
  510. mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it
  511. #ifdef NON_USER_COPY
  512. VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
  513. #else
  514. VISEntryHalf
  515. #endif
  516. mov %o3, %g1 ! restore %g1
  517. set MED_UMAX, %o3
  518. cmp %o2, %o3 ! check for.Lmedium unaligned limit
  519. bge,pt %xcc,.Lunalign_large
  520. prefetch [%o1 + (4 * BLOCK_SIZE)], 20
  521. andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
  522. and %o2, 0x3f, %o2 ! residue bytes in %o2
  523. cmp %o2, 8 ! Insure we do not load beyond
  524. bgt .Lunalign_adjust ! end of source buffer
  525. andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
  526. add %o2, 64, %o2 ! adjust to leave loop
  527. sub %o5, 64, %o5 ! early if necessary
  528. .Lunalign_adjust:
  529. alignaddr %o1, %g0, %g0 ! generate %gsr
  530. add %o1, %o5, %o1 ! advance %o1 to after blocks
  531. EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
  532. .Lunalign_loop:
  533. EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
  534. faligndata %f0, %f2, %f16
  535. EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
  536. subcc %o5, BLOCK_SIZE, %o5
  537. EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
  538. faligndata %f2, %f4, %f18
  539. EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
  540. EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
  541. faligndata %f4, %f6, %f20
  542. EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
  543. EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
  544. faligndata %f6, %f8, %f22
  545. EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
  546. EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
  547. faligndata %f8, %f10, %f24
  548. EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
  549. EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
  550. faligndata %f10, %f12, %f26
  551. EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
  552. add %o4, BLOCK_SIZE, %o4
  553. EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
  554. faligndata %f12, %f14, %f28
  555. EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
  556. EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
  557. faligndata %f14, %f0, %f30
  558. EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
  559. add %o0, BLOCK_SIZE, %o0
  560. bgu,pt %xcc, .Lunalign_loop
  561. prefetch [%o4 + (5 * BLOCK_SIZE)], 20
  562. ba .Lunalign_done
  563. nop
  564. .Lunalign_large:
  565. andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned?
  566. bz %xcc, .Lunalignsrc
  567. sub %o3, 64, %o3 ! %o3 will be multiple of 8
  568. neg %o3 ! bytes until dest is 64 byte aligned
  569. sub %o2, %o3, %o2 ! update cnt with bytes to be moved
  570. ! Move bytes according to source alignment
  571. andcc %o1, 0x1, %o5
  572. bnz %xcc, .Lunalignbyte ! check for byte alignment
  573. nop
  574. andcc %o1, 2, %o5 ! check for half word alignment
  575. bnz %xcc, .Lunalignhalf
  576. nop
  577. ! Src is word aligned
  578. .Lunalignword:
  579. EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes
  580. add %o1, 8, %o1 ! increase src ptr by 8
  581. EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4
  582. subcc %o3, 8, %o3 ! decrease count by 8
  583. EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
  584. add %o0, 8, %o0 ! increase dst ptr by 8
  585. bnz %xcc, .Lunalignword
  586. EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
  587. ba .Lunalignsrc
  588. nop
  589. ! Src is half-word aligned
  590. .Lunalignhalf:
  591. EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes
  592. sllx %o4, 32, %o5 ! shift left
  593. EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
  594. or %o4, %o5, %o5
  595. sllx %o5, 16, %o5
  596. EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
  597. or %o4, %o5, %o5
  598. EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
  599. add %o1, 8, %o1
  600. subcc %o3, 8, %o3
  601. bnz %xcc, .Lunalignhalf
  602. add %o0, 8, %o0
  603. ba .Lunalignsrc
  604. nop
  605. ! Src is Byte aligned
  606. .Lunalignbyte:
  607. sub %o0, %o1, %o0 ! share pointer advance
  608. .Lunalignbyte_loop:
  609. EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
  610. sllx %o4, 56, %o5
  611. EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
  612. sllx %o4, 40, %o4
  613. or %o4, %o5, %o5
  614. EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
  615. sllx %o4, 24, %o4
  616. or %o4, %o5, %o5
  617. EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
  618. sllx %o4, 8, %o4
  619. or %o4, %o5, %o5
  620. EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
  621. or %o4, %o5, %o5
  622. add %o0, %o1, %o0
  623. EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
  624. sub %o0, %o1, %o0
  625. subcc %o3, 8, %o3
  626. bnz %xcc, .Lunalignbyte_loop
  627. add %o1, 8, %o1
  628. add %o0,%o1, %o0 ! restore pointer
  629. ! Destination is now block (64 byte aligned)
  630. .Lunalignsrc:
  631. andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
  632. and %o2, 0x3f, %o2 ! residue bytes in %o2
  633. add %o2, 64, %o2 ! Insure we do not load beyond
  634. sub %o5, 64, %o5 ! end of source buffer
  635. andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
  636. alignaddr %o1, %g0, %g0 ! generate %gsr
  637. add %o1, %o5, %o1 ! advance %o1 to after blocks
  638. EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
  639. add %o4, 8, %o4
  640. .Lunalign_sloop:
  641. EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
  642. faligndata %f14, %f16, %f0
  643. EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
  644. faligndata %f16, %f18, %f2
  645. EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
  646. faligndata %f18, %f20, %f4
  647. EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
  648. subcc %o5, 64, %o5
  649. EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
  650. faligndata %f20, %f22, %f6
  651. EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
  652. EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
  653. faligndata %f22, %f24, %f8
  654. EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
  655. EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
  656. faligndata %f24, %f26, %f10
  657. EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
  658. EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
  659. faligndata %f26, %f28, %f12
  660. EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
  661. add %o4, 64, %o4
  662. EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
  663. faligndata %f28, %f30, %f14
  664. EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
  665. EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
  666. add %o0, 64, %o0
  667. EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
  668. fsrc2 %f30, %f14
  669. bgu,pt %xcc, .Lunalign_sloop
  670. prefetch [%o4 + (8 * BLOCK_SIZE)], 20
  671. .Lunalign_done:
  672. ! Handle trailing bytes, 64 to 127
  673. ! Dest long word aligned, Src not long word aligned
  674. cmp %o2, 15
  675. bleu %xcc, .Lunalign_short
  676. andn %o2, 0x7, %o5 ! %o5 is multiple of 8
  677. and %o2, 0x7, %o2 ! residue bytes in %o2
  678. add %o2, 8, %o2
  679. sub %o5, 8, %o5 ! insure we do not load past end of src
  680. andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
  681. add %o1, %o5, %o1 ! advance %o1 to after multiple of 8
  682. EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
  683. .Lunalign_by8:
  684. EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
  685. add %o4, 8, %o4
  686. faligndata %f0, %f2, %f16
  687. subcc %o5, 8, %o5
  688. EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
  689. fsrc2 %f2, %f0
  690. bgu,pt %xcc, .Lunalign_by8
  691. add %o0, 8, %o0
  692. .Lunalign_short:
  693. #ifdef NON_USER_COPY
  694. VISExitHalfFast
  695. #else
  696. VISExitHalf
  697. #endif
  698. ba .Lsmallrest
  699. nop
  700. /*
  701. * This is a special case of nested memcpy. This can happen when kernel
  702. * calls unaligned memcpy back to back without saving FP registers. We need
  703. * traps(context switch) to save/restore FP registers. If the kernel calls
  704. * memcpy without this trap sequence we will hit FP corruption. Let's use
  705. * the normal integer load/store method in this case.
  706. */
  707. #ifdef NON_USER_COPY
  708. .Lmedium_vis_entry_fail_cp:
  709. or %o0, %o1, %g2
  710. #endif
  711. .Lmedium_cp:
  712. LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
  713. andcc %g2, 0x7, %g0
  714. bne,pn %xcc, .Lmedium_unaligned_cp
  715. nop
  716. .Lmedium_noprefetch_cp:
  717. andncc %o2, 0x20 - 1, %o5
  718. be,pn %xcc, 2f
  719. sub %o2, %o5, %o2
  720. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
  721. EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
  722. EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
  723. EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
  724. add %o1, 0x20, %o1
  725. subcc %o5, 0x20, %o5
  726. EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
  727. EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
  728. EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
  729. EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
  730. bne,pt %xcc, 1b
  731. add %o0, 0x20, %o0
  732. 2: andcc %o2, 0x18, %o5
  733. be,pt %xcc, 3f
  734. sub %o2, %o5, %o2
  735. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
  736. add %o1, 0x08, %o1
  737. add %o0, 0x08, %o0
  738. subcc %o5, 0x08, %o5
  739. bne,pt %xcc, 1b
  740. EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
  741. 3: brz,pt %o2, .Lexit_cp
  742. cmp %o2, 0x04
  743. bl,pn %xcc, .Ltiny_cp
  744. nop
  745. EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
  746. add %o1, 0x04, %o1
  747. add %o0, 0x04, %o0
  748. subcc %o2, 0x04, %o2
  749. bne,pn %xcc, .Ltiny_cp
  750. EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
  751. ba,a,pt %xcc, .Lexit_cp
  752. .Lmedium_unaligned_cp:
  753. /* First get dest 8 byte aligned. */
  754. sub %g0, %o0, %o3
  755. and %o3, 0x7, %o3
  756. brz,pt %o3, 2f
  757. sub %o2, %o3, %o2
  758. 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
  759. add %o1, 1, %o1
  760. subcc %o3, 1, %o3
  761. add %o0, 1, %o0
  762. bne,pt %xcc, 1b
  763. EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
  764. 2:
  765. and %o1, 0x7, %o3
  766. brz,pn %o3, .Lmedium_noprefetch_cp
  767. sll %o3, 3, %o3
  768. mov 64, %g2
  769. sub %g2, %o3, %g2
  770. andn %o1, 0x7, %o1
  771. EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
  772. sllx %o4, %o3, %o4
  773. andn %o2, 0x08 - 1, %o5
  774. sub %o2, %o5, %o2
  775. 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
  776. add %o1, 0x08, %o1
  777. subcc %o5, 0x08, %o5
  778. srlx %g3, %g2, %g7
  779. or %g7, %o4, %g7
  780. EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
  781. add %o0, 0x08, %o0
  782. bne,pt %xcc, 1b
  783. sllx %g3, %o3, %o4
  784. srl %o3, 3, %o3
  785. add %o1, %o3, %o1
  786. brz,pn %o2, .Lexit_cp
  787. nop
  788. ba,pt %xcc, .Lsmall_unaligned_cp
  789. .Ltiny_cp:
  790. EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
  791. subcc %o2, 1, %o2
  792. be,pn %xcc, .Lexit_cp
  793. EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
  794. EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
  795. subcc %o2, 1, %o2
  796. be,pn %xcc, .Lexit_cp
  797. EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
  798. EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
  799. ba,pt %xcc, .Lexit_cp
  800. EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
  801. .Lsmall_cp:
  802. andcc %g2, 0x3, %g0
  803. bne,pn %xcc, .Lsmall_unaligned_cp
  804. andn %o2, 0x4 - 1, %o5
  805. sub %o2, %o5, %o2
  806. 1:
  807. EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
  808. add %o1, 0x04, %o1
  809. subcc %o5, 0x04, %o5
  810. add %o0, 0x04, %o0
  811. bne,pt %xcc, 1b
  812. EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
  813. brz,pt %o2, .Lexit_cp
  814. nop
  815. ba,a,pt %xcc, .Ltiny_cp
  816. .Lsmall_unaligned_cp:
  817. 1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
  818. add %o1, 1, %o1
  819. add %o0, 1, %o0
  820. subcc %o2, 1, %o2
  821. bne,pt %xcc, 1b
  822. EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
  823. ba,a,pt %xcc, .Lexit_cp
  824. .Lsmallrest:
  825. tst %o2
  826. bz,pt %xcc, .Lsmallx
  827. cmp %o2, 4
  828. blt,pn %xcc, .Lsmallleft3
  829. nop
  830. sub %o2, 3, %o2
  831. .Lsmallnotalign4:
  832. EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
  833. subcc %o2, 4, %o2 ! reduce count by 4
  834. EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
  835. EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
  836. add %o1, 4, %o1 ! advance SRC by 4
  837. EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
  838. EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
  839. add %o0, 4, %o0 ! advance DST by 4
  840. EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
  841. EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
  842. bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain
  843. EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
  844. addcc %o2, 3, %o2 ! restore count
  845. bz,pt %xcc, .Lsmallx
  846. .Lsmallleft3: ! 1, 2, or 3 bytes remain
  847. subcc %o2, 1, %o2
  848. EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte
  849. bz,pt %xcc, .Lsmallx
  850. EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte
  851. EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte
  852. subcc %o2, 1, %o2
  853. bz,pt %xcc, .Lsmallx
  854. EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
  855. EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte
  856. EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte
  857. .Lsmallx:
  858. retl
  859. mov EX_RETVAL(%g1), %o0
  860. .Lsmallfin:
  861. tst %o2
  862. bnz,pn %xcc, .Lsmallleft3
  863. nop
  864. retl
  865. mov EX_RETVAL(%g1), %o0 ! restore %o0
  866. .Lexit_cp:
  867. retl
  868. mov EX_RETVAL(%g1), %o0
  869. .size FUNC_NAME, .-FUNC_NAME