ev6-memcpy.S 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * arch/alpha/lib/ev6-memcpy.S
  4. * 21264 version by Rick Gorton <[email protected]>
  5. *
  6. * Reasonably optimized memcpy() routine for the Alpha 21264
  7. *
  8. * - memory accessed as aligned quadwords only
  9. * - uses bcmpge to compare 8 bytes in parallel
  10. *
  11. * Much of the information about 21264 scheduling/coding comes from:
  12. * Compiler Writer's Guide for the Alpha 21264
  13. * abbreviated as 'CWG' in other comments here
  14. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  15. * Scheduling notation:
  16. * E - either cluster
  17. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  18. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  19. *
  20. * Temp usage notes:
  21. * $1,$2, - scratch
  22. */
  23. #include <asm/export.h>
  24. .set noreorder
  25. .set noat
  26. .align 4
  27. .globl memcpy
  28. .ent memcpy
  29. memcpy:
  30. .frame $30,0,$26,0
  31. .prologue 0
  32. mov $16, $0 # E : copy dest to return
  33. ble $18, $nomoredata # U : done with the copy?
  34. xor $16, $17, $1 # E : are source and dest alignments the same?
  35. and $1, 7, $1 # E : are they the same mod 8?
  36. bne $1, $misaligned # U : Nope - gotta do this the slow way
  37. /* source and dest are same mod 8 address */
  38. and $16, 7, $1 # E : Are both 0mod8?
  39. beq $1, $both_0mod8 # U : Yes
  40. nop # E :
  41. /*
  42. * source and dest are same misalignment. move a byte at a time
  43. * until a 0mod8 alignment for both is reached.
  44. * At least one byte more to move
  45. */
  46. $head_align:
  47. ldbu $1, 0($17) # L : grab a byte
  48. subq $18, 1, $18 # E : count--
  49. addq $17, 1, $17 # E : src++
  50. stb $1, 0($16) # L :
  51. addq $16, 1, $16 # E : dest++
  52. and $16, 7, $1 # E : Are we at 0mod8 yet?
  53. ble $18, $nomoredata # U : done with the copy?
  54. bne $1, $head_align # U :
  55. $both_0mod8:
  56. cmple $18, 127, $1 # E : Can we unroll the loop?
  57. bne $1, $no_unroll # U :
  58. and $16, 63, $1 # E : get mod64 alignment
  59. beq $1, $do_unroll # U : no single quads to fiddle
  60. $single_head_quad:
  61. ldq $1, 0($17) # L : get 8 bytes
  62. subq $18, 8, $18 # E : count -= 8
  63. addq $17, 8, $17 # E : src += 8
  64. nop # E :
  65. stq $1, 0($16) # L : store
  66. addq $16, 8, $16 # E : dest += 8
  67. and $16, 63, $1 # E : get mod64 alignment
  68. bne $1, $single_head_quad # U : still not fully aligned
  69. $do_unroll:
  70. addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
  71. cmple $18, 127, $1 # E : Can we go through the unrolled loop?
  72. bne $1, $tail_quads # U : Nope
  73. nop # E :
  74. $unroll_body:
  75. wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
  76. # ($7) are about to be over-written
  77. ldq $6, 0($17) # L0 : bytes 0..7
  78. nop # E :
  79. nop # E :
  80. ldq $4, 8($17) # L : bytes 8..15
  81. ldq $5, 16($17) # L : bytes 16..23
  82. addq $7, 64, $7 # E : Update next wh64 address
  83. nop # E :
  84. ldq $3, 24($17) # L : bytes 24..31
  85. addq $16, 64, $1 # E : fallback value for wh64
  86. nop # E :
  87. nop # E :
  88. addq $17, 32, $17 # E : src += 32 bytes
  89. stq $6, 0($16) # L : bytes 0..7
  90. nop # E :
  91. nop # E :
  92. stq $4, 8($16) # L : bytes 8..15
  93. stq $5, 16($16) # L : bytes 16..23
  94. subq $18, 192, $2 # E : At least two more trips to go?
  95. nop # E :
  96. stq $3, 24($16) # L : bytes 24..31
  97. addq $16, 32, $16 # E : dest += 32 bytes
  98. nop # E :
  99. nop # E :
  100. ldq $6, 0($17) # L : bytes 0..7
  101. ldq $4, 8($17) # L : bytes 8..15
  102. cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
  103. # fallback wh64 address if < 2 more trips
  104. nop # E :
  105. ldq $5, 16($17) # L : bytes 16..23
  106. ldq $3, 24($17) # L : bytes 24..31
  107. addq $16, 32, $16 # E : dest += 32
  108. subq $18, 64, $18 # E : count -= 64
  109. addq $17, 32, $17 # E : src += 32
  110. stq $6, -32($16) # L : bytes 0..7
  111. stq $4, -24($16) # L : bytes 8..15
  112. cmple $18, 63, $1 # E : At least one more trip?
  113. stq $5, -16($16) # L : bytes 16..23
  114. stq $3, -8($16) # L : bytes 24..31
  115. nop # E :
  116. beq $1, $unroll_body
  117. $tail_quads:
  118. $no_unroll:
  119. .align 4
  120. subq $18, 8, $18 # E : At least a quad left?
  121. blt $18, $less_than_8 # U : Nope
  122. nop # E :
  123. nop # E :
  124. $move_a_quad:
  125. ldq $1, 0($17) # L : fetch 8
  126. subq $18, 8, $18 # E : count -= 8
  127. addq $17, 8, $17 # E : src += 8
  128. nop # E :
  129. stq $1, 0($16) # L : store 8
  130. addq $16, 8, $16 # E : dest += 8
  131. bge $18, $move_a_quad # U :
  132. nop # E :
  133. $less_than_8:
  134. .align 4
  135. addq $18, 8, $18 # E : add back for trailing bytes
  136. ble $18, $nomoredata # U : All-done
  137. nop # E :
  138. nop # E :
  139. /* Trailing bytes */
  140. $tail_bytes:
  141. subq $18, 1, $18 # E : count--
  142. ldbu $1, 0($17) # L : fetch a byte
  143. addq $17, 1, $17 # E : src++
  144. nop # E :
  145. stb $1, 0($16) # L : store a byte
  146. addq $16, 1, $16 # E : dest++
  147. bgt $18, $tail_bytes # U : more to be done?
  148. nop # E :
  149. /* branching to exit takes 3 extra cycles, so replicate exit here */
  150. ret $31, ($26), 1 # L0 :
  151. nop # E :
  152. nop # E :
  153. nop # E :
  154. $misaligned:
  155. mov $0, $4 # E : dest temp
  156. and $0, 7, $1 # E : dest alignment mod8
  157. beq $1, $dest_0mod8 # U : life doesnt totally suck
  158. nop
  159. $aligndest:
  160. ble $18, $nomoredata # U :
  161. ldbu $1, 0($17) # L : fetch a byte
  162. subq $18, 1, $18 # E : count--
  163. addq $17, 1, $17 # E : src++
  164. stb $1, 0($4) # L : store it
  165. addq $4, 1, $4 # E : dest++
  166. and $4, 7, $1 # E : dest 0mod8 yet?
  167. bne $1, $aligndest # U : go until we are aligned.
  168. /* Source has unknown alignment, but dest is known to be 0mod8 */
  169. $dest_0mod8:
  170. subq $18, 8, $18 # E : At least a quad left?
  171. blt $18, $misalign_tail # U : Nope
  172. ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
  173. nop # E :
  174. $mis_quad:
  175. ldq_u $16, 8($17) # L : Fetch next 8
  176. extql $3, $17, $3 # U : masking
  177. extqh $16, $17, $1 # U : masking
  178. bis $3, $1, $1 # E : merged bytes to store
  179. subq $18, 8, $18 # E : count -= 8
  180. addq $17, 8, $17 # E : src += 8
  181. stq $1, 0($4) # L : store 8 (aligned)
  182. mov $16, $3 # E : "rotate" source data
  183. addq $4, 8, $4 # E : dest += 8
  184. bge $18, $mis_quad # U : More quads to move
  185. nop
  186. nop
  187. $misalign_tail:
  188. addq $18, 8, $18 # E : account for tail stuff
  189. ble $18, $nomoredata # U :
  190. nop
  191. nop
  192. $misalign_byte:
  193. ldbu $1, 0($17) # L : fetch 1
  194. subq $18, 1, $18 # E : count--
  195. addq $17, 1, $17 # E : src++
  196. nop # E :
  197. stb $1, 0($4) # L : store
  198. addq $4, 1, $4 # E : dest++
  199. bgt $18, $misalign_byte # U : more to go?
  200. nop
  201. $nomoredata:
  202. ret $31, ($26), 1 # L0 :
  203. nop # E :
  204. nop # E :
  205. nop # E :
  206. .end memcpy
  207. EXPORT_SYMBOL(memcpy)
  208. /* For backwards module compatibility. */
  209. __memcpy = memcpy
  210. .globl __memcpy