ev6-memset.S 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * arch/alpha/lib/ev6-memset.S
  4. *
  5. * This is an efficient (and relatively small) implementation of the C library
  6. * "memset()" function for the 21264 implementation of Alpha.
  7. *
  8. * 21264 version contributed by Rick Gorton <[email protected]>
  9. *
  10. * Much of the information about 21264 scheduling/coding comes from:
  11. * Compiler Writer's Guide for the Alpha 21264
  12. * abbreviated as 'CWG' in other comments here
  13. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  14. * Scheduling notation:
  15. * E - either cluster
  16. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  17. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  18. * The algorithm for the leading and trailing quadwords remains the same,
  19. * however the loop has been unrolled to enable better memory throughput,
  20. * and the code has been replicated for each of the entry points: __memset
  21. * and __memset16 to permit better scheduling to eliminate the stalling
  22. * encountered during the mask replication.
  23. * A future enhancement might be to put in a byte store loop for really
  24. * small (say < 32 bytes) memset()s. Whether or not that change would be
  25. * a win in the kernel would depend upon the contextual usage.
  26. * WARNING: Maintaining this is going to be more work than the above version,
  27. * as fixes will need to be made in multiple places. The performance gain
  28. * is worth it.
  29. */
  30. #include <asm/export.h>
  31. .set noat
  32. .set noreorder
  33. .text
  34. .globl memset
  35. .globl __memset
  36. .globl ___memset
  37. .globl __memset16
  38. .globl __constant_c_memset
  39. .ent ___memset
  40. .align 5
  41. ___memset:
  42. .frame $30,0,$26,0
  43. .prologue 0
  44. /*
  45. * Serious stalling happens. The only way to mitigate this is to
  46. * undertake a major re-write to interleave the constant materialization
  47. * with other parts of the fall-through code. This is important, even
  48. * though it makes maintenance tougher.
  49. * Do this later.
  50. */
  51. and $17,255,$1 # E : 00000000000000ch
  52. insbl $17,1,$2 # U : 000000000000ch00
  53. bis $16,$16,$0 # E : return value
  54. ble $18,end_b # U : zero length requested?
  55. addq $18,$16,$6 # E : max address to write to
  56. bis $1,$2,$17 # E : 000000000000chch
  57. insbl $1,2,$3 # U : 0000000000ch0000
  58. insbl $1,3,$4 # U : 00000000ch000000
  59. or $3,$4,$3 # E : 00000000chch0000
  60. inswl $17,4,$5 # U : 0000chch00000000
  61. xor $16,$6,$1 # E : will complete write be within one quadword?
  62. inswl $17,6,$2 # U : chch000000000000
  63. or $17,$3,$17 # E : 00000000chchchch
  64. or $2,$5,$2 # E : chchchch00000000
  65. bic $1,7,$1 # E : fit within a single quadword?
  66. and $16,7,$3 # E : Target addr misalignment
  67. or $17,$2,$17 # E : chchchchchchchch
  68. beq $1,within_quad_b # U :
  69. nop # E :
  70. beq $3,aligned_b # U : target is 0mod8
  71. /*
  72. * Target address is misaligned, and won't fit within a quadword
  73. */
  74. ldq_u $4,0($16) # L : Fetch first partial
  75. bis $16,$16,$5 # E : Save the address
  76. insql $17,$16,$2 # U : Insert new bytes
  77. subq $3,8,$3 # E : Invert (for addressing uses)
  78. addq $18,$3,$18 # E : $18 is new count ($3 is negative)
  79. mskql $4,$16,$4 # U : clear relevant parts of the quad
  80. subq $16,$3,$16 # E : $16 is new aligned destination
  81. bis $2,$4,$1 # E : Final bytes
  82. nop
  83. stq_u $1,0($5) # L : Store result
  84. nop
  85. nop
  86. .align 4
  87. aligned_b:
  88. /*
  89. * We are now guaranteed to be quad aligned, with at least
  90. * one partial quad to write.
  91. */
  92. sra $18,3,$3 # U : Number of remaining quads to write
  93. and $18,7,$18 # E : Number of trailing bytes to write
  94. bis $16,$16,$5 # E : Save dest address
  95. beq $3,no_quad_b # U : tail stuff only
  96. /*
  97. * it's worth the effort to unroll this and use wh64 if possible
  98. * Lifted a bunch of code from clear_user.S
  99. * At this point, entry values are:
  100. * $16 Current destination address
  101. * $5 A copy of $16
  102. * $6 The max quadword address to write to
  103. * $18 Number trailer bytes
  104. * $3 Number quads to write
  105. */
  106. and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
  107. subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
  108. subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
  109. blt $4, loop_b # U :
  110. /*
  111. * We know we've got at least 16 quads, minimum of one trip
  112. * through unrolled loop. Do a quad at a time to get us 0mod64
  113. * aligned.
  114. */
  115. nop # E :
  116. nop # E :
  117. nop # E :
  118. beq $1, $bigalign_b # U :
  119. $alignmod64_b:
  120. stq $17, 0($5) # L :
  121. subq $3, 1, $3 # E : For consistency later
  122. addq $1, 8, $1 # E : Increment towards zero for alignment
  123. addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
  124. nop
  125. nop
  126. addq $5, 8, $5 # E : Inc address
  127. blt $1, $alignmod64_b # U :
  128. $bigalign_b:
  129. /*
  130. * $3 - number quads left to go
  131. * $5 - target address (aligned 0mod64)
  132. * $17 - mask of stuff to store
  133. * Scratch registers available: $7, $2, $4, $1
  134. * we know that we'll be taking a minimum of one trip through
  135. * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
  136. * Assumes the wh64 needs to be for 2 trips through the loop in the future
  137. * The wh64 is issued on for the starting destination address for trip +2
  138. * through the loop, and if there are less than two trips left, the target
  139. * address will be for the current trip.
  140. */
  141. $do_wh64_b:
  142. wh64 ($4) # L1 : memory subsystem write hint
  143. subq $3, 24, $2 # E : For determining future wh64 addresses
  144. stq $17, 0($5) # L :
  145. nop # E :
  146. addq $5, 128, $4 # E : speculative target of next wh64
  147. stq $17, 8($5) # L :
  148. stq $17, 16($5) # L :
  149. addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
  150. stq $17, 24($5) # L :
  151. stq $17, 32($5) # L :
  152. cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
  153. nop
  154. stq $17, 40($5) # L :
  155. stq $17, 48($5) # L :
  156. subq $3, 16, $2 # E : Repeat the loop at least once more?
  157. nop
  158. stq $17, 56($5) # L :
  159. addq $5, 64, $5 # E :
  160. subq $3, 8, $3 # E :
  161. bge $2, $do_wh64_b # U :
  162. nop
  163. nop
  164. nop
  165. beq $3, no_quad_b # U : Might have finished already
  166. .align 4
  167. /*
  168. * Simple loop for trailing quadwords, or for small amounts
  169. * of data (where we can't use an unrolled loop and wh64)
  170. */
  171. loop_b:
  172. stq $17,0($5) # L :
  173. subq $3,1,$3 # E : Decrement number quads left
  174. addq $5,8,$5 # E : Inc address
  175. bne $3,loop_b # U : more?
  176. no_quad_b:
  177. /*
  178. * Write 0..7 trailing bytes.
  179. */
  180. nop # E :
  181. beq $18,end_b # U : All done?
  182. ldq $7,0($5) # L :
  183. mskqh $7,$6,$2 # U : Mask final quad
  184. insqh $17,$6,$4 # U : New bits
  185. bis $2,$4,$1 # E : Put it all together
  186. stq $1,0($5) # L : And back to memory
  187. ret $31,($26),1 # L0 :
  188. within_quad_b:
  189. ldq_u $1,0($16) # L :
  190. insql $17,$16,$2 # U : New bits
  191. mskql $1,$16,$4 # U : Clear old
  192. bis $2,$4,$2 # E : New result
  193. mskql $2,$6,$4 # U :
  194. mskqh $1,$6,$2 # U :
  195. bis $2,$4,$1 # E :
  196. stq_u $1,0($16) # L :
  197. end_b:
  198. nop
  199. nop
  200. nop
  201. ret $31,($26),1 # L0 :
  202. .end ___memset
  203. EXPORT_SYMBOL(___memset)
  204. /*
  205. * This is the original body of code, prior to replication and
  206. * rescheduling. Leave it here, as there may be calls to this
  207. * entry point.
  208. */
  209. .align 4
  210. .ent __constant_c_memset
  211. __constant_c_memset:
  212. .frame $30,0,$26,0
  213. .prologue 0
  214. addq $18,$16,$6 # E : max address to write to
  215. bis $16,$16,$0 # E : return value
  216. xor $16,$6,$1 # E : will complete write be within one quadword?
  217. ble $18,end # U : zero length requested?
  218. bic $1,7,$1 # E : fit within a single quadword
  219. beq $1,within_one_quad # U :
  220. and $16,7,$3 # E : Target addr misalignment
  221. beq $3,aligned # U : target is 0mod8
  222. /*
  223. * Target address is misaligned, and won't fit within a quadword
  224. */
  225. ldq_u $4,0($16) # L : Fetch first partial
  226. bis $16,$16,$5 # E : Save the address
  227. insql $17,$16,$2 # U : Insert new bytes
  228. subq $3,8,$3 # E : Invert (for addressing uses)
  229. addq $18,$3,$18 # E : $18 is new count ($3 is negative)
  230. mskql $4,$16,$4 # U : clear relevant parts of the quad
  231. subq $16,$3,$16 # E : $16 is new aligned destination
  232. bis $2,$4,$1 # E : Final bytes
  233. nop
  234. stq_u $1,0($5) # L : Store result
  235. nop
  236. nop
  237. .align 4
  238. aligned:
  239. /*
  240. * We are now guaranteed to be quad aligned, with at least
  241. * one partial quad to write.
  242. */
  243. sra $18,3,$3 # U : Number of remaining quads to write
  244. and $18,7,$18 # E : Number of trailing bytes to write
  245. bis $16,$16,$5 # E : Save dest address
  246. beq $3,no_quad # U : tail stuff only
  247. /*
  248. * it's worth the effort to unroll this and use wh64 if possible
  249. * Lifted a bunch of code from clear_user.S
  250. * At this point, entry values are:
  251. * $16 Current destination address
  252. * $5 A copy of $16
  253. * $6 The max quadword address to write to
  254. * $18 Number trailer bytes
  255. * $3 Number quads to write
  256. */
  257. and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
  258. subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
  259. subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
  260. blt $4, loop # U :
  261. /*
  262. * We know we've got at least 16 quads, minimum of one trip
  263. * through unrolled loop. Do a quad at a time to get us 0mod64
  264. * aligned.
  265. */
  266. nop # E :
  267. nop # E :
  268. nop # E :
  269. beq $1, $bigalign # U :
  270. $alignmod64:
  271. stq $17, 0($5) # L :
  272. subq $3, 1, $3 # E : For consistency later
  273. addq $1, 8, $1 # E : Increment towards zero for alignment
  274. addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
  275. nop
  276. nop
  277. addq $5, 8, $5 # E : Inc address
  278. blt $1, $alignmod64 # U :
  279. $bigalign:
  280. /*
  281. * $3 - number quads left to go
  282. * $5 - target address (aligned 0mod64)
  283. * $17 - mask of stuff to store
  284. * Scratch registers available: $7, $2, $4, $1
  285. * we know that we'll be taking a minimum of one trip through
  286. * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
  287. * Assumes the wh64 needs to be for 2 trips through the loop in the future
  288. * The wh64 is issued on for the starting destination address for trip +2
  289. * through the loop, and if there are less than two trips left, the target
  290. * address will be for the current trip.
  291. */
  292. $do_wh64:
  293. wh64 ($4) # L1 : memory subsystem write hint
  294. subq $3, 24, $2 # E : For determining future wh64 addresses
  295. stq $17, 0($5) # L :
  296. nop # E :
  297. addq $5, 128, $4 # E : speculative target of next wh64
  298. stq $17, 8($5) # L :
  299. stq $17, 16($5) # L :
  300. addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
  301. stq $17, 24($5) # L :
  302. stq $17, 32($5) # L :
  303. cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
  304. nop
  305. stq $17, 40($5) # L :
  306. stq $17, 48($5) # L :
  307. subq $3, 16, $2 # E : Repeat the loop at least once more?
  308. nop
  309. stq $17, 56($5) # L :
  310. addq $5, 64, $5 # E :
  311. subq $3, 8, $3 # E :
  312. bge $2, $do_wh64 # U :
  313. nop
  314. nop
  315. nop
  316. beq $3, no_quad # U : Might have finished already
  317. .align 4
  318. /*
  319. * Simple loop for trailing quadwords, or for small amounts
  320. * of data (where we can't use an unrolled loop and wh64)
  321. */
  322. loop:
  323. stq $17,0($5) # L :
  324. subq $3,1,$3 # E : Decrement number quads left
  325. addq $5,8,$5 # E : Inc address
  326. bne $3,loop # U : more?
  327. no_quad:
  328. /*
  329. * Write 0..7 trailing bytes.
  330. */
  331. nop # E :
  332. beq $18,end # U : All done?
  333. ldq $7,0($5) # L :
  334. mskqh $7,$6,$2 # U : Mask final quad
  335. insqh $17,$6,$4 # U : New bits
  336. bis $2,$4,$1 # E : Put it all together
  337. stq $1,0($5) # L : And back to memory
  338. ret $31,($26),1 # L0 :
  339. within_one_quad:
  340. ldq_u $1,0($16) # L :
  341. insql $17,$16,$2 # U : New bits
  342. mskql $1,$16,$4 # U : Clear old
  343. bis $2,$4,$2 # E : New result
  344. mskql $2,$6,$4 # U :
  345. mskqh $1,$6,$2 # U :
  346. bis $2,$4,$1 # E :
  347. stq_u $1,0($16) # L :
  348. end:
  349. nop
  350. nop
  351. nop
  352. ret $31,($26),1 # L0 :
  353. .end __constant_c_memset
  354. EXPORT_SYMBOL(__constant_c_memset)
  355. /*
  356. * This is a replicant of the __constant_c_memset code, rescheduled
  357. * to mask stalls. Note that entry point names also had to change
  358. */
  359. .align 5
  360. .ent __memset16
  361. __memset16:
  362. .frame $30,0,$26,0
  363. .prologue 0
  364. inswl $17,0,$5 # U : 000000000000c1c2
  365. inswl $17,2,$2 # U : 00000000c1c20000
  366. bis $16,$16,$0 # E : return value
  367. addq $18,$16,$6 # E : max address to write to
  368. ble $18, end_w # U : zero length requested?
  369. inswl $17,4,$3 # U : 0000c1c200000000
  370. inswl $17,6,$4 # U : c1c2000000000000
  371. xor $16,$6,$1 # E : will complete write be within one quadword?
  372. or $2,$5,$2 # E : 00000000c1c2c1c2
  373. or $3,$4,$17 # E : c1c2c1c200000000
  374. bic $1,7,$1 # E : fit within a single quadword
  375. and $16,7,$3 # E : Target addr misalignment
  376. or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
  377. beq $1,within_quad_w # U :
  378. nop
  379. beq $3,aligned_w # U : target is 0mod8
  380. /*
  381. * Target address is misaligned, and won't fit within a quadword
  382. */
  383. ldq_u $4,0($16) # L : Fetch first partial
  384. bis $16,$16,$5 # E : Save the address
  385. insql $17,$16,$2 # U : Insert new bytes
  386. subq $3,8,$3 # E : Invert (for addressing uses)
  387. addq $18,$3,$18 # E : $18 is new count ($3 is negative)
  388. mskql $4,$16,$4 # U : clear relevant parts of the quad
  389. subq $16,$3,$16 # E : $16 is new aligned destination
  390. bis $2,$4,$1 # E : Final bytes
  391. nop
  392. stq_u $1,0($5) # L : Store result
  393. nop
  394. nop
  395. .align 4
  396. aligned_w:
  397. /*
  398. * We are now guaranteed to be quad aligned, with at least
  399. * one partial quad to write.
  400. */
  401. sra $18,3,$3 # U : Number of remaining quads to write
  402. and $18,7,$18 # E : Number of trailing bytes to write
  403. bis $16,$16,$5 # E : Save dest address
  404. beq $3,no_quad_w # U : tail stuff only
  405. /*
  406. * it's worth the effort to unroll this and use wh64 if possible
  407. * Lifted a bunch of code from clear_user.S
  408. * At this point, entry values are:
  409. * $16 Current destination address
  410. * $5 A copy of $16
  411. * $6 The max quadword address to write to
  412. * $18 Number trailer bytes
  413. * $3 Number quads to write
  414. */
  415. and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
  416. subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
  417. subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
  418. blt $4, loop_w # U :
  419. /*
  420. * We know we've got at least 16 quads, minimum of one trip
  421. * through unrolled loop. Do a quad at a time to get us 0mod64
  422. * aligned.
  423. */
  424. nop # E :
  425. nop # E :
  426. nop # E :
  427. beq $1, $bigalign_w # U :
  428. $alignmod64_w:
  429. stq $17, 0($5) # L :
  430. subq $3, 1, $3 # E : For consistency later
  431. addq $1, 8, $1 # E : Increment towards zero for alignment
  432. addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
  433. nop
  434. nop
  435. addq $5, 8, $5 # E : Inc address
  436. blt $1, $alignmod64_w # U :
  437. $bigalign_w:
  438. /*
  439. * $3 - number quads left to go
  440. * $5 - target address (aligned 0mod64)
  441. * $17 - mask of stuff to store
  442. * Scratch registers available: $7, $2, $4, $1
  443. * we know that we'll be taking a minimum of one trip through
  444. * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
  445. * Assumes the wh64 needs to be for 2 trips through the loop in the future
  446. * The wh64 is issued on for the starting destination address for trip +2
  447. * through the loop, and if there are less than two trips left, the target
  448. * address will be for the current trip.
  449. */
  450. $do_wh64_w:
  451. wh64 ($4) # L1 : memory subsystem write hint
  452. subq $3, 24, $2 # E : For determining future wh64 addresses
  453. stq $17, 0($5) # L :
  454. nop # E :
  455. addq $5, 128, $4 # E : speculative target of next wh64
  456. stq $17, 8($5) # L :
  457. stq $17, 16($5) # L :
  458. addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
  459. stq $17, 24($5) # L :
  460. stq $17, 32($5) # L :
  461. cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
  462. nop
  463. stq $17, 40($5) # L :
  464. stq $17, 48($5) # L :
  465. subq $3, 16, $2 # E : Repeat the loop at least once more?
  466. nop
  467. stq $17, 56($5) # L :
  468. addq $5, 64, $5 # E :
  469. subq $3, 8, $3 # E :
  470. bge $2, $do_wh64_w # U :
  471. nop
  472. nop
  473. nop
  474. beq $3, no_quad_w # U : Might have finished already
  475. .align 4
  476. /*
  477. * Simple loop for trailing quadwords, or for small amounts
  478. * of data (where we can't use an unrolled loop and wh64)
  479. */
  480. loop_w:
  481. stq $17,0($5) # L :
  482. subq $3,1,$3 # E : Decrement number quads left
  483. addq $5,8,$5 # E : Inc address
  484. bne $3,loop_w # U : more?
  485. no_quad_w:
  486. /*
  487. * Write 0..7 trailing bytes.
  488. */
  489. nop # E :
  490. beq $18,end_w # U : All done?
  491. ldq $7,0($5) # L :
  492. mskqh $7,$6,$2 # U : Mask final quad
  493. insqh $17,$6,$4 # U : New bits
  494. bis $2,$4,$1 # E : Put it all together
  495. stq $1,0($5) # L : And back to memory
  496. ret $31,($26),1 # L0 :
  497. within_quad_w:
  498. ldq_u $1,0($16) # L :
  499. insql $17,$16,$2 # U : New bits
  500. mskql $1,$16,$4 # U : Clear old
  501. bis $2,$4,$2 # E : New result
  502. mskql $2,$6,$4 # U :
  503. mskqh $1,$6,$2 # U :
  504. bis $2,$4,$1 # E :
  505. stq_u $1,0($16) # L :
  506. end_w:
  507. nop
  508. nop
  509. nop
  510. ret $31,($26),1 # L0 :
  511. .end __memset16
  512. EXPORT_SYMBOL(__memset16)
  513. memset = ___memset
  514. __memset = ___memset
  515. EXPORT_SYMBOL(memset)
  516. EXPORT_SYMBOL(__memset)