octeon-memcpy.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Unified implementation of memcpy, memmove and the __copy_user backend.
  7. *
  8. * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle ([email protected])
  9. * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10. * Copyright (C) 2002 Broadcom, Inc.
  11. * memcpy/copy_user author: Mark Vandevoorde
  12. *
  13. * Mnemonic names for arguments to memcpy/__copy_user
  14. */
  15. #include <asm/asm.h>
  16. #include <asm/asm-offsets.h>
  17. #include <asm/export.h>
  18. #include <asm/regdef.h>
  19. #define dst a0
  20. #define src a1
  21. #define len a2
  22. /*
  23. * Spec
  24. *
  25. * memcpy copies len bytes from src to dst and sets v0 to dst.
  26. * It assumes that
  27. * - src and dst don't overlap
  28. * - src is readable
  29. * - dst is writable
  30. * memcpy uses the standard calling convention
  31. *
  32. * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  33. * the number of uncopied bytes due to an exception caused by a read or write.
  34. * __copy_user assumes that src and dst don't overlap, and that the call is
  35. * implementing one of the following:
  36. * copy_to_user
  37. * - src is readable (no exceptions when reading src)
  38. * copy_from_user
  39. * - dst is writable (no exceptions when writing dst)
  40. * __copy_user uses a non-standard calling convention; see
  41. * arch/mips/include/asm/uaccess.h
  42. *
  43. * When an exception happens on a load, the handler must
  44. # ensure that all of the destination buffer is overwritten to prevent
  45. * leaking information to user mode programs.
  46. */
  47. /*
  48. * Implementation
  49. */
  50. /*
  51. * The exception handler for loads requires that:
  52. * 1- AT contain the address of the byte just past the end of the source
  53. * of the copy,
  54. * 2- src_entry <= src < AT, and
  55. * 3- (dst - src) == (dst_entry - src_entry),
  56. * The _entry suffix denotes values when __copy_user was called.
  57. *
  58. * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  59. * (2) is met by incrementing src by the number of bytes copied
  60. * (3) is met by not doing loads between a pair of increments of dst and src
  61. *
  62. * The exception handlers for stores adjust len (if necessary) and return.
  63. * These handlers do not need to overwrite any data.
  64. *
  65. * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  66. * they're not protected.
  67. */
  68. #define EXC(inst_reg,addr,handler) \
  69. 9: inst_reg, addr; \
  70. .section __ex_table,"a"; \
  71. PTR_WD 9b, handler; \
  72. .previous
  73. /*
  74. * Only on the 64-bit kernel we can made use of 64-bit registers.
  75. */
  76. #define LOAD ld
  77. #define LOADL ldl
  78. #define LOADR ldr
  79. #define STOREL sdl
  80. #define STORER sdr
  81. #define STORE sd
  82. #define ADD daddu
  83. #define SUB dsubu
  84. #define SRL dsrl
  85. #define SRA dsra
  86. #define SLL dsll
  87. #define SLLV dsllv
  88. #define SRLV dsrlv
  89. #define NBYTES 8
  90. #define LOG_NBYTES 3
  91. /*
  92. * As we are sharing code base with the mips32 tree (which use the o32 ABI
  93. * register definitions). We need to redefine the register definitions from
  94. * the n64 ABI register naming to the o32 ABI register naming.
  95. */
  96. #undef t0
  97. #undef t1
  98. #undef t2
  99. #undef t3
  100. #define t0 $8
  101. #define t1 $9
  102. #define t2 $10
  103. #define t3 $11
  104. #define t4 $12
  105. #define t5 $13
  106. #define t6 $14
  107. #define t7 $15
  108. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  109. #define LDFIRST LOADR
  110. #define LDREST LOADL
  111. #define STFIRST STORER
  112. #define STREST STOREL
  113. #define SHIFT_DISCARD SLLV
  114. #else
  115. #define LDFIRST LOADL
  116. #define LDREST LOADR
  117. #define STFIRST STOREL
  118. #define STREST STORER
  119. #define SHIFT_DISCARD SRLV
  120. #endif
  121. #define FIRST(unit) ((unit)*NBYTES)
  122. #define REST(unit) (FIRST(unit)+NBYTES-1)
  123. #define UNIT(unit) FIRST(unit)
  124. #define ADDRMASK (NBYTES-1)
  125. .text
  126. .set noreorder
  127. .set noat
  128. /*
  129. * A combined memcpy/__copy_user
  130. * __copy_user sets len to 0 for success; else to an upper bound of
  131. * the number of uncopied bytes.
  132. * memcpy sets v0 to dst.
  133. */
  134. .align 5
  135. LEAF(memcpy) /* a0=dst a1=src a2=len */
  136. EXPORT_SYMBOL(memcpy)
  137. move v0, dst /* return value */
  138. __memcpy:
  139. FEXPORT(__raw_copy_from_user)
  140. EXPORT_SYMBOL(__raw_copy_from_user)
  141. FEXPORT(__raw_copy_to_user)
  142. EXPORT_SYMBOL(__raw_copy_to_user)
  143. /*
  144. * Note: dst & src may be unaligned, len may be 0
  145. * Temps
  146. */
  147. #
  148. # Octeon doesn't care if the destination is unaligned. The hardware
  149. # can fix it faster than we can special case the assembly.
  150. #
  151. pref 0, 0(src)
  152. sltu t0, len, NBYTES # Check if < 1 word
  153. bnez t0, copy_bytes_checklen
  154. and t0, src, ADDRMASK # Check if src unaligned
  155. bnez t0, src_unaligned
  156. sltu t0, len, 4*NBYTES # Check if < 4 words
  157. bnez t0, less_than_4units
  158. sltu t0, len, 8*NBYTES # Check if < 8 words
  159. bnez t0, less_than_8units
  160. sltu t0, len, 16*NBYTES # Check if < 16 words
  161. bnez t0, cleanup_both_aligned
  162. sltu t0, len, 128+1 # Check if len < 129
  163. bnez t0, 1f # Skip prefetch if len is too short
  164. sltu t0, len, 256+1 # Check if len < 257
  165. bnez t0, 1f # Skip prefetch if len is too short
  166. pref 0, 128(src) # We must not prefetch invalid addresses
  167. #
  168. # This is where we loop if there is more than 128 bytes left
  169. 2: pref 0, 256(src) # We must not prefetch invalid addresses
  170. #
  171. # This is where we loop if we can't prefetch anymore
  172. 1:
  173. EXC( LOAD t0, UNIT(0)(src), l_exc)
  174. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  175. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  176. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  177. SUB len, len, 16*NBYTES
  178. EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
  179. EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
  180. EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
  181. EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
  182. EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
  183. EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
  184. EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
  185. EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
  186. EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
  187. EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
  188. EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
  189. ADD src, src, 16*NBYTES
  190. EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
  191. ADD dst, dst, 16*NBYTES
  192. EXC( LOAD t0, UNIT(-8)(src), l_exc_copy_rewind16)
  193. EXC( LOAD t1, UNIT(-7)(src), l_exc_copy_rewind16)
  194. EXC( LOAD t2, UNIT(-6)(src), l_exc_copy_rewind16)
  195. EXC( LOAD t3, UNIT(-5)(src), l_exc_copy_rewind16)
  196. EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
  197. EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
  198. EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
  199. EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
  200. EXC( LOAD t0, UNIT(-4)(src), l_exc_copy_rewind16)
  201. EXC( LOAD t1, UNIT(-3)(src), l_exc_copy_rewind16)
  202. EXC( LOAD t2, UNIT(-2)(src), l_exc_copy_rewind16)
  203. EXC( LOAD t3, UNIT(-1)(src), l_exc_copy_rewind16)
  204. EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
  205. EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
  206. EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
  207. EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
  208. sltu t0, len, 256+1 # See if we can prefetch more
  209. beqz t0, 2b
  210. sltu t0, len, 128 # See if we can loop more time
  211. beqz t0, 1b
  212. nop
  213. #
  214. # Jump here if there are less than 16*NBYTES left.
  215. #
  216. cleanup_both_aligned:
  217. beqz len, done
  218. sltu t0, len, 8*NBYTES
  219. bnez t0, less_than_8units
  220. nop
  221. EXC( LOAD t0, UNIT(0)(src), l_exc)
  222. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  223. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  224. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  225. SUB len, len, 8*NBYTES
  226. EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
  227. EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
  228. EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
  229. EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
  230. EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
  231. EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
  232. EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
  233. EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
  234. EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
  235. EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
  236. EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
  237. EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
  238. ADD src, src, 8*NBYTES
  239. beqz len, done
  240. ADD dst, dst, 8*NBYTES
  241. #
  242. # Jump here if there are less than 8*NBYTES left.
  243. #
  244. less_than_8units:
  245. sltu t0, len, 4*NBYTES
  246. bnez t0, less_than_4units
  247. nop
  248. EXC( LOAD t0, UNIT(0)(src), l_exc)
  249. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  250. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  251. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  252. SUB len, len, 4*NBYTES
  253. EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
  254. EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
  255. EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
  256. EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
  257. ADD src, src, 4*NBYTES
  258. beqz len, done
  259. ADD dst, dst, 4*NBYTES
  260. #
  261. # Jump here if there are less than 4*NBYTES left. This means
  262. # we may need to copy up to 3 NBYTES words.
  263. #
  264. less_than_4units:
  265. sltu t0, len, 1*NBYTES
  266. bnez t0, copy_bytes_checklen
  267. nop
  268. #
  269. # 1) Copy NBYTES, then check length again
  270. #
  271. EXC( LOAD t0, 0(src), l_exc)
  272. SUB len, len, NBYTES
  273. sltu t1, len, 8
  274. EXC( STORE t0, 0(dst), s_exc_p1u)
  275. ADD src, src, NBYTES
  276. bnez t1, copy_bytes_checklen
  277. ADD dst, dst, NBYTES
  278. #
  279. # 2) Copy NBYTES, then check length again
  280. #
  281. EXC( LOAD t0, 0(src), l_exc)
  282. SUB len, len, NBYTES
  283. sltu t1, len, 8
  284. EXC( STORE t0, 0(dst), s_exc_p1u)
  285. ADD src, src, NBYTES
  286. bnez t1, copy_bytes_checklen
  287. ADD dst, dst, NBYTES
  288. #
  289. # 3) Copy NBYTES, then check length again
  290. #
  291. EXC( LOAD t0, 0(src), l_exc)
  292. SUB len, len, NBYTES
  293. ADD src, src, NBYTES
  294. ADD dst, dst, NBYTES
  295. b copy_bytes_checklen
  296. EXC( STORE t0, -8(dst), s_exc_p1u)
  297. src_unaligned:
  298. #define rem t8
  299. SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
  300. beqz t0, cleanup_src_unaligned
  301. and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
  302. 1:
  303. /*
  304. * Avoid consecutive LD*'s to the same register since some mips
  305. * implementations can't issue them in the same cycle.
  306. * It's OK to load FIRST(N+1) before REST(N) because the two addresses
  307. * are to the same unit (unless src is aligned, but it's not).
  308. */
  309. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  310. EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
  311. SUB len, len, 4*NBYTES
  312. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  313. EXC( LDREST t1, REST(1)(src), l_exc_copy)
  314. EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
  315. EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
  316. EXC( LDREST t2, REST(2)(src), l_exc_copy)
  317. EXC( LDREST t3, REST(3)(src), l_exc_copy)
  318. ADD src, src, 4*NBYTES
  319. EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
  320. EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
  321. EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
  322. EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
  323. bne len, rem, 1b
  324. ADD dst, dst, 4*NBYTES
  325. cleanup_src_unaligned:
  326. beqz len, done
  327. and rem, len, NBYTES-1 # rem = len % NBYTES
  328. beq rem, len, copy_bytes
  329. nop
  330. 1:
  331. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  332. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  333. SUB len, len, NBYTES
  334. EXC( STORE t0, 0(dst), s_exc_p1u)
  335. ADD src, src, NBYTES
  336. bne len, rem, 1b
  337. ADD dst, dst, NBYTES
  338. copy_bytes_checklen:
  339. beqz len, done
  340. nop
  341. copy_bytes:
  342. /* 0 < len < NBYTES */
  343. #define COPY_BYTE(N) \
  344. EXC( lb t0, N(src), l_exc); \
  345. SUB len, len, 1; \
  346. beqz len, done; \
  347. EXC( sb t0, N(dst), s_exc_p1)
  348. COPY_BYTE(0)
  349. COPY_BYTE(1)
  350. COPY_BYTE(2)
  351. COPY_BYTE(3)
  352. COPY_BYTE(4)
  353. COPY_BYTE(5)
  354. EXC( lb t0, NBYTES-2(src), l_exc)
  355. SUB len, len, 1
  356. jr ra
  357. EXC( sb t0, NBYTES-2(dst), s_exc_p1)
  358. done:
  359. jr ra
  360. nop
  361. END(memcpy)
  362. l_exc_copy_rewind16:
  363. /* Rewind src and dst by 16*NBYTES for l_exc_copy */
  364. SUB src, src, 16*NBYTES
  365. SUB dst, dst, 16*NBYTES
  366. l_exc_copy:
  367. /*
  368. * Copy bytes from src until faulting load address (or until a
  369. * lb faults)
  370. *
  371. * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
  372. * may be more than a byte beyond the last address.
  373. * Hence, the lb below may get an exception.
  374. *
  375. * Assumes src < THREAD_BUADDR($28)
  376. */
  377. LOAD t0, TI_TASK($28)
  378. LOAD t0, THREAD_BUADDR(t0)
  379. 1:
  380. EXC( lb t1, 0(src), l_exc)
  381. ADD src, src, 1
  382. sb t1, 0(dst) # can't fault -- we're copy_from_user
  383. bne src, t0, 1b
  384. ADD dst, dst, 1
  385. l_exc:
  386. LOAD t0, TI_TASK($28)
  387. LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
  388. SUB len, AT, t0 # len number of uncopied bytes
  389. jr ra
  390. nop
  391. #define SEXC(n) \
  392. s_exc_p ## n ## u: \
  393. jr ra; \
  394. ADD len, len, n*NBYTES
  395. SEXC(16)
  396. SEXC(15)
  397. SEXC(14)
  398. SEXC(13)
  399. SEXC(12)
  400. SEXC(11)
  401. SEXC(10)
  402. SEXC(9)
  403. SEXC(8)
  404. SEXC(7)
  405. SEXC(6)
  406. SEXC(5)
  407. SEXC(4)
  408. SEXC(3)
  409. SEXC(2)
  410. SEXC(1)
  411. s_exc_p1:
  412. jr ra
  413. ADD len, len, 1
  414. s_exc:
  415. jr ra
  416. nop
  417. .align 5
  418. LEAF(memmove)
  419. EXPORT_SYMBOL(memmove)
  420. ADD t0, a0, a2
  421. ADD t1, a1, a2
  422. sltu t0, a1, t0 # dst + len <= src -> memcpy
  423. sltu t1, a0, t1 # dst >= src + len -> memcpy
  424. and t0, t1
  425. beqz t0, __memcpy
  426. move v0, a0 /* return value */
  427. beqz a2, r_out
  428. END(memmove)
  429. /* fall through to __rmemcpy */
  430. LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
  431. sltu t0, a1, a0
  432. beqz t0, r_end_bytes_up # src >= dst
  433. nop
  434. ADD a0, a2 # dst = dst + len
  435. ADD a1, a2 # src = src + len
  436. r_end_bytes:
  437. lb t0, -1(a1)
  438. SUB a2, a2, 0x1
  439. sb t0, -1(a0)
  440. SUB a1, a1, 0x1
  441. bnez a2, r_end_bytes
  442. SUB a0, a0, 0x1
  443. r_out:
  444. jr ra
  445. move a2, zero
  446. r_end_bytes_up:
  447. lb t0, (a1)
  448. SUB a2, a2, 0x1
  449. sb t0, (a0)
  450. ADD a1, a1, 0x1
  451. bnez a2, r_end_bytes_up
  452. ADD a0, a0, 0x1
  453. jr ra
  454. move a2, zero
  455. END(__rmemcpy)