memcpy-inatomic.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Unified implementation of memcpy, memmove and the __copy_user backend.
  7. *
  8. * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
  9. * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10. * Copyright (C) 2002 Broadcom, Inc.
  11. * memcpy/copy_user author: Mark Vandevoorde
  12. *
  13. * Mnemonic names for arguments to memcpy/__copy_user
  14. */
  15. /*
  16. * Hack to resolve longstanding prefetch issue
  17. *
  18. * Prefetching may be fatal on some systems if we're prefetching beyond the
  19. * end of memory on some systems. It's also a seriously bad idea on non
  20. * dma-coherent systems.
  21. */
  22. #if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
  23. #undef CONFIG_CPU_HAS_PREFETCH
  24. #endif
  25. #ifdef CONFIG_MIPS_MALTA
  26. #undef CONFIG_CPU_HAS_PREFETCH
  27. #endif
  28. #include <asm/asm.h>
  29. #include <asm/asm-offsets.h>
  30. #include <asm/regdef.h>
  31. #define dst a0
  32. #define src a1
  33. #define len a2
  34. /*
  35. * Spec
  36. *
  37. * memcpy copies len bytes from src to dst and sets v0 to dst.
  38. * It assumes that
  39. * - src and dst don't overlap
  40. * - src is readable
  41. * - dst is writable
  42. * memcpy uses the standard calling convention
  43. *
  44. * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  45. * the number of uncopied bytes due to an exception caused by a read or write.
  46. * __copy_user assumes that src and dst don't overlap, and that the call is
  47. * implementing one of the following:
  48. * copy_to_user
  49. * - src is readable (no exceptions when reading src)
  50. * copy_from_user
  51. * - dst is writable (no exceptions when writing dst)
  52. * __copy_user uses a non-standard calling convention; see
  53. * include/asm-mips/uaccess.h
  54. *
  55. * When an exception happens on a load, the handler must
  56. # ensure that all of the destination buffer is overwritten to prevent
  57. * leaking information to user mode programs.
  58. */
  59. /*
  60. * Implementation
  61. */
  62. /*
  63. * The exception handler for loads requires that:
  64. * 1- AT contain the address of the byte just past the end of the source
  65. * of the copy,
  66. * 2- src_entry <= src < AT, and
  67. * 3- (dst - src) == (dst_entry - src_entry),
  68. * The _entry suffix denotes values when __copy_user was called.
  69. *
  70. * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  71. * (2) is met by incrementing src by the number of bytes copied
  72. * (3) is met by not doing loads between a pair of increments of dst and src
  73. *
  74. * The exception handlers for stores adjust len (if necessary) and return.
  75. * These handlers do not need to overwrite any data.
  76. *
  77. * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  78. * they're not protected.
  79. */
  80. #define EXC(inst_reg,addr,handler) \
  81. 9: inst_reg, addr; \
  82. .section __ex_table,"a"; \
  83. PTR 9b, handler; \
  84. .previous
  85. /*
  86. * Only on the 64-bit kernel we can made use of 64-bit registers.
  87. */
  88. #ifdef CONFIG_64BIT
  89. #define USE_DOUBLE
  90. #endif
  91. #ifdef USE_DOUBLE
  92. #define LOAD ld
  93. #define LOADL ldl
  94. #define LOADR ldr
  95. #define STOREL sdl
  96. #define STORER sdr
  97. #define STORE sd
  98. #define ADD daddu
  99. #define SUB dsubu
  100. #define SRL dsrl
  101. #define SRA dsra
  102. #define SLL dsll
  103. #define SLLV dsllv
  104. #define SRLV dsrlv
  105. #define NBYTES 8
  106. #define LOG_NBYTES 3
  107. /*
  108. * As we are sharing code base with the mips32 tree (which use the o32 ABI
  109. * register definitions). We need to redefine the register definitions from
  110. * the n64 ABI register naming to the o32 ABI register naming.
  111. */
  112. #undef t0
  113. #undef t1
  114. #undef t2
  115. #undef t3
  116. #define t0 $8
  117. #define t1 $9
  118. #define t2 $10
  119. #define t3 $11
  120. #define t4 $12
  121. #define t5 $13
  122. #define t6 $14
  123. #define t7 $15
  124. #else
  125. #define LOAD lw
  126. #define LOADL lwl
  127. #define LOADR lwr
  128. #define STOREL swl
  129. #define STORER swr
  130. #define STORE sw
  131. #define ADD addu
  132. #define SUB subu
  133. #define SRL srl
  134. #define SLL sll
  135. #define SRA sra
  136. #define SLLV sllv
  137. #define SRLV srlv
  138. #define NBYTES 4
  139. #define LOG_NBYTES 2
  140. #endif /* USE_DOUBLE */
  141. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  142. #define LDFIRST LOADR
  143. #define LDREST LOADL
  144. #define STFIRST STORER
  145. #define STREST STOREL
  146. #define SHIFT_DISCARD SLLV
  147. #else
  148. #define LDFIRST LOADL
  149. #define LDREST LOADR
  150. #define STFIRST STOREL
  151. #define STREST STORER
  152. #define SHIFT_DISCARD SRLV
  153. #endif
  154. #define FIRST(unit) ((unit)*NBYTES)
  155. #define REST(unit) (FIRST(unit)+NBYTES-1)
  156. #define UNIT(unit) FIRST(unit)
  157. #define ADDRMASK (NBYTES-1)
  158. .text
  159. .set noreorder
  160. .set noat
  161. /*
  162. * A combined memcpy/__copy_user
  163. * __copy_user sets len to 0 for success; else to an upper bound of
  164. * the number of uncopied bytes.
  165. * memcpy sets v0 to dst.
  166. */
  167. .align 5
  168. LEAF(__copy_user_inatomic)
  169. /*
  170. * Note: dst & src may be unaligned, len may be 0
  171. * Temps
  172. */
  173. #define rem t8
  174. /*
  175. * The "issue break"s below are very approximate.
  176. * Issue delays for dcache fills will perturb the schedule, as will
  177. * load queue full replay traps, etc.
  178. *
  179. * If len < NBYTES use byte operations.
  180. */
  181. PREF( 0, 0(src) )
  182. PREF( 1, 0(dst) )
  183. sltu t2, len, NBYTES
  184. and t1, dst, ADDRMASK
  185. PREF( 0, 1*32(src) )
  186. PREF( 1, 1*32(dst) )
  187. bnez t2, copy_bytes_checklen
  188. and t0, src, ADDRMASK
  189. PREF( 0, 2*32(src) )
  190. PREF( 1, 2*32(dst) )
  191. bnez t1, dst_unaligned
  192. nop
  193. bnez t0, src_unaligned_dst_aligned
  194. /*
  195. * use delay slot for fall-through
  196. * src and dst are aligned; need to compute rem
  197. */
  198. both_aligned:
  199. SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
  200. beqz t0, cleanup_both_aligned # len < 8*NBYTES
  201. and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
  202. PREF( 0, 3*32(src) )
  203. PREF( 1, 3*32(dst) )
  204. .align 4
  205. 1:
  206. EXC( LOAD t0, UNIT(0)(src), l_exc)
  207. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  208. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  209. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  210. SUB len, len, 8*NBYTES
  211. EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
  212. EXC( LOAD t7, UNIT(5)(src), l_exc_copy)
  213. STORE t0, UNIT(0)(dst)
  214. STORE t1, UNIT(1)(dst)
  215. EXC( LOAD t0, UNIT(6)(src), l_exc_copy)
  216. EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
  217. ADD src, src, 8*NBYTES
  218. ADD dst, dst, 8*NBYTES
  219. STORE t2, UNIT(-6)(dst)
  220. STORE t3, UNIT(-5)(dst)
  221. STORE t4, UNIT(-4)(dst)
  222. STORE t7, UNIT(-3)(dst)
  223. STORE t0, UNIT(-2)(dst)
  224. STORE t1, UNIT(-1)(dst)
  225. PREF( 0, 8*32(src) )
  226. PREF( 1, 8*32(dst) )
  227. bne len, rem, 1b
  228. nop
  229. /*
  230. * len == rem == the number of bytes left to copy < 8*NBYTES
  231. */
  232. cleanup_both_aligned:
  233. beqz len, done
  234. sltu t0, len, 4*NBYTES
  235. bnez t0, less_than_4units
  236. and rem, len, (NBYTES-1) # rem = len % NBYTES
  237. /*
  238. * len >= 4*NBYTES
  239. */
  240. EXC( LOAD t0, UNIT(0)(src), l_exc)
  241. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  242. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  243. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  244. SUB len, len, 4*NBYTES
  245. ADD src, src, 4*NBYTES
  246. STORE t0, UNIT(0)(dst)
  247. STORE t1, UNIT(1)(dst)
  248. STORE t2, UNIT(2)(dst)
  249. STORE t3, UNIT(3)(dst)
  250. beqz len, done
  251. ADD dst, dst, 4*NBYTES
  252. less_than_4units:
  253. /*
  254. * rem = len % NBYTES
  255. */
  256. beq rem, len, copy_bytes
  257. nop
  258. 1:
  259. EXC( LOAD t0, 0(src), l_exc)
  260. ADD src, src, NBYTES
  261. SUB len, len, NBYTES
  262. STORE t0, 0(dst)
  263. bne rem, len, 1b
  264. ADD dst, dst, NBYTES
  265. /*
  266. * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
  267. * A loop would do only a byte at a time with possible branch
  268. * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
  269. * because can't assume read-access to dst. Instead, use
  270. * STREST dst, which doesn't require read access to dst.
  271. *
  272. * This code should perform better than a simple loop on modern,
  273. * wide-issue mips processors because the code has fewer branches and
  274. * more instruction-level parallelism.
  275. */
  276. #define bits t2
  277. beqz len, done
  278. ADD t1, dst, len # t1 is just past last byte of dst
  279. li bits, 8*NBYTES
  280. SLL rem, len, 3 # rem = number of bits to keep
  281. EXC( LOAD t0, 0(src), l_exc)
  282. SUB bits, bits, rem # bits = number of bits to discard
  283. SHIFT_DISCARD t0, t0, bits
  284. STREST t0, -1(t1)
  285. jr ra
  286. move len, zero
  287. dst_unaligned:
  288. /*
  289. * dst is unaligned
  290. * t0 = src & ADDRMASK
  291. * t1 = dst & ADDRMASK; T1 > 0
  292. * len >= NBYTES
  293. *
  294. * Copy enough bytes to align dst
  295. * Set match = (src and dst have same alignment)
  296. */
  297. #define match rem
  298. EXC( LDFIRST t3, FIRST(0)(src), l_exc)
  299. ADD t2, zero, NBYTES
  300. EXC( LDREST t3, REST(0)(src), l_exc_copy)
  301. SUB t2, t2, t1 # t2 = number of bytes copied
  302. xor match, t0, t1
  303. STFIRST t3, FIRST(0)(dst)
  304. beq len, t2, done
  305. SUB len, len, t2
  306. ADD dst, dst, t2
  307. beqz match, both_aligned
  308. ADD src, src, t2
  309. src_unaligned_dst_aligned:
  310. SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
  311. PREF( 0, 3*32(src) )
  312. beqz t0, cleanup_src_unaligned
  313. and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
  314. PREF( 1, 3*32(dst) )
  315. 1:
  316. /*
  317. * Avoid consecutive LD*'s to the same register since some mips
  318. * implementations can't issue them in the same cycle.
  319. * It's OK to load FIRST(N+1) before REST(N) because the two addresses
  320. * are to the same unit (unless src is aligned, but it's not).
  321. */
  322. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  323. EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
  324. SUB len, len, 4*NBYTES
  325. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  326. EXC( LDREST t1, REST(1)(src), l_exc_copy)
  327. EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
  328. EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
  329. EXC( LDREST t2, REST(2)(src), l_exc_copy)
  330. EXC( LDREST t3, REST(3)(src), l_exc_copy)
  331. PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
  332. ADD src, src, 4*NBYTES
  333. #ifdef CONFIG_CPU_SB1
  334. nop # improves slotting
  335. #endif
  336. STORE t0, UNIT(0)(dst)
  337. STORE t1, UNIT(1)(dst)
  338. STORE t2, UNIT(2)(dst)
  339. STORE t3, UNIT(3)(dst)
  340. PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
  341. bne len, rem, 1b
  342. ADD dst, dst, 4*NBYTES
  343. cleanup_src_unaligned:
  344. beqz len, done
  345. and rem, len, NBYTES-1 # rem = len % NBYTES
  346. beq rem, len, copy_bytes
  347. nop
  348. 1:
  349. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  350. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  351. ADD src, src, NBYTES
  352. SUB len, len, NBYTES
  353. STORE t0, 0(dst)
  354. bne len, rem, 1b
  355. ADD dst, dst, NBYTES
  356. copy_bytes_checklen:
  357. beqz len, done
  358. nop
  359. copy_bytes:
  360. /* 0 < len < NBYTES */
  361. #define COPY_BYTE(N) \
  362. EXC( lb t0, N(src), l_exc); \
  363. SUB len, len, 1; \
  364. beqz len, done; \
  365. sb t0, N(dst)
  366. COPY_BYTE(0)
  367. COPY_BYTE(1)
  368. #ifdef USE_DOUBLE
  369. COPY_BYTE(2)
  370. COPY_BYTE(3)
  371. COPY_BYTE(4)
  372. COPY_BYTE(5)
  373. #endif
  374. EXC( lb t0, NBYTES-2(src), l_exc)
  375. SUB len, len, 1
  376. jr ra
  377. sb t0, NBYTES-2(dst)
  378. done:
  379. jr ra
  380. nop
  381. END(__copy_user_inatomic)
  382. l_exc_copy:
  383. /*
  384. * Copy bytes from src until faulting load address (or until a
  385. * lb faults)
  386. *
  387. * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
  388. * may be more than a byte beyond the last address.
  389. * Hence, the lb below may get an exception.
  390. *
  391. * Assumes src < THREAD_BUADDR($28)
  392. */
  393. LOAD t0, TI_TASK($28)
  394. nop
  395. LOAD t0, THREAD_BUADDR(t0)
  396. 1:
  397. EXC( lb t1, 0(src), l_exc)
  398. ADD src, src, 1
  399. sb t1, 0(dst) # can't fault -- we're copy_from_user
  400. bne src, t0, 1b
  401. ADD dst, dst, 1
  402. l_exc:
  403. LOAD t0, TI_TASK($28)
  404. nop
  405. LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
  406. nop
  407. SUB len, AT, t0 # len number of uncopied bytes
  408. jr ra
  409. nop