memcpy-inatomic.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Unified implementation of memcpy, memmove and the __copy_user backend.
  7. *
  8. * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
  9. * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10. * Copyright (C) 2002 Broadcom, Inc.
  11. * memcpy/copy_user author: Mark Vandevoorde
  12. * Copyright (C) 2007 Maciej W. Rozycki
  13. *
  14. * Mnemonic names for arguments to memcpy/__copy_user
  15. */
  16. /*
  17. * Hack to resolve longstanding prefetch issue
  18. *
  19. * Prefetching may be fatal on some systems if we're prefetching beyond the
  20. * end of memory on some systems. It's also a seriously bad idea on non
  21. * dma-coherent systems.
  22. */
  23. #ifdef CONFIG_DMA_NONCOHERENT
  24. #undef CONFIG_CPU_HAS_PREFETCH
  25. #endif
  26. #ifdef CONFIG_MIPS_MALTA
  27. #undef CONFIG_CPU_HAS_PREFETCH
  28. #endif
  29. #include <asm/asm.h>
  30. #include <asm/asm-offsets.h>
  31. #include <asm/regdef.h>
  32. #define dst a0
  33. #define src a1
  34. #define len a2
  35. /*
  36. * Spec
  37. *
  38. * memcpy copies len bytes from src to dst and sets v0 to dst.
  39. * It assumes that
  40. * - src and dst don't overlap
  41. * - src is readable
  42. * - dst is writable
  43. * memcpy uses the standard calling convention
  44. *
  45. * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  46. * the number of uncopied bytes due to an exception caused by a read or write.
  47. * __copy_user assumes that src and dst don't overlap, and that the call is
  48. * implementing one of the following:
  49. * copy_to_user
  50. * - src is readable (no exceptions when reading src)
  51. * copy_from_user
  52. * - dst is writable (no exceptions when writing dst)
  53. * __copy_user uses a non-standard calling convention; see
  54. * include/asm-mips/uaccess.h
  55. *
  56. * When an exception happens on a load, the handler must
  57. # ensure that all of the destination buffer is overwritten to prevent
  58. * leaking information to user mode programs.
  59. */
  60. /*
  61. * Implementation
  62. */
  63. /*
  64. * The exception handler for loads requires that:
  65. * 1- AT contain the address of the byte just past the end of the source
  66. * of the copy,
  67. * 2- src_entry <= src < AT, and
  68. * 3- (dst - src) == (dst_entry - src_entry),
  69. * The _entry suffix denotes values when __copy_user was called.
  70. *
  71. * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  72. * (2) is met by incrementing src by the number of bytes copied
  73. * (3) is met by not doing loads between a pair of increments of dst and src
  74. *
  75. * The exception handlers for stores adjust len (if necessary) and return.
  76. * These handlers do not need to overwrite any data.
  77. *
  78. * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  79. * they're not protected.
  80. */
  81. #define EXC(inst_reg,addr,handler) \
  82. 9: inst_reg, addr; \
  83. .section __ex_table,"a"; \
  84. PTR 9b, handler; \
  85. .previous
  86. /*
  87. * Only on the 64-bit kernel we can made use of 64-bit registers.
  88. */
  89. #ifdef CONFIG_64BIT
  90. #define USE_DOUBLE
  91. #endif
  92. #ifdef USE_DOUBLE
  93. #define LOAD ld
  94. #define LOADL ldl
  95. #define LOADR ldr
  96. #define STOREL sdl
  97. #define STORER sdr
  98. #define STORE sd
  99. #define ADD daddu
  100. #define SUB dsubu
  101. #define SRL dsrl
  102. #define SRA dsra
  103. #define SLL dsll
  104. #define SLLV dsllv
  105. #define SRLV dsrlv
  106. #define NBYTES 8
  107. #define LOG_NBYTES 3
  108. /*
  109. * As we are sharing code base with the mips32 tree (which use the o32 ABI
  110. * register definitions). We need to redefine the register definitions from
  111. * the n64 ABI register naming to the o32 ABI register naming.
  112. */
  113. #undef t0
  114. #undef t1
  115. #undef t2
  116. #undef t3
  117. #define t0 $8
  118. #define t1 $9
  119. #define t2 $10
  120. #define t3 $11
  121. #define t4 $12
  122. #define t5 $13
  123. #define t6 $14
  124. #define t7 $15
  125. #else
  126. #define LOAD lw
  127. #define LOADL lwl
  128. #define LOADR lwr
  129. #define STOREL swl
  130. #define STORER swr
  131. #define STORE sw
  132. #define ADD addu
  133. #define SUB subu
  134. #define SRL srl
  135. #define SLL sll
  136. #define SRA sra
  137. #define SLLV sllv
  138. #define SRLV srlv
  139. #define NBYTES 4
  140. #define LOG_NBYTES 2
  141. #endif /* USE_DOUBLE */
  142. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  143. #define LDFIRST LOADR
  144. #define LDREST LOADL
  145. #define STFIRST STORER
  146. #define STREST STOREL
  147. #define SHIFT_DISCARD SLLV
  148. #else
  149. #define LDFIRST LOADL
  150. #define LDREST LOADR
  151. #define STFIRST STOREL
  152. #define STREST STORER
  153. #define SHIFT_DISCARD SRLV
  154. #endif
  155. #define FIRST(unit) ((unit)*NBYTES)
  156. #define REST(unit) (FIRST(unit)+NBYTES-1)
  157. #define UNIT(unit) FIRST(unit)
  158. #define ADDRMASK (NBYTES-1)
  159. .text
  160. .set noreorder
  161. #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
  162. .set noat
  163. #else
  164. .set at=v1
  165. #endif
  166. /*
  167. * A combined memcpy/__copy_user
  168. * __copy_user sets len to 0 for success; else to an upper bound of
  169. * the number of uncopied bytes.
  170. * memcpy sets v0 to dst.
  171. */
  172. .align 5
  173. LEAF(__copy_user_inatomic)
  174. /*
  175. * Note: dst & src may be unaligned, len may be 0
  176. * Temps
  177. */
  178. #define rem t8
  179. /*
  180. * The "issue break"s below are very approximate.
  181. * Issue delays for dcache fills will perturb the schedule, as will
  182. * load queue full replay traps, etc.
  183. *
  184. * If len < NBYTES use byte operations.
  185. */
  186. PREF( 0, 0(src) )
  187. PREF( 1, 0(dst) )
  188. sltu t2, len, NBYTES
  189. and t1, dst, ADDRMASK
  190. PREF( 0, 1*32(src) )
  191. PREF( 1, 1*32(dst) )
  192. bnez t2, .Lcopy_bytes_checklen
  193. and t0, src, ADDRMASK
  194. PREF( 0, 2*32(src) )
  195. PREF( 1, 2*32(dst) )
  196. bnez t1, .Ldst_unaligned
  197. nop
  198. bnez t0, .Lsrc_unaligned_dst_aligned
  199. /*
  200. * use delay slot for fall-through
  201. * src and dst are aligned; need to compute rem
  202. */
  203. .Lboth_aligned:
  204. SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
  205. beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
  206. and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
  207. PREF( 0, 3*32(src) )
  208. PREF( 1, 3*32(dst) )
  209. .align 4
  210. 1:
  211. EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
  212. EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
  213. EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
  214. EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
  215. SUB len, len, 8*NBYTES
  216. EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)
  217. EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy)
  218. STORE t0, UNIT(0)(dst)
  219. STORE t1, UNIT(1)(dst)
  220. EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy)
  221. EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy)
  222. ADD src, src, 8*NBYTES
  223. ADD dst, dst, 8*NBYTES
  224. STORE t2, UNIT(-6)(dst)
  225. STORE t3, UNIT(-5)(dst)
  226. STORE t4, UNIT(-4)(dst)
  227. STORE t7, UNIT(-3)(dst)
  228. STORE t0, UNIT(-2)(dst)
  229. STORE t1, UNIT(-1)(dst)
  230. PREF( 0, 8*32(src) )
  231. PREF( 1, 8*32(dst) )
  232. bne len, rem, 1b
  233. nop
  234. /*
  235. * len == rem == the number of bytes left to copy < 8*NBYTES
  236. */
  237. .Lcleanup_both_aligned:
  238. beqz len, .Ldone
  239. sltu t0, len, 4*NBYTES
  240. bnez t0, .Lless_than_4units
  241. and rem, len, (NBYTES-1) # rem = len % NBYTES
  242. /*
  243. * len >= 4*NBYTES
  244. */
  245. EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
  246. EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
  247. EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
  248. EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
  249. SUB len, len, 4*NBYTES
  250. ADD src, src, 4*NBYTES
  251. STORE t0, UNIT(0)(dst)
  252. STORE t1, UNIT(1)(dst)
  253. STORE t2, UNIT(2)(dst)
  254. STORE t3, UNIT(3)(dst)
  255. .set reorder /* DADDI_WAR */
  256. ADD dst, dst, 4*NBYTES
  257. beqz len, .Ldone
  258. .set noreorder
  259. .Lless_than_4units:
  260. /*
  261. * rem = len % NBYTES
  262. */
  263. beq rem, len, .Lcopy_bytes
  264. nop
  265. 1:
  266. EXC( LOAD t0, 0(src), .Ll_exc)
  267. ADD src, src, NBYTES
  268. SUB len, len, NBYTES
  269. STORE t0, 0(dst)
  270. .set reorder /* DADDI_WAR */
  271. ADD dst, dst, NBYTES
  272. bne rem, len, 1b
  273. .set noreorder
  274. /*
  275. * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
  276. * A loop would do only a byte at a time with possible branch
  277. * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
  278. * because can't assume read-access to dst. Instead, use
  279. * STREST dst, which doesn't require read access to dst.
  280. *
  281. * This code should perform better than a simple loop on modern,
  282. * wide-issue mips processors because the code has fewer branches and
  283. * more instruction-level parallelism.
  284. */
  285. #define bits t2
  286. beqz len, .Ldone
  287. ADD t1, dst, len # t1 is just past last byte of dst
  288. li bits, 8*NBYTES
  289. SLL rem, len, 3 # rem = number of bits to keep
  290. EXC( LOAD t0, 0(src), .Ll_exc)
  291. SUB bits, bits, rem # bits = number of bits to discard
  292. SHIFT_DISCARD t0, t0, bits
  293. STREST t0, -1(t1)
  294. jr ra
  295. move len, zero
  296. .Ldst_unaligned:
  297. /*
  298. * dst is unaligned
  299. * t0 = src & ADDRMASK
  300. * t1 = dst & ADDRMASK; T1 > 0
  301. * len >= NBYTES
  302. *
  303. * Copy enough bytes to align dst
  304. * Set match = (src and dst have same alignment)
  305. */
  306. #define match rem
  307. EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)
  308. ADD t2, zero, NBYTES
  309. EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)
  310. SUB t2, t2, t1 # t2 = number of bytes copied
  311. xor match, t0, t1
  312. STFIRST t3, FIRST(0)(dst)
  313. beq len, t2, .Ldone
  314. SUB len, len, t2
  315. ADD dst, dst, t2
  316. beqz match, .Lboth_aligned
  317. ADD src, src, t2
  318. .Lsrc_unaligned_dst_aligned:
  319. SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
  320. PREF( 0, 3*32(src) )
  321. beqz t0, .Lcleanup_src_unaligned
  322. and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
  323. PREF( 1, 3*32(dst) )
  324. 1:
  325. /*
  326. * Avoid consecutive LD*'s to the same register since some mips
  327. * implementations can't issue them in the same cycle.
  328. * It's OK to load FIRST(N+1) before REST(N) because the two addresses
  329. * are to the same unit (unless src is aligned, but it's not).
  330. */
  331. EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
  332. EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)
  333. SUB len, len, 4*NBYTES
  334. EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
  335. EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)
  336. EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)
  337. EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)
  338. EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)
  339. EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)
  340. PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
  341. ADD src, src, 4*NBYTES
  342. #ifdef CONFIG_CPU_SB1
  343. nop # improves slotting
  344. #endif
  345. STORE t0, UNIT(0)(dst)
  346. STORE t1, UNIT(1)(dst)
  347. STORE t2, UNIT(2)(dst)
  348. STORE t3, UNIT(3)(dst)
  349. PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
  350. .set reorder /* DADDI_WAR */
  351. ADD dst, dst, 4*NBYTES
  352. bne len, rem, 1b
  353. .set noreorder
  354. .Lcleanup_src_unaligned:
  355. beqz len, .Ldone
  356. and rem, len, NBYTES-1 # rem = len % NBYTES
  357. beq rem, len, .Lcopy_bytes
  358. nop
  359. 1:
  360. EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
  361. EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
  362. ADD src, src, NBYTES
  363. SUB len, len, NBYTES
  364. STORE t0, 0(dst)
  365. .set reorder /* DADDI_WAR */
  366. ADD dst, dst, NBYTES
  367. bne len, rem, 1b
  368. .set noreorder
  369. .Lcopy_bytes_checklen:
  370. beqz len, .Ldone
  371. nop
  372. .Lcopy_bytes:
  373. /* 0 < len < NBYTES */
  374. #define COPY_BYTE(N) \
  375. EXC( lb t0, N(src), .Ll_exc); \
  376. SUB len, len, 1; \
  377. beqz len, .Ldone; \
  378. sb t0, N(dst)
  379. COPY_BYTE(0)
  380. COPY_BYTE(1)
  381. #ifdef USE_DOUBLE
  382. COPY_BYTE(2)
  383. COPY_BYTE(3)
  384. COPY_BYTE(4)
  385. COPY_BYTE(5)
  386. #endif
  387. EXC( lb t0, NBYTES-2(src), .Ll_exc)
  388. SUB len, len, 1
  389. jr ra
  390. sb t0, NBYTES-2(dst)
  391. .Ldone:
  392. jr ra
  393. nop
  394. END(__copy_user_inatomic)
  395. .Ll_exc_copy:
  396. /*
  397. * Copy bytes from src until faulting load address (or until a
  398. * lb faults)
  399. *
  400. * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
  401. * may be more than a byte beyond the last address.
  402. * Hence, the lb below may get an exception.
  403. *
  404. * Assumes src < THREAD_BUADDR($28)
  405. */
  406. LOAD t0, TI_TASK($28)
  407. nop
  408. LOAD t0, THREAD_BUADDR(t0)
  409. 1:
  410. EXC( lb t1, 0(src), .Ll_exc)
  411. ADD src, src, 1
  412. sb t1, 0(dst) # can't fault -- we're copy_from_user
  413. .set reorder /* DADDI_WAR */
  414. ADD dst, dst, 1
  415. bne src, t0, 1b
  416. .set noreorder
  417. .Ll_exc:
  418. LOAD t0, TI_TASK($28)
  419. nop
  420. LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
  421. nop
  422. SUB len, AT, t0 # len number of uncopied bytes
  423. jr ra
  424. nop