octeon-memcpy.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Unified implementation of memcpy, memmove and the __copy_user backend.
  7. *
  8. * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
  9. * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10. * Copyright (C) 2002 Broadcom, Inc.
  11. * memcpy/copy_user author: Mark Vandevoorde
  12. *
  13. * Mnemonic names for arguments to memcpy/__copy_user
  14. */
  15. #include <asm/asm.h>
  16. #include <asm/asm-offsets.h>
  17. #include <asm/regdef.h>
  18. #define dst a0
  19. #define src a1
  20. #define len a2
  21. /*
  22. * Spec
  23. *
  24. * memcpy copies len bytes from src to dst and sets v0 to dst.
  25. * It assumes that
  26. * - src and dst don't overlap
  27. * - src is readable
  28. * - dst is writable
  29. * memcpy uses the standard calling convention
  30. *
  31. * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  32. * the number of uncopied bytes due to an exception caused by a read or write.
  33. * __copy_user assumes that src and dst don't overlap, and that the call is
  34. * implementing one of the following:
  35. * copy_to_user
  36. * - src is readable (no exceptions when reading src)
  37. * copy_from_user
  38. * - dst is writable (no exceptions when writing dst)
  39. * __copy_user uses a non-standard calling convention; see
  40. * arch/mips/include/asm/uaccess.h
  41. *
  42. * When an exception happens on a load, the handler must
  43. # ensure that all of the destination buffer is overwritten to prevent
  44. * leaking information to user mode programs.
  45. */
  46. /*
  47. * Implementation
  48. */
  49. /*
  50. * The exception handler for loads requires that:
  51. * 1- AT contain the address of the byte just past the end of the source
  52. * of the copy,
  53. * 2- src_entry <= src < AT, and
  54. * 3- (dst - src) == (dst_entry - src_entry),
  55. * The _entry suffix denotes values when __copy_user was called.
  56. *
  57. * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  58. * (2) is met by incrementing src by the number of bytes copied
  59. * (3) is met by not doing loads between a pair of increments of dst and src
  60. *
  61. * The exception handlers for stores adjust len (if necessary) and return.
  62. * These handlers do not need to overwrite any data.
  63. *
  64. * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  65. * they're not protected.
  66. */
  67. #define EXC(inst_reg,addr,handler) \
  68. 9: inst_reg, addr; \
  69. .section __ex_table,"a"; \
  70. PTR 9b, handler; \
  71. .previous
  72. /*
  73. * Only on the 64-bit kernel we can made use of 64-bit registers.
  74. */
  75. #ifdef CONFIG_64BIT
  76. #define USE_DOUBLE
  77. #endif
  78. #ifdef USE_DOUBLE
  79. #define LOAD ld
  80. #define LOADL ldl
  81. #define LOADR ldr
  82. #define STOREL sdl
  83. #define STORER sdr
  84. #define STORE sd
  85. #define ADD daddu
  86. #define SUB dsubu
  87. #define SRL dsrl
  88. #define SRA dsra
  89. #define SLL dsll
  90. #define SLLV dsllv
  91. #define SRLV dsrlv
  92. #define NBYTES 8
  93. #define LOG_NBYTES 3
  94. /*
  95. * As we are sharing code base with the mips32 tree (which use the o32 ABI
  96. * register definitions). We need to redefine the register definitions from
  97. * the n64 ABI register naming to the o32 ABI register naming.
  98. */
  99. #undef t0
  100. #undef t1
  101. #undef t2
  102. #undef t3
  103. #define t0 $8
  104. #define t1 $9
  105. #define t2 $10
  106. #define t3 $11
  107. #define t4 $12
  108. #define t5 $13
  109. #define t6 $14
  110. #define t7 $15
  111. #else
  112. #define LOAD lw
  113. #define LOADL lwl
  114. #define LOADR lwr
  115. #define STOREL swl
  116. #define STORER swr
  117. #define STORE sw
  118. #define ADD addu
  119. #define SUB subu
  120. #define SRL srl
  121. #define SLL sll
  122. #define SRA sra
  123. #define SLLV sllv
  124. #define SRLV srlv
  125. #define NBYTES 4
  126. #define LOG_NBYTES 2
  127. #endif /* USE_DOUBLE */
  128. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  129. #define LDFIRST LOADR
  130. #define LDREST LOADL
  131. #define STFIRST STORER
  132. #define STREST STOREL
  133. #define SHIFT_DISCARD SLLV
  134. #else
  135. #define LDFIRST LOADL
  136. #define LDREST LOADR
  137. #define STFIRST STOREL
  138. #define STREST STORER
  139. #define SHIFT_DISCARD SRLV
  140. #endif
  141. #define FIRST(unit) ((unit)*NBYTES)
  142. #define REST(unit) (FIRST(unit)+NBYTES-1)
  143. #define UNIT(unit) FIRST(unit)
  144. #define ADDRMASK (NBYTES-1)
  145. .text
  146. .set noreorder
  147. .set noat
  148. /*
  149. * t7 is used as a flag to note inatomic mode.
  150. */
  151. LEAF(__copy_user_inatomic)
  152. b __copy_user_common
  153. li t7, 1
  154. END(__copy_user_inatomic)
  155. /*
  156. * A combined memcpy/__copy_user
  157. * __copy_user sets len to 0 for success; else to an upper bound of
  158. * the number of uncopied bytes.
  159. * memcpy sets v0 to dst.
  160. */
  161. .align 5
  162. LEAF(memcpy) /* a0=dst a1=src a2=len */
  163. move v0, dst /* return value */
  164. __memcpy:
  165. FEXPORT(__copy_user)
  166. li t7, 0 /* not inatomic */
  167. __copy_user_common:
  168. /*
  169. * Note: dst & src may be unaligned, len may be 0
  170. * Temps
  171. */
  172. #
  173. # Octeon doesn't care if the destination is unaligned. The hardware
  174. # can fix it faster than we can special case the assembly.
  175. #
  176. pref 0, 0(src)
  177. sltu t0, len, NBYTES # Check if < 1 word
  178. bnez t0, copy_bytes_checklen
  179. and t0, src, ADDRMASK # Check if src unaligned
  180. bnez t0, src_unaligned
  181. sltu t0, len, 4*NBYTES # Check if < 4 words
  182. bnez t0, less_than_4units
  183. sltu t0, len, 8*NBYTES # Check if < 8 words
  184. bnez t0, less_than_8units
  185. sltu t0, len, 16*NBYTES # Check if < 16 words
  186. bnez t0, cleanup_both_aligned
  187. sltu t0, len, 128+1 # Check if len < 129
  188. bnez t0, 1f # Skip prefetch if len is too short
  189. sltu t0, len, 256+1 # Check if len < 257
  190. bnez t0, 1f # Skip prefetch if len is too short
  191. pref 0, 128(src) # We must not prefetch invalid addresses
  192. #
  193. # This is where we loop if there is more than 128 bytes left
  194. 2: pref 0, 256(src) # We must not prefetch invalid addresses
  195. #
  196. # This is where we loop if we can't prefetch anymore
  197. 1:
  198. EXC( LOAD t0, UNIT(0)(src), l_exc)
  199. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  200. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  201. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  202. SUB len, len, 16*NBYTES
  203. EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
  204. EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
  205. EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
  206. EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
  207. EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
  208. EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
  209. EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
  210. EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
  211. EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
  212. EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
  213. EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
  214. ADD src, src, 16*NBYTES
  215. EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
  216. ADD dst, dst, 16*NBYTES
  217. EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)
  218. EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)
  219. EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)
  220. EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)
  221. EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
  222. EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
  223. EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
  224. EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
  225. EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)
  226. EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)
  227. EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)
  228. EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)
  229. EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
  230. EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
  231. EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
  232. EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
  233. sltu t0, len, 256+1 # See if we can prefetch more
  234. beqz t0, 2b
  235. sltu t0, len, 128 # See if we can loop more time
  236. beqz t0, 1b
  237. nop
  238. #
  239. # Jump here if there are less than 16*NBYTES left.
  240. #
  241. cleanup_both_aligned:
  242. beqz len, done
  243. sltu t0, len, 8*NBYTES
  244. bnez t0, less_than_8units
  245. nop
  246. EXC( LOAD t0, UNIT(0)(src), l_exc)
  247. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  248. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  249. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  250. SUB len, len, 8*NBYTES
  251. EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
  252. EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
  253. EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
  254. EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
  255. EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
  256. EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
  257. EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
  258. EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
  259. EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
  260. EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
  261. EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
  262. EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
  263. ADD src, src, 8*NBYTES
  264. beqz len, done
  265. ADD dst, dst, 8*NBYTES
  266. #
  267. # Jump here if there are less than 8*NBYTES left.
  268. #
  269. less_than_8units:
  270. sltu t0, len, 4*NBYTES
  271. bnez t0, less_than_4units
  272. nop
  273. EXC( LOAD t0, UNIT(0)(src), l_exc)
  274. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  275. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  276. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  277. SUB len, len, 4*NBYTES
  278. EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
  279. EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
  280. EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
  281. EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
  282. ADD src, src, 4*NBYTES
  283. beqz len, done
  284. ADD dst, dst, 4*NBYTES
  285. #
  286. # Jump here if there are less than 4*NBYTES left. This means
  287. # we may need to copy up to 3 NBYTES words.
  288. #
  289. less_than_4units:
  290. sltu t0, len, 1*NBYTES
  291. bnez t0, copy_bytes_checklen
  292. nop
  293. #
  294. # 1) Copy NBYTES, then check length again
  295. #
  296. EXC( LOAD t0, 0(src), l_exc)
  297. SUB len, len, NBYTES
  298. sltu t1, len, 8
  299. EXC( STORE t0, 0(dst), s_exc_p1u)
  300. ADD src, src, NBYTES
  301. bnez t1, copy_bytes_checklen
  302. ADD dst, dst, NBYTES
  303. #
  304. # 2) Copy NBYTES, then check length again
  305. #
  306. EXC( LOAD t0, 0(src), l_exc)
  307. SUB len, len, NBYTES
  308. sltu t1, len, 8
  309. EXC( STORE t0, 0(dst), s_exc_p1u)
  310. ADD src, src, NBYTES
  311. bnez t1, copy_bytes_checklen
  312. ADD dst, dst, NBYTES
  313. #
  314. # 3) Copy NBYTES, then check length again
  315. #
  316. EXC( LOAD t0, 0(src), l_exc)
  317. SUB len, len, NBYTES
  318. ADD src, src, NBYTES
  319. ADD dst, dst, NBYTES
  320. b copy_bytes_checklen
  321. EXC( STORE t0, -8(dst), s_exc_p1u)
  322. src_unaligned:
  323. #define rem t8
  324. SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
  325. beqz t0, cleanup_src_unaligned
  326. and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
  327. 1:
  328. /*
  329. * Avoid consecutive LD*'s to the same register since some mips
  330. * implementations can't issue them in the same cycle.
  331. * It's OK to load FIRST(N+1) before REST(N) because the two addresses
  332. * are to the same unit (unless src is aligned, but it's not).
  333. */
  334. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  335. EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
  336. SUB len, len, 4*NBYTES
  337. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  338. EXC( LDREST t1, REST(1)(src), l_exc_copy)
  339. EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
  340. EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
  341. EXC( LDREST t2, REST(2)(src), l_exc_copy)
  342. EXC( LDREST t3, REST(3)(src), l_exc_copy)
  343. ADD src, src, 4*NBYTES
  344. EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
  345. EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
  346. EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
  347. EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
  348. bne len, rem, 1b
  349. ADD dst, dst, 4*NBYTES
  350. cleanup_src_unaligned:
  351. beqz len, done
  352. and rem, len, NBYTES-1 # rem = len % NBYTES
  353. beq rem, len, copy_bytes
  354. nop
  355. 1:
  356. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  357. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  358. SUB len, len, NBYTES
  359. EXC( STORE t0, 0(dst), s_exc_p1u)
  360. ADD src, src, NBYTES
  361. bne len, rem, 1b
  362. ADD dst, dst, NBYTES
  363. copy_bytes_checklen:
  364. beqz len, done
  365. nop
  366. copy_bytes:
  367. /* 0 < len < NBYTES */
  368. #define COPY_BYTE(N) \
  369. EXC( lb t0, N(src), l_exc); \
  370. SUB len, len, 1; \
  371. beqz len, done; \
  372. EXC( sb t0, N(dst), s_exc_p1)
  373. COPY_BYTE(0)
  374. COPY_BYTE(1)
  375. #ifdef USE_DOUBLE
  376. COPY_BYTE(2)
  377. COPY_BYTE(3)
  378. COPY_BYTE(4)
  379. COPY_BYTE(5)
  380. #endif
  381. EXC( lb t0, NBYTES-2(src), l_exc)
  382. SUB len, len, 1
  383. jr ra
  384. EXC( sb t0, NBYTES-2(dst), s_exc_p1)
  385. done:
  386. jr ra
  387. nop
  388. END(memcpy)
  389. l_exc_copy:
  390. /*
  391. * Copy bytes from src until faulting load address (or until a
  392. * lb faults)
  393. *
  394. * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
  395. * may be more than a byte beyond the last address.
  396. * Hence, the lb below may get an exception.
  397. *
  398. * Assumes src < THREAD_BUADDR($28)
  399. */
  400. LOAD t0, TI_TASK($28)
  401. LOAD t0, THREAD_BUADDR(t0)
  402. 1:
  403. EXC( lb t1, 0(src), l_exc)
  404. ADD src, src, 1
  405. sb t1, 0(dst) # can't fault -- we're copy_from_user
  406. bne src, t0, 1b
  407. ADD dst, dst, 1
  408. l_exc:
  409. LOAD t0, TI_TASK($28)
  410. LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
  411. SUB len, AT, t0 # len number of uncopied bytes
  412. bnez t7, 2f /* Skip the zeroing out part if inatomic */
  413. /*
  414. * Here's where we rely on src and dst being incremented in tandem,
  415. * See (3) above.
  416. * dst += (fault addr - src) to put dst at first byte to clear
  417. */
  418. ADD dst, t0 # compute start address in a1
  419. SUB dst, src
  420. /*
  421. * Clear len bytes starting at dst. Can't call __bzero because it
  422. * might modify len. An inefficient loop for these rare times...
  423. */
  424. beqz len, done
  425. SUB src, len, 1
  426. 1: sb zero, 0(dst)
  427. ADD dst, dst, 1
  428. bnez src, 1b
  429. SUB src, src, 1
  430. 2: jr ra
  431. nop
  432. #define SEXC(n) \
  433. s_exc_p ## n ## u: \
  434. jr ra; \
  435. ADD len, len, n*NBYTES
  436. SEXC(16)
  437. SEXC(15)
  438. SEXC(14)
  439. SEXC(13)
  440. SEXC(12)
  441. SEXC(11)
  442. SEXC(10)
  443. SEXC(9)
  444. SEXC(8)
  445. SEXC(7)
  446. SEXC(6)
  447. SEXC(5)
  448. SEXC(4)
  449. SEXC(3)
  450. SEXC(2)
  451. SEXC(1)
  452. s_exc_p1:
  453. jr ra
  454. ADD len, len, 1
  455. s_exc:
  456. jr ra
  457. nop
  458. .align 5
  459. LEAF(memmove)
  460. ADD t0, a0, a2
  461. ADD t1, a1, a2
  462. sltu t0, a1, t0 # dst + len <= src -> memcpy
  463. sltu t1, a0, t1 # dst >= src + len -> memcpy
  464. and t0, t1
  465. beqz t0, __memcpy
  466. move v0, a0 /* return value */
  467. beqz a2, r_out
  468. END(memmove)
  469. /* fall through to __rmemcpy */
  470. LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
  471. sltu t0, a1, a0
  472. beqz t0, r_end_bytes_up # src >= dst
  473. nop
  474. ADD a0, a2 # dst = dst + len
  475. ADD a1, a2 # src = src + len
  476. r_end_bytes:
  477. lb t0, -1(a1)
  478. SUB a2, a2, 0x1
  479. sb t0, -1(a0)
  480. SUB a1, a1, 0x1
  481. bnez a2, r_end_bytes
  482. SUB a0, a0, 0x1
  483. r_out:
  484. jr ra
  485. move a2, zero
  486. r_end_bytes_up:
  487. lb t0, (a1)
  488. SUB a2, a2, 0x1
  489. sb t0, (a0)
  490. ADD a1, a1, 0x1
  491. bnez a2, r_end_bytes_up
  492. ADD a0, a0, 0x1
  493. jr ra
  494. move a2, zero
  495. END(__rmemcpy)