memcpy_mck.S 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. /*
  2. * Itanium 2-optimized version of memcpy and copy_user function
  3. *
  4. * Inputs:
  5. * in0: destination address
  6. * in1: source address
  7. * in2: number of bytes to copy
  8. * Output:
  9. * for memcpy: return dest
  10. * for copy_user: return 0 if success,
  11. * or number of byte NOT copied if error occurred.
  12. *
  13. * Copyright (C) 2002 Intel Corp.
  14. * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
  15. */
  16. #include <linux/config.h>
  17. #include <asm/asmmacro.h>
  18. #include <asm/page.h>
  19. #define EK(y...) EX(y)
  20. /* McKinley specific optimization */
  21. #define retval r8
  22. #define saved_pfs r31
  23. #define saved_lc r10
  24. #define saved_pr r11
  25. #define saved_in0 r14
  26. #define saved_in1 r15
  27. #define saved_in2 r16
  28. #define src0 r2
  29. #define src1 r3
  30. #define dst0 r17
  31. #define dst1 r18
  32. #define cnt r9
  33. /* r19-r30 are temp for each code section */
  34. #define PREFETCH_DIST 8
  35. #define src_pre_mem r19
  36. #define dst_pre_mem r20
  37. #define src_pre_l2 r21
  38. #define dst_pre_l2 r22
  39. #define t1 r23
  40. #define t2 r24
  41. #define t3 r25
  42. #define t4 r26
  43. #define t5 t1 // alias!
  44. #define t6 t2 // alias!
  45. #define t7 t3 // alias!
  46. #define n8 r27
  47. #define t9 t5 // alias!
  48. #define t10 t4 // alias!
  49. #define t11 t7 // alias!
  50. #define t12 t6 // alias!
  51. #define t14 t10 // alias!
  52. #define t13 r28
  53. #define t15 r29
  54. #define tmp r30
  55. /* defines for long_copy block */
  56. #define A 0
  57. #define B (PREFETCH_DIST)
  58. #define C (B + PREFETCH_DIST)
  59. #define D (C + 1)
  60. #define N (D + 1)
  61. #define Nrot ((N + 7) & ~7)
  62. /* alias */
  63. #define in0 r32
  64. #define in1 r33
  65. #define in2 r34
  66. GLOBAL_ENTRY(memcpy)
  67. and r28=0x7,in0
  68. and r29=0x7,in1
  69. mov f6=f0
  70. mov retval=in0
  71. br.cond.sptk .common_code
  72. ;;
  73. END(memcpy)
  74. GLOBAL_ENTRY(__copy_user)
  75. .prologue
  76. // check dest alignment
  77. and r28=0x7,in0
  78. and r29=0x7,in1
  79. mov f6=f1
  80. mov saved_in0=in0 // save dest pointer
  81. mov saved_in1=in1 // save src pointer
  82. mov retval=r0 // initialize return value
  83. ;;
  84. .common_code:
  85. cmp.gt p15,p0=8,in2 // check for small size
  86. cmp.ne p13,p0=0,r28 // check dest alignment
  87. cmp.ne p14,p0=0,r29 // check src alignment
  88. add src0=0,in1
  89. sub r30=8,r28 // for .align_dest
  90. mov saved_in2=in2 // save len
  91. ;;
  92. add dst0=0,in0
  93. add dst1=1,in0 // dest odd index
  94. cmp.le p6,p0 = 1,r30 // for .align_dest
  95. (p15) br.cond.dpnt .memcpy_short
  96. (p13) br.cond.dpnt .align_dest
  97. (p14) br.cond.dpnt .unaligned_src
  98. ;;
  99. // both dest and src are aligned on 8-byte boundary
  100. .aligned_src:
  101. .save ar.pfs, saved_pfs
  102. alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
  103. .save pr, saved_pr
  104. mov saved_pr=pr
  105. shr.u cnt=in2,7 // this much cache line
  106. ;;
  107. cmp.lt p6,p0=2*PREFETCH_DIST,cnt
  108. cmp.lt p7,p8=1,cnt
  109. .save ar.lc, saved_lc
  110. mov saved_lc=ar.lc
  111. .body
  112. add cnt=-1,cnt
  113. add src_pre_mem=0,in1 // prefetch src pointer
  114. add dst_pre_mem=0,in0 // prefetch dest pointer
  115. ;;
  116. (p7) mov ar.lc=cnt // prefetch count
  117. (p8) mov ar.lc=r0
  118. (p6) br.cond.dpnt .long_copy
  119. ;;
  120. .prefetch:
  121. lfetch.fault [src_pre_mem], 128
  122. lfetch.fault.excl [dst_pre_mem], 128
  123. br.cloop.dptk.few .prefetch
  124. ;;
  125. .medium_copy:
  126. and tmp=31,in2 // copy length after iteration
  127. shr.u r29=in2,5 // number of 32-byte iteration
  128. add dst1=8,dst0 // 2nd dest pointer
  129. ;;
  130. add cnt=-1,r29 // ctop iteration adjustment
  131. cmp.eq p10,p0=r29,r0 // do we really need to loop?
  132. add src1=8,src0 // 2nd src pointer
  133. cmp.le p6,p0=8,tmp
  134. ;;
  135. cmp.le p7,p0=16,tmp
  136. mov ar.lc=cnt // loop setup
  137. cmp.eq p16,p17 = r0,r0
  138. mov ar.ec=2
  139. (p10) br.dpnt.few .aligned_src_tail
  140. ;;
  141. TEXT_ALIGN(32)
  142. 1:
  143. EX(.ex_handler, (p16) ld8 r34=[src0],16)
  144. EK(.ex_handler, (p16) ld8 r38=[src1],16)
  145. EX(.ex_handler, (p17) st8 [dst0]=r33,16)
  146. EK(.ex_handler, (p17) st8 [dst1]=r37,16)
  147. ;;
  148. EX(.ex_handler, (p16) ld8 r32=[src0],16)
  149. EK(.ex_handler, (p16) ld8 r36=[src1],16)
  150. EX(.ex_handler, (p16) st8 [dst0]=r34,16)
  151. EK(.ex_handler, (p16) st8 [dst1]=r38,16)
  152. br.ctop.dptk.few 1b
  153. ;;
  154. .aligned_src_tail:
  155. EX(.ex_handler, (p6) ld8 t1=[src0])
  156. mov ar.lc=saved_lc
  157. mov ar.pfs=saved_pfs
  158. EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
  159. cmp.le p8,p0=24,tmp
  160. and r21=-8,tmp
  161. ;;
  162. EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
  163. EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
  164. and in2=7,tmp // remaining length
  165. EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
  166. add src0=src0,r21 // setting up src pointer
  167. add dst0=dst0,r21 // setting up dest pointer
  168. ;;
  169. EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
  170. mov pr=saved_pr,-1
  171. br.dptk.many .memcpy_short
  172. ;;
  173. /* code taken from copy_page_mck */
  174. .long_copy:
  175. .rotr v[2*PREFETCH_DIST]
  176. .rotp p[N]
  177. mov src_pre_mem = src0
  178. mov pr.rot = 0x10000
  179. mov ar.ec = 1 // special unrolled loop
  180. mov dst_pre_mem = dst0
  181. add src_pre_l2 = 8*8, src0
  182. add dst_pre_l2 = 8*8, dst0
  183. ;;
  184. add src0 = 8, src_pre_mem // first t1 src
  185. mov ar.lc = 2*PREFETCH_DIST - 1
  186. shr.u cnt=in2,7 // number of lines
  187. add src1 = 3*8, src_pre_mem // first t3 src
  188. add dst0 = 8, dst_pre_mem // first t1 dst
  189. add dst1 = 3*8, dst_pre_mem // first t3 dst
  190. ;;
  191. and tmp=127,in2 // remaining bytes after this block
  192. add cnt = -(2*PREFETCH_DIST) - 1, cnt
  193. // same as .line_copy loop, but with all predicated-off instructions removed:
  194. .prefetch_loop:
  195. EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
  196. EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
  197. br.ctop.sptk .prefetch_loop
  198. ;;
  199. cmp.eq p16, p0 = r0, r0 // reset p16 to 1
  200. mov ar.lc = cnt
  201. mov ar.ec = N // # of stages in pipeline
  202. ;;
  203. .line_copy:
  204. EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
  205. EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
  206. EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
  207. EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
  208. ;;
  209. EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
  210. EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
  211. EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
  212. EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
  213. ;;
  214. EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
  215. EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
  216. EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
  217. EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
  218. ;;
  219. EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
  220. EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
  221. EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
  222. EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
  223. ;;
  224. EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
  225. EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
  226. EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
  227. EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
  228. ;;
  229. EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
  230. EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
  231. EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
  232. EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
  233. ;;
  234. EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
  235. EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
  236. EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
  237. EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
  238. ;;
  239. EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
  240. EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
  241. EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
  242. EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
  243. br.ctop.sptk .line_copy
  244. ;;
  245. add dst0=-8,dst0
  246. add src0=-8,src0
  247. mov in2=tmp
  248. .restore sp
  249. br.sptk.many .medium_copy
  250. ;;
  251. #define BLOCK_SIZE 128*32
  252. #define blocksize r23
  253. #define curlen r24
  254. // dest is on 8-byte boundary, src is not. We need to do
  255. // ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
  256. .unaligned_src:
  257. .prologue
  258. .save ar.pfs, saved_pfs
  259. alloc saved_pfs=ar.pfs,3,5,0,8
  260. .save ar.lc, saved_lc
  261. mov saved_lc=ar.lc
  262. .save pr, saved_pr
  263. mov saved_pr=pr
  264. .body
  265. .4k_block:
  266. mov saved_in0=dst0 // need to save all input arguments
  267. mov saved_in2=in2
  268. mov blocksize=BLOCK_SIZE
  269. ;;
  270. cmp.lt p6,p7=blocksize,in2
  271. mov saved_in1=src0
  272. ;;
  273. (p6) mov in2=blocksize
  274. ;;
  275. shr.u r21=in2,7 // this much cache line
  276. shr.u r22=in2,4 // number of 16-byte iteration
  277. and curlen=15,in2 // copy length after iteration
  278. and r30=7,src0 // source alignment
  279. ;;
  280. cmp.lt p7,p8=1,r21
  281. add cnt=-1,r21
  282. ;;
  283. add src_pre_mem=0,src0 // prefetch src pointer
  284. add dst_pre_mem=0,dst0 // prefetch dest pointer
  285. and src0=-8,src0 // 1st src pointer
  286. (p7) mov ar.lc = cnt
  287. (p8) mov ar.lc = r0
  288. ;;
  289. TEXT_ALIGN(32)
  290. 1: lfetch.fault [src_pre_mem], 128
  291. lfetch.fault.excl [dst_pre_mem], 128
  292. br.cloop.dptk.few 1b
  293. ;;
  294. shladd dst1=r22,3,dst0 // 2nd dest pointer
  295. shladd src1=r22,3,src0 // 2nd src pointer
  296. cmp.eq p8,p9=r22,r0 // do we really need to loop?
  297. cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
  298. add cnt=-1,r22 // ctop iteration adjustment
  299. ;;
  300. EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
  301. EK(.ex_handler, (p9) ld8 r37=[src1],8)
  302. (p8) br.dpnt.few .noloop
  303. ;;
  304. // The jump address is calculated based on src alignment. The COPYU
  305. // macro below need to confine its size to power of two, so an entry
  306. // can be caulated using shl instead of an expensive multiply. The
  307. // size is then hard coded by the following #define to match the
  308. // actual size. This make it somewhat tedious when COPYU macro gets
  309. // changed and this need to be adjusted to match.
  310. #define LOOP_SIZE 6
  311. 1:
  312. mov r29=ip // jmp_table thread
  313. mov ar.lc=cnt
  314. ;;
  315. add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
  316. shl r28=r30, LOOP_SIZE // jmp_table thread
  317. mov ar.ec=2 // loop setup
  318. ;;
  319. add r29=r29,r28 // jmp_table thread
  320. cmp.eq p16,p17=r0,r0
  321. ;;
  322. mov b6=r29 // jmp_table thread
  323. ;;
  324. br.cond.sptk.few b6
  325. // for 8-15 byte case
  326. // We will skip the loop, but need to replicate the side effect
  327. // that the loop produces.
  328. .noloop:
  329. EX(.ex_handler, (p6) ld8 r37=[src1],8)
  330. add src0=8,src0
  331. (p6) shl r25=r30,3
  332. ;;
  333. EX(.ex_handler, (p6) ld8 r27=[src1])
  334. (p6) shr.u r28=r37,r25
  335. (p6) sub r26=64,r25
  336. ;;
  337. (p6) shl r27=r27,r26
  338. ;;
  339. (p6) or r21=r28,r27
  340. .unaligned_src_tail:
  341. /* check if we have more than blocksize to copy, if so go back */
  342. cmp.gt p8,p0=saved_in2,blocksize
  343. ;;
  344. (p8) add dst0=saved_in0,blocksize
  345. (p8) add src0=saved_in1,blocksize
  346. (p8) sub in2=saved_in2,blocksize
  347. (p8) br.dpnt .4k_block
  348. ;;
  349. /* we have up to 15 byte to copy in the tail.
  350. * part of work is already done in the jump table code
  351. * we are at the following state.
  352. * src side:
  353. *
  354. * xxxxxx xx <----- r21 has xxxxxxxx already
  355. * -------- -------- --------
  356. * 0 8 16
  357. * ^
  358. * |
  359. * src1
  360. *
  361. * dst
  362. * -------- -------- --------
  363. * ^
  364. * |
  365. * dst1
  366. */
  367. EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
  368. (p6) add curlen=-8,curlen // update length
  369. mov ar.pfs=saved_pfs
  370. ;;
  371. mov ar.lc=saved_lc
  372. mov pr=saved_pr,-1
  373. mov in2=curlen // remaining length
  374. mov dst0=dst1 // dest pointer
  375. add src0=src1,r30 // forward by src alignment
  376. ;;
  377. // 7 byte or smaller.
  378. .memcpy_short:
  379. cmp.le p8,p9 = 1,in2
  380. cmp.le p10,p11 = 2,in2
  381. cmp.le p12,p13 = 3,in2
  382. cmp.le p14,p15 = 4,in2
  383. add src1=1,src0 // second src pointer
  384. add dst1=1,dst0 // second dest pointer
  385. ;;
  386. EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
  387. EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
  388. (p9) br.ret.dpnt rp // 0 byte copy
  389. ;;
  390. EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
  391. EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
  392. (p11) br.ret.dpnt rp // 1 byte copy
  393. EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
  394. EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
  395. (p13) br.ret.dpnt rp // 2 byte copy
  396. ;;
  397. cmp.le p6,p7 = 5,in2
  398. cmp.le p8,p9 = 6,in2
  399. cmp.le p10,p11 = 7,in2
  400. EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
  401. EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
  402. (p15) br.ret.dpnt rp // 3 byte copy
  403. ;;
  404. EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
  405. EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
  406. (p7) br.ret.dpnt rp // 4 byte copy
  407. ;;
  408. EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
  409. EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
  410. (p9) br.ret.dptk rp // 5 byte copy
  411. EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
  412. (p11) br.ret.dptk rp // 6 byte copy
  413. ;;
  414. EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
  415. br.ret.dptk rp // done all cases
  416. /* Align dest to nearest 8-byte boundary. We know we have at
  417. * least 7 bytes to copy, enough to crawl to 8-byte boundary.
  418. * Actual number of byte to crawl depend on the dest alignment.
  419. * 7 byte or less is taken care at .memcpy_short
  420. * src0 - source even index
  421. * src1 - source odd index
  422. * dst0 - dest even index
  423. * dst1 - dest odd index
  424. * r30 - distance to 8-byte boundary
  425. */
  426. .align_dest:
  427. add src1=1,in1 // source odd index
  428. cmp.le p7,p0 = 2,r30 // for .align_dest
  429. cmp.le p8,p0 = 3,r30 // for .align_dest
  430. EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
  431. cmp.le p9,p0 = 4,r30 // for .align_dest
  432. cmp.le p10,p0 = 5,r30
  433. ;;
  434. EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
  435. EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
  436. cmp.le p11,p0 = 6,r30
  437. EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
  438. cmp.le p12,p0 = 7,r30
  439. ;;
  440. EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
  441. EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
  442. EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
  443. EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
  444. ;;
  445. EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
  446. EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
  447. cmp.eq p6,p7=r28,r29
  448. EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
  449. EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
  450. sub in2=in2,r30
  451. ;;
  452. EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
  453. EK(.ex_handler_short, (p12) st1 [dst0] = t7)
  454. add dst0=in0,r30 // setup arguments
  455. add src0=in1,r30
  456. (p6) br.cond.dptk .aligned_src
  457. (p7) br.cond.dpnt .unaligned_src
  458. ;;
  459. /* main loop body in jump table format */
  460. #define COPYU(shift) \
  461. 1: \
  462. EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
  463. EK(.ex_handler, (p16) ld8 r36=[src1],8); \
  464. (p17) shrp r35=r33,r34,shift;; /* 1 */ \
  465. EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
  466. nop.m 0; \
  467. (p16) shrp r38=r36,r37,shift; \
  468. EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
  469. EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
  470. br.ctop.dptk.few 1b;; \
  471. (p7) add src1=-8,src1; /* back out for <8 byte case */ \
  472. shrp r21=r22,r38,shift; /* speculative work */ \
  473. br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
  474. ;;
  475. TEXT_ALIGN(32)
  476. .jump_table:
  477. COPYU(8) // unaligned cases
  478. .jmp1:
  479. COPYU(16)
  480. COPYU(24)
  481. COPYU(32)
  482. COPYU(40)
  483. COPYU(48)
  484. COPYU(56)
  485. #undef A
  486. #undef B
  487. #undef C
  488. #undef D
  489. /*
  490. * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
  491. * instruction failed in the bundle. The exception algorithm is that we
  492. * first figure out the faulting address, then detect if there is any
  493. * progress made on the copy, if so, redo the copy from last known copied
  494. * location up to the faulting address (exclusive). In the copy_from_user
  495. * case, remaining byte in kernel buffer will be zeroed.
  496. *
  497. * Take copy_from_user as an example, in the code there are multiple loads
  498. * in a bundle and those multiple loads could span over two pages, the
  499. * faulting address is calculated as page_round_down(max(src0, src1)).
  500. * This is based on knowledge that if we can access one byte in a page, we
  501. * can access any byte in that page.
  502. *
  503. * predicate used in the exception handler:
  504. * p6-p7: direction
  505. * p10-p11: src faulting addr calculation
  506. * p12-p13: dst faulting addr calculation
  507. */
  508. #define A r19
  509. #define B r20
  510. #define C r21
  511. #define D r22
  512. #define F r28
  513. #define memset_arg0 r32
  514. #define memset_arg2 r33
  515. #define saved_retval loc0
  516. #define saved_rtlink loc1
  517. #define saved_pfs_stack loc2
  518. .ex_hndlr_s:
  519. add src0=8,src0
  520. br.sptk .ex_handler
  521. ;;
  522. .ex_hndlr_d:
  523. add dst0=8,dst0
  524. br.sptk .ex_handler
  525. ;;
  526. .ex_hndlr_lcpy_1:
  527. mov src1=src_pre_mem
  528. mov dst1=dst_pre_mem
  529. cmp.gtu p10,p11=src_pre_mem,saved_in1
  530. cmp.gtu p12,p13=dst_pre_mem,saved_in0
  531. ;;
  532. (p10) add src0=8,saved_in1
  533. (p11) mov src0=saved_in1
  534. (p12) add dst0=8,saved_in0
  535. (p13) mov dst0=saved_in0
  536. br.sptk .ex_handler
  537. .ex_handler_lcpy:
  538. // in line_copy block, the preload addresses should always ahead
  539. // of the other two src/dst pointers. Furthermore, src1/dst1 should
  540. // always ahead of src0/dst0.
  541. mov src1=src_pre_mem
  542. mov dst1=dst_pre_mem
  543. .ex_handler:
  544. mov pr=saved_pr,-1 // first restore pr, lc, and pfs
  545. mov ar.lc=saved_lc
  546. mov ar.pfs=saved_pfs
  547. ;;
  548. .ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
  549. cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
  550. cmp.ltu p10,p11=src0,src1
  551. cmp.ltu p12,p13=dst0,dst1
  552. fcmp.eq p8,p0=f6,f0 // is it memcpy?
  553. mov tmp = dst0
  554. ;;
  555. (p11) mov src1 = src0 // pick the larger of the two
  556. (p13) mov dst0 = dst1 // make dst0 the smaller one
  557. (p13) mov dst1 = tmp // and dst1 the larger one
  558. ;;
  559. (p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
  560. (p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
  561. ;;
  562. (p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
  563. (p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
  564. mov retval=saved_in2
  565. (p8) ld1 tmp=[src1] // force an oops for memcpy call
  566. (p8) st1 [dst1]=r0 // force an oops for memcpy call
  567. (p14) br.ret.sptk.many rp
  568. /*
  569. * The remaining byte to copy is calculated as:
  570. *
  571. * A = (faulting_addr - orig_src) -> len to faulting ld address
  572. * or
  573. * (faulting_addr - orig_dst) -> len to faulting st address
  574. * B = (cur_dst - orig_dst) -> len copied so far
  575. * C = A - B -> len need to be copied
  576. * D = orig_len - A -> len need to be zeroed
  577. */
  578. (p6) sub A = F, saved_in0
  579. (p7) sub A = F, saved_in1
  580. clrrrb
  581. ;;
  582. alloc saved_pfs_stack=ar.pfs,3,3,3,0
  583. cmp.lt p8,p0=A,r0
  584. sub B = dst0, saved_in0 // how many byte copied so far
  585. ;;
  586. (p8) mov A = 0; // A shouldn't be negative, cap it
  587. ;;
  588. sub C = A, B
  589. sub D = saved_in2, A
  590. ;;
  591. cmp.gt p8,p0=C,r0 // more than 1 byte?
  592. add memset_arg0=saved_in0, A
  593. (p6) mov memset_arg2=0 // copy_to_user should not call memset
  594. (p7) mov memset_arg2=D // copy_from_user need to have kbuf zeroed
  595. mov r8=0
  596. mov saved_retval = D
  597. mov saved_rtlink = b0
  598. add out0=saved_in0, B
  599. add out1=saved_in1, B
  600. mov out2=C
  601. (p8) br.call.sptk.few b0=__copy_user // recursive call
  602. ;;
  603. add saved_retval=saved_retval,r8 // above might return non-zero value
  604. cmp.gt p8,p0=memset_arg2,r0 // more than 1 byte?
  605. mov out0=memset_arg0 // *s
  606. mov out1=r0 // c
  607. mov out2=memset_arg2 // n
  608. (p8) br.call.sptk.few b0=memset
  609. ;;
  610. mov retval=saved_retval
  611. mov ar.pfs=saved_pfs_stack
  612. mov b0=saved_rtlink
  613. br.ret.sptk.many rp
  614. /* end of McKinley specific optimization */
  615. END(__copy_user)