copyuser_power7.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. *
  16. * Copyright (C) IBM Corporation, 2011
  17. *
  18. * Author: Anton Blanchard <anton@au.ibm.com>
  19. */
  20. #include <asm/ppc_asm.h>
  21. .macro err1
  22. 100:
  23. .section __ex_table,"a"
  24. .align 3
  25. .llong 100b,.Ldo_err1
  26. .previous
  27. .endm
  28. .macro err2
  29. 200:
  30. .section __ex_table,"a"
  31. .align 3
  32. .llong 200b,.Ldo_err2
  33. .previous
  34. .endm
  35. #ifdef CONFIG_ALTIVEC
  36. .macro err3
  37. 300:
  38. .section __ex_table,"a"
  39. .align 3
  40. .llong 300b,.Ldo_err3
  41. .previous
  42. .endm
  43. .macro err4
  44. 400:
  45. .section __ex_table,"a"
  46. .align 3
  47. .llong 400b,.Ldo_err4
  48. .previous
  49. .endm
  50. .Ldo_err4:
  51. ld r16,STK_REG(R16)(r1)
  52. ld r15,STK_REG(R15)(r1)
  53. ld r14,STK_REG(R14)(r1)
  54. .Ldo_err3:
  55. bl .exit_vmx_usercopy
  56. ld r0,STACKFRAMESIZE+16(r1)
  57. mtlr r0
  58. b .Lexit
  59. #endif /* CONFIG_ALTIVEC */
  60. .Ldo_err2:
  61. ld r22,STK_REG(R22)(r1)
  62. ld r21,STK_REG(R21)(r1)
  63. ld r20,STK_REG(R20)(r1)
  64. ld r19,STK_REG(R19)(r1)
  65. ld r18,STK_REG(R18)(r1)
  66. ld r17,STK_REG(R17)(r1)
  67. ld r16,STK_REG(R16)(r1)
  68. ld r15,STK_REG(R15)(r1)
  69. ld r14,STK_REG(R14)(r1)
  70. .Lexit:
  71. addi r1,r1,STACKFRAMESIZE
  72. .Ldo_err1:
  73. ld r3,48(r1)
  74. ld r4,56(r1)
  75. ld r5,64(r1)
  76. b __copy_tofrom_user_base
  77. _GLOBAL(__copy_tofrom_user_power7)
  78. #ifdef CONFIG_ALTIVEC
  79. cmpldi r5,16
  80. cmpldi cr1,r5,4096
  81. std r3,48(r1)
  82. std r4,56(r1)
  83. std r5,64(r1)
  84. blt .Lshort_copy
  85. bgt cr1,.Lvmx_copy
  86. #else
  87. cmpldi r5,16
  88. std r3,48(r1)
  89. std r4,56(r1)
  90. std r5,64(r1)
  91. blt .Lshort_copy
  92. #endif
  93. .Lnonvmx_copy:
  94. /* Get the source 8B aligned */
  95. neg r6,r4
  96. mtocrf 0x01,r6
  97. clrldi r6,r6,(64-3)
  98. bf cr7*4+3,1f
  99. err1; lbz r0,0(r4)
  100. addi r4,r4,1
  101. err1; stb r0,0(r3)
  102. addi r3,r3,1
  103. 1: bf cr7*4+2,2f
  104. err1; lhz r0,0(r4)
  105. addi r4,r4,2
  106. err1; sth r0,0(r3)
  107. addi r3,r3,2
  108. 2: bf cr7*4+1,3f
  109. err1; lwz r0,0(r4)
  110. addi r4,r4,4
  111. err1; stw r0,0(r3)
  112. addi r3,r3,4
  113. 3: sub r5,r5,r6
  114. cmpldi r5,128
  115. blt 5f
  116. mflr r0
  117. stdu r1,-STACKFRAMESIZE(r1)
  118. std r14,STK_REG(R14)(r1)
  119. std r15,STK_REG(R15)(r1)
  120. std r16,STK_REG(R16)(r1)
  121. std r17,STK_REG(R17)(r1)
  122. std r18,STK_REG(R18)(r1)
  123. std r19,STK_REG(R19)(r1)
  124. std r20,STK_REG(R20)(r1)
  125. std r21,STK_REG(R21)(r1)
  126. std r22,STK_REG(R22)(r1)
  127. std r0,STACKFRAMESIZE+16(r1)
  128. srdi r6,r5,7
  129. mtctr r6
  130. /* Now do cacheline (128B) sized loads and stores. */
  131. .align 5
  132. 4:
  133. err2; ld r0,0(r4)
  134. err2; ld r6,8(r4)
  135. err2; ld r7,16(r4)
  136. err2; ld r8,24(r4)
  137. err2; ld r9,32(r4)
  138. err2; ld r10,40(r4)
  139. err2; ld r11,48(r4)
  140. err2; ld r12,56(r4)
  141. err2; ld r14,64(r4)
  142. err2; ld r15,72(r4)
  143. err2; ld r16,80(r4)
  144. err2; ld r17,88(r4)
  145. err2; ld r18,96(r4)
  146. err2; ld r19,104(r4)
  147. err2; ld r20,112(r4)
  148. err2; ld r21,120(r4)
  149. addi r4,r4,128
  150. err2; std r0,0(r3)
  151. err2; std r6,8(r3)
  152. err2; std r7,16(r3)
  153. err2; std r8,24(r3)
  154. err2; std r9,32(r3)
  155. err2; std r10,40(r3)
  156. err2; std r11,48(r3)
  157. err2; std r12,56(r3)
  158. err2; std r14,64(r3)
  159. err2; std r15,72(r3)
  160. err2; std r16,80(r3)
  161. err2; std r17,88(r3)
  162. err2; std r18,96(r3)
  163. err2; std r19,104(r3)
  164. err2; std r20,112(r3)
  165. err2; std r21,120(r3)
  166. addi r3,r3,128
  167. bdnz 4b
  168. clrldi r5,r5,(64-7)
  169. ld r14,STK_REG(R14)(r1)
  170. ld r15,STK_REG(R15)(r1)
  171. ld r16,STK_REG(R16)(r1)
  172. ld r17,STK_REG(R17)(r1)
  173. ld r18,STK_REG(R18)(r1)
  174. ld r19,STK_REG(R19)(r1)
  175. ld r20,STK_REG(R20)(r1)
  176. ld r21,STK_REG(R21)(r1)
  177. ld r22,STK_REG(R22)(r1)
  178. addi r1,r1,STACKFRAMESIZE
  179. /* Up to 127B to go */
  180. 5: srdi r6,r5,4
  181. mtocrf 0x01,r6
  182. 6: bf cr7*4+1,7f
  183. err1; ld r0,0(r4)
  184. err1; ld r6,8(r4)
  185. err1; ld r7,16(r4)
  186. err1; ld r8,24(r4)
  187. err1; ld r9,32(r4)
  188. err1; ld r10,40(r4)
  189. err1; ld r11,48(r4)
  190. err1; ld r12,56(r4)
  191. addi r4,r4,64
  192. err1; std r0,0(r3)
  193. err1; std r6,8(r3)
  194. err1; std r7,16(r3)
  195. err1; std r8,24(r3)
  196. err1; std r9,32(r3)
  197. err1; std r10,40(r3)
  198. err1; std r11,48(r3)
  199. err1; std r12,56(r3)
  200. addi r3,r3,64
  201. /* Up to 63B to go */
  202. 7: bf cr7*4+2,8f
  203. err1; ld r0,0(r4)
  204. err1; ld r6,8(r4)
  205. err1; ld r7,16(r4)
  206. err1; ld r8,24(r4)
  207. addi r4,r4,32
  208. err1; std r0,0(r3)
  209. err1; std r6,8(r3)
  210. err1; std r7,16(r3)
  211. err1; std r8,24(r3)
  212. addi r3,r3,32
  213. /* Up to 31B to go */
  214. 8: bf cr7*4+3,9f
  215. err1; ld r0,0(r4)
  216. err1; ld r6,8(r4)
  217. addi r4,r4,16
  218. err1; std r0,0(r3)
  219. err1; std r6,8(r3)
  220. addi r3,r3,16
  221. 9: clrldi r5,r5,(64-4)
  222. /* Up to 15B to go */
  223. .Lshort_copy:
  224. mtocrf 0x01,r5
  225. bf cr7*4+0,12f
  226. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  227. err1; lwz r6,4(r4)
  228. addi r4,r4,8
  229. err1; stw r0,0(r3)
  230. err1; stw r6,4(r3)
  231. addi r3,r3,8
  232. 12: bf cr7*4+1,13f
  233. err1; lwz r0,0(r4)
  234. addi r4,r4,4
  235. err1; stw r0,0(r3)
  236. addi r3,r3,4
  237. 13: bf cr7*4+2,14f
  238. err1; lhz r0,0(r4)
  239. addi r4,r4,2
  240. err1; sth r0,0(r3)
  241. addi r3,r3,2
  242. 14: bf cr7*4+3,15f
  243. err1; lbz r0,0(r4)
  244. err1; stb r0,0(r3)
  245. 15: li r3,0
  246. blr
  247. .Lunwind_stack_nonvmx_copy:
  248. addi r1,r1,STACKFRAMESIZE
  249. b .Lnonvmx_copy
  250. #ifdef CONFIG_ALTIVEC
  251. .Lvmx_copy:
  252. mflr r0
  253. std r0,16(r1)
  254. stdu r1,-STACKFRAMESIZE(r1)
  255. bl .enter_vmx_usercopy
  256. cmpwi r3,0
  257. ld r0,STACKFRAMESIZE+16(r1)
  258. ld r3,STACKFRAMESIZE+48(r1)
  259. ld r4,STACKFRAMESIZE+56(r1)
  260. ld r5,STACKFRAMESIZE+64(r1)
  261. mtlr r0
  262. /*
  263. * We prefetch both the source and destination using enhanced touch
  264. * instructions. We use a stream ID of 0 for the load side and
  265. * 1 for the store side.
  266. */
  267. clrrdi r6,r4,7
  268. clrrdi r9,r3,7
  269. ori r9,r9,1 /* stream=1 */
  270. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  271. cmpldi r7,0x3FF
  272. ble 1f
  273. li r7,0x3FF
  274. 1: lis r0,0x0E00 /* depth=7 */
  275. sldi r7,r7,7
  276. or r7,r7,r0
  277. ori r10,r7,1 /* stream=1 */
  278. lis r8,0x8000 /* GO=1 */
  279. clrldi r8,r8,32
  280. .machine push
  281. .machine "power4"
  282. dcbt r0,r6,0b01000
  283. dcbt r0,r7,0b01010
  284. dcbtst r0,r9,0b01000
  285. dcbtst r0,r10,0b01010
  286. eieio
  287. dcbt r0,r8,0b01010 /* GO */
  288. .machine pop
  289. /*
  290. * We prefetch both the source and destination using enhanced touch
  291. * instructions. We use a stream ID of 0 for the load side and
  292. * 1 for the store side.
  293. */
  294. clrrdi r6,r4,7
  295. clrrdi r9,r3,7
  296. ori r9,r9,1 /* stream=1 */
  297. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  298. cmpldi cr1,r7,0x3FF
  299. ble cr1,1f
  300. li r7,0x3FF
  301. 1: lis r0,0x0E00 /* depth=7 */
  302. sldi r7,r7,7
  303. or r7,r7,r0
  304. ori r10,r7,1 /* stream=1 */
  305. lis r8,0x8000 /* GO=1 */
  306. clrldi r8,r8,32
  307. .machine push
  308. .machine "power4"
  309. dcbt r0,r6,0b01000
  310. dcbt r0,r7,0b01010
  311. dcbtst r0,r9,0b01000
  312. dcbtst r0,r10,0b01010
  313. eieio
  314. dcbt r0,r8,0b01010 /* GO */
  315. .machine pop
  316. beq .Lunwind_stack_nonvmx_copy
  317. /*
  318. * If source and destination are not relatively aligned we use a
  319. * slower permute loop.
  320. */
  321. xor r6,r4,r3
  322. rldicl. r6,r6,0,(64-4)
  323. bne .Lvmx_unaligned_copy
  324. /* Get the destination 16B aligned */
  325. neg r6,r3
  326. mtocrf 0x01,r6
  327. clrldi r6,r6,(64-4)
  328. bf cr7*4+3,1f
  329. err3; lbz r0,0(r4)
  330. addi r4,r4,1
  331. err3; stb r0,0(r3)
  332. addi r3,r3,1
  333. 1: bf cr7*4+2,2f
  334. err3; lhz r0,0(r4)
  335. addi r4,r4,2
  336. err3; sth r0,0(r3)
  337. addi r3,r3,2
  338. 2: bf cr7*4+1,3f
  339. err3; lwz r0,0(r4)
  340. addi r4,r4,4
  341. err3; stw r0,0(r3)
  342. addi r3,r3,4
  343. 3: bf cr7*4+0,4f
  344. err3; ld r0,0(r4)
  345. addi r4,r4,8
  346. err3; std r0,0(r3)
  347. addi r3,r3,8
  348. 4: sub r5,r5,r6
  349. /* Get the desination 128B aligned */
  350. neg r6,r3
  351. srdi r7,r6,4
  352. mtocrf 0x01,r7
  353. clrldi r6,r6,(64-7)
  354. li r9,16
  355. li r10,32
  356. li r11,48
  357. bf cr7*4+3,5f
  358. err3; lvx vr1,r0,r4
  359. addi r4,r4,16
  360. err3; stvx vr1,r0,r3
  361. addi r3,r3,16
  362. 5: bf cr7*4+2,6f
  363. err3; lvx vr1,r0,r4
  364. err3; lvx vr0,r4,r9
  365. addi r4,r4,32
  366. err3; stvx vr1,r0,r3
  367. err3; stvx vr0,r3,r9
  368. addi r3,r3,32
  369. 6: bf cr7*4+1,7f
  370. err3; lvx vr3,r0,r4
  371. err3; lvx vr2,r4,r9
  372. err3; lvx vr1,r4,r10
  373. err3; lvx vr0,r4,r11
  374. addi r4,r4,64
  375. err3; stvx vr3,r0,r3
  376. err3; stvx vr2,r3,r9
  377. err3; stvx vr1,r3,r10
  378. err3; stvx vr0,r3,r11
  379. addi r3,r3,64
  380. 7: sub r5,r5,r6
  381. srdi r6,r5,7
  382. std r14,STK_REG(R14)(r1)
  383. std r15,STK_REG(R15)(r1)
  384. std r16,STK_REG(R16)(r1)
  385. li r12,64
  386. li r14,80
  387. li r15,96
  388. li r16,112
  389. mtctr r6
  390. /*
  391. * Now do cacheline sized loads and stores. By this stage the
  392. * cacheline stores are also cacheline aligned.
  393. */
  394. .align 5
  395. 8:
  396. err4; lvx vr7,r0,r4
  397. err4; lvx vr6,r4,r9
  398. err4; lvx vr5,r4,r10
  399. err4; lvx vr4,r4,r11
  400. err4; lvx vr3,r4,r12
  401. err4; lvx vr2,r4,r14
  402. err4; lvx vr1,r4,r15
  403. err4; lvx vr0,r4,r16
  404. addi r4,r4,128
  405. err4; stvx vr7,r0,r3
  406. err4; stvx vr6,r3,r9
  407. err4; stvx vr5,r3,r10
  408. err4; stvx vr4,r3,r11
  409. err4; stvx vr3,r3,r12
  410. err4; stvx vr2,r3,r14
  411. err4; stvx vr1,r3,r15
  412. err4; stvx vr0,r3,r16
  413. addi r3,r3,128
  414. bdnz 8b
  415. ld r14,STK_REG(R14)(r1)
  416. ld r15,STK_REG(R15)(r1)
  417. ld r16,STK_REG(R16)(r1)
  418. /* Up to 127B to go */
  419. clrldi r5,r5,(64-7)
  420. srdi r6,r5,4
  421. mtocrf 0x01,r6
  422. bf cr7*4+1,9f
  423. err3; lvx vr3,r0,r4
  424. err3; lvx vr2,r4,r9
  425. err3; lvx vr1,r4,r10
  426. err3; lvx vr0,r4,r11
  427. addi r4,r4,64
  428. err3; stvx vr3,r0,r3
  429. err3; stvx vr2,r3,r9
  430. err3; stvx vr1,r3,r10
  431. err3; stvx vr0,r3,r11
  432. addi r3,r3,64
  433. 9: bf cr7*4+2,10f
  434. err3; lvx vr1,r0,r4
  435. err3; lvx vr0,r4,r9
  436. addi r4,r4,32
  437. err3; stvx vr1,r0,r3
  438. err3; stvx vr0,r3,r9
  439. addi r3,r3,32
  440. 10: bf cr7*4+3,11f
  441. err3; lvx vr1,r0,r4
  442. addi r4,r4,16
  443. err3; stvx vr1,r0,r3
  444. addi r3,r3,16
  445. /* Up to 15B to go */
  446. 11: clrldi r5,r5,(64-4)
  447. mtocrf 0x01,r5
  448. bf cr7*4+0,12f
  449. err3; ld r0,0(r4)
  450. addi r4,r4,8
  451. err3; std r0,0(r3)
  452. addi r3,r3,8
  453. 12: bf cr7*4+1,13f
  454. err3; lwz r0,0(r4)
  455. addi r4,r4,4
  456. err3; stw r0,0(r3)
  457. addi r3,r3,4
  458. 13: bf cr7*4+2,14f
  459. err3; lhz r0,0(r4)
  460. addi r4,r4,2
  461. err3; sth r0,0(r3)
  462. addi r3,r3,2
  463. 14: bf cr7*4+3,15f
  464. err3; lbz r0,0(r4)
  465. err3; stb r0,0(r3)
  466. 15: addi r1,r1,STACKFRAMESIZE
  467. b .exit_vmx_usercopy /* tail call optimise */
  468. .Lvmx_unaligned_copy:
  469. /* Get the destination 16B aligned */
  470. neg r6,r3
  471. mtocrf 0x01,r6
  472. clrldi r6,r6,(64-4)
  473. bf cr7*4+3,1f
  474. err3; lbz r0,0(r4)
  475. addi r4,r4,1
  476. err3; stb r0,0(r3)
  477. addi r3,r3,1
  478. 1: bf cr7*4+2,2f
  479. err3; lhz r0,0(r4)
  480. addi r4,r4,2
  481. err3; sth r0,0(r3)
  482. addi r3,r3,2
  483. 2: bf cr7*4+1,3f
  484. err3; lwz r0,0(r4)
  485. addi r4,r4,4
  486. err3; stw r0,0(r3)
  487. addi r3,r3,4
  488. 3: bf cr7*4+0,4f
  489. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  490. err3; lwz r7,4(r4)
  491. addi r4,r4,8
  492. err3; stw r0,0(r3)
  493. err3; stw r7,4(r3)
  494. addi r3,r3,8
  495. 4: sub r5,r5,r6
  496. /* Get the desination 128B aligned */
  497. neg r6,r3
  498. srdi r7,r6,4
  499. mtocrf 0x01,r7
  500. clrldi r6,r6,(64-7)
  501. li r9,16
  502. li r10,32
  503. li r11,48
  504. lvsl vr16,0,r4 /* Setup permute control vector */
  505. err3; lvx vr0,0,r4
  506. addi r4,r4,16
  507. bf cr7*4+3,5f
  508. err3; lvx vr1,r0,r4
  509. vperm vr8,vr0,vr1,vr16
  510. addi r4,r4,16
  511. err3; stvx vr8,r0,r3
  512. addi r3,r3,16
  513. vor vr0,vr1,vr1
  514. 5: bf cr7*4+2,6f
  515. err3; lvx vr1,r0,r4
  516. vperm vr8,vr0,vr1,vr16
  517. err3; lvx vr0,r4,r9
  518. vperm vr9,vr1,vr0,vr16
  519. addi r4,r4,32
  520. err3; stvx vr8,r0,r3
  521. err3; stvx vr9,r3,r9
  522. addi r3,r3,32
  523. 6: bf cr7*4+1,7f
  524. err3; lvx vr3,r0,r4
  525. vperm vr8,vr0,vr3,vr16
  526. err3; lvx vr2,r4,r9
  527. vperm vr9,vr3,vr2,vr16
  528. err3; lvx vr1,r4,r10
  529. vperm vr10,vr2,vr1,vr16
  530. err3; lvx vr0,r4,r11
  531. vperm vr11,vr1,vr0,vr16
  532. addi r4,r4,64
  533. err3; stvx vr8,r0,r3
  534. err3; stvx vr9,r3,r9
  535. err3; stvx vr10,r3,r10
  536. err3; stvx vr11,r3,r11
  537. addi r3,r3,64
  538. 7: sub r5,r5,r6
  539. srdi r6,r5,7
  540. std r14,STK_REG(R14)(r1)
  541. std r15,STK_REG(R15)(r1)
  542. std r16,STK_REG(R16)(r1)
  543. li r12,64
  544. li r14,80
  545. li r15,96
  546. li r16,112
  547. mtctr r6
  548. /*
  549. * Now do cacheline sized loads and stores. By this stage the
  550. * cacheline stores are also cacheline aligned.
  551. */
  552. .align 5
  553. 8:
  554. err4; lvx vr7,r0,r4
  555. vperm vr8,vr0,vr7,vr16
  556. err4; lvx vr6,r4,r9
  557. vperm vr9,vr7,vr6,vr16
  558. err4; lvx vr5,r4,r10
  559. vperm vr10,vr6,vr5,vr16
  560. err4; lvx vr4,r4,r11
  561. vperm vr11,vr5,vr4,vr16
  562. err4; lvx vr3,r4,r12
  563. vperm vr12,vr4,vr3,vr16
  564. err4; lvx vr2,r4,r14
  565. vperm vr13,vr3,vr2,vr16
  566. err4; lvx vr1,r4,r15
  567. vperm vr14,vr2,vr1,vr16
  568. err4; lvx vr0,r4,r16
  569. vperm vr15,vr1,vr0,vr16
  570. addi r4,r4,128
  571. err4; stvx vr8,r0,r3
  572. err4; stvx vr9,r3,r9
  573. err4; stvx vr10,r3,r10
  574. err4; stvx vr11,r3,r11
  575. err4; stvx vr12,r3,r12
  576. err4; stvx vr13,r3,r14
  577. err4; stvx vr14,r3,r15
  578. err4; stvx vr15,r3,r16
  579. addi r3,r3,128
  580. bdnz 8b
  581. ld r14,STK_REG(R14)(r1)
  582. ld r15,STK_REG(R15)(r1)
  583. ld r16,STK_REG(R16)(r1)
  584. /* Up to 127B to go */
  585. clrldi r5,r5,(64-7)
  586. srdi r6,r5,4
  587. mtocrf 0x01,r6
  588. bf cr7*4+1,9f
  589. err3; lvx vr3,r0,r4
  590. vperm vr8,vr0,vr3,vr16
  591. err3; lvx vr2,r4,r9
  592. vperm vr9,vr3,vr2,vr16
  593. err3; lvx vr1,r4,r10
  594. vperm vr10,vr2,vr1,vr16
  595. err3; lvx vr0,r4,r11
  596. vperm vr11,vr1,vr0,vr16
  597. addi r4,r4,64
  598. err3; stvx vr8,r0,r3
  599. err3; stvx vr9,r3,r9
  600. err3; stvx vr10,r3,r10
  601. err3; stvx vr11,r3,r11
  602. addi r3,r3,64
  603. 9: bf cr7*4+2,10f
  604. err3; lvx vr1,r0,r4
  605. vperm vr8,vr0,vr1,vr16
  606. err3; lvx vr0,r4,r9
  607. vperm vr9,vr1,vr0,vr16
  608. addi r4,r4,32
  609. err3; stvx vr8,r0,r3
  610. err3; stvx vr9,r3,r9
  611. addi r3,r3,32
  612. 10: bf cr7*4+3,11f
  613. err3; lvx vr1,r0,r4
  614. vperm vr8,vr0,vr1,vr16
  615. addi r4,r4,16
  616. err3; stvx vr8,r0,r3
  617. addi r3,r3,16
  618. /* Up to 15B to go */
  619. 11: clrldi r5,r5,(64-4)
  620. addi r4,r4,-16 /* Unwind the +16 load offset */
  621. mtocrf 0x01,r5
  622. bf cr7*4+0,12f
  623. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  624. err3; lwz r6,4(r4)
  625. addi r4,r4,8
  626. err3; stw r0,0(r3)
  627. err3; stw r6,4(r3)
  628. addi r3,r3,8
  629. 12: bf cr7*4+1,13f
  630. err3; lwz r0,0(r4)
  631. addi r4,r4,4
  632. err3; stw r0,0(r3)
  633. addi r3,r3,4
  634. 13: bf cr7*4+2,14f
  635. err3; lhz r0,0(r4)
  636. addi r4,r4,2
  637. err3; sth r0,0(r3)
  638. addi r3,r3,2
  639. 14: bf cr7*4+3,15f
  640. err3; lbz r0,0(r4)
  641. err3; stb r0,0(r3)
  642. 15: addi r1,r1,STACKFRAMESIZE
  643. b .exit_vmx_usercopy /* tail call optimise */
  644. #endif /* CONFiG_ALTIVEC */