nva3_copy.fuc 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. /* fuc microcode for copy engine on nva3- chipsets
  2. *
  3. * Copyright 2011 Red Hat Inc.
  4. *
  5. * Permission is hereby granted, free of charge, to any person obtaining a
  6. * copy of this software and associated documentation files (the "Software"),
  7. * to deal in the Software without restriction, including without limitation
  8. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9. * and/or sell copies of the Software, and to permit persons to whom the
  10. * Software is furnished to do so, subject to the following conditions:
  11. *
  12. * The above copyright notice and this permission notice shall be included in
  13. * all copies or substantial portions of the Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  19. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  21. * OTHER DEALINGS IN THE SOFTWARE.
  22. *
  23. * Authors: Ben Skeggs
  24. */
  25. /* To build for nva3:nvc0
  26. * m4 -DNVA3 nva3_copy.fuc | envyas -a -w -m fuc -V nva3 -o nva3_copy.fuc.h
  27. *
  28. * To build for nvc0-
  29. * m4 -DNVC0 nva3_copy.fuc | envyas -a -w -m fuc -V nva3 -o nvc0_copy.fuc.h
  30. */
  31. ifdef(`NVA3',
  32. .section nva3_pcopy_data,
  33. .section nvc0_pcopy_data
  34. )
  35. ctx_object: .b32 0
  36. ifdef(`NVA3',
  37. ctx_dma:
  38. ctx_dma_query: .b32 0
  39. ctx_dma_src: .b32 0
  40. ctx_dma_dst: .b32 0
  41. ,)
  42. .equ ctx_dma_count 3
  43. ctx_query_address_high: .b32 0
  44. ctx_query_address_low: .b32 0
  45. ctx_query_counter: .b32 0
  46. ctx_src_address_high: .b32 0
  47. ctx_src_address_low: .b32 0
  48. ctx_src_pitch: .b32 0
  49. ctx_src_tile_mode: .b32 0
  50. ctx_src_xsize: .b32 0
  51. ctx_src_ysize: .b32 0
  52. ctx_src_zsize: .b32 0
  53. ctx_src_zoff: .b32 0
  54. ctx_src_xoff: .b32 0
  55. ctx_src_yoff: .b32 0
  56. ctx_src_cpp: .b32 0
  57. ctx_dst_address_high: .b32 0
  58. ctx_dst_address_low: .b32 0
  59. ctx_dst_pitch: .b32 0
  60. ctx_dst_tile_mode: .b32 0
  61. ctx_dst_xsize: .b32 0
  62. ctx_dst_ysize: .b32 0
  63. ctx_dst_zsize: .b32 0
  64. ctx_dst_zoff: .b32 0
  65. ctx_dst_xoff: .b32 0
  66. ctx_dst_yoff: .b32 0
  67. ctx_dst_cpp: .b32 0
  68. ctx_format: .b32 0
  69. ctx_swz_const0: .b32 0
  70. ctx_swz_const1: .b32 0
  71. ctx_xcnt: .b32 0
  72. ctx_ycnt: .b32 0
  73. .align 256
  74. dispatch_table:
  75. // mthd 0x0000, NAME
  76. .b16 0x000 1
  77. .b32 ctx_object ~0xffffffff
  78. // mthd 0x0100, NOP
  79. .b16 0x040 1
  80. .b32 0x00010000 + cmd_nop ~0xffffffff
  81. // mthd 0x0140, PM_TRIGGER
  82. .b16 0x050 1
  83. .b32 0x00010000 + cmd_pm_trigger ~0xffffffff
  84. ifdef(`NVA3', `
  85. // mthd 0x0180-0x018c, DMA_
  86. .b16 0x060 ctx_dma_count
  87. dispatch_dma:
  88. .b32 0x00010000 + cmd_dma ~0xffffffff
  89. .b32 0x00010000 + cmd_dma ~0xffffffff
  90. .b32 0x00010000 + cmd_dma ~0xffffffff
  91. ',)
  92. // mthd 0x0200-0x0218, SRC_TILE
  93. .b16 0x80 7
  94. .b32 ctx_src_tile_mode ~0x00000fff
  95. .b32 ctx_src_xsize ~0x0007ffff
  96. .b32 ctx_src_ysize ~0x00001fff
  97. .b32 ctx_src_zsize ~0x000007ff
  98. .b32 ctx_src_zoff ~0x00000fff
  99. .b32 ctx_src_xoff ~0x0007ffff
  100. .b32 ctx_src_yoff ~0x00001fff
  101. // mthd 0x0220-0x0238, DST_TILE
  102. .b16 0x88 7
  103. .b32 ctx_dst_tile_mode ~0x00000fff
  104. .b32 ctx_dst_xsize ~0x0007ffff
  105. .b32 ctx_dst_ysize ~0x00001fff
  106. .b32 ctx_dst_zsize ~0x000007ff
  107. .b32 ctx_dst_zoff ~0x00000fff
  108. .b32 ctx_dst_xoff ~0x0007ffff
  109. .b32 ctx_dst_yoff ~0x00001fff
  110. // mthd 0x0300-0x0304, EXEC, WRCACHE_FLUSH
  111. .b16 0xc0 2
  112. .b32 0x00010000 + cmd_exec ~0xffffffff
  113. .b32 0x00010000 + cmd_wrcache_flush ~0xffffffff
  114. // mthd 0x030c-0x0340, various stuff
  115. .b16 0xc3 14
  116. .b32 ctx_src_address_high ~0x000000ff
  117. .b32 ctx_src_address_low ~0xfffffff0
  118. .b32 ctx_dst_address_high ~0x000000ff
  119. .b32 ctx_dst_address_low ~0xfffffff0
  120. .b32 ctx_src_pitch ~0x0007ffff
  121. .b32 ctx_dst_pitch ~0x0007ffff
  122. .b32 ctx_xcnt ~0x0000ffff
  123. .b32 ctx_ycnt ~0x00001fff
  124. .b32 ctx_format ~0x0333ffff
  125. .b32 ctx_swz_const0 ~0xffffffff
  126. .b32 ctx_swz_const1 ~0xffffffff
  127. .b32 ctx_query_address_high ~0x000000ff
  128. .b32 ctx_query_address_low ~0xffffffff
  129. .b32 ctx_query_counter ~0xffffffff
  130. .b16 0x800 0
  131. ifdef(`NVA3',
  132. .section nva3_pcopy_code,
  133. .section nvc0_pcopy_code
  134. )
  135. main:
  136. clear b32 $r0
  137. mov $sp $r0
  138. // setup i0 handler and route fifo and ctxswitch to it
  139. mov $r1 ih
  140. mov $iv0 $r1
  141. mov $r1 0x400
  142. movw $r2 0xfff3
  143. sethi $r2 0
  144. iowr I[$r2 + 0x300] $r2
  145. // enable interrupts
  146. or $r2 0xc
  147. iowr I[$r1] $r2
  148. bset $flags ie0
  149. // enable fifo access and context switching
  150. mov $r1 0x1200
  151. mov $r2 3
  152. iowr I[$r1] $r2
  153. // sleep forever, waking for interrupts
  154. bset $flags $p0
  155. spin:
  156. sleep $p0
  157. bra spin
  158. // i0 handler
  159. ih:
  160. iord $r1 I[$r0 + 0x200]
  161. and $r2 $r1 0x00000008
  162. bra e ih_no_chsw
  163. call chsw
  164. ih_no_chsw:
  165. and $r2 $r1 0x00000004
  166. bra e ih_no_cmd
  167. call dispatch
  168. ih_no_cmd:
  169. and $r1 $r1 0x0000000c
  170. iowr I[$r0 + 0x100] $r1
  171. iret
  172. // $p1 direction (0 = unload, 1 = load)
  173. // $r3 channel
  174. swctx:
  175. mov $r4 0x7700
  176. mov $xtargets $r4
  177. ifdef(`NVA3', `
  178. // target 7 hardcoded to ctx dma object
  179. mov $xdbase $r0
  180. ', ` // NVC0
  181. // read SCRATCH3 to decide if we are PCOPY0 or PCOPY1
  182. mov $r4 0x2100
  183. iord $r4 I[$r4 + 0]
  184. and $r4 1
  185. shl b32 $r4 4
  186. add b32 $r4 0x30
  187. // channel is in vram
  188. mov $r15 0x61c
  189. shl b32 $r15 6
  190. mov $r5 0x114
  191. iowrs I[$r15] $r5
  192. // read 16-byte PCOPYn info, containing context pointer, from channel
  193. shl b32 $r5 $r3 4
  194. add b32 $r5 2
  195. mov $xdbase $r5
  196. mov $r5 $sp
  197. // get a chunk of stack space, aligned to 256 byte boundary
  198. sub b32 $r5 0x100
  199. mov $r6 0xff
  200. not b32 $r6
  201. and $r5 $r6
  202. sethi $r5 0x00020000
  203. xdld $r4 $r5
  204. xdwait
  205. sethi $r5 0
  206. // set context pointer, from within channel VM
  207. mov $r14 0
  208. iowrs I[$r15] $r14
  209. ld b32 $r4 D[$r5 + 0]
  210. shr b32 $r4 8
  211. ld b32 $r6 D[$r5 + 4]
  212. shl b32 $r6 24
  213. or $r4 $r6
  214. mov $xdbase $r4
  215. ')
  216. // 256-byte context, at start of data segment
  217. mov b32 $r4 $r0
  218. sethi $r4 0x60000
  219. // swap!
  220. bra $p1 swctx_load
  221. xdst $r0 $r4
  222. bra swctx_done
  223. swctx_load:
  224. xdld $r0 $r4
  225. swctx_done:
  226. xdwait
  227. ret
  228. chsw:
  229. // read current channel
  230. mov $r2 0x1400
  231. iord $r3 I[$r2]
  232. // if it's active, unload it and return
  233. xbit $r15 $r3 0x1e
  234. bra e chsw_no_unload
  235. bclr $flags $p1
  236. call swctx
  237. bclr $r3 0x1e
  238. iowr I[$r2] $r3
  239. mov $r4 1
  240. iowr I[$r2 + 0x200] $r4
  241. ret
  242. // read next channel
  243. chsw_no_unload:
  244. iord $r3 I[$r2 + 0x100]
  245. // is there a channel waiting to be loaded?
  246. xbit $r13 $r3 0x1e
  247. bra e chsw_finish_load
  248. bset $flags $p1
  249. call swctx
  250. ifdef(`NVA3',
  251. // load dma objects back into TARGET regs
  252. mov $r5 ctx_dma
  253. mov $r6 ctx_dma_count
  254. chsw_load_ctx_dma:
  255. ld b32 $r7 D[$r5 + $r6 * 4]
  256. add b32 $r8 $r6 0x180
  257. shl b32 $r8 8
  258. iowr I[$r8] $r7
  259. sub b32 $r6 1
  260. bra nc chsw_load_ctx_dma
  261. ,)
  262. chsw_finish_load:
  263. mov $r3 2
  264. iowr I[$r2 + 0x200] $r3
  265. ret
  266. dispatch:
  267. // read incoming fifo command
  268. mov $r3 0x1900
  269. iord $r2 I[$r3 + 0x100]
  270. iord $r3 I[$r3 + 0x000]
  271. and $r4 $r2 0x7ff
  272. // $r2 will be used to store exception data
  273. shl b32 $r2 0x10
  274. // lookup method in the dispatch table, ILLEGAL_MTHD if not found
  275. mov $r5 dispatch_table
  276. clear b32 $r6
  277. clear b32 $r7
  278. dispatch_loop:
  279. ld b16 $r6 D[$r5 + 0]
  280. ld b16 $r7 D[$r5 + 2]
  281. add b32 $r5 4
  282. cmpu b32 $r4 $r6
  283. bra c dispatch_illegal_mthd
  284. add b32 $r7 $r6
  285. cmpu b32 $r4 $r7
  286. bra c dispatch_valid_mthd
  287. sub b32 $r7 $r6
  288. shl b32 $r7 3
  289. add b32 $r5 $r7
  290. bra dispatch_loop
  291. // ensure no bits set in reserved fields, INVALID_BITFIELD
  292. dispatch_valid_mthd:
  293. sub b32 $r4 $r6
  294. shl b32 $r4 3
  295. add b32 $r4 $r5
  296. ld b32 $r5 D[$r4 + 4]
  297. and $r5 $r3
  298. cmpu b32 $r5 0
  299. bra ne dispatch_invalid_bitfield
  300. // depending on dispatch flags: execute method, or save data as state
  301. ld b16 $r5 D[$r4 + 0]
  302. ld b16 $r6 D[$r4 + 2]
  303. cmpu b32 $r6 0
  304. bra ne dispatch_cmd
  305. st b32 D[$r5] $r3
  306. bra dispatch_done
  307. dispatch_cmd:
  308. bclr $flags $p1
  309. call $r5
  310. bra $p1 dispatch_error
  311. bra dispatch_done
  312. dispatch_invalid_bitfield:
  313. or $r2 2
  314. dispatch_illegal_mthd:
  315. or $r2 1
  316. // store exception data in SCRATCH0/SCRATCH1, signal hostirq
  317. dispatch_error:
  318. mov $r4 0x1000
  319. iowr I[$r4 + 0x000] $r2
  320. iowr I[$r4 + 0x100] $r3
  321. mov $r2 0x40
  322. iowr I[$r0] $r2
  323. hostirq_wait:
  324. iord $r2 I[$r0 + 0x200]
  325. and $r2 0x40
  326. cmpu b32 $r2 0
  327. bra ne hostirq_wait
  328. dispatch_done:
  329. mov $r2 0x1d00
  330. mov $r3 1
  331. iowr I[$r2] $r3
  332. ret
  333. // No-operation
  334. //
  335. // Inputs:
  336. // $r1: irqh state
  337. // $r2: hostirq state
  338. // $r3: data
  339. // $r4: dispatch table entry
  340. // Outputs:
  341. // $r1: irqh state
  342. // $p1: set on error
  343. // $r2: hostirq state
  344. // $r3: data
  345. cmd_nop:
  346. ret
  347. // PM_TRIGGER
  348. //
  349. // Inputs:
  350. // $r1: irqh state
  351. // $r2: hostirq state
  352. // $r3: data
  353. // $r4: dispatch table entry
  354. // Outputs:
  355. // $r1: irqh state
  356. // $p1: set on error
  357. // $r2: hostirq state
  358. // $r3: data
  359. cmd_pm_trigger:
  360. mov $r2 0x2200
  361. clear b32 $r3
  362. sethi $r3 0x20000
  363. iowr I[$r2] $r3
  364. ret
  365. ifdef(`NVA3',
  366. // SET_DMA_* method handler
  367. //
  368. // Inputs:
  369. // $r1: irqh state
  370. // $r2: hostirq state
  371. // $r3: data
  372. // $r4: dispatch table entry
  373. // Outputs:
  374. // $r1: irqh state
  375. // $p1: set on error
  376. // $r2: hostirq state
  377. // $r3: data
  378. cmd_dma:
  379. sub b32 $r4 dispatch_dma
  380. shr b32 $r4 1
  381. bset $r3 0x1e
  382. st b32 D[$r4 + ctx_dma] $r3
  383. add b32 $r4 0x600
  384. shl b32 $r4 6
  385. iowr I[$r4] $r3
  386. ret
  387. ,)
  388. // Calculates the hw swizzle mask and adjusts the surface's xcnt to match
  389. //
  390. cmd_exec_set_format:
  391. // zero out a chunk of the stack to store the swizzle into
  392. add $sp -0x10
  393. st b32 D[$sp + 0x00] $r0
  394. st b32 D[$sp + 0x04] $r0
  395. st b32 D[$sp + 0x08] $r0
  396. st b32 D[$sp + 0x0c] $r0
  397. // extract cpp, src_ncomp and dst_ncomp from FORMAT
  398. ld b32 $r4 D[$r0 + ctx_format]
  399. extr $r5 $r4 16:17
  400. add b32 $r5 1
  401. extr $r6 $r4 20:21
  402. add b32 $r6 1
  403. extr $r7 $r4 24:25
  404. add b32 $r7 1
  405. // convert FORMAT swizzle mask to hw swizzle mask
  406. bclr $flags $p2
  407. clear b32 $r8
  408. clear b32 $r9
  409. ncomp_loop:
  410. and $r10 $r4 0xf
  411. shr b32 $r4 4
  412. clear b32 $r11
  413. bpc_loop:
  414. cmpu b8 $r10 4
  415. bra nc cmp_c0
  416. mulu $r12 $r10 $r5
  417. add b32 $r12 $r11
  418. bset $flags $p2
  419. bra bpc_next
  420. cmp_c0:
  421. bra ne cmp_c1
  422. mov $r12 0x10
  423. add b32 $r12 $r11
  424. bra bpc_next
  425. cmp_c1:
  426. cmpu b8 $r10 6
  427. bra nc cmp_zero
  428. mov $r12 0x14
  429. add b32 $r12 $r11
  430. bra bpc_next
  431. cmp_zero:
  432. mov $r12 0x80
  433. bpc_next:
  434. st b8 D[$sp + $r8] $r12
  435. add b32 $r8 1
  436. add b32 $r11 1
  437. cmpu b32 $r11 $r5
  438. bra c bpc_loop
  439. add b32 $r9 1
  440. cmpu b32 $r9 $r7
  441. bra c ncomp_loop
  442. // SRC_XCNT = (xcnt * src_cpp), or 0 if no src ref in swz (hw will hang)
  443. mulu $r6 $r5
  444. st b32 D[$r0 + ctx_src_cpp] $r6
  445. ld b32 $r8 D[$r0 + ctx_xcnt]
  446. mulu $r6 $r8
  447. bra $p2 dst_xcnt
  448. clear b32 $r6
  449. dst_xcnt:
  450. mulu $r7 $r5
  451. st b32 D[$r0 + ctx_dst_cpp] $r7
  452. mulu $r7 $r8
  453. mov $r5 0x810
  454. shl b32 $r5 6
  455. iowr I[$r5 + 0x000] $r6
  456. iowr I[$r5 + 0x100] $r7
  457. add b32 $r5 0x800
  458. ld b32 $r6 D[$r0 + ctx_dst_cpp]
  459. sub b32 $r6 1
  460. shl b32 $r6 8
  461. ld b32 $r7 D[$r0 + ctx_src_cpp]
  462. sub b32 $r7 1
  463. or $r6 $r7
  464. iowr I[$r5 + 0x000] $r6
  465. add b32 $r5 0x100
  466. ld b32 $r6 D[$sp + 0x00]
  467. iowr I[$r5 + 0x000] $r6
  468. ld b32 $r6 D[$sp + 0x04]
  469. iowr I[$r5 + 0x100] $r6
  470. ld b32 $r6 D[$sp + 0x08]
  471. iowr I[$r5 + 0x200] $r6
  472. ld b32 $r6 D[$sp + 0x0c]
  473. iowr I[$r5 + 0x300] $r6
  474. add b32 $r5 0x400
  475. ld b32 $r6 D[$r0 + ctx_swz_const0]
  476. iowr I[$r5 + 0x000] $r6
  477. ld b32 $r6 D[$r0 + ctx_swz_const1]
  478. iowr I[$r5 + 0x100] $r6
  479. add $sp 0x10
  480. ret
  481. // Setup to handle a tiled surface
  482. //
  483. // Calculates a number of parameters the hardware requires in order
  484. // to correctly handle tiling.
  485. //
  486. // Offset calculation is performed as follows (Tp/Th/Td from TILE_MODE):
  487. // nTx = round_up(w * cpp, 1 << Tp) >> Tp
  488. // nTy = round_up(h, 1 << Th) >> Th
  489. // Txo = (x * cpp) & ((1 << Tp) - 1)
  490. // Tx = (x * cpp) >> Tp
  491. // Tyo = y & ((1 << Th) - 1)
  492. // Ty = y >> Th
  493. // Tzo = z & ((1 << Td) - 1)
  494. // Tz = z >> Td
  495. //
  496. // off = (Tzo << Tp << Th) + (Tyo << Tp) + Txo
  497. // off += ((Tz * nTy * nTx)) + (Ty * nTx) + Tx) << Td << Th << Tp;
  498. //
  499. // Inputs:
  500. // $r4: hw command (0x104800)
  501. // $r5: ctx offset adjustment for src/dst selection
  502. // $p2: set if dst surface
  503. //
  504. cmd_exec_set_surface_tiled:
  505. // translate TILE_MODE into Tp, Th, Td shift values
  506. ld b32 $r7 D[$r5 + ctx_src_tile_mode]
  507. extr $r9 $r7 8:11
  508. extr $r8 $r7 4:7
  509. ifdef(`NVA3',
  510. add b32 $r8 2
  511. ,
  512. add b32 $r8 3
  513. )
  514. extr $r7 $r7 0:3
  515. cmp b32 $r7 0xe
  516. bra ne xtile64
  517. mov $r7 4
  518. bra xtileok
  519. xtile64:
  520. xbit $r7 $flags $p2
  521. add b32 $r7 17
  522. bset $r4 $r7
  523. mov $r7 6
  524. xtileok:
  525. // Op = (x * cpp) & ((1 << Tp) - 1)
  526. // Tx = (x * cpp) >> Tp
  527. ld b32 $r10 D[$r5 + ctx_src_xoff]
  528. ld b32 $r11 D[$r5 + ctx_src_cpp]
  529. mulu $r10 $r11
  530. mov $r11 1
  531. shl b32 $r11 $r7
  532. sub b32 $r11 1
  533. and $r12 $r10 $r11
  534. shr b32 $r10 $r7
  535. // Tyo = y & ((1 << Th) - 1)
  536. // Ty = y >> Th
  537. ld b32 $r13 D[$r5 + ctx_src_yoff]
  538. mov $r14 1
  539. shl b32 $r14 $r8
  540. sub b32 $r14 1
  541. and $r11 $r13 $r14
  542. shr b32 $r13 $r8
  543. // YTILE = ((1 << Th) << 12) | ((1 << Th) - Tyo)
  544. add b32 $r14 1
  545. shl b32 $r15 $r14 12
  546. sub b32 $r14 $r11
  547. or $r15 $r14
  548. xbit $r6 $flags $p2
  549. add b32 $r6 0x208
  550. shl b32 $r6 8
  551. iowr I[$r6 + 0x000] $r15
  552. // Op += Tyo << Tp
  553. shl b32 $r11 $r7
  554. add b32 $r12 $r11
  555. // nTx = ((w * cpp) + ((1 << Tp) - 1) >> Tp)
  556. ld b32 $r15 D[$r5 + ctx_src_xsize]
  557. ld b32 $r11 D[$r5 + ctx_src_cpp]
  558. mulu $r15 $r11
  559. mov $r11 1
  560. shl b32 $r11 $r7
  561. sub b32 $r11 1
  562. add b32 $r15 $r11
  563. shr b32 $r15 $r7
  564. push $r15
  565. // nTy = (h + ((1 << Th) - 1)) >> Th
  566. ld b32 $r15 D[$r5 + ctx_src_ysize]
  567. mov $r11 1
  568. shl b32 $r11 $r8
  569. sub b32 $r11 1
  570. add b32 $r15 $r11
  571. shr b32 $r15 $r8
  572. push $r15
  573. // Tys = Tp + Th
  574. // CFG_YZ_TILE_SIZE = ((1 << Th) >> 2) << Td
  575. add b32 $r7 $r8
  576. sub b32 $r8 2
  577. mov $r11 1
  578. shl b32 $r11 $r8
  579. shl b32 $r11 $r9
  580. // Tzo = z & ((1 << Td) - 1)
  581. // Tz = z >> Td
  582. // Op += Tzo << Tys
  583. // Ts = Tys + Td
  584. ld b32 $r8 D[$r5 + ctx_src_zoff]
  585. mov $r14 1
  586. shl b32 $r14 $r9
  587. sub b32 $r14 1
  588. and $r15 $r8 $r14
  589. shl b32 $r15 $r7
  590. add b32 $r12 $r15
  591. add b32 $r7 $r9
  592. shr b32 $r8 $r9
  593. // Ot = ((Tz * nTy * nTx) + (Ty * nTx) + Tx) << Ts
  594. pop $r15
  595. pop $r9
  596. mulu $r13 $r9
  597. add b32 $r10 $r13
  598. mulu $r8 $r9
  599. mulu $r8 $r15
  600. add b32 $r10 $r8
  601. shl b32 $r10 $r7
  602. // PITCH = (nTx - 1) << Ts
  603. sub b32 $r9 1
  604. shl b32 $r9 $r7
  605. iowr I[$r6 + 0x200] $r9
  606. // SRC_ADDRESS_LOW = (Ot + Op) & 0xffffffff
  607. // CFG_ADDRESS_HIGH |= ((Ot + Op) >> 32) << 16
  608. ld b32 $r7 D[$r5 + ctx_src_address_low]
  609. ld b32 $r8 D[$r5 + ctx_src_address_high]
  610. add b32 $r10 $r12
  611. add b32 $r7 $r10
  612. adc b32 $r8 0
  613. shl b32 $r8 16
  614. or $r8 $r11
  615. sub b32 $r6 0x600
  616. iowr I[$r6 + 0x000] $r7
  617. add b32 $r6 0x400
  618. iowr I[$r6 + 0x000] $r8
  619. ret
  620. // Setup to handle a linear surface
  621. //
  622. // Nothing to see here.. Sets ADDRESS and PITCH, pretty non-exciting
  623. //
  624. cmd_exec_set_surface_linear:
  625. xbit $r6 $flags $p2
  626. add b32 $r6 0x202
  627. shl b32 $r6 8
  628. ld b32 $r7 D[$r5 + ctx_src_address_low]
  629. iowr I[$r6 + 0x000] $r7
  630. add b32 $r6 0x400
  631. ld b32 $r7 D[$r5 + ctx_src_address_high]
  632. shl b32 $r7 16
  633. iowr I[$r6 + 0x000] $r7
  634. add b32 $r6 0x400
  635. ld b32 $r7 D[$r5 + ctx_src_pitch]
  636. iowr I[$r6 + 0x000] $r7
  637. ret
  638. // wait for regs to be available for use
  639. cmd_exec_wait:
  640. push $r0
  641. push $r1
  642. mov $r0 0x800
  643. shl b32 $r0 6
  644. loop:
  645. iord $r1 I[$r0]
  646. and $r1 1
  647. bra ne loop
  648. pop $r1
  649. pop $r0
  650. ret
  651. cmd_exec_query:
  652. // if QUERY_SHORT not set, write out { -, 0, TIME_LO, TIME_HI }
  653. xbit $r4 $r3 13
  654. bra ne query_counter
  655. call cmd_exec_wait
  656. mov $r4 0x80c
  657. shl b32 $r4 6
  658. ld b32 $r5 D[$r0 + ctx_query_address_low]
  659. add b32 $r5 4
  660. iowr I[$r4 + 0x000] $r5
  661. iowr I[$r4 + 0x100] $r0
  662. mov $r5 0xc
  663. iowr I[$r4 + 0x200] $r5
  664. add b32 $r4 0x400
  665. ld b32 $r5 D[$r0 + ctx_query_address_high]
  666. shl b32 $r5 16
  667. iowr I[$r4 + 0x000] $r5
  668. add b32 $r4 0x500
  669. mov $r5 0x00000b00
  670. sethi $r5 0x00010000
  671. iowr I[$r4 + 0x000] $r5
  672. mov $r5 0x00004040
  673. shl b32 $r5 1
  674. sethi $r5 0x80800000
  675. iowr I[$r4 + 0x100] $r5
  676. mov $r5 0x00001110
  677. sethi $r5 0x13120000
  678. iowr I[$r4 + 0x200] $r5
  679. mov $r5 0x00001514
  680. sethi $r5 0x17160000
  681. iowr I[$r4 + 0x300] $r5
  682. mov $r5 0x00002601
  683. sethi $r5 0x00010000
  684. mov $r4 0x800
  685. shl b32 $r4 6
  686. iowr I[$r4 + 0x000] $r5
  687. // write COUNTER
  688. query_counter:
  689. call cmd_exec_wait
  690. mov $r4 0x80c
  691. shl b32 $r4 6
  692. ld b32 $r5 D[$r0 + ctx_query_address_low]
  693. iowr I[$r4 + 0x000] $r5
  694. iowr I[$r4 + 0x100] $r0
  695. mov $r5 0x4
  696. iowr I[$r4 + 0x200] $r5
  697. add b32 $r4 0x400
  698. ld b32 $r5 D[$r0 + ctx_query_address_high]
  699. shl b32 $r5 16
  700. iowr I[$r4 + 0x000] $r5
  701. add b32 $r4 0x500
  702. mov $r5 0x00000300
  703. iowr I[$r4 + 0x000] $r5
  704. mov $r5 0x00001110
  705. sethi $r5 0x13120000
  706. iowr I[$r4 + 0x100] $r5
  707. ld b32 $r5 D[$r0 + ctx_query_counter]
  708. add b32 $r4 0x500
  709. iowr I[$r4 + 0x000] $r5
  710. mov $r5 0x00002601
  711. sethi $r5 0x00010000
  712. mov $r4 0x800
  713. shl b32 $r4 6
  714. iowr I[$r4 + 0x000] $r5
  715. ret
  716. // Execute a copy operation
  717. //
  718. // Inputs:
  719. // $r1: irqh state
  720. // $r2: hostirq state
  721. // $r3: data
  722. // 000002000 QUERY_SHORT
  723. // 000001000 QUERY
  724. // 000000100 DST_LINEAR
  725. // 000000010 SRC_LINEAR
  726. // 000000001 FORMAT
  727. // $r4: dispatch table entry
  728. // Outputs:
  729. // $r1: irqh state
  730. // $p1: set on error
  731. // $r2: hostirq state
  732. // $r3: data
  733. cmd_exec:
  734. call cmd_exec_wait
  735. // if format requested, call function to calculate it, otherwise
  736. // fill in cpp/xcnt for both surfaces as if (cpp == 1)
  737. xbit $r15 $r3 0
  738. bra e cmd_exec_no_format
  739. call cmd_exec_set_format
  740. mov $r4 0x200
  741. bra cmd_exec_init_src_surface
  742. cmd_exec_no_format:
  743. mov $r6 0x810
  744. shl b32 $r6 6
  745. mov $r7 1
  746. st b32 D[$r0 + ctx_src_cpp] $r7
  747. st b32 D[$r0 + ctx_dst_cpp] $r7
  748. ld b32 $r7 D[$r0 + ctx_xcnt]
  749. iowr I[$r6 + 0x000] $r7
  750. iowr I[$r6 + 0x100] $r7
  751. clear b32 $r4
  752. cmd_exec_init_src_surface:
  753. bclr $flags $p2
  754. clear b32 $r5
  755. xbit $r15 $r3 4
  756. bra e src_tiled
  757. call cmd_exec_set_surface_linear
  758. bra cmd_exec_init_dst_surface
  759. src_tiled:
  760. call cmd_exec_set_surface_tiled
  761. bset $r4 7
  762. cmd_exec_init_dst_surface:
  763. bset $flags $p2
  764. mov $r5 ctx_dst_address_high - ctx_src_address_high
  765. xbit $r15 $r3 8
  766. bra e dst_tiled
  767. call cmd_exec_set_surface_linear
  768. bra cmd_exec_kick
  769. dst_tiled:
  770. call cmd_exec_set_surface_tiled
  771. bset $r4 8
  772. cmd_exec_kick:
  773. mov $r5 0x800
  774. shl b32 $r5 6
  775. ld b32 $r6 D[$r0 + ctx_ycnt]
  776. iowr I[$r5 + 0x100] $r6
  777. mov $r6 0x0041
  778. // SRC_TARGET = 1, DST_TARGET = 2
  779. sethi $r6 0x44000000
  780. or $r4 $r6
  781. iowr I[$r5] $r4
  782. // if requested, queue up a QUERY write after the copy has completed
  783. xbit $r15 $r3 12
  784. bra e cmd_exec_done
  785. call cmd_exec_query
  786. cmd_exec_done:
  787. ret
  788. // Flush write cache
  789. //
  790. // Inputs:
  791. // $r1: irqh state
  792. // $r2: hostirq state
  793. // $r3: data
  794. // $r4: dispatch table entry
  795. // Outputs:
  796. // $r1: irqh state
  797. // $p1: set on error
  798. // $r2: hostirq state
  799. // $r3: data
  800. cmd_wrcache_flush:
  801. mov $r2 0x2200
  802. clear b32 $r3
  803. sethi $r3 0x10000
  804. iowr I[$r2] $r3
  805. ret
  806. .align 0x100