nva3_copy.fuc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872
  1. /* fuc microcode for copy engine on nva3- chipsets
  2. *
  3. * Copyright 2011 Red Hat Inc.
  4. *
  5. * Permission is hereby granted, free of charge, to any person obtaining a
  6. * copy of this software and associated documentation files (the "Software"),
  7. * to deal in the Software without restriction, including without limitation
  8. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9. * and/or sell copies of the Software, and to permit persons to whom the
  10. * Software is furnished to do so, subject to the following conditions:
  11. *
  12. * The above copyright notice and this permission notice shall be included in
  13. * all copies or substantial portions of the Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  19. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  21. * OTHER DEALINGS IN THE SOFTWARE.
  22. *
  23. * Authors: Ben Skeggs
  24. */
  25. /* To build for nva3:nvc0
  26. * m4 -DNVA3 nva3_copy.fuc | envyas -a -w -m fuc -V nva3 -o nva3_copy.fuc.h
  27. *
  28. * To build for nvc0-
  29. * m4 -DNVC0 nva3_copy.fuc | envyas -a -w -m fuc -V nva3 -o nvc0_copy.fuc.h
  30. */
  31. ifdef(`NVA3',
  32. .section #nva3_pcopy_data
  33. ,
  34. .section #nvc0_pcopy_data
  35. )
  36. ctx_object: .b32 0
  37. ifdef(`NVA3',
  38. ctx_dma:
  39. ctx_dma_query: .b32 0
  40. ctx_dma_src: .b32 0
  41. ctx_dma_dst: .b32 0
  42. ,)
  43. .equ #ctx_dma_count 3
  44. ctx_query_address_high: .b32 0
  45. ctx_query_address_low: .b32 0
  46. ctx_query_counter: .b32 0
  47. ctx_src_address_high: .b32 0
  48. ctx_src_address_low: .b32 0
  49. ctx_src_pitch: .b32 0
  50. ctx_src_tile_mode: .b32 0
  51. ctx_src_xsize: .b32 0
  52. ctx_src_ysize: .b32 0
  53. ctx_src_zsize: .b32 0
  54. ctx_src_zoff: .b32 0
  55. ctx_src_xoff: .b32 0
  56. ctx_src_yoff: .b32 0
  57. ctx_src_cpp: .b32 0
  58. ctx_dst_address_high: .b32 0
  59. ctx_dst_address_low: .b32 0
  60. ctx_dst_pitch: .b32 0
  61. ctx_dst_tile_mode: .b32 0
  62. ctx_dst_xsize: .b32 0
  63. ctx_dst_ysize: .b32 0
  64. ctx_dst_zsize: .b32 0
  65. ctx_dst_zoff: .b32 0
  66. ctx_dst_xoff: .b32 0
  67. ctx_dst_yoff: .b32 0
  68. ctx_dst_cpp: .b32 0
  69. ctx_format: .b32 0
  70. ctx_swz_const0: .b32 0
  71. ctx_swz_const1: .b32 0
  72. ctx_xcnt: .b32 0
  73. ctx_ycnt: .b32 0
  74. .align 256
  75. dispatch_table:
  76. // mthd 0x0000, NAME
  77. .b16 0x000 1
  78. .b32 #ctx_object ~0xffffffff
  79. // mthd 0x0100, NOP
  80. .b16 0x040 1
  81. .b32 0x00010000 + #cmd_nop ~0xffffffff
  82. // mthd 0x0140, PM_TRIGGER
  83. .b16 0x050 1
  84. .b32 0x00010000 + #cmd_pm_trigger ~0xffffffff
  85. ifdef(`NVA3', `
  86. // mthd 0x0180-0x018c, DMA_
  87. .b16 0x060 #ctx_dma_count
  88. dispatch_dma:
  89. .b32 0x00010000 + #cmd_dma ~0xffffffff
  90. .b32 0x00010000 + #cmd_dma ~0xffffffff
  91. .b32 0x00010000 + #cmd_dma ~0xffffffff
  92. ',)
  93. // mthd 0x0200-0x0218, SRC_TILE
  94. .b16 0x80 7
  95. .b32 #ctx_src_tile_mode ~0x00000fff
  96. .b32 #ctx_src_xsize ~0x0007ffff
  97. .b32 #ctx_src_ysize ~0x00001fff
  98. .b32 #ctx_src_zsize ~0x000007ff
  99. .b32 #ctx_src_zoff ~0x00000fff
  100. .b32 #ctx_src_xoff ~0x0007ffff
  101. .b32 #ctx_src_yoff ~0x00001fff
  102. // mthd 0x0220-0x0238, DST_TILE
  103. .b16 0x88 7
  104. .b32 #ctx_dst_tile_mode ~0x00000fff
  105. .b32 #ctx_dst_xsize ~0x0007ffff
  106. .b32 #ctx_dst_ysize ~0x00001fff
  107. .b32 #ctx_dst_zsize ~0x000007ff
  108. .b32 #ctx_dst_zoff ~0x00000fff
  109. .b32 #ctx_dst_xoff ~0x0007ffff
  110. .b32 #ctx_dst_yoff ~0x00001fff
  111. // mthd 0x0300-0x0304, EXEC, WRCACHE_FLUSH
  112. .b16 0xc0 2
  113. .b32 0x00010000 + #cmd_exec ~0xffffffff
  114. .b32 0x00010000 + #cmd_wrcache_flush ~0xffffffff
  115. // mthd 0x030c-0x0340, various stuff
  116. .b16 0xc3 14
  117. .b32 #ctx_src_address_high ~0x000000ff
  118. .b32 #ctx_src_address_low ~0xfffffff0
  119. .b32 #ctx_dst_address_high ~0x000000ff
  120. .b32 #ctx_dst_address_low ~0xfffffff0
  121. .b32 #ctx_src_pitch ~0x0007ffff
  122. .b32 #ctx_dst_pitch ~0x0007ffff
  123. .b32 #ctx_xcnt ~0x0000ffff
  124. .b32 #ctx_ycnt ~0x00001fff
  125. .b32 #ctx_format ~0x0333ffff
  126. .b32 #ctx_swz_const0 ~0xffffffff
  127. .b32 #ctx_swz_const1 ~0xffffffff
  128. .b32 #ctx_query_address_high ~0x000000ff
  129. .b32 #ctx_query_address_low ~0xffffffff
  130. .b32 #ctx_query_counter ~0xffffffff
  131. .b16 0x800 0
  132. ifdef(`NVA3',
  133. .section #nva3_pcopy_code
  134. ,
  135. .section #nvc0_pcopy_code
  136. )
  137. main:
  138. clear b32 $r0
  139. mov $sp $r0
  140. // setup i0 handler and route fifo and ctxswitch to it
  141. mov $r1 #ih
  142. mov $iv0 $r1
  143. mov $r1 0x400
  144. movw $r2 0xfff3
  145. sethi $r2 0
  146. iowr I[$r1 + 0x300] $r2
  147. // enable interrupts
  148. or $r2 0xc
  149. iowr I[$r1] $r2
  150. bset $flags ie0
  151. // enable fifo access and context switching
  152. mov $r1 0x1200
  153. mov $r2 3
  154. iowr I[$r1] $r2
  155. // sleep forever, waking for interrupts
  156. bset $flags $p0
  157. spin:
  158. sleep $p0
  159. bra #spin
  160. // i0 handler
  161. ih:
  162. iord $r1 I[$r0 + 0x200]
  163. and $r2 $r1 0x00000008
  164. bra e #ih_no_chsw
  165. call #chsw
  166. ih_no_chsw:
  167. and $r2 $r1 0x00000004
  168. bra e #ih_no_cmd
  169. call #dispatch
  170. ih_no_cmd:
  171. and $r1 $r1 0x0000000c
  172. iowr I[$r0 + 0x100] $r1
  173. iret
  174. // $p1 direction (0 = unload, 1 = load)
  175. // $r3 channel
  176. swctx:
  177. mov $r4 0x7700
  178. mov $xtargets $r4
  179. ifdef(`NVA3', `
  180. // target 7 hardcoded to ctx dma object
  181. mov $xdbase $r0
  182. ', ` // NVC0
  183. // read SCRATCH3 to decide if we are PCOPY0 or PCOPY1
  184. mov $r4 0x2100
  185. iord $r4 I[$r4 + 0]
  186. and $r4 1
  187. shl b32 $r4 4
  188. add b32 $r4 0x30
  189. // channel is in vram
  190. mov $r15 0x61c
  191. shl b32 $r15 6
  192. mov $r5 0x114
  193. iowrs I[$r15] $r5
  194. // read 16-byte PCOPYn info, containing context pointer, from channel
  195. shl b32 $r5 $r3 4
  196. add b32 $r5 2
  197. mov $xdbase $r5
  198. mov $r5 $sp
  199. // get a chunk of stack space, aligned to 256 byte boundary
  200. sub b32 $r5 0x100
  201. mov $r6 0xff
  202. not b32 $r6
  203. and $r5 $r6
  204. sethi $r5 0x00020000
  205. xdld $r4 $r5
  206. xdwait
  207. sethi $r5 0
  208. // set context pointer, from within channel VM
  209. mov $r14 0
  210. iowrs I[$r15] $r14
  211. ld b32 $r4 D[$r5 + 0]
  212. shr b32 $r4 8
  213. ld b32 $r6 D[$r5 + 4]
  214. shl b32 $r6 24
  215. or $r4 $r6
  216. mov $xdbase $r4
  217. ')
  218. // 256-byte context, at start of data segment
  219. mov b32 $r4 $r0
  220. sethi $r4 0x60000
  221. // swap!
  222. bra $p1 #swctx_load
  223. xdst $r0 $r4
  224. bra #swctx_done
  225. swctx_load:
  226. xdld $r0 $r4
  227. swctx_done:
  228. xdwait
  229. ret
  230. chsw:
  231. // read current channel
  232. mov $r2 0x1400
  233. iord $r3 I[$r2]
  234. // if it's active, unload it and return
  235. xbit $r15 $r3 0x1e
  236. bra e #chsw_no_unload
  237. bclr $flags $p1
  238. call #swctx
  239. bclr $r3 0x1e
  240. iowr I[$r2] $r3
  241. mov $r4 1
  242. iowr I[$r2 + 0x200] $r4
  243. ret
  244. // read next channel
  245. chsw_no_unload:
  246. iord $r3 I[$r2 + 0x100]
  247. // is there a channel waiting to be loaded?
  248. xbit $r13 $r3 0x1e
  249. bra e #chsw_finish_load
  250. bset $flags $p1
  251. call #swctx
  252. ifdef(`NVA3',
  253. // load dma objects back into TARGET regs
  254. mov $r5 #ctx_dma
  255. mov $r6 #ctx_dma_count
  256. chsw_load_ctx_dma:
  257. ld b32 $r7 D[$r5 + $r6 * 4]
  258. add b32 $r8 $r6 0x180
  259. shl b32 $r8 8
  260. iowr I[$r8] $r7
  261. sub b32 $r6 1
  262. bra nc #chsw_load_ctx_dma
  263. ,)
  264. chsw_finish_load:
  265. mov $r3 2
  266. iowr I[$r2 + 0x200] $r3
  267. ret
  268. dispatch:
  269. // read incoming fifo command
  270. mov $r3 0x1900
  271. iord $r2 I[$r3 + 0x100]
  272. iord $r3 I[$r3 + 0x000]
  273. and $r4 $r2 0x7ff
  274. // $r2 will be used to store exception data
  275. shl b32 $r2 0x10
  276. // lookup method in the dispatch table, ILLEGAL_MTHD if not found
  277. mov $r5 #dispatch_table
  278. clear b32 $r6
  279. clear b32 $r7
  280. dispatch_loop:
  281. ld b16 $r6 D[$r5 + 0]
  282. ld b16 $r7 D[$r5 + 2]
  283. add b32 $r5 4
  284. cmpu b32 $r4 $r6
  285. bra c #dispatch_illegal_mthd
  286. add b32 $r7 $r6
  287. cmpu b32 $r4 $r7
  288. bra c #dispatch_valid_mthd
  289. sub b32 $r7 $r6
  290. shl b32 $r7 3
  291. add b32 $r5 $r7
  292. bra #dispatch_loop
  293. // ensure no bits set in reserved fields, INVALID_BITFIELD
  294. dispatch_valid_mthd:
  295. sub b32 $r4 $r6
  296. shl b32 $r4 3
  297. add b32 $r4 $r5
  298. ld b32 $r5 D[$r4 + 4]
  299. and $r5 $r3
  300. cmpu b32 $r5 0
  301. bra ne #dispatch_invalid_bitfield
  302. // depending on dispatch flags: execute method, or save data as state
  303. ld b16 $r5 D[$r4 + 0]
  304. ld b16 $r6 D[$r4 + 2]
  305. cmpu b32 $r6 0
  306. bra ne #dispatch_cmd
  307. st b32 D[$r5] $r3
  308. bra #dispatch_done
  309. dispatch_cmd:
  310. bclr $flags $p1
  311. call $r5
  312. bra $p1 #dispatch_error
  313. bra #dispatch_done
  314. dispatch_invalid_bitfield:
  315. or $r2 2
  316. dispatch_illegal_mthd:
  317. or $r2 1
  318. // store exception data in SCRATCH0/SCRATCH1, signal hostirq
  319. dispatch_error:
  320. mov $r4 0x1000
  321. iowr I[$r4 + 0x000] $r2
  322. iowr I[$r4 + 0x100] $r3
  323. mov $r2 0x40
  324. iowr I[$r0] $r2
  325. hostirq_wait:
  326. iord $r2 I[$r0 + 0x200]
  327. and $r2 0x40
  328. cmpu b32 $r2 0
  329. bra ne #hostirq_wait
  330. dispatch_done:
  331. mov $r2 0x1d00
  332. mov $r3 1
  333. iowr I[$r2] $r3
  334. ret
  335. // No-operation
  336. //
  337. // Inputs:
  338. // $r1: irqh state
  339. // $r2: hostirq state
  340. // $r3: data
  341. // $r4: dispatch table entry
  342. // Outputs:
  343. // $r1: irqh state
  344. // $p1: set on error
  345. // $r2: hostirq state
  346. // $r3: data
  347. cmd_nop:
  348. ret
  349. // PM_TRIGGER
  350. //
  351. // Inputs:
  352. // $r1: irqh state
  353. // $r2: hostirq state
  354. // $r3: data
  355. // $r4: dispatch table entry
  356. // Outputs:
  357. // $r1: irqh state
  358. // $p1: set on error
  359. // $r2: hostirq state
  360. // $r3: data
  361. cmd_pm_trigger:
  362. mov $r2 0x2200
  363. clear b32 $r3
  364. sethi $r3 0x20000
  365. iowr I[$r2] $r3
  366. ret
  367. ifdef(`NVA3',
  368. // SET_DMA_* method handler
  369. //
  370. // Inputs:
  371. // $r1: irqh state
  372. // $r2: hostirq state
  373. // $r3: data
  374. // $r4: dispatch table entry
  375. // Outputs:
  376. // $r1: irqh state
  377. // $p1: set on error
  378. // $r2: hostirq state
  379. // $r3: data
  380. cmd_dma:
  381. sub b32 $r4 #dispatch_dma
  382. shr b32 $r4 1
  383. bset $r3 0x1e
  384. st b32 D[$r4 + #ctx_dma] $r3
  385. add b32 $r4 0x600
  386. shl b32 $r4 6
  387. iowr I[$r4] $r3
  388. ret
  389. ,)
  390. // Calculates the hw swizzle mask and adjusts the surface's xcnt to match
  391. //
  392. cmd_exec_set_format:
  393. // zero out a chunk of the stack to store the swizzle into
  394. add $sp -0x10
  395. st b32 D[$sp + 0x00] $r0
  396. st b32 D[$sp + 0x04] $r0
  397. st b32 D[$sp + 0x08] $r0
  398. st b32 D[$sp + 0x0c] $r0
  399. // extract cpp, src_ncomp and dst_ncomp from FORMAT
  400. ld b32 $r4 D[$r0 + #ctx_format]
  401. extr $r5 $r4 16:17
  402. add b32 $r5 1
  403. extr $r6 $r4 20:21
  404. add b32 $r6 1
  405. extr $r7 $r4 24:25
  406. add b32 $r7 1
  407. // convert FORMAT swizzle mask to hw swizzle mask
  408. bclr $flags $p2
  409. clear b32 $r8
  410. clear b32 $r9
  411. ncomp_loop:
  412. and $r10 $r4 0xf
  413. shr b32 $r4 4
  414. clear b32 $r11
  415. bpc_loop:
  416. cmpu b8 $r10 4
  417. bra nc #cmp_c0
  418. mulu $r12 $r10 $r5
  419. add b32 $r12 $r11
  420. bset $flags $p2
  421. bra #bpc_next
  422. cmp_c0:
  423. bra ne #cmp_c1
  424. mov $r12 0x10
  425. add b32 $r12 $r11
  426. bra #bpc_next
  427. cmp_c1:
  428. cmpu b8 $r10 6
  429. bra nc #cmp_zero
  430. mov $r12 0x14
  431. add b32 $r12 $r11
  432. bra #bpc_next
  433. cmp_zero:
  434. mov $r12 0x80
  435. bpc_next:
  436. st b8 D[$sp + $r8] $r12
  437. add b32 $r8 1
  438. add b32 $r11 1
  439. cmpu b32 $r11 $r5
  440. bra c #bpc_loop
  441. add b32 $r9 1
  442. cmpu b32 $r9 $r7
  443. bra c #ncomp_loop
  444. // SRC_XCNT = (xcnt * src_cpp), or 0 if no src ref in swz (hw will hang)
  445. mulu $r6 $r5
  446. st b32 D[$r0 + #ctx_src_cpp] $r6
  447. ld b32 $r8 D[$r0 + #ctx_xcnt]
  448. mulu $r6 $r8
  449. bra $p2 #dst_xcnt
  450. clear b32 $r6
  451. dst_xcnt:
  452. mulu $r7 $r5
  453. st b32 D[$r0 + #ctx_dst_cpp] $r7
  454. mulu $r7 $r8
  455. mov $r5 0x810
  456. shl b32 $r5 6
  457. iowr I[$r5 + 0x000] $r6
  458. iowr I[$r5 + 0x100] $r7
  459. add b32 $r5 0x800
  460. ld b32 $r6 D[$r0 + #ctx_dst_cpp]
  461. sub b32 $r6 1
  462. shl b32 $r6 8
  463. ld b32 $r7 D[$r0 + #ctx_src_cpp]
  464. sub b32 $r7 1
  465. or $r6 $r7
  466. iowr I[$r5 + 0x000] $r6
  467. add b32 $r5 0x100
  468. ld b32 $r6 D[$sp + 0x00]
  469. iowr I[$r5 + 0x000] $r6
  470. ld b32 $r6 D[$sp + 0x04]
  471. iowr I[$r5 + 0x100] $r6
  472. ld b32 $r6 D[$sp + 0x08]
  473. iowr I[$r5 + 0x200] $r6
  474. ld b32 $r6 D[$sp + 0x0c]
  475. iowr I[$r5 + 0x300] $r6
  476. add b32 $r5 0x400
  477. ld b32 $r6 D[$r0 + #ctx_swz_const0]
  478. iowr I[$r5 + 0x000] $r6
  479. ld b32 $r6 D[$r0 + #ctx_swz_const1]
  480. iowr I[$r5 + 0x100] $r6
  481. add $sp 0x10
  482. ret
  483. // Setup to handle a tiled surface
  484. //
  485. // Calculates a number of parameters the hardware requires in order
  486. // to correctly handle tiling.
  487. //
  488. // Offset calculation is performed as follows (Tp/Th/Td from TILE_MODE):
  489. // nTx = round_up(w * cpp, 1 << Tp) >> Tp
  490. // nTy = round_up(h, 1 << Th) >> Th
  491. // Txo = (x * cpp) & ((1 << Tp) - 1)
  492. // Tx = (x * cpp) >> Tp
  493. // Tyo = y & ((1 << Th) - 1)
  494. // Ty = y >> Th
  495. // Tzo = z & ((1 << Td) - 1)
  496. // Tz = z >> Td
  497. //
  498. // off = (Tzo << Tp << Th) + (Tyo << Tp) + Txo
  499. // off += ((Tz * nTy * nTx)) + (Ty * nTx) + Tx) << Td << Th << Tp;
  500. //
  501. // Inputs:
  502. // $r4: hw command (0x104800)
  503. // $r5: ctx offset adjustment for src/dst selection
  504. // $p2: set if dst surface
  505. //
  506. cmd_exec_set_surface_tiled:
  507. // translate TILE_MODE into Tp, Th, Td shift values
  508. ld b32 $r7 D[$r5 + #ctx_src_tile_mode]
  509. extr $r9 $r7 8:11
  510. extr $r8 $r7 4:7
  511. ifdef(`NVA3',
  512. add b32 $r8 2
  513. ,
  514. add b32 $r8 3
  515. )
  516. extr $r7 $r7 0:3
  517. cmp b32 $r7 0xe
  518. bra ne #xtile64
  519. mov $r7 4
  520. bra #xtileok
  521. xtile64:
  522. xbit $r7 $flags $p2
  523. add b32 $r7 17
  524. bset $r4 $r7
  525. mov $r7 6
  526. xtileok:
  527. // Op = (x * cpp) & ((1 << Tp) - 1)
  528. // Tx = (x * cpp) >> Tp
  529. ld b32 $r10 D[$r5 + #ctx_src_xoff]
  530. ld b32 $r11 D[$r5 + #ctx_src_cpp]
  531. mulu $r10 $r11
  532. mov $r11 1
  533. shl b32 $r11 $r7
  534. sub b32 $r11 1
  535. and $r12 $r10 $r11
  536. shr b32 $r10 $r7
  537. // Tyo = y & ((1 << Th) - 1)
  538. // Ty = y >> Th
  539. ld b32 $r13 D[$r5 + #ctx_src_yoff]
  540. mov $r14 1
  541. shl b32 $r14 $r8
  542. sub b32 $r14 1
  543. and $r11 $r13 $r14
  544. shr b32 $r13 $r8
  545. // YTILE = ((1 << Th) << 12) | ((1 << Th) - Tyo)
  546. add b32 $r14 1
  547. shl b32 $r15 $r14 12
  548. sub b32 $r14 $r11
  549. or $r15 $r14
  550. xbit $r6 $flags $p2
  551. add b32 $r6 0x208
  552. shl b32 $r6 8
  553. iowr I[$r6 + 0x000] $r15
  554. // Op += Tyo << Tp
  555. shl b32 $r11 $r7
  556. add b32 $r12 $r11
  557. // nTx = ((w * cpp) + ((1 << Tp) - 1) >> Tp)
  558. ld b32 $r15 D[$r5 + #ctx_src_xsize]
  559. ld b32 $r11 D[$r5 + #ctx_src_cpp]
  560. mulu $r15 $r11
  561. mov $r11 1
  562. shl b32 $r11 $r7
  563. sub b32 $r11 1
  564. add b32 $r15 $r11
  565. shr b32 $r15 $r7
  566. push $r15
  567. // nTy = (h + ((1 << Th) - 1)) >> Th
  568. ld b32 $r15 D[$r5 + #ctx_src_ysize]
  569. mov $r11 1
  570. shl b32 $r11 $r8
  571. sub b32 $r11 1
  572. add b32 $r15 $r11
  573. shr b32 $r15 $r8
  574. push $r15
  575. // Tys = Tp + Th
  576. // CFG_YZ_TILE_SIZE = ((1 << Th) >> 2) << Td
  577. add b32 $r7 $r8
  578. sub b32 $r8 2
  579. mov $r11 1
  580. shl b32 $r11 $r8
  581. shl b32 $r11 $r9
  582. // Tzo = z & ((1 << Td) - 1)
  583. // Tz = z >> Td
  584. // Op += Tzo << Tys
  585. // Ts = Tys + Td
  586. ld b32 $r8 D[$r5 + #ctx_src_zoff]
  587. mov $r14 1
  588. shl b32 $r14 $r9
  589. sub b32 $r14 1
  590. and $r15 $r8 $r14
  591. shl b32 $r15 $r7
  592. add b32 $r12 $r15
  593. add b32 $r7 $r9
  594. shr b32 $r8 $r9
  595. // Ot = ((Tz * nTy * nTx) + (Ty * nTx) + Tx) << Ts
  596. pop $r15
  597. pop $r9
  598. mulu $r13 $r9
  599. add b32 $r10 $r13
  600. mulu $r8 $r9
  601. mulu $r8 $r15
  602. add b32 $r10 $r8
  603. shl b32 $r10 $r7
  604. // PITCH = (nTx - 1) << Ts
  605. sub b32 $r9 1
  606. shl b32 $r9 $r7
  607. iowr I[$r6 + 0x200] $r9
  608. // SRC_ADDRESS_LOW = (Ot + Op) & 0xffffffff
  609. // CFG_ADDRESS_HIGH |= ((Ot + Op) >> 32) << 16
  610. ld b32 $r7 D[$r5 + #ctx_src_address_low]
  611. ld b32 $r8 D[$r5 + #ctx_src_address_high]
  612. add b32 $r10 $r12
  613. add b32 $r7 $r10
  614. adc b32 $r8 0
  615. shl b32 $r8 16
  616. or $r8 $r11
  617. sub b32 $r6 0x600
  618. iowr I[$r6 + 0x000] $r7
  619. add b32 $r6 0x400
  620. iowr I[$r6 + 0x000] $r8
  621. ret
  622. // Setup to handle a linear surface
  623. //
  624. // Nothing to see here.. Sets ADDRESS and PITCH, pretty non-exciting
  625. //
  626. cmd_exec_set_surface_linear:
  627. xbit $r6 $flags $p2
  628. add b32 $r6 0x202
  629. shl b32 $r6 8
  630. ld b32 $r7 D[$r5 + #ctx_src_address_low]
  631. iowr I[$r6 + 0x000] $r7
  632. add b32 $r6 0x400
  633. ld b32 $r7 D[$r5 + #ctx_src_address_high]
  634. shl b32 $r7 16
  635. iowr I[$r6 + 0x000] $r7
  636. add b32 $r6 0x400
  637. ld b32 $r7 D[$r5 + #ctx_src_pitch]
  638. iowr I[$r6 + 0x000] $r7
  639. ret
  640. // wait for regs to be available for use
  641. cmd_exec_wait:
  642. push $r0
  643. push $r1
  644. mov $r0 0x800
  645. shl b32 $r0 6
  646. loop:
  647. iord $r1 I[$r0]
  648. and $r1 1
  649. bra ne #loop
  650. pop $r1
  651. pop $r0
  652. ret
  653. cmd_exec_query:
  654. // if QUERY_SHORT not set, write out { -, 0, TIME_LO, TIME_HI }
  655. xbit $r4 $r3 13
  656. bra ne #query_counter
  657. call #cmd_exec_wait
  658. mov $r4 0x80c
  659. shl b32 $r4 6
  660. ld b32 $r5 D[$r0 + #ctx_query_address_low]
  661. add b32 $r5 4
  662. iowr I[$r4 + 0x000] $r5
  663. iowr I[$r4 + 0x100] $r0
  664. mov $r5 0xc
  665. iowr I[$r4 + 0x200] $r5
  666. add b32 $r4 0x400
  667. ld b32 $r5 D[$r0 + #ctx_query_address_high]
  668. shl b32 $r5 16
  669. iowr I[$r4 + 0x000] $r5
  670. add b32 $r4 0x500
  671. mov $r5 0x00000b00
  672. sethi $r5 0x00010000
  673. iowr I[$r4 + 0x000] $r5
  674. mov $r5 0x00004040
  675. shl b32 $r5 1
  676. sethi $r5 0x80800000
  677. iowr I[$r4 + 0x100] $r5
  678. mov $r5 0x00001110
  679. sethi $r5 0x13120000
  680. iowr I[$r4 + 0x200] $r5
  681. mov $r5 0x00001514
  682. sethi $r5 0x17160000
  683. iowr I[$r4 + 0x300] $r5
  684. mov $r5 0x00002601
  685. sethi $r5 0x00010000
  686. mov $r4 0x800
  687. shl b32 $r4 6
  688. iowr I[$r4 + 0x000] $r5
  689. // write COUNTER
  690. query_counter:
  691. call #cmd_exec_wait
  692. mov $r4 0x80c
  693. shl b32 $r4 6
  694. ld b32 $r5 D[$r0 + #ctx_query_address_low]
  695. iowr I[$r4 + 0x000] $r5
  696. iowr I[$r4 + 0x100] $r0
  697. mov $r5 0x4
  698. iowr I[$r4 + 0x200] $r5
  699. add b32 $r4 0x400
  700. ld b32 $r5 D[$r0 + #ctx_query_address_high]
  701. shl b32 $r5 16
  702. iowr I[$r4 + 0x000] $r5
  703. add b32 $r4 0x500
  704. mov $r5 0x00000300
  705. iowr I[$r4 + 0x000] $r5
  706. mov $r5 0x00001110
  707. sethi $r5 0x13120000
  708. iowr I[$r4 + 0x100] $r5
  709. ld b32 $r5 D[$r0 + #ctx_query_counter]
  710. add b32 $r4 0x500
  711. iowr I[$r4 + 0x000] $r5
  712. mov $r5 0x00002601
  713. sethi $r5 0x00010000
  714. mov $r4 0x800
  715. shl b32 $r4 6
  716. iowr I[$r4 + 0x000] $r5
  717. ret
  718. // Execute a copy operation
  719. //
  720. // Inputs:
  721. // $r1: irqh state
  722. // $r2: hostirq state
  723. // $r3: data
  724. // 000002000 QUERY_SHORT
  725. // 000001000 QUERY
  726. // 000000100 DST_LINEAR
  727. // 000000010 SRC_LINEAR
  728. // 000000001 FORMAT
  729. // $r4: dispatch table entry
  730. // Outputs:
  731. // $r1: irqh state
  732. // $p1: set on error
  733. // $r2: hostirq state
  734. // $r3: data
  735. cmd_exec:
  736. call #cmd_exec_wait
  737. // if format requested, call function to calculate it, otherwise
  738. // fill in cpp/xcnt for both surfaces as if (cpp == 1)
  739. xbit $r15 $r3 0
  740. bra e #cmd_exec_no_format
  741. call #cmd_exec_set_format
  742. mov $r4 0x200
  743. bra #cmd_exec_init_src_surface
  744. cmd_exec_no_format:
  745. mov $r6 0x810
  746. shl b32 $r6 6
  747. mov $r7 1
  748. st b32 D[$r0 + #ctx_src_cpp] $r7
  749. st b32 D[$r0 + #ctx_dst_cpp] $r7
  750. ld b32 $r7 D[$r0 + #ctx_xcnt]
  751. iowr I[$r6 + 0x000] $r7
  752. iowr I[$r6 + 0x100] $r7
  753. clear b32 $r4
  754. cmd_exec_init_src_surface:
  755. bclr $flags $p2
  756. clear b32 $r5
  757. xbit $r15 $r3 4
  758. bra e #src_tiled
  759. call #cmd_exec_set_surface_linear
  760. bra #cmd_exec_init_dst_surface
  761. src_tiled:
  762. call #cmd_exec_set_surface_tiled
  763. bset $r4 7
  764. cmd_exec_init_dst_surface:
  765. bset $flags $p2
  766. mov $r5 #ctx_dst_address_high - #ctx_src_address_high
  767. xbit $r15 $r3 8
  768. bra e #dst_tiled
  769. call #cmd_exec_set_surface_linear
  770. bra #cmd_exec_kick
  771. dst_tiled:
  772. call #cmd_exec_set_surface_tiled
  773. bset $r4 8
  774. cmd_exec_kick:
  775. mov $r5 0x800
  776. shl b32 $r5 6
  777. ld b32 $r6 D[$r0 + #ctx_ycnt]
  778. iowr I[$r5 + 0x100] $r6
  779. mov $r6 0x0041
  780. // SRC_TARGET = 1, DST_TARGET = 2
  781. sethi $r6 0x44000000
  782. or $r4 $r6
  783. iowr I[$r5] $r4
  784. // if requested, queue up a QUERY write after the copy has completed
  785. xbit $r15 $r3 12
  786. bra e #cmd_exec_done
  787. call #cmd_exec_query
  788. cmd_exec_done:
  789. ret
  790. // Flush write cache
  791. //
  792. // Inputs:
  793. // $r1: irqh state
  794. // $r2: hostirq state
  795. // $r3: data
  796. // $r4: dispatch table entry
  797. // Outputs:
  798. // $r1: irqh state
  799. // $p1: set on error
  800. // $r2: hostirq state
  801. // $r3: data
  802. cmd_wrcache_flush:
  803. mov $r2 0x2200
  804. clear b32 $r3
  805. sethi $r3 0x10000
  806. iowr I[$r2] $r3
  807. ret
  808. .align 0x100