ev6-strncpy_from_user.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. /*
  2. * arch/alpha/lib/ev6-strncpy_from_user.S
  3. * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
  4. *
  5. * Just like strncpy except in the return value:
  6. *
  7. * -EFAULT if an exception occurs before the terminator is copied.
  8. * N if the buffer filled.
  9. *
  10. * Otherwise the length of the string is returned.
  11. *
  12. * Much of the information about 21264 scheduling/coding comes from:
  13. * Compiler Writer's Guide for the Alpha 21264
  14. * abbreviated as 'CWG' in other comments here
  15. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  16. * Scheduling notation:
  17. * E - either cluster
  18. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  19. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  20. * A bunch of instructions got moved and temp registers were changed
  21. * to aid in scheduling. Control flow was also re-arranged to eliminate
  22. * branches, and to provide longer code sequences to enable better scheduling.
  23. * A total rewrite (using byte load/stores for start & tail sequences)
  24. * is desirable, but very difficult to do without a from-scratch rewrite.
  25. * Save that for the future.
  26. */
  27. #include <asm/errno.h>
  28. #include <asm/regdef.h>
  29. /* Allow an exception for an insn; exit if we get one. */
  30. #define EX(x,y...) \
  31. 99: x,##y; \
  32. .section __ex_table,"a"; \
  33. .long 99b - .; \
  34. lda $31, $exception-99b($0); \
  35. .previous
  36. .set noat
  37. .set noreorder
  38. .text
  39. .globl __strncpy_from_user
  40. .ent __strncpy_from_user
  41. .frame $30, 0, $26
  42. .prologue 0
  43. .align 4
  44. __strncpy_from_user:
  45. and a0, 7, t3 # E : find dest misalignment
  46. beq a2, $zerolength # U :
  47. /* Are source and destination co-aligned? */
  48. mov a0, v0 # E : save the string start
  49. xor a0, a1, t4 # E :
  50. EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword
  51. ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword
  52. addq a2, t3, a2 # E : bias count by dest misalignment
  53. subq a2, 1, a3 # E :
  54. addq zero, 1, t10 # E :
  55. and t4, 7, t4 # E : misalignment between the two
  56. and a3, 7, t6 # E : number of tail bytes
  57. sll t10, t6, t10 # E : t10 = bitmask of last count byte
  58. bne t4, $unaligned # U :
  59. lda t2, -1 # E : build a mask against false zero
  60. /*
  61. * We are co-aligned; take care of a partial first word.
  62. * On entry to this basic block:
  63. * t0 == the first destination word for masking back in
  64. * t1 == the first source word.
  65. */
  66. srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8
  67. addq a1, 8, a1 # E :
  68. mskqh t2, a1, t2 # U : detection in the src word
  69. nop
  70. /* Create the 1st output word and detect 0's in the 1st input word. */
  71. mskqh t1, a1, t3 # U :
  72. mskql t0, a1, t0 # U : assemble the first output word
  73. ornot t1, t2, t2 # E :
  74. nop
  75. cmpbge zero, t2, t8 # E : bits set iff null found
  76. or t0, t3, t0 # E :
  77. beq a2, $a_eoc # U :
  78. bne t8, $a_eos # U : 2nd branch in a quad. Bad.
  79. /* On entry to this basic block:
  80. * t0 == a source quad not containing a null.
  81. * a0 - current aligned destination address
  82. * a1 - current aligned source address
  83. * a2 - count of quadwords to move.
  84. * NOTE: Loop improvement - unrolling this is going to be
  85. * a huge win, since we're going to stall otherwise.
  86. * Fix this later. For _really_ large copies, look
  87. * at using wh64 on a look-ahead basis. See the code
  88. * in clear_user.S and copy_user.S.
  89. * Presumably, since (a0) and (a1) do not overlap (by C definition)
  90. * Lots of nops here:
  91. * - Separate loads from stores
  92. * - Keep it to 1 branch/quadpack so the branch predictor
  93. * can train.
  94. */
  95. $a_loop:
  96. stq_u t0, 0(a0) # L :
  97. addq a0, 8, a0 # E :
  98. nop
  99. subq a2, 1, a2 # E :
  100. EX( ldq_u t0, 0(a1) ) # L :
  101. addq a1, 8, a1 # E :
  102. cmpbge zero, t0, t8 # E : Stall 2 cycles on t0
  103. beq a2, $a_eoc # U :
  104. beq t8, $a_loop # U :
  105. nop
  106. nop
  107. nop
  108. /* Take care of the final (partial) word store. At this point
  109. * the end-of-count bit is set in t8 iff it applies.
  110. *
  111. * On entry to this basic block we have:
  112. * t0 == the source word containing the null
  113. * t8 == the cmpbge mask that found it.
  114. */
  115. $a_eos:
  116. negq t8, t12 # E : find low bit set
  117. and t8, t12, t12 # E :
  118. /* We're doing a partial word store and so need to combine
  119. our source and original destination words. */
  120. ldq_u t1, 0(a0) # L :
  121. subq t12, 1, t6 # E :
  122. or t12, t6, t8 # E :
  123. zapnot t0, t8, t0 # U : clear src bytes > null
  124. zap t1, t8, t1 # U : clear dst bytes <= null
  125. or t0, t1, t0 # E :
  126. stq_u t0, 0(a0) # L :
  127. br $finish_up # L0 :
  128. nop
  129. nop
  130. /* Add the end-of-count bit to the eos detection bitmask. */
  131. .align 4
  132. $a_eoc:
  133. or t10, t8, t8
  134. br $a_eos
  135. nop
  136. nop
  137. /* The source and destination are not co-aligned. Align the destination
  138. and cope. We have to be very careful about not reading too much and
  139. causing a SEGV. */
  140. .align 4
  141. $u_head:
  142. /* We know just enough now to be able to assemble the first
  143. full source word. We can still find a zero at the end of it
  144. that prevents us from outputting the whole thing.
  145. On entry to this basic block:
  146. t0 == the first dest word, unmasked
  147. t1 == the shifted low bits of the first source word
  148. t6 == bytemask that is -1 in dest word bytes */
  149. EX( ldq_u t2, 8(a1) ) # L : load second src word
  150. addq a1, 8, a1 # E :
  151. mskql t0, a0, t0 # U : mask trailing garbage in dst
  152. extqh t2, a1, t4 # U :
  153. or t1, t4, t1 # E : first aligned src word complete
  154. mskqh t1, a0, t1 # U : mask leading garbage in src
  155. or t0, t1, t0 # E : first output word complete
  156. or t0, t6, t6 # E : mask original data for zero test
  157. cmpbge zero, t6, t8 # E :
  158. beq a2, $u_eocfin # U :
  159. bne t8, $u_final # U : bad news - 2nd branch in a quad
  160. lda t6, -1 # E : mask out the bits we have
  161. mskql t6, a1, t6 # U : already seen
  162. stq_u t0, 0(a0) # L : store first output word
  163. or t6, t2, t2 # E :
  164. cmpbge zero, t2, t8 # E : find nulls in second partial
  165. addq a0, 8, a0 # E :
  166. subq a2, 1, a2 # E :
  167. bne t8, $u_late_head_exit # U :
  168. nop
  169. /* Finally, we've got all the stupid leading edge cases taken care
  170. of and we can set up to enter the main loop. */
  171. extql t2, a1, t1 # U : position hi-bits of lo word
  172. EX( ldq_u t2, 8(a1) ) # L : read next high-order source word
  173. addq a1, 8, a1 # E :
  174. cmpbge zero, t2, t8 # E :
  175. beq a2, $u_eoc # U :
  176. bne t8, $u_eos # U :
  177. nop
  178. nop
  179. /* Unaligned copy main loop. In order to avoid reading too much,
  180. the loop is structured to detect zeros in aligned source words.
  181. This has, unfortunately, effectively pulled half of a loop
  182. iteration out into the head and half into the tail, but it does
  183. prevent nastiness from accumulating in the very thing we want
  184. to run as fast as possible.
  185. On entry to this basic block:
  186. t1 == the shifted high-order bits from the previous source word
  187. t2 == the unshifted current source word
  188. We further know that t2 does not contain a null terminator. */
  189. /*
  190. * Extra nops here:
  191. * separate load quads from store quads
  192. * only one branch/quad to permit predictor training
  193. */
  194. .align 4
  195. $u_loop:
  196. extqh t2, a1, t0 # U : extract high bits for current word
  197. addq a1, 8, a1 # E :
  198. extql t2, a1, t3 # U : extract low bits for next time
  199. addq a0, 8, a0 # E :
  200. or t0, t1, t0 # E : current dst word now complete
  201. EX( ldq_u t2, 0(a1) ) # L : load high word for next time
  202. subq a2, 1, a2 # E :
  203. nop
  204. stq_u t0, -8(a0) # L : save the current word
  205. mov t3, t1 # E :
  206. cmpbge zero, t2, t8 # E : test new word for eos
  207. beq a2, $u_eoc # U :
  208. beq t8, $u_loop # U :
  209. nop
  210. nop
  211. nop
  212. /* We've found a zero somewhere in the source word we just read.
  213. If it resides in the lower half, we have one (probably partial)
  214. word to write out, and if it resides in the upper half, we
  215. have one full and one partial word left to write out.
  216. On entry to this basic block:
  217. t1 == the shifted high-order bits from the previous source word
  218. t2 == the unshifted current source word. */
  219. .align 4
  220. $u_eos:
  221. extqh t2, a1, t0 # U :
  222. or t0, t1, t0 # E : first (partial) source word complete
  223. cmpbge zero, t0, t8 # E : is the null in this first bit?
  224. nop
  225. bne t8, $u_final # U :
  226. stq_u t0, 0(a0) # L : the null was in the high-order bits
  227. addq a0, 8, a0 # E :
  228. subq a2, 1, a2 # E :
  229. .align 4
  230. $u_late_head_exit:
  231. extql t2, a1, t0 # U :
  232. cmpbge zero, t0, t8 # E :
  233. or t8, t10, t6 # E :
  234. cmoveq a2, t6, t8 # E :
  235. /* Take care of a final (probably partial) result word.
  236. On entry to this basic block:
  237. t0 == assembled source word
  238. t8 == cmpbge mask that found the null. */
  239. .align 4
  240. $u_final:
  241. negq t8, t6 # E : isolate low bit set
  242. and t6, t8, t12 # E :
  243. ldq_u t1, 0(a0) # L :
  244. subq t12, 1, t6 # E :
  245. or t6, t12, t8 # E :
  246. zapnot t0, t8, t0 # U : kill source bytes > null
  247. zap t1, t8, t1 # U : kill dest bytes <= null
  248. or t0, t1, t0 # E :
  249. stq_u t0, 0(a0) # E :
  250. br $finish_up # U :
  251. nop
  252. nop
  253. .align 4
  254. $u_eoc: # end-of-count
  255. extqh t2, a1, t0 # U :
  256. or t0, t1, t0 # E :
  257. cmpbge zero, t0, t8 # E :
  258. nop
  259. .align 4
  260. $u_eocfin: # end-of-count, final word
  261. or t10, t8, t8 # E :
  262. br $u_final # U :
  263. nop
  264. nop
  265. /* Unaligned copy entry point. */
  266. .align 4
  267. $unaligned:
  268. srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8
  269. and a0, 7, t4 # E : find dest misalignment
  270. and a1, 7, t5 # E : find src misalignment
  271. mov zero, t0 # E :
  272. /* Conditionally load the first destination word and a bytemask
  273. with 0xff indicating that the destination byte is sacrosanct. */
  274. mov zero, t6 # E :
  275. beq t4, 1f # U :
  276. ldq_u t0, 0(a0) # L :
  277. lda t6, -1 # E :
  278. mskql t6, a0, t6 # E :
  279. nop
  280. nop
  281. nop
  282. .align 4
  283. 1:
  284. subq a1, t4, a1 # E : sub dest misalignment from src addr
  285. /* If source misalignment is larger than dest misalignment, we need
  286. extra startup checks to avoid SEGV. */
  287. cmplt t4, t5, t12 # E :
  288. extql t1, a1, t1 # U : shift src into place
  289. lda t2, -1 # E : for creating masks later
  290. beq t12, $u_head # U :
  291. mskqh t2, t5, t2 # U : begin src byte validity mask
  292. cmpbge zero, t1, t8 # E : is there a zero?
  293. nop
  294. extql t2, a1, t2 # U :
  295. or t8, t10, t5 # E : test for end-of-count too
  296. cmpbge zero, t2, t3 # E :
  297. cmoveq a2, t5, t8 # E : Latency=2, extra map slot
  298. nop # E : goes with cmov
  299. andnot t8, t3, t8 # E :
  300. beq t8, $u_head # U :
  301. nop
  302. /* At this point we've found a zero in the first partial word of
  303. the source. We need to isolate the valid source data and mask
  304. it into the original destination data. (Incidentally, we know
  305. that we'll need at least one byte of that original dest word.) */
  306. ldq_u t0, 0(a0) # L :
  307. negq t8, t6 # E : build bitmask of bytes <= zero
  308. mskqh t1, t4, t1 # U :
  309. and t6, t8, t12 # E :
  310. subq t12, 1, t6 # E :
  311. or t6, t12, t8 # E :
  312. zapnot t2, t8, t2 # U : prepare source word; mirror changes
  313. zapnot t1, t8, t1 # U : to source validity mask
  314. andnot t0, t2, t0 # E : zero place for source to reside
  315. or t0, t1, t0 # E : and put it there
  316. stq_u t0, 0(a0) # L :
  317. nop
  318. .align 4
  319. $finish_up:
  320. zapnot t0, t12, t4 # U : was last byte written null?
  321. and t12, 0xf0, t3 # E : binary search for the address of the
  322. cmovne t4, 1, t4 # E : Latency=2, extra map slot
  323. nop # E : with cmovne
  324. and t12, 0xcc, t2 # E : last byte written
  325. and t12, 0xaa, t1 # E :
  326. cmovne t3, 4, t3 # E : Latency=2, extra map slot
  327. nop # E : with cmovne
  328. bic a0, 7, t0
  329. cmovne t2, 2, t2 # E : Latency=2, extra map slot
  330. nop # E : with cmovne
  331. nop
  332. cmovne t1, 1, t1 # E : Latency=2, extra map slot
  333. nop # E : with cmovne
  334. addq t0, t3, t0 # E :
  335. addq t1, t2, t1 # E :
  336. addq t0, t1, t0 # E :
  337. addq t0, t4, t0 # add one if we filled the buffer
  338. subq t0, v0, v0 # find string length
  339. ret # L0 :
  340. .align 4
  341. $zerolength:
  342. nop
  343. nop
  344. nop
  345. clr v0
  346. $exception:
  347. nop
  348. nop
  349. nop
  350. ret
  351. .end __strncpy_from_user