csumpartialcopygeneric.S 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. /*
  2. * linux/arch/arm26/lib/csumpartialcopygeneric.S
  3. *
  4. * Copyright (C) 1995-2001 Russell King
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. *
  10. * JMA 01/06/03 Commented out some shl0s; probobly irrelevant to arm26
  11. *
  12. */
  13. /*
  14. * unsigned int
  15. * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, )
  16. * r0 = src, r1 = dst, r2 = len, r3 = sum
  17. * Returns : r0 = checksum
  18. *
  19. * Note that 'tst' and 'teq' preserve the carry flag.
  20. */
  21. /* Quick hack */
  22. .macro save_regs
  23. stmfd sp!, {r1, r4 - r8, fp, ip, lr, pc}
  24. .endm
  25. /* end Quick Hack */
  26. src .req r0
  27. dst .req r1
  28. len .req r2
  29. sum .req r3
  30. .zero: mov r0, sum
  31. load_regs ea
  32. /*
  33. * Align an unaligned destination pointer. We know that
  34. * we have >= 8 bytes here, so we don't need to check
  35. * the length. Note that the source pointer hasn't been
  36. * aligned yet.
  37. */
  38. .dst_unaligned: tst dst, #1
  39. beq .dst_16bit
  40. load1b ip
  41. sub len, len, #1
  42. adcs sum, sum, ip, lsl #byte(1) @ update checksum
  43. strb ip, [dst], #1
  44. tst dst, #2
  45. moveq pc, lr @ dst is now 32bit aligned
  46. .dst_16bit: load2b r8, ip
  47. sub len, len, #2
  48. adcs sum, sum, r8, lsl #byte(0)
  49. strb r8, [dst], #1
  50. adcs sum, sum, ip, lsl #byte(1)
  51. strb ip, [dst], #1
  52. mov pc, lr @ dst is now 32bit aligned
  53. /*
  54. * Handle 0 to 7 bytes, with any alignment of source and
  55. * destination pointers. Note that when we get here, C = 0
  56. */
  57. .less8: teq len, #0 @ check for zero count
  58. beq .zero
  59. /* we must have at least one byte. */
  60. tst dst, #1 @ dst 16-bit aligned
  61. beq .less8_aligned
  62. /* Align dst */
  63. load1b ip
  64. sub len, len, #1
  65. adcs sum, sum, ip, lsl #byte(1) @ update checksum
  66. strb ip, [dst], #1
  67. tst len, #6
  68. beq .less8_byteonly
  69. 1: load2b r8, ip
  70. sub len, len, #2
  71. adcs sum, sum, r8, lsl #byte(0)
  72. strb r8, [dst], #1
  73. adcs sum, sum, ip, lsl #byte(1)
  74. strb ip, [dst], #1
  75. .less8_aligned: tst len, #6
  76. bne 1b
  77. .less8_byteonly:
  78. tst len, #1
  79. beq .done
  80. load1b r8
  81. adcs sum, sum, r8, lsl #byte(0) @ update checksum
  82. strb r8, [dst], #1
  83. b .done
  84. FN_ENTRY
  85. mov ip, sp
  86. save_regs
  87. sub fp, ip, #4
  88. cmp len, #8 @ Ensure that we have at least
  89. blo .less8 @ 8 bytes to copy.
  90. adds sum, sum, #0 @ C = 0
  91. tst dst, #3 @ Test destination alignment
  92. blne .dst_unaligned @ align destination, return here
  93. /*
  94. * Ok, the dst pointer is now 32bit aligned, and we know
  95. * that we must have more than 4 bytes to copy. Note
  96. * that C contains the carry from the dst alignment above.
  97. */
  98. tst src, #3 @ Test source alignment
  99. bne .src_not_aligned
  100. /* Routine for src & dst aligned */
  101. bics ip, len, #15
  102. beq 2f
  103. 1: load4l r4, r5, r6, r7
  104. stmia dst!, {r4, r5, r6, r7}
  105. adcs sum, sum, r4
  106. adcs sum, sum, r5
  107. adcs sum, sum, r6
  108. adcs sum, sum, r7
  109. sub ip, ip, #16
  110. teq ip, #0
  111. bne 1b
  112. 2: ands ip, len, #12
  113. beq 4f
  114. tst ip, #8
  115. beq 3f
  116. load2l r4, r5
  117. stmia dst!, {r4, r5}
  118. adcs sum, sum, r4
  119. adcs sum, sum, r5
  120. tst ip, #4
  121. beq 4f
  122. 3: load1l r4
  123. str r4, [dst], #4
  124. adcs sum, sum, r4
  125. 4: ands len, len, #3
  126. beq .done
  127. load1l r4
  128. tst len, #2
  129. /* mov r5, r4, lsr #byte(0)
  130. FIXME? 0 Shift anyhow!
  131. */
  132. beq .exit
  133. adcs sum, sum, r4, push #16
  134. strb r5, [dst], #1
  135. mov r5, r4, lsr #byte(1)
  136. strb r5, [dst], #1
  137. mov r5, r4, lsr #byte(2)
  138. .exit: tst len, #1
  139. strneb r5, [dst], #1
  140. andne r5, r5, #255
  141. adcnes sum, sum, r5, lsl #byte(0)
  142. /*
  143. * If the dst pointer was not 16-bit aligned, we
  144. * need to rotate the checksum here to get around
  145. * the inefficient byte manipulations in the
  146. * architecture independent code.
  147. */
  148. .done: adc r0, sum, #0
  149. ldr sum, [sp, #0] @ dst
  150. tst sum, #1
  151. movne sum, r0, lsl #8
  152. orrne r0, sum, r0, lsr #24
  153. load_regs ea
  154. .src_not_aligned:
  155. adc sum, sum, #0 @ include C from dst alignment
  156. and ip, src, #3
  157. bic src, src, #3
  158. load1l r5
  159. cmp ip, #2
  160. beq .src2_aligned
  161. bhi .src3_aligned
  162. mov r4, r5, pull #8 @ C = 0
  163. bics ip, len, #15
  164. beq 2f
  165. 1: load4l r5, r6, r7, r8
  166. orr r4, r4, r5, push #24
  167. mov r5, r5, pull #8
  168. orr r5, r5, r6, push #24
  169. mov r6, r6, pull #8
  170. orr r6, r6, r7, push #24
  171. mov r7, r7, pull #8
  172. orr r7, r7, r8, push #24
  173. stmia dst!, {r4, r5, r6, r7}
  174. adcs sum, sum, r4
  175. adcs sum, sum, r5
  176. adcs sum, sum, r6
  177. adcs sum, sum, r7
  178. mov r4, r8, pull #8
  179. sub ip, ip, #16
  180. teq ip, #0
  181. bne 1b
  182. 2: ands ip, len, #12
  183. beq 4f
  184. tst ip, #8
  185. beq 3f
  186. load2l r5, r6
  187. orr r4, r4, r5, push #24
  188. mov r5, r5, pull #8
  189. orr r5, r5, r6, push #24
  190. stmia dst!, {r4, r5}
  191. adcs sum, sum, r4
  192. adcs sum, sum, r5
  193. mov r4, r6, pull #8
  194. tst ip, #4
  195. beq 4f
  196. 3: load1l r5
  197. orr r4, r4, r5, push #24
  198. str r4, [dst], #4
  199. adcs sum, sum, r4
  200. mov r4, r5, pull #8
  201. 4: ands len, len, #3
  202. beq .done
  203. /* mov r5, r4, lsr #byte(0)
  204. FIXME? 0 Shift anyhow
  205. */
  206. tst len, #2
  207. beq .exit
  208. adcs sum, sum, r4, push #16
  209. strb r5, [dst], #1
  210. mov r5, r4, lsr #byte(1)
  211. strb r5, [dst], #1
  212. mov r5, r4, lsr #byte(2)
  213. b .exit
  214. .src2_aligned: mov r4, r5, pull #16
  215. adds sum, sum, #0
  216. bics ip, len, #15
  217. beq 2f
  218. 1: load4l r5, r6, r7, r8
  219. orr r4, r4, r5, push #16
  220. mov r5, r5, pull #16
  221. orr r5, r5, r6, push #16
  222. mov r6, r6, pull #16
  223. orr r6, r6, r7, push #16
  224. mov r7, r7, pull #16
  225. orr r7, r7, r8, push #16
  226. stmia dst!, {r4, r5, r6, r7}
  227. adcs sum, sum, r4
  228. adcs sum, sum, r5
  229. adcs sum, sum, r6
  230. adcs sum, sum, r7
  231. mov r4, r8, pull #16
  232. sub ip, ip, #16
  233. teq ip, #0
  234. bne 1b
  235. 2: ands ip, len, #12
  236. beq 4f
  237. tst ip, #8
  238. beq 3f
  239. load2l r5, r6
  240. orr r4, r4, r5, push #16
  241. mov r5, r5, pull #16
  242. orr r5, r5, r6, push #16
  243. stmia dst!, {r4, r5}
  244. adcs sum, sum, r4
  245. adcs sum, sum, r5
  246. mov r4, r6, pull #16
  247. tst ip, #4
  248. beq 4f
  249. 3: load1l r5
  250. orr r4, r4, r5, push #16
  251. str r4, [dst], #4
  252. adcs sum, sum, r4
  253. mov r4, r5, pull #16
  254. 4: ands len, len, #3
  255. beq .done
  256. /* mov r5, r4, lsr #byte(0)
  257. FIXME? 0 Shift anyhow
  258. */
  259. tst len, #2
  260. beq .exit
  261. adcs sum, sum, r4
  262. strb r5, [dst], #1
  263. mov r5, r4, lsr #byte(1)
  264. strb r5, [dst], #1
  265. tst len, #1
  266. beq .done
  267. load1b r5
  268. b .exit
  269. .src3_aligned: mov r4, r5, pull #24
  270. adds sum, sum, #0
  271. bics ip, len, #15
  272. beq 2f
  273. 1: load4l r5, r6, r7, r8
  274. orr r4, r4, r5, push #8
  275. mov r5, r5, pull #24
  276. orr r5, r5, r6, push #8
  277. mov r6, r6, pull #24
  278. orr r6, r6, r7, push #8
  279. mov r7, r7, pull #24
  280. orr r7, r7, r8, push #8
  281. stmia dst!, {r4, r5, r6, r7}
  282. adcs sum, sum, r4
  283. adcs sum, sum, r5
  284. adcs sum, sum, r6
  285. adcs sum, sum, r7
  286. mov r4, r8, pull #24
  287. sub ip, ip, #16
  288. teq ip, #0
  289. bne 1b
  290. 2: ands ip, len, #12
  291. beq 4f
  292. tst ip, #8
  293. beq 3f
  294. load2l r5, r6
  295. orr r4, r4, r5, push #8
  296. mov r5, r5, pull #24
  297. orr r5, r5, r6, push #8
  298. stmia dst!, {r4, r5}
  299. adcs sum, sum, r4
  300. adcs sum, sum, r5
  301. mov r4, r6, pull #24
  302. tst ip, #4
  303. beq 4f
  304. 3: load1l r5
  305. orr r4, r4, r5, push #8
  306. str r4, [dst], #4
  307. adcs sum, sum, r4
  308. mov r4, r5, pull #24
  309. 4: ands len, len, #3
  310. beq .done
  311. /* mov r5, r4, lsr #byte(0)
  312. FIXME? 0 Shift anyhow
  313. */
  314. tst len, #2
  315. beq .exit
  316. strb r5, [dst], #1
  317. adcs sum, sum, r4
  318. load1l r4
  319. /* mov r5, r4, lsr #byte(0)
  320. FIXME? 0 Shift anyhow
  321. */
  322. strb r5, [dst], #1
  323. adcs sum, sum, r4, push #24
  324. mov r5, r4, lsr #byte(1)
  325. b .exit