xprtsock.c 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156
  1. /*
  2. * linux/net/sunrpc/xprtsock.c
  3. *
  4. * Client-side transport implementation for sockets.
  5. *
  6. * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com>
  7. * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com>
  8. * TCP NFS related read + write fixes
  9. * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
  10. *
  11. * Rewrite of larges part of the code in order to stabilize TCP stuff.
  12. * Fix behaviour when socket buffer is full.
  13. * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
  14. *
  15. * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com>
  16. */
  17. #include <linux/types.h>
  18. #include <linux/slab.h>
  19. #include <linux/capability.h>
  20. #include <linux/sched.h>
  21. #include <linux/pagemap.h>
  22. #include <linux/errno.h>
  23. #include <linux/socket.h>
  24. #include <linux/in.h>
  25. #include <linux/net.h>
  26. #include <linux/mm.h>
  27. #include <linux/udp.h>
  28. #include <linux/tcp.h>
  29. #include <linux/sunrpc/clnt.h>
  30. #include <linux/file.h>
  31. #include <net/sock.h>
  32. #include <net/checksum.h>
  33. #include <net/udp.h>
  34. #include <net/tcp.h>
  35. /*
  36. * Maximum port number to use when requesting a reserved port.
  37. */
  38. #define XS_MAX_RESVPORT (800U)
  39. /*
  40. * How many times to try sending a request on a socket before waiting
  41. * for the socket buffer to clear.
  42. */
  43. #define XS_SENDMSG_RETRY (10U)
  44. #ifdef RPC_DEBUG
  45. # undef RPC_DEBUG_DATA
  46. # define RPCDBG_FACILITY RPCDBG_TRANS
  47. #endif
  48. #ifdef RPC_DEBUG_DATA
  49. static void xs_pktdump(char *msg, u32 *packet, unsigned int count)
  50. {
  51. u8 *buf = (u8 *) packet;
  52. int j;
  53. dprintk("RPC: %s\n", msg);
  54. for (j = 0; j < count && j < 128; j += 4) {
  55. if (!(j & 31)) {
  56. if (j)
  57. dprintk("\n");
  58. dprintk("0x%04x ", j);
  59. }
  60. dprintk("%02x%02x%02x%02x ",
  61. buf[j], buf[j+1], buf[j+2], buf[j+3]);
  62. }
  63. dprintk("\n");
  64. }
  65. #else
  66. static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count)
  67. {
  68. /* NOP */
  69. }
  70. #endif
  71. #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
  72. static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len)
  73. {
  74. struct kvec iov = {
  75. .iov_base = xdr->head[0].iov_base + base,
  76. .iov_len = len - base,
  77. };
  78. struct msghdr msg = {
  79. .msg_name = addr,
  80. .msg_namelen = addrlen,
  81. .msg_flags = XS_SENDMSG_FLAGS,
  82. };
  83. if (xdr->len > len)
  84. msg.msg_flags |= MSG_MORE;
  85. if (likely(iov.iov_len))
  86. return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
  87. return kernel_sendmsg(sock, &msg, NULL, 0, 0);
  88. }
  89. static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int base, unsigned int len)
  90. {
  91. struct kvec iov = {
  92. .iov_base = xdr->tail[0].iov_base + base,
  93. .iov_len = len - base,
  94. };
  95. struct msghdr msg = {
  96. .msg_flags = XS_SENDMSG_FLAGS,
  97. };
  98. return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
  99. }
  100. /**
  101. * xs_sendpages - write pages directly to a socket
  102. * @sock: socket to send on
  103. * @addr: UDP only -- address of destination
  104. * @addrlen: UDP only -- length of destination address
  105. * @xdr: buffer containing this request
  106. * @base: starting position in the buffer
  107. *
  108. */
  109. static inline int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
  110. {
  111. struct page **ppage = xdr->pages;
  112. unsigned int len, pglen = xdr->page_len;
  113. int err, ret = 0;
  114. ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
  115. if (unlikely(!sock))
  116. return -ENOTCONN;
  117. clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
  118. len = xdr->head[0].iov_len;
  119. if (base < len || (addr != NULL && base == 0)) {
  120. err = xs_send_head(sock, addr, addrlen, xdr, base, len);
  121. if (ret == 0)
  122. ret = err;
  123. else if (err > 0)
  124. ret += err;
  125. if (err != (len - base))
  126. goto out;
  127. base = 0;
  128. } else
  129. base -= len;
  130. if (unlikely(pglen == 0))
  131. goto copy_tail;
  132. if (unlikely(base >= pglen)) {
  133. base -= pglen;
  134. goto copy_tail;
  135. }
  136. if (base || xdr->page_base) {
  137. pglen -= base;
  138. base += xdr->page_base;
  139. ppage += base >> PAGE_CACHE_SHIFT;
  140. base &= ~PAGE_CACHE_MASK;
  141. }
  142. sendpage = sock->ops->sendpage ? : sock_no_sendpage;
  143. do {
  144. int flags = XS_SENDMSG_FLAGS;
  145. len = PAGE_CACHE_SIZE;
  146. if (base)
  147. len -= base;
  148. if (pglen < len)
  149. len = pglen;
  150. if (pglen != len || xdr->tail[0].iov_len != 0)
  151. flags |= MSG_MORE;
  152. /* Hmm... We might be dealing with highmem pages */
  153. if (PageHighMem(*ppage))
  154. sendpage = sock_no_sendpage;
  155. err = sendpage(sock, *ppage, base, len, flags);
  156. if (ret == 0)
  157. ret = err;
  158. else if (err > 0)
  159. ret += err;
  160. if (err != len)
  161. goto out;
  162. base = 0;
  163. ppage++;
  164. } while ((pglen -= len) != 0);
  165. copy_tail:
  166. len = xdr->tail[0].iov_len;
  167. if (base < len) {
  168. err = xs_send_tail(sock, xdr, base, len);
  169. if (ret == 0)
  170. ret = err;
  171. else if (err > 0)
  172. ret += err;
  173. }
  174. out:
  175. return ret;
  176. }
  177. /**
  178. * xs_nospace - place task on wait queue if transmit was incomplete
  179. * @task: task to put to sleep
  180. *
  181. */
  182. static void xs_nospace(struct rpc_task *task)
  183. {
  184. struct rpc_rqst *req = task->tk_rqstp;
  185. struct rpc_xprt *xprt = req->rq_xprt;
  186. dprintk("RPC: %4d xmit incomplete (%u left of %u)\n",
  187. task->tk_pid, req->rq_slen - req->rq_bytes_sent,
  188. req->rq_slen);
  189. if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) {
  190. /* Protect against races with write_space */
  191. spin_lock_bh(&xprt->transport_lock);
  192. /* Don't race with disconnect */
  193. if (!xprt_connected(xprt))
  194. task->tk_status = -ENOTCONN;
  195. else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags))
  196. xprt_wait_for_buffer_space(task);
  197. spin_unlock_bh(&xprt->transport_lock);
  198. } else
  199. /* Keep holding the socket if it is blocked */
  200. rpc_delay(task, HZ>>4);
  201. }
  202. /**
  203. * xs_udp_send_request - write an RPC request to a UDP socket
  204. * @task: address of RPC task that manages the state of an RPC request
  205. *
  206. * Return values:
  207. * 0: The request has been sent
  208. * EAGAIN: The socket was blocked, please call again later to
  209. * complete the request
  210. * ENOTCONN: Caller needs to invoke connect logic then call again
  211. * other: Some other error occured, the request was not sent
  212. */
  213. static int xs_udp_send_request(struct rpc_task *task)
  214. {
  215. struct rpc_rqst *req = task->tk_rqstp;
  216. struct rpc_xprt *xprt = req->rq_xprt;
  217. struct xdr_buf *xdr = &req->rq_snd_buf;
  218. int status;
  219. xs_pktdump("packet data:",
  220. req->rq_svec->iov_base,
  221. req->rq_svec->iov_len);
  222. req->rq_xtime = jiffies;
  223. status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr,
  224. sizeof(xprt->addr), xdr, req->rq_bytes_sent);
  225. dprintk("RPC: xs_udp_send_request(%u) = %d\n",
  226. xdr->len - req->rq_bytes_sent, status);
  227. if (likely(status >= (int) req->rq_slen))
  228. return 0;
  229. /* Still some bytes left; set up for a retry later. */
  230. if (status > 0)
  231. status = -EAGAIN;
  232. switch (status) {
  233. case -ENETUNREACH:
  234. case -EPIPE:
  235. case -ECONNREFUSED:
  236. /* When the server has died, an ICMP port unreachable message
  237. * prompts ECONNREFUSED. */
  238. break;
  239. case -EAGAIN:
  240. xs_nospace(task);
  241. break;
  242. default:
  243. dprintk("RPC: sendmsg returned unrecognized error %d\n",
  244. -status);
  245. break;
  246. }
  247. return status;
  248. }
  249. static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf)
  250. {
  251. u32 reclen = buf->len - sizeof(rpc_fraghdr);
  252. rpc_fraghdr *base = buf->head[0].iov_base;
  253. *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen);
  254. }
  255. /**
  256. * xs_tcp_send_request - write an RPC request to a TCP socket
  257. * @task: address of RPC task that manages the state of an RPC request
  258. *
  259. * Return values:
  260. * 0: The request has been sent
  261. * EAGAIN: The socket was blocked, please call again later to
  262. * complete the request
  263. * ENOTCONN: Caller needs to invoke connect logic then call again
  264. * other: Some other error occured, the request was not sent
  265. *
  266. * XXX: In the case of soft timeouts, should we eventually give up
  267. * if sendmsg is not able to make progress?
  268. */
  269. static int xs_tcp_send_request(struct rpc_task *task)
  270. {
  271. struct rpc_rqst *req = task->tk_rqstp;
  272. struct rpc_xprt *xprt = req->rq_xprt;
  273. struct xdr_buf *xdr = &req->rq_snd_buf;
  274. int status, retry = 0;
  275. xs_encode_tcp_record_marker(&req->rq_snd_buf);
  276. xs_pktdump("packet data:",
  277. req->rq_svec->iov_base,
  278. req->rq_svec->iov_len);
  279. /* Continue transmitting the packet/record. We must be careful
  280. * to cope with writespace callbacks arriving _after_ we have
  281. * called sendmsg(). */
  282. while (1) {
  283. req->rq_xtime = jiffies;
  284. status = xs_sendpages(xprt->sock, NULL, 0, xdr,
  285. req->rq_bytes_sent);
  286. dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
  287. xdr->len - req->rq_bytes_sent, status);
  288. if (unlikely(status < 0))
  289. break;
  290. /* If we've sent the entire packet, immediately
  291. * reset the count of bytes sent. */
  292. req->rq_bytes_sent += status;
  293. if (likely(req->rq_bytes_sent >= req->rq_slen)) {
  294. req->rq_bytes_sent = 0;
  295. return 0;
  296. }
  297. status = -EAGAIN;
  298. if (retry++ > XS_SENDMSG_RETRY)
  299. break;
  300. }
  301. switch (status) {
  302. case -EAGAIN:
  303. xs_nospace(task);
  304. break;
  305. case -ECONNREFUSED:
  306. case -ECONNRESET:
  307. case -ENOTCONN:
  308. case -EPIPE:
  309. status = -ENOTCONN;
  310. break;
  311. default:
  312. dprintk("RPC: sendmsg returned unrecognized error %d\n",
  313. -status);
  314. xprt_disconnect(xprt);
  315. break;
  316. }
  317. return status;
  318. }
  319. /**
  320. * xs_close - close a socket
  321. * @xprt: transport
  322. *
  323. */
  324. static void xs_close(struct rpc_xprt *xprt)
  325. {
  326. struct socket *sock = xprt->sock;
  327. struct sock *sk = xprt->inet;
  328. if (!sk)
  329. return;
  330. dprintk("RPC: xs_close xprt %p\n", xprt);
  331. write_lock_bh(&sk->sk_callback_lock);
  332. xprt->inet = NULL;
  333. xprt->sock = NULL;
  334. sk->sk_user_data = NULL;
  335. sk->sk_data_ready = xprt->old_data_ready;
  336. sk->sk_state_change = xprt->old_state_change;
  337. sk->sk_write_space = xprt->old_write_space;
  338. write_unlock_bh(&sk->sk_callback_lock);
  339. sk->sk_no_check = 0;
  340. sock_release(sock);
  341. }
  342. /**
  343. * xs_destroy - prepare to shutdown a transport
  344. * @xprt: doomed transport
  345. *
  346. */
  347. static void xs_destroy(struct rpc_xprt *xprt)
  348. {
  349. dprintk("RPC: xs_destroy xprt %p\n", xprt);
  350. cancel_delayed_work(&xprt->connect_worker);
  351. flush_scheduled_work();
  352. xprt_disconnect(xprt);
  353. xs_close(xprt);
  354. kfree(xprt->slot);
  355. }
  356. static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
  357. {
  358. return (struct rpc_xprt *) sk->sk_user_data;
  359. }
  360. /**
  361. * xs_udp_data_ready - "data ready" callback for UDP sockets
  362. * @sk: socket with data to read
  363. * @len: how much data to read
  364. *
  365. */
  366. static void xs_udp_data_ready(struct sock *sk, int len)
  367. {
  368. struct rpc_task *task;
  369. struct rpc_xprt *xprt;
  370. struct rpc_rqst *rovr;
  371. struct sk_buff *skb;
  372. int err, repsize, copied;
  373. u32 _xid, *xp;
  374. read_lock(&sk->sk_callback_lock);
  375. dprintk("RPC: xs_udp_data_ready...\n");
  376. if (!(xprt = xprt_from_sock(sk)))
  377. goto out;
  378. if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
  379. goto out;
  380. if (xprt->shutdown)
  381. goto dropit;
  382. repsize = skb->len - sizeof(struct udphdr);
  383. if (repsize < 4) {
  384. dprintk("RPC: impossible RPC reply size %d!\n", repsize);
  385. goto dropit;
  386. }
  387. /* Copy the XID from the skb... */
  388. xp = skb_header_pointer(skb, sizeof(struct udphdr),
  389. sizeof(_xid), &_xid);
  390. if (xp == NULL)
  391. goto dropit;
  392. /* Look up and lock the request corresponding to the given XID */
  393. spin_lock(&xprt->transport_lock);
  394. rovr = xprt_lookup_rqst(xprt, *xp);
  395. if (!rovr)
  396. goto out_unlock;
  397. task = rovr->rq_task;
  398. if ((copied = rovr->rq_private_buf.buflen) > repsize)
  399. copied = repsize;
  400. /* Suck it into the iovec, verify checksum if not done by hw. */
  401. if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb))
  402. goto out_unlock;
  403. /* Something worked... */
  404. dst_confirm(skb->dst);
  405. xprt_adjust_cwnd(task, copied);
  406. xprt_update_rtt(task);
  407. xprt_complete_rqst(task, copied);
  408. out_unlock:
  409. spin_unlock(&xprt->transport_lock);
  410. dropit:
  411. skb_free_datagram(sk, skb);
  412. out:
  413. read_unlock(&sk->sk_callback_lock);
  414. }
  415. static inline size_t xs_tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
  416. {
  417. if (len > desc->count)
  418. len = desc->count;
  419. if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
  420. dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n",
  421. len, desc->count);
  422. return 0;
  423. }
  424. desc->offset += len;
  425. desc->count -= len;
  426. dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n",
  427. len, desc->count);
  428. return len;
  429. }
  430. static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
  431. {
  432. size_t len, used;
  433. char *p;
  434. p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset;
  435. len = sizeof(xprt->tcp_recm) - xprt->tcp_offset;
  436. used = xs_tcp_copy_data(desc, p, len);
  437. xprt->tcp_offset += used;
  438. if (used != len)
  439. return;
  440. xprt->tcp_reclen = ntohl(xprt->tcp_recm);
  441. if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
  442. xprt->tcp_flags |= XPRT_LAST_FRAG;
  443. else
  444. xprt->tcp_flags &= ~XPRT_LAST_FRAG;
  445. xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
  446. xprt->tcp_flags &= ~XPRT_COPY_RECM;
  447. xprt->tcp_offset = 0;
  448. /* Sanity check of the record length */
  449. if (unlikely(xprt->tcp_reclen < 4)) {
  450. dprintk("RPC: invalid TCP record fragment length\n");
  451. xprt_disconnect(xprt);
  452. return;
  453. }
  454. dprintk("RPC: reading TCP record fragment of length %d\n",
  455. xprt->tcp_reclen);
  456. }
  457. static void xs_tcp_check_recm(struct rpc_xprt *xprt)
  458. {
  459. dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
  460. xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
  461. if (xprt->tcp_offset == xprt->tcp_reclen) {
  462. xprt->tcp_flags |= XPRT_COPY_RECM;
  463. xprt->tcp_offset = 0;
  464. if (xprt->tcp_flags & XPRT_LAST_FRAG) {
  465. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  466. xprt->tcp_flags |= XPRT_COPY_XID;
  467. xprt->tcp_copied = 0;
  468. }
  469. }
  470. }
  471. static inline void xs_tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc)
  472. {
  473. size_t len, used;
  474. char *p;
  475. len = sizeof(xprt->tcp_xid) - xprt->tcp_offset;
  476. dprintk("RPC: reading XID (%Zu bytes)\n", len);
  477. p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset;
  478. used = xs_tcp_copy_data(desc, p, len);
  479. xprt->tcp_offset += used;
  480. if (used != len)
  481. return;
  482. xprt->tcp_flags &= ~XPRT_COPY_XID;
  483. xprt->tcp_flags |= XPRT_COPY_DATA;
  484. xprt->tcp_copied = 4;
  485. dprintk("RPC: reading reply for XID %08x\n",
  486. ntohl(xprt->tcp_xid));
  487. xs_tcp_check_recm(xprt);
  488. }
  489. static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
  490. {
  491. struct rpc_rqst *req;
  492. struct xdr_buf *rcvbuf;
  493. size_t len;
  494. ssize_t r;
  495. /* Find and lock the request corresponding to this xid */
  496. spin_lock(&xprt->transport_lock);
  497. req = xprt_lookup_rqst(xprt, xprt->tcp_xid);
  498. if (!req) {
  499. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  500. dprintk("RPC: XID %08x request not found!\n",
  501. ntohl(xprt->tcp_xid));
  502. spin_unlock(&xprt->transport_lock);
  503. return;
  504. }
  505. rcvbuf = &req->rq_private_buf;
  506. len = desc->count;
  507. if (len > xprt->tcp_reclen - xprt->tcp_offset) {
  508. skb_reader_t my_desc;
  509. len = xprt->tcp_reclen - xprt->tcp_offset;
  510. memcpy(&my_desc, desc, sizeof(my_desc));
  511. my_desc.count = len;
  512. r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
  513. &my_desc, xs_tcp_copy_data);
  514. desc->count -= r;
  515. desc->offset += r;
  516. } else
  517. r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
  518. desc, xs_tcp_copy_data);
  519. if (r > 0) {
  520. xprt->tcp_copied += r;
  521. xprt->tcp_offset += r;
  522. }
  523. if (r != len) {
  524. /* Error when copying to the receive buffer,
  525. * usually because we weren't able to allocate
  526. * additional buffer pages. All we can do now
  527. * is turn off XPRT_COPY_DATA, so the request
  528. * will not receive any additional updates,
  529. * and time out.
  530. * Any remaining data from this record will
  531. * be discarded.
  532. */
  533. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  534. dprintk("RPC: XID %08x truncated request\n",
  535. ntohl(xprt->tcp_xid));
  536. dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
  537. xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
  538. goto out;
  539. }
  540. dprintk("RPC: XID %08x read %Zd bytes\n",
  541. ntohl(xprt->tcp_xid), r);
  542. dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
  543. xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
  544. if (xprt->tcp_copied == req->rq_private_buf.buflen)
  545. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  546. else if (xprt->tcp_offset == xprt->tcp_reclen) {
  547. if (xprt->tcp_flags & XPRT_LAST_FRAG)
  548. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  549. }
  550. out:
  551. if (!(xprt->tcp_flags & XPRT_COPY_DATA))
  552. xprt_complete_rqst(req->rq_task, xprt->tcp_copied);
  553. spin_unlock(&xprt->transport_lock);
  554. xs_tcp_check_recm(xprt);
  555. }
  556. static inline void xs_tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
  557. {
  558. size_t len;
  559. len = xprt->tcp_reclen - xprt->tcp_offset;
  560. if (len > desc->count)
  561. len = desc->count;
  562. desc->count -= len;
  563. desc->offset += len;
  564. xprt->tcp_offset += len;
  565. dprintk("RPC: discarded %Zu bytes\n", len);
  566. xs_tcp_check_recm(xprt);
  567. }
  568. static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
  569. {
  570. struct rpc_xprt *xprt = rd_desc->arg.data;
  571. skb_reader_t desc = {
  572. .skb = skb,
  573. .offset = offset,
  574. .count = len,
  575. .csum = 0
  576. };
  577. dprintk("RPC: xs_tcp_data_recv started\n");
  578. do {
  579. /* Read in a new fragment marker if necessary */
  580. /* Can we ever really expect to get completely empty fragments? */
  581. if (xprt->tcp_flags & XPRT_COPY_RECM) {
  582. xs_tcp_read_fraghdr(xprt, &desc);
  583. continue;
  584. }
  585. /* Read in the xid if necessary */
  586. if (xprt->tcp_flags & XPRT_COPY_XID) {
  587. xs_tcp_read_xid(xprt, &desc);
  588. continue;
  589. }
  590. /* Read in the request data */
  591. if (xprt->tcp_flags & XPRT_COPY_DATA) {
  592. xs_tcp_read_request(xprt, &desc);
  593. continue;
  594. }
  595. /* Skip over any trailing bytes on short reads */
  596. xs_tcp_read_discard(xprt, &desc);
  597. } while (desc.count);
  598. dprintk("RPC: xs_tcp_data_recv done\n");
  599. return len - desc.count;
  600. }
  601. /**
  602. * xs_tcp_data_ready - "data ready" callback for TCP sockets
  603. * @sk: socket with data to read
  604. * @bytes: how much data to read
  605. *
  606. */
  607. static void xs_tcp_data_ready(struct sock *sk, int bytes)
  608. {
  609. struct rpc_xprt *xprt;
  610. read_descriptor_t rd_desc;
  611. read_lock(&sk->sk_callback_lock);
  612. dprintk("RPC: xs_tcp_data_ready...\n");
  613. if (!(xprt = xprt_from_sock(sk)))
  614. goto out;
  615. if (xprt->shutdown)
  616. goto out;
  617. /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
  618. rd_desc.arg.data = xprt;
  619. rd_desc.count = 65536;
  620. tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
  621. out:
  622. read_unlock(&sk->sk_callback_lock);
  623. }
  624. /**
  625. * xs_tcp_state_change - callback to handle TCP socket state changes
  626. * @sk: socket whose state has changed
  627. *
  628. */
  629. static void xs_tcp_state_change(struct sock *sk)
  630. {
  631. struct rpc_xprt *xprt;
  632. read_lock(&sk->sk_callback_lock);
  633. if (!(xprt = xprt_from_sock(sk)))
  634. goto out;
  635. dprintk("RPC: xs_tcp_state_change client %p...\n", xprt);
  636. dprintk("RPC: state %x conn %d dead %d zapped %d\n",
  637. sk->sk_state, xprt_connected(xprt),
  638. sock_flag(sk, SOCK_DEAD),
  639. sock_flag(sk, SOCK_ZAPPED));
  640. switch (sk->sk_state) {
  641. case TCP_ESTABLISHED:
  642. spin_lock_bh(&xprt->transport_lock);
  643. if (!xprt_test_and_set_connected(xprt)) {
  644. /* Reset TCP record info */
  645. xprt->tcp_offset = 0;
  646. xprt->tcp_reclen = 0;
  647. xprt->tcp_copied = 0;
  648. xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID;
  649. xprt_wake_pending_tasks(xprt, 0);
  650. }
  651. spin_unlock_bh(&xprt->transport_lock);
  652. break;
  653. case TCP_SYN_SENT:
  654. case TCP_SYN_RECV:
  655. break;
  656. default:
  657. xprt_disconnect(xprt);
  658. break;
  659. }
  660. out:
  661. read_unlock(&sk->sk_callback_lock);
  662. }
  663. /**
  664. * xs_udp_write_space - callback invoked when socket buffer space
  665. * becomes available
  666. * @sk: socket whose state has changed
  667. *
  668. * Called when more output buffer space is available for this socket.
  669. * We try not to wake our writers until they can make "significant"
  670. * progress, otherwise we'll waste resources thrashing kernel_sendmsg
  671. * with a bunch of small requests.
  672. */
  673. static void xs_udp_write_space(struct sock *sk)
  674. {
  675. read_lock(&sk->sk_callback_lock);
  676. /* from net/core/sock.c:sock_def_write_space */
  677. if (sock_writeable(sk)) {
  678. struct socket *sock;
  679. struct rpc_xprt *xprt;
  680. if (unlikely(!(sock = sk->sk_socket)))
  681. goto out;
  682. if (unlikely(!(xprt = xprt_from_sock(sk))))
  683. goto out;
  684. if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)))
  685. goto out;
  686. xprt_write_space(xprt);
  687. }
  688. out:
  689. read_unlock(&sk->sk_callback_lock);
  690. }
  691. /**
  692. * xs_tcp_write_space - callback invoked when socket buffer space
  693. * becomes available
  694. * @sk: socket whose state has changed
  695. *
  696. * Called when more output buffer space is available for this socket.
  697. * We try not to wake our writers until they can make "significant"
  698. * progress, otherwise we'll waste resources thrashing kernel_sendmsg
  699. * with a bunch of small requests.
  700. */
  701. static void xs_tcp_write_space(struct sock *sk)
  702. {
  703. read_lock(&sk->sk_callback_lock);
  704. /* from net/core/stream.c:sk_stream_write_space */
  705. if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
  706. struct socket *sock;
  707. struct rpc_xprt *xprt;
  708. if (unlikely(!(sock = sk->sk_socket)))
  709. goto out;
  710. if (unlikely(!(xprt = xprt_from_sock(sk))))
  711. goto out;
  712. if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)))
  713. goto out;
  714. xprt_write_space(xprt);
  715. }
  716. out:
  717. read_unlock(&sk->sk_callback_lock);
  718. }
  719. /**
  720. * xs_udp_set_buffer_size - set send and receive limits
  721. * @xprt: generic transport
  722. *
  723. * Set socket send and receive limits based on the
  724. * sndsize and rcvsize fields in the generic transport
  725. * structure.
  726. */
  727. static void xs_udp_set_buffer_size(struct rpc_xprt *xprt)
  728. {
  729. struct sock *sk = xprt->inet;
  730. if (xprt->rcvsize) {
  731. sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
  732. sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2;
  733. }
  734. if (xprt->sndsize) {
  735. sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
  736. sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2;
  737. sk->sk_write_space(sk);
  738. }
  739. }
  740. /**
  741. * xs_tcp_set_buffer_size - set send and receive limits
  742. * @xprt: generic transport
  743. *
  744. * Nothing to do for TCP.
  745. */
  746. static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt)
  747. {
  748. return;
  749. }
  750. /**
  751. * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport
  752. * @task: task that timed out
  753. *
  754. * Adjust the congestion window after a retransmit timeout has occurred.
  755. */
  756. static void xs_udp_timer(struct rpc_task *task)
  757. {
  758. xprt_adjust_cwnd(task, -ETIMEDOUT);
  759. }
  760. static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
  761. {
  762. struct sockaddr_in myaddr = {
  763. .sin_family = AF_INET,
  764. };
  765. int err, port;
  766. /* Were we already bound to a given port? Try to reuse it */
  767. port = xprt->port;
  768. do {
  769. myaddr.sin_port = htons(port);
  770. err = sock->ops->bind(sock, (struct sockaddr *) &myaddr,
  771. sizeof(myaddr));
  772. if (err == 0) {
  773. xprt->port = port;
  774. dprintk("RPC: xs_bindresvport bound to port %u\n",
  775. port);
  776. return 0;
  777. }
  778. if (--port == 0)
  779. port = XS_MAX_RESVPORT;
  780. } while (err == -EADDRINUSE && port != xprt->port);
  781. dprintk("RPC: can't bind to reserved port (%d).\n", -err);
  782. return err;
  783. }
  784. /**
  785. * xs_udp_connect_worker - set up a UDP socket
  786. * @args: RPC transport to connect
  787. *
  788. * Invoked by a work queue tasklet.
  789. */
  790. static void xs_udp_connect_worker(void *args)
  791. {
  792. struct rpc_xprt *xprt = (struct rpc_xprt *) args;
  793. struct socket *sock = xprt->sock;
  794. int err, status = -EIO;
  795. if (xprt->shutdown || xprt->addr.sin_port == 0)
  796. goto out;
  797. dprintk("RPC: xs_udp_connect_worker for xprt %p\n", xprt);
  798. /* Start by resetting any existing state */
  799. xs_close(xprt);
  800. if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
  801. dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
  802. goto out;
  803. }
  804. if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
  805. sock_release(sock);
  806. goto out;
  807. }
  808. if (!xprt->inet) {
  809. struct sock *sk = sock->sk;
  810. write_lock_bh(&sk->sk_callback_lock);
  811. sk->sk_user_data = xprt;
  812. xprt->old_data_ready = sk->sk_data_ready;
  813. xprt->old_state_change = sk->sk_state_change;
  814. xprt->old_write_space = sk->sk_write_space;
  815. sk->sk_data_ready = xs_udp_data_ready;
  816. sk->sk_write_space = xs_udp_write_space;
  817. sk->sk_no_check = UDP_CSUM_NORCV;
  818. xprt_set_connected(xprt);
  819. /* Reset to new socket */
  820. xprt->sock = sock;
  821. xprt->inet = sk;
  822. write_unlock_bh(&sk->sk_callback_lock);
  823. }
  824. xs_udp_set_buffer_size(xprt);
  825. status = 0;
  826. out:
  827. xprt_wake_pending_tasks(xprt, status);
  828. xprt_clear_connecting(xprt);
  829. }
  830. /**
  831. * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint
  832. * @args: RPC transport to connect
  833. *
  834. * Invoked by a work queue tasklet.
  835. */
  836. static void xs_tcp_connect_worker(void *args)
  837. {
  838. struct rpc_xprt *xprt = (struct rpc_xprt *)args;
  839. struct socket *sock = xprt->sock;
  840. int err, status = -EIO;
  841. if (xprt->shutdown || xprt->addr.sin_port == 0)
  842. goto out;
  843. dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt);
  844. /* Start by resetting any existing socket state */
  845. xs_close(xprt);
  846. if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
  847. dprintk("RPC: can't create TCP transport socket (%d).\n", -err);
  848. goto out;
  849. }
  850. if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
  851. sock_release(sock);
  852. goto out;
  853. }
  854. if (!xprt->inet) {
  855. struct sock *sk = sock->sk;
  856. write_lock_bh(&sk->sk_callback_lock);
  857. sk->sk_user_data = xprt;
  858. xprt->old_data_ready = sk->sk_data_ready;
  859. xprt->old_state_change = sk->sk_state_change;
  860. xprt->old_write_space = sk->sk_write_space;
  861. sk->sk_data_ready = xs_tcp_data_ready;
  862. sk->sk_state_change = xs_tcp_state_change;
  863. sk->sk_write_space = xs_tcp_write_space;
  864. tcp_sk(sk)->nonagle = 1;
  865. xprt_clear_connected(xprt);
  866. /* Reset to new socket */
  867. xprt->sock = sock;
  868. xprt->inet = sk;
  869. write_unlock_bh(&sk->sk_callback_lock);
  870. }
  871. /* Tell the socket layer to start connecting... */
  872. status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr,
  873. sizeof(xprt->addr), O_NONBLOCK);
  874. dprintk("RPC: %p connect status %d connected %d sock state %d\n",
  875. xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
  876. if (status < 0) {
  877. switch (status) {
  878. case -EINPROGRESS:
  879. case -EALREADY:
  880. goto out_clear;
  881. }
  882. }
  883. out:
  884. xprt_wake_pending_tasks(xprt, status);
  885. out_clear:
  886. xprt_clear_connecting(xprt);
  887. }
  888. /**
  889. * xs_connect - connect a socket to a remote endpoint
  890. * @task: address of RPC task that manages state of connect request
  891. *
  892. * TCP: If the remote end dropped the connection, delay reconnecting.
  893. */
  894. static void xs_connect(struct rpc_task *task)
  895. {
  896. struct rpc_xprt *xprt = task->tk_xprt;
  897. if (xprt_test_and_set_connecting(xprt))
  898. return;
  899. if (xprt->sock != NULL) {
  900. dprintk("RPC: xs_connect delayed xprt %p\n", xprt);
  901. schedule_delayed_work(&xprt->connect_worker,
  902. RPC_REESTABLISH_TIMEOUT);
  903. } else {
  904. dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
  905. schedule_work(&xprt->connect_worker);
  906. /* flush_scheduled_work can sleep... */
  907. if (!RPC_IS_ASYNC(task))
  908. flush_scheduled_work();
  909. }
  910. }
  911. static struct rpc_xprt_ops xs_udp_ops = {
  912. .set_buffer_size = xs_udp_set_buffer_size,
  913. .reserve_xprt = xprt_reserve_xprt_cong,
  914. .release_xprt = xprt_release_xprt_cong,
  915. .connect = xs_connect,
  916. .send_request = xs_udp_send_request,
  917. .set_retrans_timeout = xprt_set_retrans_timeout_rtt,
  918. .timer = xs_udp_timer,
  919. .release_request = xprt_release_rqst_cong,
  920. .close = xs_close,
  921. .destroy = xs_destroy,
  922. };
  923. static struct rpc_xprt_ops xs_tcp_ops = {
  924. .set_buffer_size = xs_tcp_set_buffer_size,
  925. .reserve_xprt = xprt_reserve_xprt,
  926. .release_xprt = xprt_release_xprt,
  927. .connect = xs_connect,
  928. .send_request = xs_tcp_send_request,
  929. .set_retrans_timeout = xprt_set_retrans_timeout_def,
  930. .close = xs_close,
  931. .destroy = xs_destroy,
  932. };
  933. extern unsigned int xprt_udp_slot_table_entries;
  934. extern unsigned int xprt_tcp_slot_table_entries;
  935. /**
  936. * xs_setup_udp - Set up transport to use a UDP socket
  937. * @xprt: transport to set up
  938. * @to: timeout parameters
  939. *
  940. */
  941. int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
  942. {
  943. size_t slot_table_size;
  944. dprintk("RPC: setting up udp-ipv4 transport...\n");
  945. xprt->max_reqs = xprt_udp_slot_table_entries;
  946. slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
  947. xprt->slot = kmalloc(slot_table_size, GFP_KERNEL);
  948. if (xprt->slot == NULL)
  949. return -ENOMEM;
  950. memset(xprt->slot, 0, slot_table_size);
  951. xprt->prot = IPPROTO_UDP;
  952. xprt->port = XS_MAX_RESVPORT;
  953. xprt->tsh_size = 0;
  954. xprt->cwnd = RPC_INITCWND;
  955. xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
  956. /* XXX: header size can vary due to auth type, IPv6, etc. */
  957. xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
  958. INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt);
  959. xprt->ops = &xs_udp_ops;
  960. if (to)
  961. xprt->timeout = *to;
  962. else
  963. xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
  964. return 0;
  965. }
  966. /**
  967. * xs_setup_tcp - Set up transport to use a TCP socket
  968. * @xprt: transport to set up
  969. * @to: timeout parameters
  970. *
  971. */
  972. int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
  973. {
  974. size_t slot_table_size;
  975. dprintk("RPC: setting up tcp-ipv4 transport...\n");
  976. xprt->max_reqs = xprt_tcp_slot_table_entries;
  977. slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
  978. xprt->slot = kmalloc(slot_table_size, GFP_KERNEL);
  979. if (xprt->slot == NULL)
  980. return -ENOMEM;
  981. memset(xprt->slot, 0, slot_table_size);
  982. xprt->prot = IPPROTO_TCP;
  983. xprt->port = XS_MAX_RESVPORT;
  984. xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
  985. xprt->cwnd = RPC_MAXCWND(xprt);
  986. xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
  987. xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
  988. INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt);
  989. xprt->ops = &xs_tcp_ops;
  990. if (to)
  991. xprt->timeout = *to;
  992. else
  993. xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
  994. return 0;
  995. }