xprtsock.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160
  1. /*
  2. * linux/net/sunrpc/xprtsock.c
  3. *
  4. * Client-side transport implementation for sockets.
  5. *
  6. * TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com>
  7. * TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com>
  8. * TCP NFS related read + write fixes
  9. * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
  10. *
  11. * Rewrite of larges part of the code in order to stabilize TCP stuff.
  12. * Fix behaviour when socket buffer is full.
  13. * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
  14. *
  15. * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com>
  16. */
  17. #include <linux/types.h>
  18. #include <linux/slab.h>
  19. #include <linux/capability.h>
  20. #include <linux/sched.h>
  21. #include <linux/pagemap.h>
  22. #include <linux/errno.h>
  23. #include <linux/socket.h>
  24. #include <linux/in.h>
  25. #include <linux/net.h>
  26. #include <linux/mm.h>
  27. #include <linux/udp.h>
  28. #include <linux/tcp.h>
  29. #include <linux/sunrpc/clnt.h>
  30. #include <linux/file.h>
  31. #include <net/sock.h>
  32. #include <net/checksum.h>
  33. #include <net/udp.h>
  34. #include <net/tcp.h>
  35. /*
  36. * Maximum port number to use when requesting a reserved port.
  37. */
  38. #define XS_MAX_RESVPORT (800U)
  39. /*
  40. * How many times to try sending a request on a socket before waiting
  41. * for the socket buffer to clear.
  42. */
  43. #define XS_SENDMSG_RETRY (10U)
  44. #ifdef RPC_DEBUG
  45. # undef RPC_DEBUG_DATA
  46. # define RPCDBG_FACILITY RPCDBG_TRANS
  47. #endif
  48. #ifdef RPC_DEBUG_DATA
  49. static void xs_pktdump(char *msg, u32 *packet, unsigned int count)
  50. {
  51. u8 *buf = (u8 *) packet;
  52. int j;
  53. dprintk("RPC: %s\n", msg);
  54. for (j = 0; j < count && j < 128; j += 4) {
  55. if (!(j & 31)) {
  56. if (j)
  57. dprintk("\n");
  58. dprintk("0x%04x ", j);
  59. }
  60. dprintk("%02x%02x%02x%02x ",
  61. buf[j], buf[j+1], buf[j+2], buf[j+3]);
  62. }
  63. dprintk("\n");
  64. }
  65. #else
  66. static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count)
  67. {
  68. /* NOP */
  69. }
  70. #endif
  71. #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
  72. static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len)
  73. {
  74. struct kvec iov = {
  75. .iov_base = xdr->head[0].iov_base + base,
  76. .iov_len = len - base,
  77. };
  78. struct msghdr msg = {
  79. .msg_name = addr,
  80. .msg_namelen = addrlen,
  81. .msg_flags = XS_SENDMSG_FLAGS,
  82. };
  83. if (xdr->len > len)
  84. msg.msg_flags |= MSG_MORE;
  85. if (likely(iov.iov_len))
  86. return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
  87. return kernel_sendmsg(sock, &msg, NULL, 0, 0);
  88. }
  89. static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int base, unsigned int len)
  90. {
  91. struct kvec iov = {
  92. .iov_base = xdr->tail[0].iov_base + base,
  93. .iov_len = len - base,
  94. };
  95. struct msghdr msg = {
  96. .msg_flags = XS_SENDMSG_FLAGS,
  97. };
  98. return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
  99. }
  100. /**
  101. * xs_sendpages - write pages directly to a socket
  102. * @sock: socket to send on
  103. * @addr: UDP only -- address of destination
  104. * @addrlen: UDP only -- length of destination address
  105. * @xdr: buffer containing this request
  106. * @base: starting position in the buffer
  107. *
  108. */
  109. static inline int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
  110. {
  111. struct page **ppage = xdr->pages;
  112. unsigned int len, pglen = xdr->page_len;
  113. int err, ret = 0;
  114. ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
  115. if (unlikely(!sock))
  116. return -ENOTCONN;
  117. clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
  118. len = xdr->head[0].iov_len;
  119. if (base < len || (addr != NULL && base == 0)) {
  120. err = xs_send_head(sock, addr, addrlen, xdr, base, len);
  121. if (ret == 0)
  122. ret = err;
  123. else if (err > 0)
  124. ret += err;
  125. if (err != (len - base))
  126. goto out;
  127. base = 0;
  128. } else
  129. base -= len;
  130. if (unlikely(pglen == 0))
  131. goto copy_tail;
  132. if (unlikely(base >= pglen)) {
  133. base -= pglen;
  134. goto copy_tail;
  135. }
  136. if (base || xdr->page_base) {
  137. pglen -= base;
  138. base += xdr->page_base;
  139. ppage += base >> PAGE_CACHE_SHIFT;
  140. base &= ~PAGE_CACHE_MASK;
  141. }
  142. sendpage = sock->ops->sendpage ? : sock_no_sendpage;
  143. do {
  144. int flags = XS_SENDMSG_FLAGS;
  145. len = PAGE_CACHE_SIZE;
  146. if (base)
  147. len -= base;
  148. if (pglen < len)
  149. len = pglen;
  150. if (pglen != len || xdr->tail[0].iov_len != 0)
  151. flags |= MSG_MORE;
  152. /* Hmm... We might be dealing with highmem pages */
  153. if (PageHighMem(*ppage))
  154. sendpage = sock_no_sendpage;
  155. err = sendpage(sock, *ppage, base, len, flags);
  156. if (ret == 0)
  157. ret = err;
  158. else if (err > 0)
  159. ret += err;
  160. if (err != len)
  161. goto out;
  162. base = 0;
  163. ppage++;
  164. } while ((pglen -= len) != 0);
  165. copy_tail:
  166. len = xdr->tail[0].iov_len;
  167. if (base < len) {
  168. err = xs_send_tail(sock, xdr, base, len);
  169. if (ret == 0)
  170. ret = err;
  171. else if (err > 0)
  172. ret += err;
  173. }
  174. out:
  175. return ret;
  176. }
  177. /**
  178. * xs_nospace - place task on wait queue if transmit was incomplete
  179. * @task: task to put to sleep
  180. *
  181. */
  182. static void xs_nospace(struct rpc_task *task)
  183. {
  184. struct rpc_rqst *req = task->tk_rqstp;
  185. struct rpc_xprt *xprt = req->rq_xprt;
  186. dprintk("RPC: %4d xmit incomplete (%u left of %u)\n",
  187. task->tk_pid, req->rq_slen - req->rq_bytes_sent,
  188. req->rq_slen);
  189. if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) {
  190. /* Protect against races with write_space */
  191. spin_lock_bh(&xprt->transport_lock);
  192. /* Don't race with disconnect */
  193. if (!xprt_connected(xprt))
  194. task->tk_status = -ENOTCONN;
  195. else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags))
  196. xprt_wait_for_buffer_space(task);
  197. spin_unlock_bh(&xprt->transport_lock);
  198. } else
  199. /* Keep holding the socket if it is blocked */
  200. rpc_delay(task, HZ>>4);
  201. }
  202. /**
  203. * xs_udp_send_request - write an RPC request to a UDP socket
  204. * @task: address of RPC task that manages the state of an RPC request
  205. *
  206. * Return values:
  207. * 0: The request has been sent
  208. * EAGAIN: The socket was blocked, please call again later to
  209. * complete the request
  210. * ENOTCONN: Caller needs to invoke connect logic then call again
  211. * other: Some other error occured, the request was not sent
  212. */
  213. static int xs_udp_send_request(struct rpc_task *task)
  214. {
  215. struct rpc_rqst *req = task->tk_rqstp;
  216. struct rpc_xprt *xprt = req->rq_xprt;
  217. struct xdr_buf *xdr = &req->rq_snd_buf;
  218. int status;
  219. xs_pktdump("packet data:",
  220. req->rq_svec->iov_base,
  221. req->rq_svec->iov_len);
  222. req->rq_xtime = jiffies;
  223. status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr,
  224. sizeof(xprt->addr), xdr, req->rq_bytes_sent);
  225. dprintk("RPC: xs_udp_send_request(%u) = %d\n",
  226. xdr->len - req->rq_bytes_sent, status);
  227. if (likely(status >= (int) req->rq_slen))
  228. return 0;
  229. /* Still some bytes left; set up for a retry later. */
  230. if (status > 0)
  231. status = -EAGAIN;
  232. switch (status) {
  233. case -ENETUNREACH:
  234. case -EPIPE:
  235. case -ECONNREFUSED:
  236. /* When the server has died, an ICMP port unreachable message
  237. * prompts ECONNREFUSED. */
  238. break;
  239. case -EAGAIN:
  240. xs_nospace(task);
  241. break;
  242. default:
  243. dprintk("RPC: sendmsg returned unrecognized error %d\n",
  244. -status);
  245. break;
  246. }
  247. return status;
  248. }
  249. static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf)
  250. {
  251. u32 reclen = buf->len - sizeof(rpc_fraghdr);
  252. rpc_fraghdr *base = buf->head[0].iov_base;
  253. *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen);
  254. }
  255. /**
  256. * xs_tcp_send_request - write an RPC request to a TCP socket
  257. * @task: address of RPC task that manages the state of an RPC request
  258. *
  259. * Return values:
  260. * 0: The request has been sent
  261. * EAGAIN: The socket was blocked, please call again later to
  262. * complete the request
  263. * ENOTCONN: Caller needs to invoke connect logic then call again
  264. * other: Some other error occured, the request was not sent
  265. *
  266. * XXX: In the case of soft timeouts, should we eventually give up
  267. * if sendmsg is not able to make progress?
  268. */
  269. static int xs_tcp_send_request(struct rpc_task *task)
  270. {
  271. struct rpc_rqst *req = task->tk_rqstp;
  272. struct rpc_xprt *xprt = req->rq_xprt;
  273. struct xdr_buf *xdr = &req->rq_snd_buf;
  274. int status, retry = 0;
  275. xs_encode_tcp_record_marker(&req->rq_snd_buf);
  276. xs_pktdump("packet data:",
  277. req->rq_svec->iov_base,
  278. req->rq_svec->iov_len);
  279. /* Continue transmitting the packet/record. We must be careful
  280. * to cope with writespace callbacks arriving _after_ we have
  281. * called sendmsg(). */
  282. while (1) {
  283. req->rq_xtime = jiffies;
  284. status = xs_sendpages(xprt->sock, NULL, 0, xdr,
  285. req->rq_bytes_sent);
  286. dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
  287. xdr->len - req->rq_bytes_sent, status);
  288. if (unlikely(status < 0))
  289. break;
  290. /* If we've sent the entire packet, immediately
  291. * reset the count of bytes sent. */
  292. req->rq_bytes_sent += status;
  293. if (likely(req->rq_bytes_sent >= req->rq_slen)) {
  294. req->rq_bytes_sent = 0;
  295. return 0;
  296. }
  297. status = -EAGAIN;
  298. if (retry++ > XS_SENDMSG_RETRY)
  299. break;
  300. }
  301. switch (status) {
  302. case -EAGAIN:
  303. xs_nospace(task);
  304. break;
  305. case -ECONNREFUSED:
  306. case -ECONNRESET:
  307. case -ENOTCONN:
  308. case -EPIPE:
  309. status = -ENOTCONN;
  310. break;
  311. default:
  312. dprintk("RPC: sendmsg returned unrecognized error %d\n",
  313. -status);
  314. xprt_disconnect(xprt);
  315. break;
  316. }
  317. return status;
  318. }
  319. /**
  320. * xs_close - close a socket
  321. * @xprt: transport
  322. *
  323. */
  324. static void xs_close(struct rpc_xprt *xprt)
  325. {
  326. struct socket *sock = xprt->sock;
  327. struct sock *sk = xprt->inet;
  328. if (!sk)
  329. return;
  330. dprintk("RPC: xs_close xprt %p\n", xprt);
  331. write_lock_bh(&sk->sk_callback_lock);
  332. xprt->inet = NULL;
  333. xprt->sock = NULL;
  334. sk->sk_user_data = NULL;
  335. sk->sk_data_ready = xprt->old_data_ready;
  336. sk->sk_state_change = xprt->old_state_change;
  337. sk->sk_write_space = xprt->old_write_space;
  338. write_unlock_bh(&sk->sk_callback_lock);
  339. sk->sk_no_check = 0;
  340. sock_release(sock);
  341. }
  342. /**
  343. * xs_destroy - prepare to shutdown a transport
  344. * @xprt: doomed transport
  345. *
  346. */
  347. static void xs_destroy(struct rpc_xprt *xprt)
  348. {
  349. dprintk("RPC: xs_destroy xprt %p\n", xprt);
  350. cancel_delayed_work(&xprt->connect_worker);
  351. flush_scheduled_work();
  352. xprt_disconnect(xprt);
  353. xs_close(xprt);
  354. kfree(xprt->slot);
  355. }
  356. static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
  357. {
  358. return (struct rpc_xprt *) sk->sk_user_data;
  359. }
  360. /**
  361. * xs_udp_data_ready - "data ready" callback for UDP sockets
  362. * @sk: socket with data to read
  363. * @len: how much data to read
  364. *
  365. */
  366. static void xs_udp_data_ready(struct sock *sk, int len)
  367. {
  368. struct rpc_task *task;
  369. struct rpc_xprt *xprt;
  370. struct rpc_rqst *rovr;
  371. struct sk_buff *skb;
  372. int err, repsize, copied;
  373. u32 _xid, *xp;
  374. read_lock(&sk->sk_callback_lock);
  375. dprintk("RPC: xs_udp_data_ready...\n");
  376. if (!(xprt = xprt_from_sock(sk)))
  377. goto out;
  378. if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
  379. goto out;
  380. if (xprt->shutdown)
  381. goto dropit;
  382. repsize = skb->len - sizeof(struct udphdr);
  383. if (repsize < 4) {
  384. dprintk("RPC: impossible RPC reply size %d!\n", repsize);
  385. goto dropit;
  386. }
  387. /* Copy the XID from the skb... */
  388. xp = skb_header_pointer(skb, sizeof(struct udphdr),
  389. sizeof(_xid), &_xid);
  390. if (xp == NULL)
  391. goto dropit;
  392. /* Look up and lock the request corresponding to the given XID */
  393. spin_lock(&xprt->transport_lock);
  394. rovr = xprt_lookup_rqst(xprt, *xp);
  395. if (!rovr)
  396. goto out_unlock;
  397. task = rovr->rq_task;
  398. dprintk("RPC: %4d received reply\n", task->tk_pid);
  399. if ((copied = rovr->rq_private_buf.buflen) > repsize)
  400. copied = repsize;
  401. /* Suck it into the iovec, verify checksum if not done by hw. */
  402. if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb))
  403. goto out_unlock;
  404. /* Something worked... */
  405. dst_confirm(skb->dst);
  406. xprt_complete_rqst(xprt, rovr, copied);
  407. out_unlock:
  408. spin_unlock(&xprt->transport_lock);
  409. dropit:
  410. skb_free_datagram(sk, skb);
  411. out:
  412. read_unlock(&sk->sk_callback_lock);
  413. }
  414. static inline size_t xs_tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
  415. {
  416. if (len > desc->count)
  417. len = desc->count;
  418. if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
  419. dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n",
  420. len, desc->count);
  421. return 0;
  422. }
  423. desc->offset += len;
  424. desc->count -= len;
  425. dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n",
  426. len, desc->count);
  427. return len;
  428. }
  429. static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
  430. {
  431. size_t len, used;
  432. char *p;
  433. p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset;
  434. len = sizeof(xprt->tcp_recm) - xprt->tcp_offset;
  435. used = xs_tcp_copy_data(desc, p, len);
  436. xprt->tcp_offset += used;
  437. if (used != len)
  438. return;
  439. xprt->tcp_reclen = ntohl(xprt->tcp_recm);
  440. if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
  441. xprt->tcp_flags |= XPRT_LAST_FRAG;
  442. else
  443. xprt->tcp_flags &= ~XPRT_LAST_FRAG;
  444. xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
  445. xprt->tcp_flags &= ~XPRT_COPY_RECM;
  446. xprt->tcp_offset = 0;
  447. /* Sanity check of the record length */
  448. if (unlikely(xprt->tcp_reclen < 4)) {
  449. dprintk("RPC: invalid TCP record fragment length\n");
  450. xprt_disconnect(xprt);
  451. return;
  452. }
  453. dprintk("RPC: reading TCP record fragment of length %d\n",
  454. xprt->tcp_reclen);
  455. }
  456. static void xs_tcp_check_recm(struct rpc_xprt *xprt)
  457. {
  458. dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
  459. xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
  460. if (xprt->tcp_offset == xprt->tcp_reclen) {
  461. xprt->tcp_flags |= XPRT_COPY_RECM;
  462. xprt->tcp_offset = 0;
  463. if (xprt->tcp_flags & XPRT_LAST_FRAG) {
  464. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  465. xprt->tcp_flags |= XPRT_COPY_XID;
  466. xprt->tcp_copied = 0;
  467. }
  468. }
  469. }
  470. static inline void xs_tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc)
  471. {
  472. size_t len, used;
  473. char *p;
  474. len = sizeof(xprt->tcp_xid) - xprt->tcp_offset;
  475. dprintk("RPC: reading XID (%Zu bytes)\n", len);
  476. p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset;
  477. used = xs_tcp_copy_data(desc, p, len);
  478. xprt->tcp_offset += used;
  479. if (used != len)
  480. return;
  481. xprt->tcp_flags &= ~XPRT_COPY_XID;
  482. xprt->tcp_flags |= XPRT_COPY_DATA;
  483. xprt->tcp_copied = 4;
  484. dprintk("RPC: reading reply for XID %08x\n",
  485. ntohl(xprt->tcp_xid));
  486. xs_tcp_check_recm(xprt);
  487. }
  488. static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
  489. {
  490. struct rpc_rqst *req;
  491. struct xdr_buf *rcvbuf;
  492. size_t len;
  493. ssize_t r;
  494. /* Find and lock the request corresponding to this xid */
  495. spin_lock(&xprt->transport_lock);
  496. req = xprt_lookup_rqst(xprt, xprt->tcp_xid);
  497. if (!req) {
  498. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  499. dprintk("RPC: XID %08x request not found!\n",
  500. ntohl(xprt->tcp_xid));
  501. spin_unlock(&xprt->transport_lock);
  502. return;
  503. }
  504. rcvbuf = &req->rq_private_buf;
  505. len = desc->count;
  506. if (len > xprt->tcp_reclen - xprt->tcp_offset) {
  507. skb_reader_t my_desc;
  508. len = xprt->tcp_reclen - xprt->tcp_offset;
  509. memcpy(&my_desc, desc, sizeof(my_desc));
  510. my_desc.count = len;
  511. r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
  512. &my_desc, xs_tcp_copy_data);
  513. desc->count -= r;
  514. desc->offset += r;
  515. } else
  516. r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
  517. desc, xs_tcp_copy_data);
  518. if (r > 0) {
  519. xprt->tcp_copied += r;
  520. xprt->tcp_offset += r;
  521. }
  522. if (r != len) {
  523. /* Error when copying to the receive buffer,
  524. * usually because we weren't able to allocate
  525. * additional buffer pages. All we can do now
  526. * is turn off XPRT_COPY_DATA, so the request
  527. * will not receive any additional updates,
  528. * and time out.
  529. * Any remaining data from this record will
  530. * be discarded.
  531. */
  532. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  533. dprintk("RPC: XID %08x truncated request\n",
  534. ntohl(xprt->tcp_xid));
  535. dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
  536. xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
  537. goto out;
  538. }
  539. dprintk("RPC: XID %08x read %Zd bytes\n",
  540. ntohl(xprt->tcp_xid), r);
  541. dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
  542. xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
  543. if (xprt->tcp_copied == req->rq_private_buf.buflen)
  544. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  545. else if (xprt->tcp_offset == xprt->tcp_reclen) {
  546. if (xprt->tcp_flags & XPRT_LAST_FRAG)
  547. xprt->tcp_flags &= ~XPRT_COPY_DATA;
  548. }
  549. out:
  550. if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
  551. dprintk("RPC: %4d received reply complete\n",
  552. req->rq_task->tk_pid);
  553. xprt_complete_rqst(xprt, req, xprt->tcp_copied);
  554. }
  555. spin_unlock(&xprt->transport_lock);
  556. xs_tcp_check_recm(xprt);
  557. }
  558. static inline void xs_tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
  559. {
  560. size_t len;
  561. len = xprt->tcp_reclen - xprt->tcp_offset;
  562. if (len > desc->count)
  563. len = desc->count;
  564. desc->count -= len;
  565. desc->offset += len;
  566. xprt->tcp_offset += len;
  567. dprintk("RPC: discarded %Zu bytes\n", len);
  568. xs_tcp_check_recm(xprt);
  569. }
  570. static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
  571. {
  572. struct rpc_xprt *xprt = rd_desc->arg.data;
  573. skb_reader_t desc = {
  574. .skb = skb,
  575. .offset = offset,
  576. .count = len,
  577. .csum = 0
  578. };
  579. dprintk("RPC: xs_tcp_data_recv started\n");
  580. do {
  581. /* Read in a new fragment marker if necessary */
  582. /* Can we ever really expect to get completely empty fragments? */
  583. if (xprt->tcp_flags & XPRT_COPY_RECM) {
  584. xs_tcp_read_fraghdr(xprt, &desc);
  585. continue;
  586. }
  587. /* Read in the xid if necessary */
  588. if (xprt->tcp_flags & XPRT_COPY_XID) {
  589. xs_tcp_read_xid(xprt, &desc);
  590. continue;
  591. }
  592. /* Read in the request data */
  593. if (xprt->tcp_flags & XPRT_COPY_DATA) {
  594. xs_tcp_read_request(xprt, &desc);
  595. continue;
  596. }
  597. /* Skip over any trailing bytes on short reads */
  598. xs_tcp_read_discard(xprt, &desc);
  599. } while (desc.count);
  600. dprintk("RPC: xs_tcp_data_recv done\n");
  601. return len - desc.count;
  602. }
  603. /**
  604. * xs_tcp_data_ready - "data ready" callback for TCP sockets
  605. * @sk: socket with data to read
  606. * @bytes: how much data to read
  607. *
  608. */
  609. static void xs_tcp_data_ready(struct sock *sk, int bytes)
  610. {
  611. struct rpc_xprt *xprt;
  612. read_descriptor_t rd_desc;
  613. read_lock(&sk->sk_callback_lock);
  614. dprintk("RPC: xs_tcp_data_ready...\n");
  615. if (!(xprt = xprt_from_sock(sk)))
  616. goto out;
  617. if (xprt->shutdown)
  618. goto out;
  619. /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
  620. rd_desc.arg.data = xprt;
  621. rd_desc.count = 65536;
  622. tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
  623. out:
  624. read_unlock(&sk->sk_callback_lock);
  625. }
  626. /**
  627. * xs_tcp_state_change - callback to handle TCP socket state changes
  628. * @sk: socket whose state has changed
  629. *
  630. */
  631. static void xs_tcp_state_change(struct sock *sk)
  632. {
  633. struct rpc_xprt *xprt;
  634. read_lock(&sk->sk_callback_lock);
  635. if (!(xprt = xprt_from_sock(sk)))
  636. goto out;
  637. dprintk("RPC: xs_tcp_state_change client %p...\n", xprt);
  638. dprintk("RPC: state %x conn %d dead %d zapped %d\n",
  639. sk->sk_state, xprt_connected(xprt),
  640. sock_flag(sk, SOCK_DEAD),
  641. sock_flag(sk, SOCK_ZAPPED));
  642. switch (sk->sk_state) {
  643. case TCP_ESTABLISHED:
  644. spin_lock_bh(&xprt->transport_lock);
  645. if (!xprt_test_and_set_connected(xprt)) {
  646. /* Reset TCP record info */
  647. xprt->tcp_offset = 0;
  648. xprt->tcp_reclen = 0;
  649. xprt->tcp_copied = 0;
  650. xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID;
  651. xprt_wake_pending_tasks(xprt, 0);
  652. }
  653. spin_unlock_bh(&xprt->transport_lock);
  654. break;
  655. case TCP_SYN_SENT:
  656. case TCP_SYN_RECV:
  657. break;
  658. default:
  659. xprt_disconnect(xprt);
  660. break;
  661. }
  662. out:
  663. read_unlock(&sk->sk_callback_lock);
  664. }
  665. /**
  666. * xs_udp_write_space - callback invoked when socket buffer space
  667. * becomes available
  668. * @sk: socket whose state has changed
  669. *
  670. * Called when more output buffer space is available for this socket.
  671. * We try not to wake our writers until they can make "significant"
  672. * progress, otherwise we'll waste resources thrashing kernel_sendmsg
  673. * with a bunch of small requests.
  674. */
  675. static void xs_udp_write_space(struct sock *sk)
  676. {
  677. read_lock(&sk->sk_callback_lock);
  678. /* from net/core/sock.c:sock_def_write_space */
  679. if (sock_writeable(sk)) {
  680. struct socket *sock;
  681. struct rpc_xprt *xprt;
  682. if (unlikely(!(sock = sk->sk_socket)))
  683. goto out;
  684. if (unlikely(!(xprt = xprt_from_sock(sk))))
  685. goto out;
  686. if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)))
  687. goto out;
  688. xprt_write_space(xprt);
  689. }
  690. out:
  691. read_unlock(&sk->sk_callback_lock);
  692. }
  693. /**
  694. * xs_tcp_write_space - callback invoked when socket buffer space
  695. * becomes available
  696. * @sk: socket whose state has changed
  697. *
  698. * Called when more output buffer space is available for this socket.
  699. * We try not to wake our writers until they can make "significant"
  700. * progress, otherwise we'll waste resources thrashing kernel_sendmsg
  701. * with a bunch of small requests.
  702. */
  703. static void xs_tcp_write_space(struct sock *sk)
  704. {
  705. read_lock(&sk->sk_callback_lock);
  706. /* from net/core/stream.c:sk_stream_write_space */
  707. if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
  708. struct socket *sock;
  709. struct rpc_xprt *xprt;
  710. if (unlikely(!(sock = sk->sk_socket)))
  711. goto out;
  712. if (unlikely(!(xprt = xprt_from_sock(sk))))
  713. goto out;
  714. if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)))
  715. goto out;
  716. xprt_write_space(xprt);
  717. }
  718. out:
  719. read_unlock(&sk->sk_callback_lock);
  720. }
  721. /**
  722. * xs_udp_set_buffer_size - set send and receive limits
  723. * @xprt: generic transport
  724. *
  725. * Set socket send and receive limits based on the
  726. * sndsize and rcvsize fields in the generic transport
  727. * structure.
  728. */
  729. static void xs_udp_set_buffer_size(struct rpc_xprt *xprt)
  730. {
  731. struct sock *sk = xprt->inet;
  732. if (xprt->rcvsize) {
  733. sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
  734. sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2;
  735. }
  736. if (xprt->sndsize) {
  737. sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
  738. sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2;
  739. sk->sk_write_space(sk);
  740. }
  741. }
  742. /**
  743. * xs_tcp_set_buffer_size - set send and receive limits
  744. * @xprt: generic transport
  745. *
  746. * Nothing to do for TCP.
  747. */
  748. static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt)
  749. {
  750. return;
  751. }
  752. /**
  753. * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport
  754. * @task: task that timed out
  755. *
  756. * Adjust the congestion window after a retransmit timeout has occurred.
  757. */
  758. static void xs_udp_timer(struct rpc_task *task)
  759. {
  760. xprt_adjust_cwnd(task, -ETIMEDOUT);
  761. }
  762. static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
  763. {
  764. struct sockaddr_in myaddr = {
  765. .sin_family = AF_INET,
  766. };
  767. int err, port;
  768. /* Were we already bound to a given port? Try to reuse it */
  769. port = xprt->port;
  770. do {
  771. myaddr.sin_port = htons(port);
  772. err = sock->ops->bind(sock, (struct sockaddr *) &myaddr,
  773. sizeof(myaddr));
  774. if (err == 0) {
  775. xprt->port = port;
  776. dprintk("RPC: xs_bindresvport bound to port %u\n",
  777. port);
  778. return 0;
  779. }
  780. if (--port == 0)
  781. port = XS_MAX_RESVPORT;
  782. } while (err == -EADDRINUSE && port != xprt->port);
  783. dprintk("RPC: can't bind to reserved port (%d).\n", -err);
  784. return err;
  785. }
  786. /**
  787. * xs_udp_connect_worker - set up a UDP socket
  788. * @args: RPC transport to connect
  789. *
  790. * Invoked by a work queue tasklet.
  791. */
  792. static void xs_udp_connect_worker(void *args)
  793. {
  794. struct rpc_xprt *xprt = (struct rpc_xprt *) args;
  795. struct socket *sock = xprt->sock;
  796. int err, status = -EIO;
  797. if (xprt->shutdown || xprt->addr.sin_port == 0)
  798. goto out;
  799. dprintk("RPC: xs_udp_connect_worker for xprt %p\n", xprt);
  800. /* Start by resetting any existing state */
  801. xs_close(xprt);
  802. if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
  803. dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
  804. goto out;
  805. }
  806. if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
  807. sock_release(sock);
  808. goto out;
  809. }
  810. if (!xprt->inet) {
  811. struct sock *sk = sock->sk;
  812. write_lock_bh(&sk->sk_callback_lock);
  813. sk->sk_user_data = xprt;
  814. xprt->old_data_ready = sk->sk_data_ready;
  815. xprt->old_state_change = sk->sk_state_change;
  816. xprt->old_write_space = sk->sk_write_space;
  817. sk->sk_data_ready = xs_udp_data_ready;
  818. sk->sk_write_space = xs_udp_write_space;
  819. sk->sk_no_check = UDP_CSUM_NORCV;
  820. xprt_set_connected(xprt);
  821. /* Reset to new socket */
  822. xprt->sock = sock;
  823. xprt->inet = sk;
  824. write_unlock_bh(&sk->sk_callback_lock);
  825. }
  826. xs_udp_set_buffer_size(xprt);
  827. status = 0;
  828. out:
  829. xprt_wake_pending_tasks(xprt, status);
  830. xprt_clear_connecting(xprt);
  831. }
  832. /**
  833. * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint
  834. * @args: RPC transport to connect
  835. *
  836. * Invoked by a work queue tasklet.
  837. */
  838. static void xs_tcp_connect_worker(void *args)
  839. {
  840. struct rpc_xprt *xprt = (struct rpc_xprt *)args;
  841. struct socket *sock = xprt->sock;
  842. int err, status = -EIO;
  843. if (xprt->shutdown || xprt->addr.sin_port == 0)
  844. goto out;
  845. dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt);
  846. /* Start by resetting any existing socket state */
  847. xs_close(xprt);
  848. if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
  849. dprintk("RPC: can't create TCP transport socket (%d).\n", -err);
  850. goto out;
  851. }
  852. if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) {
  853. sock_release(sock);
  854. goto out;
  855. }
  856. if (!xprt->inet) {
  857. struct sock *sk = sock->sk;
  858. write_lock_bh(&sk->sk_callback_lock);
  859. sk->sk_user_data = xprt;
  860. xprt->old_data_ready = sk->sk_data_ready;
  861. xprt->old_state_change = sk->sk_state_change;
  862. xprt->old_write_space = sk->sk_write_space;
  863. sk->sk_data_ready = xs_tcp_data_ready;
  864. sk->sk_state_change = xs_tcp_state_change;
  865. sk->sk_write_space = xs_tcp_write_space;
  866. tcp_sk(sk)->nonagle = 1;
  867. xprt_clear_connected(xprt);
  868. /* Reset to new socket */
  869. xprt->sock = sock;
  870. xprt->inet = sk;
  871. write_unlock_bh(&sk->sk_callback_lock);
  872. }
  873. /* Tell the socket layer to start connecting... */
  874. status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr,
  875. sizeof(xprt->addr), O_NONBLOCK);
  876. dprintk("RPC: %p connect status %d connected %d sock state %d\n",
  877. xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
  878. if (status < 0) {
  879. switch (status) {
  880. case -EINPROGRESS:
  881. case -EALREADY:
  882. goto out_clear;
  883. }
  884. }
  885. out:
  886. xprt_wake_pending_tasks(xprt, status);
  887. out_clear:
  888. xprt_clear_connecting(xprt);
  889. }
  890. /**
  891. * xs_connect - connect a socket to a remote endpoint
  892. * @task: address of RPC task that manages state of connect request
  893. *
  894. * TCP: If the remote end dropped the connection, delay reconnecting.
  895. */
  896. static void xs_connect(struct rpc_task *task)
  897. {
  898. struct rpc_xprt *xprt = task->tk_xprt;
  899. if (xprt_test_and_set_connecting(xprt))
  900. return;
  901. if (xprt->sock != NULL) {
  902. dprintk("RPC: xs_connect delayed xprt %p\n", xprt);
  903. schedule_delayed_work(&xprt->connect_worker,
  904. RPC_REESTABLISH_TIMEOUT);
  905. } else {
  906. dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
  907. schedule_work(&xprt->connect_worker);
  908. /* flush_scheduled_work can sleep... */
  909. if (!RPC_IS_ASYNC(task))
  910. flush_scheduled_work();
  911. }
  912. }
  913. static struct rpc_xprt_ops xs_udp_ops = {
  914. .set_buffer_size = xs_udp_set_buffer_size,
  915. .reserve_xprt = xprt_reserve_xprt_cong,
  916. .release_xprt = xprt_release_xprt_cong,
  917. .connect = xs_connect,
  918. .send_request = xs_udp_send_request,
  919. .set_retrans_timeout = xprt_set_retrans_timeout_rtt,
  920. .timer = xs_udp_timer,
  921. .close = xs_close,
  922. .destroy = xs_destroy,
  923. };
  924. static struct rpc_xprt_ops xs_tcp_ops = {
  925. .set_buffer_size = xs_tcp_set_buffer_size,
  926. .reserve_xprt = xprt_reserve_xprt,
  927. .release_xprt = xprt_release_xprt,
  928. .connect = xs_connect,
  929. .send_request = xs_tcp_send_request,
  930. .set_retrans_timeout = xprt_set_retrans_timeout_def,
  931. .close = xs_close,
  932. .destroy = xs_destroy,
  933. };
  934. extern unsigned int xprt_udp_slot_table_entries;
  935. extern unsigned int xprt_tcp_slot_table_entries;
  936. /**
  937. * xs_setup_udp - Set up transport to use a UDP socket
  938. * @xprt: transport to set up
  939. * @to: timeout parameters
  940. *
  941. */
  942. int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
  943. {
  944. size_t slot_table_size;
  945. dprintk("RPC: setting up udp-ipv4 transport...\n");
  946. xprt->max_reqs = xprt_udp_slot_table_entries;
  947. slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
  948. xprt->slot = kmalloc(slot_table_size, GFP_KERNEL);
  949. if (xprt->slot == NULL)
  950. return -ENOMEM;
  951. memset(xprt->slot, 0, slot_table_size);
  952. xprt->prot = IPPROTO_UDP;
  953. xprt->port = XS_MAX_RESVPORT;
  954. xprt->tsh_size = 0;
  955. xprt->nocong = 0;
  956. xprt->cwnd = RPC_INITCWND;
  957. xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
  958. /* XXX: header size can vary due to auth type, IPv6, etc. */
  959. xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
  960. INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt);
  961. xprt->ops = &xs_udp_ops;
  962. if (to)
  963. xprt->timeout = *to;
  964. else
  965. xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
  966. return 0;
  967. }
  968. /**
  969. * xs_setup_tcp - Set up transport to use a TCP socket
  970. * @xprt: transport to set up
  971. * @to: timeout parameters
  972. *
  973. */
  974. int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
  975. {
  976. size_t slot_table_size;
  977. dprintk("RPC: setting up tcp-ipv4 transport...\n");
  978. xprt->max_reqs = xprt_tcp_slot_table_entries;
  979. slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
  980. xprt->slot = kmalloc(slot_table_size, GFP_KERNEL);
  981. if (xprt->slot == NULL)
  982. return -ENOMEM;
  983. memset(xprt->slot, 0, slot_table_size);
  984. xprt->prot = IPPROTO_TCP;
  985. xprt->port = XS_MAX_RESVPORT;
  986. xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
  987. xprt->nocong = 1;
  988. xprt->cwnd = RPC_MAXCWND(xprt);
  989. xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
  990. xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
  991. INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt);
  992. xprt->ops = &xs_tcp_ops;
  993. if (to)
  994. xprt->timeout = *to;
  995. else
  996. xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
  997. return 0;
  998. }