ip_vs_proto_tcp.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. /*
  2. * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
  3. *
  4. * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
  5. *
  6. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  7. * Julian Anastasov <ja@ssi.bg>
  8. *
  9. * This program is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU General Public License
  11. * as published by the Free Software Foundation; either version
  12. * 2 of the License, or (at your option) any later version.
  13. *
  14. * Changes:
  15. *
  16. */
  17. #include <linux/kernel.h>
  18. #include <linux/ip.h>
  19. #include <linux/tcp.h> /* for tcphdr */
  20. #include <net/ip.h>
  21. #include <net/tcp.h> /* for csum_tcpudp_magic */
  22. #include <linux/netfilter_ipv4.h>
  23. #include <net/ip_vs.h>
  24. static struct ip_vs_conn *
  25. tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
  26. const struct iphdr *iph, unsigned int proto_off, int inverse)
  27. {
  28. __u16 _ports[2], *pptr;
  29. pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
  30. if (pptr == NULL)
  31. return NULL;
  32. if (likely(!inverse)) {
  33. return ip_vs_conn_in_get(iph->protocol,
  34. iph->saddr, pptr[0],
  35. iph->daddr, pptr[1]);
  36. } else {
  37. return ip_vs_conn_in_get(iph->protocol,
  38. iph->daddr, pptr[1],
  39. iph->saddr, pptr[0]);
  40. }
  41. }
  42. static struct ip_vs_conn *
  43. tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
  44. const struct iphdr *iph, unsigned int proto_off, int inverse)
  45. {
  46. __u16 _ports[2], *pptr;
  47. pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
  48. if (pptr == NULL)
  49. return NULL;
  50. if (likely(!inverse)) {
  51. return ip_vs_conn_out_get(iph->protocol,
  52. iph->saddr, pptr[0],
  53. iph->daddr, pptr[1]);
  54. } else {
  55. return ip_vs_conn_out_get(iph->protocol,
  56. iph->daddr, pptr[1],
  57. iph->saddr, pptr[0]);
  58. }
  59. }
  60. static int
  61. tcp_conn_schedule(struct sk_buff *skb,
  62. struct ip_vs_protocol *pp,
  63. int *verdict, struct ip_vs_conn **cpp)
  64. {
  65. struct ip_vs_service *svc;
  66. struct tcphdr _tcph, *th;
  67. th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
  68. sizeof(_tcph), &_tcph);
  69. if (th == NULL) {
  70. *verdict = NF_DROP;
  71. return 0;
  72. }
  73. if (th->syn &&
  74. (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
  75. skb->nh.iph->daddr, th->dest))) {
  76. if (ip_vs_todrop()) {
  77. /*
  78. * It seems that we are very loaded.
  79. * We have to drop this packet :(
  80. */
  81. ip_vs_service_put(svc);
  82. *verdict = NF_DROP;
  83. return 0;
  84. }
  85. /*
  86. * Let the virtual server select a real server for the
  87. * incoming connection, and create a connection entry.
  88. */
  89. *cpp = ip_vs_schedule(svc, skb);
  90. if (!*cpp) {
  91. *verdict = ip_vs_leave(svc, skb, pp);
  92. return 0;
  93. }
  94. ip_vs_service_put(svc);
  95. }
  96. return 1;
  97. }
  98. static inline void
  99. tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
  100. u16 oldport, u16 newport)
  101. {
  102. tcph->check =
  103. ip_vs_check_diff(~oldip, newip,
  104. ip_vs_check_diff(oldport ^ 0xFFFF,
  105. newport, tcph->check));
  106. }
  107. static int
  108. tcp_snat_handler(struct sk_buff **pskb,
  109. struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
  110. {
  111. struct tcphdr *tcph;
  112. unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
  113. /* csum_check requires unshared skb */
  114. if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
  115. return 0;
  116. if (unlikely(cp->app != NULL)) {
  117. /* Some checks before mangling */
  118. if (pp->csum_check && !pp->csum_check(*pskb, pp))
  119. return 0;
  120. /* Call application helper if needed */
  121. if (!ip_vs_app_pkt_out(cp, pskb))
  122. return 0;
  123. }
  124. tcph = (void *)(*pskb)->nh.iph + tcphoff;
  125. tcph->source = cp->vport;
  126. /* Adjust TCP checksums */
  127. if (!cp->app) {
  128. /* Only port and addr are changed, do fast csum update */
  129. tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
  130. cp->dport, cp->vport);
  131. if ((*pskb)->ip_summed == CHECKSUM_HW)
  132. (*pskb)->ip_summed = CHECKSUM_NONE;
  133. } else {
  134. /* full checksum calculation */
  135. tcph->check = 0;
  136. (*pskb)->csum = skb_checksum(*pskb, tcphoff,
  137. (*pskb)->len - tcphoff, 0);
  138. tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
  139. (*pskb)->len - tcphoff,
  140. cp->protocol,
  141. (*pskb)->csum);
  142. IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
  143. pp->name, tcph->check,
  144. (char*)&(tcph->check) - (char*)tcph);
  145. }
  146. return 1;
  147. }
  148. static int
  149. tcp_dnat_handler(struct sk_buff **pskb,
  150. struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
  151. {
  152. struct tcphdr *tcph;
  153. unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
  154. /* csum_check requires unshared skb */
  155. if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
  156. return 0;
  157. if (unlikely(cp->app != NULL)) {
  158. /* Some checks before mangling */
  159. if (pp->csum_check && !pp->csum_check(*pskb, pp))
  160. return 0;
  161. /*
  162. * Attempt ip_vs_app call.
  163. * It will fix ip_vs_conn and iph ack_seq stuff
  164. */
  165. if (!ip_vs_app_pkt_in(cp, pskb))
  166. return 0;
  167. }
  168. tcph = (void *)(*pskb)->nh.iph + tcphoff;
  169. tcph->dest = cp->dport;
  170. /*
  171. * Adjust TCP checksums
  172. */
  173. if (!cp->app) {
  174. /* Only port and addr are changed, do fast csum update */
  175. tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
  176. cp->vport, cp->dport);
  177. if ((*pskb)->ip_summed == CHECKSUM_HW)
  178. (*pskb)->ip_summed = CHECKSUM_NONE;
  179. } else {
  180. /* full checksum calculation */
  181. tcph->check = 0;
  182. (*pskb)->csum = skb_checksum(*pskb, tcphoff,
  183. (*pskb)->len - tcphoff, 0);
  184. tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
  185. (*pskb)->len - tcphoff,
  186. cp->protocol,
  187. (*pskb)->csum);
  188. (*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
  189. }
  190. return 1;
  191. }
  192. static int
  193. tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
  194. {
  195. unsigned int tcphoff = skb->nh.iph->ihl*4;
  196. switch (skb->ip_summed) {
  197. case CHECKSUM_NONE:
  198. skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
  199. case CHECKSUM_HW:
  200. if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
  201. skb->len - tcphoff,
  202. skb->nh.iph->protocol, skb->csum)) {
  203. IP_VS_DBG_RL_PKT(0, pp, skb, 0,
  204. "Failed checksum for");
  205. return 0;
  206. }
  207. break;
  208. default:
  209. /* CHECKSUM_UNNECESSARY */
  210. break;
  211. }
  212. return 1;
  213. }
  214. #define TCP_DIR_INPUT 0
  215. #define TCP_DIR_OUTPUT 4
  216. #define TCP_DIR_INPUT_ONLY 8
  217. static int tcp_state_off[IP_VS_DIR_LAST] = {
  218. [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
  219. [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
  220. [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
  221. };
  222. /*
  223. * Timeout table[state]
  224. */
  225. static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
  226. [IP_VS_TCP_S_NONE] = 2*HZ,
  227. [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
  228. [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
  229. [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
  230. [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
  231. [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
  232. [IP_VS_TCP_S_CLOSE] = 10*HZ,
  233. [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
  234. [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
  235. [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
  236. [IP_VS_TCP_S_SYNACK] = 120*HZ,
  237. [IP_VS_TCP_S_LAST] = 2*HZ,
  238. };
  239. #if 0
  240. /* FIXME: This is going to die */
  241. static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
  242. [IP_VS_TCP_S_NONE] = 2*HZ,
  243. [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
  244. [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
  245. [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
  246. [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
  247. [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
  248. [IP_VS_TCP_S_CLOSE] = 10*HZ,
  249. [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
  250. [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
  251. [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
  252. [IP_VS_TCP_S_SYNACK] = 100*HZ,
  253. [IP_VS_TCP_S_LAST] = 2*HZ,
  254. };
  255. #endif
  256. static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
  257. [IP_VS_TCP_S_NONE] = "NONE",
  258. [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
  259. [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
  260. [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
  261. [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
  262. [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
  263. [IP_VS_TCP_S_CLOSE] = "CLOSE",
  264. [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
  265. [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
  266. [IP_VS_TCP_S_LISTEN] = "LISTEN",
  267. [IP_VS_TCP_S_SYNACK] = "SYNACK",
  268. [IP_VS_TCP_S_LAST] = "BUG!",
  269. };
  270. #define sNO IP_VS_TCP_S_NONE
  271. #define sES IP_VS_TCP_S_ESTABLISHED
  272. #define sSS IP_VS_TCP_S_SYN_SENT
  273. #define sSR IP_VS_TCP_S_SYN_RECV
  274. #define sFW IP_VS_TCP_S_FIN_WAIT
  275. #define sTW IP_VS_TCP_S_TIME_WAIT
  276. #define sCL IP_VS_TCP_S_CLOSE
  277. #define sCW IP_VS_TCP_S_CLOSE_WAIT
  278. #define sLA IP_VS_TCP_S_LAST_ACK
  279. #define sLI IP_VS_TCP_S_LISTEN
  280. #define sSA IP_VS_TCP_S_SYNACK
  281. struct tcp_states_t {
  282. int next_state[IP_VS_TCP_S_LAST];
  283. };
  284. static const char * tcp_state_name(int state)
  285. {
  286. if (state >= IP_VS_TCP_S_LAST)
  287. return "ERR!";
  288. return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
  289. }
  290. static struct tcp_states_t tcp_states [] = {
  291. /* INPUT */
  292. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  293. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
  294. /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
  295. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  296. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
  297. /* OUTPUT */
  298. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  299. /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
  300. /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
  301. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
  302. /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
  303. /* INPUT-ONLY */
  304. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  305. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
  306. /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
  307. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  308. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  309. };
  310. static struct tcp_states_t tcp_states_dos [] = {
  311. /* INPUT */
  312. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  313. /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
  314. /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
  315. /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
  316. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  317. /* OUTPUT */
  318. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  319. /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
  320. /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
  321. /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
  322. /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
  323. /* INPUT-ONLY */
  324. /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
  325. /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
  326. /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
  327. /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
  328. /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
  329. };
  330. static struct tcp_states_t *tcp_state_table = tcp_states;
  331. static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
  332. {
  333. int on = (flags & 1); /* secure_tcp */
  334. /*
  335. ** FIXME: change secure_tcp to independent sysctl var
  336. ** or make it per-service or per-app because it is valid
  337. ** for most if not for all of the applications. Something
  338. ** like "capabilities" (flags) for each object.
  339. */
  340. tcp_state_table = (on? tcp_states_dos : tcp_states);
  341. }
  342. static int
  343. tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
  344. {
  345. return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
  346. tcp_state_name_table, sname, to);
  347. }
  348. static inline int tcp_state_idx(struct tcphdr *th)
  349. {
  350. if (th->rst)
  351. return 3;
  352. if (th->syn)
  353. return 0;
  354. if (th->fin)
  355. return 1;
  356. if (th->ack)
  357. return 2;
  358. return -1;
  359. }
  360. static inline void
  361. set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
  362. int direction, struct tcphdr *th)
  363. {
  364. int state_idx;
  365. int new_state = IP_VS_TCP_S_CLOSE;
  366. int state_off = tcp_state_off[direction];
  367. /*
  368. * Update state offset to INPUT_ONLY if necessary
  369. * or delete NO_OUTPUT flag if output packet detected
  370. */
  371. if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
  372. if (state_off == TCP_DIR_OUTPUT)
  373. cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
  374. else
  375. state_off = TCP_DIR_INPUT_ONLY;
  376. }
  377. if ((state_idx = tcp_state_idx(th)) < 0) {
  378. IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
  379. goto tcp_state_out;
  380. }
  381. new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
  382. tcp_state_out:
  383. if (new_state != cp->state) {
  384. struct ip_vs_dest *dest = cp->dest;
  385. IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
  386. "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
  387. pp->name,
  388. (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
  389. th->syn? 'S' : '.',
  390. th->fin? 'F' : '.',
  391. th->ack? 'A' : '.',
  392. th->rst? 'R' : '.',
  393. NIPQUAD(cp->daddr), ntohs(cp->dport),
  394. NIPQUAD(cp->caddr), ntohs(cp->cport),
  395. tcp_state_name(cp->state),
  396. tcp_state_name(new_state),
  397. atomic_read(&cp->refcnt));
  398. if (dest) {
  399. if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
  400. (new_state != IP_VS_TCP_S_ESTABLISHED)) {
  401. atomic_dec(&dest->activeconns);
  402. atomic_inc(&dest->inactconns);
  403. cp->flags |= IP_VS_CONN_F_INACTIVE;
  404. } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
  405. (new_state == IP_VS_TCP_S_ESTABLISHED)) {
  406. atomic_inc(&dest->activeconns);
  407. atomic_dec(&dest->inactconns);
  408. cp->flags &= ~IP_VS_CONN_F_INACTIVE;
  409. }
  410. }
  411. }
  412. cp->timeout = pp->timeout_table[cp->state = new_state];
  413. }
  414. /*
  415. * Handle state transitions
  416. */
  417. static int
  418. tcp_state_transition(struct ip_vs_conn *cp, int direction,
  419. const struct sk_buff *skb,
  420. struct ip_vs_protocol *pp)
  421. {
  422. struct tcphdr _tcph, *th;
  423. th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
  424. sizeof(_tcph), &_tcph);
  425. if (th == NULL)
  426. return 0;
  427. spin_lock(&cp->lock);
  428. set_tcp_state(pp, cp, direction, th);
  429. spin_unlock(&cp->lock);
  430. return 1;
  431. }
  432. /*
  433. * Hash table for TCP application incarnations
  434. */
  435. #define TCP_APP_TAB_BITS 4
  436. #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
  437. #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
  438. static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
  439. static DEFINE_SPINLOCK(tcp_app_lock);
  440. static inline __u16 tcp_app_hashkey(__u16 port)
  441. {
  442. return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
  443. }
  444. static int tcp_register_app(struct ip_vs_app *inc)
  445. {
  446. struct ip_vs_app *i;
  447. __u16 hash, port = inc->port;
  448. int ret = 0;
  449. hash = tcp_app_hashkey(port);
  450. spin_lock_bh(&tcp_app_lock);
  451. list_for_each_entry(i, &tcp_apps[hash], p_list) {
  452. if (i->port == port) {
  453. ret = -EEXIST;
  454. goto out;
  455. }
  456. }
  457. list_add(&inc->p_list, &tcp_apps[hash]);
  458. atomic_inc(&ip_vs_protocol_tcp.appcnt);
  459. out:
  460. spin_unlock_bh(&tcp_app_lock);
  461. return ret;
  462. }
  463. static void
  464. tcp_unregister_app(struct ip_vs_app *inc)
  465. {
  466. spin_lock_bh(&tcp_app_lock);
  467. atomic_dec(&ip_vs_protocol_tcp.appcnt);
  468. list_del(&inc->p_list);
  469. spin_unlock_bh(&tcp_app_lock);
  470. }
  471. static int
  472. tcp_app_conn_bind(struct ip_vs_conn *cp)
  473. {
  474. int hash;
  475. struct ip_vs_app *inc;
  476. int result = 0;
  477. /* Default binding: bind app only for NAT */
  478. if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
  479. return 0;
  480. /* Lookup application incarnations and bind the right one */
  481. hash = tcp_app_hashkey(cp->vport);
  482. spin_lock(&tcp_app_lock);
  483. list_for_each_entry(inc, &tcp_apps[hash], p_list) {
  484. if (inc->port == cp->vport) {
  485. if (unlikely(!ip_vs_app_inc_get(inc)))
  486. break;
  487. spin_unlock(&tcp_app_lock);
  488. IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
  489. "%u.%u.%u.%u:%u to app %s on port %u\n",
  490. __FUNCTION__,
  491. NIPQUAD(cp->caddr), ntohs(cp->cport),
  492. NIPQUAD(cp->vaddr), ntohs(cp->vport),
  493. inc->name, ntohs(inc->port));
  494. cp->app = inc;
  495. if (inc->init_conn)
  496. result = inc->init_conn(inc, cp);
  497. goto out;
  498. }
  499. }
  500. spin_unlock(&tcp_app_lock);
  501. out:
  502. return result;
  503. }
  504. /*
  505. * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
  506. */
  507. void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
  508. {
  509. spin_lock(&cp->lock);
  510. cp->state = IP_VS_TCP_S_LISTEN;
  511. cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
  512. spin_unlock(&cp->lock);
  513. }
  514. static void tcp_init(struct ip_vs_protocol *pp)
  515. {
  516. IP_VS_INIT_HASH_TABLE(tcp_apps);
  517. pp->timeout_table = tcp_timeouts;
  518. }
  519. static void tcp_exit(struct ip_vs_protocol *pp)
  520. {
  521. }
  522. struct ip_vs_protocol ip_vs_protocol_tcp = {
  523. .name = "TCP",
  524. .protocol = IPPROTO_TCP,
  525. .dont_defrag = 0,
  526. .appcnt = ATOMIC_INIT(0),
  527. .init = tcp_init,
  528. .exit = tcp_exit,
  529. .register_app = tcp_register_app,
  530. .unregister_app = tcp_unregister_app,
  531. .conn_schedule = tcp_conn_schedule,
  532. .conn_in_get = tcp_conn_in_get,
  533. .conn_out_get = tcp_conn_out_get,
  534. .snat_handler = tcp_snat_handler,
  535. .dnat_handler = tcp_dnat_handler,
  536. .csum_check = tcp_csum_check,
  537. .state_name = tcp_state_name,
  538. .state_transition = tcp_state_transition,
  539. .app_conn_bind = tcp_app_conn_bind,
  540. .debug_packet = ip_vs_tcpudp_debug_packet,
  541. .timeout_change = tcp_timeout_change,
  542. .set_state_timeout = tcp_set_state_timeout,
  543. };