ip_vs_sync.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. /*
  2. * IPVS An implementation of the IP virtual server support for the
  3. * LINUX operating system. IPVS is now implemented as a module
  4. * over the NetFilter framework. IPVS can be used to build a
  5. * high-performance and highly available server based on a
  6. * cluster of servers.
  7. *
  8. * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
  9. *
  10. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  11. *
  12. * ip_vs_sync: sync connection info from master load balancer to backups
  13. * through multicast
  14. *
  15. * Changes:
  16. * Alexandre Cassen : Added master & backup support at a time.
  17. * Alexandre Cassen : Added SyncID support for incoming sync
  18. * messages filtering.
  19. * Justin Ossevoort : Fix endian problem on sync message size.
  20. */
  21. #include <linux/module.h>
  22. #include <linux/slab.h>
  23. #include <linux/net.h>
  24. #include <linux/completion.h>
  25. #include <linux/delay.h>
  26. #include <linux/skbuff.h>
  27. #include <linux/in.h>
  28. #include <linux/igmp.h> /* for ip_mc_join_group */
  29. #include <net/ip.h>
  30. #include <net/sock.h>
  31. #include <asm/uaccess.h> /* for get_fs and set_fs */
  32. #include <net/ip_vs.h>
  33. #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
  34. #define IP_VS_SYNC_PORT 8848 /* multicast port */
  35. /*
  36. * IPVS sync connection entry
  37. */
  38. struct ip_vs_sync_conn {
  39. __u8 reserved;
  40. /* Protocol, addresses and port numbers */
  41. __u8 protocol; /* Which protocol (TCP/UDP) */
  42. __u16 cport;
  43. __u16 vport;
  44. __u16 dport;
  45. __u32 caddr; /* client address */
  46. __u32 vaddr; /* virtual address */
  47. __u32 daddr; /* destination address */
  48. /* Flags and state transition */
  49. __u16 flags; /* status flags */
  50. __u16 state; /* state info */
  51. /* The sequence options start here */
  52. };
  53. struct ip_vs_sync_conn_options {
  54. struct ip_vs_seq in_seq; /* incoming seq. struct */
  55. struct ip_vs_seq out_seq; /* outgoing seq. struct */
  56. };
  57. #define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
  58. #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
  59. #define FULL_CONN_SIZE \
  60. (sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
  61. /*
  62. The master mulitcasts messages to the backup load balancers in the
  63. following format.
  64. 0 1 2 3
  65. 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  66. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  67. | Count Conns | SyncID | Size |
  68. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  69. | |
  70. | IPVS Sync Connection (1) |
  71. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  72. | . |
  73. | . |
  74. | . |
  75. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  76. | |
  77. | IPVS Sync Connection (n) |
  78. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  79. */
  80. #define SYNC_MESG_HEADER_LEN 4
  81. struct ip_vs_sync_mesg {
  82. __u8 nr_conns;
  83. __u8 syncid;
  84. __u16 size;
  85. /* ip_vs_sync_conn entries start here */
  86. };
  87. /* the maximum length of sync (sending/receiving) message */
  88. static int sync_send_mesg_maxlen;
  89. static int sync_recv_mesg_maxlen;
  90. struct ip_vs_sync_buff {
  91. struct list_head list;
  92. unsigned long firstuse;
  93. /* pointers for the message data */
  94. struct ip_vs_sync_mesg *mesg;
  95. unsigned char *head;
  96. unsigned char *end;
  97. };
  98. /* the sync_buff list head and the lock */
  99. static LIST_HEAD(ip_vs_sync_queue);
  100. static DEFINE_SPINLOCK(ip_vs_sync_lock);
  101. /* current sync_buff for accepting new conn entries */
  102. static struct ip_vs_sync_buff *curr_sb = NULL;
  103. static DEFINE_SPINLOCK(curr_sb_lock);
  104. /* ipvs sync daemon state */
  105. volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
  106. volatile int ip_vs_master_syncid = 0;
  107. volatile int ip_vs_backup_syncid = 0;
  108. /* multicast interface name */
  109. char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
  110. char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
  111. /* multicast addr */
  112. static struct sockaddr_in mcast_addr;
  113. static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
  114. {
  115. spin_lock(&ip_vs_sync_lock);
  116. list_add_tail(&sb->list, &ip_vs_sync_queue);
  117. spin_unlock(&ip_vs_sync_lock);
  118. }
  119. static inline struct ip_vs_sync_buff * sb_dequeue(void)
  120. {
  121. struct ip_vs_sync_buff *sb;
  122. spin_lock_bh(&ip_vs_sync_lock);
  123. if (list_empty(&ip_vs_sync_queue)) {
  124. sb = NULL;
  125. } else {
  126. sb = list_entry(ip_vs_sync_queue.next,
  127. struct ip_vs_sync_buff,
  128. list);
  129. list_del(&sb->list);
  130. }
  131. spin_unlock_bh(&ip_vs_sync_lock);
  132. return sb;
  133. }
  134. static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
  135. {
  136. struct ip_vs_sync_buff *sb;
  137. if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
  138. return NULL;
  139. if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
  140. kfree(sb);
  141. return NULL;
  142. }
  143. sb->mesg->nr_conns = 0;
  144. sb->mesg->syncid = ip_vs_master_syncid;
  145. sb->mesg->size = 4;
  146. sb->head = (unsigned char *)sb->mesg + 4;
  147. sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
  148. sb->firstuse = jiffies;
  149. return sb;
  150. }
  151. static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
  152. {
  153. kfree(sb->mesg);
  154. kfree(sb);
  155. }
  156. /*
  157. * Get the current sync buffer if it has been created for more
  158. * than the specified time or the specified time is zero.
  159. */
  160. static inline struct ip_vs_sync_buff *
  161. get_curr_sync_buff(unsigned long time)
  162. {
  163. struct ip_vs_sync_buff *sb;
  164. spin_lock_bh(&curr_sb_lock);
  165. if (curr_sb && (time == 0 ||
  166. time_before(jiffies - curr_sb->firstuse, time))) {
  167. sb = curr_sb;
  168. curr_sb = NULL;
  169. } else
  170. sb = NULL;
  171. spin_unlock_bh(&curr_sb_lock);
  172. return sb;
  173. }
  174. /*
  175. * Add an ip_vs_conn information into the current sync_buff.
  176. * Called by ip_vs_in.
  177. */
  178. void ip_vs_sync_conn(struct ip_vs_conn *cp)
  179. {
  180. struct ip_vs_sync_mesg *m;
  181. struct ip_vs_sync_conn *s;
  182. int len;
  183. spin_lock(&curr_sb_lock);
  184. if (!curr_sb) {
  185. if (!(curr_sb=ip_vs_sync_buff_create())) {
  186. spin_unlock(&curr_sb_lock);
  187. IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
  188. return;
  189. }
  190. }
  191. len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
  192. SIMPLE_CONN_SIZE;
  193. m = curr_sb->mesg;
  194. s = (struct ip_vs_sync_conn *)curr_sb->head;
  195. /* copy members */
  196. s->protocol = cp->protocol;
  197. s->cport = cp->cport;
  198. s->vport = cp->vport;
  199. s->dport = cp->dport;
  200. s->caddr = cp->caddr;
  201. s->vaddr = cp->vaddr;
  202. s->daddr = cp->daddr;
  203. s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
  204. s->state = htons(cp->state);
  205. if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
  206. struct ip_vs_sync_conn_options *opt =
  207. (struct ip_vs_sync_conn_options *)&s[1];
  208. memcpy(opt, &cp->in_seq, sizeof(*opt));
  209. }
  210. m->nr_conns++;
  211. m->size += len;
  212. curr_sb->head += len;
  213. /* check if there is a space for next one */
  214. if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
  215. sb_queue_tail(curr_sb);
  216. curr_sb = NULL;
  217. }
  218. spin_unlock(&curr_sb_lock);
  219. /* synchronize its controller if it has */
  220. if (cp->control)
  221. ip_vs_sync_conn(cp->control);
  222. }
  223. /*
  224. * Process received multicast message and create the corresponding
  225. * ip_vs_conn entries.
  226. */
  227. static void ip_vs_process_message(const char *buffer, const size_t buflen)
  228. {
  229. struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
  230. struct ip_vs_sync_conn *s;
  231. struct ip_vs_sync_conn_options *opt;
  232. struct ip_vs_conn *cp;
  233. char *p;
  234. int i;
  235. /* Convert size back to host byte order */
  236. m->size = ntohs(m->size);
  237. if (buflen != m->size) {
  238. IP_VS_ERR("bogus message\n");
  239. return;
  240. }
  241. /* SyncID sanity check */
  242. if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
  243. IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
  244. m->syncid);
  245. return;
  246. }
  247. p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
  248. for (i=0; i<m->nr_conns; i++) {
  249. s = (struct ip_vs_sync_conn *)p;
  250. cp = ip_vs_conn_in_get(s->protocol,
  251. s->caddr, s->cport,
  252. s->vaddr, s->vport);
  253. if (!cp) {
  254. cp = ip_vs_conn_new(s->protocol,
  255. s->caddr, s->cport,
  256. s->vaddr, s->vport,
  257. s->daddr, s->dport,
  258. ntohs(s->flags), NULL);
  259. if (!cp) {
  260. IP_VS_ERR("ip_vs_conn_new failed\n");
  261. return;
  262. }
  263. cp->state = ntohs(s->state);
  264. } else if (!cp->dest) {
  265. /* it is an entry created by the synchronization */
  266. cp->state = ntohs(s->state);
  267. cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
  268. } /* Note that we don't touch its state and flags
  269. if it is a normal entry. */
  270. if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
  271. opt = (struct ip_vs_sync_conn_options *)&s[1];
  272. memcpy(&cp->in_seq, opt, sizeof(*opt));
  273. p += FULL_CONN_SIZE;
  274. } else
  275. p += SIMPLE_CONN_SIZE;
  276. atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
  277. cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
  278. ip_vs_conn_put(cp);
  279. if (p > buffer+buflen) {
  280. IP_VS_ERR("bogus message\n");
  281. return;
  282. }
  283. }
  284. }
  285. /*
  286. * Setup loopback of outgoing multicasts on a sending socket
  287. */
  288. static void set_mcast_loop(struct sock *sk, u_char loop)
  289. {
  290. struct inet_sock *inet = inet_sk(sk);
  291. /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
  292. lock_sock(sk);
  293. inet->mc_loop = loop ? 1 : 0;
  294. release_sock(sk);
  295. }
  296. /*
  297. * Specify TTL for outgoing multicasts on a sending socket
  298. */
  299. static void set_mcast_ttl(struct sock *sk, u_char ttl)
  300. {
  301. struct inet_sock *inet = inet_sk(sk);
  302. /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
  303. lock_sock(sk);
  304. inet->mc_ttl = ttl;
  305. release_sock(sk);
  306. }
  307. /*
  308. * Specifiy default interface for outgoing multicasts
  309. */
  310. static int set_mcast_if(struct sock *sk, char *ifname)
  311. {
  312. struct net_device *dev;
  313. struct inet_sock *inet = inet_sk(sk);
  314. if ((dev = __dev_get_by_name(ifname)) == NULL)
  315. return -ENODEV;
  316. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  317. return -EINVAL;
  318. lock_sock(sk);
  319. inet->mc_index = dev->ifindex;
  320. /* inet->mc_addr = 0; */
  321. release_sock(sk);
  322. return 0;
  323. }
  324. /*
  325. * Set the maximum length of sync message according to the
  326. * specified interface's MTU.
  327. */
  328. static int set_sync_mesg_maxlen(int sync_state)
  329. {
  330. struct net_device *dev;
  331. int num;
  332. if (sync_state == IP_VS_STATE_MASTER) {
  333. if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
  334. return -ENODEV;
  335. num = (dev->mtu - sizeof(struct iphdr) -
  336. sizeof(struct udphdr) -
  337. SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
  338. sync_send_mesg_maxlen =
  339. SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
  340. IP_VS_DBG(7, "setting the maximum length of sync sending "
  341. "message %d.\n", sync_send_mesg_maxlen);
  342. } else if (sync_state == IP_VS_STATE_BACKUP) {
  343. if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
  344. return -ENODEV;
  345. sync_recv_mesg_maxlen = dev->mtu -
  346. sizeof(struct iphdr) - sizeof(struct udphdr);
  347. IP_VS_DBG(7, "setting the maximum length of sync receiving "
  348. "message %d.\n", sync_recv_mesg_maxlen);
  349. }
  350. return 0;
  351. }
  352. /*
  353. * Join a multicast group.
  354. * the group is specified by a class D multicast address 224.0.0.0/8
  355. * in the in_addr structure passed in as a parameter.
  356. */
  357. static int
  358. join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
  359. {
  360. struct ip_mreqn mreq;
  361. struct net_device *dev;
  362. int ret;
  363. memset(&mreq, 0, sizeof(mreq));
  364. memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
  365. if ((dev = __dev_get_by_name(ifname)) == NULL)
  366. return -ENODEV;
  367. if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
  368. return -EINVAL;
  369. mreq.imr_ifindex = dev->ifindex;
  370. lock_sock(sk);
  371. ret = ip_mc_join_group(sk, &mreq);
  372. release_sock(sk);
  373. return ret;
  374. }
  375. static int bind_mcastif_addr(struct socket *sock, char *ifname)
  376. {
  377. struct net_device *dev;
  378. u32 addr;
  379. struct sockaddr_in sin;
  380. if ((dev = __dev_get_by_name(ifname)) == NULL)
  381. return -ENODEV;
  382. addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
  383. if (!addr)
  384. IP_VS_ERR("You probably need to specify IP address on "
  385. "multicast interface.\n");
  386. IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
  387. ifname, NIPQUAD(addr));
  388. /* Now bind the socket with the address of multicast interface */
  389. sin.sin_family = AF_INET;
  390. sin.sin_addr.s_addr = addr;
  391. sin.sin_port = 0;
  392. return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
  393. }
  394. /*
  395. * Set up sending multicast socket over UDP
  396. */
  397. static struct socket * make_send_sock(void)
  398. {
  399. struct socket *sock;
  400. /* First create a socket */
  401. if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
  402. IP_VS_ERR("Error during creation of socket; terminating\n");
  403. return NULL;
  404. }
  405. if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
  406. IP_VS_ERR("Error setting outbound mcast interface\n");
  407. goto error;
  408. }
  409. set_mcast_loop(sock->sk, 0);
  410. set_mcast_ttl(sock->sk, 1);
  411. if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
  412. IP_VS_ERR("Error binding address of the mcast interface\n");
  413. goto error;
  414. }
  415. if (sock->ops->connect(sock,
  416. (struct sockaddr*)&mcast_addr,
  417. sizeof(struct sockaddr), 0) < 0) {
  418. IP_VS_ERR("Error connecting to the multicast addr\n");
  419. goto error;
  420. }
  421. return sock;
  422. error:
  423. sock_release(sock);
  424. return NULL;
  425. }
  426. /*
  427. * Set up receiving multicast socket over UDP
  428. */
  429. static struct socket * make_receive_sock(void)
  430. {
  431. struct socket *sock;
  432. /* First create a socket */
  433. if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
  434. IP_VS_ERR("Error during creation of socket; terminating\n");
  435. return NULL;
  436. }
  437. /* it is equivalent to the REUSEADDR option in user-space */
  438. sock->sk->sk_reuse = 1;
  439. if (sock->ops->bind(sock,
  440. (struct sockaddr*)&mcast_addr,
  441. sizeof(struct sockaddr)) < 0) {
  442. IP_VS_ERR("Error binding to the multicast addr\n");
  443. goto error;
  444. }
  445. /* join the multicast group */
  446. if (join_mcast_group(sock->sk,
  447. (struct in_addr*)&mcast_addr.sin_addr,
  448. ip_vs_backup_mcast_ifn) < 0) {
  449. IP_VS_ERR("Error joining to the multicast group\n");
  450. goto error;
  451. }
  452. return sock;
  453. error:
  454. sock_release(sock);
  455. return NULL;
  456. }
  457. static int
  458. ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
  459. {
  460. struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
  461. struct kvec iov;
  462. int len;
  463. EnterFunction(7);
  464. iov.iov_base = (void *)buffer;
  465. iov.iov_len = length;
  466. len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
  467. LeaveFunction(7);
  468. return len;
  469. }
  470. static void
  471. ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
  472. {
  473. int msize;
  474. msize = msg->size;
  475. /* Put size in network byte order */
  476. msg->size = htons(msg->size);
  477. if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
  478. IP_VS_ERR("ip_vs_send_async error\n");
  479. }
  480. static int
  481. ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
  482. {
  483. struct msghdr msg = {NULL,};
  484. struct kvec iov;
  485. int len;
  486. EnterFunction(7);
  487. /* Receive a packet */
  488. iov.iov_base = buffer;
  489. iov.iov_len = (size_t)buflen;
  490. len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
  491. if (len < 0)
  492. return -1;
  493. LeaveFunction(7);
  494. return len;
  495. }
  496. static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
  497. static pid_t sync_master_pid = 0;
  498. static pid_t sync_backup_pid = 0;
  499. static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
  500. static int stop_master_sync = 0;
  501. static int stop_backup_sync = 0;
  502. static void sync_master_loop(void)
  503. {
  504. struct socket *sock;
  505. struct ip_vs_sync_buff *sb;
  506. /* create the sending multicast socket */
  507. sock = make_send_sock();
  508. if (!sock)
  509. return;
  510. IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
  511. "syncid = %d\n",
  512. ip_vs_master_mcast_ifn, ip_vs_master_syncid);
  513. for (;;) {
  514. while ((sb=sb_dequeue())) {
  515. ip_vs_send_sync_msg(sock, sb->mesg);
  516. ip_vs_sync_buff_release(sb);
  517. }
  518. /* check if entries stay in curr_sb for 2 seconds */
  519. if ((sb = get_curr_sync_buff(2*HZ))) {
  520. ip_vs_send_sync_msg(sock, sb->mesg);
  521. ip_vs_sync_buff_release(sb);
  522. }
  523. if (stop_master_sync)
  524. break;
  525. ssleep(1);
  526. }
  527. /* clean up the sync_buff queue */
  528. while ((sb=sb_dequeue())) {
  529. ip_vs_sync_buff_release(sb);
  530. }
  531. /* clean up the current sync_buff */
  532. if ((sb = get_curr_sync_buff(0))) {
  533. ip_vs_sync_buff_release(sb);
  534. }
  535. /* release the sending multicast socket */
  536. sock_release(sock);
  537. }
  538. static void sync_backup_loop(void)
  539. {
  540. struct socket *sock;
  541. char *buf;
  542. int len;
  543. if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
  544. IP_VS_ERR("sync_backup_loop: kmalloc error\n");
  545. return;
  546. }
  547. /* create the receiving multicast socket */
  548. sock = make_receive_sock();
  549. if (!sock)
  550. goto out;
  551. IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
  552. "syncid = %d\n",
  553. ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
  554. for (;;) {
  555. /* do you have data now? */
  556. while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
  557. if ((len =
  558. ip_vs_receive(sock, buf,
  559. sync_recv_mesg_maxlen)) <= 0) {
  560. IP_VS_ERR("receiving message error\n");
  561. break;
  562. }
  563. /* disable bottom half, because it accessed the data
  564. shared by softirq while getting/creating conns */
  565. local_bh_disable();
  566. ip_vs_process_message(buf, len);
  567. local_bh_enable();
  568. }
  569. if (stop_backup_sync)
  570. break;
  571. ssleep(1);
  572. }
  573. /* release the sending multicast socket */
  574. sock_release(sock);
  575. out:
  576. kfree(buf);
  577. }
  578. static void set_sync_pid(int sync_state, pid_t sync_pid)
  579. {
  580. if (sync_state == IP_VS_STATE_MASTER)
  581. sync_master_pid = sync_pid;
  582. else if (sync_state == IP_VS_STATE_BACKUP)
  583. sync_backup_pid = sync_pid;
  584. }
  585. static void set_stop_sync(int sync_state, int set)
  586. {
  587. if (sync_state == IP_VS_STATE_MASTER)
  588. stop_master_sync = set;
  589. else if (sync_state == IP_VS_STATE_BACKUP)
  590. stop_backup_sync = set;
  591. else {
  592. stop_master_sync = set;
  593. stop_backup_sync = set;
  594. }
  595. }
  596. static int sync_thread(void *startup)
  597. {
  598. DECLARE_WAITQUEUE(wait, current);
  599. mm_segment_t oldmm;
  600. int state;
  601. const char *name;
  602. /* increase the module use count */
  603. ip_vs_use_count_inc();
  604. if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
  605. state = IP_VS_STATE_MASTER;
  606. name = "ipvs_syncmaster";
  607. } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
  608. state = IP_VS_STATE_BACKUP;
  609. name = "ipvs_syncbackup";
  610. } else {
  611. IP_VS_BUG();
  612. ip_vs_use_count_dec();
  613. return -EINVAL;
  614. }
  615. daemonize(name);
  616. oldmm = get_fs();
  617. set_fs(KERNEL_DS);
  618. /* Block all signals */
  619. spin_lock_irq(&current->sighand->siglock);
  620. siginitsetinv(&current->blocked, 0);
  621. recalc_sigpending();
  622. spin_unlock_irq(&current->sighand->siglock);
  623. /* set the maximum length of sync message */
  624. set_sync_mesg_maxlen(state);
  625. /* set up multicast address */
  626. mcast_addr.sin_family = AF_INET;
  627. mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
  628. mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
  629. add_wait_queue(&sync_wait, &wait);
  630. set_sync_pid(state, current->pid);
  631. complete((struct completion *)startup);
  632. /* processing master/backup loop here */
  633. if (state == IP_VS_STATE_MASTER)
  634. sync_master_loop();
  635. else if (state == IP_VS_STATE_BACKUP)
  636. sync_backup_loop();
  637. else IP_VS_BUG();
  638. remove_wait_queue(&sync_wait, &wait);
  639. /* thread exits */
  640. set_sync_pid(state, 0);
  641. IP_VS_INFO("sync thread stopped!\n");
  642. set_fs(oldmm);
  643. /* decrease the module use count */
  644. ip_vs_use_count_dec();
  645. set_stop_sync(state, 0);
  646. wake_up(&stop_sync_wait);
  647. return 0;
  648. }
  649. static int fork_sync_thread(void *startup)
  650. {
  651. pid_t pid;
  652. /* fork the sync thread here, then the parent process of the
  653. sync thread is the init process after this thread exits. */
  654. repeat:
  655. if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
  656. IP_VS_ERR("could not create sync_thread due to %d... "
  657. "retrying.\n", pid);
  658. ssleep(1);
  659. goto repeat;
  660. }
  661. return 0;
  662. }
  663. int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
  664. {
  665. DECLARE_COMPLETION(startup);
  666. pid_t pid;
  667. if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
  668. (state == IP_VS_STATE_BACKUP && sync_backup_pid))
  669. return -EEXIST;
  670. IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
  671. IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
  672. sizeof(struct ip_vs_sync_conn));
  673. ip_vs_sync_state |= state;
  674. if (state == IP_VS_STATE_MASTER) {
  675. strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
  676. ip_vs_master_syncid = syncid;
  677. } else {
  678. strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
  679. ip_vs_backup_syncid = syncid;
  680. }
  681. repeat:
  682. if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
  683. IP_VS_ERR("could not create fork_sync_thread due to %d... "
  684. "retrying.\n", pid);
  685. ssleep(1);
  686. goto repeat;
  687. }
  688. wait_for_completion(&startup);
  689. return 0;
  690. }
  691. int stop_sync_thread(int state)
  692. {
  693. DECLARE_WAITQUEUE(wait, current);
  694. if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
  695. (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
  696. return -ESRCH;
  697. IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
  698. IP_VS_INFO("stopping sync thread %d ...\n",
  699. (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
  700. __set_current_state(TASK_UNINTERRUPTIBLE);
  701. add_wait_queue(&stop_sync_wait, &wait);
  702. set_stop_sync(state, 1);
  703. ip_vs_sync_state -= state;
  704. wake_up(&sync_wait);
  705. schedule();
  706. __set_current_state(TASK_RUNNING);
  707. remove_wait_queue(&stop_sync_wait, &wait);
  708. /* Note: no need to reap the sync thread, because its parent
  709. process is the init process */
  710. if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
  711. (state == IP_VS_STATE_BACKUP && stop_backup_sync))
  712. IP_VS_BUG();
  713. return 0;
  714. }