|
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
|
|
|
*/
|
|
|
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
|
|
|
|
|
|
+/* Default TSQ limit of two TSO segments */
|
|
|
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
|
|
|
+
|
|
|
/* This limits the percentage of the congestion window which we
|
|
|
* will allow a single TSO frame to consume. Building TSO frames
|
|
|
* which are too large can cause TCP streams to be bursty.
|
|
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
|
|
|
int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
|
|
|
EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
|
|
|
|
|
|
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
|
+ int push_one, gfp_t gfp);
|
|
|
|
|
|
/* Account for new data that has been sent to the network. */
|
|
|
static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
|
|
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
|
|
|
return size;
|
|
|
}
|
|
|
|
|
|
+
|
|
|
+/* TCP SMALL QUEUES (TSQ)
|
|
|
+ *
|
|
|
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
|
|
|
+ * to reduce RTT and bufferbloat.
|
|
|
+ * We do this using a special skb destructor (tcp_wfree).
|
|
|
+ *
|
|
|
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
|
|
|
+ * needs to be reallocated in a driver.
|
|
|
+ * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
|
|
|
+ *
|
|
|
+ * Since transmit from skb destructor is forbidden, we use a tasklet
|
|
|
+ * to process all sockets that eventually need to send more skbs.
|
|
|
+ * We use one tasklet per cpu, with its own queue of sockets.
|
|
|
+ */
|
|
|
+struct tsq_tasklet {
|
|
|
+ struct tasklet_struct tasklet;
|
|
|
+ struct list_head head; /* queue of tcp sockets */
|
|
|
+};
|
|
|
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
|
|
|
+
|
|
|
+/*
|
|
|
+ * One tasklest per cpu tries to send more skbs.
|
|
|
+ * We run in tasklet context but need to disable irqs when
|
|
|
+ * transfering tsq->head because tcp_wfree() might
|
|
|
+ * interrupt us (non NAPI drivers)
|
|
|
+ */
|
|
|
+static void tcp_tasklet_func(unsigned long data)
|
|
|
+{
|
|
|
+ struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
|
|
|
+ LIST_HEAD(list);
|
|
|
+ unsigned long flags;
|
|
|
+ struct list_head *q, *n;
|
|
|
+ struct tcp_sock *tp;
|
|
|
+ struct sock *sk;
|
|
|
+
|
|
|
+ local_irq_save(flags);
|
|
|
+ list_splice_init(&tsq->head, &list);
|
|
|
+ local_irq_restore(flags);
|
|
|
+
|
|
|
+ list_for_each_safe(q, n, &list) {
|
|
|
+ tp = list_entry(q, struct tcp_sock, tsq_node);
|
|
|
+ list_del(&tp->tsq_node);
|
|
|
+
|
|
|
+ sk = (struct sock *)tp;
|
|
|
+ bh_lock_sock(sk);
|
|
|
+
|
|
|
+ if (!sock_owned_by_user(sk)) {
|
|
|
+ if ((1 << sk->sk_state) &
|
|
|
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
|
|
|
+ TCPF_CLOSING | TCPF_CLOSE_WAIT))
|
|
|
+ tcp_write_xmit(sk,
|
|
|
+ tcp_current_mss(sk),
|
|
|
+ 0, 0,
|
|
|
+ GFP_ATOMIC);
|
|
|
+ } else {
|
|
|
+ /* defer the work to tcp_release_cb() */
|
|
|
+ set_bit(TSQ_OWNED, &tp->tsq_flags);
|
|
|
+ }
|
|
|
+ bh_unlock_sock(sk);
|
|
|
+
|
|
|
+ clear_bit(TSQ_QUEUED, &tp->tsq_flags);
|
|
|
+ sk_free(sk);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * tcp_release_cb - tcp release_sock() callback
|
|
|
+ * @sk: socket
|
|
|
+ *
|
|
|
+ * called from release_sock() to perform protocol dependent
|
|
|
+ * actions before socket release.
|
|
|
+ */
|
|
|
+void tcp_release_cb(struct sock *sk)
|
|
|
+{
|
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
|
+
|
|
|
+ if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
|
|
|
+ if ((1 << sk->sk_state) &
|
|
|
+ (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
|
|
|
+ TCPF_CLOSING | TCPF_CLOSE_WAIT))
|
|
|
+ tcp_write_xmit(sk,
|
|
|
+ tcp_current_mss(sk),
|
|
|
+ 0, 0,
|
|
|
+ GFP_ATOMIC);
|
|
|
+ }
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(tcp_release_cb);
|
|
|
+
|
|
|
+void __init tcp_tasklet_init(void)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for_each_possible_cpu(i) {
|
|
|
+ struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
|
|
|
+
|
|
|
+ INIT_LIST_HEAD(&tsq->head);
|
|
|
+ tasklet_init(&tsq->tasklet,
|
|
|
+ tcp_tasklet_func,
|
|
|
+ (unsigned long)tsq);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Write buffer destructor automatically called from kfree_skb.
|
|
|
+ * We cant xmit new skbs from this context, as we might already
|
|
|
+ * hold qdisc lock.
|
|
|
+ */
|
|
|
+void tcp_wfree(struct sk_buff *skb)
|
|
|
+{
|
|
|
+ struct sock *sk = skb->sk;
|
|
|
+ struct tcp_sock *tp = tcp_sk(sk);
|
|
|
+
|
|
|
+ if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
|
|
|
+ !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
|
|
|
+ unsigned long flags;
|
|
|
+ struct tsq_tasklet *tsq;
|
|
|
+
|
|
|
+ /* Keep a ref on socket.
|
|
|
+ * This last ref will be released in tcp_tasklet_func()
|
|
|
+ */
|
|
|
+ atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
|
|
|
+
|
|
|
+ /* queue this socket to tasklet queue */
|
|
|
+ local_irq_save(flags);
|
|
|
+ tsq = &__get_cpu_var(tsq_tasklet);
|
|
|
+ list_add(&tp->tsq_node, &tsq->head);
|
|
|
+ tasklet_schedule(&tsq->tasklet);
|
|
|
+ local_irq_restore(flags);
|
|
|
+ } else {
|
|
|
+ sock_wfree(skb);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
/* This routine actually transmits TCP packets queued in by
|
|
|
* tcp_do_sendmsg(). This is used by both the initial
|
|
|
* transmission and possible later retransmissions.
|
|
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
|
|
|
|
|
skb_push(skb, tcp_header_size);
|
|
|
skb_reset_transport_header(skb);
|
|
|
- skb_set_owner_w(skb, sk);
|
|
|
+
|
|
|
+ skb_orphan(skb);
|
|
|
+ skb->sk = sk;
|
|
|
+ skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
|
|
|
+ tcp_wfree : sock_wfree;
|
|
|
+ atomic_add(skb->truesize, &sk->sk_wmem_alloc);
|
|
|
|
|
|
/* Build TCP header and checksum it. */
|
|
|
th = tcp_hdr(skb);
|
|
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
|
while ((skb = tcp_send_head(sk))) {
|
|
|
unsigned int limit;
|
|
|
|
|
|
+
|
|
|
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
|
|
|
BUG_ON(!tso_segs);
|
|
|
|
|
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
+ /* TSQ : sk_wmem_alloc accounts skb truesize,
|
|
|
+ * including skb overhead. But thats OK.
|
|
|
+ */
|
|
|
+ if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
|
|
|
+ set_bit(TSQ_THROTTLED, &tp->tsq_flags);
|
|
|
+ break;
|
|
|
+ }
|
|
|
limit = mss_now;
|
|
|
if (tso_segs > 1 && !tcp_urg_mode(tp))
|
|
|
limit = tcp_mss_split_point(sk, skb, mss_now,
|