|
@@ -100,6 +100,8 @@ do { \
|
|
|
} while (0)
|
|
|
#endif
|
|
|
|
|
|
+#define GOODCOPY_LEN 128
|
|
|
+
|
|
|
#define FLT_EXACT_COUNT 8
|
|
|
struct tap_filter {
|
|
|
unsigned int count; /* Number of addrs. Zero means disabled */
|
|
@@ -604,19 +606,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
|
|
|
return skb;
|
|
|
}
|
|
|
|
|
|
+/* set skb frags from iovec, this can move to core network code for reuse */
|
|
|
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
|
|
|
+ int offset, size_t count)
|
|
|
+{
|
|
|
+ int len = iov_length(from, count) - offset;
|
|
|
+ int copy = skb_headlen(skb);
|
|
|
+ int size, offset1 = 0;
|
|
|
+ int i = 0;
|
|
|
+
|
|
|
+ /* Skip over from offset */
|
|
|
+ while (count && (offset >= from->iov_len)) {
|
|
|
+ offset -= from->iov_len;
|
|
|
+ ++from;
|
|
|
+ --count;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* copy up to skb headlen */
|
|
|
+ while (count && (copy > 0)) {
|
|
|
+ size = min_t(unsigned int, copy, from->iov_len - offset);
|
|
|
+ if (copy_from_user(skb->data + offset1, from->iov_base + offset,
|
|
|
+ size))
|
|
|
+ return -EFAULT;
|
|
|
+ if (copy > size) {
|
|
|
+ ++from;
|
|
|
+ --count;
|
|
|
+ offset = 0;
|
|
|
+ } else
|
|
|
+ offset += size;
|
|
|
+ copy -= size;
|
|
|
+ offset1 += size;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (len == offset1)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ while (count--) {
|
|
|
+ struct page *page[MAX_SKB_FRAGS];
|
|
|
+ int num_pages;
|
|
|
+ unsigned long base;
|
|
|
+ unsigned long truesize;
|
|
|
+
|
|
|
+ len = from->iov_len - offset;
|
|
|
+ if (!len) {
|
|
|
+ offset = 0;
|
|
|
+ ++from;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ base = (unsigned long)from->iov_base + offset;
|
|
|
+ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
|
|
|
+ if (i + size > MAX_SKB_FRAGS)
|
|
|
+ return -EMSGSIZE;
|
|
|
+ num_pages = get_user_pages_fast(base, size, 0, &page[i]);
|
|
|
+ if (num_pages != size) {
|
|
|
+ for (i = 0; i < num_pages; i++)
|
|
|
+ put_page(page[i]);
|
|
|
+ return -EFAULT;
|
|
|
+ }
|
|
|
+ truesize = size * PAGE_SIZE;
|
|
|
+ skb->data_len += len;
|
|
|
+ skb->len += len;
|
|
|
+ skb->truesize += truesize;
|
|
|
+ atomic_add(truesize, &skb->sk->sk_wmem_alloc);
|
|
|
+ while (len) {
|
|
|
+ int off = base & ~PAGE_MASK;
|
|
|
+ int size = min_t(int, len, PAGE_SIZE - off);
|
|
|
+ __skb_fill_page_desc(skb, i, page[i], off, size);
|
|
|
+ skb_shinfo(skb)->nr_frags++;
|
|
|
+ /* increase sk_wmem_alloc */
|
|
|
+ base += size;
|
|
|
+ len -= size;
|
|
|
+ i++;
|
|
|
+ }
|
|
|
+ offset = 0;
|
|
|
+ ++from;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
/* Get packet from user space buffer */
|
|
|
-static ssize_t tun_get_user(struct tun_struct *tun,
|
|
|
- const struct iovec *iv, size_t count,
|
|
|
- int noblock)
|
|
|
+static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
|
|
|
+ const struct iovec *iv, size_t total_len,
|
|
|
+ size_t count, int noblock)
|
|
|
{
|
|
|
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
|
|
|
struct sk_buff *skb;
|
|
|
- size_t len = count, align = NET_SKB_PAD;
|
|
|
+ size_t len = total_len, align = NET_SKB_PAD;
|
|
|
struct virtio_net_hdr gso = { 0 };
|
|
|
int offset = 0;
|
|
|
+ int copylen;
|
|
|
+ bool zerocopy = false;
|
|
|
+ int err;
|
|
|
|
|
|
if (!(tun->flags & TUN_NO_PI)) {
|
|
|
- if ((len -= sizeof(pi)) > count)
|
|
|
+ if ((len -= sizeof(pi)) > total_len)
|
|
|
return -EINVAL;
|
|
|
|
|
|
if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
|
|
@@ -625,7 +708,7 @@ static ssize_t tun_get_user(struct tun_struct *tun,
|
|
|
}
|
|
|
|
|
|
if (tun->flags & TUN_VNET_HDR) {
|
|
|
- if ((len -= tun->vnet_hdr_sz) > count)
|
|
|
+ if ((len -= tun->vnet_hdr_sz) > total_len)
|
|
|
return -EINVAL;
|
|
|
|
|
|
if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
|
|
@@ -647,14 +730,46 @@ static ssize_t tun_get_user(struct tun_struct *tun,
|
|
|
return -EINVAL;
|
|
|
}
|
|
|
|
|
|
- skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
|
|
|
+ if (msg_control)
|
|
|
+ zerocopy = true;
|
|
|
+
|
|
|
+ if (zerocopy) {
|
|
|
+ /* Userspace may produce vectors with count greater than
|
|
|
+ * MAX_SKB_FRAGS, so we need to linearize parts of the skb
|
|
|
+ * to let the rest of data to be fit in the frags.
|
|
|
+ */
|
|
|
+ if (count > MAX_SKB_FRAGS) {
|
|
|
+ copylen = iov_length(iv, count - MAX_SKB_FRAGS);
|
|
|
+ if (copylen < offset)
|
|
|
+ copylen = 0;
|
|
|
+ else
|
|
|
+ copylen -= offset;
|
|
|
+ } else
|
|
|
+ copylen = 0;
|
|
|
+ /* There are 256 bytes to be copied in skb, so there is enough
|
|
|
+ * room for skb expand head in case it is used.
|
|
|
+ * The rest of the buffer is mapped from userspace.
|
|
|
+ */
|
|
|
+ if (copylen < gso.hdr_len)
|
|
|
+ copylen = gso.hdr_len;
|
|
|
+ if (!copylen)
|
|
|
+ copylen = GOODCOPY_LEN;
|
|
|
+ } else
|
|
|
+ copylen = len;
|
|
|
+
|
|
|
+ skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
|
|
|
if (IS_ERR(skb)) {
|
|
|
if (PTR_ERR(skb) != -EAGAIN)
|
|
|
tun->dev->stats.rx_dropped++;
|
|
|
return PTR_ERR(skb);
|
|
|
}
|
|
|
|
|
|
- if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) {
|
|
|
+ if (zerocopy)
|
|
|
+ err = zerocopy_sg_from_iovec(skb, iv, offset, count);
|
|
|
+ else
|
|
|
+ err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
|
|
|
+
|
|
|
+ if (err) {
|
|
|
tun->dev->stats.rx_dropped++;
|
|
|
kfree_skb(skb);
|
|
|
return -EFAULT;
|
|
@@ -728,12 +843,18 @@ static ssize_t tun_get_user(struct tun_struct *tun,
|
|
|
skb_shinfo(skb)->gso_segs = 0;
|
|
|
}
|
|
|
|
|
|
+ /* copy skb_ubuf_info for callback when skb has no error */
|
|
|
+ if (zerocopy) {
|
|
|
+ skb_shinfo(skb)->destructor_arg = msg_control;
|
|
|
+ skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
|
|
|
+ }
|
|
|
+
|
|
|
netif_rx_ni(skb);
|
|
|
|
|
|
tun->dev->stats.rx_packets++;
|
|
|
tun->dev->stats.rx_bytes += len;
|
|
|
|
|
|
- return count;
|
|
|
+ return total_len;
|
|
|
}
|
|
|
|
|
|
static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
|
|
@@ -748,7 +869,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
|
|
|
|
|
|
tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
|
|
|
|
|
|
- result = tun_get_user(tun, iv, iov_length(iv, count),
|
|
|
+ result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count,
|
|
|
file->f_flags & O_NONBLOCK);
|
|
|
|
|
|
tun_put(tun);
|
|
@@ -962,8 +1083,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
|
|
|
struct msghdr *m, size_t total_len)
|
|
|
{
|
|
|
struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
|
|
|
- return tun_get_user(tun, m->msg_iov, total_len,
|
|
|
- m->msg_flags & MSG_DONTWAIT);
|
|
|
+ return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
|
|
|
+ m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
|
|
|
}
|
|
|
|
|
|
static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
|
|
@@ -1133,6 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
|
|
|
sock_init_data(&tun->socket, sk);
|
|
|
sk->sk_write_space = tun_sock_write_space;
|
|
|
sk->sk_sndbuf = INT_MAX;
|
|
|
+ sock_set_flag(sk, SOCK_ZEROCOPY);
|
|
|
|
|
|
tun_sk(sk)->tun = tun;
|
|
|
|