#include <asm/uaccess.h>
#include <trace/events/skb.h>
+#include <linux/highmem.h>
-#include "kmap_skb.h"
-
-static struct kmem_cache *skbuff_head_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
/**
* build_skb - build a network buffer
* @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc()
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *build_skb(void *data)
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
- unsigned int size;
+ unsigned int size = frag_size ? : ksize(data);
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
if (!skb)
return NULL;
- size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
+ skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
skb_get(list);
}
+static void skb_free_head(struct sk_buff *skb)
+{
+ if (skb->head_frag)
+ put_page(virt_to_head_page(skb->head));
+ else
+ kfree(skb->head);
+}
+
static void skb_release_data(struct sk_buff *skb)
{
if (!skb->cloned ||
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
- kfree(skb->head);
+ skb_free_head(skb);
}
}
C(tail);
C(end);
C(head);
+ C(head_frag);
C(data);
C(truesize);
atomic_set(&n->users, 1);
}
return -ENOMEM;
}
- vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ vaddr = kmap_atomic(skb_frag_page(f));
memcpy(page_address(page),
vaddr + f->page_offset, skb_frag_size(f));
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
page->private = (unsigned long)head;
head = page;
}
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
int headerlen = skb_headroom(skb);
- unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
+ unsigned int size = skb_end_offset(skb) + skb->data_len;
struct sk_buff *n = alloc_skb(size, gfp_mask);
if (!n)
{
int i;
u8 *data;
- int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
+ int size = nhead + skb_end_offset(skb) + ntail;
long off;
- bool fastpath;
BUG_ON(nhead < 0);
size = SKB_DATA_ALIGN(size);
- /* Check if we can avoid taking references on fragments if we own
- * the last reference on skb->head. (see skb_release_data())
- */
- if (!skb->cloned)
- fastpath = true;
- else {
- int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
- fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
- }
-
- if (fastpath &&
- size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
- memmove(skb->head + size, skb_shinfo(skb),
- offsetof(struct skb_shared_info,
- frags[skb_shinfo(skb)->nr_frags]));
- memmove(skb->head + nhead, skb->head,
- skb_tail_pointer(skb) - skb->head);
- off = nhead;
- goto adjust_others;
- }
-
data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
gfp_mask);
if (!data)
skb_shinfo(skb),
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
- if (fastpath) {
- kfree(skb->head);
- } else {
+ /*
+ * if shinfo is shared we must drop the old head gracefully, but if it
+ * is not we can just drop the old head and let the existing refcount
+ * be since all we did is relocate the values
+ */
+ if (skb_cloned(skb)) {
/* copy this zero copy skb frags */
if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
if (skb_copy_ubufs(skb, gfp_mask))
skb_clone_fraglist(skb);
skb_release_data(skb);
+ } else {
+ skb_free_head(skb);
}
off = (data + nhead) - skb->head;
skb->head = data;
-adjust_others:
+ skb->head_frag = 0;
skb->data += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
skb->end = size;
return -ENOMEM;
nfrag->next = frag->next;
- kfree_skb(frag);
+ consume_skb(frag);
frag = nfrag;
*fragp = frag;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *f = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ end = start + skb_frag_size(f);
if ((copy = end - offset) > 0) {
u8 *vaddr;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ vaddr = kmap_atomic(skb_frag_page(f));
memcpy(to,
- vaddr + skb_shinfo(skb)->frags[i].page_offset+
- offset - start, copy);
- kunmap_skb_frag(vaddr);
+ vaddr + f->page_offset + offset - start,
+ copy);
+ kunmap_atomic(vaddr);
if ((len -= copy) == 0)
return 0;
put_page(spd->pages[i]);
}
-static inline struct page *linear_to_page(struct page *page, unsigned int *len,
- unsigned int *offset,
- struct sk_buff *skb, struct sock *sk)
+static struct page *linear_to_page(struct page *page, unsigned int *len,
+ unsigned int *offset,
+ struct sk_buff *skb, struct sock *sk)
{
struct page *p = sk->sk_sndmsg_page;
unsigned int off;
} else {
unsigned int mlen;
+ /* If we are the only user of the page, we can reset offset */
+ if (page_count(p) == 1)
+ sk->sk_sndmsg_off = 0;
off = sk->sk_sndmsg_off;
mlen = PAGE_SIZE - off;
if (mlen < 64 && mlen < *len) {
memcpy(page_address(p) + off, page_address(page) + *offset, *len);
sk->sk_sndmsg_off += *len;
*offset = off;
- get_page(p);
return p;
}
+static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
+ struct page *page,
+ unsigned int offset)
+{
+ return spd->nr_pages &&
+ spd->pages[spd->nr_pages - 1] == page &&
+ (spd->partial[spd->nr_pages - 1].offset +
+ spd->partial[spd->nr_pages - 1].len == offset);
+}
+
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
-static inline int spd_fill_page(struct splice_pipe_desc *spd,
- struct pipe_inode_info *pipe, struct page *page,
- unsigned int *len, unsigned int offset,
- struct sk_buff *skb, int linear,
- struct sock *sk)
+static bool spd_fill_page(struct splice_pipe_desc *spd,
+ struct pipe_inode_info *pipe, struct page *page,
+ unsigned int *len, unsigned int offset,
+ struct sk_buff *skb, bool linear,
+ struct sock *sk)
{
- if (unlikely(spd->nr_pages == pipe->buffers))
- return 1;
+ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
+ return true;
if (linear) {
page = linear_to_page(page, len, &offset, skb, sk);
if (!page)
- return 1;
- } else
- get_page(page);
-
+ return true;
+ }
+ if (spd_can_coalesce(spd, page, offset)) {
+ spd->partial[spd->nr_pages - 1].len += *len;
+ return false;
+ }
+ get_page(page);
spd->pages[spd->nr_pages] = page;
spd->partial[spd->nr_pages].len = *len;
spd->partial[spd->nr_pages].offset = offset;
spd->nr_pages++;
- return 0;
+ return false;
}
static inline void __segment_seek(struct page **page, unsigned int *poff,
*plen -= off;
}
-static inline int __splice_segment(struct page *page, unsigned int poff,
- unsigned int plen, unsigned int *off,
- unsigned int *len, struct sk_buff *skb,
- struct splice_pipe_desc *spd, int linear,
- struct sock *sk,
- struct pipe_inode_info *pipe)
+static bool __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len, struct sk_buff *skb,
+ struct splice_pipe_desc *spd, bool linear,
+ struct sock *sk,
+ struct pipe_inode_info *pipe)
{
if (!*len)
- return 1;
+ return true;
/* skip this segment if already processed */
if (*off >= plen) {
*off -= plen;
- return 0;
+ return false;
}
/* ignore any bits we already processed */
flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
- return 1;
+ return true;
__segment_seek(&page, &poff, &plen, flen);
*len -= flen;
} while (*len && plen);
- return 0;
+ return false;
}
/*
- * Map linear and fragment data from the skb to spd. It reports failure if the
+ * Map linear and fragment data from the skb to spd. It reports true if the
* pipe is full or if we already spliced the requested length.
*/
-static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
- unsigned int *offset, unsigned int *len,
- struct splice_pipe_desc *spd, struct sock *sk)
+static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+ unsigned int *offset, unsigned int *len,
+ struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
- /*
- * map the linear part
+ /* map the linear part :
+ * If skb->head_frag is set, this 'linear' part is backed by a
+ * fragment, and if the head is not shared with any clones then
+ * we can avoid a copy since we own the head portion of this page.
*/
if (__splice_segment(virt_to_page(skb->data),
(unsigned long) skb->data & (PAGE_SIZE - 1),
skb_headlen(skb),
- offset, len, skb, spd, 1, sk, pipe))
- return 1;
+ offset, len, skb, spd,
+ skb_head_is_locked(skb),
+ sk, pipe))
+ return true;
/*
* then map the fragments
if (__splice_segment(skb_frag_page(f),
f->page_offset, skb_frag_size(f),
- offset, len, skb, spd, 0, sk, pipe))
- return 1;
+ offset, len, skb, spd, false, sk, pipe))
+ return true;
}
- return 0;
+ return false;
}
/*
struct pipe_inode_info *pipe, unsigned int tlen,
unsigned int flags)
{
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[MAX_SKB_FRAGS];
+ struct page *pages[MAX_SKB_FRAGS];
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
struct sock *sk = skb->sk;
int ret = 0;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
/*
* __skb_splice_bits() only fails if the output has no room left,
* so no point in going over the frag_list for the error case.
lock_sock(sk);
}
- splice_shrink_spd(pipe, &spd);
return ret;
}
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
memcpy(vaddr + frag->page_offset + offset - start,
from, copy);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
if ((len -= copy) == 0)
return 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
__wsum csum2;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
csum2 = csum_partial(vaddr + frag->page_offset +
offset - start, copy, 0);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
if (!(len -= copy))
return csum;
if (copy > len)
copy = len;
- vaddr = kmap_skb_frag(frag);
+ vaddr = kmap_atomic(skb_frag_page(frag));
csum2 = csum_partial_copy_nocheck(vaddr +
frag->page_offset +
offset - start, to,
copy, 0);
- kunmap_skb_frag(vaddr);
+ kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
if (!(len -= copy))
return csum;
if (abs_offset < block_limit) {
if (!st->frag_data)
- st->frag_data = kmap_skb_frag(frag);
+ st->frag_data = kmap_atomic(skb_frag_page(frag));
*data = (u8 *) st->frag_data + frag->page_offset +
(abs_offset - st->stepped_offset);
}
if (st->frag_data) {
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
}
if (st->frag_data) {
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
void skb_abort_seq_read(struct skb_seq_state *st)
{
if (st->frag_data)
- kunmap_skb_frag(st->frag_data);
+ kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);
if (unlikely(!nskb))
goto err;
- hsize = skb_end_pointer(nskb) - nskb->head;
+ hsize = skb_end_offset(nskb);
if (skb_cow_head(nskb, doffset + headroom)) {
kfree_skb(nskb);
goto err;
}
- nskb->truesize += skb_end_pointer(nskb) - nskb->head -
- hsize;
+ nskb->truesize += skb_end_offset(nskb) - hsize;
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
unsigned int len = skb_gro_len(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
+ unsigned int delta_truesize;
if (p->len + len >= 65536)
return -E2BIG;
frag->page_offset += offset;
skb_frag_size_sub(frag, offset);
+ /* all fragments truesize : remove (head size + sk_buff) */
+ delta_truesize = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
+
skb->truesize -= skb->data_len;
skb->len -= skb->data_len;
skb->data_len = 0;
- NAPI_GRO_CB(skb)->free = 1;
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ return -E2BIG;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ frag->page.p = page;
+ frag->page_offset = first_offset;
+ skb_frag_size_set(frag, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
} else if (skb_gro_len(p) != pinfo->gso_size)
return -E2BIG;
p = nskb;
merge:
- p->truesize += skb->truesize - len;
+ delta_truesize = skb->truesize;
if (offset > headlen) {
unsigned int eat = offset - headlen;
done:
NAPI_GRO_CB(p)->count++;
p->data_len += len;
- p->truesize += len;
+ p->truesize += delta_truesize;
p->len += len;
NAPI_GRO_CB(skb)->same_flow = 1;
int len = skb->len;
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf)
+ (unsigned int)sk->sk_rcvbuf)
return -ENOMEM;
skb_orphan(skb);
{
if (unlikely(start > skb_headlen(skb)) ||
unlikely((int)start + off > skb_headlen(skb) - 2)) {
- if (net_ratelimit())
- printk(KERN_WARNING
- "bad partial csum: csum=%u/%u len=%u\n",
- start, off, skb_headlen(skb));
+ net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
+ start, off, skb_headlen(skb));
return false;
}
skb->ip_summed = CHECKSUM_PARTIAL;
void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
- if (net_ratelimit())
- pr_warning("%s: received packets cannot be forwarded"
- " while LRO is enabled\n", skb->dev->name);
+ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
+ skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);