]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/core/skbuff.c
skbuff: fix coalescing for page_pool fragment recycling
[mirror_ubuntu-jammy-kernel.git] / net / core / skbuff.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * Routines having to do with the 'struct sk_buff' memory handlers.
4 *
113aa838 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
1da177e4
LT
6 * Florian La Roche <rzsfl@rz.uni-sb.de>
7 *
1da177e4
LT
8 * Fixes:
9 * Alan Cox : Fixed the worst of the load
10 * balancer bugs.
11 * Dave Platt : Interrupt stacking fix.
12 * Richard Kooijman : Timestamp fixes.
13 * Alan Cox : Changed buffer format.
14 * Alan Cox : destructor hook for AF_UNIX etc.
15 * Linus Torvalds : Better skb_clone.
16 * Alan Cox : Added skb_copy.
17 * Alan Cox : Added all the changed routines Linus
18 * only put in the headers
19 * Ray VanTassle : Fixed --skb->lock in free
20 * Alan Cox : skb_copy copy arp field
21 * Andi Kleen : slabified it.
22 * Robert Olsson : Removed skb_head_pool
23 *
24 * NOTE:
25 * The __skb_ routines should be called with interrupts
26 * disabled, or you better be *real* sure that the operation is atomic
27 * with respect to whatever list is being frobbed (e.g. via lock_sock()
28 * or via disabling bottom half handlers, etc).
1da177e4
LT
29 */
30
31/*
32 * The functions in this file will not compile correctly with gcc 2.4.x
33 */
34
e005d193
JP
35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
36
1da177e4
LT
37#include <linux/module.h>
38#include <linux/types.h>
39#include <linux/kernel.h>
1da177e4
LT
40#include <linux/mm.h>
41#include <linux/interrupt.h>
42#include <linux/in.h>
43#include <linux/inet.h>
44#include <linux/slab.h>
de960aa9
FW
45#include <linux/tcp.h>
46#include <linux/udp.h>
90017acc 47#include <linux/sctp.h>
1da177e4
LT
48#include <linux/netdevice.h>
49#ifdef CONFIG_NET_CLS_ACT
50#include <net/pkt_sched.h>
51#endif
52#include <linux/string.h>
53#include <linux/skbuff.h>
9c55e01c 54#include <linux/splice.h>
1da177e4
LT
55#include <linux/cache.h>
56#include <linux/rtnetlink.h>
57#include <linux/init.h>
716ea3a7 58#include <linux/scatterlist.h>
ac45f602 59#include <linux/errqueue.h>
268bb0ce 60#include <linux/prefetch.h>
0d5501c1 61#include <linux/if_vlan.h>
2a2ea508 62#include <linux/mpls.h>
183f47fc 63#include <linux/kcov.h>
1da177e4
LT
64
65#include <net/protocol.h>
66#include <net/dst.h>
67#include <net/sock.h>
68#include <net/checksum.h>
ed1f50c3 69#include <net/ip6_checksum.h>
1da177e4 70#include <net/xfrm.h>
8822e270 71#include <net/mpls.h>
3ee17bc7 72#include <net/mptcp.h>
6a5bcd84 73#include <net/page_pool.h>
1da177e4 74
7c0f6ba6 75#include <linux/uaccess.h>
ad8d75ff 76#include <trace/events/skb.h>
51c56b00 77#include <linux/highmem.h>
b245be1f
WB
78#include <linux/capability.h>
79#include <linux/user_namespace.h>
2544af03 80#include <linux/indirect_call_wrapper.h>
a1f8e7f7 81
7b7ed885 82#include "datagram.h"
7f678def 83#include "sock_destructor.h"
7b7ed885 84
08009a76
AD
85struct kmem_cache *skbuff_head_cache __ro_after_init;
86static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
df5042f4
FW
87#ifdef CONFIG_SKB_EXTENSIONS
88static struct kmem_cache *skbuff_ext_cache __ro_after_init;
89#endif
5f74f82e
HWR
90int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
91EXPORT_SYMBOL(sysctl_max_skb_frags);
1da177e4 92
1da177e4 93/**
f05de73b
JS
94 * skb_panic - private function for out-of-line support
95 * @skb: buffer
96 * @sz: size
97 * @addr: address
99d5851e 98 * @msg: skb_over_panic or skb_under_panic
1da177e4 99 *
f05de73b
JS
100 * Out-of-line support for skb_put() and skb_push().
101 * Called via the wrapper skb_over_panic() or skb_under_panic().
102 * Keep out of line to prevent kernel bloat.
103 * __builtin_return_address is not used because it is not always reliable.
1da177e4 104 */
f05de73b 105static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
99d5851e 106 const char msg[])
1da177e4 107{
41a46913 108 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
99d5851e 109 msg, addr, skb->len, sz, skb->head, skb->data,
e005d193
JP
110 (unsigned long)skb->tail, (unsigned long)skb->end,
111 skb->dev ? skb->dev->name : "<NULL>");
1da177e4
LT
112 BUG();
113}
114
f05de73b 115static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
1da177e4 116{
f05de73b 117 skb_panic(skb, sz, addr, __func__);
1da177e4
LT
118}
119
f05de73b
JS
120static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
121{
122 skb_panic(skb, sz, addr, __func__);
123}
c93bdd0e 124
50fad4b5 125#define NAPI_SKB_CACHE_SIZE 64
f450d539
AL
126#define NAPI_SKB_CACHE_BULK 16
127#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
50fad4b5
AL
128
129struct napi_alloc_cache {
130 struct page_frag_cache page;
131 unsigned int skb_count;
132 void *skb_cache[NAPI_SKB_CACHE_SIZE];
133};
134
135static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
136static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
137
138static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
139 unsigned int align_mask)
140{
141 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
142
143 return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
144}
145
146void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
147{
148 fragsz = SKB_DATA_ALIGN(fragsz);
149
150 return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
151}
152EXPORT_SYMBOL(__napi_alloc_frag_align);
153
154void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
155{
156 struct page_frag_cache *nc;
157 void *data;
158
159 fragsz = SKB_DATA_ALIGN(fragsz);
afa79d08 160 if (in_hardirq() || irqs_disabled()) {
50fad4b5
AL
161 nc = this_cpu_ptr(&netdev_alloc_cache);
162 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
163 } else {
164 local_bh_disable();
165 data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
166 local_bh_enable();
167 }
168 return data;
169}
170EXPORT_SYMBOL(__netdev_alloc_frag_align);
171
f450d539
AL
172static struct sk_buff *napi_skb_cache_get(void)
173{
174 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
175 struct sk_buff *skb;
176
177 if (unlikely(!nc->skb_count))
178 nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
179 GFP_ATOMIC,
180 NAPI_SKB_CACHE_BULK,
181 nc->skb_cache);
182 if (unlikely(!nc->skb_count))
183 return NULL;
184
185 skb = nc->skb_cache[--nc->skb_count];
186 kasan_unpoison_object_data(skbuff_head_cache, skb);
187
188 return skb;
189}
190
ba0509b6 191/* Caller must provide SKB that is memset cleared */
483126b3
AL
192static void __build_skb_around(struct sk_buff *skb, void *data,
193 unsigned int frag_size)
ba0509b6
JDB
194{
195 struct skb_shared_info *shinfo;
196 unsigned int size = frag_size ? : ksize(data);
197
198 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
199
200 /* Assumes caller memset cleared SKB */
201 skb->truesize = SKB_TRUESIZE(size);
202 refcount_set(&skb->users, 1);
203 skb->head = data;
204 skb->data = data;
205 skb_reset_tail_pointer(skb);
704d34ae 206 skb_set_end_offset(skb, size);
ba0509b6
JDB
207 skb->mac_header = (typeof(skb->mac_header))~0U;
208 skb->transport_header = (typeof(skb->transport_header))~0U;
209
210 /* make sure we initialize shinfo sequentially */
211 shinfo = skb_shinfo(skb);
212 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
213 atomic_set(&shinfo->dataref, 1);
214
6370cc3b 215 skb_set_kcov_handle(skb, kcov_common_handle());
ba0509b6
JDB
216}
217
b2b5ce9d 218/**
2ea2f62c 219 * __build_skb - build a network buffer
b2b5ce9d 220 * @data: data buffer provided by caller
2ea2f62c 221 * @frag_size: size of data, or 0 if head was kmalloced
b2b5ce9d
ED
222 *
223 * Allocate a new &sk_buff. Caller provides space holding head and
deceb4c0 224 * skb_shared_info. @data must have been allocated by kmalloc() only if
2ea2f62c
ED
225 * @frag_size is 0, otherwise data should come from the page allocator
226 * or vmalloc()
b2b5ce9d
ED
227 * The return is the new skb buffer.
228 * On a failure the return is %NULL, and @data is not freed.
229 * Notes :
230 * Before IO, driver allocates only data buffer where NIC put incoming frame
231 * Driver should add room at head (NET_SKB_PAD) and
232 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
233 * After IO, driver calls build_skb(), to allocate sk_buff and populate it
234 * before giving packet to stack.
235 * RX rings only contains data buffers, not full skbs.
236 */
2ea2f62c 237struct sk_buff *__build_skb(void *data, unsigned int frag_size)
b2b5ce9d 238{
b2b5ce9d 239 struct sk_buff *skb;
b2b5ce9d
ED
240
241 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
ba0509b6 242 if (unlikely(!skb))
b2b5ce9d
ED
243 return NULL;
244
b2b5ce9d 245 memset(skb, 0, offsetof(struct sk_buff, tail));
483126b3 246 __build_skb_around(skb, data, frag_size);
b2b5ce9d 247
483126b3 248 return skb;
b2b5ce9d 249}
2ea2f62c
ED
250
251/* build_skb() is wrapper over __build_skb(), that specifically
252 * takes care of skb->head and skb->pfmemalloc
253 * This means that if @frag_size is not zero, then @data must be backed
254 * by a page fragment, not kmalloc() or vmalloc()
255 */
256struct sk_buff *build_skb(void *data, unsigned int frag_size)
257{
258 struct sk_buff *skb = __build_skb(data, frag_size);
259
260 if (skb && frag_size) {
261 skb->head_frag = 1;
2f064f34 262 if (page_is_pfmemalloc(virt_to_head_page(data)))
2ea2f62c
ED
263 skb->pfmemalloc = 1;
264 }
265 return skb;
266}
b2b5ce9d
ED
267EXPORT_SYMBOL(build_skb);
268
ba0509b6
JDB
269/**
270 * build_skb_around - build a network buffer around provided skb
271 * @skb: sk_buff provide by caller, must be memset cleared
272 * @data: data buffer provided by caller
273 * @frag_size: size of data, or 0 if head was kmalloced
274 */
275struct sk_buff *build_skb_around(struct sk_buff *skb,
276 void *data, unsigned int frag_size)
277{
278 if (unlikely(!skb))
279 return NULL;
280
483126b3 281 __build_skb_around(skb, data, frag_size);
ba0509b6 282
483126b3 283 if (frag_size) {
ba0509b6
JDB
284 skb->head_frag = 1;
285 if (page_is_pfmemalloc(virt_to_head_page(data)))
286 skb->pfmemalloc = 1;
287 }
288 return skb;
289}
290EXPORT_SYMBOL(build_skb_around);
291
f450d539
AL
292/**
293 * __napi_build_skb - build a network buffer
294 * @data: data buffer provided by caller
295 * @frag_size: size of data, or 0 if head was kmalloced
296 *
297 * Version of __build_skb() that uses NAPI percpu caches to obtain
298 * skbuff_head instead of inplace allocation.
299 *
300 * Returns a new &sk_buff on success, %NULL on allocation failure.
301 */
302static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
303{
304 struct sk_buff *skb;
305
306 skb = napi_skb_cache_get();
307 if (unlikely(!skb))
308 return NULL;
309
310 memset(skb, 0, offsetof(struct sk_buff, tail));
311 __build_skb_around(skb, data, frag_size);
312
313 return skb;
314}
315
316/**
317 * napi_build_skb - build a network buffer
318 * @data: data buffer provided by caller
319 * @frag_size: size of data, or 0 if head was kmalloced
320 *
321 * Version of __napi_build_skb() that takes care of skb->head_frag
322 * and skb->pfmemalloc when the data is a page or page fragment.
323 *
324 * Returns a new &sk_buff on success, %NULL on allocation failure.
325 */
326struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
327{
328 struct sk_buff *skb = __napi_build_skb(data, frag_size);
329
330 if (likely(skb) && frag_size) {
331 skb->head_frag = 1;
332 skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
333 }
334
335 return skb;
336}
337EXPORT_SYMBOL(napi_build_skb);
338
5381b23d
AL
339/*
340 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
341 * the caller if emergency pfmemalloc reserves are being used. If it is and
342 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
343 * may be used. Otherwise, the packet data may be discarded until enough
344 * memory is free
345 */
ef28095f
AL
346static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
347 bool *pfmemalloc)
5381b23d
AL
348{
349 void *obj;
350 bool ret_pfmemalloc = false;
351
352 /*
353 * Try a regular allocation, when that fails and we're not entitled
354 * to the reserves, fail.
355 */
356 obj = kmalloc_node_track_caller(size,
357 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
358 node);
359 if (obj || !(gfp_pfmemalloc_allowed(flags)))
360 goto out;
361
362 /* Try again but now we are using pfmemalloc reserves */
363 ret_pfmemalloc = true;
364 obj = kmalloc_node_track_caller(size, flags, node);
365
366out:
367 if (pfmemalloc)
368 *pfmemalloc = ret_pfmemalloc;
369
370 return obj;
371}
372
373/* Allocate a new skbuff. We do this ourselves so we can fill in a few
374 * 'private' fields and also do memory statistics to find all the
375 * [BEEP] leaks.
376 *
377 */
378
379/**
380 * __alloc_skb - allocate a network buffer
381 * @size: size to allocate
382 * @gfp_mask: allocation mask
383 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
384 * instead of head cache and allocate a cloned (child) skb.
385 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
386 * allocations in case the data is required for writeback
387 * @node: numa node to allocate memory on
388 *
389 * Allocate a new &sk_buff. The returned buffer has no headroom and a
390 * tail room of at least size bytes. The object has a reference count
391 * of one. The return is the buffer. On a failure the return is %NULL.
392 *
393 * Buffers may only be allocated from interrupts using a @gfp_mask of
394 * %GFP_ATOMIC.
395 */
396struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
397 int flags, int node)
398{
399 struct kmem_cache *cache;
5381b23d
AL
400 struct sk_buff *skb;
401 u8 *data;
402 bool pfmemalloc;
403
404 cache = (flags & SKB_ALLOC_FCLONE)
405 ? skbuff_fclone_cache : skbuff_head_cache;
406
407 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
408 gfp_mask |= __GFP_MEMALLOC;
409
410 /* Get the HEAD */
d13612b5
AL
411 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
412 likely(node == NUMA_NO_NODE || node == numa_mem_id()))
413 skb = napi_skb_cache_get();
414 else
415 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
df1ae022
AL
416 if (unlikely(!skb))
417 return NULL;
5381b23d
AL
418 prefetchw(skb);
419
420 /* We do our best to align skb_shared_info on a separate cache
421 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
422 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
423 * Both skb->head and skb_shared_info are cache line aligned.
424 */
425 size = SKB_DATA_ALIGN(size);
426 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
427 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
df1ae022 428 if (unlikely(!data))
5381b23d
AL
429 goto nodata;
430 /* kmalloc(size) might give us more room than requested.
431 * Put skb_shared_info exactly at the end of allocated zone,
432 * to allow max possible filling before reallocation.
433 */
434 size = SKB_WITH_OVERHEAD(ksize(data));
435 prefetchw(data + size);
436
437 /*
438 * Only clear those fields we need to clear, not those that we will
439 * actually initialise below. Hence, don't put any more fields after
440 * the tail pointer in struct sk_buff!
441 */
442 memset(skb, 0, offsetof(struct sk_buff, tail));
f9d6725b 443 __build_skb_around(skb, data, 0);
5381b23d 444 skb->pfmemalloc = pfmemalloc;
5381b23d
AL
445
446 if (flags & SKB_ALLOC_FCLONE) {
447 struct sk_buff_fclones *fclones;
448
449 fclones = container_of(skb, struct sk_buff_fclones, skb1);
450
451 skb->fclone = SKB_FCLONE_ORIG;
452 refcount_set(&fclones->fclone_ref, 1);
453
454 fclones->skb2.fclone = SKB_FCLONE_CLONE;
455 }
456
5381b23d 457 return skb;
df1ae022 458
5381b23d
AL
459nodata:
460 kmem_cache_free(cache, skb);
df1ae022 461 return NULL;
5381b23d
AL
462}
463EXPORT_SYMBOL(__alloc_skb);
464
fd11a83d
AD
465/**
466 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
467 * @dev: network device to receive on
d7499160 468 * @len: length to allocate
fd11a83d
AD
469 * @gfp_mask: get_free_pages mask, passed to alloc_skb
470 *
471 * Allocate a new &sk_buff and assign it a usage count of one. The
472 * buffer has NET_SKB_PAD headroom built in. Users should allocate
473 * the headroom they think they need without accounting for the
474 * built in space. The built in space is used for optimisations.
475 *
476 * %NULL is returned if there is no free memory.
477 */
9451980a
AD
478struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
479 gfp_t gfp_mask)
fd11a83d 480{
b63ae8ca 481 struct page_frag_cache *nc;
fd11a83d 482 struct sk_buff *skb;
9451980a
AD
483 bool pfmemalloc;
484 void *data;
485
486 len += NET_SKB_PAD;
fd11a83d 487
66c55602
AL
488 /* If requested length is either too small or too big,
489 * we use kmalloc() for skb->head allocation.
490 */
491 if (len <= SKB_WITH_OVERHEAD(1024) ||
492 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 493 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
a080e7bd
AD
494 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
495 if (!skb)
496 goto skb_fail;
497 goto skb_success;
498 }
fd11a83d 499
9451980a
AD
500 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
501 len = SKB_DATA_ALIGN(len);
502
503 if (sk_memalloc_socks())
504 gfp_mask |= __GFP_MEMALLOC;
505
afa79d08 506 if (in_hardirq() || irqs_disabled()) {
92dcabd7
SAS
507 nc = this_cpu_ptr(&netdev_alloc_cache);
508 data = page_frag_alloc(nc, len, gfp_mask);
509 pfmemalloc = nc->pfmemalloc;
510 } else {
511 local_bh_disable();
512 nc = this_cpu_ptr(&napi_alloc_cache.page);
513 data = page_frag_alloc(nc, len, gfp_mask);
514 pfmemalloc = nc->pfmemalloc;
515 local_bh_enable();
516 }
9451980a
AD
517
518 if (unlikely(!data))
519 return NULL;
520
521 skb = __build_skb(data, len);
522 if (unlikely(!skb)) {
181edb2b 523 skb_free_frag(data);
9451980a 524 return NULL;
7b2e497a 525 }
fd11a83d 526
9451980a
AD
527 if (pfmemalloc)
528 skb->pfmemalloc = 1;
529 skb->head_frag = 1;
530
a080e7bd 531skb_success:
9451980a
AD
532 skb_reserve(skb, NET_SKB_PAD);
533 skb->dev = dev;
534
a080e7bd 535skb_fail:
8af27456
CH
536 return skb;
537}
b4ac530f 538EXPORT_SYMBOL(__netdev_alloc_skb);
1da177e4 539
fd11a83d
AD
540/**
541 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
542 * @napi: napi instance this buffer was allocated for
d7499160 543 * @len: length to allocate
fd11a83d
AD
544 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
545 *
546 * Allocate a new sk_buff for use in NAPI receive. This buffer will
547 * attempt to allocate the head from a special reserved region used
548 * only for NAPI Rx allocation. By doing this we can save several
549 * CPU cycles by avoiding having to disable and re-enable IRQs.
550 *
551 * %NULL is returned if there is no free memory.
552 */
9451980a
AD
553struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
554 gfp_t gfp_mask)
fd11a83d 555{
3226b158 556 struct napi_alloc_cache *nc;
fd11a83d 557 struct sk_buff *skb;
9451980a
AD
558 void *data;
559
560 len += NET_SKB_PAD + NET_IP_ALIGN;
fd11a83d 561
3226b158
ED
562 /* If requested length is either too small or too big,
563 * we use kmalloc() for skb->head allocation.
564 */
565 if (len <= SKB_WITH_OVERHEAD(1024) ||
566 len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
d0164adc 567 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
cfb8ec65
AL
568 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
569 NUMA_NO_NODE);
a080e7bd
AD
570 if (!skb)
571 goto skb_fail;
572 goto skb_success;
573 }
9451980a 574
3226b158 575 nc = this_cpu_ptr(&napi_alloc_cache);
9451980a
AD
576 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
577 len = SKB_DATA_ALIGN(len);
578
579 if (sk_memalloc_socks())
580 gfp_mask |= __GFP_MEMALLOC;
fd11a83d 581
8c2dd3e4 582 data = page_frag_alloc(&nc->page, len, gfp_mask);
9451980a
AD
583 if (unlikely(!data))
584 return NULL;
585
cfb8ec65 586 skb = __napi_build_skb(data, len);
9451980a 587 if (unlikely(!skb)) {
181edb2b 588 skb_free_frag(data);
9451980a 589 return NULL;
fd11a83d
AD
590 }
591
795bb1c0 592 if (nc->page.pfmemalloc)
9451980a
AD
593 skb->pfmemalloc = 1;
594 skb->head_frag = 1;
595
a080e7bd 596skb_success:
9451980a
AD
597 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
598 skb->dev = napi->dev;
599
a080e7bd 600skb_fail:
fd11a83d
AD
601 return skb;
602}
603EXPORT_SYMBOL(__napi_alloc_skb);
604
654bed16 605void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
50269e19 606 int size, unsigned int truesize)
654bed16
PZ
607{
608 skb_fill_page_desc(skb, i, page, off, size);
609 skb->len += size;
610 skb->data_len += size;
50269e19 611 skb->truesize += truesize;
654bed16
PZ
612}
613EXPORT_SYMBOL(skb_add_rx_frag);
614
f8e617e1
JW
615void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
616 unsigned int truesize)
617{
618 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
619
620 skb_frag_size_add(frag, size);
621 skb->len += size;
622 skb->data_len += size;
623 skb->truesize += truesize;
624}
625EXPORT_SYMBOL(skb_coalesce_rx_frag);
626
27b437c8 627static void skb_drop_list(struct sk_buff **listp)
1da177e4 628{
bd8a7036 629 kfree_skb_list(*listp);
27b437c8 630 *listp = NULL;
1da177e4
LT
631}
632
27b437c8
HX
633static inline void skb_drop_fraglist(struct sk_buff *skb)
634{
635 skb_drop_list(&skb_shinfo(skb)->frag_list);
636}
637
1da177e4
LT
638static void skb_clone_fraglist(struct sk_buff *skb)
639{
640 struct sk_buff *list;
641
fbb398a8 642 skb_walk_frags(skb, list)
1da177e4
LT
643 skb_get(list);
644}
645
d3836f21
ED
646static void skb_free_head(struct sk_buff *skb)
647{
181edb2b
AD
648 unsigned char *head = skb->head;
649
6a5bcd84
IA
650 if (skb->head_frag) {
651 if (skb_pp_recycle(skb, head))
652 return;
181edb2b 653 skb_free_frag(head);
6a5bcd84 654 } else {
181edb2b 655 kfree(head);
6a5bcd84 656 }
d3836f21
ED
657}
658
5bba1712 659static void skb_release_data(struct sk_buff *skb)
1da177e4 660{
ff04a771
ED
661 struct skb_shared_info *shinfo = skb_shinfo(skb);
662 int i;
1da177e4 663
ff04a771
ED
664 if (skb->cloned &&
665 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
666 &shinfo->dataref))
2cc3aeb5 667 goto exit;
a6686f2f 668
70c43167
JL
669 skb_zcopy_clear(skb, true);
670
ff04a771 671 for (i = 0; i < shinfo->nr_frags; i++)
6a5bcd84 672 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
a6686f2f 673
ff04a771
ED
674 if (shinfo->frag_list)
675 kfree_skb_list(shinfo->frag_list);
676
677 skb_free_head(skb);
2cc3aeb5
IA
678exit:
679 /* When we clone an SKB we copy the reycling bit. The pp_recycle
680 * bit is only set on the head though, so in order to avoid races
681 * while trying to recycle fragments on __skb_frag_unref() we need
682 * to make one SKB responsible for triggering the recycle path.
683 * So disable the recycling bit if an SKB is cloned and we have
684 * additional references to to the fragmented part of the SKB.
685 * Eventually the last SKB will have the recycling bit set and it's
686 * dataref set to 0, which will trigger the recycling
687 */
688 skb->pp_recycle = 0;
1da177e4
LT
689}
690
691/*
692 * Free an skbuff by memory without cleaning the state.
693 */
2d4baff8 694static void kfree_skbmem(struct sk_buff *skb)
1da177e4 695{
d0bf4a9e 696 struct sk_buff_fclones *fclones;
d179cd12 697
d179cd12
DM
698 switch (skb->fclone) {
699 case SKB_FCLONE_UNAVAILABLE:
700 kmem_cache_free(skbuff_head_cache, skb);
6ffe75eb 701 return;
d179cd12
DM
702
703 case SKB_FCLONE_ORIG:
d0bf4a9e 704 fclones = container_of(skb, struct sk_buff_fclones, skb1);
d179cd12 705
6ffe75eb
ED
706 /* We usually free the clone (TX completion) before original skb
707 * This test would have no chance to be true for the clone,
708 * while here, branch prediction will be good.
d179cd12 709 */
2638595a 710 if (refcount_read(&fclones->fclone_ref) == 1)
6ffe75eb
ED
711 goto fastpath;
712 break;
e7820e39 713
6ffe75eb
ED
714 default: /* SKB_FCLONE_CLONE */
715 fclones = container_of(skb, struct sk_buff_fclones, skb2);
d179cd12 716 break;
3ff50b79 717 }
2638595a 718 if (!refcount_dec_and_test(&fclones->fclone_ref))
6ffe75eb
ED
719 return;
720fastpath:
721 kmem_cache_free(skbuff_fclone_cache, fclones);
1da177e4
LT
722}
723
0a463c78 724void skb_release_head_state(struct sk_buff *skb)
1da177e4 725{
adf30907 726 skb_dst_drop(skb);
9c2b3328 727 if (skb->destructor) {
afa79d08 728 WARN_ON(in_hardirq());
1da177e4
LT
729 skb->destructor(skb);
730 }
a3bf7ae9 731#if IS_ENABLED(CONFIG_NF_CONNTRACK)
cb9c6836 732 nf_conntrack_put(skb_nfct(skb));
1da177e4 733#endif
df5042f4 734 skb_ext_put(skb);
04a4bb55
LB
735}
736
737/* Free everything but the sk_buff shell. */
738static void skb_release_all(struct sk_buff *skb)
739{
740 skb_release_head_state(skb);
a28b1b90
FW
741 if (likely(skb->head))
742 skb_release_data(skb);
2d4baff8
HX
743}
744
745/**
746 * __kfree_skb - private function
747 * @skb: buffer
748 *
749 * Free an sk_buff. Release anything attached to the buffer.
750 * Clean the state. This is an internal helper function. Users should
751 * always call kfree_skb
752 */
1da177e4 753
2d4baff8
HX
754void __kfree_skb(struct sk_buff *skb)
755{
756 skb_release_all(skb);
1da177e4
LT
757 kfree_skbmem(skb);
758}
b4ac530f 759EXPORT_SYMBOL(__kfree_skb);
1da177e4 760