]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - net/packet/af_packet.c
net/packet: annotate accesses to po->bind
[mirror_ubuntu-focal-kernel.git] / net / packet / af_packet.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * PACKET - implements raw packet sockets.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 *
1ce4f28b 13 * Fixes:
1da177e4
LT
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 36 * Ulises Alonso : Frame number limit removal and
1da177e4 37 * packet_set_ring memory leak.
0fb375fb
EB
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
1ce4f28b 41 * byte arrays at the end of sockaddr_ll
0fb375fb 42 * and packet_mreq.
69e3c75f 43 * Johann Baudy : Added TX RING.
f6fb8f10 44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * layer.
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
1da177e4 47 */
1ce4f28b 48
1da177e4 49#include <linux/types.h>
1da177e4 50#include <linux/mm.h>
4fc268d2 51#include <linux/capability.h>
1da177e4
LT
52#include <linux/fcntl.h>
53#include <linux/socket.h>
54#include <linux/in.h>
55#include <linux/inet.h>
56#include <linux/netdevice.h>
57#include <linux/if_packet.h>
58#include <linux/wireless.h>
ffbc6111 59#include <linux/kernel.h>
1da177e4 60#include <linux/kmod.h>
5a0e3ad6 61#include <linux/slab.h>
0e3125c7 62#include <linux/vmalloc.h>
457c4cbc 63#include <net/net_namespace.h>
1da177e4
LT
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
7c0f6ba6 70#include <linux/uaccess.h>
1da177e4
LT
71#include <asm/ioctls.h>
72#include <asm/page.h>
a1f8e7f7 73#include <asm/cacheflush.h>
1da177e4
LT
74#include <asm/io.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77#include <linux/poll.h>
78#include <linux/module.h>
79#include <linux/init.h>
905db440 80#include <linux/mutex.h>
05423b24 81#include <linux/if_vlan.h>
bfd5f4a3 82#include <linux/virtio_net.h>
ed85b565 83#include <linux/errqueue.h>
614f60fa 84#include <linux/net_tstamp.h>
b0138408 85#include <linux/percpu.h>
1da177e4
LT
86#ifdef CONFIG_INET
87#include <net/inet_common.h>
88#endif
47dceb8e 89#include <linux/bpf.h>
719c44d3 90#include <net/compat.h>
1da177e4 91
2787b04b
PE
92#include "internal.h"
93
1da177e4
LT
94/*
95 Assumptions:
96 - if device has no dev->hard_header routine, it adds and removes ll header
97 inside itself. In this case ll header is invisible outside of device,
98 but higher levels still should reserve dev->hard_header_len.
99 Some devices are enough clever to reallocate skb, when header
100 will not fit to reserved space (tunnel), another ones are silly
101 (PPP).
102 - packet socket receives packets with pulled ll header,
103 so that SOCK_RAW should push it back.
104
105On receive:
106-----------
107
108Incoming, dev->hard_header!=NULL
b0e380b1
ACM
109 mac_header -> ll header
110 data -> data
1da177e4
LT
111
112Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> ll header
1da177e4
LT
115
116Incoming, dev->hard_header==NULL
b0e380b1
ACM
117 mac_header -> UNKNOWN position. It is very likely, that it points to ll
118 header. PPP makes it, that is wrong, because introduce
db0c58f9 119 assymetry between rx and tx paths.
b0e380b1 120 data -> data
1da177e4
LT
121
122Outgoing, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> data. ll header is still not built!
124 data -> data
1da177e4
LT
125
126Resume
127 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
128
129
130On transmit:
131------------
132
133dev->hard_header != NULL
b0e380b1
ACM
134 mac_header -> ll header
135 data -> ll header
1da177e4
LT
136
137dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
138 mac_header -> data
139 data -> data
1da177e4
LT
140
141 We should set nh.raw on output to correct posistion,
142 packet classifier depends on it.
143 */
144
1da177e4
LT
145/* Private packet socket structures. */
146
0fb375fb
EB
147/* identical to struct packet_mreq except it has
148 * a longer address field.
149 */
40d4e3df 150struct packet_mreq_max {
0fb375fb
EB
151 int mr_ifindex;
152 unsigned short mr_type;
153 unsigned short mr_alen;
154 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 155};
a2efcfa0 156
184f489e
DB
157union tpacket_uhdr {
158 struct tpacket_hdr *h1;
159 struct tpacket2_hdr *h2;
160 struct tpacket3_hdr *h3;
161 void *raw;
162};
163
f6fb8f10 164static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
165 int closing, int tx_ring);
166
f6fb8f10 167#define V3_ALIGNMENT (8)
168
bc59ba39 169#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 170
171#define BLK_PLUS_PRIV(sz_of_priv) \
172 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
173
f6fb8f10 174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
181
69e3c75f 182struct packet_sock;
77f65ebd
WB
183static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
184 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 185
f6fb8f10 186static void *packet_previous_frame(struct packet_sock *po,
187 struct packet_ring_buffer *rb,
188 int status);
189static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 190static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 191static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 192 struct packet_sock *);
bc59ba39 193static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 194 struct packet_sock *, unsigned int status);
bc59ba39 195static int prb_queue_frozen(struct tpacket_kbdq_core *);
196static void prb_open_block(struct tpacket_kbdq_core *,
197 struct tpacket_block_desc *);
17bfd8c8 198static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 199static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 200static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
201static void prb_clear_rxhash(struct tpacket_kbdq_core *,
202 struct tpacket3_hdr *);
203static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
204 struct tpacket3_hdr *);
1da177e4 205static void packet_flush_mclist(struct sock *sk);
865b03f2 206static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 207
ffbc6111 208struct packet_skb_cb {
ffbc6111
HX
209 union {
210 struct sockaddr_pkt pkt;
2472d761
EB
211 union {
212 /* Trick: alias skb original length with
213 * ll.sll_family and ll.protocol in order
214 * to save room.
215 */
216 unsigned int origlen;
217 struct sockaddr_ll ll;
218 };
ffbc6111
HX
219 } sa;
220};
221
d3869efe
DW
222#define vio_le() virtio_legacy_is_little_endian()
223
ffbc6111 224#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 225
bc59ba39 226#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 227#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 228 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 229#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 231#define GET_NEXT_PRB_BLK_NUM(x) \
232 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
233 ((x)->kactive_blk_num+1) : 0)
234
dc99f600
DM
235static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
236static void __fanout_link(struct sock *sk, struct packet_sock *po);
237
d346a3fa
DB
238static int packet_direct_xmit(struct sk_buff *skb)
239{
865b03f2 240 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
241}
242
66e56cd4
DB
243static struct net_device *packet_cached_dev_get(struct packet_sock *po)
244{
245 struct net_device *dev;
246
247 rcu_read_lock();
248 dev = rcu_dereference(po->cached_dev);
249 if (likely(dev))
250 dev_hold(dev);
251 rcu_read_unlock();
252
253 return dev;
254}
255
256static void packet_cached_dev_assign(struct packet_sock *po,
257 struct net_device *dev)
258{
259 rcu_assign_pointer(po->cached_dev, dev);
260}
261
262static void packet_cached_dev_reset(struct packet_sock *po)
263{
264 RCU_INIT_POINTER(po->cached_dev, NULL);
265}
266
d346a3fa
DB
267static bool packet_use_direct_xmit(const struct packet_sock *po)
268{
269 return po->xmit == packet_direct_xmit;
270}
271
865b03f2 272static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 273{
865b03f2 274 struct net_device *dev = skb->dev;
0fd5d57b 275 const struct net_device_ops *ops = dev->netdev_ops;
b71b5837 276 int cpu = raw_smp_processor_id();
0fd5d57b
DB
277 u16 queue_index;
278
b71b5837
PA
279#ifdef CONFIG_XPS
280 skb->sender_cpu = cpu + 1;
281#endif
282 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
0fd5d57b 283 if (ops->ndo_select_queue) {
a350ecce 284 queue_index = ops->ndo_select_queue(dev, skb, NULL);
0fd5d57b
DB
285 queue_index = netdev_cap_txqueue(dev, queue_index);
286 } else {
b71b5837 287 queue_index = netdev_pick_tx(dev, skb, NULL);
0fd5d57b
DB
288 }
289
865b03f2 290 return queue_index;
0fd5d57b
DB
291}
292
a6361f0c 293/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
294 * or from a context in which asynchronous accesses to the packet
295 * socket is not possible (packet_create()).
296 */
a6361f0c 297static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
298{
299 struct packet_sock *po = pkt_sk(sk);
e40526cb 300
ce06b03e 301 if (!po->running) {
66e56cd4 302 if (po->fanout)
dc99f600 303 __fanout_link(sk, po);
66e56cd4 304 else
dc99f600 305 dev_add_pack(&po->prot_hook);
e40526cb 306
ce06b03e
DM
307 sock_hold(sk);
308 po->running = 1;
309 }
310}
311
a6361f0c
WB
312static void register_prot_hook(struct sock *sk)
313{
314 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
315 __register_prot_hook(sk);
316}
317
318/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
319 * the po->bind_lock and do a synchronize_net to make sure no
320 * asynchronous packet processing paths still refer to the elements
321 * of po->prot_hook. If the sync parameter is false, it is the
322 * callers responsibility to take care of this.
323 */
324static void __unregister_prot_hook(struct sock *sk, bool sync)
325{
326 struct packet_sock *po = pkt_sk(sk);
327
a6361f0c
WB
328 lockdep_assert_held_once(&po->bind_lock);
329
ce06b03e 330 po->running = 0;
66e56cd4
DB
331
332 if (po->fanout)
dc99f600 333 __fanout_unlink(sk, po);
66e56cd4 334 else
dc99f600 335 __dev_remove_pack(&po->prot_hook);
e40526cb 336
ce06b03e
DM
337 __sock_put(sk);
338
339 if (sync) {
340 spin_unlock(&po->bind_lock);
341 synchronize_net();
342 spin_lock(&po->bind_lock);
343 }
344}
345
346static void unregister_prot_hook(struct sock *sk, bool sync)
347{
348 struct packet_sock *po = pkt_sk(sk);
349
350 if (po->running)
351 __unregister_prot_hook(sk, sync);
352}
353
6e58040b 354static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
355{
356 if (is_vmalloc_addr(addr))
357 return vmalloc_to_page(addr);
358 return virt_to_page(addr);
359}
360
69e3c75f 361static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 362{
184f489e 363 union tpacket_uhdr h;
1da177e4 364
69e3c75f 365 h.raw = frame;
bbd6ef87
PM
366 switch (po->tp_version) {
367 case TPACKET_V1:
69e3c75f 368 h.h1->tp_status = status;
0af55bb5 369 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
370 break;
371 case TPACKET_V2:
69e3c75f 372 h.h2->tp_status = status;
0af55bb5 373 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 374 break;
f6fb8f10 375 case TPACKET_V3:
7f953ab2
SV
376 h.h3->tp_status = status;
377 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
378 break;
69e3c75f 379 default:
f6fb8f10 380 WARN(1, "TPACKET version not supported.\n");
69e3c75f 381 BUG();
bbd6ef87 382 }
69e3c75f
JB
383
384 smp_wmb();
bbd6ef87
PM
385}
386
96f657e6 387static int __packet_get_status(const struct packet_sock *po, void *frame)
bbd6ef87 388{
184f489e 389 union tpacket_uhdr h;
bbd6ef87 390
69e3c75f
JB
391 smp_rmb();
392
bbd6ef87
PM
393 h.raw = frame;
394 switch (po->tp_version) {
395 case TPACKET_V1:
0af55bb5 396 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 397 return h.h1->tp_status;
bbd6ef87 398 case TPACKET_V2:
0af55bb5 399 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 400 return h.h2->tp_status;
f6fb8f10 401 case TPACKET_V3:
7f953ab2
SV
402 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
403 return h.h3->tp_status;
69e3c75f 404 default:
f6fb8f10 405 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
406 BUG();
407 return 0;
bbd6ef87 408 }
1da177e4 409}
69e3c75f 410
b9c32fb2
DB
411static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
412 unsigned int flags)
7a51384c
DB
413{
414 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
415
68a360e8
WB
416 if (shhwtstamps &&
417 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
418 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
419 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
420
421 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 422 return TP_STATUS_TS_SOFTWARE;
7a51384c 423
b9c32fb2 424 return 0;
7a51384c
DB
425}
426
b9c32fb2
DB
427static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
428 struct sk_buff *skb)
2e31396f
WB
429{
430 union tpacket_uhdr h;
431 struct timespec ts;
b9c32fb2 432 __u32 ts_status;
2e31396f 433
b9c32fb2
DB
434 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
435 return 0;
2e31396f
WB
436
437 h.raw = frame;
438 switch (po->tp_version) {
439 case TPACKET_V1:
440 h.h1->tp_sec = ts.tv_sec;
441 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
442 break;
443 case TPACKET_V2:
444 h.h2->tp_sec = ts.tv_sec;
445 h.h2->tp_nsec = ts.tv_nsec;
446 break;
447 case TPACKET_V3:
57ea884b
DB
448 h.h3->tp_sec = ts.tv_sec;
449 h.h3->tp_nsec = ts.tv_nsec;
450 break;
2e31396f
WB
451 default:
452 WARN(1, "TPACKET version not supported.\n");
453 BUG();
454 }
455
456 /* one flush is safe, as both fields always lie on the same cacheline */
457 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
458 smp_wmb();
b9c32fb2
DB
459
460 return ts_status;
2e31396f
WB
461}
462
d4b5bd98
ED
463static void *packet_lookup_frame(const struct packet_sock *po,
464 const struct packet_ring_buffer *rb,
465 unsigned int position,
466 int status)
69e3c75f
JB
467{
468 unsigned int pg_vec_pos, frame_offset;
184f489e 469 union tpacket_uhdr h;
69e3c75f
JB
470
471 pg_vec_pos = position / rb->frames_per_block;
472 frame_offset = position % rb->frames_per_block;
473
0e3125c7
NH
474 h.raw = rb->pg_vec[pg_vec_pos].buffer +
475 (frame_offset * rb->frame_size);
69e3c75f
JB
476
477 if (status != __packet_get_status(po, h.raw))
478 return NULL;
479
480 return h.raw;
481}
482
eea49cc9 483static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
484 struct packet_ring_buffer *rb,
485 int status)
486{
487 return packet_lookup_frame(po, rb, rb->head, status);
488}
489
bc59ba39 490static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 491{
492 del_timer_sync(&pkc->retire_blk_timer);
493}
494
495static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 496 struct sk_buff_head *rb_queue)
497{
bc59ba39 498 struct tpacket_kbdq_core *pkc;
f6fb8f10 499
73d0fcf2 500 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 501
ec6f809f 502 spin_lock_bh(&rb_queue->lock);
f6fb8f10 503 pkc->delete_blk_timer = 1;
ec6f809f 504 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 505
506 prb_del_retire_blk_timer(pkc);
507}
508
e8e85cc5 509static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 510{
bc59ba39 511 struct tpacket_kbdq_core *pkc;
f6fb8f10 512
e8e85cc5 513 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
514 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
515 0);
516 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 517}
518
519static int prb_calc_retire_blk_tmo(struct packet_sock *po,
520 int blk_size_in_bytes)
521{
522 struct net_device *dev;
523 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 524 struct ethtool_link_ksettings ecmd;
4bc71cb9 525 int err;
f6fb8f10 526
4bc71cb9
JP
527 rtnl_lock();
528 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
529 if (unlikely(!dev)) {
530 rtnl_unlock();
f6fb8f10 531 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 532 }
7cad1bac 533 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
534 rtnl_unlock();
535 if (!err) {
4bc71cb9
JP
536 /*
537 * If the link speed is so slow you don't really
538 * need to worry about perf anyways
539 */
7cad1bac
DD
540 if (ecmd.base.speed < SPEED_1000 ||
541 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 542 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 543 } else {
544 msec = 1;
7cad1bac 545 div = ecmd.base.speed / 1000;
f6fb8f10 546 }
fcfcfe0b
MW
547 } else
548 return DEFAULT_PRB_RETIRE_TOV;
f6fb8f10 549
550 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
551
552 if (div)
553 mbits /= div;
554
555 tmo = mbits * msec;
556
557 if (div)
558 return tmo+1;
559 return tmo;
560}
561
bc59ba39 562static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 563 union tpacket_req_u *req_u)
564{
565 p1->feature_req_word = req_u->req3.tp_feature_req_word;
566}
567
568static void init_prb_bdqc(struct packet_sock *po,
569 struct packet_ring_buffer *rb,
570 struct pgv *pg_vec,
e8e85cc5 571 union tpacket_req_u *req_u)
f6fb8f10 572{
22781a5b 573 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 574 struct tpacket_block_desc *pbd;
f6fb8f10 575
576 memset(p1, 0x0, sizeof(*p1));
577
578 p1->knxt_seq_num = 1;
579 p1->pkbdq = pg_vec;
bc59ba39 580 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 581 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 582 p1->kblk_size = req_u->req3.tp_block_size;
583 p1->knum_blocks = req_u->req3.tp_block_nr;
584 p1->hdrlen = po->tp_hdrlen;
585 p1->version = po->tp_version;
586 p1->last_kactive_blk_num = 0;
ee80fbf3 587 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 588 if (req_u->req3.tp_retire_blk_tov)
589 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
590 else
591 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
592 req_u->req3.tp_block_size);
593 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
594 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
595
dc808110 596 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 597 prb_init_ft_ops(p1, req_u);
e8e85cc5 598 prb_setup_retire_blk_timer(po);
f6fb8f10 599 prb_open_block(p1, pbd);
600}
601
602/* Do NOT update the last_blk_num first.
603 * Assumes sk_buff_head lock is held.
604 */
bc59ba39 605static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 606{
607 mod_timer(&pkc->retire_blk_timer,
608 jiffies + pkc->tov_in_jiffies);
609 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
610}
611
612/*
613 * Timer logic:
614 * 1) We refresh the timer only when we open a block.
615 * By doing this we don't waste cycles refreshing the timer
616 * on packet-by-packet basis.
617 *
618 * With a 1MB block-size, on a 1Gbps line, it will take
619 * i) ~8 ms to fill a block + ii) memcpy etc.
620 * In this cut we are not accounting for the memcpy time.
621 *
622 * So, if the user sets the 'tmo' to 10ms then the timer
623 * will never fire while the block is still getting filled
624 * (which is what we want). However, the user could choose
625 * to close a block early and that's fine.
626 *
627 * But when the timer does fire, we check whether or not to refresh it.
628 * Since the tmo granularity is in msecs, it is not too expensive
629 * to refresh the timer, lets say every '8' msecs.
630 * Either the user can set the 'tmo' or we can derive it based on
631 * a) line-speed and b) block-size.
632 * prb_calc_retire_blk_tmo() calculates the tmo.
633 *
634 */
17bfd8c8 635static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 636{
17bfd8c8
KC
637 struct packet_sock *po =
638 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 639 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 640 unsigned int frozen;
bc59ba39 641 struct tpacket_block_desc *pbd;
f6fb8f10 642
643 spin_lock(&po->sk.sk_receive_queue.lock);
644
645 frozen = prb_queue_frozen(pkc);
646 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
647
648 if (unlikely(pkc->delete_blk_timer))
649 goto out;
650
651 /* We only need to plug the race when the block is partially filled.
652 * tpacket_rcv:
653 * lock(); increment BLOCK_NUM_PKTS; unlock()
654 * copy_bits() is in progress ...
655 * timer fires on other cpu:
656 * we can't retire the current block because copy_bits
657 * is in progress.
658 *
659 */
660 if (BLOCK_NUM_PKTS(pbd)) {
661 while (atomic_read(&pkc->blk_fill_in_prog)) {
662 /* Waiting for skb_copy_bits to finish... */
663 cpu_relax();
664 }
665 }
666
667 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
668 if (!frozen) {
41a50d62
AD
669 if (!BLOCK_NUM_PKTS(pbd)) {
670 /* An empty block. Just refresh the timer. */
671 goto refresh_timer;
672 }
f6fb8f10 673 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
674 if (!prb_dispatch_next_block(pkc, po))
675 goto refresh_timer;
676 else
677 goto out;
678 } else {
679 /* Case 1. Queue was frozen because user-space was
680 * lagging behind.
681 */
878cd3ba 682 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 683 /*
684 * Ok, user-space is still behind.
685 * So just refresh the timer.
686 */
687 goto refresh_timer;
688 } else {
689 /* Case 2. queue was frozen,user-space caught up,
690 * now the link went idle && the timer fired.
691 * We don't have a block to close.So we open this
692 * block and restart the timer.
693 * opening a block thaws the queue,restarts timer
694 * Thawing/timer-refresh is a side effect.
695 */
696 prb_open_block(pkc, pbd);
697 goto out;
698 }
699 }
700 }
701
702refresh_timer:
703 _prb_refresh_rx_retire_blk_timer(pkc);
704
705out:
706 spin_unlock(&po->sk.sk_receive_queue.lock);
707}
708
eea49cc9 709static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 710 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 711{
712 /* Flush everything minus the block header */
713
714#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
715 u8 *start, *end;
716
717 start = (u8 *)pbd1;
718
719 /* Skip the block header(we know header WILL fit in 4K) */
720 start += PAGE_SIZE;
721
722 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
723 for (; start < end; start += PAGE_SIZE)
724 flush_dcache_page(pgv_to_page(start));
725
726 smp_wmb();
727#endif
728
729 /* Now update the block status. */
730
731 BLOCK_STATUS(pbd1) = status;
732
733 /* Flush the block header */
734
735#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
736 start = (u8 *)pbd1;
737 flush_dcache_page(pgv_to_page(start));
738
739 smp_wmb();
740#endif
741}
742
743/*
744 * Side effect:
745 *
746 * 1) flush the block
747 * 2) Increment active_blk_num
748 *
749 * Note:We DONT refresh the timer on purpose.
750 * Because almost always the next block will be opened.
751 */
bc59ba39 752static void prb_close_block(struct tpacket_kbdq_core *pkc1,
753 struct tpacket_block_desc *pbd1,
f6fb8f10 754 struct packet_sock *po, unsigned int stat)
755{
756 __u32 status = TP_STATUS_USER | stat;
757
758 struct tpacket3_hdr *last_pkt;
bc59ba39 759 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 760 struct sock *sk = &po->sk;
f6fb8f10 761
8e8e2951 762 if (atomic_read(&po->tp_drops))
f6fb8f10 763 status |= TP_STATUS_LOSING;
764
765 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
766 last_pkt->tp_next_offset = 0;
767
768 /* Get the ts of the last pkt */
769 if (BLOCK_NUM_PKTS(pbd1)) {
770 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
771 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
772 } else {
41a50d62
AD
773 /* Ok, we tmo'd - so get the current time.
774 *
775 * It shouldn't really happen as we don't close empty
776 * blocks. See prb_retire_rx_blk_timer_expired().
777 */
f6fb8f10 778 struct timespec ts;
779 getnstimeofday(&ts);
780 h1->ts_last_pkt.ts_sec = ts.tv_sec;
781 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
782 }
783
784 smp_wmb();
785
786 /* Flush the block */
787 prb_flush_block(pkc1, pbd1, status);
788
da413eec
DC
789 sk->sk_data_ready(sk);
790
f6fb8f10 791 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
792}
793
eea49cc9 794static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 795{
796 pkc->reset_pending_on_curr_blk = 0;
797}
798
799/*
800 * Side effect of opening a block:
801 *
802 * 1) prb_queue is thawed.
803 * 2) retire_blk_timer is refreshed.
804 *
805 */
bc59ba39 806static void prb_open_block(struct tpacket_kbdq_core *pkc1,
807 struct tpacket_block_desc *pbd1)
f6fb8f10 808{
809 struct timespec ts;
bc59ba39 810 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 811
812 smp_rmb();
813
8da3056c
DB
814 /* We could have just memset this but we will lose the
815 * flexibility of making the priv area sticky
816 */
f6fb8f10 817
8da3056c
DB
818 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
819 BLOCK_NUM_PKTS(pbd1) = 0;
820 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 821
8da3056c
DB
822 getnstimeofday(&ts);
823
824 h1->ts_first_pkt.ts_sec = ts.tv_sec;
825 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 826
8da3056c
DB
827 pkc1->pkblk_start = (char *)pbd1;
828 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
829
830 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
831 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
832
833 pbd1->version = pkc1->version;
834 pkc1->prev = pkc1->nxt_offset;
835 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
836
837 prb_thaw_queue(pkc1);
838 _prb_refresh_rx_retire_blk_timer(pkc1);
839
840 smp_wmb();
f6fb8f10 841}
842
843/*
844 * Queue freeze logic:
845 * 1) Assume tp_block_nr = 8 blocks.
846 * 2) At time 't0', user opens Rx ring.
847 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
848 * 4) user-space is either sleeping or processing block '0'.
849 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
850 * it will close block-7,loop around and try to fill block '0'.
851 * call-flow:
852 * __packet_lookup_frame_in_block
853 * prb_retire_current_block()
854 * prb_dispatch_next_block()
855 * |->(BLOCK_STATUS == USER) evaluates to true
856 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
857 * 6) Now there are two cases:
858 * 6.1) Link goes idle right after the queue is frozen.
859 * But remember, the last open_block() refreshed the timer.
860 * When this timer expires,it will refresh itself so that we can
861 * re-open block-0 in near future.
862 * 6.2) Link is busy and keeps on receiving packets. This is a simple
863 * case and __packet_lookup_frame_in_block will check if block-0
864 * is free and can now be re-used.
865 */
eea49cc9 866static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 867 struct packet_sock *po)
868{
869 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 870 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 871}
872
873#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
874
875/*
876 * If the next block is free then we will dispatch it
877 * and return a good offset.
878 * Else, we will freeze the queue.
879 * So, caller must check the return value.
880 */
bc59ba39 881static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 882 struct packet_sock *po)
883{
bc59ba39 884 struct tpacket_block_desc *pbd;
f6fb8f10 885
886 smp_rmb();
887
888 /* 1. Get current block num */
889 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
890
891 /* 2. If this block is currently in_use then freeze the queue */
892 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
893 prb_freeze_queue(pkc, po);
894 return NULL;
895 }
896
897 /*
898 * 3.
899 * open this block and return the offset where the first packet
900 * needs to get stored.
901 */
902 prb_open_block(pkc, pbd);
903 return (void *)pkc->nxt_offset;
904}
905
bc59ba39 906static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 907 struct packet_sock *po, unsigned int status)
908{
bc59ba39 909 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 910
911 /* retire/close the current block */
912 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
913 /*
914 * Plug the case where copy_bits() is in progress on
915 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
916 * have space to copy the pkt in the current block and
917 * called prb_retire_current_block()
918 *
919 * We don't need to worry about the TMO case because
920 * the timer-handler already handled this case.
921 */
922 if (!(status & TP_STATUS_BLK_TMO)) {
923 while (atomic_read(&pkc->blk_fill_in_prog)) {
924 /* Waiting for skb_copy_bits to finish... */
925 cpu_relax();
926 }
927 }
928 prb_close_block(pkc, pbd, po, status);
929 return;
930 }
f6fb8f10 931}
932
878cd3ba 933static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 934{
935 return TP_STATUS_USER & BLOCK_STATUS(pbd);
936}
937
eea49cc9 938static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 939{
940 return pkc->reset_pending_on_curr_blk;
941}
942
eea49cc9 943static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
014dc8b9 944 __releases(&pkc->blk_fill_in_prog_lock)
f6fb8f10 945{
bc59ba39 946 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 947 atomic_dec(&pkc->blk_fill_in_prog);
948}
949
eea49cc9 950static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 951 struct tpacket3_hdr *ppd)
952{
3958afa1 953 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 954}
955
eea49cc9 956static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 957 struct tpacket3_hdr *ppd)
958{
959 ppd->hv1.tp_rxhash = 0;
960}
961
eea49cc9 962static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 963 struct tpacket3_hdr *ppd)
964{
df8a39de
JP
965 if (skb_vlan_tag_present(pkc->skb)) {
966 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
967 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
968 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 969 } else {
9e67030a 970 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 971 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 972 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 973 }
974}
975
bc59ba39 976static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 977 struct tpacket3_hdr *ppd)
978{
a0cdfcf3 979 ppd->hv1.tp_padding = 0;
f6fb8f10 980 prb_fill_vlan_info(pkc, ppd);
981
982 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
983 prb_fill_rxhash(pkc, ppd);
984 else
985 prb_clear_rxhash(pkc, ppd);
986}
987
eea49cc9 988static void prb_fill_curr_block(char *curr,
bc59ba39 989 struct tpacket_kbdq_core *pkc,
990 struct tpacket_block_desc *pbd,
f6fb8f10 991 unsigned int len)
014dc8b9 992 __acquires(&pkc->blk_fill_in_prog_lock)
f6fb8f10 993{
994 struct tpacket3_hdr *ppd;
995
996 ppd = (struct tpacket3_hdr *)curr;
997 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
998 pkc->prev = curr;
999 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1000 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1001 BLOCK_NUM_PKTS(pbd) += 1;
1002 atomic_inc(&pkc->blk_fill_in_prog);
1003 prb_run_all_ft_ops(pkc, ppd);
1004}
1005
1006/* Assumes caller has the sk->rx_queue.lock */
1007static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1008 struct sk_buff *skb,
f6fb8f10 1009 unsigned int len
1010 )
1011{
bc59ba39 1012 struct tpacket_kbdq_core *pkc;
1013 struct tpacket_block_desc *pbd;
f6fb8f10 1014 char *curr, *end;
1015
e3192690 1016 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1017 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1018
1019 /* Queue is frozen when user space is lagging behind */
1020 if (prb_queue_frozen(pkc)) {
1021 /*
1022 * Check if that last block which caused the queue to freeze,
1023 * is still in_use by user-space.
1024 */
878cd3ba 1025 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1026 /* Can't record this packet */
1027 return NULL;
1028 } else {
1029 /*
1030 * Ok, the block was released by user-space.
1031 * Now let's open that block.
1032 * opening a block also thaws the queue.
1033 * Thawing is a side effect.
1034 */
1035 prb_open_block(pkc, pbd);
1036 }
1037 }
1038
1039 smp_mb();
1040 curr = pkc->nxt_offset;
1041 pkc->skb = skb;
e3192690 1042 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1043
1044 /* first try the current block */
1045 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1046 prb_fill_curr_block(curr, pkc, pbd, len);
1047 return (void *)curr;
1048 }
1049
1050 /* Ok, close the current block */
1051 prb_retire_current_block(pkc, po, 0);
1052
1053 /* Now, try to dispatch the next block */
1054 curr = (char *)prb_dispatch_next_block(pkc, po);
1055 if (curr) {
1056 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1057 prb_fill_curr_block(curr, pkc, pbd, len);
1058 return (void *)curr;
1059 }
1060
1061 /*
1062 * No free blocks are available.user_space hasn't caught up yet.
1063 * Queue was just frozen and now this packet will get dropped.
1064 */
1065 return NULL;
1066}
1067
eea49cc9 1068static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1069 struct sk_buff *skb,
1070 int status, unsigned int len)
1071{
1072 char *curr = NULL;
1073 switch (po->tp_version) {
1074 case TPACKET_V1:
1075 case TPACKET_V2:
1076 curr = packet_lookup_frame(po, &po->rx_ring,
1077 po->rx_ring.head, status);
1078 return curr;
1079 case TPACKET_V3:
46088059 1080 return __packet_lookup_frame_in_block(po, skb, len);
f6fb8f10 1081 default:
1082 WARN(1, "TPACKET version not supported\n");
1083 BUG();
99aa3473 1084 return NULL;
f6fb8f10 1085 }
1086}
1087
dcf70cef
ED
1088static void *prb_lookup_block(const struct packet_sock *po,
1089 const struct packet_ring_buffer *rb,
1090 unsigned int idx,
1091 int status)
f6fb8f10 1092{
bc59ba39 1093 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1094 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1095
1096 if (status != BLOCK_STATUS(pbd))
1097 return NULL;
1098 return pbd;
1099}
1100
eea49cc9 1101static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1102{
1103 unsigned int prev;
1104 if (rb->prb_bdqc.kactive_blk_num)
1105 prev = rb->prb_bdqc.kactive_blk_num-1;
1106 else
1107 prev = rb->prb_bdqc.knum_blocks-1;
1108 return prev;
1109}
1110
1111/* Assumes caller has held the rx_queue.lock */
eea49cc9 1112static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1113 struct packet_ring_buffer *rb,
1114 int status)
1115{
1116 unsigned int previous = prb_previous_blk_num(rb);
1117 return prb_lookup_block(po, rb, previous, status);
1118}
1119
eea49cc9 1120static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1121 struct packet_ring_buffer *rb,
1122 int status)
1123{
1124 if (po->tp_version <= TPACKET_V2)
1125 return packet_previous_frame(po, rb, status);
1126
1127 return __prb_previous_block(po, rb, status);
1128}
1129
eea49cc9 1130static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1131 struct packet_ring_buffer *rb)
1132{
1133 switch (po->tp_version) {
1134 case TPACKET_V1:
1135 case TPACKET_V2:
1136 return packet_increment_head(rb);
1137 case TPACKET_V3:
1138 default:
1139 WARN(1, "TPACKET version not supported.\n");
1140 BUG();
1141 return;
1142 }
1143}
1144
eea49cc9 1145static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1146 struct packet_ring_buffer *rb,
1147 int status)
1148{
1149 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1150 return packet_lookup_frame(po, rb, previous, status);
1151}
1152
eea49cc9 1153static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1154{
1155 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1156}
1157
b0138408
DB
1158static void packet_inc_pending(struct packet_ring_buffer *rb)
1159{
1160 this_cpu_inc(*rb->pending_refcnt);
1161}
1162
1163static void packet_dec_pending(struct packet_ring_buffer *rb)
1164{
1165 this_cpu_dec(*rb->pending_refcnt);
1166}
1167
1168static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1169{
1170 unsigned int refcnt = 0;
1171 int cpu;
1172
1173 /* We don't use pending refcount in rx_ring. */
1174 if (rb->pending_refcnt == NULL)
1175 return 0;
1176
1177 for_each_possible_cpu(cpu)
1178 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1179
1180 return refcnt;
1181}
1182
1183static int packet_alloc_pending(struct packet_sock *po)
1184{
1185 po->rx_ring.pending_refcnt = NULL;
1186
1187 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1188 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1189 return -ENOBUFS;
1190
1191 return 0;
1192}
1193
1194static void packet_free_pending(struct packet_sock *po)
1195{
1196 free_percpu(po->tx_ring.pending_refcnt);
1197}
1198
9954729b
WB
1199#define ROOM_POW_OFF 2
1200#define ROOM_NONE 0x0
1201#define ROOM_LOW 0x1
1202#define ROOM_NORMAL 0x2
1203
d4b5bd98 1204static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
77f65ebd 1205{
9954729b
WB
1206 int idx, len;
1207
d4b5bd98
ED
1208 len = READ_ONCE(po->rx_ring.frame_max) + 1;
1209 idx = READ_ONCE(po->rx_ring.head);
9954729b
WB
1210 if (pow_off)
1211 idx += len >> pow_off;
1212 if (idx >= len)
1213 idx -= len;
1214 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1215}
1216
dcf70cef 1217static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
9954729b
WB
1218{
1219 int idx, len;
1220
dcf70cef
ED
1221 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1222 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
9954729b
WB
1223 if (pow_off)
1224 idx += len >> pow_off;
1225 if (idx >= len)
1226 idx -= len;
1227 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1228}
77f65ebd 1229
0338a145
ED
1230static int __packet_rcv_has_room(const struct packet_sock *po,
1231 const struct sk_buff *skb)
9954729b 1232{
0338a145 1233 const struct sock *sk = &po->sk;
9954729b
WB
1234 int ret = ROOM_NONE;
1235
1236 if (po->prot_hook.func != tpacket_rcv) {
0338a145
ED
1237 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1238 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1239 - (skb ? skb->truesize : 0);
1240
1241 if (avail > (rcvbuf >> ROOM_POW_OFF))
9954729b
WB
1242 return ROOM_NORMAL;
1243 else if (avail > 0)
1244 return ROOM_LOW;
1245 else
1246 return ROOM_NONE;
1247 }
77f65ebd 1248
9954729b
WB
1249 if (po->tp_version == TPACKET_V3) {
1250 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1251 ret = ROOM_NORMAL;
1252 else if (__tpacket_v3_has_room(po, 0))
1253 ret = ROOM_LOW;
1254 } else {
1255 if (__tpacket_has_room(po, ROOM_POW_OFF))
1256 ret = ROOM_NORMAL;
1257 else if (__tpacket_has_room(po, 0))
1258 ret = ROOM_LOW;
1259 }
2ccdbaa6
WB
1260
1261 return ret;
1262}
1263
1264static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1265{
3a2bb84e 1266 int pressure, ret;
2ccdbaa6 1267
54d7c01d 1268 ret = __packet_rcv_has_room(po, skb);
3a2bb84e
ED
1269 pressure = ret != ROOM_NORMAL;
1270
1271 if (READ_ONCE(po->pressure) != pressure)
1272 WRITE_ONCE(po->pressure, pressure);
77f65ebd 1273
9954729b 1274 return ret;
77f65ebd
WB
1275}
1276
9bb6cd65
ED
1277static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1278{
1279 if (READ_ONCE(po->pressure) &&
1280 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1281 WRITE_ONCE(po->pressure, 0);
1282}
1283
1da177e4
LT
1284static void packet_sock_destruct(struct sock *sk)
1285{
ed85b565
RC
1286 skb_queue_purge(&sk->sk_error_queue);
1287
547b792c 1288 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1289 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1290
1291 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1292 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1293 return;
1294 }
1295
17ab56a2 1296 sk_refcnt_debug_dec(sk);
1da177e4
LT
1297}
1298
3b3a5b0a
WB
1299static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1300{
f6cec329
ED
1301 u32 *history = po->rollover->history;
1302 u32 victim, rxhash;
3b3a5b0a
WB
1303 int i, count = 0;
1304
1305 rxhash = skb_get_hash(skb);
1306 for (i = 0; i < ROLLOVER_HLEN; i++)
f6cec329 1307 if (READ_ONCE(history[i]) == rxhash)
3b3a5b0a
WB
1308 count++;
1309
f6cec329
ED
1310 victim = prandom_u32() % ROLLOVER_HLEN;
1311
1312 /* Avoid dirtying the cache line if possible */
1313 if (READ_ONCE(history[victim]) != rxhash)
1314 WRITE_ONCE(history[victim], rxhash);
1315
3b3a5b0a
WB
1316 return count > (ROLLOVER_HLEN >> 1);
1317}
1318
77f65ebd
WB
1319static unsigned int fanout_demux_hash(struct packet_fanout *f,
1320 struct sk_buff *skb,
1321 unsigned int num)
dc99f600 1322{
eb70db87 1323 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1324}
1325
77f65ebd
WB
1326static unsigned int fanout_demux_lb(struct packet_fanout *f,
1327 struct sk_buff *skb,
1328 unsigned int num)
dc99f600 1329{
468479e6 1330 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1331
468479e6 1332 return val % num;
77f65ebd
WB
1333}
1334
1335static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1336 struct sk_buff *skb,
1337 unsigned int num)
1338{
1339 return smp_processor_id() % num;
dc99f600
DM
1340}
1341
5df0ddfb
DB
1342static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1343 struct sk_buff *skb,
1344 unsigned int num)
1345{
f337db64 1346 return prandom_u32_max(num);
5df0ddfb
DB
1347}
1348
77f65ebd
WB
1349static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1350 struct sk_buff *skb,
ad377cab 1351 unsigned int idx, bool try_self,
77f65ebd 1352 unsigned int num)
95ec3eb4 1353{
4633c9e0 1354 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1355 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1356
0648ab70 1357 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1358
1359 if (try_self) {
1360 room = packet_rcv_has_room(po, skb);
1361 if (room == ROOM_NORMAL ||
1362 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1363 return idx;
4633c9e0 1364 po_skip = po;
3b3a5b0a 1365 }
ad377cab 1366
0648ab70 1367 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1368 do {
2ccdbaa6 1369 po_next = pkt_sk(f->arr[i]);
3a2bb84e 1370 if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
2ccdbaa6 1371 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1372 if (i != j)
0648ab70 1373 po->rollover->sock = i;
a9b63918
WB
1374 atomic_long_inc(&po->rollover->num);
1375 if (room == ROOM_LOW)
1376 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1377 return i;
1378 }
ad377cab 1379
77f65ebd
WB
1380 if (++i == num)
1381 i = 0;
1382 } while (i != j);
1383
a9b63918 1384 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1385 return idx;
1386}
1387
2d36097d
NH
1388static unsigned int fanout_demux_qm(struct packet_fanout *f,
1389 struct sk_buff *skb,
1390 unsigned int num)
1391{
1392 return skb_get_queue_mapping(skb) % num;
1393}
1394
47dceb8e
WB
1395static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1396 struct sk_buff *skb,
1397 unsigned int num)
1398{
1399 struct bpf_prog *prog;
1400 unsigned int ret = 0;
1401
1402 rcu_read_lock();
1403 prog = rcu_dereference(f->bpf_prog);
1404 if (prog)
ff936a04 1405 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1406 rcu_read_unlock();
1407
1408 return ret;
1409}
1410
77f65ebd
WB
1411static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1412{
1413 return f->flags & (flag >> 8);
95ec3eb4
DM
1414}
1415
95ec3eb4
DM
1416static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1417 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1418{
1419 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1420 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1421 struct net *net = read_pnet(&f->net);
dc99f600 1422 struct packet_sock *po;
77f65ebd 1423 unsigned int idx;
dc99f600 1424
19bcf9f2 1425 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1426 kfree_skb(skb);
1427 return 0;
1428 }
1429
3f34b24a 1430 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1431 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1432 if (!skb)
1433 return 0;
1434 }
95ec3eb4
DM
1435 switch (f->type) {
1436 case PACKET_FANOUT_HASH:
1437 default:
77f65ebd 1438 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1439 break;
1440 case PACKET_FANOUT_LB:
77f65ebd 1441 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1442 break;
1443 case PACKET_FANOUT_CPU:
77f65ebd
WB
1444 idx = fanout_demux_cpu(f, skb, num);
1445 break;
5df0ddfb
DB
1446 case PACKET_FANOUT_RND:
1447 idx = fanout_demux_rnd(f, skb, num);
1448 break;
2d36097d
NH
1449 case PACKET_FANOUT_QM:
1450 idx = fanout_demux_qm(f, skb, num);
1451 break;
77f65ebd 1452 case PACKET_FANOUT_ROLLOVER:
ad377cab 1453 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1454 break;
47dceb8e 1455 case PACKET_FANOUT_CBPF:
f2e52095 1456 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1457 idx = fanout_demux_bpf(f, skb, num);
1458 break;
dc99f600
DM
1459 }
1460
ad377cab
WB
1461 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1462 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1463
ad377cab 1464 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1465 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1466}
1467
fff3321d
PE
1468DEFINE_MUTEX(fanout_mutex);
1469EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1470static LIST_HEAD(fanout_list);
4a69a864 1471static u16 fanout_next_id;
dc99f600
DM
1472
1473static void __fanout_link(struct sock *sk, struct packet_sock *po)
1474{
1475 struct packet_fanout *f = po->fanout;
1476
1477 spin_lock(&f->lock);
1478 f->arr[f->num_members] = sk;
1479 smp_wmb();
1480 f->num_members++;
2bd624b4
AS
1481 if (f->num_members == 1)
1482 dev_add_pack(&f->prot_hook);
dc99f600
DM
1483 spin_unlock(&f->lock);
1484}
1485
1486static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1487{
1488 struct packet_fanout *f = po->fanout;
1489 int i;
1490
1491 spin_lock(&f->lock);
1492 for (i = 0; i < f->num_members; i++) {
1493 if (f->arr[i] == sk)
1494 break;
1495 }
1496 BUG_ON(i >= f->num_members);
1497 f->arr[i] = f->arr[f->num_members - 1];
1498 f->num_members--;
2bd624b4
AS
1499 if (f->num_members == 0)
1500 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1501 spin_unlock(&f->lock);
1502}
1503
d4dd8aee 1504static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1505{
161642e2
ED
1506 if (sk->sk_family != PF_PACKET)
1507 return false;
c0de08d0 1508
161642e2 1509 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1510}
1511
47dceb8e
WB
1512static void fanout_init_data(struct packet_fanout *f)
1513{
1514 switch (f->type) {
1515 case PACKET_FANOUT_LB:
1516 atomic_set(&f->rr_cur, 0);
1517 break;
1518 case PACKET_FANOUT_CBPF:
f2e52095 1519 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1520 RCU_INIT_POINTER(f->bpf_prog, NULL);
1521 break;
1522 }
1523}
1524
1525static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1526{
1527 struct bpf_prog *old;
1528
1529 spin_lock(&f->lock);
1530 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1531 rcu_assign_pointer(f->bpf_prog, new);
1532 spin_unlock(&f->lock);
1533
1534 if (old) {
1535 synchronize_net();
1536 bpf_prog_destroy(old);
1537 }
1538}
1539
1540static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1541 unsigned int len)
1542{
1543 struct bpf_prog *new;
1544 struct sock_fprog fprog;
1545 int ret;
1546
1547 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1548 return -EPERM;
1549 if (len != sizeof(fprog))
1550 return -EINVAL;
1551 if (copy_from_user(&fprog, data, len))
1552 return -EFAULT;
1553
bab18991 1554 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1555 if (ret)
1556 return ret;
1557
1558 __fanout_set_data_bpf(po->fanout, new);
1559 return 0;
1560}
1561
f2e52095
WB
1562static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1563 unsigned int len)
1564{
1565 struct bpf_prog *new;
1566 u32 fd;
1567
1568 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1569 return -EPERM;
1570 if (len != sizeof(fd))
1571 return -EINVAL;
1572 if (copy_from_user(&fd, data, len))
1573 return -EFAULT;
1574
113214be 1575 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1576 if (IS_ERR(new))
1577 return PTR_ERR(new);
f2e52095
WB
1578
1579 __fanout_set_data_bpf(po->fanout, new);
1580 return 0;
1581}
1582
47dceb8e
WB
1583static int fanout_set_data(struct packet_sock *po, char __user *data,
1584 unsigned int len)
1585{
1586 switch (po->fanout->type) {
1587 case PACKET_FANOUT_CBPF:
1588 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1589 case PACKET_FANOUT_EBPF:
1590 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1591 default:
1592 return -EINVAL;
07d53ae4 1593 }
47dceb8e
WB
1594}
1595
1596static void fanout_release_data(struct packet_fanout *f)
1597{
1598 switch (f->type) {
1599 case PACKET_FANOUT_CBPF:
f2e52095 1600 case PACKET_FANOUT_EBPF:
47dceb8e 1601 __fanout_set_data_bpf(f, NULL);
07d53ae4 1602 }
47dceb8e
WB
1603}
1604
4a69a864
MM
1605static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1606{
1607 struct packet_fanout *f;
1608
1609 list_for_each_entry(f, &fanout_list, list) {
1610 if (f->id == candidate_id &&
1611 read_pnet(&f->net) == sock_net(sk)) {
1612 return false;
1613 }
1614 }
1615 return true;
1616}
1617
1618static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1619{
1620 u16 id = fanout_next_id;
1621
1622 do {
1623 if (__fanout_id_is_free(sk, id)) {
1624 *new_id = id;
1625 fanout_next_id = id + 1;
1626 return true;
1627 }
1628
1629 id++;
1630 } while (id != fanout_next_id);
1631
1632 return false;
1633}
1634
7736d33f 1635static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1636{
d199fab6 1637 struct packet_rollover *rollover = NULL;
dc99f600
DM
1638 struct packet_sock *po = pkt_sk(sk);
1639 struct packet_fanout *f, *match;
7736d33f 1640 u8 type = type_flags & 0xff;
77f65ebd 1641 u8 flags = type_flags >> 8;
dc99f600
DM
1642 int err;
1643
1644 switch (type) {
77f65ebd
WB
1645 case PACKET_FANOUT_ROLLOVER:
1646 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1647 return -EINVAL;
dc99f600
DM
1648 case PACKET_FANOUT_HASH:
1649 case PACKET_FANOUT_LB:
95ec3eb4 1650 case PACKET_FANOUT_CPU:
5df0ddfb 1651 case PACKET_FANOUT_RND:
2d36097d 1652 case PACKET_FANOUT_QM:
47dceb8e 1653 case PACKET_FANOUT_CBPF:
f2e52095 1654 case PACKET_FANOUT_EBPF:
dc99f600
DM
1655 break;
1656 default:
1657 return -EINVAL;
1658 }
1659
d199fab6
ED
1660 mutex_lock(&fanout_mutex);
1661
d199fab6 1662 err = -EALREADY;
dc99f600 1663 if (po->fanout)
d199fab6 1664 goto out;
dc99f600 1665
4633c9e0
WB
1666 if (type == PACKET_FANOUT_ROLLOVER ||
1667 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1668 err = -ENOMEM;
1669 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1670 if (!rollover)
1671 goto out;
1672 atomic_long_set(&rollover->num, 0);
1673 atomic_long_set(&rollover->num_huge, 0);
1674 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1675 }
1676
4a69a864
MM
1677 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1678 if (id != 0) {
1679 err = -EINVAL;
1680 goto out;
1681 }
1682 if (!fanout_find_new_id(sk, &id)) {
1683 err = -ENOMEM;
1684 goto out;
1685 }
1686 /* ephemeral flag for the first socket in the group: drop it */
1687 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1688 }
1689
dc99f600
DM
1690 match = NULL;
1691 list_for_each_entry(f, &fanout_list, list) {
1692 if (f->id == id &&
1693 read_pnet(&f->net) == sock_net(sk)) {
1694 match = f;
1695 break;
1696 }
1697 }
afe62c68 1698 err = -EINVAL;
77f65ebd 1699 if (match && match->flags != flags)
afe62c68 1700 goto out;
dc99f600 1701 if (!match) {
afe62c68 1702 err = -ENOMEM;
dc99f600 1703 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1704 if (!match)
1705 goto out;
1706 write_pnet(&match->net, sock_net(sk));
1707 match->id = id;
1708 match->type = type;
77f65ebd 1709 match->flags = flags;
afe62c68
ED
1710 INIT_LIST_HEAD(&match->list);
1711 spin_lock_init(&match->lock);
fb5c2c17 1712 refcount_set(&match->sk_ref, 0);
47dceb8e 1713 fanout_init_data(match);
afe62c68
ED
1714 match->prot_hook.type = po->prot_hook.type;
1715 match->prot_hook.dev = po->prot_hook.dev;
1716 match->prot_hook.func = packet_rcv_fanout;
1717 match->prot_hook.af_packet_priv = match;
c0de08d0 1718 match->prot_hook.id_match = match_fanout_group;
afe62c68 1719 list_add(&match->list, &fanout_list);
dc99f600 1720 }
afe62c68 1721 err = -EINVAL;
008ba2a1
WB
1722
1723 spin_lock(&po->bind_lock);
1724 if (po->running &&
1725 match->type == type &&
afe62c68
ED
1726 match->prot_hook.type == po->prot_hook.type &&
1727 match->prot_hook.dev == po->prot_hook.dev) {
1728 err = -ENOSPC;
fb5c2c17 1729 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1730 __dev_remove_pack(&po->prot_hook);
1731 po->fanout = match;
57f015f5
MM
1732 po->rollover = rollover;
1733 rollover = NULL;
fb5c2c17 1734 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1735 __fanout_link(sk, po);
1736 err = 0;
dc99f600
DM
1737 }
1738 }
008ba2a1
WB
1739 spin_unlock(&po->bind_lock);
1740
1741 if (err && !refcount_read(&match->sk_ref)) {
1742 list_del(&match->list);
1743 kfree(match);
1744 }
1745
afe62c68 1746out:
57f015f5 1747 kfree(rollover);
d199fab6 1748 mutex_unlock(&fanout_mutex);
dc99f600
DM
1749 return err;
1750}
1751
2bd624b4
AS
1752/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1753 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1754 * It is the responsibility of the caller to call fanout_release_data() and
1755 * free the returned packet_fanout (after synchronize_net())
1756 */
1757static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1758{
1759 struct packet_sock *po = pkt_sk(sk);
1760 struct packet_fanout *f;
1761
fff3321d 1762 mutex_lock(&fanout_mutex);
d199fab6
ED
1763 f = po->fanout;
1764 if (f) {
1765 po->fanout = NULL;
1766
fb5c2c17 1767 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1768 list_del(&f->list);
2bd624b4
AS
1769 else
1770 f = NULL;
dc99f600
DM
1771 }
1772 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1773
1774 return f;
dc99f600 1775}
1da177e4 1776
3c70c132
DB
1777static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1778 struct sk_buff *skb)
1779{
1780 /* Earlier code assumed this would be a VLAN pkt, double-check
1781 * this now that we have the actual packet in hand. We can only
1782 * do this check on Ethernet devices.
1783 */
1784 if (unlikely(dev->type != ARPHRD_ETHER))
1785 return false;
1786
1787 skb_reset_mac_header(skb);
1788 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1789}
1790
90ddc4f0 1791static const struct proto_ops packet_ops;
1da177e4 1792
90ddc4f0 1793static const struct proto_ops packet_ops_spkt;
1da177e4 1794
40d4e3df
ED
1795static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1796 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1797{
1798 struct sock *sk;
1799 struct sockaddr_pkt *spkt;
1800
1801 /*
1802 * When we registered the protocol we saved the socket in the data
1803 * field for just this event.
1804 */
1805
1806 sk = pt->af_packet_priv;
1ce4f28b 1807
1da177e4
LT
1808 /*
1809 * Yank back the headers [hope the device set this
1810 * right or kerboom...]
1811 *
1812 * Incoming packets have ll header pulled,
1813 * push it back.
1814 *
98e399f8 1815 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1816 * so that this procedure is noop.
1817 */
1818
1819 if (skb->pkt_type == PACKET_LOOPBACK)
1820 goto out;
1821
09ad9bc7 1822 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1823 goto out;
1824
40d4e3df
ED
1825 skb = skb_share_check(skb, GFP_ATOMIC);
1826 if (skb == NULL)
1da177e4
LT
1827 goto oom;
1828
1829 /* drop any routing info */
adf30907 1830 skb_dst_drop(skb);
1da177e4 1831
84531c24 1832 /* drop conntrack reference */
895b5c9f 1833 nf_reset_ct(skb);
84531c24 1834
ffbc6111 1835 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1836
98e399f8 1837 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1838
1839 /*
1840 * The SOCK_PACKET socket receives _all_ frames.
1841 */
1842
1843 spkt->spkt_family = dev->type;
1844 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1845 spkt->spkt_protocol = skb->protocol;
1846
1847 /*
1848 * Charge the memory to the socket. This is done specifically
1849 * to prevent sockets using all the memory up.
1850 */
1851
40d4e3df 1852 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1853 return 0;
1854
1855out:
1856 kfree_skb(skb);
1857oom:
1858 return 0;
1859}
1860
75c65772
MM
1861static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1862{
18bed891
YK
1863 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1864 sock->type == SOCK_RAW) {
75c65772
MM
1865 skb_reset_mac_header(skb);
1866 skb->protocol = dev_parse_header_protocol(skb);
1867 }
1868
1869 skb_probe_transport_header(skb);
1870}
1da177e4
LT
1871
1872/*
1873 * Output a raw packet to a device layer. This bypasses all the other
1874 * protocol layers and you must therefore supply it with a complete frame
1875 */
1ce4f28b 1876
1b784140
YX
1877static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1878 size_t len)
1da177e4
LT
1879{
1880 struct sock *sk = sock->sk;
342dfc30 1881 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1882 struct sk_buff *skb = NULL;
1da177e4 1883 struct net_device *dev;
c14ac945 1884 struct sockcm_cookie sockc;
40d4e3df 1885 __be16 proto = 0;
1da177e4 1886 int err;
3bdc0eba 1887 int extra_len = 0;
1ce4f28b 1888
1da177e4 1889 /*
1ce4f28b 1890 * Get and verify the address.
1da177e4
LT
1891 */
1892
40d4e3df 1893 if (saddr) {
1da177e4 1894 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1895 return -EINVAL;
1896 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1897 proto = saddr->spkt_protocol;
1898 } else
1899 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1900
1901 /*
1ce4f28b 1902 * Find the device first to size check it
1da177e4
LT
1903 */
1904
de74e92a 1905 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1906retry:
654d1f8a
ED
1907 rcu_read_lock();
1908 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1909 err = -ENODEV;
1910 if (dev == NULL)
1911 goto out_unlock;
1ce4f28b 1912
d5e76b0a
DM
1913 err = -ENETDOWN;
1914 if (!(dev->flags & IFF_UP))
1915 goto out_unlock;
1916
1da177e4 1917 /*
40d4e3df
ED
1918 * You may not queue a frame bigger than the mtu. This is the lowest level
1919 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1920 */
1ce4f28b 1921
3bdc0eba
BG
1922 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1923 if (!netif_supports_nofcs(dev)) {
1924 err = -EPROTONOSUPPORT;
1925 goto out_unlock;
1926 }
1927 extra_len = 4; /* We're doing our own CRC */
1928 }
1929
1da177e4 1930 err = -EMSGSIZE;
3bdc0eba 1931 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1932 goto out_unlock;
1933
1a35ca80
ED
1934 if (!skb) {
1935 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1936 int tlen = dev->needed_tailroom;
1a35ca80
ED
1937 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1938
1939 rcu_read_unlock();
4ce40912 1940 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1941 if (skb == NULL)
1942 return -ENOBUFS;
1943 /* FIXME: Save some space for broken drivers that write a hard
1944 * header at transmission time by themselves. PPP is the notable
1945 * one here. This should really be fixed at the driver level.
1946 */
1947 skb_reserve(skb, reserved);
1948 skb_reset_network_header(skb);
1949
1950 /* Try to align data part correctly */
1951 if (hhlen) {
1952 skb->data -= hhlen;
1953 skb->tail -= hhlen;
1954 if (len < hhlen)
1955 skb_reset_network_header(skb);
1956 }
6ce8e9ce 1957 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1958 if (err)
1959 goto out_free;
1960 goto retry;
1da177e4
LT
1961 }
1962
9ed988cd
WB
1963 if (!dev_validate_header(dev, skb->data, len)) {
1964 err = -EINVAL;
1965 goto out_unlock;
1966 }
3c70c132
DB
1967 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1968 !packet_extra_vlan_len_allowed(dev, skb)) {
1969 err = -EMSGSIZE;
1970 goto out_unlock;
57f89bfa 1971 }
1a35ca80 1972
657a0667 1973 sockcm_init(&sockc, sk);
c14ac945
SHY
1974 if (msg->msg_controllen) {
1975 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1976 if (unlikely(err))
c14ac945 1977 goto out_unlock;
c14ac945
SHY
1978 }
1979
1da177e4
LT
1980 skb->protocol = proto;
1981 skb->dev = dev;
1982 skb->priority = sk->sk_priority;
2d37a186 1983 skb->mark = sk->sk_mark;
3d0ba8c0 1984 skb->tstamp = sockc.transmit_time;
bf84a010 1985
8f932f76 1986 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1987
3bdc0eba
BG
1988 if (unlikely(extra_len == 4))
1989 skb->no_fcs = 1;
1990
75c65772 1991 packet_parse_headers(skb, sock);
c1aad275 1992
1da177e4 1993 dev_queue_xmit(skb);
654d1f8a 1994 rcu_read_unlock();
40d4e3df 1995 return len;
1da177e4 1996
1da177e4 1997out_unlock:
654d1f8a 1998 rcu_read_unlock();
1a35ca80
ED
1999out_free:
2000 kfree_skb(skb);
1da177e4
LT
2001 return err;
2002}
1da177e4 2003
ff936a04
AS
2004static unsigned int run_filter(struct sk_buff *skb,
2005 const struct sock *sk,
2006 unsigned int res)
1da177e4
LT
2007{
2008 struct sk_filter *filter;
fda9ef5d 2009
80f8f102
ED
2010 rcu_read_lock();
2011 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2012 if (filter != NULL)
ff936a04 2013 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2014 rcu_read_unlock();
1da177e4 2015
dbcb5855 2016 return res;
1da177e4
LT
2017}
2018
16cc1400
WB
2019static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2020 size_t *len)
2021{
2022 struct virtio_net_hdr vnet_hdr;
2023
2024 if (*len < sizeof(vnet_hdr))
2025 return -EINVAL;
2026 *len -= sizeof(vnet_hdr);
2027
fd3a8862 2028 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2029 return -EINVAL;
2030
2031 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2032}
2033
1da177e4 2034/*
62ab0812
ED
2035 * This function makes lazy skb cloning in hope that most of packets
2036 * are discarded by BPF.
2037 *
2038 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2039 * and skb->cb are mangled. It works because (and until) packets
2040 * falling here are owned by current CPU. Output packets are cloned
2041 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2042 * sequencially, so that if we return skb to original state on exit,
2043 * we will not harm anyone.
1da177e4
LT
2044 */
2045
40d4e3df
ED
2046static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2047 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2048{
2049 struct sock *sk;
2050 struct sockaddr_ll *sll;
2051 struct packet_sock *po;
40d4e3df 2052 u8 *skb_head = skb->data;
1da177e4 2053 int skb_len = skb->len;
dbcb5855 2054 unsigned int snaplen, res;
da37845f 2055 bool is_drop_n_account = false;
1da177e4
LT
2056
2057 if (skb->pkt_type == PACKET_LOOPBACK)
2058 goto drop;
2059
2060 sk = pt->af_packet_priv;
2061 po = pkt_sk(sk);
2062
09ad9bc7 2063 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2064 goto drop;
2065
1da177e4
LT
2066 skb->dev = dev;
2067
3b04ddde 2068 if (dev->header_ops) {
1da177e4 2069 /* The device has an explicit notion of ll header,
62ab0812
ED
2070 * exported to higher levels.
2071 *
2072 * Otherwise, the device hides details of its frame
2073 * structure, so that corresponding packet head is
2074 * never delivered to user.
1da177e4
LT
2075 */
2076 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2077 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2078 else if (skb->pkt_type == PACKET_OUTGOING) {
2079 /* Special case: outgoing packets have ll header at head */
bbe735e4 2080 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2081 }
2082 }
2083
2084 snaplen = skb->len;
2085
dbcb5855
DM
2086 res = run_filter(skb, sk, snaplen);
2087 if (!res)
fda9ef5d 2088 goto drop_n_restore;
dbcb5855
DM
2089 if (snaplen > res)
2090 snaplen = res;
1da177e4 2091
0fd7bac6 2092 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2093 goto drop_n_acct;
2094
2095 if (skb_shared(skb)) {
2096 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2097 if (nskb == NULL)
2098 goto drop_n_acct;
2099
2100 if (skb_head != skb->data) {
2101 skb->data = skb_head;
2102 skb->len = skb_len;
2103 }
abc4e4fa 2104 consume_skb(skb);
1da177e4
LT
2105 skb = nskb;
2106 }
2107
b4772ef8 2108 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2109
2110 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2111 sll->sll_hatype = dev->type;
1da177e4 2112 sll->sll_pkttype = skb->pkt_type;
8032b464 2113 if (unlikely(po->origdev))
80feaacb
PWJ
2114 sll->sll_ifindex = orig_dev->ifindex;
2115 else
2116 sll->sll_ifindex = dev->ifindex;
1da177e4 2117
b95cce35 2118 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2119
2472d761
EB
2120 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2121 * Use their space for storing the original skb length.
2122 */
2123 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2124
1da177e4
LT
2125 if (pskb_trim(skb, snaplen))
2126 goto drop_n_acct;
2127
2128 skb_set_owner_r(skb, sk);
2129 skb->dev = NULL;
adf30907 2130 skb_dst_drop(skb);
1da177e4 2131
84531c24 2132 /* drop conntrack reference */
895b5c9f 2133 nf_reset_ct(skb);
84531c24 2134
1da177e4 2135 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2136 po->stats.stats1.tp_packets++;
3bc3b96f 2137 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2138 __skb_queue_tail(&sk->sk_receive_queue, skb);
2139 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2140 sk->sk_data_ready(sk);
1da177e4
LT
2141 return 0;
2142
2143drop_n_acct:
da37845f 2144 is_drop_n_account = true;
8e8e2951 2145 atomic_inc(&po->tp_drops);
7091fbd8 2146 atomic_inc(&sk->sk_drops);
1da177e4
LT
2147
2148drop_n_restore:
2149 if (skb_head != skb->data && skb_shared(skb)) {
2150 skb->data = skb_head;
2151 skb->len = skb_len;
2152 }
2153drop:
da37845f
WJ
2154 if (!is_drop_n_account)
2155 consume_skb(skb);
2156 else
2157 kfree_skb(skb);
1da177e4
LT
2158 return 0;
2159}
2160
40d4e3df
ED
2161static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2162 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2163{
2164 struct sock *sk;
2165 struct packet_sock *po;
2166 struct sockaddr_ll *sll;
184f489e 2167 union tpacket_uhdr h;
40d4e3df 2168 u8 *skb_head = skb->data;
1da177e4 2169 int skb_len = skb->len;
dbcb5855 2170 unsigned int snaplen, res;
f6fb8f10 2171 unsigned long status = TP_STATUS_USER;
7b52b013
OC
2172 unsigned short macoff, hdrlen;
2173 unsigned int netoff;
1da177e4 2174 struct sk_buff *copy_skb = NULL;
bbd6ef87 2175 struct timespec ts;
b9c32fb2 2176 __u32 ts_status;
da37845f 2177 bool is_drop_n_account = false;
f897d759 2178 unsigned int slot_id = 0;
edbd58be 2179 bool do_vnet = false;
1da177e4 2180
51846355
AW
2181 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2182 * We may add members to them until current aligned size without forcing
2183 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2184 */
2185 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2186 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2187
1da177e4
LT
2188 if (skb->pkt_type == PACKET_LOOPBACK)
2189 goto drop;
2190
2191 sk = pt->af_packet_priv;
2192 po = pkt_sk(sk);
2193
09ad9bc7 2194 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2195 goto drop;
2196
3b04ddde 2197 if (dev->header_ops) {
1da177e4 2198 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2199 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2200 else if (skb->pkt_type == PACKET_OUTGOING) {
2201 /* Special case: outgoing packets have ll header at head */
bbe735e4 2202 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2203 }
2204 }
2205
2206 snaplen = skb->len;
2207
dbcb5855
DM
2208 res = run_filter(skb, sk, snaplen);
2209 if (!res)
fda9ef5d 2210 goto drop_n_restore;
68c2e5de 2211
2c51c627
ED
2212 /* If we are flooded, just give up */
2213 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2214 atomic_inc(&po->tp_drops);
2215 goto drop_n_restore;
2216 }
2217
68c2e5de
AD
2218 if (skb->ip_summed == CHECKSUM_PARTIAL)
2219 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2220 else if (skb->pkt_type != PACKET_OUTGOING &&
2221 (skb->ip_summed == CHECKSUM_COMPLETE ||
2222 skb_csum_unnecessary(skb)))
2223 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2224
dbcb5855
DM
2225 if (snaplen > res)
2226 snaplen = res;
1da177e4
LT
2227
2228 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2229 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2230 po->tp_reserve;
1da177e4 2231 } else {
95c96174 2232 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2233 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2234 (maclen < 16 ? 16 : maclen)) +
58d19b19 2235 po->tp_reserve;
edbd58be 2236 if (po->has_vnet_hdr) {
58d19b19 2237 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2238 do_vnet = true;
2239 }
1da177e4
LT
2240 macoff = netoff - maclen;
2241 }
7b52b013
OC
2242 if (netoff > USHRT_MAX) {
2243 atomic_inc(&po->tp_drops);
2244 goto drop_n_restore;
2245 }
f6fb8f10 2246 if (po->tp_version <= TPACKET_V2) {
2247 if (macoff + snaplen > po->rx_ring.frame_size) {
2248 if (po->copy_thresh &&
0fd7bac6 2249 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2250 if (skb_shared(skb)) {
2251 copy_skb = skb_clone(skb, GFP_ATOMIC);
2252 } else {
2253 copy_skb = skb_get(skb);
2254 skb_head = skb->data;
2255 }
2256 if (copy_skb)
2257 skb_set_owner_r(copy_skb, sk);
1da177e4 2258 }
f6fb8f10 2259 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2260 if ((int)snaplen < 0) {
f6fb8f10 2261 snaplen = 0;
edbd58be
BP
2262 do_vnet = false;
2263 }
1da177e4 2264 }
dc808110
ED
2265 } else if (unlikely(macoff + snaplen >
2266 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2267 u32 nval;
2268
2269 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2270 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2271 snaplen, nval, macoff);
2272 snaplen = nval;
2273 if (unlikely((int)snaplen < 0)) {
2274 snaplen = 0;
2275 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2276 do_vnet = false;
dc808110 2277 }
1da177e4 2278 }
1da177e4 2279 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2280 h.raw = packet_current_rx_frame(po, skb,
2281 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2282 if (!h.raw)
58d19b19 2283 goto drop_n_account;
41442444 2284
f897d759
WB
2285 if (po->tp_version <= TPACKET_V2) {
2286 slot_id = po->rx_ring.head;
2287 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
2288 goto drop_n_account;
2289 __set_bit(slot_id, po->rx_ring.rx_owner_map);
2290 }
2291
41442444
WB
2292 if (do_vnet &&
2293 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2294 sizeof(struct virtio_net_hdr),
014dc8b9
JO
2295 vio_le(), true, 0)) {
2296 if (po->tp_version == TPACKET_V3)
2297 prb_clear_blk_fill_status(&po->rx_ring);
41442444 2298 goto drop_n_account;
014dc8b9 2299 }
41442444 2300
f6fb8f10 2301 if (po->tp_version <= TPACKET_V2) {
2302 packet_increment_rx_head(po, &po->rx_ring);
2303 /*
2304 * LOSING will be reported till you read the stats,
2305 * because it's COR - Clear On Read.
2306 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2307 * at packet level.
2308 */
8e8e2951 2309 if (atomic_read(&po->tp_drops))
f6fb8f10 2310 status |= TP_STATUS_LOSING;
2311 }
945d015e 2312
ee80fbf3 2313 po->stats.stats1.tp_packets++;
1da177e4
LT
2314 if (copy_skb) {
2315 status |= TP_STATUS_COPY;
2316 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2317 }
1da177e4
LT
2318 spin_unlock(&sk->sk_receive_queue.lock);
2319
bbd6ef87 2320 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2321
2322 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2323 getnstimeofday(&ts);
1da177e4 2324
b9c32fb2
DB
2325 status |= ts_status;
2326
bbd6ef87
PM
2327 switch (po->tp_version) {
2328 case TPACKET_V1:
2329 h.h1->tp_len = skb->len;
2330 h.h1->tp_snaplen = snaplen;
2331 h.h1->tp_mac = macoff;
2332 h.h1->tp_net = netoff;
4b457bdf
DB
2333 h.h1->tp_sec = ts.tv_sec;
2334 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2335 hdrlen = sizeof(*h.h1);
2336 break;
2337 case TPACKET_V2:
2338 h.h2->tp_len = skb->len;
2339 h.h2->tp_snaplen = snaplen;
2340 h.h2->tp_mac = macoff;
2341 h.h2->tp_net = netoff;
bbd6ef87
PM
2342 h.h2->tp_sec = ts.tv_sec;
2343 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2344 if (skb_vlan_tag_present(skb)) {
2345 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2346 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2347 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2348 } else {
2349 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2350 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2351 }
e4d26f4b 2352 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2353 hdrlen = sizeof(*h.h2);
2354 break;
f6fb8f10 2355 case TPACKET_V3:
2356 /* tp_nxt_offset,vlan are already populated above.
2357 * So DONT clear those fields here
2358 */
2359 h.h3->tp_status |= status;
2360 h.h3->tp_len = skb->len;
2361 h.h3->tp_snaplen = snaplen;
2362 h.h3->tp_mac = macoff;
2363 h.h3->tp_net = netoff;
f6fb8f10 2364 h.h3->tp_sec = ts.tv_sec;
2365 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2366 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2367 hdrlen = sizeof(*h.h3);
2368 break;
bbd6ef87
PM
2369 default:
2370 BUG();
2371 }
1da177e4 2372
bbd6ef87 2373 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2374 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2375 sll->sll_family = AF_PACKET;
2376 sll->sll_hatype = dev->type;
2377 sll->sll_protocol = skb->protocol;
2378 sll->sll_pkttype = skb->pkt_type;
8032b464 2379 if (unlikely(po->origdev))
80feaacb
PWJ
2380 sll->sll_ifindex = orig_dev->ifindex;
2381 else
2382 sll->sll_ifindex = dev->ifindex;
1da177e4 2383
e16aa207 2384 smp_mb();
f0d4eb29 2385
f6dafa95 2386#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2387 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2388 u8 *start, *end;
2389
f0d4eb29
DB
2390 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2391 macoff + snaplen);
2392
2393 for (start = h.raw; start < end; start += PAGE_SIZE)
2394 flush_dcache_page(pgv_to_page(start));
1da177e4 2395 }
f0d4eb29 2396 smp_wmb();
f6dafa95 2397#endif
f0d4eb29 2398
da413eec 2399 if (po->tp_version <= TPACKET_V2) {
f897d759 2400 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2401 __packet_set_status(po, h.raw, status);
f897d759
WB
2402 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
2403 spin_unlock(&sk->sk_receive_queue.lock);
da413eec 2404 sk->sk_data_ready(sk);
014dc8b9 2405 } else if (po->tp_version == TPACKET_V3) {
f6fb8f10 2406 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2407 }
1da177e4
LT
2408
2409drop_n_restore:
2410 if (skb_head != skb->data && skb_shared(skb)) {
2411 skb->data = skb_head;
2412 skb->len = skb_len;
2413 }
2414drop:
da37845f
WJ
2415 if (!is_drop_n_account)
2416 consume_skb(skb);
2417 else
2418 kfree_skb(skb);
1da177e4
LT
2419 return 0;
2420
58d19b19 2421drop_n_account:
1da177e4 2422 spin_unlock(&sk->sk_receive_queue.lock);
8e8e2951
ED
2423 atomic_inc(&po->tp_drops);
2424 is_drop_n_account = true;
1da177e4 2425
676d2369 2426 sk->sk_data_ready(sk);
acb5d75b 2427 kfree_skb(copy_skb);
1da177e4
LT
2428 goto drop_n_restore;
2429}
2430
69e3c75f
JB
2431static void tpacket_destruct_skb(struct sk_buff *skb)
2432{
2433 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2434
69e3c75f 2435 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2436 void *ph;
b9c32fb2
DB
2437 __u32 ts;
2438
5cd8d46e 2439 ph = skb_zcopy_get_nouarg(skb);
b0138408 2440 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2441
2442 ts = __packet_set_timestamp(po, ph, skb);
2443 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
89ed5b51
NH
2444
2445 if (!packet_read_pending(&po->tx_ring))
2446 complete(&po->skb_completion);
69e3c75f
JB
2447 }
2448
2449 sock_wfree(skb);
2450}
2451
16cc1400
WB
2452static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2453{
16cc1400
WB
2454 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2455 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2456 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2457 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2458 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2459 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2460 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2461
2462 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2463 return -EINVAL;
2464
16cc1400
WB
2465 return 0;
2466}
2467
2468static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2469 struct virtio_net_hdr *vnet_hdr)
2470{
16cc1400
WB
2471 if (*len < sizeof(*vnet_hdr))
2472 return -EINVAL;
2473 *len -= sizeof(*vnet_hdr);
2474
cbbd26b8 2475 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2476 return -EFAULT;
2477
2478 return __packet_snd_vnet_parse(vnet_hdr, *len);
2479}
2480
40d4e3df 2481static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2482 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2483 __be16 proto, unsigned char *addr, int hlen, int copylen,
2484 const struct sockcm_cookie *sockc)
69e3c75f 2485{
184f489e 2486 union tpacket_uhdr ph;
8d39b4a6 2487 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2488 struct socket *sock = po->sk.sk_socket;
2489 struct page *page;
69e3c75f
JB
2490 int err;
2491
2492 ph.raw = frame;
2493
2494 skb->protocol = proto;
2495 skb->dev = dev;
2496 skb->priority = po->sk.sk_priority;
2d37a186 2497 skb->mark = po->sk.sk_mark;
3d0ba8c0 2498 skb->tstamp = sockc->transmit_time;
8f932f76 2499 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2500 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2501
ae641949 2502 skb_reserve(skb, hlen);
69e3c75f 2503 skb_reset_network_header(skb);
c1aad275 2504
69e3c75f
JB
2505 to_write = tp_len;
2506
2507 if (sock->type == SOCK_DGRAM) {
2508 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2509 NULL, tp_len);
2510 if (unlikely(err < 0))
2511 return -EINVAL;
1d036d25 2512 } else if (copylen) {
9ed988cd
WB
2513 int hdrlen = min_t(int, copylen, tp_len);
2514
69e3c75f 2515 skb_push(skb, dev->hard_header_len);
1d036d25 2516 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2517 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2518 if (unlikely(err))
2519 return err;
9ed988cd
WB
2520 if (!dev_validate_header(dev, skb->data, hdrlen))
2521 return -EINVAL;
69e3c75f 2522
9ed988cd
WB
2523 data += hdrlen;
2524 to_write -= hdrlen;
69e3c75f
JB
2525 }
2526
69e3c75f
JB
2527 offset = offset_in_page(data);
2528 len_max = PAGE_SIZE - offset;
2529 len = ((to_write > len_max) ? len_max : to_write);
2530
2531 skb->data_len = to_write;
2532 skb->len += to_write;
2533 skb->truesize += to_write;
14afee4b 2534 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2535
2536 while (likely(to_write)) {
2537 nr_frags = skb_shinfo(skb)->nr_frags;
2538
2539 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2540 pr_err("Packet exceed the number of skb frags(%lu)\n",
2541 MAX_SKB_FRAGS);
69e3c75f
JB
2542 return -EFAULT;
2543 }
2544
0af55bb5
CG
2545 page = pgv_to_page(data);
2546 data += len;
69e3c75f
JB
2547 flush_dcache_page(page);
2548 get_page(page);
0af55bb5 2549 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2550 to_write -= len;
2551 offset = 0;
2552 len_max = PAGE_SIZE;
2553 len = ((to_write > len_max) ? len_max : to_write);
2554 }
2555
75c65772 2556 packet_parse_headers(skb, sock);
efdfa2f7 2557
69e3c75f
JB
2558 return tp_len;
2559}
2560
8d39b4a6
WB
2561static int tpacket_parse_header(struct packet_sock *po, void *frame,
2562 int size_max, void **data)
2563{
2564 union tpacket_uhdr ph;
2565 int tp_len, off;
2566
2567 ph.raw = frame;
2568
2569 switch (po->tp_version) {
7f953ab2
SV
2570 case TPACKET_V3:
2571 if (ph.h3->tp_next_offset != 0) {
2572 pr_warn_once("variable sized slot not supported");
2573 return -EINVAL;
2574 }
2575 tp_len = ph.h3->tp_len;
2576 break;
8d39b4a6
WB
2577 case TPACKET_V2:
2578 tp_len = ph.h2->tp_len;
2579 break;
2580 default:
2581 tp_len = ph.h1->tp_len;
2582 break;
2583 }
2584 if (unlikely(tp_len > size_max)) {
2585 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2586 return -EMSGSIZE;
2587 }
2588
2589 if (unlikely(po->tp_tx_has_off)) {
2590 int off_min, off_max;
2591
2592 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2593 off_max = po->tx_ring.frame_size - tp_len;
2594 if (po->sk.sk_type == SOCK_DGRAM) {
2595 switch (po->tp_version) {
7f953ab2
SV
2596 case TPACKET_V3:
2597 off = ph.h3->tp_net;
2598 break;
8d39b4a6
WB
2599 case TPACKET_V2:
2600 off = ph.h2->tp_net;
2601 break;
2602 default:
2603 off = ph.h1->tp_net;
2604 break;
2605 }
2606 } else {
2607 switch (po->tp_version) {
7f953ab2
SV
2608 case TPACKET_V3:
2609 off = ph.h3->tp_mac;
2610 break;
8d39b4a6
WB
2611 case TPACKET_V2:
2612 off = ph.h2->tp_mac;
2613 break;
2614 default:
2615 off = ph.h1->tp_mac;
2616 break;
2617 }
2618 }
2619 if (unlikely((off < off_min) || (off_max < off)))
2620 return -EINVAL;
2621 } else {
2622 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2623 }
2624
2625 *data = frame + off;
2626 return tp_len;
2627}
2628
69e3c75f
JB
2629static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2630{
89ed5b51 2631 struct sk_buff *skb = NULL;
69e3c75f 2632 struct net_device *dev;
1d036d25 2633 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2634 struct sockcm_cookie sockc;
69e3c75f 2635 __be16 proto;
09effa67 2636 int err, reserve = 0;
40d4e3df 2637 void *ph;
342dfc30 2638 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2639 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
486efdc8 2640 unsigned char *addr = NULL;
69e3c75f 2641 int tp_len, size_max;
8d39b4a6 2642 void *data;
69e3c75f 2643 int len_sum = 0;
9e67030a 2644 int status = TP_STATUS_AVAILABLE;
1d036d25 2645 int hlen, tlen, copylen = 0;
89ed5b51 2646 long timeo = 0;
69e3c75f 2647
69e3c75f
JB
2648 mutex_lock(&po->pg_vec_lock);
2649
32d3182c
ED
2650 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2651 * we need to confirm it under protection of pg_vec_lock.
2652 */
2653 if (unlikely(!po->tx_ring.pg_vec)) {
2654 err = -EBUSY;
2655 goto out;
2656 }
66e56cd4 2657 if (likely(saddr == NULL)) {
e40526cb 2658 dev = packet_cached_dev_get(po);
414b5431 2659 proto = READ_ONCE(po->num);
69e3c75f
JB
2660 } else {
2661 err = -EINVAL;
2662 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2663 goto out;
2664 if (msg->msg_namelen < (saddr->sll_halen
2665 + offsetof(struct sockaddr_ll,
2666 sll_addr)))
2667 goto out;
69e3c75f 2668 proto = saddr->sll_protocol;
827d9780 2669 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
486efdc8
WB
2670 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2671 if (dev && msg->msg_namelen < dev->addr_len +
2672 offsetof(struct sockaddr_ll, sll_addr))
2673 goto out_put;
2674 addr = saddr->sll_addr;
2675 }
69e3c75f
JB
2676 }
2677
69e3c75f
JB
2678 err = -ENXIO;
2679 if (unlikely(dev == NULL))
2680 goto out;
69e3c75f
JB
2681 err = -ENETDOWN;
2682 if (unlikely(!(dev->flags & IFF_UP)))
2683 goto out_put;
2684
657a0667 2685 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2686 if (msg->msg_controllen) {
2687 err = sock_cmsg_send(&po->sk, msg, &sockc);
2688 if (unlikely(err))
2689 goto out_put;
2690 }
2691
5cfb4c8d
DB
2692 if (po->sk.sk_socket->type == SOCK_RAW)
2693 reserve = dev->hard_header_len;
69e3c75f 2694 size_max = po->tx_ring.frame_size
b5dd884e 2695 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2696
1d036d25 2697 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2698 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2699
89ed5b51
NH
2700 reinit_completion(&po->skb_completion);
2701
69e3c75f
JB
2702 do {
2703 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2704 TP_STATUS_SEND_REQUEST);
69e3c75f 2705 if (unlikely(ph == NULL)) {
89ed5b51
NH
2706 if (need_wait && skb) {
2707 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2708 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2709 if (timeo <= 0) {
2710 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2711 goto out_put;
2712 }
2713 }
2714 /* check for additional frames */
69e3c75f
JB
2715 continue;
2716 }
2717
8d39b4a6
WB
2718 skb = NULL;
2719 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2720 if (tp_len < 0)
2721 goto tpacket_error;
2722
69e3c75f 2723 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2724 hlen = LL_RESERVED_SPACE(dev);
2725 tlen = dev->needed_tailroom;
1d036d25
WB
2726 if (po->has_vnet_hdr) {
2727 vnet_hdr = data;
2728 data += sizeof(*vnet_hdr);
2729 tp_len -= sizeof(*vnet_hdr);
2730 if (tp_len < 0 ||
2731 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2732 tp_len = -EINVAL;
2733 goto tpacket_error;
2734 }
2735 copylen = __virtio16_to_cpu(vio_le(),
2736 vnet_hdr->hdr_len);
2737 }
9ed988cd 2738 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2739 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2740 hlen + tlen + sizeof(struct sockaddr_ll) +
2741 (copylen - dev->hard_header_len),
fbf33a28 2742 !need_wait, &err);
69e3c75f 2743
fbf33a28
KM
2744 if (unlikely(skb == NULL)) {
2745 /* we assume the socket was initially writeable ... */
2746 if (likely(len_sum > 0))
2747 err = len_sum;
69e3c75f 2748 goto out_status;
fbf33a28 2749 }
8d39b4a6 2750 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2751 addr, hlen, copylen, &sockc);
dbd46ab4 2752 if (likely(tp_len >= 0) &&
5cfb4c8d 2753 tp_len > dev->mtu + reserve &&
1d036d25 2754 !po->has_vnet_hdr &&
3c70c132
DB
2755 !packet_extra_vlan_len_allowed(dev, skb))
2756 tp_len = -EMSGSIZE;
69e3c75f
JB
2757
2758 if (unlikely(tp_len < 0)) {
8d39b4a6 2759tpacket_error:
69e3c75f
JB
2760 if (po->tp_loss) {
2761 __packet_set_status(po, ph,
2762 TP_STATUS_AVAILABLE);
2763 packet_increment_head(&po->tx_ring);
2764 kfree_skb(skb);
2765 continue;
2766 } else {
2767 status = TP_STATUS_WRONG_FORMAT;
2768 err = tp_len;
2769 goto out_status;
2770 }
2771 }
2772
9d2f67e4
JT
2773 if (po->has_vnet_hdr) {
2774 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2775 tp_len = -EINVAL;
2776 goto tpacket_error;
2777 }
2778 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2779 }
2780
69e3c75f
JB
2781 skb->destructor = tpacket_destruct_skb;
2782 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2783 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2784
2785 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2786 err = po->xmit(skb);
eb70df13
JP
2787 if (unlikely(err > 0)) {
2788 err = net_xmit_errno(err);
2789 if (err && __packet_get_status(po, ph) ==
2790 TP_STATUS_AVAILABLE) {
2791 /* skb was destructed already */
2792 skb = NULL;
2793 goto out_status;
2794 }
2795 /*
2796 * skb was dropped but not destructed yet;
2797 * let's treat it like congestion or err < 0
2798 */
2799 err = 0;
2800 }
69e3c75f
JB
2801 packet_increment_head(&po->tx_ring);
2802 len_sum += tp_len;
b0138408
DB
2803 } while (likely((ph != NULL) ||
2804 /* Note: packet_read_pending() might be slow if we have
2805 * to call it as it's per_cpu variable, but in fast-path
2806 * we already short-circuit the loop with the first
2807 * condition, and luckily don't have to go that path
2808 * anyway.
2809 */
2810 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2811
2812 err = len_sum;
2813 goto out_put;
2814
69e3c75f
JB
2815out_status:
2816 __packet_set_status(po, ph, status);
2817 kfree_skb(skb);
2818out_put:
e40526cb 2819 dev_put(dev);
69e3c75f
JB
2820out:
2821 mutex_unlock(&po->pg_vec_lock);
2822 return err;
2823}
69e3c75f 2824
eea49cc9
OJ
2825static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2826 size_t reserve, size_t len,
2827 size_t linear, int noblock,
2828 int *err)
bfd5f4a3
SS
2829{
2830 struct sk_buff *skb;
2831
2832 /* Under a page? Don't bother with paged skb. */
2833 if (prepad + len < PAGE_SIZE || !linear)
2834 linear = len;
2835
2836 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2837 err, 0);
bfd5f4a3
SS
2838 if (!skb)
2839 return NULL;
2840
2841 skb_reserve(skb, reserve);
2842 skb_put(skb, linear);
2843 skb->data_len = len - linear;
2844 skb->len += len - linear;
2845
2846 return skb;
2847}
2848
d346a3fa 2849static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2850{
2851 struct sock *sk = sock->sk;
342dfc30 2852 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2853 struct sk_buff *skb;
2854 struct net_device *dev;
0e11c91e 2855 __be16 proto;
486efdc8 2856 unsigned char *addr = NULL;
827d9780 2857 int err, reserve = 0;
c7d39e32 2858 struct sockcm_cookie sockc;
bfd5f4a3
SS
2859 struct virtio_net_hdr vnet_hdr = { 0 };
2860 int offset = 0;
bfd5f4a3 2861 struct packet_sock *po = pkt_sk(sk);
da7c9561 2862 bool has_vnet_hdr = false;
57031eb7 2863 int hlen, tlen, linear;
3bdc0eba 2864 int extra_len = 0;
1da177e4
LT
2865
2866 /*
1ce4f28b 2867 * Get and verify the address.
1da177e4 2868 */
1ce4f28b 2869
66e56cd4 2870 if (likely(saddr == NULL)) {
e40526cb 2871 dev = packet_cached_dev_get(po);
414b5431 2872 proto = READ_ONCE(po->num);
1da177e4
LT
2873 } else {
2874 err = -EINVAL;
2875 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2876 goto out;
0fb375fb
EB
2877 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2878 goto out;
1da177e4 2879 proto = saddr->sll_protocol;
827d9780 2880 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
486efdc8
WB
2881 if (sock->type == SOCK_DGRAM) {
2882 if (dev && msg->msg_namelen < dev->addr_len +
2883 offsetof(struct sockaddr_ll, sll_addr))
2884 goto out_unlock;
2885 addr = saddr->sll_addr;
2886 }
1da177e4
LT
2887 }
2888
1da177e4 2889 err = -ENXIO;
e40526cb 2890 if (unlikely(dev == NULL))
1da177e4 2891 goto out_unlock;
d5e76b0a 2892 err = -ENETDOWN;
e40526cb 2893 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2894 goto out_unlock;
2895
657a0667 2896 sockcm_init(&sockc, sk);
c7d39e32
EJ
2897 sockc.mark = sk->sk_mark;
2898 if (msg->msg_controllen) {
2899 err = sock_cmsg_send(sk, msg, &sockc);
2900 if (unlikely(err))
2901 goto out_unlock;
2902 }
2903
e40526cb
DB
2904 if (sock->type == SOCK_RAW)
2905 reserve = dev->hard_header_len;
bfd5f4a3 2906 if (po->has_vnet_hdr) {
16cc1400
WB
2907 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2908 if (err)
bfd5f4a3 2909 goto out_unlock;
da7c9561 2910 has_vnet_hdr = true;
bfd5f4a3
SS
2911 }
2912
3bdc0eba
BG
2913 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2914 if (!netif_supports_nofcs(dev)) {
2915 err = -EPROTONOSUPPORT;
2916 goto out_unlock;
2917 }
2918 extra_len = 4; /* We're doing our own CRC */
2919 }
2920
1da177e4 2921 err = -EMSGSIZE;
16cc1400
WB
2922 if (!vnet_hdr.gso_type &&
2923 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2924 goto out_unlock;
2925
bfd5f4a3 2926 err = -ENOBUFS;
ae641949
HX
2927 hlen = LL_RESERVED_SPACE(dev);
2928 tlen = dev->needed_tailroom;
57031eb7
WB
2929 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2930 linear = max(linear, min_t(int, len, dev->hard_header_len));
2931 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2932 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2933 if (skb == NULL)
1da177e4
LT
2934 goto out_unlock;
2935
b84bbaf7 2936 skb_reset_network_header(skb);
1da177e4 2937
0c4e8581 2938 err = -EINVAL;
9c707762
WB
2939 if (sock->type == SOCK_DGRAM) {
2940 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2941 if (unlikely(offset < 0))
9c707762 2942 goto out_free;
b84bbaf7 2943 } else if (reserve) {
9aad13b0 2944 skb_reserve(skb, -reserve);
88a8121d
ND
2945 if (len < reserve + sizeof(struct ipv6hdr) &&
2946 dev->min_header_len != dev->hard_header_len)
993675a3 2947 skb_reset_network_header(skb);
9c707762 2948 }
1da177e4
LT
2949
2950 /* Returns -EFAULT on error */
c0371da6 2951 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2952 if (err)
2953 goto out_free;
bf84a010 2954
9ed988cd
WB
2955 if (sock->type == SOCK_RAW &&
2956 !dev_validate_header(dev, skb->data, len)) {
2957 err = -EINVAL;
2958 goto out_free;
2959 }
2960
8f932f76 2961 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2962
16cc1400 2963 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2964 !packet_extra_vlan_len_allowed(dev, skb)) {
2965 err = -EMSGSIZE;
2966 goto out_free;
57f89bfa
BG
2967 }
2968
09effa67
DM
2969 skb->protocol = proto;
2970 skb->dev = dev;
1da177e4 2971 skb->priority = sk->sk_priority;
c7d39e32 2972 skb->mark = sockc.mark;
3d0ba8c0 2973 skb->tstamp = sockc.transmit_time;
0fd5d57b 2974
da7c9561 2975 if (has_vnet_hdr) {
db60eb5f 2976 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2977 if (err)
2978 goto out_free;
2979 len += sizeof(vnet_hdr);
9d2f67e4 2980 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2981 }
2982
75c65772 2983 packet_parse_headers(skb, sock);
8fd6c80d 2984
3bdc0eba
BG
2985 if (unlikely(extra_len == 4))
2986 skb->no_fcs = 1;
2987
d346a3fa 2988 err = po->xmit(skb);
1da177e4
LT
2989 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2990 goto out_unlock;
2991
e40526cb 2992 dev_put(dev);
1da177e4 2993
40d4e3df 2994 return len;
1da177e4
LT
2995
2996out_free:
2997 kfree_skb(skb);
2998out_unlock:
e40526cb 2999 if (dev)
1da177e4
LT
3000 dev_put(dev);
3001out:
3002 return err;
3003}
3004
1b784140 3005static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 3006{
69e3c75f
JB
3007 struct sock *sk = sock->sk;
3008 struct packet_sock *po = pkt_sk(sk);
d346a3fa 3009
69e3c75f
JB
3010 if (po->tx_ring.pg_vec)
3011 return tpacket_snd(po, msg);
3012 else
69e3c75f
JB
3013 return packet_snd(sock, msg, len);
3014}
3015
1da177e4
LT
3016/*
3017 * Close a PACKET socket. This is fairly simple. We immediately go
3018 * to 'closed' state and remove our protocol entry in the device list.
3019 */
3020
3021static int packet_release(struct socket *sock)
3022{
3023 struct sock *sk = sock->sk;
3024 struct packet_sock *po;
2bd624b4 3025 struct packet_fanout *f;
d12d01d6 3026 struct net *net;
f6fb8f10 3027 union tpacket_req_u req_u;
1da177e4
LT
3028
3029 if (!sk)
3030 return 0;
3031
3b1e0a65 3032 net = sock_net(sk);
1da177e4
LT
3033 po = pkt_sk(sk);
3034
0fa7fa98 3035 mutex_lock(&net->packet.sklist_lock);
808f5114 3036 sk_del_node_init_rcu(sk);
0fa7fa98
PE
3037 mutex_unlock(&net->packet.sklist_lock);
3038
3039 preempt_disable();
920de804 3040 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 3041 preempt_enable();
1da177e4 3042
808f5114 3043 spin_lock(&po->bind_lock);
ce06b03e 3044 unregister_prot_hook(sk, false);
66e56cd4
DB
3045 packet_cached_dev_reset(po);
3046
160ff18a
BG
3047 if (po->prot_hook.dev) {
3048 dev_put(po->prot_hook.dev);
3049 po->prot_hook.dev = NULL;
3050 }
808f5114 3051 spin_unlock(&po->bind_lock);
1da177e4 3052
1da177e4 3053 packet_flush_mclist(sk);
1da177e4 3054
5171b37d 3055 lock_sock(sk);
9665d5d6
PS
3056 if (po->rx_ring.pg_vec) {
3057 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3058 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3059 }
69e3c75f 3060
9665d5d6
PS
3061 if (po->tx_ring.pg_vec) {
3062 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3063 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3064 }
5171b37d 3065 release_sock(sk);
1da177e4 3066
2bd624b4 3067 f = fanout_release(sk);
dc99f600 3068
808f5114 3069 synchronize_net();
2bd624b4 3070
afa0925c 3071 kfree(po->rollover);
2bd624b4
AS
3072 if (f) {
3073 fanout_release_data(f);
3074 kfree(f);
3075 }
1da177e4
LT
3076 /*
3077 * Now the socket is dead. No more input will appear.
3078 */
1da177e4
LT
3079 sock_orphan(sk);
3080 sock->sk = NULL;
3081
3082 /* Purge queues */
3083
3084 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3085 packet_free_pending(po);
17ab56a2 3086 sk_refcnt_debug_release(sk);
1da177e4
LT
3087
3088 sock_put(sk);
3089 return 0;
3090}
3091
3092/*
3093 * Attach a packet hook.
3094 */
3095
30f7ea1c
FR
3096static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3097 __be16 proto)
1da177e4
LT
3098{
3099 struct packet_sock *po = pkt_sk(sk);
158cd4af 3100 struct net_device *dev_curr;
902fefb8
DB
3101 __be16 proto_curr;
3102 bool need_rehook;
30f7ea1c
FR
3103 struct net_device *dev = NULL;
3104 int ret = 0;
3105 bool unlisted = false;
dc99f600 3106
1da177e4 3107 lock_sock(sk);
1da177e4 3108 spin_lock(&po->bind_lock);
30f7ea1c
FR
3109 rcu_read_lock();
3110
4971613c
WB
3111 if (po->fanout) {
3112 ret = -EINVAL;
3113 goto out_unlock;
3114 }
3115
30f7ea1c
FR
3116 if (name) {
3117 dev = dev_get_by_name_rcu(sock_net(sk), name);
3118 if (!dev) {
3119 ret = -ENODEV;
3120 goto out_unlock;
3121 }
3122 } else if (ifindex) {
3123 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3124 if (!dev) {
3125 ret = -ENODEV;
3126 goto out_unlock;
3127 }
3128 }
3129
3130 if (dev)
3131 dev_hold(dev);
66e56cd4 3132
902fefb8
DB
3133 proto_curr = po->prot_hook.type;
3134 dev_curr = po->prot_hook.dev;
3135
3136 need_rehook = proto_curr != proto || dev_curr != dev;
3137
3138 if (need_rehook) {
30f7ea1c
FR
3139 if (po->running) {
3140 rcu_read_unlock();
15fe076e
ED
3141 /* prevents packet_notifier() from calling
3142 * register_prot_hook()
3143 */
414b5431 3144 WRITE_ONCE(po->num, 0);
30f7ea1c
FR
3145 __unregister_prot_hook(sk, true);
3146 rcu_read_lock();
3147 dev_curr = po->prot_hook.dev;
3148 if (dev)
3149 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3150 dev->ifindex);
3151 }
1da177e4 3152
15fe076e 3153 BUG_ON(po->running);
414b5431 3154 WRITE_ONCE(po->num, proto);
902fefb8 3155 po->prot_hook.type = proto;
902fefb8 3156
30f7ea1c
FR
3157 if (unlikely(unlisted)) {
3158 dev_put(dev);
3159 po->prot_hook.dev = NULL;
3160 po->ifindex = -1;
3161 packet_cached_dev_reset(po);
3162 } else {
3163 po->prot_hook.dev = dev;
3164 po->ifindex = dev ? dev->ifindex : 0;
3165 packet_cached_dev_assign(po, dev);
3166 }
902fefb8 3167 }
158cd4af
LW
3168 if (dev_curr)
3169 dev_put(dev_curr);
66e56cd4 3170
902fefb8 3171 if (proto == 0 || !need_rehook)
1da177e4
LT
3172 goto out_unlock;
3173
30f7ea1c 3174 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3175 register_prot_hook(sk);
be85d4ad
UT
3176 } else {
3177 sk->sk_err = ENETDOWN;
3178 if (!sock_flag(sk, SOCK_DEAD))
3179 sk->sk_error_report(sk);
1da177e4
LT
3180 }
3181
3182out_unlock:
30f7ea1c 3183 rcu_read_unlock();
1da177e4
LT
3184 spin_unlock(&po->bind_lock);
3185 release_sock(sk);
30f7ea1c 3186 return ret;
1da177e4
LT
3187}
3188
3189/*
3190 * Bind a packet socket to a device
3191 */
3192
40d4e3df
ED
3193static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3194 int addr_len)
1da177e4 3195{
40d4e3df 3196 struct sock *sk = sock->sk;
540e2894 3197 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3198
1da177e4
LT
3199 /*
3200 * Check legality
3201 */
1ce4f28b 3202
8ae55f04 3203 if (addr_len != sizeof(struct sockaddr))
1da177e4 3204 return -EINVAL;
540e2894
AP
3205 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3206 * zero-terminated.
3207 */
3208 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3209 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3210
30f7ea1c 3211 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3212}
1da177e4
LT
3213
3214static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3215{
40d4e3df
ED
3216 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3217 struct sock *sk = sock->sk;
1da177e4
LT
3218
3219 /*
3220 * Check legality
3221 */
1ce4f28b 3222
1da177e4
LT
3223 if (addr_len < sizeof(struct sockaddr_ll))
3224 return -EINVAL;
3225 if (sll->sll_family != AF_PACKET)
3226 return -EINVAL;
3227
30f7ea1c
FR
3228 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3229 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3230}
3231
3232static struct proto packet_proto = {
3233 .name = "PACKET",
3234 .owner = THIS_MODULE,
3235 .obj_size = sizeof(struct packet_sock),
3236};
3237
3238/*
1ce4f28b 3239 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3240 */
3241
3f378b68
EP
3242static int packet_create(struct net *net, struct socket *sock, int protocol,
3243 int kern)
1da177e4
LT
3244{
3245 struct sock *sk;
3246 struct packet_sock *po;
0e11c91e 3247 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3248 int err;
3249
df008c91 3250 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3251 return -EPERM;
be02097c
DM
3252 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3253 sock->type != SOCK_PACKET)
1da177e4
LT
3254 return -ESOCKTNOSUPPORT;
3255
3256 sock->state = SS_UNCONNECTED;
3257
3258 err = -ENOBUFS;
11aa9c28 3259 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3260 if (sk == NULL)
3261 goto out;
3262
3263 sock->ops = &packet_ops;
1da177e4
LT
3264 if (sock->type == SOCK_PACKET)
3265 sock->ops = &packet_ops_spkt;
be02097c 3266
1da177e4
LT
3267 sock_init_data(sock, sk);
3268
3269 po = pkt_sk(sk);
89ed5b51 3270 init_completion(&po->skb_completion);
1da177e4 3271 sk->sk_family = PF_PACKET;
0e11c91e 3272 po->num = proto;
d346a3fa 3273 po->xmit = dev_queue_xmit;
66e56cd4 3274
b0138408
DB
3275 err = packet_alloc_pending(po);
3276 if (err)
3277 goto out2;
3278
66e56cd4 3279 packet_cached_dev_reset(po);
1da177e4
LT
3280
3281 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3282 sk_refcnt_debug_inc(sk);
1da177e4
LT
3283
3284 /*
3285 * Attach a protocol block
3286 */
3287
3288 spin_lock_init(&po->bind_lock);
905db440 3289 mutex_init(&po->pg_vec_lock);
0648ab70 3290 po->rollover = NULL;
1da177e4 3291 po->prot_hook.func = packet_rcv;
be02097c 3292
1da177e4
LT
3293 if (sock->type == SOCK_PACKET)
3294 po->prot_hook.func = packet_rcv_spkt;
be02097c 3295
1da177e4
LT
3296 po->prot_hook.af_packet_priv = sk;
3297
0e11c91e
AV
3298 if (proto) {
3299 po->prot_hook.type = proto;
a6361f0c 3300 __register_prot_hook(sk);
1da177e4
LT
3301 }
3302
0fa7fa98 3303 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3304 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3305 mutex_unlock(&net->packet.sklist_lock);
3306
3307 preempt_disable();
3680453c 3308 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3309 preempt_enable();
808f5114 3310
40d4e3df 3311 return 0;
b0138408
DB
3312out2:
3313 sk_free(sk);
1da177e4
LT
3314out:
3315 return err;
3316}
3317
3318/*
3319 * Pull a packet from our receive queue and hand it to the user.
3320 * If necessary we block.
3321 */
3322
1b784140
YX
3323static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3324 int flags)
1da177e4
LT
3325{
3326 struct sock *sk = sock->sk;
3327 struct sk_buff *skb;
3328 int copied, err;
bfd5f4a3 3329 int vnet_hdr_len = 0;
2472d761 3330 unsigned int origlen = 0;
1da177e4
LT
3331
3332 err = -EINVAL;
ed85b565 3333 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3334 goto out;
3335
3336#if 0
3337 /* What error should we return now? EUNATTACH? */
3338 if (pkt_sk(sk)->ifindex < 0)
3339 return -ENODEV;
3340#endif
3341
ed85b565 3342 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3343 err = sock_recv_errqueue(sk, msg, len,
3344 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3345 goto out;
3346 }
3347
1da177e4
LT
3348 /*
3349 * Call the generic datagram receiver. This handles all sorts
3350 * of horrible races and re-entrancy so we can forget about it
3351 * in the protocol layers.
3352 *
3353 * Now it will return ENETDOWN, if device have just gone down,
3354 * but then it will block.
3355 */
3356
40d4e3df 3357 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3358
3359 /*
1ce4f28b 3360 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3361 * handles the blocking we don't see and worry about blocking
3362 * retries.
3363 */
3364
8ae55f04 3365 if (skb == NULL)
1da177e4
LT
3366 goto out;
3367
9bb6cd65 3368 packet_rcv_try_clear_pressure(pkt_sk(sk));
2ccdbaa6 3369
bfd5f4a3 3370 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3371 err = packet_rcv_vnet(msg, skb, &len);
3372 if (err)
bfd5f4a3 3373 goto out_free;
16cc1400 3374 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3375 }
3376
f3d33426
HFS
3377 /* You lose any data beyond the buffer you gave. If it worries
3378 * a user program they can ask the device for its MTU
3379 * anyway.
1da177e4 3380 */
1da177e4 3381 copied = skb->len;
40d4e3df
ED
3382 if (copied > len) {
3383 copied = len;
3384 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3385 }
3386
51f3d02b 3387 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3388 if (err)
3389 goto out_free;
3390
2472d761
EB
3391 if (sock->type != SOCK_PACKET) {
3392 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3393
3394 /* Original length was stored in sockaddr_ll fields */
3395 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3396 sll->sll_family = AF_PACKET;
3397 sll->sll_protocol = skb->protocol;
3398 }
3399
3b885787 3400 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3401
f3d33426 3402 if (msg->msg_name) {
b2cf86e1
WB
3403 int copy_len;
3404
f3d33426
HFS
3405 /* If the address length field is there to be filled
3406 * in, we fill it in now.
3407 */
3408 if (sock->type == SOCK_PACKET) {
342dfc30 3409 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426 3410 msg->msg_namelen = sizeof(struct sockaddr_pkt);
b2cf86e1 3411 copy_len = msg->msg_namelen;
f3d33426
HFS
3412 } else {
3413 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3414
f3d33426
HFS
3415 msg->msg_namelen = sll->sll_halen +
3416 offsetof(struct sockaddr_ll, sll_addr);
b2cf86e1
WB
3417 copy_len = msg->msg_namelen;
3418 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3419 memset(msg->msg_name +
3420 offsetof(struct sockaddr_ll, sll_addr),
3421 0, sizeof(sll->sll_addr));
3422 msg->msg_namelen = sizeof(struct sockaddr_ll);
3423 }
f3d33426 3424 }
b2cf86e1 3425 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
f3d33426 3426 }
1da177e4 3427
8dc41944 3428 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3429 struct tpacket_auxdata aux;
3430
3431 aux.tp_status = TP_STATUS_USER;
3432 if (skb->ip_summed == CHECKSUM_PARTIAL)
3433 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3434 else if (skb->pkt_type != PACKET_OUTGOING &&
3435 (skb->ip_summed == CHECKSUM_COMPLETE ||
3436 skb_csum_unnecessary(skb)))
3437 aux.tp_status |= TP_STATUS_CSUM_VALID;
3438
2472d761 3439 aux.tp_len = origlen;
ffbc6111
HX
3440 aux.tp_snaplen = skb->len;
3441 aux.tp_mac = 0;
bbe735e4 3442 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3443 if (skb_vlan_tag_present(skb)) {
3444 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3445 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3446 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3447 } else {
3448 aux.tp_vlan_tci = 0;
a0cdfcf3 3449 aux.tp_vlan_tpid = 0;
a3bcc23e 3450 }
ffbc6111 3451 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3452 }
3453
1da177e4
LT
3454 /*
3455 * Free or return the buffer as appropriate. Again this
3456 * hides all the races and re-entrancy issues from us.
3457 */
bfd5f4a3 3458 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3459
3460out_free:
3461 skb_free_datagram(sk, skb);
3462out:
3463 return err;
3464}
3465
1da177e4 3466static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3467 int peer)
1da177e4
LT
3468{
3469 struct net_device *dev;
3470 struct sock *sk = sock->sk;
3471
3472 if (peer)
3473 return -EOPNOTSUPP;
3474
3475 uaddr->sa_family = AF_PACKET;
2dc85bf3 3476 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3477 rcu_read_lock();
3478 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3479 if (dev)
2dc85bf3 3480 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3481 rcu_read_unlock();
1da177e4 3482
9b2c45d4 3483 return sizeof(*uaddr);
1da177e4 3484}
1da177e4
LT
3485
3486static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3487 int peer)
1da177e4
LT
3488{
3489 struct net_device *dev;
3490 struct sock *sk = sock->sk;
3491 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3492 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3493
3494 if (peer)
3495 return -EOPNOTSUPP;
3496
3497 sll->sll_family = AF_PACKET;
3498 sll->sll_ifindex = po->ifindex;
414b5431 3499 sll->sll_protocol = READ_ONCE(po->num);
67286640 3500 sll->sll_pkttype = 0;
654d1f8a
ED
3501 rcu_read_lock();
3502 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3503 if (dev) {
3504 sll->sll_hatype = dev->type;
3505 sll->sll_halen = dev->addr_len;
3506 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3507 } else {
3508 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3509 sll->sll_halen = 0;
3510 }
654d1f8a 3511 rcu_read_unlock();
1da177e4 3512
9b2c45d4 3513 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3514}
3515
2aeb0b88
WC
3516static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3517 int what)
1da177e4
LT
3518{
3519 switch (i->type) {
3520 case PACKET_MR_MULTICAST:
1162563f
JP
3521 if (i->alen != dev->addr_len)
3522 return -EINVAL;
1da177e4 3523 if (what > 0)
22bedad3 3524 return dev_mc_add(dev, i->addr);
1da177e4 3525 else
22bedad3 3526 return dev_mc_del(dev, i->addr);
1da177e4
LT
3527 break;
3528 case PACKET_MR_PROMISC:
2aeb0b88 3529 return dev_set_promiscuity(dev, what);
1da177e4 3530 case PACKET_MR_ALLMULTI:
2aeb0b88 3531 return dev_set_allmulti(dev, what);
d95ed927 3532 case PACKET_MR_UNICAST:
1162563f
JP
3533 if (i->alen != dev->addr_len)
3534 return -EINVAL;
d95ed927 3535 if (what > 0)
a748ee24 3536 return dev_uc_add(dev, i->addr);
d95ed927 3537 else
a748ee24 3538 return dev_uc_del(dev, i->addr);
d95ed927 3539 break;
40d4e3df
ED
3540 default:
3541 break;
1da177e4 3542 }
2aeb0b88 3543 return 0;
1da177e4
LT
3544}
3545
82f17091
FR
3546static void packet_dev_mclist_delete(struct net_device *dev,
3547 struct packet_mclist **mlp)
1da177e4 3548{
82f17091
FR
3549 struct packet_mclist *ml;
3550
3551 while ((ml = *mlp) != NULL) {
3552 if (ml->ifindex == dev->ifindex) {
3553 packet_dev_mc(dev, ml, -1);
3554 *mlp = ml->next;
3555 kfree(ml);
3556 } else
3557 mlp = &ml->next;
1da177e4
LT
3558 }
3559}
3560
0fb375fb 3561static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3562{
3563 struct packet_sock *po = pkt_sk(sk);
3564 struct packet_mclist *ml, *i;
3565 struct net_device *dev;
3566 int err;
3567
3568 rtnl_lock();
3569
3570 err = -ENODEV;
3b1e0a65 3571 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3572 if (!dev)
3573 goto done;
3574
3575 err = -EINVAL;
1162563f 3576 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3577 goto done;
3578
3579 err = -ENOBUFS;
8b3a7005 3580 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3581 if (i == NULL)
3582 goto done;
3583
3584 err = 0;
3585 for (ml = po->mclist; ml; ml = ml->next) {
3586 if (ml->ifindex == mreq->mr_ifindex &&
3587 ml->type == mreq->mr_type &&
3588 ml->alen == mreq->mr_alen &&
3589 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3590 ml->count++;
3591 /* Free the new element ... */
3592 kfree(i);
3593 goto done;
3594 }
3595 }
3596
3597 i->type = mreq->mr_type;
3598 i->ifindex = mreq->mr_ifindex;
3599 i->alen = mreq->mr_alen;
3600 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3601 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3602 i->count = 1;
3603 i->next = po->mclist;
3604 po->mclist = i;
2aeb0b88
WC
3605 err = packet_dev_mc(dev, i, 1);
3606 if (err) {
3607 po->mclist = i->next;
3608 kfree(i);
3609 }
1da177e4
LT
3610
3611done:
3612 rtnl_unlock();
3613 return err;
3614}
3615
0fb375fb 3616static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3617{
3618 struct packet_mclist *ml, **mlp;
3619
3620 rtnl_lock();
3621
3622 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3623 if (ml->ifindex == mreq->mr_ifindex &&
3624 ml->type == mreq->mr_type &&
3625 ml->alen == mreq->mr_alen &&
3626 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3627 if (--ml->count == 0) {
3628 struct net_device *dev;
3629 *mlp = ml->next;
ad959e76
ED
3630 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3631 if (dev)
1da177e4 3632 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3633 kfree(ml);
3634 }
82f17091 3635 break;
1da177e4
LT
3636 }
3637 }
3638 rtnl_unlock();
82f17091 3639 return 0;
1da177e4
LT
3640}
3641
3642static void packet_flush_mclist(struct sock *sk)
3643{
3644 struct packet_sock *po = pkt_sk(sk);
3645 struct packet_mclist *ml;
3646
3647 if (!po->mclist)
3648 return;
3649
3650 rtnl_lock();
3651 while ((ml = po->mclist) != NULL) {
3652 struct net_device *dev;
3653
3654 po->mclist = ml->next;
ad959e76
ED
3655 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3656 if (dev != NULL)
1da177e4 3657 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3658 kfree(ml);
3659 }
3660 rtnl_unlock();
3661}
1da177e4
LT
3662
3663static int
b7058842 3664packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3665{
3666 struct sock *sk = sock->sk;
8dc41944 3667 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3668 int ret;
3669
3670 if (level != SOL_PACKET)
3671 return -ENOPROTOOPT;
3672
69e3c75f 3673 switch (optname) {
1ce4f28b 3674 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3675 case PACKET_DROP_MEMBERSHIP:
3676 {
0fb375fb
EB
3677 struct packet_mreq_max mreq;
3678 int len = optlen;
3679 memset(&mreq, 0, sizeof(mreq));
3680 if (len < sizeof(struct packet_mreq))
1da177e4 3681 return -EINVAL;
0fb375fb
EB
3682 if (len > sizeof(mreq))
3683 len = sizeof(mreq);
40d4e3df 3684 if (copy_from_user(&mreq, optval, len))
1da177e4 3685 return -EFAULT;
0fb375fb
EB
3686 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3687 return -EINVAL;
1da177e4
LT
3688 if (optname == PACKET_ADD_MEMBERSHIP)
3689 ret = packet_mc_add(sk, &mreq);
3690 else
3691 ret = packet_mc_drop(sk, &mreq);
3692 return ret;
3693 }
a2efcfa0 3694
1da177e4 3695 case PACKET_RX_RING:
69e3c75f 3696 case PACKET_TX_RING:
1da177e4 3697 {
f6fb8f10 3698 union tpacket_req_u req_u;
3699 int len;
1da177e4 3700
5171b37d 3701 lock_sock(sk);
f6fb8f10 3702 switch (po->tp_version) {
3703 case TPACKET_V1:
3704 case TPACKET_V2:
3705 len = sizeof(req_u.req);
3706 break;
3707 case TPACKET_V3:
3708 default:
3709 len = sizeof(req_u.req3);
3710 break;
3711 }
5171b37d
ED
3712 if (optlen < len) {
3713 ret = -EINVAL;
3714 } else {
3715 if (copy_from_user(&req_u.req, optval, len))
3716 ret = -EFAULT;
3717 else
3718 ret = packet_set_ring(sk, &req_u, 0,
3719 optname == PACKET_TX_RING);
3720 }
3721 release_sock(sk);
3722 return ret;
1da177e4
LT
3723 }
3724 case PACKET_COPY_THRESH:
3725 {
3726 int val;
3727
40d4e3df 3728 if (optlen != sizeof(val))
1da177e4 3729 return -EINVAL;
40d4e3df 3730 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3731 return -EFAULT;
3732
3733 pkt_sk(sk)->copy_thresh = val;
3734 return 0;
3735 }
bbd6ef87
PM
3736 case PACKET_VERSION:
3737 {
3738 int val;
3739
3740 if (optlen != sizeof(val))
3741 return -EINVAL;
bbd6ef87
PM
3742 if (copy_from_user(&val, optval, sizeof(val)))
3743 return -EFAULT;
3744 switch (val) {
3745 case TPACKET_V1:
3746 case TPACKET_V2:
f6fb8f10 3747 case TPACKET_V3:
84ac7260 3748 break;
bbd6ef87
PM
3749 default:
3750 return -EINVAL;
3751 }
84ac7260
PP
3752 lock_sock(sk);
3753 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3754 ret = -EBUSY;
3755 } else {
3756 po->tp_version = val;
3757 ret = 0;
3758 }
3759 release_sock(sk);
3760 return ret;
bbd6ef87 3761 }
8913336a
PM
3762 case PACKET_RESERVE:
3763 {
3764 unsigned int val;
3765
3766 if (optlen != sizeof(val))
3767 return -EINVAL;
8913336a
PM
3768 if (copy_from_user(&val, optval, sizeof(val)))
3769 return -EFAULT;
bcc5364b
AK
3770 if (val > INT_MAX)
3771 return -EINVAL;
c27927e3
WB
3772 lock_sock(sk);
3773 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3774 ret = -EBUSY;
3775 } else {
3776 po->tp_reserve = val;
3777 ret = 0;
3778 }
3779 release_sock(sk);
3780 return ret;
8913336a 3781 }
69e3c75f
JB
3782 case PACKET_LOSS:
3783 {
3784 unsigned int val;
3785
3786 if (optlen != sizeof(val))
3787 return -EINVAL;
69e3c75f
JB
3788 if (copy_from_user(&val, optval, sizeof(val)))
3789 return -EFAULT;
a6361f0c
WB
3790
3791 lock_sock(sk);
3792 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3793 ret = -EBUSY;
3794 } else {
3795 po->tp_loss = !!val;
3796 ret = 0;
3797 }
3798 release_sock(sk);
3799 return ret;
69e3c75f 3800 }
8dc41944
HX
3801 case PACKET_AUXDATA:
3802 {
3803 int val;
3804
3805 if (optlen < sizeof(val))
3806 return -EINVAL;
3807 if (copy_from_user(&val, optval, sizeof(val)))
3808 return -EFAULT;
3809
a6361f0c 3810 lock_sock(sk);
8dc41944 3811 po->auxdata = !!val;
a6361f0c 3812 release_sock(sk);
8dc41944
HX
3813 return 0;
3814 }
80feaacb
PWJ
3815 case PACKET_ORIGDEV:
3816 {
3817 int val;
3818
3819 if (optlen < sizeof(val))
3820 return -EINVAL;
3821 if (copy_from_user(&val, optval, sizeof(val)))
3822 return -EFAULT;
3823
a6361f0c 3824 lock_sock(sk);
80feaacb 3825 po->origdev = !!val;
a6361f0c 3826 release_sock(sk);
80feaacb
PWJ
3827 return 0;
3828 }
bfd5f4a3
SS
3829 case PACKET_VNET_HDR:
3830 {
3831 int val;
3832
3833 if (sock->type != SOCK_RAW)
3834 return -EINVAL;
bfd5f4a3
SS
3835 if (optlen < sizeof(val))
3836 return -EINVAL;
3837 if (copy_from_user(&val, optval, sizeof(val)))
3838 return -EFAULT;
3839
a6361f0c
WB
3840 lock_sock(sk);
3841 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3842 ret = -EBUSY;
3843 } else {
3844 po->has_vnet_hdr = !!val;
3845 ret = 0;
3846 }
3847 release_sock(sk);
3848 return ret;
bfd5f4a3 3849 }
614f60fa
SM
3850 case PACKET_TIMESTAMP:
3851 {
3852 int val;
3853
3854 if (optlen != sizeof(val))
3855 return -EINVAL;
3856 if (copy_from_user(&val, optval, sizeof(val)))
3857 return -EFAULT;
3858
3859 po->tp_tstamp = val;
3860 return 0;
3861 }
dc99f600
DM
3862 case PACKET_FANOUT:
3863 {
3864 int val;
3865
3866 if (optlen != sizeof(val))
3867 return -EINVAL;
3868 if (copy_from_user(&val, optval, sizeof(val)))
3869 return -EFAULT;
3870
3871 return fanout_add(sk, val & 0xffff, val >> 16);
3872 }
47dceb8e
WB
3873 case PACKET_FANOUT_DATA:
3874 {
3875 if (!po->fanout)
3876 return -EINVAL;
3877
3878 return fanout_set_data(po, optval, optlen);
3879 }
fa788d98
VW
3880 case PACKET_IGNORE_OUTGOING:
3881 {
3882 int val;
3883
3884 if (optlen != sizeof(val))
3885 return -EINVAL;
3886 if (copy_from_user(&val, optval, sizeof(val)))
3887 return -EFAULT;
3888 if (val < 0 || val > 1)
3889 return -EINVAL;
3890
3891 po->prot_hook.ignore_outgoing = !!val;
3892 return 0;
3893 }
5920cd3a
PC
3894 case PACKET_TX_HAS_OFF:
3895 {
3896 unsigned int val;
3897
3898 if (optlen != sizeof(val))
3899 return -EINVAL;
5920cd3a
PC
3900 if (copy_from_user(&val, optval, sizeof(val)))
3901 return -EFAULT;
a6361f0c
WB
3902
3903 lock_sock(sk);
3904 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3905 ret = -EBUSY;
3906 } else {
3907 po->tp_tx_has_off = !!val;
3908 ret = 0;
3909 }
3910 release_sock(sk);
5920cd3a
PC
3911 return 0;
3912 }
d346a3fa
DB
3913 case PACKET_QDISC_BYPASS:
3914 {
3915 int val;
3916
3917 if (optlen != sizeof(val))
3918 return -EINVAL;
3919 if (copy_from_user(&val, optval, sizeof(val)))
3920 return -EFAULT;
3921
3922 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3923 return 0;
3924 }
1da177e4
LT
3925 default:
3926 return -ENOPROTOOPT;
3927 }
3928}
3929
3930static int packet_getsockopt(struct socket *sock, int level, int optname,
3931 char __user *optval, int __user *optlen)
3932{
3933 int len;
c06fff6e 3934 int val, lv = sizeof(val);
1da177e4
LT
3935 struct sock *sk = sock->sk;
3936 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3937 void *data = &val;
ee80fbf3 3938 union tpacket_stats_u st;
a9b63918 3939 struct tpacket_rollover_stats rstats;
8e8e2951 3940 int drops;
1da177e4
LT
3941
3942 if (level != SOL_PACKET)
3943 return -ENOPROTOOPT;
3944
8ae55f04
KK
3945 if (get_user(len, optlen))
3946 return -EFAULT;
1da177e4
LT
3947
3948 if (len < 0)
3949 return -EINVAL;
1ce4f28b 3950
69e3c75f 3951 switch (optname) {
1da177e4 3952 case PACKET_STATISTICS:
1da177e4 3953 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3954 memcpy(&st, &po->stats, sizeof(st));
3955 memset(&po->stats, 0, sizeof(po->stats));
3956 spin_unlock_bh(&sk->sk_receive_queue.lock);
8e8e2951 3957 drops = atomic_xchg(&po->tp_drops, 0);
ee80fbf3 3958
f6fb8f10 3959 if (po->tp_version == TPACKET_V3) {
c06fff6e 3960 lv = sizeof(struct tpacket_stats_v3);
8e8e2951
ED
3961 st.stats3.tp_drops = drops;
3962 st.stats3.tp_packets += drops;
ee80fbf3 3963 data = &st.stats3;
f6fb8f10 3964 } else {
c06fff6e 3965 lv = sizeof(struct tpacket_stats);
8e8e2951
ED
3966 st.stats1.tp_drops = drops;
3967 st.stats1.tp_packets += drops;
ee80fbf3 3968 data = &st.stats1;
f6fb8f10 3969 }
ee80fbf3 3970
8dc41944
HX
3971 break;
3972 case PACKET_AUXDATA:
8dc41944 3973 val = po->auxdata;
80feaacb
PWJ
3974 break;
3975 case PACKET_ORIGDEV:
80feaacb 3976 val = po->origdev;
bfd5f4a3
SS
3977 break;
3978 case PACKET_VNET_HDR:
bfd5f4a3 3979 val = po->has_vnet_hdr;
1da177e4 3980 break;
bbd6ef87 3981 case PACKET_VERSION:
bbd6ef87 3982 val = po->tp_version;
bbd6ef87
PM
3983 break;
3984 case PACKET_HDRLEN:
3985 if (len > sizeof(int))
3986 len = sizeof(int);
fd2c83b3
AP
3987 if (len < sizeof(int))
3988 return -EINVAL;
bbd6ef87
PM
3989 if (copy_from_user(&val, optval, len))
3990 return -EFAULT;
3991 switch (val) {
3992 case TPACKET_V1:
3993 val = sizeof(struct tpacket_hdr);
3994 break;
3995 case TPACKET_V2:
3996 val = sizeof(struct tpacket2_hdr);
3997 break;
f6fb8f10 3998 case TPACKET_V3:
3999 val = sizeof(struct tpacket3_hdr);
4000 break;
bbd6ef87
PM
4001 default:
4002 return -EINVAL;
4003 }
bbd6ef87 4004 break;
8913336a 4005 case PACKET_RESERVE:
8913336a 4006 val = po->tp_reserve;
8913336a 4007 break;
69e3c75f 4008 case PACKET_LOSS:
69e3c75f 4009 val = po->tp_loss;
69e3c75f 4010 break;
614f60fa 4011 case PACKET_TIMESTAMP:
614f60fa 4012 val = po->tp_tstamp;
614f60fa 4013 break;
dc99f600 4014 case PACKET_FANOUT:
dc99f600
DM
4015 val = (po->fanout ?
4016 ((u32)po->fanout->id |
77f65ebd
WB
4017 ((u32)po->fanout->type << 16) |
4018 ((u32)po->fanout->flags << 24)) :
dc99f600 4019 0);
dc99f600 4020 break;
fa788d98
VW
4021 case PACKET_IGNORE_OUTGOING:
4022 val = po->prot_hook.ignore_outgoing;
4023 break;
a9b63918 4024 case PACKET_ROLLOVER_STATS:
57f015f5 4025 if (!po->rollover)
a9b63918 4026 return -EINVAL;
57f015f5
MM
4027 rstats.tp_all = atomic_long_read(&po->rollover->num);
4028 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4029 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4030 data = &rstats;
4031 lv = sizeof(rstats);
a9b63918 4032 break;
5920cd3a
PC
4033 case PACKET_TX_HAS_OFF:
4034 val = po->tp_tx_has_off;
4035 break;
d346a3fa
DB
4036 case PACKET_QDISC_BYPASS:
4037 val = packet_use_direct_xmit(po);
4038 break;
1da177e4
LT
4039 default:
4040 return -ENOPROTOOPT;
4041 }
4042
c06fff6e
ED
4043 if (len > lv)
4044 len = lv;
8ae55f04
KK
4045 if (put_user(len, optlen))
4046 return -EFAULT;
8dc41944
HX
4047 if (copy_to_user(optval, data, len))
4048 return -EFAULT;
8ae55f04 4049 return 0;
1da177e4
LT
4050}
4051
4052
719c44d3
WB
4053#ifdef CONFIG_COMPAT
4054static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4055 char __user *optval, unsigned int optlen)
4056{
4057 struct packet_sock *po = pkt_sk(sock->sk);
4058
4059 if (level != SOL_PACKET)
4060 return -ENOPROTOOPT;
4061
4062 if (optname == PACKET_FANOUT_DATA &&
4063 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4064 optval = (char __user *)get_compat_bpf_fprog(optval);
4065 if (!optval)
4066 return -EFAULT;
4067 optlen = sizeof(struct sock_fprog);
4068 }
4069
4070 return packet_setsockopt(sock, level, optname, optval, optlen);
4071}
4072#endif
4073
351638e7
JP
4074static int packet_notifier(struct notifier_block *this,
4075 unsigned long msg, void *ptr)
1da177e4
LT
4076{
4077 struct sock *sk;
351638e7 4078 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4079 struct net *net = dev_net(dev);
1da177e4 4080
808f5114 4081 rcu_read_lock();
b67bfe0d 4082 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4083 struct packet_sock *po = pkt_sk(sk);
4084
4085 switch (msg) {
4086 case NETDEV_UNREGISTER:
1da177e4 4087 if (po->mclist)
82f17091 4088 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4089 /* fallthrough */
4090
1da177e4
LT
4091 case NETDEV_DOWN:
4092 if (dev->ifindex == po->ifindex) {
4093 spin_lock(&po->bind_lock);
4094 if (po->running) {
ce06b03e 4095 __unregister_prot_hook(sk, false);
1da177e4
LT
4096 sk->sk_err = ENETDOWN;
4097 if (!sock_flag(sk, SOCK_DEAD))
4098 sk->sk_error_report(sk);
4099 }
4100 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4101 packet_cached_dev_reset(po);
1da177e4 4102 po->ifindex = -1;
160ff18a
BG
4103 if (po->prot_hook.dev)
4104 dev_put(po->prot_hook.dev);
1da177e4
LT
4105 po->prot_hook.dev = NULL;
4106 }
4107 spin_unlock(&po->bind_lock);
4108 }
4109 break;
4110 case NETDEV_UP:
808f5114 4111 if (dev->ifindex == po->ifindex) {
4112 spin_lock(&po->bind_lock);
ce06b03e
DM
4113 if (po->num)
4114 register_prot_hook(sk);
808f5114 4115 spin_unlock(&po->bind_lock);
1da177e4 4116 }
1da177e4
LT
4117 break;
4118 }
4119 }
808f5114 4120 rcu_read_unlock();
1da177e4
LT
4121 return NOTIFY_DONE;
4122}
4123
4124
4125static int packet_ioctl(struct socket *sock, unsigned int cmd,
4126 unsigned long arg)
4127{
4128 struct sock *sk = sock->sk;
4129
69e3c75f 4130 switch (cmd) {
40d4e3df
ED
4131 case SIOCOUTQ:
4132 {
4133 int amount = sk_wmem_alloc_get(sk);
31e6d363 4134
40d4e3df
ED
4135 return put_user(amount, (int __user *)arg);
4136 }
4137 case SIOCINQ:
4138 {
4139 struct sk_buff *skb;
4140 int amount = 0;
4141
4142 spin_lock_bh(&sk->sk_receive_queue.lock);
4143 skb = skb_peek(&sk->sk_receive_queue);
4144 if (skb)
4145 amount = skb->len;
4146 spin_unlock_bh(&sk->sk_receive_queue.lock);
4147 return put_user(amount, (int __user *)arg);
4148 }
1da177e4 4149#ifdef CONFIG_INET
40d4e3df
ED
4150 case SIOCADDRT:
4151 case SIOCDELRT:
4152 case SIOCDARP:
4153 case SIOCGARP:
4154 case SIOCSARP:
4155 case SIOCGIFADDR:
4156 case SIOCSIFADDR:
4157 case SIOCGIFBRDADDR:
4158 case SIOCSIFBRDADDR:
4159 case SIOCGIFNETMASK:
4160 case SIOCSIFNETMASK:
4161 case SIOCGIFDSTADDR:
4162 case SIOCSIFDSTADDR:
4163 case SIOCSIFFLAGS:
40d4e3df 4164 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4165#endif
4166
40d4e3df
ED
4167 default:
4168 return -ENOIOCTLCMD;
1da177e4
LT
4169 }
4170 return 0;
4171}
4172
a11e1d43
LT
4173static __poll_t packet_poll(struct file *file, struct socket *sock,
4174 poll_table *wait)
1da177e4
LT
4175{
4176 struct sock *sk = sock->sk;
4177 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4178 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4179
4180 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4181 if (po->rx_ring.pg_vec) {
f6fb8f10 4182 if (!packet_previous_rx_frame(po, &po->rx_ring,
4183 TP_STATUS_KERNEL))
a9a08845 4184 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4185 }
9bb6cd65 4186 packet_rcv_try_clear_pressure(po);
1da177e4 4187 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4188 spin_lock_bh(&sk->sk_write_queue.lock);
4189 if (po->tx_ring.pg_vec) {
4190 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4191 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4192 }
4193 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4194 return mask;
4195}
4196
4197
4198/* Dirty? Well, I still did not learn better way to account
4199 * for user mmaps.
4200 */
4201
4202static void packet_mm_open(struct vm_area_struct *vma)
4203{
4204 struct file *file = vma->vm_file;
40d4e3df 4205 struct socket *sock = file->private_data;
1da177e4 4206 struct sock *sk = sock->sk;
1ce4f28b 4207
1da177e4
LT
4208 if (sk)
4209 atomic_inc(&pkt_sk(sk)->mapped);
4210}
4211
4212static void packet_mm_close(struct vm_area_struct *vma)
4213{
4214 struct file *file = vma->vm_file;
40d4e3df 4215 struct socket *sock = file->private_data;
1da177e4 4216 struct sock *sk = sock->sk;
1ce4f28b 4217
1da177e4
LT
4218 if (sk)
4219 atomic_dec(&pkt_sk(sk)->mapped);
4220}
4221
f0f37e2f 4222static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4223 .open = packet_mm_open,
4224 .close = packet_mm_close,
1da177e4
LT
4225};
4226
3a7ad063
ED
4227static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4228 unsigned int len)
1da177e4
LT
4229{
4230 int i;
4231
4ebf0ae2 4232 for (i = 0; i < len; i++) {
0e3125c7 4233 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4234 if (is_vmalloc_addr(pg_vec[i].buffer))
4235 vfree(pg_vec[i].buffer);
4236 else
4237 free_pages((unsigned long)pg_vec[i].buffer,
4238 order);
0e3125c7
NH
4239 pg_vec[i].buffer = NULL;
4240 }
1da177e4
LT
4241 }
4242 kfree(pg_vec);
4243}
4244
3a7ad063 4245static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4246{
f0d4eb29 4247 char *buffer;
3a7ad063
ED
4248 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4249 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4250
3a7ad063 4251 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4252 if (buffer)
4253 return buffer;
4254
3a7ad063
ED
4255 /* __get_free_pages failed, fall back to vmalloc */
4256 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4257 if (buffer)
4258 return buffer;
0e3125c7 4259
3a7ad063
ED
4260 /* vmalloc failed, lets dig into swap here */
4261 gfp_flags &= ~__GFP_NORETRY;
4262 buffer = (char *) __get_free_pages(gfp_flags, order);
4263 if (buffer)
4264 return buffer;
4265
4266 /* complete and utter failure */
4267 return NULL;
4ebf0ae2
DM
4268}
4269
3a7ad063 4270static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4271{
4272 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4273 struct pgv *pg_vec;
4ebf0ae2
DM
4274 int i;
4275
398f0132 4276 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4ebf0ae2
DM
4277 if (unlikely(!pg_vec))
4278 goto out;
4279
4280 for (i = 0; i < block_nr; i++) {
3a7ad063 4281 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4282 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4283 goto out_free_pgvec;
4284 }
4285
4286out:
4287 return pg_vec;
4288
4289out_free_pgvec:
3a7ad063 4290 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4291 pg_vec = NULL;
4292 goto out;
4293}
1da177e4 4294
f6fb8f10 4295static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4296 int closing, int tx_ring)
1da177e4 4297{
0e3125c7 4298 struct pgv *pg_vec = NULL;
1da177e4 4299 struct packet_sock *po = pkt_sk(sk);
f897d759 4300 unsigned long *rx_owner_map = NULL;
3a7ad063 4301 int was_running, order = 0;
69e3c75f
JB
4302 struct packet_ring_buffer *rb;
4303 struct sk_buff_head *rb_queue;
0e11c91e 4304 __be16 num;
f6fb8f10 4305 int err = -EINVAL;
4306 /* Added to avoid minimal code churn */
4307 struct tpacket_req *req = &req_u->req;
4308
69e3c75f
JB
4309 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4310 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4311
69e3c75f
JB
4312 err = -EBUSY;
4313 if (!closing) {
4314 if (atomic_read(&po->mapped))
4315 goto out;
b0138408 4316 if (packet_read_pending(rb))
69e3c75f
JB
4317 goto out;
4318 }
1da177e4 4319
69e3c75f 4320 if (req->tp_block_nr) {
4576cd46
WB
4321 unsigned int min_frame_size;
4322
69e3c75f
JB
4323 /* Sanity tests and some calculations */
4324 err = -EBUSY;
4325 if (unlikely(rb->pg_vec))
4326 goto out;
1da177e4 4327
bbd6ef87
PM
4328 switch (po->tp_version) {
4329 case TPACKET_V1:
4330 po->tp_hdrlen = TPACKET_HDRLEN;
4331 break;
4332 case TPACKET_V2:
4333 po->tp_hdrlen = TPACKET2_HDRLEN;
4334 break;
f6fb8f10 4335 case TPACKET_V3:
4336 po->tp_hdrlen = TPACKET3_HDRLEN;
4337 break;
bbd6ef87
PM
4338 }
4339
69e3c75f 4340 err = -EINVAL;
4ebf0ae2 4341 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4342 goto out;
90836b67 4343 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4344 goto out;
4576cd46 4345 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4346 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4347 req->tp_block_size <
4348 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4349 goto out;
4576cd46 4350 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4351 goto out;
4ebf0ae2 4352 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4353 goto out;
1da177e4 4354
4194b491
TK
4355 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4356 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4357 goto out;
fc62814d 4358 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4359 goto out;
69e3c75f
JB
4360 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4361 req->tp_frame_nr))
4362 goto out;
1da177e4
LT
4363
4364 err = -ENOMEM;
3a7ad063
ED
4365 order = get_order(req->tp_block_size);
4366 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4367 if (unlikely(!pg_vec))
1da177e4 4368 goto out;
f6fb8f10 4369 switch (po->tp_version) {
4370 case TPACKET_V3:
7f953ab2
SV
4371 /* Block transmit is not supported yet */
4372 if (!tx_ring) {
e8e85cc5 4373 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4374 } else {
4375 struct tpacket_req3 *req3 = &req_u->req3;
4376
4377 if (req3->tp_retire_blk_tov ||
4378 req3->tp_sizeof_priv ||
4379 req3->tp_feature_req_word) {
4380 err = -EINVAL;
55655e3d 4381 goto out_free_pg_vec;
7f953ab2
SV
4382 }
4383 }
d7cf0c34 4384 break;
f6fb8f10 4385 default:
f897d759
WB
4386 if (!tx_ring) {
4387 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
4388 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
4389 if (!rx_owner_map)
4390 goto out_free_pg_vec;
4391 }
f6fb8f10 4392 break;
4393 }
69e3c75f
JB
4394 }
4395 /* Done */
4396 else {
4397 err = -EINVAL;
4ebf0ae2 4398 if (unlikely(req->tp_frame_nr))
69e3c75f 4399 goto out;
1da177e4
LT
4400 }
4401
1da177e4
LT
4402
4403 /* Detach socket from network */
4404 spin_lock(&po->bind_lock);
4405 was_running = po->running;
4406 num = po->num;
4407 if (was_running) {
414b5431 4408 WRITE_ONCE(po->num, 0);
ce06b03e 4409 __unregister_prot_hook(sk, false);
1da177e4
LT
4410 }
4411 spin_unlock(&po->bind_lock);
1ce4f28b 4412
1da177e4
LT
4413 synchronize_net();
4414
4415 err = -EBUSY;
905db440 4416 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4417 if (closing || atomic_read(&po->mapped) == 0) {
4418 err = 0;
69e3c75f 4419 spin_lock_bh(&rb_queue->lock);
c053fd96 4420 swap(rb->pg_vec, pg_vec);
f897d759
WB
4421 if (po->tp_version <= TPACKET_V2)
4422 swap(rb->rx_owner_map, rx_owner_map);
69e3c75f
JB
4423 rb->frame_max = (req->tp_frame_nr - 1);
4424 rb->head = 0;
4425 rb->frame_size = req->tp_frame_size;
4426 spin_unlock_bh(&rb_queue->lock);
4427
3a7ad063 4428 swap(rb->pg_vec_order, order);
c053fd96 4429 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4430
4431 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4432 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4433 tpacket_rcv : packet_rcv;
4434 skb_queue_purge(rb_queue);
1da177e4 4435 if (atomic_read(&po->mapped))
40d4e3df
ED
4436 pr_err("packet_mmap: vma is busy: %d\n",
4437 atomic_read(&po->mapped));
1da177e4 4438 }
905db440 4439 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4440
4441 spin_lock(&po->bind_lock);
ce06b03e 4442 if (was_running) {
414b5431 4443 WRITE_ONCE(po->num, num);
ce06b03e 4444 register_prot_hook(sk);
1da177e4
LT
4445 }
4446 spin_unlock(&po->bind_lock);
c800aaf8 4447 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4448 /* Because we don't support block-based V3 on tx-ring */
4449 if (!tx_ring)
73d0fcf2 4450 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4451 }
1da177e4 4452
55655e3d 4453out_free_pg_vec:
f897d759 4454 bitmap_free(rx_owner_map);
1da177e4 4455 if (pg_vec)
3a7ad063 4456 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4457out:
4458 return err;
4459}
4460
69e3c75f
JB
4461static int packet_mmap(struct file *file, struct socket *sock,
4462 struct vm_area_struct *vma)
1da177e4
LT
4463{
4464 struct sock *sk = sock->sk;
4465 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4466 unsigned long size, expected_size;
4467 struct packet_ring_buffer *rb;
1da177e4
LT
4468 unsigned long start;
4469 int err = -EINVAL;
4470 int i;
4471
4472 if (vma->vm_pgoff)
4473 return -EINVAL;
4474
905db440 4475 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4476
4477 expected_size = 0;
4478 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4479 if (rb->pg_vec) {
4480 expected_size += rb->pg_vec_len
4481 * rb->pg_vec_pages
4482 * PAGE_SIZE;
4483 }
4484 }
4485
4486 if (expected_size == 0)
1da177e4 4487 goto out;
69e3c75f
JB
4488
4489 size = vma->vm_end - vma->vm_start;
4490 if (size != expected_size)
1da177e4
LT
4491 goto out;
4492
1da177e4 4493 start = vma->vm_start;
69e3c75f
JB
4494 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4495 if (rb->pg_vec == NULL)
4496 continue;
4497
4498 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4499 struct page *page;
4500 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4501 int pg_num;
4502
c56b4d90
CG
4503 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4504 page = pgv_to_page(kaddr);
69e3c75f
JB
4505 err = vm_insert_page(vma, start, page);
4506 if (unlikely(err))
4507 goto out;
4508 start += PAGE_SIZE;
0e3125c7 4509 kaddr += PAGE_SIZE;
69e3c75f 4510 }
4ebf0ae2 4511 }
1da177e4 4512 }
69e3c75f 4513
4ebf0ae2 4514 atomic_inc(&po->mapped);
1da177e4
LT
4515 vma->vm_ops = &packet_mmap_ops;
4516 err = 0;
4517
4518out:
905db440 4519 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4520 return err;
4521}
1da177e4 4522
90ddc4f0 4523static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4524 .family = PF_PACKET,
4525 .owner = THIS_MODULE,
4526 .release = packet_release,
4527 .bind = packet_bind_spkt,
4528 .connect = sock_no_connect,
4529 .socketpair = sock_no_socketpair,
4530 .accept = sock_no_accept,
4531 .getname = packet_getname_spkt,
a11e1d43 4532 .poll = datagram_poll,
1da177e4 4533 .ioctl = packet_ioctl,
c7cbdbf2 4534 .gettstamp = sock_gettstamp,
1da177e4
LT
4535 .listen = sock_no_listen,
4536 .shutdown = sock_no_shutdown,
4537 .setsockopt = sock_no_setsockopt,
4538 .getsockopt = sock_no_getsockopt,
4539 .sendmsg = packet_sendmsg_spkt,
4540 .recvmsg = packet_recvmsg,
4541 .mmap = sock_no_mmap,
4542 .sendpage = sock_no_sendpage,
4543};
1da177e4 4544
90ddc4f0 4545static const struct proto_ops packet_ops = {
1da177e4
LT
4546 .family = PF_PACKET,
4547 .owner = THIS_MODULE,
4548 .release = packet_release,
4549 .bind = packet_bind,
4550 .connect = sock_no_connect,
4551 .socketpair = sock_no_socketpair,
4552 .accept = sock_no_accept,
1ce4f28b 4553 .getname = packet_getname,
a11e1d43 4554 .poll = packet_poll,
1da177e4 4555 .ioctl = packet_ioctl,
c7cbdbf2 4556 .gettstamp = sock_gettstamp,
1da177e4
LT
4557 .listen = sock_no_listen,
4558 .shutdown = sock_no_shutdown,
4559 .setsockopt = packet_setsockopt,
4560 .getsockopt = packet_getsockopt,
719c44d3
WB
4561#ifdef CONFIG_COMPAT
4562 .compat_setsockopt = compat_packet_setsockopt,
4563#endif
1da177e4
LT
4564 .sendmsg = packet_sendmsg,
4565 .recvmsg = packet_recvmsg,
4566 .mmap = packet_mmap,
4567 .sendpage = sock_no_sendpage,
4568};
4569
ec1b4cf7 4570static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4571 .family = PF_PACKET,
4572 .create = packet_create,
4573 .owner = THIS_MODULE,
4574};
4575
4576static struct notifier_block packet_netdev_notifier = {
40d4e3df 4577 .notifier_call = packet_notifier,
1da177e4
LT
4578};
4579
4580#ifdef CONFIG_PROC_FS
1da177e4
LT
4581
4582static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4583 __acquires(RCU)
1da177e4 4584{
e372c414 4585 struct net *net = seq_file_net(seq);
808f5114 4586
4587 rcu_read_lock();
4588 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4589}
4590
4591static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4592{
1bf40954 4593 struct net *net = seq_file_net(seq);
808f5114 4594 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4595}
4596
4597static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4598 __releases(RCU)
1da177e4 4599{
808f5114 4600 rcu_read_unlock();
1da177e4
LT
4601}
4602
1ce4f28b 4603static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4604{
4605 if (v == SEQ_START_TOKEN)
4606 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4607 else {
b7ceabd9 4608 struct sock *s = sk_entry(v);
1da177e4
LT
4609 const struct packet_sock *po = pkt_sk(s);
4610
4611 seq_printf(seq,
71338aa7 4612 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4613 s,
41c6d650 4614 refcount_read(&s->sk_refcnt),
1da177e4 4615 s->sk_type,
414b5431 4616 ntohs(READ_ONCE(po->num)),
1da177e4
LT
4617 po->ifindex,
4618 po->running,
4619 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4620 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4621 sock_i_ino(s));
1da177e4
LT
4622 }
4623
4624 return 0;
4625}
4626
56b3d975 4627static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4628 .start = packet_seq_start,
4629 .next = packet_seq_next,
4630 .stop = packet_seq_stop,
4631 .show = packet_seq_show,
4632};
1da177e4
LT
4633#endif
4634
2c8c1e72 4635static int __net_init packet_net_init(struct net *net)
d12d01d6 4636{
0fa7fa98 4637 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4638 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4639
c3506372
CH
4640 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4641 sizeof(struct seq_net_private)))
d12d01d6
DL
4642 return -ENOMEM;
4643
4644 return 0;
4645}
4646
2c8c1e72 4647static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4648{
ece31ffd 4649 remove_proc_entry("packet", net->proc_net);
669f8f1a 4650 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4651}
4652
4653static struct pernet_operations packet_net_ops = {
4654 .init = packet_net_init,
4655 .exit = packet_net_exit,
4656};
4657
4658
1da177e4
LT
4659static void __exit packet_exit(void)
4660{
1da177e4 4661 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4662 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4663 sock_unregister(PF_PACKET);
4664 proto_unregister(&packet_proto);
4665}
4666
4667static int __init packet_init(void)
4668{
36096f2f 4669 int rc;
1da177e4 4670
36096f2f
Y
4671 rc = proto_register(&packet_proto, 0);
4672 if (rc)
1da177e4 4673 goto out;
36096f2f
Y
4674 rc = sock_register(&packet_family_ops);
4675 if (rc)
4676 goto out_proto;
4677 rc = register_pernet_subsys(&packet_net_ops);
4678 if (rc)
4679 goto out_sock;
4680 rc = register_netdevice_notifier(&packet_netdev_notifier);
4681 if (rc)
4682 goto out_pernet;
1da177e4 4683
36096f2f
Y
4684 return 0;
4685
4686out_pernet:
4687 unregister_pernet_subsys(&packet_net_ops);
4688out_sock:
4689 sock_unregister(PF_PACKET);
4690out_proto:
4691 proto_unregister(&packet_proto);
1da177e4
LT
4692out:
4693 return rc;
4694}
4695
4696module_init(packet_init);
4697module_exit(packet_exit);
4698MODULE_LICENSE("GPL");
4699MODULE_ALIAS_NETPROTO(PF_PACKET);