]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - net/packet/af_packet.c
packet: fix data-race in fanout_flow_is_huge()
[mirror_ubuntu-focal-kernel.git] / net / packet / af_packet.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * PACKET - implements raw packet sockets.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 *
1ce4f28b 13 * Fixes:
1da177e4
LT
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 36 * Ulises Alonso : Frame number limit removal and
1da177e4 37 * packet_set_ring memory leak.
0fb375fb
EB
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
1ce4f28b 41 * byte arrays at the end of sockaddr_ll
0fb375fb 42 * and packet_mreq.
69e3c75f 43 * Johann Baudy : Added TX RING.
f6fb8f10 44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * layer.
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
1da177e4 47 */
1ce4f28b 48
1da177e4 49#include <linux/types.h>
1da177e4 50#include <linux/mm.h>
4fc268d2 51#include <linux/capability.h>
1da177e4
LT
52#include <linux/fcntl.h>
53#include <linux/socket.h>
54#include <linux/in.h>
55#include <linux/inet.h>
56#include <linux/netdevice.h>
57#include <linux/if_packet.h>
58#include <linux/wireless.h>
ffbc6111 59#include <linux/kernel.h>
1da177e4 60#include <linux/kmod.h>
5a0e3ad6 61#include <linux/slab.h>
0e3125c7 62#include <linux/vmalloc.h>
457c4cbc 63#include <net/net_namespace.h>
1da177e4
LT
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
7c0f6ba6 70#include <linux/uaccess.h>
1da177e4
LT
71#include <asm/ioctls.h>
72#include <asm/page.h>
a1f8e7f7 73#include <asm/cacheflush.h>
1da177e4
LT
74#include <asm/io.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77#include <linux/poll.h>
78#include <linux/module.h>
79#include <linux/init.h>
905db440 80#include <linux/mutex.h>
05423b24 81#include <linux/if_vlan.h>
bfd5f4a3 82#include <linux/virtio_net.h>
ed85b565 83#include <linux/errqueue.h>
614f60fa 84#include <linux/net_tstamp.h>
b0138408 85#include <linux/percpu.h>
1da177e4
LT
86#ifdef CONFIG_INET
87#include <net/inet_common.h>
88#endif
47dceb8e 89#include <linux/bpf.h>
719c44d3 90#include <net/compat.h>
1da177e4 91
2787b04b
PE
92#include "internal.h"
93
1da177e4
LT
94/*
95 Assumptions:
96 - if device has no dev->hard_header routine, it adds and removes ll header
97 inside itself. In this case ll header is invisible outside of device,
98 but higher levels still should reserve dev->hard_header_len.
99 Some devices are enough clever to reallocate skb, when header
100 will not fit to reserved space (tunnel), another ones are silly
101 (PPP).
102 - packet socket receives packets with pulled ll header,
103 so that SOCK_RAW should push it back.
104
105On receive:
106-----------
107
108Incoming, dev->hard_header!=NULL
b0e380b1
ACM
109 mac_header -> ll header
110 data -> data
1da177e4
LT
111
112Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> ll header
1da177e4
LT
115
116Incoming, dev->hard_header==NULL
b0e380b1
ACM
117 mac_header -> UNKNOWN position. It is very likely, that it points to ll
118 header. PPP makes it, that is wrong, because introduce
db0c58f9 119 assymetry between rx and tx paths.
b0e380b1 120 data -> data
1da177e4
LT
121
122Outgoing, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> data. ll header is still not built!
124 data -> data
1da177e4
LT
125
126Resume
127 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
128
129
130On transmit:
131------------
132
133dev->hard_header != NULL
b0e380b1
ACM
134 mac_header -> ll header
135 data -> ll header
1da177e4
LT
136
137dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
138 mac_header -> data
139 data -> data
1da177e4
LT
140
141 We should set nh.raw on output to correct posistion,
142 packet classifier depends on it.
143 */
144
1da177e4
LT
145/* Private packet socket structures. */
146
0fb375fb
EB
147/* identical to struct packet_mreq except it has
148 * a longer address field.
149 */
40d4e3df 150struct packet_mreq_max {
0fb375fb
EB
151 int mr_ifindex;
152 unsigned short mr_type;
153 unsigned short mr_alen;
154 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 155};
a2efcfa0 156
184f489e
DB
157union tpacket_uhdr {
158 struct tpacket_hdr *h1;
159 struct tpacket2_hdr *h2;
160 struct tpacket3_hdr *h3;
161 void *raw;
162};
163
f6fb8f10 164static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
165 int closing, int tx_ring);
166
f6fb8f10 167#define V3_ALIGNMENT (8)
168
bc59ba39 169#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 170
171#define BLK_PLUS_PRIV(sz_of_priv) \
172 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
173
f6fb8f10 174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
181
69e3c75f 182struct packet_sock;
77f65ebd
WB
183static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
184 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 185
f6fb8f10 186static void *packet_previous_frame(struct packet_sock *po,
187 struct packet_ring_buffer *rb,
188 int status);
189static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 190static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 191static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 192 struct packet_sock *);
bc59ba39 193static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 194 struct packet_sock *, unsigned int status);
bc59ba39 195static int prb_queue_frozen(struct tpacket_kbdq_core *);
196static void prb_open_block(struct tpacket_kbdq_core *,
197 struct tpacket_block_desc *);
17bfd8c8 198static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 199static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 200static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
201static void prb_clear_rxhash(struct tpacket_kbdq_core *,
202 struct tpacket3_hdr *);
203static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
204 struct tpacket3_hdr *);
1da177e4 205static void packet_flush_mclist(struct sock *sk);
865b03f2 206static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 207
ffbc6111 208struct packet_skb_cb {
ffbc6111
HX
209 union {
210 struct sockaddr_pkt pkt;
2472d761
EB
211 union {
212 /* Trick: alias skb original length with
213 * ll.sll_family and ll.protocol in order
214 * to save room.
215 */
216 unsigned int origlen;
217 struct sockaddr_ll ll;
218 };
ffbc6111
HX
219 } sa;
220};
221
d3869efe
DW
222#define vio_le() virtio_legacy_is_little_endian()
223
ffbc6111 224#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 225
bc59ba39 226#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 227#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 228 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 229#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 231#define GET_NEXT_PRB_BLK_NUM(x) \
232 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
233 ((x)->kactive_blk_num+1) : 0)
234
dc99f600
DM
235static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
236static void __fanout_link(struct sock *sk, struct packet_sock *po);
237
d346a3fa
DB
238static int packet_direct_xmit(struct sk_buff *skb)
239{
865b03f2 240 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
241}
242
66e56cd4
DB
243static struct net_device *packet_cached_dev_get(struct packet_sock *po)
244{
245 struct net_device *dev;
246
247 rcu_read_lock();
248 dev = rcu_dereference(po->cached_dev);
249 if (likely(dev))
250 dev_hold(dev);
251 rcu_read_unlock();
252
253 return dev;
254}
255
256static void packet_cached_dev_assign(struct packet_sock *po,
257 struct net_device *dev)
258{
259 rcu_assign_pointer(po->cached_dev, dev);
260}
261
262static void packet_cached_dev_reset(struct packet_sock *po)
263{
264 RCU_INIT_POINTER(po->cached_dev, NULL);
265}
266
d346a3fa
DB
267static bool packet_use_direct_xmit(const struct packet_sock *po)
268{
269 return po->xmit == packet_direct_xmit;
270}
271
865b03f2 272static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 273{
865b03f2 274 struct net_device *dev = skb->dev;
0fd5d57b 275 const struct net_device_ops *ops = dev->netdev_ops;
b71b5837 276 int cpu = raw_smp_processor_id();
0fd5d57b
DB
277 u16 queue_index;
278
b71b5837
PA
279#ifdef CONFIG_XPS
280 skb->sender_cpu = cpu + 1;
281#endif
282 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
0fd5d57b 283 if (ops->ndo_select_queue) {
a350ecce 284 queue_index = ops->ndo_select_queue(dev, skb, NULL);
0fd5d57b
DB
285 queue_index = netdev_cap_txqueue(dev, queue_index);
286 } else {
b71b5837 287 queue_index = netdev_pick_tx(dev, skb, NULL);
0fd5d57b
DB
288 }
289
865b03f2 290 return queue_index;
0fd5d57b
DB
291}
292
a6361f0c 293/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
294 * or from a context in which asynchronous accesses to the packet
295 * socket is not possible (packet_create()).
296 */
a6361f0c 297static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
298{
299 struct packet_sock *po = pkt_sk(sk);
e40526cb 300
ce06b03e 301 if (!po->running) {
66e56cd4 302 if (po->fanout)
dc99f600 303 __fanout_link(sk, po);
66e56cd4 304 else
dc99f600 305 dev_add_pack(&po->prot_hook);
e40526cb 306
ce06b03e
DM
307 sock_hold(sk);
308 po->running = 1;
309 }
310}
311
a6361f0c
WB
312static void register_prot_hook(struct sock *sk)
313{
314 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
315 __register_prot_hook(sk);
316}
317
318/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
319 * the po->bind_lock and do a synchronize_net to make sure no
320 * asynchronous packet processing paths still refer to the elements
321 * of po->prot_hook. If the sync parameter is false, it is the
322 * callers responsibility to take care of this.
323 */
324static void __unregister_prot_hook(struct sock *sk, bool sync)
325{
326 struct packet_sock *po = pkt_sk(sk);
327
a6361f0c
WB
328 lockdep_assert_held_once(&po->bind_lock);
329
ce06b03e 330 po->running = 0;
66e56cd4
DB
331
332 if (po->fanout)
dc99f600 333 __fanout_unlink(sk, po);
66e56cd4 334 else
dc99f600 335 __dev_remove_pack(&po->prot_hook);
e40526cb 336
ce06b03e
DM
337 __sock_put(sk);
338
339 if (sync) {
340 spin_unlock(&po->bind_lock);
341 synchronize_net();
342 spin_lock(&po->bind_lock);
343 }
344}
345
346static void unregister_prot_hook(struct sock *sk, bool sync)
347{
348 struct packet_sock *po = pkt_sk(sk);
349
350 if (po->running)
351 __unregister_prot_hook(sk, sync);
352}
353
6e58040b 354static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
355{
356 if (is_vmalloc_addr(addr))
357 return vmalloc_to_page(addr);
358 return virt_to_page(addr);
359}
360
69e3c75f 361static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 362{
184f489e 363 union tpacket_uhdr h;
1da177e4 364
69e3c75f 365 h.raw = frame;
bbd6ef87
PM
366 switch (po->tp_version) {
367 case TPACKET_V1:
69e3c75f 368 h.h1->tp_status = status;
0af55bb5 369 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
370 break;
371 case TPACKET_V2:
69e3c75f 372 h.h2->tp_status = status;
0af55bb5 373 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 374 break;
f6fb8f10 375 case TPACKET_V3:
7f953ab2
SV
376 h.h3->tp_status = status;
377 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
378 break;
69e3c75f 379 default:
f6fb8f10 380 WARN(1, "TPACKET version not supported.\n");
69e3c75f 381 BUG();
bbd6ef87 382 }
69e3c75f
JB
383
384 smp_wmb();
bbd6ef87
PM
385}
386
96f657e6 387static int __packet_get_status(const struct packet_sock *po, void *frame)
bbd6ef87 388{
184f489e 389 union tpacket_uhdr h;
bbd6ef87 390
69e3c75f
JB
391 smp_rmb();
392
bbd6ef87
PM
393 h.raw = frame;
394 switch (po->tp_version) {
395 case TPACKET_V1:
0af55bb5 396 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 397 return h.h1->tp_status;
bbd6ef87 398 case TPACKET_V2:
0af55bb5 399 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 400 return h.h2->tp_status;
f6fb8f10 401 case TPACKET_V3:
7f953ab2
SV
402 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
403 return h.h3->tp_status;
69e3c75f 404 default:
f6fb8f10 405 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
406 BUG();
407 return 0;
bbd6ef87 408 }
1da177e4 409}
69e3c75f 410
b9c32fb2
DB
411static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
412 unsigned int flags)
7a51384c
DB
413{
414 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
415
68a360e8
WB
416 if (shhwtstamps &&
417 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
418 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
419 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
420
421 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 422 return TP_STATUS_TS_SOFTWARE;
7a51384c 423
b9c32fb2 424 return 0;
7a51384c
DB
425}
426
b9c32fb2
DB
427static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
428 struct sk_buff *skb)
2e31396f
WB
429{
430 union tpacket_uhdr h;
431 struct timespec ts;
b9c32fb2 432 __u32 ts_status;
2e31396f 433
b9c32fb2
DB
434 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
435 return 0;
2e31396f
WB
436
437 h.raw = frame;
438 switch (po->tp_version) {
439 case TPACKET_V1:
440 h.h1->tp_sec = ts.tv_sec;
441 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
442 break;
443 case TPACKET_V2:
444 h.h2->tp_sec = ts.tv_sec;
445 h.h2->tp_nsec = ts.tv_nsec;
446 break;
447 case TPACKET_V3:
57ea884b
DB
448 h.h3->tp_sec = ts.tv_sec;
449 h.h3->tp_nsec = ts.tv_nsec;
450 break;
2e31396f
WB
451 default:
452 WARN(1, "TPACKET version not supported.\n");
453 BUG();
454 }
455
456 /* one flush is safe, as both fields always lie on the same cacheline */
457 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
458 smp_wmb();
b9c32fb2
DB
459
460 return ts_status;
2e31396f
WB
461}
462
d4b5bd98
ED
463static void *packet_lookup_frame(const struct packet_sock *po,
464 const struct packet_ring_buffer *rb,
465 unsigned int position,
466 int status)
69e3c75f
JB
467{
468 unsigned int pg_vec_pos, frame_offset;
184f489e 469 union tpacket_uhdr h;
69e3c75f
JB
470
471 pg_vec_pos = position / rb->frames_per_block;
472 frame_offset = position % rb->frames_per_block;
473
0e3125c7
NH
474 h.raw = rb->pg_vec[pg_vec_pos].buffer +
475 (frame_offset * rb->frame_size);
69e3c75f
JB
476
477 if (status != __packet_get_status(po, h.raw))
478 return NULL;
479
480 return h.raw;
481}
482
eea49cc9 483static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
484 struct packet_ring_buffer *rb,
485 int status)
486{
487 return packet_lookup_frame(po, rb, rb->head, status);
488}
489
bc59ba39 490static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 491{
492 del_timer_sync(&pkc->retire_blk_timer);
493}
494
495static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 496 struct sk_buff_head *rb_queue)
497{
bc59ba39 498 struct tpacket_kbdq_core *pkc;
f6fb8f10 499
73d0fcf2 500 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 501
ec6f809f 502 spin_lock_bh(&rb_queue->lock);
f6fb8f10 503 pkc->delete_blk_timer = 1;
ec6f809f 504 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 505
506 prb_del_retire_blk_timer(pkc);
507}
508
e8e85cc5 509static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 510{
bc59ba39 511 struct tpacket_kbdq_core *pkc;
f6fb8f10 512
e8e85cc5 513 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
514 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
515 0);
516 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 517}
518
519static int prb_calc_retire_blk_tmo(struct packet_sock *po,
520 int blk_size_in_bytes)
521{
522 struct net_device *dev;
523 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 524 struct ethtool_link_ksettings ecmd;
4bc71cb9 525 int err;
f6fb8f10 526
4bc71cb9
JP
527 rtnl_lock();
528 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
529 if (unlikely(!dev)) {
530 rtnl_unlock();
f6fb8f10 531 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 532 }
7cad1bac 533 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
534 rtnl_unlock();
535 if (!err) {
4bc71cb9
JP
536 /*
537 * If the link speed is so slow you don't really
538 * need to worry about perf anyways
539 */
7cad1bac
DD
540 if (ecmd.base.speed < SPEED_1000 ||
541 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 542 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 543 } else {
544 msec = 1;
7cad1bac 545 div = ecmd.base.speed / 1000;
f6fb8f10 546 }
fcfcfe0b
MW
547 } else
548 return DEFAULT_PRB_RETIRE_TOV;
f6fb8f10 549
550 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
551
552 if (div)
553 mbits /= div;
554
555 tmo = mbits * msec;
556
557 if (div)
558 return tmo+1;
559 return tmo;
560}
561
bc59ba39 562static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 563 union tpacket_req_u *req_u)
564{
565 p1->feature_req_word = req_u->req3.tp_feature_req_word;
566}
567
568static void init_prb_bdqc(struct packet_sock *po,
569 struct packet_ring_buffer *rb,
570 struct pgv *pg_vec,
e8e85cc5 571 union tpacket_req_u *req_u)
f6fb8f10 572{
22781a5b 573 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 574 struct tpacket_block_desc *pbd;
f6fb8f10 575
576 memset(p1, 0x0, sizeof(*p1));
577
578 p1->knxt_seq_num = 1;
579 p1->pkbdq = pg_vec;
bc59ba39 580 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 581 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 582 p1->kblk_size = req_u->req3.tp_block_size;
583 p1->knum_blocks = req_u->req3.tp_block_nr;
584 p1->hdrlen = po->tp_hdrlen;
585 p1->version = po->tp_version;
586 p1->last_kactive_blk_num = 0;
ee80fbf3 587 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 588 if (req_u->req3.tp_retire_blk_tov)
589 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
590 else
591 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
592 req_u->req3.tp_block_size);
593 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
594 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
595
dc808110 596 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 597 prb_init_ft_ops(p1, req_u);
e8e85cc5 598 prb_setup_retire_blk_timer(po);
f6fb8f10 599 prb_open_block(p1, pbd);
600}
601
602/* Do NOT update the last_blk_num first.
603 * Assumes sk_buff_head lock is held.
604 */
bc59ba39 605static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 606{
607 mod_timer(&pkc->retire_blk_timer,
608 jiffies + pkc->tov_in_jiffies);
609 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
610}
611
612/*
613 * Timer logic:
614 * 1) We refresh the timer only when we open a block.
615 * By doing this we don't waste cycles refreshing the timer
616 * on packet-by-packet basis.
617 *
618 * With a 1MB block-size, on a 1Gbps line, it will take
619 * i) ~8 ms to fill a block + ii) memcpy etc.
620 * In this cut we are not accounting for the memcpy time.
621 *
622 * So, if the user sets the 'tmo' to 10ms then the timer
623 * will never fire while the block is still getting filled
624 * (which is what we want). However, the user could choose
625 * to close a block early and that's fine.
626 *
627 * But when the timer does fire, we check whether or not to refresh it.
628 * Since the tmo granularity is in msecs, it is not too expensive
629 * to refresh the timer, lets say every '8' msecs.
630 * Either the user can set the 'tmo' or we can derive it based on
631 * a) line-speed and b) block-size.
632 * prb_calc_retire_blk_tmo() calculates the tmo.
633 *
634 */
17bfd8c8 635static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 636{
17bfd8c8
KC
637 struct packet_sock *po =
638 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 639 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 640 unsigned int frozen;
bc59ba39 641 struct tpacket_block_desc *pbd;
f6fb8f10 642
643 spin_lock(&po->sk.sk_receive_queue.lock);
644
645 frozen = prb_queue_frozen(pkc);
646 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
647
648 if (unlikely(pkc->delete_blk_timer))
649 goto out;
650
651 /* We only need to plug the race when the block is partially filled.
652 * tpacket_rcv:
653 * lock(); increment BLOCK_NUM_PKTS; unlock()
654 * copy_bits() is in progress ...
655 * timer fires on other cpu:
656 * we can't retire the current block because copy_bits
657 * is in progress.
658 *
659 */
660 if (BLOCK_NUM_PKTS(pbd)) {
661 while (atomic_read(&pkc->blk_fill_in_prog)) {
662 /* Waiting for skb_copy_bits to finish... */
663 cpu_relax();
664 }
665 }
666
667 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
668 if (!frozen) {
41a50d62
AD
669 if (!BLOCK_NUM_PKTS(pbd)) {
670 /* An empty block. Just refresh the timer. */
671 goto refresh_timer;
672 }
f6fb8f10 673 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
674 if (!prb_dispatch_next_block(pkc, po))
675 goto refresh_timer;
676 else
677 goto out;
678 } else {
679 /* Case 1. Queue was frozen because user-space was
680 * lagging behind.
681 */
878cd3ba 682 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 683 /*
684 * Ok, user-space is still behind.
685 * So just refresh the timer.
686 */
687 goto refresh_timer;
688 } else {
689 /* Case 2. queue was frozen,user-space caught up,
690 * now the link went idle && the timer fired.
691 * We don't have a block to close.So we open this
692 * block and restart the timer.
693 * opening a block thaws the queue,restarts timer
694 * Thawing/timer-refresh is a side effect.
695 */
696 prb_open_block(pkc, pbd);
697 goto out;
698 }
699 }
700 }
701
702refresh_timer:
703 _prb_refresh_rx_retire_blk_timer(pkc);
704
705out:
706 spin_unlock(&po->sk.sk_receive_queue.lock);
707}
708
eea49cc9 709static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 710 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 711{
712 /* Flush everything minus the block header */
713
714#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
715 u8 *start, *end;
716
717 start = (u8 *)pbd1;
718
719 /* Skip the block header(we know header WILL fit in 4K) */
720 start += PAGE_SIZE;
721
722 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
723 for (; start < end; start += PAGE_SIZE)
724 flush_dcache_page(pgv_to_page(start));
725
726 smp_wmb();
727#endif
728
729 /* Now update the block status. */
730
731 BLOCK_STATUS(pbd1) = status;
732
733 /* Flush the block header */
734
735#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
736 start = (u8 *)pbd1;
737 flush_dcache_page(pgv_to_page(start));
738
739 smp_wmb();
740#endif
741}
742
743/*
744 * Side effect:
745 *
746 * 1) flush the block
747 * 2) Increment active_blk_num
748 *
749 * Note:We DONT refresh the timer on purpose.
750 * Because almost always the next block will be opened.
751 */
bc59ba39 752static void prb_close_block(struct tpacket_kbdq_core *pkc1,
753 struct tpacket_block_desc *pbd1,
f6fb8f10 754 struct packet_sock *po, unsigned int stat)
755{
756 __u32 status = TP_STATUS_USER | stat;
757
758 struct tpacket3_hdr *last_pkt;
bc59ba39 759 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 760 struct sock *sk = &po->sk;
f6fb8f10 761
8e8e2951 762 if (atomic_read(&po->tp_drops))
f6fb8f10 763 status |= TP_STATUS_LOSING;
764
765 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
766 last_pkt->tp_next_offset = 0;
767
768 /* Get the ts of the last pkt */
769 if (BLOCK_NUM_PKTS(pbd1)) {
770 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
771 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
772 } else {
41a50d62
AD
773 /* Ok, we tmo'd - so get the current time.
774 *
775 * It shouldn't really happen as we don't close empty
776 * blocks. See prb_retire_rx_blk_timer_expired().
777 */
f6fb8f10 778 struct timespec ts;
779 getnstimeofday(&ts);
780 h1->ts_last_pkt.ts_sec = ts.tv_sec;
781 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
782 }
783
784 smp_wmb();
785
786 /* Flush the block */
787 prb_flush_block(pkc1, pbd1, status);
788
da413eec
DC
789 sk->sk_data_ready(sk);
790
f6fb8f10 791 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
792}
793
eea49cc9 794static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 795{
796 pkc->reset_pending_on_curr_blk = 0;
797}
798
799/*
800 * Side effect of opening a block:
801 *
802 * 1) prb_queue is thawed.
803 * 2) retire_blk_timer is refreshed.
804 *
805 */
bc59ba39 806static void prb_open_block(struct tpacket_kbdq_core *pkc1,
807 struct tpacket_block_desc *pbd1)
f6fb8f10 808{
809 struct timespec ts;
bc59ba39 810 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 811
812 smp_rmb();
813
8da3056c
DB
814 /* We could have just memset this but we will lose the
815 * flexibility of making the priv area sticky
816 */
f6fb8f10 817
8da3056c
DB
818 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
819 BLOCK_NUM_PKTS(pbd1) = 0;
820 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 821
8da3056c
DB
822 getnstimeofday(&ts);
823
824 h1->ts_first_pkt.ts_sec = ts.tv_sec;
825 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 826
8da3056c
DB
827 pkc1->pkblk_start = (char *)pbd1;
828 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
829
830 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
831 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
832
833 pbd1->version = pkc1->version;
834 pkc1->prev = pkc1->nxt_offset;
835 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
836
837 prb_thaw_queue(pkc1);
838 _prb_refresh_rx_retire_blk_timer(pkc1);
839
840 smp_wmb();
f6fb8f10 841}
842
843/*
844 * Queue freeze logic:
845 * 1) Assume tp_block_nr = 8 blocks.
846 * 2) At time 't0', user opens Rx ring.
847 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
848 * 4) user-space is either sleeping or processing block '0'.
849 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
850 * it will close block-7,loop around and try to fill block '0'.
851 * call-flow:
852 * __packet_lookup_frame_in_block
853 * prb_retire_current_block()
854 * prb_dispatch_next_block()
855 * |->(BLOCK_STATUS == USER) evaluates to true
856 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
857 * 6) Now there are two cases:
858 * 6.1) Link goes idle right after the queue is frozen.
859 * But remember, the last open_block() refreshed the timer.
860 * When this timer expires,it will refresh itself so that we can
861 * re-open block-0 in near future.
862 * 6.2) Link is busy and keeps on receiving packets. This is a simple
863 * case and __packet_lookup_frame_in_block will check if block-0
864 * is free and can now be re-used.
865 */
eea49cc9 866static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 867 struct packet_sock *po)
868{
869 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 870 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 871}
872
873#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
874
875/*
876 * If the next block is free then we will dispatch it
877 * and return a good offset.
878 * Else, we will freeze the queue.
879 * So, caller must check the return value.
880 */
bc59ba39 881static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 882 struct packet_sock *po)
883{
bc59ba39 884 struct tpacket_block_desc *pbd;
f6fb8f10 885
886 smp_rmb();
887
888 /* 1. Get current block num */
889 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
890
891 /* 2. If this block is currently in_use then freeze the queue */
892 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
893 prb_freeze_queue(pkc, po);
894 return NULL;
895 }
896
897 /*
898 * 3.
899 * open this block and return the offset where the first packet
900 * needs to get stored.
901 */
902 prb_open_block(pkc, pbd);
903 return (void *)pkc->nxt_offset;
904}
905
bc59ba39 906static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 907 struct packet_sock *po, unsigned int status)
908{
bc59ba39 909 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 910
911 /* retire/close the current block */
912 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
913 /*
914 * Plug the case where copy_bits() is in progress on
915 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
916 * have space to copy the pkt in the current block and
917 * called prb_retire_current_block()
918 *
919 * We don't need to worry about the TMO case because
920 * the timer-handler already handled this case.
921 */
922 if (!(status & TP_STATUS_BLK_TMO)) {
923 while (atomic_read(&pkc->blk_fill_in_prog)) {
924 /* Waiting for skb_copy_bits to finish... */
925 cpu_relax();
926 }
927 }
928 prb_close_block(pkc, pbd, po, status);
929 return;
930 }
f6fb8f10 931}
932
878cd3ba 933static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 934{
935 return TP_STATUS_USER & BLOCK_STATUS(pbd);
936}
937
eea49cc9 938static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 939{
940 return pkc->reset_pending_on_curr_blk;
941}
942
eea49cc9 943static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 944{
bc59ba39 945 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 946 atomic_dec(&pkc->blk_fill_in_prog);
947}
948
eea49cc9 949static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 950 struct tpacket3_hdr *ppd)
951{
3958afa1 952 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 953}
954
eea49cc9 955static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 956 struct tpacket3_hdr *ppd)
957{
958 ppd->hv1.tp_rxhash = 0;
959}
960
eea49cc9 961static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 962 struct tpacket3_hdr *ppd)
963{
df8a39de
JP
964 if (skb_vlan_tag_present(pkc->skb)) {
965 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
966 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
967 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 968 } else {
9e67030a 969 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 970 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 971 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 972 }
973}
974
bc59ba39 975static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 976 struct tpacket3_hdr *ppd)
977{
a0cdfcf3 978 ppd->hv1.tp_padding = 0;
f6fb8f10 979 prb_fill_vlan_info(pkc, ppd);
980
981 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
982 prb_fill_rxhash(pkc, ppd);
983 else
984 prb_clear_rxhash(pkc, ppd);
985}
986
eea49cc9 987static void prb_fill_curr_block(char *curr,
bc59ba39 988 struct tpacket_kbdq_core *pkc,
989 struct tpacket_block_desc *pbd,
f6fb8f10 990 unsigned int len)
991{
992 struct tpacket3_hdr *ppd;
993
994 ppd = (struct tpacket3_hdr *)curr;
995 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
996 pkc->prev = curr;
997 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
998 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
999 BLOCK_NUM_PKTS(pbd) += 1;
1000 atomic_inc(&pkc->blk_fill_in_prog);
1001 prb_run_all_ft_ops(pkc, ppd);
1002}
1003
1004/* Assumes caller has the sk->rx_queue.lock */
1005static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1006 struct sk_buff *skb,
f6fb8f10 1007 unsigned int len
1008 )
1009{
bc59ba39 1010 struct tpacket_kbdq_core *pkc;
1011 struct tpacket_block_desc *pbd;
f6fb8f10 1012 char *curr, *end;
1013
e3192690 1014 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1015 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1016
1017 /* Queue is frozen when user space is lagging behind */
1018 if (prb_queue_frozen(pkc)) {
1019 /*
1020 * Check if that last block which caused the queue to freeze,
1021 * is still in_use by user-space.
1022 */
878cd3ba 1023 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1024 /* Can't record this packet */
1025 return NULL;
1026 } else {
1027 /*
1028 * Ok, the block was released by user-space.
1029 * Now let's open that block.
1030 * opening a block also thaws the queue.
1031 * Thawing is a side effect.
1032 */
1033 prb_open_block(pkc, pbd);
1034 }
1035 }
1036
1037 smp_mb();
1038 curr = pkc->nxt_offset;
1039 pkc->skb = skb;
e3192690 1040 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1041
1042 /* first try the current block */
1043 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1044 prb_fill_curr_block(curr, pkc, pbd, len);
1045 return (void *)curr;
1046 }
1047
1048 /* Ok, close the current block */
1049 prb_retire_current_block(pkc, po, 0);
1050
1051 /* Now, try to dispatch the next block */
1052 curr = (char *)prb_dispatch_next_block(pkc, po);
1053 if (curr) {
1054 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1055 prb_fill_curr_block(curr, pkc, pbd, len);
1056 return (void *)curr;
1057 }
1058
1059 /*
1060 * No free blocks are available.user_space hasn't caught up yet.
1061 * Queue was just frozen and now this packet will get dropped.
1062 */
1063 return NULL;
1064}
1065
eea49cc9 1066static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1067 struct sk_buff *skb,
1068 int status, unsigned int len)
1069{
1070 char *curr = NULL;
1071 switch (po->tp_version) {
1072 case TPACKET_V1:
1073 case TPACKET_V2:
1074 curr = packet_lookup_frame(po, &po->rx_ring,
1075 po->rx_ring.head, status);
1076 return curr;
1077 case TPACKET_V3:
46088059 1078 return __packet_lookup_frame_in_block(po, skb, len);
f6fb8f10 1079 default:
1080 WARN(1, "TPACKET version not supported\n");
1081 BUG();
99aa3473 1082 return NULL;
f6fb8f10 1083 }
1084}
1085
dcf70cef
ED
1086static void *prb_lookup_block(const struct packet_sock *po,
1087 const struct packet_ring_buffer *rb,
1088 unsigned int idx,
1089 int status)
f6fb8f10 1090{
bc59ba39 1091 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1092 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1093
1094 if (status != BLOCK_STATUS(pbd))
1095 return NULL;
1096 return pbd;
1097}
1098
eea49cc9 1099static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1100{
1101 unsigned int prev;
1102 if (rb->prb_bdqc.kactive_blk_num)
1103 prev = rb->prb_bdqc.kactive_blk_num-1;
1104 else
1105 prev = rb->prb_bdqc.knum_blocks-1;
1106 return prev;
1107}
1108
1109/* Assumes caller has held the rx_queue.lock */
eea49cc9 1110static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1111 struct packet_ring_buffer *rb,
1112 int status)
1113{
1114 unsigned int previous = prb_previous_blk_num(rb);
1115 return prb_lookup_block(po, rb, previous, status);
1116}
1117
eea49cc9 1118static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 if (po->tp_version <= TPACKET_V2)
1123 return packet_previous_frame(po, rb, status);
1124
1125 return __prb_previous_block(po, rb, status);
1126}
1127
eea49cc9 1128static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1129 struct packet_ring_buffer *rb)
1130{
1131 switch (po->tp_version) {
1132 case TPACKET_V1:
1133 case TPACKET_V2:
1134 return packet_increment_head(rb);
1135 case TPACKET_V3:
1136 default:
1137 WARN(1, "TPACKET version not supported.\n");
1138 BUG();
1139 return;
1140 }
1141}
1142
eea49cc9 1143static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1144 struct packet_ring_buffer *rb,
1145 int status)
1146{
1147 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1148 return packet_lookup_frame(po, rb, previous, status);
1149}
1150
eea49cc9 1151static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1152{
1153 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1154}
1155
b0138408
DB
1156static void packet_inc_pending(struct packet_ring_buffer *rb)
1157{
1158 this_cpu_inc(*rb->pending_refcnt);
1159}
1160
1161static void packet_dec_pending(struct packet_ring_buffer *rb)
1162{
1163 this_cpu_dec(*rb->pending_refcnt);
1164}
1165
1166static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1167{
1168 unsigned int refcnt = 0;
1169 int cpu;
1170
1171 /* We don't use pending refcount in rx_ring. */
1172 if (rb->pending_refcnt == NULL)
1173 return 0;
1174
1175 for_each_possible_cpu(cpu)
1176 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1177
1178 return refcnt;
1179}
1180
1181static int packet_alloc_pending(struct packet_sock *po)
1182{
1183 po->rx_ring.pending_refcnt = NULL;
1184
1185 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1186 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1187 return -ENOBUFS;
1188
1189 return 0;
1190}
1191
1192static void packet_free_pending(struct packet_sock *po)
1193{
1194 free_percpu(po->tx_ring.pending_refcnt);
1195}
1196
9954729b
WB
1197#define ROOM_POW_OFF 2
1198#define ROOM_NONE 0x0
1199#define ROOM_LOW 0x1
1200#define ROOM_NORMAL 0x2
1201
d4b5bd98 1202static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
77f65ebd 1203{
9954729b
WB
1204 int idx, len;
1205
d4b5bd98
ED
1206 len = READ_ONCE(po->rx_ring.frame_max) + 1;
1207 idx = READ_ONCE(po->rx_ring.head);
9954729b
WB
1208 if (pow_off)
1209 idx += len >> pow_off;
1210 if (idx >= len)
1211 idx -= len;
1212 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1213}
1214
dcf70cef 1215static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
9954729b
WB
1216{
1217 int idx, len;
1218
dcf70cef
ED
1219 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1220 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
9954729b
WB
1221 if (pow_off)
1222 idx += len >> pow_off;
1223 if (idx >= len)
1224 idx -= len;
1225 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1226}
77f65ebd 1227
0338a145
ED
1228static int __packet_rcv_has_room(const struct packet_sock *po,
1229 const struct sk_buff *skb)
9954729b 1230{
0338a145 1231 const struct sock *sk = &po->sk;
9954729b
WB
1232 int ret = ROOM_NONE;
1233
1234 if (po->prot_hook.func != tpacket_rcv) {
0338a145
ED
1235 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1236 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1237 - (skb ? skb->truesize : 0);
1238
1239 if (avail > (rcvbuf >> ROOM_POW_OFF))
9954729b
WB
1240 return ROOM_NORMAL;
1241 else if (avail > 0)
1242 return ROOM_LOW;
1243 else
1244 return ROOM_NONE;
1245 }
77f65ebd 1246
9954729b
WB
1247 if (po->tp_version == TPACKET_V3) {
1248 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1249 ret = ROOM_NORMAL;
1250 else if (__tpacket_v3_has_room(po, 0))
1251 ret = ROOM_LOW;
1252 } else {
1253 if (__tpacket_has_room(po, ROOM_POW_OFF))
1254 ret = ROOM_NORMAL;
1255 else if (__tpacket_has_room(po, 0))
1256 ret = ROOM_LOW;
1257 }
2ccdbaa6
WB
1258
1259 return ret;
1260}
1261
1262static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1263{
3a2bb84e 1264 int pressure, ret;
2ccdbaa6 1265
54d7c01d 1266 ret = __packet_rcv_has_room(po, skb);
3a2bb84e
ED
1267 pressure = ret != ROOM_NORMAL;
1268
1269 if (READ_ONCE(po->pressure) != pressure)
1270 WRITE_ONCE(po->pressure, pressure);
77f65ebd 1271
9954729b 1272 return ret;
77f65ebd
WB
1273}
1274
9bb6cd65
ED
1275static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1276{
1277 if (READ_ONCE(po->pressure) &&
1278 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1279 WRITE_ONCE(po->pressure, 0);
1280}
1281
1da177e4
LT
1282static void packet_sock_destruct(struct sock *sk)
1283{
ed85b565
RC
1284 skb_queue_purge(&sk->sk_error_queue);
1285
547b792c 1286 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1287 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1288
1289 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1290 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1291 return;
1292 }
1293
17ab56a2 1294 sk_refcnt_debug_dec(sk);
1da177e4
LT
1295}
1296
3b3a5b0a
WB
1297static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1298{
f6cec329
ED
1299 u32 *history = po->rollover->history;
1300 u32 victim, rxhash;
3b3a5b0a
WB
1301 int i, count = 0;
1302
1303 rxhash = skb_get_hash(skb);
1304 for (i = 0; i < ROLLOVER_HLEN; i++)
f6cec329 1305 if (READ_ONCE(history[i]) == rxhash)
3b3a5b0a
WB
1306 count++;
1307
f6cec329
ED
1308 victim = prandom_u32() % ROLLOVER_HLEN;
1309
1310 /* Avoid dirtying the cache line if possible */
1311 if (READ_ONCE(history[victim]) != rxhash)
1312 WRITE_ONCE(history[victim], rxhash);
1313
3b3a5b0a
WB
1314 return count > (ROLLOVER_HLEN >> 1);
1315}
1316
77f65ebd
WB
1317static unsigned int fanout_demux_hash(struct packet_fanout *f,
1318 struct sk_buff *skb,
1319 unsigned int num)
dc99f600 1320{
eb70db87 1321 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1322}
1323
77f65ebd
WB
1324static unsigned int fanout_demux_lb(struct packet_fanout *f,
1325 struct sk_buff *skb,
1326 unsigned int num)
dc99f600 1327{
468479e6 1328 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1329
468479e6 1330 return val % num;
77f65ebd
WB
1331}
1332
1333static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1334 struct sk_buff *skb,
1335 unsigned int num)
1336{
1337 return smp_processor_id() % num;
dc99f600
DM
1338}
1339
5df0ddfb
DB
1340static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1341 struct sk_buff *skb,
1342 unsigned int num)
1343{
f337db64 1344 return prandom_u32_max(num);
5df0ddfb
DB
1345}
1346
77f65ebd
WB
1347static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1348 struct sk_buff *skb,
ad377cab 1349 unsigned int idx, bool try_self,
77f65ebd 1350 unsigned int num)
95ec3eb4 1351{
4633c9e0 1352 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1353 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1354
0648ab70 1355 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1356
1357 if (try_self) {
1358 room = packet_rcv_has_room(po, skb);
1359 if (room == ROOM_NORMAL ||
1360 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1361 return idx;
4633c9e0 1362 po_skip = po;
3b3a5b0a 1363 }
ad377cab 1364
0648ab70 1365 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1366 do {
2ccdbaa6 1367 po_next = pkt_sk(f->arr[i]);
3a2bb84e 1368 if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
2ccdbaa6 1369 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1370 if (i != j)
0648ab70 1371 po->rollover->sock = i;
a9b63918
WB
1372 atomic_long_inc(&po->rollover->num);
1373 if (room == ROOM_LOW)
1374 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1375 return i;
1376 }
ad377cab 1377
77f65ebd
WB
1378 if (++i == num)
1379 i = 0;
1380 } while (i != j);
1381
a9b63918 1382 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1383 return idx;
1384}
1385
2d36097d
NH
1386static unsigned int fanout_demux_qm(struct packet_fanout *f,
1387 struct sk_buff *skb,
1388 unsigned int num)
1389{
1390 return skb_get_queue_mapping(skb) % num;
1391}
1392
47dceb8e
WB
1393static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1394 struct sk_buff *skb,
1395 unsigned int num)
1396{
1397 struct bpf_prog *prog;
1398 unsigned int ret = 0;
1399
1400 rcu_read_lock();
1401 prog = rcu_dereference(f->bpf_prog);
1402 if (prog)
ff936a04 1403 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1404 rcu_read_unlock();
1405
1406 return ret;
1407}
1408
77f65ebd
WB
1409static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1410{
1411 return f->flags & (flag >> 8);
95ec3eb4
DM
1412}
1413
95ec3eb4
DM
1414static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1415 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1416{
1417 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1418 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1419 struct net *net = read_pnet(&f->net);
dc99f600 1420 struct packet_sock *po;
77f65ebd 1421 unsigned int idx;
dc99f600 1422
19bcf9f2 1423 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1424 kfree_skb(skb);
1425 return 0;
1426 }
1427
3f34b24a 1428 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1429 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1430 if (!skb)
1431 return 0;
1432 }
95ec3eb4
DM
1433 switch (f->type) {
1434 case PACKET_FANOUT_HASH:
1435 default:
77f65ebd 1436 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1437 break;
1438 case PACKET_FANOUT_LB:
77f65ebd 1439 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1440 break;
1441 case PACKET_FANOUT_CPU:
77f65ebd
WB
1442 idx = fanout_demux_cpu(f, skb, num);
1443 break;
5df0ddfb
DB
1444 case PACKET_FANOUT_RND:
1445 idx = fanout_demux_rnd(f, skb, num);
1446 break;
2d36097d
NH
1447 case PACKET_FANOUT_QM:
1448 idx = fanout_demux_qm(f, skb, num);
1449 break;
77f65ebd 1450 case PACKET_FANOUT_ROLLOVER:
ad377cab 1451 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1452 break;
47dceb8e 1453 case PACKET_FANOUT_CBPF:
f2e52095 1454 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1455 idx = fanout_demux_bpf(f, skb, num);
1456 break;
dc99f600
DM
1457 }
1458
ad377cab
WB
1459 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1460 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1461
ad377cab 1462 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1463 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1464}
1465
fff3321d
PE
1466DEFINE_MUTEX(fanout_mutex);
1467EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1468static LIST_HEAD(fanout_list);
4a69a864 1469static u16 fanout_next_id;
dc99f600
DM
1470
1471static void __fanout_link(struct sock *sk, struct packet_sock *po)
1472{
1473 struct packet_fanout *f = po->fanout;
1474
1475 spin_lock(&f->lock);
1476 f->arr[f->num_members] = sk;
1477 smp_wmb();
1478 f->num_members++;
2bd624b4
AS
1479 if (f->num_members == 1)
1480 dev_add_pack(&f->prot_hook);
dc99f600
DM
1481 spin_unlock(&f->lock);
1482}
1483
1484static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1485{
1486 struct packet_fanout *f = po->fanout;
1487 int i;
1488
1489 spin_lock(&f->lock);
1490 for (i = 0; i < f->num_members; i++) {
1491 if (f->arr[i] == sk)
1492 break;
1493 }
1494 BUG_ON(i >= f->num_members);
1495 f->arr[i] = f->arr[f->num_members - 1];
1496 f->num_members--;
2bd624b4
AS
1497 if (f->num_members == 0)
1498 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1499 spin_unlock(&f->lock);
1500}
1501
d4dd8aee 1502static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1503{
161642e2
ED
1504 if (sk->sk_family != PF_PACKET)
1505 return false;
c0de08d0 1506
161642e2 1507 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1508}
1509
47dceb8e
WB
1510static void fanout_init_data(struct packet_fanout *f)
1511{
1512 switch (f->type) {
1513 case PACKET_FANOUT_LB:
1514 atomic_set(&f->rr_cur, 0);
1515 break;
1516 case PACKET_FANOUT_CBPF:
f2e52095 1517 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1518 RCU_INIT_POINTER(f->bpf_prog, NULL);
1519 break;
1520 }
1521}
1522
1523static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1524{
1525 struct bpf_prog *old;
1526
1527 spin_lock(&f->lock);
1528 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1529 rcu_assign_pointer(f->bpf_prog, new);
1530 spin_unlock(&f->lock);
1531
1532 if (old) {
1533 synchronize_net();
1534 bpf_prog_destroy(old);
1535 }
1536}
1537
1538static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1539 unsigned int len)
1540{
1541 struct bpf_prog *new;
1542 struct sock_fprog fprog;
1543 int ret;
1544
1545 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1546 return -EPERM;
1547 if (len != sizeof(fprog))
1548 return -EINVAL;
1549 if (copy_from_user(&fprog, data, len))
1550 return -EFAULT;
1551
bab18991 1552 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1553 if (ret)
1554 return ret;
1555
1556 __fanout_set_data_bpf(po->fanout, new);
1557 return 0;
1558}
1559
f2e52095
WB
1560static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1561 unsigned int len)
1562{
1563 struct bpf_prog *new;
1564 u32 fd;
1565
1566 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1567 return -EPERM;
1568 if (len != sizeof(fd))
1569 return -EINVAL;
1570 if (copy_from_user(&fd, data, len))
1571 return -EFAULT;
1572
113214be 1573 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1574 if (IS_ERR(new))
1575 return PTR_ERR(new);
f2e52095
WB
1576
1577 __fanout_set_data_bpf(po->fanout, new);
1578 return 0;
1579}
1580
47dceb8e
WB
1581static int fanout_set_data(struct packet_sock *po, char __user *data,
1582 unsigned int len)
1583{
1584 switch (po->fanout->type) {
1585 case PACKET_FANOUT_CBPF:
1586 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1587 case PACKET_FANOUT_EBPF:
1588 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1589 default:
1590 return -EINVAL;
07d53ae4 1591 }
47dceb8e
WB
1592}
1593
1594static void fanout_release_data(struct packet_fanout *f)
1595{
1596 switch (f->type) {
1597 case PACKET_FANOUT_CBPF:
f2e52095 1598 case PACKET_FANOUT_EBPF:
47dceb8e 1599 __fanout_set_data_bpf(f, NULL);
07d53ae4 1600 }
47dceb8e
WB
1601}
1602
4a69a864
MM
1603static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1604{
1605 struct packet_fanout *f;
1606
1607 list_for_each_entry(f, &fanout_list, list) {
1608 if (f->id == candidate_id &&
1609 read_pnet(&f->net) == sock_net(sk)) {
1610 return false;
1611 }
1612 }
1613 return true;
1614}
1615
1616static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1617{
1618 u16 id = fanout_next_id;
1619
1620 do {
1621 if (__fanout_id_is_free(sk, id)) {
1622 *new_id = id;
1623 fanout_next_id = id + 1;
1624 return true;
1625 }
1626
1627 id++;
1628 } while (id != fanout_next_id);
1629
1630 return false;
1631}
1632
7736d33f 1633static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1634{
d199fab6 1635 struct packet_rollover *rollover = NULL;
dc99f600
DM
1636 struct packet_sock *po = pkt_sk(sk);
1637 struct packet_fanout *f, *match;
7736d33f 1638 u8 type = type_flags & 0xff;
77f65ebd 1639 u8 flags = type_flags >> 8;
dc99f600
DM
1640 int err;
1641
1642 switch (type) {
77f65ebd
WB
1643 case PACKET_FANOUT_ROLLOVER:
1644 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1645 return -EINVAL;
dc99f600
DM
1646 case PACKET_FANOUT_HASH:
1647 case PACKET_FANOUT_LB:
95ec3eb4 1648 case PACKET_FANOUT_CPU:
5df0ddfb 1649 case PACKET_FANOUT_RND:
2d36097d 1650 case PACKET_FANOUT_QM:
47dceb8e 1651 case PACKET_FANOUT_CBPF:
f2e52095 1652 case PACKET_FANOUT_EBPF:
dc99f600
DM
1653 break;
1654 default:
1655 return -EINVAL;
1656 }
1657
d199fab6
ED
1658 mutex_lock(&fanout_mutex);
1659
d199fab6 1660 err = -EALREADY;
dc99f600 1661 if (po->fanout)
d199fab6 1662 goto out;
dc99f600 1663
4633c9e0
WB
1664 if (type == PACKET_FANOUT_ROLLOVER ||
1665 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1666 err = -ENOMEM;
1667 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1668 if (!rollover)
1669 goto out;
1670 atomic_long_set(&rollover->num, 0);
1671 atomic_long_set(&rollover->num_huge, 0);
1672 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1673 }
1674
4a69a864
MM
1675 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1676 if (id != 0) {
1677 err = -EINVAL;
1678 goto out;
1679 }
1680 if (!fanout_find_new_id(sk, &id)) {
1681 err = -ENOMEM;
1682 goto out;
1683 }
1684 /* ephemeral flag for the first socket in the group: drop it */
1685 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1686 }
1687
dc99f600
DM
1688 match = NULL;
1689 list_for_each_entry(f, &fanout_list, list) {
1690 if (f->id == id &&
1691 read_pnet(&f->net) == sock_net(sk)) {
1692 match = f;
1693 break;
1694 }
1695 }
afe62c68 1696 err = -EINVAL;
77f65ebd 1697 if (match && match->flags != flags)
afe62c68 1698 goto out;
dc99f600 1699 if (!match) {
afe62c68 1700 err = -ENOMEM;
dc99f600 1701 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1702 if (!match)
1703 goto out;
1704 write_pnet(&match->net, sock_net(sk));
1705 match->id = id;
1706 match->type = type;
77f65ebd 1707 match->flags = flags;
afe62c68
ED
1708 INIT_LIST_HEAD(&match->list);
1709 spin_lock_init(&match->lock);
fb5c2c17 1710 refcount_set(&match->sk_ref, 0);
47dceb8e 1711 fanout_init_data(match);
afe62c68
ED
1712 match->prot_hook.type = po->prot_hook.type;
1713 match->prot_hook.dev = po->prot_hook.dev;
1714 match->prot_hook.func = packet_rcv_fanout;
1715 match->prot_hook.af_packet_priv = match;
c0de08d0 1716 match->prot_hook.id_match = match_fanout_group;
afe62c68 1717 list_add(&match->list, &fanout_list);
dc99f600 1718 }
afe62c68 1719 err = -EINVAL;
008ba2a1
WB
1720
1721 spin_lock(&po->bind_lock);
1722 if (po->running &&
1723 match->type == type &&
afe62c68
ED
1724 match->prot_hook.type == po->prot_hook.type &&
1725 match->prot_hook.dev == po->prot_hook.dev) {
1726 err = -ENOSPC;
fb5c2c17 1727 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1728 __dev_remove_pack(&po->prot_hook);
1729 po->fanout = match;
57f015f5
MM
1730 po->rollover = rollover;
1731 rollover = NULL;
fb5c2c17 1732 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1733 __fanout_link(sk, po);
1734 err = 0;
dc99f600
DM
1735 }
1736 }
008ba2a1
WB
1737 spin_unlock(&po->bind_lock);
1738
1739 if (err && !refcount_read(&match->sk_ref)) {
1740 list_del(&match->list);
1741 kfree(match);
1742 }
1743
afe62c68 1744out:
57f015f5 1745 kfree(rollover);
d199fab6 1746 mutex_unlock(&fanout_mutex);
dc99f600
DM
1747 return err;
1748}
1749
2bd624b4
AS
1750/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1751 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1752 * It is the responsibility of the caller to call fanout_release_data() and
1753 * free the returned packet_fanout (after synchronize_net())
1754 */
1755static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1756{
1757 struct packet_sock *po = pkt_sk(sk);
1758 struct packet_fanout *f;
1759
fff3321d 1760 mutex_lock(&fanout_mutex);
d199fab6
ED
1761 f = po->fanout;
1762 if (f) {
1763 po->fanout = NULL;
1764
fb5c2c17 1765 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1766 list_del(&f->list);
2bd624b4
AS
1767 else
1768 f = NULL;
dc99f600
DM
1769 }
1770 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1771
1772 return f;
dc99f600 1773}
1da177e4 1774
3c70c132
DB
1775static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1776 struct sk_buff *skb)
1777{
1778 /* Earlier code assumed this would be a VLAN pkt, double-check
1779 * this now that we have the actual packet in hand. We can only
1780 * do this check on Ethernet devices.
1781 */
1782 if (unlikely(dev->type != ARPHRD_ETHER))
1783 return false;
1784
1785 skb_reset_mac_header(skb);
1786 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1787}
1788
90ddc4f0 1789static const struct proto_ops packet_ops;
1da177e4 1790
90ddc4f0 1791static const struct proto_ops packet_ops_spkt;
1da177e4 1792
40d4e3df
ED
1793static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1794 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1795{
1796 struct sock *sk;
1797 struct sockaddr_pkt *spkt;
1798
1799 /*
1800 * When we registered the protocol we saved the socket in the data
1801 * field for just this event.
1802 */
1803
1804 sk = pt->af_packet_priv;
1ce4f28b 1805
1da177e4
LT
1806 /*
1807 * Yank back the headers [hope the device set this
1808 * right or kerboom...]
1809 *
1810 * Incoming packets have ll header pulled,
1811 * push it back.
1812 *
98e399f8 1813 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1814 * so that this procedure is noop.
1815 */
1816
1817 if (skb->pkt_type == PACKET_LOOPBACK)
1818 goto out;
1819
09ad9bc7 1820 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1821 goto out;
1822
40d4e3df
ED
1823 skb = skb_share_check(skb, GFP_ATOMIC);
1824 if (skb == NULL)
1da177e4
LT
1825 goto oom;
1826
1827 /* drop any routing info */
adf30907 1828 skb_dst_drop(skb);
1da177e4 1829
84531c24 1830 /* drop conntrack reference */
895b5c9f 1831 nf_reset_ct(skb);
84531c24 1832
ffbc6111 1833 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1834
98e399f8 1835 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1836
1837 /*
1838 * The SOCK_PACKET socket receives _all_ frames.
1839 */
1840
1841 spkt->spkt_family = dev->type;
1842 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1843 spkt->spkt_protocol = skb->protocol;
1844
1845 /*
1846 * Charge the memory to the socket. This is done specifically
1847 * to prevent sockets using all the memory up.
1848 */
1849
40d4e3df 1850 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1851 return 0;
1852
1853out:
1854 kfree_skb(skb);
1855oom:
1856 return 0;
1857}
1858
75c65772
MM
1859static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1860{
18bed891
YK
1861 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1862 sock->type == SOCK_RAW) {
75c65772
MM
1863 skb_reset_mac_header(skb);
1864 skb->protocol = dev_parse_header_protocol(skb);
1865 }
1866
1867 skb_probe_transport_header(skb);
1868}
1da177e4
LT
1869
1870/*
1871 * Output a raw packet to a device layer. This bypasses all the other
1872 * protocol layers and you must therefore supply it with a complete frame
1873 */
1ce4f28b 1874
1b784140
YX
1875static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1876 size_t len)
1da177e4
LT
1877{
1878 struct sock *sk = sock->sk;
342dfc30 1879 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1880 struct sk_buff *skb = NULL;
1da177e4 1881 struct net_device *dev;
c14ac945 1882 struct sockcm_cookie sockc;
40d4e3df 1883 __be16 proto = 0;
1da177e4 1884 int err;
3bdc0eba 1885 int extra_len = 0;
1ce4f28b 1886
1da177e4 1887 /*
1ce4f28b 1888 * Get and verify the address.
1da177e4
LT
1889 */
1890
40d4e3df 1891 if (saddr) {
1da177e4 1892 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1893 return -EINVAL;
1894 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1895 proto = saddr->spkt_protocol;
1896 } else
1897 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1898
1899 /*
1ce4f28b 1900 * Find the device first to size check it
1da177e4
LT
1901 */
1902
de74e92a 1903 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1904retry:
654d1f8a
ED
1905 rcu_read_lock();
1906 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1907 err = -ENODEV;
1908 if (dev == NULL)
1909 goto out_unlock;
1ce4f28b 1910
d5e76b0a
DM
1911 err = -ENETDOWN;
1912 if (!(dev->flags & IFF_UP))
1913 goto out_unlock;
1914
1da177e4 1915 /*
40d4e3df
ED
1916 * You may not queue a frame bigger than the mtu. This is the lowest level
1917 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1918 */
1ce4f28b 1919
3bdc0eba
BG
1920 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1921 if (!netif_supports_nofcs(dev)) {
1922 err = -EPROTONOSUPPORT;
1923 goto out_unlock;
1924 }
1925 extra_len = 4; /* We're doing our own CRC */
1926 }
1927
1da177e4 1928 err = -EMSGSIZE;
3bdc0eba 1929 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1930 goto out_unlock;
1931
1a35ca80
ED
1932 if (!skb) {
1933 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1934 int tlen = dev->needed_tailroom;
1a35ca80
ED
1935 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1936
1937 rcu_read_unlock();
4ce40912 1938 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1939 if (skb == NULL)
1940 return -ENOBUFS;
1941 /* FIXME: Save some space for broken drivers that write a hard
1942 * header at transmission time by themselves. PPP is the notable
1943 * one here. This should really be fixed at the driver level.
1944 */
1945 skb_reserve(skb, reserved);
1946 skb_reset_network_header(skb);
1947
1948 /* Try to align data part correctly */
1949 if (hhlen) {
1950 skb->data -= hhlen;
1951 skb->tail -= hhlen;
1952 if (len < hhlen)
1953 skb_reset_network_header(skb);
1954 }
6ce8e9ce 1955 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1956 if (err)
1957 goto out_free;
1958 goto retry;
1da177e4
LT
1959 }
1960
9ed988cd
WB
1961 if (!dev_validate_header(dev, skb->data, len)) {
1962 err = -EINVAL;
1963 goto out_unlock;
1964 }
3c70c132
DB
1965 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1966 !packet_extra_vlan_len_allowed(dev, skb)) {
1967 err = -EMSGSIZE;
1968 goto out_unlock;
57f89bfa 1969 }
1a35ca80 1970
657a0667 1971 sockcm_init(&sockc, sk);
c14ac945
SHY
1972 if (msg->msg_controllen) {
1973 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1974 if (unlikely(err))
c14ac945 1975 goto out_unlock;
c14ac945
SHY
1976 }
1977
1da177e4
LT
1978 skb->protocol = proto;
1979 skb->dev = dev;
1980 skb->priority = sk->sk_priority;
2d37a186 1981 skb->mark = sk->sk_mark;
3d0ba8c0 1982 skb->tstamp = sockc.transmit_time;
bf84a010 1983
8f932f76 1984 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1985
3bdc0eba
BG
1986 if (unlikely(extra_len == 4))
1987 skb->no_fcs = 1;
1988
75c65772 1989 packet_parse_headers(skb, sock);
c1aad275 1990
1da177e4 1991 dev_queue_xmit(skb);
654d1f8a 1992 rcu_read_unlock();
40d4e3df 1993 return len;
1da177e4 1994
1da177e4 1995out_unlock:
654d1f8a 1996 rcu_read_unlock();
1a35ca80
ED
1997out_free:
1998 kfree_skb(skb);
1da177e4
LT
1999 return err;
2000}
1da177e4 2001
ff936a04
AS
2002static unsigned int run_filter(struct sk_buff *skb,
2003 const struct sock *sk,
2004 unsigned int res)
1da177e4
LT
2005{
2006 struct sk_filter *filter;
fda9ef5d 2007
80f8f102
ED
2008 rcu_read_lock();
2009 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2010 if (filter != NULL)
ff936a04 2011 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2012 rcu_read_unlock();
1da177e4 2013
dbcb5855 2014 return res;
1da177e4
LT
2015}
2016
16cc1400
WB
2017static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2018 size_t *len)
2019{
2020 struct virtio_net_hdr vnet_hdr;
2021
2022 if (*len < sizeof(vnet_hdr))
2023 return -EINVAL;
2024 *len -= sizeof(vnet_hdr);
2025
fd3a8862 2026 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2027 return -EINVAL;
2028
2029 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2030}
2031
1da177e4 2032/*
62ab0812
ED
2033 * This function makes lazy skb cloning in hope that most of packets
2034 * are discarded by BPF.
2035 *
2036 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2037 * and skb->cb are mangled. It works because (and until) packets
2038 * falling here are owned by current CPU. Output packets are cloned
2039 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2040 * sequencially, so that if we return skb to original state on exit,
2041 * we will not harm anyone.
1da177e4
LT
2042 */
2043
40d4e3df
ED
2044static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2045 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2046{
2047 struct sock *sk;
2048 struct sockaddr_ll *sll;
2049 struct packet_sock *po;
40d4e3df 2050 u8 *skb_head = skb->data;
1da177e4 2051 int skb_len = skb->len;
dbcb5855 2052 unsigned int snaplen, res;
da37845f 2053 bool is_drop_n_account = false;
1da177e4
LT
2054
2055 if (skb->pkt_type == PACKET_LOOPBACK)
2056 goto drop;
2057
2058 sk = pt->af_packet_priv;
2059 po = pkt_sk(sk);
2060
09ad9bc7 2061 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2062 goto drop;
2063
1da177e4
LT
2064 skb->dev = dev;
2065
3b04ddde 2066 if (dev->header_ops) {
1da177e4 2067 /* The device has an explicit notion of ll header,
62ab0812
ED
2068 * exported to higher levels.
2069 *
2070 * Otherwise, the device hides details of its frame
2071 * structure, so that corresponding packet head is
2072 * never delivered to user.
1da177e4
LT
2073 */
2074 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2075 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2076 else if (skb->pkt_type == PACKET_OUTGOING) {
2077 /* Special case: outgoing packets have ll header at head */
bbe735e4 2078 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2079 }
2080 }
2081
2082 snaplen = skb->len;
2083
dbcb5855
DM
2084 res = run_filter(skb, sk, snaplen);
2085 if (!res)
fda9ef5d 2086 goto drop_n_restore;
dbcb5855
DM
2087 if (snaplen > res)
2088 snaplen = res;
1da177e4 2089
0fd7bac6 2090 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2091 goto drop_n_acct;
2092
2093 if (skb_shared(skb)) {
2094 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2095 if (nskb == NULL)
2096 goto drop_n_acct;
2097
2098 if (skb_head != skb->data) {
2099 skb->data = skb_head;
2100 skb->len = skb_len;
2101 }
abc4e4fa 2102 consume_skb(skb);
1da177e4
LT
2103 skb = nskb;
2104 }
2105
b4772ef8 2106 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2107
2108 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2109 sll->sll_hatype = dev->type;
1da177e4 2110 sll->sll_pkttype = skb->pkt_type;
8032b464 2111 if (unlikely(po->origdev))
80feaacb
PWJ
2112 sll->sll_ifindex = orig_dev->ifindex;
2113 else
2114 sll->sll_ifindex = dev->ifindex;
1da177e4 2115
b95cce35 2116 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2117
2472d761
EB
2118 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2119 * Use their space for storing the original skb length.
2120 */
2121 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2122
1da177e4
LT
2123 if (pskb_trim(skb, snaplen))
2124 goto drop_n_acct;
2125
2126 skb_set_owner_r(skb, sk);
2127 skb->dev = NULL;
adf30907 2128 skb_dst_drop(skb);
1da177e4 2129
84531c24 2130 /* drop conntrack reference */
895b5c9f 2131 nf_reset_ct(skb);
84531c24 2132
1da177e4 2133 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2134 po->stats.stats1.tp_packets++;
3bc3b96f 2135 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2136 __skb_queue_tail(&sk->sk_receive_queue, skb);
2137 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2138 sk->sk_data_ready(sk);
1da177e4
LT
2139 return 0;
2140
2141drop_n_acct:
da37845f 2142 is_drop_n_account = true;
8e8e2951 2143 atomic_inc(&po->tp_drops);
7091fbd8 2144 atomic_inc(&sk->sk_drops);
1da177e4
LT
2145
2146drop_n_restore:
2147 if (skb_head != skb->data && skb_shared(skb)) {
2148 skb->data = skb_head;
2149 skb->len = skb_len;
2150 }
2151drop:
da37845f
WJ
2152 if (!is_drop_n_account)
2153 consume_skb(skb);
2154 else
2155 kfree_skb(skb);
1da177e4
LT
2156 return 0;
2157}
2158
40d4e3df
ED
2159static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2160 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2161{
2162 struct sock *sk;
2163 struct packet_sock *po;
2164 struct sockaddr_ll *sll;
184f489e 2165 union tpacket_uhdr h;
40d4e3df 2166 u8 *skb_head = skb->data;
1da177e4 2167 int skb_len = skb->len;
dbcb5855 2168 unsigned int snaplen, res;
f6fb8f10 2169 unsigned long status = TP_STATUS_USER;
bbd6ef87 2170 unsigned short macoff, netoff, hdrlen;
1da177e4 2171 struct sk_buff *copy_skb = NULL;
bbd6ef87 2172 struct timespec ts;
b9c32fb2 2173 __u32 ts_status;
da37845f 2174 bool is_drop_n_account = false;
edbd58be 2175 bool do_vnet = false;
1da177e4 2176
51846355
AW
2177 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2178 * We may add members to them until current aligned size without forcing
2179 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2180 */
2181 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2182 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2183
1da177e4
LT
2184 if (skb->pkt_type == PACKET_LOOPBACK)
2185 goto drop;
2186
2187 sk = pt->af_packet_priv;
2188 po = pkt_sk(sk);
2189
09ad9bc7 2190 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2191 goto drop;
2192
3b04ddde 2193 if (dev->header_ops) {
1da177e4 2194 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2195 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2196 else if (skb->pkt_type == PACKET_OUTGOING) {
2197 /* Special case: outgoing packets have ll header at head */
bbe735e4 2198 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2199 }
2200 }
2201
2202 snaplen = skb->len;
2203
dbcb5855
DM
2204 res = run_filter(skb, sk, snaplen);
2205 if (!res)
fda9ef5d 2206 goto drop_n_restore;
68c2e5de 2207
2c51c627
ED
2208 /* If we are flooded, just give up */
2209 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2210 atomic_inc(&po->tp_drops);
2211 goto drop_n_restore;
2212 }
2213
68c2e5de
AD
2214 if (skb->ip_summed == CHECKSUM_PARTIAL)
2215 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2216 else if (skb->pkt_type != PACKET_OUTGOING &&
2217 (skb->ip_summed == CHECKSUM_COMPLETE ||
2218 skb_csum_unnecessary(skb)))
2219 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2220
dbcb5855
DM
2221 if (snaplen > res)
2222 snaplen = res;
1da177e4
LT
2223
2224 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2225 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2226 po->tp_reserve;
1da177e4 2227 } else {
95c96174 2228 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2229 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2230 (maclen < 16 ? 16 : maclen)) +
58d19b19 2231 po->tp_reserve;
edbd58be 2232 if (po->has_vnet_hdr) {
58d19b19 2233 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2234 do_vnet = true;
2235 }
1da177e4
LT
2236 macoff = netoff - maclen;
2237 }
f6fb8f10 2238 if (po->tp_version <= TPACKET_V2) {
2239 if (macoff + snaplen > po->rx_ring.frame_size) {
2240 if (po->copy_thresh &&
0fd7bac6 2241 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2242 if (skb_shared(skb)) {
2243 copy_skb = skb_clone(skb, GFP_ATOMIC);
2244 } else {
2245 copy_skb = skb_get(skb);
2246 skb_head = skb->data;
2247 }
2248 if (copy_skb)
2249 skb_set_owner_r(copy_skb, sk);
1da177e4 2250 }
f6fb8f10 2251 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2252 if ((int)snaplen < 0) {
f6fb8f10 2253 snaplen = 0;
edbd58be
BP
2254 do_vnet = false;
2255 }
1da177e4 2256 }
dc808110
ED
2257 } else if (unlikely(macoff + snaplen >
2258 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2259 u32 nval;
2260
2261 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2262 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2263 snaplen, nval, macoff);
2264 snaplen = nval;
2265 if (unlikely((int)snaplen < 0)) {
2266 snaplen = 0;
2267 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2268 do_vnet = false;
dc808110 2269 }
1da177e4 2270 }
1da177e4 2271 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2272 h.raw = packet_current_rx_frame(po, skb,
2273 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2274 if (!h.raw)
58d19b19 2275 goto drop_n_account;
f6fb8f10 2276 if (po->tp_version <= TPACKET_V2) {
2277 packet_increment_rx_head(po, &po->rx_ring);
2278 /*
2279 * LOSING will be reported till you read the stats,
2280 * because it's COR - Clear On Read.
2281 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2282 * at packet level.
2283 */
8e8e2951 2284 if (atomic_read(&po->tp_drops))
f6fb8f10 2285 status |= TP_STATUS_LOSING;
2286 }
945d015e
ED
2287
2288 if (do_vnet &&
2289 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2290 sizeof(struct virtio_net_hdr),
2291 vio_le(), true, 0))
2292 goto drop_n_account;
2293
ee80fbf3 2294 po->stats.stats1.tp_packets++;
1da177e4
LT
2295 if (copy_skb) {
2296 status |= TP_STATUS_COPY;
2297 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2298 }
1da177e4
LT
2299 spin_unlock(&sk->sk_receive_queue.lock);
2300
bbd6ef87 2301 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2302
2303 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2304 getnstimeofday(&ts);
1da177e4 2305
b9c32fb2
DB
2306 status |= ts_status;
2307
bbd6ef87
PM
2308 switch (po->tp_version) {
2309 case TPACKET_V1:
2310 h.h1->tp_len = skb->len;
2311 h.h1->tp_snaplen = snaplen;
2312 h.h1->tp_mac = macoff;
2313 h.h1->tp_net = netoff;
4b457bdf
DB
2314 h.h1->tp_sec = ts.tv_sec;
2315 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2316 hdrlen = sizeof(*h.h1);
2317 break;
2318 case TPACKET_V2:
2319 h.h2->tp_len = skb->len;
2320 h.h2->tp_snaplen = snaplen;
2321 h.h2->tp_mac = macoff;
2322 h.h2->tp_net = netoff;
bbd6ef87
PM
2323 h.h2->tp_sec = ts.tv_sec;
2324 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2325 if (skb_vlan_tag_present(skb)) {
2326 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2327 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2328 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2329 } else {
2330 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2331 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2332 }
e4d26f4b 2333 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2334 hdrlen = sizeof(*h.h2);
2335 break;
f6fb8f10 2336 case TPACKET_V3:
2337 /* tp_nxt_offset,vlan are already populated above.
2338 * So DONT clear those fields here
2339 */
2340 h.h3->tp_status |= status;
2341 h.h3->tp_len = skb->len;
2342 h.h3->tp_snaplen = snaplen;
2343 h.h3->tp_mac = macoff;
2344 h.h3->tp_net = netoff;
f6fb8f10 2345 h.h3->tp_sec = ts.tv_sec;
2346 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2347 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2348 hdrlen = sizeof(*h.h3);
2349 break;
bbd6ef87
PM
2350 default:
2351 BUG();
2352 }
1da177e4 2353
bbd6ef87 2354 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2355 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2356 sll->sll_family = AF_PACKET;
2357 sll->sll_hatype = dev->type;
2358 sll->sll_protocol = skb->protocol;
2359 sll->sll_pkttype = skb->pkt_type;
8032b464 2360 if (unlikely(po->origdev))
80feaacb
PWJ
2361 sll->sll_ifindex = orig_dev->ifindex;
2362 else
2363 sll->sll_ifindex = dev->ifindex;
1da177e4 2364
e16aa207 2365 smp_mb();
f0d4eb29 2366
f6dafa95 2367#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2368 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2369 u8 *start, *end;
2370
f0d4eb29
DB
2371 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2372 macoff + snaplen);
2373
2374 for (start = h.raw; start < end; start += PAGE_SIZE)
2375 flush_dcache_page(pgv_to_page(start));
1da177e4 2376 }
f0d4eb29 2377 smp_wmb();
f6dafa95 2378#endif
f0d4eb29 2379
da413eec 2380 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2381 __packet_set_status(po, h.raw, status);
da413eec
DC
2382 sk->sk_data_ready(sk);
2383 } else {
f6fb8f10 2384 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2385 }
1da177e4
LT
2386
2387drop_n_restore:
2388 if (skb_head != skb->data && skb_shared(skb)) {
2389 skb->data = skb_head;
2390 skb->len = skb_len;
2391 }
2392drop:
da37845f
WJ
2393 if (!is_drop_n_account)
2394 consume_skb(skb);
2395 else
2396 kfree_skb(skb);
1da177e4
LT
2397 return 0;
2398
58d19b19 2399drop_n_account:
1da177e4 2400 spin_unlock(&sk->sk_receive_queue.lock);
8e8e2951
ED
2401 atomic_inc(&po->tp_drops);
2402 is_drop_n_account = true;
1da177e4 2403
676d2369 2404 sk->sk_data_ready(sk);
acb5d75b 2405 kfree_skb(copy_skb);
1da177e4
LT
2406 goto drop_n_restore;
2407}
2408
69e3c75f
JB
2409static void tpacket_destruct_skb(struct sk_buff *skb)
2410{
2411 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2412
69e3c75f 2413 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2414 void *ph;
b9c32fb2
DB
2415 __u32 ts;
2416
5cd8d46e 2417 ph = skb_zcopy_get_nouarg(skb);
b0138408 2418 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2419
2420 ts = __packet_set_timestamp(po, ph, skb);
2421 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
89ed5b51
NH
2422
2423 if (!packet_read_pending(&po->tx_ring))
2424 complete(&po->skb_completion);
69e3c75f
JB
2425 }
2426
2427 sock_wfree(skb);
2428}
2429
16cc1400
WB
2430static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2431{
16cc1400
WB
2432 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2433 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2434 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2435 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2436 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2437 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2438 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2439
2440 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2441 return -EINVAL;
2442
16cc1400
WB
2443 return 0;
2444}
2445
2446static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2447 struct virtio_net_hdr *vnet_hdr)
2448{
16cc1400
WB
2449 if (*len < sizeof(*vnet_hdr))
2450 return -EINVAL;
2451 *len -= sizeof(*vnet_hdr);
2452
cbbd26b8 2453 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2454 return -EFAULT;
2455
2456 return __packet_snd_vnet_parse(vnet_hdr, *len);
2457}
2458
40d4e3df 2459static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2460 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2461 __be16 proto, unsigned char *addr, int hlen, int copylen,
2462 const struct sockcm_cookie *sockc)
69e3c75f 2463{
184f489e 2464 union tpacket_uhdr ph;
8d39b4a6 2465 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2466 struct socket *sock = po->sk.sk_socket;
2467 struct page *page;
69e3c75f
JB
2468 int err;
2469
2470 ph.raw = frame;
2471
2472 skb->protocol = proto;
2473 skb->dev = dev;
2474 skb->priority = po->sk.sk_priority;
2d37a186 2475 skb->mark = po->sk.sk_mark;
3d0ba8c0 2476 skb->tstamp = sockc->transmit_time;
8f932f76 2477 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2478 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2479
ae641949 2480 skb_reserve(skb, hlen);
69e3c75f 2481 skb_reset_network_header(skb);
c1aad275 2482
69e3c75f
JB
2483 to_write = tp_len;
2484
2485 if (sock->type == SOCK_DGRAM) {
2486 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2487 NULL, tp_len);
2488 if (unlikely(err < 0))
2489 return -EINVAL;
1d036d25 2490 } else if (copylen) {
9ed988cd
WB
2491 int hdrlen = min_t(int, copylen, tp_len);
2492
69e3c75f 2493 skb_push(skb, dev->hard_header_len);
1d036d25 2494 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2495 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2496 if (unlikely(err))
2497 return err;
9ed988cd
WB
2498 if (!dev_validate_header(dev, skb->data, hdrlen))
2499 return -EINVAL;
69e3c75f 2500
9ed988cd
WB
2501 data += hdrlen;
2502 to_write -= hdrlen;
69e3c75f
JB
2503 }
2504
69e3c75f
JB
2505 offset = offset_in_page(data);
2506 len_max = PAGE_SIZE - offset;
2507 len = ((to_write > len_max) ? len_max : to_write);
2508
2509 skb->data_len = to_write;
2510 skb->len += to_write;
2511 skb->truesize += to_write;
14afee4b 2512 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2513
2514 while (likely(to_write)) {
2515 nr_frags = skb_shinfo(skb)->nr_frags;
2516
2517 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2518 pr_err("Packet exceed the number of skb frags(%lu)\n",
2519 MAX_SKB_FRAGS);
69e3c75f
JB
2520 return -EFAULT;
2521 }
2522
0af55bb5
CG
2523 page = pgv_to_page(data);
2524 data += len;
69e3c75f
JB
2525 flush_dcache_page(page);
2526 get_page(page);
0af55bb5 2527 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2528 to_write -= len;
2529 offset = 0;
2530 len_max = PAGE_SIZE;
2531 len = ((to_write > len_max) ? len_max : to_write);
2532 }
2533
75c65772 2534 packet_parse_headers(skb, sock);
efdfa2f7 2535
69e3c75f
JB
2536 return tp_len;
2537}
2538
8d39b4a6
WB
2539static int tpacket_parse_header(struct packet_sock *po, void *frame,
2540 int size_max, void **data)
2541{
2542 union tpacket_uhdr ph;
2543 int tp_len, off;
2544
2545 ph.raw = frame;
2546
2547 switch (po->tp_version) {
7f953ab2
SV
2548 case TPACKET_V3:
2549 if (ph.h3->tp_next_offset != 0) {
2550 pr_warn_once("variable sized slot not supported");
2551 return -EINVAL;
2552 }
2553 tp_len = ph.h3->tp_len;
2554 break;
8d39b4a6
WB
2555 case TPACKET_V2:
2556 tp_len = ph.h2->tp_len;
2557 break;
2558 default:
2559 tp_len = ph.h1->tp_len;
2560 break;
2561 }
2562 if (unlikely(tp_len > size_max)) {
2563 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2564 return -EMSGSIZE;
2565 }
2566
2567 if (unlikely(po->tp_tx_has_off)) {
2568 int off_min, off_max;
2569
2570 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2571 off_max = po->tx_ring.frame_size - tp_len;
2572 if (po->sk.sk_type == SOCK_DGRAM) {
2573 switch (po->tp_version) {
7f953ab2
SV
2574 case TPACKET_V3:
2575 off = ph.h3->tp_net;
2576 break;
8d39b4a6
WB
2577 case TPACKET_V2:
2578 off = ph.h2->tp_net;
2579 break;
2580 default:
2581 off = ph.h1->tp_net;
2582 break;
2583 }
2584 } else {
2585 switch (po->tp_version) {
7f953ab2
SV
2586 case TPACKET_V3:
2587 off = ph.h3->tp_mac;
2588 break;
8d39b4a6
WB
2589 case TPACKET_V2:
2590 off = ph.h2->tp_mac;
2591 break;
2592 default:
2593 off = ph.h1->tp_mac;
2594 break;
2595 }
2596 }
2597 if (unlikely((off < off_min) || (off_max < off)))
2598 return -EINVAL;
2599 } else {
2600 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2601 }
2602
2603 *data = frame + off;
2604 return tp_len;
2605}
2606
69e3c75f
JB
2607static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2608{
89ed5b51 2609 struct sk_buff *skb = NULL;
69e3c75f 2610 struct net_device *dev;
1d036d25 2611 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2612 struct sockcm_cookie sockc;
69e3c75f 2613 __be16 proto;
09effa67 2614 int err, reserve = 0;
40d4e3df 2615 void *ph;
342dfc30 2616 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2617 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
486efdc8 2618 unsigned char *addr = NULL;
69e3c75f 2619 int tp_len, size_max;
8d39b4a6 2620 void *data;
69e3c75f 2621 int len_sum = 0;
9e67030a 2622 int status = TP_STATUS_AVAILABLE;
1d036d25 2623 int hlen, tlen, copylen = 0;
89ed5b51 2624 long timeo = 0;
69e3c75f 2625
69e3c75f
JB
2626 mutex_lock(&po->pg_vec_lock);
2627
32d3182c
ED
2628 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2629 * we need to confirm it under protection of pg_vec_lock.
2630 */
2631 if (unlikely(!po->tx_ring.pg_vec)) {
2632 err = -EBUSY;
2633 goto out;
2634 }
66e56cd4 2635 if (likely(saddr == NULL)) {
e40526cb 2636 dev = packet_cached_dev_get(po);
69e3c75f 2637 proto = po->num;
69e3c75f
JB
2638 } else {
2639 err = -EINVAL;
2640 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2641 goto out;
2642 if (msg->msg_namelen < (saddr->sll_halen
2643 + offsetof(struct sockaddr_ll,
2644 sll_addr)))
2645 goto out;
69e3c75f 2646 proto = saddr->sll_protocol;
827d9780 2647 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
486efdc8
WB
2648 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2649 if (dev && msg->msg_namelen < dev->addr_len +
2650 offsetof(struct sockaddr_ll, sll_addr))
2651 goto out_put;
2652 addr = saddr->sll_addr;
2653 }
69e3c75f
JB
2654 }
2655
69e3c75f
JB
2656 err = -ENXIO;
2657 if (unlikely(dev == NULL))
2658 goto out;
69e3c75f
JB
2659 err = -ENETDOWN;
2660 if (unlikely(!(dev->flags & IFF_UP)))
2661 goto out_put;
2662
657a0667 2663 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2664 if (msg->msg_controllen) {
2665 err = sock_cmsg_send(&po->sk, msg, &sockc);
2666 if (unlikely(err))
2667 goto out_put;
2668 }
2669
5cfb4c8d
DB
2670 if (po->sk.sk_socket->type == SOCK_RAW)
2671 reserve = dev->hard_header_len;
69e3c75f 2672 size_max = po->tx_ring.frame_size
b5dd884e 2673 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2674
1d036d25 2675 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2676 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2677
89ed5b51
NH
2678 reinit_completion(&po->skb_completion);
2679
69e3c75f
JB
2680 do {
2681 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2682 TP_STATUS_SEND_REQUEST);
69e3c75f 2683 if (unlikely(ph == NULL)) {
89ed5b51
NH
2684 if (need_wait && skb) {
2685 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2686 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2687 if (timeo <= 0) {
2688 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2689 goto out_put;
2690 }
2691 }
2692 /* check for additional frames */
69e3c75f
JB
2693 continue;
2694 }
2695
8d39b4a6
WB
2696 skb = NULL;
2697 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2698 if (tp_len < 0)
2699 goto tpacket_error;
2700
69e3c75f 2701 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2702 hlen = LL_RESERVED_SPACE(dev);
2703 tlen = dev->needed_tailroom;
1d036d25
WB
2704 if (po->has_vnet_hdr) {
2705 vnet_hdr = data;
2706 data += sizeof(*vnet_hdr);
2707 tp_len -= sizeof(*vnet_hdr);
2708 if (tp_len < 0 ||
2709 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2710 tp_len = -EINVAL;
2711 goto tpacket_error;
2712 }
2713 copylen = __virtio16_to_cpu(vio_le(),
2714 vnet_hdr->hdr_len);
2715 }
9ed988cd 2716 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2717 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2718 hlen + tlen + sizeof(struct sockaddr_ll) +
2719 (copylen - dev->hard_header_len),
fbf33a28 2720 !need_wait, &err);
69e3c75f 2721
fbf33a28
KM
2722 if (unlikely(skb == NULL)) {
2723 /* we assume the socket was initially writeable ... */
2724 if (likely(len_sum > 0))
2725 err = len_sum;
69e3c75f 2726 goto out_status;
fbf33a28 2727 }
8d39b4a6 2728 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2729 addr, hlen, copylen, &sockc);
dbd46ab4 2730 if (likely(tp_len >= 0) &&
5cfb4c8d 2731 tp_len > dev->mtu + reserve &&
1d036d25 2732 !po->has_vnet_hdr &&
3c70c132
DB
2733 !packet_extra_vlan_len_allowed(dev, skb))
2734 tp_len = -EMSGSIZE;
69e3c75f
JB
2735
2736 if (unlikely(tp_len < 0)) {
8d39b4a6 2737tpacket_error:
69e3c75f
JB
2738 if (po->tp_loss) {
2739 __packet_set_status(po, ph,
2740 TP_STATUS_AVAILABLE);
2741 packet_increment_head(&po->tx_ring);
2742 kfree_skb(skb);
2743 continue;
2744 } else {
2745 status = TP_STATUS_WRONG_FORMAT;
2746 err = tp_len;
2747 goto out_status;
2748 }
2749 }
2750
9d2f67e4
JT
2751 if (po->has_vnet_hdr) {
2752 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2753 tp_len = -EINVAL;
2754 goto tpacket_error;
2755 }
2756 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2757 }
2758
69e3c75f
JB
2759 skb->destructor = tpacket_destruct_skb;
2760 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2761 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2762
2763 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2764 err = po->xmit(skb);
eb70df13
JP
2765 if (unlikely(err > 0)) {
2766 err = net_xmit_errno(err);
2767 if (err && __packet_get_status(po, ph) ==
2768 TP_STATUS_AVAILABLE) {
2769 /* skb was destructed already */
2770 skb = NULL;
2771 goto out_status;
2772 }
2773 /*
2774 * skb was dropped but not destructed yet;
2775 * let's treat it like congestion or err < 0
2776 */
2777 err = 0;
2778 }
69e3c75f
JB
2779 packet_increment_head(&po->tx_ring);
2780 len_sum += tp_len;
b0138408
DB
2781 } while (likely((ph != NULL) ||
2782 /* Note: packet_read_pending() might be slow if we have
2783 * to call it as it's per_cpu variable, but in fast-path
2784 * we already short-circuit the loop with the first
2785 * condition, and luckily don't have to go that path
2786 * anyway.
2787 */
2788 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2789
2790 err = len_sum;
2791 goto out_put;
2792
69e3c75f
JB
2793out_status:
2794 __packet_set_status(po, ph, status);
2795 kfree_skb(skb);
2796out_put:
e40526cb 2797 dev_put(dev);
69e3c75f
JB
2798out:
2799 mutex_unlock(&po->pg_vec_lock);
2800 return err;
2801}
69e3c75f 2802
eea49cc9
OJ
2803static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2804 size_t reserve, size_t len,
2805 size_t linear, int noblock,
2806 int *err)
bfd5f4a3
SS
2807{
2808 struct sk_buff *skb;
2809
2810 /* Under a page? Don't bother with paged skb. */
2811 if (prepad + len < PAGE_SIZE || !linear)
2812 linear = len;
2813
2814 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2815 err, 0);
bfd5f4a3
SS
2816 if (!skb)
2817 return NULL;
2818
2819 skb_reserve(skb, reserve);
2820 skb_put(skb, linear);
2821 skb->data_len = len - linear;
2822 skb->len += len - linear;
2823
2824 return skb;
2825}
2826
d346a3fa 2827static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2828{
2829 struct sock *sk = sock->sk;
342dfc30 2830 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2831 struct sk_buff *skb;
2832 struct net_device *dev;
0e11c91e 2833 __be16 proto;
486efdc8 2834 unsigned char *addr = NULL;
827d9780 2835 int err, reserve = 0;
c7d39e32 2836 struct sockcm_cookie sockc;
bfd5f4a3
SS
2837 struct virtio_net_hdr vnet_hdr = { 0 };
2838 int offset = 0;
bfd5f4a3 2839 struct packet_sock *po = pkt_sk(sk);
da7c9561 2840 bool has_vnet_hdr = false;
57031eb7 2841 int hlen, tlen, linear;
3bdc0eba 2842 int extra_len = 0;
1da177e4
LT
2843
2844 /*
1ce4f28b 2845 * Get and verify the address.
1da177e4 2846 */
1ce4f28b 2847
66e56cd4 2848 if (likely(saddr == NULL)) {
e40526cb 2849 dev = packet_cached_dev_get(po);
1da177e4 2850 proto = po->num;
1da177e4
LT
2851 } else {
2852 err = -EINVAL;
2853 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2854 goto out;
0fb375fb
EB
2855 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2856 goto out;
1da177e4 2857 proto = saddr->sll_protocol;
827d9780 2858 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
486efdc8
WB
2859 if (sock->type == SOCK_DGRAM) {
2860 if (dev && msg->msg_namelen < dev->addr_len +
2861 offsetof(struct sockaddr_ll, sll_addr))
2862 goto out_unlock;
2863 addr = saddr->sll_addr;
2864 }
1da177e4
LT
2865 }
2866
1da177e4 2867 err = -ENXIO;
e40526cb 2868 if (unlikely(dev == NULL))
1da177e4 2869 goto out_unlock;
d5e76b0a 2870 err = -ENETDOWN;
e40526cb 2871 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2872 goto out_unlock;
2873
657a0667 2874 sockcm_init(&sockc, sk);
c7d39e32
EJ
2875 sockc.mark = sk->sk_mark;
2876 if (msg->msg_controllen) {
2877 err = sock_cmsg_send(sk, msg, &sockc);
2878 if (unlikely(err))
2879 goto out_unlock;
2880 }
2881
e40526cb
DB
2882 if (sock->type == SOCK_RAW)
2883 reserve = dev->hard_header_len;
bfd5f4a3 2884 if (po->has_vnet_hdr) {
16cc1400
WB
2885 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2886 if (err)
bfd5f4a3 2887 goto out_unlock;
da7c9561 2888 has_vnet_hdr = true;
bfd5f4a3
SS
2889 }
2890
3bdc0eba
BG
2891 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2892 if (!netif_supports_nofcs(dev)) {
2893 err = -EPROTONOSUPPORT;
2894 goto out_unlock;
2895 }
2896 extra_len = 4; /* We're doing our own CRC */
2897 }
2898
1da177e4 2899 err = -EMSGSIZE;
16cc1400
WB
2900 if (!vnet_hdr.gso_type &&
2901 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2902 goto out_unlock;
2903
bfd5f4a3 2904 err = -ENOBUFS;
ae641949
HX
2905 hlen = LL_RESERVED_SPACE(dev);
2906 tlen = dev->needed_tailroom;
57031eb7
WB
2907 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2908 linear = max(linear, min_t(int, len, dev->hard_header_len));
2909 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2910 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2911 if (skb == NULL)
1da177e4
LT
2912 goto out_unlock;
2913
b84bbaf7 2914 skb_reset_network_header(skb);
1da177e4 2915
0c4e8581 2916 err = -EINVAL;
9c707762
WB
2917 if (sock->type == SOCK_DGRAM) {
2918 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2919 if (unlikely(offset < 0))
9c707762 2920 goto out_free;
b84bbaf7 2921 } else if (reserve) {
9aad13b0 2922 skb_reserve(skb, -reserve);
88a8121d
ND
2923 if (len < reserve + sizeof(struct ipv6hdr) &&
2924 dev->min_header_len != dev->hard_header_len)
993675a3 2925 skb_reset_network_header(skb);
9c707762 2926 }
1da177e4
LT
2927
2928 /* Returns -EFAULT on error */
c0371da6 2929 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2930 if (err)
2931 goto out_free;
bf84a010 2932
9ed988cd
WB
2933 if (sock->type == SOCK_RAW &&
2934 !dev_validate_header(dev, skb->data, len)) {
2935 err = -EINVAL;
2936 goto out_free;
2937 }
2938
8f932f76 2939 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2940
16cc1400 2941 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2942 !packet_extra_vlan_len_allowed(dev, skb)) {
2943 err = -EMSGSIZE;
2944 goto out_free;
57f89bfa
BG
2945 }
2946
09effa67
DM
2947 skb->protocol = proto;
2948 skb->dev = dev;
1da177e4 2949 skb->priority = sk->sk_priority;
c7d39e32 2950 skb->mark = sockc.mark;
3d0ba8c0 2951 skb->tstamp = sockc.transmit_time;
0fd5d57b 2952
da7c9561 2953 if (has_vnet_hdr) {
db60eb5f 2954 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2955 if (err)
2956 goto out_free;
2957 len += sizeof(vnet_hdr);
9d2f67e4 2958 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2959 }
2960
75c65772 2961 packet_parse_headers(skb, sock);
8fd6c80d 2962
3bdc0eba
BG
2963 if (unlikely(extra_len == 4))
2964 skb->no_fcs = 1;
2965
d346a3fa 2966 err = po->xmit(skb);
1da177e4
LT
2967 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2968 goto out_unlock;
2969
e40526cb 2970 dev_put(dev);
1da177e4 2971
40d4e3df 2972 return len;
1da177e4
LT
2973
2974out_free:
2975 kfree_skb(skb);
2976out_unlock:
e40526cb 2977 if (dev)
1da177e4
LT
2978 dev_put(dev);
2979out:
2980 return err;
2981}
2982
1b784140 2983static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2984{
69e3c75f
JB
2985 struct sock *sk = sock->sk;
2986 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2987
69e3c75f
JB
2988 if (po->tx_ring.pg_vec)
2989 return tpacket_snd(po, msg);
2990 else
69e3c75f
JB
2991 return packet_snd(sock, msg, len);
2992}
2993
1da177e4
LT
2994/*
2995 * Close a PACKET socket. This is fairly simple. We immediately go
2996 * to 'closed' state and remove our protocol entry in the device list.
2997 */
2998
2999static int packet_release(struct socket *sock)
3000{
3001 struct sock *sk = sock->sk;
3002 struct packet_sock *po;
2bd624b4 3003 struct packet_fanout *f;
d12d01d6 3004 struct net *net;
f6fb8f10 3005 union tpacket_req_u req_u;
1da177e4
LT
3006
3007 if (!sk)
3008 return 0;
3009
3b1e0a65 3010 net = sock_net(sk);
1da177e4
LT
3011 po = pkt_sk(sk);
3012
0fa7fa98 3013 mutex_lock(&net->packet.sklist_lock);
808f5114 3014 sk_del_node_init_rcu(sk);
0fa7fa98
PE
3015 mutex_unlock(&net->packet.sklist_lock);
3016
3017 preempt_disable();
920de804 3018 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 3019 preempt_enable();
1da177e4 3020
808f5114 3021 spin_lock(&po->bind_lock);
ce06b03e 3022 unregister_prot_hook(sk, false);
66e56cd4
DB
3023 packet_cached_dev_reset(po);
3024
160ff18a
BG
3025 if (po->prot_hook.dev) {
3026 dev_put(po->prot_hook.dev);
3027 po->prot_hook.dev = NULL;
3028 }
808f5114 3029 spin_unlock(&po->bind_lock);
1da177e4 3030
1da177e4 3031 packet_flush_mclist(sk);
1da177e4 3032
5171b37d 3033 lock_sock(sk);
9665d5d6
PS
3034 if (po->rx_ring.pg_vec) {
3035 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3036 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3037 }
69e3c75f 3038
9665d5d6
PS
3039 if (po->tx_ring.pg_vec) {
3040 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3041 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3042 }
5171b37d 3043 release_sock(sk);
1da177e4 3044
2bd624b4 3045 f = fanout_release(sk);
dc99f600 3046
808f5114 3047 synchronize_net();
2bd624b4 3048
afa0925c 3049 kfree(po->rollover);
2bd624b4
AS
3050 if (f) {
3051 fanout_release_data(f);
3052 kfree(f);
3053 }
1da177e4
LT
3054 /*
3055 * Now the socket is dead. No more input will appear.
3056 */
1da177e4
LT
3057 sock_orphan(sk);
3058 sock->sk = NULL;
3059
3060 /* Purge queues */
3061
3062 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3063 packet_free_pending(po);
17ab56a2 3064 sk_refcnt_debug_release(sk);
1da177e4
LT
3065
3066 sock_put(sk);
3067 return 0;
3068}
3069
3070/*
3071 * Attach a packet hook.
3072 */
3073
30f7ea1c
FR
3074static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3075 __be16 proto)
1da177e4
LT
3076{
3077 struct packet_sock *po = pkt_sk(sk);
158cd4af 3078 struct net_device *dev_curr;
902fefb8
DB
3079 __be16 proto_curr;
3080 bool need_rehook;
30f7ea1c
FR
3081 struct net_device *dev = NULL;
3082 int ret = 0;
3083 bool unlisted = false;
dc99f600 3084
1da177e4 3085 lock_sock(sk);
1da177e4 3086 spin_lock(&po->bind_lock);
30f7ea1c
FR
3087 rcu_read_lock();
3088
4971613c
WB
3089 if (po->fanout) {
3090 ret = -EINVAL;
3091 goto out_unlock;
3092 }
3093
30f7ea1c
FR
3094 if (name) {
3095 dev = dev_get_by_name_rcu(sock_net(sk), name);
3096 if (!dev) {
3097 ret = -ENODEV;
3098 goto out_unlock;
3099 }
3100 } else if (ifindex) {
3101 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3102 if (!dev) {
3103 ret = -ENODEV;
3104 goto out_unlock;
3105 }
3106 }
3107
3108 if (dev)
3109 dev_hold(dev);
66e56cd4 3110
902fefb8
DB
3111 proto_curr = po->prot_hook.type;
3112 dev_curr = po->prot_hook.dev;
3113
3114 need_rehook = proto_curr != proto || dev_curr != dev;
3115
3116 if (need_rehook) {
30f7ea1c
FR
3117 if (po->running) {
3118 rcu_read_unlock();
15fe076e
ED
3119 /* prevents packet_notifier() from calling
3120 * register_prot_hook()
3121 */
3122 po->num = 0;
30f7ea1c
FR
3123 __unregister_prot_hook(sk, true);
3124 rcu_read_lock();
3125 dev_curr = po->prot_hook.dev;
3126 if (dev)
3127 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3128 dev->ifindex);
3129 }
1da177e4 3130
15fe076e 3131 BUG_ON(po->running);
902fefb8
DB
3132 po->num = proto;
3133 po->prot_hook.type = proto;
902fefb8 3134
30f7ea1c
FR
3135 if (unlikely(unlisted)) {
3136 dev_put(dev);
3137 po->prot_hook.dev = NULL;
3138 po->ifindex = -1;
3139 packet_cached_dev_reset(po);
3140 } else {
3141 po->prot_hook.dev = dev;
3142 po->ifindex = dev ? dev->ifindex : 0;
3143 packet_cached_dev_assign(po, dev);
3144 }
902fefb8 3145 }
158cd4af
LW
3146 if (dev_curr)
3147 dev_put(dev_curr);
66e56cd4 3148
902fefb8 3149 if (proto == 0 || !need_rehook)
1da177e4
LT
3150 goto out_unlock;
3151
30f7ea1c 3152 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3153 register_prot_hook(sk);
be85d4ad
UT
3154 } else {
3155 sk->sk_err = ENETDOWN;
3156 if (!sock_flag(sk, SOCK_DEAD))
3157 sk->sk_error_report(sk);
1da177e4
LT
3158 }
3159
3160out_unlock:
30f7ea1c 3161 rcu_read_unlock();
1da177e4
LT
3162 spin_unlock(&po->bind_lock);
3163 release_sock(sk);
30f7ea1c 3164 return ret;
1da177e4
LT
3165}
3166
3167/*
3168 * Bind a packet socket to a device
3169 */
3170
40d4e3df
ED
3171static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3172 int addr_len)
1da177e4 3173{
40d4e3df 3174 struct sock *sk = sock->sk;
540e2894 3175 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3176
1da177e4
LT
3177 /*
3178 * Check legality
3179 */
1ce4f28b 3180
8ae55f04 3181 if (addr_len != sizeof(struct sockaddr))
1da177e4 3182 return -EINVAL;
540e2894
AP
3183 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3184 * zero-terminated.
3185 */
3186 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3187 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3188
30f7ea1c 3189 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3190}
1da177e4
LT
3191
3192static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3193{
40d4e3df
ED
3194 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3195 struct sock *sk = sock->sk;
1da177e4
LT
3196
3197 /*
3198 * Check legality
3199 */
1ce4f28b 3200
1da177e4
LT
3201 if (addr_len < sizeof(struct sockaddr_ll))
3202 return -EINVAL;
3203 if (sll->sll_family != AF_PACKET)
3204 return -EINVAL;
3205
30f7ea1c
FR
3206 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3207 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3208}
3209
3210static struct proto packet_proto = {
3211 .name = "PACKET",
3212 .owner = THIS_MODULE,
3213 .obj_size = sizeof(struct packet_sock),
3214};
3215
3216/*
1ce4f28b 3217 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3218 */
3219
3f378b68
EP
3220static int packet_create(struct net *net, struct socket *sock, int protocol,
3221 int kern)
1da177e4
LT
3222{
3223 struct sock *sk;
3224 struct packet_sock *po;
0e11c91e 3225 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3226 int err;
3227
df008c91 3228 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3229 return -EPERM;
be02097c
DM
3230 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3231 sock->type != SOCK_PACKET)
1da177e4
LT
3232 return -ESOCKTNOSUPPORT;
3233
3234 sock->state = SS_UNCONNECTED;
3235
3236 err = -ENOBUFS;
11aa9c28 3237 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3238 if (sk == NULL)
3239 goto out;
3240
3241 sock->ops = &packet_ops;
1da177e4
LT
3242 if (sock->type == SOCK_PACKET)
3243 sock->ops = &packet_ops_spkt;
be02097c 3244
1da177e4
LT
3245 sock_init_data(sock, sk);
3246
3247 po = pkt_sk(sk);
89ed5b51 3248 init_completion(&po->skb_completion);
1da177e4 3249 sk->sk_family = PF_PACKET;
0e11c91e 3250 po->num = proto;
d346a3fa 3251 po->xmit = dev_queue_xmit;
66e56cd4 3252
b0138408
DB
3253 err = packet_alloc_pending(po);
3254 if (err)
3255 goto out2;
3256
66e56cd4 3257 packet_cached_dev_reset(po);
1da177e4
LT
3258
3259 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3260 sk_refcnt_debug_inc(sk);
1da177e4
LT
3261
3262 /*
3263 * Attach a protocol block
3264 */
3265
3266 spin_lock_init(&po->bind_lock);
905db440 3267 mutex_init(&po->pg_vec_lock);
0648ab70 3268 po->rollover = NULL;
1da177e4 3269 po->prot_hook.func = packet_rcv;
be02097c 3270
1da177e4
LT
3271 if (sock->type == SOCK_PACKET)
3272 po->prot_hook.func = packet_rcv_spkt;
be02097c 3273
1da177e4
LT
3274 po->prot_hook.af_packet_priv = sk;
3275
0e11c91e
AV
3276 if (proto) {
3277 po->prot_hook.type = proto;
a6361f0c 3278 __register_prot_hook(sk);
1da177e4
LT
3279 }
3280
0fa7fa98 3281 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3282 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3283 mutex_unlock(&net->packet.sklist_lock);
3284
3285 preempt_disable();
3680453c 3286 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3287 preempt_enable();
808f5114 3288
40d4e3df 3289 return 0;
b0138408
DB
3290out2:
3291 sk_free(sk);
1da177e4
LT
3292out:
3293 return err;
3294}
3295
3296/*
3297 * Pull a packet from our receive queue and hand it to the user.
3298 * If necessary we block.
3299 */
3300
1b784140
YX
3301static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3302 int flags)
1da177e4
LT
3303{
3304 struct sock *sk = sock->sk;
3305 struct sk_buff *skb;
3306 int copied, err;
bfd5f4a3 3307 int vnet_hdr_len = 0;
2472d761 3308 unsigned int origlen = 0;
1da177e4
LT
3309
3310 err = -EINVAL;
ed85b565 3311 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3312 goto out;
3313
3314#if 0
3315 /* What error should we return now? EUNATTACH? */
3316 if (pkt_sk(sk)->ifindex < 0)
3317 return -ENODEV;
3318#endif
3319
ed85b565 3320 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3321 err = sock_recv_errqueue(sk, msg, len,
3322 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3323 goto out;
3324 }
3325
1da177e4
LT
3326 /*
3327 * Call the generic datagram receiver. This handles all sorts
3328 * of horrible races and re-entrancy so we can forget about it
3329 * in the protocol layers.
3330 *
3331 * Now it will return ENETDOWN, if device have just gone down,
3332 * but then it will block.
3333 */
3334
40d4e3df 3335 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3336
3337 /*
1ce4f28b 3338 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3339 * handles the blocking we don't see and worry about blocking
3340 * retries.
3341 */
3342
8ae55f04 3343 if (skb == NULL)
1da177e4
LT
3344 goto out;
3345
9bb6cd65 3346 packet_rcv_try_clear_pressure(pkt_sk(sk));
2ccdbaa6 3347
bfd5f4a3 3348 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3349 err = packet_rcv_vnet(msg, skb, &len);
3350 if (err)
bfd5f4a3 3351 goto out_free;
16cc1400 3352 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3353 }
3354
f3d33426
HFS
3355 /* You lose any data beyond the buffer you gave. If it worries
3356 * a user program they can ask the device for its MTU
3357 * anyway.
1da177e4 3358 */
1da177e4 3359 copied = skb->len;
40d4e3df
ED
3360 if (copied > len) {
3361 copied = len;
3362 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3363 }
3364
51f3d02b 3365 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3366 if (err)
3367 goto out_free;
3368
2472d761
EB
3369 if (sock->type != SOCK_PACKET) {
3370 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3371
3372 /* Original length was stored in sockaddr_ll fields */
3373 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3374 sll->sll_family = AF_PACKET;
3375 sll->sll_protocol = skb->protocol;
3376 }
3377
3b885787 3378 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3379
f3d33426 3380 if (msg->msg_name) {
b2cf86e1
WB
3381 int copy_len;
3382
f3d33426
HFS
3383 /* If the address length field is there to be filled
3384 * in, we fill it in now.
3385 */
3386 if (sock->type == SOCK_PACKET) {
342dfc30 3387 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426 3388 msg->msg_namelen = sizeof(struct sockaddr_pkt);
b2cf86e1 3389 copy_len = msg->msg_namelen;
f3d33426
HFS
3390 } else {
3391 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3392
f3d33426
HFS
3393 msg->msg_namelen = sll->sll_halen +
3394 offsetof(struct sockaddr_ll, sll_addr);
b2cf86e1
WB
3395 copy_len = msg->msg_namelen;
3396 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3397 memset(msg->msg_name +
3398 offsetof(struct sockaddr_ll, sll_addr),
3399 0, sizeof(sll->sll_addr));
3400 msg->msg_namelen = sizeof(struct sockaddr_ll);
3401 }
f3d33426 3402 }
b2cf86e1 3403 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
f3d33426 3404 }
1da177e4 3405
8dc41944 3406 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3407 struct tpacket_auxdata aux;
3408
3409 aux.tp_status = TP_STATUS_USER;
3410 if (skb->ip_summed == CHECKSUM_PARTIAL)
3411 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3412 else if (skb->pkt_type != PACKET_OUTGOING &&
3413 (skb->ip_summed == CHECKSUM_COMPLETE ||
3414 skb_csum_unnecessary(skb)))
3415 aux.tp_status |= TP_STATUS_CSUM_VALID;
3416
2472d761 3417 aux.tp_len = origlen;
ffbc6111
HX
3418 aux.tp_snaplen = skb->len;
3419 aux.tp_mac = 0;
bbe735e4 3420 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3421 if (skb_vlan_tag_present(skb)) {
3422 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3423 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3424 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3425 } else {
3426 aux.tp_vlan_tci = 0;
a0cdfcf3 3427 aux.tp_vlan_tpid = 0;
a3bcc23e 3428 }
ffbc6111 3429 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3430 }
3431
1da177e4
LT
3432 /*
3433 * Free or return the buffer as appropriate. Again this
3434 * hides all the races and re-entrancy issues from us.
3435 */
bfd5f4a3 3436 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3437
3438out_free:
3439 skb_free_datagram(sk, skb);
3440out:
3441 return err;
3442}
3443
1da177e4 3444static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3445 int peer)
1da177e4
LT
3446{
3447 struct net_device *dev;
3448 struct sock *sk = sock->sk;
3449
3450 if (peer)
3451 return -EOPNOTSUPP;
3452
3453 uaddr->sa_family = AF_PACKET;
2dc85bf3 3454 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3455 rcu_read_lock();
3456 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3457 if (dev)
2dc85bf3 3458 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3459 rcu_read_unlock();
1da177e4 3460
9b2c45d4 3461 return sizeof(*uaddr);
1da177e4 3462}
1da177e4
LT
3463
3464static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3465 int peer)
1da177e4
LT
3466{
3467 struct net_device *dev;
3468 struct sock *sk = sock->sk;
3469 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3470 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3471
3472 if (peer)
3473 return -EOPNOTSUPP;
3474
3475 sll->sll_family = AF_PACKET;
3476 sll->sll_ifindex = po->ifindex;
3477 sll->sll_protocol = po->num;
67286640 3478 sll->sll_pkttype = 0;
654d1f8a
ED
3479 rcu_read_lock();
3480 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3481 if (dev) {
3482 sll->sll_hatype = dev->type;
3483 sll->sll_halen = dev->addr_len;
3484 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3485 } else {
3486 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3487 sll->sll_halen = 0;
3488 }
654d1f8a 3489 rcu_read_unlock();
1da177e4 3490
9b2c45d4 3491 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3492}
3493
2aeb0b88
WC
3494static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3495 int what)
1da177e4
LT
3496{
3497 switch (i->type) {
3498 case PACKET_MR_MULTICAST:
1162563f
JP
3499 if (i->alen != dev->addr_len)
3500 return -EINVAL;
1da177e4 3501 if (what > 0)
22bedad3 3502 return dev_mc_add(dev, i->addr);
1da177e4 3503 else
22bedad3 3504 return dev_mc_del(dev, i->addr);
1da177e4
LT
3505 break;
3506 case PACKET_MR_PROMISC:
2aeb0b88 3507 return dev_set_promiscuity(dev, what);
1da177e4 3508 case PACKET_MR_ALLMULTI:
2aeb0b88 3509 return dev_set_allmulti(dev, what);
d95ed927 3510 case PACKET_MR_UNICAST:
1162563f
JP
3511 if (i->alen != dev->addr_len)
3512 return -EINVAL;
d95ed927 3513 if (what > 0)
a748ee24 3514 return dev_uc_add(dev, i->addr);
d95ed927 3515 else
a748ee24 3516 return dev_uc_del(dev, i->addr);
d95ed927 3517 break;
40d4e3df
ED
3518 default:
3519 break;
1da177e4 3520 }
2aeb0b88 3521 return 0;
1da177e4
LT
3522}
3523
82f17091
FR
3524static void packet_dev_mclist_delete(struct net_device *dev,
3525 struct packet_mclist **mlp)
1da177e4 3526{
82f17091
FR
3527 struct packet_mclist *ml;
3528
3529 while ((ml = *mlp) != NULL) {
3530 if (ml->ifindex == dev->ifindex) {
3531 packet_dev_mc(dev, ml, -1);
3532 *mlp = ml->next;
3533 kfree(ml);
3534 } else
3535 mlp = &ml->next;
1da177e4
LT
3536 }
3537}
3538
0fb375fb 3539static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3540{
3541 struct packet_sock *po = pkt_sk(sk);
3542 struct packet_mclist *ml, *i;
3543 struct net_device *dev;
3544 int err;
3545
3546 rtnl_lock();
3547
3548 err = -ENODEV;
3b1e0a65 3549 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3550 if (!dev)
3551 goto done;
3552
3553 err = -EINVAL;
1162563f 3554 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3555 goto done;
3556
3557 err = -ENOBUFS;
8b3a7005 3558 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3559 if (i == NULL)
3560 goto done;
3561
3562 err = 0;
3563 for (ml = po->mclist; ml; ml = ml->next) {
3564 if (ml->ifindex == mreq->mr_ifindex &&
3565 ml->type == mreq->mr_type &&
3566 ml->alen == mreq->mr_alen &&
3567 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3568 ml->count++;
3569 /* Free the new element ... */
3570 kfree(i);
3571 goto done;
3572 }
3573 }
3574
3575 i->type = mreq->mr_type;
3576 i->ifindex = mreq->mr_ifindex;
3577 i->alen = mreq->mr_alen;
3578 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3579 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3580 i->count = 1;
3581 i->next = po->mclist;
3582 po->mclist = i;
2aeb0b88
WC
3583 err = packet_dev_mc(dev, i, 1);
3584 if (err) {
3585 po->mclist = i->next;
3586 kfree(i);
3587 }
1da177e4
LT
3588
3589done:
3590 rtnl_unlock();
3591 return err;
3592}
3593
0fb375fb 3594static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3595{
3596 struct packet_mclist *ml, **mlp;
3597
3598 rtnl_lock();
3599
3600 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3601 if (ml->ifindex == mreq->mr_ifindex &&
3602 ml->type == mreq->mr_type &&
3603 ml->alen == mreq->mr_alen &&
3604 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3605 if (--ml->count == 0) {
3606 struct net_device *dev;
3607 *mlp = ml->next;
ad959e76
ED
3608 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3609 if (dev)
1da177e4 3610 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3611 kfree(ml);
3612 }
82f17091 3613 break;
1da177e4
LT
3614 }
3615 }
3616 rtnl_unlock();
82f17091 3617 return 0;
1da177e4
LT
3618}
3619
3620static void packet_flush_mclist(struct sock *sk)
3621{
3622 struct packet_sock *po = pkt_sk(sk);
3623 struct packet_mclist *ml;
3624
3625 if (!po->mclist)
3626 return;
3627
3628 rtnl_lock();
3629 while ((ml = po->mclist) != NULL) {
3630 struct net_device *dev;
3631
3632 po->mclist = ml->next;
ad959e76
ED
3633 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3634 if (dev != NULL)
1da177e4 3635 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3636 kfree(ml);
3637 }
3638 rtnl_unlock();
3639}
1da177e4
LT
3640
3641static int
b7058842 3642packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3643{
3644 struct sock *sk = sock->sk;
8dc41944 3645 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3646 int ret;
3647
3648 if (level != SOL_PACKET)
3649 return -ENOPROTOOPT;
3650
69e3c75f 3651 switch (optname) {
1ce4f28b 3652 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3653 case PACKET_DROP_MEMBERSHIP:
3654 {
0fb375fb
EB
3655 struct packet_mreq_max mreq;
3656 int len = optlen;
3657 memset(&mreq, 0, sizeof(mreq));
3658 if (len < sizeof(struct packet_mreq))
1da177e4 3659 return -EINVAL;
0fb375fb
EB
3660 if (len > sizeof(mreq))
3661 len = sizeof(mreq);
40d4e3df 3662 if (copy_from_user(&mreq, optval, len))
1da177e4 3663 return -EFAULT;
0fb375fb
EB
3664 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3665 return -EINVAL;
1da177e4
LT
3666 if (optname == PACKET_ADD_MEMBERSHIP)
3667 ret = packet_mc_add(sk, &mreq);
3668 else
3669 ret = packet_mc_drop(sk, &mreq);
3670 return ret;
3671 }
a2efcfa0 3672
1da177e4 3673 case PACKET_RX_RING:
69e3c75f 3674 case PACKET_TX_RING:
1da177e4 3675 {
f6fb8f10 3676 union tpacket_req_u req_u;
3677 int len;
1da177e4 3678
5171b37d 3679 lock_sock(sk);
f6fb8f10 3680 switch (po->tp_version) {
3681 case TPACKET_V1:
3682 case TPACKET_V2:
3683 len = sizeof(req_u.req);
3684 break;
3685 case TPACKET_V3:
3686 default:
3687 len = sizeof(req_u.req3);
3688 break;
3689 }
5171b37d
ED
3690 if (optlen < len) {
3691 ret = -EINVAL;
3692 } else {
3693 if (copy_from_user(&req_u.req, optval, len))
3694 ret = -EFAULT;
3695 else
3696 ret = packet_set_ring(sk, &req_u, 0,
3697 optname == PACKET_TX_RING);
3698 }
3699 release_sock(sk);
3700 return ret;
1da177e4
LT
3701 }
3702 case PACKET_COPY_THRESH:
3703 {
3704 int val;
3705
40d4e3df 3706 if (optlen != sizeof(val))
1da177e4 3707 return -EINVAL;
40d4e3df 3708 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3709 return -EFAULT;
3710
3711 pkt_sk(sk)->copy_thresh = val;
3712 return 0;
3713 }
bbd6ef87
PM
3714 case PACKET_VERSION:
3715 {
3716 int val;
3717
3718 if (optlen != sizeof(val))
3719 return -EINVAL;
bbd6ef87
PM
3720 if (copy_from_user(&val, optval, sizeof(val)))
3721 return -EFAULT;
3722 switch (val) {
3723 case TPACKET_V1:
3724 case TPACKET_V2:
f6fb8f10 3725 case TPACKET_V3:
84ac7260 3726 break;
bbd6ef87
PM
3727 default:
3728 return -EINVAL;
3729 }
84ac7260
PP
3730 lock_sock(sk);
3731 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3732 ret = -EBUSY;
3733 } else {
3734 po->tp_version = val;
3735 ret = 0;
3736 }
3737 release_sock(sk);
3738 return ret;
bbd6ef87 3739 }
8913336a
PM
3740 case PACKET_RESERVE:
3741 {
3742 unsigned int val;
3743
3744 if (optlen != sizeof(val))
3745 return -EINVAL;
8913336a
PM
3746 if (copy_from_user(&val, optval, sizeof(val)))
3747 return -EFAULT;
bcc5364b
AK
3748 if (val > INT_MAX)
3749 return -EINVAL;
c27927e3
WB
3750 lock_sock(sk);
3751 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3752 ret = -EBUSY;
3753 } else {
3754 po->tp_reserve = val;
3755 ret = 0;
3756 }
3757 release_sock(sk);
3758 return ret;
8913336a 3759 }
69e3c75f
JB
3760 case PACKET_LOSS:
3761 {
3762 unsigned int val;
3763
3764 if (optlen != sizeof(val))
3765 return -EINVAL;
69e3c75f
JB
3766 if (copy_from_user(&val, optval, sizeof(val)))
3767 return -EFAULT;
a6361f0c
WB
3768
3769 lock_sock(sk);
3770 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3771 ret = -EBUSY;
3772 } else {
3773 po->tp_loss = !!val;
3774 ret = 0;
3775 }
3776 release_sock(sk);
3777 return ret;
69e3c75f 3778 }
8dc41944
HX
3779 case PACKET_AUXDATA:
3780 {
3781 int val;
3782
3783 if (optlen < sizeof(val))
3784 return -EINVAL;
3785 if (copy_from_user(&val, optval, sizeof(val)))
3786 return -EFAULT;
3787
a6361f0c 3788 lock_sock(sk);
8dc41944 3789 po->auxdata = !!val;
a6361f0c 3790 release_sock(sk);
8dc41944
HX
3791 return 0;
3792 }
80feaacb
PWJ
3793 case PACKET_ORIGDEV:
3794 {
3795 int val;
3796
3797 if (optlen < sizeof(val))
3798 return -EINVAL;
3799 if (copy_from_user(&val, optval, sizeof(val)))
3800 return -EFAULT;
3801
a6361f0c 3802 lock_sock(sk);
80feaacb 3803 po->origdev = !!val;
a6361f0c 3804 release_sock(sk);
80feaacb
PWJ
3805 return 0;
3806 }
bfd5f4a3
SS
3807 case PACKET_VNET_HDR:
3808 {
3809 int val;
3810
3811 if (sock->type != SOCK_RAW)
3812 return -EINVAL;
bfd5f4a3
SS
3813 if (optlen < sizeof(val))
3814 return -EINVAL;
3815 if (copy_from_user(&val, optval, sizeof(val)))
3816 return -EFAULT;
3817
a6361f0c
WB
3818 lock_sock(sk);
3819 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3820 ret = -EBUSY;
3821 } else {
3822 po->has_vnet_hdr = !!val;
3823 ret = 0;
3824 }
3825 release_sock(sk);
3826 return ret;
bfd5f4a3 3827 }
614f60fa
SM
3828 case PACKET_TIMESTAMP:
3829 {
3830 int val;
3831
3832 if (optlen != sizeof(val))
3833 return -EINVAL;
3834 if (copy_from_user(&val, optval, sizeof(val)))
3835 return -EFAULT;
3836
3837 po->tp_tstamp = val;
3838 return 0;
3839 }
dc99f600
DM
3840 case PACKET_FANOUT:
3841 {
3842 int val;
3843
3844 if (optlen != sizeof(val))
3845 return -EINVAL;
3846 if (copy_from_user(&val, optval, sizeof(val)))
3847 return -EFAULT;
3848
3849 return fanout_add(sk, val & 0xffff, val >> 16);
3850 }
47dceb8e
WB
3851 case PACKET_FANOUT_DATA:
3852 {
3853 if (!po->fanout)
3854 return -EINVAL;
3855
3856 return fanout_set_data(po, optval, optlen);
3857 }
fa788d98
VW
3858 case PACKET_IGNORE_OUTGOING:
3859 {
3860 int val;
3861
3862 if (optlen != sizeof(val))
3863 return -EINVAL;
3864 if (copy_from_user(&val, optval, sizeof(val)))
3865 return -EFAULT;
3866 if (val < 0 || val > 1)
3867 return -EINVAL;
3868
3869 po->prot_hook.ignore_outgoing = !!val;
3870 return 0;
3871 }
5920cd3a
PC
3872 case PACKET_TX_HAS_OFF:
3873 {
3874 unsigned int val;
3875
3876 if (optlen != sizeof(val))
3877 return -EINVAL;
5920cd3a
PC
3878 if (copy_from_user(&val, optval, sizeof(val)))
3879 return -EFAULT;
a6361f0c
WB
3880
3881 lock_sock(sk);
3882 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3883 ret = -EBUSY;
3884 } else {
3885 po->tp_tx_has_off = !!val;
3886 ret = 0;
3887 }
3888 release_sock(sk);
5920cd3a
PC
3889 return 0;
3890 }
d346a3fa
DB
3891 case PACKET_QDISC_BYPASS:
3892 {
3893 int val;
3894
3895 if (optlen != sizeof(val))
3896 return -EINVAL;
3897 if (copy_from_user(&val, optval, sizeof(val)))
3898 return -EFAULT;
3899
3900 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3901 return 0;
3902 }
1da177e4
LT
3903 default:
3904 return -ENOPROTOOPT;
3905 }
3906}
3907
3908static int packet_getsockopt(struct socket *sock, int level, int optname,
3909 char __user *optval, int __user *optlen)
3910{
3911 int len;
c06fff6e 3912 int val, lv = sizeof(val);
1da177e4
LT
3913 struct sock *sk = sock->sk;
3914 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3915 void *data = &val;
ee80fbf3 3916 union tpacket_stats_u st;
a9b63918 3917 struct tpacket_rollover_stats rstats;
8e8e2951 3918 int drops;
1da177e4
LT
3919
3920 if (level != SOL_PACKET)
3921 return -ENOPROTOOPT;
3922
8ae55f04
KK
3923 if (get_user(len, optlen))
3924 return -EFAULT;
1da177e4
LT
3925
3926 if (len < 0)
3927 return -EINVAL;
1ce4f28b 3928
69e3c75f 3929 switch (optname) {
1da177e4 3930 case PACKET_STATISTICS:
1da177e4 3931 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3932 memcpy(&st, &po->stats, sizeof(st));
3933 memset(&po->stats, 0, sizeof(po->stats));
3934 spin_unlock_bh(&sk->sk_receive_queue.lock);
8e8e2951 3935 drops = atomic_xchg(&po->tp_drops, 0);
ee80fbf3 3936
f6fb8f10 3937 if (po->tp_version == TPACKET_V3) {
c06fff6e 3938 lv = sizeof(struct tpacket_stats_v3);
8e8e2951
ED
3939 st.stats3.tp_drops = drops;
3940 st.stats3.tp_packets += drops;
ee80fbf3 3941 data = &st.stats3;
f6fb8f10 3942 } else {
c06fff6e 3943 lv = sizeof(struct tpacket_stats);
8e8e2951
ED
3944 st.stats1.tp_drops = drops;
3945 st.stats1.tp_packets += drops;
ee80fbf3 3946 data = &st.stats1;
f6fb8f10 3947 }
ee80fbf3 3948
8dc41944
HX
3949 break;
3950 case PACKET_AUXDATA:
8dc41944 3951 val = po->auxdata;
80feaacb
PWJ
3952 break;
3953 case PACKET_ORIGDEV:
80feaacb 3954 val = po->origdev;
bfd5f4a3
SS
3955 break;
3956 case PACKET_VNET_HDR:
bfd5f4a3 3957 val = po->has_vnet_hdr;
1da177e4 3958 break;
bbd6ef87 3959 case PACKET_VERSION:
bbd6ef87 3960 val = po->tp_version;
bbd6ef87
PM
3961 break;
3962 case PACKET_HDRLEN:
3963 if (len > sizeof(int))
3964 len = sizeof(int);
fd2c83b3
AP
3965 if (len < sizeof(int))
3966 return -EINVAL;
bbd6ef87
PM
3967 if (copy_from_user(&val, optval, len))
3968 return -EFAULT;
3969 switch (val) {
3970 case TPACKET_V1:
3971 val = sizeof(struct tpacket_hdr);
3972 break;
3973 case TPACKET_V2:
3974 val = sizeof(struct tpacket2_hdr);
3975 break;
f6fb8f10 3976 case TPACKET_V3:
3977 val = sizeof(struct tpacket3_hdr);
3978 break;
bbd6ef87
PM
3979 default:
3980 return -EINVAL;
3981 }
bbd6ef87 3982 break;
8913336a 3983 case PACKET_RESERVE:
8913336a 3984 val = po->tp_reserve;
8913336a 3985 break;
69e3c75f 3986 case PACKET_LOSS:
69e3c75f 3987 val = po->tp_loss;
69e3c75f 3988 break;
614f60fa 3989 case PACKET_TIMESTAMP:
614f60fa 3990 val = po->tp_tstamp;
614f60fa 3991 break;
dc99f600 3992 case PACKET_FANOUT:
dc99f600
DM
3993 val = (po->fanout ?
3994 ((u32)po->fanout->id |
77f65ebd
WB
3995 ((u32)po->fanout->type << 16) |
3996 ((u32)po->fanout->flags << 24)) :
dc99f600 3997 0);
dc99f600 3998 break;
fa788d98
VW
3999 case PACKET_IGNORE_OUTGOING:
4000 val = po->prot_hook.ignore_outgoing;
4001 break;
a9b63918 4002 case PACKET_ROLLOVER_STATS:
57f015f5 4003 if (!po->rollover)
a9b63918 4004 return -EINVAL;
57f015f5
MM
4005 rstats.tp_all = atomic_long_read(&po->rollover->num);
4006 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4007 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4008 data = &rstats;
4009 lv = sizeof(rstats);
a9b63918 4010 break;
5920cd3a
PC
4011 case PACKET_TX_HAS_OFF:
4012 val = po->tp_tx_has_off;
4013 break;
d346a3fa
DB
4014 case PACKET_QDISC_BYPASS:
4015 val = packet_use_direct_xmit(po);
4016 break;
1da177e4
LT
4017 default:
4018 return -ENOPROTOOPT;
4019 }
4020
c06fff6e
ED
4021 if (len > lv)
4022 len = lv;
8ae55f04
KK
4023 if (put_user(len, optlen))
4024 return -EFAULT;
8dc41944
HX
4025 if (copy_to_user(optval, data, len))
4026 return -EFAULT;
8ae55f04 4027 return 0;
1da177e4
LT
4028}
4029
4030
719c44d3
WB
4031#ifdef CONFIG_COMPAT
4032static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4033 char __user *optval, unsigned int optlen)
4034{
4035 struct packet_sock *po = pkt_sk(sock->sk);
4036
4037 if (level != SOL_PACKET)
4038 return -ENOPROTOOPT;
4039
4040 if (optname == PACKET_FANOUT_DATA &&
4041 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4042 optval = (char __user *)get_compat_bpf_fprog(optval);
4043 if (!optval)
4044 return -EFAULT;
4045 optlen = sizeof(struct sock_fprog);
4046 }
4047
4048 return packet_setsockopt(sock, level, optname, optval, optlen);
4049}
4050#endif
4051
351638e7
JP
4052static int packet_notifier(struct notifier_block *this,
4053 unsigned long msg, void *ptr)
1da177e4
LT
4054{
4055 struct sock *sk;
351638e7 4056 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4057 struct net *net = dev_net(dev);
1da177e4 4058
808f5114 4059 rcu_read_lock();
b67bfe0d 4060 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4061 struct packet_sock *po = pkt_sk(sk);
4062
4063 switch (msg) {
4064 case NETDEV_UNREGISTER:
1da177e4 4065 if (po->mclist)
82f17091 4066 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4067 /* fallthrough */
4068
1da177e4
LT
4069 case NETDEV_DOWN:
4070 if (dev->ifindex == po->ifindex) {
4071 spin_lock(&po->bind_lock);
4072 if (po->running) {
ce06b03e 4073 __unregister_prot_hook(sk, false);
1da177e4
LT
4074 sk->sk_err = ENETDOWN;
4075 if (!sock_flag(sk, SOCK_DEAD))
4076 sk->sk_error_report(sk);
4077 }
4078 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4079 packet_cached_dev_reset(po);
1da177e4 4080 po->ifindex = -1;
160ff18a
BG
4081 if (po->prot_hook.dev)
4082 dev_put(po->prot_hook.dev);
1da177e4
LT
4083 po->prot_hook.dev = NULL;
4084 }
4085 spin_unlock(&po->bind_lock);
4086 }
4087 break;
4088 case NETDEV_UP:
808f5114 4089 if (dev->ifindex == po->ifindex) {
4090 spin_lock(&po->bind_lock);
ce06b03e
DM
4091 if (po->num)
4092 register_prot_hook(sk);
808f5114 4093 spin_unlock(&po->bind_lock);
1da177e4 4094 }
1da177e4
LT
4095 break;
4096 }
4097 }
808f5114 4098 rcu_read_unlock();
1da177e4
LT
4099 return NOTIFY_DONE;
4100}
4101
4102
4103static int packet_ioctl(struct socket *sock, unsigned int cmd,
4104 unsigned long arg)
4105{
4106 struct sock *sk = sock->sk;
4107
69e3c75f 4108 switch (cmd) {
40d4e3df
ED
4109 case SIOCOUTQ:
4110 {
4111 int amount = sk_wmem_alloc_get(sk);
31e6d363 4112
40d4e3df
ED
4113 return put_user(amount, (int __user *)arg);
4114 }
4115 case SIOCINQ:
4116 {
4117 struct sk_buff *skb;
4118 int amount = 0;
4119
4120 spin_lock_bh(&sk->sk_receive_queue.lock);
4121 skb = skb_peek(&sk->sk_receive_queue);
4122 if (skb)
4123 amount = skb->len;
4124 spin_unlock_bh(&sk->sk_receive_queue.lock);
4125 return put_user(amount, (int __user *)arg);
4126 }
1da177e4 4127#ifdef CONFIG_INET
40d4e3df
ED
4128 case SIOCADDRT:
4129 case SIOCDELRT:
4130 case SIOCDARP:
4131 case SIOCGARP:
4132 case SIOCSARP:
4133 case SIOCGIFADDR:
4134 case SIOCSIFADDR:
4135 case SIOCGIFBRDADDR:
4136 case SIOCSIFBRDADDR:
4137 case SIOCGIFNETMASK:
4138 case SIOCSIFNETMASK:
4139 case SIOCGIFDSTADDR:
4140 case SIOCSIFDSTADDR:
4141 case SIOCSIFFLAGS:
40d4e3df 4142 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4143#endif
4144
40d4e3df
ED
4145 default:
4146 return -ENOIOCTLCMD;
1da177e4
LT
4147 }
4148 return 0;
4149}
4150
a11e1d43
LT
4151static __poll_t packet_poll(struct file *file, struct socket *sock,
4152 poll_table *wait)
1da177e4
LT
4153{
4154 struct sock *sk = sock->sk;
4155 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4156 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4157
4158 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4159 if (po->rx_ring.pg_vec) {
f6fb8f10 4160 if (!packet_previous_rx_frame(po, &po->rx_ring,
4161 TP_STATUS_KERNEL))
a9a08845 4162 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4163 }
9bb6cd65 4164 packet_rcv_try_clear_pressure(po);
1da177e4 4165 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4166 spin_lock_bh(&sk->sk_write_queue.lock);
4167 if (po->tx_ring.pg_vec) {
4168 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4169 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4170 }
4171 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4172 return mask;
4173}
4174
4175
4176/* Dirty? Well, I still did not learn better way to account
4177 * for user mmaps.
4178 */
4179
4180static void packet_mm_open(struct vm_area_struct *vma)
4181{
4182 struct file *file = vma->vm_file;
40d4e3df 4183 struct socket *sock = file->private_data;
1da177e4 4184 struct sock *sk = sock->sk;
1ce4f28b 4185
1da177e4
LT
4186 if (sk)
4187 atomic_inc(&pkt_sk(sk)->mapped);
4188}
4189
4190static void packet_mm_close(struct vm_area_struct *vma)
4191{
4192 struct file *file = vma->vm_file;
40d4e3df 4193 struct socket *sock = file->private_data;
1da177e4 4194 struct sock *sk = sock->sk;
1ce4f28b 4195
1da177e4
LT
4196 if (sk)
4197 atomic_dec(&pkt_sk(sk)->mapped);
4198}
4199
f0f37e2f 4200static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4201 .open = packet_mm_open,
4202 .close = packet_mm_close,
1da177e4
LT
4203};
4204
3a7ad063
ED
4205static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4206 unsigned int len)
1da177e4
LT
4207{
4208 int i;
4209
4ebf0ae2 4210 for (i = 0; i < len; i++) {
0e3125c7 4211 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4212 if (is_vmalloc_addr(pg_vec[i].buffer))
4213 vfree(pg_vec[i].buffer);
4214 else
4215 free_pages((unsigned long)pg_vec[i].buffer,
4216 order);
0e3125c7
NH
4217 pg_vec[i].buffer = NULL;
4218 }
1da177e4
LT
4219 }
4220 kfree(pg_vec);
4221}
4222
3a7ad063 4223static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4224{
f0d4eb29 4225 char *buffer;
3a7ad063
ED
4226 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4227 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4228
3a7ad063 4229 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4230 if (buffer)
4231 return buffer;
4232
3a7ad063
ED
4233 /* __get_free_pages failed, fall back to vmalloc */
4234 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4235 if (buffer)
4236 return buffer;
0e3125c7 4237
3a7ad063
ED
4238 /* vmalloc failed, lets dig into swap here */
4239 gfp_flags &= ~__GFP_NORETRY;
4240 buffer = (char *) __get_free_pages(gfp_flags, order);
4241 if (buffer)
4242 return buffer;
4243
4244 /* complete and utter failure */
4245 return NULL;
4ebf0ae2
DM
4246}
4247
3a7ad063 4248static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4249{
4250 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4251 struct pgv *pg_vec;
4ebf0ae2
DM
4252 int i;
4253
398f0132 4254 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4ebf0ae2
DM
4255 if (unlikely(!pg_vec))
4256 goto out;
4257
4258 for (i = 0; i < block_nr; i++) {
3a7ad063 4259 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4260 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4261 goto out_free_pgvec;
4262 }
4263
4264out:
4265 return pg_vec;
4266
4267out_free_pgvec:
3a7ad063 4268 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4269 pg_vec = NULL;
4270 goto out;
4271}
1da177e4 4272
f6fb8f10 4273static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4274 int closing, int tx_ring)
1da177e4 4275{
0e3125c7 4276 struct pgv *pg_vec = NULL;
1da177e4 4277 struct packet_sock *po = pkt_sk(sk);
3a7ad063 4278 int was_running, order = 0;
69e3c75f
JB
4279 struct packet_ring_buffer *rb;
4280 struct sk_buff_head *rb_queue;
0e11c91e 4281 __be16 num;
f6fb8f10 4282 int err = -EINVAL;
4283 /* Added to avoid minimal code churn */
4284 struct tpacket_req *req = &req_u->req;
4285
69e3c75f
JB
4286 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4287 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4288
69e3c75f
JB
4289 err = -EBUSY;
4290 if (!closing) {
4291 if (atomic_read(&po->mapped))
4292 goto out;
b0138408 4293 if (packet_read_pending(rb))
69e3c75f
JB
4294 goto out;
4295 }
1da177e4 4296
69e3c75f 4297 if (req->tp_block_nr) {
4576cd46
WB
4298 unsigned int min_frame_size;
4299
69e3c75f
JB
4300 /* Sanity tests and some calculations */
4301 err = -EBUSY;
4302 if (unlikely(rb->pg_vec))
4303 goto out;
1da177e4 4304
bbd6ef87
PM
4305 switch (po->tp_version) {
4306 case TPACKET_V1:
4307 po->tp_hdrlen = TPACKET_HDRLEN;
4308 break;
4309 case TPACKET_V2:
4310 po->tp_hdrlen = TPACKET2_HDRLEN;
4311 break;
f6fb8f10 4312 case TPACKET_V3:
4313 po->tp_hdrlen = TPACKET3_HDRLEN;
4314 break;
bbd6ef87
PM
4315 }
4316
69e3c75f 4317 err = -EINVAL;
4ebf0ae2 4318 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4319 goto out;
90836b67 4320 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4321 goto out;
4576cd46 4322 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4323 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4324 req->tp_block_size <
4325 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4326 goto out;
4576cd46 4327 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4328 goto out;
4ebf0ae2 4329 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4330 goto out;
1da177e4 4331
4194b491
TK
4332 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4333 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4334 goto out;
fc62814d 4335 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4336 goto out;
69e3c75f
JB
4337 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4338 req->tp_frame_nr))
4339 goto out;
1da177e4
LT
4340
4341 err = -ENOMEM;
3a7ad063
ED
4342 order = get_order(req->tp_block_size);
4343 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4344 if (unlikely(!pg_vec))
1da177e4 4345 goto out;
f6fb8f10 4346 switch (po->tp_version) {
4347 case TPACKET_V3:
7f953ab2
SV
4348 /* Block transmit is not supported yet */
4349 if (!tx_ring) {
e8e85cc5 4350 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4351 } else {
4352 struct tpacket_req3 *req3 = &req_u->req3;
4353
4354 if (req3->tp_retire_blk_tov ||
4355 req3->tp_sizeof_priv ||
4356 req3->tp_feature_req_word) {
4357 err = -EINVAL;
55655e3d 4358 goto out_free_pg_vec;
7f953ab2
SV
4359 }
4360 }
d7cf0c34 4361 break;
f6fb8f10 4362 default:
4363 break;
4364 }
69e3c75f
JB
4365 }
4366 /* Done */
4367 else {
4368 err = -EINVAL;
4ebf0ae2 4369 if (unlikely(req->tp_frame_nr))
69e3c75f 4370 goto out;
1da177e4
LT
4371 }
4372
1da177e4
LT
4373
4374 /* Detach socket from network */
4375 spin_lock(&po->bind_lock);
4376 was_running = po->running;
4377 num = po->num;
4378 if (was_running) {
1da177e4 4379 po->num = 0;
ce06b03e 4380 __unregister_prot_hook(sk, false);
1da177e4
LT
4381 }
4382 spin_unlock(&po->bind_lock);
1ce4f28b 4383
1da177e4
LT
4384 synchronize_net();
4385
4386 err = -EBUSY;
905db440 4387 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4388 if (closing || atomic_read(&po->mapped) == 0) {
4389 err = 0;
69e3c75f 4390 spin_lock_bh(&rb_queue->lock);
c053fd96 4391 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4392 rb->frame_max = (req->tp_frame_nr - 1);
4393 rb->head = 0;
4394 rb->frame_size = req->tp_frame_size;
4395 spin_unlock_bh(&rb_queue->lock);
4396
3a7ad063 4397 swap(rb->pg_vec_order, order);
c053fd96 4398 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4399
4400 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4401 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4402 tpacket_rcv : packet_rcv;
4403 skb_queue_purge(rb_queue);
1da177e4 4404 if (atomic_read(&po->mapped))
40d4e3df
ED
4405 pr_err("packet_mmap: vma is busy: %d\n",
4406 atomic_read(&po->mapped));
1da177e4 4407 }
905db440 4408 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4409
4410 spin_lock(&po->bind_lock);
ce06b03e 4411 if (was_running) {
1da177e4 4412 po->num = num;
ce06b03e 4413 register_prot_hook(sk);
1da177e4
LT
4414 }
4415 spin_unlock(&po->bind_lock);
c800aaf8 4416 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4417 /* Because we don't support block-based V3 on tx-ring */
4418 if (!tx_ring)
73d0fcf2 4419 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4420 }
1da177e4 4421
55655e3d 4422out_free_pg_vec:
1da177e4 4423 if (pg_vec)
3a7ad063 4424 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4425out:
4426 return err;
4427}
4428
69e3c75f
JB
4429static int packet_mmap(struct file *file, struct socket *sock,
4430 struct vm_area_struct *vma)
1da177e4
LT
4431{
4432 struct sock *sk = sock->sk;
4433 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4434 unsigned long size, expected_size;
4435 struct packet_ring_buffer *rb;
1da177e4
LT
4436 unsigned long start;
4437 int err = -EINVAL;
4438 int i;
4439
4440 if (vma->vm_pgoff)
4441 return -EINVAL;
4442
905db440 4443 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4444
4445 expected_size = 0;
4446 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4447 if (rb->pg_vec) {
4448 expected_size += rb->pg_vec_len
4449 * rb->pg_vec_pages
4450 * PAGE_SIZE;
4451 }
4452 }
4453
4454 if (expected_size == 0)
1da177e4 4455 goto out;
69e3c75f
JB
4456
4457 size = vma->vm_end - vma->vm_start;
4458 if (size != expected_size)
1da177e4
LT
4459 goto out;
4460
1da177e4 4461 start = vma->vm_start;
69e3c75f
JB
4462 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4463 if (rb->pg_vec == NULL)
4464 continue;
4465
4466 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4467 struct page *page;
4468 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4469 int pg_num;
4470
c56b4d90
CG
4471 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4472 page = pgv_to_page(kaddr);
69e3c75f
JB
4473 err = vm_insert_page(vma, start, page);
4474 if (unlikely(err))
4475 goto out;
4476 start += PAGE_SIZE;
0e3125c7 4477 kaddr += PAGE_SIZE;
69e3c75f 4478 }
4ebf0ae2 4479 }
1da177e4 4480 }
69e3c75f 4481
4ebf0ae2 4482 atomic_inc(&po->mapped);
1da177e4
LT
4483 vma->vm_ops = &packet_mmap_ops;
4484 err = 0;
4485
4486out:
905db440 4487 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4488 return err;
4489}
1da177e4 4490
90ddc4f0 4491static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4492 .family = PF_PACKET,
4493 .owner = THIS_MODULE,
4494 .release = packet_release,
4495 .bind = packet_bind_spkt,
4496 .connect = sock_no_connect,
4497 .socketpair = sock_no_socketpair,
4498 .accept = sock_no_accept,
4499 .getname = packet_getname_spkt,
a11e1d43 4500 .poll = datagram_poll,
1da177e4 4501 .ioctl = packet_ioctl,
c7cbdbf2 4502 .gettstamp = sock_gettstamp,
1da177e4
LT
4503 .listen = sock_no_listen,
4504 .shutdown = sock_no_shutdown,
4505 .setsockopt = sock_no_setsockopt,
4506 .getsockopt = sock_no_getsockopt,
4507 .sendmsg = packet_sendmsg_spkt,
4508 .recvmsg = packet_recvmsg,
4509 .mmap = sock_no_mmap,
4510 .sendpage = sock_no_sendpage,
4511};
1da177e4 4512
90ddc4f0 4513static const struct proto_ops packet_ops = {
1da177e4
LT
4514 .family = PF_PACKET,
4515 .owner = THIS_MODULE,
4516 .release = packet_release,
4517 .bind = packet_bind,
4518 .connect = sock_no_connect,
4519 .socketpair = sock_no_socketpair,
4520 .accept = sock_no_accept,
1ce4f28b 4521 .getname = packet_getname,
a11e1d43 4522 .poll = packet_poll,
1da177e4 4523 .ioctl = packet_ioctl,
c7cbdbf2 4524 .gettstamp = sock_gettstamp,
1da177e4
LT
4525 .listen = sock_no_listen,
4526 .shutdown = sock_no_shutdown,
4527 .setsockopt = packet_setsockopt,
4528 .getsockopt = packet_getsockopt,
719c44d3
WB
4529#ifdef CONFIG_COMPAT
4530 .compat_setsockopt = compat_packet_setsockopt,
4531#endif
1da177e4
LT
4532 .sendmsg = packet_sendmsg,
4533 .recvmsg = packet_recvmsg,
4534 .mmap = packet_mmap,
4535 .sendpage = sock_no_sendpage,
4536};
4537
ec1b4cf7 4538static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4539 .family = PF_PACKET,
4540 .create = packet_create,
4541 .owner = THIS_MODULE,
4542};
4543
4544static struct notifier_block packet_netdev_notifier = {
40d4e3df 4545 .notifier_call = packet_notifier,
1da177e4
LT
4546};
4547
4548#ifdef CONFIG_PROC_FS
1da177e4
LT
4549
4550static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4551 __acquires(RCU)
1da177e4 4552{
e372c414 4553 struct net *net = seq_file_net(seq);
808f5114 4554
4555 rcu_read_lock();
4556 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4557}
4558
4559static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4560{
1bf40954 4561 struct net *net = seq_file_net(seq);
808f5114 4562 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4563}
4564
4565static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4566 __releases(RCU)
1da177e4 4567{
808f5114 4568 rcu_read_unlock();
1da177e4
LT
4569}
4570
1ce4f28b 4571static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4572{
4573 if (v == SEQ_START_TOKEN)
4574 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4575 else {
b7ceabd9 4576 struct sock *s = sk_entry(v);
1da177e4
LT
4577 const struct packet_sock *po = pkt_sk(s);
4578
4579 seq_printf(seq,
71338aa7 4580 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4581 s,
41c6d650 4582 refcount_read(&s->sk_refcnt),
1da177e4
LT
4583 s->sk_type,
4584 ntohs(po->num),
4585 po->ifindex,
4586 po->running,
4587 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4588 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4589 sock_i_ino(s));
1da177e4
LT
4590 }
4591
4592 return 0;
4593}
4594
56b3d975 4595static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4596 .start = packet_seq_start,
4597 .next = packet_seq_next,
4598 .stop = packet_seq_stop,
4599 .show = packet_seq_show,
4600};
1da177e4
LT
4601#endif
4602
2c8c1e72 4603static int __net_init packet_net_init(struct net *net)
d12d01d6 4604{
0fa7fa98 4605 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4606 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4607
c3506372
CH
4608 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4609 sizeof(struct seq_net_private)))
d12d01d6
DL
4610 return -ENOMEM;
4611
4612 return 0;
4613}
4614
2c8c1e72 4615static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4616{
ece31ffd 4617 remove_proc_entry("packet", net->proc_net);
669f8f1a 4618 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4619}
4620
4621static struct pernet_operations packet_net_ops = {
4622 .init = packet_net_init,
4623 .exit = packet_net_exit,
4624};
4625
4626
1da177e4
LT
4627static void __exit packet_exit(void)
4628{
1da177e4 4629 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4630 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4631 sock_unregister(PF_PACKET);
4632 proto_unregister(&packet_proto);
4633}
4634
4635static int __init packet_init(void)
4636{
36096f2f 4637 int rc;
1da177e4 4638
36096f2f
Y
4639 rc = proto_register(&packet_proto, 0);
4640 if (rc)
1da177e4 4641 goto out;
36096f2f
Y
4642 rc = sock_register(&packet_family_ops);
4643 if (rc)
4644 goto out_proto;
4645 rc = register_pernet_subsys(&packet_net_ops);
4646 if (rc)
4647 goto out_sock;
4648 rc = register_netdevice_notifier(&packet_netdev_notifier);
4649 if (rc)
4650 goto out_pernet;
1da177e4 4651
36096f2f
Y
4652 return 0;
4653
4654out_pernet:
4655 unregister_pernet_subsys(&packet_net_ops);
4656out_sock:
4657 sock_unregister(PF_PACKET);
4658out_proto:
4659 proto_unregister(&packet_proto);
1da177e4
LT
4660out:
4661 return rc;
4662}
4663
4664module_init(packet_init);
4665module_exit(packet_exit);
4666MODULE_LICENSE("GPL");
4667MODULE_ALIAS_NETPROTO(PF_PACKET);