]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - net/packet/af_packet.c
net/packet: tpacket_rcv: do not increment ring index on drop
[mirror_ubuntu-focal-kernel.git] / net / packet / af_packet.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * PACKET - implements raw packet sockets.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 *
1ce4f28b 13 * Fixes:
1da177e4
LT
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 36 * Ulises Alonso : Frame number limit removal and
1da177e4 37 * packet_set_ring memory leak.
0fb375fb
EB
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
1ce4f28b 41 * byte arrays at the end of sockaddr_ll
0fb375fb 42 * and packet_mreq.
69e3c75f 43 * Johann Baudy : Added TX RING.
f6fb8f10 44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * layer.
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
1da177e4 47 */
1ce4f28b 48
1da177e4 49#include <linux/types.h>
1da177e4 50#include <linux/mm.h>
4fc268d2 51#include <linux/capability.h>
1da177e4
LT
52#include <linux/fcntl.h>
53#include <linux/socket.h>
54#include <linux/in.h>
55#include <linux/inet.h>
56#include <linux/netdevice.h>
57#include <linux/if_packet.h>
58#include <linux/wireless.h>
ffbc6111 59#include <linux/kernel.h>
1da177e4 60#include <linux/kmod.h>
5a0e3ad6 61#include <linux/slab.h>
0e3125c7 62#include <linux/vmalloc.h>
457c4cbc 63#include <net/net_namespace.h>
1da177e4
LT
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
7c0f6ba6 70#include <linux/uaccess.h>
1da177e4
LT
71#include <asm/ioctls.h>
72#include <asm/page.h>
a1f8e7f7 73#include <asm/cacheflush.h>
1da177e4
LT
74#include <asm/io.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77#include <linux/poll.h>
78#include <linux/module.h>
79#include <linux/init.h>
905db440 80#include <linux/mutex.h>
05423b24 81#include <linux/if_vlan.h>
bfd5f4a3 82#include <linux/virtio_net.h>
ed85b565 83#include <linux/errqueue.h>
614f60fa 84#include <linux/net_tstamp.h>
b0138408 85#include <linux/percpu.h>
1da177e4
LT
86#ifdef CONFIG_INET
87#include <net/inet_common.h>
88#endif
47dceb8e 89#include <linux/bpf.h>
719c44d3 90#include <net/compat.h>
1da177e4 91
2787b04b
PE
92#include "internal.h"
93
1da177e4
LT
94/*
95 Assumptions:
96 - if device has no dev->hard_header routine, it adds and removes ll header
97 inside itself. In this case ll header is invisible outside of device,
98 but higher levels still should reserve dev->hard_header_len.
99 Some devices are enough clever to reallocate skb, when header
100 will not fit to reserved space (tunnel), another ones are silly
101 (PPP).
102 - packet socket receives packets with pulled ll header,
103 so that SOCK_RAW should push it back.
104
105On receive:
106-----------
107
108Incoming, dev->hard_header!=NULL
b0e380b1
ACM
109 mac_header -> ll header
110 data -> data
1da177e4
LT
111
112Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> ll header
1da177e4
LT
115
116Incoming, dev->hard_header==NULL
b0e380b1
ACM
117 mac_header -> UNKNOWN position. It is very likely, that it points to ll
118 header. PPP makes it, that is wrong, because introduce
db0c58f9 119 assymetry between rx and tx paths.
b0e380b1 120 data -> data
1da177e4
LT
121
122Outgoing, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> data. ll header is still not built!
124 data -> data
1da177e4
LT
125
126Resume
127 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
128
129
130On transmit:
131------------
132
133dev->hard_header != NULL
b0e380b1
ACM
134 mac_header -> ll header
135 data -> ll header
1da177e4
LT
136
137dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
138 mac_header -> data
139 data -> data
1da177e4
LT
140
141 We should set nh.raw on output to correct posistion,
142 packet classifier depends on it.
143 */
144
1da177e4
LT
145/* Private packet socket structures. */
146
0fb375fb
EB
147/* identical to struct packet_mreq except it has
148 * a longer address field.
149 */
40d4e3df 150struct packet_mreq_max {
0fb375fb
EB
151 int mr_ifindex;
152 unsigned short mr_type;
153 unsigned short mr_alen;
154 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 155};
a2efcfa0 156
184f489e
DB
157union tpacket_uhdr {
158 struct tpacket_hdr *h1;
159 struct tpacket2_hdr *h2;
160 struct tpacket3_hdr *h3;
161 void *raw;
162};
163
f6fb8f10 164static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
165 int closing, int tx_ring);
166
f6fb8f10 167#define V3_ALIGNMENT (8)
168
bc59ba39 169#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 170
171#define BLK_PLUS_PRIV(sz_of_priv) \
172 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
173
f6fb8f10 174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
181
69e3c75f 182struct packet_sock;
77f65ebd
WB
183static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
184 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 185
f6fb8f10 186static void *packet_previous_frame(struct packet_sock *po,
187 struct packet_ring_buffer *rb,
188 int status);
189static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 190static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 191static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 192 struct packet_sock *);
bc59ba39 193static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 194 struct packet_sock *, unsigned int status);
bc59ba39 195static int prb_queue_frozen(struct tpacket_kbdq_core *);
196static void prb_open_block(struct tpacket_kbdq_core *,
197 struct tpacket_block_desc *);
17bfd8c8 198static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 199static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 200static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
201static void prb_clear_rxhash(struct tpacket_kbdq_core *,
202 struct tpacket3_hdr *);
203static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
204 struct tpacket3_hdr *);
1da177e4 205static void packet_flush_mclist(struct sock *sk);
865b03f2 206static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 207
ffbc6111 208struct packet_skb_cb {
ffbc6111
HX
209 union {
210 struct sockaddr_pkt pkt;
2472d761
EB
211 union {
212 /* Trick: alias skb original length with
213 * ll.sll_family and ll.protocol in order
214 * to save room.
215 */
216 unsigned int origlen;
217 struct sockaddr_ll ll;
218 };
ffbc6111
HX
219 } sa;
220};
221
d3869efe
DW
222#define vio_le() virtio_legacy_is_little_endian()
223
ffbc6111 224#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 225
bc59ba39 226#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 227#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 228 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 229#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 231#define GET_NEXT_PRB_BLK_NUM(x) \
232 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
233 ((x)->kactive_blk_num+1) : 0)
234
dc99f600
DM
235static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
236static void __fanout_link(struct sock *sk, struct packet_sock *po);
237
d346a3fa
DB
238static int packet_direct_xmit(struct sk_buff *skb)
239{
865b03f2 240 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
241}
242
66e56cd4
DB
243static struct net_device *packet_cached_dev_get(struct packet_sock *po)
244{
245 struct net_device *dev;
246
247 rcu_read_lock();
248 dev = rcu_dereference(po->cached_dev);
249 if (likely(dev))
250 dev_hold(dev);
251 rcu_read_unlock();
252
253 return dev;
254}
255
256static void packet_cached_dev_assign(struct packet_sock *po,
257 struct net_device *dev)
258{
259 rcu_assign_pointer(po->cached_dev, dev);
260}
261
262static void packet_cached_dev_reset(struct packet_sock *po)
263{
264 RCU_INIT_POINTER(po->cached_dev, NULL);
265}
266
d346a3fa
DB
267static bool packet_use_direct_xmit(const struct packet_sock *po)
268{
269 return po->xmit == packet_direct_xmit;
270}
271
865b03f2 272static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 273{
865b03f2 274 struct net_device *dev = skb->dev;
0fd5d57b 275 const struct net_device_ops *ops = dev->netdev_ops;
b71b5837 276 int cpu = raw_smp_processor_id();
0fd5d57b
DB
277 u16 queue_index;
278
b71b5837
PA
279#ifdef CONFIG_XPS
280 skb->sender_cpu = cpu + 1;
281#endif
282 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
0fd5d57b 283 if (ops->ndo_select_queue) {
a350ecce 284 queue_index = ops->ndo_select_queue(dev, skb, NULL);
0fd5d57b
DB
285 queue_index = netdev_cap_txqueue(dev, queue_index);
286 } else {
b71b5837 287 queue_index = netdev_pick_tx(dev, skb, NULL);
0fd5d57b
DB
288 }
289
865b03f2 290 return queue_index;
0fd5d57b
DB
291}
292
a6361f0c 293/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
294 * or from a context in which asynchronous accesses to the packet
295 * socket is not possible (packet_create()).
296 */
a6361f0c 297static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
298{
299 struct packet_sock *po = pkt_sk(sk);
e40526cb 300
ce06b03e 301 if (!po->running) {
66e56cd4 302 if (po->fanout)
dc99f600 303 __fanout_link(sk, po);
66e56cd4 304 else
dc99f600 305 dev_add_pack(&po->prot_hook);
e40526cb 306
ce06b03e
DM
307 sock_hold(sk);
308 po->running = 1;
309 }
310}
311
a6361f0c
WB
312static void register_prot_hook(struct sock *sk)
313{
314 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
315 __register_prot_hook(sk);
316}
317
318/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
319 * the po->bind_lock and do a synchronize_net to make sure no
320 * asynchronous packet processing paths still refer to the elements
321 * of po->prot_hook. If the sync parameter is false, it is the
322 * callers responsibility to take care of this.
323 */
324static void __unregister_prot_hook(struct sock *sk, bool sync)
325{
326 struct packet_sock *po = pkt_sk(sk);
327
a6361f0c
WB
328 lockdep_assert_held_once(&po->bind_lock);
329
ce06b03e 330 po->running = 0;
66e56cd4
DB
331
332 if (po->fanout)
dc99f600 333 __fanout_unlink(sk, po);
66e56cd4 334 else
dc99f600 335 __dev_remove_pack(&po->prot_hook);
e40526cb 336
ce06b03e
DM
337 __sock_put(sk);
338
339 if (sync) {
340 spin_unlock(&po->bind_lock);
341 synchronize_net();
342 spin_lock(&po->bind_lock);
343 }
344}
345
346static void unregister_prot_hook(struct sock *sk, bool sync)
347{
348 struct packet_sock *po = pkt_sk(sk);
349
350 if (po->running)
351 __unregister_prot_hook(sk, sync);
352}
353
6e58040b 354static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
355{
356 if (is_vmalloc_addr(addr))
357 return vmalloc_to_page(addr);
358 return virt_to_page(addr);
359}
360
69e3c75f 361static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 362{
184f489e 363 union tpacket_uhdr h;
1da177e4 364
69e3c75f 365 h.raw = frame;
bbd6ef87
PM
366 switch (po->tp_version) {
367 case TPACKET_V1:
69e3c75f 368 h.h1->tp_status = status;
0af55bb5 369 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
370 break;
371 case TPACKET_V2:
69e3c75f 372 h.h2->tp_status = status;
0af55bb5 373 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 374 break;
f6fb8f10 375 case TPACKET_V3:
7f953ab2
SV
376 h.h3->tp_status = status;
377 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
378 break;
69e3c75f 379 default:
f6fb8f10 380 WARN(1, "TPACKET version not supported.\n");
69e3c75f 381 BUG();
bbd6ef87 382 }
69e3c75f
JB
383
384 smp_wmb();
bbd6ef87
PM
385}
386
96f657e6 387static int __packet_get_status(const struct packet_sock *po, void *frame)
bbd6ef87 388{
184f489e 389 union tpacket_uhdr h;
bbd6ef87 390
69e3c75f
JB
391 smp_rmb();
392
bbd6ef87
PM
393 h.raw = frame;
394 switch (po->tp_version) {
395 case TPACKET_V1:
0af55bb5 396 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 397 return h.h1->tp_status;
bbd6ef87 398 case TPACKET_V2:
0af55bb5 399 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 400 return h.h2->tp_status;
f6fb8f10 401 case TPACKET_V3:
7f953ab2
SV
402 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
403 return h.h3->tp_status;
69e3c75f 404 default:
f6fb8f10 405 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
406 BUG();
407 return 0;
bbd6ef87 408 }
1da177e4 409}
69e3c75f 410
b9c32fb2
DB
411static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
412 unsigned int flags)
7a51384c
DB
413{
414 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
415
68a360e8
WB
416 if (shhwtstamps &&
417 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
418 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
419 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
420
421 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 422 return TP_STATUS_TS_SOFTWARE;
7a51384c 423
b9c32fb2 424 return 0;
7a51384c
DB
425}
426
b9c32fb2
DB
427static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
428 struct sk_buff *skb)
2e31396f
WB
429{
430 union tpacket_uhdr h;
431 struct timespec ts;
b9c32fb2 432 __u32 ts_status;
2e31396f 433
b9c32fb2
DB
434 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
435 return 0;
2e31396f
WB
436
437 h.raw = frame;
438 switch (po->tp_version) {
439 case TPACKET_V1:
440 h.h1->tp_sec = ts.tv_sec;
441 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
442 break;
443 case TPACKET_V2:
444 h.h2->tp_sec = ts.tv_sec;
445 h.h2->tp_nsec = ts.tv_nsec;
446 break;
447 case TPACKET_V3:
57ea884b
DB
448 h.h3->tp_sec = ts.tv_sec;
449 h.h3->tp_nsec = ts.tv_nsec;
450 break;
2e31396f
WB
451 default:
452 WARN(1, "TPACKET version not supported.\n");
453 BUG();
454 }
455
456 /* one flush is safe, as both fields always lie on the same cacheline */
457 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
458 smp_wmb();
b9c32fb2
DB
459
460 return ts_status;
2e31396f
WB
461}
462
d4b5bd98
ED
463static void *packet_lookup_frame(const struct packet_sock *po,
464 const struct packet_ring_buffer *rb,
465 unsigned int position,
466 int status)
69e3c75f
JB
467{
468 unsigned int pg_vec_pos, frame_offset;
184f489e 469 union tpacket_uhdr h;
69e3c75f
JB
470
471 pg_vec_pos = position / rb->frames_per_block;
472 frame_offset = position % rb->frames_per_block;
473
0e3125c7
NH
474 h.raw = rb->pg_vec[pg_vec_pos].buffer +
475 (frame_offset * rb->frame_size);
69e3c75f
JB
476
477 if (status != __packet_get_status(po, h.raw))
478 return NULL;
479
480 return h.raw;
481}
482
eea49cc9 483static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
484 struct packet_ring_buffer *rb,
485 int status)
486{
487 return packet_lookup_frame(po, rb, rb->head, status);
488}
489
bc59ba39 490static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 491{
492 del_timer_sync(&pkc->retire_blk_timer);
493}
494
495static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 496 struct sk_buff_head *rb_queue)
497{
bc59ba39 498 struct tpacket_kbdq_core *pkc;
f6fb8f10 499
73d0fcf2 500 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 501
ec6f809f 502 spin_lock_bh(&rb_queue->lock);
f6fb8f10 503 pkc->delete_blk_timer = 1;
ec6f809f 504 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 505
506 prb_del_retire_blk_timer(pkc);
507}
508
e8e85cc5 509static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 510{
bc59ba39 511 struct tpacket_kbdq_core *pkc;
f6fb8f10 512
e8e85cc5 513 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
514 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
515 0);
516 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 517}
518
519static int prb_calc_retire_blk_tmo(struct packet_sock *po,
520 int blk_size_in_bytes)
521{
522 struct net_device *dev;
523 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 524 struct ethtool_link_ksettings ecmd;
4bc71cb9 525 int err;
f6fb8f10 526
4bc71cb9
JP
527 rtnl_lock();
528 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
529 if (unlikely(!dev)) {
530 rtnl_unlock();
f6fb8f10 531 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 532 }
7cad1bac 533 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
534 rtnl_unlock();
535 if (!err) {
4bc71cb9
JP
536 /*
537 * If the link speed is so slow you don't really
538 * need to worry about perf anyways
539 */
7cad1bac
DD
540 if (ecmd.base.speed < SPEED_1000 ||
541 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 542 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 543 } else {
544 msec = 1;
7cad1bac 545 div = ecmd.base.speed / 1000;
f6fb8f10 546 }
fcfcfe0b
MW
547 } else
548 return DEFAULT_PRB_RETIRE_TOV;
f6fb8f10 549
550 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
551
552 if (div)
553 mbits /= div;
554
555 tmo = mbits * msec;
556
557 if (div)
558 return tmo+1;
559 return tmo;
560}
561
bc59ba39 562static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 563 union tpacket_req_u *req_u)
564{
565 p1->feature_req_word = req_u->req3.tp_feature_req_word;
566}
567
568static void init_prb_bdqc(struct packet_sock *po,
569 struct packet_ring_buffer *rb,
570 struct pgv *pg_vec,
e8e85cc5 571 union tpacket_req_u *req_u)
f6fb8f10 572{
22781a5b 573 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 574 struct tpacket_block_desc *pbd;
f6fb8f10 575
576 memset(p1, 0x0, sizeof(*p1));
577
578 p1->knxt_seq_num = 1;
579 p1->pkbdq = pg_vec;
bc59ba39 580 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 581 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 582 p1->kblk_size = req_u->req3.tp_block_size;
583 p1->knum_blocks = req_u->req3.tp_block_nr;
584 p1->hdrlen = po->tp_hdrlen;
585 p1->version = po->tp_version;
586 p1->last_kactive_blk_num = 0;
ee80fbf3 587 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 588 if (req_u->req3.tp_retire_blk_tov)
589 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
590 else
591 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
592 req_u->req3.tp_block_size);
593 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
594 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
595
dc808110 596 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 597 prb_init_ft_ops(p1, req_u);
e8e85cc5 598 prb_setup_retire_blk_timer(po);
f6fb8f10 599 prb_open_block(p1, pbd);
600}
601
602/* Do NOT update the last_blk_num first.
603 * Assumes sk_buff_head lock is held.
604 */
bc59ba39 605static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 606{
607 mod_timer(&pkc->retire_blk_timer,
608 jiffies + pkc->tov_in_jiffies);
609 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
610}
611
612/*
613 * Timer logic:
614 * 1) We refresh the timer only when we open a block.
615 * By doing this we don't waste cycles refreshing the timer
616 * on packet-by-packet basis.
617 *
618 * With a 1MB block-size, on a 1Gbps line, it will take
619 * i) ~8 ms to fill a block + ii) memcpy etc.
620 * In this cut we are not accounting for the memcpy time.
621 *
622 * So, if the user sets the 'tmo' to 10ms then the timer
623 * will never fire while the block is still getting filled
624 * (which is what we want). However, the user could choose
625 * to close a block early and that's fine.
626 *
627 * But when the timer does fire, we check whether or not to refresh it.
628 * Since the tmo granularity is in msecs, it is not too expensive
629 * to refresh the timer, lets say every '8' msecs.
630 * Either the user can set the 'tmo' or we can derive it based on
631 * a) line-speed and b) block-size.
632 * prb_calc_retire_blk_tmo() calculates the tmo.
633 *
634 */
17bfd8c8 635static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 636{
17bfd8c8
KC
637 struct packet_sock *po =
638 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 639 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 640 unsigned int frozen;
bc59ba39 641 struct tpacket_block_desc *pbd;
f6fb8f10 642
643 spin_lock(&po->sk.sk_receive_queue.lock);
644
645 frozen = prb_queue_frozen(pkc);
646 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
647
648 if (unlikely(pkc->delete_blk_timer))
649 goto out;
650
651 /* We only need to plug the race when the block is partially filled.
652 * tpacket_rcv:
653 * lock(); increment BLOCK_NUM_PKTS; unlock()
654 * copy_bits() is in progress ...
655 * timer fires on other cpu:
656 * we can't retire the current block because copy_bits
657 * is in progress.
658 *
659 */
660 if (BLOCK_NUM_PKTS(pbd)) {
661 while (atomic_read(&pkc->blk_fill_in_prog)) {
662 /* Waiting for skb_copy_bits to finish... */
663 cpu_relax();
664 }
665 }
666
667 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
668 if (!frozen) {
41a50d62
AD
669 if (!BLOCK_NUM_PKTS(pbd)) {
670 /* An empty block. Just refresh the timer. */
671 goto refresh_timer;
672 }
f6fb8f10 673 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
674 if (!prb_dispatch_next_block(pkc, po))
675 goto refresh_timer;
676 else
677 goto out;
678 } else {
679 /* Case 1. Queue was frozen because user-space was
680 * lagging behind.
681 */
878cd3ba 682 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 683 /*
684 * Ok, user-space is still behind.
685 * So just refresh the timer.
686 */
687 goto refresh_timer;
688 } else {
689 /* Case 2. queue was frozen,user-space caught up,
690 * now the link went idle && the timer fired.
691 * We don't have a block to close.So we open this
692 * block and restart the timer.
693 * opening a block thaws the queue,restarts timer
694 * Thawing/timer-refresh is a side effect.
695 */
696 prb_open_block(pkc, pbd);
697 goto out;
698 }
699 }
700 }
701
702refresh_timer:
703 _prb_refresh_rx_retire_blk_timer(pkc);
704
705out:
706 spin_unlock(&po->sk.sk_receive_queue.lock);
707}
708
eea49cc9 709static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 710 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 711{
712 /* Flush everything minus the block header */
713
714#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
715 u8 *start, *end;
716
717 start = (u8 *)pbd1;
718
719 /* Skip the block header(we know header WILL fit in 4K) */
720 start += PAGE_SIZE;
721
722 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
723 for (; start < end; start += PAGE_SIZE)
724 flush_dcache_page(pgv_to_page(start));
725
726 smp_wmb();
727#endif
728
729 /* Now update the block status. */
730
731 BLOCK_STATUS(pbd1) = status;
732
733 /* Flush the block header */
734
735#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
736 start = (u8 *)pbd1;
737 flush_dcache_page(pgv_to_page(start));
738
739 smp_wmb();
740#endif
741}
742
743/*
744 * Side effect:
745 *
746 * 1) flush the block
747 * 2) Increment active_blk_num
748 *
749 * Note:We DONT refresh the timer on purpose.
750 * Because almost always the next block will be opened.
751 */
bc59ba39 752static void prb_close_block(struct tpacket_kbdq_core *pkc1,
753 struct tpacket_block_desc *pbd1,
f6fb8f10 754 struct packet_sock *po, unsigned int stat)
755{
756 __u32 status = TP_STATUS_USER | stat;
757
758 struct tpacket3_hdr *last_pkt;
bc59ba39 759 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 760 struct sock *sk = &po->sk;
f6fb8f10 761
8e8e2951 762 if (atomic_read(&po->tp_drops))
f6fb8f10 763 status |= TP_STATUS_LOSING;
764
765 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
766 last_pkt->tp_next_offset = 0;
767
768 /* Get the ts of the last pkt */
769 if (BLOCK_NUM_PKTS(pbd1)) {
770 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
771 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
772 } else {
41a50d62
AD
773 /* Ok, we tmo'd - so get the current time.
774 *
775 * It shouldn't really happen as we don't close empty
776 * blocks. See prb_retire_rx_blk_timer_expired().
777 */
f6fb8f10 778 struct timespec ts;
779 getnstimeofday(&ts);
780 h1->ts_last_pkt.ts_sec = ts.tv_sec;
781 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
782 }
783
784 smp_wmb();
785
786 /* Flush the block */
787 prb_flush_block(pkc1, pbd1, status);
788
da413eec
DC
789 sk->sk_data_ready(sk);
790
f6fb8f10 791 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
792}
793
eea49cc9 794static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 795{
796 pkc->reset_pending_on_curr_blk = 0;
797}
798
799/*
800 * Side effect of opening a block:
801 *
802 * 1) prb_queue is thawed.
803 * 2) retire_blk_timer is refreshed.
804 *
805 */
bc59ba39 806static void prb_open_block(struct tpacket_kbdq_core *pkc1,
807 struct tpacket_block_desc *pbd1)
f6fb8f10 808{
809 struct timespec ts;
bc59ba39 810 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 811
812 smp_rmb();
813
8da3056c
DB
814 /* We could have just memset this but we will lose the
815 * flexibility of making the priv area sticky
816 */
f6fb8f10 817
8da3056c
DB
818 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
819 BLOCK_NUM_PKTS(pbd1) = 0;
820 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 821
8da3056c
DB
822 getnstimeofday(&ts);
823
824 h1->ts_first_pkt.ts_sec = ts.tv_sec;
825 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 826
8da3056c
DB
827 pkc1->pkblk_start = (char *)pbd1;
828 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
829
830 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
831 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
832
833 pbd1->version = pkc1->version;
834 pkc1->prev = pkc1->nxt_offset;
835 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
836
837 prb_thaw_queue(pkc1);
838 _prb_refresh_rx_retire_blk_timer(pkc1);
839
840 smp_wmb();
f6fb8f10 841}
842
843/*
844 * Queue freeze logic:
845 * 1) Assume tp_block_nr = 8 blocks.
846 * 2) At time 't0', user opens Rx ring.
847 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
848 * 4) user-space is either sleeping or processing block '0'.
849 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
850 * it will close block-7,loop around and try to fill block '0'.
851 * call-flow:
852 * __packet_lookup_frame_in_block
853 * prb_retire_current_block()
854 * prb_dispatch_next_block()
855 * |->(BLOCK_STATUS == USER) evaluates to true
856 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
857 * 6) Now there are two cases:
858 * 6.1) Link goes idle right after the queue is frozen.
859 * But remember, the last open_block() refreshed the timer.
860 * When this timer expires,it will refresh itself so that we can
861 * re-open block-0 in near future.
862 * 6.2) Link is busy and keeps on receiving packets. This is a simple
863 * case and __packet_lookup_frame_in_block will check if block-0
864 * is free and can now be re-used.
865 */
eea49cc9 866static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 867 struct packet_sock *po)
868{
869 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 870 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 871}
872
873#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
874
875/*
876 * If the next block is free then we will dispatch it
877 * and return a good offset.
878 * Else, we will freeze the queue.
879 * So, caller must check the return value.
880 */
bc59ba39 881static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 882 struct packet_sock *po)
883{
bc59ba39 884 struct tpacket_block_desc *pbd;
f6fb8f10 885
886 smp_rmb();
887
888 /* 1. Get current block num */
889 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
890
891 /* 2. If this block is currently in_use then freeze the queue */
892 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
893 prb_freeze_queue(pkc, po);
894 return NULL;
895 }
896
897 /*
898 * 3.
899 * open this block and return the offset where the first packet
900 * needs to get stored.
901 */
902 prb_open_block(pkc, pbd);
903 return (void *)pkc->nxt_offset;
904}
905
bc59ba39 906static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 907 struct packet_sock *po, unsigned int status)
908{
bc59ba39 909 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 910
911 /* retire/close the current block */
912 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
913 /*
914 * Plug the case where copy_bits() is in progress on
915 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
916 * have space to copy the pkt in the current block and
917 * called prb_retire_current_block()
918 *
919 * We don't need to worry about the TMO case because
920 * the timer-handler already handled this case.
921 */
922 if (!(status & TP_STATUS_BLK_TMO)) {
923 while (atomic_read(&pkc->blk_fill_in_prog)) {
924 /* Waiting for skb_copy_bits to finish... */
925 cpu_relax();
926 }
927 }
928 prb_close_block(pkc, pbd, po, status);
929 return;
930 }
f6fb8f10 931}
932
878cd3ba 933static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 934{
935 return TP_STATUS_USER & BLOCK_STATUS(pbd);
936}
937
eea49cc9 938static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 939{
940 return pkc->reset_pending_on_curr_blk;
941}
942
eea49cc9 943static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 944{
bc59ba39 945 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 946 atomic_dec(&pkc->blk_fill_in_prog);
947}
948
eea49cc9 949static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 950 struct tpacket3_hdr *ppd)
951{
3958afa1 952 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 953}
954
eea49cc9 955static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 956 struct tpacket3_hdr *ppd)
957{
958 ppd->hv1.tp_rxhash = 0;
959}
960
eea49cc9 961static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 962 struct tpacket3_hdr *ppd)
963{
df8a39de
JP
964 if (skb_vlan_tag_present(pkc->skb)) {
965 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
966 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
967 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 968 } else {
9e67030a 969 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 970 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 971 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 972 }
973}
974
bc59ba39 975static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 976 struct tpacket3_hdr *ppd)
977{
a0cdfcf3 978 ppd->hv1.tp_padding = 0;
f6fb8f10 979 prb_fill_vlan_info(pkc, ppd);
980
981 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
982 prb_fill_rxhash(pkc, ppd);
983 else
984 prb_clear_rxhash(pkc, ppd);
985}
986
eea49cc9 987static void prb_fill_curr_block(char *curr,
bc59ba39 988 struct tpacket_kbdq_core *pkc,
989 struct tpacket_block_desc *pbd,
f6fb8f10 990 unsigned int len)
991{
992 struct tpacket3_hdr *ppd;
993
994 ppd = (struct tpacket3_hdr *)curr;
995 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
996 pkc->prev = curr;
997 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
998 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
999 BLOCK_NUM_PKTS(pbd) += 1;
1000 atomic_inc(&pkc->blk_fill_in_prog);
1001 prb_run_all_ft_ops(pkc, ppd);
1002}
1003
1004/* Assumes caller has the sk->rx_queue.lock */
1005static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1006 struct sk_buff *skb,
f6fb8f10 1007 unsigned int len
1008 )
1009{
bc59ba39 1010 struct tpacket_kbdq_core *pkc;
1011 struct tpacket_block_desc *pbd;
f6fb8f10 1012 char *curr, *end;
1013
e3192690 1014 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1015 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1016
1017 /* Queue is frozen when user space is lagging behind */
1018 if (prb_queue_frozen(pkc)) {
1019 /*
1020 * Check if that last block which caused the queue to freeze,
1021 * is still in_use by user-space.
1022 */
878cd3ba 1023 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1024 /* Can't record this packet */
1025 return NULL;
1026 } else {
1027 /*
1028 * Ok, the block was released by user-space.
1029 * Now let's open that block.
1030 * opening a block also thaws the queue.
1031 * Thawing is a side effect.
1032 */
1033 prb_open_block(pkc, pbd);
1034 }
1035 }
1036
1037 smp_mb();
1038 curr = pkc->nxt_offset;
1039 pkc->skb = skb;
e3192690 1040 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1041
1042 /* first try the current block */
1043 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1044 prb_fill_curr_block(curr, pkc, pbd, len);
1045 return (void *)curr;
1046 }
1047
1048 /* Ok, close the current block */
1049 prb_retire_current_block(pkc, po, 0);
1050
1051 /* Now, try to dispatch the next block */
1052 curr = (char *)prb_dispatch_next_block(pkc, po);
1053 if (curr) {
1054 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1055 prb_fill_curr_block(curr, pkc, pbd, len);
1056 return (void *)curr;
1057 }
1058
1059 /*
1060 * No free blocks are available.user_space hasn't caught up yet.
1061 * Queue was just frozen and now this packet will get dropped.
1062 */
1063 return NULL;
1064}
1065
eea49cc9 1066static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1067 struct sk_buff *skb,
1068 int status, unsigned int len)
1069{
1070 char *curr = NULL;
1071 switch (po->tp_version) {
1072 case TPACKET_V1:
1073 case TPACKET_V2:
1074 curr = packet_lookup_frame(po, &po->rx_ring,
1075 po->rx_ring.head, status);
1076 return curr;
1077 case TPACKET_V3:
46088059 1078 return __packet_lookup_frame_in_block(po, skb, len);
f6fb8f10 1079 default:
1080 WARN(1, "TPACKET version not supported\n");
1081 BUG();
99aa3473 1082 return NULL;
f6fb8f10 1083 }
1084}
1085
dcf70cef
ED
1086static void *prb_lookup_block(const struct packet_sock *po,
1087 const struct packet_ring_buffer *rb,
1088 unsigned int idx,
1089 int status)
f6fb8f10 1090{
bc59ba39 1091 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1092 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1093
1094 if (status != BLOCK_STATUS(pbd))
1095 return NULL;
1096 return pbd;
1097}
1098
eea49cc9 1099static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1100{
1101 unsigned int prev;
1102 if (rb->prb_bdqc.kactive_blk_num)
1103 prev = rb->prb_bdqc.kactive_blk_num-1;
1104 else
1105 prev = rb->prb_bdqc.knum_blocks-1;
1106 return prev;
1107}
1108
1109/* Assumes caller has held the rx_queue.lock */
eea49cc9 1110static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1111 struct packet_ring_buffer *rb,
1112 int status)
1113{
1114 unsigned int previous = prb_previous_blk_num(rb);
1115 return prb_lookup_block(po, rb, previous, status);
1116}
1117
eea49cc9 1118static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 if (po->tp_version <= TPACKET_V2)
1123 return packet_previous_frame(po, rb, status);
1124
1125 return __prb_previous_block(po, rb, status);
1126}
1127
eea49cc9 1128static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1129 struct packet_ring_buffer *rb)
1130{
1131 switch (po->tp_version) {
1132 case TPACKET_V1:
1133 case TPACKET_V2:
1134 return packet_increment_head(rb);
1135 case TPACKET_V3:
1136 default:
1137 WARN(1, "TPACKET version not supported.\n");
1138 BUG();
1139 return;
1140 }
1141}
1142
eea49cc9 1143static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1144 struct packet_ring_buffer *rb,
1145 int status)
1146{
1147 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1148 return packet_lookup_frame(po, rb, previous, status);
1149}
1150
eea49cc9 1151static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1152{
1153 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1154}
1155
b0138408
DB
1156static void packet_inc_pending(struct packet_ring_buffer *rb)
1157{
1158 this_cpu_inc(*rb->pending_refcnt);
1159}
1160
1161static void packet_dec_pending(struct packet_ring_buffer *rb)
1162{
1163 this_cpu_dec(*rb->pending_refcnt);
1164}
1165
1166static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1167{
1168 unsigned int refcnt = 0;
1169 int cpu;
1170
1171 /* We don't use pending refcount in rx_ring. */
1172 if (rb->pending_refcnt == NULL)
1173 return 0;
1174
1175 for_each_possible_cpu(cpu)
1176 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1177
1178 return refcnt;
1179}
1180
1181static int packet_alloc_pending(struct packet_sock *po)
1182{
1183 po->rx_ring.pending_refcnt = NULL;
1184
1185 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1186 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1187 return -ENOBUFS;
1188
1189 return 0;
1190}
1191
1192static void packet_free_pending(struct packet_sock *po)
1193{
1194 free_percpu(po->tx_ring.pending_refcnt);
1195}
1196
9954729b
WB
1197#define ROOM_POW_OFF 2
1198#define ROOM_NONE 0x0
1199#define ROOM_LOW 0x1
1200#define ROOM_NORMAL 0x2
1201
d4b5bd98 1202static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
77f65ebd 1203{
9954729b
WB
1204 int idx, len;
1205
d4b5bd98
ED
1206 len = READ_ONCE(po->rx_ring.frame_max) + 1;
1207 idx = READ_ONCE(po->rx_ring.head);
9954729b
WB
1208 if (pow_off)
1209 idx += len >> pow_off;
1210 if (idx >= len)
1211 idx -= len;
1212 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1213}
1214
dcf70cef 1215static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
9954729b
WB
1216{
1217 int idx, len;
1218
dcf70cef
ED
1219 len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
1220 idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
9954729b
WB
1221 if (pow_off)
1222 idx += len >> pow_off;
1223 if (idx >= len)
1224 idx -= len;
1225 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1226}
77f65ebd 1227
0338a145
ED
1228static int __packet_rcv_has_room(const struct packet_sock *po,
1229 const struct sk_buff *skb)
9954729b 1230{
0338a145 1231 const struct sock *sk = &po->sk;
9954729b
WB
1232 int ret = ROOM_NONE;
1233
1234 if (po->prot_hook.func != tpacket_rcv) {
0338a145
ED
1235 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1236 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1237 - (skb ? skb->truesize : 0);
1238
1239 if (avail > (rcvbuf >> ROOM_POW_OFF))
9954729b
WB
1240 return ROOM_NORMAL;
1241 else if (avail > 0)
1242 return ROOM_LOW;
1243 else
1244 return ROOM_NONE;
1245 }
77f65ebd 1246
9954729b
WB
1247 if (po->tp_version == TPACKET_V3) {
1248 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1249 ret = ROOM_NORMAL;
1250 else if (__tpacket_v3_has_room(po, 0))
1251 ret = ROOM_LOW;
1252 } else {
1253 if (__tpacket_has_room(po, ROOM_POW_OFF))
1254 ret = ROOM_NORMAL;
1255 else if (__tpacket_has_room(po, 0))
1256 ret = ROOM_LOW;
1257 }
2ccdbaa6
WB
1258
1259 return ret;
1260}
1261
1262static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1263{
3a2bb84e 1264 int pressure, ret;
2ccdbaa6 1265
54d7c01d 1266 ret = __packet_rcv_has_room(po, skb);
3a2bb84e
ED
1267 pressure = ret != ROOM_NORMAL;
1268
1269 if (READ_ONCE(po->pressure) != pressure)
1270 WRITE_ONCE(po->pressure, pressure);
77f65ebd 1271
9954729b 1272 return ret;
77f65ebd
WB
1273}
1274
9bb6cd65
ED
1275static void packet_rcv_try_clear_pressure(struct packet_sock *po)
1276{
1277 if (READ_ONCE(po->pressure) &&
1278 __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
1279 WRITE_ONCE(po->pressure, 0);
1280}
1281
1da177e4
LT
1282static void packet_sock_destruct(struct sock *sk)
1283{
ed85b565
RC
1284 skb_queue_purge(&sk->sk_error_queue);
1285
547b792c 1286 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1287 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1288
1289 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1290 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1291 return;
1292 }
1293
17ab56a2 1294 sk_refcnt_debug_dec(sk);
1da177e4
LT
1295}
1296
3b3a5b0a
WB
1297static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1298{
f6cec329
ED
1299 u32 *history = po->rollover->history;
1300 u32 victim, rxhash;
3b3a5b0a
WB
1301 int i, count = 0;
1302
1303 rxhash = skb_get_hash(skb);
1304 for (i = 0; i < ROLLOVER_HLEN; i++)
f6cec329 1305 if (READ_ONCE(history[i]) == rxhash)
3b3a5b0a
WB
1306 count++;
1307
f6cec329
ED
1308 victim = prandom_u32() % ROLLOVER_HLEN;
1309
1310 /* Avoid dirtying the cache line if possible */
1311 if (READ_ONCE(history[victim]) != rxhash)
1312 WRITE_ONCE(history[victim], rxhash);
1313
3b3a5b0a
WB
1314 return count > (ROLLOVER_HLEN >> 1);
1315}
1316
77f65ebd
WB
1317static unsigned int fanout_demux_hash(struct packet_fanout *f,
1318 struct sk_buff *skb,
1319 unsigned int num)
dc99f600 1320{
eb70db87 1321 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1322}
1323
77f65ebd
WB
1324static unsigned int fanout_demux_lb(struct packet_fanout *f,
1325 struct sk_buff *skb,
1326 unsigned int num)
dc99f600 1327{
468479e6 1328 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1329
468479e6 1330 return val % num;
77f65ebd
WB
1331}
1332
1333static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1334 struct sk_buff *skb,
1335 unsigned int num)
1336{
1337 return smp_processor_id() % num;
dc99f600
DM
1338}
1339
5df0ddfb
DB
1340static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1341 struct sk_buff *skb,
1342 unsigned int num)
1343{
f337db64 1344 return prandom_u32_max(num);
5df0ddfb
DB
1345}
1346
77f65ebd
WB
1347static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1348 struct sk_buff *skb,
ad377cab 1349 unsigned int idx, bool try_self,
77f65ebd 1350 unsigned int num)
95ec3eb4 1351{
4633c9e0 1352 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1353 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1354
0648ab70 1355 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1356
1357 if (try_self) {
1358 room = packet_rcv_has_room(po, skb);
1359 if (room == ROOM_NORMAL ||
1360 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1361 return idx;
4633c9e0 1362 po_skip = po;
3b3a5b0a 1363 }
ad377cab 1364
0648ab70 1365 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1366 do {
2ccdbaa6 1367 po_next = pkt_sk(f->arr[i]);
3a2bb84e 1368 if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
2ccdbaa6 1369 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1370 if (i != j)
0648ab70 1371 po->rollover->sock = i;
a9b63918
WB
1372 atomic_long_inc(&po->rollover->num);
1373 if (room == ROOM_LOW)
1374 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1375 return i;
1376 }
ad377cab 1377
77f65ebd
WB
1378 if (++i == num)
1379 i = 0;
1380 } while (i != j);
1381
a9b63918 1382 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1383 return idx;
1384}
1385
2d36097d
NH
1386static unsigned int fanout_demux_qm(struct packet_fanout *f,
1387 struct sk_buff *skb,
1388 unsigned int num)
1389{
1390 return skb_get_queue_mapping(skb) % num;
1391}
1392
47dceb8e
WB
1393static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1394 struct sk_buff *skb,
1395 unsigned int num)
1396{
1397 struct bpf_prog *prog;
1398 unsigned int ret = 0;
1399
1400 rcu_read_lock();
1401 prog = rcu_dereference(f->bpf_prog);
1402 if (prog)
ff936a04 1403 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1404 rcu_read_unlock();
1405
1406 return ret;
1407}
1408
77f65ebd
WB
1409static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1410{
1411 return f->flags & (flag >> 8);
95ec3eb4
DM
1412}
1413
95ec3eb4
DM
1414static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1415 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1416{
1417 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1418 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1419 struct net *net = read_pnet(&f->net);
dc99f600 1420 struct packet_sock *po;
77f65ebd 1421 unsigned int idx;
dc99f600 1422
19bcf9f2 1423 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1424 kfree_skb(skb);
1425 return 0;
1426 }
1427
3f34b24a 1428 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1429 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1430 if (!skb)
1431 return 0;
1432 }
95ec3eb4
DM
1433 switch (f->type) {
1434 case PACKET_FANOUT_HASH:
1435 default:
77f65ebd 1436 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1437 break;
1438 case PACKET_FANOUT_LB:
77f65ebd 1439 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1440 break;
1441 case PACKET_FANOUT_CPU:
77f65ebd
WB
1442 idx = fanout_demux_cpu(f, skb, num);
1443 break;
5df0ddfb
DB
1444 case PACKET_FANOUT_RND:
1445 idx = fanout_demux_rnd(f, skb, num);
1446 break;
2d36097d
NH
1447 case PACKET_FANOUT_QM:
1448 idx = fanout_demux_qm(f, skb, num);
1449 break;
77f65ebd 1450 case PACKET_FANOUT_ROLLOVER:
ad377cab 1451 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1452 break;
47dceb8e 1453 case PACKET_FANOUT_CBPF:
f2e52095 1454 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1455 idx = fanout_demux_bpf(f, skb, num);
1456 break;
dc99f600
DM
1457 }
1458
ad377cab
WB
1459 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1460 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1461
ad377cab 1462 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1463 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1464}
1465
fff3321d
PE
1466DEFINE_MUTEX(fanout_mutex);
1467EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1468static LIST_HEAD(fanout_list);
4a69a864 1469static u16 fanout_next_id;
dc99f600
DM
1470
1471static void __fanout_link(struct sock *sk, struct packet_sock *po)
1472{
1473 struct packet_fanout *f = po->fanout;
1474
1475 spin_lock(&f->lock);
1476 f->arr[f->num_members] = sk;
1477 smp_wmb();
1478 f->num_members++;
2bd624b4
AS
1479 if (f->num_members == 1)
1480 dev_add_pack(&f->prot_hook);
dc99f600
DM
1481 spin_unlock(&f->lock);
1482}
1483
1484static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1485{
1486 struct packet_fanout *f = po->fanout;
1487 int i;
1488
1489 spin_lock(&f->lock);
1490 for (i = 0; i < f->num_members; i++) {
1491 if (f->arr[i] == sk)
1492 break;
1493 }
1494 BUG_ON(i >= f->num_members);
1495 f->arr[i] = f->arr[f->num_members - 1];
1496 f->num_members--;
2bd624b4
AS
1497 if (f->num_members == 0)
1498 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1499 spin_unlock(&f->lock);
1500}
1501
d4dd8aee 1502static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1503{
161642e2
ED
1504 if (sk->sk_family != PF_PACKET)
1505 return false;
c0de08d0 1506
161642e2 1507 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1508}
1509
47dceb8e
WB
1510static void fanout_init_data(struct packet_fanout *f)
1511{
1512 switch (f->type) {
1513 case PACKET_FANOUT_LB:
1514 atomic_set(&f->rr_cur, 0);
1515 break;
1516 case PACKET_FANOUT_CBPF:
f2e52095 1517 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1518 RCU_INIT_POINTER(f->bpf_prog, NULL);
1519 break;
1520 }
1521}
1522
1523static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1524{
1525 struct bpf_prog *old;
1526
1527 spin_lock(&f->lock);
1528 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1529 rcu_assign_pointer(f->bpf_prog, new);
1530 spin_unlock(&f->lock);
1531
1532 if (old) {
1533 synchronize_net();
1534 bpf_prog_destroy(old);
1535 }
1536}
1537
1538static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1539 unsigned int len)
1540{
1541 struct bpf_prog *new;
1542 struct sock_fprog fprog;
1543 int ret;
1544
1545 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1546 return -EPERM;
1547 if (len != sizeof(fprog))
1548 return -EINVAL;
1549 if (copy_from_user(&fprog, data, len))
1550 return -EFAULT;
1551
bab18991 1552 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1553 if (ret)
1554 return ret;
1555
1556 __fanout_set_data_bpf(po->fanout, new);
1557 return 0;
1558}
1559
f2e52095
WB
1560static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1561 unsigned int len)
1562{
1563 struct bpf_prog *new;
1564 u32 fd;
1565
1566 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1567 return -EPERM;
1568 if (len != sizeof(fd))
1569 return -EINVAL;
1570 if (copy_from_user(&fd, data, len))
1571 return -EFAULT;
1572
113214be 1573 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1574 if (IS_ERR(new))
1575 return PTR_ERR(new);
f2e52095
WB
1576
1577 __fanout_set_data_bpf(po->fanout, new);
1578 return 0;
1579}
1580
47dceb8e
WB
1581static int fanout_set_data(struct packet_sock *po, char __user *data,
1582 unsigned int len)
1583{
1584 switch (po->fanout->type) {
1585 case PACKET_FANOUT_CBPF:
1586 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1587 case PACKET_FANOUT_EBPF:
1588 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1589 default:
1590 return -EINVAL;
07d53ae4 1591 }
47dceb8e
WB
1592}
1593
1594static void fanout_release_data(struct packet_fanout *f)
1595{
1596 switch (f->type) {
1597 case PACKET_FANOUT_CBPF:
f2e52095 1598 case PACKET_FANOUT_EBPF:
47dceb8e 1599 __fanout_set_data_bpf(f, NULL);
07d53ae4 1600 }
47dceb8e
WB
1601}
1602
4a69a864
MM
1603static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1604{
1605 struct packet_fanout *f;
1606
1607 list_for_each_entry(f, &fanout_list, list) {
1608 if (f->id == candidate_id &&
1609 read_pnet(&f->net) == sock_net(sk)) {
1610 return false;
1611 }
1612 }
1613 return true;
1614}
1615
1616static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1617{
1618 u16 id = fanout_next_id;
1619
1620 do {
1621 if (__fanout_id_is_free(sk, id)) {
1622 *new_id = id;
1623 fanout_next_id = id + 1;
1624 return true;
1625 }
1626
1627 id++;
1628 } while (id != fanout_next_id);
1629
1630 return false;
1631}
1632
7736d33f 1633static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1634{
d199fab6 1635 struct packet_rollover *rollover = NULL;
dc99f600
DM
1636 struct packet_sock *po = pkt_sk(sk);
1637 struct packet_fanout *f, *match;
7736d33f 1638 u8 type = type_flags & 0xff;
77f65ebd 1639 u8 flags = type_flags >> 8;
dc99f600
DM
1640 int err;
1641
1642 switch (type) {
77f65ebd
WB
1643 case PACKET_FANOUT_ROLLOVER:
1644 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1645 return -EINVAL;
dc99f600
DM
1646 case PACKET_FANOUT_HASH:
1647 case PACKET_FANOUT_LB:
95ec3eb4 1648 case PACKET_FANOUT_CPU:
5df0ddfb 1649 case PACKET_FANOUT_RND:
2d36097d 1650 case PACKET_FANOUT_QM:
47dceb8e 1651 case PACKET_FANOUT_CBPF:
f2e52095 1652 case PACKET_FANOUT_EBPF:
dc99f600
DM
1653 break;
1654 default:
1655 return -EINVAL;
1656 }
1657
d199fab6
ED
1658 mutex_lock(&fanout_mutex);
1659
d199fab6 1660 err = -EALREADY;
dc99f600 1661 if (po->fanout)
d199fab6 1662 goto out;
dc99f600 1663
4633c9e0
WB
1664 if (type == PACKET_FANOUT_ROLLOVER ||
1665 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1666 err = -ENOMEM;
1667 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1668 if (!rollover)
1669 goto out;
1670 atomic_long_set(&rollover->num, 0);
1671 atomic_long_set(&rollover->num_huge, 0);
1672 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1673 }
1674
4a69a864
MM
1675 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1676 if (id != 0) {
1677 err = -EINVAL;
1678 goto out;
1679 }
1680 if (!fanout_find_new_id(sk, &id)) {
1681 err = -ENOMEM;
1682 goto out;
1683 }
1684 /* ephemeral flag for the first socket in the group: drop it */
1685 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1686 }
1687
dc99f600
DM
1688 match = NULL;
1689 list_for_each_entry(f, &fanout_list, list) {
1690 if (f->id == id &&
1691 read_pnet(&f->net) == sock_net(sk)) {
1692 match = f;
1693 break;
1694 }
1695 }
afe62c68 1696 err = -EINVAL;
77f65ebd 1697 if (match && match->flags != flags)
afe62c68 1698 goto out;
dc99f600 1699 if (!match) {
afe62c68 1700 err = -ENOMEM;
dc99f600 1701 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1702 if (!match)
1703 goto out;
1704 write_pnet(&match->net, sock_net(sk));
1705 match->id = id;
1706 match->type = type;
77f65ebd 1707 match->flags = flags;
afe62c68
ED
1708 INIT_LIST_HEAD(&match->list);
1709 spin_lock_init(&match->lock);
fb5c2c17 1710 refcount_set(&match->sk_ref, 0);
47dceb8e 1711 fanout_init_data(match);
afe62c68
ED
1712 match->prot_hook.type = po->prot_hook.type;
1713 match->prot_hook.dev = po->prot_hook.dev;
1714 match->prot_hook.func = packet_rcv_fanout;
1715 match->prot_hook.af_packet_priv = match;
c0de08d0 1716 match->prot_hook.id_match = match_fanout_group;
afe62c68 1717 list_add(&match->list, &fanout_list);
dc99f600 1718 }
afe62c68 1719 err = -EINVAL;
008ba2a1
WB
1720
1721 spin_lock(&po->bind_lock);
1722 if (po->running &&
1723 match->type == type &&
afe62c68
ED
1724 match->prot_hook.type == po->prot_hook.type &&
1725 match->prot_hook.dev == po->prot_hook.dev) {
1726 err = -ENOSPC;
fb5c2c17 1727 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1728 __dev_remove_pack(&po->prot_hook);
1729 po->fanout = match;
57f015f5
MM
1730 po->rollover = rollover;
1731 rollover = NULL;
fb5c2c17 1732 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1733 __fanout_link(sk, po);
1734 err = 0;
dc99f600
DM
1735 }
1736 }
008ba2a1
WB
1737 spin_unlock(&po->bind_lock);
1738
1739 if (err && !refcount_read(&match->sk_ref)) {
1740 list_del(&match->list);
1741 kfree(match);
1742 }
1743
afe62c68 1744out:
57f015f5 1745 kfree(rollover);
d199fab6 1746 mutex_unlock(&fanout_mutex);
dc99f600
DM
1747 return err;
1748}
1749
2bd624b4
AS
1750/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1751 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1752 * It is the responsibility of the caller to call fanout_release_data() and
1753 * free the returned packet_fanout (after synchronize_net())
1754 */
1755static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1756{
1757 struct packet_sock *po = pkt_sk(sk);
1758 struct packet_fanout *f;
1759
fff3321d 1760 mutex_lock(&fanout_mutex);
d199fab6
ED
1761 f = po->fanout;
1762 if (f) {
1763 po->fanout = NULL;
1764
fb5c2c17 1765 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1766 list_del(&f->list);
2bd624b4
AS
1767 else
1768 f = NULL;
dc99f600
DM
1769 }
1770 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1771
1772 return f;
dc99f600 1773}
1da177e4 1774
3c70c132
DB
1775static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1776 struct sk_buff *skb)
1777{
1778 /* Earlier code assumed this would be a VLAN pkt, double-check
1779 * this now that we have the actual packet in hand. We can only
1780 * do this check on Ethernet devices.
1781 */
1782 if (unlikely(dev->type != ARPHRD_ETHER))
1783 return false;
1784
1785 skb_reset_mac_header(skb);
1786 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1787}
1788
90ddc4f0 1789static const struct proto_ops packet_ops;
1da177e4 1790
90ddc4f0 1791static const struct proto_ops packet_ops_spkt;
1da177e4 1792
40d4e3df
ED
1793static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1794 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1795{
1796 struct sock *sk;
1797 struct sockaddr_pkt *spkt;
1798
1799 /*
1800 * When we registered the protocol we saved the socket in the data
1801 * field for just this event.
1802 */
1803
1804 sk = pt->af_packet_priv;
1ce4f28b 1805
1da177e4
LT
1806 /*
1807 * Yank back the headers [hope the device set this
1808 * right or kerboom...]
1809 *
1810 * Incoming packets have ll header pulled,
1811 * push it back.
1812 *
98e399f8 1813 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1814 * so that this procedure is noop.
1815 */
1816
1817 if (skb->pkt_type == PACKET_LOOPBACK)
1818 goto out;
1819
09ad9bc7 1820 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1821 goto out;
1822
40d4e3df
ED
1823 skb = skb_share_check(skb, GFP_ATOMIC);
1824 if (skb == NULL)
1da177e4
LT
1825 goto oom;
1826
1827 /* drop any routing info */
adf30907 1828 skb_dst_drop(skb);
1da177e4 1829
84531c24 1830 /* drop conntrack reference */
895b5c9f 1831 nf_reset_ct(skb);
84531c24 1832
ffbc6111 1833 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1834
98e399f8 1835 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1836
1837 /*
1838 * The SOCK_PACKET socket receives _all_ frames.
1839 */
1840
1841 spkt->spkt_family = dev->type;
1842 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1843 spkt->spkt_protocol = skb->protocol;
1844
1845 /*
1846 * Charge the memory to the socket. This is done specifically
1847 * to prevent sockets using all the memory up.
1848 */
1849
40d4e3df 1850 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1851 return 0;
1852
1853out:
1854 kfree_skb(skb);
1855oom:
1856 return 0;
1857}
1858
75c65772
MM
1859static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1860{
18bed891
YK
1861 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1862 sock->type == SOCK_RAW) {
75c65772
MM
1863 skb_reset_mac_header(skb);
1864 skb->protocol = dev_parse_header_protocol(skb);
1865 }
1866
1867 skb_probe_transport_header(skb);
1868}
1da177e4
LT
1869
1870/*
1871 * Output a raw packet to a device layer. This bypasses all the other
1872 * protocol layers and you must therefore supply it with a complete frame
1873 */
1ce4f28b 1874
1b784140
YX
1875static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1876 size_t len)
1da177e4
LT
1877{
1878 struct sock *sk = sock->sk;
342dfc30 1879 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1880 struct sk_buff *skb = NULL;
1da177e4 1881 struct net_device *dev;
c14ac945 1882 struct sockcm_cookie sockc;
40d4e3df 1883 __be16 proto = 0;
1da177e4 1884 int err;
3bdc0eba 1885 int extra_len = 0;
1ce4f28b 1886
1da177e4 1887 /*
1ce4f28b 1888 * Get and verify the address.
1da177e4
LT
1889 */
1890
40d4e3df 1891 if (saddr) {
1da177e4 1892 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1893 return -EINVAL;
1894 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1895 proto = saddr->spkt_protocol;
1896 } else
1897 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1898
1899 /*
1ce4f28b 1900 * Find the device first to size check it
1da177e4
LT
1901 */
1902
de74e92a 1903 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1904retry:
654d1f8a
ED
1905 rcu_read_lock();
1906 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1907 err = -ENODEV;
1908 if (dev == NULL)
1909 goto out_unlock;
1ce4f28b 1910
d5e76b0a
DM
1911 err = -ENETDOWN;
1912 if (!(dev->flags & IFF_UP))
1913 goto out_unlock;
1914
1da177e4 1915 /*
40d4e3df
ED
1916 * You may not queue a frame bigger than the mtu. This is the lowest level
1917 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1918 */
1ce4f28b 1919
3bdc0eba
BG
1920 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1921 if (!netif_supports_nofcs(dev)) {
1922 err = -EPROTONOSUPPORT;
1923 goto out_unlock;
1924 }
1925 extra_len = 4; /* We're doing our own CRC */
1926 }
1927
1da177e4 1928 err = -EMSGSIZE;
3bdc0eba 1929 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1930 goto out_unlock;
1931
1a35ca80
ED
1932 if (!skb) {
1933 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1934 int tlen = dev->needed_tailroom;
1a35ca80
ED
1935 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1936
1937 rcu_read_unlock();
4ce40912 1938 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1939 if (skb == NULL)
1940 return -ENOBUFS;
1941 /* FIXME: Save some space for broken drivers that write a hard
1942 * header at transmission time by themselves. PPP is the notable
1943 * one here. This should really be fixed at the driver level.
1944 */
1945 skb_reserve(skb, reserved);
1946 skb_reset_network_header(skb);
1947
1948 /* Try to align data part correctly */
1949 if (hhlen) {
1950 skb->data -= hhlen;
1951 skb->tail -= hhlen;
1952 if (len < hhlen)
1953 skb_reset_network_header(skb);
1954 }
6ce8e9ce 1955 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1956 if (err)
1957 goto out_free;
1958 goto retry;
1da177e4
LT
1959 }
1960
9ed988cd
WB
1961 if (!dev_validate_header(dev, skb->data, len)) {
1962 err = -EINVAL;
1963 goto out_unlock;
1964 }
3c70c132
DB
1965 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1966 !packet_extra_vlan_len_allowed(dev, skb)) {
1967 err = -EMSGSIZE;
1968 goto out_unlock;
57f89bfa 1969 }
1a35ca80 1970
657a0667 1971 sockcm_init(&sockc, sk);
c14ac945
SHY
1972 if (msg->msg_controllen) {
1973 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1974 if (unlikely(err))
c14ac945 1975 goto out_unlock;
c14ac945
SHY
1976 }
1977
1da177e4
LT
1978 skb->protocol = proto;
1979 skb->dev = dev;
1980 skb->priority = sk->sk_priority;
2d37a186 1981 skb->mark = sk->sk_mark;
3d0ba8c0 1982 skb->tstamp = sockc.transmit_time;
bf84a010 1983
8f932f76 1984 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1985
3bdc0eba
BG
1986 if (unlikely(extra_len == 4))
1987 skb->no_fcs = 1;
1988
75c65772 1989 packet_parse_headers(skb, sock);
c1aad275 1990
1da177e4 1991 dev_queue_xmit(skb);
654d1f8a 1992 rcu_read_unlock();
40d4e3df 1993 return len;
1da177e4 1994
1da177e4 1995out_unlock:
654d1f8a 1996 rcu_read_unlock();
1a35ca80
ED
1997out_free:
1998 kfree_skb(skb);
1da177e4
LT
1999 return err;
2000}
1da177e4 2001
ff936a04
AS
2002static unsigned int run_filter(struct sk_buff *skb,
2003 const struct sock *sk,
2004 unsigned int res)
1da177e4
LT
2005{
2006 struct sk_filter *filter;
fda9ef5d 2007
80f8f102
ED
2008 rcu_read_lock();
2009 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2010 if (filter != NULL)
ff936a04 2011 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2012 rcu_read_unlock();
1da177e4 2013
dbcb5855 2014 return res;
1da177e4
LT
2015}
2016
16cc1400
WB
2017static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2018 size_t *len)
2019{
2020 struct virtio_net_hdr vnet_hdr;
2021
2022 if (*len < sizeof(vnet_hdr))
2023 return -EINVAL;
2024 *len -= sizeof(vnet_hdr);
2025
fd3a8862 2026 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2027 return -EINVAL;
2028
2029 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2030}
2031
1da177e4 2032/*
62ab0812
ED
2033 * This function makes lazy skb cloning in hope that most of packets
2034 * are discarded by BPF.
2035 *
2036 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2037 * and skb->cb are mangled. It works because (and until) packets
2038 * falling here are owned by current CPU. Output packets are cloned
2039 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2040 * sequencially, so that if we return skb to original state on exit,
2041 * we will not harm anyone.
1da177e4
LT
2042 */
2043
40d4e3df
ED
2044static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2045 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2046{
2047 struct sock *sk;
2048 struct sockaddr_ll *sll;
2049 struct packet_sock *po;
40d4e3df 2050 u8 *skb_head = skb->data;
1da177e4 2051 int skb_len = skb->len;
dbcb5855 2052 unsigned int snaplen, res;
da37845f 2053 bool is_drop_n_account = false;
1da177e4
LT
2054
2055 if (skb->pkt_type == PACKET_LOOPBACK)
2056 goto drop;
2057
2058 sk = pt->af_packet_priv;
2059 po = pkt_sk(sk);
2060
09ad9bc7 2061 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2062 goto drop;
2063
1da177e4
LT
2064 skb->dev = dev;
2065
3b04ddde 2066 if (dev->header_ops) {
1da177e4 2067 /* The device has an explicit notion of ll header,
62ab0812
ED
2068 * exported to higher levels.
2069 *
2070 * Otherwise, the device hides details of its frame
2071 * structure, so that corresponding packet head is
2072 * never delivered to user.
1da177e4
LT
2073 */
2074 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2075 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2076 else if (skb->pkt_type == PACKET_OUTGOING) {
2077 /* Special case: outgoing packets have ll header at head */
bbe735e4 2078 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2079 }
2080 }
2081
2082 snaplen = skb->len;
2083
dbcb5855
DM
2084 res = run_filter(skb, sk, snaplen);
2085 if (!res)
fda9ef5d 2086 goto drop_n_restore;
dbcb5855
DM
2087 if (snaplen > res)
2088 snaplen = res;
1da177e4 2089
0fd7bac6 2090 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2091 goto drop_n_acct;
2092
2093 if (skb_shared(skb)) {
2094 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2095 if (nskb == NULL)
2096 goto drop_n_acct;
2097
2098 if (skb_head != skb->data) {
2099 skb->data = skb_head;
2100 skb->len = skb_len;
2101 }
abc4e4fa 2102 consume_skb(skb);
1da177e4
LT
2103 skb = nskb;
2104 }
2105
b4772ef8 2106 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2107
2108 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2109 sll->sll_hatype = dev->type;
1da177e4 2110 sll->sll_pkttype = skb->pkt_type;
8032b464 2111 if (unlikely(po->origdev))
80feaacb
PWJ
2112 sll->sll_ifindex = orig_dev->ifindex;
2113 else
2114 sll->sll_ifindex = dev->ifindex;
1da177e4 2115
b95cce35 2116 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2117
2472d761
EB
2118 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2119 * Use their space for storing the original skb length.
2120 */
2121 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2122
1da177e4
LT
2123 if (pskb_trim(skb, snaplen))
2124 goto drop_n_acct;
2125
2126 skb_set_owner_r(skb, sk);
2127 skb->dev = NULL;
adf30907 2128 skb_dst_drop(skb);
1da177e4 2129
84531c24 2130 /* drop conntrack reference */
895b5c9f 2131 nf_reset_ct(skb);
84531c24 2132
1da177e4 2133 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2134 po->stats.stats1.tp_packets++;
3bc3b96f 2135 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2136 __skb_queue_tail(&sk->sk_receive_queue, skb);
2137 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2138 sk->sk_data_ready(sk);
1da177e4
LT
2139 return 0;
2140
2141drop_n_acct:
da37845f 2142 is_drop_n_account = true;
8e8e2951 2143 atomic_inc(&po->tp_drops);
7091fbd8 2144 atomic_inc(&sk->sk_drops);
1da177e4
LT
2145
2146drop_n_restore:
2147 if (skb_head != skb->data && skb_shared(skb)) {
2148 skb->data = skb_head;
2149 skb->len = skb_len;
2150 }
2151drop:
da37845f
WJ
2152 if (!is_drop_n_account)
2153 consume_skb(skb);
2154 else
2155 kfree_skb(skb);
1da177e4
LT
2156 return 0;
2157}
2158
40d4e3df
ED
2159static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2160 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2161{
2162 struct sock *sk;
2163 struct packet_sock *po;
2164 struct sockaddr_ll *sll;
184f489e 2165 union tpacket_uhdr h;
40d4e3df 2166 u8 *skb_head = skb->data;
1da177e4 2167 int skb_len = skb->len;
dbcb5855 2168 unsigned int snaplen, res;
f6fb8f10 2169 unsigned long status = TP_STATUS_USER;
bbd6ef87 2170 unsigned short macoff, netoff, hdrlen;
1da177e4 2171 struct sk_buff *copy_skb = NULL;
bbd6ef87 2172 struct timespec ts;
b9c32fb2 2173 __u32 ts_status;
da37845f 2174 bool is_drop_n_account = false;
edbd58be 2175 bool do_vnet = false;
1da177e4 2176
51846355
AW
2177 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2178 * We may add members to them until current aligned size without forcing
2179 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2180 */
2181 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2182 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2183
1da177e4
LT
2184 if (skb->pkt_type == PACKET_LOOPBACK)
2185 goto drop;
2186
2187 sk = pt->af_packet_priv;
2188 po = pkt_sk(sk);
2189
09ad9bc7 2190 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2191 goto drop;
2192
3b04ddde 2193 if (dev->header_ops) {
1da177e4 2194 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2195 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2196 else if (skb->pkt_type == PACKET_OUTGOING) {
2197 /* Special case: outgoing packets have ll header at head */
bbe735e4 2198 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2199 }
2200 }
2201
2202 snaplen = skb->len;
2203
dbcb5855
DM
2204 res = run_filter(skb, sk, snaplen);
2205 if (!res)
fda9ef5d 2206 goto drop_n_restore;
68c2e5de 2207
2c51c627
ED
2208 /* If we are flooded, just give up */
2209 if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
2210 atomic_inc(&po->tp_drops);
2211 goto drop_n_restore;
2212 }
2213
68c2e5de
AD
2214 if (skb->ip_summed == CHECKSUM_PARTIAL)
2215 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2216 else if (skb->pkt_type != PACKET_OUTGOING &&
2217 (skb->ip_summed == CHECKSUM_COMPLETE ||
2218 skb_csum_unnecessary(skb)))
2219 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2220
dbcb5855
DM
2221 if (snaplen > res)
2222 snaplen = res;
1da177e4
LT
2223
2224 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2225 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2226 po->tp_reserve;
1da177e4 2227 } else {
95c96174 2228 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2229 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2230 (maclen < 16 ? 16 : maclen)) +
58d19b19 2231 po->tp_reserve;
edbd58be 2232 if (po->has_vnet_hdr) {
58d19b19 2233 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2234 do_vnet = true;
2235 }
1da177e4
LT
2236 macoff = netoff - maclen;
2237 }
f6fb8f10 2238 if (po->tp_version <= TPACKET_V2) {
2239 if (macoff + snaplen > po->rx_ring.frame_size) {
2240 if (po->copy_thresh &&
0fd7bac6 2241 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2242 if (skb_shared(skb)) {
2243 copy_skb = skb_clone(skb, GFP_ATOMIC);
2244 } else {
2245 copy_skb = skb_get(skb);
2246 skb_head = skb->data;
2247 }
2248 if (copy_skb)
2249 skb_set_owner_r(copy_skb, sk);
1da177e4 2250 }
f6fb8f10 2251 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2252 if ((int)snaplen < 0) {
f6fb8f10 2253 snaplen = 0;
edbd58be
BP
2254 do_vnet = false;
2255 }
1da177e4 2256 }
dc808110
ED
2257 } else if (unlikely(macoff + snaplen >
2258 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2259 u32 nval;
2260
2261 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2262 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2263 snaplen, nval, macoff);
2264 snaplen = nval;
2265 if (unlikely((int)snaplen < 0)) {
2266 snaplen = 0;
2267 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2268 do_vnet = false;
dc808110 2269 }
1da177e4 2270 }
1da177e4 2271 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2272 h.raw = packet_current_rx_frame(po, skb,
2273 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2274 if (!h.raw)
58d19b19 2275 goto drop_n_account;
41442444
WB
2276
2277 if (do_vnet &&
2278 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2279 sizeof(struct virtio_net_hdr),
2280 vio_le(), true, 0))
2281 goto drop_n_account;
2282
f6fb8f10 2283 if (po->tp_version <= TPACKET_V2) {
2284 packet_increment_rx_head(po, &po->rx_ring);
2285 /*
2286 * LOSING will be reported till you read the stats,
2287 * because it's COR - Clear On Read.
2288 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2289 * at packet level.
2290 */
8e8e2951 2291 if (atomic_read(&po->tp_drops))
f6fb8f10 2292 status |= TP_STATUS_LOSING;
2293 }
945d015e 2294
ee80fbf3 2295 po->stats.stats1.tp_packets++;
1da177e4
LT
2296 if (copy_skb) {
2297 status |= TP_STATUS_COPY;
2298 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2299 }
1da177e4
LT
2300 spin_unlock(&sk->sk_receive_queue.lock);
2301
bbd6ef87 2302 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2303
2304 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2305 getnstimeofday(&ts);
1da177e4 2306
b9c32fb2
DB
2307 status |= ts_status;
2308
bbd6ef87
PM
2309 switch (po->tp_version) {
2310 case TPACKET_V1:
2311 h.h1->tp_len = skb->len;
2312 h.h1->tp_snaplen = snaplen;
2313 h.h1->tp_mac = macoff;
2314 h.h1->tp_net = netoff;
4b457bdf
DB
2315 h.h1->tp_sec = ts.tv_sec;
2316 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2317 hdrlen = sizeof(*h.h1);
2318 break;
2319 case TPACKET_V2:
2320 h.h2->tp_len = skb->len;
2321 h.h2->tp_snaplen = snaplen;
2322 h.h2->tp_mac = macoff;
2323 h.h2->tp_net = netoff;
bbd6ef87
PM
2324 h.h2->tp_sec = ts.tv_sec;
2325 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2326 if (skb_vlan_tag_present(skb)) {
2327 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2328 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2329 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2330 } else {
2331 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2332 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2333 }
e4d26f4b 2334 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2335 hdrlen = sizeof(*h.h2);
2336 break;
f6fb8f10 2337 case TPACKET_V3:
2338 /* tp_nxt_offset,vlan are already populated above.
2339 * So DONT clear those fields here
2340 */
2341 h.h3->tp_status |= status;
2342 h.h3->tp_len = skb->len;
2343 h.h3->tp_snaplen = snaplen;
2344 h.h3->tp_mac = macoff;
2345 h.h3->tp_net = netoff;
f6fb8f10 2346 h.h3->tp_sec = ts.tv_sec;
2347 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2348 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2349 hdrlen = sizeof(*h.h3);
2350 break;
bbd6ef87
PM
2351 default:
2352 BUG();
2353 }
1da177e4 2354
bbd6ef87 2355 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2356 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2357 sll->sll_family = AF_PACKET;
2358 sll->sll_hatype = dev->type;
2359 sll->sll_protocol = skb->protocol;
2360 sll->sll_pkttype = skb->pkt_type;
8032b464 2361 if (unlikely(po->origdev))
80feaacb
PWJ
2362 sll->sll_ifindex = orig_dev->ifindex;
2363 else
2364 sll->sll_ifindex = dev->ifindex;
1da177e4 2365
e16aa207 2366 smp_mb();
f0d4eb29 2367
f6dafa95 2368#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2369 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2370 u8 *start, *end;
2371
f0d4eb29
DB
2372 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2373 macoff + snaplen);
2374
2375 for (start = h.raw; start < end; start += PAGE_SIZE)
2376 flush_dcache_page(pgv_to_page(start));
1da177e4 2377 }
f0d4eb29 2378 smp_wmb();
f6dafa95 2379#endif
f0d4eb29 2380
da413eec 2381 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2382 __packet_set_status(po, h.raw, status);
da413eec
DC
2383 sk->sk_data_ready(sk);
2384 } else {
f6fb8f10 2385 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2386 }
1da177e4
LT
2387
2388drop_n_restore:
2389 if (skb_head != skb->data && skb_shared(skb)) {
2390 skb->data = skb_head;
2391 skb->len = skb_len;
2392 }
2393drop:
da37845f
WJ
2394 if (!is_drop_n_account)
2395 consume_skb(skb);
2396 else
2397 kfree_skb(skb);
1da177e4
LT
2398 return 0;
2399
58d19b19 2400drop_n_account:
1da177e4 2401 spin_unlock(&sk->sk_receive_queue.lock);
8e8e2951
ED
2402 atomic_inc(&po->tp_drops);
2403 is_drop_n_account = true;
1da177e4 2404
676d2369 2405 sk->sk_data_ready(sk);
acb5d75b 2406 kfree_skb(copy_skb);
1da177e4
LT
2407 goto drop_n_restore;
2408}
2409
69e3c75f
JB
2410static void tpacket_destruct_skb(struct sk_buff *skb)
2411{
2412 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2413
69e3c75f 2414 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2415 void *ph;
b9c32fb2
DB
2416 __u32 ts;
2417
5cd8d46e 2418 ph = skb_zcopy_get_nouarg(skb);
b0138408 2419 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2420
2421 ts = __packet_set_timestamp(po, ph, skb);
2422 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
89ed5b51
NH
2423
2424 if (!packet_read_pending(&po->tx_ring))
2425 complete(&po->skb_completion);
69e3c75f
JB
2426 }
2427
2428 sock_wfree(skb);
2429}
2430
16cc1400
WB
2431static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2432{
16cc1400
WB
2433 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2434 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2435 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2436 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2437 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2438 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2439 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2440
2441 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2442 return -EINVAL;
2443
16cc1400
WB
2444 return 0;
2445}
2446
2447static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2448 struct virtio_net_hdr *vnet_hdr)
2449{
16cc1400
WB
2450 if (*len < sizeof(*vnet_hdr))
2451 return -EINVAL;
2452 *len -= sizeof(*vnet_hdr);
2453
cbbd26b8 2454 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2455 return -EFAULT;
2456
2457 return __packet_snd_vnet_parse(vnet_hdr, *len);
2458}
2459
40d4e3df 2460static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2461 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2462 __be16 proto, unsigned char *addr, int hlen, int copylen,
2463 const struct sockcm_cookie *sockc)
69e3c75f 2464{
184f489e 2465 union tpacket_uhdr ph;
8d39b4a6 2466 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2467 struct socket *sock = po->sk.sk_socket;
2468 struct page *page;
69e3c75f
JB
2469 int err;
2470
2471 ph.raw = frame;
2472
2473 skb->protocol = proto;
2474 skb->dev = dev;
2475 skb->priority = po->sk.sk_priority;
2d37a186 2476 skb->mark = po->sk.sk_mark;
3d0ba8c0 2477 skb->tstamp = sockc->transmit_time;
8f932f76 2478 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2479 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2480
ae641949 2481 skb_reserve(skb, hlen);
69e3c75f 2482 skb_reset_network_header(skb);
c1aad275 2483
69e3c75f
JB
2484 to_write = tp_len;
2485
2486 if (sock->type == SOCK_DGRAM) {
2487 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2488 NULL, tp_len);
2489 if (unlikely(err < 0))
2490 return -EINVAL;
1d036d25 2491 } else if (copylen) {
9ed988cd
WB
2492 int hdrlen = min_t(int, copylen, tp_len);
2493
69e3c75f 2494 skb_push(skb, dev->hard_header_len);
1d036d25 2495 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2496 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2497 if (unlikely(err))
2498 return err;
9ed988cd
WB
2499 if (!dev_validate_header(dev, skb->data, hdrlen))
2500 return -EINVAL;
69e3c75f 2501
9ed988cd
WB
2502 data += hdrlen;
2503 to_write -= hdrlen;
69e3c75f
JB
2504 }
2505
69e3c75f
JB
2506 offset = offset_in_page(data);
2507 len_max = PAGE_SIZE - offset;
2508 len = ((to_write > len_max) ? len_max : to_write);
2509
2510 skb->data_len = to_write;
2511 skb->len += to_write;
2512 skb->truesize += to_write;
14afee4b 2513 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2514
2515 while (likely(to_write)) {
2516 nr_frags = skb_shinfo(skb)->nr_frags;
2517
2518 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2519 pr_err("Packet exceed the number of skb frags(%lu)\n",
2520 MAX_SKB_FRAGS);
69e3c75f
JB
2521 return -EFAULT;
2522 }
2523
0af55bb5
CG
2524 page = pgv_to_page(data);
2525 data += len;
69e3c75f
JB
2526 flush_dcache_page(page);
2527 get_page(page);
0af55bb5 2528 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2529 to_write -= len;
2530 offset = 0;
2531 len_max = PAGE_SIZE;
2532 len = ((to_write > len_max) ? len_max : to_write);
2533 }
2534
75c65772 2535 packet_parse_headers(skb, sock);
efdfa2f7 2536
69e3c75f
JB
2537 return tp_len;
2538}
2539
8d39b4a6
WB
2540static int tpacket_parse_header(struct packet_sock *po, void *frame,
2541 int size_max, void **data)
2542{
2543 union tpacket_uhdr ph;
2544 int tp_len, off;
2545
2546 ph.raw = frame;
2547
2548 switch (po->tp_version) {
7f953ab2
SV
2549 case TPACKET_V3:
2550 if (ph.h3->tp_next_offset != 0) {
2551 pr_warn_once("variable sized slot not supported");
2552 return -EINVAL;
2553 }
2554 tp_len = ph.h3->tp_len;
2555 break;
8d39b4a6
WB
2556 case TPACKET_V2:
2557 tp_len = ph.h2->tp_len;
2558 break;
2559 default:
2560 tp_len = ph.h1->tp_len;
2561 break;
2562 }
2563 if (unlikely(tp_len > size_max)) {
2564 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2565 return -EMSGSIZE;
2566 }
2567
2568 if (unlikely(po->tp_tx_has_off)) {
2569 int off_min, off_max;
2570
2571 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2572 off_max = po->tx_ring.frame_size - tp_len;
2573 if (po->sk.sk_type == SOCK_DGRAM) {
2574 switch (po->tp_version) {
7f953ab2
SV
2575 case TPACKET_V3:
2576 off = ph.h3->tp_net;
2577 break;
8d39b4a6
WB
2578 case TPACKET_V2:
2579 off = ph.h2->tp_net;
2580 break;
2581 default:
2582 off = ph.h1->tp_net;
2583 break;
2584 }
2585 } else {
2586 switch (po->tp_version) {
7f953ab2
SV
2587 case TPACKET_V3:
2588 off = ph.h3->tp_mac;
2589 break;
8d39b4a6
WB
2590 case TPACKET_V2:
2591 off = ph.h2->tp_mac;
2592 break;
2593 default:
2594 off = ph.h1->tp_mac;
2595 break;
2596 }
2597 }
2598 if (unlikely((off < off_min) || (off_max < off)))
2599 return -EINVAL;
2600 } else {
2601 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2602 }
2603
2604 *data = frame + off;
2605 return tp_len;
2606}
2607
69e3c75f
JB
2608static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2609{
89ed5b51 2610 struct sk_buff *skb = NULL;
69e3c75f 2611 struct net_device *dev;
1d036d25 2612 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2613 struct sockcm_cookie sockc;
69e3c75f 2614 __be16 proto;
09effa67 2615 int err, reserve = 0;
40d4e3df 2616 void *ph;
342dfc30 2617 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2618 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
486efdc8 2619 unsigned char *addr = NULL;
69e3c75f 2620 int tp_len, size_max;
8d39b4a6 2621 void *data;
69e3c75f 2622 int len_sum = 0;
9e67030a 2623 int status = TP_STATUS_AVAILABLE;
1d036d25 2624 int hlen, tlen, copylen = 0;
89ed5b51 2625 long timeo = 0;
69e3c75f 2626
69e3c75f
JB
2627 mutex_lock(&po->pg_vec_lock);
2628
32d3182c
ED
2629 /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
2630 * we need to confirm it under protection of pg_vec_lock.
2631 */
2632 if (unlikely(!po->tx_ring.pg_vec)) {
2633 err = -EBUSY;
2634 goto out;
2635 }
66e56cd4 2636 if (likely(saddr == NULL)) {
e40526cb 2637 dev = packet_cached_dev_get(po);
69e3c75f 2638 proto = po->num;
69e3c75f
JB
2639 } else {
2640 err = -EINVAL;
2641 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2642 goto out;
2643 if (msg->msg_namelen < (saddr->sll_halen
2644 + offsetof(struct sockaddr_ll,
2645 sll_addr)))
2646 goto out;
69e3c75f 2647 proto = saddr->sll_protocol;
827d9780 2648 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
486efdc8
WB
2649 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2650 if (dev && msg->msg_namelen < dev->addr_len +
2651 offsetof(struct sockaddr_ll, sll_addr))
2652 goto out_put;
2653 addr = saddr->sll_addr;
2654 }
69e3c75f
JB
2655 }
2656
69e3c75f
JB
2657 err = -ENXIO;
2658 if (unlikely(dev == NULL))
2659 goto out;
69e3c75f
JB
2660 err = -ENETDOWN;
2661 if (unlikely(!(dev->flags & IFF_UP)))
2662 goto out_put;
2663
657a0667 2664 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2665 if (msg->msg_controllen) {
2666 err = sock_cmsg_send(&po->sk, msg, &sockc);
2667 if (unlikely(err))
2668 goto out_put;
2669 }
2670
5cfb4c8d
DB
2671 if (po->sk.sk_socket->type == SOCK_RAW)
2672 reserve = dev->hard_header_len;
69e3c75f 2673 size_max = po->tx_ring.frame_size
b5dd884e 2674 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2675
1d036d25 2676 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2677 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2678
89ed5b51
NH
2679 reinit_completion(&po->skb_completion);
2680
69e3c75f
JB
2681 do {
2682 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2683 TP_STATUS_SEND_REQUEST);
69e3c75f 2684 if (unlikely(ph == NULL)) {
89ed5b51
NH
2685 if (need_wait && skb) {
2686 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
2687 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
2688 if (timeo <= 0) {
2689 err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
2690 goto out_put;
2691 }
2692 }
2693 /* check for additional frames */
69e3c75f
JB
2694 continue;
2695 }
2696
8d39b4a6
WB
2697 skb = NULL;
2698 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2699 if (tp_len < 0)
2700 goto tpacket_error;
2701
69e3c75f 2702 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2703 hlen = LL_RESERVED_SPACE(dev);
2704 tlen = dev->needed_tailroom;
1d036d25
WB
2705 if (po->has_vnet_hdr) {
2706 vnet_hdr = data;
2707 data += sizeof(*vnet_hdr);
2708 tp_len -= sizeof(*vnet_hdr);
2709 if (tp_len < 0 ||
2710 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2711 tp_len = -EINVAL;
2712 goto tpacket_error;
2713 }
2714 copylen = __virtio16_to_cpu(vio_le(),
2715 vnet_hdr->hdr_len);
2716 }
9ed988cd 2717 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2718 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2719 hlen + tlen + sizeof(struct sockaddr_ll) +
2720 (copylen - dev->hard_header_len),
fbf33a28 2721 !need_wait, &err);
69e3c75f 2722
fbf33a28
KM
2723 if (unlikely(skb == NULL)) {
2724 /* we assume the socket was initially writeable ... */
2725 if (likely(len_sum > 0))
2726 err = len_sum;
69e3c75f 2727 goto out_status;
fbf33a28 2728 }
8d39b4a6 2729 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2730 addr, hlen, copylen, &sockc);
dbd46ab4 2731 if (likely(tp_len >= 0) &&
5cfb4c8d 2732 tp_len > dev->mtu + reserve &&
1d036d25 2733 !po->has_vnet_hdr &&
3c70c132
DB
2734 !packet_extra_vlan_len_allowed(dev, skb))
2735 tp_len = -EMSGSIZE;
69e3c75f
JB
2736
2737 if (unlikely(tp_len < 0)) {
8d39b4a6 2738tpacket_error:
69e3c75f
JB
2739 if (po->tp_loss) {
2740 __packet_set_status(po, ph,
2741 TP_STATUS_AVAILABLE);
2742 packet_increment_head(&po->tx_ring);
2743 kfree_skb(skb);
2744 continue;
2745 } else {
2746 status = TP_STATUS_WRONG_FORMAT;
2747 err = tp_len;
2748 goto out_status;
2749 }
2750 }
2751
9d2f67e4
JT
2752 if (po->has_vnet_hdr) {
2753 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2754 tp_len = -EINVAL;
2755 goto tpacket_error;
2756 }
2757 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2758 }
2759
69e3c75f
JB
2760 skb->destructor = tpacket_destruct_skb;
2761 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2762 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2763
2764 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2765 err = po->xmit(skb);
eb70df13
JP
2766 if (unlikely(err > 0)) {
2767 err = net_xmit_errno(err);
2768 if (err && __packet_get_status(po, ph) ==
2769 TP_STATUS_AVAILABLE) {
2770 /* skb was destructed already */
2771 skb = NULL;
2772 goto out_status;
2773 }
2774 /*
2775 * skb was dropped but not destructed yet;
2776 * let's treat it like congestion or err < 0
2777 */
2778 err = 0;
2779 }
69e3c75f
JB
2780 packet_increment_head(&po->tx_ring);
2781 len_sum += tp_len;
b0138408
DB
2782 } while (likely((ph != NULL) ||
2783 /* Note: packet_read_pending() might be slow if we have
2784 * to call it as it's per_cpu variable, but in fast-path
2785 * we already short-circuit the loop with the first
2786 * condition, and luckily don't have to go that path
2787 * anyway.
2788 */
2789 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2790
2791 err = len_sum;
2792 goto out_put;
2793
69e3c75f
JB
2794out_status:
2795 __packet_set_status(po, ph, status);
2796 kfree_skb(skb);
2797out_put:
e40526cb 2798 dev_put(dev);
69e3c75f
JB
2799out:
2800 mutex_unlock(&po->pg_vec_lock);
2801 return err;
2802}
69e3c75f 2803
eea49cc9
OJ
2804static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2805 size_t reserve, size_t len,
2806 size_t linear, int noblock,
2807 int *err)
bfd5f4a3
SS
2808{
2809 struct sk_buff *skb;
2810
2811 /* Under a page? Don't bother with paged skb. */
2812 if (prepad + len < PAGE_SIZE || !linear)
2813 linear = len;
2814
2815 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2816 err, 0);
bfd5f4a3
SS
2817 if (!skb)
2818 return NULL;
2819
2820 skb_reserve(skb, reserve);
2821 skb_put(skb, linear);
2822 skb->data_len = len - linear;
2823 skb->len += len - linear;
2824
2825 return skb;
2826}
2827
d346a3fa 2828static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2829{
2830 struct sock *sk = sock->sk;
342dfc30 2831 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2832 struct sk_buff *skb;
2833 struct net_device *dev;
0e11c91e 2834 __be16 proto;
486efdc8 2835 unsigned char *addr = NULL;
827d9780 2836 int err, reserve = 0;
c7d39e32 2837 struct sockcm_cookie sockc;
bfd5f4a3
SS
2838 struct virtio_net_hdr vnet_hdr = { 0 };
2839 int offset = 0;
bfd5f4a3 2840 struct packet_sock *po = pkt_sk(sk);
da7c9561 2841 bool has_vnet_hdr = false;
57031eb7 2842 int hlen, tlen, linear;
3bdc0eba 2843 int extra_len = 0;
1da177e4
LT
2844
2845 /*
1ce4f28b 2846 * Get and verify the address.
1da177e4 2847 */
1ce4f28b 2848
66e56cd4 2849 if (likely(saddr == NULL)) {
e40526cb 2850 dev = packet_cached_dev_get(po);
1da177e4 2851 proto = po->num;
1da177e4
LT
2852 } else {
2853 err = -EINVAL;
2854 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2855 goto out;
0fb375fb
EB
2856 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2857 goto out;
1da177e4 2858 proto = saddr->sll_protocol;
827d9780 2859 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
486efdc8
WB
2860 if (sock->type == SOCK_DGRAM) {
2861 if (dev && msg->msg_namelen < dev->addr_len +
2862 offsetof(struct sockaddr_ll, sll_addr))
2863 goto out_unlock;
2864 addr = saddr->sll_addr;
2865 }
1da177e4
LT
2866 }
2867
1da177e4 2868 err = -ENXIO;
e40526cb 2869 if (unlikely(dev == NULL))
1da177e4 2870 goto out_unlock;
d5e76b0a 2871 err = -ENETDOWN;
e40526cb 2872 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2873 goto out_unlock;
2874
657a0667 2875 sockcm_init(&sockc, sk);
c7d39e32
EJ
2876 sockc.mark = sk->sk_mark;
2877 if (msg->msg_controllen) {
2878 err = sock_cmsg_send(sk, msg, &sockc);
2879 if (unlikely(err))
2880 goto out_unlock;
2881 }
2882
e40526cb
DB
2883 if (sock->type == SOCK_RAW)
2884 reserve = dev->hard_header_len;
bfd5f4a3 2885 if (po->has_vnet_hdr) {
16cc1400
WB
2886 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2887 if (err)
bfd5f4a3 2888 goto out_unlock;
da7c9561 2889 has_vnet_hdr = true;
bfd5f4a3
SS
2890 }
2891
3bdc0eba
BG
2892 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2893 if (!netif_supports_nofcs(dev)) {
2894 err = -EPROTONOSUPPORT;
2895 goto out_unlock;
2896 }
2897 extra_len = 4; /* We're doing our own CRC */
2898 }
2899
1da177e4 2900 err = -EMSGSIZE;
16cc1400
WB
2901 if (!vnet_hdr.gso_type &&
2902 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2903 goto out_unlock;
2904
bfd5f4a3 2905 err = -ENOBUFS;
ae641949
HX
2906 hlen = LL_RESERVED_SPACE(dev);
2907 tlen = dev->needed_tailroom;
57031eb7
WB
2908 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2909 linear = max(linear, min_t(int, len, dev->hard_header_len));
2910 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2911 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2912 if (skb == NULL)
1da177e4
LT
2913 goto out_unlock;
2914
b84bbaf7 2915 skb_reset_network_header(skb);
1da177e4 2916
0c4e8581 2917 err = -EINVAL;
9c707762
WB
2918 if (sock->type == SOCK_DGRAM) {
2919 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2920 if (unlikely(offset < 0))
9c707762 2921 goto out_free;
b84bbaf7 2922 } else if (reserve) {
9aad13b0 2923 skb_reserve(skb, -reserve);
88a8121d
ND
2924 if (len < reserve + sizeof(struct ipv6hdr) &&
2925 dev->min_header_len != dev->hard_header_len)
993675a3 2926 skb_reset_network_header(skb);
9c707762 2927 }
1da177e4
LT
2928
2929 /* Returns -EFAULT on error */
c0371da6 2930 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2931 if (err)
2932 goto out_free;
bf84a010 2933
9ed988cd
WB
2934 if (sock->type == SOCK_RAW &&
2935 !dev_validate_header(dev, skb->data, len)) {
2936 err = -EINVAL;
2937 goto out_free;
2938 }
2939
8f932f76 2940 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2941
16cc1400 2942 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2943 !packet_extra_vlan_len_allowed(dev, skb)) {
2944 err = -EMSGSIZE;
2945 goto out_free;
57f89bfa
BG
2946 }
2947
09effa67
DM
2948 skb->protocol = proto;
2949 skb->dev = dev;
1da177e4 2950 skb->priority = sk->sk_priority;
c7d39e32 2951 skb->mark = sockc.mark;
3d0ba8c0 2952 skb->tstamp = sockc.transmit_time;
0fd5d57b 2953
da7c9561 2954 if (has_vnet_hdr) {
db60eb5f 2955 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2956 if (err)
2957 goto out_free;
2958 len += sizeof(vnet_hdr);
9d2f67e4 2959 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2960 }
2961
75c65772 2962 packet_parse_headers(skb, sock);
8fd6c80d 2963
3bdc0eba
BG
2964 if (unlikely(extra_len == 4))
2965 skb->no_fcs = 1;
2966
d346a3fa 2967 err = po->xmit(skb);
1da177e4
LT
2968 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2969 goto out_unlock;
2970
e40526cb 2971 dev_put(dev);
1da177e4 2972
40d4e3df 2973 return len;
1da177e4
LT
2974
2975out_free:
2976 kfree_skb(skb);
2977out_unlock:
e40526cb 2978 if (dev)
1da177e4
LT
2979 dev_put(dev);
2980out:
2981 return err;
2982}
2983
1b784140 2984static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2985{
69e3c75f
JB
2986 struct sock *sk = sock->sk;
2987 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2988
69e3c75f
JB
2989 if (po->tx_ring.pg_vec)
2990 return tpacket_snd(po, msg);
2991 else
69e3c75f
JB
2992 return packet_snd(sock, msg, len);
2993}
2994
1da177e4
LT
2995/*
2996 * Close a PACKET socket. This is fairly simple. We immediately go
2997 * to 'closed' state and remove our protocol entry in the device list.
2998 */
2999
3000static int packet_release(struct socket *sock)
3001{
3002 struct sock *sk = sock->sk;
3003 struct packet_sock *po;
2bd624b4 3004 struct packet_fanout *f;
d12d01d6 3005 struct net *net;
f6fb8f10 3006 union tpacket_req_u req_u;
1da177e4
LT
3007
3008 if (!sk)
3009 return 0;
3010
3b1e0a65 3011 net = sock_net(sk);
1da177e4
LT
3012 po = pkt_sk(sk);
3013
0fa7fa98 3014 mutex_lock(&net->packet.sklist_lock);
808f5114 3015 sk_del_node_init_rcu(sk);
0fa7fa98
PE
3016 mutex_unlock(&net->packet.sklist_lock);
3017
3018 preempt_disable();
920de804 3019 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 3020 preempt_enable();
1da177e4 3021
808f5114 3022 spin_lock(&po->bind_lock);
ce06b03e 3023 unregister_prot_hook(sk, false);
66e56cd4
DB
3024 packet_cached_dev_reset(po);
3025
160ff18a
BG
3026 if (po->prot_hook.dev) {
3027 dev_put(po->prot_hook.dev);
3028 po->prot_hook.dev = NULL;
3029 }
808f5114 3030 spin_unlock(&po->bind_lock);
1da177e4 3031
1da177e4 3032 packet_flush_mclist(sk);
1da177e4 3033
5171b37d 3034 lock_sock(sk);
9665d5d6
PS
3035 if (po->rx_ring.pg_vec) {
3036 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3037 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3038 }
69e3c75f 3039
9665d5d6
PS
3040 if (po->tx_ring.pg_vec) {
3041 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3042 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3043 }
5171b37d 3044 release_sock(sk);
1da177e4 3045
2bd624b4 3046 f = fanout_release(sk);
dc99f600 3047
808f5114 3048 synchronize_net();
2bd624b4 3049
afa0925c 3050 kfree(po->rollover);
2bd624b4
AS
3051 if (f) {
3052 fanout_release_data(f);
3053 kfree(f);
3054 }
1da177e4
LT
3055 /*
3056 * Now the socket is dead. No more input will appear.
3057 */
1da177e4
LT
3058 sock_orphan(sk);
3059 sock->sk = NULL;
3060
3061 /* Purge queues */
3062
3063 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3064 packet_free_pending(po);
17ab56a2 3065 sk_refcnt_debug_release(sk);
1da177e4
LT
3066
3067 sock_put(sk);
3068 return 0;
3069}
3070
3071/*
3072 * Attach a packet hook.
3073 */
3074
30f7ea1c
FR
3075static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3076 __be16 proto)
1da177e4
LT
3077{
3078 struct packet_sock *po = pkt_sk(sk);
158cd4af 3079 struct net_device *dev_curr;
902fefb8
DB
3080 __be16 proto_curr;
3081 bool need_rehook;
30f7ea1c
FR
3082 struct net_device *dev = NULL;
3083 int ret = 0;
3084 bool unlisted = false;
dc99f600 3085
1da177e4 3086 lock_sock(sk);
1da177e4 3087 spin_lock(&po->bind_lock);
30f7ea1c
FR
3088 rcu_read_lock();
3089
4971613c
WB
3090 if (po->fanout) {
3091 ret = -EINVAL;
3092 goto out_unlock;
3093 }
3094
30f7ea1c
FR
3095 if (name) {
3096 dev = dev_get_by_name_rcu(sock_net(sk), name);
3097 if (!dev) {
3098 ret = -ENODEV;
3099 goto out_unlock;
3100 }
3101 } else if (ifindex) {
3102 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3103 if (!dev) {
3104 ret = -ENODEV;
3105 goto out_unlock;
3106 }
3107 }
3108
3109 if (dev)
3110 dev_hold(dev);
66e56cd4 3111
902fefb8
DB
3112 proto_curr = po->prot_hook.type;
3113 dev_curr = po->prot_hook.dev;
3114
3115 need_rehook = proto_curr != proto || dev_curr != dev;
3116
3117 if (need_rehook) {
30f7ea1c
FR
3118 if (po->running) {
3119 rcu_read_unlock();
15fe076e
ED
3120 /* prevents packet_notifier() from calling
3121 * register_prot_hook()
3122 */
3123 po->num = 0;
30f7ea1c
FR
3124 __unregister_prot_hook(sk, true);
3125 rcu_read_lock();
3126 dev_curr = po->prot_hook.dev;
3127 if (dev)
3128 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3129 dev->ifindex);
3130 }
1da177e4 3131
15fe076e 3132 BUG_ON(po->running);
902fefb8
DB
3133 po->num = proto;
3134 po->prot_hook.type = proto;
902fefb8 3135
30f7ea1c
FR
3136 if (unlikely(unlisted)) {
3137 dev_put(dev);
3138 po->prot_hook.dev = NULL;
3139 po->ifindex = -1;
3140 packet_cached_dev_reset(po);
3141 } else {
3142 po->prot_hook.dev = dev;
3143 po->ifindex = dev ? dev->ifindex : 0;
3144 packet_cached_dev_assign(po, dev);
3145 }
902fefb8 3146 }
158cd4af
LW
3147 if (dev_curr)
3148 dev_put(dev_curr);
66e56cd4 3149
902fefb8 3150 if (proto == 0 || !need_rehook)
1da177e4
LT
3151 goto out_unlock;
3152
30f7ea1c 3153 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3154 register_prot_hook(sk);
be85d4ad
UT
3155 } else {
3156 sk->sk_err = ENETDOWN;
3157 if (!sock_flag(sk, SOCK_DEAD))
3158 sk->sk_error_report(sk);
1da177e4
LT
3159 }
3160
3161out_unlock:
30f7ea1c 3162 rcu_read_unlock();
1da177e4
LT
3163 spin_unlock(&po->bind_lock);
3164 release_sock(sk);
30f7ea1c 3165 return ret;
1da177e4
LT
3166}
3167
3168/*
3169 * Bind a packet socket to a device
3170 */
3171
40d4e3df
ED
3172static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3173 int addr_len)
1da177e4 3174{
40d4e3df 3175 struct sock *sk = sock->sk;
540e2894 3176 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3177
1da177e4
LT
3178 /*
3179 * Check legality
3180 */
1ce4f28b 3181
8ae55f04 3182 if (addr_len != sizeof(struct sockaddr))
1da177e4 3183 return -EINVAL;
540e2894
AP
3184 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3185 * zero-terminated.
3186 */
3187 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3188 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3189
30f7ea1c 3190 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3191}
1da177e4
LT
3192
3193static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3194{
40d4e3df
ED
3195 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3196 struct sock *sk = sock->sk;
1da177e4
LT
3197
3198 /*
3199 * Check legality
3200 */
1ce4f28b 3201
1da177e4
LT
3202 if (addr_len < sizeof(struct sockaddr_ll))
3203 return -EINVAL;
3204 if (sll->sll_family != AF_PACKET)
3205 return -EINVAL;
3206
30f7ea1c
FR
3207 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3208 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3209}
3210
3211static struct proto packet_proto = {
3212 .name = "PACKET",
3213 .owner = THIS_MODULE,
3214 .obj_size = sizeof(struct packet_sock),
3215};
3216
3217/*
1ce4f28b 3218 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3219 */
3220
3f378b68
EP
3221static int packet_create(struct net *net, struct socket *sock, int protocol,
3222 int kern)
1da177e4
LT
3223{
3224 struct sock *sk;
3225 struct packet_sock *po;
0e11c91e 3226 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3227 int err;
3228
df008c91 3229 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3230 return -EPERM;
be02097c
DM
3231 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3232 sock->type != SOCK_PACKET)
1da177e4
LT
3233 return -ESOCKTNOSUPPORT;
3234
3235 sock->state = SS_UNCONNECTED;
3236
3237 err = -ENOBUFS;
11aa9c28 3238 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3239 if (sk == NULL)
3240 goto out;
3241
3242 sock->ops = &packet_ops;
1da177e4
LT
3243 if (sock->type == SOCK_PACKET)
3244 sock->ops = &packet_ops_spkt;
be02097c 3245
1da177e4
LT
3246 sock_init_data(sock, sk);
3247
3248 po = pkt_sk(sk);
89ed5b51 3249 init_completion(&po->skb_completion);
1da177e4 3250 sk->sk_family = PF_PACKET;
0e11c91e 3251 po->num = proto;
d346a3fa 3252 po->xmit = dev_queue_xmit;
66e56cd4 3253
b0138408
DB
3254 err = packet_alloc_pending(po);
3255 if (err)
3256 goto out2;
3257
66e56cd4 3258 packet_cached_dev_reset(po);
1da177e4
LT
3259
3260 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3261 sk_refcnt_debug_inc(sk);
1da177e4
LT
3262
3263 /*
3264 * Attach a protocol block
3265 */
3266
3267 spin_lock_init(&po->bind_lock);
905db440 3268 mutex_init(&po->pg_vec_lock);
0648ab70 3269 po->rollover = NULL;
1da177e4 3270 po->prot_hook.func = packet_rcv;
be02097c 3271
1da177e4
LT
3272 if (sock->type == SOCK_PACKET)
3273 po->prot_hook.func = packet_rcv_spkt;
be02097c 3274
1da177e4
LT
3275 po->prot_hook.af_packet_priv = sk;
3276
0e11c91e
AV
3277 if (proto) {
3278 po->prot_hook.type = proto;
a6361f0c 3279 __register_prot_hook(sk);
1da177e4
LT
3280 }
3281
0fa7fa98 3282 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3283 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3284 mutex_unlock(&net->packet.sklist_lock);
3285
3286 preempt_disable();
3680453c 3287 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3288 preempt_enable();
808f5114 3289
40d4e3df 3290 return 0;
b0138408
DB
3291out2:
3292 sk_free(sk);
1da177e4
LT
3293out:
3294 return err;
3295}
3296
3297/*
3298 * Pull a packet from our receive queue and hand it to the user.
3299 * If necessary we block.
3300 */
3301
1b784140
YX
3302static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3303 int flags)
1da177e4
LT
3304{
3305 struct sock *sk = sock->sk;
3306 struct sk_buff *skb;
3307 int copied, err;
bfd5f4a3 3308 int vnet_hdr_len = 0;
2472d761 3309 unsigned int origlen = 0;
1da177e4
LT
3310
3311 err = -EINVAL;
ed85b565 3312 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3313 goto out;
3314
3315#if 0
3316 /* What error should we return now? EUNATTACH? */
3317 if (pkt_sk(sk)->ifindex < 0)
3318 return -ENODEV;
3319#endif
3320
ed85b565 3321 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3322 err = sock_recv_errqueue(sk, msg, len,
3323 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3324 goto out;
3325 }
3326
1da177e4
LT
3327 /*
3328 * Call the generic datagram receiver. This handles all sorts
3329 * of horrible races and re-entrancy so we can forget about it
3330 * in the protocol layers.
3331 *
3332 * Now it will return ENETDOWN, if device have just gone down,
3333 * but then it will block.
3334 */
3335
40d4e3df 3336 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3337
3338 /*
1ce4f28b 3339 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3340 * handles the blocking we don't see and worry about blocking
3341 * retries.
3342 */
3343
8ae55f04 3344 if (skb == NULL)
1da177e4
LT
3345 goto out;
3346
9bb6cd65 3347 packet_rcv_try_clear_pressure(pkt_sk(sk));
2ccdbaa6 3348
bfd5f4a3 3349 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3350 err = packet_rcv_vnet(msg, skb, &len);
3351 if (err)
bfd5f4a3 3352 goto out_free;
16cc1400 3353 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3354 }
3355
f3d33426
HFS
3356 /* You lose any data beyond the buffer you gave. If it worries
3357 * a user program they can ask the device for its MTU
3358 * anyway.
1da177e4 3359 */
1da177e4 3360 copied = skb->len;
40d4e3df
ED
3361 if (copied > len) {
3362 copied = len;
3363 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3364 }
3365
51f3d02b 3366 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3367 if (err)
3368 goto out_free;
3369
2472d761
EB
3370 if (sock->type != SOCK_PACKET) {
3371 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3372
3373 /* Original length was stored in sockaddr_ll fields */
3374 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3375 sll->sll_family = AF_PACKET;
3376 sll->sll_protocol = skb->protocol;
3377 }
3378
3b885787 3379 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3380
f3d33426 3381 if (msg->msg_name) {
b2cf86e1
WB
3382 int copy_len;
3383
f3d33426
HFS
3384 /* If the address length field is there to be filled
3385 * in, we fill it in now.
3386 */
3387 if (sock->type == SOCK_PACKET) {
342dfc30 3388 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426 3389 msg->msg_namelen = sizeof(struct sockaddr_pkt);
b2cf86e1 3390 copy_len = msg->msg_namelen;
f3d33426
HFS
3391 } else {
3392 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3393
f3d33426
HFS
3394 msg->msg_namelen = sll->sll_halen +
3395 offsetof(struct sockaddr_ll, sll_addr);
b2cf86e1
WB
3396 copy_len = msg->msg_namelen;
3397 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3398 memset(msg->msg_name +
3399 offsetof(struct sockaddr_ll, sll_addr),
3400 0, sizeof(sll->sll_addr));
3401 msg->msg_namelen = sizeof(struct sockaddr_ll);
3402 }
f3d33426 3403 }
b2cf86e1 3404 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
f3d33426 3405 }
1da177e4 3406
8dc41944 3407 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3408 struct tpacket_auxdata aux;
3409
3410 aux.tp_status = TP_STATUS_USER;
3411 if (skb->ip_summed == CHECKSUM_PARTIAL)
3412 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3413 else if (skb->pkt_type != PACKET_OUTGOING &&
3414 (skb->ip_summed == CHECKSUM_COMPLETE ||
3415 skb_csum_unnecessary(skb)))
3416 aux.tp_status |= TP_STATUS_CSUM_VALID;
3417
2472d761 3418 aux.tp_len = origlen;
ffbc6111
HX
3419 aux.tp_snaplen = skb->len;
3420 aux.tp_mac = 0;
bbe735e4 3421 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3422 if (skb_vlan_tag_present(skb)) {
3423 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3424 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3425 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3426 } else {
3427 aux.tp_vlan_tci = 0;
a0cdfcf3 3428 aux.tp_vlan_tpid = 0;
a3bcc23e 3429 }
ffbc6111 3430 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3431 }
3432
1da177e4
LT
3433 /*
3434 * Free or return the buffer as appropriate. Again this
3435 * hides all the races and re-entrancy issues from us.
3436 */
bfd5f4a3 3437 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3438
3439out_free:
3440 skb_free_datagram(sk, skb);
3441out:
3442 return err;
3443}
3444
1da177e4 3445static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3446 int peer)
1da177e4
LT
3447{
3448 struct net_device *dev;
3449 struct sock *sk = sock->sk;
3450
3451 if (peer)
3452 return -EOPNOTSUPP;
3453
3454 uaddr->sa_family = AF_PACKET;
2dc85bf3 3455 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3456 rcu_read_lock();
3457 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3458 if (dev)
2dc85bf3 3459 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3460 rcu_read_unlock();
1da177e4 3461
9b2c45d4 3462 return sizeof(*uaddr);
1da177e4 3463}
1da177e4
LT
3464
3465static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3466 int peer)
1da177e4
LT
3467{
3468 struct net_device *dev;
3469 struct sock *sk = sock->sk;
3470 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3471 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3472
3473 if (peer)
3474 return -EOPNOTSUPP;
3475
3476 sll->sll_family = AF_PACKET;
3477 sll->sll_ifindex = po->ifindex;
3478 sll->sll_protocol = po->num;
67286640 3479 sll->sll_pkttype = 0;
654d1f8a
ED
3480 rcu_read_lock();
3481 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3482 if (dev) {
3483 sll->sll_hatype = dev->type;
3484 sll->sll_halen = dev->addr_len;
3485 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3486 } else {
3487 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3488 sll->sll_halen = 0;
3489 }
654d1f8a 3490 rcu_read_unlock();
1da177e4 3491
9b2c45d4 3492 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3493}
3494
2aeb0b88
WC
3495static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3496 int what)
1da177e4
LT
3497{
3498 switch (i->type) {
3499 case PACKET_MR_MULTICAST:
1162563f
JP
3500 if (i->alen != dev->addr_len)
3501 return -EINVAL;
1da177e4 3502 if (what > 0)
22bedad3 3503 return dev_mc_add(dev, i->addr);
1da177e4 3504 else
22bedad3 3505 return dev_mc_del(dev, i->addr);
1da177e4
LT
3506 break;
3507 case PACKET_MR_PROMISC:
2aeb0b88 3508 return dev_set_promiscuity(dev, what);
1da177e4 3509 case PACKET_MR_ALLMULTI:
2aeb0b88 3510 return dev_set_allmulti(dev, what);
d95ed927 3511 case PACKET_MR_UNICAST:
1162563f
JP
3512 if (i->alen != dev->addr_len)
3513 return -EINVAL;
d95ed927 3514 if (what > 0)
a748ee24 3515 return dev_uc_add(dev, i->addr);
d95ed927 3516 else
a748ee24 3517 return dev_uc_del(dev, i->addr);
d95ed927 3518 break;
40d4e3df
ED
3519 default:
3520 break;
1da177e4 3521 }
2aeb0b88 3522 return 0;
1da177e4
LT
3523}
3524
82f17091
FR
3525static void packet_dev_mclist_delete(struct net_device *dev,
3526 struct packet_mclist **mlp)
1da177e4 3527{
82f17091
FR
3528 struct packet_mclist *ml;
3529
3530 while ((ml = *mlp) != NULL) {
3531 if (ml->ifindex == dev->ifindex) {
3532 packet_dev_mc(dev, ml, -1);
3533 *mlp = ml->next;
3534 kfree(ml);
3535 } else
3536 mlp = &ml->next;
1da177e4
LT
3537 }
3538}
3539
0fb375fb 3540static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3541{
3542 struct packet_sock *po = pkt_sk(sk);
3543 struct packet_mclist *ml, *i;
3544 struct net_device *dev;
3545 int err;
3546
3547 rtnl_lock();
3548
3549 err = -ENODEV;
3b1e0a65 3550 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3551 if (!dev)
3552 goto done;
3553
3554 err = -EINVAL;
1162563f 3555 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3556 goto done;
3557
3558 err = -ENOBUFS;
8b3a7005 3559 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3560 if (i == NULL)
3561 goto done;
3562
3563 err = 0;
3564 for (ml = po->mclist; ml; ml = ml->next) {
3565 if (ml->ifindex == mreq->mr_ifindex &&
3566 ml->type == mreq->mr_type &&
3567 ml->alen == mreq->mr_alen &&
3568 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3569 ml->count++;
3570 /* Free the new element ... */
3571 kfree(i);
3572 goto done;
3573 }
3574 }
3575
3576 i->type = mreq->mr_type;
3577 i->ifindex = mreq->mr_ifindex;
3578 i->alen = mreq->mr_alen;
3579 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3580 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3581 i->count = 1;
3582 i->next = po->mclist;
3583 po->mclist = i;
2aeb0b88
WC
3584 err = packet_dev_mc(dev, i, 1);
3585 if (err) {
3586 po->mclist = i->next;
3587 kfree(i);
3588 }
1da177e4
LT
3589
3590done:
3591 rtnl_unlock();
3592 return err;
3593}
3594
0fb375fb 3595static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3596{
3597 struct packet_mclist *ml, **mlp;
3598
3599 rtnl_lock();
3600
3601 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3602 if (ml->ifindex == mreq->mr_ifindex &&
3603 ml->type == mreq->mr_type &&
3604 ml->alen == mreq->mr_alen &&
3605 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3606 if (--ml->count == 0) {
3607 struct net_device *dev;
3608 *mlp = ml->next;
ad959e76
ED
3609 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3610 if (dev)
1da177e4 3611 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3612 kfree(ml);
3613 }
82f17091 3614 break;
1da177e4
LT
3615 }
3616 }
3617 rtnl_unlock();
82f17091 3618 return 0;
1da177e4
LT
3619}
3620
3621static void packet_flush_mclist(struct sock *sk)
3622{
3623 struct packet_sock *po = pkt_sk(sk);
3624 struct packet_mclist *ml;
3625
3626 if (!po->mclist)
3627 return;
3628
3629 rtnl_lock();
3630 while ((ml = po->mclist) != NULL) {
3631 struct net_device *dev;
3632
3633 po->mclist = ml->next;
ad959e76
ED
3634 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3635 if (dev != NULL)
1da177e4 3636 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3637 kfree(ml);
3638 }
3639 rtnl_unlock();
3640}
1da177e4
LT
3641
3642static int
b7058842 3643packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3644{
3645 struct sock *sk = sock->sk;
8dc41944 3646 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3647 int ret;
3648
3649 if (level != SOL_PACKET)
3650 return -ENOPROTOOPT;
3651
69e3c75f 3652 switch (optname) {
1ce4f28b 3653 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3654 case PACKET_DROP_MEMBERSHIP:
3655 {
0fb375fb
EB
3656 struct packet_mreq_max mreq;
3657 int len = optlen;
3658 memset(&mreq, 0, sizeof(mreq));
3659 if (len < sizeof(struct packet_mreq))
1da177e4 3660 return -EINVAL;
0fb375fb
EB
3661 if (len > sizeof(mreq))
3662 len = sizeof(mreq);
40d4e3df 3663 if (copy_from_user(&mreq, optval, len))
1da177e4 3664 return -EFAULT;
0fb375fb
EB
3665 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3666 return -EINVAL;
1da177e4
LT
3667 if (optname == PACKET_ADD_MEMBERSHIP)
3668 ret = packet_mc_add(sk, &mreq);
3669 else
3670 ret = packet_mc_drop(sk, &mreq);
3671 return ret;
3672 }
a2efcfa0 3673
1da177e4 3674 case PACKET_RX_RING:
69e3c75f 3675 case PACKET_TX_RING:
1da177e4 3676 {
f6fb8f10 3677 union tpacket_req_u req_u;
3678 int len;
1da177e4 3679
5171b37d 3680 lock_sock(sk);
f6fb8f10 3681 switch (po->tp_version) {
3682 case TPACKET_V1:
3683 case TPACKET_V2:
3684 len = sizeof(req_u.req);
3685 break;
3686 case TPACKET_V3:
3687 default:
3688 len = sizeof(req_u.req3);
3689 break;
3690 }
5171b37d
ED
3691 if (optlen < len) {
3692 ret = -EINVAL;
3693 } else {
3694 if (copy_from_user(&req_u.req, optval, len))
3695 ret = -EFAULT;
3696 else
3697 ret = packet_set_ring(sk, &req_u, 0,
3698 optname == PACKET_TX_RING);
3699 }
3700 release_sock(sk);
3701 return ret;
1da177e4
LT
3702 }
3703 case PACKET_COPY_THRESH:
3704 {
3705 int val;
3706
40d4e3df 3707 if (optlen != sizeof(val))
1da177e4 3708 return -EINVAL;
40d4e3df 3709 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3710 return -EFAULT;
3711
3712 pkt_sk(sk)->copy_thresh = val;
3713 return 0;
3714 }
bbd6ef87
PM
3715 case PACKET_VERSION:
3716 {
3717 int val;
3718
3719 if (optlen != sizeof(val))
3720 return -EINVAL;
bbd6ef87
PM
3721 if (copy_from_user(&val, optval, sizeof(val)))
3722 return -EFAULT;
3723 switch (val) {
3724 case TPACKET_V1:
3725 case TPACKET_V2:
f6fb8f10 3726 case TPACKET_V3:
84ac7260 3727 break;
bbd6ef87
PM
3728 default:
3729 return -EINVAL;
3730 }
84ac7260
PP
3731 lock_sock(sk);
3732 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3733 ret = -EBUSY;
3734 } else {
3735 po->tp_version = val;
3736 ret = 0;
3737 }
3738 release_sock(sk);
3739 return ret;
bbd6ef87 3740 }
8913336a
PM
3741 case PACKET_RESERVE:
3742 {
3743 unsigned int val;
3744
3745 if (optlen != sizeof(val))
3746 return -EINVAL;
8913336a
PM
3747 if (copy_from_user(&val, optval, sizeof(val)))
3748 return -EFAULT;
bcc5364b
AK
3749 if (val > INT_MAX)
3750 return -EINVAL;
c27927e3
WB
3751 lock_sock(sk);
3752 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3753 ret = -EBUSY;
3754 } else {
3755 po->tp_reserve = val;
3756 ret = 0;
3757 }
3758 release_sock(sk);
3759 return ret;
8913336a 3760 }
69e3c75f
JB
3761 case PACKET_LOSS:
3762 {
3763 unsigned int val;
3764
3765 if (optlen != sizeof(val))
3766 return -EINVAL;
69e3c75f
JB
3767 if (copy_from_user(&val, optval, sizeof(val)))
3768 return -EFAULT;
a6361f0c
WB
3769
3770 lock_sock(sk);
3771 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3772 ret = -EBUSY;
3773 } else {
3774 po->tp_loss = !!val;
3775 ret = 0;
3776 }
3777 release_sock(sk);
3778 return ret;
69e3c75f 3779 }
8dc41944
HX
3780 case PACKET_AUXDATA:
3781 {
3782 int val;
3783
3784 if (optlen < sizeof(val))
3785 return -EINVAL;
3786 if (copy_from_user(&val, optval, sizeof(val)))
3787 return -EFAULT;
3788
a6361f0c 3789 lock_sock(sk);
8dc41944 3790 po->auxdata = !!val;
a6361f0c 3791 release_sock(sk);
8dc41944
HX
3792 return 0;
3793 }
80feaacb
PWJ
3794 case PACKET_ORIGDEV:
3795 {
3796 int val;
3797
3798 if (optlen < sizeof(val))
3799 return -EINVAL;
3800 if (copy_from_user(&val, optval, sizeof(val)))
3801 return -EFAULT;
3802
a6361f0c 3803 lock_sock(sk);
80feaacb 3804 po->origdev = !!val;
a6361f0c 3805 release_sock(sk);
80feaacb
PWJ
3806 return 0;
3807 }
bfd5f4a3
SS
3808 case PACKET_VNET_HDR:
3809 {
3810 int val;
3811
3812 if (sock->type != SOCK_RAW)
3813 return -EINVAL;
bfd5f4a3
SS
3814 if (optlen < sizeof(val))
3815 return -EINVAL;
3816 if (copy_from_user(&val, optval, sizeof(val)))
3817 return -EFAULT;
3818
a6361f0c
WB
3819 lock_sock(sk);
3820 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3821 ret = -EBUSY;
3822 } else {
3823 po->has_vnet_hdr = !!val;
3824 ret = 0;
3825 }
3826 release_sock(sk);
3827 return ret;
bfd5f4a3 3828 }
614f60fa
SM
3829 case PACKET_TIMESTAMP:
3830 {
3831 int val;
3832
3833 if (optlen != sizeof(val))
3834 return -EINVAL;
3835 if (copy_from_user(&val, optval, sizeof(val)))
3836 return -EFAULT;
3837
3838 po->tp_tstamp = val;
3839 return 0;
3840 }
dc99f600
DM
3841 case PACKET_FANOUT:
3842 {
3843 int val;
3844
3845 if (optlen != sizeof(val))
3846 return -EINVAL;
3847 if (copy_from_user(&val, optval, sizeof(val)))
3848 return -EFAULT;
3849
3850 return fanout_add(sk, val & 0xffff, val >> 16);
3851 }
47dceb8e
WB
3852 case PACKET_FANOUT_DATA:
3853 {
3854 if (!po->fanout)
3855 return -EINVAL;
3856
3857 return fanout_set_data(po, optval, optlen);
3858 }
fa788d98
VW
3859 case PACKET_IGNORE_OUTGOING:
3860 {
3861 int val;
3862
3863 if (optlen != sizeof(val))
3864 return -EINVAL;
3865 if (copy_from_user(&val, optval, sizeof(val)))
3866 return -EFAULT;
3867 if (val < 0 || val > 1)
3868 return -EINVAL;
3869
3870 po->prot_hook.ignore_outgoing = !!val;
3871 return 0;
3872 }
5920cd3a
PC
3873 case PACKET_TX_HAS_OFF:
3874 {
3875 unsigned int val;
3876
3877 if (optlen != sizeof(val))
3878 return -EINVAL;
5920cd3a
PC
3879 if (copy_from_user(&val, optval, sizeof(val)))
3880 return -EFAULT;
a6361f0c
WB
3881
3882 lock_sock(sk);
3883 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3884 ret = -EBUSY;
3885 } else {
3886 po->tp_tx_has_off = !!val;
3887 ret = 0;
3888 }
3889 release_sock(sk);
5920cd3a
PC
3890 return 0;
3891 }
d346a3fa
DB
3892 case PACKET_QDISC_BYPASS:
3893 {
3894 int val;
3895
3896 if (optlen != sizeof(val))
3897 return -EINVAL;
3898 if (copy_from_user(&val, optval, sizeof(val)))
3899 return -EFAULT;
3900
3901 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3902 return 0;
3903 }
1da177e4
LT
3904 default:
3905 return -ENOPROTOOPT;
3906 }
3907}
3908
3909static int packet_getsockopt(struct socket *sock, int level, int optname,
3910 char __user *optval, int __user *optlen)
3911{
3912 int len;
c06fff6e 3913 int val, lv = sizeof(val);
1da177e4
LT
3914 struct sock *sk = sock->sk;
3915 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3916 void *data = &val;
ee80fbf3 3917 union tpacket_stats_u st;
a9b63918 3918 struct tpacket_rollover_stats rstats;
8e8e2951 3919 int drops;
1da177e4
LT
3920
3921 if (level != SOL_PACKET)
3922 return -ENOPROTOOPT;
3923
8ae55f04
KK
3924 if (get_user(len, optlen))
3925 return -EFAULT;
1da177e4
LT
3926
3927 if (len < 0)
3928 return -EINVAL;
1ce4f28b 3929
69e3c75f 3930 switch (optname) {
1da177e4 3931 case PACKET_STATISTICS:
1da177e4 3932 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3933 memcpy(&st, &po->stats, sizeof(st));
3934 memset(&po->stats, 0, sizeof(po->stats));
3935 spin_unlock_bh(&sk->sk_receive_queue.lock);
8e8e2951 3936 drops = atomic_xchg(&po->tp_drops, 0);
ee80fbf3 3937
f6fb8f10 3938 if (po->tp_version == TPACKET_V3) {
c06fff6e 3939 lv = sizeof(struct tpacket_stats_v3);
8e8e2951
ED
3940 st.stats3.tp_drops = drops;
3941 st.stats3.tp_packets += drops;
ee80fbf3 3942 data = &st.stats3;
f6fb8f10 3943 } else {
c06fff6e 3944 lv = sizeof(struct tpacket_stats);
8e8e2951
ED
3945 st.stats1.tp_drops = drops;
3946 st.stats1.tp_packets += drops;
ee80fbf3 3947 data = &st.stats1;
f6fb8f10 3948 }
ee80fbf3 3949
8dc41944
HX
3950 break;
3951 case PACKET_AUXDATA:
8dc41944 3952 val = po->auxdata;
80feaacb
PWJ
3953 break;
3954 case PACKET_ORIGDEV:
80feaacb 3955 val = po->origdev;
bfd5f4a3
SS
3956 break;
3957 case PACKET_VNET_HDR:
bfd5f4a3 3958 val = po->has_vnet_hdr;
1da177e4 3959 break;
bbd6ef87 3960 case PACKET_VERSION:
bbd6ef87 3961 val = po->tp_version;
bbd6ef87
PM
3962 break;
3963 case PACKET_HDRLEN:
3964 if (len > sizeof(int))
3965 len = sizeof(int);
fd2c83b3
AP
3966 if (len < sizeof(int))
3967 return -EINVAL;
bbd6ef87
PM
3968 if (copy_from_user(&val, optval, len))
3969 return -EFAULT;
3970 switch (val) {
3971 case TPACKET_V1:
3972 val = sizeof(struct tpacket_hdr);
3973 break;
3974 case TPACKET_V2:
3975 val = sizeof(struct tpacket2_hdr);
3976 break;
f6fb8f10 3977 case TPACKET_V3:
3978 val = sizeof(struct tpacket3_hdr);
3979 break;
bbd6ef87
PM
3980 default:
3981 return -EINVAL;
3982 }
bbd6ef87 3983 break;
8913336a 3984 case PACKET_RESERVE:
8913336a 3985 val = po->tp_reserve;
8913336a 3986 break;
69e3c75f 3987 case PACKET_LOSS:
69e3c75f 3988 val = po->tp_loss;
69e3c75f 3989 break;
614f60fa 3990 case PACKET_TIMESTAMP:
614f60fa 3991 val = po->tp_tstamp;
614f60fa 3992 break;
dc99f600 3993 case PACKET_FANOUT:
dc99f600
DM
3994 val = (po->fanout ?
3995 ((u32)po->fanout->id |
77f65ebd
WB
3996 ((u32)po->fanout->type << 16) |
3997 ((u32)po->fanout->flags << 24)) :
dc99f600 3998 0);
dc99f600 3999 break;
fa788d98
VW
4000 case PACKET_IGNORE_OUTGOING:
4001 val = po->prot_hook.ignore_outgoing;
4002 break;
a9b63918 4003 case PACKET_ROLLOVER_STATS:
57f015f5 4004 if (!po->rollover)
a9b63918 4005 return -EINVAL;
57f015f5
MM
4006 rstats.tp_all = atomic_long_read(&po->rollover->num);
4007 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
4008 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
4009 data = &rstats;
4010 lv = sizeof(rstats);
a9b63918 4011 break;
5920cd3a
PC
4012 case PACKET_TX_HAS_OFF:
4013 val = po->tp_tx_has_off;
4014 break;
d346a3fa
DB
4015 case PACKET_QDISC_BYPASS:
4016 val = packet_use_direct_xmit(po);
4017 break;
1da177e4
LT
4018 default:
4019 return -ENOPROTOOPT;
4020 }
4021
c06fff6e
ED
4022 if (len > lv)
4023 len = lv;
8ae55f04
KK
4024 if (put_user(len, optlen))
4025 return -EFAULT;
8dc41944
HX
4026 if (copy_to_user(optval, data, len))
4027 return -EFAULT;
8ae55f04 4028 return 0;
1da177e4
LT
4029}
4030
4031
719c44d3
WB
4032#ifdef CONFIG_COMPAT
4033static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
4034 char __user *optval, unsigned int optlen)
4035{
4036 struct packet_sock *po = pkt_sk(sock->sk);
4037
4038 if (level != SOL_PACKET)
4039 return -ENOPROTOOPT;
4040
4041 if (optname == PACKET_FANOUT_DATA &&
4042 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4043 optval = (char __user *)get_compat_bpf_fprog(optval);
4044 if (!optval)
4045 return -EFAULT;
4046 optlen = sizeof(struct sock_fprog);
4047 }
4048
4049 return packet_setsockopt(sock, level, optname, optval, optlen);
4050}
4051#endif
4052
351638e7
JP
4053static int packet_notifier(struct notifier_block *this,
4054 unsigned long msg, void *ptr)
1da177e4
LT
4055{
4056 struct sock *sk;
351638e7 4057 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4058 struct net *net = dev_net(dev);
1da177e4 4059
808f5114 4060 rcu_read_lock();
b67bfe0d 4061 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4062 struct packet_sock *po = pkt_sk(sk);
4063
4064 switch (msg) {
4065 case NETDEV_UNREGISTER:
1da177e4 4066 if (po->mclist)
82f17091 4067 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4068 /* fallthrough */
4069
1da177e4
LT
4070 case NETDEV_DOWN:
4071 if (dev->ifindex == po->ifindex) {
4072 spin_lock(&po->bind_lock);
4073 if (po->running) {
ce06b03e 4074 __unregister_prot_hook(sk, false);
1da177e4
LT
4075 sk->sk_err = ENETDOWN;
4076 if (!sock_flag(sk, SOCK_DEAD))
4077 sk->sk_error_report(sk);
4078 }
4079 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4080 packet_cached_dev_reset(po);
1da177e4 4081 po->ifindex = -1;
160ff18a
BG
4082 if (po->prot_hook.dev)
4083 dev_put(po->prot_hook.dev);
1da177e4
LT
4084 po->prot_hook.dev = NULL;
4085 }
4086 spin_unlock(&po->bind_lock);
4087 }
4088 break;
4089 case NETDEV_UP:
808f5114 4090 if (dev->ifindex == po->ifindex) {
4091 spin_lock(&po->bind_lock);
ce06b03e
DM
4092 if (po->num)
4093 register_prot_hook(sk);
808f5114 4094 spin_unlock(&po->bind_lock);
1da177e4 4095 }
1da177e4
LT
4096 break;
4097 }
4098 }
808f5114 4099 rcu_read_unlock();
1da177e4
LT
4100 return NOTIFY_DONE;
4101}
4102
4103
4104static int packet_ioctl(struct socket *sock, unsigned int cmd,
4105 unsigned long arg)
4106{
4107 struct sock *sk = sock->sk;
4108
69e3c75f 4109 switch (cmd) {
40d4e3df
ED
4110 case SIOCOUTQ:
4111 {
4112 int amount = sk_wmem_alloc_get(sk);
31e6d363 4113
40d4e3df
ED
4114 return put_user(amount, (int __user *)arg);
4115 }
4116 case SIOCINQ:
4117 {
4118 struct sk_buff *skb;
4119 int amount = 0;
4120
4121 spin_lock_bh(&sk->sk_receive_queue.lock);
4122 skb = skb_peek(&sk->sk_receive_queue);
4123 if (skb)
4124 amount = skb->len;
4125 spin_unlock_bh(&sk->sk_receive_queue.lock);
4126 return put_user(amount, (int __user *)arg);
4127 }
1da177e4 4128#ifdef CONFIG_INET
40d4e3df
ED
4129 case SIOCADDRT:
4130 case SIOCDELRT:
4131 case SIOCDARP:
4132 case SIOCGARP:
4133 case SIOCSARP:
4134 case SIOCGIFADDR:
4135 case SIOCSIFADDR:
4136 case SIOCGIFBRDADDR:
4137 case SIOCSIFBRDADDR:
4138 case SIOCGIFNETMASK:
4139 case SIOCSIFNETMASK:
4140 case SIOCGIFDSTADDR:
4141 case SIOCSIFDSTADDR:
4142 case SIOCSIFFLAGS:
40d4e3df 4143 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4144#endif
4145
40d4e3df
ED
4146 default:
4147 return -ENOIOCTLCMD;
1da177e4
LT
4148 }
4149 return 0;
4150}
4151
a11e1d43
LT
4152static __poll_t packet_poll(struct file *file, struct socket *sock,
4153 poll_table *wait)
1da177e4
LT
4154{
4155 struct sock *sk = sock->sk;
4156 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4157 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4158
4159 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4160 if (po->rx_ring.pg_vec) {
f6fb8f10 4161 if (!packet_previous_rx_frame(po, &po->rx_ring,
4162 TP_STATUS_KERNEL))
a9a08845 4163 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4164 }
9bb6cd65 4165 packet_rcv_try_clear_pressure(po);
1da177e4 4166 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4167 spin_lock_bh(&sk->sk_write_queue.lock);
4168 if (po->tx_ring.pg_vec) {
4169 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4170 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4171 }
4172 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4173 return mask;
4174}
4175
4176
4177/* Dirty? Well, I still did not learn better way to account
4178 * for user mmaps.
4179 */
4180
4181static void packet_mm_open(struct vm_area_struct *vma)
4182{
4183 struct file *file = vma->vm_file;
40d4e3df 4184 struct socket *sock = file->private_data;
1da177e4 4185 struct sock *sk = sock->sk;
1ce4f28b 4186
1da177e4
LT
4187 if (sk)
4188 atomic_inc(&pkt_sk(sk)->mapped);
4189}
4190
4191static void packet_mm_close(struct vm_area_struct *vma)
4192{
4193 struct file *file = vma->vm_file;
40d4e3df 4194 struct socket *sock = file->private_data;
1da177e4 4195 struct sock *sk = sock->sk;
1ce4f28b 4196
1da177e4
LT
4197 if (sk)
4198 atomic_dec(&pkt_sk(sk)->mapped);
4199}
4200
f0f37e2f 4201static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4202 .open = packet_mm_open,
4203 .close = packet_mm_close,
1da177e4
LT
4204};
4205
3a7ad063
ED
4206static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4207 unsigned int len)
1da177e4
LT
4208{
4209 int i;
4210
4ebf0ae2 4211 for (i = 0; i < len; i++) {
0e3125c7 4212 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4213 if (is_vmalloc_addr(pg_vec[i].buffer))
4214 vfree(pg_vec[i].buffer);
4215 else
4216 free_pages((unsigned long)pg_vec[i].buffer,
4217 order);
0e3125c7
NH
4218 pg_vec[i].buffer = NULL;
4219 }
1da177e4
LT
4220 }
4221 kfree(pg_vec);
4222}
4223
3a7ad063 4224static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4225{
f0d4eb29 4226 char *buffer;
3a7ad063
ED
4227 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4228 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4229
3a7ad063 4230 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4231 if (buffer)
4232 return buffer;
4233
3a7ad063
ED
4234 /* __get_free_pages failed, fall back to vmalloc */
4235 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4236 if (buffer)
4237 return buffer;
0e3125c7 4238
3a7ad063
ED
4239 /* vmalloc failed, lets dig into swap here */
4240 gfp_flags &= ~__GFP_NORETRY;
4241 buffer = (char *) __get_free_pages(gfp_flags, order);
4242 if (buffer)
4243 return buffer;
4244
4245 /* complete and utter failure */
4246 return NULL;
4ebf0ae2
DM
4247}
4248
3a7ad063 4249static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4250{
4251 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4252 struct pgv *pg_vec;
4ebf0ae2
DM
4253 int i;
4254
398f0132 4255 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4ebf0ae2
DM
4256 if (unlikely(!pg_vec))
4257 goto out;
4258
4259 for (i = 0; i < block_nr; i++) {
3a7ad063 4260 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4261 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4262 goto out_free_pgvec;
4263 }
4264
4265out:
4266 return pg_vec;
4267
4268out_free_pgvec:
3a7ad063 4269 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4270 pg_vec = NULL;
4271 goto out;
4272}
1da177e4 4273
f6fb8f10 4274static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4275 int closing, int tx_ring)
1da177e4 4276{
0e3125c7 4277 struct pgv *pg_vec = NULL;
1da177e4 4278 struct packet_sock *po = pkt_sk(sk);
3a7ad063 4279 int was_running, order = 0;
69e3c75f
JB
4280 struct packet_ring_buffer *rb;
4281 struct sk_buff_head *rb_queue;
0e11c91e 4282 __be16 num;
f6fb8f10 4283 int err = -EINVAL;
4284 /* Added to avoid minimal code churn */
4285 struct tpacket_req *req = &req_u->req;
4286
69e3c75f
JB
4287 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4288 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4289
69e3c75f
JB
4290 err = -EBUSY;
4291 if (!closing) {
4292 if (atomic_read(&po->mapped))
4293 goto out;
b0138408 4294 if (packet_read_pending(rb))
69e3c75f
JB
4295 goto out;
4296 }
1da177e4 4297
69e3c75f 4298 if (req->tp_block_nr) {
4576cd46
WB
4299 unsigned int min_frame_size;
4300
69e3c75f
JB
4301 /* Sanity tests and some calculations */
4302 err = -EBUSY;
4303 if (unlikely(rb->pg_vec))
4304 goto out;
1da177e4 4305
bbd6ef87
PM
4306 switch (po->tp_version) {
4307 case TPACKET_V1:
4308 po->tp_hdrlen = TPACKET_HDRLEN;
4309 break;
4310 case TPACKET_V2:
4311 po->tp_hdrlen = TPACKET2_HDRLEN;
4312 break;
f6fb8f10 4313 case TPACKET_V3:
4314 po->tp_hdrlen = TPACKET3_HDRLEN;
4315 break;
bbd6ef87
PM
4316 }
4317
69e3c75f 4318 err = -EINVAL;
4ebf0ae2 4319 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4320 goto out;
90836b67 4321 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4322 goto out;
4576cd46 4323 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4324 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4325 req->tp_block_size <
4326 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4327 goto out;
4576cd46 4328 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4329 goto out;
4ebf0ae2 4330 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4331 goto out;
1da177e4 4332
4194b491
TK
4333 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4334 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4335 goto out;
fc62814d 4336 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4337 goto out;
69e3c75f
JB
4338 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4339 req->tp_frame_nr))
4340 goto out;
1da177e4
LT
4341
4342 err = -ENOMEM;
3a7ad063
ED
4343 order = get_order(req->tp_block_size);
4344 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4345 if (unlikely(!pg_vec))
1da177e4 4346 goto out;
f6fb8f10 4347 switch (po->tp_version) {
4348 case TPACKET_V3:
7f953ab2
SV
4349 /* Block transmit is not supported yet */
4350 if (!tx_ring) {
e8e85cc5 4351 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4352 } else {
4353 struct tpacket_req3 *req3 = &req_u->req3;
4354
4355 if (req3->tp_retire_blk_tov ||
4356 req3->tp_sizeof_priv ||
4357 req3->tp_feature_req_word) {
4358 err = -EINVAL;
55655e3d 4359 goto out_free_pg_vec;
7f953ab2
SV
4360 }
4361 }
d7cf0c34 4362 break;
f6fb8f10 4363 default:
4364 break;
4365 }
69e3c75f
JB
4366 }
4367 /* Done */
4368 else {
4369 err = -EINVAL;
4ebf0ae2 4370 if (unlikely(req->tp_frame_nr))
69e3c75f 4371 goto out;
1da177e4
LT
4372 }
4373
1da177e4
LT
4374
4375 /* Detach socket from network */
4376 spin_lock(&po->bind_lock);
4377 was_running = po->running;
4378 num = po->num;
4379 if (was_running) {
1da177e4 4380 po->num = 0;
ce06b03e 4381 __unregister_prot_hook(sk, false);
1da177e4
LT
4382 }
4383 spin_unlock(&po->bind_lock);
1ce4f28b 4384
1da177e4
LT
4385 synchronize_net();
4386
4387 err = -EBUSY;
905db440 4388 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4389 if (closing || atomic_read(&po->mapped) == 0) {
4390 err = 0;
69e3c75f 4391 spin_lock_bh(&rb_queue->lock);
c053fd96 4392 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4393 rb->frame_max = (req->tp_frame_nr - 1);
4394 rb->head = 0;
4395 rb->frame_size = req->tp_frame_size;
4396 spin_unlock_bh(&rb_queue->lock);
4397
3a7ad063 4398 swap(rb->pg_vec_order, order);
c053fd96 4399 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4400
4401 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4402 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4403 tpacket_rcv : packet_rcv;
4404 skb_queue_purge(rb_queue);
1da177e4 4405 if (atomic_read(&po->mapped))
40d4e3df
ED
4406 pr_err("packet_mmap: vma is busy: %d\n",
4407 atomic_read(&po->mapped));
1da177e4 4408 }
905db440 4409 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4410
4411 spin_lock(&po->bind_lock);
ce06b03e 4412 if (was_running) {
1da177e4 4413 po->num = num;
ce06b03e 4414 register_prot_hook(sk);
1da177e4
LT
4415 }
4416 spin_unlock(&po->bind_lock);
c800aaf8 4417 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4418 /* Because we don't support block-based V3 on tx-ring */
4419 if (!tx_ring)
73d0fcf2 4420 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4421 }
1da177e4 4422
55655e3d 4423out_free_pg_vec:
1da177e4 4424 if (pg_vec)
3a7ad063 4425 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4426out:
4427 return err;
4428}
4429
69e3c75f
JB
4430static int packet_mmap(struct file *file, struct socket *sock,
4431 struct vm_area_struct *vma)
1da177e4
LT
4432{
4433 struct sock *sk = sock->sk;
4434 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4435 unsigned long size, expected_size;
4436 struct packet_ring_buffer *rb;
1da177e4
LT
4437 unsigned long start;
4438 int err = -EINVAL;
4439 int i;
4440
4441 if (vma->vm_pgoff)
4442 return -EINVAL;
4443
905db440 4444 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4445
4446 expected_size = 0;
4447 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4448 if (rb->pg_vec) {
4449 expected_size += rb->pg_vec_len
4450 * rb->pg_vec_pages
4451 * PAGE_SIZE;
4452 }
4453 }
4454
4455 if (expected_size == 0)
1da177e4 4456 goto out;
69e3c75f
JB
4457
4458 size = vma->vm_end - vma->vm_start;
4459 if (size != expected_size)
1da177e4
LT
4460 goto out;
4461
1da177e4 4462 start = vma->vm_start;
69e3c75f
JB
4463 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4464 if (rb->pg_vec == NULL)
4465 continue;
4466
4467 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4468 struct page *page;
4469 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4470 int pg_num;
4471
c56b4d90
CG
4472 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4473 page = pgv_to_page(kaddr);
69e3c75f
JB
4474 err = vm_insert_page(vma, start, page);
4475 if (unlikely(err))
4476 goto out;
4477 start += PAGE_SIZE;
0e3125c7 4478 kaddr += PAGE_SIZE;
69e3c75f 4479 }
4ebf0ae2 4480 }
1da177e4 4481 }
69e3c75f 4482
4ebf0ae2 4483 atomic_inc(&po->mapped);
1da177e4
LT
4484 vma->vm_ops = &packet_mmap_ops;
4485 err = 0;
4486
4487out:
905db440 4488 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4489 return err;
4490}
1da177e4 4491
90ddc4f0 4492static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4493 .family = PF_PACKET,
4494 .owner = THIS_MODULE,
4495 .release = packet_release,
4496 .bind = packet_bind_spkt,
4497 .connect = sock_no_connect,
4498 .socketpair = sock_no_socketpair,
4499 .accept = sock_no_accept,
4500 .getname = packet_getname_spkt,
a11e1d43 4501 .poll = datagram_poll,
1da177e4 4502 .ioctl = packet_ioctl,
c7cbdbf2 4503 .gettstamp = sock_gettstamp,
1da177e4
LT
4504 .listen = sock_no_listen,
4505 .shutdown = sock_no_shutdown,
4506 .setsockopt = sock_no_setsockopt,
4507 .getsockopt = sock_no_getsockopt,
4508 .sendmsg = packet_sendmsg_spkt,
4509 .recvmsg = packet_recvmsg,
4510 .mmap = sock_no_mmap,
4511 .sendpage = sock_no_sendpage,
4512};
1da177e4 4513
90ddc4f0 4514static const struct proto_ops packet_ops = {
1da177e4
LT
4515 .family = PF_PACKET,
4516 .owner = THIS_MODULE,
4517 .release = packet_release,
4518 .bind = packet_bind,
4519 .connect = sock_no_connect,
4520 .socketpair = sock_no_socketpair,
4521 .accept = sock_no_accept,
1ce4f28b 4522 .getname = packet_getname,
a11e1d43 4523 .poll = packet_poll,
1da177e4 4524 .ioctl = packet_ioctl,
c7cbdbf2 4525 .gettstamp = sock_gettstamp,
1da177e4
LT
4526 .listen = sock_no_listen,
4527 .shutdown = sock_no_shutdown,
4528 .setsockopt = packet_setsockopt,
4529 .getsockopt = packet_getsockopt,
719c44d3
WB
4530#ifdef CONFIG_COMPAT
4531 .compat_setsockopt = compat_packet_setsockopt,
4532#endif
1da177e4
LT
4533 .sendmsg = packet_sendmsg,
4534 .recvmsg = packet_recvmsg,
4535 .mmap = packet_mmap,
4536 .sendpage = sock_no_sendpage,
4537};
4538
ec1b4cf7 4539static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4540 .family = PF_PACKET,
4541 .create = packet_create,
4542 .owner = THIS_MODULE,
4543};
4544
4545static struct notifier_block packet_netdev_notifier = {
40d4e3df 4546 .notifier_call = packet_notifier,
1da177e4
LT
4547};
4548
4549#ifdef CONFIG_PROC_FS
1da177e4
LT
4550
4551static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4552 __acquires(RCU)
1da177e4 4553{
e372c414 4554 struct net *net = seq_file_net(seq);
808f5114 4555
4556 rcu_read_lock();
4557 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4558}
4559
4560static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4561{
1bf40954 4562 struct net *net = seq_file_net(seq);
808f5114 4563 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4564}
4565
4566static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4567 __releases(RCU)
1da177e4 4568{
808f5114 4569 rcu_read_unlock();
1da177e4
LT
4570}
4571
1ce4f28b 4572static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4573{
4574 if (v == SEQ_START_TOKEN)
4575 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4576 else {
b7ceabd9 4577 struct sock *s = sk_entry(v);
1da177e4
LT
4578 const struct packet_sock *po = pkt_sk(s);
4579
4580 seq_printf(seq,
71338aa7 4581 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4582 s,
41c6d650 4583 refcount_read(&s->sk_refcnt),
1da177e4
LT
4584 s->sk_type,
4585 ntohs(po->num),
4586 po->ifindex,
4587 po->running,
4588 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4589 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4590 sock_i_ino(s));
1da177e4
LT
4591 }
4592
4593 return 0;
4594}
4595
56b3d975 4596static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4597 .start = packet_seq_start,
4598 .next = packet_seq_next,
4599 .stop = packet_seq_stop,
4600 .show = packet_seq_show,
4601};
1da177e4
LT
4602#endif
4603
2c8c1e72 4604static int __net_init packet_net_init(struct net *net)
d12d01d6 4605{
0fa7fa98 4606 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4607 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4608
c3506372
CH
4609 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4610 sizeof(struct seq_net_private)))
d12d01d6
DL
4611 return -ENOMEM;
4612
4613 return 0;
4614}
4615
2c8c1e72 4616static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4617{
ece31ffd 4618 remove_proc_entry("packet", net->proc_net);
669f8f1a 4619 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4620}
4621
4622static struct pernet_operations packet_net_ops = {
4623 .init = packet_net_init,
4624 .exit = packet_net_exit,
4625};
4626
4627
1da177e4
LT
4628static void __exit packet_exit(void)
4629{
1da177e4 4630 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4631 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4632 sock_unregister(PF_PACKET);
4633 proto_unregister(&packet_proto);
4634}
4635
4636static int __init packet_init(void)
4637{
36096f2f 4638 int rc;
1da177e4 4639
36096f2f
Y
4640 rc = proto_register(&packet_proto, 0);
4641 if (rc)
1da177e4 4642 goto out;
36096f2f
Y
4643 rc = sock_register(&packet_family_ops);
4644 if (rc)
4645 goto out_proto;
4646 rc = register_pernet_subsys(&packet_net_ops);
4647 if (rc)
4648 goto out_sock;
4649 rc = register_netdevice_notifier(&packet_netdev_notifier);
4650 if (rc)
4651 goto out_pernet;
1da177e4 4652
36096f2f
Y
4653 return 0;
4654
4655out_pernet:
4656 unregister_pernet_subsys(&packet_net_ops);
4657out_sock:
4658 sock_unregister(PF_PACKET);
4659out_proto:
4660 proto_unregister(&packet_proto);
1da177e4
LT
4661out:
4662 return rc;
4663}
4664
4665module_init(packet_init);
4666module_exit(packet_exit);
4667MODULE_LICENSE("GPL");
4668MODULE_ALIAS_NETPROTO(PF_PACKET);