]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/packet/af_packet.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[mirror_ubuntu-bionic-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
17bfd8c8 204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
1da177e4 211static void packet_flush_mclist(struct sock *sk);
ccd4eb49 212static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb);
1da177e4 213
ffbc6111 214struct packet_skb_cb {
ffbc6111
HX
215 union {
216 struct sockaddr_pkt pkt;
2472d761
EB
217 union {
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
220 * to save room.
221 */
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
ffbc6111
HX
225 } sa;
226};
227
d3869efe
DW
228#define vio_le() virtio_legacy_is_little_endian()
229
ffbc6111 230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 231
bc59ba39 232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 233#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
dc99f600
DM
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
d346a3fa
DB
244static int packet_direct_xmit(struct sk_buff *skb)
245{
246 struct net_device *dev = skb->dev;
104ba78c 247 struct sk_buff *orig_skb = skb;
d346a3fa 248 struct netdev_queue *txq;
43279500 249 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
250
251 if (unlikely(!netif_running(dev) ||
43279500
DB
252 !netif_carrier_ok(dev)))
253 goto drop;
d346a3fa 254
104ba78c
WB
255 skb = validate_xmit_skb_list(skb, dev);
256 if (skb != orig_skb)
43279500 257 goto drop;
d346a3fa 258
ccd4eb49 259 packet_pick_tx_queue(dev, skb);
10c51b56 260 txq = skb_get_tx_queue(dev, skb);
d346a3fa 261
43279500
DB
262 local_bh_disable();
263
264 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 265 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 266 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 267 HARD_TX_UNLOCK(dev, txq);
d346a3fa 268
43279500
DB
269 local_bh_enable();
270
271 if (!dev_xmit_complete(ret))
d346a3fa 272 kfree_skb(skb);
43279500 273
d346a3fa 274 return ret;
43279500 275drop:
0f97ede4 276 atomic_long_inc(&dev->tx_dropped);
104ba78c 277 kfree_skb_list(skb);
43279500 278 return NET_XMIT_DROP;
d346a3fa
DB
279}
280
66e56cd4
DB
281static struct net_device *packet_cached_dev_get(struct packet_sock *po)
282{
283 struct net_device *dev;
284
285 rcu_read_lock();
286 dev = rcu_dereference(po->cached_dev);
287 if (likely(dev))
288 dev_hold(dev);
289 rcu_read_unlock();
290
291 return dev;
292}
293
294static void packet_cached_dev_assign(struct packet_sock *po,
295 struct net_device *dev)
296{
297 rcu_assign_pointer(po->cached_dev, dev);
298}
299
300static void packet_cached_dev_reset(struct packet_sock *po)
301{
302 RCU_INIT_POINTER(po->cached_dev, NULL);
303}
304
d346a3fa
DB
305static bool packet_use_direct_xmit(const struct packet_sock *po)
306{
307 return po->xmit == packet_direct_xmit;
308}
309
0fd5d57b 310static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 311{
1cbac010 312 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
313}
314
0fd5d57b
DB
315static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
316{
317 const struct net_device_ops *ops = dev->netdev_ops;
318 u16 queue_index;
319
320 if (ops->ndo_select_queue) {
321 queue_index = ops->ndo_select_queue(dev, skb, NULL,
322 __packet_pick_tx_queue);
323 queue_index = netdev_cap_txqueue(dev, queue_index);
324 } else {
325 queue_index = __packet_pick_tx_queue(dev, skb);
326 }
327
328 skb_set_queue_mapping(skb, queue_index);
329}
330
ce06b03e
DM
331/* register_prot_hook must be invoked with the po->bind_lock held,
332 * or from a context in which asynchronous accesses to the packet
333 * socket is not possible (packet_create()).
334 */
335static void register_prot_hook(struct sock *sk)
336{
337 struct packet_sock *po = pkt_sk(sk);
e40526cb 338
ce06b03e 339 if (!po->running) {
66e56cd4 340 if (po->fanout)
dc99f600 341 __fanout_link(sk, po);
66e56cd4 342 else
dc99f600 343 dev_add_pack(&po->prot_hook);
e40526cb 344
ce06b03e
DM
345 sock_hold(sk);
346 po->running = 1;
347 }
348}
349
350/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
351 * held. If the sync parameter is true, we will temporarily drop
352 * the po->bind_lock and do a synchronize_net to make sure no
353 * asynchronous packet processing paths still refer to the elements
354 * of po->prot_hook. If the sync parameter is false, it is the
355 * callers responsibility to take care of this.
356 */
357static void __unregister_prot_hook(struct sock *sk, bool sync)
358{
359 struct packet_sock *po = pkt_sk(sk);
360
361 po->running = 0;
66e56cd4
DB
362
363 if (po->fanout)
dc99f600 364 __fanout_unlink(sk, po);
66e56cd4 365 else
dc99f600 366 __dev_remove_pack(&po->prot_hook);
e40526cb 367
ce06b03e
DM
368 __sock_put(sk);
369
370 if (sync) {
371 spin_unlock(&po->bind_lock);
372 synchronize_net();
373 spin_lock(&po->bind_lock);
374 }
375}
376
377static void unregister_prot_hook(struct sock *sk, bool sync)
378{
379 struct packet_sock *po = pkt_sk(sk);
380
381 if (po->running)
382 __unregister_prot_hook(sk, sync);
383}
384
6e58040b 385static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
386{
387 if (is_vmalloc_addr(addr))
388 return vmalloc_to_page(addr);
389 return virt_to_page(addr);
390}
391
69e3c75f 392static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 393{
184f489e 394 union tpacket_uhdr h;
1da177e4 395
69e3c75f 396 h.raw = frame;
bbd6ef87
PM
397 switch (po->tp_version) {
398 case TPACKET_V1:
69e3c75f 399 h.h1->tp_status = status;
0af55bb5 400 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
401 break;
402 case TPACKET_V2:
69e3c75f 403 h.h2->tp_status = status;
0af55bb5 404 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 405 break;
f6fb8f10 406 case TPACKET_V3:
7f953ab2
SV
407 h.h3->tp_status = status;
408 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
409 break;
69e3c75f 410 default:
f6fb8f10 411 WARN(1, "TPACKET version not supported.\n");
69e3c75f 412 BUG();
bbd6ef87 413 }
69e3c75f
JB
414
415 smp_wmb();
bbd6ef87
PM
416}
417
69e3c75f 418static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 419{
184f489e 420 union tpacket_uhdr h;
bbd6ef87 421
69e3c75f
JB
422 smp_rmb();
423
bbd6ef87
PM
424 h.raw = frame;
425 switch (po->tp_version) {
426 case TPACKET_V1:
0af55bb5 427 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 428 return h.h1->tp_status;
bbd6ef87 429 case TPACKET_V2:
0af55bb5 430 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 431 return h.h2->tp_status;
f6fb8f10 432 case TPACKET_V3:
7f953ab2
SV
433 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
434 return h.h3->tp_status;
69e3c75f 435 default:
f6fb8f10 436 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
437 BUG();
438 return 0;
bbd6ef87 439 }
1da177e4 440}
69e3c75f 441
b9c32fb2
DB
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
7a51384c
DB
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
68a360e8
WB
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 453 return TP_STATUS_TS_SOFTWARE;
7a51384c 454
b9c32fb2 455 return 0;
7a51384c
DB
456}
457
b9c32fb2
DB
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
2e31396f
WB
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
b9c32fb2 463 __u32 ts_status;
2e31396f 464
b9c32fb2
DB
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
2e31396f
WB
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
57ea884b
DB
479 h.h3->tp_sec = ts.tv_sec;
480 h.h3->tp_nsec = ts.tv_nsec;
481 break;
2e31396f
WB
482 default:
483 WARN(1, "TPACKET version not supported.\n");
484 BUG();
485 }
486
487 /* one flush is safe, as both fields always lie on the same cacheline */
488 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
489 smp_wmb();
b9c32fb2
DB
490
491 return ts_status;
2e31396f
WB
492}
493
69e3c75f
JB
494static void *packet_lookup_frame(struct packet_sock *po,
495 struct packet_ring_buffer *rb,
496 unsigned int position,
497 int status)
498{
499 unsigned int pg_vec_pos, frame_offset;
184f489e 500 union tpacket_uhdr h;
69e3c75f
JB
501
502 pg_vec_pos = position / rb->frames_per_block;
503 frame_offset = position % rb->frames_per_block;
504
0e3125c7
NH
505 h.raw = rb->pg_vec[pg_vec_pos].buffer +
506 (frame_offset * rb->frame_size);
69e3c75f
JB
507
508 if (status != __packet_get_status(po, h.raw))
509 return NULL;
510
511 return h.raw;
512}
513
eea49cc9 514static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
515 struct packet_ring_buffer *rb,
516 int status)
517{
518 return packet_lookup_frame(po, rb, rb->head, status);
519}
520
bc59ba39 521static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 522{
523 del_timer_sync(&pkc->retire_blk_timer);
524}
525
526static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 527 struct sk_buff_head *rb_queue)
528{
bc59ba39 529 struct tpacket_kbdq_core *pkc;
f6fb8f10 530
73d0fcf2 531 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 532
ec6f809f 533 spin_lock_bh(&rb_queue->lock);
f6fb8f10 534 pkc->delete_blk_timer = 1;
ec6f809f 535 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 536
537 prb_del_retire_blk_timer(pkc);
538}
539
e8e85cc5 540static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 541{
bc59ba39 542 struct tpacket_kbdq_core *pkc;
f6fb8f10 543
e8e85cc5 544 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
545 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
546 0);
547 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 548}
549
550static int prb_calc_retire_blk_tmo(struct packet_sock *po,
551 int blk_size_in_bytes)
552{
553 struct net_device *dev;
554 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 555 struct ethtool_link_ksettings ecmd;
4bc71cb9 556 int err;
f6fb8f10 557
4bc71cb9
JP
558 rtnl_lock();
559 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
560 if (unlikely(!dev)) {
561 rtnl_unlock();
f6fb8f10 562 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 563 }
7cad1bac 564 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
565 rtnl_unlock();
566 if (!err) {
4bc71cb9
JP
567 /*
568 * If the link speed is so slow you don't really
569 * need to worry about perf anyways
570 */
7cad1bac
DD
571 if (ecmd.base.speed < SPEED_1000 ||
572 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 573 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 574 } else {
575 msec = 1;
7cad1bac 576 div = ecmd.base.speed / 1000;
f6fb8f10 577 }
578 }
579
580 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
581
582 if (div)
583 mbits /= div;
584
585 tmo = mbits * msec;
586
587 if (div)
588 return tmo+1;
589 return tmo;
590}
591
bc59ba39 592static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 593 union tpacket_req_u *req_u)
594{
595 p1->feature_req_word = req_u->req3.tp_feature_req_word;
596}
597
598static void init_prb_bdqc(struct packet_sock *po,
599 struct packet_ring_buffer *rb,
600 struct pgv *pg_vec,
e8e85cc5 601 union tpacket_req_u *req_u)
f6fb8f10 602{
22781a5b 603 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 604 struct tpacket_block_desc *pbd;
f6fb8f10 605
606 memset(p1, 0x0, sizeof(*p1));
607
608 p1->knxt_seq_num = 1;
609 p1->pkbdq = pg_vec;
bc59ba39 610 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 611 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 612 p1->kblk_size = req_u->req3.tp_block_size;
613 p1->knum_blocks = req_u->req3.tp_block_nr;
614 p1->hdrlen = po->tp_hdrlen;
615 p1->version = po->tp_version;
616 p1->last_kactive_blk_num = 0;
ee80fbf3 617 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 618 if (req_u->req3.tp_retire_blk_tov)
619 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
620 else
621 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
622 req_u->req3.tp_block_size);
623 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
624 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
625
dc808110 626 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 627 prb_init_ft_ops(p1, req_u);
e8e85cc5 628 prb_setup_retire_blk_timer(po);
f6fb8f10 629 prb_open_block(p1, pbd);
630}
631
632/* Do NOT update the last_blk_num first.
633 * Assumes sk_buff_head lock is held.
634 */
bc59ba39 635static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 636{
637 mod_timer(&pkc->retire_blk_timer,
638 jiffies + pkc->tov_in_jiffies);
639 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
640}
641
642/*
643 * Timer logic:
644 * 1) We refresh the timer only when we open a block.
645 * By doing this we don't waste cycles refreshing the timer
646 * on packet-by-packet basis.
647 *
648 * With a 1MB block-size, on a 1Gbps line, it will take
649 * i) ~8 ms to fill a block + ii) memcpy etc.
650 * In this cut we are not accounting for the memcpy time.
651 *
652 * So, if the user sets the 'tmo' to 10ms then the timer
653 * will never fire while the block is still getting filled
654 * (which is what we want). However, the user could choose
655 * to close a block early and that's fine.
656 *
657 * But when the timer does fire, we check whether or not to refresh it.
658 * Since the tmo granularity is in msecs, it is not too expensive
659 * to refresh the timer, lets say every '8' msecs.
660 * Either the user can set the 'tmo' or we can derive it based on
661 * a) line-speed and b) block-size.
662 * prb_calc_retire_blk_tmo() calculates the tmo.
663 *
664 */
17bfd8c8 665static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 666{
17bfd8c8
KC
667 struct packet_sock *po =
668 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 669 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 670 unsigned int frozen;
bc59ba39 671 struct tpacket_block_desc *pbd;
f6fb8f10 672
673 spin_lock(&po->sk.sk_receive_queue.lock);
674
675 frozen = prb_queue_frozen(pkc);
676 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
677
678 if (unlikely(pkc->delete_blk_timer))
679 goto out;
680
681 /* We only need to plug the race when the block is partially filled.
682 * tpacket_rcv:
683 * lock(); increment BLOCK_NUM_PKTS; unlock()
684 * copy_bits() is in progress ...
685 * timer fires on other cpu:
686 * we can't retire the current block because copy_bits
687 * is in progress.
688 *
689 */
690 if (BLOCK_NUM_PKTS(pbd)) {
691 while (atomic_read(&pkc->blk_fill_in_prog)) {
692 /* Waiting for skb_copy_bits to finish... */
693 cpu_relax();
694 }
695 }
696
697 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
698 if (!frozen) {
41a50d62
AD
699 if (!BLOCK_NUM_PKTS(pbd)) {
700 /* An empty block. Just refresh the timer. */
701 goto refresh_timer;
702 }
f6fb8f10 703 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
704 if (!prb_dispatch_next_block(pkc, po))
705 goto refresh_timer;
706 else
707 goto out;
708 } else {
709 /* Case 1. Queue was frozen because user-space was
710 * lagging behind.
711 */
878cd3ba 712 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 713 /*
714 * Ok, user-space is still behind.
715 * So just refresh the timer.
716 */
717 goto refresh_timer;
718 } else {
719 /* Case 2. queue was frozen,user-space caught up,
720 * now the link went idle && the timer fired.
721 * We don't have a block to close.So we open this
722 * block and restart the timer.
723 * opening a block thaws the queue,restarts timer
724 * Thawing/timer-refresh is a side effect.
725 */
726 prb_open_block(pkc, pbd);
727 goto out;
728 }
729 }
730 }
731
732refresh_timer:
733 _prb_refresh_rx_retire_blk_timer(pkc);
734
735out:
736 spin_unlock(&po->sk.sk_receive_queue.lock);
737}
738
eea49cc9 739static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 740 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 741{
742 /* Flush everything minus the block header */
743
744#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
745 u8 *start, *end;
746
747 start = (u8 *)pbd1;
748
749 /* Skip the block header(we know header WILL fit in 4K) */
750 start += PAGE_SIZE;
751
752 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
753 for (; start < end; start += PAGE_SIZE)
754 flush_dcache_page(pgv_to_page(start));
755
756 smp_wmb();
757#endif
758
759 /* Now update the block status. */
760
761 BLOCK_STATUS(pbd1) = status;
762
763 /* Flush the block header */
764
765#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
766 start = (u8 *)pbd1;
767 flush_dcache_page(pgv_to_page(start));
768
769 smp_wmb();
770#endif
771}
772
773/*
774 * Side effect:
775 *
776 * 1) flush the block
777 * 2) Increment active_blk_num
778 *
779 * Note:We DONT refresh the timer on purpose.
780 * Because almost always the next block will be opened.
781 */
bc59ba39 782static void prb_close_block(struct tpacket_kbdq_core *pkc1,
783 struct tpacket_block_desc *pbd1,
f6fb8f10 784 struct packet_sock *po, unsigned int stat)
785{
786 __u32 status = TP_STATUS_USER | stat;
787
788 struct tpacket3_hdr *last_pkt;
bc59ba39 789 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 790 struct sock *sk = &po->sk;
f6fb8f10 791
ee80fbf3 792 if (po->stats.stats3.tp_drops)
f6fb8f10 793 status |= TP_STATUS_LOSING;
794
795 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
796 last_pkt->tp_next_offset = 0;
797
798 /* Get the ts of the last pkt */
799 if (BLOCK_NUM_PKTS(pbd1)) {
800 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
801 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
802 } else {
41a50d62
AD
803 /* Ok, we tmo'd - so get the current time.
804 *
805 * It shouldn't really happen as we don't close empty
806 * blocks. See prb_retire_rx_blk_timer_expired().
807 */
f6fb8f10 808 struct timespec ts;
809 getnstimeofday(&ts);
810 h1->ts_last_pkt.ts_sec = ts.tv_sec;
811 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
812 }
813
814 smp_wmb();
815
816 /* Flush the block */
817 prb_flush_block(pkc1, pbd1, status);
818
da413eec
DC
819 sk->sk_data_ready(sk);
820
f6fb8f10 821 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
822}
823
eea49cc9 824static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 825{
826 pkc->reset_pending_on_curr_blk = 0;
827}
828
829/*
830 * Side effect of opening a block:
831 *
832 * 1) prb_queue is thawed.
833 * 2) retire_blk_timer is refreshed.
834 *
835 */
bc59ba39 836static void prb_open_block(struct tpacket_kbdq_core *pkc1,
837 struct tpacket_block_desc *pbd1)
f6fb8f10 838{
839 struct timespec ts;
bc59ba39 840 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 841
842 smp_rmb();
843
8da3056c
DB
844 /* We could have just memset this but we will lose the
845 * flexibility of making the priv area sticky
846 */
f6fb8f10 847
8da3056c
DB
848 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
849 BLOCK_NUM_PKTS(pbd1) = 0;
850 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 851
8da3056c
DB
852 getnstimeofday(&ts);
853
854 h1->ts_first_pkt.ts_sec = ts.tv_sec;
855 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 856
8da3056c
DB
857 pkc1->pkblk_start = (char *)pbd1;
858 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
859
860 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
861 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
862
863 pbd1->version = pkc1->version;
864 pkc1->prev = pkc1->nxt_offset;
865 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
866
867 prb_thaw_queue(pkc1);
868 _prb_refresh_rx_retire_blk_timer(pkc1);
869
870 smp_wmb();
f6fb8f10 871}
872
873/*
874 * Queue freeze logic:
875 * 1) Assume tp_block_nr = 8 blocks.
876 * 2) At time 't0', user opens Rx ring.
877 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
878 * 4) user-space is either sleeping or processing block '0'.
879 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
880 * it will close block-7,loop around and try to fill block '0'.
881 * call-flow:
882 * __packet_lookup_frame_in_block
883 * prb_retire_current_block()
884 * prb_dispatch_next_block()
885 * |->(BLOCK_STATUS == USER) evaluates to true
886 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
887 * 6) Now there are two cases:
888 * 6.1) Link goes idle right after the queue is frozen.
889 * But remember, the last open_block() refreshed the timer.
890 * When this timer expires,it will refresh itself so that we can
891 * re-open block-0 in near future.
892 * 6.2) Link is busy and keeps on receiving packets. This is a simple
893 * case and __packet_lookup_frame_in_block will check if block-0
894 * is free and can now be re-used.
895 */
eea49cc9 896static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 897 struct packet_sock *po)
898{
899 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 900 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 901}
902
903#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
904
905/*
906 * If the next block is free then we will dispatch it
907 * and return a good offset.
908 * Else, we will freeze the queue.
909 * So, caller must check the return value.
910 */
bc59ba39 911static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 912 struct packet_sock *po)
913{
bc59ba39 914 struct tpacket_block_desc *pbd;
f6fb8f10 915
916 smp_rmb();
917
918 /* 1. Get current block num */
919 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
920
921 /* 2. If this block is currently in_use then freeze the queue */
922 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
923 prb_freeze_queue(pkc, po);
924 return NULL;
925 }
926
927 /*
928 * 3.
929 * open this block and return the offset where the first packet
930 * needs to get stored.
931 */
932 prb_open_block(pkc, pbd);
933 return (void *)pkc->nxt_offset;
934}
935
bc59ba39 936static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 937 struct packet_sock *po, unsigned int status)
938{
bc59ba39 939 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 940
941 /* retire/close the current block */
942 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
943 /*
944 * Plug the case where copy_bits() is in progress on
945 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
946 * have space to copy the pkt in the current block and
947 * called prb_retire_current_block()
948 *
949 * We don't need to worry about the TMO case because
950 * the timer-handler already handled this case.
951 */
952 if (!(status & TP_STATUS_BLK_TMO)) {
953 while (atomic_read(&pkc->blk_fill_in_prog)) {
954 /* Waiting for skb_copy_bits to finish... */
955 cpu_relax();
956 }
957 }
958 prb_close_block(pkc, pbd, po, status);
959 return;
960 }
f6fb8f10 961}
962
878cd3ba 963static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 964{
965 return TP_STATUS_USER & BLOCK_STATUS(pbd);
966}
967
eea49cc9 968static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 969{
970 return pkc->reset_pending_on_curr_blk;
971}
972
eea49cc9 973static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 974{
bc59ba39 975 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 976 atomic_dec(&pkc->blk_fill_in_prog);
977}
978
eea49cc9 979static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 980 struct tpacket3_hdr *ppd)
981{
3958afa1 982 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 983}
984
eea49cc9 985static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 986 struct tpacket3_hdr *ppd)
987{
988 ppd->hv1.tp_rxhash = 0;
989}
990
eea49cc9 991static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 992 struct tpacket3_hdr *ppd)
993{
df8a39de
JP
994 if (skb_vlan_tag_present(pkc->skb)) {
995 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
996 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
997 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 998 } else {
9e67030a 999 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1000 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1001 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1002 }
1003}
1004
bc59ba39 1005static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1006 struct tpacket3_hdr *ppd)
1007{
a0cdfcf3 1008 ppd->hv1.tp_padding = 0;
f6fb8f10 1009 prb_fill_vlan_info(pkc, ppd);
1010
1011 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1012 prb_fill_rxhash(pkc, ppd);
1013 else
1014 prb_clear_rxhash(pkc, ppd);
1015}
1016
eea49cc9 1017static void prb_fill_curr_block(char *curr,
bc59ba39 1018 struct tpacket_kbdq_core *pkc,
1019 struct tpacket_block_desc *pbd,
f6fb8f10 1020 unsigned int len)
1021{
1022 struct tpacket3_hdr *ppd;
1023
1024 ppd = (struct tpacket3_hdr *)curr;
1025 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1026 pkc->prev = curr;
1027 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1028 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1029 BLOCK_NUM_PKTS(pbd) += 1;
1030 atomic_inc(&pkc->blk_fill_in_prog);
1031 prb_run_all_ft_ops(pkc, ppd);
1032}
1033
1034/* Assumes caller has the sk->rx_queue.lock */
1035static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1036 struct sk_buff *skb,
1037 int status,
1038 unsigned int len
1039 )
1040{
bc59ba39 1041 struct tpacket_kbdq_core *pkc;
1042 struct tpacket_block_desc *pbd;
f6fb8f10 1043 char *curr, *end;
1044
e3192690 1045 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1046 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1047
1048 /* Queue is frozen when user space is lagging behind */
1049 if (prb_queue_frozen(pkc)) {
1050 /*
1051 * Check if that last block which caused the queue to freeze,
1052 * is still in_use by user-space.
1053 */
878cd3ba 1054 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1055 /* Can't record this packet */
1056 return NULL;
1057 } else {
1058 /*
1059 * Ok, the block was released by user-space.
1060 * Now let's open that block.
1061 * opening a block also thaws the queue.
1062 * Thawing is a side effect.
1063 */
1064 prb_open_block(pkc, pbd);
1065 }
1066 }
1067
1068 smp_mb();
1069 curr = pkc->nxt_offset;
1070 pkc->skb = skb;
e3192690 1071 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1072
1073 /* first try the current block */
1074 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1075 prb_fill_curr_block(curr, pkc, pbd, len);
1076 return (void *)curr;
1077 }
1078
1079 /* Ok, close the current block */
1080 prb_retire_current_block(pkc, po, 0);
1081
1082 /* Now, try to dispatch the next block */
1083 curr = (char *)prb_dispatch_next_block(pkc, po);
1084 if (curr) {
1085 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1086 prb_fill_curr_block(curr, pkc, pbd, len);
1087 return (void *)curr;
1088 }
1089
1090 /*
1091 * No free blocks are available.user_space hasn't caught up yet.
1092 * Queue was just frozen and now this packet will get dropped.
1093 */
1094 return NULL;
1095}
1096
eea49cc9 1097static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1098 struct sk_buff *skb,
1099 int status, unsigned int len)
1100{
1101 char *curr = NULL;
1102 switch (po->tp_version) {
1103 case TPACKET_V1:
1104 case TPACKET_V2:
1105 curr = packet_lookup_frame(po, &po->rx_ring,
1106 po->rx_ring.head, status);
1107 return curr;
1108 case TPACKET_V3:
1109 return __packet_lookup_frame_in_block(po, skb, status, len);
1110 default:
1111 WARN(1, "TPACKET version not supported\n");
1112 BUG();
99aa3473 1113 return NULL;
f6fb8f10 1114 }
1115}
1116
eea49cc9 1117static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1118 struct packet_ring_buffer *rb,
77f65ebd 1119 unsigned int idx,
f6fb8f10 1120 int status)
1121{
bc59ba39 1122 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1123 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1124
1125 if (status != BLOCK_STATUS(pbd))
1126 return NULL;
1127 return pbd;
1128}
1129
eea49cc9 1130static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1131{
1132 unsigned int prev;
1133 if (rb->prb_bdqc.kactive_blk_num)
1134 prev = rb->prb_bdqc.kactive_blk_num-1;
1135 else
1136 prev = rb->prb_bdqc.knum_blocks-1;
1137 return prev;
1138}
1139
1140/* Assumes caller has held the rx_queue.lock */
eea49cc9 1141static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1142 struct packet_ring_buffer *rb,
1143 int status)
1144{
1145 unsigned int previous = prb_previous_blk_num(rb);
1146 return prb_lookup_block(po, rb, previous, status);
1147}
1148
eea49cc9 1149static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1150 struct packet_ring_buffer *rb,
1151 int status)
1152{
1153 if (po->tp_version <= TPACKET_V2)
1154 return packet_previous_frame(po, rb, status);
1155
1156 return __prb_previous_block(po, rb, status);
1157}
1158
eea49cc9 1159static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1160 struct packet_ring_buffer *rb)
1161{
1162 switch (po->tp_version) {
1163 case TPACKET_V1:
1164 case TPACKET_V2:
1165 return packet_increment_head(rb);
1166 case TPACKET_V3:
1167 default:
1168 WARN(1, "TPACKET version not supported.\n");
1169 BUG();
1170 return;
1171 }
1172}
1173
eea49cc9 1174static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1175 struct packet_ring_buffer *rb,
1176 int status)
1177{
1178 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1179 return packet_lookup_frame(po, rb, previous, status);
1180}
1181
eea49cc9 1182static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1183{
1184 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1185}
1186
b0138408
DB
1187static void packet_inc_pending(struct packet_ring_buffer *rb)
1188{
1189 this_cpu_inc(*rb->pending_refcnt);
1190}
1191
1192static void packet_dec_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_dec(*rb->pending_refcnt);
1195}
1196
1197static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1198{
1199 unsigned int refcnt = 0;
1200 int cpu;
1201
1202 /* We don't use pending refcount in rx_ring. */
1203 if (rb->pending_refcnt == NULL)
1204 return 0;
1205
1206 for_each_possible_cpu(cpu)
1207 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1208
1209 return refcnt;
1210}
1211
1212static int packet_alloc_pending(struct packet_sock *po)
1213{
1214 po->rx_ring.pending_refcnt = NULL;
1215
1216 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1217 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1218 return -ENOBUFS;
1219
1220 return 0;
1221}
1222
1223static void packet_free_pending(struct packet_sock *po)
1224{
1225 free_percpu(po->tx_ring.pending_refcnt);
1226}
1227
9954729b
WB
1228#define ROOM_POW_OFF 2
1229#define ROOM_NONE 0x0
1230#define ROOM_LOW 0x1
1231#define ROOM_NORMAL 0x2
1232
1233static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1234{
9954729b
WB
1235 int idx, len;
1236
1237 len = po->rx_ring.frame_max + 1;
1238 idx = po->rx_ring.head;
1239 if (pow_off)
1240 idx += len >> pow_off;
1241 if (idx >= len)
1242 idx -= len;
1243 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1244}
1245
1246static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1247{
1248 int idx, len;
1249
1250 len = po->rx_ring.prb_bdqc.knum_blocks;
1251 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1252 if (pow_off)
1253 idx += len >> pow_off;
1254 if (idx >= len)
1255 idx -= len;
1256 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1257}
77f65ebd 1258
2ccdbaa6 1259static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1260{
1261 struct sock *sk = &po->sk;
1262 int ret = ROOM_NONE;
1263
1264 if (po->prot_hook.func != tpacket_rcv) {
1265 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1266 - (skb ? skb->truesize : 0);
9954729b
WB
1267 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1268 return ROOM_NORMAL;
1269 else if (avail > 0)
1270 return ROOM_LOW;
1271 else
1272 return ROOM_NONE;
1273 }
77f65ebd 1274
9954729b
WB
1275 if (po->tp_version == TPACKET_V3) {
1276 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1277 ret = ROOM_NORMAL;
1278 else if (__tpacket_v3_has_room(po, 0))
1279 ret = ROOM_LOW;
1280 } else {
1281 if (__tpacket_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 }
2ccdbaa6
WB
1286
1287 return ret;
1288}
1289
1290static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1291{
1292 int ret;
1293 bool has_room;
1294
54d7c01d
WB
1295 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1296 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1297 has_room = ret == ROOM_NORMAL;
1298 if (po->pressure == has_room)
54d7c01d
WB
1299 po->pressure = !has_room;
1300 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1301
9954729b 1302 return ret;
77f65ebd
WB
1303}
1304
1da177e4
LT
1305static void packet_sock_destruct(struct sock *sk)
1306{
ed85b565
RC
1307 skb_queue_purge(&sk->sk_error_queue);
1308
547b792c 1309 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1310 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1311
1312 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1313 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1314 return;
1315 }
1316
17ab56a2 1317 sk_refcnt_debug_dec(sk);
1da177e4
LT
1318}
1319
3b3a5b0a
WB
1320static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1321{
1322 u32 rxhash;
1323 int i, count = 0;
1324
1325 rxhash = skb_get_hash(skb);
1326 for (i = 0; i < ROLLOVER_HLEN; i++)
1327 if (po->rollover->history[i] == rxhash)
1328 count++;
1329
1330 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1331 return count > (ROLLOVER_HLEN >> 1);
1332}
1333
77f65ebd
WB
1334static unsigned int fanout_demux_hash(struct packet_fanout *f,
1335 struct sk_buff *skb,
1336 unsigned int num)
dc99f600 1337{
eb70db87 1338 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1339}
1340
77f65ebd
WB
1341static unsigned int fanout_demux_lb(struct packet_fanout *f,
1342 struct sk_buff *skb,
1343 unsigned int num)
dc99f600 1344{
468479e6 1345 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1346
468479e6 1347 return val % num;
77f65ebd
WB
1348}
1349
1350static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1351 struct sk_buff *skb,
1352 unsigned int num)
1353{
1354 return smp_processor_id() % num;
dc99f600
DM
1355}
1356
5df0ddfb
DB
1357static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1358 struct sk_buff *skb,
1359 unsigned int num)
1360{
f337db64 1361 return prandom_u32_max(num);
5df0ddfb
DB
1362}
1363
77f65ebd
WB
1364static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1365 struct sk_buff *skb,
ad377cab 1366 unsigned int idx, bool try_self,
77f65ebd 1367 unsigned int num)
95ec3eb4 1368{
4633c9e0 1369 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1370 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1371
0648ab70 1372 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1373
1374 if (try_self) {
1375 room = packet_rcv_has_room(po, skb);
1376 if (room == ROOM_NORMAL ||
1377 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1378 return idx;
4633c9e0 1379 po_skip = po;
3b3a5b0a 1380 }
ad377cab 1381
0648ab70 1382 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1383 do {
2ccdbaa6 1384 po_next = pkt_sk(f->arr[i]);
4633c9e0 1385 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1386 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1387 if (i != j)
0648ab70 1388 po->rollover->sock = i;
a9b63918
WB
1389 atomic_long_inc(&po->rollover->num);
1390 if (room == ROOM_LOW)
1391 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1392 return i;
1393 }
ad377cab 1394
77f65ebd
WB
1395 if (++i == num)
1396 i = 0;
1397 } while (i != j);
1398
a9b63918 1399 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1400 return idx;
1401}
1402
2d36097d
NH
1403static unsigned int fanout_demux_qm(struct packet_fanout *f,
1404 struct sk_buff *skb,
1405 unsigned int num)
1406{
1407 return skb_get_queue_mapping(skb) % num;
1408}
1409
47dceb8e
WB
1410static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1411 struct sk_buff *skb,
1412 unsigned int num)
1413{
1414 struct bpf_prog *prog;
1415 unsigned int ret = 0;
1416
1417 rcu_read_lock();
1418 prog = rcu_dereference(f->bpf_prog);
1419 if (prog)
ff936a04 1420 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1421 rcu_read_unlock();
1422
1423 return ret;
1424}
1425
77f65ebd
WB
1426static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1427{
1428 return f->flags & (flag >> 8);
95ec3eb4
DM
1429}
1430
95ec3eb4
DM
1431static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1432 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1433{
1434 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1435 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1436 struct net *net = read_pnet(&f->net);
dc99f600 1437 struct packet_sock *po;
77f65ebd 1438 unsigned int idx;
dc99f600 1439
19bcf9f2 1440 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1441 kfree_skb(skb);
1442 return 0;
1443 }
1444
3f34b24a 1445 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1446 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1447 if (!skb)
1448 return 0;
1449 }
95ec3eb4
DM
1450 switch (f->type) {
1451 case PACKET_FANOUT_HASH:
1452 default:
77f65ebd 1453 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1454 break;
1455 case PACKET_FANOUT_LB:
77f65ebd 1456 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1457 break;
1458 case PACKET_FANOUT_CPU:
77f65ebd
WB
1459 idx = fanout_demux_cpu(f, skb, num);
1460 break;
5df0ddfb
DB
1461 case PACKET_FANOUT_RND:
1462 idx = fanout_demux_rnd(f, skb, num);
1463 break;
2d36097d
NH
1464 case PACKET_FANOUT_QM:
1465 idx = fanout_demux_qm(f, skb, num);
1466 break;
77f65ebd 1467 case PACKET_FANOUT_ROLLOVER:
ad377cab 1468 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1469 break;
47dceb8e 1470 case PACKET_FANOUT_CBPF:
f2e52095 1471 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1472 idx = fanout_demux_bpf(f, skb, num);
1473 break;
dc99f600
DM
1474 }
1475
ad377cab
WB
1476 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1477 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1478
ad377cab 1479 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1480 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1481}
1482
fff3321d
PE
1483DEFINE_MUTEX(fanout_mutex);
1484EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1485static LIST_HEAD(fanout_list);
4a69a864 1486static u16 fanout_next_id;
dc99f600
DM
1487
1488static void __fanout_link(struct sock *sk, struct packet_sock *po)
1489{
1490 struct packet_fanout *f = po->fanout;
1491
1492 spin_lock(&f->lock);
1493 f->arr[f->num_members] = sk;
1494 smp_wmb();
1495 f->num_members++;
2bd624b4
AS
1496 if (f->num_members == 1)
1497 dev_add_pack(&f->prot_hook);
dc99f600
DM
1498 spin_unlock(&f->lock);
1499}
1500
1501static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1502{
1503 struct packet_fanout *f = po->fanout;
1504 int i;
1505
1506 spin_lock(&f->lock);
1507 for (i = 0; i < f->num_members; i++) {
1508 if (f->arr[i] == sk)
1509 break;
1510 }
1511 BUG_ON(i >= f->num_members);
1512 f->arr[i] = f->arr[f->num_members - 1];
1513 f->num_members--;
2bd624b4
AS
1514 if (f->num_members == 0)
1515 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1516 spin_unlock(&f->lock);
1517}
1518
d4dd8aee 1519static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1520{
161642e2
ED
1521 if (sk->sk_family != PF_PACKET)
1522 return false;
c0de08d0 1523
161642e2 1524 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1525}
1526
47dceb8e
WB
1527static void fanout_init_data(struct packet_fanout *f)
1528{
1529 switch (f->type) {
1530 case PACKET_FANOUT_LB:
1531 atomic_set(&f->rr_cur, 0);
1532 break;
1533 case PACKET_FANOUT_CBPF:
f2e52095 1534 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1535 RCU_INIT_POINTER(f->bpf_prog, NULL);
1536 break;
1537 }
1538}
1539
1540static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1541{
1542 struct bpf_prog *old;
1543
1544 spin_lock(&f->lock);
1545 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1546 rcu_assign_pointer(f->bpf_prog, new);
1547 spin_unlock(&f->lock);
1548
1549 if (old) {
1550 synchronize_net();
1551 bpf_prog_destroy(old);
1552 }
1553}
1554
1555static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1556 unsigned int len)
1557{
1558 struct bpf_prog *new;
1559 struct sock_fprog fprog;
1560 int ret;
1561
1562 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1563 return -EPERM;
1564 if (len != sizeof(fprog))
1565 return -EINVAL;
1566 if (copy_from_user(&fprog, data, len))
1567 return -EFAULT;
1568
bab18991 1569 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1570 if (ret)
1571 return ret;
1572
1573 __fanout_set_data_bpf(po->fanout, new);
1574 return 0;
1575}
1576
f2e52095
WB
1577static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1578 unsigned int len)
1579{
1580 struct bpf_prog *new;
1581 u32 fd;
1582
1583 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1584 return -EPERM;
1585 if (len != sizeof(fd))
1586 return -EINVAL;
1587 if (copy_from_user(&fd, data, len))
1588 return -EFAULT;
1589
113214be 1590 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1591 if (IS_ERR(new))
1592 return PTR_ERR(new);
f2e52095
WB
1593
1594 __fanout_set_data_bpf(po->fanout, new);
1595 return 0;
1596}
1597
47dceb8e
WB
1598static int fanout_set_data(struct packet_sock *po, char __user *data,
1599 unsigned int len)
1600{
1601 switch (po->fanout->type) {
1602 case PACKET_FANOUT_CBPF:
1603 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1604 case PACKET_FANOUT_EBPF:
1605 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1606 default:
1607 return -EINVAL;
1608 };
1609}
1610
1611static void fanout_release_data(struct packet_fanout *f)
1612{
1613 switch (f->type) {
1614 case PACKET_FANOUT_CBPF:
f2e52095 1615 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1616 __fanout_set_data_bpf(f, NULL);
1617 };
1618}
1619
4a69a864
MM
1620static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1621{
1622 struct packet_fanout *f;
1623
1624 list_for_each_entry(f, &fanout_list, list) {
1625 if (f->id == candidate_id &&
1626 read_pnet(&f->net) == sock_net(sk)) {
1627 return false;
1628 }
1629 }
1630 return true;
1631}
1632
1633static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1634{
1635 u16 id = fanout_next_id;
1636
1637 do {
1638 if (__fanout_id_is_free(sk, id)) {
1639 *new_id = id;
1640 fanout_next_id = id + 1;
1641 return true;
1642 }
1643
1644 id++;
1645 } while (id != fanout_next_id);
1646
1647 return false;
1648}
1649
7736d33f 1650static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1651{
d199fab6 1652 struct packet_rollover *rollover = NULL;
dc99f600
DM
1653 struct packet_sock *po = pkt_sk(sk);
1654 struct packet_fanout *f, *match;
7736d33f 1655 u8 type = type_flags & 0xff;
77f65ebd 1656 u8 flags = type_flags >> 8;
dc99f600
DM
1657 int err;
1658
1659 switch (type) {
77f65ebd
WB
1660 case PACKET_FANOUT_ROLLOVER:
1661 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1662 return -EINVAL;
dc99f600
DM
1663 case PACKET_FANOUT_HASH:
1664 case PACKET_FANOUT_LB:
95ec3eb4 1665 case PACKET_FANOUT_CPU:
5df0ddfb 1666 case PACKET_FANOUT_RND:
2d36097d 1667 case PACKET_FANOUT_QM:
47dceb8e 1668 case PACKET_FANOUT_CBPF:
f2e52095 1669 case PACKET_FANOUT_EBPF:
dc99f600
DM
1670 break;
1671 default:
1672 return -EINVAL;
1673 }
1674
d199fab6
ED
1675 mutex_lock(&fanout_mutex);
1676
d199fab6 1677 err = -EALREADY;
dc99f600 1678 if (po->fanout)
d199fab6 1679 goto out;
dc99f600 1680
4633c9e0
WB
1681 if (type == PACKET_FANOUT_ROLLOVER ||
1682 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1683 err = -ENOMEM;
1684 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1685 if (!rollover)
1686 goto out;
1687 atomic_long_set(&rollover->num, 0);
1688 atomic_long_set(&rollover->num_huge, 0);
1689 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1690 }
1691
4a69a864
MM
1692 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1693 if (id != 0) {
1694 err = -EINVAL;
1695 goto out;
1696 }
1697 if (!fanout_find_new_id(sk, &id)) {
1698 err = -ENOMEM;
1699 goto out;
1700 }
1701 /* ephemeral flag for the first socket in the group: drop it */
1702 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1703 }
1704
dc99f600
DM
1705 match = NULL;
1706 list_for_each_entry(f, &fanout_list, list) {
1707 if (f->id == id &&
1708 read_pnet(&f->net) == sock_net(sk)) {
1709 match = f;
1710 break;
1711 }
1712 }
afe62c68 1713 err = -EINVAL;
77f65ebd 1714 if (match && match->flags != flags)
afe62c68 1715 goto out;
dc99f600 1716 if (!match) {
afe62c68 1717 err = -ENOMEM;
dc99f600 1718 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1719 if (!match)
1720 goto out;
1721 write_pnet(&match->net, sock_net(sk));
1722 match->id = id;
1723 match->type = type;
77f65ebd 1724 match->flags = flags;
afe62c68
ED
1725 INIT_LIST_HEAD(&match->list);
1726 spin_lock_init(&match->lock);
fb5c2c17 1727 refcount_set(&match->sk_ref, 0);
47dceb8e 1728 fanout_init_data(match);
afe62c68
ED
1729 match->prot_hook.type = po->prot_hook.type;
1730 match->prot_hook.dev = po->prot_hook.dev;
1731 match->prot_hook.func = packet_rcv_fanout;
1732 match->prot_hook.af_packet_priv = match;
c0de08d0 1733 match->prot_hook.id_match = match_fanout_group;
afe62c68 1734 list_add(&match->list, &fanout_list);
dc99f600 1735 }
afe62c68 1736 err = -EINVAL;
008ba2a1
WB
1737
1738 spin_lock(&po->bind_lock);
1739 if (po->running &&
1740 match->type == type &&
afe62c68
ED
1741 match->prot_hook.type == po->prot_hook.type &&
1742 match->prot_hook.dev == po->prot_hook.dev) {
1743 err = -ENOSPC;
fb5c2c17 1744 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1745 __dev_remove_pack(&po->prot_hook);
1746 po->fanout = match;
57f015f5
MM
1747 po->rollover = rollover;
1748 rollover = NULL;
fb5c2c17 1749 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1750 __fanout_link(sk, po);
1751 err = 0;
dc99f600
DM
1752 }
1753 }
008ba2a1
WB
1754 spin_unlock(&po->bind_lock);
1755
1756 if (err && !refcount_read(&match->sk_ref)) {
1757 list_del(&match->list);
1758 kfree(match);
1759 }
1760
afe62c68 1761out:
57f015f5 1762 kfree(rollover);
d199fab6 1763 mutex_unlock(&fanout_mutex);
dc99f600
DM
1764 return err;
1765}
1766
2bd624b4
AS
1767/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1768 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1769 * It is the responsibility of the caller to call fanout_release_data() and
1770 * free the returned packet_fanout (after synchronize_net())
1771 */
1772static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1773{
1774 struct packet_sock *po = pkt_sk(sk);
1775 struct packet_fanout *f;
1776
fff3321d 1777 mutex_lock(&fanout_mutex);
d199fab6
ED
1778 f = po->fanout;
1779 if (f) {
1780 po->fanout = NULL;
1781
fb5c2c17 1782 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1783 list_del(&f->list);
2bd624b4
AS
1784 else
1785 f = NULL;
dc99f600
DM
1786 }
1787 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1788
1789 return f;
dc99f600 1790}
1da177e4 1791
3c70c132
DB
1792static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1793 struct sk_buff *skb)
1794{
1795 /* Earlier code assumed this would be a VLAN pkt, double-check
1796 * this now that we have the actual packet in hand. We can only
1797 * do this check on Ethernet devices.
1798 */
1799 if (unlikely(dev->type != ARPHRD_ETHER))
1800 return false;
1801
1802 skb_reset_mac_header(skb);
1803 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1804}
1805
90ddc4f0 1806static const struct proto_ops packet_ops;
1da177e4 1807
90ddc4f0 1808static const struct proto_ops packet_ops_spkt;
1da177e4 1809
40d4e3df
ED
1810static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1811 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1812{
1813 struct sock *sk;
1814 struct sockaddr_pkt *spkt;
1815
1816 /*
1817 * When we registered the protocol we saved the socket in the data
1818 * field for just this event.
1819 */
1820
1821 sk = pt->af_packet_priv;
1ce4f28b 1822
1da177e4
LT
1823 /*
1824 * Yank back the headers [hope the device set this
1825 * right or kerboom...]
1826 *
1827 * Incoming packets have ll header pulled,
1828 * push it back.
1829 *
98e399f8 1830 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1831 * so that this procedure is noop.
1832 */
1833
1834 if (skb->pkt_type == PACKET_LOOPBACK)
1835 goto out;
1836
09ad9bc7 1837 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1838 goto out;
1839
40d4e3df
ED
1840 skb = skb_share_check(skb, GFP_ATOMIC);
1841 if (skb == NULL)
1da177e4
LT
1842 goto oom;
1843
1844 /* drop any routing info */
adf30907 1845 skb_dst_drop(skb);
1da177e4 1846
84531c24
PO
1847 /* drop conntrack reference */
1848 nf_reset(skb);
1849
ffbc6111 1850 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1851
98e399f8 1852 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1853
1854 /*
1855 * The SOCK_PACKET socket receives _all_ frames.
1856 */
1857
1858 spkt->spkt_family = dev->type;
1859 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1860 spkt->spkt_protocol = skb->protocol;
1861
1862 /*
1863 * Charge the memory to the socket. This is done specifically
1864 * to prevent sockets using all the memory up.
1865 */
1866
40d4e3df 1867 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1868 return 0;
1869
1870out:
1871 kfree_skb(skb);
1872oom:
1873 return 0;
1874}
1875
1876
1877/*
1878 * Output a raw packet to a device layer. This bypasses all the other
1879 * protocol layers and you must therefore supply it with a complete frame
1880 */
1ce4f28b 1881
1b784140
YX
1882static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1883 size_t len)
1da177e4
LT
1884{
1885 struct sock *sk = sock->sk;
342dfc30 1886 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1887 struct sk_buff *skb = NULL;
1da177e4 1888 struct net_device *dev;
c14ac945 1889 struct sockcm_cookie sockc;
40d4e3df 1890 __be16 proto = 0;
1da177e4 1891 int err;
3bdc0eba 1892 int extra_len = 0;
1ce4f28b 1893
1da177e4 1894 /*
1ce4f28b 1895 * Get and verify the address.
1da177e4
LT
1896 */
1897
40d4e3df 1898 if (saddr) {
1da177e4 1899 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1900 return -EINVAL;
1901 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1902 proto = saddr->spkt_protocol;
1903 } else
1904 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1905
1906 /*
1ce4f28b 1907 * Find the device first to size check it
1da177e4
LT
1908 */
1909
de74e92a 1910 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1911retry:
654d1f8a
ED
1912 rcu_read_lock();
1913 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1914 err = -ENODEV;
1915 if (dev == NULL)
1916 goto out_unlock;
1ce4f28b 1917
d5e76b0a
DM
1918 err = -ENETDOWN;
1919 if (!(dev->flags & IFF_UP))
1920 goto out_unlock;
1921
1da177e4 1922 /*
40d4e3df
ED
1923 * You may not queue a frame bigger than the mtu. This is the lowest level
1924 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1925 */
1ce4f28b 1926
3bdc0eba
BG
1927 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1928 if (!netif_supports_nofcs(dev)) {
1929 err = -EPROTONOSUPPORT;
1930 goto out_unlock;
1931 }
1932 extra_len = 4; /* We're doing our own CRC */
1933 }
1934
1da177e4 1935 err = -EMSGSIZE;
3bdc0eba 1936 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1937 goto out_unlock;
1938
1a35ca80
ED
1939 if (!skb) {
1940 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1941 int tlen = dev->needed_tailroom;
1a35ca80
ED
1942 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1943
1944 rcu_read_unlock();
4ce40912 1945 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1946 if (skb == NULL)
1947 return -ENOBUFS;
1948 /* FIXME: Save some space for broken drivers that write a hard
1949 * header at transmission time by themselves. PPP is the notable
1950 * one here. This should really be fixed at the driver level.
1951 */
1952 skb_reserve(skb, reserved);
1953 skb_reset_network_header(skb);
1954
1955 /* Try to align data part correctly */
1956 if (hhlen) {
1957 skb->data -= hhlen;
1958 skb->tail -= hhlen;
1959 if (len < hhlen)
1960 skb_reset_network_header(skb);
1961 }
6ce8e9ce 1962 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1963 if (err)
1964 goto out_free;
1965 goto retry;
1da177e4
LT
1966 }
1967
9ed988cd
WB
1968 if (!dev_validate_header(dev, skb->data, len)) {
1969 err = -EINVAL;
1970 goto out_unlock;
1971 }
3c70c132
DB
1972 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1973 !packet_extra_vlan_len_allowed(dev, skb)) {
1974 err = -EMSGSIZE;
1975 goto out_unlock;
57f89bfa 1976 }
1a35ca80 1977
edbe7746 1978 sockc.tsflags = sk->sk_tsflags;
c14ac945
SHY
1979 if (msg->msg_controllen) {
1980 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1981 if (unlikely(err))
c14ac945 1982 goto out_unlock;
c14ac945
SHY
1983 }
1984
1da177e4
LT
1985 skb->protocol = proto;
1986 skb->dev = dev;
1987 skb->priority = sk->sk_priority;
2d37a186 1988 skb->mark = sk->sk_mark;
bf84a010 1989
c14ac945 1990 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1991
3bdc0eba
BG
1992 if (unlikely(extra_len == 4))
1993 skb->no_fcs = 1;
1994
40893fd0 1995 skb_probe_transport_header(skb, 0);
c1aad275 1996
1da177e4 1997 dev_queue_xmit(skb);
654d1f8a 1998 rcu_read_unlock();
40d4e3df 1999 return len;
1da177e4 2000
1da177e4 2001out_unlock:
654d1f8a 2002 rcu_read_unlock();
1a35ca80
ED
2003out_free:
2004 kfree_skb(skb);
1da177e4
LT
2005 return err;
2006}
1da177e4 2007
ff936a04
AS
2008static unsigned int run_filter(struct sk_buff *skb,
2009 const struct sock *sk,
2010 unsigned int res)
1da177e4
LT
2011{
2012 struct sk_filter *filter;
fda9ef5d 2013
80f8f102
ED
2014 rcu_read_lock();
2015 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2016 if (filter != NULL)
ff936a04 2017 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2018 rcu_read_unlock();
1da177e4 2019
dbcb5855 2020 return res;
1da177e4
LT
2021}
2022
16cc1400
WB
2023static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2024 size_t *len)
2025{
2026 struct virtio_net_hdr vnet_hdr;
2027
2028 if (*len < sizeof(vnet_hdr))
2029 return -EINVAL;
2030 *len -= sizeof(vnet_hdr);
2031
6391a448 2032 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
16cc1400
WB
2033 return -EINVAL;
2034
2035 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2036}
2037
1da177e4 2038/*
62ab0812
ED
2039 * This function makes lazy skb cloning in hope that most of packets
2040 * are discarded by BPF.
2041 *
2042 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2043 * and skb->cb are mangled. It works because (and until) packets
2044 * falling here are owned by current CPU. Output packets are cloned
2045 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2046 * sequencially, so that if we return skb to original state on exit,
2047 * we will not harm anyone.
1da177e4
LT
2048 */
2049
40d4e3df
ED
2050static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2051 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2052{
2053 struct sock *sk;
2054 struct sockaddr_ll *sll;
2055 struct packet_sock *po;
40d4e3df 2056 u8 *skb_head = skb->data;
1da177e4 2057 int skb_len = skb->len;
dbcb5855 2058 unsigned int snaplen, res;
da37845f 2059 bool is_drop_n_account = false;
1da177e4
LT
2060
2061 if (skb->pkt_type == PACKET_LOOPBACK)
2062 goto drop;
2063
2064 sk = pt->af_packet_priv;
2065 po = pkt_sk(sk);
2066
09ad9bc7 2067 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2068 goto drop;
2069
1da177e4
LT
2070 skb->dev = dev;
2071
3b04ddde 2072 if (dev->header_ops) {
1da177e4 2073 /* The device has an explicit notion of ll header,
62ab0812
ED
2074 * exported to higher levels.
2075 *
2076 * Otherwise, the device hides details of its frame
2077 * structure, so that corresponding packet head is
2078 * never delivered to user.
1da177e4
LT
2079 */
2080 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2081 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2082 else if (skb->pkt_type == PACKET_OUTGOING) {
2083 /* Special case: outgoing packets have ll header at head */
bbe735e4 2084 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2085 }
2086 }
2087
2088 snaplen = skb->len;
2089
dbcb5855
DM
2090 res = run_filter(skb, sk, snaplen);
2091 if (!res)
fda9ef5d 2092 goto drop_n_restore;
dbcb5855
DM
2093 if (snaplen > res)
2094 snaplen = res;
1da177e4 2095
0fd7bac6 2096 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2097 goto drop_n_acct;
2098
2099 if (skb_shared(skb)) {
2100 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2101 if (nskb == NULL)
2102 goto drop_n_acct;
2103
2104 if (skb_head != skb->data) {
2105 skb->data = skb_head;
2106 skb->len = skb_len;
2107 }
abc4e4fa 2108 consume_skb(skb);
1da177e4
LT
2109 skb = nskb;
2110 }
2111
b4772ef8 2112 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2113
2114 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2115 sll->sll_hatype = dev->type;
1da177e4 2116 sll->sll_pkttype = skb->pkt_type;
8032b464 2117 if (unlikely(po->origdev))
80feaacb
PWJ
2118 sll->sll_ifindex = orig_dev->ifindex;
2119 else
2120 sll->sll_ifindex = dev->ifindex;
1da177e4 2121
b95cce35 2122 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2123
2472d761
EB
2124 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2125 * Use their space for storing the original skb length.
2126 */
2127 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2128
1da177e4
LT
2129 if (pskb_trim(skb, snaplen))
2130 goto drop_n_acct;
2131
2132 skb_set_owner_r(skb, sk);
2133 skb->dev = NULL;
adf30907 2134 skb_dst_drop(skb);
1da177e4 2135
84531c24
PO
2136 /* drop conntrack reference */
2137 nf_reset(skb);
2138
1da177e4 2139 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2140 po->stats.stats1.tp_packets++;
3bc3b96f 2141 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2142 __skb_queue_tail(&sk->sk_receive_queue, skb);
2143 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2144 sk->sk_data_ready(sk);
1da177e4
LT
2145 return 0;
2146
2147drop_n_acct:
da37845f 2148 is_drop_n_account = true;
7091fbd8 2149 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2150 po->stats.stats1.tp_drops++;
7091fbd8
WB
2151 atomic_inc(&sk->sk_drops);
2152 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2153
2154drop_n_restore:
2155 if (skb_head != skb->data && skb_shared(skb)) {
2156 skb->data = skb_head;
2157 skb->len = skb_len;
2158 }
2159drop:
da37845f
WJ
2160 if (!is_drop_n_account)
2161 consume_skb(skb);
2162 else
2163 kfree_skb(skb);
1da177e4
LT
2164 return 0;
2165}
2166
40d4e3df
ED
2167static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2168 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2169{
2170 struct sock *sk;
2171 struct packet_sock *po;
2172 struct sockaddr_ll *sll;
184f489e 2173 union tpacket_uhdr h;
40d4e3df 2174 u8 *skb_head = skb->data;
1da177e4 2175 int skb_len = skb->len;
dbcb5855 2176 unsigned int snaplen, res;
f6fb8f10 2177 unsigned long status = TP_STATUS_USER;
bbd6ef87 2178 unsigned short macoff, netoff, hdrlen;
1da177e4 2179 struct sk_buff *copy_skb = NULL;
bbd6ef87 2180 struct timespec ts;
b9c32fb2 2181 __u32 ts_status;
da37845f 2182 bool is_drop_n_account = false;
edbd58be 2183 bool do_vnet = false;
1da177e4 2184
51846355
AW
2185 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2186 * We may add members to them until current aligned size without forcing
2187 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2188 */
2189 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2190 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2191
1da177e4
LT
2192 if (skb->pkt_type == PACKET_LOOPBACK)
2193 goto drop;
2194
2195 sk = pt->af_packet_priv;
2196 po = pkt_sk(sk);
2197
09ad9bc7 2198 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2199 goto drop;
2200
3b04ddde 2201 if (dev->header_ops) {
1da177e4 2202 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2203 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2204 else if (skb->pkt_type == PACKET_OUTGOING) {
2205 /* Special case: outgoing packets have ll header at head */
bbe735e4 2206 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2207 }
2208 }
2209
2210 snaplen = skb->len;
2211
dbcb5855
DM
2212 res = run_filter(skb, sk, snaplen);
2213 if (!res)
fda9ef5d 2214 goto drop_n_restore;
68c2e5de
AD
2215
2216 if (skb->ip_summed == CHECKSUM_PARTIAL)
2217 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2218 else if (skb->pkt_type != PACKET_OUTGOING &&
2219 (skb->ip_summed == CHECKSUM_COMPLETE ||
2220 skb_csum_unnecessary(skb)))
2221 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2222
dbcb5855
DM
2223 if (snaplen > res)
2224 snaplen = res;
1da177e4
LT
2225
2226 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2227 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2228 po->tp_reserve;
1da177e4 2229 } else {
95c96174 2230 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2231 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2232 (maclen < 16 ? 16 : maclen)) +
58d19b19 2233 po->tp_reserve;
edbd58be 2234 if (po->has_vnet_hdr) {
58d19b19 2235 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2236 do_vnet = true;
2237 }
1da177e4
LT
2238 macoff = netoff - maclen;
2239 }
f6fb8f10 2240 if (po->tp_version <= TPACKET_V2) {
2241 if (macoff + snaplen > po->rx_ring.frame_size) {
2242 if (po->copy_thresh &&
0fd7bac6 2243 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2244 if (skb_shared(skb)) {
2245 copy_skb = skb_clone(skb, GFP_ATOMIC);
2246 } else {
2247 copy_skb = skb_get(skb);
2248 skb_head = skb->data;
2249 }
2250 if (copy_skb)
2251 skb_set_owner_r(copy_skb, sk);
1da177e4 2252 }
f6fb8f10 2253 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2254 if ((int)snaplen < 0) {
f6fb8f10 2255 snaplen = 0;
edbd58be
BP
2256 do_vnet = false;
2257 }
1da177e4 2258 }
dc808110
ED
2259 } else if (unlikely(macoff + snaplen >
2260 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2261 u32 nval;
2262
2263 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2264 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2265 snaplen, nval, macoff);
2266 snaplen = nval;
2267 if (unlikely((int)snaplen < 0)) {
2268 snaplen = 0;
2269 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2270 do_vnet = false;
dc808110 2271 }
1da177e4 2272 }
1da177e4 2273 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2274 h.raw = packet_current_rx_frame(po, skb,
2275 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2276 if (!h.raw)
58d19b19 2277 goto drop_n_account;
f6fb8f10 2278 if (po->tp_version <= TPACKET_V2) {
2279 packet_increment_rx_head(po, &po->rx_ring);
2280 /*
2281 * LOSING will be reported till you read the stats,
2282 * because it's COR - Clear On Read.
2283 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2284 * at packet level.
2285 */
ee80fbf3 2286 if (po->stats.stats1.tp_drops)
f6fb8f10 2287 status |= TP_STATUS_LOSING;
2288 }
ee80fbf3 2289 po->stats.stats1.tp_packets++;
1da177e4
LT
2290 if (copy_skb) {
2291 status |= TP_STATUS_COPY;
2292 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2293 }
1da177e4
LT
2294 spin_unlock(&sk->sk_receive_queue.lock);
2295
edbd58be 2296 if (do_vnet) {
5a213881
JR
2297 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2298 sizeof(struct virtio_net_hdr),
6391a448 2299 vio_le(), true)) {
58d19b19
WB
2300 spin_lock(&sk->sk_receive_queue.lock);
2301 goto drop_n_account;
2302 }
2303 }
2304
bbd6ef87 2305 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2306
2307 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2308 getnstimeofday(&ts);
1da177e4 2309
b9c32fb2
DB
2310 status |= ts_status;
2311
bbd6ef87
PM
2312 switch (po->tp_version) {
2313 case TPACKET_V1:
2314 h.h1->tp_len = skb->len;
2315 h.h1->tp_snaplen = snaplen;
2316 h.h1->tp_mac = macoff;
2317 h.h1->tp_net = netoff;
4b457bdf
DB
2318 h.h1->tp_sec = ts.tv_sec;
2319 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2320 hdrlen = sizeof(*h.h1);
2321 break;
2322 case TPACKET_V2:
2323 h.h2->tp_len = skb->len;
2324 h.h2->tp_snaplen = snaplen;
2325 h.h2->tp_mac = macoff;
2326 h.h2->tp_net = netoff;
bbd6ef87
PM
2327 h.h2->tp_sec = ts.tv_sec;
2328 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2329 if (skb_vlan_tag_present(skb)) {
2330 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2331 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2332 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2333 } else {
2334 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2335 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2336 }
e4d26f4b 2337 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2338 hdrlen = sizeof(*h.h2);
2339 break;
f6fb8f10 2340 case TPACKET_V3:
2341 /* tp_nxt_offset,vlan are already populated above.
2342 * So DONT clear those fields here
2343 */
2344 h.h3->tp_status |= status;
2345 h.h3->tp_len = skb->len;
2346 h.h3->tp_snaplen = snaplen;
2347 h.h3->tp_mac = macoff;
2348 h.h3->tp_net = netoff;
f6fb8f10 2349 h.h3->tp_sec = ts.tv_sec;
2350 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2351 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2352 hdrlen = sizeof(*h.h3);
2353 break;
bbd6ef87
PM
2354 default:
2355 BUG();
2356 }
1da177e4 2357
bbd6ef87 2358 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2359 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2360 sll->sll_family = AF_PACKET;
2361 sll->sll_hatype = dev->type;
2362 sll->sll_protocol = skb->protocol;
2363 sll->sll_pkttype = skb->pkt_type;
8032b464 2364 if (unlikely(po->origdev))
80feaacb
PWJ
2365 sll->sll_ifindex = orig_dev->ifindex;
2366 else
2367 sll->sll_ifindex = dev->ifindex;
1da177e4 2368
e16aa207 2369 smp_mb();
f0d4eb29 2370
f6dafa95 2371#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2372 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2373 u8 *start, *end;
2374
f0d4eb29
DB
2375 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2376 macoff + snaplen);
2377
2378 for (start = h.raw; start < end; start += PAGE_SIZE)
2379 flush_dcache_page(pgv_to_page(start));
1da177e4 2380 }
f0d4eb29 2381 smp_wmb();
f6dafa95 2382#endif
f0d4eb29 2383
da413eec 2384 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2385 __packet_set_status(po, h.raw, status);
da413eec
DC
2386 sk->sk_data_ready(sk);
2387 } else {
f6fb8f10 2388 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2389 }
1da177e4
LT
2390
2391drop_n_restore:
2392 if (skb_head != skb->data && skb_shared(skb)) {
2393 skb->data = skb_head;
2394 skb->len = skb_len;
2395 }
2396drop:
da37845f
WJ
2397 if (!is_drop_n_account)
2398 consume_skb(skb);
2399 else
2400 kfree_skb(skb);
1da177e4
LT
2401 return 0;
2402
58d19b19 2403drop_n_account:
da37845f 2404 is_drop_n_account = true;
ee80fbf3 2405 po->stats.stats1.tp_drops++;
1da177e4
LT
2406 spin_unlock(&sk->sk_receive_queue.lock);
2407
676d2369 2408 sk->sk_data_ready(sk);
acb5d75b 2409 kfree_skb(copy_skb);
1da177e4
LT
2410 goto drop_n_restore;
2411}
2412
69e3c75f
JB
2413static void tpacket_destruct_skb(struct sk_buff *skb)
2414{
2415 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2416
69e3c75f 2417 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2418 void *ph;
b9c32fb2
DB
2419 __u32 ts;
2420
69e3c75f 2421 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2422 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2423
2424 ts = __packet_set_timestamp(po, ph, skb);
2425 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2426 }
2427
2428 sock_wfree(skb);
2429}
2430
c72219b7
DB
2431static void tpacket_set_protocol(const struct net_device *dev,
2432 struct sk_buff *skb)
2433{
2434 if (dev->type == ARPHRD_ETHER) {
2435 skb_reset_mac_header(skb);
2436 skb->protocol = eth_hdr(skb)->h_proto;
2437 }
2438}
2439
16cc1400
WB
2440static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2441{
16cc1400
WB
2442 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2443 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2444 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2445 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2446 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2447 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2448 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2449
2450 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2451 return -EINVAL;
2452
16cc1400
WB
2453 return 0;
2454}
2455
2456static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2457 struct virtio_net_hdr *vnet_hdr)
2458{
16cc1400
WB
2459 if (*len < sizeof(*vnet_hdr))
2460 return -EINVAL;
2461 *len -= sizeof(*vnet_hdr);
2462
cbbd26b8 2463 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2464 return -EFAULT;
2465
2466 return __packet_snd_vnet_parse(vnet_hdr, *len);
2467}
2468
40d4e3df 2469static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2470 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2471 __be16 proto, unsigned char *addr, int hlen, int copylen,
2472 const struct sockcm_cookie *sockc)
69e3c75f 2473{
184f489e 2474 union tpacket_uhdr ph;
8d39b4a6 2475 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2476 struct socket *sock = po->sk.sk_socket;
2477 struct page *page;
69e3c75f
JB
2478 int err;
2479
2480 ph.raw = frame;
2481
2482 skb->protocol = proto;
2483 skb->dev = dev;
2484 skb->priority = po->sk.sk_priority;
2d37a186 2485 skb->mark = po->sk.sk_mark;
c14ac945 2486 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2487 skb_shinfo(skb)->destructor_arg = ph.raw;
2488
ae641949 2489 skb_reserve(skb, hlen);
69e3c75f 2490 skb_reset_network_header(skb);
c1aad275 2491
69e3c75f
JB
2492 to_write = tp_len;
2493
2494 if (sock->type == SOCK_DGRAM) {
2495 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2496 NULL, tp_len);
2497 if (unlikely(err < 0))
2498 return -EINVAL;
1d036d25 2499 } else if (copylen) {
9ed988cd
WB
2500 int hdrlen = min_t(int, copylen, tp_len);
2501
69e3c75f 2502 skb_push(skb, dev->hard_header_len);
1d036d25 2503 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2504 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2505 if (unlikely(err))
2506 return err;
9ed988cd
WB
2507 if (!dev_validate_header(dev, skb->data, hdrlen))
2508 return -EINVAL;
c72219b7
DB
2509 if (!skb->protocol)
2510 tpacket_set_protocol(dev, skb);
69e3c75f 2511
9ed988cd
WB
2512 data += hdrlen;
2513 to_write -= hdrlen;
69e3c75f
JB
2514 }
2515
69e3c75f
JB
2516 offset = offset_in_page(data);
2517 len_max = PAGE_SIZE - offset;
2518 len = ((to_write > len_max) ? len_max : to_write);
2519
2520 skb->data_len = to_write;
2521 skb->len += to_write;
2522 skb->truesize += to_write;
14afee4b 2523 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2524
2525 while (likely(to_write)) {
2526 nr_frags = skb_shinfo(skb)->nr_frags;
2527
2528 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2529 pr_err("Packet exceed the number of skb frags(%lu)\n",
2530 MAX_SKB_FRAGS);
69e3c75f
JB
2531 return -EFAULT;
2532 }
2533
0af55bb5
CG
2534 page = pgv_to_page(data);
2535 data += len;
69e3c75f
JB
2536 flush_dcache_page(page);
2537 get_page(page);
0af55bb5 2538 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2539 to_write -= len;
2540 offset = 0;
2541 len_max = PAGE_SIZE;
2542 len = ((to_write > len_max) ? len_max : to_write);
2543 }
2544
8fd6c80d 2545 skb_probe_transport_header(skb, 0);
efdfa2f7 2546
69e3c75f
JB
2547 return tp_len;
2548}
2549
8d39b4a6
WB
2550static int tpacket_parse_header(struct packet_sock *po, void *frame,
2551 int size_max, void **data)
2552{
2553 union tpacket_uhdr ph;
2554 int tp_len, off;
2555
2556 ph.raw = frame;
2557
2558 switch (po->tp_version) {
7f953ab2
SV
2559 case TPACKET_V3:
2560 if (ph.h3->tp_next_offset != 0) {
2561 pr_warn_once("variable sized slot not supported");
2562 return -EINVAL;
2563 }
2564 tp_len = ph.h3->tp_len;
2565 break;
8d39b4a6
WB
2566 case TPACKET_V2:
2567 tp_len = ph.h2->tp_len;
2568 break;
2569 default:
2570 tp_len = ph.h1->tp_len;
2571 break;
2572 }
2573 if (unlikely(tp_len > size_max)) {
2574 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2575 return -EMSGSIZE;
2576 }
2577
2578 if (unlikely(po->tp_tx_has_off)) {
2579 int off_min, off_max;
2580
2581 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2582 off_max = po->tx_ring.frame_size - tp_len;
2583 if (po->sk.sk_type == SOCK_DGRAM) {
2584 switch (po->tp_version) {
7f953ab2
SV
2585 case TPACKET_V3:
2586 off = ph.h3->tp_net;
2587 break;
8d39b4a6
WB
2588 case TPACKET_V2:
2589 off = ph.h2->tp_net;
2590 break;
2591 default:
2592 off = ph.h1->tp_net;
2593 break;
2594 }
2595 } else {
2596 switch (po->tp_version) {
7f953ab2
SV
2597 case TPACKET_V3:
2598 off = ph.h3->tp_mac;
2599 break;
8d39b4a6
WB
2600 case TPACKET_V2:
2601 off = ph.h2->tp_mac;
2602 break;
2603 default:
2604 off = ph.h1->tp_mac;
2605 break;
2606 }
2607 }
2608 if (unlikely((off < off_min) || (off_max < off)))
2609 return -EINVAL;
2610 } else {
2611 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2612 }
2613
2614 *data = frame + off;
2615 return tp_len;
2616}
2617
69e3c75f
JB
2618static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2619{
69e3c75f
JB
2620 struct sk_buff *skb;
2621 struct net_device *dev;
1d036d25 2622 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2623 struct sockcm_cookie sockc;
69e3c75f 2624 __be16 proto;
09effa67 2625 int err, reserve = 0;
40d4e3df 2626 void *ph;
342dfc30 2627 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2628 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2629 int tp_len, size_max;
2630 unsigned char *addr;
8d39b4a6 2631 void *data;
69e3c75f 2632 int len_sum = 0;
9e67030a 2633 int status = TP_STATUS_AVAILABLE;
1d036d25 2634 int hlen, tlen, copylen = 0;
69e3c75f 2635
69e3c75f
JB
2636 mutex_lock(&po->pg_vec_lock);
2637
66e56cd4 2638 if (likely(saddr == NULL)) {
e40526cb 2639 dev = packet_cached_dev_get(po);
69e3c75f
JB
2640 proto = po->num;
2641 addr = NULL;
2642 } else {
2643 err = -EINVAL;
2644 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2645 goto out;
2646 if (msg->msg_namelen < (saddr->sll_halen
2647 + offsetof(struct sockaddr_ll,
2648 sll_addr)))
2649 goto out;
69e3c75f
JB
2650 proto = saddr->sll_protocol;
2651 addr = saddr->sll_addr;
827d9780 2652 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2653 }
2654
69e3c75f
JB
2655 err = -ENXIO;
2656 if (unlikely(dev == NULL))
2657 goto out;
69e3c75f
JB
2658 err = -ENETDOWN;
2659 if (unlikely(!(dev->flags & IFF_UP)))
2660 goto out_put;
2661
d19b183c
DCS
2662 sockc.tsflags = po->sk.sk_tsflags;
2663 if (msg->msg_controllen) {
2664 err = sock_cmsg_send(&po->sk, msg, &sockc);
2665 if (unlikely(err))
2666 goto out_put;
2667 }
2668
5cfb4c8d
DB
2669 if (po->sk.sk_socket->type == SOCK_RAW)
2670 reserve = dev->hard_header_len;
69e3c75f 2671 size_max = po->tx_ring.frame_size
b5dd884e 2672 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2673
1d036d25 2674 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2675 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2676
69e3c75f
JB
2677 do {
2678 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2679 TP_STATUS_SEND_REQUEST);
69e3c75f 2680 if (unlikely(ph == NULL)) {
87a2fd28
DB
2681 if (need_wait && need_resched())
2682 schedule();
69e3c75f
JB
2683 continue;
2684 }
2685
8d39b4a6
WB
2686 skb = NULL;
2687 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2688 if (tp_len < 0)
2689 goto tpacket_error;
2690
69e3c75f 2691 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2692 hlen = LL_RESERVED_SPACE(dev);
2693 tlen = dev->needed_tailroom;
1d036d25
WB
2694 if (po->has_vnet_hdr) {
2695 vnet_hdr = data;
2696 data += sizeof(*vnet_hdr);
2697 tp_len -= sizeof(*vnet_hdr);
2698 if (tp_len < 0 ||
2699 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2700 tp_len = -EINVAL;
2701 goto tpacket_error;
2702 }
2703 copylen = __virtio16_to_cpu(vio_le(),
2704 vnet_hdr->hdr_len);
2705 }
9ed988cd 2706 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2707 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2708 hlen + tlen + sizeof(struct sockaddr_ll) +
2709 (copylen - dev->hard_header_len),
fbf33a28 2710 !need_wait, &err);
69e3c75f 2711
fbf33a28
KM
2712 if (unlikely(skb == NULL)) {
2713 /* we assume the socket was initially writeable ... */
2714 if (likely(len_sum > 0))
2715 err = len_sum;
69e3c75f 2716 goto out_status;
fbf33a28 2717 }
8d39b4a6 2718 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2719 addr, hlen, copylen, &sockc);
dbd46ab4 2720 if (likely(tp_len >= 0) &&
5cfb4c8d 2721 tp_len > dev->mtu + reserve &&
1d036d25 2722 !po->has_vnet_hdr &&
3c70c132
DB
2723 !packet_extra_vlan_len_allowed(dev, skb))
2724 tp_len = -EMSGSIZE;
69e3c75f
JB
2725
2726 if (unlikely(tp_len < 0)) {
8d39b4a6 2727tpacket_error:
69e3c75f
JB
2728 if (po->tp_loss) {
2729 __packet_set_status(po, ph,
2730 TP_STATUS_AVAILABLE);
2731 packet_increment_head(&po->tx_ring);
2732 kfree_skb(skb);
2733 continue;
2734 } else {
2735 status = TP_STATUS_WRONG_FORMAT;
2736 err = tp_len;
2737 goto out_status;
2738 }
2739 }
2740
db60eb5f
JR
2741 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2742 vio_le())) {
1d036d25
WB
2743 tp_len = -EINVAL;
2744 goto tpacket_error;
2745 }
2746
69e3c75f
JB
2747 skb->destructor = tpacket_destruct_skb;
2748 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2749 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2750
2751 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2752 err = po->xmit(skb);
eb70df13
JP
2753 if (unlikely(err > 0)) {
2754 err = net_xmit_errno(err);
2755 if (err && __packet_get_status(po, ph) ==
2756 TP_STATUS_AVAILABLE) {
2757 /* skb was destructed already */
2758 skb = NULL;
2759 goto out_status;
2760 }
2761 /*
2762 * skb was dropped but not destructed yet;
2763 * let's treat it like congestion or err < 0
2764 */
2765 err = 0;
2766 }
69e3c75f
JB
2767 packet_increment_head(&po->tx_ring);
2768 len_sum += tp_len;
b0138408
DB
2769 } while (likely((ph != NULL) ||
2770 /* Note: packet_read_pending() might be slow if we have
2771 * to call it as it's per_cpu variable, but in fast-path
2772 * we already short-circuit the loop with the first
2773 * condition, and luckily don't have to go that path
2774 * anyway.
2775 */
2776 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2777
2778 err = len_sum;
2779 goto out_put;
2780
69e3c75f
JB
2781out_status:
2782 __packet_set_status(po, ph, status);
2783 kfree_skb(skb);
2784out_put:
e40526cb 2785 dev_put(dev);
69e3c75f
JB
2786out:
2787 mutex_unlock(&po->pg_vec_lock);
2788 return err;
2789}
69e3c75f 2790
eea49cc9
OJ
2791static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2792 size_t reserve, size_t len,
2793 size_t linear, int noblock,
2794 int *err)
bfd5f4a3
SS
2795{
2796 struct sk_buff *skb;
2797
2798 /* Under a page? Don't bother with paged skb. */
2799 if (prepad + len < PAGE_SIZE || !linear)
2800 linear = len;
2801
2802 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2803 err, 0);
bfd5f4a3
SS
2804 if (!skb)
2805 return NULL;
2806
2807 skb_reserve(skb, reserve);
2808 skb_put(skb, linear);
2809 skb->data_len = len - linear;
2810 skb->len += len - linear;
2811
2812 return skb;
2813}
2814
d346a3fa 2815static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2816{
2817 struct sock *sk = sock->sk;
342dfc30 2818 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2819 struct sk_buff *skb;
2820 struct net_device *dev;
0e11c91e 2821 __be16 proto;
1da177e4 2822 unsigned char *addr;
827d9780 2823 int err, reserve = 0;
c7d39e32 2824 struct sockcm_cookie sockc;
bfd5f4a3
SS
2825 struct virtio_net_hdr vnet_hdr = { 0 };
2826 int offset = 0;
bfd5f4a3 2827 struct packet_sock *po = pkt_sk(sk);
da7c9561 2828 bool has_vnet_hdr = false;
57031eb7 2829 int hlen, tlen, linear;
3bdc0eba 2830 int extra_len = 0;
1da177e4
LT
2831
2832 /*
1ce4f28b 2833 * Get and verify the address.
1da177e4 2834 */
1ce4f28b 2835
66e56cd4 2836 if (likely(saddr == NULL)) {
e40526cb 2837 dev = packet_cached_dev_get(po);
1da177e4
LT
2838 proto = po->num;
2839 addr = NULL;
2840 } else {
2841 err = -EINVAL;
2842 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2843 goto out;
0fb375fb
EB
2844 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2845 goto out;
1da177e4
LT
2846 proto = saddr->sll_protocol;
2847 addr = saddr->sll_addr;
827d9780 2848 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2849 }
2850
1da177e4 2851 err = -ENXIO;
e40526cb 2852 if (unlikely(dev == NULL))
1da177e4 2853 goto out_unlock;
d5e76b0a 2854 err = -ENETDOWN;
e40526cb 2855 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2856 goto out_unlock;
2857
edbe7746 2858 sockc.tsflags = sk->sk_tsflags;
c7d39e32
EJ
2859 sockc.mark = sk->sk_mark;
2860 if (msg->msg_controllen) {
2861 err = sock_cmsg_send(sk, msg, &sockc);
2862 if (unlikely(err))
2863 goto out_unlock;
2864 }
2865
e40526cb
DB
2866 if (sock->type == SOCK_RAW)
2867 reserve = dev->hard_header_len;
bfd5f4a3 2868 if (po->has_vnet_hdr) {
16cc1400
WB
2869 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2870 if (err)
bfd5f4a3 2871 goto out_unlock;
da7c9561 2872 has_vnet_hdr = true;
bfd5f4a3
SS
2873 }
2874
3bdc0eba
BG
2875 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2876 if (!netif_supports_nofcs(dev)) {
2877 err = -EPROTONOSUPPORT;
2878 goto out_unlock;
2879 }
2880 extra_len = 4; /* We're doing our own CRC */
2881 }
2882
1da177e4 2883 err = -EMSGSIZE;
16cc1400
WB
2884 if (!vnet_hdr.gso_type &&
2885 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2886 goto out_unlock;
2887
bfd5f4a3 2888 err = -ENOBUFS;
ae641949
HX
2889 hlen = LL_RESERVED_SPACE(dev);
2890 tlen = dev->needed_tailroom;
57031eb7
WB
2891 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2892 linear = max(linear, min_t(int, len, dev->hard_header_len));
2893 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2894 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2895 if (skb == NULL)
1da177e4
LT
2896 goto out_unlock;
2897
bfd5f4a3 2898 skb_set_network_header(skb, reserve);
1da177e4 2899
0c4e8581 2900 err = -EINVAL;
9c707762
WB
2901 if (sock->type == SOCK_DGRAM) {
2902 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2903 if (unlikely(offset < 0))
9c707762 2904 goto out_free;
9c707762 2905 }
1da177e4
LT
2906
2907 /* Returns -EFAULT on error */
c0371da6 2908 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2909 if (err)
2910 goto out_free;
bf84a010 2911
9ed988cd
WB
2912 if (sock->type == SOCK_RAW &&
2913 !dev_validate_header(dev, skb->data, len)) {
2914 err = -EINVAL;
2915 goto out_free;
2916 }
2917
c14ac945 2918 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2919
16cc1400 2920 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2921 !packet_extra_vlan_len_allowed(dev, skb)) {
2922 err = -EMSGSIZE;
2923 goto out_free;
57f89bfa
BG
2924 }
2925
09effa67
DM
2926 skb->protocol = proto;
2927 skb->dev = dev;
1da177e4 2928 skb->priority = sk->sk_priority;
c7d39e32 2929 skb->mark = sockc.mark;
0fd5d57b 2930
da7c9561 2931 if (has_vnet_hdr) {
db60eb5f 2932 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2933 if (err)
2934 goto out_free;
2935 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2936 }
2937
8fd6c80d
DB
2938 skb_probe_transport_header(skb, reserve);
2939
3bdc0eba
BG
2940 if (unlikely(extra_len == 4))
2941 skb->no_fcs = 1;
2942
d346a3fa 2943 err = po->xmit(skb);
1da177e4
LT
2944 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2945 goto out_unlock;
2946
e40526cb 2947 dev_put(dev);
1da177e4 2948
40d4e3df 2949 return len;
1da177e4
LT
2950
2951out_free:
2952 kfree_skb(skb);
2953out_unlock:
e40526cb 2954 if (dev)
1da177e4
LT
2955 dev_put(dev);
2956out:
2957 return err;
2958}
2959
1b784140 2960static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2961{
69e3c75f
JB
2962 struct sock *sk = sock->sk;
2963 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2964
69e3c75f
JB
2965 if (po->tx_ring.pg_vec)
2966 return tpacket_snd(po, msg);
2967 else
69e3c75f
JB
2968 return packet_snd(sock, msg, len);
2969}
2970
1da177e4
LT
2971/*
2972 * Close a PACKET socket. This is fairly simple. We immediately go
2973 * to 'closed' state and remove our protocol entry in the device list.
2974 */
2975
2976static int packet_release(struct socket *sock)
2977{
2978 struct sock *sk = sock->sk;
2979 struct packet_sock *po;
2bd624b4 2980 struct packet_fanout *f;
d12d01d6 2981 struct net *net;
f6fb8f10 2982 union tpacket_req_u req_u;
1da177e4
LT
2983
2984 if (!sk)
2985 return 0;
2986
3b1e0a65 2987 net = sock_net(sk);
1da177e4
LT
2988 po = pkt_sk(sk);
2989
0fa7fa98 2990 mutex_lock(&net->packet.sklist_lock);
808f5114 2991 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2992 mutex_unlock(&net->packet.sklist_lock);
2993
2994 preempt_disable();
920de804 2995 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2996 preempt_enable();
1da177e4 2997
808f5114 2998 spin_lock(&po->bind_lock);
ce06b03e 2999 unregister_prot_hook(sk, false);
66e56cd4
DB
3000 packet_cached_dev_reset(po);
3001
160ff18a
BG
3002 if (po->prot_hook.dev) {
3003 dev_put(po->prot_hook.dev);
3004 po->prot_hook.dev = NULL;
3005 }
808f5114 3006 spin_unlock(&po->bind_lock);
1da177e4 3007
1da177e4 3008 packet_flush_mclist(sk);
1da177e4 3009
9665d5d6
PS
3010 if (po->rx_ring.pg_vec) {
3011 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3012 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3013 }
69e3c75f 3014
9665d5d6
PS
3015 if (po->tx_ring.pg_vec) {
3016 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3017 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3018 }
1da177e4 3019
2bd624b4 3020 f = fanout_release(sk);
dc99f600 3021
808f5114 3022 synchronize_net();
2bd624b4
AS
3023
3024 if (f) {
57f015f5 3025 kfree(po->rollover);
2bd624b4
AS
3026 fanout_release_data(f);
3027 kfree(f);
3028 }
1da177e4
LT
3029 /*
3030 * Now the socket is dead. No more input will appear.
3031 */
1da177e4
LT
3032 sock_orphan(sk);
3033 sock->sk = NULL;
3034
3035 /* Purge queues */
3036
3037 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3038 packet_free_pending(po);
17ab56a2 3039 sk_refcnt_debug_release(sk);
1da177e4
LT
3040
3041 sock_put(sk);
3042 return 0;
3043}
3044
3045/*
3046 * Attach a packet hook.
3047 */
3048
30f7ea1c
FR
3049static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3050 __be16 proto)
1da177e4
LT
3051{
3052 struct packet_sock *po = pkt_sk(sk);
158cd4af 3053 struct net_device *dev_curr;
902fefb8
DB
3054 __be16 proto_curr;
3055 bool need_rehook;
30f7ea1c
FR
3056 struct net_device *dev = NULL;
3057 int ret = 0;
3058 bool unlisted = false;
dc99f600 3059
1da177e4 3060 lock_sock(sk);
1da177e4 3061 spin_lock(&po->bind_lock);
30f7ea1c
FR
3062 rcu_read_lock();
3063
4971613c
WB
3064 if (po->fanout) {
3065 ret = -EINVAL;
3066 goto out_unlock;
3067 }
3068
30f7ea1c
FR
3069 if (name) {
3070 dev = dev_get_by_name_rcu(sock_net(sk), name);
3071 if (!dev) {
3072 ret = -ENODEV;
3073 goto out_unlock;
3074 }
3075 } else if (ifindex) {
3076 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3077 if (!dev) {
3078 ret = -ENODEV;
3079 goto out_unlock;
3080 }
3081 }
3082
3083 if (dev)
3084 dev_hold(dev);
66e56cd4 3085
902fefb8
DB
3086 proto_curr = po->prot_hook.type;
3087 dev_curr = po->prot_hook.dev;
3088
3089 need_rehook = proto_curr != proto || dev_curr != dev;
3090
3091 if (need_rehook) {
30f7ea1c
FR
3092 if (po->running) {
3093 rcu_read_unlock();
15fe076e
ED
3094 /* prevents packet_notifier() from calling
3095 * register_prot_hook()
3096 */
3097 po->num = 0;
30f7ea1c
FR
3098 __unregister_prot_hook(sk, true);
3099 rcu_read_lock();
3100 dev_curr = po->prot_hook.dev;
3101 if (dev)
3102 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3103 dev->ifindex);
3104 }
1da177e4 3105
15fe076e 3106 BUG_ON(po->running);
902fefb8
DB
3107 po->num = proto;
3108 po->prot_hook.type = proto;
902fefb8 3109
30f7ea1c
FR
3110 if (unlikely(unlisted)) {
3111 dev_put(dev);
3112 po->prot_hook.dev = NULL;
3113 po->ifindex = -1;
3114 packet_cached_dev_reset(po);
3115 } else {
3116 po->prot_hook.dev = dev;
3117 po->ifindex = dev ? dev->ifindex : 0;
3118 packet_cached_dev_assign(po, dev);
3119 }
902fefb8 3120 }
158cd4af
LW
3121 if (dev_curr)
3122 dev_put(dev_curr);
66e56cd4 3123
902fefb8 3124 if (proto == 0 || !need_rehook)
1da177e4
LT
3125 goto out_unlock;
3126
30f7ea1c 3127 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3128 register_prot_hook(sk);
be85d4ad
UT
3129 } else {
3130 sk->sk_err = ENETDOWN;
3131 if (!sock_flag(sk, SOCK_DEAD))
3132 sk->sk_error_report(sk);
1da177e4
LT
3133 }
3134
3135out_unlock:
30f7ea1c 3136 rcu_read_unlock();
1da177e4
LT
3137 spin_unlock(&po->bind_lock);
3138 release_sock(sk);
30f7ea1c 3139 return ret;
1da177e4
LT
3140}
3141
3142/*
3143 * Bind a packet socket to a device
3144 */
3145
40d4e3df
ED
3146static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3147 int addr_len)
1da177e4 3148{
40d4e3df 3149 struct sock *sk = sock->sk;
540e2894 3150 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3151
1da177e4
LT
3152 /*
3153 * Check legality
3154 */
1ce4f28b 3155
8ae55f04 3156 if (addr_len != sizeof(struct sockaddr))
1da177e4 3157 return -EINVAL;
540e2894
AP
3158 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3159 * zero-terminated.
3160 */
3161 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3162 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3163
30f7ea1c 3164 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3165}
1da177e4
LT
3166
3167static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3168{
40d4e3df
ED
3169 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3170 struct sock *sk = sock->sk;
1da177e4
LT
3171
3172 /*
3173 * Check legality
3174 */
1ce4f28b 3175
1da177e4
LT
3176 if (addr_len < sizeof(struct sockaddr_ll))
3177 return -EINVAL;
3178 if (sll->sll_family != AF_PACKET)
3179 return -EINVAL;
3180
30f7ea1c
FR
3181 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3182 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3183}
3184
3185static struct proto packet_proto = {
3186 .name = "PACKET",
3187 .owner = THIS_MODULE,
3188 .obj_size = sizeof(struct packet_sock),
3189};
3190
3191/*
1ce4f28b 3192 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3193 */
3194
3f378b68
EP
3195static int packet_create(struct net *net, struct socket *sock, int protocol,
3196 int kern)
1da177e4
LT
3197{
3198 struct sock *sk;
3199 struct packet_sock *po;
0e11c91e 3200 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3201 int err;
3202
df008c91 3203 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3204 return -EPERM;
be02097c
DM
3205 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3206 sock->type != SOCK_PACKET)
1da177e4
LT
3207 return -ESOCKTNOSUPPORT;
3208
3209 sock->state = SS_UNCONNECTED;
3210
3211 err = -ENOBUFS;
11aa9c28 3212 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3213 if (sk == NULL)
3214 goto out;
3215
3216 sock->ops = &packet_ops;
1da177e4
LT
3217 if (sock->type == SOCK_PACKET)
3218 sock->ops = &packet_ops_spkt;
be02097c 3219
1da177e4
LT
3220 sock_init_data(sock, sk);
3221
3222 po = pkt_sk(sk);
3223 sk->sk_family = PF_PACKET;
0e11c91e 3224 po->num = proto;
d346a3fa 3225 po->xmit = dev_queue_xmit;
66e56cd4 3226
b0138408
DB
3227 err = packet_alloc_pending(po);
3228 if (err)
3229 goto out2;
3230
66e56cd4 3231 packet_cached_dev_reset(po);
1da177e4
LT
3232
3233 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3234 sk_refcnt_debug_inc(sk);
1da177e4
LT
3235
3236 /*
3237 * Attach a protocol block
3238 */
3239
3240 spin_lock_init(&po->bind_lock);
905db440 3241 mutex_init(&po->pg_vec_lock);
0648ab70 3242 po->rollover = NULL;
1da177e4 3243 po->prot_hook.func = packet_rcv;
be02097c 3244
1da177e4
LT
3245 if (sock->type == SOCK_PACKET)
3246 po->prot_hook.func = packet_rcv_spkt;
be02097c 3247
1da177e4
LT
3248 po->prot_hook.af_packet_priv = sk;
3249
0e11c91e
AV
3250 if (proto) {
3251 po->prot_hook.type = proto;
ce06b03e 3252 register_prot_hook(sk);
1da177e4
LT
3253 }
3254
0fa7fa98 3255 mutex_lock(&net->packet.sklist_lock);
808f5114 3256 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3257 mutex_unlock(&net->packet.sklist_lock);
3258
3259 preempt_disable();
3680453c 3260 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3261 preempt_enable();
808f5114 3262
40d4e3df 3263 return 0;
b0138408
DB
3264out2:
3265 sk_free(sk);
1da177e4
LT
3266out:
3267 return err;
3268}
3269
3270/*
3271 * Pull a packet from our receive queue and hand it to the user.
3272 * If necessary we block.
3273 */
3274
1b784140
YX
3275static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3276 int flags)
1da177e4
LT
3277{
3278 struct sock *sk = sock->sk;
3279 struct sk_buff *skb;
3280 int copied, err;
bfd5f4a3 3281 int vnet_hdr_len = 0;
2472d761 3282 unsigned int origlen = 0;
1da177e4
LT
3283
3284 err = -EINVAL;
ed85b565 3285 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3286 goto out;
3287
3288#if 0
3289 /* What error should we return now? EUNATTACH? */
3290 if (pkt_sk(sk)->ifindex < 0)
3291 return -ENODEV;
3292#endif
3293
ed85b565 3294 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3295 err = sock_recv_errqueue(sk, msg, len,
3296 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3297 goto out;
3298 }
3299
1da177e4
LT
3300 /*
3301 * Call the generic datagram receiver. This handles all sorts
3302 * of horrible races and re-entrancy so we can forget about it
3303 * in the protocol layers.
3304 *
3305 * Now it will return ENETDOWN, if device have just gone down,
3306 * but then it will block.
3307 */
3308
40d4e3df 3309 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3310
3311 /*
1ce4f28b 3312 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3313 * handles the blocking we don't see and worry about blocking
3314 * retries.
3315 */
3316
8ae55f04 3317 if (skb == NULL)
1da177e4
LT
3318 goto out;
3319
2ccdbaa6
WB
3320 if (pkt_sk(sk)->pressure)
3321 packet_rcv_has_room(pkt_sk(sk), NULL);
3322
bfd5f4a3 3323 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3324 err = packet_rcv_vnet(msg, skb, &len);
3325 if (err)
bfd5f4a3 3326 goto out_free;
16cc1400 3327 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3328 }
3329
f3d33426
HFS
3330 /* You lose any data beyond the buffer you gave. If it worries
3331 * a user program they can ask the device for its MTU
3332 * anyway.
1da177e4 3333 */
1da177e4 3334 copied = skb->len;
40d4e3df
ED
3335 if (copied > len) {
3336 copied = len;
3337 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3338 }
3339
51f3d02b 3340 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3341 if (err)
3342 goto out_free;
3343
2472d761
EB
3344 if (sock->type != SOCK_PACKET) {
3345 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3346
3347 /* Original length was stored in sockaddr_ll fields */
3348 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3349 sll->sll_family = AF_PACKET;
3350 sll->sll_protocol = skb->protocol;
3351 }
3352
3b885787 3353 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3354
f3d33426
HFS
3355 if (msg->msg_name) {
3356 /* If the address length field is there to be filled
3357 * in, we fill it in now.
3358 */
3359 if (sock->type == SOCK_PACKET) {
342dfc30 3360 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3361 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3362 } else {
3363 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3364
f3d33426
HFS
3365 msg->msg_namelen = sll->sll_halen +
3366 offsetof(struct sockaddr_ll, sll_addr);
3367 }
ffbc6111
HX
3368 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3369 msg->msg_namelen);
f3d33426 3370 }
1da177e4 3371
8dc41944 3372 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3373 struct tpacket_auxdata aux;
3374
3375 aux.tp_status = TP_STATUS_USER;
3376 if (skb->ip_summed == CHECKSUM_PARTIAL)
3377 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3378 else if (skb->pkt_type != PACKET_OUTGOING &&
3379 (skb->ip_summed == CHECKSUM_COMPLETE ||
3380 skb_csum_unnecessary(skb)))
3381 aux.tp_status |= TP_STATUS_CSUM_VALID;
3382
2472d761 3383 aux.tp_len = origlen;
ffbc6111
HX
3384 aux.tp_snaplen = skb->len;
3385 aux.tp_mac = 0;
bbe735e4 3386 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3387 if (skb_vlan_tag_present(skb)) {
3388 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3389 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3390 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3391 } else {
3392 aux.tp_vlan_tci = 0;
a0cdfcf3 3393 aux.tp_vlan_tpid = 0;
a3bcc23e 3394 }
ffbc6111 3395 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3396 }
3397
1da177e4
LT
3398 /*
3399 * Free or return the buffer as appropriate. Again this
3400 * hides all the races and re-entrancy issues from us.
3401 */
bfd5f4a3 3402 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3403
3404out_free:
3405 skb_free_datagram(sk, skb);
3406out:
3407 return err;
3408}
3409
1da177e4
LT
3410static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3411 int *uaddr_len, int peer)
3412{
3413 struct net_device *dev;
3414 struct sock *sk = sock->sk;
3415
3416 if (peer)
3417 return -EOPNOTSUPP;
3418
3419 uaddr->sa_family = AF_PACKET;
2dc85bf3 3420 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3421 rcu_read_lock();
3422 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3423 if (dev)
2dc85bf3 3424 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3425 rcu_read_unlock();
1da177e4
LT
3426 *uaddr_len = sizeof(*uaddr);
3427
3428 return 0;
3429}
1da177e4
LT
3430
3431static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3432 int *uaddr_len, int peer)
3433{
3434 struct net_device *dev;
3435 struct sock *sk = sock->sk;
3436 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3437 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3438
3439 if (peer)
3440 return -EOPNOTSUPP;
3441
3442 sll->sll_family = AF_PACKET;
3443 sll->sll_ifindex = po->ifindex;
3444 sll->sll_protocol = po->num;
67286640 3445 sll->sll_pkttype = 0;
654d1f8a
ED
3446 rcu_read_lock();
3447 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3448 if (dev) {
3449 sll->sll_hatype = dev->type;
3450 sll->sll_halen = dev->addr_len;
3451 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3452 } else {
3453 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3454 sll->sll_halen = 0;
3455 }
654d1f8a 3456 rcu_read_unlock();
0fb375fb 3457 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3458
3459 return 0;
3460}
3461
2aeb0b88
WC
3462static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3463 int what)
1da177e4
LT
3464{
3465 switch (i->type) {
3466 case PACKET_MR_MULTICAST:
1162563f
JP
3467 if (i->alen != dev->addr_len)
3468 return -EINVAL;
1da177e4 3469 if (what > 0)
22bedad3 3470 return dev_mc_add(dev, i->addr);
1da177e4 3471 else
22bedad3 3472 return dev_mc_del(dev, i->addr);
1da177e4
LT
3473 break;
3474 case PACKET_MR_PROMISC:
2aeb0b88 3475 return dev_set_promiscuity(dev, what);
1da177e4 3476 case PACKET_MR_ALLMULTI:
2aeb0b88 3477 return dev_set_allmulti(dev, what);
d95ed927 3478 case PACKET_MR_UNICAST:
1162563f
JP
3479 if (i->alen != dev->addr_len)
3480 return -EINVAL;
d95ed927 3481 if (what > 0)
a748ee24 3482 return dev_uc_add(dev, i->addr);
d95ed927 3483 else
a748ee24 3484 return dev_uc_del(dev, i->addr);
d95ed927 3485 break;
40d4e3df
ED
3486 default:
3487 break;
1da177e4 3488 }
2aeb0b88 3489 return 0;
1da177e4
LT
3490}
3491
82f17091
FR
3492static void packet_dev_mclist_delete(struct net_device *dev,
3493 struct packet_mclist **mlp)
1da177e4 3494{
82f17091
FR
3495 struct packet_mclist *ml;
3496
3497 while ((ml = *mlp) != NULL) {
3498 if (ml->ifindex == dev->ifindex) {
3499 packet_dev_mc(dev, ml, -1);
3500 *mlp = ml->next;
3501 kfree(ml);
3502 } else
3503 mlp = &ml->next;
1da177e4
LT
3504 }
3505}
3506
0fb375fb 3507static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3508{
3509 struct packet_sock *po = pkt_sk(sk);
3510 struct packet_mclist *ml, *i;
3511 struct net_device *dev;
3512 int err;
3513
3514 rtnl_lock();
3515
3516 err = -ENODEV;
3b1e0a65 3517 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3518 if (!dev)
3519 goto done;
3520
3521 err = -EINVAL;
1162563f 3522 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3523 goto done;
3524
3525 err = -ENOBUFS;
8b3a7005 3526 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3527 if (i == NULL)
3528 goto done;
3529
3530 err = 0;
3531 for (ml = po->mclist; ml; ml = ml->next) {
3532 if (ml->ifindex == mreq->mr_ifindex &&
3533 ml->type == mreq->mr_type &&
3534 ml->alen == mreq->mr_alen &&
3535 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3536 ml->count++;
3537 /* Free the new element ... */
3538 kfree(i);
3539 goto done;
3540 }
3541 }
3542
3543 i->type = mreq->mr_type;
3544 i->ifindex = mreq->mr_ifindex;
3545 i->alen = mreq->mr_alen;
3546 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3547 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3548 i->count = 1;
3549 i->next = po->mclist;
3550 po->mclist = i;
2aeb0b88
WC
3551 err = packet_dev_mc(dev, i, 1);
3552 if (err) {
3553 po->mclist = i->next;
3554 kfree(i);
3555 }
1da177e4
LT
3556
3557done:
3558 rtnl_unlock();
3559 return err;
3560}
3561
0fb375fb 3562static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3563{
3564 struct packet_mclist *ml, **mlp;
3565
3566 rtnl_lock();
3567
3568 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3569 if (ml->ifindex == mreq->mr_ifindex &&
3570 ml->type == mreq->mr_type &&
3571 ml->alen == mreq->mr_alen &&
3572 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3573 if (--ml->count == 0) {
3574 struct net_device *dev;
3575 *mlp = ml->next;
ad959e76
ED
3576 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3577 if (dev)
1da177e4 3578 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3579 kfree(ml);
3580 }
82f17091 3581 break;
1da177e4
LT
3582 }
3583 }
3584 rtnl_unlock();
82f17091 3585 return 0;
1da177e4
LT
3586}
3587
3588static void packet_flush_mclist(struct sock *sk)
3589{
3590 struct packet_sock *po = pkt_sk(sk);
3591 struct packet_mclist *ml;
3592
3593 if (!po->mclist)
3594 return;
3595
3596 rtnl_lock();
3597 while ((ml = po->mclist) != NULL) {
3598 struct net_device *dev;
3599
3600 po->mclist = ml->next;
ad959e76
ED
3601 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3602 if (dev != NULL)
1da177e4 3603 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3604 kfree(ml);
3605 }
3606 rtnl_unlock();
3607}
1da177e4
LT
3608
3609static int
b7058842 3610packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3611{
3612 struct sock *sk = sock->sk;
8dc41944 3613 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3614 int ret;
3615
3616 if (level != SOL_PACKET)
3617 return -ENOPROTOOPT;
3618
69e3c75f 3619 switch (optname) {
1ce4f28b 3620 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3621 case PACKET_DROP_MEMBERSHIP:
3622 {
0fb375fb
EB
3623 struct packet_mreq_max mreq;
3624 int len = optlen;
3625 memset(&mreq, 0, sizeof(mreq));
3626 if (len < sizeof(struct packet_mreq))
1da177e4 3627 return -EINVAL;
0fb375fb
EB
3628 if (len > sizeof(mreq))
3629 len = sizeof(mreq);
40d4e3df 3630 if (copy_from_user(&mreq, optval, len))
1da177e4 3631 return -EFAULT;
0fb375fb
EB
3632 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3633 return -EINVAL;
1da177e4
LT
3634 if (optname == PACKET_ADD_MEMBERSHIP)
3635 ret = packet_mc_add(sk, &mreq);
3636 else
3637 ret = packet_mc_drop(sk, &mreq);
3638 return ret;
3639 }
a2efcfa0 3640
1da177e4 3641 case PACKET_RX_RING:
69e3c75f 3642 case PACKET_TX_RING:
1da177e4 3643 {
f6fb8f10 3644 union tpacket_req_u req_u;
3645 int len;
1da177e4 3646
f6fb8f10 3647 switch (po->tp_version) {
3648 case TPACKET_V1:
3649 case TPACKET_V2:
3650 len = sizeof(req_u.req);
3651 break;
3652 case TPACKET_V3:
3653 default:
3654 len = sizeof(req_u.req3);
3655 break;
3656 }
3657 if (optlen < len)
1da177e4 3658 return -EINVAL;
f6fb8f10 3659 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3660 return -EFAULT;
f6fb8f10 3661 return packet_set_ring(sk, &req_u, 0,
3662 optname == PACKET_TX_RING);
1da177e4
LT
3663 }
3664 case PACKET_COPY_THRESH:
3665 {
3666 int val;
3667
40d4e3df 3668 if (optlen != sizeof(val))
1da177e4 3669 return -EINVAL;
40d4e3df 3670 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3671 return -EFAULT;
3672
3673 pkt_sk(sk)->copy_thresh = val;
3674 return 0;
3675 }
bbd6ef87
PM
3676 case PACKET_VERSION:
3677 {
3678 int val;
3679
3680 if (optlen != sizeof(val))
3681 return -EINVAL;
bbd6ef87
PM
3682 if (copy_from_user(&val, optval, sizeof(val)))
3683 return -EFAULT;
3684 switch (val) {
3685 case TPACKET_V1:
3686 case TPACKET_V2:
f6fb8f10 3687 case TPACKET_V3:
84ac7260 3688 break;
bbd6ef87
PM
3689 default:
3690 return -EINVAL;
3691 }
84ac7260
PP
3692 lock_sock(sk);
3693 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3694 ret = -EBUSY;
3695 } else {
3696 po->tp_version = val;
3697 ret = 0;
3698 }
3699 release_sock(sk);
3700 return ret;
bbd6ef87 3701 }
8913336a
PM
3702 case PACKET_RESERVE:
3703 {
3704 unsigned int val;
3705
3706 if (optlen != sizeof(val))
3707 return -EINVAL;
8913336a
PM
3708 if (copy_from_user(&val, optval, sizeof(val)))
3709 return -EFAULT;
bcc5364b
AK
3710 if (val > INT_MAX)
3711 return -EINVAL;
c27927e3
WB
3712 lock_sock(sk);
3713 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3714 ret = -EBUSY;
3715 } else {
3716 po->tp_reserve = val;
3717 ret = 0;
3718 }
3719 release_sock(sk);
3720 return ret;
8913336a 3721 }
69e3c75f
JB
3722 case PACKET_LOSS:
3723 {
3724 unsigned int val;
3725
3726 if (optlen != sizeof(val))
3727 return -EINVAL;
3728 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3729 return -EBUSY;
3730 if (copy_from_user(&val, optval, sizeof(val)))
3731 return -EFAULT;
3732 po->tp_loss = !!val;
3733 return 0;
3734 }
8dc41944
HX
3735 case PACKET_AUXDATA:
3736 {
3737 int val;
3738
3739 if (optlen < sizeof(val))
3740 return -EINVAL;
3741 if (copy_from_user(&val, optval, sizeof(val)))
3742 return -EFAULT;
3743
3744 po->auxdata = !!val;
3745 return 0;
3746 }
80feaacb
PWJ
3747 case PACKET_ORIGDEV:
3748 {
3749 int val;
3750
3751 if (optlen < sizeof(val))
3752 return -EINVAL;
3753 if (copy_from_user(&val, optval, sizeof(val)))
3754 return -EFAULT;
3755
3756 po->origdev = !!val;
3757 return 0;
3758 }
bfd5f4a3
SS
3759 case PACKET_VNET_HDR:
3760 {
3761 int val;
3762
3763 if (sock->type != SOCK_RAW)
3764 return -EINVAL;
3765 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3766 return -EBUSY;
3767 if (optlen < sizeof(val))
3768 return -EINVAL;
3769 if (copy_from_user(&val, optval, sizeof(val)))
3770 return -EFAULT;
3771
3772 po->has_vnet_hdr = !!val;
3773 return 0;
3774 }
614f60fa
SM
3775 case PACKET_TIMESTAMP:
3776 {
3777 int val;
3778
3779 if (optlen != sizeof(val))
3780 return -EINVAL;
3781 if (copy_from_user(&val, optval, sizeof(val)))
3782 return -EFAULT;
3783
3784 po->tp_tstamp = val;
3785 return 0;
3786 }
dc99f600
DM
3787 case PACKET_FANOUT:
3788 {
3789 int val;
3790
3791 if (optlen != sizeof(val))
3792 return -EINVAL;
3793 if (copy_from_user(&val, optval, sizeof(val)))
3794 return -EFAULT;
3795
3796 return fanout_add(sk, val & 0xffff, val >> 16);
3797 }
47dceb8e
WB
3798 case PACKET_FANOUT_DATA:
3799 {
3800 if (!po->fanout)
3801 return -EINVAL;
3802
3803 return fanout_set_data(po, optval, optlen);
3804 }
5920cd3a
PC
3805 case PACKET_TX_HAS_OFF:
3806 {
3807 unsigned int val;
3808
3809 if (optlen != sizeof(val))
3810 return -EINVAL;
3811 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3812 return -EBUSY;
3813 if (copy_from_user(&val, optval, sizeof(val)))
3814 return -EFAULT;
3815 po->tp_tx_has_off = !!val;
3816 return 0;
3817 }
d346a3fa
DB
3818 case PACKET_QDISC_BYPASS:
3819 {
3820 int val;
3821
3822 if (optlen != sizeof(val))
3823 return -EINVAL;
3824 if (copy_from_user(&val, optval, sizeof(val)))
3825 return -EFAULT;
3826
3827 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3828 return 0;
3829 }
1da177e4
LT
3830 default:
3831 return -ENOPROTOOPT;
3832 }
3833}
3834
3835static int packet_getsockopt(struct socket *sock, int level, int optname,
3836 char __user *optval, int __user *optlen)
3837{
3838 int len;
c06fff6e 3839 int val, lv = sizeof(val);
1da177e4
LT
3840 struct sock *sk = sock->sk;
3841 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3842 void *data = &val;
ee80fbf3 3843 union tpacket_stats_u st;
a9b63918 3844 struct tpacket_rollover_stats rstats;
1da177e4
LT
3845
3846 if (level != SOL_PACKET)
3847 return -ENOPROTOOPT;
3848
8ae55f04
KK
3849 if (get_user(len, optlen))
3850 return -EFAULT;
1da177e4
LT
3851
3852 if (len < 0)
3853 return -EINVAL;
1ce4f28b 3854
69e3c75f 3855 switch (optname) {
1da177e4 3856 case PACKET_STATISTICS:
1da177e4 3857 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3858 memcpy(&st, &po->stats, sizeof(st));
3859 memset(&po->stats, 0, sizeof(po->stats));
3860 spin_unlock_bh(&sk->sk_receive_queue.lock);
3861
f6fb8f10 3862 if (po->tp_version == TPACKET_V3) {
c06fff6e 3863 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3864 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3865 data = &st.stats3;
f6fb8f10 3866 } else {
c06fff6e 3867 lv = sizeof(struct tpacket_stats);
8bcdeaff 3868 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3869 data = &st.stats1;
f6fb8f10 3870 }
ee80fbf3 3871
8dc41944
HX
3872 break;
3873 case PACKET_AUXDATA:
8dc41944 3874 val = po->auxdata;
80feaacb
PWJ
3875 break;
3876 case PACKET_ORIGDEV:
80feaacb 3877 val = po->origdev;
bfd5f4a3
SS
3878 break;
3879 case PACKET_VNET_HDR:
bfd5f4a3 3880 val = po->has_vnet_hdr;
1da177e4 3881 break;
bbd6ef87 3882 case PACKET_VERSION:
bbd6ef87 3883 val = po->tp_version;
bbd6ef87
PM
3884 break;
3885 case PACKET_HDRLEN:
3886 if (len > sizeof(int))
3887 len = sizeof(int);
fd2c83b3
AP
3888 if (len < sizeof(int))
3889 return -EINVAL;
bbd6ef87
PM
3890 if (copy_from_user(&val, optval, len))
3891 return -EFAULT;
3892 switch (val) {
3893 case TPACKET_V1:
3894 val = sizeof(struct tpacket_hdr);
3895 break;
3896 case TPACKET_V2:
3897 val = sizeof(struct tpacket2_hdr);
3898 break;
f6fb8f10 3899 case TPACKET_V3:
3900 val = sizeof(struct tpacket3_hdr);
3901 break;
bbd6ef87
PM
3902 default:
3903 return -EINVAL;
3904 }
bbd6ef87 3905 break;
8913336a 3906 case PACKET_RESERVE:
8913336a 3907 val = po->tp_reserve;
8913336a 3908 break;
69e3c75f 3909 case PACKET_LOSS:
69e3c75f 3910 val = po->tp_loss;
69e3c75f 3911 break;
614f60fa 3912 case PACKET_TIMESTAMP:
614f60fa 3913 val = po->tp_tstamp;
614f60fa 3914 break;
dc99f600 3915 case PACKET_FANOUT:
dc99f600
DM
3916 val = (po->fanout ?
3917 ((u32)po->fanout->id |
77f65ebd
WB
3918 ((u32)po->fanout->type << 16) |
3919 ((u32)po->fanout->flags << 24)) :
dc99f600 3920 0);
dc99f600 3921 break;
a9b63918 3922 case PACKET_ROLLOVER_STATS:
57f015f5 3923 if (!po->rollover)
a9b63918 3924 return -EINVAL;
57f015f5
MM
3925 rstats.tp_all = atomic_long_read(&po->rollover->num);
3926 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3927 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3928 data = &rstats;
3929 lv = sizeof(rstats);
a9b63918 3930 break;
5920cd3a
PC
3931 case PACKET_TX_HAS_OFF:
3932 val = po->tp_tx_has_off;
3933 break;
d346a3fa
DB
3934 case PACKET_QDISC_BYPASS:
3935 val = packet_use_direct_xmit(po);
3936 break;
1da177e4
LT
3937 default:
3938 return -ENOPROTOOPT;
3939 }
3940
c06fff6e
ED
3941 if (len > lv)
3942 len = lv;
8ae55f04
KK
3943 if (put_user(len, optlen))
3944 return -EFAULT;
8dc41944
HX
3945 if (copy_to_user(optval, data, len))
3946 return -EFAULT;
8ae55f04 3947 return 0;
1da177e4
LT
3948}
3949
3950
719c44d3
WB
3951#ifdef CONFIG_COMPAT
3952static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3953 char __user *optval, unsigned int optlen)
3954{
3955 struct packet_sock *po = pkt_sk(sock->sk);
3956
3957 if (level != SOL_PACKET)
3958 return -ENOPROTOOPT;
3959
3960 if (optname == PACKET_FANOUT_DATA &&
3961 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3962 optval = (char __user *)get_compat_bpf_fprog(optval);
3963 if (!optval)
3964 return -EFAULT;
3965 optlen = sizeof(struct sock_fprog);
3966 }
3967
3968 return packet_setsockopt(sock, level, optname, optval, optlen);
3969}
3970#endif
3971
351638e7
JP
3972static int packet_notifier(struct notifier_block *this,
3973 unsigned long msg, void *ptr)
1da177e4
LT
3974{
3975 struct sock *sk;
351638e7 3976 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3977 struct net *net = dev_net(dev);
1da177e4 3978
808f5114 3979 rcu_read_lock();
b67bfe0d 3980 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3981 struct packet_sock *po = pkt_sk(sk);
3982
3983 switch (msg) {
3984 case NETDEV_UNREGISTER:
1da177e4 3985 if (po->mclist)
82f17091 3986 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3987 /* fallthrough */
3988
1da177e4
LT
3989 case NETDEV_DOWN:
3990 if (dev->ifindex == po->ifindex) {
3991 spin_lock(&po->bind_lock);
3992 if (po->running) {
ce06b03e 3993 __unregister_prot_hook(sk, false);
1da177e4
LT
3994 sk->sk_err = ENETDOWN;
3995 if (!sock_flag(sk, SOCK_DEAD))
3996 sk->sk_error_report(sk);
3997 }
3998 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3999 packet_cached_dev_reset(po);
1da177e4 4000 po->ifindex = -1;
160ff18a
BG
4001 if (po->prot_hook.dev)
4002 dev_put(po->prot_hook.dev);
1da177e4
LT
4003 po->prot_hook.dev = NULL;
4004 }
4005 spin_unlock(&po->bind_lock);
4006 }
4007 break;
4008 case NETDEV_UP:
808f5114 4009 if (dev->ifindex == po->ifindex) {
4010 spin_lock(&po->bind_lock);
ce06b03e
DM
4011 if (po->num)
4012 register_prot_hook(sk);
808f5114 4013 spin_unlock(&po->bind_lock);
1da177e4 4014 }
1da177e4
LT
4015 break;
4016 }
4017 }
808f5114 4018 rcu_read_unlock();
1da177e4
LT
4019 return NOTIFY_DONE;
4020}
4021
4022
4023static int packet_ioctl(struct socket *sock, unsigned int cmd,
4024 unsigned long arg)
4025{
4026 struct sock *sk = sock->sk;
4027
69e3c75f 4028 switch (cmd) {
40d4e3df
ED
4029 case SIOCOUTQ:
4030 {
4031 int amount = sk_wmem_alloc_get(sk);
31e6d363 4032
40d4e3df
ED
4033 return put_user(amount, (int __user *)arg);
4034 }
4035 case SIOCINQ:
4036 {
4037 struct sk_buff *skb;
4038 int amount = 0;
4039
4040 spin_lock_bh(&sk->sk_receive_queue.lock);
4041 skb = skb_peek(&sk->sk_receive_queue);
4042 if (skb)
4043 amount = skb->len;
4044 spin_unlock_bh(&sk->sk_receive_queue.lock);
4045 return put_user(amount, (int __user *)arg);
4046 }
4047 case SIOCGSTAMP:
4048 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4049 case SIOCGSTAMPNS:
4050 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4051
1da177e4 4052#ifdef CONFIG_INET
40d4e3df
ED
4053 case SIOCADDRT:
4054 case SIOCDELRT:
4055 case SIOCDARP:
4056 case SIOCGARP:
4057 case SIOCSARP:
4058 case SIOCGIFADDR:
4059 case SIOCSIFADDR:
4060 case SIOCGIFBRDADDR:
4061 case SIOCSIFBRDADDR:
4062 case SIOCGIFNETMASK:
4063 case SIOCSIFNETMASK:
4064 case SIOCGIFDSTADDR:
4065 case SIOCSIFDSTADDR:
4066 case SIOCSIFFLAGS:
40d4e3df 4067 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4068#endif
4069
40d4e3df
ED
4070 default:
4071 return -ENOIOCTLCMD;
1da177e4
LT
4072 }
4073 return 0;
4074}
4075
40d4e3df 4076static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4077 poll_table *wait)
4078{
4079 struct sock *sk = sock->sk;
4080 struct packet_sock *po = pkt_sk(sk);
4081 unsigned int mask = datagram_poll(file, sock, wait);
4082
4083 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4084 if (po->rx_ring.pg_vec) {
f6fb8f10 4085 if (!packet_previous_rx_frame(po, &po->rx_ring,
4086 TP_STATUS_KERNEL))
1da177e4
LT
4087 mask |= POLLIN | POLLRDNORM;
4088 }
2ccdbaa6 4089 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4090 po->pressure = 0;
1da177e4 4091 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4092 spin_lock_bh(&sk->sk_write_queue.lock);
4093 if (po->tx_ring.pg_vec) {
4094 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4095 mask |= POLLOUT | POLLWRNORM;
4096 }
4097 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4098 return mask;
4099}
4100
4101
4102/* Dirty? Well, I still did not learn better way to account
4103 * for user mmaps.
4104 */
4105
4106static void packet_mm_open(struct vm_area_struct *vma)
4107{
4108 struct file *file = vma->vm_file;
40d4e3df 4109 struct socket *sock = file->private_data;
1da177e4 4110 struct sock *sk = sock->sk;
1ce4f28b 4111
1da177e4
LT
4112 if (sk)
4113 atomic_inc(&pkt_sk(sk)->mapped);
4114}
4115
4116static void packet_mm_close(struct vm_area_struct *vma)
4117{
4118 struct file *file = vma->vm_file;
40d4e3df 4119 struct socket *sock = file->private_data;
1da177e4 4120 struct sock *sk = sock->sk;
1ce4f28b 4121
1da177e4
LT
4122 if (sk)
4123 atomic_dec(&pkt_sk(sk)->mapped);
4124}
4125
f0f37e2f 4126static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4127 .open = packet_mm_open,
4128 .close = packet_mm_close,
1da177e4
LT
4129};
4130
0e3125c7
NH
4131static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4132 unsigned int len)
1da177e4
LT
4133{
4134 int i;
4135
4ebf0ae2 4136 for (i = 0; i < len; i++) {
0e3125c7 4137 if (likely(pg_vec[i].buffer)) {
c56b4d90 4138 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4139 vfree(pg_vec[i].buffer);
4140 else
4141 free_pages((unsigned long)pg_vec[i].buffer,
4142 order);
4143 pg_vec[i].buffer = NULL;
4144 }
1da177e4
LT
4145 }
4146 kfree(pg_vec);
4147}
4148
eea49cc9 4149static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4150{
f0d4eb29 4151 char *buffer;
0e3125c7
NH
4152 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4153 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4154
4155 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4156 if (buffer)
4157 return buffer;
4158
f0d4eb29 4159 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4160 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4161 if (buffer)
4162 return buffer;
4163
f0d4eb29 4164 /* vmalloc failed, lets dig into swap here */
0e3125c7 4165 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4166 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4167 if (buffer)
4168 return buffer;
4169
f0d4eb29 4170 /* complete and utter failure */
0e3125c7 4171 return NULL;
4ebf0ae2
DM
4172}
4173
0e3125c7 4174static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4175{
4176 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4177 struct pgv *pg_vec;
4ebf0ae2
DM
4178 int i;
4179
0e3125c7 4180 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4181 if (unlikely(!pg_vec))
4182 goto out;
4183
4184 for (i = 0; i < block_nr; i++) {
c56b4d90 4185 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4186 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4187 goto out_free_pgvec;
4188 }
4189
4190out:
4191 return pg_vec;
4192
4193out_free_pgvec:
4194 free_pg_vec(pg_vec, order, block_nr);
4195 pg_vec = NULL;
4196 goto out;
4197}
1da177e4 4198
f6fb8f10 4199static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4200 int closing, int tx_ring)
1da177e4 4201{
0e3125c7 4202 struct pgv *pg_vec = NULL;
1da177e4 4203 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4204 int was_running, order = 0;
69e3c75f
JB
4205 struct packet_ring_buffer *rb;
4206 struct sk_buff_head *rb_queue;
0e11c91e 4207 __be16 num;
f6fb8f10 4208 int err = -EINVAL;
4209 /* Added to avoid minimal code churn */
4210 struct tpacket_req *req = &req_u->req;
4211
84ac7260 4212 lock_sock(sk);
1ce4f28b 4213
69e3c75f
JB
4214 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4215 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4216
69e3c75f
JB
4217 err = -EBUSY;
4218 if (!closing) {
4219 if (atomic_read(&po->mapped))
4220 goto out;
b0138408 4221 if (packet_read_pending(rb))
69e3c75f
JB
4222 goto out;
4223 }
1da177e4 4224
69e3c75f
JB
4225 if (req->tp_block_nr) {
4226 /* Sanity tests and some calculations */
4227 err = -EBUSY;
4228 if (unlikely(rb->pg_vec))
4229 goto out;
1da177e4 4230
bbd6ef87
PM
4231 switch (po->tp_version) {
4232 case TPACKET_V1:
4233 po->tp_hdrlen = TPACKET_HDRLEN;
4234 break;
4235 case TPACKET_V2:
4236 po->tp_hdrlen = TPACKET2_HDRLEN;
4237 break;
f6fb8f10 4238 case TPACKET_V3:
4239 po->tp_hdrlen = TPACKET3_HDRLEN;
4240 break;
bbd6ef87
PM
4241 }
4242
69e3c75f 4243 err = -EINVAL;
4ebf0ae2 4244 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4245 goto out;
90836b67 4246 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4247 goto out;
dc808110 4248 if (po->tp_version >= TPACKET_V3 &&
2b6867c2
AK
4249 req->tp_block_size <=
4250 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
dc808110 4251 goto out;
8913336a 4252 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4253 po->tp_reserve))
4254 goto out;
4ebf0ae2 4255 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4256 goto out;
1da177e4 4257
4194b491
TK
4258 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4259 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4260 goto out;
8f8d28e4
AK
4261 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4262 goto out;
69e3c75f
JB
4263 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4264 req->tp_frame_nr))
4265 goto out;
1da177e4
LT
4266
4267 err = -ENOMEM;
4ebf0ae2
DM
4268 order = get_order(req->tp_block_size);
4269 pg_vec = alloc_pg_vec(req, order);
4270 if (unlikely(!pg_vec))
1da177e4 4271 goto out;
f6fb8f10 4272 switch (po->tp_version) {
4273 case TPACKET_V3:
7f953ab2
SV
4274 /* Block transmit is not supported yet */
4275 if (!tx_ring) {
e8e85cc5 4276 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4277 } else {
4278 struct tpacket_req3 *req3 = &req_u->req3;
4279
4280 if (req3->tp_retire_blk_tov ||
4281 req3->tp_sizeof_priv ||
4282 req3->tp_feature_req_word) {
4283 err = -EINVAL;
4284 goto out;
4285 }
4286 }
d7cf0c34 4287 break;
f6fb8f10 4288 default:
4289 break;
4290 }
69e3c75f
JB
4291 }
4292 /* Done */
4293 else {
4294 err = -EINVAL;
4ebf0ae2 4295 if (unlikely(req->tp_frame_nr))
69e3c75f 4296 goto out;
1da177e4
LT
4297 }
4298
1da177e4
LT
4299
4300 /* Detach socket from network */
4301 spin_lock(&po->bind_lock);
4302 was_running = po->running;
4303 num = po->num;
4304 if (was_running) {
1da177e4 4305 po->num = 0;
ce06b03e 4306 __unregister_prot_hook(sk, false);
1da177e4
LT
4307 }
4308 spin_unlock(&po->bind_lock);
1ce4f28b 4309
1da177e4
LT
4310 synchronize_net();
4311
4312 err = -EBUSY;
905db440 4313 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4314 if (closing || atomic_read(&po->mapped) == 0) {
4315 err = 0;
69e3c75f 4316 spin_lock_bh(&rb_queue->lock);
c053fd96 4317 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4318 rb->frame_max = (req->tp_frame_nr - 1);
4319 rb->head = 0;
4320 rb->frame_size = req->tp_frame_size;
4321 spin_unlock_bh(&rb_queue->lock);
4322
c053fd96
CG
4323 swap(rb->pg_vec_order, order);
4324 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4325
4326 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4327 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4328 tpacket_rcv : packet_rcv;
4329 skb_queue_purge(rb_queue);
1da177e4 4330 if (atomic_read(&po->mapped))
40d4e3df
ED
4331 pr_err("packet_mmap: vma is busy: %d\n",
4332 atomic_read(&po->mapped));
1da177e4 4333 }
905db440 4334 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4335
4336 spin_lock(&po->bind_lock);
ce06b03e 4337 if (was_running) {
1da177e4 4338 po->num = num;
ce06b03e 4339 register_prot_hook(sk);
1da177e4
LT
4340 }
4341 spin_unlock(&po->bind_lock);
c800aaf8 4342 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4343 /* Because we don't support block-based V3 on tx-ring */
4344 if (!tx_ring)
73d0fcf2 4345 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4346 }
1da177e4 4347
1da177e4
LT
4348 if (pg_vec)
4349 free_pg_vec(pg_vec, order, req->tp_block_nr);
4350out:
84ac7260 4351 release_sock(sk);
1da177e4
LT
4352 return err;
4353}
4354
69e3c75f
JB
4355static int packet_mmap(struct file *file, struct socket *sock,
4356 struct vm_area_struct *vma)
1da177e4
LT
4357{
4358 struct sock *sk = sock->sk;
4359 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4360 unsigned long size, expected_size;
4361 struct packet_ring_buffer *rb;
1da177e4
LT
4362 unsigned long start;
4363 int err = -EINVAL;
4364 int i;
4365
4366 if (vma->vm_pgoff)
4367 return -EINVAL;
4368
905db440 4369 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4370
4371 expected_size = 0;
4372 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4373 if (rb->pg_vec) {
4374 expected_size += rb->pg_vec_len
4375 * rb->pg_vec_pages
4376 * PAGE_SIZE;
4377 }
4378 }
4379
4380 if (expected_size == 0)
1da177e4 4381 goto out;
69e3c75f
JB
4382
4383 size = vma->vm_end - vma->vm_start;
4384 if (size != expected_size)
1da177e4
LT
4385 goto out;
4386
1da177e4 4387 start = vma->vm_start;
69e3c75f
JB
4388 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4389 if (rb->pg_vec == NULL)
4390 continue;
4391
4392 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4393 struct page *page;
4394 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4395 int pg_num;
4396
c56b4d90
CG
4397 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4398 page = pgv_to_page(kaddr);
69e3c75f
JB
4399 err = vm_insert_page(vma, start, page);
4400 if (unlikely(err))
4401 goto out;
4402 start += PAGE_SIZE;
0e3125c7 4403 kaddr += PAGE_SIZE;
69e3c75f 4404 }
4ebf0ae2 4405 }
1da177e4 4406 }
69e3c75f 4407
4ebf0ae2 4408 atomic_inc(&po->mapped);
1da177e4
LT
4409 vma->vm_ops = &packet_mmap_ops;
4410 err = 0;
4411
4412out:
905db440 4413 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4414 return err;
4415}
1da177e4 4416
90ddc4f0 4417static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4418 .family = PF_PACKET,
4419 .owner = THIS_MODULE,
4420 .release = packet_release,
4421 .bind = packet_bind_spkt,
4422 .connect = sock_no_connect,
4423 .socketpair = sock_no_socketpair,
4424 .accept = sock_no_accept,
4425 .getname = packet_getname_spkt,
4426 .poll = datagram_poll,
4427 .ioctl = packet_ioctl,
4428 .listen = sock_no_listen,
4429 .shutdown = sock_no_shutdown,
4430 .setsockopt = sock_no_setsockopt,
4431 .getsockopt = sock_no_getsockopt,
4432 .sendmsg = packet_sendmsg_spkt,
4433 .recvmsg = packet_recvmsg,
4434 .mmap = sock_no_mmap,
4435 .sendpage = sock_no_sendpage,
4436};
1da177e4 4437
90ddc4f0 4438static const struct proto_ops packet_ops = {
1da177e4
LT
4439 .family = PF_PACKET,
4440 .owner = THIS_MODULE,
4441 .release = packet_release,
4442 .bind = packet_bind,
4443 .connect = sock_no_connect,
4444 .socketpair = sock_no_socketpair,
4445 .accept = sock_no_accept,
1ce4f28b 4446 .getname = packet_getname,
1da177e4
LT
4447 .poll = packet_poll,
4448 .ioctl = packet_ioctl,
4449 .listen = sock_no_listen,
4450 .shutdown = sock_no_shutdown,
4451 .setsockopt = packet_setsockopt,
4452 .getsockopt = packet_getsockopt,
719c44d3
WB
4453#ifdef CONFIG_COMPAT
4454 .compat_setsockopt = compat_packet_setsockopt,
4455#endif
1da177e4
LT
4456 .sendmsg = packet_sendmsg,
4457 .recvmsg = packet_recvmsg,
4458 .mmap = packet_mmap,
4459 .sendpage = sock_no_sendpage,
4460};
4461
ec1b4cf7 4462static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4463 .family = PF_PACKET,
4464 .create = packet_create,
4465 .owner = THIS_MODULE,
4466};
4467
4468static struct notifier_block packet_netdev_notifier = {
40d4e3df 4469 .notifier_call = packet_notifier,
1da177e4
LT
4470};
4471
4472#ifdef CONFIG_PROC_FS
1da177e4
LT
4473
4474static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4475 __acquires(RCU)
1da177e4 4476{
e372c414 4477 struct net *net = seq_file_net(seq);
808f5114 4478
4479 rcu_read_lock();
4480 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4481}
4482
4483static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4484{
1bf40954 4485 struct net *net = seq_file_net(seq);
808f5114 4486 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4487}
4488
4489static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4490 __releases(RCU)
1da177e4 4491{
808f5114 4492 rcu_read_unlock();
1da177e4
LT
4493}
4494
1ce4f28b 4495static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4496{
4497 if (v == SEQ_START_TOKEN)
4498 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4499 else {
b7ceabd9 4500 struct sock *s = sk_entry(v);
1da177e4
LT
4501 const struct packet_sock *po = pkt_sk(s);
4502
4503 seq_printf(seq,
71338aa7 4504 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4505 s,
41c6d650 4506 refcount_read(&s->sk_refcnt),
1da177e4
LT
4507 s->sk_type,
4508 ntohs(po->num),
4509 po->ifindex,
4510 po->running,
4511 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4512 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4513 sock_i_ino(s));
1da177e4
LT
4514 }
4515
4516 return 0;
4517}
4518
56b3d975 4519static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4520 .start = packet_seq_start,
4521 .next = packet_seq_next,
4522 .stop = packet_seq_stop,
4523 .show = packet_seq_show,
4524};
4525
4526static int packet_seq_open(struct inode *inode, struct file *file)
4527{
e372c414
DL
4528 return seq_open_net(inode, file, &packet_seq_ops,
4529 sizeof(struct seq_net_private));
1da177e4
LT
4530}
4531
da7071d7 4532static const struct file_operations packet_seq_fops = {
1da177e4
LT
4533 .owner = THIS_MODULE,
4534 .open = packet_seq_open,
4535 .read = seq_read,
4536 .llseek = seq_lseek,
e372c414 4537 .release = seq_release_net,
1da177e4
LT
4538};
4539
4540#endif
4541
2c8c1e72 4542static int __net_init packet_net_init(struct net *net)
d12d01d6 4543{
0fa7fa98 4544 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4545 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4546
d4beaa66 4547 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4548 return -ENOMEM;
4549
4550 return 0;
4551}
4552
2c8c1e72 4553static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4554{
ece31ffd 4555 remove_proc_entry("packet", net->proc_net);
669f8f1a 4556 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4557}
4558
4559static struct pernet_operations packet_net_ops = {
4560 .init = packet_net_init,
4561 .exit = packet_net_exit,
4562};
4563
4564
1da177e4
LT
4565static void __exit packet_exit(void)
4566{
1da177e4 4567 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4568 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4569 sock_unregister(PF_PACKET);
4570 proto_unregister(&packet_proto);
4571}
4572
4573static int __init packet_init(void)
4574{
4575 int rc = proto_register(&packet_proto, 0);
4576
4577 if (rc != 0)
4578 goto out;
4579
4580 sock_register(&packet_family_ops);
d12d01d6 4581 register_pernet_subsys(&packet_net_ops);
1da177e4 4582 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4583out:
4584 return rc;
4585}
4586
4587module_init(packet_init);
4588module_exit(packet_exit);
4589MODULE_LICENSE("GPL");
4590MODULE_ALIAS_NETPROTO(PF_PACKET);