]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
net/packet: fix overflow in check for priv area size
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define PGV_FROM_VMALLOC 1
69e3c75f 181
f6fb8f10 182#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
183#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
184#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
185#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
186#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
187#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
188#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189
69e3c75f
JB
190struct packet_sock;
191static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
192static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
193 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 194
f6fb8f10 195static void *packet_previous_frame(struct packet_sock *po,
196 struct packet_ring_buffer *rb,
197 int status);
198static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 199static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
200 struct tpacket_block_desc *);
201static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *);
bc59ba39 203static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 204 struct packet_sock *, unsigned int status);
bc59ba39 205static int prb_queue_frozen(struct tpacket_kbdq_core *);
206static void prb_open_block(struct tpacket_kbdq_core *,
207 struct tpacket_block_desc *);
f6fb8f10 208static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 209static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
210static void prb_init_blk_timer(struct packet_sock *,
211 struct tpacket_kbdq_core *,
212 void (*func) (unsigned long));
213static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
214static void prb_clear_rxhash(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
217 struct tpacket3_hdr *);
1da177e4
LT
218static void packet_flush_mclist(struct sock *sk);
219
ffbc6111 220struct packet_skb_cb {
ffbc6111
HX
221 union {
222 struct sockaddr_pkt pkt;
2472d761
EB
223 union {
224 /* Trick: alias skb original length with
225 * ll.sll_family and ll.protocol in order
226 * to save room.
227 */
228 unsigned int origlen;
229 struct sockaddr_ll ll;
230 };
ffbc6111
HX
231 } sa;
232};
233
d3869efe
DW
234#define vio_le() virtio_legacy_is_little_endian()
235
ffbc6111 236#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 237
bc59ba39 238#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 239#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 240 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 241#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 242 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 243#define GET_NEXT_PRB_BLK_NUM(x) \
244 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
245 ((x)->kactive_blk_num+1) : 0)
246
dc99f600
DM
247static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
248static void __fanout_link(struct sock *sk, struct packet_sock *po);
249
d346a3fa
DB
250static int packet_direct_xmit(struct sk_buff *skb)
251{
252 struct net_device *dev = skb->dev;
104ba78c 253 struct sk_buff *orig_skb = skb;
d346a3fa 254 struct netdev_queue *txq;
43279500 255 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
256
257 if (unlikely(!netif_running(dev) ||
43279500
DB
258 !netif_carrier_ok(dev)))
259 goto drop;
d346a3fa 260
104ba78c
WB
261 skb = validate_xmit_skb_list(skb, dev);
262 if (skb != orig_skb)
43279500 263 goto drop;
d346a3fa 264
10c51b56 265 txq = skb_get_tx_queue(dev, skb);
d346a3fa 266
43279500
DB
267 local_bh_disable();
268
269 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 270 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 271 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 272 HARD_TX_UNLOCK(dev, txq);
d346a3fa 273
43279500
DB
274 local_bh_enable();
275
276 if (!dev_xmit_complete(ret))
d346a3fa 277 kfree_skb(skb);
43279500 278
d346a3fa 279 return ret;
43279500 280drop:
0f97ede4 281 atomic_long_inc(&dev->tx_dropped);
104ba78c 282 kfree_skb_list(skb);
43279500 283 return NET_XMIT_DROP;
d346a3fa
DB
284}
285
66e56cd4
DB
286static struct net_device *packet_cached_dev_get(struct packet_sock *po)
287{
288 struct net_device *dev;
289
290 rcu_read_lock();
291 dev = rcu_dereference(po->cached_dev);
292 if (likely(dev))
293 dev_hold(dev);
294 rcu_read_unlock();
295
296 return dev;
297}
298
299static void packet_cached_dev_assign(struct packet_sock *po,
300 struct net_device *dev)
301{
302 rcu_assign_pointer(po->cached_dev, dev);
303}
304
305static void packet_cached_dev_reset(struct packet_sock *po)
306{
307 RCU_INIT_POINTER(po->cached_dev, NULL);
308}
309
d346a3fa
DB
310static bool packet_use_direct_xmit(const struct packet_sock *po)
311{
312 return po->xmit == packet_direct_xmit;
313}
314
0fd5d57b 315static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 316{
1cbac010 317 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
318}
319
0fd5d57b
DB
320static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
321{
322 const struct net_device_ops *ops = dev->netdev_ops;
323 u16 queue_index;
324
325 if (ops->ndo_select_queue) {
326 queue_index = ops->ndo_select_queue(dev, skb, NULL,
327 __packet_pick_tx_queue);
328 queue_index = netdev_cap_txqueue(dev, queue_index);
329 } else {
330 queue_index = __packet_pick_tx_queue(dev, skb);
331 }
332
333 skb_set_queue_mapping(skb, queue_index);
334}
335
ce06b03e
DM
336/* register_prot_hook must be invoked with the po->bind_lock held,
337 * or from a context in which asynchronous accesses to the packet
338 * socket is not possible (packet_create()).
339 */
340static void register_prot_hook(struct sock *sk)
341{
342 struct packet_sock *po = pkt_sk(sk);
e40526cb 343
ce06b03e 344 if (!po->running) {
66e56cd4 345 if (po->fanout)
dc99f600 346 __fanout_link(sk, po);
66e56cd4 347 else
dc99f600 348 dev_add_pack(&po->prot_hook);
e40526cb 349
ce06b03e
DM
350 sock_hold(sk);
351 po->running = 1;
352 }
353}
354
355/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
356 * held. If the sync parameter is true, we will temporarily drop
357 * the po->bind_lock and do a synchronize_net to make sure no
358 * asynchronous packet processing paths still refer to the elements
359 * of po->prot_hook. If the sync parameter is false, it is the
360 * callers responsibility to take care of this.
361 */
362static void __unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 po->running = 0;
66e56cd4
DB
367
368 if (po->fanout)
dc99f600 369 __fanout_unlink(sk, po);
66e56cd4 370 else
dc99f600 371 __dev_remove_pack(&po->prot_hook);
e40526cb 372
ce06b03e
DM
373 __sock_put(sk);
374
375 if (sync) {
376 spin_unlock(&po->bind_lock);
377 synchronize_net();
378 spin_lock(&po->bind_lock);
379 }
380}
381
382static void unregister_prot_hook(struct sock *sk, bool sync)
383{
384 struct packet_sock *po = pkt_sk(sk);
385
386 if (po->running)
387 __unregister_prot_hook(sk, sync);
388}
389
6e58040b 390static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
391{
392 if (is_vmalloc_addr(addr))
393 return vmalloc_to_page(addr);
394 return virt_to_page(addr);
395}
396
69e3c75f 397static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 398{
184f489e 399 union tpacket_uhdr h;
1da177e4 400
69e3c75f 401 h.raw = frame;
bbd6ef87
PM
402 switch (po->tp_version) {
403 case TPACKET_V1:
69e3c75f 404 h.h1->tp_status = status;
0af55bb5 405 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
406 break;
407 case TPACKET_V2:
69e3c75f 408 h.h2->tp_status = status;
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 410 break;
f6fb8f10 411 case TPACKET_V3:
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f 414 BUG();
bbd6ef87 415 }
69e3c75f
JB
416
417 smp_wmb();
bbd6ef87
PM
418}
419
69e3c75f 420static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 421{
184f489e 422 union tpacket_uhdr h;
bbd6ef87 423
69e3c75f
JB
424 smp_rmb();
425
bbd6ef87
PM
426 h.raw = frame;
427 switch (po->tp_version) {
428 case TPACKET_V1:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 430 return h.h1->tp_status;
bbd6ef87 431 case TPACKET_V2:
0af55bb5 432 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 433 return h.h2->tp_status;
f6fb8f10 434 case TPACKET_V3:
69e3c75f 435 default:
f6fb8f10 436 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
437 BUG();
438 return 0;
bbd6ef87 439 }
1da177e4 440}
69e3c75f 441
b9c32fb2
DB
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
7a51384c
DB
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
68a360e8
WB
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 453 return TP_STATUS_TS_SOFTWARE;
7a51384c 454
b9c32fb2 455 return 0;
7a51384c
DB
456}
457
b9c32fb2
DB
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
2e31396f
WB
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
b9c32fb2 463 __u32 ts_status;
2e31396f 464
b9c32fb2
DB
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
2e31396f
WB
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
479 default:
480 WARN(1, "TPACKET version not supported.\n");
481 BUG();
482 }
483
484 /* one flush is safe, as both fields always lie on the same cacheline */
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
486 smp_wmb();
b9c32fb2
DB
487
488 return ts_status;
2e31396f
WB
489}
490
69e3c75f
JB
491static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
494 int status)
495{
496 unsigned int pg_vec_pos, frame_offset;
184f489e 497 union tpacket_uhdr h;
69e3c75f
JB
498
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
501
0e3125c7
NH
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
69e3c75f
JB
504
505 if (status != __packet_get_status(po, h.raw))
506 return NULL;
507
508 return h.raw;
509}
510
eea49cc9 511static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
512 struct packet_ring_buffer *rb,
513 int status)
514{
515 return packet_lookup_frame(po, rb, rb->head, status);
516}
517
bc59ba39 518static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 519{
520 del_timer_sync(&pkc->retire_blk_timer);
521}
522
523static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 524 struct sk_buff_head *rb_queue)
525{
bc59ba39 526 struct tpacket_kbdq_core *pkc;
f6fb8f10 527
73d0fcf2 528 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 529
ec6f809f 530 spin_lock_bh(&rb_queue->lock);
f6fb8f10 531 pkc->delete_blk_timer = 1;
ec6f809f 532 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 533
534 prb_del_retire_blk_timer(pkc);
535}
536
537static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 538 struct tpacket_kbdq_core *pkc,
f6fb8f10 539 void (*func) (unsigned long))
540{
541 init_timer(&pkc->retire_blk_timer);
542 pkc->retire_blk_timer.data = (long)po;
543 pkc->retire_blk_timer.function = func;
544 pkc->retire_blk_timer.expires = jiffies;
545}
546
e8e85cc5 547static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 548{
bc59ba39 549 struct tpacket_kbdq_core *pkc;
f6fb8f10 550
e8e85cc5 551 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 552 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
553}
554
555static int prb_calc_retire_blk_tmo(struct packet_sock *po,
556 int blk_size_in_bytes)
557{
558 struct net_device *dev;
559 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 560 struct ethtool_link_ksettings ecmd;
4bc71cb9 561 int err;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 568 }
7cad1bac 569 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
570 rtnl_unlock();
571 if (!err) {
4bc71cb9
JP
572 /*
573 * If the link speed is so slow you don't really
574 * need to worry about perf anyways
575 */
7cad1bac
DD
576 if (ecmd.base.speed < SPEED_1000 ||
577 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
7cad1bac 581 div = ecmd.base.speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
eb70db87 1343 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
47dceb8e
WB
1415static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1416 struct sk_buff *skb,
1417 unsigned int num)
1418{
1419 struct bpf_prog *prog;
1420 unsigned int ret = 0;
1421
1422 rcu_read_lock();
1423 prog = rcu_dereference(f->bpf_prog);
1424 if (prog)
ff936a04 1425 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1426 rcu_read_unlock();
1427
1428 return ret;
1429}
1430
77f65ebd
WB
1431static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1432{
1433 return f->flags & (flag >> 8);
95ec3eb4
DM
1434}
1435
95ec3eb4
DM
1436static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1437 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1438{
1439 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1440 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1441 struct net *net = read_pnet(&f->net);
dc99f600 1442 struct packet_sock *po;
77f65ebd 1443 unsigned int idx;
dc99f600 1444
19bcf9f2 1445 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1446 kfree_skb(skb);
1447 return 0;
1448 }
1449
3f34b24a 1450 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1451 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1452 if (!skb)
1453 return 0;
1454 }
95ec3eb4
DM
1455 switch (f->type) {
1456 case PACKET_FANOUT_HASH:
1457 default:
77f65ebd 1458 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1459 break;
1460 case PACKET_FANOUT_LB:
77f65ebd 1461 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1462 break;
1463 case PACKET_FANOUT_CPU:
77f65ebd
WB
1464 idx = fanout_demux_cpu(f, skb, num);
1465 break;
5df0ddfb
DB
1466 case PACKET_FANOUT_RND:
1467 idx = fanout_demux_rnd(f, skb, num);
1468 break;
2d36097d
NH
1469 case PACKET_FANOUT_QM:
1470 idx = fanout_demux_qm(f, skb, num);
1471 break;
77f65ebd 1472 case PACKET_FANOUT_ROLLOVER:
ad377cab 1473 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1474 break;
47dceb8e 1475 case PACKET_FANOUT_CBPF:
f2e52095 1476 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1477 idx = fanout_demux_bpf(f, skb, num);
1478 break;
dc99f600
DM
1479 }
1480
ad377cab
WB
1481 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1482 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1483
ad377cab 1484 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1485 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1486}
1487
fff3321d
PE
1488DEFINE_MUTEX(fanout_mutex);
1489EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1490static LIST_HEAD(fanout_list);
1491
1492static void __fanout_link(struct sock *sk, struct packet_sock *po)
1493{
1494 struct packet_fanout *f = po->fanout;
1495
1496 spin_lock(&f->lock);
1497 f->arr[f->num_members] = sk;
1498 smp_wmb();
1499 f->num_members++;
2bd624b4
AS
1500 if (f->num_members == 1)
1501 dev_add_pack(&f->prot_hook);
dc99f600
DM
1502 spin_unlock(&f->lock);
1503}
1504
1505static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1506{
1507 struct packet_fanout *f = po->fanout;
1508 int i;
1509
1510 spin_lock(&f->lock);
1511 for (i = 0; i < f->num_members; i++) {
1512 if (f->arr[i] == sk)
1513 break;
1514 }
1515 BUG_ON(i >= f->num_members);
1516 f->arr[i] = f->arr[f->num_members - 1];
1517 f->num_members--;
2bd624b4
AS
1518 if (f->num_members == 0)
1519 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1520 spin_unlock(&f->lock);
1521}
1522
d4dd8aee 1523static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1524{
161642e2
ED
1525 if (sk->sk_family != PF_PACKET)
1526 return false;
c0de08d0 1527
161642e2 1528 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1529}
1530
47dceb8e
WB
1531static void fanout_init_data(struct packet_fanout *f)
1532{
1533 switch (f->type) {
1534 case PACKET_FANOUT_LB:
1535 atomic_set(&f->rr_cur, 0);
1536 break;
1537 case PACKET_FANOUT_CBPF:
f2e52095 1538 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1539 RCU_INIT_POINTER(f->bpf_prog, NULL);
1540 break;
1541 }
1542}
1543
1544static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1545{
1546 struct bpf_prog *old;
1547
1548 spin_lock(&f->lock);
1549 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1550 rcu_assign_pointer(f->bpf_prog, new);
1551 spin_unlock(&f->lock);
1552
1553 if (old) {
1554 synchronize_net();
1555 bpf_prog_destroy(old);
1556 }
1557}
1558
1559static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1560 unsigned int len)
1561{
1562 struct bpf_prog *new;
1563 struct sock_fprog fprog;
1564 int ret;
1565
1566 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1567 return -EPERM;
1568 if (len != sizeof(fprog))
1569 return -EINVAL;
1570 if (copy_from_user(&fprog, data, len))
1571 return -EFAULT;
1572
bab18991 1573 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1574 if (ret)
1575 return ret;
1576
1577 __fanout_set_data_bpf(po->fanout, new);
1578 return 0;
1579}
1580
f2e52095
WB
1581static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1582 unsigned int len)
1583{
1584 struct bpf_prog *new;
1585 u32 fd;
1586
1587 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1588 return -EPERM;
1589 if (len != sizeof(fd))
1590 return -EINVAL;
1591 if (copy_from_user(&fd, data, len))
1592 return -EFAULT;
1593
113214be 1594 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1595 if (IS_ERR(new))
1596 return PTR_ERR(new);
f2e52095
WB
1597
1598 __fanout_set_data_bpf(po->fanout, new);
1599 return 0;
1600}
1601
47dceb8e
WB
1602static int fanout_set_data(struct packet_sock *po, char __user *data,
1603 unsigned int len)
1604{
1605 switch (po->fanout->type) {
1606 case PACKET_FANOUT_CBPF:
1607 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1608 case PACKET_FANOUT_EBPF:
1609 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1610 default:
1611 return -EINVAL;
1612 };
1613}
1614
1615static void fanout_release_data(struct packet_fanout *f)
1616{
1617 switch (f->type) {
1618 case PACKET_FANOUT_CBPF:
f2e52095 1619 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1620 __fanout_set_data_bpf(f, NULL);
1621 };
1622}
1623
7736d33f 1624static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1625{
d199fab6 1626 struct packet_rollover *rollover = NULL;
dc99f600
DM
1627 struct packet_sock *po = pkt_sk(sk);
1628 struct packet_fanout *f, *match;
7736d33f 1629 u8 type = type_flags & 0xff;
77f65ebd 1630 u8 flags = type_flags >> 8;
dc99f600
DM
1631 int err;
1632
1633 switch (type) {
77f65ebd
WB
1634 case PACKET_FANOUT_ROLLOVER:
1635 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1636 return -EINVAL;
dc99f600
DM
1637 case PACKET_FANOUT_HASH:
1638 case PACKET_FANOUT_LB:
95ec3eb4 1639 case PACKET_FANOUT_CPU:
5df0ddfb 1640 case PACKET_FANOUT_RND:
2d36097d 1641 case PACKET_FANOUT_QM:
47dceb8e 1642 case PACKET_FANOUT_CBPF:
f2e52095 1643 case PACKET_FANOUT_EBPF:
dc99f600
DM
1644 break;
1645 default:
1646 return -EINVAL;
1647 }
1648
d199fab6
ED
1649 mutex_lock(&fanout_mutex);
1650
1651 err = -EINVAL;
dc99f600 1652 if (!po->running)
d199fab6 1653 goto out;
dc99f600 1654
d199fab6 1655 err = -EALREADY;
dc99f600 1656 if (po->fanout)
d199fab6 1657 goto out;
dc99f600 1658
4633c9e0
WB
1659 if (type == PACKET_FANOUT_ROLLOVER ||
1660 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1661 err = -ENOMEM;
1662 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1663 if (!rollover)
1664 goto out;
1665 atomic_long_set(&rollover->num, 0);
1666 atomic_long_set(&rollover->num_huge, 0);
1667 atomic_long_set(&rollover->num_failed, 0);
1668 po->rollover = rollover;
0648ab70
WB
1669 }
1670
dc99f600
DM
1671 match = NULL;
1672 list_for_each_entry(f, &fanout_list, list) {
1673 if (f->id == id &&
1674 read_pnet(&f->net) == sock_net(sk)) {
1675 match = f;
1676 break;
1677 }
1678 }
afe62c68 1679 err = -EINVAL;
77f65ebd 1680 if (match && match->flags != flags)
afe62c68 1681 goto out;
dc99f600 1682 if (!match) {
afe62c68 1683 err = -ENOMEM;
dc99f600 1684 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1685 if (!match)
1686 goto out;
1687 write_pnet(&match->net, sock_net(sk));
1688 match->id = id;
1689 match->type = type;
77f65ebd 1690 match->flags = flags;
afe62c68
ED
1691 INIT_LIST_HEAD(&match->list);
1692 spin_lock_init(&match->lock);
1693 atomic_set(&match->sk_ref, 0);
47dceb8e 1694 fanout_init_data(match);
afe62c68
ED
1695 match->prot_hook.type = po->prot_hook.type;
1696 match->prot_hook.dev = po->prot_hook.dev;
1697 match->prot_hook.func = packet_rcv_fanout;
1698 match->prot_hook.af_packet_priv = match;
c0de08d0 1699 match->prot_hook.id_match = match_fanout_group;
afe62c68 1700 list_add(&match->list, &fanout_list);
dc99f600 1701 }
afe62c68
ED
1702 err = -EINVAL;
1703 if (match->type == type &&
1704 match->prot_hook.type == po->prot_hook.type &&
1705 match->prot_hook.dev == po->prot_hook.dev) {
1706 err = -ENOSPC;
1707 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1708 __dev_remove_pack(&po->prot_hook);
1709 po->fanout = match;
1710 atomic_inc(&match->sk_ref);
1711 __fanout_link(sk, po);
1712 err = 0;
dc99f600
DM
1713 }
1714 }
afe62c68 1715out:
d199fab6
ED
1716 if (err && rollover) {
1717 kfree(rollover);
0648ab70
WB
1718 po->rollover = NULL;
1719 }
d199fab6 1720 mutex_unlock(&fanout_mutex);
dc99f600
DM
1721 return err;
1722}
1723
2bd624b4
AS
1724/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1725 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1726 * It is the responsibility of the caller to call fanout_release_data() and
1727 * free the returned packet_fanout (after synchronize_net())
1728 */
1729static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1730{
1731 struct packet_sock *po = pkt_sk(sk);
1732 struct packet_fanout *f;
1733
fff3321d 1734 mutex_lock(&fanout_mutex);
d199fab6
ED
1735 f = po->fanout;
1736 if (f) {
1737 po->fanout = NULL;
1738
2bd624b4 1739 if (atomic_dec_and_test(&f->sk_ref))
d199fab6 1740 list_del(&f->list);
2bd624b4
AS
1741 else
1742 f = NULL;
dc99f600 1743
d199fab6
ED
1744 if (po->rollover)
1745 kfree_rcu(po->rollover, rcu);
dc99f600
DM
1746 }
1747 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1748
1749 return f;
dc99f600 1750}
1da177e4 1751
3c70c132
DB
1752static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1753 struct sk_buff *skb)
1754{
1755 /* Earlier code assumed this would be a VLAN pkt, double-check
1756 * this now that we have the actual packet in hand. We can only
1757 * do this check on Ethernet devices.
1758 */
1759 if (unlikely(dev->type != ARPHRD_ETHER))
1760 return false;
1761
1762 skb_reset_mac_header(skb);
1763 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1764}
1765
90ddc4f0 1766static const struct proto_ops packet_ops;
1da177e4 1767
90ddc4f0 1768static const struct proto_ops packet_ops_spkt;
1da177e4 1769
40d4e3df
ED
1770static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1771 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1772{
1773 struct sock *sk;
1774 struct sockaddr_pkt *spkt;
1775
1776 /*
1777 * When we registered the protocol we saved the socket in the data
1778 * field for just this event.
1779 */
1780
1781 sk = pt->af_packet_priv;
1ce4f28b 1782
1da177e4
LT
1783 /*
1784 * Yank back the headers [hope the device set this
1785 * right or kerboom...]
1786 *
1787 * Incoming packets have ll header pulled,
1788 * push it back.
1789 *
98e399f8 1790 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1791 * so that this procedure is noop.
1792 */
1793
1794 if (skb->pkt_type == PACKET_LOOPBACK)
1795 goto out;
1796
09ad9bc7 1797 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1798 goto out;
1799
40d4e3df
ED
1800 skb = skb_share_check(skb, GFP_ATOMIC);
1801 if (skb == NULL)
1da177e4
LT
1802 goto oom;
1803
1804 /* drop any routing info */
adf30907 1805 skb_dst_drop(skb);
1da177e4 1806
84531c24
PO
1807 /* drop conntrack reference */
1808 nf_reset(skb);
1809
ffbc6111 1810 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1811
98e399f8 1812 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1813
1814 /*
1815 * The SOCK_PACKET socket receives _all_ frames.
1816 */
1817
1818 spkt->spkt_family = dev->type;
1819 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1820 spkt->spkt_protocol = skb->protocol;
1821
1822 /*
1823 * Charge the memory to the socket. This is done specifically
1824 * to prevent sockets using all the memory up.
1825 */
1826
40d4e3df 1827 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1828 return 0;
1829
1830out:
1831 kfree_skb(skb);
1832oom:
1833 return 0;
1834}
1835
1836
1837/*
1838 * Output a raw packet to a device layer. This bypasses all the other
1839 * protocol layers and you must therefore supply it with a complete frame
1840 */
1ce4f28b 1841
1b784140
YX
1842static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1843 size_t len)
1da177e4
LT
1844{
1845 struct sock *sk = sock->sk;
342dfc30 1846 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1847 struct sk_buff *skb = NULL;
1da177e4 1848 struct net_device *dev;
c14ac945 1849 struct sockcm_cookie sockc;
40d4e3df 1850 __be16 proto = 0;
1da177e4 1851 int err;
3bdc0eba 1852 int extra_len = 0;
1ce4f28b 1853
1da177e4 1854 /*
1ce4f28b 1855 * Get and verify the address.
1da177e4
LT
1856 */
1857
40d4e3df 1858 if (saddr) {
1da177e4 1859 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1860 return -EINVAL;
1861 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1862 proto = saddr->spkt_protocol;
1863 } else
1864 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1865
1866 /*
1ce4f28b 1867 * Find the device first to size check it
1da177e4
LT
1868 */
1869
de74e92a 1870 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1871retry:
654d1f8a
ED
1872 rcu_read_lock();
1873 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1874 err = -ENODEV;
1875 if (dev == NULL)
1876 goto out_unlock;
1ce4f28b 1877
d5e76b0a
DM
1878 err = -ENETDOWN;
1879 if (!(dev->flags & IFF_UP))
1880 goto out_unlock;
1881
1da177e4 1882 /*
40d4e3df
ED
1883 * You may not queue a frame bigger than the mtu. This is the lowest level
1884 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1885 */
1ce4f28b 1886
3bdc0eba
BG
1887 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1888 if (!netif_supports_nofcs(dev)) {
1889 err = -EPROTONOSUPPORT;
1890 goto out_unlock;
1891 }
1892 extra_len = 4; /* We're doing our own CRC */
1893 }
1894
1da177e4 1895 err = -EMSGSIZE;
3bdc0eba 1896 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1897 goto out_unlock;
1898
1a35ca80
ED
1899 if (!skb) {
1900 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1901 int tlen = dev->needed_tailroom;
1a35ca80
ED
1902 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1903
1904 rcu_read_unlock();
4ce40912 1905 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1906 if (skb == NULL)
1907 return -ENOBUFS;
1908 /* FIXME: Save some space for broken drivers that write a hard
1909 * header at transmission time by themselves. PPP is the notable
1910 * one here. This should really be fixed at the driver level.
1911 */
1912 skb_reserve(skb, reserved);
1913 skb_reset_network_header(skb);
1914
1915 /* Try to align data part correctly */
1916 if (hhlen) {
1917 skb->data -= hhlen;
1918 skb->tail -= hhlen;
1919 if (len < hhlen)
1920 skb_reset_network_header(skb);
1921 }
6ce8e9ce 1922 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1923 if (err)
1924 goto out_free;
1925 goto retry;
1da177e4
LT
1926 }
1927
9ed988cd
WB
1928 if (!dev_validate_header(dev, skb->data, len)) {
1929 err = -EINVAL;
1930 goto out_unlock;
1931 }
3c70c132
DB
1932 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1933 !packet_extra_vlan_len_allowed(dev, skb)) {
1934 err = -EMSGSIZE;
1935 goto out_unlock;
57f89bfa 1936 }
1a35ca80 1937
edbe7746 1938 sockc.tsflags = sk->sk_tsflags;
c14ac945
SHY
1939 if (msg->msg_controllen) {
1940 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1941 if (unlikely(err))
c14ac945 1942 goto out_unlock;
c14ac945
SHY
1943 }
1944
1da177e4
LT
1945 skb->protocol = proto;
1946 skb->dev = dev;
1947 skb->priority = sk->sk_priority;
2d37a186 1948 skb->mark = sk->sk_mark;
bf84a010 1949
c14ac945 1950 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1951
3bdc0eba
BG
1952 if (unlikely(extra_len == 4))
1953 skb->no_fcs = 1;
1954
40893fd0 1955 skb_probe_transport_header(skb, 0);
c1aad275 1956
1da177e4 1957 dev_queue_xmit(skb);
654d1f8a 1958 rcu_read_unlock();
40d4e3df 1959 return len;
1da177e4 1960
1da177e4 1961out_unlock:
654d1f8a 1962 rcu_read_unlock();
1a35ca80
ED
1963out_free:
1964 kfree_skb(skb);
1da177e4
LT
1965 return err;
1966}
1da177e4 1967
ff936a04
AS
1968static unsigned int run_filter(struct sk_buff *skb,
1969 const struct sock *sk,
1970 unsigned int res)
1da177e4
LT
1971{
1972 struct sk_filter *filter;
fda9ef5d 1973
80f8f102
ED
1974 rcu_read_lock();
1975 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1976 if (filter != NULL)
ff936a04 1977 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1978 rcu_read_unlock();
1da177e4 1979
dbcb5855 1980 return res;
1da177e4
LT
1981}
1982
16cc1400
WB
1983static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
1984 size_t *len)
1985{
1986 struct virtio_net_hdr vnet_hdr;
1987
1988 if (*len < sizeof(vnet_hdr))
1989 return -EINVAL;
1990 *len -= sizeof(vnet_hdr);
1991
6391a448 1992 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
16cc1400
WB
1993 return -EINVAL;
1994
1995 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
1996}
1997
1da177e4 1998/*
62ab0812
ED
1999 * This function makes lazy skb cloning in hope that most of packets
2000 * are discarded by BPF.
2001 *
2002 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2003 * and skb->cb are mangled. It works because (and until) packets
2004 * falling here are owned by current CPU. Output packets are cloned
2005 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2006 * sequencially, so that if we return skb to original state on exit,
2007 * we will not harm anyone.
1da177e4
LT
2008 */
2009
40d4e3df
ED
2010static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2011 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2012{
2013 struct sock *sk;
2014 struct sockaddr_ll *sll;
2015 struct packet_sock *po;
40d4e3df 2016 u8 *skb_head = skb->data;
1da177e4 2017 int skb_len = skb->len;
dbcb5855 2018 unsigned int snaplen, res;
da37845f 2019 bool is_drop_n_account = false;
1da177e4
LT
2020
2021 if (skb->pkt_type == PACKET_LOOPBACK)
2022 goto drop;
2023
2024 sk = pt->af_packet_priv;
2025 po = pkt_sk(sk);
2026
09ad9bc7 2027 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2028 goto drop;
2029
1da177e4
LT
2030 skb->dev = dev;
2031
3b04ddde 2032 if (dev->header_ops) {
1da177e4 2033 /* The device has an explicit notion of ll header,
62ab0812
ED
2034 * exported to higher levels.
2035 *
2036 * Otherwise, the device hides details of its frame
2037 * structure, so that corresponding packet head is
2038 * never delivered to user.
1da177e4
LT
2039 */
2040 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2041 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2042 else if (skb->pkt_type == PACKET_OUTGOING) {
2043 /* Special case: outgoing packets have ll header at head */
bbe735e4 2044 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2045 }
2046 }
2047
2048 snaplen = skb->len;
2049
dbcb5855
DM
2050 res = run_filter(skb, sk, snaplen);
2051 if (!res)
fda9ef5d 2052 goto drop_n_restore;
dbcb5855
DM
2053 if (snaplen > res)
2054 snaplen = res;
1da177e4 2055
0fd7bac6 2056 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2057 goto drop_n_acct;
2058
2059 if (skb_shared(skb)) {
2060 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2061 if (nskb == NULL)
2062 goto drop_n_acct;
2063
2064 if (skb_head != skb->data) {
2065 skb->data = skb_head;
2066 skb->len = skb_len;
2067 }
abc4e4fa 2068 consume_skb(skb);
1da177e4
LT
2069 skb = nskb;
2070 }
2071
b4772ef8 2072 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2073
2074 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2075 sll->sll_hatype = dev->type;
1da177e4 2076 sll->sll_pkttype = skb->pkt_type;
8032b464 2077 if (unlikely(po->origdev))
80feaacb
PWJ
2078 sll->sll_ifindex = orig_dev->ifindex;
2079 else
2080 sll->sll_ifindex = dev->ifindex;
1da177e4 2081
b95cce35 2082 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2083
2472d761
EB
2084 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2085 * Use their space for storing the original skb length.
2086 */
2087 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2088
1da177e4
LT
2089 if (pskb_trim(skb, snaplen))
2090 goto drop_n_acct;
2091
2092 skb_set_owner_r(skb, sk);
2093 skb->dev = NULL;
adf30907 2094 skb_dst_drop(skb);
1da177e4 2095
84531c24
PO
2096 /* drop conntrack reference */
2097 nf_reset(skb);
2098
1da177e4 2099 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2100 po->stats.stats1.tp_packets++;
3bc3b96f 2101 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2102 __skb_queue_tail(&sk->sk_receive_queue, skb);
2103 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2104 sk->sk_data_ready(sk);
1da177e4
LT
2105 return 0;
2106
2107drop_n_acct:
da37845f 2108 is_drop_n_account = true;
7091fbd8 2109 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2110 po->stats.stats1.tp_drops++;
7091fbd8
WB
2111 atomic_inc(&sk->sk_drops);
2112 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2113
2114drop_n_restore:
2115 if (skb_head != skb->data && skb_shared(skb)) {
2116 skb->data = skb_head;
2117 skb->len = skb_len;
2118 }
2119drop:
da37845f
WJ
2120 if (!is_drop_n_account)
2121 consume_skb(skb);
2122 else
2123 kfree_skb(skb);
1da177e4
LT
2124 return 0;
2125}
2126
40d4e3df
ED
2127static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2128 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2129{
2130 struct sock *sk;
2131 struct packet_sock *po;
2132 struct sockaddr_ll *sll;
184f489e 2133 union tpacket_uhdr h;
40d4e3df 2134 u8 *skb_head = skb->data;
1da177e4 2135 int skb_len = skb->len;
dbcb5855 2136 unsigned int snaplen, res;
f6fb8f10 2137 unsigned long status = TP_STATUS_USER;
bbd6ef87 2138 unsigned short macoff, netoff, hdrlen;
1da177e4 2139 struct sk_buff *copy_skb = NULL;
bbd6ef87 2140 struct timespec ts;
b9c32fb2 2141 __u32 ts_status;
da37845f 2142 bool is_drop_n_account = false;
1da177e4 2143
51846355
AW
2144 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2145 * We may add members to them until current aligned size without forcing
2146 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2147 */
2148 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2149 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2150
1da177e4
LT
2151 if (skb->pkt_type == PACKET_LOOPBACK)
2152 goto drop;
2153
2154 sk = pt->af_packet_priv;
2155 po = pkt_sk(sk);
2156
09ad9bc7 2157 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2158 goto drop;
2159
3b04ddde 2160 if (dev->header_ops) {
1da177e4 2161 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2162 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2163 else if (skb->pkt_type == PACKET_OUTGOING) {
2164 /* Special case: outgoing packets have ll header at head */
bbe735e4 2165 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2166 }
2167 }
2168
2169 snaplen = skb->len;
2170
dbcb5855
DM
2171 res = run_filter(skb, sk, snaplen);
2172 if (!res)
fda9ef5d 2173 goto drop_n_restore;
68c2e5de
AD
2174
2175 if (skb->ip_summed == CHECKSUM_PARTIAL)
2176 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2177 else if (skb->pkt_type != PACKET_OUTGOING &&
2178 (skb->ip_summed == CHECKSUM_COMPLETE ||
2179 skb_csum_unnecessary(skb)))
2180 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2181
dbcb5855
DM
2182 if (snaplen > res)
2183 snaplen = res;
1da177e4
LT
2184
2185 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2186 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2187 po->tp_reserve;
1da177e4 2188 } else {
95c96174 2189 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2190 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2191 (maclen < 16 ? 16 : maclen)) +
58d19b19
WB
2192 po->tp_reserve;
2193 if (po->has_vnet_hdr)
2194 netoff += sizeof(struct virtio_net_hdr);
1da177e4
LT
2195 macoff = netoff - maclen;
2196 }
f6fb8f10 2197 if (po->tp_version <= TPACKET_V2) {
2198 if (macoff + snaplen > po->rx_ring.frame_size) {
2199 if (po->copy_thresh &&
0fd7bac6 2200 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2201 if (skb_shared(skb)) {
2202 copy_skb = skb_clone(skb, GFP_ATOMIC);
2203 } else {
2204 copy_skb = skb_get(skb);
2205 skb_head = skb->data;
2206 }
2207 if (copy_skb)
2208 skb_set_owner_r(copy_skb, sk);
1da177e4 2209 }
f6fb8f10 2210 snaplen = po->rx_ring.frame_size - macoff;
2211 if ((int)snaplen < 0)
2212 snaplen = 0;
1da177e4 2213 }
dc808110
ED
2214 } else if (unlikely(macoff + snaplen >
2215 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2216 u32 nval;
2217
2218 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2219 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2220 snaplen, nval, macoff);
2221 snaplen = nval;
2222 if (unlikely((int)snaplen < 0)) {
2223 snaplen = 0;
2224 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2225 }
1da177e4 2226 }
1da177e4 2227 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2228 h.raw = packet_current_rx_frame(po, skb,
2229 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2230 if (!h.raw)
58d19b19 2231 goto drop_n_account;
f6fb8f10 2232 if (po->tp_version <= TPACKET_V2) {
2233 packet_increment_rx_head(po, &po->rx_ring);
2234 /*
2235 * LOSING will be reported till you read the stats,
2236 * because it's COR - Clear On Read.
2237 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2238 * at packet level.
2239 */
ee80fbf3 2240 if (po->stats.stats1.tp_drops)
f6fb8f10 2241 status |= TP_STATUS_LOSING;
2242 }
ee80fbf3 2243 po->stats.stats1.tp_packets++;
1da177e4
LT
2244 if (copy_skb) {
2245 status |= TP_STATUS_COPY;
2246 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2247 }
1da177e4
LT
2248 spin_unlock(&sk->sk_receive_queue.lock);
2249
58d19b19 2250 if (po->has_vnet_hdr) {
5a213881
JR
2251 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2252 sizeof(struct virtio_net_hdr),
6391a448 2253 vio_le(), true)) {
58d19b19
WB
2254 spin_lock(&sk->sk_receive_queue.lock);
2255 goto drop_n_account;
2256 }
2257 }
2258
bbd6ef87 2259 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2260
2261 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2262 getnstimeofday(&ts);
1da177e4 2263
b9c32fb2
DB
2264 status |= ts_status;
2265
bbd6ef87
PM
2266 switch (po->tp_version) {
2267 case TPACKET_V1:
2268 h.h1->tp_len = skb->len;
2269 h.h1->tp_snaplen = snaplen;
2270 h.h1->tp_mac = macoff;
2271 h.h1->tp_net = netoff;
4b457bdf
DB
2272 h.h1->tp_sec = ts.tv_sec;
2273 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2274 hdrlen = sizeof(*h.h1);
2275 break;
2276 case TPACKET_V2:
2277 h.h2->tp_len = skb->len;
2278 h.h2->tp_snaplen = snaplen;
2279 h.h2->tp_mac = macoff;
2280 h.h2->tp_net = netoff;
bbd6ef87
PM
2281 h.h2->tp_sec = ts.tv_sec;
2282 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2283 if (skb_vlan_tag_present(skb)) {
2284 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2285 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2286 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2287 } else {
2288 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2289 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2290 }
e4d26f4b 2291 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2292 hdrlen = sizeof(*h.h2);
2293 break;
f6fb8f10 2294 case TPACKET_V3:
2295 /* tp_nxt_offset,vlan are already populated above.
2296 * So DONT clear those fields here
2297 */
2298 h.h3->tp_status |= status;
2299 h.h3->tp_len = skb->len;
2300 h.h3->tp_snaplen = snaplen;
2301 h.h3->tp_mac = macoff;
2302 h.h3->tp_net = netoff;
f6fb8f10 2303 h.h3->tp_sec = ts.tv_sec;
2304 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2305 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2306 hdrlen = sizeof(*h.h3);
2307 break;
bbd6ef87
PM
2308 default:
2309 BUG();
2310 }
1da177e4 2311
bbd6ef87 2312 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2313 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2314 sll->sll_family = AF_PACKET;
2315 sll->sll_hatype = dev->type;
2316 sll->sll_protocol = skb->protocol;
2317 sll->sll_pkttype = skb->pkt_type;
8032b464 2318 if (unlikely(po->origdev))
80feaacb
PWJ
2319 sll->sll_ifindex = orig_dev->ifindex;
2320 else
2321 sll->sll_ifindex = dev->ifindex;
1da177e4 2322
e16aa207 2323 smp_mb();
f0d4eb29 2324
f6dafa95 2325#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2326 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2327 u8 *start, *end;
2328
f0d4eb29
DB
2329 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2330 macoff + snaplen);
2331
2332 for (start = h.raw; start < end; start += PAGE_SIZE)
2333 flush_dcache_page(pgv_to_page(start));
1da177e4 2334 }
f0d4eb29 2335 smp_wmb();
f6dafa95 2336#endif
f0d4eb29 2337
da413eec 2338 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2339 __packet_set_status(po, h.raw, status);
da413eec
DC
2340 sk->sk_data_ready(sk);
2341 } else {
f6fb8f10 2342 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2343 }
1da177e4
LT
2344
2345drop_n_restore:
2346 if (skb_head != skb->data && skb_shared(skb)) {
2347 skb->data = skb_head;
2348 skb->len = skb_len;
2349 }
2350drop:
da37845f
WJ
2351 if (!is_drop_n_account)
2352 consume_skb(skb);
2353 else
2354 kfree_skb(skb);
1da177e4
LT
2355 return 0;
2356
58d19b19 2357drop_n_account:
da37845f 2358 is_drop_n_account = true;
ee80fbf3 2359 po->stats.stats1.tp_drops++;
1da177e4
LT
2360 spin_unlock(&sk->sk_receive_queue.lock);
2361
676d2369 2362 sk->sk_data_ready(sk);
acb5d75b 2363 kfree_skb(copy_skb);
1da177e4
LT
2364 goto drop_n_restore;
2365}
2366
69e3c75f
JB
2367static void tpacket_destruct_skb(struct sk_buff *skb)
2368{
2369 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2370
69e3c75f 2371 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2372 void *ph;
b9c32fb2
DB
2373 __u32 ts;
2374
69e3c75f 2375 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2376 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2377
2378 ts = __packet_set_timestamp(po, ph, skb);
2379 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2380 }
2381
2382 sock_wfree(skb);
2383}
2384
c72219b7
DB
2385static void tpacket_set_protocol(const struct net_device *dev,
2386 struct sk_buff *skb)
2387{
2388 if (dev->type == ARPHRD_ETHER) {
2389 skb_reset_mac_header(skb);
2390 skb->protocol = eth_hdr(skb)->h_proto;
2391 }
2392}
2393
16cc1400
WB
2394static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2395{
16cc1400
WB
2396 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2397 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2398 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2399 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2400 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2401 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2402 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2403
2404 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2405 return -EINVAL;
2406
16cc1400
WB
2407 return 0;
2408}
2409
2410static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2411 struct virtio_net_hdr *vnet_hdr)
2412{
16cc1400
WB
2413 if (*len < sizeof(*vnet_hdr))
2414 return -EINVAL;
2415 *len -= sizeof(*vnet_hdr);
2416
cbbd26b8 2417 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2418 return -EFAULT;
2419
2420 return __packet_snd_vnet_parse(vnet_hdr, *len);
2421}
2422
40d4e3df 2423static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2424 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2425 __be16 proto, unsigned char *addr, int hlen, int copylen,
2426 const struct sockcm_cookie *sockc)
69e3c75f 2427{
184f489e 2428 union tpacket_uhdr ph;
8d39b4a6 2429 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2430 struct socket *sock = po->sk.sk_socket;
2431 struct page *page;
69e3c75f
JB
2432 int err;
2433
2434 ph.raw = frame;
2435
2436 skb->protocol = proto;
2437 skb->dev = dev;
2438 skb->priority = po->sk.sk_priority;
2d37a186 2439 skb->mark = po->sk.sk_mark;
c14ac945 2440 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2441 skb_shinfo(skb)->destructor_arg = ph.raw;
2442
ae641949 2443 skb_reserve(skb, hlen);
69e3c75f 2444 skb_reset_network_header(skb);
c1aad275 2445
69e3c75f
JB
2446 to_write = tp_len;
2447
2448 if (sock->type == SOCK_DGRAM) {
2449 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2450 NULL, tp_len);
2451 if (unlikely(err < 0))
2452 return -EINVAL;
1d036d25 2453 } else if (copylen) {
9ed988cd
WB
2454 int hdrlen = min_t(int, copylen, tp_len);
2455
69e3c75f 2456 skb_push(skb, dev->hard_header_len);
1d036d25 2457 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2458 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2459 if (unlikely(err))
2460 return err;
9ed988cd
WB
2461 if (!dev_validate_header(dev, skb->data, hdrlen))
2462 return -EINVAL;
c72219b7
DB
2463 if (!skb->protocol)
2464 tpacket_set_protocol(dev, skb);
69e3c75f 2465
9ed988cd
WB
2466 data += hdrlen;
2467 to_write -= hdrlen;
69e3c75f
JB
2468 }
2469
69e3c75f
JB
2470 offset = offset_in_page(data);
2471 len_max = PAGE_SIZE - offset;
2472 len = ((to_write > len_max) ? len_max : to_write);
2473
2474 skb->data_len = to_write;
2475 skb->len += to_write;
2476 skb->truesize += to_write;
2477 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2478
2479 while (likely(to_write)) {
2480 nr_frags = skb_shinfo(skb)->nr_frags;
2481
2482 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2483 pr_err("Packet exceed the number of skb frags(%lu)\n",
2484 MAX_SKB_FRAGS);
69e3c75f
JB
2485 return -EFAULT;
2486 }
2487
0af55bb5
CG
2488 page = pgv_to_page(data);
2489 data += len;
69e3c75f
JB
2490 flush_dcache_page(page);
2491 get_page(page);
0af55bb5 2492 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2493 to_write -= len;
2494 offset = 0;
2495 len_max = PAGE_SIZE;
2496 len = ((to_write > len_max) ? len_max : to_write);
2497 }
2498
8fd6c80d 2499 skb_probe_transport_header(skb, 0);
efdfa2f7 2500
69e3c75f
JB
2501 return tp_len;
2502}
2503
8d39b4a6
WB
2504static int tpacket_parse_header(struct packet_sock *po, void *frame,
2505 int size_max, void **data)
2506{
2507 union tpacket_uhdr ph;
2508 int tp_len, off;
2509
2510 ph.raw = frame;
2511
2512 switch (po->tp_version) {
2513 case TPACKET_V2:
2514 tp_len = ph.h2->tp_len;
2515 break;
2516 default:
2517 tp_len = ph.h1->tp_len;
2518 break;
2519 }
2520 if (unlikely(tp_len > size_max)) {
2521 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2522 return -EMSGSIZE;
2523 }
2524
2525 if (unlikely(po->tp_tx_has_off)) {
2526 int off_min, off_max;
2527
2528 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2529 off_max = po->tx_ring.frame_size - tp_len;
2530 if (po->sk.sk_type == SOCK_DGRAM) {
2531 switch (po->tp_version) {
2532 case TPACKET_V2:
2533 off = ph.h2->tp_net;
2534 break;
2535 default:
2536 off = ph.h1->tp_net;
2537 break;
2538 }
2539 } else {
2540 switch (po->tp_version) {
2541 case TPACKET_V2:
2542 off = ph.h2->tp_mac;
2543 break;
2544 default:
2545 off = ph.h1->tp_mac;
2546 break;
2547 }
2548 }
2549 if (unlikely((off < off_min) || (off_max < off)))
2550 return -EINVAL;
2551 } else {
2552 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2553 }
2554
2555 *data = frame + off;
2556 return tp_len;
2557}
2558
69e3c75f
JB
2559static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2560{
69e3c75f
JB
2561 struct sk_buff *skb;
2562 struct net_device *dev;
1d036d25 2563 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2564 struct sockcm_cookie sockc;
69e3c75f 2565 __be16 proto;
09effa67 2566 int err, reserve = 0;
40d4e3df 2567 void *ph;
342dfc30 2568 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2569 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2570 int tp_len, size_max;
2571 unsigned char *addr;
8d39b4a6 2572 void *data;
69e3c75f 2573 int len_sum = 0;
9e67030a 2574 int status = TP_STATUS_AVAILABLE;
1d036d25 2575 int hlen, tlen, copylen = 0;
69e3c75f 2576
69e3c75f
JB
2577 mutex_lock(&po->pg_vec_lock);
2578
66e56cd4 2579 if (likely(saddr == NULL)) {
e40526cb 2580 dev = packet_cached_dev_get(po);
69e3c75f
JB
2581 proto = po->num;
2582 addr = NULL;
2583 } else {
2584 err = -EINVAL;
2585 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2586 goto out;
2587 if (msg->msg_namelen < (saddr->sll_halen
2588 + offsetof(struct sockaddr_ll,
2589 sll_addr)))
2590 goto out;
69e3c75f
JB
2591 proto = saddr->sll_protocol;
2592 addr = saddr->sll_addr;
827d9780 2593 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2594 }
2595
edbe7746 2596 sockc.tsflags = po->sk.sk_tsflags;
c14ac945
SHY
2597 if (msg->msg_controllen) {
2598 err = sock_cmsg_send(&po->sk, msg, &sockc);
2599 if (unlikely(err))
2600 goto out;
2601 }
2602
69e3c75f
JB
2603 err = -ENXIO;
2604 if (unlikely(dev == NULL))
2605 goto out;
69e3c75f
JB
2606 err = -ENETDOWN;
2607 if (unlikely(!(dev->flags & IFF_UP)))
2608 goto out_put;
2609
5cfb4c8d
DB
2610 if (po->sk.sk_socket->type == SOCK_RAW)
2611 reserve = dev->hard_header_len;
69e3c75f 2612 size_max = po->tx_ring.frame_size
b5dd884e 2613 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2614
1d036d25 2615 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2616 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2617
69e3c75f
JB
2618 do {
2619 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2620 TP_STATUS_SEND_REQUEST);
69e3c75f 2621 if (unlikely(ph == NULL)) {
87a2fd28
DB
2622 if (need_wait && need_resched())
2623 schedule();
69e3c75f
JB
2624 continue;
2625 }
2626
8d39b4a6
WB
2627 skb = NULL;
2628 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2629 if (tp_len < 0)
2630 goto tpacket_error;
2631
69e3c75f 2632 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2633 hlen = LL_RESERVED_SPACE(dev);
2634 tlen = dev->needed_tailroom;
1d036d25
WB
2635 if (po->has_vnet_hdr) {
2636 vnet_hdr = data;
2637 data += sizeof(*vnet_hdr);
2638 tp_len -= sizeof(*vnet_hdr);
2639 if (tp_len < 0 ||
2640 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2641 tp_len = -EINVAL;
2642 goto tpacket_error;
2643 }
2644 copylen = __virtio16_to_cpu(vio_le(),
2645 vnet_hdr->hdr_len);
2646 }
9ed988cd 2647 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2648 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2649 hlen + tlen + sizeof(struct sockaddr_ll) +
2650 (copylen - dev->hard_header_len),
fbf33a28 2651 !need_wait, &err);
69e3c75f 2652
fbf33a28
KM
2653 if (unlikely(skb == NULL)) {
2654 /* we assume the socket was initially writeable ... */
2655 if (likely(len_sum > 0))
2656 err = len_sum;
69e3c75f 2657 goto out_status;
fbf33a28 2658 }
8d39b4a6 2659 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2660 addr, hlen, copylen, &sockc);
dbd46ab4 2661 if (likely(tp_len >= 0) &&
5cfb4c8d 2662 tp_len > dev->mtu + reserve &&
1d036d25 2663 !po->has_vnet_hdr &&
3c70c132
DB
2664 !packet_extra_vlan_len_allowed(dev, skb))
2665 tp_len = -EMSGSIZE;
69e3c75f
JB
2666
2667 if (unlikely(tp_len < 0)) {
8d39b4a6 2668tpacket_error:
69e3c75f
JB
2669 if (po->tp_loss) {
2670 __packet_set_status(po, ph,
2671 TP_STATUS_AVAILABLE);
2672 packet_increment_head(&po->tx_ring);
2673 kfree_skb(skb);
2674 continue;
2675 } else {
2676 status = TP_STATUS_WRONG_FORMAT;
2677 err = tp_len;
2678 goto out_status;
2679 }
2680 }
2681
db60eb5f
JR
2682 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2683 vio_le())) {
1d036d25
WB
2684 tp_len = -EINVAL;
2685 goto tpacket_error;
2686 }
2687
0fd5d57b
DB
2688 packet_pick_tx_queue(dev, skb);
2689
69e3c75f
JB
2690 skb->destructor = tpacket_destruct_skb;
2691 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2692 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2693
2694 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2695 err = po->xmit(skb);
eb70df13
JP
2696 if (unlikely(err > 0)) {
2697 err = net_xmit_errno(err);
2698 if (err && __packet_get_status(po, ph) ==
2699 TP_STATUS_AVAILABLE) {
2700 /* skb was destructed already */
2701 skb = NULL;
2702 goto out_status;
2703 }
2704 /*
2705 * skb was dropped but not destructed yet;
2706 * let's treat it like congestion or err < 0
2707 */
2708 err = 0;
2709 }
69e3c75f
JB
2710 packet_increment_head(&po->tx_ring);
2711 len_sum += tp_len;
b0138408
DB
2712 } while (likely((ph != NULL) ||
2713 /* Note: packet_read_pending() might be slow if we have
2714 * to call it as it's per_cpu variable, but in fast-path
2715 * we already short-circuit the loop with the first
2716 * condition, and luckily don't have to go that path
2717 * anyway.
2718 */
2719 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2720
2721 err = len_sum;
2722 goto out_put;
2723
69e3c75f
JB
2724out_status:
2725 __packet_set_status(po, ph, status);
2726 kfree_skb(skb);
2727out_put:
e40526cb 2728 dev_put(dev);
69e3c75f
JB
2729out:
2730 mutex_unlock(&po->pg_vec_lock);
2731 return err;
2732}
69e3c75f 2733
eea49cc9
OJ
2734static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2735 size_t reserve, size_t len,
2736 size_t linear, int noblock,
2737 int *err)
bfd5f4a3
SS
2738{
2739 struct sk_buff *skb;
2740
2741 /* Under a page? Don't bother with paged skb. */
2742 if (prepad + len < PAGE_SIZE || !linear)
2743 linear = len;
2744
2745 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2746 err, 0);
bfd5f4a3
SS
2747 if (!skb)
2748 return NULL;
2749
2750 skb_reserve(skb, reserve);
2751 skb_put(skb, linear);
2752 skb->data_len = len - linear;
2753 skb->len += len - linear;
2754
2755 return skb;
2756}
2757
d346a3fa 2758static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2759{
2760 struct sock *sk = sock->sk;
342dfc30 2761 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2762 struct sk_buff *skb;
2763 struct net_device *dev;
0e11c91e 2764 __be16 proto;
1da177e4 2765 unsigned char *addr;
827d9780 2766 int err, reserve = 0;
c7d39e32 2767 struct sockcm_cookie sockc;
bfd5f4a3
SS
2768 struct virtio_net_hdr vnet_hdr = { 0 };
2769 int offset = 0;
bfd5f4a3 2770 struct packet_sock *po = pkt_sk(sk);
57031eb7 2771 int hlen, tlen, linear;
3bdc0eba 2772 int extra_len = 0;
1da177e4
LT
2773
2774 /*
1ce4f28b 2775 * Get and verify the address.
1da177e4 2776 */
1ce4f28b 2777
66e56cd4 2778 if (likely(saddr == NULL)) {
e40526cb 2779 dev = packet_cached_dev_get(po);
1da177e4
LT
2780 proto = po->num;
2781 addr = NULL;
2782 } else {
2783 err = -EINVAL;
2784 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2785 goto out;
0fb375fb
EB
2786 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2787 goto out;
1da177e4
LT
2788 proto = saddr->sll_protocol;
2789 addr = saddr->sll_addr;
827d9780 2790 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2791 }
2792
1da177e4 2793 err = -ENXIO;
e40526cb 2794 if (unlikely(dev == NULL))
1da177e4 2795 goto out_unlock;
d5e76b0a 2796 err = -ENETDOWN;
e40526cb 2797 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2798 goto out_unlock;
2799
edbe7746 2800 sockc.tsflags = sk->sk_tsflags;
c7d39e32
EJ
2801 sockc.mark = sk->sk_mark;
2802 if (msg->msg_controllen) {
2803 err = sock_cmsg_send(sk, msg, &sockc);
2804 if (unlikely(err))
2805 goto out_unlock;
2806 }
2807
e40526cb
DB
2808 if (sock->type == SOCK_RAW)
2809 reserve = dev->hard_header_len;
bfd5f4a3 2810 if (po->has_vnet_hdr) {
16cc1400
WB
2811 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2812 if (err)
bfd5f4a3 2813 goto out_unlock;
bfd5f4a3
SS
2814 }
2815
3bdc0eba
BG
2816 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2817 if (!netif_supports_nofcs(dev)) {
2818 err = -EPROTONOSUPPORT;
2819 goto out_unlock;
2820 }
2821 extra_len = 4; /* We're doing our own CRC */
2822 }
2823
1da177e4 2824 err = -EMSGSIZE;
16cc1400
WB
2825 if (!vnet_hdr.gso_type &&
2826 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2827 goto out_unlock;
2828
bfd5f4a3 2829 err = -ENOBUFS;
ae641949
HX
2830 hlen = LL_RESERVED_SPACE(dev);
2831 tlen = dev->needed_tailroom;
57031eb7
WB
2832 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2833 linear = max(linear, min_t(int, len, dev->hard_header_len));
2834 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2835 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2836 if (skb == NULL)
1da177e4
LT
2837 goto out_unlock;
2838
bfd5f4a3 2839 skb_set_network_header(skb, reserve);
1da177e4 2840
0c4e8581 2841 err = -EINVAL;
9c707762
WB
2842 if (sock->type == SOCK_DGRAM) {
2843 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2844 if (unlikely(offset < 0))
9c707762 2845 goto out_free;
9c707762 2846 }
1da177e4
LT
2847
2848 /* Returns -EFAULT on error */
c0371da6 2849 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2850 if (err)
2851 goto out_free;
bf84a010 2852
9ed988cd
WB
2853 if (sock->type == SOCK_RAW &&
2854 !dev_validate_header(dev, skb->data, len)) {
2855 err = -EINVAL;
2856 goto out_free;
2857 }
2858
c14ac945 2859 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2860
16cc1400 2861 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2862 !packet_extra_vlan_len_allowed(dev, skb)) {
2863 err = -EMSGSIZE;
2864 goto out_free;
57f89bfa
BG
2865 }
2866
09effa67
DM
2867 skb->protocol = proto;
2868 skb->dev = dev;
1da177e4 2869 skb->priority = sk->sk_priority;
c7d39e32 2870 skb->mark = sockc.mark;
0fd5d57b
DB
2871
2872 packet_pick_tx_queue(dev, skb);
1da177e4 2873
bfd5f4a3 2874 if (po->has_vnet_hdr) {
db60eb5f 2875 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2876 if (err)
2877 goto out_free;
2878 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2879 }
2880
8fd6c80d
DB
2881 skb_probe_transport_header(skb, reserve);
2882
3bdc0eba
BG
2883 if (unlikely(extra_len == 4))
2884 skb->no_fcs = 1;
2885
d346a3fa 2886 err = po->xmit(skb);
1da177e4
LT
2887 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2888 goto out_unlock;
2889
e40526cb 2890 dev_put(dev);
1da177e4 2891
40d4e3df 2892 return len;
1da177e4
LT
2893
2894out_free:
2895 kfree_skb(skb);
2896out_unlock:
e40526cb 2897 if (dev)
1da177e4
LT
2898 dev_put(dev);
2899out:
2900 return err;
2901}
2902
1b784140 2903static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2904{
69e3c75f
JB
2905 struct sock *sk = sock->sk;
2906 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2907
69e3c75f
JB
2908 if (po->tx_ring.pg_vec)
2909 return tpacket_snd(po, msg);
2910 else
69e3c75f
JB
2911 return packet_snd(sock, msg, len);
2912}
2913
1da177e4
LT
2914/*
2915 * Close a PACKET socket. This is fairly simple. We immediately go
2916 * to 'closed' state and remove our protocol entry in the device list.
2917 */
2918
2919static int packet_release(struct socket *sock)
2920{
2921 struct sock *sk = sock->sk;
2922 struct packet_sock *po;
2bd624b4 2923 struct packet_fanout *f;
d12d01d6 2924 struct net *net;
f6fb8f10 2925 union tpacket_req_u req_u;
1da177e4
LT
2926
2927 if (!sk)
2928 return 0;
2929
3b1e0a65 2930 net = sock_net(sk);
1da177e4
LT
2931 po = pkt_sk(sk);
2932
0fa7fa98 2933 mutex_lock(&net->packet.sklist_lock);
808f5114 2934 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2935 mutex_unlock(&net->packet.sklist_lock);
2936
2937 preempt_disable();
920de804 2938 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2939 preempt_enable();
1da177e4 2940
808f5114 2941 spin_lock(&po->bind_lock);
ce06b03e 2942 unregister_prot_hook(sk, false);
66e56cd4
DB
2943 packet_cached_dev_reset(po);
2944
160ff18a
BG
2945 if (po->prot_hook.dev) {
2946 dev_put(po->prot_hook.dev);
2947 po->prot_hook.dev = NULL;
2948 }
808f5114 2949 spin_unlock(&po->bind_lock);
1da177e4 2950
1da177e4 2951 packet_flush_mclist(sk);
1da177e4 2952
9665d5d6
PS
2953 if (po->rx_ring.pg_vec) {
2954 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2955 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2956 }
69e3c75f 2957
9665d5d6
PS
2958 if (po->tx_ring.pg_vec) {
2959 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2960 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2961 }
1da177e4 2962
2bd624b4 2963 f = fanout_release(sk);
dc99f600 2964
808f5114 2965 synchronize_net();
2bd624b4
AS
2966
2967 if (f) {
2968 fanout_release_data(f);
2969 kfree(f);
2970 }
1da177e4
LT
2971 /*
2972 * Now the socket is dead. No more input will appear.
2973 */
1da177e4
LT
2974 sock_orphan(sk);
2975 sock->sk = NULL;
2976
2977 /* Purge queues */
2978
2979 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2980 packet_free_pending(po);
17ab56a2 2981 sk_refcnt_debug_release(sk);
1da177e4
LT
2982
2983 sock_put(sk);
2984 return 0;
2985}
2986
2987/*
2988 * Attach a packet hook.
2989 */
2990
30f7ea1c
FR
2991static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
2992 __be16 proto)
1da177e4
LT
2993{
2994 struct packet_sock *po = pkt_sk(sk);
158cd4af 2995 struct net_device *dev_curr;
902fefb8
DB
2996 __be16 proto_curr;
2997 bool need_rehook;
30f7ea1c
FR
2998 struct net_device *dev = NULL;
2999 int ret = 0;
3000 bool unlisted = false;
dc99f600 3001
30f7ea1c 3002 if (po->fanout)
dc99f600 3003 return -EINVAL;
1da177e4
LT
3004
3005 lock_sock(sk);
1da177e4 3006 spin_lock(&po->bind_lock);
30f7ea1c
FR
3007 rcu_read_lock();
3008
3009 if (name) {
3010 dev = dev_get_by_name_rcu(sock_net(sk), name);
3011 if (!dev) {
3012 ret = -ENODEV;
3013 goto out_unlock;
3014 }
3015 } else if (ifindex) {
3016 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3017 if (!dev) {
3018 ret = -ENODEV;
3019 goto out_unlock;
3020 }
3021 }
3022
3023 if (dev)
3024 dev_hold(dev);
66e56cd4 3025
902fefb8
DB
3026 proto_curr = po->prot_hook.type;
3027 dev_curr = po->prot_hook.dev;
3028
3029 need_rehook = proto_curr != proto || dev_curr != dev;
3030
3031 if (need_rehook) {
30f7ea1c
FR
3032 if (po->running) {
3033 rcu_read_unlock();
3034 __unregister_prot_hook(sk, true);
3035 rcu_read_lock();
3036 dev_curr = po->prot_hook.dev;
3037 if (dev)
3038 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3039 dev->ifindex);
3040 }
1da177e4 3041
902fefb8
DB
3042 po->num = proto;
3043 po->prot_hook.type = proto;
902fefb8 3044
30f7ea1c
FR
3045 if (unlikely(unlisted)) {
3046 dev_put(dev);
3047 po->prot_hook.dev = NULL;
3048 po->ifindex = -1;
3049 packet_cached_dev_reset(po);
3050 } else {
3051 po->prot_hook.dev = dev;
3052 po->ifindex = dev ? dev->ifindex : 0;
3053 packet_cached_dev_assign(po, dev);
3054 }
902fefb8 3055 }
158cd4af
LW
3056 if (dev_curr)
3057 dev_put(dev_curr);
66e56cd4 3058
902fefb8 3059 if (proto == 0 || !need_rehook)
1da177e4
LT
3060 goto out_unlock;
3061
30f7ea1c 3062 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3063 register_prot_hook(sk);
be85d4ad
UT
3064 } else {
3065 sk->sk_err = ENETDOWN;
3066 if (!sock_flag(sk, SOCK_DEAD))
3067 sk->sk_error_report(sk);
1da177e4
LT
3068 }
3069
3070out_unlock:
30f7ea1c 3071 rcu_read_unlock();
1da177e4
LT
3072 spin_unlock(&po->bind_lock);
3073 release_sock(sk);
30f7ea1c 3074 return ret;
1da177e4
LT
3075}
3076
3077/*
3078 * Bind a packet socket to a device
3079 */
3080
40d4e3df
ED
3081static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3082 int addr_len)
1da177e4 3083{
40d4e3df 3084 struct sock *sk = sock->sk;
c87838f6 3085 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3086
1da177e4
LT
3087 /*
3088 * Check legality
3089 */
1ce4f28b 3090
8ae55f04 3091 if (addr_len != sizeof(struct sockaddr))
1da177e4 3092 return -EINVAL;
c87838f6
AP
3093 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3094 * zero-terminated.
3095 */
3096 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3097 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3098
30f7ea1c 3099 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3100}
1da177e4
LT
3101
3102static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3103{
40d4e3df
ED
3104 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3105 struct sock *sk = sock->sk;
1da177e4
LT
3106
3107 /*
3108 * Check legality
3109 */
1ce4f28b 3110
1da177e4
LT
3111 if (addr_len < sizeof(struct sockaddr_ll))
3112 return -EINVAL;
3113 if (sll->sll_family != AF_PACKET)
3114 return -EINVAL;
3115
30f7ea1c
FR
3116 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3117 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3118}
3119
3120static struct proto packet_proto = {
3121 .name = "PACKET",
3122 .owner = THIS_MODULE,
3123 .obj_size = sizeof(struct packet_sock),
3124};
3125
3126/*
1ce4f28b 3127 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3128 */
3129
3f378b68
EP
3130static int packet_create(struct net *net, struct socket *sock, int protocol,
3131 int kern)
1da177e4
LT
3132{
3133 struct sock *sk;
3134 struct packet_sock *po;
0e11c91e 3135 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3136 int err;
3137
df008c91 3138 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3139 return -EPERM;
be02097c
DM
3140 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3141 sock->type != SOCK_PACKET)
1da177e4
LT
3142 return -ESOCKTNOSUPPORT;
3143
3144 sock->state = SS_UNCONNECTED;
3145
3146 err = -ENOBUFS;
11aa9c28 3147 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3148 if (sk == NULL)
3149 goto out;
3150
3151 sock->ops = &packet_ops;
1da177e4
LT
3152 if (sock->type == SOCK_PACKET)
3153 sock->ops = &packet_ops_spkt;
be02097c 3154
1da177e4
LT
3155 sock_init_data(sock, sk);
3156
3157 po = pkt_sk(sk);
3158 sk->sk_family = PF_PACKET;
0e11c91e 3159 po->num = proto;
d346a3fa 3160 po->xmit = dev_queue_xmit;
66e56cd4 3161
b0138408
DB
3162 err = packet_alloc_pending(po);
3163 if (err)
3164 goto out2;
3165
66e56cd4 3166 packet_cached_dev_reset(po);
1da177e4
LT
3167
3168 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3169 sk_refcnt_debug_inc(sk);
1da177e4
LT
3170
3171 /*
3172 * Attach a protocol block
3173 */
3174
3175 spin_lock_init(&po->bind_lock);
905db440 3176 mutex_init(&po->pg_vec_lock);
0648ab70 3177 po->rollover = NULL;
1da177e4 3178 po->prot_hook.func = packet_rcv;
be02097c 3179
1da177e4
LT
3180 if (sock->type == SOCK_PACKET)
3181 po->prot_hook.func = packet_rcv_spkt;
be02097c 3182
1da177e4
LT
3183 po->prot_hook.af_packet_priv = sk;
3184
0e11c91e
AV
3185 if (proto) {
3186 po->prot_hook.type = proto;
ce06b03e 3187 register_prot_hook(sk);
1da177e4
LT
3188 }
3189
0fa7fa98 3190 mutex_lock(&net->packet.sklist_lock);
808f5114 3191 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3192 mutex_unlock(&net->packet.sklist_lock);
3193
3194 preempt_disable();
3680453c 3195 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3196 preempt_enable();
808f5114 3197
40d4e3df 3198 return 0;
b0138408
DB
3199out2:
3200 sk_free(sk);
1da177e4
LT
3201out:
3202 return err;
3203}
3204
3205/*
3206 * Pull a packet from our receive queue and hand it to the user.
3207 * If necessary we block.
3208 */
3209
1b784140
YX
3210static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3211 int flags)
1da177e4
LT
3212{
3213 struct sock *sk = sock->sk;
3214 struct sk_buff *skb;
3215 int copied, err;
bfd5f4a3 3216 int vnet_hdr_len = 0;
2472d761 3217 unsigned int origlen = 0;
1da177e4
LT
3218
3219 err = -EINVAL;
ed85b565 3220 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3221 goto out;
3222
3223#if 0
3224 /* What error should we return now? EUNATTACH? */
3225 if (pkt_sk(sk)->ifindex < 0)
3226 return -ENODEV;
3227#endif
3228
ed85b565 3229 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3230 err = sock_recv_errqueue(sk, msg, len,
3231 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3232 goto out;
3233 }
3234
1da177e4
LT
3235 /*
3236 * Call the generic datagram receiver. This handles all sorts
3237 * of horrible races and re-entrancy so we can forget about it
3238 * in the protocol layers.
3239 *
3240 * Now it will return ENETDOWN, if device have just gone down,
3241 * but then it will block.
3242 */
3243
40d4e3df 3244 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3245
3246 /*
1ce4f28b 3247 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3248 * handles the blocking we don't see and worry about blocking
3249 * retries.
3250 */
3251
8ae55f04 3252 if (skb == NULL)
1da177e4
LT
3253 goto out;
3254
2ccdbaa6
WB
3255 if (pkt_sk(sk)->pressure)
3256 packet_rcv_has_room(pkt_sk(sk), NULL);
3257
bfd5f4a3 3258 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3259 err = packet_rcv_vnet(msg, skb, &len);
3260 if (err)
bfd5f4a3 3261 goto out_free;
16cc1400 3262 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3263 }
3264
f3d33426
HFS
3265 /* You lose any data beyond the buffer you gave. If it worries
3266 * a user program they can ask the device for its MTU
3267 * anyway.
1da177e4 3268 */
1da177e4 3269 copied = skb->len;
40d4e3df
ED
3270 if (copied > len) {
3271 copied = len;
3272 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3273 }
3274
51f3d02b 3275 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3276 if (err)
3277 goto out_free;
3278
2472d761
EB
3279 if (sock->type != SOCK_PACKET) {
3280 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3281
3282 /* Original length was stored in sockaddr_ll fields */
3283 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3284 sll->sll_family = AF_PACKET;
3285 sll->sll_protocol = skb->protocol;
3286 }
3287
3b885787 3288 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3289
f3d33426
HFS
3290 if (msg->msg_name) {
3291 /* If the address length field is there to be filled
3292 * in, we fill it in now.
3293 */
3294 if (sock->type == SOCK_PACKET) {
342dfc30 3295 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3296 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3297 } else {
3298 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3299
f3d33426
HFS
3300 msg->msg_namelen = sll->sll_halen +
3301 offsetof(struct sockaddr_ll, sll_addr);
3302 }
ffbc6111
HX
3303 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3304 msg->msg_namelen);
f3d33426 3305 }
1da177e4 3306
8dc41944 3307 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3308 struct tpacket_auxdata aux;
3309
3310 aux.tp_status = TP_STATUS_USER;
3311 if (skb->ip_summed == CHECKSUM_PARTIAL)
3312 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3313 else if (skb->pkt_type != PACKET_OUTGOING &&
3314 (skb->ip_summed == CHECKSUM_COMPLETE ||
3315 skb_csum_unnecessary(skb)))
3316 aux.tp_status |= TP_STATUS_CSUM_VALID;
3317
2472d761 3318 aux.tp_len = origlen;
ffbc6111
HX
3319 aux.tp_snaplen = skb->len;
3320 aux.tp_mac = 0;
bbe735e4 3321 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3322 if (skb_vlan_tag_present(skb)) {
3323 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3324 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3325 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3326 } else {
3327 aux.tp_vlan_tci = 0;
a0cdfcf3 3328 aux.tp_vlan_tpid = 0;
a3bcc23e 3329 }
ffbc6111 3330 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3331 }
3332
1da177e4
LT
3333 /*
3334 * Free or return the buffer as appropriate. Again this
3335 * hides all the races and re-entrancy issues from us.
3336 */
bfd5f4a3 3337 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3338
3339out_free:
3340 skb_free_datagram(sk, skb);
3341out:
3342 return err;
3343}
3344
1da177e4
LT
3345static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3346 int *uaddr_len, int peer)
3347{
3348 struct net_device *dev;
3349 struct sock *sk = sock->sk;
3350
3351 if (peer)
3352 return -EOPNOTSUPP;
3353
3354 uaddr->sa_family = AF_PACKET;
2dc85bf3 3355 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3356 rcu_read_lock();
3357 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3358 if (dev)
2dc85bf3 3359 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3360 rcu_read_unlock();
1da177e4
LT
3361 *uaddr_len = sizeof(*uaddr);
3362
3363 return 0;
3364}
1da177e4
LT
3365
3366static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3367 int *uaddr_len, int peer)
3368{
3369 struct net_device *dev;
3370 struct sock *sk = sock->sk;
3371 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3372 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3373
3374 if (peer)
3375 return -EOPNOTSUPP;
3376
3377 sll->sll_family = AF_PACKET;
3378 sll->sll_ifindex = po->ifindex;
3379 sll->sll_protocol = po->num;
67286640 3380 sll->sll_pkttype = 0;
654d1f8a
ED
3381 rcu_read_lock();
3382 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3383 if (dev) {
3384 sll->sll_hatype = dev->type;
3385 sll->sll_halen = dev->addr_len;
3386 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3387 } else {
3388 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3389 sll->sll_halen = 0;
3390 }
654d1f8a 3391 rcu_read_unlock();
0fb375fb 3392 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3393
3394 return 0;
3395}
3396
2aeb0b88
WC
3397static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3398 int what)
1da177e4
LT
3399{
3400 switch (i->type) {
3401 case PACKET_MR_MULTICAST:
1162563f
JP
3402 if (i->alen != dev->addr_len)
3403 return -EINVAL;
1da177e4 3404 if (what > 0)
22bedad3 3405 return dev_mc_add(dev, i->addr);
1da177e4 3406 else
22bedad3 3407 return dev_mc_del(dev, i->addr);
1da177e4
LT
3408 break;
3409 case PACKET_MR_PROMISC:
2aeb0b88 3410 return dev_set_promiscuity(dev, what);
1da177e4 3411 case PACKET_MR_ALLMULTI:
2aeb0b88 3412 return dev_set_allmulti(dev, what);
d95ed927 3413 case PACKET_MR_UNICAST:
1162563f
JP
3414 if (i->alen != dev->addr_len)
3415 return -EINVAL;
d95ed927 3416 if (what > 0)
a748ee24 3417 return dev_uc_add(dev, i->addr);
d95ed927 3418 else
a748ee24 3419 return dev_uc_del(dev, i->addr);
d95ed927 3420 break;
40d4e3df
ED
3421 default:
3422 break;
1da177e4 3423 }
2aeb0b88 3424 return 0;
1da177e4
LT
3425}
3426
82f17091
FR
3427static void packet_dev_mclist_delete(struct net_device *dev,
3428 struct packet_mclist **mlp)
1da177e4 3429{
82f17091
FR
3430 struct packet_mclist *ml;
3431
3432 while ((ml = *mlp) != NULL) {
3433 if (ml->ifindex == dev->ifindex) {
3434 packet_dev_mc(dev, ml, -1);
3435 *mlp = ml->next;
3436 kfree(ml);
3437 } else
3438 mlp = &ml->next;
1da177e4
LT
3439 }
3440}
3441
0fb375fb 3442static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3443{
3444 struct packet_sock *po = pkt_sk(sk);
3445 struct packet_mclist *ml, *i;
3446 struct net_device *dev;
3447 int err;
3448
3449 rtnl_lock();
3450
3451 err = -ENODEV;
3b1e0a65 3452 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3453 if (!dev)
3454 goto done;
3455
3456 err = -EINVAL;
1162563f 3457 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3458 goto done;
3459
3460 err = -ENOBUFS;
8b3a7005 3461 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3462 if (i == NULL)
3463 goto done;
3464
3465 err = 0;
3466 for (ml = po->mclist; ml; ml = ml->next) {
3467 if (ml->ifindex == mreq->mr_ifindex &&
3468 ml->type == mreq->mr_type &&
3469 ml->alen == mreq->mr_alen &&
3470 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3471 ml->count++;
3472 /* Free the new element ... */
3473 kfree(i);
3474 goto done;
3475 }
3476 }
3477
3478 i->type = mreq->mr_type;
3479 i->ifindex = mreq->mr_ifindex;
3480 i->alen = mreq->mr_alen;
3481 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3482 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3483 i->count = 1;
3484 i->next = po->mclist;
3485 po->mclist = i;
2aeb0b88
WC
3486 err = packet_dev_mc(dev, i, 1);
3487 if (err) {
3488 po->mclist = i->next;
3489 kfree(i);
3490 }
1da177e4
LT
3491
3492done:
3493 rtnl_unlock();
3494 return err;
3495}
3496
0fb375fb 3497static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3498{
3499 struct packet_mclist *ml, **mlp;
3500
3501 rtnl_lock();
3502
3503 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3504 if (ml->ifindex == mreq->mr_ifindex &&
3505 ml->type == mreq->mr_type &&
3506 ml->alen == mreq->mr_alen &&
3507 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3508 if (--ml->count == 0) {
3509 struct net_device *dev;
3510 *mlp = ml->next;
ad959e76
ED
3511 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3512 if (dev)
1da177e4 3513 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3514 kfree(ml);
3515 }
82f17091 3516 break;
1da177e4
LT
3517 }
3518 }
3519 rtnl_unlock();
82f17091 3520 return 0;
1da177e4
LT
3521}
3522
3523static void packet_flush_mclist(struct sock *sk)
3524{
3525 struct packet_sock *po = pkt_sk(sk);
3526 struct packet_mclist *ml;
3527
3528 if (!po->mclist)
3529 return;
3530
3531 rtnl_lock();
3532 while ((ml = po->mclist) != NULL) {
3533 struct net_device *dev;
3534
3535 po->mclist = ml->next;
ad959e76
ED
3536 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3537 if (dev != NULL)
1da177e4 3538 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3539 kfree(ml);
3540 }
3541 rtnl_unlock();
3542}
1da177e4
LT
3543
3544static int
b7058842 3545packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3546{
3547 struct sock *sk = sock->sk;
8dc41944 3548 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3549 int ret;
3550
3551 if (level != SOL_PACKET)
3552 return -ENOPROTOOPT;
3553
69e3c75f 3554 switch (optname) {
1ce4f28b 3555 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3556 case PACKET_DROP_MEMBERSHIP:
3557 {
0fb375fb
EB
3558 struct packet_mreq_max mreq;
3559 int len = optlen;
3560 memset(&mreq, 0, sizeof(mreq));
3561 if (len < sizeof(struct packet_mreq))
1da177e4 3562 return -EINVAL;
0fb375fb
EB
3563 if (len > sizeof(mreq))
3564 len = sizeof(mreq);
40d4e3df 3565 if (copy_from_user(&mreq, optval, len))
1da177e4 3566 return -EFAULT;
0fb375fb
EB
3567 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3568 return -EINVAL;
1da177e4
LT
3569 if (optname == PACKET_ADD_MEMBERSHIP)
3570 ret = packet_mc_add(sk, &mreq);
3571 else
3572 ret = packet_mc_drop(sk, &mreq);
3573 return ret;
3574 }
a2efcfa0 3575
1da177e4 3576 case PACKET_RX_RING:
69e3c75f 3577 case PACKET_TX_RING:
1da177e4 3578 {
f6fb8f10 3579 union tpacket_req_u req_u;
3580 int len;
1da177e4 3581
f6fb8f10 3582 switch (po->tp_version) {
3583 case TPACKET_V1:
3584 case TPACKET_V2:
3585 len = sizeof(req_u.req);
3586 break;
3587 case TPACKET_V3:
3588 default:
3589 len = sizeof(req_u.req3);
3590 break;
3591 }
3592 if (optlen < len)
1da177e4 3593 return -EINVAL;
f6fb8f10 3594 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3595 return -EFAULT;
f6fb8f10 3596 return packet_set_ring(sk, &req_u, 0,
3597 optname == PACKET_TX_RING);
1da177e4
LT
3598 }
3599 case PACKET_COPY_THRESH:
3600 {
3601 int val;
3602
40d4e3df 3603 if (optlen != sizeof(val))
1da177e4 3604 return -EINVAL;
40d4e3df 3605 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3606 return -EFAULT;
3607
3608 pkt_sk(sk)->copy_thresh = val;
3609 return 0;
3610 }
bbd6ef87
PM
3611 case PACKET_VERSION:
3612 {
3613 int val;
3614
3615 if (optlen != sizeof(val))
3616 return -EINVAL;
bbd6ef87
PM
3617 if (copy_from_user(&val, optval, sizeof(val)))
3618 return -EFAULT;
3619 switch (val) {
3620 case TPACKET_V1:
3621 case TPACKET_V2:
f6fb8f10 3622 case TPACKET_V3:
84ac7260 3623 break;
bbd6ef87
PM
3624 default:
3625 return -EINVAL;
3626 }
84ac7260
PP
3627 lock_sock(sk);
3628 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3629 ret = -EBUSY;
3630 } else {
3631 po->tp_version = val;
3632 ret = 0;
3633 }
3634 release_sock(sk);
3635 return ret;
bbd6ef87 3636 }
8913336a
PM
3637 case PACKET_RESERVE:
3638 {
3639 unsigned int val;
3640
3641 if (optlen != sizeof(val))
3642 return -EINVAL;
69e3c75f 3643 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3644 return -EBUSY;
3645 if (copy_from_user(&val, optval, sizeof(val)))
3646 return -EFAULT;
3647 po->tp_reserve = val;
3648 return 0;
3649 }
69e3c75f
JB
3650 case PACKET_LOSS:
3651 {
3652 unsigned int val;
3653
3654 if (optlen != sizeof(val))
3655 return -EINVAL;
3656 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3657 return -EBUSY;
3658 if (copy_from_user(&val, optval, sizeof(val)))
3659 return -EFAULT;
3660 po->tp_loss = !!val;
3661 return 0;
3662 }
8dc41944
HX
3663 case PACKET_AUXDATA:
3664 {
3665 int val;
3666
3667 if (optlen < sizeof(val))
3668 return -EINVAL;
3669 if (copy_from_user(&val, optval, sizeof(val)))
3670 return -EFAULT;
3671
3672 po->auxdata = !!val;
3673 return 0;
3674 }
80feaacb
PWJ
3675 case PACKET_ORIGDEV:
3676 {
3677 int val;
3678
3679 if (optlen < sizeof(val))
3680 return -EINVAL;
3681 if (copy_from_user(&val, optval, sizeof(val)))
3682 return -EFAULT;
3683
3684 po->origdev = !!val;
3685 return 0;
3686 }
bfd5f4a3
SS
3687 case PACKET_VNET_HDR:
3688 {
3689 int val;
3690
3691 if (sock->type != SOCK_RAW)
3692 return -EINVAL;
3693 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3694 return -EBUSY;
3695 if (optlen < sizeof(val))
3696 return -EINVAL;
3697 if (copy_from_user(&val, optval, sizeof(val)))
3698 return -EFAULT;
3699
3700 po->has_vnet_hdr = !!val;
3701 return 0;
3702 }
614f60fa
SM
3703 case PACKET_TIMESTAMP:
3704 {
3705 int val;
3706
3707 if (optlen != sizeof(val))
3708 return -EINVAL;
3709 if (copy_from_user(&val, optval, sizeof(val)))
3710 return -EFAULT;
3711
3712 po->tp_tstamp = val;
3713 return 0;
3714 }
dc99f600
DM
3715 case PACKET_FANOUT:
3716 {
3717 int val;
3718
3719 if (optlen != sizeof(val))
3720 return -EINVAL;
3721 if (copy_from_user(&val, optval, sizeof(val)))
3722 return -EFAULT;
3723
3724 return fanout_add(sk, val & 0xffff, val >> 16);
3725 }
47dceb8e
WB
3726 case PACKET_FANOUT_DATA:
3727 {
3728 if (!po->fanout)
3729 return -EINVAL;
3730
3731 return fanout_set_data(po, optval, optlen);
3732 }
5920cd3a
PC
3733 case PACKET_TX_HAS_OFF:
3734 {
3735 unsigned int val;
3736
3737 if (optlen != sizeof(val))
3738 return -EINVAL;
3739 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3740 return -EBUSY;
3741 if (copy_from_user(&val, optval, sizeof(val)))
3742 return -EFAULT;
3743 po->tp_tx_has_off = !!val;
3744 return 0;
3745 }
d346a3fa
DB
3746 case PACKET_QDISC_BYPASS:
3747 {
3748 int val;
3749
3750 if (optlen != sizeof(val))
3751 return -EINVAL;
3752 if (copy_from_user(&val, optval, sizeof(val)))
3753 return -EFAULT;
3754
3755 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3756 return 0;
3757 }
1da177e4
LT
3758 default:
3759 return -ENOPROTOOPT;
3760 }
3761}
3762
3763static int packet_getsockopt(struct socket *sock, int level, int optname,
3764 char __user *optval, int __user *optlen)
3765{
3766 int len;
c06fff6e 3767 int val, lv = sizeof(val);
1da177e4
LT
3768 struct sock *sk = sock->sk;
3769 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3770 void *data = &val;
ee80fbf3 3771 union tpacket_stats_u st;
a9b63918 3772 struct tpacket_rollover_stats rstats;
1da177e4
LT
3773
3774 if (level != SOL_PACKET)
3775 return -ENOPROTOOPT;
3776
8ae55f04
KK
3777 if (get_user(len, optlen))
3778 return -EFAULT;
1da177e4
LT
3779
3780 if (len < 0)
3781 return -EINVAL;
1ce4f28b 3782
69e3c75f 3783 switch (optname) {
1da177e4 3784 case PACKET_STATISTICS:
1da177e4 3785 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3786 memcpy(&st, &po->stats, sizeof(st));
3787 memset(&po->stats, 0, sizeof(po->stats));
3788 spin_unlock_bh(&sk->sk_receive_queue.lock);
3789
f6fb8f10 3790 if (po->tp_version == TPACKET_V3) {
c06fff6e 3791 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3792 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3793 data = &st.stats3;
f6fb8f10 3794 } else {
c06fff6e 3795 lv = sizeof(struct tpacket_stats);
8bcdeaff 3796 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3797 data = &st.stats1;
f6fb8f10 3798 }
ee80fbf3 3799
8dc41944
HX
3800 break;
3801 case PACKET_AUXDATA:
8dc41944 3802 val = po->auxdata;
80feaacb
PWJ
3803 break;
3804 case PACKET_ORIGDEV:
80feaacb 3805 val = po->origdev;
bfd5f4a3
SS
3806 break;
3807 case PACKET_VNET_HDR:
bfd5f4a3 3808 val = po->has_vnet_hdr;
1da177e4 3809 break;
bbd6ef87 3810 case PACKET_VERSION:
bbd6ef87 3811 val = po->tp_version;
bbd6ef87
PM
3812 break;
3813 case PACKET_HDRLEN:
3814 if (len > sizeof(int))
3815 len = sizeof(int);
3816 if (copy_from_user(&val, optval, len))
3817 return -EFAULT;
3818 switch (val) {
3819 case TPACKET_V1:
3820 val = sizeof(struct tpacket_hdr);
3821 break;
3822 case TPACKET_V2:
3823 val = sizeof(struct tpacket2_hdr);
3824 break;
f6fb8f10 3825 case TPACKET_V3:
3826 val = sizeof(struct tpacket3_hdr);
3827 break;
bbd6ef87
PM
3828 default:
3829 return -EINVAL;
3830 }
bbd6ef87 3831 break;
8913336a 3832 case PACKET_RESERVE:
8913336a 3833 val = po->tp_reserve;
8913336a 3834 break;
69e3c75f 3835 case PACKET_LOSS:
69e3c75f 3836 val = po->tp_loss;
69e3c75f 3837 break;
614f60fa 3838 case PACKET_TIMESTAMP:
614f60fa 3839 val = po->tp_tstamp;
614f60fa 3840 break;
dc99f600 3841 case PACKET_FANOUT:
dc99f600
DM
3842 val = (po->fanout ?
3843 ((u32)po->fanout->id |
77f65ebd
WB
3844 ((u32)po->fanout->type << 16) |
3845 ((u32)po->fanout->flags << 24)) :
dc99f600 3846 0);
dc99f600 3847 break;
a9b63918
WB
3848 case PACKET_ROLLOVER_STATS:
3849 if (!po->rollover)
3850 return -EINVAL;
3851 rstats.tp_all = atomic_long_read(&po->rollover->num);
3852 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3853 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3854 data = &rstats;
3855 lv = sizeof(rstats);
3856 break;
5920cd3a
PC
3857 case PACKET_TX_HAS_OFF:
3858 val = po->tp_tx_has_off;
3859 break;
d346a3fa
DB
3860 case PACKET_QDISC_BYPASS:
3861 val = packet_use_direct_xmit(po);
3862 break;
1da177e4
LT
3863 default:
3864 return -ENOPROTOOPT;
3865 }
3866
c06fff6e
ED
3867 if (len > lv)
3868 len = lv;
8ae55f04
KK
3869 if (put_user(len, optlen))
3870 return -EFAULT;
8dc41944
HX
3871 if (copy_to_user(optval, data, len))
3872 return -EFAULT;
8ae55f04 3873 return 0;
1da177e4
LT
3874}
3875
3876
719c44d3
WB
3877#ifdef CONFIG_COMPAT
3878static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3879 char __user *optval, unsigned int optlen)
3880{
3881 struct packet_sock *po = pkt_sk(sock->sk);
3882
3883 if (level != SOL_PACKET)
3884 return -ENOPROTOOPT;
3885
3886 if (optname == PACKET_FANOUT_DATA &&
3887 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3888 optval = (char __user *)get_compat_bpf_fprog(optval);
3889 if (!optval)
3890 return -EFAULT;
3891 optlen = sizeof(struct sock_fprog);
3892 }
3893
3894 return packet_setsockopt(sock, level, optname, optval, optlen);
3895}
3896#endif
3897
351638e7
JP
3898static int packet_notifier(struct notifier_block *this,
3899 unsigned long msg, void *ptr)
1da177e4
LT
3900{
3901 struct sock *sk;
351638e7 3902 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3903 struct net *net = dev_net(dev);
1da177e4 3904
808f5114 3905 rcu_read_lock();
b67bfe0d 3906 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3907 struct packet_sock *po = pkt_sk(sk);
3908
3909 switch (msg) {
3910 case NETDEV_UNREGISTER:
1da177e4 3911 if (po->mclist)
82f17091 3912 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3913 /* fallthrough */
3914
1da177e4
LT
3915 case NETDEV_DOWN:
3916 if (dev->ifindex == po->ifindex) {
3917 spin_lock(&po->bind_lock);
3918 if (po->running) {
ce06b03e 3919 __unregister_prot_hook(sk, false);
1da177e4
LT
3920 sk->sk_err = ENETDOWN;
3921 if (!sock_flag(sk, SOCK_DEAD))
3922 sk->sk_error_report(sk);
3923 }
3924 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3925 packet_cached_dev_reset(po);
1da177e4 3926 po->ifindex = -1;
160ff18a
BG
3927 if (po->prot_hook.dev)
3928 dev_put(po->prot_hook.dev);
1da177e4
LT
3929 po->prot_hook.dev = NULL;
3930 }
3931 spin_unlock(&po->bind_lock);
3932 }
3933 break;
3934 case NETDEV_UP:
808f5114 3935 if (dev->ifindex == po->ifindex) {
3936 spin_lock(&po->bind_lock);
ce06b03e
DM
3937 if (po->num)
3938 register_prot_hook(sk);
808f5114 3939 spin_unlock(&po->bind_lock);
1da177e4 3940 }
1da177e4
LT
3941 break;
3942 }
3943 }
808f5114 3944 rcu_read_unlock();
1da177e4
LT
3945 return NOTIFY_DONE;
3946}
3947
3948
3949static int packet_ioctl(struct socket *sock, unsigned int cmd,
3950 unsigned long arg)
3951{
3952 struct sock *sk = sock->sk;
3953
69e3c75f 3954 switch (cmd) {
40d4e3df
ED
3955 case SIOCOUTQ:
3956 {
3957 int amount = sk_wmem_alloc_get(sk);
31e6d363 3958
40d4e3df
ED
3959 return put_user(amount, (int __user *)arg);
3960 }
3961 case SIOCINQ:
3962 {
3963 struct sk_buff *skb;
3964 int amount = 0;
3965
3966 spin_lock_bh(&sk->sk_receive_queue.lock);
3967 skb = skb_peek(&sk->sk_receive_queue);
3968 if (skb)
3969 amount = skb->len;
3970 spin_unlock_bh(&sk->sk_receive_queue.lock);
3971 return put_user(amount, (int __user *)arg);
3972 }
3973 case SIOCGSTAMP:
3974 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3975 case SIOCGSTAMPNS:
3976 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3977
1da177e4 3978#ifdef CONFIG_INET
40d4e3df
ED
3979 case SIOCADDRT:
3980 case SIOCDELRT:
3981 case SIOCDARP:
3982 case SIOCGARP:
3983 case SIOCSARP:
3984 case SIOCGIFADDR:
3985 case SIOCSIFADDR:
3986 case SIOCGIFBRDADDR:
3987 case SIOCSIFBRDADDR:
3988 case SIOCGIFNETMASK:
3989 case SIOCSIFNETMASK:
3990 case SIOCGIFDSTADDR:
3991 case SIOCSIFDSTADDR:
3992 case SIOCSIFFLAGS:
40d4e3df 3993 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3994#endif
3995
40d4e3df
ED
3996 default:
3997 return -ENOIOCTLCMD;
1da177e4
LT
3998 }
3999 return 0;
4000}
4001
40d4e3df 4002static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4003 poll_table *wait)
4004{
4005 struct sock *sk = sock->sk;
4006 struct packet_sock *po = pkt_sk(sk);
4007 unsigned int mask = datagram_poll(file, sock, wait);
4008
4009 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4010 if (po->rx_ring.pg_vec) {
f6fb8f10 4011 if (!packet_previous_rx_frame(po, &po->rx_ring,
4012 TP_STATUS_KERNEL))
1da177e4
LT
4013 mask |= POLLIN | POLLRDNORM;
4014 }
2ccdbaa6 4015 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4016 po->pressure = 0;
1da177e4 4017 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4018 spin_lock_bh(&sk->sk_write_queue.lock);
4019 if (po->tx_ring.pg_vec) {
4020 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4021 mask |= POLLOUT | POLLWRNORM;
4022 }
4023 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4024 return mask;
4025}
4026
4027
4028/* Dirty? Well, I still did not learn better way to account
4029 * for user mmaps.
4030 */
4031
4032static void packet_mm_open(struct vm_area_struct *vma)
4033{
4034 struct file *file = vma->vm_file;
40d4e3df 4035 struct socket *sock = file->private_data;
1da177e4 4036 struct sock *sk = sock->sk;
1ce4f28b 4037
1da177e4
LT
4038 if (sk)
4039 atomic_inc(&pkt_sk(sk)->mapped);
4040}
4041
4042static void packet_mm_close(struct vm_area_struct *vma)
4043{
4044 struct file *file = vma->vm_file;
40d4e3df 4045 struct socket *sock = file->private_data;
1da177e4 4046 struct sock *sk = sock->sk;
1ce4f28b 4047
1da177e4
LT
4048 if (sk)
4049 atomic_dec(&pkt_sk(sk)->mapped);
4050}
4051
f0f37e2f 4052static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4053 .open = packet_mm_open,
4054 .close = packet_mm_close,
1da177e4
LT
4055};
4056
0e3125c7
NH
4057static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4058 unsigned int len)
1da177e4
LT
4059{
4060 int i;
4061
4ebf0ae2 4062 for (i = 0; i < len; i++) {
0e3125c7 4063 if (likely(pg_vec[i].buffer)) {
c56b4d90 4064 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4065 vfree(pg_vec[i].buffer);
4066 else
4067 free_pages((unsigned long)pg_vec[i].buffer,
4068 order);
4069 pg_vec[i].buffer = NULL;
4070 }
1da177e4
LT
4071 }
4072 kfree(pg_vec);
4073}
4074
eea49cc9 4075static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4076{
f0d4eb29 4077 char *buffer;
0e3125c7
NH
4078 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4079 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4080
4081 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4082 if (buffer)
4083 return buffer;
4084
f0d4eb29 4085 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4086 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4087 if (buffer)
4088 return buffer;
4089
f0d4eb29 4090 /* vmalloc failed, lets dig into swap here */
0e3125c7 4091 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4092 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4093 if (buffer)
4094 return buffer;
4095
f0d4eb29 4096 /* complete and utter failure */
0e3125c7 4097 return NULL;
4ebf0ae2
DM
4098}
4099
0e3125c7 4100static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4101{
4102 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4103 struct pgv *pg_vec;
4ebf0ae2
DM
4104 int i;
4105
0e3125c7 4106 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4107 if (unlikely(!pg_vec))
4108 goto out;
4109
4110 for (i = 0; i < block_nr; i++) {
c56b4d90 4111 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4112 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4113 goto out_free_pgvec;
4114 }
4115
4116out:
4117 return pg_vec;
4118
4119out_free_pgvec:
4120 free_pg_vec(pg_vec, order, block_nr);
4121 pg_vec = NULL;
4122 goto out;
4123}
1da177e4 4124
f6fb8f10 4125static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4126 int closing, int tx_ring)
1da177e4 4127{
0e3125c7 4128 struct pgv *pg_vec = NULL;
1da177e4 4129 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4130 int was_running, order = 0;
69e3c75f
JB
4131 struct packet_ring_buffer *rb;
4132 struct sk_buff_head *rb_queue;
0e11c91e 4133 __be16 num;
f6fb8f10 4134 int err = -EINVAL;
4135 /* Added to avoid minimal code churn */
4136 struct tpacket_req *req = &req_u->req;
4137
84ac7260 4138 lock_sock(sk);
f6fb8f10 4139 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4140 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
6ae81ced 4141 net_warn_ratelimited("Tx-ring is not supported.\n");
f6fb8f10 4142 goto out;
4143 }
1ce4f28b 4144
69e3c75f
JB
4145 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4146 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4147
69e3c75f
JB
4148 err = -EBUSY;
4149 if (!closing) {
4150 if (atomic_read(&po->mapped))
4151 goto out;
b0138408 4152 if (packet_read_pending(rb))
69e3c75f
JB
4153 goto out;
4154 }
1da177e4 4155
69e3c75f
JB
4156 if (req->tp_block_nr) {
4157 /* Sanity tests and some calculations */
4158 err = -EBUSY;
4159 if (unlikely(rb->pg_vec))
4160 goto out;
1da177e4 4161
bbd6ef87
PM
4162 switch (po->tp_version) {
4163 case TPACKET_V1:
4164 po->tp_hdrlen = TPACKET_HDRLEN;
4165 break;
4166 case TPACKET_V2:
4167 po->tp_hdrlen = TPACKET2_HDRLEN;
4168 break;
f6fb8f10 4169 case TPACKET_V3:
4170 po->tp_hdrlen = TPACKET3_HDRLEN;
4171 break;
bbd6ef87
PM
4172 }
4173
69e3c75f 4174 err = -EINVAL;
4ebf0ae2 4175 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4176 goto out;
90836b67 4177 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4178 goto out;
dc808110 4179 if (po->tp_version >= TPACKET_V3 &&
500e91e0
AK
4180 req->tp_block_size <=
4181 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
dc808110 4182 goto out;
8913336a 4183 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4184 po->tp_reserve))
4185 goto out;
4ebf0ae2 4186 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4187 goto out;
1da177e4 4188
4194b491
TK
4189 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4190 if (unlikely(rb->frames_per_block == 0))
69e3c75f
JB
4191 goto out;
4192 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4193 req->tp_frame_nr))
4194 goto out;
1da177e4
LT
4195
4196 err = -ENOMEM;
4ebf0ae2
DM
4197 order = get_order(req->tp_block_size);
4198 pg_vec = alloc_pg_vec(req, order);
4199 if (unlikely(!pg_vec))
1da177e4 4200 goto out;
f6fb8f10 4201 switch (po->tp_version) {
4202 case TPACKET_V3:
4203 /* Transmit path is not supported. We checked
4204 * it above but just being paranoid
4205 */
4206 if (!tx_ring)
e8e85cc5 4207 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4208 break;
f6fb8f10 4209 default:
4210 break;
4211 }
69e3c75f
JB
4212 }
4213 /* Done */
4214 else {
4215 err = -EINVAL;
4ebf0ae2 4216 if (unlikely(req->tp_frame_nr))
69e3c75f 4217 goto out;
1da177e4
LT
4218 }
4219
1da177e4
LT
4220
4221 /* Detach socket from network */
4222 spin_lock(&po->bind_lock);
4223 was_running = po->running;
4224 num = po->num;
4225 if (was_running) {
1da177e4 4226 po->num = 0;
ce06b03e 4227 __unregister_prot_hook(sk, false);
1da177e4
LT
4228 }
4229 spin_unlock(&po->bind_lock);
1ce4f28b 4230
1da177e4
LT
4231 synchronize_net();
4232
4233 err = -EBUSY;
905db440 4234 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4235 if (closing || atomic_read(&po->mapped) == 0) {
4236 err = 0;
69e3c75f 4237 spin_lock_bh(&rb_queue->lock);
c053fd96 4238 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4239 rb->frame_max = (req->tp_frame_nr - 1);
4240 rb->head = 0;
4241 rb->frame_size = req->tp_frame_size;
4242 spin_unlock_bh(&rb_queue->lock);
4243
c053fd96
CG
4244 swap(rb->pg_vec_order, order);
4245 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4246
4247 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4248 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4249 tpacket_rcv : packet_rcv;
4250 skb_queue_purge(rb_queue);
1da177e4 4251 if (atomic_read(&po->mapped))
40d4e3df
ED
4252 pr_err("packet_mmap: vma is busy: %d\n",
4253 atomic_read(&po->mapped));
1da177e4 4254 }
905db440 4255 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4256
4257 spin_lock(&po->bind_lock);
ce06b03e 4258 if (was_running) {
1da177e4 4259 po->num = num;
ce06b03e 4260 register_prot_hook(sk);
1da177e4
LT
4261 }
4262 spin_unlock(&po->bind_lock);
f6fb8f10 4263 if (closing && (po->tp_version > TPACKET_V2)) {
4264 /* Because we don't support block-based V3 on tx-ring */
4265 if (!tx_ring)
73d0fcf2 4266 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4267 }
1da177e4 4268
1da177e4
LT
4269 if (pg_vec)
4270 free_pg_vec(pg_vec, order, req->tp_block_nr);
4271out:
84ac7260 4272 release_sock(sk);
1da177e4
LT
4273 return err;
4274}
4275
69e3c75f
JB
4276static int packet_mmap(struct file *file, struct socket *sock,
4277 struct vm_area_struct *vma)
1da177e4
LT
4278{
4279 struct sock *sk = sock->sk;
4280 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4281 unsigned long size, expected_size;
4282 struct packet_ring_buffer *rb;
1da177e4
LT
4283 unsigned long start;
4284 int err = -EINVAL;
4285 int i;
4286
4287 if (vma->vm_pgoff)
4288 return -EINVAL;
4289
905db440 4290 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4291
4292 expected_size = 0;
4293 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4294 if (rb->pg_vec) {
4295 expected_size += rb->pg_vec_len
4296 * rb->pg_vec_pages
4297 * PAGE_SIZE;
4298 }
4299 }
4300
4301 if (expected_size == 0)
1da177e4 4302 goto out;
69e3c75f
JB
4303
4304 size = vma->vm_end - vma->vm_start;
4305 if (size != expected_size)
1da177e4
LT
4306 goto out;
4307
1da177e4 4308 start = vma->vm_start;
69e3c75f
JB
4309 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4310 if (rb->pg_vec == NULL)
4311 continue;
4312
4313 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4314 struct page *page;
4315 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4316 int pg_num;
4317
c56b4d90
CG
4318 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4319 page = pgv_to_page(kaddr);
69e3c75f
JB
4320 err = vm_insert_page(vma, start, page);
4321 if (unlikely(err))
4322 goto out;
4323 start += PAGE_SIZE;
0e3125c7 4324 kaddr += PAGE_SIZE;
69e3c75f 4325 }
4ebf0ae2 4326 }
1da177e4 4327 }
69e3c75f 4328
4ebf0ae2 4329 atomic_inc(&po->mapped);
1da177e4
LT
4330 vma->vm_ops = &packet_mmap_ops;
4331 err = 0;
4332
4333out:
905db440 4334 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4335 return err;
4336}
1da177e4 4337
90ddc4f0 4338static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4339 .family = PF_PACKET,
4340 .owner = THIS_MODULE,
4341 .release = packet_release,
4342 .bind = packet_bind_spkt,
4343 .connect = sock_no_connect,
4344 .socketpair = sock_no_socketpair,
4345 .accept = sock_no_accept,
4346 .getname = packet_getname_spkt,
4347 .poll = datagram_poll,
4348 .ioctl = packet_ioctl,
4349 .listen = sock_no_listen,
4350 .shutdown = sock_no_shutdown,
4351 .setsockopt = sock_no_setsockopt,
4352 .getsockopt = sock_no_getsockopt,
4353 .sendmsg = packet_sendmsg_spkt,
4354 .recvmsg = packet_recvmsg,
4355 .mmap = sock_no_mmap,
4356 .sendpage = sock_no_sendpage,
4357};
1da177e4 4358
90ddc4f0 4359static const struct proto_ops packet_ops = {
1da177e4
LT
4360 .family = PF_PACKET,
4361 .owner = THIS_MODULE,
4362 .release = packet_release,
4363 .bind = packet_bind,
4364 .connect = sock_no_connect,
4365 .socketpair = sock_no_socketpair,
4366 .accept = sock_no_accept,
1ce4f28b 4367 .getname = packet_getname,
1da177e4
LT
4368 .poll = packet_poll,
4369 .ioctl = packet_ioctl,
4370 .listen = sock_no_listen,
4371 .shutdown = sock_no_shutdown,
4372 .setsockopt = packet_setsockopt,
4373 .getsockopt = packet_getsockopt,
719c44d3
WB
4374#ifdef CONFIG_COMPAT
4375 .compat_setsockopt = compat_packet_setsockopt,
4376#endif
1da177e4
LT
4377 .sendmsg = packet_sendmsg,
4378 .recvmsg = packet_recvmsg,
4379 .mmap = packet_mmap,
4380 .sendpage = sock_no_sendpage,
4381};
4382
ec1b4cf7 4383static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4384 .family = PF_PACKET,
4385 .create = packet_create,
4386 .owner = THIS_MODULE,
4387};
4388
4389static struct notifier_block packet_netdev_notifier = {
40d4e3df 4390 .notifier_call = packet_notifier,
1da177e4
LT
4391};
4392
4393#ifdef CONFIG_PROC_FS
1da177e4
LT
4394
4395static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4396 __acquires(RCU)
1da177e4 4397{
e372c414 4398 struct net *net = seq_file_net(seq);
808f5114 4399
4400 rcu_read_lock();
4401 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4402}
4403
4404static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4405{
1bf40954 4406 struct net *net = seq_file_net(seq);
808f5114 4407 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4408}
4409
4410static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4411 __releases(RCU)
1da177e4 4412{
808f5114 4413 rcu_read_unlock();
1da177e4
LT
4414}
4415
1ce4f28b 4416static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4417{
4418 if (v == SEQ_START_TOKEN)
4419 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4420 else {
b7ceabd9 4421 struct sock *s = sk_entry(v);
1da177e4
LT
4422 const struct packet_sock *po = pkt_sk(s);
4423
4424 seq_printf(seq,
71338aa7 4425 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4426 s,
4427 atomic_read(&s->sk_refcnt),
4428 s->sk_type,
4429 ntohs(po->num),
4430 po->ifindex,
4431 po->running,
4432 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4433 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4434 sock_i_ino(s));
1da177e4
LT
4435 }
4436
4437 return 0;
4438}
4439
56b3d975 4440static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4441 .start = packet_seq_start,
4442 .next = packet_seq_next,
4443 .stop = packet_seq_stop,
4444 .show = packet_seq_show,
4445};
4446
4447static int packet_seq_open(struct inode *inode, struct file *file)
4448{
e372c414
DL
4449 return seq_open_net(inode, file, &packet_seq_ops,
4450 sizeof(struct seq_net_private));
1da177e4
LT
4451}
4452
da7071d7 4453static const struct file_operations packet_seq_fops = {
1da177e4
LT
4454 .owner = THIS_MODULE,
4455 .open = packet_seq_open,
4456 .read = seq_read,
4457 .llseek = seq_lseek,
e372c414 4458 .release = seq_release_net,
1da177e4
LT
4459};
4460
4461#endif
4462
2c8c1e72 4463static int __net_init packet_net_init(struct net *net)
d12d01d6 4464{
0fa7fa98 4465 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4466 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4467
d4beaa66 4468 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4469 return -ENOMEM;
4470
4471 return 0;
4472}
4473
2c8c1e72 4474static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4475{
ece31ffd 4476 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4477}
4478
4479static struct pernet_operations packet_net_ops = {
4480 .init = packet_net_init,
4481 .exit = packet_net_exit,
4482};
4483
4484
1da177e4
LT
4485static void __exit packet_exit(void)
4486{
1da177e4 4487 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4488 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4489 sock_unregister(PF_PACKET);
4490 proto_unregister(&packet_proto);
4491}
4492
4493static int __init packet_init(void)
4494{
4495 int rc = proto_register(&packet_proto, 0);
4496
4497 if (rc != 0)
4498 goto out;
4499
4500 sock_register(&packet_family_ops);
d12d01d6 4501 register_pernet_subsys(&packet_net_ops);
1da177e4 4502 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4503out:
4504 return rc;
4505}
4506
4507module_init(packet_init);
4508module_exit(packet_exit);
4509MODULE_LICENSE("GPL");
4510MODULE_ALIAS_NETPROTO(PF_PACKET);