]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
Revert "net-packet: fix race in packet_set_ring on PACKET_RESERVE"
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define PGV_FROM_VMALLOC 1
69e3c75f 181
f6fb8f10 182#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
183#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
184#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
185#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
186#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
187#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
188#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189
69e3c75f
JB
190struct packet_sock;
191static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
192static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
193 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 194
f6fb8f10 195static void *packet_previous_frame(struct packet_sock *po,
196 struct packet_ring_buffer *rb,
197 int status);
198static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 199static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
200 struct tpacket_block_desc *);
201static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *);
bc59ba39 203static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 204 struct packet_sock *, unsigned int status);
bc59ba39 205static int prb_queue_frozen(struct tpacket_kbdq_core *);
206static void prb_open_block(struct tpacket_kbdq_core *,
207 struct tpacket_block_desc *);
f6fb8f10 208static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 209static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
210static void prb_init_blk_timer(struct packet_sock *,
211 struct tpacket_kbdq_core *,
212 void (*func) (unsigned long));
213static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
214static void prb_clear_rxhash(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
217 struct tpacket3_hdr *);
1da177e4
LT
218static void packet_flush_mclist(struct sock *sk);
219
ffbc6111 220struct packet_skb_cb {
ffbc6111
HX
221 union {
222 struct sockaddr_pkt pkt;
2472d761
EB
223 union {
224 /* Trick: alias skb original length with
225 * ll.sll_family and ll.protocol in order
226 * to save room.
227 */
228 unsigned int origlen;
229 struct sockaddr_ll ll;
230 };
ffbc6111
HX
231 } sa;
232};
233
d3869efe
DW
234#define vio_le() virtio_legacy_is_little_endian()
235
ffbc6111 236#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 237
bc59ba39 238#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 239#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 240 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 241#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 242 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 243#define GET_NEXT_PRB_BLK_NUM(x) \
244 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
245 ((x)->kactive_blk_num+1) : 0)
246
dc99f600
DM
247static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
248static void __fanout_link(struct sock *sk, struct packet_sock *po);
249
d346a3fa
DB
250static int packet_direct_xmit(struct sk_buff *skb)
251{
252 struct net_device *dev = skb->dev;
104ba78c 253 struct sk_buff *orig_skb = skb;
d346a3fa 254 struct netdev_queue *txq;
43279500 255 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
256
257 if (unlikely(!netif_running(dev) ||
43279500
DB
258 !netif_carrier_ok(dev)))
259 goto drop;
d346a3fa 260
104ba78c
WB
261 skb = validate_xmit_skb_list(skb, dev);
262 if (skb != orig_skb)
43279500 263 goto drop;
d346a3fa 264
10c51b56 265 txq = skb_get_tx_queue(dev, skb);
d346a3fa 266
43279500
DB
267 local_bh_disable();
268
269 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 270 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 271 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 272 HARD_TX_UNLOCK(dev, txq);
d346a3fa 273
43279500
DB
274 local_bh_enable();
275
276 if (!dev_xmit_complete(ret))
d346a3fa 277 kfree_skb(skb);
43279500 278
d346a3fa 279 return ret;
43279500 280drop:
0f97ede4 281 atomic_long_inc(&dev->tx_dropped);
104ba78c 282 kfree_skb_list(skb);
43279500 283 return NET_XMIT_DROP;
d346a3fa
DB
284}
285
66e56cd4
DB
286static struct net_device *packet_cached_dev_get(struct packet_sock *po)
287{
288 struct net_device *dev;
289
290 rcu_read_lock();
291 dev = rcu_dereference(po->cached_dev);
292 if (likely(dev))
293 dev_hold(dev);
294 rcu_read_unlock();
295
296 return dev;
297}
298
299static void packet_cached_dev_assign(struct packet_sock *po,
300 struct net_device *dev)
301{
302 rcu_assign_pointer(po->cached_dev, dev);
303}
304
305static void packet_cached_dev_reset(struct packet_sock *po)
306{
307 RCU_INIT_POINTER(po->cached_dev, NULL);
308}
309
d346a3fa
DB
310static bool packet_use_direct_xmit(const struct packet_sock *po)
311{
312 return po->xmit == packet_direct_xmit;
313}
314
0fd5d57b 315static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 316{
1cbac010 317 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
318}
319
0fd5d57b
DB
320static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
321{
322 const struct net_device_ops *ops = dev->netdev_ops;
323 u16 queue_index;
324
325 if (ops->ndo_select_queue) {
326 queue_index = ops->ndo_select_queue(dev, skb, NULL,
327 __packet_pick_tx_queue);
328 queue_index = netdev_cap_txqueue(dev, queue_index);
329 } else {
330 queue_index = __packet_pick_tx_queue(dev, skb);
331 }
332
333 skb_set_queue_mapping(skb, queue_index);
334}
335
ce06b03e
DM
336/* register_prot_hook must be invoked with the po->bind_lock held,
337 * or from a context in which asynchronous accesses to the packet
338 * socket is not possible (packet_create()).
339 */
340static void register_prot_hook(struct sock *sk)
341{
342 struct packet_sock *po = pkt_sk(sk);
e40526cb 343
ce06b03e 344 if (!po->running) {
66e56cd4 345 if (po->fanout)
dc99f600 346 __fanout_link(sk, po);
66e56cd4 347 else
dc99f600 348 dev_add_pack(&po->prot_hook);
e40526cb 349
ce06b03e
DM
350 sock_hold(sk);
351 po->running = 1;
352 }
353}
354
355/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
356 * held. If the sync parameter is true, we will temporarily drop
357 * the po->bind_lock and do a synchronize_net to make sure no
358 * asynchronous packet processing paths still refer to the elements
359 * of po->prot_hook. If the sync parameter is false, it is the
360 * callers responsibility to take care of this.
361 */
362static void __unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 po->running = 0;
66e56cd4
DB
367
368 if (po->fanout)
dc99f600 369 __fanout_unlink(sk, po);
66e56cd4 370 else
dc99f600 371 __dev_remove_pack(&po->prot_hook);
e40526cb 372
ce06b03e
DM
373 __sock_put(sk);
374
375 if (sync) {
376 spin_unlock(&po->bind_lock);
377 synchronize_net();
378 spin_lock(&po->bind_lock);
379 }
380}
381
382static void unregister_prot_hook(struct sock *sk, bool sync)
383{
384 struct packet_sock *po = pkt_sk(sk);
385
386 if (po->running)
387 __unregister_prot_hook(sk, sync);
388}
389
6e58040b 390static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
391{
392 if (is_vmalloc_addr(addr))
393 return vmalloc_to_page(addr);
394 return virt_to_page(addr);
395}
396
69e3c75f 397static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 398{
184f489e 399 union tpacket_uhdr h;
1da177e4 400
69e3c75f 401 h.raw = frame;
bbd6ef87
PM
402 switch (po->tp_version) {
403 case TPACKET_V1:
69e3c75f 404 h.h1->tp_status = status;
0af55bb5 405 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
406 break;
407 case TPACKET_V2:
69e3c75f 408 h.h2->tp_status = status;
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 410 break;
f6fb8f10 411 case TPACKET_V3:
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f 414 BUG();
bbd6ef87 415 }
69e3c75f
JB
416
417 smp_wmb();
bbd6ef87
PM
418}
419
69e3c75f 420static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 421{
184f489e 422 union tpacket_uhdr h;
bbd6ef87 423
69e3c75f
JB
424 smp_rmb();
425
bbd6ef87
PM
426 h.raw = frame;
427 switch (po->tp_version) {
428 case TPACKET_V1:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 430 return h.h1->tp_status;
bbd6ef87 431 case TPACKET_V2:
0af55bb5 432 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 433 return h.h2->tp_status;
f6fb8f10 434 case TPACKET_V3:
69e3c75f 435 default:
f6fb8f10 436 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
437 BUG();
438 return 0;
bbd6ef87 439 }
1da177e4 440}
69e3c75f 441
b9c32fb2
DB
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
7a51384c
DB
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
68a360e8
WB
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 453 return TP_STATUS_TS_SOFTWARE;
7a51384c 454
b9c32fb2 455 return 0;
7a51384c
DB
456}
457
b9c32fb2
DB
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
2e31396f
WB
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
b9c32fb2 463 __u32 ts_status;
2e31396f 464
b9c32fb2
DB
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
2e31396f
WB
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
479 default:
480 WARN(1, "TPACKET version not supported.\n");
481 BUG();
482 }
483
484 /* one flush is safe, as both fields always lie on the same cacheline */
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
486 smp_wmb();
b9c32fb2
DB
487
488 return ts_status;
2e31396f
WB
489}
490
69e3c75f
JB
491static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
494 int status)
495{
496 unsigned int pg_vec_pos, frame_offset;
184f489e 497 union tpacket_uhdr h;
69e3c75f
JB
498
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
501
0e3125c7
NH
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
69e3c75f
JB
504
505 if (status != __packet_get_status(po, h.raw))
506 return NULL;
507
508 return h.raw;
509}
510
eea49cc9 511static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
512 struct packet_ring_buffer *rb,
513 int status)
514{
515 return packet_lookup_frame(po, rb, rb->head, status);
516}
517
bc59ba39 518static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 519{
520 del_timer_sync(&pkc->retire_blk_timer);
521}
522
523static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 524 struct sk_buff_head *rb_queue)
525{
bc59ba39 526 struct tpacket_kbdq_core *pkc;
f6fb8f10 527
73d0fcf2 528 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 529
ec6f809f 530 spin_lock_bh(&rb_queue->lock);
f6fb8f10 531 pkc->delete_blk_timer = 1;
ec6f809f 532 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 533
534 prb_del_retire_blk_timer(pkc);
535}
536
537static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 538 struct tpacket_kbdq_core *pkc,
f6fb8f10 539 void (*func) (unsigned long))
540{
541 init_timer(&pkc->retire_blk_timer);
542 pkc->retire_blk_timer.data = (long)po;
543 pkc->retire_blk_timer.function = func;
544 pkc->retire_blk_timer.expires = jiffies;
545}
546
e8e85cc5 547static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 548{
bc59ba39 549 struct tpacket_kbdq_core *pkc;
f6fb8f10 550
e8e85cc5 551 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 552 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
553}
554
555static int prb_calc_retire_blk_tmo(struct packet_sock *po,
556 int blk_size_in_bytes)
557{
558 struct net_device *dev;
559 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 560 struct ethtool_link_ksettings ecmd;
4bc71cb9 561 int err;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 568 }
7cad1bac 569 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
570 rtnl_unlock();
571 if (!err) {
4bc71cb9
JP
572 /*
573 * If the link speed is so slow you don't really
574 * need to worry about perf anyways
575 */
7cad1bac
DD
576 if (ecmd.base.speed < SPEED_1000 ||
577 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
7cad1bac 581 div = ecmd.base.speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
eb70db87 1343 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
47dceb8e
WB
1415static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1416 struct sk_buff *skb,
1417 unsigned int num)
1418{
1419 struct bpf_prog *prog;
1420 unsigned int ret = 0;
1421
1422 rcu_read_lock();
1423 prog = rcu_dereference(f->bpf_prog);
1424 if (prog)
ff936a04 1425 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1426 rcu_read_unlock();
1427
1428 return ret;
1429}
1430
77f65ebd
WB
1431static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1432{
1433 return f->flags & (flag >> 8);
95ec3eb4
DM
1434}
1435
95ec3eb4
DM
1436static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1437 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1438{
1439 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1440 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1441 struct net *net = read_pnet(&f->net);
dc99f600 1442 struct packet_sock *po;
77f65ebd 1443 unsigned int idx;
dc99f600 1444
19bcf9f2 1445 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1446 kfree_skb(skb);
1447 return 0;
1448 }
1449
3f34b24a 1450 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1451 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1452 if (!skb)
1453 return 0;
1454 }
95ec3eb4
DM
1455 switch (f->type) {
1456 case PACKET_FANOUT_HASH:
1457 default:
77f65ebd 1458 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1459 break;
1460 case PACKET_FANOUT_LB:
77f65ebd 1461 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1462 break;
1463 case PACKET_FANOUT_CPU:
77f65ebd
WB
1464 idx = fanout_demux_cpu(f, skb, num);
1465 break;
5df0ddfb
DB
1466 case PACKET_FANOUT_RND:
1467 idx = fanout_demux_rnd(f, skb, num);
1468 break;
2d36097d
NH
1469 case PACKET_FANOUT_QM:
1470 idx = fanout_demux_qm(f, skb, num);
1471 break;
77f65ebd 1472 case PACKET_FANOUT_ROLLOVER:
ad377cab 1473 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1474 break;
47dceb8e 1475 case PACKET_FANOUT_CBPF:
f2e52095 1476 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1477 idx = fanout_demux_bpf(f, skb, num);
1478 break;
dc99f600
DM
1479 }
1480
ad377cab
WB
1481 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1482 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1483
ad377cab 1484 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1485 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1486}
1487
fff3321d
PE
1488DEFINE_MUTEX(fanout_mutex);
1489EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1490static LIST_HEAD(fanout_list);
1491
1492static void __fanout_link(struct sock *sk, struct packet_sock *po)
1493{
1494 struct packet_fanout *f = po->fanout;
1495
1496 spin_lock(&f->lock);
1497 f->arr[f->num_members] = sk;
1498 smp_wmb();
1499 f->num_members++;
2bd624b4
AS
1500 if (f->num_members == 1)
1501 dev_add_pack(&f->prot_hook);
dc99f600
DM
1502 spin_unlock(&f->lock);
1503}
1504
1505static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1506{
1507 struct packet_fanout *f = po->fanout;
1508 int i;
1509
1510 spin_lock(&f->lock);
1511 for (i = 0; i < f->num_members; i++) {
1512 if (f->arr[i] == sk)
1513 break;
1514 }
1515 BUG_ON(i >= f->num_members);
1516 f->arr[i] = f->arr[f->num_members - 1];
1517 f->num_members--;
2bd624b4
AS
1518 if (f->num_members == 0)
1519 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1520 spin_unlock(&f->lock);
1521}
1522
d4dd8aee 1523static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1524{
161642e2
ED
1525 if (sk->sk_family != PF_PACKET)
1526 return false;
c0de08d0 1527
161642e2 1528 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1529}
1530
47dceb8e
WB
1531static void fanout_init_data(struct packet_fanout *f)
1532{
1533 switch (f->type) {
1534 case PACKET_FANOUT_LB:
1535 atomic_set(&f->rr_cur, 0);
1536 break;
1537 case PACKET_FANOUT_CBPF:
f2e52095 1538 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1539 RCU_INIT_POINTER(f->bpf_prog, NULL);
1540 break;
1541 }
1542}
1543
1544static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1545{
1546 struct bpf_prog *old;
1547
1548 spin_lock(&f->lock);
1549 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1550 rcu_assign_pointer(f->bpf_prog, new);
1551 spin_unlock(&f->lock);
1552
1553 if (old) {
1554 synchronize_net();
1555 bpf_prog_destroy(old);
1556 }
1557}
1558
1559static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1560 unsigned int len)
1561{
1562 struct bpf_prog *new;
1563 struct sock_fprog fprog;
1564 int ret;
1565
1566 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1567 return -EPERM;
1568 if (len != sizeof(fprog))
1569 return -EINVAL;
1570 if (copy_from_user(&fprog, data, len))
1571 return -EFAULT;
1572
bab18991 1573 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1574 if (ret)
1575 return ret;
1576
1577 __fanout_set_data_bpf(po->fanout, new);
1578 return 0;
1579}
1580
f2e52095
WB
1581static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1582 unsigned int len)
1583{
1584 struct bpf_prog *new;
1585 u32 fd;
1586
1587 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1588 return -EPERM;
1589 if (len != sizeof(fd))
1590 return -EINVAL;
1591 if (copy_from_user(&fd, data, len))
1592 return -EFAULT;
1593
113214be 1594 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1595 if (IS_ERR(new))
1596 return PTR_ERR(new);
f2e52095
WB
1597
1598 __fanout_set_data_bpf(po->fanout, new);
1599 return 0;
1600}
1601
47dceb8e
WB
1602static int fanout_set_data(struct packet_sock *po, char __user *data,
1603 unsigned int len)
1604{
1605 switch (po->fanout->type) {
1606 case PACKET_FANOUT_CBPF:
1607 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1608 case PACKET_FANOUT_EBPF:
1609 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1610 default:
1611 return -EINVAL;
1612 };
1613}
1614
1615static void fanout_release_data(struct packet_fanout *f)
1616{
1617 switch (f->type) {
1618 case PACKET_FANOUT_CBPF:
f2e52095 1619 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1620 __fanout_set_data_bpf(f, NULL);
1621 };
1622}
1623
7736d33f 1624static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1625{
d199fab6 1626 struct packet_rollover *rollover = NULL;
dc99f600
DM
1627 struct packet_sock *po = pkt_sk(sk);
1628 struct packet_fanout *f, *match;
7736d33f 1629 u8 type = type_flags & 0xff;
77f65ebd 1630 u8 flags = type_flags >> 8;
dc99f600
DM
1631 int err;
1632
1633 switch (type) {
77f65ebd
WB
1634 case PACKET_FANOUT_ROLLOVER:
1635 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1636 return -EINVAL;
dc99f600
DM
1637 case PACKET_FANOUT_HASH:
1638 case PACKET_FANOUT_LB:
95ec3eb4 1639 case PACKET_FANOUT_CPU:
5df0ddfb 1640 case PACKET_FANOUT_RND:
2d36097d 1641 case PACKET_FANOUT_QM:
47dceb8e 1642 case PACKET_FANOUT_CBPF:
f2e52095 1643 case PACKET_FANOUT_EBPF:
dc99f600
DM
1644 break;
1645 default:
1646 return -EINVAL;
1647 }
1648
d199fab6
ED
1649 mutex_lock(&fanout_mutex);
1650
1651 err = -EINVAL;
dc99f600 1652 if (!po->running)
d199fab6 1653 goto out;
dc99f600 1654
d199fab6 1655 err = -EALREADY;
dc99f600 1656 if (po->fanout)
d199fab6 1657 goto out;
dc99f600 1658
4633c9e0
WB
1659 if (type == PACKET_FANOUT_ROLLOVER ||
1660 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1661 err = -ENOMEM;
1662 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1663 if (!rollover)
1664 goto out;
1665 atomic_long_set(&rollover->num, 0);
1666 atomic_long_set(&rollover->num_huge, 0);
1667 atomic_long_set(&rollover->num_failed, 0);
1668 po->rollover = rollover;
0648ab70
WB
1669 }
1670
dc99f600
DM
1671 match = NULL;
1672 list_for_each_entry(f, &fanout_list, list) {
1673 if (f->id == id &&
1674 read_pnet(&f->net) == sock_net(sk)) {
1675 match = f;
1676 break;
1677 }
1678 }
afe62c68 1679 err = -EINVAL;
77f65ebd 1680 if (match && match->flags != flags)
afe62c68 1681 goto out;
dc99f600 1682 if (!match) {
afe62c68 1683 err = -ENOMEM;
dc99f600 1684 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1685 if (!match)
1686 goto out;
1687 write_pnet(&match->net, sock_net(sk));
1688 match->id = id;
1689 match->type = type;
77f65ebd 1690 match->flags = flags;
afe62c68
ED
1691 INIT_LIST_HEAD(&match->list);
1692 spin_lock_init(&match->lock);
1693 atomic_set(&match->sk_ref, 0);
47dceb8e 1694 fanout_init_data(match);
afe62c68
ED
1695 match->prot_hook.type = po->prot_hook.type;
1696 match->prot_hook.dev = po->prot_hook.dev;
1697 match->prot_hook.func = packet_rcv_fanout;
1698 match->prot_hook.af_packet_priv = match;
c0de08d0 1699 match->prot_hook.id_match = match_fanout_group;
afe62c68 1700 list_add(&match->list, &fanout_list);
dc99f600 1701 }
afe62c68
ED
1702 err = -EINVAL;
1703 if (match->type == type &&
1704 match->prot_hook.type == po->prot_hook.type &&
1705 match->prot_hook.dev == po->prot_hook.dev) {
1706 err = -ENOSPC;
1707 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1708 __dev_remove_pack(&po->prot_hook);
1709 po->fanout = match;
1710 atomic_inc(&match->sk_ref);
1711 __fanout_link(sk, po);
1712 err = 0;
dc99f600
DM
1713 }
1714 }
afe62c68 1715out:
d199fab6
ED
1716 if (err && rollover) {
1717 kfree(rollover);
0648ab70
WB
1718 po->rollover = NULL;
1719 }
d199fab6 1720 mutex_unlock(&fanout_mutex);
dc99f600
DM
1721 return err;
1722}
1723
2bd624b4
AS
1724/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1725 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1726 * It is the responsibility of the caller to call fanout_release_data() and
1727 * free the returned packet_fanout (after synchronize_net())
1728 */
1729static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1730{
1731 struct packet_sock *po = pkt_sk(sk);
1732 struct packet_fanout *f;
1733
fff3321d 1734 mutex_lock(&fanout_mutex);
d199fab6
ED
1735 f = po->fanout;
1736 if (f) {
1737 po->fanout = NULL;
1738
2bd624b4 1739 if (atomic_dec_and_test(&f->sk_ref))
d199fab6 1740 list_del(&f->list);
2bd624b4
AS
1741 else
1742 f = NULL;
dc99f600 1743
d199fab6
ED
1744 if (po->rollover)
1745 kfree_rcu(po->rollover, rcu);
dc99f600
DM
1746 }
1747 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1748
1749 return f;
dc99f600 1750}
1da177e4 1751
3c70c132
DB
1752static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1753 struct sk_buff *skb)
1754{
1755 /* Earlier code assumed this would be a VLAN pkt, double-check
1756 * this now that we have the actual packet in hand. We can only
1757 * do this check on Ethernet devices.
1758 */
1759 if (unlikely(dev->type != ARPHRD_ETHER))
1760 return false;
1761
1762 skb_reset_mac_header(skb);
1763 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1764}
1765
90ddc4f0 1766static const struct proto_ops packet_ops;
1da177e4 1767
90ddc4f0 1768static const struct proto_ops packet_ops_spkt;
1da177e4 1769
40d4e3df
ED
1770static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1771 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1772{
1773 struct sock *sk;
1774 struct sockaddr_pkt *spkt;
1775
1776 /*
1777 * When we registered the protocol we saved the socket in the data
1778 * field for just this event.
1779 */
1780
1781 sk = pt->af_packet_priv;
1ce4f28b 1782
1da177e4
LT
1783 /*
1784 * Yank back the headers [hope the device set this
1785 * right or kerboom...]
1786 *
1787 * Incoming packets have ll header pulled,
1788 * push it back.
1789 *
98e399f8 1790 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1791 * so that this procedure is noop.
1792 */
1793
1794 if (skb->pkt_type == PACKET_LOOPBACK)
1795 goto out;
1796
09ad9bc7 1797 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1798 goto out;
1799
40d4e3df
ED
1800 skb = skb_share_check(skb, GFP_ATOMIC);
1801 if (skb == NULL)
1da177e4
LT
1802 goto oom;
1803
1804 /* drop any routing info */
adf30907 1805 skb_dst_drop(skb);
1da177e4 1806
84531c24
PO
1807 /* drop conntrack reference */
1808 nf_reset(skb);
1809
ffbc6111 1810 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1811
98e399f8 1812 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1813
1814 /*
1815 * The SOCK_PACKET socket receives _all_ frames.
1816 */
1817
1818 spkt->spkt_family = dev->type;
1819 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1820 spkt->spkt_protocol = skb->protocol;
1821
1822 /*
1823 * Charge the memory to the socket. This is done specifically
1824 * to prevent sockets using all the memory up.
1825 */
1826
40d4e3df 1827 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1828 return 0;
1829
1830out:
1831 kfree_skb(skb);
1832oom:
1833 return 0;
1834}
1835
1836
1837/*
1838 * Output a raw packet to a device layer. This bypasses all the other
1839 * protocol layers and you must therefore supply it with a complete frame
1840 */
1ce4f28b 1841
1b784140
YX
1842static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1843 size_t len)
1da177e4
LT
1844{
1845 struct sock *sk = sock->sk;
342dfc30 1846 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1847 struct sk_buff *skb = NULL;
1da177e4 1848 struct net_device *dev;
c14ac945 1849 struct sockcm_cookie sockc;
40d4e3df 1850 __be16 proto = 0;
1da177e4 1851 int err;
3bdc0eba 1852 int extra_len = 0;
1ce4f28b 1853
1da177e4 1854 /*
1ce4f28b 1855 * Get and verify the address.
1da177e4
LT
1856 */
1857
40d4e3df 1858 if (saddr) {
1da177e4 1859 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1860 return -EINVAL;
1861 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1862 proto = saddr->spkt_protocol;
1863 } else
1864 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1865
1866 /*
1ce4f28b 1867 * Find the device first to size check it
1da177e4
LT
1868 */
1869
de74e92a 1870 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1871retry:
654d1f8a
ED
1872 rcu_read_lock();
1873 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1874 err = -ENODEV;
1875 if (dev == NULL)
1876 goto out_unlock;
1ce4f28b 1877
d5e76b0a
DM
1878 err = -ENETDOWN;
1879 if (!(dev->flags & IFF_UP))
1880 goto out_unlock;
1881
1da177e4 1882 /*
40d4e3df
ED
1883 * You may not queue a frame bigger than the mtu. This is the lowest level
1884 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1885 */
1ce4f28b 1886
3bdc0eba
BG
1887 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1888 if (!netif_supports_nofcs(dev)) {
1889 err = -EPROTONOSUPPORT;
1890 goto out_unlock;
1891 }
1892 extra_len = 4; /* We're doing our own CRC */
1893 }
1894
1da177e4 1895 err = -EMSGSIZE;
3bdc0eba 1896 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1897 goto out_unlock;
1898
1a35ca80
ED
1899 if (!skb) {
1900 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1901 int tlen = dev->needed_tailroom;
1a35ca80
ED
1902 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1903
1904 rcu_read_unlock();
4ce40912 1905 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1906 if (skb == NULL)
1907 return -ENOBUFS;
1908 /* FIXME: Save some space for broken drivers that write a hard
1909 * header at transmission time by themselves. PPP is the notable
1910 * one here. This should really be fixed at the driver level.
1911 */
1912 skb_reserve(skb, reserved);
1913 skb_reset_network_header(skb);
1914
1915 /* Try to align data part correctly */
1916 if (hhlen) {
1917 skb->data -= hhlen;
1918 skb->tail -= hhlen;
1919 if (len < hhlen)
1920 skb_reset_network_header(skb);
1921 }
6ce8e9ce 1922 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1923 if (err)
1924 goto out_free;
1925 goto retry;
1da177e4
LT
1926 }
1927
9ed988cd
WB
1928 if (!dev_validate_header(dev, skb->data, len)) {
1929 err = -EINVAL;
1930 goto out_unlock;
1931 }
3c70c132
DB
1932 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1933 !packet_extra_vlan_len_allowed(dev, skb)) {
1934 err = -EMSGSIZE;
1935 goto out_unlock;
57f89bfa 1936 }
1a35ca80 1937
edbe7746 1938 sockc.tsflags = sk->sk_tsflags;
c14ac945
SHY
1939 if (msg->msg_controllen) {
1940 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1941 if (unlikely(err))
c14ac945 1942 goto out_unlock;
c14ac945
SHY
1943 }
1944
1da177e4
LT
1945 skb->protocol = proto;
1946 skb->dev = dev;
1947 skb->priority = sk->sk_priority;
2d37a186 1948 skb->mark = sk->sk_mark;
bf84a010 1949
c14ac945 1950 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1951
3bdc0eba
BG
1952 if (unlikely(extra_len == 4))
1953 skb->no_fcs = 1;
1954
40893fd0 1955 skb_probe_transport_header(skb, 0);
c1aad275 1956
1da177e4 1957 dev_queue_xmit(skb);
654d1f8a 1958 rcu_read_unlock();
40d4e3df 1959 return len;
1da177e4 1960
1da177e4 1961out_unlock:
654d1f8a 1962 rcu_read_unlock();
1a35ca80
ED
1963out_free:
1964 kfree_skb(skb);
1da177e4
LT
1965 return err;
1966}
1da177e4 1967
ff936a04
AS
1968static unsigned int run_filter(struct sk_buff *skb,
1969 const struct sock *sk,
1970 unsigned int res)
1da177e4
LT
1971{
1972 struct sk_filter *filter;
fda9ef5d 1973
80f8f102
ED
1974 rcu_read_lock();
1975 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1976 if (filter != NULL)
ff936a04 1977 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1978 rcu_read_unlock();
1da177e4 1979
dbcb5855 1980 return res;
1da177e4
LT
1981}
1982
16cc1400
WB
1983static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
1984 size_t *len)
1985{
1986 struct virtio_net_hdr vnet_hdr;
1987
1988 if (*len < sizeof(vnet_hdr))
1989 return -EINVAL;
1990 *len -= sizeof(vnet_hdr);
1991
6391a448 1992 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
16cc1400
WB
1993 return -EINVAL;
1994
1995 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
1996}
1997
1da177e4 1998/*
62ab0812
ED
1999 * This function makes lazy skb cloning in hope that most of packets
2000 * are discarded by BPF.
2001 *
2002 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2003 * and skb->cb are mangled. It works because (and until) packets
2004 * falling here are owned by current CPU. Output packets are cloned
2005 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2006 * sequencially, so that if we return skb to original state on exit,
2007 * we will not harm anyone.
1da177e4
LT
2008 */
2009
40d4e3df
ED
2010static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2011 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2012{
2013 struct sock *sk;
2014 struct sockaddr_ll *sll;
2015 struct packet_sock *po;
40d4e3df 2016 u8 *skb_head = skb->data;
1da177e4 2017 int skb_len = skb->len;
dbcb5855 2018 unsigned int snaplen, res;
da37845f 2019 bool is_drop_n_account = false;
1da177e4
LT
2020
2021 if (skb->pkt_type == PACKET_LOOPBACK)
2022 goto drop;
2023
2024 sk = pt->af_packet_priv;
2025 po = pkt_sk(sk);
2026
09ad9bc7 2027 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2028 goto drop;
2029
1da177e4
LT
2030 skb->dev = dev;
2031
3b04ddde 2032 if (dev->header_ops) {
1da177e4 2033 /* The device has an explicit notion of ll header,
62ab0812
ED
2034 * exported to higher levels.
2035 *
2036 * Otherwise, the device hides details of its frame
2037 * structure, so that corresponding packet head is
2038 * never delivered to user.
1da177e4
LT
2039 */
2040 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2041 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2042 else if (skb->pkt_type == PACKET_OUTGOING) {
2043 /* Special case: outgoing packets have ll header at head */
bbe735e4 2044 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2045 }
2046 }
2047
2048 snaplen = skb->len;
2049
dbcb5855
DM
2050 res = run_filter(skb, sk, snaplen);
2051 if (!res)
fda9ef5d 2052 goto drop_n_restore;
dbcb5855
DM
2053 if (snaplen > res)
2054 snaplen = res;
1da177e4 2055
0fd7bac6 2056 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2057 goto drop_n_acct;
2058
2059 if (skb_shared(skb)) {
2060 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2061 if (nskb == NULL)
2062 goto drop_n_acct;
2063
2064 if (skb_head != skb->data) {
2065 skb->data = skb_head;
2066 skb->len = skb_len;
2067 }
abc4e4fa 2068 consume_skb(skb);
1da177e4
LT
2069 skb = nskb;
2070 }
2071
b4772ef8 2072 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2073
2074 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2075 sll->sll_hatype = dev->type;
1da177e4 2076 sll->sll_pkttype = skb->pkt_type;
8032b464 2077 if (unlikely(po->origdev))
80feaacb
PWJ
2078 sll->sll_ifindex = orig_dev->ifindex;
2079 else
2080 sll->sll_ifindex = dev->ifindex;
1da177e4 2081
b95cce35 2082 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2083
2472d761
EB
2084 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2085 * Use their space for storing the original skb length.
2086 */
2087 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2088
1da177e4
LT
2089 if (pskb_trim(skb, snaplen))
2090 goto drop_n_acct;
2091
2092 skb_set_owner_r(skb, sk);
2093 skb->dev = NULL;
adf30907 2094 skb_dst_drop(skb);
1da177e4 2095
84531c24
PO
2096 /* drop conntrack reference */
2097 nf_reset(skb);
2098
1da177e4 2099 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2100 po->stats.stats1.tp_packets++;
3bc3b96f 2101 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2102 __skb_queue_tail(&sk->sk_receive_queue, skb);
2103 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2104 sk->sk_data_ready(sk);
1da177e4
LT
2105 return 0;
2106
2107drop_n_acct:
da37845f 2108 is_drop_n_account = true;
7091fbd8 2109 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2110 po->stats.stats1.tp_drops++;
7091fbd8
WB
2111 atomic_inc(&sk->sk_drops);
2112 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2113
2114drop_n_restore:
2115 if (skb_head != skb->data && skb_shared(skb)) {
2116 skb->data = skb_head;
2117 skb->len = skb_len;
2118 }
2119drop:
da37845f
WJ
2120 if (!is_drop_n_account)
2121 consume_skb(skb);
2122 else
2123 kfree_skb(skb);
1da177e4
LT
2124 return 0;
2125}
2126
40d4e3df
ED
2127static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2128 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2129{
2130 struct sock *sk;
2131 struct packet_sock *po;
2132 struct sockaddr_ll *sll;
184f489e 2133 union tpacket_uhdr h;
40d4e3df 2134 u8 *skb_head = skb->data;
1da177e4 2135 int skb_len = skb->len;
dbcb5855 2136 unsigned int snaplen, res;
f6fb8f10 2137 unsigned long status = TP_STATUS_USER;
bbd6ef87 2138 unsigned short macoff, netoff, hdrlen;
1da177e4 2139 struct sk_buff *copy_skb = NULL;
bbd6ef87 2140 struct timespec ts;
b9c32fb2 2141 __u32 ts_status;
da37845f 2142 bool is_drop_n_account = false;
1da177e4 2143
51846355
AW
2144 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2145 * We may add members to them until current aligned size without forcing
2146 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2147 */
2148 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2149 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2150
1da177e4
LT
2151 if (skb->pkt_type == PACKET_LOOPBACK)
2152 goto drop;
2153
2154 sk = pt->af_packet_priv;
2155 po = pkt_sk(sk);
2156
09ad9bc7 2157 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2158 goto drop;
2159
3b04ddde 2160 if (dev->header_ops) {
1da177e4 2161 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2162 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2163 else if (skb->pkt_type == PACKET_OUTGOING) {
2164 /* Special case: outgoing packets have ll header at head */
bbe735e4 2165 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2166 }
2167 }
2168
2169 snaplen = skb->len;
2170
dbcb5855
DM
2171 res = run_filter(skb, sk, snaplen);
2172 if (!res)
fda9ef5d 2173 goto drop_n_restore;
68c2e5de
AD
2174
2175 if (skb->ip_summed == CHECKSUM_PARTIAL)
2176 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2177 else if (skb->pkt_type != PACKET_OUTGOING &&
2178 (skb->ip_summed == CHECKSUM_COMPLETE ||
2179 skb_csum_unnecessary(skb)))
2180 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2181
dbcb5855
DM
2182 if (snaplen > res)
2183 snaplen = res;
1da177e4
LT
2184
2185 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2186 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2187 po->tp_reserve;
1da177e4 2188 } else {
95c96174 2189 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2190 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2191 (maclen < 16 ? 16 : maclen)) +
58d19b19
WB
2192 po->tp_reserve;
2193 if (po->has_vnet_hdr)
2194 netoff += sizeof(struct virtio_net_hdr);
1da177e4
LT
2195 macoff = netoff - maclen;
2196 }
f6fb8f10 2197 if (po->tp_version <= TPACKET_V2) {
2198 if (macoff + snaplen > po->rx_ring.frame_size) {
2199 if (po->copy_thresh &&
0fd7bac6 2200 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2201 if (skb_shared(skb)) {
2202 copy_skb = skb_clone(skb, GFP_ATOMIC);
2203 } else {
2204 copy_skb = skb_get(skb);
2205 skb_head = skb->data;
2206 }
2207 if (copy_skb)
2208 skb_set_owner_r(copy_skb, sk);
1da177e4 2209 }
f6fb8f10 2210 snaplen = po->rx_ring.frame_size - macoff;
2211 if ((int)snaplen < 0)
2212 snaplen = 0;
1da177e4 2213 }
dc808110
ED
2214 } else if (unlikely(macoff + snaplen >
2215 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2216 u32 nval;
2217
2218 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2219 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2220 snaplen, nval, macoff);
2221 snaplen = nval;
2222 if (unlikely((int)snaplen < 0)) {
2223 snaplen = 0;
2224 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2225 }
1da177e4 2226 }
1da177e4 2227 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2228 h.raw = packet_current_rx_frame(po, skb,
2229 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2230 if (!h.raw)
58d19b19 2231 goto drop_n_account;
f6fb8f10 2232 if (po->tp_version <= TPACKET_V2) {
2233 packet_increment_rx_head(po, &po->rx_ring);
2234 /*
2235 * LOSING will be reported till you read the stats,
2236 * because it's COR - Clear On Read.
2237 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2238 * at packet level.
2239 */
ee80fbf3 2240 if (po->stats.stats1.tp_drops)
f6fb8f10 2241 status |= TP_STATUS_LOSING;
2242 }
ee80fbf3 2243 po->stats.stats1.tp_packets++;
1da177e4
LT
2244 if (copy_skb) {
2245 status |= TP_STATUS_COPY;
2246 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2247 }
1da177e4
LT
2248 spin_unlock(&sk->sk_receive_queue.lock);
2249
58d19b19 2250 if (po->has_vnet_hdr) {
5a213881
JR
2251 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2252 sizeof(struct virtio_net_hdr),
6391a448 2253 vio_le(), true)) {
58d19b19
WB
2254 spin_lock(&sk->sk_receive_queue.lock);
2255 goto drop_n_account;
2256 }
2257 }
2258
bbd6ef87 2259 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2260
2261 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2262 getnstimeofday(&ts);
1da177e4 2263
b9c32fb2
DB
2264 status |= ts_status;
2265
bbd6ef87
PM
2266 switch (po->tp_version) {
2267 case TPACKET_V1:
2268 h.h1->tp_len = skb->len;
2269 h.h1->tp_snaplen = snaplen;
2270 h.h1->tp_mac = macoff;
2271 h.h1->tp_net = netoff;
4b457bdf
DB
2272 h.h1->tp_sec = ts.tv_sec;
2273 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2274 hdrlen = sizeof(*h.h1);
2275 break;
2276 case TPACKET_V2:
2277 h.h2->tp_len = skb->len;
2278 h.h2->tp_snaplen = snaplen;
2279 h.h2->tp_mac = macoff;
2280 h.h2->tp_net = netoff;
bbd6ef87
PM
2281 h.h2->tp_sec = ts.tv_sec;
2282 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2283 if (skb_vlan_tag_present(skb)) {
2284 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2285 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2286 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2287 } else {
2288 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2289 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2290 }
e4d26f4b 2291 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2292 hdrlen = sizeof(*h.h2);
2293 break;
f6fb8f10 2294 case TPACKET_V3:
2295 /* tp_nxt_offset,vlan are already populated above.
2296 * So DONT clear those fields here
2297 */
2298 h.h3->tp_status |= status;
2299 h.h3->tp_len = skb->len;
2300 h.h3->tp_snaplen = snaplen;
2301 h.h3->tp_mac = macoff;
2302 h.h3->tp_net = netoff;
f6fb8f10 2303 h.h3->tp_sec = ts.tv_sec;
2304 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2305 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2306 hdrlen = sizeof(*h.h3);
2307 break;
bbd6ef87
PM
2308 default:
2309 BUG();
2310 }
1da177e4 2311
bbd6ef87 2312 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2313 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2314 sll->sll_family = AF_PACKET;
2315 sll->sll_hatype = dev->type;
2316 sll->sll_protocol = skb->protocol;
2317 sll->sll_pkttype = skb->pkt_type;
8032b464 2318 if (unlikely(po->origdev))
80feaacb
PWJ
2319 sll->sll_ifindex = orig_dev->ifindex;
2320 else
2321 sll->sll_ifindex = dev->ifindex;
1da177e4 2322
e16aa207 2323 smp_mb();
f0d4eb29 2324
f6dafa95 2325#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2326 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2327 u8 *start, *end;
2328
f0d4eb29
DB
2329 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2330 macoff + snaplen);
2331
2332 for (start = h.raw; start < end; start += PAGE_SIZE)
2333 flush_dcache_page(pgv_to_page(start));
1da177e4 2334 }
f0d4eb29 2335 smp_wmb();
f6dafa95 2336#endif
f0d4eb29 2337
da413eec 2338 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2339 __packet_set_status(po, h.raw, status);
da413eec
DC
2340 sk->sk_data_ready(sk);
2341 } else {
f6fb8f10 2342 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2343 }
1da177e4
LT
2344
2345drop_n_restore:
2346 if (skb_head != skb->data && skb_shared(skb)) {
2347 skb->data = skb_head;
2348 skb->len = skb_len;
2349 }
2350drop:
da37845f
WJ
2351 if (!is_drop_n_account)
2352 consume_skb(skb);
2353 else
2354 kfree_skb(skb);
1da177e4
LT
2355 return 0;
2356
58d19b19 2357drop_n_account:
da37845f 2358 is_drop_n_account = true;
ee80fbf3 2359 po->stats.stats1.tp_drops++;
1da177e4
LT
2360 spin_unlock(&sk->sk_receive_queue.lock);
2361
676d2369 2362 sk->sk_data_ready(sk);
acb5d75b 2363 kfree_skb(copy_skb);
1da177e4
LT
2364 goto drop_n_restore;
2365}
2366
69e3c75f
JB
2367static void tpacket_destruct_skb(struct sk_buff *skb)
2368{
2369 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2370
69e3c75f 2371 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2372 void *ph;
b9c32fb2
DB
2373 __u32 ts;
2374
69e3c75f 2375 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2376 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2377
2378 ts = __packet_set_timestamp(po, ph, skb);
2379 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2380 }
2381
2382 sock_wfree(skb);
2383}
2384
c72219b7
DB
2385static void tpacket_set_protocol(const struct net_device *dev,
2386 struct sk_buff *skb)
2387{
2388 if (dev->type == ARPHRD_ETHER) {
2389 skb_reset_mac_header(skb);
2390 skb->protocol = eth_hdr(skb)->h_proto;
2391 }
2392}
2393
16cc1400
WB
2394static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2395{
16cc1400
WB
2396 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2397 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2398 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2399 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2400 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2401 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2402 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2403
2404 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2405 return -EINVAL;
2406
16cc1400
WB
2407 return 0;
2408}
2409
2410static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2411 struct virtio_net_hdr *vnet_hdr)
2412{
16cc1400
WB
2413 if (*len < sizeof(*vnet_hdr))
2414 return -EINVAL;
2415 *len -= sizeof(*vnet_hdr);
2416
cbbd26b8 2417 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2418 return -EFAULT;
2419
2420 return __packet_snd_vnet_parse(vnet_hdr, *len);
2421}
2422
40d4e3df 2423static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2424 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2425 __be16 proto, unsigned char *addr, int hlen, int copylen,
2426 const struct sockcm_cookie *sockc)
69e3c75f 2427{
184f489e 2428 union tpacket_uhdr ph;
8d39b4a6 2429 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2430 struct socket *sock = po->sk.sk_socket;
2431 struct page *page;
69e3c75f
JB
2432 int err;
2433
2434 ph.raw = frame;
2435
2436 skb->protocol = proto;
2437 skb->dev = dev;
2438 skb->priority = po->sk.sk_priority;
2d37a186 2439 skb->mark = po->sk.sk_mark;
c14ac945 2440 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2441 skb_shinfo(skb)->destructor_arg = ph.raw;
2442
ae641949 2443 skb_reserve(skb, hlen);
69e3c75f 2444 skb_reset_network_header(skb);
c1aad275 2445
69e3c75f
JB
2446 to_write = tp_len;
2447
2448 if (sock->type == SOCK_DGRAM) {
2449 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2450 NULL, tp_len);
2451 if (unlikely(err < 0))
2452 return -EINVAL;
1d036d25 2453 } else if (copylen) {
9ed988cd
WB
2454 int hdrlen = min_t(int, copylen, tp_len);
2455
69e3c75f 2456 skb_push(skb, dev->hard_header_len);
1d036d25 2457 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2458 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2459 if (unlikely(err))
2460 return err;
9ed988cd
WB
2461 if (!dev_validate_header(dev, skb->data, hdrlen))
2462 return -EINVAL;
c72219b7
DB
2463 if (!skb->protocol)
2464 tpacket_set_protocol(dev, skb);
69e3c75f 2465
9ed988cd
WB
2466 data += hdrlen;
2467 to_write -= hdrlen;
69e3c75f
JB
2468 }
2469
69e3c75f
JB
2470 offset = offset_in_page(data);
2471 len_max = PAGE_SIZE - offset;
2472 len = ((to_write > len_max) ? len_max : to_write);
2473
2474 skb->data_len = to_write;
2475 skb->len += to_write;
2476 skb->truesize += to_write;
2477 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2478
2479 while (likely(to_write)) {
2480 nr_frags = skb_shinfo(skb)->nr_frags;
2481
2482 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2483 pr_err("Packet exceed the number of skb frags(%lu)\n",
2484 MAX_SKB_FRAGS);
69e3c75f
JB
2485 return -EFAULT;
2486 }
2487
0af55bb5
CG
2488 page = pgv_to_page(data);
2489 data += len;
69e3c75f
JB
2490 flush_dcache_page(page);
2491 get_page(page);
0af55bb5 2492 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2493 to_write -= len;
2494 offset = 0;
2495 len_max = PAGE_SIZE;
2496 len = ((to_write > len_max) ? len_max : to_write);
2497 }
2498
8fd6c80d 2499 skb_probe_transport_header(skb, 0);
efdfa2f7 2500
69e3c75f
JB
2501 return tp_len;
2502}
2503
8d39b4a6
WB
2504static int tpacket_parse_header(struct packet_sock *po, void *frame,
2505 int size_max, void **data)
2506{
2507 union tpacket_uhdr ph;
2508 int tp_len, off;
2509
2510 ph.raw = frame;
2511
2512 switch (po->tp_version) {
2513 case TPACKET_V2:
2514 tp_len = ph.h2->tp_len;
2515 break;
2516 default:
2517 tp_len = ph.h1->tp_len;
2518 break;
2519 }
2520 if (unlikely(tp_len > size_max)) {
2521 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2522 return -EMSGSIZE;
2523 }
2524
2525 if (unlikely(po->tp_tx_has_off)) {
2526 int off_min, off_max;
2527
2528 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2529 off_max = po->tx_ring.frame_size - tp_len;
2530 if (po->sk.sk_type == SOCK_DGRAM) {
2531 switch (po->tp_version) {
2532 case TPACKET_V2:
2533 off = ph.h2->tp_net;
2534 break;
2535 default:
2536 off = ph.h1->tp_net;
2537 break;
2538 }
2539 } else {
2540 switch (po->tp_version) {
2541 case TPACKET_V2:
2542 off = ph.h2->tp_mac;
2543 break;
2544 default:
2545 off = ph.h1->tp_mac;
2546 break;
2547 }
2548 }
2549 if (unlikely((off < off_min) || (off_max < off)))
2550 return -EINVAL;
2551 } else {
2552 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2553 }
2554
2555 *data = frame + off;
2556 return tp_len;
2557}
2558
69e3c75f
JB
2559static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2560{
69e3c75f
JB
2561 struct sk_buff *skb;
2562 struct net_device *dev;
1d036d25 2563 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2564 struct sockcm_cookie sockc;
69e3c75f 2565 __be16 proto;
09effa67 2566 int err, reserve = 0;
40d4e3df 2567 void *ph;
342dfc30 2568 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2569 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2570 int tp_len, size_max;
2571 unsigned char *addr;
8d39b4a6 2572 void *data;
69e3c75f 2573 int len_sum = 0;
9e67030a 2574 int status = TP_STATUS_AVAILABLE;
1d036d25 2575 int hlen, tlen, copylen = 0;
69e3c75f 2576
69e3c75f
JB
2577 mutex_lock(&po->pg_vec_lock);
2578
66e56cd4 2579 if (likely(saddr == NULL)) {
e40526cb 2580 dev = packet_cached_dev_get(po);
69e3c75f
JB
2581 proto = po->num;
2582 addr = NULL;
2583 } else {
2584 err = -EINVAL;
2585 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2586 goto out;
2587 if (msg->msg_namelen < (saddr->sll_halen
2588 + offsetof(struct sockaddr_ll,
2589 sll_addr)))
2590 goto out;
69e3c75f
JB
2591 proto = saddr->sll_protocol;
2592 addr = saddr->sll_addr;
827d9780 2593 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2594 }
2595
edbe7746 2596 sockc.tsflags = po->sk.sk_tsflags;
c14ac945
SHY
2597 if (msg->msg_controllen) {
2598 err = sock_cmsg_send(&po->sk, msg, &sockc);
2599 if (unlikely(err))
2600 goto out;
2601 }
2602
69e3c75f
JB
2603 err = -ENXIO;
2604 if (unlikely(dev == NULL))
2605 goto out;
69e3c75f
JB
2606 err = -ENETDOWN;
2607 if (unlikely(!(dev->flags & IFF_UP)))
2608 goto out_put;
2609
5cfb4c8d
DB
2610 if (po->sk.sk_socket->type == SOCK_RAW)
2611 reserve = dev->hard_header_len;
69e3c75f 2612 size_max = po->tx_ring.frame_size
b5dd884e 2613 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2614
1d036d25 2615 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2616 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2617
69e3c75f
JB
2618 do {
2619 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2620 TP_STATUS_SEND_REQUEST);
69e3c75f 2621 if (unlikely(ph == NULL)) {
87a2fd28
DB
2622 if (need_wait && need_resched())
2623 schedule();
69e3c75f
JB
2624 continue;
2625 }
2626
8d39b4a6
WB
2627 skb = NULL;
2628 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2629 if (tp_len < 0)
2630 goto tpacket_error;
2631
69e3c75f 2632 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2633 hlen = LL_RESERVED_SPACE(dev);
2634 tlen = dev->needed_tailroom;
1d036d25
WB
2635 if (po->has_vnet_hdr) {
2636 vnet_hdr = data;
2637 data += sizeof(*vnet_hdr);
2638 tp_len -= sizeof(*vnet_hdr);
2639 if (tp_len < 0 ||
2640 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2641 tp_len = -EINVAL;
2642 goto tpacket_error;
2643 }
2644 copylen = __virtio16_to_cpu(vio_le(),
2645 vnet_hdr->hdr_len);
2646 }
9ed988cd 2647 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2648 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2649 hlen + tlen + sizeof(struct sockaddr_ll) +
2650 (copylen - dev->hard_header_len),
fbf33a28 2651 !need_wait, &err);
69e3c75f 2652
fbf33a28
KM
2653 if (unlikely(skb == NULL)) {
2654 /* we assume the socket was initially writeable ... */
2655 if (likely(len_sum > 0))
2656 err = len_sum;
69e3c75f 2657 goto out_status;
fbf33a28 2658 }
8d39b4a6 2659 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2660 addr, hlen, copylen, &sockc);
dbd46ab4 2661 if (likely(tp_len >= 0) &&
5cfb4c8d 2662 tp_len > dev->mtu + reserve &&
1d036d25 2663 !po->has_vnet_hdr &&
3c70c132
DB
2664 !packet_extra_vlan_len_allowed(dev, skb))
2665 tp_len = -EMSGSIZE;
69e3c75f
JB
2666
2667 if (unlikely(tp_len < 0)) {
8d39b4a6 2668tpacket_error:
69e3c75f
JB
2669 if (po->tp_loss) {
2670 __packet_set_status(po, ph,
2671 TP_STATUS_AVAILABLE);
2672 packet_increment_head(&po->tx_ring);
2673 kfree_skb(skb);
2674 continue;
2675 } else {
2676 status = TP_STATUS_WRONG_FORMAT;
2677 err = tp_len;
2678 goto out_status;
2679 }
2680 }
2681
db60eb5f
JR
2682 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2683 vio_le())) {
1d036d25
WB
2684 tp_len = -EINVAL;
2685 goto tpacket_error;
2686 }
2687
0fd5d57b
DB
2688 packet_pick_tx_queue(dev, skb);
2689
69e3c75f
JB
2690 skb->destructor = tpacket_destruct_skb;
2691 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2692 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2693
2694 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2695 err = po->xmit(skb);
eb70df13
JP
2696 if (unlikely(err > 0)) {
2697 err = net_xmit_errno(err);
2698 if (err && __packet_get_status(po, ph) ==
2699 TP_STATUS_AVAILABLE) {
2700 /* skb was destructed already */
2701 skb = NULL;
2702 goto out_status;
2703 }
2704 /*
2705 * skb was dropped but not destructed yet;
2706 * let's treat it like congestion or err < 0
2707 */
2708 err = 0;
2709 }
69e3c75f
JB
2710 packet_increment_head(&po->tx_ring);
2711 len_sum += tp_len;
b0138408
DB
2712 } while (likely((ph != NULL) ||
2713 /* Note: packet_read_pending() might be slow if we have
2714 * to call it as it's per_cpu variable, but in fast-path
2715 * we already short-circuit the loop with the first
2716 * condition, and luckily don't have to go that path
2717 * anyway.
2718 */
2719 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2720
2721 err = len_sum;
2722 goto out_put;
2723
69e3c75f
JB
2724out_status:
2725 __packet_set_status(po, ph, status);
2726 kfree_skb(skb);
2727out_put:
e40526cb 2728 dev_put(dev);
69e3c75f
JB
2729out:
2730 mutex_unlock(&po->pg_vec_lock);
2731 return err;
2732}
69e3c75f 2733
eea49cc9
OJ
2734static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2735 size_t reserve, size_t len,
2736 size_t linear, int noblock,
2737 int *err)
bfd5f4a3
SS
2738{
2739 struct sk_buff *skb;
2740
2741 /* Under a page? Don't bother with paged skb. */
2742 if (prepad + len < PAGE_SIZE || !linear)
2743 linear = len;
2744
2745 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2746 err, 0);
bfd5f4a3
SS
2747 if (!skb)
2748 return NULL;
2749
2750 skb_reserve(skb, reserve);
2751 skb_put(skb, linear);
2752 skb->data_len = len - linear;
2753 skb->len += len - linear;
2754
2755 return skb;
2756}
2757
d346a3fa 2758static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2759{
2760 struct sock *sk = sock->sk;
342dfc30 2761 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2762 struct sk_buff *skb;
2763 struct net_device *dev;
0e11c91e 2764 __be16 proto;
1da177e4 2765 unsigned char *addr;
827d9780 2766 int err, reserve = 0;
c7d39e32 2767 struct sockcm_cookie sockc;
bfd5f4a3
SS
2768 struct virtio_net_hdr vnet_hdr = { 0 };
2769 int offset = 0;
bfd5f4a3 2770 struct packet_sock *po = pkt_sk(sk);
57031eb7 2771 int hlen, tlen, linear;
3bdc0eba 2772 int extra_len = 0;
1da177e4
LT
2773
2774 /*
1ce4f28b 2775 * Get and verify the address.
1da177e4 2776 */
1ce4f28b 2777
66e56cd4 2778 if (likely(saddr == NULL)) {
e40526cb 2779 dev = packet_cached_dev_get(po);
1da177e4
LT
2780 proto = po->num;
2781 addr = NULL;
2782 } else {
2783 err = -EINVAL;
2784 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2785 goto out;
0fb375fb
EB
2786 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2787 goto out;
1da177e4
LT
2788 proto = saddr->sll_protocol;
2789 addr = saddr->sll_addr;
827d9780 2790 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2791 }
2792
1da177e4 2793 err = -ENXIO;
e40526cb 2794 if (unlikely(dev == NULL))
1da177e4 2795 goto out_unlock;
d5e76b0a 2796 err = -ENETDOWN;
e40526cb 2797 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2798 goto out_unlock;
2799
edbe7746 2800 sockc.tsflags = sk->sk_tsflags;
c7d39e32
EJ
2801 sockc.mark = sk->sk_mark;
2802 if (msg->msg_controllen) {
2803 err = sock_cmsg_send(sk, msg, &sockc);
2804 if (unlikely(err))
2805 goto out_unlock;
2806 }
2807
e40526cb
DB
2808 if (sock->type == SOCK_RAW)
2809 reserve = dev->hard_header_len;
bfd5f4a3 2810 if (po->has_vnet_hdr) {
16cc1400
WB
2811 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2812 if (err)
bfd5f4a3 2813 goto out_unlock;
bfd5f4a3
SS
2814 }
2815
3bdc0eba
BG
2816 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2817 if (!netif_supports_nofcs(dev)) {
2818 err = -EPROTONOSUPPORT;
2819 goto out_unlock;
2820 }
2821 extra_len = 4; /* We're doing our own CRC */
2822 }
2823
1da177e4 2824 err = -EMSGSIZE;
16cc1400
WB
2825 if (!vnet_hdr.gso_type &&
2826 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2827 goto out_unlock;
2828
bfd5f4a3 2829 err = -ENOBUFS;
ae641949
HX
2830 hlen = LL_RESERVED_SPACE(dev);
2831 tlen = dev->needed_tailroom;
57031eb7
WB
2832 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2833 linear = max(linear, min_t(int, len, dev->hard_header_len));
2834 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2835 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2836 if (skb == NULL)
1da177e4
LT
2837 goto out_unlock;
2838
bfd5f4a3 2839 skb_set_network_header(skb, reserve);
1da177e4 2840
0c4e8581 2841 err = -EINVAL;
9c707762
WB
2842 if (sock->type == SOCK_DGRAM) {
2843 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2844 if (unlikely(offset < 0))
9c707762 2845 goto out_free;
9c707762 2846 }
1da177e4
LT
2847
2848 /* Returns -EFAULT on error */
c0371da6 2849 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2850 if (err)
2851 goto out_free;
bf84a010 2852
9ed988cd
WB
2853 if (sock->type == SOCK_RAW &&
2854 !dev_validate_header(dev, skb->data, len)) {
2855 err = -EINVAL;
2856 goto out_free;
2857 }
2858
c14ac945 2859 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2860
16cc1400 2861 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2862 !packet_extra_vlan_len_allowed(dev, skb)) {
2863 err = -EMSGSIZE;
2864 goto out_free;
57f89bfa
BG
2865 }
2866
09effa67
DM
2867 skb->protocol = proto;
2868 skb->dev = dev;
1da177e4 2869 skb->priority = sk->sk_priority;
c7d39e32 2870 skb->mark = sockc.mark;
0fd5d57b
DB
2871
2872 packet_pick_tx_queue(dev, skb);
1da177e4 2873
bfd5f4a3 2874 if (po->has_vnet_hdr) {
db60eb5f 2875 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2876 if (err)
2877 goto out_free;
2878 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2879 }
2880
8fd6c80d
DB
2881 skb_probe_transport_header(skb, reserve);
2882
3bdc0eba
BG
2883 if (unlikely(extra_len == 4))
2884 skb->no_fcs = 1;
2885
d346a3fa 2886 err = po->xmit(skb);
1da177e4
LT
2887 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2888 goto out_unlock;
2889
e40526cb 2890 dev_put(dev);
1da177e4 2891
40d4e3df 2892 return len;
1da177e4
LT
2893
2894out_free:
2895 kfree_skb(skb);
2896out_unlock:
e40526cb 2897 if (dev)
1da177e4
LT
2898 dev_put(dev);
2899out:
2900 return err;
2901}
2902
1b784140 2903static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2904{
69e3c75f
JB
2905 struct sock *sk = sock->sk;
2906 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2907
69e3c75f
JB
2908 if (po->tx_ring.pg_vec)
2909 return tpacket_snd(po, msg);
2910 else
69e3c75f
JB
2911 return packet_snd(sock, msg, len);
2912}
2913
1da177e4
LT
2914/*
2915 * Close a PACKET socket. This is fairly simple. We immediately go
2916 * to 'closed' state and remove our protocol entry in the device list.
2917 */
2918
2919static int packet_release(struct socket *sock)
2920{
2921 struct sock *sk = sock->sk;
2922 struct packet_sock *po;
2bd624b4 2923 struct packet_fanout *f;
d12d01d6 2924 struct net *net;
f6fb8f10 2925 union tpacket_req_u req_u;
1da177e4
LT
2926
2927 if (!sk)
2928 return 0;
2929
3b1e0a65 2930 net = sock_net(sk);
1da177e4
LT
2931 po = pkt_sk(sk);
2932
0fa7fa98 2933 mutex_lock(&net->packet.sklist_lock);
808f5114 2934 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2935 mutex_unlock(&net->packet.sklist_lock);
2936
2937 preempt_disable();
920de804 2938 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2939 preempt_enable();
1da177e4 2940
808f5114 2941 spin_lock(&po->bind_lock);
ce06b03e 2942 unregister_prot_hook(sk, false);
66e56cd4
DB
2943 packet_cached_dev_reset(po);
2944
160ff18a
BG
2945 if (po->prot_hook.dev) {
2946 dev_put(po->prot_hook.dev);
2947 po->prot_hook.dev = NULL;
2948 }
808f5114 2949 spin_unlock(&po->bind_lock);
1da177e4 2950
1da177e4 2951 packet_flush_mclist(sk);
1da177e4 2952
9665d5d6
PS
2953 if (po->rx_ring.pg_vec) {
2954 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2955 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2956 }
69e3c75f 2957
9665d5d6
PS
2958 if (po->tx_ring.pg_vec) {
2959 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2960 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2961 }
1da177e4 2962
2bd624b4 2963 f = fanout_release(sk);
dc99f600 2964
808f5114 2965 synchronize_net();
2bd624b4
AS
2966
2967 if (f) {
2968 fanout_release_data(f);
2969 kfree(f);
2970 }
1da177e4
LT
2971 /*
2972 * Now the socket is dead. No more input will appear.
2973 */
1da177e4
LT
2974 sock_orphan(sk);
2975 sock->sk = NULL;
2976
2977 /* Purge queues */
2978
2979 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2980 packet_free_pending(po);
17ab56a2 2981 sk_refcnt_debug_release(sk);
1da177e4
LT
2982
2983 sock_put(sk);
2984 return 0;
2985}
2986
2987/*
2988 * Attach a packet hook.
2989 */
2990
30f7ea1c
FR
2991static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
2992 __be16 proto)
1da177e4
LT
2993{
2994 struct packet_sock *po = pkt_sk(sk);
158cd4af 2995 struct net_device *dev_curr;
902fefb8
DB
2996 __be16 proto_curr;
2997 bool need_rehook;
30f7ea1c
FR
2998 struct net_device *dev = NULL;
2999 int ret = 0;
3000 bool unlisted = false;
dc99f600 3001
30f7ea1c 3002 if (po->fanout)
dc99f600 3003 return -EINVAL;
1da177e4
LT
3004
3005 lock_sock(sk);
1da177e4 3006 spin_lock(&po->bind_lock);
30f7ea1c
FR
3007 rcu_read_lock();
3008
3009 if (name) {
3010 dev = dev_get_by_name_rcu(sock_net(sk), name);
3011 if (!dev) {
3012 ret = -ENODEV;
3013 goto out_unlock;
3014 }
3015 } else if (ifindex) {
3016 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3017 if (!dev) {
3018 ret = -ENODEV;
3019 goto out_unlock;
3020 }
3021 }
3022
3023 if (dev)
3024 dev_hold(dev);
66e56cd4 3025
902fefb8
DB
3026 proto_curr = po->prot_hook.type;
3027 dev_curr = po->prot_hook.dev;
3028
3029 need_rehook = proto_curr != proto || dev_curr != dev;
3030
3031 if (need_rehook) {
30f7ea1c
FR
3032 if (po->running) {
3033 rcu_read_unlock();
3034 __unregister_prot_hook(sk, true);
3035 rcu_read_lock();
3036 dev_curr = po->prot_hook.dev;
3037 if (dev)
3038 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3039 dev->ifindex);
3040 }
1da177e4 3041
902fefb8
DB
3042 po->num = proto;
3043 po->prot_hook.type = proto;
902fefb8 3044
30f7ea1c
FR
3045 if (unlikely(unlisted)) {
3046 dev_put(dev);
3047 po->prot_hook.dev = NULL;
3048 po->ifindex = -1;
3049 packet_cached_dev_reset(po);
3050 } else {
3051 po->prot_hook.dev = dev;
3052 po->ifindex = dev ? dev->ifindex : 0;
3053 packet_cached_dev_assign(po, dev);
3054 }
902fefb8 3055 }
158cd4af
LW
3056 if (dev_curr)
3057 dev_put(dev_curr);
66e56cd4 3058
902fefb8 3059 if (proto == 0 || !need_rehook)
1da177e4
LT
3060 goto out_unlock;
3061
30f7ea1c 3062 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3063 register_prot_hook(sk);
be85d4ad
UT
3064 } else {
3065 sk->sk_err = ENETDOWN;
3066 if (!sock_flag(sk, SOCK_DEAD))
3067 sk->sk_error_report(sk);
1da177e4
LT
3068 }
3069
3070out_unlock:
30f7ea1c 3071 rcu_read_unlock();
1da177e4
LT
3072 spin_unlock(&po->bind_lock);
3073 release_sock(sk);
30f7ea1c 3074 return ret;
1da177e4
LT
3075}
3076
3077/*
3078 * Bind a packet socket to a device
3079 */
3080
40d4e3df
ED
3081static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3082 int addr_len)
1da177e4 3083{
40d4e3df 3084 struct sock *sk = sock->sk;
c87838f6 3085 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3086
1da177e4
LT
3087 /*
3088 * Check legality
3089 */
1ce4f28b 3090
8ae55f04 3091 if (addr_len != sizeof(struct sockaddr))
1da177e4 3092 return -EINVAL;
c87838f6
AP
3093 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3094 * zero-terminated.
3095 */
3096 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3097 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3098
30f7ea1c 3099 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3100}
1da177e4
LT
3101
3102static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3103{
40d4e3df
ED
3104 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3105 struct sock *sk = sock->sk;
1da177e4
LT
3106
3107 /*
3108 * Check legality
3109 */
1ce4f28b 3110
1da177e4
LT
3111 if (addr_len < sizeof(struct sockaddr_ll))
3112 return -EINVAL;
3113 if (sll->sll_family != AF_PACKET)
3114 return -EINVAL;
3115
30f7ea1c
FR
3116 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3117 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3118}
3119
3120static struct proto packet_proto = {
3121 .name = "PACKET",
3122 .owner = THIS_MODULE,
3123 .obj_size = sizeof(struct packet_sock),
3124};
3125
3126/*
1ce4f28b 3127 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3128 */
3129
3f378b68
EP
3130static int packet_create(struct net *net, struct socket *sock, int protocol,
3131 int kern)
1da177e4
LT
3132{
3133 struct sock *sk;
3134 struct packet_sock *po;
0e11c91e 3135 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3136 int err;
3137
df008c91 3138 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3139 return -EPERM;
be02097c
DM
3140 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3141 sock->type != SOCK_PACKET)
1da177e4
LT
3142 return -ESOCKTNOSUPPORT;
3143
3144 sock->state = SS_UNCONNECTED;
3145
3146 err = -ENOBUFS;
11aa9c28 3147 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3148 if (sk == NULL)
3149 goto out;
3150
3151 sock->ops = &packet_ops;
1da177e4
LT
3152 if (sock->type == SOCK_PACKET)
3153 sock->ops = &packet_ops_spkt;
be02097c 3154
1da177e4
LT
3155 sock_init_data(sock, sk);
3156
3157 po = pkt_sk(sk);
3158 sk->sk_family = PF_PACKET;
0e11c91e 3159 po->num = proto;
d346a3fa 3160 po->xmit = dev_queue_xmit;
66e56cd4 3161
b0138408
DB
3162 err = packet_alloc_pending(po);
3163 if (err)
3164 goto out2;
3165
66e56cd4 3166 packet_cached_dev_reset(po);
1da177e4
LT
3167
3168 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3169 sk_refcnt_debug_inc(sk);
1da177e4
LT
3170
3171 /*
3172 * Attach a protocol block
3173 */
3174
3175 spin_lock_init(&po->bind_lock);
905db440 3176 mutex_init(&po->pg_vec_lock);
0648ab70 3177 po->rollover = NULL;
1da177e4 3178 po->prot_hook.func = packet_rcv;
be02097c 3179
1da177e4
LT
3180 if (sock->type == SOCK_PACKET)
3181 po->prot_hook.func = packet_rcv_spkt;
be02097c 3182
1da177e4
LT
3183 po->prot_hook.af_packet_priv = sk;
3184
0e11c91e
AV
3185 if (proto) {
3186 po->prot_hook.type = proto;
ce06b03e 3187 register_prot_hook(sk);
1da177e4
LT
3188 }
3189
0fa7fa98 3190 mutex_lock(&net->packet.sklist_lock);
808f5114 3191 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3192 mutex_unlock(&net->packet.sklist_lock);
3193
3194 preempt_disable();
3680453c 3195 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3196 preempt_enable();
808f5114 3197
40d4e3df 3198 return 0;
b0138408
DB
3199out2:
3200 sk_free(sk);
1da177e4
LT
3201out:
3202 return err;
3203}
3204
3205/*
3206 * Pull a packet from our receive queue and hand it to the user.
3207 * If necessary we block.
3208 */
3209
1b784140
YX
3210static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3211 int flags)
1da177e4
LT
3212{
3213 struct sock *sk = sock->sk;
3214 struct sk_buff *skb;
3215 int copied, err;
bfd5f4a3 3216 int vnet_hdr_len = 0;
2472d761 3217 unsigned int origlen = 0;
1da177e4
LT
3218
3219 err = -EINVAL;
ed85b565 3220 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3221 goto out;
3222
3223#if 0
3224 /* What error should we return now? EUNATTACH? */
3225 if (pkt_sk(sk)->ifindex < 0)
3226 return -ENODEV;
3227#endif
3228
ed85b565 3229 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3230 err = sock_recv_errqueue(sk, msg, len,
3231 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3232 goto out;
3233 }
3234
1da177e4
LT
3235 /*
3236 * Call the generic datagram receiver. This handles all sorts
3237 * of horrible races and re-entrancy so we can forget about it
3238 * in the protocol layers.
3239 *
3240 * Now it will return ENETDOWN, if device have just gone down,
3241 * but then it will block.
3242 */
3243
40d4e3df 3244 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3245
3246 /*
1ce4f28b 3247 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3248 * handles the blocking we don't see and worry about blocking
3249 * retries.
3250 */
3251
8ae55f04 3252 if (skb == NULL)
1da177e4
LT
3253 goto out;
3254
2ccdbaa6
WB
3255 if (pkt_sk(sk)->pressure)
3256 packet_rcv_has_room(pkt_sk(sk), NULL);
3257
bfd5f4a3 3258 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3259 err = packet_rcv_vnet(msg, skb, &len);
3260 if (err)
bfd5f4a3 3261 goto out_free;
16cc1400 3262 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3263 }
3264
f3d33426
HFS
3265 /* You lose any data beyond the buffer you gave. If it worries
3266 * a user program they can ask the device for its MTU
3267 * anyway.
1da177e4 3268 */
1da177e4 3269 copied = skb->len;
40d4e3df
ED
3270 if (copied > len) {
3271 copied = len;
3272 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3273 }
3274
51f3d02b 3275 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3276 if (err)
3277 goto out_free;
3278
2472d761
EB
3279 if (sock->type != SOCK_PACKET) {
3280 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3281
3282 /* Original length was stored in sockaddr_ll fields */
3283 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3284 sll->sll_family = AF_PACKET;
3285 sll->sll_protocol = skb->protocol;
3286 }
3287
3b885787 3288 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3289
f3d33426
HFS
3290 if (msg->msg_name) {
3291 /* If the address length field is there to be filled
3292 * in, we fill it in now.
3293 */
3294 if (sock->type == SOCK_PACKET) {
342dfc30 3295 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3296 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3297 } else {
3298 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3299
f3d33426
HFS
3300 msg->msg_namelen = sll->sll_halen +
3301 offsetof(struct sockaddr_ll, sll_addr);
3302 }
ffbc6111
HX
3303 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3304 msg->msg_namelen);
f3d33426 3305 }
1da177e4 3306
8dc41944 3307 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3308 struct tpacket_auxdata aux;
3309
3310 aux.tp_status = TP_STATUS_USER;
3311 if (skb->ip_summed == CHECKSUM_PARTIAL)
3312 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3313 else if (skb->pkt_type != PACKET_OUTGOING &&
3314 (skb->ip_summed == CHECKSUM_COMPLETE ||
3315 skb_csum_unnecessary(skb)))
3316 aux.tp_status |= TP_STATUS_CSUM_VALID;
3317
2472d761 3318 aux.tp_len = origlen;
ffbc6111
HX
3319 aux.tp_snaplen = skb->len;
3320 aux.tp_mac = 0;
bbe735e4 3321 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3322 if (skb_vlan_tag_present(skb)) {
3323 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3324 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3325 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3326 } else {
3327 aux.tp_vlan_tci = 0;
a0cdfcf3 3328 aux.tp_vlan_tpid = 0;
a3bcc23e 3329 }
ffbc6111 3330 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3331 }
3332
1da177e4
LT
3333 /*
3334 * Free or return the buffer as appropriate. Again this
3335 * hides all the races and re-entrancy issues from us.
3336 */
bfd5f4a3 3337 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3338
3339out_free:
3340 skb_free_datagram(sk, skb);
3341out:
3342 return err;
3343}
3344
1da177e4
LT
3345static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3346 int *uaddr_len, int peer)
3347{
3348 struct net_device *dev;
3349 struct sock *sk = sock->sk;
3350
3351 if (peer)
3352 return -EOPNOTSUPP;
3353
3354 uaddr->sa_family = AF_PACKET;
2dc85bf3 3355 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3356 rcu_read_lock();
3357 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3358 if (dev)
2dc85bf3 3359 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3360 rcu_read_unlock();
1da177e4
LT
3361 *uaddr_len = sizeof(*uaddr);
3362
3363 return 0;
3364}
1da177e4
LT
3365
3366static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3367 int *uaddr_len, int peer)
3368{
3369 struct net_device *dev;
3370 struct sock *sk = sock->sk;
3371 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3372 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3373
3374 if (peer)
3375 return -EOPNOTSUPP;
3376
3377 sll->sll_family = AF_PACKET;
3378 sll->sll_ifindex = po->ifindex;
3379 sll->sll_protocol = po->num;
67286640 3380 sll->sll_pkttype = 0;
654d1f8a
ED
3381 rcu_read_lock();
3382 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3383 if (dev) {
3384 sll->sll_hatype = dev->type;
3385 sll->sll_halen = dev->addr_len;
3386 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3387 } else {
3388 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3389 sll->sll_halen = 0;
3390 }
654d1f8a 3391 rcu_read_unlock();
0fb375fb 3392 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3393
3394 return 0;
3395}
3396
2aeb0b88
WC
3397static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3398 int what)
1da177e4
LT
3399{
3400 switch (i->type) {
3401 case PACKET_MR_MULTICAST:
1162563f
JP
3402 if (i->alen != dev->addr_len)
3403 return -EINVAL;
1da177e4 3404 if (what > 0)
22bedad3 3405 return dev_mc_add(dev, i->addr);
1da177e4 3406 else
22bedad3 3407 return dev_mc_del(dev, i->addr);
1da177e4
LT
3408 break;
3409 case PACKET_MR_PROMISC:
2aeb0b88 3410 return dev_set_promiscuity(dev, what);
1da177e4 3411 case PACKET_MR_ALLMULTI:
2aeb0b88 3412 return dev_set_allmulti(dev, what);
d95ed927 3413 case PACKET_MR_UNICAST:
1162563f
JP
3414 if (i->alen != dev->addr_len)
3415 return -EINVAL;
d95ed927 3416 if (what > 0)
a748ee24 3417 return dev_uc_add(dev, i->addr);
d95ed927 3418 else
a748ee24 3419 return dev_uc_del(dev, i->addr);
d95ed927 3420 break;
40d4e3df
ED
3421 default:
3422 break;
1da177e4 3423 }
2aeb0b88 3424 return 0;
1da177e4
LT
3425}
3426
82f17091
FR
3427static void packet_dev_mclist_delete(struct net_device *dev,
3428 struct packet_mclist **mlp)
1da177e4 3429{
82f17091
FR
3430 struct packet_mclist *ml;
3431
3432 while ((ml = *mlp) != NULL) {
3433 if (ml->ifindex == dev->ifindex) {
3434 packet_dev_mc(dev, ml, -1);
3435 *mlp = ml->next;
3436 kfree(ml);
3437 } else
3438 mlp = &ml->next;
1da177e4
LT
3439 }
3440}
3441
0fb375fb 3442static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3443{
3444 struct packet_sock *po = pkt_sk(sk);
3445 struct packet_mclist *ml, *i;
3446 struct net_device *dev;
3447 int err;
3448
3449 rtnl_lock();
3450
3451 err = -ENODEV;
3b1e0a65 3452 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3453 if (!dev)
3454 goto done;
3455
3456 err = -EINVAL;
1162563f 3457 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3458 goto done;
3459
3460 err = -ENOBUFS;
8b3a7005 3461 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3462 if (i == NULL)
3463 goto done;
3464
3465 err = 0;
3466 for (ml = po->mclist; ml; ml = ml->next) {
3467 if (ml->ifindex == mreq->mr_ifindex &&
3468 ml->type == mreq->mr_type &&
3469 ml->alen == mreq->mr_alen &&
3470 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3471 ml->count++;
3472 /* Free the new element ... */
3473 kfree(i);
3474 goto done;
3475 }
3476 }
3477
3478 i->type = mreq->mr_type;
3479 i->ifindex = mreq->mr_ifindex;
3480 i->alen = mreq->mr_alen;
3481 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3482 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3483 i->count = 1;
3484 i->next = po->mclist;
3485 po->mclist = i;
2aeb0b88
WC
3486 err = packet_dev_mc(dev, i, 1);
3487 if (err) {
3488 po->mclist = i->next;
3489 kfree(i);
3490 }
1da177e4
LT
3491
3492done:
3493 rtnl_unlock();
3494 return err;
3495}
3496
0fb375fb 3497static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3498{
3499 struct packet_mclist *ml, **mlp;
3500
3501 rtnl_lock();
3502
3503 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3504 if (ml->ifindex == mreq->mr_ifindex &&
3505 ml->type == mreq->mr_type &&
3506 ml->alen == mreq->mr_alen &&
3507 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3508 if (--ml->count == 0) {
3509 struct net_device *dev;
3510 *mlp = ml->next;
ad959e76
ED
3511 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3512 if (dev)
1da177e4 3513 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3514 kfree(ml);
3515 }
82f17091 3516 break;
1da177e4
LT
3517 }
3518 }
3519 rtnl_unlock();
82f17091 3520 return 0;
1da177e4
LT
3521}
3522
3523static void packet_flush_mclist(struct sock *sk)
3524{
3525 struct packet_sock *po = pkt_sk(sk);
3526 struct packet_mclist *ml;
3527
3528 if (!po->mclist)
3529 return;
3530
3531 rtnl_lock();
3532 while ((ml = po->mclist) != NULL) {
3533 struct net_device *dev;
3534
3535 po->mclist = ml->next;
ad959e76
ED
3536 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3537 if (dev != NULL)
1da177e4 3538 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3539 kfree(ml);
3540 }
3541 rtnl_unlock();
3542}
1da177e4
LT
3543
3544static int
b7058842 3545packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3546{
3547 struct sock *sk = sock->sk;
8dc41944 3548 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3549 int ret;
3550
3551 if (level != SOL_PACKET)
3552 return -ENOPROTOOPT;
3553
69e3c75f 3554 switch (optname) {
1ce4f28b 3555 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3556 case PACKET_DROP_MEMBERSHIP:
3557 {
0fb375fb
EB
3558 struct packet_mreq_max mreq;
3559 int len = optlen;
3560 memset(&mreq, 0, sizeof(mreq));
3561 if (len < sizeof(struct packet_mreq))
1da177e4 3562 return -EINVAL;
0fb375fb
EB
3563 if (len > sizeof(mreq))
3564 len = sizeof(mreq);
40d4e3df 3565 if (copy_from_user(&mreq, optval, len))
1da177e4 3566 return -EFAULT;
0fb375fb
EB
3567 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3568 return -EINVAL;
1da177e4
LT
3569 if (optname == PACKET_ADD_MEMBERSHIP)
3570 ret = packet_mc_add(sk, &mreq);
3571 else
3572 ret = packet_mc_drop(sk, &mreq);
3573 return ret;
3574 }
a2efcfa0 3575
1da177e4 3576 case PACKET_RX_RING:
69e3c75f 3577 case PACKET_TX_RING:
1da177e4 3578 {
f6fb8f10 3579 union tpacket_req_u req_u;
3580 int len;
1da177e4 3581
f6fb8f10 3582 switch (po->tp_version) {
3583 case TPACKET_V1:
3584 case TPACKET_V2:
3585 len = sizeof(req_u.req);
3586 break;
3587 case TPACKET_V3:
3588 default:
3589 len = sizeof(req_u.req3);
3590 break;
3591 }
3592 if (optlen < len)
1da177e4 3593 return -EINVAL;
f6fb8f10 3594 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3595 return -EFAULT;
f6fb8f10 3596 return packet_set_ring(sk, &req_u, 0,
3597 optname == PACKET_TX_RING);
1da177e4
LT
3598 }
3599 case PACKET_COPY_THRESH:
3600 {
3601 int val;
3602
40d4e3df 3603 if (optlen != sizeof(val))
1da177e4 3604 return -EINVAL;
40d4e3df 3605 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3606 return -EFAULT;
3607
3608 pkt_sk(sk)->copy_thresh = val;
3609 return 0;
3610 }
bbd6ef87
PM
3611 case PACKET_VERSION:
3612 {
3613 int val;
3614
3615 if (optlen != sizeof(val))
3616 return -EINVAL;
bbd6ef87
PM
3617 if (copy_from_user(&val, optval, sizeof(val)))
3618 return -EFAULT;
3619 switch (val) {
3620 case TPACKET_V1:
3621 case TPACKET_V2:
f6fb8f10 3622 case TPACKET_V3:
84ac7260 3623 break;
bbd6ef87
PM
3624 default:
3625 return -EINVAL;
3626 }
84ac7260
PP
3627 lock_sock(sk);
3628 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3629 ret = -EBUSY;
3630 } else {
3631 po->tp_version = val;
3632 ret = 0;
3633 }
3634 release_sock(sk);
3635 return ret;
bbd6ef87 3636 }
8913336a
PM
3637 case PACKET_RESERVE:
3638 {
3639 unsigned int val;
3640
3641 if (optlen != sizeof(val))
3642 return -EINVAL;
6bb37937
SB
3643 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3644 return -EBUSY;
8913336a
PM
3645 if (copy_from_user(&val, optval, sizeof(val)))
3646 return -EFAULT;
1d27b680
AK
3647 if (val > INT_MAX)
3648 return -EINVAL;
6bb37937
SB
3649 po->tp_reserve = val;
3650 return 0;
8913336a 3651 }
69e3c75f
JB
3652 case PACKET_LOSS:
3653 {
3654 unsigned int val;
3655
3656 if (optlen != sizeof(val))
3657 return -EINVAL;
3658 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3659 return -EBUSY;
3660 if (copy_from_user(&val, optval, sizeof(val)))
3661 return -EFAULT;
3662 po->tp_loss = !!val;
3663 return 0;
3664 }
8dc41944
HX
3665 case PACKET_AUXDATA:
3666 {
3667 int val;
3668
3669 if (optlen < sizeof(val))
3670 return -EINVAL;
3671 if (copy_from_user(&val, optval, sizeof(val)))
3672 return -EFAULT;
3673
3674 po->auxdata = !!val;
3675 return 0;
3676 }
80feaacb
PWJ
3677 case PACKET_ORIGDEV:
3678 {
3679 int val;
3680
3681 if (optlen < sizeof(val))
3682 return -EINVAL;
3683 if (copy_from_user(&val, optval, sizeof(val)))
3684 return -EFAULT;
3685
3686 po->origdev = !!val;
3687 return 0;
3688 }
bfd5f4a3
SS
3689 case PACKET_VNET_HDR:
3690 {
3691 int val;
3692
3693 if (sock->type != SOCK_RAW)
3694 return -EINVAL;
3695 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3696 return -EBUSY;
3697 if (optlen < sizeof(val))
3698 return -EINVAL;
3699 if (copy_from_user(&val, optval, sizeof(val)))
3700 return -EFAULT;
3701
3702 po->has_vnet_hdr = !!val;
3703 return 0;
3704 }
614f60fa
SM
3705 case PACKET_TIMESTAMP:
3706 {
3707 int val;
3708
3709 if (optlen != sizeof(val))
3710 return -EINVAL;
3711 if (copy_from_user(&val, optval, sizeof(val)))
3712 return -EFAULT;
3713
3714 po->tp_tstamp = val;
3715 return 0;
3716 }
dc99f600
DM
3717 case PACKET_FANOUT:
3718 {
3719 int val;
3720
3721 if (optlen != sizeof(val))
3722 return -EINVAL;
3723 if (copy_from_user(&val, optval, sizeof(val)))
3724 return -EFAULT;
3725
3726 return fanout_add(sk, val & 0xffff, val >> 16);
3727 }
47dceb8e
WB
3728 case PACKET_FANOUT_DATA:
3729 {
3730 if (!po->fanout)
3731 return -EINVAL;
3732
3733 return fanout_set_data(po, optval, optlen);
3734 }
5920cd3a
PC
3735 case PACKET_TX_HAS_OFF:
3736 {
3737 unsigned int val;
3738
3739 if (optlen != sizeof(val))
3740 return -EINVAL;
3741 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3742 return -EBUSY;
3743 if (copy_from_user(&val, optval, sizeof(val)))
3744 return -EFAULT;
3745 po->tp_tx_has_off = !!val;
3746 return 0;
3747 }
d346a3fa
DB
3748 case PACKET_QDISC_BYPASS:
3749 {
3750 int val;
3751
3752 if (optlen != sizeof(val))
3753 return -EINVAL;
3754 if (copy_from_user(&val, optval, sizeof(val)))
3755 return -EFAULT;
3756
3757 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3758 return 0;
3759 }
1da177e4
LT
3760 default:
3761 return -ENOPROTOOPT;
3762 }
3763}
3764
3765static int packet_getsockopt(struct socket *sock, int level, int optname,
3766 char __user *optval, int __user *optlen)
3767{
3768 int len;
c06fff6e 3769 int val, lv = sizeof(val);
1da177e4
LT
3770 struct sock *sk = sock->sk;
3771 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3772 void *data = &val;
ee80fbf3 3773 union tpacket_stats_u st;
a9b63918 3774 struct tpacket_rollover_stats rstats;
1da177e4
LT
3775
3776 if (level != SOL_PACKET)
3777 return -ENOPROTOOPT;
3778
8ae55f04
KK
3779 if (get_user(len, optlen))
3780 return -EFAULT;
1da177e4
LT
3781
3782 if (len < 0)
3783 return -EINVAL;
1ce4f28b 3784
69e3c75f 3785 switch (optname) {
1da177e4 3786 case PACKET_STATISTICS:
1da177e4 3787 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3788 memcpy(&st, &po->stats, sizeof(st));
3789 memset(&po->stats, 0, sizeof(po->stats));
3790 spin_unlock_bh(&sk->sk_receive_queue.lock);
3791
f6fb8f10 3792 if (po->tp_version == TPACKET_V3) {
c06fff6e 3793 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3794 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3795 data = &st.stats3;
f6fb8f10 3796 } else {
c06fff6e 3797 lv = sizeof(struct tpacket_stats);
8bcdeaff 3798 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3799 data = &st.stats1;
f6fb8f10 3800 }
ee80fbf3 3801
8dc41944
HX
3802 break;
3803 case PACKET_AUXDATA:
8dc41944 3804 val = po->auxdata;
80feaacb
PWJ
3805 break;
3806 case PACKET_ORIGDEV:
80feaacb 3807 val = po->origdev;
bfd5f4a3
SS
3808 break;
3809 case PACKET_VNET_HDR:
bfd5f4a3 3810 val = po->has_vnet_hdr;
1da177e4 3811 break;
bbd6ef87 3812 case PACKET_VERSION:
bbd6ef87 3813 val = po->tp_version;
bbd6ef87
PM
3814 break;
3815 case PACKET_HDRLEN:
3816 if (len > sizeof(int))
3817 len = sizeof(int);
3818 if (copy_from_user(&val, optval, len))
3819 return -EFAULT;
3820 switch (val) {
3821 case TPACKET_V1:
3822 val = sizeof(struct tpacket_hdr);
3823 break;
3824 case TPACKET_V2:
3825 val = sizeof(struct tpacket2_hdr);
3826 break;
f6fb8f10 3827 case TPACKET_V3:
3828 val = sizeof(struct tpacket3_hdr);
3829 break;
bbd6ef87
PM
3830 default:
3831 return -EINVAL;
3832 }
bbd6ef87 3833 break;
8913336a 3834 case PACKET_RESERVE:
8913336a 3835 val = po->tp_reserve;
8913336a 3836 break;
69e3c75f 3837 case PACKET_LOSS:
69e3c75f 3838 val = po->tp_loss;
69e3c75f 3839 break;
614f60fa 3840 case PACKET_TIMESTAMP:
614f60fa 3841 val = po->tp_tstamp;
614f60fa 3842 break;
dc99f600 3843 case PACKET_FANOUT:
dc99f600
DM
3844 val = (po->fanout ?
3845 ((u32)po->fanout->id |
77f65ebd
WB
3846 ((u32)po->fanout->type << 16) |
3847 ((u32)po->fanout->flags << 24)) :
dc99f600 3848 0);
dc99f600 3849 break;
a9b63918
WB
3850 case PACKET_ROLLOVER_STATS:
3851 if (!po->rollover)
3852 return -EINVAL;
3853 rstats.tp_all = atomic_long_read(&po->rollover->num);
3854 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3855 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3856 data = &rstats;
3857 lv = sizeof(rstats);
3858 break;
5920cd3a
PC
3859 case PACKET_TX_HAS_OFF:
3860 val = po->tp_tx_has_off;
3861 break;
d346a3fa
DB
3862 case PACKET_QDISC_BYPASS:
3863 val = packet_use_direct_xmit(po);
3864 break;
1da177e4
LT
3865 default:
3866 return -ENOPROTOOPT;
3867 }
3868
c06fff6e
ED
3869 if (len > lv)
3870 len = lv;
8ae55f04
KK
3871 if (put_user(len, optlen))
3872 return -EFAULT;
8dc41944
HX
3873 if (copy_to_user(optval, data, len))
3874 return -EFAULT;
8ae55f04 3875 return 0;
1da177e4
LT
3876}
3877
3878
719c44d3
WB
3879#ifdef CONFIG_COMPAT
3880static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3881 char __user *optval, unsigned int optlen)
3882{
3883 struct packet_sock *po = pkt_sk(sock->sk);
3884
3885 if (level != SOL_PACKET)
3886 return -ENOPROTOOPT;
3887
3888 if (optname == PACKET_FANOUT_DATA &&
3889 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3890 optval = (char __user *)get_compat_bpf_fprog(optval);
3891 if (!optval)
3892 return -EFAULT;
3893 optlen = sizeof(struct sock_fprog);
3894 }
3895
3896 return packet_setsockopt(sock, level, optname, optval, optlen);
3897}
3898#endif
3899
351638e7
JP
3900static int packet_notifier(struct notifier_block *this,
3901 unsigned long msg, void *ptr)
1da177e4
LT
3902{
3903 struct sock *sk;
351638e7 3904 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3905 struct net *net = dev_net(dev);
1da177e4 3906
808f5114 3907 rcu_read_lock();
b67bfe0d 3908 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3909 struct packet_sock *po = pkt_sk(sk);
3910
3911 switch (msg) {
3912 case NETDEV_UNREGISTER:
1da177e4 3913 if (po->mclist)
82f17091 3914 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3915 /* fallthrough */
3916
1da177e4
LT
3917 case NETDEV_DOWN:
3918 if (dev->ifindex == po->ifindex) {
3919 spin_lock(&po->bind_lock);
3920 if (po->running) {
ce06b03e 3921 __unregister_prot_hook(sk, false);
1da177e4
LT
3922 sk->sk_err = ENETDOWN;
3923 if (!sock_flag(sk, SOCK_DEAD))
3924 sk->sk_error_report(sk);
3925 }
3926 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3927 packet_cached_dev_reset(po);
1da177e4 3928 po->ifindex = -1;
160ff18a
BG
3929 if (po->prot_hook.dev)
3930 dev_put(po->prot_hook.dev);
1da177e4
LT
3931 po->prot_hook.dev = NULL;
3932 }
3933 spin_unlock(&po->bind_lock);
3934 }
3935 break;
3936 case NETDEV_UP:
808f5114 3937 if (dev->ifindex == po->ifindex) {
3938 spin_lock(&po->bind_lock);
ce06b03e
DM
3939 if (po->num)
3940 register_prot_hook(sk);
808f5114 3941 spin_unlock(&po->bind_lock);
1da177e4 3942 }
1da177e4
LT
3943 break;
3944 }
3945 }
808f5114 3946 rcu_read_unlock();
1da177e4
LT
3947 return NOTIFY_DONE;
3948}
3949
3950
3951static int packet_ioctl(struct socket *sock, unsigned int cmd,
3952 unsigned long arg)
3953{
3954 struct sock *sk = sock->sk;
3955
69e3c75f 3956 switch (cmd) {
40d4e3df
ED
3957 case SIOCOUTQ:
3958 {
3959 int amount = sk_wmem_alloc_get(sk);
31e6d363 3960
40d4e3df
ED
3961 return put_user(amount, (int __user *)arg);
3962 }
3963 case SIOCINQ:
3964 {
3965 struct sk_buff *skb;
3966 int amount = 0;
3967
3968 spin_lock_bh(&sk->sk_receive_queue.lock);
3969 skb = skb_peek(&sk->sk_receive_queue);
3970 if (skb)
3971 amount = skb->len;
3972 spin_unlock_bh(&sk->sk_receive_queue.lock);
3973 return put_user(amount, (int __user *)arg);
3974 }
3975 case SIOCGSTAMP:
3976 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3977 case SIOCGSTAMPNS:
3978 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3979
1da177e4 3980#ifdef CONFIG_INET
40d4e3df
ED
3981 case SIOCADDRT:
3982 case SIOCDELRT:
3983 case SIOCDARP:
3984 case SIOCGARP:
3985 case SIOCSARP:
3986 case SIOCGIFADDR:
3987 case SIOCSIFADDR:
3988 case SIOCGIFBRDADDR:
3989 case SIOCSIFBRDADDR:
3990 case SIOCGIFNETMASK:
3991 case SIOCSIFNETMASK:
3992 case SIOCGIFDSTADDR:
3993 case SIOCSIFDSTADDR:
3994 case SIOCSIFFLAGS:
40d4e3df 3995 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3996#endif
3997
40d4e3df
ED
3998 default:
3999 return -ENOIOCTLCMD;
1da177e4
LT
4000 }
4001 return 0;
4002}
4003
40d4e3df 4004static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4005 poll_table *wait)
4006{
4007 struct sock *sk = sock->sk;
4008 struct packet_sock *po = pkt_sk(sk);
4009 unsigned int mask = datagram_poll(file, sock, wait);
4010
4011 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4012 if (po->rx_ring.pg_vec) {
f6fb8f10 4013 if (!packet_previous_rx_frame(po, &po->rx_ring,
4014 TP_STATUS_KERNEL))
1da177e4
LT
4015 mask |= POLLIN | POLLRDNORM;
4016 }
2ccdbaa6 4017 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4018 po->pressure = 0;
1da177e4 4019 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4020 spin_lock_bh(&sk->sk_write_queue.lock);
4021 if (po->tx_ring.pg_vec) {
4022 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4023 mask |= POLLOUT | POLLWRNORM;
4024 }
4025 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4026 return mask;
4027}
4028
4029
4030/* Dirty? Well, I still did not learn better way to account
4031 * for user mmaps.
4032 */
4033
4034static void packet_mm_open(struct vm_area_struct *vma)
4035{
4036 struct file *file = vma->vm_file;
40d4e3df 4037 struct socket *sock = file->private_data;
1da177e4 4038 struct sock *sk = sock->sk;
1ce4f28b 4039
1da177e4
LT
4040 if (sk)
4041 atomic_inc(&pkt_sk(sk)->mapped);
4042}
4043
4044static void packet_mm_close(struct vm_area_struct *vma)
4045{
4046 struct file *file = vma->vm_file;
40d4e3df 4047 struct socket *sock = file->private_data;
1da177e4 4048 struct sock *sk = sock->sk;
1ce4f28b 4049
1da177e4
LT
4050 if (sk)
4051 atomic_dec(&pkt_sk(sk)->mapped);
4052}
4053
f0f37e2f 4054static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4055 .open = packet_mm_open,
4056 .close = packet_mm_close,
1da177e4
LT
4057};
4058
0e3125c7
NH
4059static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4060 unsigned int len)
1da177e4
LT
4061{
4062 int i;
4063
4ebf0ae2 4064 for (i = 0; i < len; i++) {
0e3125c7 4065 if (likely(pg_vec[i].buffer)) {
c56b4d90 4066 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4067 vfree(pg_vec[i].buffer);
4068 else
4069 free_pages((unsigned long)pg_vec[i].buffer,
4070 order);
4071 pg_vec[i].buffer = NULL;
4072 }
1da177e4
LT
4073 }
4074 kfree(pg_vec);
4075}
4076
eea49cc9 4077static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4078{
f0d4eb29 4079 char *buffer;
0e3125c7
NH
4080 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4081 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4082
4083 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4084 if (buffer)
4085 return buffer;
4086
f0d4eb29 4087 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4088 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4089 if (buffer)
4090 return buffer;
4091
f0d4eb29 4092 /* vmalloc failed, lets dig into swap here */
0e3125c7 4093 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4094 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4095 if (buffer)
4096 return buffer;
4097
f0d4eb29 4098 /* complete and utter failure */
0e3125c7 4099 return NULL;
4ebf0ae2
DM
4100}
4101
0e3125c7 4102static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4103{
4104 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4105 struct pgv *pg_vec;
4ebf0ae2
DM
4106 int i;
4107
0e3125c7 4108 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4109 if (unlikely(!pg_vec))
4110 goto out;
4111
4112 for (i = 0; i < block_nr; i++) {
c56b4d90 4113 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4114 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4115 goto out_free_pgvec;
4116 }
4117
4118out:
4119 return pg_vec;
4120
4121out_free_pgvec:
4122 free_pg_vec(pg_vec, order, block_nr);
4123 pg_vec = NULL;
4124 goto out;
4125}
1da177e4 4126
f6fb8f10 4127static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4128 int closing, int tx_ring)
1da177e4 4129{
0e3125c7 4130 struct pgv *pg_vec = NULL;
1da177e4 4131 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4132 int was_running, order = 0;
69e3c75f
JB
4133 struct packet_ring_buffer *rb;
4134 struct sk_buff_head *rb_queue;
0e11c91e 4135 __be16 num;
f6fb8f10 4136 int err = -EINVAL;
4137 /* Added to avoid minimal code churn */
4138 struct tpacket_req *req = &req_u->req;
4139
84ac7260 4140 lock_sock(sk);
f6fb8f10 4141 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4142 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
6ae81ced 4143 net_warn_ratelimited("Tx-ring is not supported.\n");
f6fb8f10 4144 goto out;
4145 }
1ce4f28b 4146
69e3c75f
JB
4147 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4148 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4149
69e3c75f
JB
4150 err = -EBUSY;
4151 if (!closing) {
4152 if (atomic_read(&po->mapped))
4153 goto out;
b0138408 4154 if (packet_read_pending(rb))
69e3c75f
JB
4155 goto out;
4156 }
1da177e4 4157
69e3c75f
JB
4158 if (req->tp_block_nr) {
4159 /* Sanity tests and some calculations */
4160 err = -EBUSY;
4161 if (unlikely(rb->pg_vec))
4162 goto out;
1da177e4 4163
bbd6ef87
PM
4164 switch (po->tp_version) {
4165 case TPACKET_V1:
4166 po->tp_hdrlen = TPACKET_HDRLEN;
4167 break;
4168 case TPACKET_V2:
4169 po->tp_hdrlen = TPACKET2_HDRLEN;
4170 break;
f6fb8f10 4171 case TPACKET_V3:
4172 po->tp_hdrlen = TPACKET3_HDRLEN;
4173 break;
bbd6ef87
PM
4174 }
4175
69e3c75f 4176 err = -EINVAL;
4ebf0ae2 4177 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4178 goto out;
90836b67 4179 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4180 goto out;
dc808110 4181 if (po->tp_version >= TPACKET_V3 &&
500e91e0
AK
4182 req->tp_block_size <=
4183 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
dc808110 4184 goto out;
8913336a 4185 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4186 po->tp_reserve))
4187 goto out;
4ebf0ae2 4188 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4189 goto out;
1da177e4 4190
4194b491
TK
4191 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4192 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4193 goto out;
213e19c7
AK
4194 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4195 goto out;
69e3c75f
JB
4196 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4197 req->tp_frame_nr))
4198 goto out;
1da177e4
LT
4199
4200 err = -ENOMEM;
4ebf0ae2
DM
4201 order = get_order(req->tp_block_size);
4202 pg_vec = alloc_pg_vec(req, order);
4203 if (unlikely(!pg_vec))
1da177e4 4204 goto out;
f6fb8f10 4205 switch (po->tp_version) {
4206 case TPACKET_V3:
4207 /* Transmit path is not supported. We checked
4208 * it above but just being paranoid
4209 */
4210 if (!tx_ring)
e8e85cc5 4211 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4212 break;
f6fb8f10 4213 default:
4214 break;
4215 }
69e3c75f
JB
4216 }
4217 /* Done */
4218 else {
4219 err = -EINVAL;
4ebf0ae2 4220 if (unlikely(req->tp_frame_nr))
69e3c75f 4221 goto out;
1da177e4
LT
4222 }
4223
1da177e4
LT
4224
4225 /* Detach socket from network */
4226 spin_lock(&po->bind_lock);
4227 was_running = po->running;
4228 num = po->num;
4229 if (was_running) {
1da177e4 4230 po->num = 0;
ce06b03e 4231 __unregister_prot_hook(sk, false);
1da177e4
LT
4232 }
4233 spin_unlock(&po->bind_lock);
1ce4f28b 4234
1da177e4
LT
4235 synchronize_net();
4236
4237 err = -EBUSY;
905db440 4238 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4239 if (closing || atomic_read(&po->mapped) == 0) {
4240 err = 0;
69e3c75f 4241 spin_lock_bh(&rb_queue->lock);
c053fd96 4242 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4243 rb->frame_max = (req->tp_frame_nr - 1);
4244 rb->head = 0;
4245 rb->frame_size = req->tp_frame_size;
4246 spin_unlock_bh(&rb_queue->lock);
4247
c053fd96
CG
4248 swap(rb->pg_vec_order, order);
4249 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4250
4251 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4252 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4253 tpacket_rcv : packet_rcv;
4254 skb_queue_purge(rb_queue);
1da177e4 4255 if (atomic_read(&po->mapped))
40d4e3df
ED
4256 pr_err("packet_mmap: vma is busy: %d\n",
4257 atomic_read(&po->mapped));
1da177e4 4258 }
905db440 4259 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4260
4261 spin_lock(&po->bind_lock);
ce06b03e 4262 if (was_running) {
1da177e4 4263 po->num = num;
ce06b03e 4264 register_prot_hook(sk);
1da177e4
LT
4265 }
4266 spin_unlock(&po->bind_lock);
f6fb8f10 4267 if (closing && (po->tp_version > TPACKET_V2)) {
4268 /* Because we don't support block-based V3 on tx-ring */
4269 if (!tx_ring)
73d0fcf2 4270 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4271 }
1da177e4 4272
1da177e4
LT
4273 if (pg_vec)
4274 free_pg_vec(pg_vec, order, req->tp_block_nr);
4275out:
84ac7260 4276 release_sock(sk);
1da177e4
LT
4277 return err;
4278}
4279
69e3c75f
JB
4280static int packet_mmap(struct file *file, struct socket *sock,
4281 struct vm_area_struct *vma)
1da177e4
LT
4282{
4283 struct sock *sk = sock->sk;
4284 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4285 unsigned long size, expected_size;
4286 struct packet_ring_buffer *rb;
1da177e4
LT
4287 unsigned long start;
4288 int err = -EINVAL;
4289 int i;
4290
4291 if (vma->vm_pgoff)
4292 return -EINVAL;
4293
905db440 4294 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4295
4296 expected_size = 0;
4297 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4298 if (rb->pg_vec) {
4299 expected_size += rb->pg_vec_len
4300 * rb->pg_vec_pages
4301 * PAGE_SIZE;
4302 }
4303 }
4304
4305 if (expected_size == 0)
1da177e4 4306 goto out;
69e3c75f
JB
4307
4308 size = vma->vm_end - vma->vm_start;
4309 if (size != expected_size)
1da177e4
LT
4310 goto out;
4311
1da177e4 4312 start = vma->vm_start;
69e3c75f
JB
4313 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4314 if (rb->pg_vec == NULL)
4315 continue;
4316
4317 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4318 struct page *page;
4319 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4320 int pg_num;
4321
c56b4d90
CG
4322 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4323 page = pgv_to_page(kaddr);
69e3c75f
JB
4324 err = vm_insert_page(vma, start, page);
4325 if (unlikely(err))
4326 goto out;
4327 start += PAGE_SIZE;
0e3125c7 4328 kaddr += PAGE_SIZE;
69e3c75f 4329 }
4ebf0ae2 4330 }
1da177e4 4331 }
69e3c75f 4332
4ebf0ae2 4333 atomic_inc(&po->mapped);
1da177e4
LT
4334 vma->vm_ops = &packet_mmap_ops;
4335 err = 0;
4336
4337out:
905db440 4338 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4339 return err;
4340}
1da177e4 4341
90ddc4f0 4342static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4343 .family = PF_PACKET,
4344 .owner = THIS_MODULE,
4345 .release = packet_release,
4346 .bind = packet_bind_spkt,
4347 .connect = sock_no_connect,
4348 .socketpair = sock_no_socketpair,
4349 .accept = sock_no_accept,
4350 .getname = packet_getname_spkt,
4351 .poll = datagram_poll,
4352 .ioctl = packet_ioctl,
4353 .listen = sock_no_listen,
4354 .shutdown = sock_no_shutdown,
4355 .setsockopt = sock_no_setsockopt,
4356 .getsockopt = sock_no_getsockopt,
4357 .sendmsg = packet_sendmsg_spkt,
4358 .recvmsg = packet_recvmsg,
4359 .mmap = sock_no_mmap,
4360 .sendpage = sock_no_sendpage,
4361};
1da177e4 4362
90ddc4f0 4363static const struct proto_ops packet_ops = {
1da177e4
LT
4364 .family = PF_PACKET,
4365 .owner = THIS_MODULE,
4366 .release = packet_release,
4367 .bind = packet_bind,
4368 .connect = sock_no_connect,
4369 .socketpair = sock_no_socketpair,
4370 .accept = sock_no_accept,
1ce4f28b 4371 .getname = packet_getname,
1da177e4
LT
4372 .poll = packet_poll,
4373 .ioctl = packet_ioctl,
4374 .listen = sock_no_listen,
4375 .shutdown = sock_no_shutdown,
4376 .setsockopt = packet_setsockopt,
4377 .getsockopt = packet_getsockopt,
719c44d3
WB
4378#ifdef CONFIG_COMPAT
4379 .compat_setsockopt = compat_packet_setsockopt,
4380#endif
1da177e4
LT
4381 .sendmsg = packet_sendmsg,
4382 .recvmsg = packet_recvmsg,
4383 .mmap = packet_mmap,
4384 .sendpage = sock_no_sendpage,
4385};
4386
ec1b4cf7 4387static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4388 .family = PF_PACKET,
4389 .create = packet_create,
4390 .owner = THIS_MODULE,
4391};
4392
4393static struct notifier_block packet_netdev_notifier = {
40d4e3df 4394 .notifier_call = packet_notifier,
1da177e4
LT
4395};
4396
4397#ifdef CONFIG_PROC_FS
1da177e4
LT
4398
4399static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4400 __acquires(RCU)
1da177e4 4401{
e372c414 4402 struct net *net = seq_file_net(seq);
808f5114 4403
4404 rcu_read_lock();
4405 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4406}
4407
4408static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4409{
1bf40954 4410 struct net *net = seq_file_net(seq);
808f5114 4411 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4412}
4413
4414static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4415 __releases(RCU)
1da177e4 4416{
808f5114 4417 rcu_read_unlock();
1da177e4
LT
4418}
4419
1ce4f28b 4420static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4421{
4422 if (v == SEQ_START_TOKEN)
4423 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4424 else {
b7ceabd9 4425 struct sock *s = sk_entry(v);
1da177e4
LT
4426 const struct packet_sock *po = pkt_sk(s);
4427
4428 seq_printf(seq,
71338aa7 4429 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4430 s,
4431 atomic_read(&s->sk_refcnt),
4432 s->sk_type,
4433 ntohs(po->num),
4434 po->ifindex,
4435 po->running,
4436 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4437 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4438 sock_i_ino(s));
1da177e4
LT
4439 }
4440
4441 return 0;
4442}
4443
56b3d975 4444static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4445 .start = packet_seq_start,
4446 .next = packet_seq_next,
4447 .stop = packet_seq_stop,
4448 .show = packet_seq_show,
4449};
4450
4451static int packet_seq_open(struct inode *inode, struct file *file)
4452{
e372c414
DL
4453 return seq_open_net(inode, file, &packet_seq_ops,
4454 sizeof(struct seq_net_private));
1da177e4
LT
4455}
4456
da7071d7 4457static const struct file_operations packet_seq_fops = {
1da177e4
LT
4458 .owner = THIS_MODULE,
4459 .open = packet_seq_open,
4460 .read = seq_read,
4461 .llseek = seq_lseek,
e372c414 4462 .release = seq_release_net,
1da177e4
LT
4463};
4464
4465#endif
4466
2c8c1e72 4467static int __net_init packet_net_init(struct net *net)
d12d01d6 4468{
0fa7fa98 4469 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4470 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4471
d4beaa66 4472 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4473 return -ENOMEM;
4474
4475 return 0;
4476}
4477
2c8c1e72 4478static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4479{
ece31ffd 4480 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4481}
4482
4483static struct pernet_operations packet_net_ops = {
4484 .init = packet_net_init,
4485 .exit = packet_net_exit,
4486};
4487
4488
1da177e4
LT
4489static void __exit packet_exit(void)
4490{
1da177e4 4491 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4492 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4493 sock_unregister(PF_PACKET);
4494 proto_unregister(&packet_proto);
4495}
4496
4497static int __init packet_init(void)
4498{
4499 int rc = proto_register(&packet_proto, 0);
4500
4501 if (rc != 0)
4502 goto out;
4503
4504 sock_register(&packet_family_ops);
d12d01d6 4505 register_pernet_subsys(&packet_net_ops);
1da177e4 4506 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4507out:
4508 return rc;
4509}
4510
4511module_init(packet_init);
4512module_exit(packet_exit);
4513MODULE_LICENSE("GPL");
4514MODULE_ALIAS_NETPROTO(PF_PACKET);