]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
net/mlx5: E-Switch, Enlarge the FDB size for the switchdev mode
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define PGV_FROM_VMALLOC 1
69e3c75f 181
f6fb8f10 182#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
183#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
184#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
185#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
186#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
187#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
188#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189
69e3c75f
JB
190struct packet_sock;
191static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
192static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
193 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 194
f6fb8f10 195static void *packet_previous_frame(struct packet_sock *po,
196 struct packet_ring_buffer *rb,
197 int status);
198static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 199static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
200 struct tpacket_block_desc *);
201static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *);
bc59ba39 203static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 204 struct packet_sock *, unsigned int status);
bc59ba39 205static int prb_queue_frozen(struct tpacket_kbdq_core *);
206static void prb_open_block(struct tpacket_kbdq_core *,
207 struct tpacket_block_desc *);
f6fb8f10 208static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 209static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
210static void prb_init_blk_timer(struct packet_sock *,
211 struct tpacket_kbdq_core *,
212 void (*func) (unsigned long));
213static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
214static void prb_clear_rxhash(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
217 struct tpacket3_hdr *);
1da177e4
LT
218static void packet_flush_mclist(struct sock *sk);
219
ffbc6111 220struct packet_skb_cb {
ffbc6111
HX
221 union {
222 struct sockaddr_pkt pkt;
2472d761
EB
223 union {
224 /* Trick: alias skb original length with
225 * ll.sll_family and ll.protocol in order
226 * to save room.
227 */
228 unsigned int origlen;
229 struct sockaddr_ll ll;
230 };
ffbc6111
HX
231 } sa;
232};
233
d3869efe
DW
234#define vio_le() virtio_legacy_is_little_endian()
235
ffbc6111 236#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 237
bc59ba39 238#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 239#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 240 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 241#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 242 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 243#define GET_NEXT_PRB_BLK_NUM(x) \
244 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
245 ((x)->kactive_blk_num+1) : 0)
246
dc99f600
DM
247static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
248static void __fanout_link(struct sock *sk, struct packet_sock *po);
249
d346a3fa
DB
250static int packet_direct_xmit(struct sk_buff *skb)
251{
252 struct net_device *dev = skb->dev;
104ba78c 253 struct sk_buff *orig_skb = skb;
d346a3fa 254 struct netdev_queue *txq;
43279500 255 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
256
257 if (unlikely(!netif_running(dev) ||
43279500
DB
258 !netif_carrier_ok(dev)))
259 goto drop;
d346a3fa 260
104ba78c
WB
261 skb = validate_xmit_skb_list(skb, dev);
262 if (skb != orig_skb)
43279500 263 goto drop;
d346a3fa 264
10c51b56 265 txq = skb_get_tx_queue(dev, skb);
d346a3fa 266
43279500
DB
267 local_bh_disable();
268
269 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 270 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 271 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 272 HARD_TX_UNLOCK(dev, txq);
d346a3fa 273
43279500
DB
274 local_bh_enable();
275
276 if (!dev_xmit_complete(ret))
d346a3fa 277 kfree_skb(skb);
43279500 278
d346a3fa 279 return ret;
43279500 280drop:
0f97ede4 281 atomic_long_inc(&dev->tx_dropped);
104ba78c 282 kfree_skb_list(skb);
43279500 283 return NET_XMIT_DROP;
d346a3fa
DB
284}
285
66e56cd4
DB
286static struct net_device *packet_cached_dev_get(struct packet_sock *po)
287{
288 struct net_device *dev;
289
290 rcu_read_lock();
291 dev = rcu_dereference(po->cached_dev);
292 if (likely(dev))
293 dev_hold(dev);
294 rcu_read_unlock();
295
296 return dev;
297}
298
299static void packet_cached_dev_assign(struct packet_sock *po,
300 struct net_device *dev)
301{
302 rcu_assign_pointer(po->cached_dev, dev);
303}
304
305static void packet_cached_dev_reset(struct packet_sock *po)
306{
307 RCU_INIT_POINTER(po->cached_dev, NULL);
308}
309
d346a3fa
DB
310static bool packet_use_direct_xmit(const struct packet_sock *po)
311{
312 return po->xmit == packet_direct_xmit;
313}
314
0fd5d57b 315static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 316{
1cbac010 317 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
318}
319
0fd5d57b
DB
320static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
321{
322 const struct net_device_ops *ops = dev->netdev_ops;
323 u16 queue_index;
324
325 if (ops->ndo_select_queue) {
326 queue_index = ops->ndo_select_queue(dev, skb, NULL,
327 __packet_pick_tx_queue);
328 queue_index = netdev_cap_txqueue(dev, queue_index);
329 } else {
330 queue_index = __packet_pick_tx_queue(dev, skb);
331 }
332
333 skb_set_queue_mapping(skb, queue_index);
334}
335
ce06b03e
DM
336/* register_prot_hook must be invoked with the po->bind_lock held,
337 * or from a context in which asynchronous accesses to the packet
338 * socket is not possible (packet_create()).
339 */
340static void register_prot_hook(struct sock *sk)
341{
342 struct packet_sock *po = pkt_sk(sk);
e40526cb 343
ce06b03e 344 if (!po->running) {
66e56cd4 345 if (po->fanout)
dc99f600 346 __fanout_link(sk, po);
66e56cd4 347 else
dc99f600 348 dev_add_pack(&po->prot_hook);
e40526cb 349
ce06b03e
DM
350 sock_hold(sk);
351 po->running = 1;
352 }
353}
354
355/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
356 * held. If the sync parameter is true, we will temporarily drop
357 * the po->bind_lock and do a synchronize_net to make sure no
358 * asynchronous packet processing paths still refer to the elements
359 * of po->prot_hook. If the sync parameter is false, it is the
360 * callers responsibility to take care of this.
361 */
362static void __unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 po->running = 0;
66e56cd4
DB
367
368 if (po->fanout)
dc99f600 369 __fanout_unlink(sk, po);
66e56cd4 370 else
dc99f600 371 __dev_remove_pack(&po->prot_hook);
e40526cb 372
ce06b03e
DM
373 __sock_put(sk);
374
375 if (sync) {
376 spin_unlock(&po->bind_lock);
377 synchronize_net();
378 spin_lock(&po->bind_lock);
379 }
380}
381
382static void unregister_prot_hook(struct sock *sk, bool sync)
383{
384 struct packet_sock *po = pkt_sk(sk);
385
386 if (po->running)
387 __unregister_prot_hook(sk, sync);
388}
389
6e58040b 390static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
391{
392 if (is_vmalloc_addr(addr))
393 return vmalloc_to_page(addr);
394 return virt_to_page(addr);
395}
396
69e3c75f 397static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 398{
184f489e 399 union tpacket_uhdr h;
1da177e4 400
69e3c75f 401 h.raw = frame;
bbd6ef87
PM
402 switch (po->tp_version) {
403 case TPACKET_V1:
69e3c75f 404 h.h1->tp_status = status;
0af55bb5 405 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
406 break;
407 case TPACKET_V2:
69e3c75f 408 h.h2->tp_status = status;
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 410 break;
f6fb8f10 411 case TPACKET_V3:
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f 414 BUG();
bbd6ef87 415 }
69e3c75f
JB
416
417 smp_wmb();
bbd6ef87
PM
418}
419
69e3c75f 420static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 421{
184f489e 422 union tpacket_uhdr h;
bbd6ef87 423
69e3c75f
JB
424 smp_rmb();
425
bbd6ef87
PM
426 h.raw = frame;
427 switch (po->tp_version) {
428 case TPACKET_V1:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 430 return h.h1->tp_status;
bbd6ef87 431 case TPACKET_V2:
0af55bb5 432 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 433 return h.h2->tp_status;
f6fb8f10 434 case TPACKET_V3:
69e3c75f 435 default:
f6fb8f10 436 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
437 BUG();
438 return 0;
bbd6ef87 439 }
1da177e4 440}
69e3c75f 441
b9c32fb2
DB
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
7a51384c
DB
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
68a360e8
WB
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 453 return TP_STATUS_TS_SOFTWARE;
7a51384c 454
b9c32fb2 455 return 0;
7a51384c
DB
456}
457
b9c32fb2
DB
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
2e31396f
WB
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
b9c32fb2 463 __u32 ts_status;
2e31396f 464
b9c32fb2
DB
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
2e31396f
WB
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
479 default:
480 WARN(1, "TPACKET version not supported.\n");
481 BUG();
482 }
483
484 /* one flush is safe, as both fields always lie on the same cacheline */
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
486 smp_wmb();
b9c32fb2
DB
487
488 return ts_status;
2e31396f
WB
489}
490
69e3c75f
JB
491static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
494 int status)
495{
496 unsigned int pg_vec_pos, frame_offset;
184f489e 497 union tpacket_uhdr h;
69e3c75f
JB
498
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
501
0e3125c7
NH
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
69e3c75f
JB
504
505 if (status != __packet_get_status(po, h.raw))
506 return NULL;
507
508 return h.raw;
509}
510
eea49cc9 511static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
512 struct packet_ring_buffer *rb,
513 int status)
514{
515 return packet_lookup_frame(po, rb, rb->head, status);
516}
517
bc59ba39 518static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 519{
520 del_timer_sync(&pkc->retire_blk_timer);
521}
522
523static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 524 struct sk_buff_head *rb_queue)
525{
bc59ba39 526 struct tpacket_kbdq_core *pkc;
f6fb8f10 527
73d0fcf2 528 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 529
ec6f809f 530 spin_lock_bh(&rb_queue->lock);
f6fb8f10 531 pkc->delete_blk_timer = 1;
ec6f809f 532 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 533
534 prb_del_retire_blk_timer(pkc);
535}
536
537static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 538 struct tpacket_kbdq_core *pkc,
f6fb8f10 539 void (*func) (unsigned long))
540{
541 init_timer(&pkc->retire_blk_timer);
542 pkc->retire_blk_timer.data = (long)po;
543 pkc->retire_blk_timer.function = func;
544 pkc->retire_blk_timer.expires = jiffies;
545}
546
e8e85cc5 547static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 548{
bc59ba39 549 struct tpacket_kbdq_core *pkc;
f6fb8f10 550
e8e85cc5 551 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 552 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
553}
554
555static int prb_calc_retire_blk_tmo(struct packet_sock *po,
556 int blk_size_in_bytes)
557{
558 struct net_device *dev;
559 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 560 struct ethtool_link_ksettings ecmd;
4bc71cb9 561 int err;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 568 }
7cad1bac 569 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
570 rtnl_unlock();
571 if (!err) {
4bc71cb9
JP
572 /*
573 * If the link speed is so slow you don't really
574 * need to worry about perf anyways
575 */
7cad1bac
DD
576 if (ecmd.base.speed < SPEED_1000 ||
577 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
7cad1bac 581 div = ecmd.base.speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
eb70db87 1343 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
47dceb8e
WB
1415static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1416 struct sk_buff *skb,
1417 unsigned int num)
1418{
1419 struct bpf_prog *prog;
1420 unsigned int ret = 0;
1421
1422 rcu_read_lock();
1423 prog = rcu_dereference(f->bpf_prog);
1424 if (prog)
ff936a04 1425 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1426 rcu_read_unlock();
1427
1428 return ret;
1429}
1430
77f65ebd
WB
1431static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1432{
1433 return f->flags & (flag >> 8);
95ec3eb4
DM
1434}
1435
95ec3eb4
DM
1436static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1437 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1438{
1439 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1440 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1441 struct net *net = read_pnet(&f->net);
dc99f600 1442 struct packet_sock *po;
77f65ebd 1443 unsigned int idx;
dc99f600 1444
19bcf9f2 1445 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1446 kfree_skb(skb);
1447 return 0;
1448 }
1449
3f34b24a 1450 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1451 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1452 if (!skb)
1453 return 0;
1454 }
95ec3eb4
DM
1455 switch (f->type) {
1456 case PACKET_FANOUT_HASH:
1457 default:
77f65ebd 1458 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1459 break;
1460 case PACKET_FANOUT_LB:
77f65ebd 1461 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1462 break;
1463 case PACKET_FANOUT_CPU:
77f65ebd
WB
1464 idx = fanout_demux_cpu(f, skb, num);
1465 break;
5df0ddfb
DB
1466 case PACKET_FANOUT_RND:
1467 idx = fanout_demux_rnd(f, skb, num);
1468 break;
2d36097d
NH
1469 case PACKET_FANOUT_QM:
1470 idx = fanout_demux_qm(f, skb, num);
1471 break;
77f65ebd 1472 case PACKET_FANOUT_ROLLOVER:
ad377cab 1473 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1474 break;
47dceb8e 1475 case PACKET_FANOUT_CBPF:
f2e52095 1476 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1477 idx = fanout_demux_bpf(f, skb, num);
1478 break;
dc99f600
DM
1479 }
1480
ad377cab
WB
1481 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1482 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1483
ad377cab 1484 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1485 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1486}
1487
fff3321d
PE
1488DEFINE_MUTEX(fanout_mutex);
1489EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1490static LIST_HEAD(fanout_list);
1491
1492static void __fanout_link(struct sock *sk, struct packet_sock *po)
1493{
1494 struct packet_fanout *f = po->fanout;
1495
1496 spin_lock(&f->lock);
1497 f->arr[f->num_members] = sk;
1498 smp_wmb();
1499 f->num_members++;
2bd624b4
AS
1500 if (f->num_members == 1)
1501 dev_add_pack(&f->prot_hook);
dc99f600
DM
1502 spin_unlock(&f->lock);
1503}
1504
1505static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1506{
1507 struct packet_fanout *f = po->fanout;
1508 int i;
1509
1510 spin_lock(&f->lock);
1511 for (i = 0; i < f->num_members; i++) {
1512 if (f->arr[i] == sk)
1513 break;
1514 }
1515 BUG_ON(i >= f->num_members);
1516 f->arr[i] = f->arr[f->num_members - 1];
1517 f->num_members--;
2bd624b4
AS
1518 if (f->num_members == 0)
1519 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1520 spin_unlock(&f->lock);
1521}
1522
d4dd8aee 1523static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1524{
161642e2
ED
1525 if (sk->sk_family != PF_PACKET)
1526 return false;
c0de08d0 1527
161642e2 1528 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1529}
1530
47dceb8e
WB
1531static void fanout_init_data(struct packet_fanout *f)
1532{
1533 switch (f->type) {
1534 case PACKET_FANOUT_LB:
1535 atomic_set(&f->rr_cur, 0);
1536 break;
1537 case PACKET_FANOUT_CBPF:
f2e52095 1538 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1539 RCU_INIT_POINTER(f->bpf_prog, NULL);
1540 break;
1541 }
1542}
1543
1544static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1545{
1546 struct bpf_prog *old;
1547
1548 spin_lock(&f->lock);
1549 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1550 rcu_assign_pointer(f->bpf_prog, new);
1551 spin_unlock(&f->lock);
1552
1553 if (old) {
1554 synchronize_net();
1555 bpf_prog_destroy(old);
1556 }
1557}
1558
1559static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1560 unsigned int len)
1561{
1562 struct bpf_prog *new;
1563 struct sock_fprog fprog;
1564 int ret;
1565
1566 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1567 return -EPERM;
1568 if (len != sizeof(fprog))
1569 return -EINVAL;
1570 if (copy_from_user(&fprog, data, len))
1571 return -EFAULT;
1572
bab18991 1573 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1574 if (ret)
1575 return ret;
1576
1577 __fanout_set_data_bpf(po->fanout, new);
1578 return 0;
1579}
1580
f2e52095
WB
1581static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1582 unsigned int len)
1583{
1584 struct bpf_prog *new;
1585 u32 fd;
1586
1587 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1588 return -EPERM;
1589 if (len != sizeof(fd))
1590 return -EINVAL;
1591 if (copy_from_user(&fd, data, len))
1592 return -EFAULT;
1593
113214be 1594 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1595 if (IS_ERR(new))
1596 return PTR_ERR(new);
f2e52095
WB
1597
1598 __fanout_set_data_bpf(po->fanout, new);
1599 return 0;
1600}
1601
47dceb8e
WB
1602static int fanout_set_data(struct packet_sock *po, char __user *data,
1603 unsigned int len)
1604{
1605 switch (po->fanout->type) {
1606 case PACKET_FANOUT_CBPF:
1607 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1608 case PACKET_FANOUT_EBPF:
1609 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1610 default:
1611 return -EINVAL;
1612 };
1613}
1614
1615static void fanout_release_data(struct packet_fanout *f)
1616{
1617 switch (f->type) {
1618 case PACKET_FANOUT_CBPF:
f2e52095 1619 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1620 __fanout_set_data_bpf(f, NULL);
1621 };
1622}
1623
7736d33f 1624static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1625{
d199fab6 1626 struct packet_rollover *rollover = NULL;
dc99f600
DM
1627 struct packet_sock *po = pkt_sk(sk);
1628 struct packet_fanout *f, *match;
7736d33f 1629 u8 type = type_flags & 0xff;
77f65ebd 1630 u8 flags = type_flags >> 8;
dc99f600
DM
1631 int err;
1632
1633 switch (type) {
77f65ebd
WB
1634 case PACKET_FANOUT_ROLLOVER:
1635 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1636 return -EINVAL;
dc99f600
DM
1637 case PACKET_FANOUT_HASH:
1638 case PACKET_FANOUT_LB:
95ec3eb4 1639 case PACKET_FANOUT_CPU:
5df0ddfb 1640 case PACKET_FANOUT_RND:
2d36097d 1641 case PACKET_FANOUT_QM:
47dceb8e 1642 case PACKET_FANOUT_CBPF:
f2e52095 1643 case PACKET_FANOUT_EBPF:
dc99f600
DM
1644 break;
1645 default:
1646 return -EINVAL;
1647 }
1648
d199fab6
ED
1649 mutex_lock(&fanout_mutex);
1650
1651 err = -EINVAL;
dc99f600 1652 if (!po->running)
d199fab6 1653 goto out;
dc99f600 1654
d199fab6 1655 err = -EALREADY;
dc99f600 1656 if (po->fanout)
d199fab6 1657 goto out;
dc99f600 1658
4633c9e0
WB
1659 if (type == PACKET_FANOUT_ROLLOVER ||
1660 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1661 err = -ENOMEM;
1662 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1663 if (!rollover)
1664 goto out;
1665 atomic_long_set(&rollover->num, 0);
1666 atomic_long_set(&rollover->num_huge, 0);
1667 atomic_long_set(&rollover->num_failed, 0);
1668 po->rollover = rollover;
0648ab70
WB
1669 }
1670
dc99f600
DM
1671 match = NULL;
1672 list_for_each_entry(f, &fanout_list, list) {
1673 if (f->id == id &&
1674 read_pnet(&f->net) == sock_net(sk)) {
1675 match = f;
1676 break;
1677 }
1678 }
afe62c68 1679 err = -EINVAL;
77f65ebd 1680 if (match && match->flags != flags)
afe62c68 1681 goto out;
dc99f600 1682 if (!match) {
afe62c68 1683 err = -ENOMEM;
dc99f600 1684 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1685 if (!match)
1686 goto out;
1687 write_pnet(&match->net, sock_net(sk));
1688 match->id = id;
1689 match->type = type;
77f65ebd 1690 match->flags = flags;
afe62c68
ED
1691 INIT_LIST_HEAD(&match->list);
1692 spin_lock_init(&match->lock);
1693 atomic_set(&match->sk_ref, 0);
47dceb8e 1694 fanout_init_data(match);
afe62c68
ED
1695 match->prot_hook.type = po->prot_hook.type;
1696 match->prot_hook.dev = po->prot_hook.dev;
1697 match->prot_hook.func = packet_rcv_fanout;
1698 match->prot_hook.af_packet_priv = match;
c0de08d0 1699 match->prot_hook.id_match = match_fanout_group;
afe62c68 1700 list_add(&match->list, &fanout_list);
dc99f600 1701 }
afe62c68
ED
1702 err = -EINVAL;
1703 if (match->type == type &&
1704 match->prot_hook.type == po->prot_hook.type &&
1705 match->prot_hook.dev == po->prot_hook.dev) {
1706 err = -ENOSPC;
1707 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1708 __dev_remove_pack(&po->prot_hook);
1709 po->fanout = match;
1710 atomic_inc(&match->sk_ref);
1711 __fanout_link(sk, po);
1712 err = 0;
dc99f600
DM
1713 }
1714 }
afe62c68 1715out:
d199fab6
ED
1716 if (err && rollover) {
1717 kfree(rollover);
0648ab70
WB
1718 po->rollover = NULL;
1719 }
d199fab6 1720 mutex_unlock(&fanout_mutex);
dc99f600
DM
1721 return err;
1722}
1723
2bd624b4
AS
1724/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1725 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1726 * It is the responsibility of the caller to call fanout_release_data() and
1727 * free the returned packet_fanout (after synchronize_net())
1728 */
1729static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1730{
1731 struct packet_sock *po = pkt_sk(sk);
1732 struct packet_fanout *f;
1733
fff3321d 1734 mutex_lock(&fanout_mutex);
d199fab6
ED
1735 f = po->fanout;
1736 if (f) {
1737 po->fanout = NULL;
1738
2bd624b4 1739 if (atomic_dec_and_test(&f->sk_ref))
d199fab6 1740 list_del(&f->list);
2bd624b4
AS
1741 else
1742 f = NULL;
dc99f600 1743
d199fab6
ED
1744 if (po->rollover)
1745 kfree_rcu(po->rollover, rcu);
dc99f600
DM
1746 }
1747 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1748
1749 return f;
dc99f600 1750}
1da177e4 1751
3c70c132
DB
1752static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1753 struct sk_buff *skb)
1754{
1755 /* Earlier code assumed this would be a VLAN pkt, double-check
1756 * this now that we have the actual packet in hand. We can only
1757 * do this check on Ethernet devices.
1758 */
1759 if (unlikely(dev->type != ARPHRD_ETHER))
1760 return false;
1761
1762 skb_reset_mac_header(skb);
1763 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1764}
1765
90ddc4f0 1766static const struct proto_ops packet_ops;
1da177e4 1767
90ddc4f0 1768static const struct proto_ops packet_ops_spkt;
1da177e4 1769
40d4e3df
ED
1770static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1771 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1772{
1773 struct sock *sk;
1774 struct sockaddr_pkt *spkt;
1775
1776 /*
1777 * When we registered the protocol we saved the socket in the data
1778 * field for just this event.
1779 */
1780
1781 sk = pt->af_packet_priv;
1ce4f28b 1782
1da177e4
LT
1783 /*
1784 * Yank back the headers [hope the device set this
1785 * right or kerboom...]
1786 *
1787 * Incoming packets have ll header pulled,
1788 * push it back.
1789 *
98e399f8 1790 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1791 * so that this procedure is noop.
1792 */
1793
1794 if (skb->pkt_type == PACKET_LOOPBACK)
1795 goto out;
1796
09ad9bc7 1797 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1798 goto out;
1799
40d4e3df
ED
1800 skb = skb_share_check(skb, GFP_ATOMIC);
1801 if (skb == NULL)
1da177e4
LT
1802 goto oom;
1803
1804 /* drop any routing info */
adf30907 1805 skb_dst_drop(skb);
1da177e4 1806
84531c24
PO
1807 /* drop conntrack reference */
1808 nf_reset(skb);
1809
ffbc6111 1810 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1811
98e399f8 1812 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1813
1814 /*
1815 * The SOCK_PACKET socket receives _all_ frames.
1816 */
1817
1818 spkt->spkt_family = dev->type;
1819 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1820 spkt->spkt_protocol = skb->protocol;
1821
1822 /*
1823 * Charge the memory to the socket. This is done specifically
1824 * to prevent sockets using all the memory up.
1825 */
1826
40d4e3df 1827 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1828 return 0;
1829
1830out:
1831 kfree_skb(skb);
1832oom:
1833 return 0;
1834}
1835
1836
1837/*
1838 * Output a raw packet to a device layer. This bypasses all the other
1839 * protocol layers and you must therefore supply it with a complete frame
1840 */
1ce4f28b 1841
1b784140
YX
1842static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1843 size_t len)
1da177e4
LT
1844{
1845 struct sock *sk = sock->sk;
342dfc30 1846 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1847 struct sk_buff *skb = NULL;
1da177e4 1848 struct net_device *dev;
c14ac945 1849 struct sockcm_cookie sockc;
40d4e3df 1850 __be16 proto = 0;
1da177e4 1851 int err;
3bdc0eba 1852 int extra_len = 0;
1ce4f28b 1853
1da177e4 1854 /*
1ce4f28b 1855 * Get and verify the address.
1da177e4
LT
1856 */
1857
40d4e3df 1858 if (saddr) {
1da177e4 1859 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1860 return -EINVAL;
1861 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1862 proto = saddr->spkt_protocol;
1863 } else
1864 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1865
1866 /*
1ce4f28b 1867 * Find the device first to size check it
1da177e4
LT
1868 */
1869
de74e92a 1870 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1871retry:
654d1f8a
ED
1872 rcu_read_lock();
1873 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1874 err = -ENODEV;
1875 if (dev == NULL)
1876 goto out_unlock;
1ce4f28b 1877
d5e76b0a
DM
1878 err = -ENETDOWN;
1879 if (!(dev->flags & IFF_UP))
1880 goto out_unlock;
1881
1da177e4 1882 /*
40d4e3df
ED
1883 * You may not queue a frame bigger than the mtu. This is the lowest level
1884 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1885 */
1ce4f28b 1886
3bdc0eba
BG
1887 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1888 if (!netif_supports_nofcs(dev)) {
1889 err = -EPROTONOSUPPORT;
1890 goto out_unlock;
1891 }
1892 extra_len = 4; /* We're doing our own CRC */
1893 }
1894
1da177e4 1895 err = -EMSGSIZE;
3bdc0eba 1896 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1897 goto out_unlock;
1898
1a35ca80
ED
1899 if (!skb) {
1900 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1901 int tlen = dev->needed_tailroom;
1a35ca80
ED
1902 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1903
1904 rcu_read_unlock();
4ce40912 1905 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1906 if (skb == NULL)
1907 return -ENOBUFS;
1908 /* FIXME: Save some space for broken drivers that write a hard
1909 * header at transmission time by themselves. PPP is the notable
1910 * one here. This should really be fixed at the driver level.
1911 */
1912 skb_reserve(skb, reserved);
1913 skb_reset_network_header(skb);
1914
1915 /* Try to align data part correctly */
1916 if (hhlen) {
1917 skb->data -= hhlen;
1918 skb->tail -= hhlen;
1919 if (len < hhlen)
1920 skb_reset_network_header(skb);
1921 }
6ce8e9ce 1922 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1923 if (err)
1924 goto out_free;
1925 goto retry;
1da177e4
LT
1926 }
1927
9ed988cd
WB
1928 if (!dev_validate_header(dev, skb->data, len)) {
1929 err = -EINVAL;
1930 goto out_unlock;
1931 }
3c70c132
DB
1932 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1933 !packet_extra_vlan_len_allowed(dev, skb)) {
1934 err = -EMSGSIZE;
1935 goto out_unlock;
57f89bfa 1936 }
1a35ca80 1937
edbe7746 1938 sockc.tsflags = sk->sk_tsflags;
c14ac945
SHY
1939 if (msg->msg_controllen) {
1940 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1941 if (unlikely(err))
c14ac945 1942 goto out_unlock;
c14ac945
SHY
1943 }
1944
1da177e4
LT
1945 skb->protocol = proto;
1946 skb->dev = dev;
1947 skb->priority = sk->sk_priority;
2d37a186 1948 skb->mark = sk->sk_mark;
bf84a010 1949
c14ac945 1950 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1951
3bdc0eba
BG
1952 if (unlikely(extra_len == 4))
1953 skb->no_fcs = 1;
1954
40893fd0 1955 skb_probe_transport_header(skb, 0);
c1aad275 1956
1da177e4 1957 dev_queue_xmit(skb);
654d1f8a 1958 rcu_read_unlock();
40d4e3df 1959 return len;
1da177e4 1960
1da177e4 1961out_unlock:
654d1f8a 1962 rcu_read_unlock();
1a35ca80
ED
1963out_free:
1964 kfree_skb(skb);
1da177e4
LT
1965 return err;
1966}
1da177e4 1967
ff936a04
AS
1968static unsigned int run_filter(struct sk_buff *skb,
1969 const struct sock *sk,
1970 unsigned int res)
1da177e4
LT
1971{
1972 struct sk_filter *filter;
fda9ef5d 1973
80f8f102
ED
1974 rcu_read_lock();
1975 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1976 if (filter != NULL)
ff936a04 1977 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1978 rcu_read_unlock();
1da177e4 1979
dbcb5855 1980 return res;
1da177e4
LT
1981}
1982
16cc1400
WB
1983static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
1984 size_t *len)
1985{
1986 struct virtio_net_hdr vnet_hdr;
1987
1988 if (*len < sizeof(vnet_hdr))
1989 return -EINVAL;
1990 *len -= sizeof(vnet_hdr);
1991
6391a448 1992 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
16cc1400
WB
1993 return -EINVAL;
1994
1995 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
1996}
1997
1da177e4 1998/*
62ab0812
ED
1999 * This function makes lazy skb cloning in hope that most of packets
2000 * are discarded by BPF.
2001 *
2002 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2003 * and skb->cb are mangled. It works because (and until) packets
2004 * falling here are owned by current CPU. Output packets are cloned
2005 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2006 * sequencially, so that if we return skb to original state on exit,
2007 * we will not harm anyone.
1da177e4
LT
2008 */
2009
40d4e3df
ED
2010static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2011 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2012{
2013 struct sock *sk;
2014 struct sockaddr_ll *sll;
2015 struct packet_sock *po;
40d4e3df 2016 u8 *skb_head = skb->data;
1da177e4 2017 int skb_len = skb->len;
dbcb5855 2018 unsigned int snaplen, res;
da37845f 2019 bool is_drop_n_account = false;
1da177e4
LT
2020
2021 if (skb->pkt_type == PACKET_LOOPBACK)
2022 goto drop;
2023
2024 sk = pt->af_packet_priv;
2025 po = pkt_sk(sk);
2026
09ad9bc7 2027 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2028 goto drop;
2029
1da177e4
LT
2030 skb->dev = dev;
2031
3b04ddde 2032 if (dev->header_ops) {
1da177e4 2033 /* The device has an explicit notion of ll header,
62ab0812
ED
2034 * exported to higher levels.
2035 *
2036 * Otherwise, the device hides details of its frame
2037 * structure, so that corresponding packet head is
2038 * never delivered to user.
1da177e4
LT
2039 */
2040 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2041 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2042 else if (skb->pkt_type == PACKET_OUTGOING) {
2043 /* Special case: outgoing packets have ll header at head */
bbe735e4 2044 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2045 }
2046 }
2047
2048 snaplen = skb->len;
2049
dbcb5855
DM
2050 res = run_filter(skb, sk, snaplen);
2051 if (!res)
fda9ef5d 2052 goto drop_n_restore;
dbcb5855
DM
2053 if (snaplen > res)
2054 snaplen = res;
1da177e4 2055
0fd7bac6 2056 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2057 goto drop_n_acct;
2058
2059 if (skb_shared(skb)) {
2060 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2061 if (nskb == NULL)
2062 goto drop_n_acct;
2063
2064 if (skb_head != skb->data) {
2065 skb->data = skb_head;
2066 skb->len = skb_len;
2067 }
abc4e4fa 2068 consume_skb(skb);
1da177e4
LT
2069 skb = nskb;
2070 }
2071
b4772ef8 2072 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2073
2074 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2075 sll->sll_hatype = dev->type;
1da177e4 2076 sll->sll_pkttype = skb->pkt_type;
8032b464 2077 if (unlikely(po->origdev))
80feaacb
PWJ
2078 sll->sll_ifindex = orig_dev->ifindex;
2079 else
2080 sll->sll_ifindex = dev->ifindex;
1da177e4 2081
b95cce35 2082 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2083
2472d761
EB
2084 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2085 * Use their space for storing the original skb length.
2086 */
2087 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2088
1da177e4
LT
2089 if (pskb_trim(skb, snaplen))
2090 goto drop_n_acct;
2091
2092 skb_set_owner_r(skb, sk);
2093 skb->dev = NULL;
adf30907 2094 skb_dst_drop(skb);
1da177e4 2095
84531c24
PO
2096 /* drop conntrack reference */
2097 nf_reset(skb);
2098
1da177e4 2099 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2100 po->stats.stats1.tp_packets++;
3bc3b96f 2101 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2102 __skb_queue_tail(&sk->sk_receive_queue, skb);
2103 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2104 sk->sk_data_ready(sk);
1da177e4
LT
2105 return 0;
2106
2107drop_n_acct:
da37845f 2108 is_drop_n_account = true;
7091fbd8 2109 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2110 po->stats.stats1.tp_drops++;
7091fbd8
WB
2111 atomic_inc(&sk->sk_drops);
2112 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2113
2114drop_n_restore:
2115 if (skb_head != skb->data && skb_shared(skb)) {
2116 skb->data = skb_head;
2117 skb->len = skb_len;
2118 }
2119drop:
da37845f
WJ
2120 if (!is_drop_n_account)
2121 consume_skb(skb);
2122 else
2123 kfree_skb(skb);
1da177e4
LT
2124 return 0;
2125}
2126
40d4e3df
ED
2127static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2128 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2129{
2130 struct sock *sk;
2131 struct packet_sock *po;
2132 struct sockaddr_ll *sll;
184f489e 2133 union tpacket_uhdr h;
40d4e3df 2134 u8 *skb_head = skb->data;
1da177e4 2135 int skb_len = skb->len;
dbcb5855 2136 unsigned int snaplen, res;
f6fb8f10 2137 unsigned long status = TP_STATUS_USER;
bbd6ef87 2138 unsigned short macoff, netoff, hdrlen;
1da177e4 2139 struct sk_buff *copy_skb = NULL;
bbd6ef87 2140 struct timespec ts;
b9c32fb2 2141 __u32 ts_status;
da37845f 2142 bool is_drop_n_account = false;
1da177e4 2143
51846355
AW
2144 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2145 * We may add members to them until current aligned size without forcing
2146 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2147 */
2148 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2149 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2150
1da177e4
LT
2151 if (skb->pkt_type == PACKET_LOOPBACK)
2152 goto drop;
2153
2154 sk = pt->af_packet_priv;
2155 po = pkt_sk(sk);
2156
09ad9bc7 2157 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2158 goto drop;
2159
3b04ddde 2160 if (dev->header_ops) {
1da177e4 2161 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2162 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2163 else if (skb->pkt_type == PACKET_OUTGOING) {
2164 /* Special case: outgoing packets have ll header at head */
bbe735e4 2165 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2166 }
2167 }
2168
2169 snaplen = skb->len;
2170
dbcb5855
DM
2171 res = run_filter(skb, sk, snaplen);
2172 if (!res)
fda9ef5d 2173 goto drop_n_restore;
68c2e5de
AD
2174
2175 if (skb->ip_summed == CHECKSUM_PARTIAL)
2176 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2177 else if (skb->pkt_type != PACKET_OUTGOING &&
2178 (skb->ip_summed == CHECKSUM_COMPLETE ||
2179 skb_csum_unnecessary(skb)))
2180 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2181
dbcb5855
DM
2182 if (snaplen > res)
2183 snaplen = res;
1da177e4
LT
2184
2185 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2186 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2187 po->tp_reserve;
1da177e4 2188 } else {
95c96174 2189 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2190 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2191 (maclen < 16 ? 16 : maclen)) +
58d19b19
WB
2192 po->tp_reserve;
2193 if (po->has_vnet_hdr)
2194 netoff += sizeof(struct virtio_net_hdr);
1da177e4
LT
2195 macoff = netoff - maclen;
2196 }
f6fb8f10 2197 if (po->tp_version <= TPACKET_V2) {
2198 if (macoff + snaplen > po->rx_ring.frame_size) {
2199 if (po->copy_thresh &&
0fd7bac6 2200 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2201 if (skb_shared(skb)) {
2202 copy_skb = skb_clone(skb, GFP_ATOMIC);
2203 } else {
2204 copy_skb = skb_get(skb);
2205 skb_head = skb->data;
2206 }
2207 if (copy_skb)
2208 skb_set_owner_r(copy_skb, sk);
1da177e4 2209 }
f6fb8f10 2210 snaplen = po->rx_ring.frame_size - macoff;
2211 if ((int)snaplen < 0)
2212 snaplen = 0;
1da177e4 2213 }
dc808110
ED
2214 } else if (unlikely(macoff + snaplen >
2215 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2216 u32 nval;
2217
2218 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2219 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2220 snaplen, nval, macoff);
2221 snaplen = nval;
2222 if (unlikely((int)snaplen < 0)) {
2223 snaplen = 0;
2224 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2225 }
1da177e4 2226 }
1da177e4 2227 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2228 h.raw = packet_current_rx_frame(po, skb,
2229 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2230 if (!h.raw)
58d19b19 2231 goto drop_n_account;
f6fb8f10 2232 if (po->tp_version <= TPACKET_V2) {
2233 packet_increment_rx_head(po, &po->rx_ring);
2234 /*
2235 * LOSING will be reported till you read the stats,
2236 * because it's COR - Clear On Read.
2237 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2238 * at packet level.
2239 */
ee80fbf3 2240 if (po->stats.stats1.tp_drops)
f6fb8f10 2241 status |= TP_STATUS_LOSING;
2242 }
ee80fbf3 2243 po->stats.stats1.tp_packets++;
1da177e4
LT
2244 if (copy_skb) {
2245 status |= TP_STATUS_COPY;
2246 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2247 }
1da177e4
LT
2248 spin_unlock(&sk->sk_receive_queue.lock);
2249
58d19b19 2250 if (po->has_vnet_hdr) {
5a213881
JR
2251 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2252 sizeof(struct virtio_net_hdr),
6391a448 2253 vio_le(), true)) {
58d19b19
WB
2254 spin_lock(&sk->sk_receive_queue.lock);
2255 goto drop_n_account;
2256 }
2257 }
2258
bbd6ef87 2259 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2260
2261 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2262 getnstimeofday(&ts);
1da177e4 2263
b9c32fb2
DB
2264 status |= ts_status;
2265
bbd6ef87
PM
2266 switch (po->tp_version) {
2267 case TPACKET_V1:
2268 h.h1->tp_len = skb->len;
2269 h.h1->tp_snaplen = snaplen;
2270 h.h1->tp_mac = macoff;
2271 h.h1->tp_net = netoff;
4b457bdf
DB
2272 h.h1->tp_sec = ts.tv_sec;
2273 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2274 hdrlen = sizeof(*h.h1);
2275 break;
2276 case TPACKET_V2:
2277 h.h2->tp_len = skb->len;
2278 h.h2->tp_snaplen = snaplen;
2279 h.h2->tp_mac = macoff;
2280 h.h2->tp_net = netoff;
bbd6ef87
PM
2281 h.h2->tp_sec = ts.tv_sec;
2282 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2283 if (skb_vlan_tag_present(skb)) {
2284 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2285 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2286 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2287 } else {
2288 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2289 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2290 }
e4d26f4b 2291 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2292 hdrlen = sizeof(*h.h2);
2293 break;
f6fb8f10 2294 case TPACKET_V3:
2295 /* tp_nxt_offset,vlan are already populated above.
2296 * So DONT clear those fields here
2297 */
2298 h.h3->tp_status |= status;
2299 h.h3->tp_len = skb->len;
2300 h.h3->tp_snaplen = snaplen;
2301 h.h3->tp_mac = macoff;
2302 h.h3->tp_net = netoff;
f6fb8f10 2303 h.h3->tp_sec = ts.tv_sec;
2304 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2305 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2306 hdrlen = sizeof(*h.h3);
2307 break;
bbd6ef87
PM
2308 default:
2309 BUG();
2310 }
1da177e4 2311
bbd6ef87 2312 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2313 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2314 sll->sll_family = AF_PACKET;
2315 sll->sll_hatype = dev->type;
2316 sll->sll_protocol = skb->protocol;
2317 sll->sll_pkttype = skb->pkt_type;
8032b464 2318 if (unlikely(po->origdev))
80feaacb
PWJ
2319 sll->sll_ifindex = orig_dev->ifindex;
2320 else
2321 sll->sll_ifindex = dev->ifindex;
1da177e4 2322
e16aa207 2323 smp_mb();
f0d4eb29 2324
f6dafa95 2325#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2326 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2327 u8 *start, *end;
2328
f0d4eb29
DB
2329 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2330 macoff + snaplen);
2331
2332 for (start = h.raw; start < end; start += PAGE_SIZE)
2333 flush_dcache_page(pgv_to_page(start));
1da177e4 2334 }
f0d4eb29 2335 smp_wmb();
f6dafa95 2336#endif
f0d4eb29 2337
da413eec 2338 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2339 __packet_set_status(po, h.raw, status);
da413eec
DC
2340 sk->sk_data_ready(sk);
2341 } else {
f6fb8f10 2342 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2343 }
1da177e4
LT
2344
2345drop_n_restore:
2346 if (skb_head != skb->data && skb_shared(skb)) {
2347 skb->data = skb_head;
2348 skb->len = skb_len;
2349 }
2350drop:
da37845f
WJ
2351 if (!is_drop_n_account)
2352 consume_skb(skb);
2353 else
2354 kfree_skb(skb);
1da177e4
LT
2355 return 0;
2356
58d19b19 2357drop_n_account:
da37845f 2358 is_drop_n_account = true;
ee80fbf3 2359 po->stats.stats1.tp_drops++;
1da177e4
LT
2360 spin_unlock(&sk->sk_receive_queue.lock);
2361
676d2369 2362 sk->sk_data_ready(sk);
acb5d75b 2363 kfree_skb(copy_skb);
1da177e4
LT
2364 goto drop_n_restore;
2365}
2366
69e3c75f
JB
2367static void tpacket_destruct_skb(struct sk_buff *skb)
2368{
2369 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2370
69e3c75f 2371 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2372 void *ph;
b9c32fb2
DB
2373 __u32 ts;
2374
69e3c75f 2375 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2376 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2377
2378 ts = __packet_set_timestamp(po, ph, skb);
2379 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2380 }
2381
2382 sock_wfree(skb);
2383}
2384
c72219b7
DB
2385static void tpacket_set_protocol(const struct net_device *dev,
2386 struct sk_buff *skb)
2387{
2388 if (dev->type == ARPHRD_ETHER) {
2389 skb_reset_mac_header(skb);
2390 skb->protocol = eth_hdr(skb)->h_proto;
2391 }
2392}
2393
16cc1400
WB
2394static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2395{
16cc1400
WB
2396 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2397 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2398 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2399 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2400 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2401 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2402 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2403
2404 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2405 return -EINVAL;
2406
16cc1400
WB
2407 return 0;
2408}
2409
2410static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2411 struct virtio_net_hdr *vnet_hdr)
2412{
16cc1400
WB
2413 if (*len < sizeof(*vnet_hdr))
2414 return -EINVAL;
2415 *len -= sizeof(*vnet_hdr);
2416
cbbd26b8 2417 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2418 return -EFAULT;
2419
2420 return __packet_snd_vnet_parse(vnet_hdr, *len);
2421}
2422
40d4e3df 2423static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2424 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2425 __be16 proto, unsigned char *addr, int hlen, int copylen,
2426 const struct sockcm_cookie *sockc)
69e3c75f 2427{
184f489e 2428 union tpacket_uhdr ph;
8d39b4a6 2429 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2430 struct socket *sock = po->sk.sk_socket;
2431 struct page *page;
69e3c75f
JB
2432 int err;
2433
2434 ph.raw = frame;
2435
2436 skb->protocol = proto;
2437 skb->dev = dev;
2438 skb->priority = po->sk.sk_priority;
2d37a186 2439 skb->mark = po->sk.sk_mark;
c14ac945 2440 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2441 skb_shinfo(skb)->destructor_arg = ph.raw;
2442
ae641949 2443 skb_reserve(skb, hlen);
69e3c75f 2444 skb_reset_network_header(skb);
c1aad275 2445
69e3c75f
JB
2446 to_write = tp_len;
2447
2448 if (sock->type == SOCK_DGRAM) {
2449 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2450 NULL, tp_len);
2451 if (unlikely(err < 0))
2452 return -EINVAL;
1d036d25 2453 } else if (copylen) {
9ed988cd
WB
2454 int hdrlen = min_t(int, copylen, tp_len);
2455
69e3c75f 2456 skb_push(skb, dev->hard_header_len);
1d036d25 2457 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2458 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2459 if (unlikely(err))
2460 return err;
9ed988cd
WB
2461 if (!dev_validate_header(dev, skb->data, hdrlen))
2462 return -EINVAL;
c72219b7
DB
2463 if (!skb->protocol)
2464 tpacket_set_protocol(dev, skb);
69e3c75f 2465
9ed988cd
WB
2466 data += hdrlen;
2467 to_write -= hdrlen;
69e3c75f
JB
2468 }
2469
69e3c75f
JB
2470 offset = offset_in_page(data);
2471 len_max = PAGE_SIZE - offset;
2472 len = ((to_write > len_max) ? len_max : to_write);
2473
2474 skb->data_len = to_write;
2475 skb->len += to_write;
2476 skb->truesize += to_write;
2477 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2478
2479 while (likely(to_write)) {
2480 nr_frags = skb_shinfo(skb)->nr_frags;
2481
2482 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2483 pr_err("Packet exceed the number of skb frags(%lu)\n",
2484 MAX_SKB_FRAGS);
69e3c75f
JB
2485 return -EFAULT;
2486 }
2487
0af55bb5
CG
2488 page = pgv_to_page(data);
2489 data += len;
69e3c75f
JB
2490 flush_dcache_page(page);
2491 get_page(page);
0af55bb5 2492 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2493 to_write -= len;
2494 offset = 0;
2495 len_max = PAGE_SIZE;
2496 len = ((to_write > len_max) ? len_max : to_write);
2497 }
2498
8fd6c80d 2499 skb_probe_transport_header(skb, 0);
efdfa2f7 2500
69e3c75f
JB
2501 return tp_len;
2502}
2503
8d39b4a6
WB
2504static int tpacket_parse_header(struct packet_sock *po, void *frame,
2505 int size_max, void **data)
2506{
2507 union tpacket_uhdr ph;
2508 int tp_len, off;
2509
2510 ph.raw = frame;
2511
2512 switch (po->tp_version) {
2513 case TPACKET_V2:
2514 tp_len = ph.h2->tp_len;
2515 break;
2516 default:
2517 tp_len = ph.h1->tp_len;
2518 break;
2519 }
2520 if (unlikely(tp_len > size_max)) {
2521 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2522 return -EMSGSIZE;
2523 }
2524
2525 if (unlikely(po->tp_tx_has_off)) {
2526 int off_min, off_max;
2527
2528 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2529 off_max = po->tx_ring.frame_size - tp_len;
2530 if (po->sk.sk_type == SOCK_DGRAM) {
2531 switch (po->tp_version) {
2532 case TPACKET_V2:
2533 off = ph.h2->tp_net;
2534 break;
2535 default:
2536 off = ph.h1->tp_net;
2537 break;
2538 }
2539 } else {
2540 switch (po->tp_version) {
2541 case TPACKET_V2:
2542 off = ph.h2->tp_mac;
2543 break;
2544 default:
2545 off = ph.h1->tp_mac;
2546 break;
2547 }
2548 }
2549 if (unlikely((off < off_min) || (off_max < off)))
2550 return -EINVAL;
2551 } else {
2552 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2553 }
2554
2555 *data = frame + off;
2556 return tp_len;
2557}
2558
69e3c75f
JB
2559static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2560{
69e3c75f
JB
2561 struct sk_buff *skb;
2562 struct net_device *dev;
1d036d25 2563 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2564 struct sockcm_cookie sockc;
69e3c75f 2565 __be16 proto;
09effa67 2566 int err, reserve = 0;
40d4e3df 2567 void *ph;
342dfc30 2568 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2569 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2570 int tp_len, size_max;
2571 unsigned char *addr;
8d39b4a6 2572 void *data;
69e3c75f 2573 int len_sum = 0;
9e67030a 2574 int status = TP_STATUS_AVAILABLE;
1d036d25 2575 int hlen, tlen, copylen = 0;
69e3c75f 2576
69e3c75f
JB
2577 mutex_lock(&po->pg_vec_lock);
2578
66e56cd4 2579 if (likely(saddr == NULL)) {
e40526cb 2580 dev = packet_cached_dev_get(po);
69e3c75f
JB
2581 proto = po->num;
2582 addr = NULL;
2583 } else {
2584 err = -EINVAL;
2585 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2586 goto out;
2587 if (msg->msg_namelen < (saddr->sll_halen
2588 + offsetof(struct sockaddr_ll,
2589 sll_addr)))
2590 goto out;
69e3c75f
JB
2591 proto = saddr->sll_protocol;
2592 addr = saddr->sll_addr;
827d9780 2593 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2594 }
2595
edbe7746 2596 sockc.tsflags = po->sk.sk_tsflags;
c14ac945
SHY
2597 if (msg->msg_controllen) {
2598 err = sock_cmsg_send(&po->sk, msg, &sockc);
2599 if (unlikely(err))
2600 goto out;
2601 }
2602
69e3c75f
JB
2603 err = -ENXIO;
2604 if (unlikely(dev == NULL))
2605 goto out;
69e3c75f
JB
2606 err = -ENETDOWN;
2607 if (unlikely(!(dev->flags & IFF_UP)))
2608 goto out_put;
2609
5cfb4c8d
DB
2610 if (po->sk.sk_socket->type == SOCK_RAW)
2611 reserve = dev->hard_header_len;
69e3c75f 2612 size_max = po->tx_ring.frame_size
b5dd884e 2613 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2614
1d036d25 2615 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2616 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2617
69e3c75f
JB
2618 do {
2619 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2620 TP_STATUS_SEND_REQUEST);
69e3c75f 2621 if (unlikely(ph == NULL)) {
87a2fd28
DB
2622 if (need_wait && need_resched())
2623 schedule();
69e3c75f
JB
2624 continue;
2625 }
2626
8d39b4a6
WB
2627 skb = NULL;
2628 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2629 if (tp_len < 0)
2630 goto tpacket_error;
2631
69e3c75f 2632 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2633 hlen = LL_RESERVED_SPACE(dev);
2634 tlen = dev->needed_tailroom;
1d036d25
WB
2635 if (po->has_vnet_hdr) {
2636 vnet_hdr = data;
2637 data += sizeof(*vnet_hdr);
2638 tp_len -= sizeof(*vnet_hdr);
2639 if (tp_len < 0 ||
2640 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2641 tp_len = -EINVAL;
2642 goto tpacket_error;
2643 }
2644 copylen = __virtio16_to_cpu(vio_le(),
2645 vnet_hdr->hdr_len);
2646 }
9ed988cd 2647 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2648 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2649 hlen + tlen + sizeof(struct sockaddr_ll) +
2650 (copylen - dev->hard_header_len),
fbf33a28 2651 !need_wait, &err);
69e3c75f 2652
fbf33a28
KM
2653 if (unlikely(skb == NULL)) {
2654 /* we assume the socket was initially writeable ... */
2655 if (likely(len_sum > 0))
2656 err = len_sum;
69e3c75f 2657 goto out_status;
fbf33a28 2658 }
8d39b4a6 2659 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2660 addr, hlen, copylen, &sockc);
dbd46ab4 2661 if (likely(tp_len >= 0) &&
5cfb4c8d 2662 tp_len > dev->mtu + reserve &&
1d036d25 2663 !po->has_vnet_hdr &&
3c70c132
DB
2664 !packet_extra_vlan_len_allowed(dev, skb))
2665 tp_len = -EMSGSIZE;
69e3c75f
JB
2666
2667 if (unlikely(tp_len < 0)) {
8d39b4a6 2668tpacket_error:
69e3c75f
JB
2669 if (po->tp_loss) {
2670 __packet_set_status(po, ph,
2671 TP_STATUS_AVAILABLE);
2672 packet_increment_head(&po->tx_ring);
2673 kfree_skb(skb);
2674 continue;
2675 } else {
2676 status = TP_STATUS_WRONG_FORMAT;
2677 err = tp_len;
2678 goto out_status;
2679 }
2680 }
2681
db60eb5f
JR
2682 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2683 vio_le())) {
1d036d25
WB
2684 tp_len = -EINVAL;
2685 goto tpacket_error;
2686 }
2687
0fd5d57b
DB
2688 packet_pick_tx_queue(dev, skb);
2689
69e3c75f
JB
2690 skb->destructor = tpacket_destruct_skb;
2691 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2692 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2693
2694 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2695 err = po->xmit(skb);
eb70df13
JP
2696 if (unlikely(err > 0)) {
2697 err = net_xmit_errno(err);
2698 if (err && __packet_get_status(po, ph) ==
2699 TP_STATUS_AVAILABLE) {
2700 /* skb was destructed already */
2701 skb = NULL;
2702 goto out_status;
2703 }
2704 /*
2705 * skb was dropped but not destructed yet;
2706 * let's treat it like congestion or err < 0
2707 */
2708 err = 0;
2709 }
69e3c75f
JB
2710 packet_increment_head(&po->tx_ring);
2711 len_sum += tp_len;
b0138408
DB
2712 } while (likely((ph != NULL) ||
2713 /* Note: packet_read_pending() might be slow if we have
2714 * to call it as it's per_cpu variable, but in fast-path
2715 * we already short-circuit the loop with the first
2716 * condition, and luckily don't have to go that path
2717 * anyway.
2718 */
2719 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2720
2721 err = len_sum;
2722 goto out_put;
2723
69e3c75f
JB
2724out_status:
2725 __packet_set_status(po, ph, status);
2726 kfree_skb(skb);
2727out_put:
e40526cb 2728 dev_put(dev);
69e3c75f
JB
2729out:
2730 mutex_unlock(&po->pg_vec_lock);
2731 return err;
2732}
69e3c75f 2733
eea49cc9
OJ
2734static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2735 size_t reserve, size_t len,
2736 size_t linear, int noblock,
2737 int *err)
bfd5f4a3
SS
2738{
2739 struct sk_buff *skb;
2740
2741 /* Under a page? Don't bother with paged skb. */
2742 if (prepad + len < PAGE_SIZE || !linear)
2743 linear = len;
2744
2745 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2746 err, 0);
bfd5f4a3
SS
2747 if (!skb)
2748 return NULL;
2749
2750 skb_reserve(skb, reserve);
2751 skb_put(skb, linear);
2752 skb->data_len = len - linear;
2753 skb->len += len - linear;
2754
2755 return skb;
2756}
2757
d346a3fa 2758static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2759{
2760 struct sock *sk = sock->sk;
342dfc30 2761 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2762 struct sk_buff *skb;
2763 struct net_device *dev;
0e11c91e 2764 __be16 proto;
1da177e4 2765 unsigned char *addr;
827d9780 2766 int err, reserve = 0;
c7d39e32 2767 struct sockcm_cookie sockc;
bfd5f4a3
SS
2768 struct virtio_net_hdr vnet_hdr = { 0 };
2769 int offset = 0;
bfd5f4a3 2770 struct packet_sock *po = pkt_sk(sk);
57031eb7 2771 int hlen, tlen, linear;
3bdc0eba 2772 int extra_len = 0;
1da177e4
LT
2773
2774 /*
1ce4f28b 2775 * Get and verify the address.
1da177e4 2776 */
1ce4f28b 2777
66e56cd4 2778 if (likely(saddr == NULL)) {
e40526cb 2779 dev = packet_cached_dev_get(po);
1da177e4
LT
2780 proto = po->num;
2781 addr = NULL;
2782 } else {
2783 err = -EINVAL;
2784 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2785 goto out;
0fb375fb
EB
2786 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2787 goto out;
1da177e4
LT
2788 proto = saddr->sll_protocol;
2789 addr = saddr->sll_addr;
827d9780 2790 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2791 }
2792
1da177e4 2793 err = -ENXIO;
e40526cb 2794 if (unlikely(dev == NULL))
1da177e4 2795 goto out_unlock;
d5e76b0a 2796 err = -ENETDOWN;
e40526cb 2797 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2798 goto out_unlock;
2799
edbe7746 2800 sockc.tsflags = sk->sk_tsflags;
c7d39e32
EJ
2801 sockc.mark = sk->sk_mark;
2802 if (msg->msg_controllen) {
2803 err = sock_cmsg_send(sk, msg, &sockc);
2804 if (unlikely(err))
2805 goto out_unlock;
2806 }
2807
e40526cb
DB
2808 if (sock->type == SOCK_RAW)
2809 reserve = dev->hard_header_len;
bfd5f4a3 2810 if (po->has_vnet_hdr) {
16cc1400
WB
2811 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2812 if (err)
bfd5f4a3 2813 goto out_unlock;
bfd5f4a3
SS
2814 }
2815
3bdc0eba
BG
2816 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2817 if (!netif_supports_nofcs(dev)) {
2818 err = -EPROTONOSUPPORT;
2819 goto out_unlock;
2820 }
2821 extra_len = 4; /* We're doing our own CRC */
2822 }
2823
1da177e4 2824 err = -EMSGSIZE;
16cc1400
WB
2825 if (!vnet_hdr.gso_type &&
2826 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2827 goto out_unlock;
2828
bfd5f4a3 2829 err = -ENOBUFS;
ae641949
HX
2830 hlen = LL_RESERVED_SPACE(dev);
2831 tlen = dev->needed_tailroom;
57031eb7
WB
2832 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2833 linear = max(linear, min_t(int, len, dev->hard_header_len));
2834 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2835 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2836 if (skb == NULL)
1da177e4
LT
2837 goto out_unlock;
2838
bfd5f4a3 2839 skb_set_network_header(skb, reserve);
1da177e4 2840
0c4e8581 2841 err = -EINVAL;
9c707762
WB
2842 if (sock->type == SOCK_DGRAM) {
2843 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2844 if (unlikely(offset < 0))
9c707762 2845 goto out_free;
9c707762 2846 }
1da177e4
LT
2847
2848 /* Returns -EFAULT on error */
c0371da6 2849 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2850 if (err)
2851 goto out_free;
bf84a010 2852
9ed988cd
WB
2853 if (sock->type == SOCK_RAW &&
2854 !dev_validate_header(dev, skb->data, len)) {
2855 err = -EINVAL;
2856 goto out_free;
2857 }
2858
c14ac945 2859 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2860
16cc1400 2861 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2862 !packet_extra_vlan_len_allowed(dev, skb)) {
2863 err = -EMSGSIZE;
2864 goto out_free;
57f89bfa
BG
2865 }
2866
09effa67
DM
2867 skb->protocol = proto;
2868 skb->dev = dev;
1da177e4 2869 skb->priority = sk->sk_priority;
c7d39e32 2870 skb->mark = sockc.mark;
0fd5d57b
DB
2871
2872 packet_pick_tx_queue(dev, skb);
1da177e4 2873
bfd5f4a3 2874 if (po->has_vnet_hdr) {
db60eb5f 2875 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2876 if (err)
2877 goto out_free;
2878 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2879 }
2880
8fd6c80d
DB
2881 skb_probe_transport_header(skb, reserve);
2882
3bdc0eba
BG
2883 if (unlikely(extra_len == 4))
2884 skb->no_fcs = 1;
2885
d346a3fa 2886 err = po->xmit(skb);
1da177e4
LT
2887 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2888 goto out_unlock;
2889
e40526cb 2890 dev_put(dev);
1da177e4 2891
40d4e3df 2892 return len;
1da177e4
LT
2893
2894out_free:
2895 kfree_skb(skb);
2896out_unlock:
e40526cb 2897 if (dev)
1da177e4
LT
2898 dev_put(dev);
2899out:
2900 return err;
2901}
2902
1b784140 2903static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2904{
69e3c75f
JB
2905 struct sock *sk = sock->sk;
2906 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2907
69e3c75f
JB
2908 if (po->tx_ring.pg_vec)
2909 return tpacket_snd(po, msg);
2910 else
69e3c75f
JB
2911 return packet_snd(sock, msg, len);
2912}
2913
1da177e4
LT
2914/*
2915 * Close a PACKET socket. This is fairly simple. We immediately go
2916 * to 'closed' state and remove our protocol entry in the device list.
2917 */
2918
2919static int packet_release(struct socket *sock)
2920{
2921 struct sock *sk = sock->sk;
2922 struct packet_sock *po;
2bd624b4 2923 struct packet_fanout *f;
d12d01d6 2924 struct net *net;
f6fb8f10 2925 union tpacket_req_u req_u;
1da177e4
LT
2926
2927 if (!sk)
2928 return 0;
2929
3b1e0a65 2930 net = sock_net(sk);
1da177e4
LT
2931 po = pkt_sk(sk);
2932
0fa7fa98 2933 mutex_lock(&net->packet.sklist_lock);
808f5114 2934 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2935 mutex_unlock(&net->packet.sklist_lock);
2936
2937 preempt_disable();
920de804 2938 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2939 preempt_enable();
1da177e4 2940
808f5114 2941 spin_lock(&po->bind_lock);
ce06b03e 2942 unregister_prot_hook(sk, false);
66e56cd4
DB
2943 packet_cached_dev_reset(po);
2944
160ff18a
BG
2945 if (po->prot_hook.dev) {
2946 dev_put(po->prot_hook.dev);
2947 po->prot_hook.dev = NULL;
2948 }
808f5114 2949 spin_unlock(&po->bind_lock);
1da177e4 2950
1da177e4 2951 packet_flush_mclist(sk);
1da177e4 2952
9665d5d6
PS
2953 if (po->rx_ring.pg_vec) {
2954 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2955 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2956 }
69e3c75f 2957
9665d5d6
PS
2958 if (po->tx_ring.pg_vec) {
2959 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2960 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2961 }
1da177e4 2962
2bd624b4 2963 f = fanout_release(sk);
dc99f600 2964
808f5114 2965 synchronize_net();
2bd624b4
AS
2966
2967 if (f) {
2968 fanout_release_data(f);
2969 kfree(f);
2970 }
1da177e4
LT
2971 /*
2972 * Now the socket is dead. No more input will appear.
2973 */
1da177e4
LT
2974 sock_orphan(sk);
2975 sock->sk = NULL;
2976
2977 /* Purge queues */
2978
2979 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2980 packet_free_pending(po);
17ab56a2 2981 sk_refcnt_debug_release(sk);
1da177e4
LT
2982
2983 sock_put(sk);
2984 return 0;
2985}
2986
2987/*
2988 * Attach a packet hook.
2989 */
2990
30f7ea1c
FR
2991static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
2992 __be16 proto)
1da177e4
LT
2993{
2994 struct packet_sock *po = pkt_sk(sk);
158cd4af 2995 struct net_device *dev_curr;
902fefb8
DB
2996 __be16 proto_curr;
2997 bool need_rehook;
30f7ea1c
FR
2998 struct net_device *dev = NULL;
2999 int ret = 0;
3000 bool unlisted = false;
dc99f600 3001
30f7ea1c 3002 if (po->fanout)
dc99f600 3003 return -EINVAL;
1da177e4
LT
3004
3005 lock_sock(sk);
1da177e4 3006 spin_lock(&po->bind_lock);
30f7ea1c
FR
3007 rcu_read_lock();
3008
3009 if (name) {
3010 dev = dev_get_by_name_rcu(sock_net(sk), name);
3011 if (!dev) {
3012 ret = -ENODEV;
3013 goto out_unlock;
3014 }
3015 } else if (ifindex) {
3016 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3017 if (!dev) {
3018 ret = -ENODEV;
3019 goto out_unlock;
3020 }
3021 }
3022
3023 if (dev)
3024 dev_hold(dev);
66e56cd4 3025
902fefb8
DB
3026 proto_curr = po->prot_hook.type;
3027 dev_curr = po->prot_hook.dev;
3028
3029 need_rehook = proto_curr != proto || dev_curr != dev;
3030
3031 if (need_rehook) {
30f7ea1c
FR
3032 if (po->running) {
3033 rcu_read_unlock();
3034 __unregister_prot_hook(sk, true);
3035 rcu_read_lock();
3036 dev_curr = po->prot_hook.dev;
3037 if (dev)
3038 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3039 dev->ifindex);
3040 }
1da177e4 3041
902fefb8
DB
3042 po->num = proto;
3043 po->prot_hook.type = proto;
902fefb8 3044
30f7ea1c
FR
3045 if (unlikely(unlisted)) {
3046 dev_put(dev);
3047 po->prot_hook.dev = NULL;
3048 po->ifindex = -1;
3049 packet_cached_dev_reset(po);
3050 } else {
3051 po->prot_hook.dev = dev;
3052 po->ifindex = dev ? dev->ifindex : 0;
3053 packet_cached_dev_assign(po, dev);
3054 }
902fefb8 3055 }
158cd4af
LW
3056 if (dev_curr)
3057 dev_put(dev_curr);
66e56cd4 3058
902fefb8 3059 if (proto == 0 || !need_rehook)
1da177e4
LT
3060 goto out_unlock;
3061
30f7ea1c 3062 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3063 register_prot_hook(sk);
be85d4ad
UT
3064 } else {
3065 sk->sk_err = ENETDOWN;
3066 if (!sock_flag(sk, SOCK_DEAD))
3067 sk->sk_error_report(sk);
1da177e4
LT
3068 }
3069
3070out_unlock:
30f7ea1c 3071 rcu_read_unlock();
1da177e4
LT
3072 spin_unlock(&po->bind_lock);
3073 release_sock(sk);
30f7ea1c 3074 return ret;
1da177e4
LT
3075}
3076
3077/*
3078 * Bind a packet socket to a device
3079 */
3080
40d4e3df
ED
3081static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3082 int addr_len)
1da177e4 3083{
40d4e3df 3084 struct sock *sk = sock->sk;
1da177e4 3085 char name[15];
1ce4f28b 3086
1da177e4
LT
3087 /*
3088 * Check legality
3089 */
1ce4f28b 3090
8ae55f04 3091 if (addr_len != sizeof(struct sockaddr))
1da177e4 3092 return -EINVAL;
40d4e3df 3093 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 3094
30f7ea1c 3095 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3096}
1da177e4
LT
3097
3098static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3099{
40d4e3df
ED
3100 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3101 struct sock *sk = sock->sk;
1da177e4
LT
3102
3103 /*
3104 * Check legality
3105 */
1ce4f28b 3106
1da177e4
LT
3107 if (addr_len < sizeof(struct sockaddr_ll))
3108 return -EINVAL;
3109 if (sll->sll_family != AF_PACKET)
3110 return -EINVAL;
3111
30f7ea1c
FR
3112 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3113 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3114}
3115
3116static struct proto packet_proto = {
3117 .name = "PACKET",
3118 .owner = THIS_MODULE,
3119 .obj_size = sizeof(struct packet_sock),
3120};
3121
3122/*
1ce4f28b 3123 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3124 */
3125
3f378b68
EP
3126static int packet_create(struct net *net, struct socket *sock, int protocol,
3127 int kern)
1da177e4
LT
3128{
3129 struct sock *sk;
3130 struct packet_sock *po;
0e11c91e 3131 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3132 int err;
3133
df008c91 3134 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3135 return -EPERM;
be02097c
DM
3136 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3137 sock->type != SOCK_PACKET)
1da177e4
LT
3138 return -ESOCKTNOSUPPORT;
3139
3140 sock->state = SS_UNCONNECTED;
3141
3142 err = -ENOBUFS;
11aa9c28 3143 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3144 if (sk == NULL)
3145 goto out;
3146
3147 sock->ops = &packet_ops;
1da177e4
LT
3148 if (sock->type == SOCK_PACKET)
3149 sock->ops = &packet_ops_spkt;
be02097c 3150
1da177e4
LT
3151 sock_init_data(sock, sk);
3152
3153 po = pkt_sk(sk);
3154 sk->sk_family = PF_PACKET;
0e11c91e 3155 po->num = proto;
d346a3fa 3156 po->xmit = dev_queue_xmit;
66e56cd4 3157
b0138408
DB
3158 err = packet_alloc_pending(po);
3159 if (err)
3160 goto out2;
3161
66e56cd4 3162 packet_cached_dev_reset(po);
1da177e4
LT
3163
3164 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3165 sk_refcnt_debug_inc(sk);
1da177e4
LT
3166
3167 /*
3168 * Attach a protocol block
3169 */
3170
3171 spin_lock_init(&po->bind_lock);
905db440 3172 mutex_init(&po->pg_vec_lock);
0648ab70 3173 po->rollover = NULL;
1da177e4 3174 po->prot_hook.func = packet_rcv;
be02097c 3175
1da177e4
LT
3176 if (sock->type == SOCK_PACKET)
3177 po->prot_hook.func = packet_rcv_spkt;
be02097c 3178
1da177e4
LT
3179 po->prot_hook.af_packet_priv = sk;
3180
0e11c91e
AV
3181 if (proto) {
3182 po->prot_hook.type = proto;
ce06b03e 3183 register_prot_hook(sk);
1da177e4
LT
3184 }
3185
0fa7fa98 3186 mutex_lock(&net->packet.sklist_lock);
808f5114 3187 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3188 mutex_unlock(&net->packet.sklist_lock);
3189
3190 preempt_disable();
3680453c 3191 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3192 preempt_enable();
808f5114 3193
40d4e3df 3194 return 0;
b0138408
DB
3195out2:
3196 sk_free(sk);
1da177e4
LT
3197out:
3198 return err;
3199}
3200
3201/*
3202 * Pull a packet from our receive queue and hand it to the user.
3203 * If necessary we block.
3204 */
3205
1b784140
YX
3206static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3207 int flags)
1da177e4
LT
3208{
3209 struct sock *sk = sock->sk;
3210 struct sk_buff *skb;
3211 int copied, err;
bfd5f4a3 3212 int vnet_hdr_len = 0;
2472d761 3213 unsigned int origlen = 0;
1da177e4
LT
3214
3215 err = -EINVAL;
ed85b565 3216 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3217 goto out;
3218
3219#if 0
3220 /* What error should we return now? EUNATTACH? */
3221 if (pkt_sk(sk)->ifindex < 0)
3222 return -ENODEV;
3223#endif
3224
ed85b565 3225 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3226 err = sock_recv_errqueue(sk, msg, len,
3227 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3228 goto out;
3229 }
3230
1da177e4
LT
3231 /*
3232 * Call the generic datagram receiver. This handles all sorts
3233 * of horrible races and re-entrancy so we can forget about it
3234 * in the protocol layers.
3235 *
3236 * Now it will return ENETDOWN, if device have just gone down,
3237 * but then it will block.
3238 */
3239
40d4e3df 3240 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3241
3242 /*
1ce4f28b 3243 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3244 * handles the blocking we don't see and worry about blocking
3245 * retries.
3246 */
3247
8ae55f04 3248 if (skb == NULL)
1da177e4
LT
3249 goto out;
3250
2ccdbaa6
WB
3251 if (pkt_sk(sk)->pressure)
3252 packet_rcv_has_room(pkt_sk(sk), NULL);
3253
bfd5f4a3 3254 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3255 err = packet_rcv_vnet(msg, skb, &len);
3256 if (err)
bfd5f4a3 3257 goto out_free;
16cc1400 3258 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3259 }
3260
f3d33426
HFS
3261 /* You lose any data beyond the buffer you gave. If it worries
3262 * a user program they can ask the device for its MTU
3263 * anyway.
1da177e4 3264 */
1da177e4 3265 copied = skb->len;
40d4e3df
ED
3266 if (copied > len) {
3267 copied = len;
3268 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3269 }
3270
51f3d02b 3271 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3272 if (err)
3273 goto out_free;
3274
2472d761
EB
3275 if (sock->type != SOCK_PACKET) {
3276 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3277
3278 /* Original length was stored in sockaddr_ll fields */
3279 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3280 sll->sll_family = AF_PACKET;
3281 sll->sll_protocol = skb->protocol;
3282 }
3283
3b885787 3284 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3285
f3d33426
HFS
3286 if (msg->msg_name) {
3287 /* If the address length field is there to be filled
3288 * in, we fill it in now.
3289 */
3290 if (sock->type == SOCK_PACKET) {
342dfc30 3291 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3292 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3293 } else {
3294 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3295
f3d33426
HFS
3296 msg->msg_namelen = sll->sll_halen +
3297 offsetof(struct sockaddr_ll, sll_addr);
3298 }
ffbc6111
HX
3299 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3300 msg->msg_namelen);
f3d33426 3301 }
1da177e4 3302
8dc41944 3303 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3304 struct tpacket_auxdata aux;
3305
3306 aux.tp_status = TP_STATUS_USER;
3307 if (skb->ip_summed == CHECKSUM_PARTIAL)
3308 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3309 else if (skb->pkt_type != PACKET_OUTGOING &&
3310 (skb->ip_summed == CHECKSUM_COMPLETE ||
3311 skb_csum_unnecessary(skb)))
3312 aux.tp_status |= TP_STATUS_CSUM_VALID;
3313
2472d761 3314 aux.tp_len = origlen;
ffbc6111
HX
3315 aux.tp_snaplen = skb->len;
3316 aux.tp_mac = 0;
bbe735e4 3317 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3318 if (skb_vlan_tag_present(skb)) {
3319 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3320 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3321 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3322 } else {
3323 aux.tp_vlan_tci = 0;
a0cdfcf3 3324 aux.tp_vlan_tpid = 0;
a3bcc23e 3325 }
ffbc6111 3326 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3327 }
3328
1da177e4
LT
3329 /*
3330 * Free or return the buffer as appropriate. Again this
3331 * hides all the races and re-entrancy issues from us.
3332 */
bfd5f4a3 3333 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3334
3335out_free:
3336 skb_free_datagram(sk, skb);
3337out:
3338 return err;
3339}
3340
1da177e4
LT
3341static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3342 int *uaddr_len, int peer)
3343{
3344 struct net_device *dev;
3345 struct sock *sk = sock->sk;
3346
3347 if (peer)
3348 return -EOPNOTSUPP;
3349
3350 uaddr->sa_family = AF_PACKET;
2dc85bf3 3351 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3352 rcu_read_lock();
3353 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3354 if (dev)
2dc85bf3 3355 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3356 rcu_read_unlock();
1da177e4
LT
3357 *uaddr_len = sizeof(*uaddr);
3358
3359 return 0;
3360}
1da177e4
LT
3361
3362static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3363 int *uaddr_len, int peer)
3364{
3365 struct net_device *dev;
3366 struct sock *sk = sock->sk;
3367 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3368 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3369
3370 if (peer)
3371 return -EOPNOTSUPP;
3372
3373 sll->sll_family = AF_PACKET;
3374 sll->sll_ifindex = po->ifindex;
3375 sll->sll_protocol = po->num;
67286640 3376 sll->sll_pkttype = 0;
654d1f8a
ED
3377 rcu_read_lock();
3378 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3379 if (dev) {
3380 sll->sll_hatype = dev->type;
3381 sll->sll_halen = dev->addr_len;
3382 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3383 } else {
3384 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3385 sll->sll_halen = 0;
3386 }
654d1f8a 3387 rcu_read_unlock();
0fb375fb 3388 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3389
3390 return 0;
3391}
3392
2aeb0b88
WC
3393static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3394 int what)
1da177e4
LT
3395{
3396 switch (i->type) {
3397 case PACKET_MR_MULTICAST:
1162563f
JP
3398 if (i->alen != dev->addr_len)
3399 return -EINVAL;
1da177e4 3400 if (what > 0)
22bedad3 3401 return dev_mc_add(dev, i->addr);
1da177e4 3402 else
22bedad3 3403 return dev_mc_del(dev, i->addr);
1da177e4
LT
3404 break;
3405 case PACKET_MR_PROMISC:
2aeb0b88 3406 return dev_set_promiscuity(dev, what);
1da177e4 3407 case PACKET_MR_ALLMULTI:
2aeb0b88 3408 return dev_set_allmulti(dev, what);
d95ed927 3409 case PACKET_MR_UNICAST:
1162563f
JP
3410 if (i->alen != dev->addr_len)
3411 return -EINVAL;
d95ed927 3412 if (what > 0)
a748ee24 3413 return dev_uc_add(dev, i->addr);
d95ed927 3414 else
a748ee24 3415 return dev_uc_del(dev, i->addr);
d95ed927 3416 break;
40d4e3df
ED
3417 default:
3418 break;
1da177e4 3419 }
2aeb0b88 3420 return 0;
1da177e4
LT
3421}
3422
82f17091
FR
3423static void packet_dev_mclist_delete(struct net_device *dev,
3424 struct packet_mclist **mlp)
1da177e4 3425{
82f17091
FR
3426 struct packet_mclist *ml;
3427
3428 while ((ml = *mlp) != NULL) {
3429 if (ml->ifindex == dev->ifindex) {
3430 packet_dev_mc(dev, ml, -1);
3431 *mlp = ml->next;
3432 kfree(ml);
3433 } else
3434 mlp = &ml->next;
1da177e4
LT
3435 }
3436}
3437
0fb375fb 3438static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3439{
3440 struct packet_sock *po = pkt_sk(sk);
3441 struct packet_mclist *ml, *i;
3442 struct net_device *dev;
3443 int err;
3444
3445 rtnl_lock();
3446
3447 err = -ENODEV;
3b1e0a65 3448 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3449 if (!dev)
3450 goto done;
3451
3452 err = -EINVAL;
1162563f 3453 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3454 goto done;
3455
3456 err = -ENOBUFS;
8b3a7005 3457 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3458 if (i == NULL)
3459 goto done;
3460
3461 err = 0;
3462 for (ml = po->mclist; ml; ml = ml->next) {
3463 if (ml->ifindex == mreq->mr_ifindex &&
3464 ml->type == mreq->mr_type &&
3465 ml->alen == mreq->mr_alen &&
3466 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3467 ml->count++;
3468 /* Free the new element ... */
3469 kfree(i);
3470 goto done;
3471 }
3472 }
3473
3474 i->type = mreq->mr_type;
3475 i->ifindex = mreq->mr_ifindex;
3476 i->alen = mreq->mr_alen;
3477 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3478 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3479 i->count = 1;
3480 i->next = po->mclist;
3481 po->mclist = i;
2aeb0b88
WC
3482 err = packet_dev_mc(dev, i, 1);
3483 if (err) {
3484 po->mclist = i->next;
3485 kfree(i);
3486 }
1da177e4
LT
3487
3488done:
3489 rtnl_unlock();
3490 return err;
3491}
3492
0fb375fb 3493static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3494{
3495 struct packet_mclist *ml, **mlp;
3496
3497 rtnl_lock();
3498
3499 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3500 if (ml->ifindex == mreq->mr_ifindex &&
3501 ml->type == mreq->mr_type &&
3502 ml->alen == mreq->mr_alen &&
3503 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3504 if (--ml->count == 0) {
3505 struct net_device *dev;
3506 *mlp = ml->next;
ad959e76
ED
3507 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3508 if (dev)
1da177e4 3509 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3510 kfree(ml);
3511 }
82f17091 3512 break;
1da177e4
LT
3513 }
3514 }
3515 rtnl_unlock();
82f17091 3516 return 0;
1da177e4
LT
3517}
3518
3519static void packet_flush_mclist(struct sock *sk)
3520{
3521 struct packet_sock *po = pkt_sk(sk);
3522 struct packet_mclist *ml;
3523
3524 if (!po->mclist)
3525 return;
3526
3527 rtnl_lock();
3528 while ((ml = po->mclist) != NULL) {
3529 struct net_device *dev;
3530
3531 po->mclist = ml->next;
ad959e76
ED
3532 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3533 if (dev != NULL)
1da177e4 3534 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3535 kfree(ml);
3536 }
3537 rtnl_unlock();
3538}
1da177e4
LT
3539
3540static int
b7058842 3541packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3542{
3543 struct sock *sk = sock->sk;
8dc41944 3544 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3545 int ret;
3546
3547 if (level != SOL_PACKET)
3548 return -ENOPROTOOPT;
3549
69e3c75f 3550 switch (optname) {
1ce4f28b 3551 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3552 case PACKET_DROP_MEMBERSHIP:
3553 {
0fb375fb
EB
3554 struct packet_mreq_max mreq;
3555 int len = optlen;
3556 memset(&mreq, 0, sizeof(mreq));
3557 if (len < sizeof(struct packet_mreq))
1da177e4 3558 return -EINVAL;
0fb375fb
EB
3559 if (len > sizeof(mreq))
3560 len = sizeof(mreq);
40d4e3df 3561 if (copy_from_user(&mreq, optval, len))
1da177e4 3562 return -EFAULT;
0fb375fb
EB
3563 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3564 return -EINVAL;
1da177e4
LT
3565 if (optname == PACKET_ADD_MEMBERSHIP)
3566 ret = packet_mc_add(sk, &mreq);
3567 else
3568 ret = packet_mc_drop(sk, &mreq);
3569 return ret;
3570 }
a2efcfa0 3571
1da177e4 3572 case PACKET_RX_RING:
69e3c75f 3573 case PACKET_TX_RING:
1da177e4 3574 {
f6fb8f10 3575 union tpacket_req_u req_u;
3576 int len;
1da177e4 3577
f6fb8f10 3578 switch (po->tp_version) {
3579 case TPACKET_V1:
3580 case TPACKET_V2:
3581 len = sizeof(req_u.req);
3582 break;
3583 case TPACKET_V3:
3584 default:
3585 len = sizeof(req_u.req3);
3586 break;
3587 }
3588 if (optlen < len)
1da177e4 3589 return -EINVAL;
f6fb8f10 3590 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3591 return -EFAULT;
f6fb8f10 3592 return packet_set_ring(sk, &req_u, 0,
3593 optname == PACKET_TX_RING);
1da177e4
LT
3594 }
3595 case PACKET_COPY_THRESH:
3596 {
3597 int val;
3598
40d4e3df 3599 if (optlen != sizeof(val))
1da177e4 3600 return -EINVAL;
40d4e3df 3601 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3602 return -EFAULT;
3603
3604 pkt_sk(sk)->copy_thresh = val;
3605 return 0;
3606 }
bbd6ef87
PM
3607 case PACKET_VERSION:
3608 {
3609 int val;
3610
3611 if (optlen != sizeof(val))
3612 return -EINVAL;
bbd6ef87
PM
3613 if (copy_from_user(&val, optval, sizeof(val)))
3614 return -EFAULT;
3615 switch (val) {
3616 case TPACKET_V1:
3617 case TPACKET_V2:
f6fb8f10 3618 case TPACKET_V3:
84ac7260 3619 break;
bbd6ef87
PM
3620 default:
3621 return -EINVAL;
3622 }
84ac7260
PP
3623 lock_sock(sk);
3624 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3625 ret = -EBUSY;
3626 } else {
3627 po->tp_version = val;
3628 ret = 0;
3629 }
3630 release_sock(sk);
3631 return ret;
bbd6ef87 3632 }
8913336a
PM
3633 case PACKET_RESERVE:
3634 {
3635 unsigned int val;
3636
3637 if (optlen != sizeof(val))
3638 return -EINVAL;
69e3c75f 3639 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3640 return -EBUSY;
3641 if (copy_from_user(&val, optval, sizeof(val)))
3642 return -EFAULT;
3643 po->tp_reserve = val;
3644 return 0;
3645 }
69e3c75f
JB
3646 case PACKET_LOSS:
3647 {
3648 unsigned int val;
3649
3650 if (optlen != sizeof(val))
3651 return -EINVAL;
3652 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3653 return -EBUSY;
3654 if (copy_from_user(&val, optval, sizeof(val)))
3655 return -EFAULT;
3656 po->tp_loss = !!val;
3657 return 0;
3658 }
8dc41944
HX
3659 case PACKET_AUXDATA:
3660 {
3661 int val;
3662
3663 if (optlen < sizeof(val))
3664 return -EINVAL;
3665 if (copy_from_user(&val, optval, sizeof(val)))
3666 return -EFAULT;
3667
3668 po->auxdata = !!val;
3669 return 0;
3670 }
80feaacb
PWJ
3671 case PACKET_ORIGDEV:
3672 {
3673 int val;
3674
3675 if (optlen < sizeof(val))
3676 return -EINVAL;
3677 if (copy_from_user(&val, optval, sizeof(val)))
3678 return -EFAULT;
3679
3680 po->origdev = !!val;
3681 return 0;
3682 }
bfd5f4a3
SS
3683 case PACKET_VNET_HDR:
3684 {
3685 int val;
3686
3687 if (sock->type != SOCK_RAW)
3688 return -EINVAL;
3689 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3690 return -EBUSY;
3691 if (optlen < sizeof(val))
3692 return -EINVAL;
3693 if (copy_from_user(&val, optval, sizeof(val)))
3694 return -EFAULT;
3695
3696 po->has_vnet_hdr = !!val;
3697 return 0;
3698 }
614f60fa
SM
3699 case PACKET_TIMESTAMP:
3700 {
3701 int val;
3702
3703 if (optlen != sizeof(val))
3704 return -EINVAL;
3705 if (copy_from_user(&val, optval, sizeof(val)))
3706 return -EFAULT;
3707
3708 po->tp_tstamp = val;
3709 return 0;
3710 }
dc99f600
DM
3711 case PACKET_FANOUT:
3712 {
3713 int val;
3714
3715 if (optlen != sizeof(val))
3716 return -EINVAL;
3717 if (copy_from_user(&val, optval, sizeof(val)))
3718 return -EFAULT;
3719
3720 return fanout_add(sk, val & 0xffff, val >> 16);
3721 }
47dceb8e
WB
3722 case PACKET_FANOUT_DATA:
3723 {
3724 if (!po->fanout)
3725 return -EINVAL;
3726
3727 return fanout_set_data(po, optval, optlen);
3728 }
5920cd3a
PC
3729 case PACKET_TX_HAS_OFF:
3730 {
3731 unsigned int val;
3732
3733 if (optlen != sizeof(val))
3734 return -EINVAL;
3735 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3736 return -EBUSY;
3737 if (copy_from_user(&val, optval, sizeof(val)))
3738 return -EFAULT;
3739 po->tp_tx_has_off = !!val;
3740 return 0;
3741 }
d346a3fa
DB
3742 case PACKET_QDISC_BYPASS:
3743 {
3744 int val;
3745
3746 if (optlen != sizeof(val))
3747 return -EINVAL;
3748 if (copy_from_user(&val, optval, sizeof(val)))
3749 return -EFAULT;
3750
3751 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3752 return 0;
3753 }
1da177e4
LT
3754 default:
3755 return -ENOPROTOOPT;
3756 }
3757}
3758
3759static int packet_getsockopt(struct socket *sock, int level, int optname,
3760 char __user *optval, int __user *optlen)
3761{
3762 int len;
c06fff6e 3763 int val, lv = sizeof(val);
1da177e4
LT
3764 struct sock *sk = sock->sk;
3765 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3766 void *data = &val;
ee80fbf3 3767 union tpacket_stats_u st;
a9b63918 3768 struct tpacket_rollover_stats rstats;
1da177e4
LT
3769
3770 if (level != SOL_PACKET)
3771 return -ENOPROTOOPT;
3772
8ae55f04
KK
3773 if (get_user(len, optlen))
3774 return -EFAULT;
1da177e4
LT
3775
3776 if (len < 0)
3777 return -EINVAL;
1ce4f28b 3778
69e3c75f 3779 switch (optname) {
1da177e4 3780 case PACKET_STATISTICS:
1da177e4 3781 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3782 memcpy(&st, &po->stats, sizeof(st));
3783 memset(&po->stats, 0, sizeof(po->stats));
3784 spin_unlock_bh(&sk->sk_receive_queue.lock);
3785
f6fb8f10 3786 if (po->tp_version == TPACKET_V3) {
c06fff6e 3787 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3788 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3789 data = &st.stats3;
f6fb8f10 3790 } else {
c06fff6e 3791 lv = sizeof(struct tpacket_stats);
8bcdeaff 3792 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3793 data = &st.stats1;
f6fb8f10 3794 }
ee80fbf3 3795
8dc41944
HX
3796 break;
3797 case PACKET_AUXDATA:
8dc41944 3798 val = po->auxdata;
80feaacb
PWJ
3799 break;
3800 case PACKET_ORIGDEV:
80feaacb 3801 val = po->origdev;
bfd5f4a3
SS
3802 break;
3803 case PACKET_VNET_HDR:
bfd5f4a3 3804 val = po->has_vnet_hdr;
1da177e4 3805 break;
bbd6ef87 3806 case PACKET_VERSION:
bbd6ef87 3807 val = po->tp_version;
bbd6ef87
PM
3808 break;
3809 case PACKET_HDRLEN:
3810 if (len > sizeof(int))
3811 len = sizeof(int);
3812 if (copy_from_user(&val, optval, len))
3813 return -EFAULT;
3814 switch (val) {
3815 case TPACKET_V1:
3816 val = sizeof(struct tpacket_hdr);
3817 break;
3818 case TPACKET_V2:
3819 val = sizeof(struct tpacket2_hdr);
3820 break;
f6fb8f10 3821 case TPACKET_V3:
3822 val = sizeof(struct tpacket3_hdr);
3823 break;
bbd6ef87
PM
3824 default:
3825 return -EINVAL;
3826 }
bbd6ef87 3827 break;
8913336a 3828 case PACKET_RESERVE:
8913336a 3829 val = po->tp_reserve;
8913336a 3830 break;
69e3c75f 3831 case PACKET_LOSS:
69e3c75f 3832 val = po->tp_loss;
69e3c75f 3833 break;
614f60fa 3834 case PACKET_TIMESTAMP:
614f60fa 3835 val = po->tp_tstamp;
614f60fa 3836 break;
dc99f600 3837 case PACKET_FANOUT:
dc99f600
DM
3838 val = (po->fanout ?
3839 ((u32)po->fanout->id |
77f65ebd
WB
3840 ((u32)po->fanout->type << 16) |
3841 ((u32)po->fanout->flags << 24)) :
dc99f600 3842 0);
dc99f600 3843 break;
a9b63918
WB
3844 case PACKET_ROLLOVER_STATS:
3845 if (!po->rollover)
3846 return -EINVAL;
3847 rstats.tp_all = atomic_long_read(&po->rollover->num);
3848 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3849 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3850 data = &rstats;
3851 lv = sizeof(rstats);
3852 break;
5920cd3a
PC
3853 case PACKET_TX_HAS_OFF:
3854 val = po->tp_tx_has_off;
3855 break;
d346a3fa
DB
3856 case PACKET_QDISC_BYPASS:
3857 val = packet_use_direct_xmit(po);
3858 break;
1da177e4
LT
3859 default:
3860 return -ENOPROTOOPT;
3861 }
3862
c06fff6e
ED
3863 if (len > lv)
3864 len = lv;
8ae55f04
KK
3865 if (put_user(len, optlen))
3866 return -EFAULT;
8dc41944
HX
3867 if (copy_to_user(optval, data, len))
3868 return -EFAULT;
8ae55f04 3869 return 0;
1da177e4
LT
3870}
3871
3872
719c44d3
WB
3873#ifdef CONFIG_COMPAT
3874static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3875 char __user *optval, unsigned int optlen)
3876{
3877 struct packet_sock *po = pkt_sk(sock->sk);
3878
3879 if (level != SOL_PACKET)
3880 return -ENOPROTOOPT;
3881
3882 if (optname == PACKET_FANOUT_DATA &&
3883 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3884 optval = (char __user *)get_compat_bpf_fprog(optval);
3885 if (!optval)
3886 return -EFAULT;
3887 optlen = sizeof(struct sock_fprog);
3888 }
3889
3890 return packet_setsockopt(sock, level, optname, optval, optlen);
3891}
3892#endif
3893
351638e7
JP
3894static int packet_notifier(struct notifier_block *this,
3895 unsigned long msg, void *ptr)
1da177e4
LT
3896{
3897 struct sock *sk;
351638e7 3898 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3899 struct net *net = dev_net(dev);
1da177e4 3900
808f5114 3901 rcu_read_lock();
b67bfe0d 3902 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3903 struct packet_sock *po = pkt_sk(sk);
3904
3905 switch (msg) {
3906 case NETDEV_UNREGISTER:
1da177e4 3907 if (po->mclist)
82f17091 3908 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3909 /* fallthrough */
3910
1da177e4
LT
3911 case NETDEV_DOWN:
3912 if (dev->ifindex == po->ifindex) {
3913 spin_lock(&po->bind_lock);
3914 if (po->running) {
ce06b03e 3915 __unregister_prot_hook(sk, false);
1da177e4
LT
3916 sk->sk_err = ENETDOWN;
3917 if (!sock_flag(sk, SOCK_DEAD))
3918 sk->sk_error_report(sk);
3919 }
3920 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3921 packet_cached_dev_reset(po);
1da177e4 3922 po->ifindex = -1;
160ff18a
BG
3923 if (po->prot_hook.dev)
3924 dev_put(po->prot_hook.dev);
1da177e4
LT
3925 po->prot_hook.dev = NULL;
3926 }
3927 spin_unlock(&po->bind_lock);
3928 }
3929 break;
3930 case NETDEV_UP:
808f5114 3931 if (dev->ifindex == po->ifindex) {
3932 spin_lock(&po->bind_lock);
ce06b03e
DM
3933 if (po->num)
3934 register_prot_hook(sk);
808f5114 3935 spin_unlock(&po->bind_lock);
1da177e4 3936 }
1da177e4
LT
3937 break;
3938 }
3939 }
808f5114 3940 rcu_read_unlock();
1da177e4
LT
3941 return NOTIFY_DONE;
3942}
3943
3944
3945static int packet_ioctl(struct socket *sock, unsigned int cmd,
3946 unsigned long arg)
3947{
3948 struct sock *sk = sock->sk;
3949
69e3c75f 3950 switch (cmd) {
40d4e3df
ED
3951 case SIOCOUTQ:
3952 {
3953 int amount = sk_wmem_alloc_get(sk);
31e6d363 3954
40d4e3df
ED
3955 return put_user(amount, (int __user *)arg);
3956 }
3957 case SIOCINQ:
3958 {
3959 struct sk_buff *skb;
3960 int amount = 0;
3961
3962 spin_lock_bh(&sk->sk_receive_queue.lock);
3963 skb = skb_peek(&sk->sk_receive_queue);
3964 if (skb)
3965 amount = skb->len;
3966 spin_unlock_bh(&sk->sk_receive_queue.lock);
3967 return put_user(amount, (int __user *)arg);
3968 }
3969 case SIOCGSTAMP:
3970 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3971 case SIOCGSTAMPNS:
3972 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3973
1da177e4 3974#ifdef CONFIG_INET
40d4e3df
ED
3975 case SIOCADDRT:
3976 case SIOCDELRT:
3977 case SIOCDARP:
3978 case SIOCGARP:
3979 case SIOCSARP:
3980 case SIOCGIFADDR:
3981 case SIOCSIFADDR:
3982 case SIOCGIFBRDADDR:
3983 case SIOCSIFBRDADDR:
3984 case SIOCGIFNETMASK:
3985 case SIOCSIFNETMASK:
3986 case SIOCGIFDSTADDR:
3987 case SIOCSIFDSTADDR:
3988 case SIOCSIFFLAGS:
40d4e3df 3989 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3990#endif
3991
40d4e3df
ED
3992 default:
3993 return -ENOIOCTLCMD;
1da177e4
LT
3994 }
3995 return 0;
3996}
3997
40d4e3df 3998static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3999 poll_table *wait)
4000{
4001 struct sock *sk = sock->sk;
4002 struct packet_sock *po = pkt_sk(sk);
4003 unsigned int mask = datagram_poll(file, sock, wait);
4004
4005 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4006 if (po->rx_ring.pg_vec) {
f6fb8f10 4007 if (!packet_previous_rx_frame(po, &po->rx_ring,
4008 TP_STATUS_KERNEL))
1da177e4
LT
4009 mask |= POLLIN | POLLRDNORM;
4010 }
2ccdbaa6 4011 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4012 po->pressure = 0;
1da177e4 4013 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4014 spin_lock_bh(&sk->sk_write_queue.lock);
4015 if (po->tx_ring.pg_vec) {
4016 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4017 mask |= POLLOUT | POLLWRNORM;
4018 }
4019 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4020 return mask;
4021}
4022
4023
4024/* Dirty? Well, I still did not learn better way to account
4025 * for user mmaps.
4026 */
4027
4028static void packet_mm_open(struct vm_area_struct *vma)
4029{
4030 struct file *file = vma->vm_file;
40d4e3df 4031 struct socket *sock = file->private_data;
1da177e4 4032 struct sock *sk = sock->sk;
1ce4f28b 4033
1da177e4
LT
4034 if (sk)
4035 atomic_inc(&pkt_sk(sk)->mapped);
4036}
4037
4038static void packet_mm_close(struct vm_area_struct *vma)
4039{
4040 struct file *file = vma->vm_file;
40d4e3df 4041 struct socket *sock = file->private_data;
1da177e4 4042 struct sock *sk = sock->sk;
1ce4f28b 4043
1da177e4
LT
4044 if (sk)
4045 atomic_dec(&pkt_sk(sk)->mapped);
4046}
4047
f0f37e2f 4048static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4049 .open = packet_mm_open,
4050 .close = packet_mm_close,
1da177e4
LT
4051};
4052
0e3125c7
NH
4053static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4054 unsigned int len)
1da177e4
LT
4055{
4056 int i;
4057
4ebf0ae2 4058 for (i = 0; i < len; i++) {
0e3125c7 4059 if (likely(pg_vec[i].buffer)) {
c56b4d90 4060 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4061 vfree(pg_vec[i].buffer);
4062 else
4063 free_pages((unsigned long)pg_vec[i].buffer,
4064 order);
4065 pg_vec[i].buffer = NULL;
4066 }
1da177e4
LT
4067 }
4068 kfree(pg_vec);
4069}
4070
eea49cc9 4071static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4072{
f0d4eb29 4073 char *buffer;
0e3125c7
NH
4074 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4075 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4076
4077 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4078 if (buffer)
4079 return buffer;
4080
f0d4eb29 4081 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4082 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4083 if (buffer)
4084 return buffer;
4085
f0d4eb29 4086 /* vmalloc failed, lets dig into swap here */
0e3125c7 4087 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4088 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4089 if (buffer)
4090 return buffer;
4091
f0d4eb29 4092 /* complete and utter failure */
0e3125c7 4093 return NULL;
4ebf0ae2
DM
4094}
4095
0e3125c7 4096static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4097{
4098 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4099 struct pgv *pg_vec;
4ebf0ae2
DM
4100 int i;
4101
0e3125c7 4102 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4103 if (unlikely(!pg_vec))
4104 goto out;
4105
4106 for (i = 0; i < block_nr; i++) {
c56b4d90 4107 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4108 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4109 goto out_free_pgvec;
4110 }
4111
4112out:
4113 return pg_vec;
4114
4115out_free_pgvec:
4116 free_pg_vec(pg_vec, order, block_nr);
4117 pg_vec = NULL;
4118 goto out;
4119}
1da177e4 4120
f6fb8f10 4121static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4122 int closing, int tx_ring)
1da177e4 4123{
0e3125c7 4124 struct pgv *pg_vec = NULL;
1da177e4 4125 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4126 int was_running, order = 0;
69e3c75f
JB
4127 struct packet_ring_buffer *rb;
4128 struct sk_buff_head *rb_queue;
0e11c91e 4129 __be16 num;
f6fb8f10 4130 int err = -EINVAL;
4131 /* Added to avoid minimal code churn */
4132 struct tpacket_req *req = &req_u->req;
4133
84ac7260 4134 lock_sock(sk);
f6fb8f10 4135 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4136 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
6ae81ced 4137 net_warn_ratelimited("Tx-ring is not supported.\n");
f6fb8f10 4138 goto out;
4139 }
1ce4f28b 4140
69e3c75f
JB
4141 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4142 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4143
69e3c75f
JB
4144 err = -EBUSY;
4145 if (!closing) {
4146 if (atomic_read(&po->mapped))
4147 goto out;
b0138408 4148 if (packet_read_pending(rb))
69e3c75f
JB
4149 goto out;
4150 }
1da177e4 4151
69e3c75f
JB
4152 if (req->tp_block_nr) {
4153 /* Sanity tests and some calculations */
4154 err = -EBUSY;
4155 if (unlikely(rb->pg_vec))
4156 goto out;
1da177e4 4157
bbd6ef87
PM
4158 switch (po->tp_version) {
4159 case TPACKET_V1:
4160 po->tp_hdrlen = TPACKET_HDRLEN;
4161 break;
4162 case TPACKET_V2:
4163 po->tp_hdrlen = TPACKET2_HDRLEN;
4164 break;
f6fb8f10 4165 case TPACKET_V3:
4166 po->tp_hdrlen = TPACKET3_HDRLEN;
4167 break;
bbd6ef87
PM
4168 }
4169
69e3c75f 4170 err = -EINVAL;
4ebf0ae2 4171 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4172 goto out;
90836b67 4173 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4174 goto out;
dc808110
ED
4175 if (po->tp_version >= TPACKET_V3 &&
4176 (int)(req->tp_block_size -
4177 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
4178 goto out;
8913336a 4179 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4180 po->tp_reserve))
4181 goto out;
4ebf0ae2 4182 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4183 goto out;
1da177e4 4184
4194b491
TK
4185 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4186 if (unlikely(rb->frames_per_block == 0))
69e3c75f
JB
4187 goto out;
4188 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4189 req->tp_frame_nr))
4190 goto out;
1da177e4
LT
4191
4192 err = -ENOMEM;
4ebf0ae2
DM
4193 order = get_order(req->tp_block_size);
4194 pg_vec = alloc_pg_vec(req, order);
4195 if (unlikely(!pg_vec))
1da177e4 4196 goto out;
f6fb8f10 4197 switch (po->tp_version) {
4198 case TPACKET_V3:
4199 /* Transmit path is not supported. We checked
4200 * it above but just being paranoid
4201 */
4202 if (!tx_ring)
e8e85cc5 4203 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4204 break;
f6fb8f10 4205 default:
4206 break;
4207 }
69e3c75f
JB
4208 }
4209 /* Done */
4210 else {
4211 err = -EINVAL;
4ebf0ae2 4212 if (unlikely(req->tp_frame_nr))
69e3c75f 4213 goto out;
1da177e4
LT
4214 }
4215
1da177e4
LT
4216
4217 /* Detach socket from network */
4218 spin_lock(&po->bind_lock);
4219 was_running = po->running;
4220 num = po->num;
4221 if (was_running) {
1da177e4 4222 po->num = 0;
ce06b03e 4223 __unregister_prot_hook(sk, false);
1da177e4
LT
4224 }
4225 spin_unlock(&po->bind_lock);
1ce4f28b 4226
1da177e4
LT
4227 synchronize_net();
4228
4229 err = -EBUSY;
905db440 4230 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4231 if (closing || atomic_read(&po->mapped) == 0) {
4232 err = 0;
69e3c75f 4233 spin_lock_bh(&rb_queue->lock);
c053fd96 4234 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4235 rb->frame_max = (req->tp_frame_nr - 1);
4236 rb->head = 0;
4237 rb->frame_size = req->tp_frame_size;
4238 spin_unlock_bh(&rb_queue->lock);
4239
c053fd96
CG
4240 swap(rb->pg_vec_order, order);
4241 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4242
4243 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4244 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4245 tpacket_rcv : packet_rcv;
4246 skb_queue_purge(rb_queue);
1da177e4 4247 if (atomic_read(&po->mapped))
40d4e3df
ED
4248 pr_err("packet_mmap: vma is busy: %d\n",
4249 atomic_read(&po->mapped));
1da177e4 4250 }
905db440 4251 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4252
4253 spin_lock(&po->bind_lock);
ce06b03e 4254 if (was_running) {
1da177e4 4255 po->num = num;
ce06b03e 4256 register_prot_hook(sk);
1da177e4
LT
4257 }
4258 spin_unlock(&po->bind_lock);
f6fb8f10 4259 if (closing && (po->tp_version > TPACKET_V2)) {
4260 /* Because we don't support block-based V3 on tx-ring */
4261 if (!tx_ring)
73d0fcf2 4262 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4263 }
1da177e4 4264
1da177e4
LT
4265 if (pg_vec)
4266 free_pg_vec(pg_vec, order, req->tp_block_nr);
4267out:
84ac7260 4268 release_sock(sk);
1da177e4
LT
4269 return err;
4270}
4271
69e3c75f
JB
4272static int packet_mmap(struct file *file, struct socket *sock,
4273 struct vm_area_struct *vma)
1da177e4
LT
4274{
4275 struct sock *sk = sock->sk;
4276 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4277 unsigned long size, expected_size;
4278 struct packet_ring_buffer *rb;
1da177e4
LT
4279 unsigned long start;
4280 int err = -EINVAL;
4281 int i;
4282
4283 if (vma->vm_pgoff)
4284 return -EINVAL;
4285
905db440 4286 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4287
4288 expected_size = 0;
4289 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4290 if (rb->pg_vec) {
4291 expected_size += rb->pg_vec_len
4292 * rb->pg_vec_pages
4293 * PAGE_SIZE;
4294 }
4295 }
4296
4297 if (expected_size == 0)
1da177e4 4298 goto out;
69e3c75f
JB
4299
4300 size = vma->vm_end - vma->vm_start;
4301 if (size != expected_size)
1da177e4
LT
4302 goto out;
4303
1da177e4 4304 start = vma->vm_start;
69e3c75f
JB
4305 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4306 if (rb->pg_vec == NULL)
4307 continue;
4308
4309 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4310 struct page *page;
4311 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4312 int pg_num;
4313
c56b4d90
CG
4314 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4315 page = pgv_to_page(kaddr);
69e3c75f
JB
4316 err = vm_insert_page(vma, start, page);
4317 if (unlikely(err))
4318 goto out;
4319 start += PAGE_SIZE;
0e3125c7 4320 kaddr += PAGE_SIZE;
69e3c75f 4321 }
4ebf0ae2 4322 }
1da177e4 4323 }
69e3c75f 4324
4ebf0ae2 4325 atomic_inc(&po->mapped);
1da177e4
LT
4326 vma->vm_ops = &packet_mmap_ops;
4327 err = 0;
4328
4329out:
905db440 4330 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4331 return err;
4332}
1da177e4 4333
90ddc4f0 4334static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4335 .family = PF_PACKET,
4336 .owner = THIS_MODULE,
4337 .release = packet_release,
4338 .bind = packet_bind_spkt,
4339 .connect = sock_no_connect,
4340 .socketpair = sock_no_socketpair,
4341 .accept = sock_no_accept,
4342 .getname = packet_getname_spkt,
4343 .poll = datagram_poll,
4344 .ioctl = packet_ioctl,
4345 .listen = sock_no_listen,
4346 .shutdown = sock_no_shutdown,
4347 .setsockopt = sock_no_setsockopt,
4348 .getsockopt = sock_no_getsockopt,
4349 .sendmsg = packet_sendmsg_spkt,
4350 .recvmsg = packet_recvmsg,
4351 .mmap = sock_no_mmap,
4352 .sendpage = sock_no_sendpage,
4353};
1da177e4 4354
90ddc4f0 4355static const struct proto_ops packet_ops = {
1da177e4
LT
4356 .family = PF_PACKET,
4357 .owner = THIS_MODULE,
4358 .release = packet_release,
4359 .bind = packet_bind,
4360 .connect = sock_no_connect,
4361 .socketpair = sock_no_socketpair,
4362 .accept = sock_no_accept,
1ce4f28b 4363 .getname = packet_getname,
1da177e4
LT
4364 .poll = packet_poll,
4365 .ioctl = packet_ioctl,
4366 .listen = sock_no_listen,
4367 .shutdown = sock_no_shutdown,
4368 .setsockopt = packet_setsockopt,
4369 .getsockopt = packet_getsockopt,
719c44d3
WB
4370#ifdef CONFIG_COMPAT
4371 .compat_setsockopt = compat_packet_setsockopt,
4372#endif
1da177e4
LT
4373 .sendmsg = packet_sendmsg,
4374 .recvmsg = packet_recvmsg,
4375 .mmap = packet_mmap,
4376 .sendpage = sock_no_sendpage,
4377};
4378
ec1b4cf7 4379static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4380 .family = PF_PACKET,
4381 .create = packet_create,
4382 .owner = THIS_MODULE,
4383};
4384
4385static struct notifier_block packet_netdev_notifier = {
40d4e3df 4386 .notifier_call = packet_notifier,
1da177e4
LT
4387};
4388
4389#ifdef CONFIG_PROC_FS
1da177e4
LT
4390
4391static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4392 __acquires(RCU)
1da177e4 4393{
e372c414 4394 struct net *net = seq_file_net(seq);
808f5114 4395
4396 rcu_read_lock();
4397 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4398}
4399
4400static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4401{
1bf40954 4402 struct net *net = seq_file_net(seq);
808f5114 4403 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4404}
4405
4406static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4407 __releases(RCU)
1da177e4 4408{
808f5114 4409 rcu_read_unlock();
1da177e4
LT
4410}
4411
1ce4f28b 4412static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4413{
4414 if (v == SEQ_START_TOKEN)
4415 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4416 else {
b7ceabd9 4417 struct sock *s = sk_entry(v);
1da177e4
LT
4418 const struct packet_sock *po = pkt_sk(s);
4419
4420 seq_printf(seq,
71338aa7 4421 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4422 s,
4423 atomic_read(&s->sk_refcnt),
4424 s->sk_type,
4425 ntohs(po->num),
4426 po->ifindex,
4427 po->running,
4428 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4429 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4430 sock_i_ino(s));
1da177e4
LT
4431 }
4432
4433 return 0;
4434}
4435
56b3d975 4436static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4437 .start = packet_seq_start,
4438 .next = packet_seq_next,
4439 .stop = packet_seq_stop,
4440 .show = packet_seq_show,
4441};
4442
4443static int packet_seq_open(struct inode *inode, struct file *file)
4444{
e372c414
DL
4445 return seq_open_net(inode, file, &packet_seq_ops,
4446 sizeof(struct seq_net_private));
1da177e4
LT
4447}
4448
da7071d7 4449static const struct file_operations packet_seq_fops = {
1da177e4
LT
4450 .owner = THIS_MODULE,
4451 .open = packet_seq_open,
4452 .read = seq_read,
4453 .llseek = seq_lseek,
e372c414 4454 .release = seq_release_net,
1da177e4
LT
4455};
4456
4457#endif
4458
2c8c1e72 4459static int __net_init packet_net_init(struct net *net)
d12d01d6 4460{
0fa7fa98 4461 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4462 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4463
d4beaa66 4464 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4465 return -ENOMEM;
4466
4467 return 0;
4468}
4469
2c8c1e72 4470static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4471{
ece31ffd 4472 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4473}
4474
4475static struct pernet_operations packet_net_ops = {
4476 .init = packet_net_init,
4477 .exit = packet_net_exit,
4478};
4479
4480
1da177e4
LT
4481static void __exit packet_exit(void)
4482{
1da177e4 4483 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4484 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4485 sock_unregister(PF_PACKET);
4486 proto_unregister(&packet_proto);
4487}
4488
4489static int __init packet_init(void)
4490{
4491 int rc = proto_register(&packet_proto, 0);
4492
4493 if (rc != 0)
4494 goto out;
4495
4496 sock_register(&packet_family_ops);
d12d01d6 4497 register_pernet_subsys(&packet_net_ops);
1da177e4 4498 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4499out:
4500 return rc;
4501}
4502
4503module_init(packet_init);
4504module_exit(packet_exit);
4505MODULE_LICENSE("GPL");
4506MODULE_ALIAS_NETPROTO(PF_PACKET);