]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
packet: fix tp_reserve race in packet_set_ring
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define PGV_FROM_VMALLOC 1
69e3c75f 181
f6fb8f10 182#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
183#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
184#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
185#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
186#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
187#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
188#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189
69e3c75f
JB
190struct packet_sock;
191static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
192static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
193 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 194
f6fb8f10 195static void *packet_previous_frame(struct packet_sock *po,
196 struct packet_ring_buffer *rb,
197 int status);
198static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 199static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
200 struct tpacket_block_desc *);
201static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *);
bc59ba39 203static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 204 struct packet_sock *, unsigned int status);
bc59ba39 205static int prb_queue_frozen(struct tpacket_kbdq_core *);
206static void prb_open_block(struct tpacket_kbdq_core *,
207 struct tpacket_block_desc *);
f6fb8f10 208static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 209static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
210static void prb_init_blk_timer(struct packet_sock *,
211 struct tpacket_kbdq_core *,
212 void (*func) (unsigned long));
213static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
214static void prb_clear_rxhash(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
217 struct tpacket3_hdr *);
1da177e4
LT
218static void packet_flush_mclist(struct sock *sk);
219
ffbc6111 220struct packet_skb_cb {
ffbc6111
HX
221 union {
222 struct sockaddr_pkt pkt;
2472d761
EB
223 union {
224 /* Trick: alias skb original length with
225 * ll.sll_family and ll.protocol in order
226 * to save room.
227 */
228 unsigned int origlen;
229 struct sockaddr_ll ll;
230 };
ffbc6111
HX
231 } sa;
232};
233
d3869efe
DW
234#define vio_le() virtio_legacy_is_little_endian()
235
ffbc6111 236#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 237
bc59ba39 238#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 239#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 240 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 241#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 242 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 243#define GET_NEXT_PRB_BLK_NUM(x) \
244 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
245 ((x)->kactive_blk_num+1) : 0)
246
dc99f600
DM
247static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
248static void __fanout_link(struct sock *sk, struct packet_sock *po);
249
d346a3fa
DB
250static int packet_direct_xmit(struct sk_buff *skb)
251{
252 struct net_device *dev = skb->dev;
104ba78c 253 struct sk_buff *orig_skb = skb;
d346a3fa 254 struct netdev_queue *txq;
43279500 255 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
256
257 if (unlikely(!netif_running(dev) ||
43279500
DB
258 !netif_carrier_ok(dev)))
259 goto drop;
d346a3fa 260
104ba78c
WB
261 skb = validate_xmit_skb_list(skb, dev);
262 if (skb != orig_skb)
43279500 263 goto drop;
d346a3fa 264
10c51b56 265 txq = skb_get_tx_queue(dev, skb);
d346a3fa 266
43279500
DB
267 local_bh_disable();
268
269 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 270 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 271 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 272 HARD_TX_UNLOCK(dev, txq);
d346a3fa 273
43279500
DB
274 local_bh_enable();
275
276 if (!dev_xmit_complete(ret))
d346a3fa 277 kfree_skb(skb);
43279500 278
d346a3fa 279 return ret;
43279500 280drop:
0f97ede4 281 atomic_long_inc(&dev->tx_dropped);
104ba78c 282 kfree_skb_list(skb);
43279500 283 return NET_XMIT_DROP;
d346a3fa
DB
284}
285
66e56cd4
DB
286static struct net_device *packet_cached_dev_get(struct packet_sock *po)
287{
288 struct net_device *dev;
289
290 rcu_read_lock();
291 dev = rcu_dereference(po->cached_dev);
292 if (likely(dev))
293 dev_hold(dev);
294 rcu_read_unlock();
295
296 return dev;
297}
298
299static void packet_cached_dev_assign(struct packet_sock *po,
300 struct net_device *dev)
301{
302 rcu_assign_pointer(po->cached_dev, dev);
303}
304
305static void packet_cached_dev_reset(struct packet_sock *po)
306{
307 RCU_INIT_POINTER(po->cached_dev, NULL);
308}
309
d346a3fa
DB
310static bool packet_use_direct_xmit(const struct packet_sock *po)
311{
312 return po->xmit == packet_direct_xmit;
313}
314
0fd5d57b 315static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 316{
1cbac010 317 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
318}
319
0fd5d57b
DB
320static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
321{
322 const struct net_device_ops *ops = dev->netdev_ops;
323 u16 queue_index;
324
325 if (ops->ndo_select_queue) {
326 queue_index = ops->ndo_select_queue(dev, skb, NULL,
327 __packet_pick_tx_queue);
328 queue_index = netdev_cap_txqueue(dev, queue_index);
329 } else {
330 queue_index = __packet_pick_tx_queue(dev, skb);
331 }
332
333 skb_set_queue_mapping(skb, queue_index);
334}
335
ce06b03e
DM
336/* register_prot_hook must be invoked with the po->bind_lock held,
337 * or from a context in which asynchronous accesses to the packet
338 * socket is not possible (packet_create()).
339 */
340static void register_prot_hook(struct sock *sk)
341{
342 struct packet_sock *po = pkt_sk(sk);
e40526cb 343
ce06b03e 344 if (!po->running) {
66e56cd4 345 if (po->fanout)
dc99f600 346 __fanout_link(sk, po);
66e56cd4 347 else
dc99f600 348 dev_add_pack(&po->prot_hook);
e40526cb 349
ce06b03e
DM
350 sock_hold(sk);
351 po->running = 1;
352 }
353}
354
355/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
356 * held. If the sync parameter is true, we will temporarily drop
357 * the po->bind_lock and do a synchronize_net to make sure no
358 * asynchronous packet processing paths still refer to the elements
359 * of po->prot_hook. If the sync parameter is false, it is the
360 * callers responsibility to take care of this.
361 */
362static void __unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 po->running = 0;
66e56cd4
DB
367
368 if (po->fanout)
dc99f600 369 __fanout_unlink(sk, po);
66e56cd4 370 else
dc99f600 371 __dev_remove_pack(&po->prot_hook);
e40526cb 372
ce06b03e
DM
373 __sock_put(sk);
374
375 if (sync) {
376 spin_unlock(&po->bind_lock);
377 synchronize_net();
378 spin_lock(&po->bind_lock);
379 }
380}
381
382static void unregister_prot_hook(struct sock *sk, bool sync)
383{
384 struct packet_sock *po = pkt_sk(sk);
385
386 if (po->running)
387 __unregister_prot_hook(sk, sync);
388}
389
6e58040b 390static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
391{
392 if (is_vmalloc_addr(addr))
393 return vmalloc_to_page(addr);
394 return virt_to_page(addr);
395}
396
69e3c75f 397static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 398{
184f489e 399 union tpacket_uhdr h;
1da177e4 400
69e3c75f 401 h.raw = frame;
bbd6ef87
PM
402 switch (po->tp_version) {
403 case TPACKET_V1:
69e3c75f 404 h.h1->tp_status = status;
0af55bb5 405 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
406 break;
407 case TPACKET_V2:
69e3c75f 408 h.h2->tp_status = status;
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 410 break;
f6fb8f10 411 case TPACKET_V3:
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f 414 BUG();
bbd6ef87 415 }
69e3c75f
JB
416
417 smp_wmb();
bbd6ef87
PM
418}
419
69e3c75f 420static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 421{
184f489e 422 union tpacket_uhdr h;
bbd6ef87 423
69e3c75f
JB
424 smp_rmb();
425
bbd6ef87
PM
426 h.raw = frame;
427 switch (po->tp_version) {
428 case TPACKET_V1:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 430 return h.h1->tp_status;
bbd6ef87 431 case TPACKET_V2:
0af55bb5 432 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 433 return h.h2->tp_status;
f6fb8f10 434 case TPACKET_V3:
69e3c75f 435 default:
f6fb8f10 436 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
437 BUG();
438 return 0;
bbd6ef87 439 }
1da177e4 440}
69e3c75f 441
b9c32fb2
DB
442static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
443 unsigned int flags)
7a51384c
DB
444{
445 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
446
68a360e8
WB
447 if (shhwtstamps &&
448 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
449 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
450 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
451
452 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 453 return TP_STATUS_TS_SOFTWARE;
7a51384c 454
b9c32fb2 455 return 0;
7a51384c
DB
456}
457
b9c32fb2
DB
458static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
459 struct sk_buff *skb)
2e31396f
WB
460{
461 union tpacket_uhdr h;
462 struct timespec ts;
b9c32fb2 463 __u32 ts_status;
2e31396f 464
b9c32fb2
DB
465 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 return 0;
2e31396f
WB
467
468 h.raw = frame;
469 switch (po->tp_version) {
470 case TPACKET_V1:
471 h.h1->tp_sec = ts.tv_sec;
472 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
473 break;
474 case TPACKET_V2:
475 h.h2->tp_sec = ts.tv_sec;
476 h.h2->tp_nsec = ts.tv_nsec;
477 break;
478 case TPACKET_V3:
479 default:
480 WARN(1, "TPACKET version not supported.\n");
481 BUG();
482 }
483
484 /* one flush is safe, as both fields always lie on the same cacheline */
485 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
486 smp_wmb();
b9c32fb2
DB
487
488 return ts_status;
2e31396f
WB
489}
490
69e3c75f
JB
491static void *packet_lookup_frame(struct packet_sock *po,
492 struct packet_ring_buffer *rb,
493 unsigned int position,
494 int status)
495{
496 unsigned int pg_vec_pos, frame_offset;
184f489e 497 union tpacket_uhdr h;
69e3c75f
JB
498
499 pg_vec_pos = position / rb->frames_per_block;
500 frame_offset = position % rb->frames_per_block;
501
0e3125c7
NH
502 h.raw = rb->pg_vec[pg_vec_pos].buffer +
503 (frame_offset * rb->frame_size);
69e3c75f
JB
504
505 if (status != __packet_get_status(po, h.raw))
506 return NULL;
507
508 return h.raw;
509}
510
eea49cc9 511static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
512 struct packet_ring_buffer *rb,
513 int status)
514{
515 return packet_lookup_frame(po, rb, rb->head, status);
516}
517
bc59ba39 518static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 519{
520 del_timer_sync(&pkc->retire_blk_timer);
521}
522
523static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 524 struct sk_buff_head *rb_queue)
525{
bc59ba39 526 struct tpacket_kbdq_core *pkc;
f6fb8f10 527
73d0fcf2 528 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 529
ec6f809f 530 spin_lock_bh(&rb_queue->lock);
f6fb8f10 531 pkc->delete_blk_timer = 1;
ec6f809f 532 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 533
534 prb_del_retire_blk_timer(pkc);
535}
536
537static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 538 struct tpacket_kbdq_core *pkc,
f6fb8f10 539 void (*func) (unsigned long))
540{
541 init_timer(&pkc->retire_blk_timer);
542 pkc->retire_blk_timer.data = (long)po;
543 pkc->retire_blk_timer.function = func;
544 pkc->retire_blk_timer.expires = jiffies;
545}
546
e8e85cc5 547static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 548{
bc59ba39 549 struct tpacket_kbdq_core *pkc;
f6fb8f10 550
e8e85cc5 551 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 552 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
553}
554
555static int prb_calc_retire_blk_tmo(struct packet_sock *po,
556 int blk_size_in_bytes)
557{
558 struct net_device *dev;
559 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 560 struct ethtool_link_ksettings ecmd;
4bc71cb9 561 int err;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 568 }
7cad1bac 569 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
570 rtnl_unlock();
571 if (!err) {
4bc71cb9
JP
572 /*
573 * If the link speed is so slow you don't really
574 * need to worry about perf anyways
575 */
7cad1bac
DD
576 if (ecmd.base.speed < SPEED_1000 ||
577 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
7cad1bac 581 div = ecmd.base.speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
eb70db87 1343 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
47dceb8e
WB
1415static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1416 struct sk_buff *skb,
1417 unsigned int num)
1418{
1419 struct bpf_prog *prog;
1420 unsigned int ret = 0;
1421
1422 rcu_read_lock();
1423 prog = rcu_dereference(f->bpf_prog);
1424 if (prog)
ff936a04 1425 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1426 rcu_read_unlock();
1427
1428 return ret;
1429}
1430
77f65ebd
WB
1431static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1432{
1433 return f->flags & (flag >> 8);
95ec3eb4
DM
1434}
1435
95ec3eb4
DM
1436static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1437 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1438{
1439 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1440 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1441 struct net *net = read_pnet(&f->net);
dc99f600 1442 struct packet_sock *po;
77f65ebd 1443 unsigned int idx;
dc99f600 1444
19bcf9f2 1445 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1446 kfree_skb(skb);
1447 return 0;
1448 }
1449
3f34b24a 1450 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1451 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1452 if (!skb)
1453 return 0;
1454 }
95ec3eb4
DM
1455 switch (f->type) {
1456 case PACKET_FANOUT_HASH:
1457 default:
77f65ebd 1458 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1459 break;
1460 case PACKET_FANOUT_LB:
77f65ebd 1461 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1462 break;
1463 case PACKET_FANOUT_CPU:
77f65ebd
WB
1464 idx = fanout_demux_cpu(f, skb, num);
1465 break;
5df0ddfb
DB
1466 case PACKET_FANOUT_RND:
1467 idx = fanout_demux_rnd(f, skb, num);
1468 break;
2d36097d
NH
1469 case PACKET_FANOUT_QM:
1470 idx = fanout_demux_qm(f, skb, num);
1471 break;
77f65ebd 1472 case PACKET_FANOUT_ROLLOVER:
ad377cab 1473 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1474 break;
47dceb8e 1475 case PACKET_FANOUT_CBPF:
f2e52095 1476 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1477 idx = fanout_demux_bpf(f, skb, num);
1478 break;
dc99f600
DM
1479 }
1480
ad377cab
WB
1481 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1482 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1483
ad377cab 1484 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1485 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1486}
1487
fff3321d
PE
1488DEFINE_MUTEX(fanout_mutex);
1489EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1490static LIST_HEAD(fanout_list);
1491
1492static void __fanout_link(struct sock *sk, struct packet_sock *po)
1493{
1494 struct packet_fanout *f = po->fanout;
1495
1496 spin_lock(&f->lock);
1497 f->arr[f->num_members] = sk;
1498 smp_wmb();
1499 f->num_members++;
2bd624b4
AS
1500 if (f->num_members == 1)
1501 dev_add_pack(&f->prot_hook);
dc99f600
DM
1502 spin_unlock(&f->lock);
1503}
1504
1505static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1506{
1507 struct packet_fanout *f = po->fanout;
1508 int i;
1509
1510 spin_lock(&f->lock);
1511 for (i = 0; i < f->num_members; i++) {
1512 if (f->arr[i] == sk)
1513 break;
1514 }
1515 BUG_ON(i >= f->num_members);
1516 f->arr[i] = f->arr[f->num_members - 1];
1517 f->num_members--;
2bd624b4
AS
1518 if (f->num_members == 0)
1519 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1520 spin_unlock(&f->lock);
1521}
1522
d4dd8aee 1523static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1524{
161642e2
ED
1525 if (sk->sk_family != PF_PACKET)
1526 return false;
c0de08d0 1527
161642e2 1528 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1529}
1530
47dceb8e
WB
1531static void fanout_init_data(struct packet_fanout *f)
1532{
1533 switch (f->type) {
1534 case PACKET_FANOUT_LB:
1535 atomic_set(&f->rr_cur, 0);
1536 break;
1537 case PACKET_FANOUT_CBPF:
f2e52095 1538 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1539 RCU_INIT_POINTER(f->bpf_prog, NULL);
1540 break;
1541 }
1542}
1543
1544static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1545{
1546 struct bpf_prog *old;
1547
1548 spin_lock(&f->lock);
1549 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1550 rcu_assign_pointer(f->bpf_prog, new);
1551 spin_unlock(&f->lock);
1552
1553 if (old) {
1554 synchronize_net();
1555 bpf_prog_destroy(old);
1556 }
1557}
1558
1559static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1560 unsigned int len)
1561{
1562 struct bpf_prog *new;
1563 struct sock_fprog fprog;
1564 int ret;
1565
1566 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1567 return -EPERM;
1568 if (len != sizeof(fprog))
1569 return -EINVAL;
1570 if (copy_from_user(&fprog, data, len))
1571 return -EFAULT;
1572
bab18991 1573 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1574 if (ret)
1575 return ret;
1576
1577 __fanout_set_data_bpf(po->fanout, new);
1578 return 0;
1579}
1580
f2e52095
WB
1581static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1582 unsigned int len)
1583{
1584 struct bpf_prog *new;
1585 u32 fd;
1586
1587 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1588 return -EPERM;
1589 if (len != sizeof(fd))
1590 return -EINVAL;
1591 if (copy_from_user(&fd, data, len))
1592 return -EFAULT;
1593
113214be 1594 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1595 if (IS_ERR(new))
1596 return PTR_ERR(new);
f2e52095
WB
1597
1598 __fanout_set_data_bpf(po->fanout, new);
1599 return 0;
1600}
1601
47dceb8e
WB
1602static int fanout_set_data(struct packet_sock *po, char __user *data,
1603 unsigned int len)
1604{
1605 switch (po->fanout->type) {
1606 case PACKET_FANOUT_CBPF:
1607 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1608 case PACKET_FANOUT_EBPF:
1609 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1610 default:
1611 return -EINVAL;
1612 };
1613}
1614
1615static void fanout_release_data(struct packet_fanout *f)
1616{
1617 switch (f->type) {
1618 case PACKET_FANOUT_CBPF:
f2e52095 1619 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1620 __fanout_set_data_bpf(f, NULL);
1621 };
1622}
1623
7736d33f 1624static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1625{
d199fab6 1626 struct packet_rollover *rollover = NULL;
dc99f600
DM
1627 struct packet_sock *po = pkt_sk(sk);
1628 struct packet_fanout *f, *match;
7736d33f 1629 u8 type = type_flags & 0xff;
77f65ebd 1630 u8 flags = type_flags >> 8;
dc99f600
DM
1631 int err;
1632
1633 switch (type) {
77f65ebd
WB
1634 case PACKET_FANOUT_ROLLOVER:
1635 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1636 return -EINVAL;
dc99f600
DM
1637 case PACKET_FANOUT_HASH:
1638 case PACKET_FANOUT_LB:
95ec3eb4 1639 case PACKET_FANOUT_CPU:
5df0ddfb 1640 case PACKET_FANOUT_RND:
2d36097d 1641 case PACKET_FANOUT_QM:
47dceb8e 1642 case PACKET_FANOUT_CBPF:
f2e52095 1643 case PACKET_FANOUT_EBPF:
dc99f600
DM
1644 break;
1645 default:
1646 return -EINVAL;
1647 }
1648
d199fab6
ED
1649 mutex_lock(&fanout_mutex);
1650
1651 err = -EINVAL;
dc99f600 1652 if (!po->running)
d199fab6 1653 goto out;
dc99f600 1654
d199fab6 1655 err = -EALREADY;
dc99f600 1656 if (po->fanout)
d199fab6 1657 goto out;
dc99f600 1658
4633c9e0
WB
1659 if (type == PACKET_FANOUT_ROLLOVER ||
1660 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1661 err = -ENOMEM;
1662 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1663 if (!rollover)
1664 goto out;
1665 atomic_long_set(&rollover->num, 0);
1666 atomic_long_set(&rollover->num_huge, 0);
1667 atomic_long_set(&rollover->num_failed, 0);
1668 po->rollover = rollover;
0648ab70
WB
1669 }
1670
dc99f600
DM
1671 match = NULL;
1672 list_for_each_entry(f, &fanout_list, list) {
1673 if (f->id == id &&
1674 read_pnet(&f->net) == sock_net(sk)) {
1675 match = f;
1676 break;
1677 }
1678 }
afe62c68 1679 err = -EINVAL;
77f65ebd 1680 if (match && match->flags != flags)
afe62c68 1681 goto out;
dc99f600 1682 if (!match) {
afe62c68 1683 err = -ENOMEM;
dc99f600 1684 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1685 if (!match)
1686 goto out;
1687 write_pnet(&match->net, sock_net(sk));
1688 match->id = id;
1689 match->type = type;
77f65ebd 1690 match->flags = flags;
afe62c68
ED
1691 INIT_LIST_HEAD(&match->list);
1692 spin_lock_init(&match->lock);
1693 atomic_set(&match->sk_ref, 0);
47dceb8e 1694 fanout_init_data(match);
afe62c68
ED
1695 match->prot_hook.type = po->prot_hook.type;
1696 match->prot_hook.dev = po->prot_hook.dev;
1697 match->prot_hook.func = packet_rcv_fanout;
1698 match->prot_hook.af_packet_priv = match;
c0de08d0 1699 match->prot_hook.id_match = match_fanout_group;
afe62c68 1700 list_add(&match->list, &fanout_list);
dc99f600 1701 }
afe62c68
ED
1702 err = -EINVAL;
1703 if (match->type == type &&
1704 match->prot_hook.type == po->prot_hook.type &&
1705 match->prot_hook.dev == po->prot_hook.dev) {
1706 err = -ENOSPC;
1707 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1708 __dev_remove_pack(&po->prot_hook);
1709 po->fanout = match;
1710 atomic_inc(&match->sk_ref);
1711 __fanout_link(sk, po);
1712 err = 0;
dc99f600
DM
1713 }
1714 }
afe62c68 1715out:
d199fab6
ED
1716 if (err && rollover) {
1717 kfree(rollover);
0648ab70
WB
1718 po->rollover = NULL;
1719 }
d199fab6 1720 mutex_unlock(&fanout_mutex);
dc99f600
DM
1721 return err;
1722}
1723
2bd624b4
AS
1724/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1725 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1726 * It is the responsibility of the caller to call fanout_release_data() and
1727 * free the returned packet_fanout (after synchronize_net())
1728 */
1729static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1730{
1731 struct packet_sock *po = pkt_sk(sk);
1732 struct packet_fanout *f;
1733
fff3321d 1734 mutex_lock(&fanout_mutex);
d199fab6
ED
1735 f = po->fanout;
1736 if (f) {
1737 po->fanout = NULL;
1738
2bd624b4 1739 if (atomic_dec_and_test(&f->sk_ref))
d199fab6 1740 list_del(&f->list);
2bd624b4
AS
1741 else
1742 f = NULL;
dc99f600 1743
d199fab6
ED
1744 if (po->rollover)
1745 kfree_rcu(po->rollover, rcu);
dc99f600
DM
1746 }
1747 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1748
1749 return f;
dc99f600 1750}
1da177e4 1751
3c70c132
DB
1752static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1753 struct sk_buff *skb)
1754{
1755 /* Earlier code assumed this would be a VLAN pkt, double-check
1756 * this now that we have the actual packet in hand. We can only
1757 * do this check on Ethernet devices.
1758 */
1759 if (unlikely(dev->type != ARPHRD_ETHER))
1760 return false;
1761
1762 skb_reset_mac_header(skb);
1763 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1764}
1765
90ddc4f0 1766static const struct proto_ops packet_ops;
1da177e4 1767
90ddc4f0 1768static const struct proto_ops packet_ops_spkt;
1da177e4 1769
40d4e3df
ED
1770static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1771 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1772{
1773 struct sock *sk;
1774 struct sockaddr_pkt *spkt;
1775
1776 /*
1777 * When we registered the protocol we saved the socket in the data
1778 * field for just this event.
1779 */
1780
1781 sk = pt->af_packet_priv;
1ce4f28b 1782
1da177e4
LT
1783 /*
1784 * Yank back the headers [hope the device set this
1785 * right or kerboom...]
1786 *
1787 * Incoming packets have ll header pulled,
1788 * push it back.
1789 *
98e399f8 1790 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1791 * so that this procedure is noop.
1792 */
1793
1794 if (skb->pkt_type == PACKET_LOOPBACK)
1795 goto out;
1796
09ad9bc7 1797 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1798 goto out;
1799
40d4e3df
ED
1800 skb = skb_share_check(skb, GFP_ATOMIC);
1801 if (skb == NULL)
1da177e4
LT
1802 goto oom;
1803
1804 /* drop any routing info */
adf30907 1805 skb_dst_drop(skb);
1da177e4 1806
84531c24
PO
1807 /* drop conntrack reference */
1808 nf_reset(skb);
1809
ffbc6111 1810 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1811
98e399f8 1812 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1813
1814 /*
1815 * The SOCK_PACKET socket receives _all_ frames.
1816 */
1817
1818 spkt->spkt_family = dev->type;
1819 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1820 spkt->spkt_protocol = skb->protocol;
1821
1822 /*
1823 * Charge the memory to the socket. This is done specifically
1824 * to prevent sockets using all the memory up.
1825 */
1826
40d4e3df 1827 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1828 return 0;
1829
1830out:
1831 kfree_skb(skb);
1832oom:
1833 return 0;
1834}
1835
1836
1837/*
1838 * Output a raw packet to a device layer. This bypasses all the other
1839 * protocol layers and you must therefore supply it with a complete frame
1840 */
1ce4f28b 1841
1b784140
YX
1842static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1843 size_t len)
1da177e4
LT
1844{
1845 struct sock *sk = sock->sk;
342dfc30 1846 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1847 struct sk_buff *skb = NULL;
1da177e4 1848 struct net_device *dev;
c14ac945 1849 struct sockcm_cookie sockc;
40d4e3df 1850 __be16 proto = 0;
1da177e4 1851 int err;
3bdc0eba 1852 int extra_len = 0;
1ce4f28b 1853
1da177e4 1854 /*
1ce4f28b 1855 * Get and verify the address.
1da177e4
LT
1856 */
1857
40d4e3df 1858 if (saddr) {
1da177e4 1859 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1860 return -EINVAL;
1861 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1862 proto = saddr->spkt_protocol;
1863 } else
1864 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1865
1866 /*
1ce4f28b 1867 * Find the device first to size check it
1da177e4
LT
1868 */
1869
de74e92a 1870 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1871retry:
654d1f8a
ED
1872 rcu_read_lock();
1873 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1874 err = -ENODEV;
1875 if (dev == NULL)
1876 goto out_unlock;
1ce4f28b 1877
d5e76b0a
DM
1878 err = -ENETDOWN;
1879 if (!(dev->flags & IFF_UP))
1880 goto out_unlock;
1881
1da177e4 1882 /*
40d4e3df
ED
1883 * You may not queue a frame bigger than the mtu. This is the lowest level
1884 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1885 */
1ce4f28b 1886
3bdc0eba
BG
1887 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1888 if (!netif_supports_nofcs(dev)) {
1889 err = -EPROTONOSUPPORT;
1890 goto out_unlock;
1891 }
1892 extra_len = 4; /* We're doing our own CRC */
1893 }
1894
1da177e4 1895 err = -EMSGSIZE;
3bdc0eba 1896 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1897 goto out_unlock;
1898
1a35ca80
ED
1899 if (!skb) {
1900 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1901 int tlen = dev->needed_tailroom;
1a35ca80
ED
1902 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1903
1904 rcu_read_unlock();
4ce40912 1905 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1906 if (skb == NULL)
1907 return -ENOBUFS;
1908 /* FIXME: Save some space for broken drivers that write a hard
1909 * header at transmission time by themselves. PPP is the notable
1910 * one here. This should really be fixed at the driver level.
1911 */
1912 skb_reserve(skb, reserved);
1913 skb_reset_network_header(skb);
1914
1915 /* Try to align data part correctly */
1916 if (hhlen) {
1917 skb->data -= hhlen;
1918 skb->tail -= hhlen;
1919 if (len < hhlen)
1920 skb_reset_network_header(skb);
1921 }
6ce8e9ce 1922 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1923 if (err)
1924 goto out_free;
1925 goto retry;
1da177e4
LT
1926 }
1927
9ed988cd
WB
1928 if (!dev_validate_header(dev, skb->data, len)) {
1929 err = -EINVAL;
1930 goto out_unlock;
1931 }
3c70c132
DB
1932 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1933 !packet_extra_vlan_len_allowed(dev, skb)) {
1934 err = -EMSGSIZE;
1935 goto out_unlock;
57f89bfa 1936 }
1a35ca80 1937
edbe7746 1938 sockc.tsflags = sk->sk_tsflags;
c14ac945
SHY
1939 if (msg->msg_controllen) {
1940 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1941 if (unlikely(err))
c14ac945 1942 goto out_unlock;
c14ac945
SHY
1943 }
1944
1da177e4
LT
1945 skb->protocol = proto;
1946 skb->dev = dev;
1947 skb->priority = sk->sk_priority;
2d37a186 1948 skb->mark = sk->sk_mark;
bf84a010 1949
c14ac945 1950 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1951
3bdc0eba
BG
1952 if (unlikely(extra_len == 4))
1953 skb->no_fcs = 1;
1954
40893fd0 1955 skb_probe_transport_header(skb, 0);
c1aad275 1956
1da177e4 1957 dev_queue_xmit(skb);
654d1f8a 1958 rcu_read_unlock();
40d4e3df 1959 return len;
1da177e4 1960
1da177e4 1961out_unlock:
654d1f8a 1962 rcu_read_unlock();
1a35ca80
ED
1963out_free:
1964 kfree_skb(skb);
1da177e4
LT
1965 return err;
1966}
1da177e4 1967
ff936a04
AS
1968static unsigned int run_filter(struct sk_buff *skb,
1969 const struct sock *sk,
1970 unsigned int res)
1da177e4
LT
1971{
1972 struct sk_filter *filter;
fda9ef5d 1973
80f8f102
ED
1974 rcu_read_lock();
1975 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1976 if (filter != NULL)
ff936a04 1977 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1978 rcu_read_unlock();
1da177e4 1979
dbcb5855 1980 return res;
1da177e4
LT
1981}
1982
16cc1400
WB
1983static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
1984 size_t *len)
1985{
1986 struct virtio_net_hdr vnet_hdr;
1987
1988 if (*len < sizeof(vnet_hdr))
1989 return -EINVAL;
1990 *len -= sizeof(vnet_hdr);
1991
6391a448 1992 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
16cc1400
WB
1993 return -EINVAL;
1994
1995 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
1996}
1997
1da177e4 1998/*
62ab0812
ED
1999 * This function makes lazy skb cloning in hope that most of packets
2000 * are discarded by BPF.
2001 *
2002 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2003 * and skb->cb are mangled. It works because (and until) packets
2004 * falling here are owned by current CPU. Output packets are cloned
2005 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2006 * sequencially, so that if we return skb to original state on exit,
2007 * we will not harm anyone.
1da177e4
LT
2008 */
2009
40d4e3df
ED
2010static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2011 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2012{
2013 struct sock *sk;
2014 struct sockaddr_ll *sll;
2015 struct packet_sock *po;
40d4e3df 2016 u8 *skb_head = skb->data;
1da177e4 2017 int skb_len = skb->len;
dbcb5855 2018 unsigned int snaplen, res;
da37845f 2019 bool is_drop_n_account = false;
1da177e4
LT
2020
2021 if (skb->pkt_type == PACKET_LOOPBACK)
2022 goto drop;
2023
2024 sk = pt->af_packet_priv;
2025 po = pkt_sk(sk);
2026
09ad9bc7 2027 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2028 goto drop;
2029
1da177e4
LT
2030 skb->dev = dev;
2031
3b04ddde 2032 if (dev->header_ops) {
1da177e4 2033 /* The device has an explicit notion of ll header,
62ab0812
ED
2034 * exported to higher levels.
2035 *
2036 * Otherwise, the device hides details of its frame
2037 * structure, so that corresponding packet head is
2038 * never delivered to user.
1da177e4
LT
2039 */
2040 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2041 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2042 else if (skb->pkt_type == PACKET_OUTGOING) {
2043 /* Special case: outgoing packets have ll header at head */
bbe735e4 2044 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2045 }
2046 }
2047
2048 snaplen = skb->len;
2049
dbcb5855
DM
2050 res = run_filter(skb, sk, snaplen);
2051 if (!res)
fda9ef5d 2052 goto drop_n_restore;
dbcb5855
DM
2053 if (snaplen > res)
2054 snaplen = res;
1da177e4 2055
0fd7bac6 2056 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2057 goto drop_n_acct;
2058
2059 if (skb_shared(skb)) {
2060 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2061 if (nskb == NULL)
2062 goto drop_n_acct;
2063
2064 if (skb_head != skb->data) {
2065 skb->data = skb_head;
2066 skb->len = skb_len;
2067 }
abc4e4fa 2068 consume_skb(skb);
1da177e4
LT
2069 skb = nskb;
2070 }
2071
b4772ef8 2072 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2073
2074 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2075 sll->sll_hatype = dev->type;
1da177e4 2076 sll->sll_pkttype = skb->pkt_type;
8032b464 2077 if (unlikely(po->origdev))
80feaacb
PWJ
2078 sll->sll_ifindex = orig_dev->ifindex;
2079 else
2080 sll->sll_ifindex = dev->ifindex;
1da177e4 2081
b95cce35 2082 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2083
2472d761
EB
2084 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2085 * Use their space for storing the original skb length.
2086 */
2087 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2088
1da177e4
LT
2089 if (pskb_trim(skb, snaplen))
2090 goto drop_n_acct;
2091
2092 skb_set_owner_r(skb, sk);
2093 skb->dev = NULL;
adf30907 2094 skb_dst_drop(skb);
1da177e4 2095
84531c24
PO
2096 /* drop conntrack reference */
2097 nf_reset(skb);
2098
1da177e4 2099 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2100 po->stats.stats1.tp_packets++;
3bc3b96f 2101 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2102 __skb_queue_tail(&sk->sk_receive_queue, skb);
2103 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2104 sk->sk_data_ready(sk);
1da177e4
LT
2105 return 0;
2106
2107drop_n_acct:
da37845f 2108 is_drop_n_account = true;
7091fbd8 2109 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2110 po->stats.stats1.tp_drops++;
7091fbd8
WB
2111 atomic_inc(&sk->sk_drops);
2112 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2113
2114drop_n_restore:
2115 if (skb_head != skb->data && skb_shared(skb)) {
2116 skb->data = skb_head;
2117 skb->len = skb_len;
2118 }
2119drop:
da37845f
WJ
2120 if (!is_drop_n_account)
2121 consume_skb(skb);
2122 else
2123 kfree_skb(skb);
1da177e4
LT
2124 return 0;
2125}
2126
40d4e3df
ED
2127static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2128 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2129{
2130 struct sock *sk;
2131 struct packet_sock *po;
2132 struct sockaddr_ll *sll;
184f489e 2133 union tpacket_uhdr h;
40d4e3df 2134 u8 *skb_head = skb->data;
1da177e4 2135 int skb_len = skb->len;
dbcb5855 2136 unsigned int snaplen, res;
f6fb8f10 2137 unsigned long status = TP_STATUS_USER;
bbd6ef87 2138 unsigned short macoff, netoff, hdrlen;
1da177e4 2139 struct sk_buff *copy_skb = NULL;
bbd6ef87 2140 struct timespec ts;
b9c32fb2 2141 __u32 ts_status;
da37845f 2142 bool is_drop_n_account = false;
1da177e4 2143
51846355
AW
2144 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2145 * We may add members to them until current aligned size without forcing
2146 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2147 */
2148 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2149 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2150
1da177e4
LT
2151 if (skb->pkt_type == PACKET_LOOPBACK)
2152 goto drop;
2153
2154 sk = pt->af_packet_priv;
2155 po = pkt_sk(sk);
2156
09ad9bc7 2157 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2158 goto drop;
2159
3b04ddde 2160 if (dev->header_ops) {
1da177e4 2161 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2162 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2163 else if (skb->pkt_type == PACKET_OUTGOING) {
2164 /* Special case: outgoing packets have ll header at head */
bbe735e4 2165 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2166 }
2167 }
2168
2169 snaplen = skb->len;
2170
dbcb5855
DM
2171 res = run_filter(skb, sk, snaplen);
2172 if (!res)
fda9ef5d 2173 goto drop_n_restore;
68c2e5de
AD
2174
2175 if (skb->ip_summed == CHECKSUM_PARTIAL)
2176 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2177 else if (skb->pkt_type != PACKET_OUTGOING &&
2178 (skb->ip_summed == CHECKSUM_COMPLETE ||
2179 skb_csum_unnecessary(skb)))
2180 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2181
dbcb5855
DM
2182 if (snaplen > res)
2183 snaplen = res;
1da177e4
LT
2184
2185 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2186 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2187 po->tp_reserve;
1da177e4 2188 } else {
95c96174 2189 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2190 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2191 (maclen < 16 ? 16 : maclen)) +
58d19b19
WB
2192 po->tp_reserve;
2193 if (po->has_vnet_hdr)
2194 netoff += sizeof(struct virtio_net_hdr);
1da177e4
LT
2195 macoff = netoff - maclen;
2196 }
f6fb8f10 2197 if (po->tp_version <= TPACKET_V2) {
2198 if (macoff + snaplen > po->rx_ring.frame_size) {
2199 if (po->copy_thresh &&
0fd7bac6 2200 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2201 if (skb_shared(skb)) {
2202 copy_skb = skb_clone(skb, GFP_ATOMIC);
2203 } else {
2204 copy_skb = skb_get(skb);
2205 skb_head = skb->data;
2206 }
2207 if (copy_skb)
2208 skb_set_owner_r(copy_skb, sk);
1da177e4 2209 }
f6fb8f10 2210 snaplen = po->rx_ring.frame_size - macoff;
2211 if ((int)snaplen < 0)
2212 snaplen = 0;
1da177e4 2213 }
dc808110
ED
2214 } else if (unlikely(macoff + snaplen >
2215 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2216 u32 nval;
2217
2218 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2219 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2220 snaplen, nval, macoff);
2221 snaplen = nval;
2222 if (unlikely((int)snaplen < 0)) {
2223 snaplen = 0;
2224 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2225 }
1da177e4 2226 }
1da177e4 2227 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2228 h.raw = packet_current_rx_frame(po, skb,
2229 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2230 if (!h.raw)
58d19b19 2231 goto drop_n_account;
f6fb8f10 2232 if (po->tp_version <= TPACKET_V2) {
2233 packet_increment_rx_head(po, &po->rx_ring);
2234 /*
2235 * LOSING will be reported till you read the stats,
2236 * because it's COR - Clear On Read.
2237 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2238 * at packet level.
2239 */
ee80fbf3 2240 if (po->stats.stats1.tp_drops)
f6fb8f10 2241 status |= TP_STATUS_LOSING;
2242 }
ee80fbf3 2243 po->stats.stats1.tp_packets++;
1da177e4
LT
2244 if (copy_skb) {
2245 status |= TP_STATUS_COPY;
2246 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2247 }
1da177e4
LT
2248 spin_unlock(&sk->sk_receive_queue.lock);
2249
58d19b19 2250 if (po->has_vnet_hdr) {
5a213881
JR
2251 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2252 sizeof(struct virtio_net_hdr),
6391a448 2253 vio_le(), true)) {
58d19b19
WB
2254 spin_lock(&sk->sk_receive_queue.lock);
2255 goto drop_n_account;
2256 }
2257 }
2258
bbd6ef87 2259 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2260
2261 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2262 getnstimeofday(&ts);
1da177e4 2263
b9c32fb2
DB
2264 status |= ts_status;
2265
bbd6ef87
PM
2266 switch (po->tp_version) {
2267 case TPACKET_V1:
2268 h.h1->tp_len = skb->len;
2269 h.h1->tp_snaplen = snaplen;
2270 h.h1->tp_mac = macoff;
2271 h.h1->tp_net = netoff;
4b457bdf
DB
2272 h.h1->tp_sec = ts.tv_sec;
2273 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2274 hdrlen = sizeof(*h.h1);
2275 break;
2276 case TPACKET_V2:
2277 h.h2->tp_len = skb->len;
2278 h.h2->tp_snaplen = snaplen;
2279 h.h2->tp_mac = macoff;
2280 h.h2->tp_net = netoff;
bbd6ef87
PM
2281 h.h2->tp_sec = ts.tv_sec;
2282 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2283 if (skb_vlan_tag_present(skb)) {
2284 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2285 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2286 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2287 } else {
2288 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2289 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2290 }
e4d26f4b 2291 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2292 hdrlen = sizeof(*h.h2);
2293 break;
f6fb8f10 2294 case TPACKET_V3:
2295 /* tp_nxt_offset,vlan are already populated above.
2296 * So DONT clear those fields here
2297 */
2298 h.h3->tp_status |= status;
2299 h.h3->tp_len = skb->len;
2300 h.h3->tp_snaplen = snaplen;
2301 h.h3->tp_mac = macoff;
2302 h.h3->tp_net = netoff;
f6fb8f10 2303 h.h3->tp_sec = ts.tv_sec;
2304 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2305 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2306 hdrlen = sizeof(*h.h3);
2307 break;
bbd6ef87
PM
2308 default:
2309 BUG();
2310 }
1da177e4 2311
bbd6ef87 2312 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2313 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2314 sll->sll_family = AF_PACKET;
2315 sll->sll_hatype = dev->type;
2316 sll->sll_protocol = skb->protocol;
2317 sll->sll_pkttype = skb->pkt_type;
8032b464 2318 if (unlikely(po->origdev))
80feaacb
PWJ
2319 sll->sll_ifindex = orig_dev->ifindex;
2320 else
2321 sll->sll_ifindex = dev->ifindex;
1da177e4 2322
e16aa207 2323 smp_mb();
f0d4eb29 2324
f6dafa95 2325#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2326 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2327 u8 *start, *end;
2328
f0d4eb29
DB
2329 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2330 macoff + snaplen);
2331
2332 for (start = h.raw; start < end; start += PAGE_SIZE)
2333 flush_dcache_page(pgv_to_page(start));
1da177e4 2334 }
f0d4eb29 2335 smp_wmb();
f6dafa95 2336#endif
f0d4eb29 2337
da413eec 2338 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2339 __packet_set_status(po, h.raw, status);
da413eec
DC
2340 sk->sk_data_ready(sk);
2341 } else {
f6fb8f10 2342 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2343 }
1da177e4
LT
2344
2345drop_n_restore:
2346 if (skb_head != skb->data && skb_shared(skb)) {
2347 skb->data = skb_head;
2348 skb->len = skb_len;
2349 }
2350drop:
da37845f
WJ
2351 if (!is_drop_n_account)
2352 consume_skb(skb);
2353 else
2354 kfree_skb(skb);
1da177e4
LT
2355 return 0;
2356
58d19b19 2357drop_n_account:
da37845f 2358 is_drop_n_account = true;
ee80fbf3 2359 po->stats.stats1.tp_drops++;
1da177e4
LT
2360 spin_unlock(&sk->sk_receive_queue.lock);
2361
676d2369 2362 sk->sk_data_ready(sk);
acb5d75b 2363 kfree_skb(copy_skb);
1da177e4
LT
2364 goto drop_n_restore;
2365}
2366
69e3c75f
JB
2367static void tpacket_destruct_skb(struct sk_buff *skb)
2368{
2369 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2370
69e3c75f 2371 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2372 void *ph;
b9c32fb2
DB
2373 __u32 ts;
2374
69e3c75f 2375 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2376 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2377
2378 ts = __packet_set_timestamp(po, ph, skb);
2379 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2380 }
2381
2382 sock_wfree(skb);
2383}
2384
c72219b7
DB
2385static void tpacket_set_protocol(const struct net_device *dev,
2386 struct sk_buff *skb)
2387{
2388 if (dev->type == ARPHRD_ETHER) {
2389 skb_reset_mac_header(skb);
2390 skb->protocol = eth_hdr(skb)->h_proto;
2391 }
2392}
2393
16cc1400
WB
2394static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2395{
16cc1400
WB
2396 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2397 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2398 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2399 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2400 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2401 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2402 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2403
2404 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2405 return -EINVAL;
2406
16cc1400
WB
2407 return 0;
2408}
2409
2410static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2411 struct virtio_net_hdr *vnet_hdr)
2412{
16cc1400
WB
2413 if (*len < sizeof(*vnet_hdr))
2414 return -EINVAL;
2415 *len -= sizeof(*vnet_hdr);
2416
cbbd26b8 2417 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2418 return -EFAULT;
2419
2420 return __packet_snd_vnet_parse(vnet_hdr, *len);
2421}
2422
40d4e3df 2423static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2424 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2425 __be16 proto, unsigned char *addr, int hlen, int copylen,
2426 const struct sockcm_cookie *sockc)
69e3c75f 2427{
184f489e 2428 union tpacket_uhdr ph;
8d39b4a6 2429 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2430 struct socket *sock = po->sk.sk_socket;
2431 struct page *page;
69e3c75f
JB
2432 int err;
2433
2434 ph.raw = frame;
2435
2436 skb->protocol = proto;
2437 skb->dev = dev;
2438 skb->priority = po->sk.sk_priority;
2d37a186 2439 skb->mark = po->sk.sk_mark;
c14ac945 2440 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2441 skb_shinfo(skb)->destructor_arg = ph.raw;
2442
ae641949 2443 skb_reserve(skb, hlen);
69e3c75f 2444 skb_reset_network_header(skb);
c1aad275 2445
69e3c75f
JB
2446 to_write = tp_len;
2447
2448 if (sock->type == SOCK_DGRAM) {
2449 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2450 NULL, tp_len);
2451 if (unlikely(err < 0))
2452 return -EINVAL;
1d036d25 2453 } else if (copylen) {
9ed988cd
WB
2454 int hdrlen = min_t(int, copylen, tp_len);
2455
69e3c75f 2456 skb_push(skb, dev->hard_header_len);
1d036d25 2457 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2458 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2459 if (unlikely(err))
2460 return err;
9ed988cd
WB
2461 if (!dev_validate_header(dev, skb->data, hdrlen))
2462 return -EINVAL;
c72219b7
DB
2463 if (!skb->protocol)
2464 tpacket_set_protocol(dev, skb);
69e3c75f 2465
9ed988cd
WB
2466 data += hdrlen;
2467 to_write -= hdrlen;
69e3c75f
JB
2468 }
2469
69e3c75f
JB
2470 offset = offset_in_page(data);
2471 len_max = PAGE_SIZE - offset;
2472 len = ((to_write > len_max) ? len_max : to_write);
2473
2474 skb->data_len = to_write;
2475 skb->len += to_write;
2476 skb->truesize += to_write;
2477 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2478
2479 while (likely(to_write)) {
2480 nr_frags = skb_shinfo(skb)->nr_frags;
2481
2482 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2483 pr_err("Packet exceed the number of skb frags(%lu)\n",
2484 MAX_SKB_FRAGS);
69e3c75f
JB
2485 return -EFAULT;
2486 }
2487
0af55bb5
CG
2488 page = pgv_to_page(data);
2489 data += len;
69e3c75f
JB
2490 flush_dcache_page(page);
2491 get_page(page);
0af55bb5 2492 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2493 to_write -= len;
2494 offset = 0;
2495 len_max = PAGE_SIZE;
2496 len = ((to_write > len_max) ? len_max : to_write);
2497 }
2498
8fd6c80d 2499 skb_probe_transport_header(skb, 0);
efdfa2f7 2500
69e3c75f
JB
2501 return tp_len;
2502}
2503
8d39b4a6
WB
2504static int tpacket_parse_header(struct packet_sock *po, void *frame,
2505 int size_max, void **data)
2506{
2507 union tpacket_uhdr ph;
2508 int tp_len, off;
2509
2510 ph.raw = frame;
2511
2512 switch (po->tp_version) {
2513 case TPACKET_V2:
2514 tp_len = ph.h2->tp_len;
2515 break;
2516 default:
2517 tp_len = ph.h1->tp_len;
2518 break;
2519 }
2520 if (unlikely(tp_len > size_max)) {
2521 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2522 return -EMSGSIZE;
2523 }
2524
2525 if (unlikely(po->tp_tx_has_off)) {
2526 int off_min, off_max;
2527
2528 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2529 off_max = po->tx_ring.frame_size - tp_len;
2530 if (po->sk.sk_type == SOCK_DGRAM) {
2531 switch (po->tp_version) {
2532 case TPACKET_V2:
2533 off = ph.h2->tp_net;
2534 break;
2535 default:
2536 off = ph.h1->tp_net;
2537 break;
2538 }
2539 } else {
2540 switch (po->tp_version) {
2541 case TPACKET_V2:
2542 off = ph.h2->tp_mac;
2543 break;
2544 default:
2545 off = ph.h1->tp_mac;
2546 break;
2547 }
2548 }
2549 if (unlikely((off < off_min) || (off_max < off)))
2550 return -EINVAL;
2551 } else {
2552 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2553 }
2554
2555 *data = frame + off;
2556 return tp_len;
2557}
2558
69e3c75f
JB
2559static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2560{
69e3c75f
JB
2561 struct sk_buff *skb;
2562 struct net_device *dev;
1d036d25 2563 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2564 struct sockcm_cookie sockc;
69e3c75f 2565 __be16 proto;
09effa67 2566 int err, reserve = 0;
40d4e3df 2567 void *ph;
342dfc30 2568 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2569 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2570 int tp_len, size_max;
2571 unsigned char *addr;
8d39b4a6 2572 void *data;
69e3c75f 2573 int len_sum = 0;
9e67030a 2574 int status = TP_STATUS_AVAILABLE;
1d036d25 2575 int hlen, tlen, copylen = 0;
69e3c75f 2576
69e3c75f
JB
2577 mutex_lock(&po->pg_vec_lock);
2578
66e56cd4 2579 if (likely(saddr == NULL)) {
e40526cb 2580 dev = packet_cached_dev_get(po);
69e3c75f
JB
2581 proto = po->num;
2582 addr = NULL;
2583 } else {
2584 err = -EINVAL;
2585 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2586 goto out;
2587 if (msg->msg_namelen < (saddr->sll_halen
2588 + offsetof(struct sockaddr_ll,
2589 sll_addr)))
2590 goto out;
69e3c75f
JB
2591 proto = saddr->sll_protocol;
2592 addr = saddr->sll_addr;
827d9780 2593 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2594 }
2595
edbe7746 2596 sockc.tsflags = po->sk.sk_tsflags;
c14ac945
SHY
2597 if (msg->msg_controllen) {
2598 err = sock_cmsg_send(&po->sk, msg, &sockc);
2599 if (unlikely(err))
2600 goto out;
2601 }
2602
69e3c75f
JB
2603 err = -ENXIO;
2604 if (unlikely(dev == NULL))
2605 goto out;
69e3c75f
JB
2606 err = -ENETDOWN;
2607 if (unlikely(!(dev->flags & IFF_UP)))
2608 goto out_put;
2609
5cfb4c8d
DB
2610 if (po->sk.sk_socket->type == SOCK_RAW)
2611 reserve = dev->hard_header_len;
69e3c75f 2612 size_max = po->tx_ring.frame_size
b5dd884e 2613 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2614
1d036d25 2615 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2616 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2617
69e3c75f
JB
2618 do {
2619 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2620 TP_STATUS_SEND_REQUEST);
69e3c75f 2621 if (unlikely(ph == NULL)) {
87a2fd28
DB
2622 if (need_wait && need_resched())
2623 schedule();
69e3c75f
JB
2624 continue;
2625 }
2626
8d39b4a6
WB
2627 skb = NULL;
2628 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2629 if (tp_len < 0)
2630 goto tpacket_error;
2631
69e3c75f 2632 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2633 hlen = LL_RESERVED_SPACE(dev);
2634 tlen = dev->needed_tailroom;
1d036d25
WB
2635 if (po->has_vnet_hdr) {
2636 vnet_hdr = data;
2637 data += sizeof(*vnet_hdr);
2638 tp_len -= sizeof(*vnet_hdr);
2639 if (tp_len < 0 ||
2640 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2641 tp_len = -EINVAL;
2642 goto tpacket_error;
2643 }
2644 copylen = __virtio16_to_cpu(vio_le(),
2645 vnet_hdr->hdr_len);
2646 }
9ed988cd 2647 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2648 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2649 hlen + tlen + sizeof(struct sockaddr_ll) +
2650 (copylen - dev->hard_header_len),
fbf33a28 2651 !need_wait, &err);
69e3c75f 2652
fbf33a28
KM
2653 if (unlikely(skb == NULL)) {
2654 /* we assume the socket was initially writeable ... */
2655 if (likely(len_sum > 0))
2656 err = len_sum;
69e3c75f 2657 goto out_status;
fbf33a28 2658 }
8d39b4a6 2659 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2660 addr, hlen, copylen, &sockc);
dbd46ab4 2661 if (likely(tp_len >= 0) &&
5cfb4c8d 2662 tp_len > dev->mtu + reserve &&
1d036d25 2663 !po->has_vnet_hdr &&
3c70c132
DB
2664 !packet_extra_vlan_len_allowed(dev, skb))
2665 tp_len = -EMSGSIZE;
69e3c75f
JB
2666
2667 if (unlikely(tp_len < 0)) {
8d39b4a6 2668tpacket_error:
69e3c75f
JB
2669 if (po->tp_loss) {
2670 __packet_set_status(po, ph,
2671 TP_STATUS_AVAILABLE);
2672 packet_increment_head(&po->tx_ring);
2673 kfree_skb(skb);
2674 continue;
2675 } else {
2676 status = TP_STATUS_WRONG_FORMAT;
2677 err = tp_len;
2678 goto out_status;
2679 }
2680 }
2681
db60eb5f
JR
2682 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2683 vio_le())) {
1d036d25
WB
2684 tp_len = -EINVAL;
2685 goto tpacket_error;
2686 }
2687
0fd5d57b
DB
2688 packet_pick_tx_queue(dev, skb);
2689
69e3c75f
JB
2690 skb->destructor = tpacket_destruct_skb;
2691 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2692 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2693
2694 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2695 err = po->xmit(skb);
eb70df13
JP
2696 if (unlikely(err > 0)) {
2697 err = net_xmit_errno(err);
2698 if (err && __packet_get_status(po, ph) ==
2699 TP_STATUS_AVAILABLE) {
2700 /* skb was destructed already */
2701 skb = NULL;
2702 goto out_status;
2703 }
2704 /*
2705 * skb was dropped but not destructed yet;
2706 * let's treat it like congestion or err < 0
2707 */
2708 err = 0;
2709 }
69e3c75f
JB
2710 packet_increment_head(&po->tx_ring);
2711 len_sum += tp_len;
b0138408
DB
2712 } while (likely((ph != NULL) ||
2713 /* Note: packet_read_pending() might be slow if we have
2714 * to call it as it's per_cpu variable, but in fast-path
2715 * we already short-circuit the loop with the first
2716 * condition, and luckily don't have to go that path
2717 * anyway.
2718 */
2719 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2720
2721 err = len_sum;
2722 goto out_put;
2723
69e3c75f
JB
2724out_status:
2725 __packet_set_status(po, ph, status);
2726 kfree_skb(skb);
2727out_put:
e40526cb 2728 dev_put(dev);
69e3c75f
JB
2729out:
2730 mutex_unlock(&po->pg_vec_lock);
2731 return err;
2732}
69e3c75f 2733
eea49cc9
OJ
2734static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2735 size_t reserve, size_t len,
2736 size_t linear, int noblock,
2737 int *err)
bfd5f4a3
SS
2738{
2739 struct sk_buff *skb;
2740
2741 /* Under a page? Don't bother with paged skb. */
2742 if (prepad + len < PAGE_SIZE || !linear)
2743 linear = len;
2744
2745 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2746 err, 0);
bfd5f4a3
SS
2747 if (!skb)
2748 return NULL;
2749
2750 skb_reserve(skb, reserve);
2751 skb_put(skb, linear);
2752 skb->data_len = len - linear;
2753 skb->len += len - linear;
2754
2755 return skb;
2756}
2757
d346a3fa 2758static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2759{
2760 struct sock *sk = sock->sk;
342dfc30 2761 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2762 struct sk_buff *skb;
2763 struct net_device *dev;
0e11c91e 2764 __be16 proto;
1da177e4 2765 unsigned char *addr;
827d9780 2766 int err, reserve = 0;
c7d39e32 2767 struct sockcm_cookie sockc;
bfd5f4a3
SS
2768 struct virtio_net_hdr vnet_hdr = { 0 };
2769 int offset = 0;
bfd5f4a3 2770 struct packet_sock *po = pkt_sk(sk);
57031eb7 2771 int hlen, tlen, linear;
3bdc0eba 2772 int extra_len = 0;
1da177e4
LT
2773
2774 /*
1ce4f28b 2775 * Get and verify the address.
1da177e4 2776 */
1ce4f28b 2777
66e56cd4 2778 if (likely(saddr == NULL)) {
e40526cb 2779 dev = packet_cached_dev_get(po);
1da177e4
LT
2780 proto = po->num;
2781 addr = NULL;
2782 } else {
2783 err = -EINVAL;
2784 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2785 goto out;
0fb375fb
EB
2786 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2787 goto out;
1da177e4
LT
2788 proto = saddr->sll_protocol;
2789 addr = saddr->sll_addr;
827d9780 2790 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2791 }
2792
1da177e4 2793 err = -ENXIO;
e40526cb 2794 if (unlikely(dev == NULL))
1da177e4 2795 goto out_unlock;
d5e76b0a 2796 err = -ENETDOWN;
e40526cb 2797 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2798 goto out_unlock;
2799
edbe7746 2800 sockc.tsflags = sk->sk_tsflags;
c7d39e32
EJ
2801 sockc.mark = sk->sk_mark;
2802 if (msg->msg_controllen) {
2803 err = sock_cmsg_send(sk, msg, &sockc);
2804 if (unlikely(err))
2805 goto out_unlock;
2806 }
2807
e40526cb
DB
2808 if (sock->type == SOCK_RAW)
2809 reserve = dev->hard_header_len;
bfd5f4a3 2810 if (po->has_vnet_hdr) {
16cc1400
WB
2811 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2812 if (err)
bfd5f4a3 2813 goto out_unlock;
bfd5f4a3
SS
2814 }
2815
3bdc0eba
BG
2816 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2817 if (!netif_supports_nofcs(dev)) {
2818 err = -EPROTONOSUPPORT;
2819 goto out_unlock;
2820 }
2821 extra_len = 4; /* We're doing our own CRC */
2822 }
2823
1da177e4 2824 err = -EMSGSIZE;
16cc1400
WB
2825 if (!vnet_hdr.gso_type &&
2826 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2827 goto out_unlock;
2828
bfd5f4a3 2829 err = -ENOBUFS;
ae641949
HX
2830 hlen = LL_RESERVED_SPACE(dev);
2831 tlen = dev->needed_tailroom;
57031eb7
WB
2832 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2833 linear = max(linear, min_t(int, len, dev->hard_header_len));
2834 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2835 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2836 if (skb == NULL)
1da177e4
LT
2837 goto out_unlock;
2838
bfd5f4a3 2839 skb_set_network_header(skb, reserve);
1da177e4 2840
0c4e8581 2841 err = -EINVAL;
9c707762
WB
2842 if (sock->type == SOCK_DGRAM) {
2843 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2844 if (unlikely(offset < 0))
9c707762 2845 goto out_free;
9c707762 2846 }
1da177e4
LT
2847
2848 /* Returns -EFAULT on error */
c0371da6 2849 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2850 if (err)
2851 goto out_free;
bf84a010 2852
9ed988cd
WB
2853 if (sock->type == SOCK_RAW &&
2854 !dev_validate_header(dev, skb->data, len)) {
2855 err = -EINVAL;
2856 goto out_free;
2857 }
2858
c14ac945 2859 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2860
16cc1400 2861 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2862 !packet_extra_vlan_len_allowed(dev, skb)) {
2863 err = -EMSGSIZE;
2864 goto out_free;
57f89bfa
BG
2865 }
2866
09effa67
DM
2867 skb->protocol = proto;
2868 skb->dev = dev;
1da177e4 2869 skb->priority = sk->sk_priority;
c7d39e32 2870 skb->mark = sockc.mark;
0fd5d57b
DB
2871
2872 packet_pick_tx_queue(dev, skb);
1da177e4 2873
bfd5f4a3 2874 if (po->has_vnet_hdr) {
db60eb5f 2875 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2876 if (err)
2877 goto out_free;
2878 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2879 }
2880
8fd6c80d
DB
2881 skb_probe_transport_header(skb, reserve);
2882
3bdc0eba
BG
2883 if (unlikely(extra_len == 4))
2884 skb->no_fcs = 1;
2885
d346a3fa 2886 err = po->xmit(skb);
1da177e4
LT
2887 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2888 goto out_unlock;
2889
e40526cb 2890 dev_put(dev);
1da177e4 2891
40d4e3df 2892 return len;
1da177e4
LT
2893
2894out_free:
2895 kfree_skb(skb);
2896out_unlock:
e40526cb 2897 if (dev)
1da177e4
LT
2898 dev_put(dev);
2899out:
2900 return err;
2901}
2902
1b784140 2903static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2904{
69e3c75f
JB
2905 struct sock *sk = sock->sk;
2906 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2907
69e3c75f
JB
2908 if (po->tx_ring.pg_vec)
2909 return tpacket_snd(po, msg);
2910 else
69e3c75f
JB
2911 return packet_snd(sock, msg, len);
2912}
2913
1da177e4
LT
2914/*
2915 * Close a PACKET socket. This is fairly simple. We immediately go
2916 * to 'closed' state and remove our protocol entry in the device list.
2917 */
2918
2919static int packet_release(struct socket *sock)
2920{
2921 struct sock *sk = sock->sk;
2922 struct packet_sock *po;
2bd624b4 2923 struct packet_fanout *f;
d12d01d6 2924 struct net *net;
f6fb8f10 2925 union tpacket_req_u req_u;
1da177e4
LT
2926
2927 if (!sk)
2928 return 0;
2929
3b1e0a65 2930 net = sock_net(sk);
1da177e4
LT
2931 po = pkt_sk(sk);
2932
0fa7fa98 2933 mutex_lock(&net->packet.sklist_lock);
808f5114 2934 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2935 mutex_unlock(&net->packet.sklist_lock);
2936
2937 preempt_disable();
920de804 2938 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2939 preempt_enable();
1da177e4 2940
808f5114 2941 spin_lock(&po->bind_lock);
ce06b03e 2942 unregister_prot_hook(sk, false);
66e56cd4
DB
2943 packet_cached_dev_reset(po);
2944
160ff18a
BG
2945 if (po->prot_hook.dev) {
2946 dev_put(po->prot_hook.dev);
2947 po->prot_hook.dev = NULL;
2948 }
808f5114 2949 spin_unlock(&po->bind_lock);
1da177e4 2950
1da177e4 2951 packet_flush_mclist(sk);
1da177e4 2952
9665d5d6
PS
2953 if (po->rx_ring.pg_vec) {
2954 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2955 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2956 }
69e3c75f 2957
9665d5d6
PS
2958 if (po->tx_ring.pg_vec) {
2959 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2960 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2961 }
1da177e4 2962
2bd624b4 2963 f = fanout_release(sk);
dc99f600 2964
808f5114 2965 synchronize_net();
2bd624b4
AS
2966
2967 if (f) {
2968 fanout_release_data(f);
2969 kfree(f);
2970 }
1da177e4
LT
2971 /*
2972 * Now the socket is dead. No more input will appear.
2973 */
1da177e4
LT
2974 sock_orphan(sk);
2975 sock->sk = NULL;
2976
2977 /* Purge queues */
2978
2979 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2980 packet_free_pending(po);
17ab56a2 2981 sk_refcnt_debug_release(sk);
1da177e4
LT
2982
2983 sock_put(sk);
2984 return 0;
2985}
2986
2987/*
2988 * Attach a packet hook.
2989 */
2990
30f7ea1c
FR
2991static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
2992 __be16 proto)
1da177e4
LT
2993{
2994 struct packet_sock *po = pkt_sk(sk);
158cd4af 2995 struct net_device *dev_curr;
902fefb8
DB
2996 __be16 proto_curr;
2997 bool need_rehook;
30f7ea1c
FR
2998 struct net_device *dev = NULL;
2999 int ret = 0;
3000 bool unlisted = false;
dc99f600 3001
30f7ea1c 3002 if (po->fanout)
dc99f600 3003 return -EINVAL;
1da177e4
LT
3004
3005 lock_sock(sk);
1da177e4 3006 spin_lock(&po->bind_lock);
30f7ea1c
FR
3007 rcu_read_lock();
3008
3009 if (name) {
3010 dev = dev_get_by_name_rcu(sock_net(sk), name);
3011 if (!dev) {
3012 ret = -ENODEV;
3013 goto out_unlock;
3014 }
3015 } else if (ifindex) {
3016 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3017 if (!dev) {
3018 ret = -ENODEV;
3019 goto out_unlock;
3020 }
3021 }
3022
3023 if (dev)
3024 dev_hold(dev);
66e56cd4 3025
902fefb8
DB
3026 proto_curr = po->prot_hook.type;
3027 dev_curr = po->prot_hook.dev;
3028
3029 need_rehook = proto_curr != proto || dev_curr != dev;
3030
3031 if (need_rehook) {
30f7ea1c
FR
3032 if (po->running) {
3033 rcu_read_unlock();
3034 __unregister_prot_hook(sk, true);
3035 rcu_read_lock();
3036 dev_curr = po->prot_hook.dev;
3037 if (dev)
3038 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3039 dev->ifindex);
3040 }
1da177e4 3041
902fefb8
DB
3042 po->num = proto;
3043 po->prot_hook.type = proto;
902fefb8 3044
30f7ea1c
FR
3045 if (unlikely(unlisted)) {
3046 dev_put(dev);
3047 po->prot_hook.dev = NULL;
3048 po->ifindex = -1;
3049 packet_cached_dev_reset(po);
3050 } else {
3051 po->prot_hook.dev = dev;
3052 po->ifindex = dev ? dev->ifindex : 0;
3053 packet_cached_dev_assign(po, dev);
3054 }
902fefb8 3055 }
158cd4af
LW
3056 if (dev_curr)
3057 dev_put(dev_curr);
66e56cd4 3058
902fefb8 3059 if (proto == 0 || !need_rehook)
1da177e4
LT
3060 goto out_unlock;
3061
30f7ea1c 3062 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3063 register_prot_hook(sk);
be85d4ad
UT
3064 } else {
3065 sk->sk_err = ENETDOWN;
3066 if (!sock_flag(sk, SOCK_DEAD))
3067 sk->sk_error_report(sk);
1da177e4
LT
3068 }
3069
3070out_unlock:
30f7ea1c 3071 rcu_read_unlock();
1da177e4
LT
3072 spin_unlock(&po->bind_lock);
3073 release_sock(sk);
30f7ea1c 3074 return ret;
1da177e4
LT
3075}
3076
3077/*
3078 * Bind a packet socket to a device
3079 */
3080
40d4e3df
ED
3081static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3082 int addr_len)
1da177e4 3083{
40d4e3df 3084 struct sock *sk = sock->sk;
c87838f6 3085 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3086
1da177e4
LT
3087 /*
3088 * Check legality
3089 */
1ce4f28b 3090
8ae55f04 3091 if (addr_len != sizeof(struct sockaddr))
1da177e4 3092 return -EINVAL;
c87838f6
AP
3093 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3094 * zero-terminated.
3095 */
3096 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3097 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3098
30f7ea1c 3099 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3100}
1da177e4
LT
3101
3102static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3103{
40d4e3df
ED
3104 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3105 struct sock *sk = sock->sk;
1da177e4
LT
3106
3107 /*
3108 * Check legality
3109 */
1ce4f28b 3110
1da177e4
LT
3111 if (addr_len < sizeof(struct sockaddr_ll))
3112 return -EINVAL;
3113 if (sll->sll_family != AF_PACKET)
3114 return -EINVAL;
3115
30f7ea1c
FR
3116 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3117 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3118}
3119
3120static struct proto packet_proto = {
3121 .name = "PACKET",
3122 .owner = THIS_MODULE,
3123 .obj_size = sizeof(struct packet_sock),
3124};
3125
3126/*
1ce4f28b 3127 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3128 */
3129
3f378b68
EP
3130static int packet_create(struct net *net, struct socket *sock, int protocol,
3131 int kern)
1da177e4
LT
3132{
3133 struct sock *sk;
3134 struct packet_sock *po;
0e11c91e 3135 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3136 int err;
3137
df008c91 3138 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3139 return -EPERM;
be02097c
DM
3140 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3141 sock->type != SOCK_PACKET)
1da177e4
LT
3142 return -ESOCKTNOSUPPORT;
3143
3144 sock->state = SS_UNCONNECTED;
3145
3146 err = -ENOBUFS;
11aa9c28 3147 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3148 if (sk == NULL)
3149 goto out;
3150
3151 sock->ops = &packet_ops;
1da177e4
LT
3152 if (sock->type == SOCK_PACKET)
3153 sock->ops = &packet_ops_spkt;
be02097c 3154
1da177e4
LT
3155 sock_init_data(sock, sk);
3156
3157 po = pkt_sk(sk);
3158 sk->sk_family = PF_PACKET;
0e11c91e 3159 po->num = proto;
d346a3fa 3160 po->xmit = dev_queue_xmit;
66e56cd4 3161
b0138408
DB
3162 err = packet_alloc_pending(po);
3163 if (err)
3164 goto out2;
3165
66e56cd4 3166 packet_cached_dev_reset(po);
1da177e4
LT
3167
3168 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3169 sk_refcnt_debug_inc(sk);
1da177e4
LT
3170
3171 /*
3172 * Attach a protocol block
3173 */
3174
3175 spin_lock_init(&po->bind_lock);
905db440 3176 mutex_init(&po->pg_vec_lock);
0648ab70 3177 po->rollover = NULL;
1da177e4 3178 po->prot_hook.func = packet_rcv;
be02097c 3179
1da177e4
LT
3180 if (sock->type == SOCK_PACKET)
3181 po->prot_hook.func = packet_rcv_spkt;
be02097c 3182
1da177e4
LT
3183 po->prot_hook.af_packet_priv = sk;
3184
0e11c91e
AV
3185 if (proto) {
3186 po->prot_hook.type = proto;
ce06b03e 3187 register_prot_hook(sk);
1da177e4
LT
3188 }
3189
0fa7fa98 3190 mutex_lock(&net->packet.sklist_lock);
808f5114 3191 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3192 mutex_unlock(&net->packet.sklist_lock);
3193
3194 preempt_disable();
3680453c 3195 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3196 preempt_enable();
808f5114 3197
40d4e3df 3198 return 0;
b0138408
DB
3199out2:
3200 sk_free(sk);
1da177e4
LT
3201out:
3202 return err;
3203}
3204
3205/*
3206 * Pull a packet from our receive queue and hand it to the user.
3207 * If necessary we block.
3208 */
3209
1b784140
YX
3210static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3211 int flags)
1da177e4
LT
3212{
3213 struct sock *sk = sock->sk;
3214 struct sk_buff *skb;
3215 int copied, err;
bfd5f4a3 3216 int vnet_hdr_len = 0;
2472d761 3217 unsigned int origlen = 0;
1da177e4
LT
3218
3219 err = -EINVAL;
ed85b565 3220 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3221 goto out;
3222
3223#if 0
3224 /* What error should we return now? EUNATTACH? */
3225 if (pkt_sk(sk)->ifindex < 0)
3226 return -ENODEV;
3227#endif
3228
ed85b565 3229 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3230 err = sock_recv_errqueue(sk, msg, len,
3231 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3232 goto out;
3233 }
3234
1da177e4
LT
3235 /*
3236 * Call the generic datagram receiver. This handles all sorts
3237 * of horrible races and re-entrancy so we can forget about it
3238 * in the protocol layers.
3239 *
3240 * Now it will return ENETDOWN, if device have just gone down,
3241 * but then it will block.
3242 */
3243
40d4e3df 3244 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3245
3246 /*
1ce4f28b 3247 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3248 * handles the blocking we don't see and worry about blocking
3249 * retries.
3250 */
3251
8ae55f04 3252 if (skb == NULL)
1da177e4
LT
3253 goto out;
3254
2ccdbaa6
WB
3255 if (pkt_sk(sk)->pressure)
3256 packet_rcv_has_room(pkt_sk(sk), NULL);
3257
bfd5f4a3 3258 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3259 err = packet_rcv_vnet(msg, skb, &len);
3260 if (err)
bfd5f4a3 3261 goto out_free;
16cc1400 3262 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3263 }
3264
f3d33426
HFS
3265 /* You lose any data beyond the buffer you gave. If it worries
3266 * a user program they can ask the device for its MTU
3267 * anyway.
1da177e4 3268 */
1da177e4 3269 copied = skb->len;
40d4e3df
ED
3270 if (copied > len) {
3271 copied = len;
3272 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3273 }
3274
51f3d02b 3275 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3276 if (err)
3277 goto out_free;
3278
2472d761
EB
3279 if (sock->type != SOCK_PACKET) {
3280 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3281
3282 /* Original length was stored in sockaddr_ll fields */
3283 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3284 sll->sll_family = AF_PACKET;
3285 sll->sll_protocol = skb->protocol;
3286 }
3287
3b885787 3288 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3289
f3d33426
HFS
3290 if (msg->msg_name) {
3291 /* If the address length field is there to be filled
3292 * in, we fill it in now.
3293 */
3294 if (sock->type == SOCK_PACKET) {
342dfc30 3295 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3296 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3297 } else {
3298 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3299
f3d33426
HFS
3300 msg->msg_namelen = sll->sll_halen +
3301 offsetof(struct sockaddr_ll, sll_addr);
3302 }
ffbc6111
HX
3303 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3304 msg->msg_namelen);
f3d33426 3305 }
1da177e4 3306
8dc41944 3307 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3308 struct tpacket_auxdata aux;
3309
3310 aux.tp_status = TP_STATUS_USER;
3311 if (skb->ip_summed == CHECKSUM_PARTIAL)
3312 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3313 else if (skb->pkt_type != PACKET_OUTGOING &&
3314 (skb->ip_summed == CHECKSUM_COMPLETE ||
3315 skb_csum_unnecessary(skb)))
3316 aux.tp_status |= TP_STATUS_CSUM_VALID;
3317
2472d761 3318 aux.tp_len = origlen;
ffbc6111
HX
3319 aux.tp_snaplen = skb->len;
3320 aux.tp_mac = 0;
bbe735e4 3321 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3322 if (skb_vlan_tag_present(skb)) {
3323 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3324 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3325 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3326 } else {
3327 aux.tp_vlan_tci = 0;
a0cdfcf3 3328 aux.tp_vlan_tpid = 0;
a3bcc23e 3329 }
ffbc6111 3330 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3331 }
3332
1da177e4
LT
3333 /*
3334 * Free or return the buffer as appropriate. Again this
3335 * hides all the races and re-entrancy issues from us.
3336 */
bfd5f4a3 3337 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3338
3339out_free:
3340 skb_free_datagram(sk, skb);
3341out:
3342 return err;
3343}
3344
1da177e4
LT
3345static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3346 int *uaddr_len, int peer)
3347{
3348 struct net_device *dev;
3349 struct sock *sk = sock->sk;
3350
3351 if (peer)
3352 return -EOPNOTSUPP;
3353
3354 uaddr->sa_family = AF_PACKET;
2dc85bf3 3355 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3356 rcu_read_lock();
3357 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3358 if (dev)
2dc85bf3 3359 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3360 rcu_read_unlock();
1da177e4
LT
3361 *uaddr_len = sizeof(*uaddr);
3362
3363 return 0;
3364}
1da177e4
LT
3365
3366static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3367 int *uaddr_len, int peer)
3368{
3369 struct net_device *dev;
3370 struct sock *sk = sock->sk;
3371 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3372 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3373
3374 if (peer)
3375 return -EOPNOTSUPP;
3376
3377 sll->sll_family = AF_PACKET;
3378 sll->sll_ifindex = po->ifindex;
3379 sll->sll_protocol = po->num;
67286640 3380 sll->sll_pkttype = 0;
654d1f8a
ED
3381 rcu_read_lock();
3382 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3383 if (dev) {
3384 sll->sll_hatype = dev->type;
3385 sll->sll_halen = dev->addr_len;
3386 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3387 } else {
3388 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3389 sll->sll_halen = 0;
3390 }
654d1f8a 3391 rcu_read_unlock();
0fb375fb 3392 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3393
3394 return 0;
3395}
3396
2aeb0b88
WC
3397static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3398 int what)
1da177e4
LT
3399{
3400 switch (i->type) {
3401 case PACKET_MR_MULTICAST:
1162563f
JP
3402 if (i->alen != dev->addr_len)
3403 return -EINVAL;
1da177e4 3404 if (what > 0)
22bedad3 3405 return dev_mc_add(dev, i->addr);
1da177e4 3406 else
22bedad3 3407 return dev_mc_del(dev, i->addr);
1da177e4
LT
3408 break;
3409 case PACKET_MR_PROMISC:
2aeb0b88 3410 return dev_set_promiscuity(dev, what);
1da177e4 3411 case PACKET_MR_ALLMULTI:
2aeb0b88 3412 return dev_set_allmulti(dev, what);
d95ed927 3413 case PACKET_MR_UNICAST:
1162563f
JP
3414 if (i->alen != dev->addr_len)
3415 return -EINVAL;
d95ed927 3416 if (what > 0)
a748ee24 3417 return dev_uc_add(dev, i->addr);
d95ed927 3418 else
a748ee24 3419 return dev_uc_del(dev, i->addr);
d95ed927 3420 break;
40d4e3df
ED
3421 default:
3422 break;
1da177e4 3423 }
2aeb0b88 3424 return 0;
1da177e4
LT
3425}
3426
82f17091
FR
3427static void packet_dev_mclist_delete(struct net_device *dev,
3428 struct packet_mclist **mlp)
1da177e4 3429{
82f17091
FR
3430 struct packet_mclist *ml;
3431
3432 while ((ml = *mlp) != NULL) {
3433 if (ml->ifindex == dev->ifindex) {
3434 packet_dev_mc(dev, ml, -1);
3435 *mlp = ml->next;
3436 kfree(ml);
3437 } else
3438 mlp = &ml->next;
1da177e4
LT
3439 }
3440}
3441
0fb375fb 3442static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3443{
3444 struct packet_sock *po = pkt_sk(sk);
3445 struct packet_mclist *ml, *i;
3446 struct net_device *dev;
3447 int err;
3448
3449 rtnl_lock();
3450
3451 err = -ENODEV;
3b1e0a65 3452 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3453 if (!dev)
3454 goto done;
3455
3456 err = -EINVAL;
1162563f 3457 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3458 goto done;
3459
3460 err = -ENOBUFS;
8b3a7005 3461 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3462 if (i == NULL)
3463 goto done;
3464
3465 err = 0;
3466 for (ml = po->mclist; ml; ml = ml->next) {
3467 if (ml->ifindex == mreq->mr_ifindex &&
3468 ml->type == mreq->mr_type &&
3469 ml->alen == mreq->mr_alen &&
3470 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3471 ml->count++;
3472 /* Free the new element ... */
3473 kfree(i);
3474 goto done;
3475 }
3476 }
3477
3478 i->type = mreq->mr_type;
3479 i->ifindex = mreq->mr_ifindex;
3480 i->alen = mreq->mr_alen;
3481 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3482 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3483 i->count = 1;
3484 i->next = po->mclist;
3485 po->mclist = i;
2aeb0b88
WC
3486 err = packet_dev_mc(dev, i, 1);
3487 if (err) {
3488 po->mclist = i->next;
3489 kfree(i);
3490 }
1da177e4
LT
3491
3492done:
3493 rtnl_unlock();
3494 return err;
3495}
3496
0fb375fb 3497static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3498{
3499 struct packet_mclist *ml, **mlp;
3500
3501 rtnl_lock();
3502
3503 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3504 if (ml->ifindex == mreq->mr_ifindex &&
3505 ml->type == mreq->mr_type &&
3506 ml->alen == mreq->mr_alen &&
3507 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3508 if (--ml->count == 0) {
3509 struct net_device *dev;
3510 *mlp = ml->next;
ad959e76
ED
3511 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3512 if (dev)
1da177e4 3513 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3514 kfree(ml);
3515 }
82f17091 3516 break;
1da177e4
LT
3517 }
3518 }
3519 rtnl_unlock();
82f17091 3520 return 0;
1da177e4
LT
3521}
3522
3523static void packet_flush_mclist(struct sock *sk)
3524{
3525 struct packet_sock *po = pkt_sk(sk);
3526 struct packet_mclist *ml;
3527
3528 if (!po->mclist)
3529 return;
3530
3531 rtnl_lock();
3532 while ((ml = po->mclist) != NULL) {
3533 struct net_device *dev;
3534
3535 po->mclist = ml->next;
ad959e76
ED
3536 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3537 if (dev != NULL)
1da177e4 3538 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3539 kfree(ml);
3540 }
3541 rtnl_unlock();
3542}
1da177e4
LT
3543
3544static int
b7058842 3545packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3546{
3547 struct sock *sk = sock->sk;
8dc41944 3548 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3549 int ret;
3550
3551 if (level != SOL_PACKET)
3552 return -ENOPROTOOPT;
3553
69e3c75f 3554 switch (optname) {
1ce4f28b 3555 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3556 case PACKET_DROP_MEMBERSHIP:
3557 {
0fb375fb
EB
3558 struct packet_mreq_max mreq;
3559 int len = optlen;
3560 memset(&mreq, 0, sizeof(mreq));
3561 if (len < sizeof(struct packet_mreq))
1da177e4 3562 return -EINVAL;
0fb375fb
EB
3563 if (len > sizeof(mreq))
3564 len = sizeof(mreq);
40d4e3df 3565 if (copy_from_user(&mreq, optval, len))
1da177e4 3566 return -EFAULT;
0fb375fb
EB
3567 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3568 return -EINVAL;
1da177e4
LT
3569 if (optname == PACKET_ADD_MEMBERSHIP)
3570 ret = packet_mc_add(sk, &mreq);
3571 else
3572 ret = packet_mc_drop(sk, &mreq);
3573 return ret;
3574 }
a2efcfa0 3575
1da177e4 3576 case PACKET_RX_RING:
69e3c75f 3577 case PACKET_TX_RING:
1da177e4 3578 {
f6fb8f10 3579 union tpacket_req_u req_u;
3580 int len;
1da177e4 3581
f6fb8f10 3582 switch (po->tp_version) {
3583 case TPACKET_V1:
3584 case TPACKET_V2:
3585 len = sizeof(req_u.req);
3586 break;
3587 case TPACKET_V3:
3588 default:
3589 len = sizeof(req_u.req3);
3590 break;
3591 }
3592 if (optlen < len)
1da177e4 3593 return -EINVAL;
f6fb8f10 3594 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3595 return -EFAULT;
f6fb8f10 3596 return packet_set_ring(sk, &req_u, 0,
3597 optname == PACKET_TX_RING);
1da177e4
LT
3598 }
3599 case PACKET_COPY_THRESH:
3600 {
3601 int val;
3602
40d4e3df 3603 if (optlen != sizeof(val))
1da177e4 3604 return -EINVAL;
40d4e3df 3605 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3606 return -EFAULT;
3607
3608 pkt_sk(sk)->copy_thresh = val;
3609 return 0;
3610 }
bbd6ef87
PM
3611 case PACKET_VERSION:
3612 {
3613 int val;
3614
3615 if (optlen != sizeof(val))
3616 return -EINVAL;
bbd6ef87
PM
3617 if (copy_from_user(&val, optval, sizeof(val)))
3618 return -EFAULT;
3619 switch (val) {
3620 case TPACKET_V1:
3621 case TPACKET_V2:
f6fb8f10 3622 case TPACKET_V3:
84ac7260 3623 break;
bbd6ef87
PM
3624 default:
3625 return -EINVAL;
3626 }
84ac7260
PP
3627 lock_sock(sk);
3628 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3629 ret = -EBUSY;
3630 } else {
3631 po->tp_version = val;
3632 ret = 0;
3633 }
3634 release_sock(sk);
3635 return ret;
bbd6ef87 3636 }
8913336a
PM
3637 case PACKET_RESERVE:
3638 {
3639 unsigned int val;
3640
3641 if (optlen != sizeof(val))
3642 return -EINVAL;
8913336a
PM
3643 if (copy_from_user(&val, optval, sizeof(val)))
3644 return -EFAULT;
1d27b680
AK
3645 if (val > INT_MAX)
3646 return -EINVAL;
ad629e8e
WB
3647 lock_sock(sk);
3648 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3649 ret = -EBUSY;
3650 } else {
3651 po->tp_reserve = val;
3652 ret = 0;
3653 }
3654 release_sock(sk);
3655 return ret;
8913336a 3656 }
69e3c75f
JB
3657 case PACKET_LOSS:
3658 {
3659 unsigned int val;
3660
3661 if (optlen != sizeof(val))
3662 return -EINVAL;
3663 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3664 return -EBUSY;
3665 if (copy_from_user(&val, optval, sizeof(val)))
3666 return -EFAULT;
3667 po->tp_loss = !!val;
3668 return 0;
3669 }
8dc41944
HX
3670 case PACKET_AUXDATA:
3671 {
3672 int val;
3673
3674 if (optlen < sizeof(val))
3675 return -EINVAL;
3676 if (copy_from_user(&val, optval, sizeof(val)))
3677 return -EFAULT;
3678
3679 po->auxdata = !!val;
3680 return 0;
3681 }
80feaacb
PWJ
3682 case PACKET_ORIGDEV:
3683 {
3684 int val;
3685
3686 if (optlen < sizeof(val))
3687 return -EINVAL;
3688 if (copy_from_user(&val, optval, sizeof(val)))
3689 return -EFAULT;
3690
3691 po->origdev = !!val;
3692 return 0;
3693 }
bfd5f4a3
SS
3694 case PACKET_VNET_HDR:
3695 {
3696 int val;
3697
3698 if (sock->type != SOCK_RAW)
3699 return -EINVAL;
3700 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3701 return -EBUSY;
3702 if (optlen < sizeof(val))
3703 return -EINVAL;
3704 if (copy_from_user(&val, optval, sizeof(val)))
3705 return -EFAULT;
3706
3707 po->has_vnet_hdr = !!val;
3708 return 0;
3709 }
614f60fa
SM
3710 case PACKET_TIMESTAMP:
3711 {
3712 int val;
3713
3714 if (optlen != sizeof(val))
3715 return -EINVAL;
3716 if (copy_from_user(&val, optval, sizeof(val)))
3717 return -EFAULT;
3718
3719 po->tp_tstamp = val;
3720 return 0;
3721 }
dc99f600
DM
3722 case PACKET_FANOUT:
3723 {
3724 int val;
3725
3726 if (optlen != sizeof(val))
3727 return -EINVAL;
3728 if (copy_from_user(&val, optval, sizeof(val)))
3729 return -EFAULT;
3730
3731 return fanout_add(sk, val & 0xffff, val >> 16);
3732 }
47dceb8e
WB
3733 case PACKET_FANOUT_DATA:
3734 {
3735 if (!po->fanout)
3736 return -EINVAL;
3737
3738 return fanout_set_data(po, optval, optlen);
3739 }
5920cd3a
PC
3740 case PACKET_TX_HAS_OFF:
3741 {
3742 unsigned int val;
3743
3744 if (optlen != sizeof(val))
3745 return -EINVAL;
3746 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3747 return -EBUSY;
3748 if (copy_from_user(&val, optval, sizeof(val)))
3749 return -EFAULT;
3750 po->tp_tx_has_off = !!val;
3751 return 0;
3752 }
d346a3fa
DB
3753 case PACKET_QDISC_BYPASS:
3754 {
3755 int val;
3756
3757 if (optlen != sizeof(val))
3758 return -EINVAL;
3759 if (copy_from_user(&val, optval, sizeof(val)))
3760 return -EFAULT;
3761
3762 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3763 return 0;
3764 }
1da177e4
LT
3765 default:
3766 return -ENOPROTOOPT;
3767 }
3768}
3769
3770static int packet_getsockopt(struct socket *sock, int level, int optname,
3771 char __user *optval, int __user *optlen)
3772{
3773 int len;
c06fff6e 3774 int val, lv = sizeof(val);
1da177e4
LT
3775 struct sock *sk = sock->sk;
3776 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3777 void *data = &val;
ee80fbf3 3778 union tpacket_stats_u st;
a9b63918 3779 struct tpacket_rollover_stats rstats;
1da177e4
LT
3780
3781 if (level != SOL_PACKET)
3782 return -ENOPROTOOPT;
3783
8ae55f04
KK
3784 if (get_user(len, optlen))
3785 return -EFAULT;
1da177e4
LT
3786
3787 if (len < 0)
3788 return -EINVAL;
1ce4f28b 3789
69e3c75f 3790 switch (optname) {
1da177e4 3791 case PACKET_STATISTICS:
1da177e4 3792 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3793 memcpy(&st, &po->stats, sizeof(st));
3794 memset(&po->stats, 0, sizeof(po->stats));
3795 spin_unlock_bh(&sk->sk_receive_queue.lock);
3796
f6fb8f10 3797 if (po->tp_version == TPACKET_V3) {
c06fff6e 3798 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3799 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3800 data = &st.stats3;
f6fb8f10 3801 } else {
c06fff6e 3802 lv = sizeof(struct tpacket_stats);
8bcdeaff 3803 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3804 data = &st.stats1;
f6fb8f10 3805 }
ee80fbf3 3806
8dc41944
HX
3807 break;
3808 case PACKET_AUXDATA:
8dc41944 3809 val = po->auxdata;
80feaacb
PWJ
3810 break;
3811 case PACKET_ORIGDEV:
80feaacb 3812 val = po->origdev;
bfd5f4a3
SS
3813 break;
3814 case PACKET_VNET_HDR:
bfd5f4a3 3815 val = po->has_vnet_hdr;
1da177e4 3816 break;
bbd6ef87 3817 case PACKET_VERSION:
bbd6ef87 3818 val = po->tp_version;
bbd6ef87
PM
3819 break;
3820 case PACKET_HDRLEN:
3821 if (len > sizeof(int))
3822 len = sizeof(int);
3823 if (copy_from_user(&val, optval, len))
3824 return -EFAULT;
3825 switch (val) {
3826 case TPACKET_V1:
3827 val = sizeof(struct tpacket_hdr);
3828 break;
3829 case TPACKET_V2:
3830 val = sizeof(struct tpacket2_hdr);
3831 break;
f6fb8f10 3832 case TPACKET_V3:
3833 val = sizeof(struct tpacket3_hdr);
3834 break;
bbd6ef87
PM
3835 default:
3836 return -EINVAL;
3837 }
bbd6ef87 3838 break;
8913336a 3839 case PACKET_RESERVE:
8913336a 3840 val = po->tp_reserve;
8913336a 3841 break;
69e3c75f 3842 case PACKET_LOSS:
69e3c75f 3843 val = po->tp_loss;
69e3c75f 3844 break;
614f60fa 3845 case PACKET_TIMESTAMP:
614f60fa 3846 val = po->tp_tstamp;
614f60fa 3847 break;
dc99f600 3848 case PACKET_FANOUT:
dc99f600
DM
3849 val = (po->fanout ?
3850 ((u32)po->fanout->id |
77f65ebd
WB
3851 ((u32)po->fanout->type << 16) |
3852 ((u32)po->fanout->flags << 24)) :
dc99f600 3853 0);
dc99f600 3854 break;
a9b63918
WB
3855 case PACKET_ROLLOVER_STATS:
3856 if (!po->rollover)
3857 return -EINVAL;
3858 rstats.tp_all = atomic_long_read(&po->rollover->num);
3859 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3860 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3861 data = &rstats;
3862 lv = sizeof(rstats);
3863 break;
5920cd3a
PC
3864 case PACKET_TX_HAS_OFF:
3865 val = po->tp_tx_has_off;
3866 break;
d346a3fa
DB
3867 case PACKET_QDISC_BYPASS:
3868 val = packet_use_direct_xmit(po);
3869 break;
1da177e4
LT
3870 default:
3871 return -ENOPROTOOPT;
3872 }
3873
c06fff6e
ED
3874 if (len > lv)
3875 len = lv;
8ae55f04
KK
3876 if (put_user(len, optlen))
3877 return -EFAULT;
8dc41944
HX
3878 if (copy_to_user(optval, data, len))
3879 return -EFAULT;
8ae55f04 3880 return 0;
1da177e4
LT
3881}
3882
3883
719c44d3
WB
3884#ifdef CONFIG_COMPAT
3885static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3886 char __user *optval, unsigned int optlen)
3887{
3888 struct packet_sock *po = pkt_sk(sock->sk);
3889
3890 if (level != SOL_PACKET)
3891 return -ENOPROTOOPT;
3892
3893 if (optname == PACKET_FANOUT_DATA &&
3894 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3895 optval = (char __user *)get_compat_bpf_fprog(optval);
3896 if (!optval)
3897 return -EFAULT;
3898 optlen = sizeof(struct sock_fprog);
3899 }
3900
3901 return packet_setsockopt(sock, level, optname, optval, optlen);
3902}
3903#endif
3904
351638e7
JP
3905static int packet_notifier(struct notifier_block *this,
3906 unsigned long msg, void *ptr)
1da177e4
LT
3907{
3908 struct sock *sk;
351638e7 3909 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3910 struct net *net = dev_net(dev);
1da177e4 3911
808f5114 3912 rcu_read_lock();
b67bfe0d 3913 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3914 struct packet_sock *po = pkt_sk(sk);
3915
3916 switch (msg) {
3917 case NETDEV_UNREGISTER:
1da177e4 3918 if (po->mclist)
82f17091 3919 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3920 /* fallthrough */
3921
1da177e4
LT
3922 case NETDEV_DOWN:
3923 if (dev->ifindex == po->ifindex) {
3924 spin_lock(&po->bind_lock);
3925 if (po->running) {
ce06b03e 3926 __unregister_prot_hook(sk, false);
1da177e4
LT
3927 sk->sk_err = ENETDOWN;
3928 if (!sock_flag(sk, SOCK_DEAD))
3929 sk->sk_error_report(sk);
3930 }
3931 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3932 packet_cached_dev_reset(po);
1da177e4 3933 po->ifindex = -1;
160ff18a
BG
3934 if (po->prot_hook.dev)
3935 dev_put(po->prot_hook.dev);
1da177e4
LT
3936 po->prot_hook.dev = NULL;
3937 }
3938 spin_unlock(&po->bind_lock);
3939 }
3940 break;
3941 case NETDEV_UP:
808f5114 3942 if (dev->ifindex == po->ifindex) {
3943 spin_lock(&po->bind_lock);
ce06b03e
DM
3944 if (po->num)
3945 register_prot_hook(sk);
808f5114 3946 spin_unlock(&po->bind_lock);
1da177e4 3947 }
1da177e4
LT
3948 break;
3949 }
3950 }
808f5114 3951 rcu_read_unlock();
1da177e4
LT
3952 return NOTIFY_DONE;
3953}
3954
3955
3956static int packet_ioctl(struct socket *sock, unsigned int cmd,
3957 unsigned long arg)
3958{
3959 struct sock *sk = sock->sk;
3960
69e3c75f 3961 switch (cmd) {
40d4e3df
ED
3962 case SIOCOUTQ:
3963 {
3964 int amount = sk_wmem_alloc_get(sk);
31e6d363 3965
40d4e3df
ED
3966 return put_user(amount, (int __user *)arg);
3967 }
3968 case SIOCINQ:
3969 {
3970 struct sk_buff *skb;
3971 int amount = 0;
3972
3973 spin_lock_bh(&sk->sk_receive_queue.lock);
3974 skb = skb_peek(&sk->sk_receive_queue);
3975 if (skb)
3976 amount = skb->len;
3977 spin_unlock_bh(&sk->sk_receive_queue.lock);
3978 return put_user(amount, (int __user *)arg);
3979 }
3980 case SIOCGSTAMP:
3981 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3982 case SIOCGSTAMPNS:
3983 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3984
1da177e4 3985#ifdef CONFIG_INET
40d4e3df
ED
3986 case SIOCADDRT:
3987 case SIOCDELRT:
3988 case SIOCDARP:
3989 case SIOCGARP:
3990 case SIOCSARP:
3991 case SIOCGIFADDR:
3992 case SIOCSIFADDR:
3993 case SIOCGIFBRDADDR:
3994 case SIOCSIFBRDADDR:
3995 case SIOCGIFNETMASK:
3996 case SIOCSIFNETMASK:
3997 case SIOCGIFDSTADDR:
3998 case SIOCSIFDSTADDR:
3999 case SIOCSIFFLAGS:
40d4e3df 4000 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4001#endif
4002
40d4e3df
ED
4003 default:
4004 return -ENOIOCTLCMD;
1da177e4
LT
4005 }
4006 return 0;
4007}
4008
40d4e3df 4009static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
4010 poll_table *wait)
4011{
4012 struct sock *sk = sock->sk;
4013 struct packet_sock *po = pkt_sk(sk);
4014 unsigned int mask = datagram_poll(file, sock, wait);
4015
4016 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4017 if (po->rx_ring.pg_vec) {
f6fb8f10 4018 if (!packet_previous_rx_frame(po, &po->rx_ring,
4019 TP_STATUS_KERNEL))
1da177e4
LT
4020 mask |= POLLIN | POLLRDNORM;
4021 }
2ccdbaa6 4022 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4023 po->pressure = 0;
1da177e4 4024 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4025 spin_lock_bh(&sk->sk_write_queue.lock);
4026 if (po->tx_ring.pg_vec) {
4027 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
4028 mask |= POLLOUT | POLLWRNORM;
4029 }
4030 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4031 return mask;
4032}
4033
4034
4035/* Dirty? Well, I still did not learn better way to account
4036 * for user mmaps.
4037 */
4038
4039static void packet_mm_open(struct vm_area_struct *vma)
4040{
4041 struct file *file = vma->vm_file;
40d4e3df 4042 struct socket *sock = file->private_data;
1da177e4 4043 struct sock *sk = sock->sk;
1ce4f28b 4044
1da177e4
LT
4045 if (sk)
4046 atomic_inc(&pkt_sk(sk)->mapped);
4047}
4048
4049static void packet_mm_close(struct vm_area_struct *vma)
4050{
4051 struct file *file = vma->vm_file;
40d4e3df 4052 struct socket *sock = file->private_data;
1da177e4 4053 struct sock *sk = sock->sk;
1ce4f28b 4054
1da177e4
LT
4055 if (sk)
4056 atomic_dec(&pkt_sk(sk)->mapped);
4057}
4058
f0f37e2f 4059static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4060 .open = packet_mm_open,
4061 .close = packet_mm_close,
1da177e4
LT
4062};
4063
0e3125c7
NH
4064static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4065 unsigned int len)
1da177e4
LT
4066{
4067 int i;
4068
4ebf0ae2 4069 for (i = 0; i < len; i++) {
0e3125c7 4070 if (likely(pg_vec[i].buffer)) {
c56b4d90 4071 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4072 vfree(pg_vec[i].buffer);
4073 else
4074 free_pages((unsigned long)pg_vec[i].buffer,
4075 order);
4076 pg_vec[i].buffer = NULL;
4077 }
1da177e4
LT
4078 }
4079 kfree(pg_vec);
4080}
4081
eea49cc9 4082static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4083{
f0d4eb29 4084 char *buffer;
0e3125c7
NH
4085 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4086 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4087
4088 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4089 if (buffer)
4090 return buffer;
4091
f0d4eb29 4092 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4093 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4094 if (buffer)
4095 return buffer;
4096
f0d4eb29 4097 /* vmalloc failed, lets dig into swap here */
0e3125c7 4098 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4099 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4100 if (buffer)
4101 return buffer;
4102
f0d4eb29 4103 /* complete and utter failure */
0e3125c7 4104 return NULL;
4ebf0ae2
DM
4105}
4106
0e3125c7 4107static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4108{
4109 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4110 struct pgv *pg_vec;
4ebf0ae2
DM
4111 int i;
4112
0e3125c7 4113 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4114 if (unlikely(!pg_vec))
4115 goto out;
4116
4117 for (i = 0; i < block_nr; i++) {
c56b4d90 4118 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4119 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4120 goto out_free_pgvec;
4121 }
4122
4123out:
4124 return pg_vec;
4125
4126out_free_pgvec:
4127 free_pg_vec(pg_vec, order, block_nr);
4128 pg_vec = NULL;
4129 goto out;
4130}
1da177e4 4131
f6fb8f10 4132static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4133 int closing, int tx_ring)
1da177e4 4134{
0e3125c7 4135 struct pgv *pg_vec = NULL;
1da177e4 4136 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4137 int was_running, order = 0;
69e3c75f
JB
4138 struct packet_ring_buffer *rb;
4139 struct sk_buff_head *rb_queue;
0e11c91e 4140 __be16 num;
f6fb8f10 4141 int err = -EINVAL;
4142 /* Added to avoid minimal code churn */
4143 struct tpacket_req *req = &req_u->req;
4144
84ac7260 4145 lock_sock(sk);
f6fb8f10 4146 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
4147 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
6ae81ced 4148 net_warn_ratelimited("Tx-ring is not supported.\n");
f6fb8f10 4149 goto out;
4150 }
1ce4f28b 4151
69e3c75f
JB
4152 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4153 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4154
69e3c75f
JB
4155 err = -EBUSY;
4156 if (!closing) {
4157 if (atomic_read(&po->mapped))
4158 goto out;
b0138408 4159 if (packet_read_pending(rb))
69e3c75f
JB
4160 goto out;
4161 }
1da177e4 4162
69e3c75f
JB
4163 if (req->tp_block_nr) {
4164 /* Sanity tests and some calculations */
4165 err = -EBUSY;
4166 if (unlikely(rb->pg_vec))
4167 goto out;
1da177e4 4168
bbd6ef87
PM
4169 switch (po->tp_version) {
4170 case TPACKET_V1:
4171 po->tp_hdrlen = TPACKET_HDRLEN;
4172 break;
4173 case TPACKET_V2:
4174 po->tp_hdrlen = TPACKET2_HDRLEN;
4175 break;
f6fb8f10 4176 case TPACKET_V3:
4177 po->tp_hdrlen = TPACKET3_HDRLEN;
4178 break;
bbd6ef87
PM
4179 }
4180
69e3c75f 4181 err = -EINVAL;
4ebf0ae2 4182 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4183 goto out;
90836b67 4184 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4185 goto out;
dc808110 4186 if (po->tp_version >= TPACKET_V3 &&
500e91e0
AK
4187 req->tp_block_size <=
4188 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
dc808110 4189 goto out;
8913336a 4190 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4191 po->tp_reserve))
4192 goto out;
4ebf0ae2 4193 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4194 goto out;
1da177e4 4195
4194b491
TK
4196 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4197 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4198 goto out;
213e19c7
AK
4199 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4200 goto out;
69e3c75f
JB
4201 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4202 req->tp_frame_nr))
4203 goto out;
1da177e4
LT
4204
4205 err = -ENOMEM;
4ebf0ae2
DM
4206 order = get_order(req->tp_block_size);
4207 pg_vec = alloc_pg_vec(req, order);
4208 if (unlikely(!pg_vec))
1da177e4 4209 goto out;
f6fb8f10 4210 switch (po->tp_version) {
4211 case TPACKET_V3:
4212 /* Transmit path is not supported. We checked
4213 * it above but just being paranoid
4214 */
4215 if (!tx_ring)
e8e85cc5 4216 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 4217 break;
f6fb8f10 4218 default:
4219 break;
4220 }
69e3c75f
JB
4221 }
4222 /* Done */
4223 else {
4224 err = -EINVAL;
4ebf0ae2 4225 if (unlikely(req->tp_frame_nr))
69e3c75f 4226 goto out;
1da177e4
LT
4227 }
4228
1da177e4
LT
4229
4230 /* Detach socket from network */
4231 spin_lock(&po->bind_lock);
4232 was_running = po->running;
4233 num = po->num;
4234 if (was_running) {
1da177e4 4235 po->num = 0;
ce06b03e 4236 __unregister_prot_hook(sk, false);
1da177e4
LT
4237 }
4238 spin_unlock(&po->bind_lock);
1ce4f28b 4239
1da177e4
LT
4240 synchronize_net();
4241
4242 err = -EBUSY;
905db440 4243 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4244 if (closing || atomic_read(&po->mapped) == 0) {
4245 err = 0;
69e3c75f 4246 spin_lock_bh(&rb_queue->lock);
c053fd96 4247 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4248 rb->frame_max = (req->tp_frame_nr - 1);
4249 rb->head = 0;
4250 rb->frame_size = req->tp_frame_size;
4251 spin_unlock_bh(&rb_queue->lock);
4252
c053fd96
CG
4253 swap(rb->pg_vec_order, order);
4254 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4255
4256 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4257 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4258 tpacket_rcv : packet_rcv;
4259 skb_queue_purge(rb_queue);
1da177e4 4260 if (atomic_read(&po->mapped))
40d4e3df
ED
4261 pr_err("packet_mmap: vma is busy: %d\n",
4262 atomic_read(&po->mapped));
1da177e4 4263 }
905db440 4264 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4265
4266 spin_lock(&po->bind_lock);
ce06b03e 4267 if (was_running) {
1da177e4 4268 po->num = num;
ce06b03e 4269 register_prot_hook(sk);
1da177e4
LT
4270 }
4271 spin_unlock(&po->bind_lock);
f6fb8f10 4272 if (closing && (po->tp_version > TPACKET_V2)) {
4273 /* Because we don't support block-based V3 on tx-ring */
4274 if (!tx_ring)
73d0fcf2 4275 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4276 }
1da177e4 4277
1da177e4
LT
4278 if (pg_vec)
4279 free_pg_vec(pg_vec, order, req->tp_block_nr);
4280out:
84ac7260 4281 release_sock(sk);
1da177e4
LT
4282 return err;
4283}
4284
69e3c75f
JB
4285static int packet_mmap(struct file *file, struct socket *sock,
4286 struct vm_area_struct *vma)
1da177e4
LT
4287{
4288 struct sock *sk = sock->sk;
4289 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4290 unsigned long size, expected_size;
4291 struct packet_ring_buffer *rb;
1da177e4
LT
4292 unsigned long start;
4293 int err = -EINVAL;
4294 int i;
4295
4296 if (vma->vm_pgoff)
4297 return -EINVAL;
4298
905db440 4299 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4300
4301 expected_size = 0;
4302 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4303 if (rb->pg_vec) {
4304 expected_size += rb->pg_vec_len
4305 * rb->pg_vec_pages
4306 * PAGE_SIZE;
4307 }
4308 }
4309
4310 if (expected_size == 0)
1da177e4 4311 goto out;
69e3c75f
JB
4312
4313 size = vma->vm_end - vma->vm_start;
4314 if (size != expected_size)
1da177e4
LT
4315 goto out;
4316
1da177e4 4317 start = vma->vm_start;
69e3c75f
JB
4318 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4319 if (rb->pg_vec == NULL)
4320 continue;
4321
4322 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4323 struct page *page;
4324 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4325 int pg_num;
4326
c56b4d90
CG
4327 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4328 page = pgv_to_page(kaddr);
69e3c75f
JB
4329 err = vm_insert_page(vma, start, page);
4330 if (unlikely(err))
4331 goto out;
4332 start += PAGE_SIZE;
0e3125c7 4333 kaddr += PAGE_SIZE;
69e3c75f 4334 }
4ebf0ae2 4335 }
1da177e4 4336 }
69e3c75f 4337
4ebf0ae2 4338 atomic_inc(&po->mapped);
1da177e4
LT
4339 vma->vm_ops = &packet_mmap_ops;
4340 err = 0;
4341
4342out:
905db440 4343 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4344 return err;
4345}
1da177e4 4346
90ddc4f0 4347static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4348 .family = PF_PACKET,
4349 .owner = THIS_MODULE,
4350 .release = packet_release,
4351 .bind = packet_bind_spkt,
4352 .connect = sock_no_connect,
4353 .socketpair = sock_no_socketpair,
4354 .accept = sock_no_accept,
4355 .getname = packet_getname_spkt,
4356 .poll = datagram_poll,
4357 .ioctl = packet_ioctl,
4358 .listen = sock_no_listen,
4359 .shutdown = sock_no_shutdown,
4360 .setsockopt = sock_no_setsockopt,
4361 .getsockopt = sock_no_getsockopt,
4362 .sendmsg = packet_sendmsg_spkt,
4363 .recvmsg = packet_recvmsg,
4364 .mmap = sock_no_mmap,
4365 .sendpage = sock_no_sendpage,
4366};
1da177e4 4367
90ddc4f0 4368static const struct proto_ops packet_ops = {
1da177e4
LT
4369 .family = PF_PACKET,
4370 .owner = THIS_MODULE,
4371 .release = packet_release,
4372 .bind = packet_bind,
4373 .connect = sock_no_connect,
4374 .socketpair = sock_no_socketpair,
4375 .accept = sock_no_accept,
1ce4f28b 4376 .getname = packet_getname,
1da177e4
LT
4377 .poll = packet_poll,
4378 .ioctl = packet_ioctl,
4379 .listen = sock_no_listen,
4380 .shutdown = sock_no_shutdown,
4381 .setsockopt = packet_setsockopt,
4382 .getsockopt = packet_getsockopt,
719c44d3
WB
4383#ifdef CONFIG_COMPAT
4384 .compat_setsockopt = compat_packet_setsockopt,
4385#endif
1da177e4
LT
4386 .sendmsg = packet_sendmsg,
4387 .recvmsg = packet_recvmsg,
4388 .mmap = packet_mmap,
4389 .sendpage = sock_no_sendpage,
4390};
4391
ec1b4cf7 4392static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4393 .family = PF_PACKET,
4394 .create = packet_create,
4395 .owner = THIS_MODULE,
4396};
4397
4398static struct notifier_block packet_netdev_notifier = {
40d4e3df 4399 .notifier_call = packet_notifier,
1da177e4
LT
4400};
4401
4402#ifdef CONFIG_PROC_FS
1da177e4
LT
4403
4404static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4405 __acquires(RCU)
1da177e4 4406{
e372c414 4407 struct net *net = seq_file_net(seq);
808f5114 4408
4409 rcu_read_lock();
4410 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4411}
4412
4413static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4414{
1bf40954 4415 struct net *net = seq_file_net(seq);
808f5114 4416 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4417}
4418
4419static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4420 __releases(RCU)
1da177e4 4421{
808f5114 4422 rcu_read_unlock();
1da177e4
LT
4423}
4424
1ce4f28b 4425static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4426{
4427 if (v == SEQ_START_TOKEN)
4428 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4429 else {
b7ceabd9 4430 struct sock *s = sk_entry(v);
1da177e4
LT
4431 const struct packet_sock *po = pkt_sk(s);
4432
4433 seq_printf(seq,
71338aa7 4434 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4435 s,
4436 atomic_read(&s->sk_refcnt),
4437 s->sk_type,
4438 ntohs(po->num),
4439 po->ifindex,
4440 po->running,
4441 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4442 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4443 sock_i_ino(s));
1da177e4
LT
4444 }
4445
4446 return 0;
4447}
4448
56b3d975 4449static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4450 .start = packet_seq_start,
4451 .next = packet_seq_next,
4452 .stop = packet_seq_stop,
4453 .show = packet_seq_show,
4454};
4455
4456static int packet_seq_open(struct inode *inode, struct file *file)
4457{
e372c414
DL
4458 return seq_open_net(inode, file, &packet_seq_ops,
4459 sizeof(struct seq_net_private));
1da177e4
LT
4460}
4461
da7071d7 4462static const struct file_operations packet_seq_fops = {
1da177e4
LT
4463 .owner = THIS_MODULE,
4464 .open = packet_seq_open,
4465 .read = seq_read,
4466 .llseek = seq_lseek,
e372c414 4467 .release = seq_release_net,
1da177e4
LT
4468};
4469
4470#endif
4471
2c8c1e72 4472static int __net_init packet_net_init(struct net *net)
d12d01d6 4473{
0fa7fa98 4474 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4475 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4476
d4beaa66 4477 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4478 return -ENOMEM;
4479
4480 return 0;
4481}
4482
2c8c1e72 4483static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4484{
ece31ffd 4485 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4486}
4487
4488static struct pernet_operations packet_net_ops = {
4489 .init = packet_net_init,
4490 .exit = packet_net_exit,
4491};
4492
4493
1da177e4
LT
4494static void __exit packet_exit(void)
4495{
1da177e4 4496 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4497 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4498 sock_unregister(PF_PACKET);
4499 proto_unregister(&packet_proto);
4500}
4501
4502static int __init packet_init(void)
4503{
4504 int rc = proto_register(&packet_proto, 0);
4505
4506 if (rc != 0)
4507 goto out;
4508
4509 sock_register(&packet_family_ops);
d12d01d6 4510 register_pernet_subsys(&packet_net_ops);
1da177e4 4511 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4512out:
4513 return rc;
4514}
4515
4516module_init(packet_init);
4517module_exit(packet_exit);
4518MODULE_LICENSE("GPL");
4519MODULE_ALIAS_NETPROTO(PF_PACKET);