]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/packet/af_packet.c
netfilter: nf_tables: add NFT_LOGLEVEL_* enumeration and use it
[mirror_ubuntu-jammy-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
17bfd8c8 204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
1da177e4 211static void packet_flush_mclist(struct sock *sk);
865b03f2 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 213
ffbc6111 214struct packet_skb_cb {
ffbc6111
HX
215 union {
216 struct sockaddr_pkt pkt;
2472d761
EB
217 union {
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
220 * to save room.
221 */
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
ffbc6111
HX
225 } sa;
226};
227
d3869efe
DW
228#define vio_le() virtio_legacy_is_little_endian()
229
ffbc6111 230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 231
bc59ba39 232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 233#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
dc99f600
DM
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
d346a3fa
DB
244static int packet_direct_xmit(struct sk_buff *skb)
245{
865b03f2 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
247}
248
66e56cd4
DB
249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
250{
251 struct net_device *dev;
252
253 rcu_read_lock();
254 dev = rcu_dereference(po->cached_dev);
255 if (likely(dev))
256 dev_hold(dev);
257 rcu_read_unlock();
258
259 return dev;
260}
261
262static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
264{
265 rcu_assign_pointer(po->cached_dev, dev);
266}
267
268static void packet_cached_dev_reset(struct packet_sock *po)
269{
270 RCU_INIT_POINTER(po->cached_dev, NULL);
271}
272
d346a3fa
DB
273static bool packet_use_direct_xmit(const struct packet_sock *po)
274{
275 return po->xmit == packet_direct_xmit;
276}
277
0fd5d57b 278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 279{
1cbac010 280 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
281}
282
865b03f2 283static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 284{
865b03f2 285 struct net_device *dev = skb->dev;
0fd5d57b
DB
286 const struct net_device_ops *ops = dev->netdev_ops;
287 u16 queue_index;
288
289 if (ops->ndo_select_queue) {
290 queue_index = ops->ndo_select_queue(dev, skb, NULL,
291 __packet_pick_tx_queue);
292 queue_index = netdev_cap_txqueue(dev, queue_index);
293 } else {
294 queue_index = __packet_pick_tx_queue(dev, skb);
295 }
296
865b03f2 297 return queue_index;
0fd5d57b
DB
298}
299
a6361f0c 300/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
301 * or from a context in which asynchronous accesses to the packet
302 * socket is not possible (packet_create()).
303 */
a6361f0c 304static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
305{
306 struct packet_sock *po = pkt_sk(sk);
e40526cb 307
ce06b03e 308 if (!po->running) {
66e56cd4 309 if (po->fanout)
dc99f600 310 __fanout_link(sk, po);
66e56cd4 311 else
dc99f600 312 dev_add_pack(&po->prot_hook);
e40526cb 313
ce06b03e
DM
314 sock_hold(sk);
315 po->running = 1;
316 }
317}
318
a6361f0c
WB
319static void register_prot_hook(struct sock *sk)
320{
321 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
322 __register_prot_hook(sk);
323}
324
325/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
326 * the po->bind_lock and do a synchronize_net to make sure no
327 * asynchronous packet processing paths still refer to the elements
328 * of po->prot_hook. If the sync parameter is false, it is the
329 * callers responsibility to take care of this.
330 */
331static void __unregister_prot_hook(struct sock *sk, bool sync)
332{
333 struct packet_sock *po = pkt_sk(sk);
334
a6361f0c
WB
335 lockdep_assert_held_once(&po->bind_lock);
336
ce06b03e 337 po->running = 0;
66e56cd4
DB
338
339 if (po->fanout)
dc99f600 340 __fanout_unlink(sk, po);
66e56cd4 341 else
dc99f600 342 __dev_remove_pack(&po->prot_hook);
e40526cb 343
ce06b03e
DM
344 __sock_put(sk);
345
346 if (sync) {
347 spin_unlock(&po->bind_lock);
348 synchronize_net();
349 spin_lock(&po->bind_lock);
350 }
351}
352
353static void unregister_prot_hook(struct sock *sk, bool sync)
354{
355 struct packet_sock *po = pkt_sk(sk);
356
357 if (po->running)
358 __unregister_prot_hook(sk, sync);
359}
360
6e58040b 361static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
362{
363 if (is_vmalloc_addr(addr))
364 return vmalloc_to_page(addr);
365 return virt_to_page(addr);
366}
367
69e3c75f 368static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 369{
184f489e 370 union tpacket_uhdr h;
1da177e4 371
69e3c75f 372 h.raw = frame;
bbd6ef87
PM
373 switch (po->tp_version) {
374 case TPACKET_V1:
69e3c75f 375 h.h1->tp_status = status;
0af55bb5 376 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
377 break;
378 case TPACKET_V2:
69e3c75f 379 h.h2->tp_status = status;
0af55bb5 380 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 381 break;
f6fb8f10 382 case TPACKET_V3:
7f953ab2
SV
383 h.h3->tp_status = status;
384 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
385 break;
69e3c75f 386 default:
f6fb8f10 387 WARN(1, "TPACKET version not supported.\n");
69e3c75f 388 BUG();
bbd6ef87 389 }
69e3c75f
JB
390
391 smp_wmb();
bbd6ef87
PM
392}
393
69e3c75f 394static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 395{
184f489e 396 union tpacket_uhdr h;
bbd6ef87 397
69e3c75f
JB
398 smp_rmb();
399
bbd6ef87
PM
400 h.raw = frame;
401 switch (po->tp_version) {
402 case TPACKET_V1:
0af55bb5 403 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 404 return h.h1->tp_status;
bbd6ef87 405 case TPACKET_V2:
0af55bb5 406 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 407 return h.h2->tp_status;
f6fb8f10 408 case TPACKET_V3:
7f953ab2
SV
409 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
410 return h.h3->tp_status;
69e3c75f 411 default:
f6fb8f10 412 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
413 BUG();
414 return 0;
bbd6ef87 415 }
1da177e4 416}
69e3c75f 417
b9c32fb2
DB
418static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
419 unsigned int flags)
7a51384c
DB
420{
421 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
422
68a360e8
WB
423 if (shhwtstamps &&
424 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
425 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
426 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
427
428 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 429 return TP_STATUS_TS_SOFTWARE;
7a51384c 430
b9c32fb2 431 return 0;
7a51384c
DB
432}
433
b9c32fb2
DB
434static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
435 struct sk_buff *skb)
2e31396f
WB
436{
437 union tpacket_uhdr h;
438 struct timespec ts;
b9c32fb2 439 __u32 ts_status;
2e31396f 440
b9c32fb2
DB
441 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
442 return 0;
2e31396f
WB
443
444 h.raw = frame;
445 switch (po->tp_version) {
446 case TPACKET_V1:
447 h.h1->tp_sec = ts.tv_sec;
448 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
449 break;
450 case TPACKET_V2:
451 h.h2->tp_sec = ts.tv_sec;
452 h.h2->tp_nsec = ts.tv_nsec;
453 break;
454 case TPACKET_V3:
57ea884b
DB
455 h.h3->tp_sec = ts.tv_sec;
456 h.h3->tp_nsec = ts.tv_nsec;
457 break;
2e31396f
WB
458 default:
459 WARN(1, "TPACKET version not supported.\n");
460 BUG();
461 }
462
463 /* one flush is safe, as both fields always lie on the same cacheline */
464 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
465 smp_wmb();
b9c32fb2
DB
466
467 return ts_status;
2e31396f
WB
468}
469
69e3c75f
JB
470static void *packet_lookup_frame(struct packet_sock *po,
471 struct packet_ring_buffer *rb,
472 unsigned int position,
473 int status)
474{
475 unsigned int pg_vec_pos, frame_offset;
184f489e 476 union tpacket_uhdr h;
69e3c75f
JB
477
478 pg_vec_pos = position / rb->frames_per_block;
479 frame_offset = position % rb->frames_per_block;
480
0e3125c7
NH
481 h.raw = rb->pg_vec[pg_vec_pos].buffer +
482 (frame_offset * rb->frame_size);
69e3c75f
JB
483
484 if (status != __packet_get_status(po, h.raw))
485 return NULL;
486
487 return h.raw;
488}
489
eea49cc9 490static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
491 struct packet_ring_buffer *rb,
492 int status)
493{
494 return packet_lookup_frame(po, rb, rb->head, status);
495}
496
bc59ba39 497static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 498{
499 del_timer_sync(&pkc->retire_blk_timer);
500}
501
502static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 503 struct sk_buff_head *rb_queue)
504{
bc59ba39 505 struct tpacket_kbdq_core *pkc;
f6fb8f10 506
73d0fcf2 507 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 508
ec6f809f 509 spin_lock_bh(&rb_queue->lock);
f6fb8f10 510 pkc->delete_blk_timer = 1;
ec6f809f 511 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 512
513 prb_del_retire_blk_timer(pkc);
514}
515
e8e85cc5 516static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
e8e85cc5 520 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
521 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
522 0);
523 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 524}
525
526static int prb_calc_retire_blk_tmo(struct packet_sock *po,
527 int blk_size_in_bytes)
528{
529 struct net_device *dev;
530 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 531 struct ethtool_link_ksettings ecmd;
4bc71cb9 532 int err;
f6fb8f10 533
4bc71cb9
JP
534 rtnl_lock();
535 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
536 if (unlikely(!dev)) {
537 rtnl_unlock();
f6fb8f10 538 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 539 }
7cad1bac 540 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
541 rtnl_unlock();
542 if (!err) {
4bc71cb9
JP
543 /*
544 * If the link speed is so slow you don't really
545 * need to worry about perf anyways
546 */
7cad1bac
DD
547 if (ecmd.base.speed < SPEED_1000 ||
548 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 549 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 550 } else {
551 msec = 1;
7cad1bac 552 div = ecmd.base.speed / 1000;
f6fb8f10 553 }
554 }
555
556 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
557
558 if (div)
559 mbits /= div;
560
561 tmo = mbits * msec;
562
563 if (div)
564 return tmo+1;
565 return tmo;
566}
567
bc59ba39 568static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 569 union tpacket_req_u *req_u)
570{
571 p1->feature_req_word = req_u->req3.tp_feature_req_word;
572}
573
574static void init_prb_bdqc(struct packet_sock *po,
575 struct packet_ring_buffer *rb,
576 struct pgv *pg_vec,
e8e85cc5 577 union tpacket_req_u *req_u)
f6fb8f10 578{
22781a5b 579 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 580 struct tpacket_block_desc *pbd;
f6fb8f10 581
582 memset(p1, 0x0, sizeof(*p1));
583
584 p1->knxt_seq_num = 1;
585 p1->pkbdq = pg_vec;
bc59ba39 586 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 587 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 588 p1->kblk_size = req_u->req3.tp_block_size;
589 p1->knum_blocks = req_u->req3.tp_block_nr;
590 p1->hdrlen = po->tp_hdrlen;
591 p1->version = po->tp_version;
592 p1->last_kactive_blk_num = 0;
ee80fbf3 593 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 594 if (req_u->req3.tp_retire_blk_tov)
595 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
596 else
597 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
598 req_u->req3.tp_block_size);
599 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
600 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
601
dc808110 602 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 603 prb_init_ft_ops(p1, req_u);
e8e85cc5 604 prb_setup_retire_blk_timer(po);
f6fb8f10 605 prb_open_block(p1, pbd);
606}
607
608/* Do NOT update the last_blk_num first.
609 * Assumes sk_buff_head lock is held.
610 */
bc59ba39 611static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 612{
613 mod_timer(&pkc->retire_blk_timer,
614 jiffies + pkc->tov_in_jiffies);
615 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
616}
617
618/*
619 * Timer logic:
620 * 1) We refresh the timer only when we open a block.
621 * By doing this we don't waste cycles refreshing the timer
622 * on packet-by-packet basis.
623 *
624 * With a 1MB block-size, on a 1Gbps line, it will take
625 * i) ~8 ms to fill a block + ii) memcpy etc.
626 * In this cut we are not accounting for the memcpy time.
627 *
628 * So, if the user sets the 'tmo' to 10ms then the timer
629 * will never fire while the block is still getting filled
630 * (which is what we want). However, the user could choose
631 * to close a block early and that's fine.
632 *
633 * But when the timer does fire, we check whether or not to refresh it.
634 * Since the tmo granularity is in msecs, it is not too expensive
635 * to refresh the timer, lets say every '8' msecs.
636 * Either the user can set the 'tmo' or we can derive it based on
637 * a) line-speed and b) block-size.
638 * prb_calc_retire_blk_tmo() calculates the tmo.
639 *
640 */
17bfd8c8 641static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 642{
17bfd8c8
KC
643 struct packet_sock *po =
644 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 645 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 646 unsigned int frozen;
bc59ba39 647 struct tpacket_block_desc *pbd;
f6fb8f10 648
649 spin_lock(&po->sk.sk_receive_queue.lock);
650
651 frozen = prb_queue_frozen(pkc);
652 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
653
654 if (unlikely(pkc->delete_blk_timer))
655 goto out;
656
657 /* We only need to plug the race when the block is partially filled.
658 * tpacket_rcv:
659 * lock(); increment BLOCK_NUM_PKTS; unlock()
660 * copy_bits() is in progress ...
661 * timer fires on other cpu:
662 * we can't retire the current block because copy_bits
663 * is in progress.
664 *
665 */
666 if (BLOCK_NUM_PKTS(pbd)) {
667 while (atomic_read(&pkc->blk_fill_in_prog)) {
668 /* Waiting for skb_copy_bits to finish... */
669 cpu_relax();
670 }
671 }
672
673 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
674 if (!frozen) {
41a50d62
AD
675 if (!BLOCK_NUM_PKTS(pbd)) {
676 /* An empty block. Just refresh the timer. */
677 goto refresh_timer;
678 }
f6fb8f10 679 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
680 if (!prb_dispatch_next_block(pkc, po))
681 goto refresh_timer;
682 else
683 goto out;
684 } else {
685 /* Case 1. Queue was frozen because user-space was
686 * lagging behind.
687 */
878cd3ba 688 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 689 /*
690 * Ok, user-space is still behind.
691 * So just refresh the timer.
692 */
693 goto refresh_timer;
694 } else {
695 /* Case 2. queue was frozen,user-space caught up,
696 * now the link went idle && the timer fired.
697 * We don't have a block to close.So we open this
698 * block and restart the timer.
699 * opening a block thaws the queue,restarts timer
700 * Thawing/timer-refresh is a side effect.
701 */
702 prb_open_block(pkc, pbd);
703 goto out;
704 }
705 }
706 }
707
708refresh_timer:
709 _prb_refresh_rx_retire_blk_timer(pkc);
710
711out:
712 spin_unlock(&po->sk.sk_receive_queue.lock);
713}
714
eea49cc9 715static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 716 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 717{
718 /* Flush everything minus the block header */
719
720#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
721 u8 *start, *end;
722
723 start = (u8 *)pbd1;
724
725 /* Skip the block header(we know header WILL fit in 4K) */
726 start += PAGE_SIZE;
727
728 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
729 for (; start < end; start += PAGE_SIZE)
730 flush_dcache_page(pgv_to_page(start));
731
732 smp_wmb();
733#endif
734
735 /* Now update the block status. */
736
737 BLOCK_STATUS(pbd1) = status;
738
739 /* Flush the block header */
740
741#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
742 start = (u8 *)pbd1;
743 flush_dcache_page(pgv_to_page(start));
744
745 smp_wmb();
746#endif
747}
748
749/*
750 * Side effect:
751 *
752 * 1) flush the block
753 * 2) Increment active_blk_num
754 *
755 * Note:We DONT refresh the timer on purpose.
756 * Because almost always the next block will be opened.
757 */
bc59ba39 758static void prb_close_block(struct tpacket_kbdq_core *pkc1,
759 struct tpacket_block_desc *pbd1,
f6fb8f10 760 struct packet_sock *po, unsigned int stat)
761{
762 __u32 status = TP_STATUS_USER | stat;
763
764 struct tpacket3_hdr *last_pkt;
bc59ba39 765 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 766 struct sock *sk = &po->sk;
f6fb8f10 767
ee80fbf3 768 if (po->stats.stats3.tp_drops)
f6fb8f10 769 status |= TP_STATUS_LOSING;
770
771 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
772 last_pkt->tp_next_offset = 0;
773
774 /* Get the ts of the last pkt */
775 if (BLOCK_NUM_PKTS(pbd1)) {
776 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
777 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
778 } else {
41a50d62
AD
779 /* Ok, we tmo'd - so get the current time.
780 *
781 * It shouldn't really happen as we don't close empty
782 * blocks. See prb_retire_rx_blk_timer_expired().
783 */
f6fb8f10 784 struct timespec ts;
785 getnstimeofday(&ts);
786 h1->ts_last_pkt.ts_sec = ts.tv_sec;
787 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
788 }
789
790 smp_wmb();
791
792 /* Flush the block */
793 prb_flush_block(pkc1, pbd1, status);
794
da413eec
DC
795 sk->sk_data_ready(sk);
796
f6fb8f10 797 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
798}
799
eea49cc9 800static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 801{
802 pkc->reset_pending_on_curr_blk = 0;
803}
804
805/*
806 * Side effect of opening a block:
807 *
808 * 1) prb_queue is thawed.
809 * 2) retire_blk_timer is refreshed.
810 *
811 */
bc59ba39 812static void prb_open_block(struct tpacket_kbdq_core *pkc1,
813 struct tpacket_block_desc *pbd1)
f6fb8f10 814{
815 struct timespec ts;
bc59ba39 816 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 817
818 smp_rmb();
819
8da3056c
DB
820 /* We could have just memset this but we will lose the
821 * flexibility of making the priv area sticky
822 */
f6fb8f10 823
8da3056c
DB
824 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
825 BLOCK_NUM_PKTS(pbd1) = 0;
826 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 827
8da3056c
DB
828 getnstimeofday(&ts);
829
830 h1->ts_first_pkt.ts_sec = ts.tv_sec;
831 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 832
8da3056c
DB
833 pkc1->pkblk_start = (char *)pbd1;
834 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
835
836 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
837 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
838
839 pbd1->version = pkc1->version;
840 pkc1->prev = pkc1->nxt_offset;
841 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
842
843 prb_thaw_queue(pkc1);
844 _prb_refresh_rx_retire_blk_timer(pkc1);
845
846 smp_wmb();
f6fb8f10 847}
848
849/*
850 * Queue freeze logic:
851 * 1) Assume tp_block_nr = 8 blocks.
852 * 2) At time 't0', user opens Rx ring.
853 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
854 * 4) user-space is either sleeping or processing block '0'.
855 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
856 * it will close block-7,loop around and try to fill block '0'.
857 * call-flow:
858 * __packet_lookup_frame_in_block
859 * prb_retire_current_block()
860 * prb_dispatch_next_block()
861 * |->(BLOCK_STATUS == USER) evaluates to true
862 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
863 * 6) Now there are two cases:
864 * 6.1) Link goes idle right after the queue is frozen.
865 * But remember, the last open_block() refreshed the timer.
866 * When this timer expires,it will refresh itself so that we can
867 * re-open block-0 in near future.
868 * 6.2) Link is busy and keeps on receiving packets. This is a simple
869 * case and __packet_lookup_frame_in_block will check if block-0
870 * is free and can now be re-used.
871 */
eea49cc9 872static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 873 struct packet_sock *po)
874{
875 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 876 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 877}
878
879#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
880
881/*
882 * If the next block is free then we will dispatch it
883 * and return a good offset.
884 * Else, we will freeze the queue.
885 * So, caller must check the return value.
886 */
bc59ba39 887static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 888 struct packet_sock *po)
889{
bc59ba39 890 struct tpacket_block_desc *pbd;
f6fb8f10 891
892 smp_rmb();
893
894 /* 1. Get current block num */
895 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
896
897 /* 2. If this block is currently in_use then freeze the queue */
898 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
899 prb_freeze_queue(pkc, po);
900 return NULL;
901 }
902
903 /*
904 * 3.
905 * open this block and return the offset where the first packet
906 * needs to get stored.
907 */
908 prb_open_block(pkc, pbd);
909 return (void *)pkc->nxt_offset;
910}
911
bc59ba39 912static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 913 struct packet_sock *po, unsigned int status)
914{
bc59ba39 915 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 916
917 /* retire/close the current block */
918 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
919 /*
920 * Plug the case where copy_bits() is in progress on
921 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
922 * have space to copy the pkt in the current block and
923 * called prb_retire_current_block()
924 *
925 * We don't need to worry about the TMO case because
926 * the timer-handler already handled this case.
927 */
928 if (!(status & TP_STATUS_BLK_TMO)) {
929 while (atomic_read(&pkc->blk_fill_in_prog)) {
930 /* Waiting for skb_copy_bits to finish... */
931 cpu_relax();
932 }
933 }
934 prb_close_block(pkc, pbd, po, status);
935 return;
936 }
f6fb8f10 937}
938
878cd3ba 939static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 940{
941 return TP_STATUS_USER & BLOCK_STATUS(pbd);
942}
943
eea49cc9 944static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 945{
946 return pkc->reset_pending_on_curr_blk;
947}
948
eea49cc9 949static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 950{
bc59ba39 951 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 952 atomic_dec(&pkc->blk_fill_in_prog);
953}
954
eea49cc9 955static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 956 struct tpacket3_hdr *ppd)
957{
3958afa1 958 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 959}
960
eea49cc9 961static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 962 struct tpacket3_hdr *ppd)
963{
964 ppd->hv1.tp_rxhash = 0;
965}
966
eea49cc9 967static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 968 struct tpacket3_hdr *ppd)
969{
df8a39de
JP
970 if (skb_vlan_tag_present(pkc->skb)) {
971 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
972 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
973 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 974 } else {
9e67030a 975 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 976 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 977 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 978 }
979}
980
bc59ba39 981static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 982 struct tpacket3_hdr *ppd)
983{
a0cdfcf3 984 ppd->hv1.tp_padding = 0;
f6fb8f10 985 prb_fill_vlan_info(pkc, ppd);
986
987 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
988 prb_fill_rxhash(pkc, ppd);
989 else
990 prb_clear_rxhash(pkc, ppd);
991}
992
eea49cc9 993static void prb_fill_curr_block(char *curr,
bc59ba39 994 struct tpacket_kbdq_core *pkc,
995 struct tpacket_block_desc *pbd,
f6fb8f10 996 unsigned int len)
997{
998 struct tpacket3_hdr *ppd;
999
1000 ppd = (struct tpacket3_hdr *)curr;
1001 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1002 pkc->prev = curr;
1003 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_NUM_PKTS(pbd) += 1;
1006 atomic_inc(&pkc->blk_fill_in_prog);
1007 prb_run_all_ft_ops(pkc, ppd);
1008}
1009
1010/* Assumes caller has the sk->rx_queue.lock */
1011static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1012 struct sk_buff *skb,
1013 int status,
1014 unsigned int len
1015 )
1016{
bc59ba39 1017 struct tpacket_kbdq_core *pkc;
1018 struct tpacket_block_desc *pbd;
f6fb8f10 1019 char *curr, *end;
1020
e3192690 1021 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1022 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1023
1024 /* Queue is frozen when user space is lagging behind */
1025 if (prb_queue_frozen(pkc)) {
1026 /*
1027 * Check if that last block which caused the queue to freeze,
1028 * is still in_use by user-space.
1029 */
878cd3ba 1030 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1031 /* Can't record this packet */
1032 return NULL;
1033 } else {
1034 /*
1035 * Ok, the block was released by user-space.
1036 * Now let's open that block.
1037 * opening a block also thaws the queue.
1038 * Thawing is a side effect.
1039 */
1040 prb_open_block(pkc, pbd);
1041 }
1042 }
1043
1044 smp_mb();
1045 curr = pkc->nxt_offset;
1046 pkc->skb = skb;
e3192690 1047 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1048
1049 /* first try the current block */
1050 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1051 prb_fill_curr_block(curr, pkc, pbd, len);
1052 return (void *)curr;
1053 }
1054
1055 /* Ok, close the current block */
1056 prb_retire_current_block(pkc, po, 0);
1057
1058 /* Now, try to dispatch the next block */
1059 curr = (char *)prb_dispatch_next_block(pkc, po);
1060 if (curr) {
1061 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1062 prb_fill_curr_block(curr, pkc, pbd, len);
1063 return (void *)curr;
1064 }
1065
1066 /*
1067 * No free blocks are available.user_space hasn't caught up yet.
1068 * Queue was just frozen and now this packet will get dropped.
1069 */
1070 return NULL;
1071}
1072
eea49cc9 1073static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1074 struct sk_buff *skb,
1075 int status, unsigned int len)
1076{
1077 char *curr = NULL;
1078 switch (po->tp_version) {
1079 case TPACKET_V1:
1080 case TPACKET_V2:
1081 curr = packet_lookup_frame(po, &po->rx_ring,
1082 po->rx_ring.head, status);
1083 return curr;
1084 case TPACKET_V3:
1085 return __packet_lookup_frame_in_block(po, skb, status, len);
1086 default:
1087 WARN(1, "TPACKET version not supported\n");
1088 BUG();
99aa3473 1089 return NULL;
f6fb8f10 1090 }
1091}
1092
eea49cc9 1093static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1094 struct packet_ring_buffer *rb,
77f65ebd 1095 unsigned int idx,
f6fb8f10 1096 int status)
1097{
bc59ba39 1098 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1099 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1100
1101 if (status != BLOCK_STATUS(pbd))
1102 return NULL;
1103 return pbd;
1104}
1105
eea49cc9 1106static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1107{
1108 unsigned int prev;
1109 if (rb->prb_bdqc.kactive_blk_num)
1110 prev = rb->prb_bdqc.kactive_blk_num-1;
1111 else
1112 prev = rb->prb_bdqc.knum_blocks-1;
1113 return prev;
1114}
1115
1116/* Assumes caller has held the rx_queue.lock */
eea49cc9 1117static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1118 struct packet_ring_buffer *rb,
1119 int status)
1120{
1121 unsigned int previous = prb_previous_blk_num(rb);
1122 return prb_lookup_block(po, rb, previous, status);
1123}
1124
eea49cc9 1125static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1126 struct packet_ring_buffer *rb,
1127 int status)
1128{
1129 if (po->tp_version <= TPACKET_V2)
1130 return packet_previous_frame(po, rb, status);
1131
1132 return __prb_previous_block(po, rb, status);
1133}
1134
eea49cc9 1135static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1136 struct packet_ring_buffer *rb)
1137{
1138 switch (po->tp_version) {
1139 case TPACKET_V1:
1140 case TPACKET_V2:
1141 return packet_increment_head(rb);
1142 case TPACKET_V3:
1143 default:
1144 WARN(1, "TPACKET version not supported.\n");
1145 BUG();
1146 return;
1147 }
1148}
1149
eea49cc9 1150static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1151 struct packet_ring_buffer *rb,
1152 int status)
1153{
1154 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1155 return packet_lookup_frame(po, rb, previous, status);
1156}
1157
eea49cc9 1158static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1159{
1160 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1161}
1162
b0138408
DB
1163static void packet_inc_pending(struct packet_ring_buffer *rb)
1164{
1165 this_cpu_inc(*rb->pending_refcnt);
1166}
1167
1168static void packet_dec_pending(struct packet_ring_buffer *rb)
1169{
1170 this_cpu_dec(*rb->pending_refcnt);
1171}
1172
1173static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1174{
1175 unsigned int refcnt = 0;
1176 int cpu;
1177
1178 /* We don't use pending refcount in rx_ring. */
1179 if (rb->pending_refcnt == NULL)
1180 return 0;
1181
1182 for_each_possible_cpu(cpu)
1183 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1184
1185 return refcnt;
1186}
1187
1188static int packet_alloc_pending(struct packet_sock *po)
1189{
1190 po->rx_ring.pending_refcnt = NULL;
1191
1192 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1193 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1194 return -ENOBUFS;
1195
1196 return 0;
1197}
1198
1199static void packet_free_pending(struct packet_sock *po)
1200{
1201 free_percpu(po->tx_ring.pending_refcnt);
1202}
1203
9954729b
WB
1204#define ROOM_POW_OFF 2
1205#define ROOM_NONE 0x0
1206#define ROOM_LOW 0x1
1207#define ROOM_NORMAL 0x2
1208
1209static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1210{
9954729b
WB
1211 int idx, len;
1212
1213 len = po->rx_ring.frame_max + 1;
1214 idx = po->rx_ring.head;
1215 if (pow_off)
1216 idx += len >> pow_off;
1217 if (idx >= len)
1218 idx -= len;
1219 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1220}
1221
1222static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1223{
1224 int idx, len;
1225
1226 len = po->rx_ring.prb_bdqc.knum_blocks;
1227 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1228 if (pow_off)
1229 idx += len >> pow_off;
1230 if (idx >= len)
1231 idx -= len;
1232 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1233}
77f65ebd 1234
2ccdbaa6 1235static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1236{
1237 struct sock *sk = &po->sk;
1238 int ret = ROOM_NONE;
1239
1240 if (po->prot_hook.func != tpacket_rcv) {
1241 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1242 - (skb ? skb->truesize : 0);
9954729b
WB
1243 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1244 return ROOM_NORMAL;
1245 else if (avail > 0)
1246 return ROOM_LOW;
1247 else
1248 return ROOM_NONE;
1249 }
77f65ebd 1250
9954729b
WB
1251 if (po->tp_version == TPACKET_V3) {
1252 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1253 ret = ROOM_NORMAL;
1254 else if (__tpacket_v3_has_room(po, 0))
1255 ret = ROOM_LOW;
1256 } else {
1257 if (__tpacket_has_room(po, ROOM_POW_OFF))
1258 ret = ROOM_NORMAL;
1259 else if (__tpacket_has_room(po, 0))
1260 ret = ROOM_LOW;
1261 }
2ccdbaa6
WB
1262
1263 return ret;
1264}
1265
1266static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1267{
1268 int ret;
1269 bool has_room;
1270
54d7c01d
WB
1271 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1272 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1273 has_room = ret == ROOM_NORMAL;
1274 if (po->pressure == has_room)
54d7c01d
WB
1275 po->pressure = !has_room;
1276 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1277
9954729b 1278 return ret;
77f65ebd
WB
1279}
1280
1da177e4
LT
1281static void packet_sock_destruct(struct sock *sk)
1282{
ed85b565
RC
1283 skb_queue_purge(&sk->sk_error_queue);
1284
547b792c 1285 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1286 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1287
1288 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1289 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1290 return;
1291 }
1292
17ab56a2 1293 sk_refcnt_debug_dec(sk);
1da177e4
LT
1294}
1295
3b3a5b0a
WB
1296static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1297{
1298 u32 rxhash;
1299 int i, count = 0;
1300
1301 rxhash = skb_get_hash(skb);
1302 for (i = 0; i < ROLLOVER_HLEN; i++)
1303 if (po->rollover->history[i] == rxhash)
1304 count++;
1305
1306 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1307 return count > (ROLLOVER_HLEN >> 1);
1308}
1309
77f65ebd
WB
1310static unsigned int fanout_demux_hash(struct packet_fanout *f,
1311 struct sk_buff *skb,
1312 unsigned int num)
dc99f600 1313{
eb70db87 1314 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1315}
1316
77f65ebd
WB
1317static unsigned int fanout_demux_lb(struct packet_fanout *f,
1318 struct sk_buff *skb,
1319 unsigned int num)
dc99f600 1320{
468479e6 1321 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1322
468479e6 1323 return val % num;
77f65ebd
WB
1324}
1325
1326static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1327 struct sk_buff *skb,
1328 unsigned int num)
1329{
1330 return smp_processor_id() % num;
dc99f600
DM
1331}
1332
5df0ddfb
DB
1333static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1334 struct sk_buff *skb,
1335 unsigned int num)
1336{
f337db64 1337 return prandom_u32_max(num);
5df0ddfb
DB
1338}
1339
77f65ebd
WB
1340static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1341 struct sk_buff *skb,
ad377cab 1342 unsigned int idx, bool try_self,
77f65ebd 1343 unsigned int num)
95ec3eb4 1344{
4633c9e0 1345 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1346 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1347
0648ab70 1348 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1349
1350 if (try_self) {
1351 room = packet_rcv_has_room(po, skb);
1352 if (room == ROOM_NORMAL ||
1353 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1354 return idx;
4633c9e0 1355 po_skip = po;
3b3a5b0a 1356 }
ad377cab 1357
0648ab70 1358 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1359 do {
2ccdbaa6 1360 po_next = pkt_sk(f->arr[i]);
4633c9e0 1361 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1362 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1363 if (i != j)
0648ab70 1364 po->rollover->sock = i;
a9b63918
WB
1365 atomic_long_inc(&po->rollover->num);
1366 if (room == ROOM_LOW)
1367 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1368 return i;
1369 }
ad377cab 1370
77f65ebd
WB
1371 if (++i == num)
1372 i = 0;
1373 } while (i != j);
1374
a9b63918 1375 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1376 return idx;
1377}
1378
2d36097d
NH
1379static unsigned int fanout_demux_qm(struct packet_fanout *f,
1380 struct sk_buff *skb,
1381 unsigned int num)
1382{
1383 return skb_get_queue_mapping(skb) % num;
1384}
1385
47dceb8e
WB
1386static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1387 struct sk_buff *skb,
1388 unsigned int num)
1389{
1390 struct bpf_prog *prog;
1391 unsigned int ret = 0;
1392
1393 rcu_read_lock();
1394 prog = rcu_dereference(f->bpf_prog);
1395 if (prog)
ff936a04 1396 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1397 rcu_read_unlock();
1398
1399 return ret;
1400}
1401
77f65ebd
WB
1402static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1403{
1404 return f->flags & (flag >> 8);
95ec3eb4
DM
1405}
1406
95ec3eb4
DM
1407static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1408 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1409{
1410 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1411 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1412 struct net *net = read_pnet(&f->net);
dc99f600 1413 struct packet_sock *po;
77f65ebd 1414 unsigned int idx;
dc99f600 1415
19bcf9f2 1416 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1417 kfree_skb(skb);
1418 return 0;
1419 }
1420
3f34b24a 1421 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1422 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1423 if (!skb)
1424 return 0;
1425 }
95ec3eb4
DM
1426 switch (f->type) {
1427 case PACKET_FANOUT_HASH:
1428 default:
77f65ebd 1429 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1430 break;
1431 case PACKET_FANOUT_LB:
77f65ebd 1432 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1433 break;
1434 case PACKET_FANOUT_CPU:
77f65ebd
WB
1435 idx = fanout_demux_cpu(f, skb, num);
1436 break;
5df0ddfb
DB
1437 case PACKET_FANOUT_RND:
1438 idx = fanout_demux_rnd(f, skb, num);
1439 break;
2d36097d
NH
1440 case PACKET_FANOUT_QM:
1441 idx = fanout_demux_qm(f, skb, num);
1442 break;
77f65ebd 1443 case PACKET_FANOUT_ROLLOVER:
ad377cab 1444 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1445 break;
47dceb8e 1446 case PACKET_FANOUT_CBPF:
f2e52095 1447 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1448 idx = fanout_demux_bpf(f, skb, num);
1449 break;
dc99f600
DM
1450 }
1451
ad377cab
WB
1452 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1453 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1454
ad377cab 1455 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1456 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1457}
1458
fff3321d
PE
1459DEFINE_MUTEX(fanout_mutex);
1460EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1461static LIST_HEAD(fanout_list);
4a69a864 1462static u16 fanout_next_id;
dc99f600
DM
1463
1464static void __fanout_link(struct sock *sk, struct packet_sock *po)
1465{
1466 struct packet_fanout *f = po->fanout;
1467
1468 spin_lock(&f->lock);
1469 f->arr[f->num_members] = sk;
1470 smp_wmb();
1471 f->num_members++;
2bd624b4
AS
1472 if (f->num_members == 1)
1473 dev_add_pack(&f->prot_hook);
dc99f600
DM
1474 spin_unlock(&f->lock);
1475}
1476
1477static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1478{
1479 struct packet_fanout *f = po->fanout;
1480 int i;
1481
1482 spin_lock(&f->lock);
1483 for (i = 0; i < f->num_members; i++) {
1484 if (f->arr[i] == sk)
1485 break;
1486 }
1487 BUG_ON(i >= f->num_members);
1488 f->arr[i] = f->arr[f->num_members - 1];
1489 f->num_members--;
2bd624b4
AS
1490 if (f->num_members == 0)
1491 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1492 spin_unlock(&f->lock);
1493}
1494
d4dd8aee 1495static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1496{
161642e2
ED
1497 if (sk->sk_family != PF_PACKET)
1498 return false;
c0de08d0 1499
161642e2 1500 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1501}
1502
47dceb8e
WB
1503static void fanout_init_data(struct packet_fanout *f)
1504{
1505 switch (f->type) {
1506 case PACKET_FANOUT_LB:
1507 atomic_set(&f->rr_cur, 0);
1508 break;
1509 case PACKET_FANOUT_CBPF:
f2e52095 1510 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1511 RCU_INIT_POINTER(f->bpf_prog, NULL);
1512 break;
1513 }
1514}
1515
1516static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1517{
1518 struct bpf_prog *old;
1519
1520 spin_lock(&f->lock);
1521 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1522 rcu_assign_pointer(f->bpf_prog, new);
1523 spin_unlock(&f->lock);
1524
1525 if (old) {
1526 synchronize_net();
1527 bpf_prog_destroy(old);
1528 }
1529}
1530
1531static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1532 unsigned int len)
1533{
1534 struct bpf_prog *new;
1535 struct sock_fprog fprog;
1536 int ret;
1537
1538 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1539 return -EPERM;
1540 if (len != sizeof(fprog))
1541 return -EINVAL;
1542 if (copy_from_user(&fprog, data, len))
1543 return -EFAULT;
1544
bab18991 1545 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1546 if (ret)
1547 return ret;
1548
1549 __fanout_set_data_bpf(po->fanout, new);
1550 return 0;
1551}
1552
f2e52095
WB
1553static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1554 unsigned int len)
1555{
1556 struct bpf_prog *new;
1557 u32 fd;
1558
1559 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1560 return -EPERM;
1561 if (len != sizeof(fd))
1562 return -EINVAL;
1563 if (copy_from_user(&fd, data, len))
1564 return -EFAULT;
1565
113214be 1566 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1567 if (IS_ERR(new))
1568 return PTR_ERR(new);
f2e52095
WB
1569
1570 __fanout_set_data_bpf(po->fanout, new);
1571 return 0;
1572}
1573
47dceb8e
WB
1574static int fanout_set_data(struct packet_sock *po, char __user *data,
1575 unsigned int len)
1576{
1577 switch (po->fanout->type) {
1578 case PACKET_FANOUT_CBPF:
1579 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1580 case PACKET_FANOUT_EBPF:
1581 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1582 default:
1583 return -EINVAL;
1584 };
1585}
1586
1587static void fanout_release_data(struct packet_fanout *f)
1588{
1589 switch (f->type) {
1590 case PACKET_FANOUT_CBPF:
f2e52095 1591 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1592 __fanout_set_data_bpf(f, NULL);
1593 };
1594}
1595
4a69a864
MM
1596static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1597{
1598 struct packet_fanout *f;
1599
1600 list_for_each_entry(f, &fanout_list, list) {
1601 if (f->id == candidate_id &&
1602 read_pnet(&f->net) == sock_net(sk)) {
1603 return false;
1604 }
1605 }
1606 return true;
1607}
1608
1609static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1610{
1611 u16 id = fanout_next_id;
1612
1613 do {
1614 if (__fanout_id_is_free(sk, id)) {
1615 *new_id = id;
1616 fanout_next_id = id + 1;
1617 return true;
1618 }
1619
1620 id++;
1621 } while (id != fanout_next_id);
1622
1623 return false;
1624}
1625
7736d33f 1626static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1627{
d199fab6 1628 struct packet_rollover *rollover = NULL;
dc99f600
DM
1629 struct packet_sock *po = pkt_sk(sk);
1630 struct packet_fanout *f, *match;
7736d33f 1631 u8 type = type_flags & 0xff;
77f65ebd 1632 u8 flags = type_flags >> 8;
dc99f600
DM
1633 int err;
1634
1635 switch (type) {
77f65ebd
WB
1636 case PACKET_FANOUT_ROLLOVER:
1637 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1638 return -EINVAL;
dc99f600
DM
1639 case PACKET_FANOUT_HASH:
1640 case PACKET_FANOUT_LB:
95ec3eb4 1641 case PACKET_FANOUT_CPU:
5df0ddfb 1642 case PACKET_FANOUT_RND:
2d36097d 1643 case PACKET_FANOUT_QM:
47dceb8e 1644 case PACKET_FANOUT_CBPF:
f2e52095 1645 case PACKET_FANOUT_EBPF:
dc99f600
DM
1646 break;
1647 default:
1648 return -EINVAL;
1649 }
1650
d199fab6
ED
1651 mutex_lock(&fanout_mutex);
1652
d199fab6 1653 err = -EALREADY;
dc99f600 1654 if (po->fanout)
d199fab6 1655 goto out;
dc99f600 1656
4633c9e0
WB
1657 if (type == PACKET_FANOUT_ROLLOVER ||
1658 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1659 err = -ENOMEM;
1660 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1661 if (!rollover)
1662 goto out;
1663 atomic_long_set(&rollover->num, 0);
1664 atomic_long_set(&rollover->num_huge, 0);
1665 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1666 }
1667
4a69a864
MM
1668 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1669 if (id != 0) {
1670 err = -EINVAL;
1671 goto out;
1672 }
1673 if (!fanout_find_new_id(sk, &id)) {
1674 err = -ENOMEM;
1675 goto out;
1676 }
1677 /* ephemeral flag for the first socket in the group: drop it */
1678 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1679 }
1680
dc99f600
DM
1681 match = NULL;
1682 list_for_each_entry(f, &fanout_list, list) {
1683 if (f->id == id &&
1684 read_pnet(&f->net) == sock_net(sk)) {
1685 match = f;
1686 break;
1687 }
1688 }
afe62c68 1689 err = -EINVAL;
77f65ebd 1690 if (match && match->flags != flags)
afe62c68 1691 goto out;
dc99f600 1692 if (!match) {
afe62c68 1693 err = -ENOMEM;
dc99f600 1694 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1695 if (!match)
1696 goto out;
1697 write_pnet(&match->net, sock_net(sk));
1698 match->id = id;
1699 match->type = type;
77f65ebd 1700 match->flags = flags;
afe62c68
ED
1701 INIT_LIST_HEAD(&match->list);
1702 spin_lock_init(&match->lock);
fb5c2c17 1703 refcount_set(&match->sk_ref, 0);
47dceb8e 1704 fanout_init_data(match);
afe62c68
ED
1705 match->prot_hook.type = po->prot_hook.type;
1706 match->prot_hook.dev = po->prot_hook.dev;
1707 match->prot_hook.func = packet_rcv_fanout;
1708 match->prot_hook.af_packet_priv = match;
c0de08d0 1709 match->prot_hook.id_match = match_fanout_group;
afe62c68 1710 list_add(&match->list, &fanout_list);
dc99f600 1711 }
afe62c68 1712 err = -EINVAL;
008ba2a1
WB
1713
1714 spin_lock(&po->bind_lock);
1715 if (po->running &&
1716 match->type == type &&
afe62c68
ED
1717 match->prot_hook.type == po->prot_hook.type &&
1718 match->prot_hook.dev == po->prot_hook.dev) {
1719 err = -ENOSPC;
fb5c2c17 1720 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1721 __dev_remove_pack(&po->prot_hook);
1722 po->fanout = match;
57f015f5
MM
1723 po->rollover = rollover;
1724 rollover = NULL;
fb5c2c17 1725 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1726 __fanout_link(sk, po);
1727 err = 0;
dc99f600
DM
1728 }
1729 }
008ba2a1
WB
1730 spin_unlock(&po->bind_lock);
1731
1732 if (err && !refcount_read(&match->sk_ref)) {
1733 list_del(&match->list);
1734 kfree(match);
1735 }
1736
afe62c68 1737out:
57f015f5 1738 kfree(rollover);
d199fab6 1739 mutex_unlock(&fanout_mutex);
dc99f600
DM
1740 return err;
1741}
1742
2bd624b4
AS
1743/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1744 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1745 * It is the responsibility of the caller to call fanout_release_data() and
1746 * free the returned packet_fanout (after synchronize_net())
1747 */
1748static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1749{
1750 struct packet_sock *po = pkt_sk(sk);
1751 struct packet_fanout *f;
1752
fff3321d 1753 mutex_lock(&fanout_mutex);
d199fab6
ED
1754 f = po->fanout;
1755 if (f) {
1756 po->fanout = NULL;
1757
fb5c2c17 1758 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1759 list_del(&f->list);
2bd624b4
AS
1760 else
1761 f = NULL;
dc99f600
DM
1762 }
1763 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1764
1765 return f;
dc99f600 1766}
1da177e4 1767
3c70c132
DB
1768static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1769 struct sk_buff *skb)
1770{
1771 /* Earlier code assumed this would be a VLAN pkt, double-check
1772 * this now that we have the actual packet in hand. We can only
1773 * do this check on Ethernet devices.
1774 */
1775 if (unlikely(dev->type != ARPHRD_ETHER))
1776 return false;
1777
1778 skb_reset_mac_header(skb);
1779 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1780}
1781
90ddc4f0 1782static const struct proto_ops packet_ops;
1da177e4 1783
90ddc4f0 1784static const struct proto_ops packet_ops_spkt;
1da177e4 1785
40d4e3df
ED
1786static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1787 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1788{
1789 struct sock *sk;
1790 struct sockaddr_pkt *spkt;
1791
1792 /*
1793 * When we registered the protocol we saved the socket in the data
1794 * field for just this event.
1795 */
1796
1797 sk = pt->af_packet_priv;
1ce4f28b 1798
1da177e4
LT
1799 /*
1800 * Yank back the headers [hope the device set this
1801 * right or kerboom...]
1802 *
1803 * Incoming packets have ll header pulled,
1804 * push it back.
1805 *
98e399f8 1806 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1807 * so that this procedure is noop.
1808 */
1809
1810 if (skb->pkt_type == PACKET_LOOPBACK)
1811 goto out;
1812
09ad9bc7 1813 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1814 goto out;
1815
40d4e3df
ED
1816 skb = skb_share_check(skb, GFP_ATOMIC);
1817 if (skb == NULL)
1da177e4
LT
1818 goto oom;
1819
1820 /* drop any routing info */
adf30907 1821 skb_dst_drop(skb);
1da177e4 1822
84531c24
PO
1823 /* drop conntrack reference */
1824 nf_reset(skb);
1825
ffbc6111 1826 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1827
98e399f8 1828 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1829
1830 /*
1831 * The SOCK_PACKET socket receives _all_ frames.
1832 */
1833
1834 spkt->spkt_family = dev->type;
1835 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1836 spkt->spkt_protocol = skb->protocol;
1837
1838 /*
1839 * Charge the memory to the socket. This is done specifically
1840 * to prevent sockets using all the memory up.
1841 */
1842
40d4e3df 1843 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1844 return 0;
1845
1846out:
1847 kfree_skb(skb);
1848oom:
1849 return 0;
1850}
1851
1852
1853/*
1854 * Output a raw packet to a device layer. This bypasses all the other
1855 * protocol layers and you must therefore supply it with a complete frame
1856 */
1ce4f28b 1857
1b784140
YX
1858static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1859 size_t len)
1da177e4
LT
1860{
1861 struct sock *sk = sock->sk;
342dfc30 1862 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1863 struct sk_buff *skb = NULL;
1da177e4 1864 struct net_device *dev;
c14ac945 1865 struct sockcm_cookie sockc;
40d4e3df 1866 __be16 proto = 0;
1da177e4 1867 int err;
3bdc0eba 1868 int extra_len = 0;
1ce4f28b 1869
1da177e4 1870 /*
1ce4f28b 1871 * Get and verify the address.
1da177e4
LT
1872 */
1873
40d4e3df 1874 if (saddr) {
1da177e4 1875 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1876 return -EINVAL;
1877 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1878 proto = saddr->spkt_protocol;
1879 } else
1880 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1881
1882 /*
1ce4f28b 1883 * Find the device first to size check it
1da177e4
LT
1884 */
1885
de74e92a 1886 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1887retry:
654d1f8a
ED
1888 rcu_read_lock();
1889 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1890 err = -ENODEV;
1891 if (dev == NULL)
1892 goto out_unlock;
1ce4f28b 1893
d5e76b0a
DM
1894 err = -ENETDOWN;
1895 if (!(dev->flags & IFF_UP))
1896 goto out_unlock;
1897
1da177e4 1898 /*
40d4e3df
ED
1899 * You may not queue a frame bigger than the mtu. This is the lowest level
1900 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1901 */
1ce4f28b 1902
3bdc0eba
BG
1903 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1904 if (!netif_supports_nofcs(dev)) {
1905 err = -EPROTONOSUPPORT;
1906 goto out_unlock;
1907 }
1908 extra_len = 4; /* We're doing our own CRC */
1909 }
1910
1da177e4 1911 err = -EMSGSIZE;
3bdc0eba 1912 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1913 goto out_unlock;
1914
1a35ca80
ED
1915 if (!skb) {
1916 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1917 int tlen = dev->needed_tailroom;
1a35ca80
ED
1918 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1919
1920 rcu_read_unlock();
4ce40912 1921 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1922 if (skb == NULL)
1923 return -ENOBUFS;
1924 /* FIXME: Save some space for broken drivers that write a hard
1925 * header at transmission time by themselves. PPP is the notable
1926 * one here. This should really be fixed at the driver level.
1927 */
1928 skb_reserve(skb, reserved);
1929 skb_reset_network_header(skb);
1930
1931 /* Try to align data part correctly */
1932 if (hhlen) {
1933 skb->data -= hhlen;
1934 skb->tail -= hhlen;
1935 if (len < hhlen)
1936 skb_reset_network_header(skb);
1937 }
6ce8e9ce 1938 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1939 if (err)
1940 goto out_free;
1941 goto retry;
1da177e4
LT
1942 }
1943
9ed988cd
WB
1944 if (!dev_validate_header(dev, skb->data, len)) {
1945 err = -EINVAL;
1946 goto out_unlock;
1947 }
3c70c132
DB
1948 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1949 !packet_extra_vlan_len_allowed(dev, skb)) {
1950 err = -EMSGSIZE;
1951 goto out_unlock;
57f89bfa 1952 }
1a35ca80 1953
edbe7746 1954 sockc.tsflags = sk->sk_tsflags;
c14ac945
SHY
1955 if (msg->msg_controllen) {
1956 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1957 if (unlikely(err))
c14ac945 1958 goto out_unlock;
c14ac945
SHY
1959 }
1960
1da177e4
LT
1961 skb->protocol = proto;
1962 skb->dev = dev;
1963 skb->priority = sk->sk_priority;
2d37a186 1964 skb->mark = sk->sk_mark;
bf84a010 1965
c14ac945 1966 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 1967
3bdc0eba
BG
1968 if (unlikely(extra_len == 4))
1969 skb->no_fcs = 1;
1970
40893fd0 1971 skb_probe_transport_header(skb, 0);
c1aad275 1972
1da177e4 1973 dev_queue_xmit(skb);
654d1f8a 1974 rcu_read_unlock();
40d4e3df 1975 return len;
1da177e4 1976
1da177e4 1977out_unlock:
654d1f8a 1978 rcu_read_unlock();
1a35ca80
ED
1979out_free:
1980 kfree_skb(skb);
1da177e4
LT
1981 return err;
1982}
1da177e4 1983
ff936a04
AS
1984static unsigned int run_filter(struct sk_buff *skb,
1985 const struct sock *sk,
1986 unsigned int res)
1da177e4
LT
1987{
1988 struct sk_filter *filter;
fda9ef5d 1989
80f8f102
ED
1990 rcu_read_lock();
1991 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1992 if (filter != NULL)
ff936a04 1993 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1994 rcu_read_unlock();
1da177e4 1995
dbcb5855 1996 return res;
1da177e4
LT
1997}
1998
16cc1400
WB
1999static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2000 size_t *len)
2001{
2002 struct virtio_net_hdr vnet_hdr;
2003
2004 if (*len < sizeof(vnet_hdr))
2005 return -EINVAL;
2006 *len -= sizeof(vnet_hdr);
2007
6391a448 2008 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
16cc1400
WB
2009 return -EINVAL;
2010
2011 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2012}
2013
1da177e4 2014/*
62ab0812
ED
2015 * This function makes lazy skb cloning in hope that most of packets
2016 * are discarded by BPF.
2017 *
2018 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2019 * and skb->cb are mangled. It works because (and until) packets
2020 * falling here are owned by current CPU. Output packets are cloned
2021 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2022 * sequencially, so that if we return skb to original state on exit,
2023 * we will not harm anyone.
1da177e4
LT
2024 */
2025
40d4e3df
ED
2026static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2027 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2028{
2029 struct sock *sk;
2030 struct sockaddr_ll *sll;
2031 struct packet_sock *po;
40d4e3df 2032 u8 *skb_head = skb->data;
1da177e4 2033 int skb_len = skb->len;
dbcb5855 2034 unsigned int snaplen, res;
da37845f 2035 bool is_drop_n_account = false;
1da177e4
LT
2036
2037 if (skb->pkt_type == PACKET_LOOPBACK)
2038 goto drop;
2039
2040 sk = pt->af_packet_priv;
2041 po = pkt_sk(sk);
2042
09ad9bc7 2043 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2044 goto drop;
2045
1da177e4
LT
2046 skb->dev = dev;
2047
3b04ddde 2048 if (dev->header_ops) {
1da177e4 2049 /* The device has an explicit notion of ll header,
62ab0812
ED
2050 * exported to higher levels.
2051 *
2052 * Otherwise, the device hides details of its frame
2053 * structure, so that corresponding packet head is
2054 * never delivered to user.
1da177e4
LT
2055 */
2056 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2057 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2058 else if (skb->pkt_type == PACKET_OUTGOING) {
2059 /* Special case: outgoing packets have ll header at head */
bbe735e4 2060 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2061 }
2062 }
2063
2064 snaplen = skb->len;
2065
dbcb5855
DM
2066 res = run_filter(skb, sk, snaplen);
2067 if (!res)
fda9ef5d 2068 goto drop_n_restore;
dbcb5855
DM
2069 if (snaplen > res)
2070 snaplen = res;
1da177e4 2071
0fd7bac6 2072 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2073 goto drop_n_acct;
2074
2075 if (skb_shared(skb)) {
2076 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2077 if (nskb == NULL)
2078 goto drop_n_acct;
2079
2080 if (skb_head != skb->data) {
2081 skb->data = skb_head;
2082 skb->len = skb_len;
2083 }
abc4e4fa 2084 consume_skb(skb);
1da177e4
LT
2085 skb = nskb;
2086 }
2087
b4772ef8 2088 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2089
2090 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2091 sll->sll_hatype = dev->type;
1da177e4 2092 sll->sll_pkttype = skb->pkt_type;
8032b464 2093 if (unlikely(po->origdev))
80feaacb
PWJ
2094 sll->sll_ifindex = orig_dev->ifindex;
2095 else
2096 sll->sll_ifindex = dev->ifindex;
1da177e4 2097
b95cce35 2098 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2099
2472d761
EB
2100 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2101 * Use their space for storing the original skb length.
2102 */
2103 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2104
1da177e4
LT
2105 if (pskb_trim(skb, snaplen))
2106 goto drop_n_acct;
2107
2108 skb_set_owner_r(skb, sk);
2109 skb->dev = NULL;
adf30907 2110 skb_dst_drop(skb);
1da177e4 2111
84531c24
PO
2112 /* drop conntrack reference */
2113 nf_reset(skb);
2114
1da177e4 2115 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2116 po->stats.stats1.tp_packets++;
3bc3b96f 2117 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2118 __skb_queue_tail(&sk->sk_receive_queue, skb);
2119 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2120 sk->sk_data_ready(sk);
1da177e4
LT
2121 return 0;
2122
2123drop_n_acct:
da37845f 2124 is_drop_n_account = true;
7091fbd8 2125 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2126 po->stats.stats1.tp_drops++;
7091fbd8
WB
2127 atomic_inc(&sk->sk_drops);
2128 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2129
2130drop_n_restore:
2131 if (skb_head != skb->data && skb_shared(skb)) {
2132 skb->data = skb_head;
2133 skb->len = skb_len;
2134 }
2135drop:
da37845f
WJ
2136 if (!is_drop_n_account)
2137 consume_skb(skb);
2138 else
2139 kfree_skb(skb);
1da177e4
LT
2140 return 0;
2141}
2142
40d4e3df
ED
2143static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2144 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2145{
2146 struct sock *sk;
2147 struct packet_sock *po;
2148 struct sockaddr_ll *sll;
184f489e 2149 union tpacket_uhdr h;
40d4e3df 2150 u8 *skb_head = skb->data;
1da177e4 2151 int skb_len = skb->len;
dbcb5855 2152 unsigned int snaplen, res;
f6fb8f10 2153 unsigned long status = TP_STATUS_USER;
bbd6ef87 2154 unsigned short macoff, netoff, hdrlen;
1da177e4 2155 struct sk_buff *copy_skb = NULL;
bbd6ef87 2156 struct timespec ts;
b9c32fb2 2157 __u32 ts_status;
da37845f 2158 bool is_drop_n_account = false;
edbd58be 2159 bool do_vnet = false;
1da177e4 2160
51846355
AW
2161 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2162 * We may add members to them until current aligned size without forcing
2163 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2164 */
2165 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2166 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2167
1da177e4
LT
2168 if (skb->pkt_type == PACKET_LOOPBACK)
2169 goto drop;
2170
2171 sk = pt->af_packet_priv;
2172 po = pkt_sk(sk);
2173
09ad9bc7 2174 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2175 goto drop;
2176
3b04ddde 2177 if (dev->header_ops) {
1da177e4 2178 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2179 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2180 else if (skb->pkt_type == PACKET_OUTGOING) {
2181 /* Special case: outgoing packets have ll header at head */
bbe735e4 2182 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2183 }
2184 }
2185
2186 snaplen = skb->len;
2187
dbcb5855
DM
2188 res = run_filter(skb, sk, snaplen);
2189 if (!res)
fda9ef5d 2190 goto drop_n_restore;
68c2e5de
AD
2191
2192 if (skb->ip_summed == CHECKSUM_PARTIAL)
2193 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2194 else if (skb->pkt_type != PACKET_OUTGOING &&
2195 (skb->ip_summed == CHECKSUM_COMPLETE ||
2196 skb_csum_unnecessary(skb)))
2197 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2198
dbcb5855
DM
2199 if (snaplen > res)
2200 snaplen = res;
1da177e4
LT
2201
2202 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2203 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2204 po->tp_reserve;
1da177e4 2205 } else {
95c96174 2206 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2207 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2208 (maclen < 16 ? 16 : maclen)) +
58d19b19 2209 po->tp_reserve;
edbd58be 2210 if (po->has_vnet_hdr) {
58d19b19 2211 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2212 do_vnet = true;
2213 }
1da177e4
LT
2214 macoff = netoff - maclen;
2215 }
f6fb8f10 2216 if (po->tp_version <= TPACKET_V2) {
2217 if (macoff + snaplen > po->rx_ring.frame_size) {
2218 if (po->copy_thresh &&
0fd7bac6 2219 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2220 if (skb_shared(skb)) {
2221 copy_skb = skb_clone(skb, GFP_ATOMIC);
2222 } else {
2223 copy_skb = skb_get(skb);
2224 skb_head = skb->data;
2225 }
2226 if (copy_skb)
2227 skb_set_owner_r(copy_skb, sk);
1da177e4 2228 }
f6fb8f10 2229 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2230 if ((int)snaplen < 0) {
f6fb8f10 2231 snaplen = 0;
edbd58be
BP
2232 do_vnet = false;
2233 }
1da177e4 2234 }
dc808110
ED
2235 } else if (unlikely(macoff + snaplen >
2236 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2237 u32 nval;
2238
2239 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2240 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2241 snaplen, nval, macoff);
2242 snaplen = nval;
2243 if (unlikely((int)snaplen < 0)) {
2244 snaplen = 0;
2245 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2246 do_vnet = false;
dc808110 2247 }
1da177e4 2248 }
1da177e4 2249 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2250 h.raw = packet_current_rx_frame(po, skb,
2251 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2252 if (!h.raw)
58d19b19 2253 goto drop_n_account;
f6fb8f10 2254 if (po->tp_version <= TPACKET_V2) {
2255 packet_increment_rx_head(po, &po->rx_ring);
2256 /*
2257 * LOSING will be reported till you read the stats,
2258 * because it's COR - Clear On Read.
2259 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2260 * at packet level.
2261 */
ee80fbf3 2262 if (po->stats.stats1.tp_drops)
f6fb8f10 2263 status |= TP_STATUS_LOSING;
2264 }
ee80fbf3 2265 po->stats.stats1.tp_packets++;
1da177e4
LT
2266 if (copy_skb) {
2267 status |= TP_STATUS_COPY;
2268 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2269 }
1da177e4
LT
2270 spin_unlock(&sk->sk_receive_queue.lock);
2271
edbd58be 2272 if (do_vnet) {
5a213881
JR
2273 if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
2274 sizeof(struct virtio_net_hdr),
6391a448 2275 vio_le(), true)) {
58d19b19
WB
2276 spin_lock(&sk->sk_receive_queue.lock);
2277 goto drop_n_account;
2278 }
2279 }
2280
bbd6ef87 2281 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2282
2283 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2284 getnstimeofday(&ts);
1da177e4 2285
b9c32fb2
DB
2286 status |= ts_status;
2287
bbd6ef87
PM
2288 switch (po->tp_version) {
2289 case TPACKET_V1:
2290 h.h1->tp_len = skb->len;
2291 h.h1->tp_snaplen = snaplen;
2292 h.h1->tp_mac = macoff;
2293 h.h1->tp_net = netoff;
4b457bdf
DB
2294 h.h1->tp_sec = ts.tv_sec;
2295 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2296 hdrlen = sizeof(*h.h1);
2297 break;
2298 case TPACKET_V2:
2299 h.h2->tp_len = skb->len;
2300 h.h2->tp_snaplen = snaplen;
2301 h.h2->tp_mac = macoff;
2302 h.h2->tp_net = netoff;
bbd6ef87
PM
2303 h.h2->tp_sec = ts.tv_sec;
2304 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2305 if (skb_vlan_tag_present(skb)) {
2306 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2307 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2308 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2309 } else {
2310 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2311 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2312 }
e4d26f4b 2313 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2314 hdrlen = sizeof(*h.h2);
2315 break;
f6fb8f10 2316 case TPACKET_V3:
2317 /* tp_nxt_offset,vlan are already populated above.
2318 * So DONT clear those fields here
2319 */
2320 h.h3->tp_status |= status;
2321 h.h3->tp_len = skb->len;
2322 h.h3->tp_snaplen = snaplen;
2323 h.h3->tp_mac = macoff;
2324 h.h3->tp_net = netoff;
f6fb8f10 2325 h.h3->tp_sec = ts.tv_sec;
2326 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2327 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2328 hdrlen = sizeof(*h.h3);
2329 break;
bbd6ef87
PM
2330 default:
2331 BUG();
2332 }
1da177e4 2333
bbd6ef87 2334 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2335 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2336 sll->sll_family = AF_PACKET;
2337 sll->sll_hatype = dev->type;
2338 sll->sll_protocol = skb->protocol;
2339 sll->sll_pkttype = skb->pkt_type;
8032b464 2340 if (unlikely(po->origdev))
80feaacb
PWJ
2341 sll->sll_ifindex = orig_dev->ifindex;
2342 else
2343 sll->sll_ifindex = dev->ifindex;
1da177e4 2344
e16aa207 2345 smp_mb();
f0d4eb29 2346
f6dafa95 2347#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2348 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2349 u8 *start, *end;
2350
f0d4eb29
DB
2351 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2352 macoff + snaplen);
2353
2354 for (start = h.raw; start < end; start += PAGE_SIZE)
2355 flush_dcache_page(pgv_to_page(start));
1da177e4 2356 }
f0d4eb29 2357 smp_wmb();
f6dafa95 2358#endif
f0d4eb29 2359
da413eec 2360 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2361 __packet_set_status(po, h.raw, status);
da413eec
DC
2362 sk->sk_data_ready(sk);
2363 } else {
f6fb8f10 2364 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2365 }
1da177e4
LT
2366
2367drop_n_restore:
2368 if (skb_head != skb->data && skb_shared(skb)) {
2369 skb->data = skb_head;
2370 skb->len = skb_len;
2371 }
2372drop:
da37845f
WJ
2373 if (!is_drop_n_account)
2374 consume_skb(skb);
2375 else
2376 kfree_skb(skb);
1da177e4
LT
2377 return 0;
2378
58d19b19 2379drop_n_account:
da37845f 2380 is_drop_n_account = true;
ee80fbf3 2381 po->stats.stats1.tp_drops++;
1da177e4
LT
2382 spin_unlock(&sk->sk_receive_queue.lock);
2383
676d2369 2384 sk->sk_data_ready(sk);
acb5d75b 2385 kfree_skb(copy_skb);
1da177e4
LT
2386 goto drop_n_restore;
2387}
2388
69e3c75f
JB
2389static void tpacket_destruct_skb(struct sk_buff *skb)
2390{
2391 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2392
69e3c75f 2393 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2394 void *ph;
b9c32fb2
DB
2395 __u32 ts;
2396
69e3c75f 2397 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2398 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2399
2400 ts = __packet_set_timestamp(po, ph, skb);
2401 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2402 }
2403
2404 sock_wfree(skb);
2405}
2406
c72219b7
DB
2407static void tpacket_set_protocol(const struct net_device *dev,
2408 struct sk_buff *skb)
2409{
2410 if (dev->type == ARPHRD_ETHER) {
2411 skb_reset_mac_header(skb);
2412 skb->protocol = eth_hdr(skb)->h_proto;
2413 }
2414}
2415
16cc1400
WB
2416static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2417{
16cc1400
WB
2418 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2419 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2420 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2421 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2422 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2423 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2424 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2425
2426 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2427 return -EINVAL;
2428
16cc1400
WB
2429 return 0;
2430}
2431
2432static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2433 struct virtio_net_hdr *vnet_hdr)
2434{
16cc1400
WB
2435 if (*len < sizeof(*vnet_hdr))
2436 return -EINVAL;
2437 *len -= sizeof(*vnet_hdr);
2438
cbbd26b8 2439 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2440 return -EFAULT;
2441
2442 return __packet_snd_vnet_parse(vnet_hdr, *len);
2443}
2444
40d4e3df 2445static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2446 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2447 __be16 proto, unsigned char *addr, int hlen, int copylen,
2448 const struct sockcm_cookie *sockc)
69e3c75f 2449{
184f489e 2450 union tpacket_uhdr ph;
8d39b4a6 2451 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2452 struct socket *sock = po->sk.sk_socket;
2453 struct page *page;
69e3c75f
JB
2454 int err;
2455
2456 ph.raw = frame;
2457
2458 skb->protocol = proto;
2459 skb->dev = dev;
2460 skb->priority = po->sk.sk_priority;
2d37a186 2461 skb->mark = po->sk.sk_mark;
c14ac945 2462 sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2463 skb_shinfo(skb)->destructor_arg = ph.raw;
2464
ae641949 2465 skb_reserve(skb, hlen);
69e3c75f 2466 skb_reset_network_header(skb);
c1aad275 2467
69e3c75f
JB
2468 to_write = tp_len;
2469
2470 if (sock->type == SOCK_DGRAM) {
2471 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2472 NULL, tp_len);
2473 if (unlikely(err < 0))
2474 return -EINVAL;
1d036d25 2475 } else if (copylen) {
9ed988cd
WB
2476 int hdrlen = min_t(int, copylen, tp_len);
2477
69e3c75f 2478 skb_push(skb, dev->hard_header_len);
1d036d25 2479 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2480 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2481 if (unlikely(err))
2482 return err;
9ed988cd
WB
2483 if (!dev_validate_header(dev, skb->data, hdrlen))
2484 return -EINVAL;
c72219b7
DB
2485 if (!skb->protocol)
2486 tpacket_set_protocol(dev, skb);
69e3c75f 2487
9ed988cd
WB
2488 data += hdrlen;
2489 to_write -= hdrlen;
69e3c75f
JB
2490 }
2491
69e3c75f
JB
2492 offset = offset_in_page(data);
2493 len_max = PAGE_SIZE - offset;
2494 len = ((to_write > len_max) ? len_max : to_write);
2495
2496 skb->data_len = to_write;
2497 skb->len += to_write;
2498 skb->truesize += to_write;
14afee4b 2499 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2500
2501 while (likely(to_write)) {
2502 nr_frags = skb_shinfo(skb)->nr_frags;
2503
2504 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2505 pr_err("Packet exceed the number of skb frags(%lu)\n",
2506 MAX_SKB_FRAGS);
69e3c75f
JB
2507 return -EFAULT;
2508 }
2509
0af55bb5
CG
2510 page = pgv_to_page(data);
2511 data += len;
69e3c75f
JB
2512 flush_dcache_page(page);
2513 get_page(page);
0af55bb5 2514 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2515 to_write -= len;
2516 offset = 0;
2517 len_max = PAGE_SIZE;
2518 len = ((to_write > len_max) ? len_max : to_write);
2519 }
2520
8fd6c80d 2521 skb_probe_transport_header(skb, 0);
efdfa2f7 2522
69e3c75f
JB
2523 return tp_len;
2524}
2525
8d39b4a6
WB
2526static int tpacket_parse_header(struct packet_sock *po, void *frame,
2527 int size_max, void **data)
2528{
2529 union tpacket_uhdr ph;
2530 int tp_len, off;
2531
2532 ph.raw = frame;
2533
2534 switch (po->tp_version) {
7f953ab2
SV
2535 case TPACKET_V3:
2536 if (ph.h3->tp_next_offset != 0) {
2537 pr_warn_once("variable sized slot not supported");
2538 return -EINVAL;
2539 }
2540 tp_len = ph.h3->tp_len;
2541 break;
8d39b4a6
WB
2542 case TPACKET_V2:
2543 tp_len = ph.h2->tp_len;
2544 break;
2545 default:
2546 tp_len = ph.h1->tp_len;
2547 break;
2548 }
2549 if (unlikely(tp_len > size_max)) {
2550 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2551 return -EMSGSIZE;
2552 }
2553
2554 if (unlikely(po->tp_tx_has_off)) {
2555 int off_min, off_max;
2556
2557 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2558 off_max = po->tx_ring.frame_size - tp_len;
2559 if (po->sk.sk_type == SOCK_DGRAM) {
2560 switch (po->tp_version) {
7f953ab2
SV
2561 case TPACKET_V3:
2562 off = ph.h3->tp_net;
2563 break;
8d39b4a6
WB
2564 case TPACKET_V2:
2565 off = ph.h2->tp_net;
2566 break;
2567 default:
2568 off = ph.h1->tp_net;
2569 break;
2570 }
2571 } else {
2572 switch (po->tp_version) {
7f953ab2
SV
2573 case TPACKET_V3:
2574 off = ph.h3->tp_mac;
2575 break;
8d39b4a6
WB
2576 case TPACKET_V2:
2577 off = ph.h2->tp_mac;
2578 break;
2579 default:
2580 off = ph.h1->tp_mac;
2581 break;
2582 }
2583 }
2584 if (unlikely((off < off_min) || (off_max < off)))
2585 return -EINVAL;
2586 } else {
2587 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2588 }
2589
2590 *data = frame + off;
2591 return tp_len;
2592}
2593
69e3c75f
JB
2594static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2595{
69e3c75f
JB
2596 struct sk_buff *skb;
2597 struct net_device *dev;
1d036d25 2598 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2599 struct sockcm_cookie sockc;
69e3c75f 2600 __be16 proto;
09effa67 2601 int err, reserve = 0;
40d4e3df 2602 void *ph;
342dfc30 2603 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2604 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2605 int tp_len, size_max;
2606 unsigned char *addr;
8d39b4a6 2607 void *data;
69e3c75f 2608 int len_sum = 0;
9e67030a 2609 int status = TP_STATUS_AVAILABLE;
1d036d25 2610 int hlen, tlen, copylen = 0;
69e3c75f 2611
69e3c75f
JB
2612 mutex_lock(&po->pg_vec_lock);
2613
66e56cd4 2614 if (likely(saddr == NULL)) {
e40526cb 2615 dev = packet_cached_dev_get(po);
69e3c75f
JB
2616 proto = po->num;
2617 addr = NULL;
2618 } else {
2619 err = -EINVAL;
2620 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2621 goto out;
2622 if (msg->msg_namelen < (saddr->sll_halen
2623 + offsetof(struct sockaddr_ll,
2624 sll_addr)))
2625 goto out;
69e3c75f
JB
2626 proto = saddr->sll_protocol;
2627 addr = saddr->sll_addr;
827d9780 2628 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2629 }
2630
69e3c75f
JB
2631 err = -ENXIO;
2632 if (unlikely(dev == NULL))
2633 goto out;
69e3c75f
JB
2634 err = -ENETDOWN;
2635 if (unlikely(!(dev->flags & IFF_UP)))
2636 goto out_put;
2637
d19b183c
DCS
2638 sockc.tsflags = po->sk.sk_tsflags;
2639 if (msg->msg_controllen) {
2640 err = sock_cmsg_send(&po->sk, msg, &sockc);
2641 if (unlikely(err))
2642 goto out_put;
2643 }
2644
5cfb4c8d
DB
2645 if (po->sk.sk_socket->type == SOCK_RAW)
2646 reserve = dev->hard_header_len;
69e3c75f 2647 size_max = po->tx_ring.frame_size
b5dd884e 2648 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2649
1d036d25 2650 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2651 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2652
69e3c75f
JB
2653 do {
2654 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2655 TP_STATUS_SEND_REQUEST);
69e3c75f 2656 if (unlikely(ph == NULL)) {
87a2fd28
DB
2657 if (need_wait && need_resched())
2658 schedule();
69e3c75f
JB
2659 continue;
2660 }
2661
8d39b4a6
WB
2662 skb = NULL;
2663 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2664 if (tp_len < 0)
2665 goto tpacket_error;
2666
69e3c75f 2667 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2668 hlen = LL_RESERVED_SPACE(dev);
2669 tlen = dev->needed_tailroom;
1d036d25
WB
2670 if (po->has_vnet_hdr) {
2671 vnet_hdr = data;
2672 data += sizeof(*vnet_hdr);
2673 tp_len -= sizeof(*vnet_hdr);
2674 if (tp_len < 0 ||
2675 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2676 tp_len = -EINVAL;
2677 goto tpacket_error;
2678 }
2679 copylen = __virtio16_to_cpu(vio_le(),
2680 vnet_hdr->hdr_len);
2681 }
9ed988cd 2682 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2683 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2684 hlen + tlen + sizeof(struct sockaddr_ll) +
2685 (copylen - dev->hard_header_len),
fbf33a28 2686 !need_wait, &err);
69e3c75f 2687
fbf33a28
KM
2688 if (unlikely(skb == NULL)) {
2689 /* we assume the socket was initially writeable ... */
2690 if (likely(len_sum > 0))
2691 err = len_sum;
69e3c75f 2692 goto out_status;
fbf33a28 2693 }
8d39b4a6 2694 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2695 addr, hlen, copylen, &sockc);
dbd46ab4 2696 if (likely(tp_len >= 0) &&
5cfb4c8d 2697 tp_len > dev->mtu + reserve &&
1d036d25 2698 !po->has_vnet_hdr &&
3c70c132
DB
2699 !packet_extra_vlan_len_allowed(dev, skb))
2700 tp_len = -EMSGSIZE;
69e3c75f
JB
2701
2702 if (unlikely(tp_len < 0)) {
8d39b4a6 2703tpacket_error:
69e3c75f
JB
2704 if (po->tp_loss) {
2705 __packet_set_status(po, ph,
2706 TP_STATUS_AVAILABLE);
2707 packet_increment_head(&po->tx_ring);
2708 kfree_skb(skb);
2709 continue;
2710 } else {
2711 status = TP_STATUS_WRONG_FORMAT;
2712 err = tp_len;
2713 goto out_status;
2714 }
2715 }
2716
db60eb5f
JR
2717 if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
2718 vio_le())) {
1d036d25
WB
2719 tp_len = -EINVAL;
2720 goto tpacket_error;
2721 }
2722
69e3c75f
JB
2723 skb->destructor = tpacket_destruct_skb;
2724 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2725 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2726
2727 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2728 err = po->xmit(skb);
eb70df13
JP
2729 if (unlikely(err > 0)) {
2730 err = net_xmit_errno(err);
2731 if (err && __packet_get_status(po, ph) ==
2732 TP_STATUS_AVAILABLE) {
2733 /* skb was destructed already */
2734 skb = NULL;
2735 goto out_status;
2736 }
2737 /*
2738 * skb was dropped but not destructed yet;
2739 * let's treat it like congestion or err < 0
2740 */
2741 err = 0;
2742 }
69e3c75f
JB
2743 packet_increment_head(&po->tx_ring);
2744 len_sum += tp_len;
b0138408
DB
2745 } while (likely((ph != NULL) ||
2746 /* Note: packet_read_pending() might be slow if we have
2747 * to call it as it's per_cpu variable, but in fast-path
2748 * we already short-circuit the loop with the first
2749 * condition, and luckily don't have to go that path
2750 * anyway.
2751 */
2752 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2753
2754 err = len_sum;
2755 goto out_put;
2756
69e3c75f
JB
2757out_status:
2758 __packet_set_status(po, ph, status);
2759 kfree_skb(skb);
2760out_put:
e40526cb 2761 dev_put(dev);
69e3c75f
JB
2762out:
2763 mutex_unlock(&po->pg_vec_lock);
2764 return err;
2765}
69e3c75f 2766
eea49cc9
OJ
2767static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2768 size_t reserve, size_t len,
2769 size_t linear, int noblock,
2770 int *err)
bfd5f4a3
SS
2771{
2772 struct sk_buff *skb;
2773
2774 /* Under a page? Don't bother with paged skb. */
2775 if (prepad + len < PAGE_SIZE || !linear)
2776 linear = len;
2777
2778 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2779 err, 0);
bfd5f4a3
SS
2780 if (!skb)
2781 return NULL;
2782
2783 skb_reserve(skb, reserve);
2784 skb_put(skb, linear);
2785 skb->data_len = len - linear;
2786 skb->len += len - linear;
2787
2788 return skb;
2789}
2790
d346a3fa 2791static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2792{
2793 struct sock *sk = sock->sk;
342dfc30 2794 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2795 struct sk_buff *skb;
2796 struct net_device *dev;
0e11c91e 2797 __be16 proto;
1da177e4 2798 unsigned char *addr;
827d9780 2799 int err, reserve = 0;
c7d39e32 2800 struct sockcm_cookie sockc;
bfd5f4a3
SS
2801 struct virtio_net_hdr vnet_hdr = { 0 };
2802 int offset = 0;
bfd5f4a3 2803 struct packet_sock *po = pkt_sk(sk);
da7c9561 2804 bool has_vnet_hdr = false;
57031eb7 2805 int hlen, tlen, linear;
3bdc0eba 2806 int extra_len = 0;
1da177e4
LT
2807
2808 /*
1ce4f28b 2809 * Get and verify the address.
1da177e4 2810 */
1ce4f28b 2811
66e56cd4 2812 if (likely(saddr == NULL)) {
e40526cb 2813 dev = packet_cached_dev_get(po);
1da177e4
LT
2814 proto = po->num;
2815 addr = NULL;
2816 } else {
2817 err = -EINVAL;
2818 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2819 goto out;
0fb375fb
EB
2820 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2821 goto out;
1da177e4
LT
2822 proto = saddr->sll_protocol;
2823 addr = saddr->sll_addr;
827d9780 2824 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2825 }
2826
1da177e4 2827 err = -ENXIO;
e40526cb 2828 if (unlikely(dev == NULL))
1da177e4 2829 goto out_unlock;
d5e76b0a 2830 err = -ENETDOWN;
e40526cb 2831 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2832 goto out_unlock;
2833
edbe7746 2834 sockc.tsflags = sk->sk_tsflags;
c7d39e32
EJ
2835 sockc.mark = sk->sk_mark;
2836 if (msg->msg_controllen) {
2837 err = sock_cmsg_send(sk, msg, &sockc);
2838 if (unlikely(err))
2839 goto out_unlock;
2840 }
2841
e40526cb
DB
2842 if (sock->type == SOCK_RAW)
2843 reserve = dev->hard_header_len;
bfd5f4a3 2844 if (po->has_vnet_hdr) {
16cc1400
WB
2845 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2846 if (err)
bfd5f4a3 2847 goto out_unlock;
da7c9561 2848 has_vnet_hdr = true;
bfd5f4a3
SS
2849 }
2850
3bdc0eba
BG
2851 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2852 if (!netif_supports_nofcs(dev)) {
2853 err = -EPROTONOSUPPORT;
2854 goto out_unlock;
2855 }
2856 extra_len = 4; /* We're doing our own CRC */
2857 }
2858
1da177e4 2859 err = -EMSGSIZE;
16cc1400
WB
2860 if (!vnet_hdr.gso_type &&
2861 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2862 goto out_unlock;
2863
bfd5f4a3 2864 err = -ENOBUFS;
ae641949
HX
2865 hlen = LL_RESERVED_SPACE(dev);
2866 tlen = dev->needed_tailroom;
57031eb7
WB
2867 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2868 linear = max(linear, min_t(int, len, dev->hard_header_len));
2869 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2870 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2871 if (skb == NULL)
1da177e4
LT
2872 goto out_unlock;
2873
b84bbaf7 2874 skb_reset_network_header(skb);
1da177e4 2875
0c4e8581 2876 err = -EINVAL;
9c707762
WB
2877 if (sock->type == SOCK_DGRAM) {
2878 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2879 if (unlikely(offset < 0))
9c707762 2880 goto out_free;
b84bbaf7 2881 } else if (reserve) {
9aad13b0 2882 skb_reserve(skb, -reserve);
9c707762 2883 }
1da177e4
LT
2884
2885 /* Returns -EFAULT on error */
c0371da6 2886 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2887 if (err)
2888 goto out_free;
bf84a010 2889
9ed988cd
WB
2890 if (sock->type == SOCK_RAW &&
2891 !dev_validate_header(dev, skb->data, len)) {
2892 err = -EINVAL;
2893 goto out_free;
2894 }
2895
c14ac945 2896 sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
1da177e4 2897
16cc1400 2898 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2899 !packet_extra_vlan_len_allowed(dev, skb)) {
2900 err = -EMSGSIZE;
2901 goto out_free;
57f89bfa
BG
2902 }
2903
09effa67
DM
2904 skb->protocol = proto;
2905 skb->dev = dev;
1da177e4 2906 skb->priority = sk->sk_priority;
c7d39e32 2907 skb->mark = sockc.mark;
0fd5d57b 2908
da7c9561 2909 if (has_vnet_hdr) {
db60eb5f 2910 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2911 if (err)
2912 goto out_free;
2913 len += sizeof(vnet_hdr);
bfd5f4a3
SS
2914 }
2915
8fd6c80d
DB
2916 skb_probe_transport_header(skb, reserve);
2917
3bdc0eba
BG
2918 if (unlikely(extra_len == 4))
2919 skb->no_fcs = 1;
2920
d346a3fa 2921 err = po->xmit(skb);
1da177e4
LT
2922 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2923 goto out_unlock;
2924
e40526cb 2925 dev_put(dev);
1da177e4 2926
40d4e3df 2927 return len;
1da177e4
LT
2928
2929out_free:
2930 kfree_skb(skb);
2931out_unlock:
e40526cb 2932 if (dev)
1da177e4
LT
2933 dev_put(dev);
2934out:
2935 return err;
2936}
2937
1b784140 2938static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2939{
69e3c75f
JB
2940 struct sock *sk = sock->sk;
2941 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2942
69e3c75f
JB
2943 if (po->tx_ring.pg_vec)
2944 return tpacket_snd(po, msg);
2945 else
69e3c75f
JB
2946 return packet_snd(sock, msg, len);
2947}
2948
1da177e4
LT
2949/*
2950 * Close a PACKET socket. This is fairly simple. We immediately go
2951 * to 'closed' state and remove our protocol entry in the device list.
2952 */
2953
2954static int packet_release(struct socket *sock)
2955{
2956 struct sock *sk = sock->sk;
2957 struct packet_sock *po;
2bd624b4 2958 struct packet_fanout *f;
d12d01d6 2959 struct net *net;
f6fb8f10 2960 union tpacket_req_u req_u;
1da177e4
LT
2961
2962 if (!sk)
2963 return 0;
2964
3b1e0a65 2965 net = sock_net(sk);
1da177e4
LT
2966 po = pkt_sk(sk);
2967
0fa7fa98 2968 mutex_lock(&net->packet.sklist_lock);
808f5114 2969 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2970 mutex_unlock(&net->packet.sklist_lock);
2971
2972 preempt_disable();
920de804 2973 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2974 preempt_enable();
1da177e4 2975
808f5114 2976 spin_lock(&po->bind_lock);
ce06b03e 2977 unregister_prot_hook(sk, false);
66e56cd4
DB
2978 packet_cached_dev_reset(po);
2979
160ff18a
BG
2980 if (po->prot_hook.dev) {
2981 dev_put(po->prot_hook.dev);
2982 po->prot_hook.dev = NULL;
2983 }
808f5114 2984 spin_unlock(&po->bind_lock);
1da177e4 2985
1da177e4 2986 packet_flush_mclist(sk);
1da177e4 2987
5171b37d 2988 lock_sock(sk);
9665d5d6
PS
2989 if (po->rx_ring.pg_vec) {
2990 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2991 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2992 }
69e3c75f 2993
9665d5d6
PS
2994 if (po->tx_ring.pg_vec) {
2995 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2996 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2997 }
5171b37d 2998 release_sock(sk);
1da177e4 2999
2bd624b4 3000 f = fanout_release(sk);
dc99f600 3001
808f5114 3002 synchronize_net();
2bd624b4
AS
3003
3004 if (f) {
57f015f5 3005 kfree(po->rollover);
2bd624b4
AS
3006 fanout_release_data(f);
3007 kfree(f);
3008 }
1da177e4
LT
3009 /*
3010 * Now the socket is dead. No more input will appear.
3011 */
1da177e4
LT
3012 sock_orphan(sk);
3013 sock->sk = NULL;
3014
3015 /* Purge queues */
3016
3017 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3018 packet_free_pending(po);
17ab56a2 3019 sk_refcnt_debug_release(sk);
1da177e4
LT
3020
3021 sock_put(sk);
3022 return 0;
3023}
3024
3025/*
3026 * Attach a packet hook.
3027 */
3028
30f7ea1c
FR
3029static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3030 __be16 proto)
1da177e4
LT
3031{
3032 struct packet_sock *po = pkt_sk(sk);
158cd4af 3033 struct net_device *dev_curr;
902fefb8
DB
3034 __be16 proto_curr;
3035 bool need_rehook;
30f7ea1c
FR
3036 struct net_device *dev = NULL;
3037 int ret = 0;
3038 bool unlisted = false;
dc99f600 3039
1da177e4 3040 lock_sock(sk);
1da177e4 3041 spin_lock(&po->bind_lock);
30f7ea1c
FR
3042 rcu_read_lock();
3043
4971613c
WB
3044 if (po->fanout) {
3045 ret = -EINVAL;
3046 goto out_unlock;
3047 }
3048
30f7ea1c
FR
3049 if (name) {
3050 dev = dev_get_by_name_rcu(sock_net(sk), name);
3051 if (!dev) {
3052 ret = -ENODEV;
3053 goto out_unlock;
3054 }
3055 } else if (ifindex) {
3056 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3057 if (!dev) {
3058 ret = -ENODEV;
3059 goto out_unlock;
3060 }
3061 }
3062
3063 if (dev)
3064 dev_hold(dev);
66e56cd4 3065
902fefb8
DB
3066 proto_curr = po->prot_hook.type;
3067 dev_curr = po->prot_hook.dev;
3068
3069 need_rehook = proto_curr != proto || dev_curr != dev;
3070
3071 if (need_rehook) {
30f7ea1c
FR
3072 if (po->running) {
3073 rcu_read_unlock();
15fe076e
ED
3074 /* prevents packet_notifier() from calling
3075 * register_prot_hook()
3076 */
3077 po->num = 0;
30f7ea1c
FR
3078 __unregister_prot_hook(sk, true);
3079 rcu_read_lock();
3080 dev_curr = po->prot_hook.dev;
3081 if (dev)
3082 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3083 dev->ifindex);
3084 }
1da177e4 3085
15fe076e 3086 BUG_ON(po->running);
902fefb8
DB
3087 po->num = proto;
3088 po->prot_hook.type = proto;
902fefb8 3089
30f7ea1c
FR
3090 if (unlikely(unlisted)) {
3091 dev_put(dev);
3092 po->prot_hook.dev = NULL;
3093 po->ifindex = -1;
3094 packet_cached_dev_reset(po);
3095 } else {
3096 po->prot_hook.dev = dev;
3097 po->ifindex = dev ? dev->ifindex : 0;
3098 packet_cached_dev_assign(po, dev);
3099 }
902fefb8 3100 }
158cd4af
LW
3101 if (dev_curr)
3102 dev_put(dev_curr);
66e56cd4 3103
902fefb8 3104 if (proto == 0 || !need_rehook)
1da177e4
LT
3105 goto out_unlock;
3106
30f7ea1c 3107 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3108 register_prot_hook(sk);
be85d4ad
UT
3109 } else {
3110 sk->sk_err = ENETDOWN;
3111 if (!sock_flag(sk, SOCK_DEAD))
3112 sk->sk_error_report(sk);
1da177e4
LT
3113 }
3114
3115out_unlock:
30f7ea1c 3116 rcu_read_unlock();
1da177e4
LT
3117 spin_unlock(&po->bind_lock);
3118 release_sock(sk);
30f7ea1c 3119 return ret;
1da177e4
LT
3120}
3121
3122/*
3123 * Bind a packet socket to a device
3124 */
3125
40d4e3df
ED
3126static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3127 int addr_len)
1da177e4 3128{
40d4e3df 3129 struct sock *sk = sock->sk;
540e2894 3130 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3131
1da177e4
LT
3132 /*
3133 * Check legality
3134 */
1ce4f28b 3135
8ae55f04 3136 if (addr_len != sizeof(struct sockaddr))
1da177e4 3137 return -EINVAL;
540e2894
AP
3138 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3139 * zero-terminated.
3140 */
3141 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3142 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3143
30f7ea1c 3144 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3145}
1da177e4
LT
3146
3147static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3148{
40d4e3df
ED
3149 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3150 struct sock *sk = sock->sk;
1da177e4
LT
3151
3152 /*
3153 * Check legality
3154 */
1ce4f28b 3155
1da177e4
LT
3156 if (addr_len < sizeof(struct sockaddr_ll))
3157 return -EINVAL;
3158 if (sll->sll_family != AF_PACKET)
3159 return -EINVAL;
3160
30f7ea1c
FR
3161 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3162 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3163}
3164
3165static struct proto packet_proto = {
3166 .name = "PACKET",
3167 .owner = THIS_MODULE,
3168 .obj_size = sizeof(struct packet_sock),
3169};
3170
3171/*
1ce4f28b 3172 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3173 */
3174
3f378b68
EP
3175static int packet_create(struct net *net, struct socket *sock, int protocol,
3176 int kern)
1da177e4
LT
3177{
3178 struct sock *sk;
3179 struct packet_sock *po;
0e11c91e 3180 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3181 int err;
3182
df008c91 3183 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3184 return -EPERM;
be02097c
DM
3185 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3186 sock->type != SOCK_PACKET)
1da177e4
LT
3187 return -ESOCKTNOSUPPORT;
3188
3189 sock->state = SS_UNCONNECTED;
3190
3191 err = -ENOBUFS;
11aa9c28 3192 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3193 if (sk == NULL)
3194 goto out;
3195
3196 sock->ops = &packet_ops;
1da177e4
LT
3197 if (sock->type == SOCK_PACKET)
3198 sock->ops = &packet_ops_spkt;
be02097c 3199
1da177e4
LT
3200 sock_init_data(sock, sk);
3201
3202 po = pkt_sk(sk);
3203 sk->sk_family = PF_PACKET;
0e11c91e 3204 po->num = proto;
d346a3fa 3205 po->xmit = dev_queue_xmit;
66e56cd4 3206
b0138408
DB
3207 err = packet_alloc_pending(po);
3208 if (err)
3209 goto out2;
3210
66e56cd4 3211 packet_cached_dev_reset(po);
1da177e4
LT
3212
3213 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3214 sk_refcnt_debug_inc(sk);
1da177e4
LT
3215
3216 /*
3217 * Attach a protocol block
3218 */
3219
3220 spin_lock_init(&po->bind_lock);
905db440 3221 mutex_init(&po->pg_vec_lock);
0648ab70 3222 po->rollover = NULL;
1da177e4 3223 po->prot_hook.func = packet_rcv;
be02097c 3224
1da177e4
LT
3225 if (sock->type == SOCK_PACKET)
3226 po->prot_hook.func = packet_rcv_spkt;
be02097c 3227
1da177e4
LT
3228 po->prot_hook.af_packet_priv = sk;
3229
0e11c91e
AV
3230 if (proto) {
3231 po->prot_hook.type = proto;
a6361f0c 3232 __register_prot_hook(sk);
1da177e4
LT
3233 }
3234
0fa7fa98 3235 mutex_lock(&net->packet.sklist_lock);
808f5114 3236 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3237 mutex_unlock(&net->packet.sklist_lock);
3238
3239 preempt_disable();
3680453c 3240 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3241 preempt_enable();
808f5114 3242
40d4e3df 3243 return 0;
b0138408
DB
3244out2:
3245 sk_free(sk);
1da177e4
LT
3246out:
3247 return err;
3248}
3249
3250/*
3251 * Pull a packet from our receive queue and hand it to the user.
3252 * If necessary we block.
3253 */
3254
1b784140
YX
3255static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3256 int flags)
1da177e4
LT
3257{
3258 struct sock *sk = sock->sk;
3259 struct sk_buff *skb;
3260 int copied, err;
bfd5f4a3 3261 int vnet_hdr_len = 0;
2472d761 3262 unsigned int origlen = 0;
1da177e4
LT
3263
3264 err = -EINVAL;
ed85b565 3265 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3266 goto out;
3267
3268#if 0
3269 /* What error should we return now? EUNATTACH? */
3270 if (pkt_sk(sk)->ifindex < 0)
3271 return -ENODEV;
3272#endif
3273
ed85b565 3274 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3275 err = sock_recv_errqueue(sk, msg, len,
3276 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3277 goto out;
3278 }
3279
1da177e4
LT
3280 /*
3281 * Call the generic datagram receiver. This handles all sorts
3282 * of horrible races and re-entrancy so we can forget about it
3283 * in the protocol layers.
3284 *
3285 * Now it will return ENETDOWN, if device have just gone down,
3286 * but then it will block.
3287 */
3288
40d4e3df 3289 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3290
3291 /*
1ce4f28b 3292 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3293 * handles the blocking we don't see and worry about blocking
3294 * retries.
3295 */
3296
8ae55f04 3297 if (skb == NULL)
1da177e4
LT
3298 goto out;
3299
2ccdbaa6
WB
3300 if (pkt_sk(sk)->pressure)
3301 packet_rcv_has_room(pkt_sk(sk), NULL);
3302
bfd5f4a3 3303 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3304 err = packet_rcv_vnet(msg, skb, &len);
3305 if (err)
bfd5f4a3 3306 goto out_free;
16cc1400 3307 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3308 }
3309
f3d33426
HFS
3310 /* You lose any data beyond the buffer you gave. If it worries
3311 * a user program they can ask the device for its MTU
3312 * anyway.
1da177e4 3313 */
1da177e4 3314 copied = skb->len;
40d4e3df
ED
3315 if (copied > len) {
3316 copied = len;
3317 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3318 }
3319
51f3d02b 3320 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3321 if (err)
3322 goto out_free;
3323
2472d761
EB
3324 if (sock->type != SOCK_PACKET) {
3325 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3326
3327 /* Original length was stored in sockaddr_ll fields */
3328 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3329 sll->sll_family = AF_PACKET;
3330 sll->sll_protocol = skb->protocol;
3331 }
3332
3b885787 3333 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3334
f3d33426
HFS
3335 if (msg->msg_name) {
3336 /* If the address length field is there to be filled
3337 * in, we fill it in now.
3338 */
3339 if (sock->type == SOCK_PACKET) {
342dfc30 3340 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3341 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3342 } else {
3343 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3344
f3d33426
HFS
3345 msg->msg_namelen = sll->sll_halen +
3346 offsetof(struct sockaddr_ll, sll_addr);
3347 }
ffbc6111
HX
3348 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3349 msg->msg_namelen);
f3d33426 3350 }
1da177e4 3351
8dc41944 3352 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3353 struct tpacket_auxdata aux;
3354
3355 aux.tp_status = TP_STATUS_USER;
3356 if (skb->ip_summed == CHECKSUM_PARTIAL)
3357 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3358 else if (skb->pkt_type != PACKET_OUTGOING &&
3359 (skb->ip_summed == CHECKSUM_COMPLETE ||
3360 skb_csum_unnecessary(skb)))
3361 aux.tp_status |= TP_STATUS_CSUM_VALID;
3362
2472d761 3363 aux.tp_len = origlen;
ffbc6111
HX
3364 aux.tp_snaplen = skb->len;
3365 aux.tp_mac = 0;
bbe735e4 3366 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3367 if (skb_vlan_tag_present(skb)) {
3368 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3369 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3370 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3371 } else {
3372 aux.tp_vlan_tci = 0;
a0cdfcf3 3373 aux.tp_vlan_tpid = 0;
a3bcc23e 3374 }
ffbc6111 3375 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3376 }
3377
1da177e4
LT
3378 /*
3379 * Free or return the buffer as appropriate. Again this
3380 * hides all the races and re-entrancy issues from us.
3381 */
bfd5f4a3 3382 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3383
3384out_free:
3385 skb_free_datagram(sk, skb);
3386out:
3387 return err;
3388}
3389
1da177e4 3390static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3391 int peer)
1da177e4
LT
3392{
3393 struct net_device *dev;
3394 struct sock *sk = sock->sk;
3395
3396 if (peer)
3397 return -EOPNOTSUPP;
3398
3399 uaddr->sa_family = AF_PACKET;
2dc85bf3 3400 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3401 rcu_read_lock();
3402 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3403 if (dev)
2dc85bf3 3404 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3405 rcu_read_unlock();
1da177e4 3406
9b2c45d4 3407 return sizeof(*uaddr);
1da177e4 3408}
1da177e4
LT
3409
3410static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3411 int peer)
1da177e4
LT
3412{
3413 struct net_device *dev;
3414 struct sock *sk = sock->sk;
3415 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3416 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3417
3418 if (peer)
3419 return -EOPNOTSUPP;
3420
3421 sll->sll_family = AF_PACKET;
3422 sll->sll_ifindex = po->ifindex;
3423 sll->sll_protocol = po->num;
67286640 3424 sll->sll_pkttype = 0;
654d1f8a
ED
3425 rcu_read_lock();
3426 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3427 if (dev) {
3428 sll->sll_hatype = dev->type;
3429 sll->sll_halen = dev->addr_len;
3430 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3431 } else {
3432 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3433 sll->sll_halen = 0;
3434 }
654d1f8a 3435 rcu_read_unlock();
1da177e4 3436
9b2c45d4 3437 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3438}
3439
2aeb0b88
WC
3440static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3441 int what)
1da177e4
LT
3442{
3443 switch (i->type) {
3444 case PACKET_MR_MULTICAST:
1162563f
JP
3445 if (i->alen != dev->addr_len)
3446 return -EINVAL;
1da177e4 3447 if (what > 0)
22bedad3 3448 return dev_mc_add(dev, i->addr);
1da177e4 3449 else
22bedad3 3450 return dev_mc_del(dev, i->addr);
1da177e4
LT
3451 break;
3452 case PACKET_MR_PROMISC:
2aeb0b88 3453 return dev_set_promiscuity(dev, what);
1da177e4 3454 case PACKET_MR_ALLMULTI:
2aeb0b88 3455 return dev_set_allmulti(dev, what);
d95ed927 3456 case PACKET_MR_UNICAST:
1162563f
JP
3457 if (i->alen != dev->addr_len)
3458 return -EINVAL;
d95ed927 3459 if (what > 0)
a748ee24 3460 return dev_uc_add(dev, i->addr);
d95ed927 3461 else
a748ee24 3462 return dev_uc_del(dev, i->addr);
d95ed927 3463 break;
40d4e3df
ED
3464 default:
3465 break;
1da177e4 3466 }
2aeb0b88 3467 return 0;
1da177e4
LT
3468}
3469
82f17091
FR
3470static void packet_dev_mclist_delete(struct net_device *dev,
3471 struct packet_mclist **mlp)
1da177e4 3472{
82f17091
FR
3473 struct packet_mclist *ml;
3474
3475 while ((ml = *mlp) != NULL) {
3476 if (ml->ifindex == dev->ifindex) {
3477 packet_dev_mc(dev, ml, -1);
3478 *mlp = ml->next;
3479 kfree(ml);
3480 } else
3481 mlp = &ml->next;
1da177e4
LT
3482 }
3483}
3484
0fb375fb 3485static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3486{
3487 struct packet_sock *po = pkt_sk(sk);
3488 struct packet_mclist *ml, *i;
3489 struct net_device *dev;
3490 int err;
3491
3492 rtnl_lock();
3493
3494 err = -ENODEV;
3b1e0a65 3495 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3496 if (!dev)
3497 goto done;
3498
3499 err = -EINVAL;
1162563f 3500 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3501 goto done;
3502
3503 err = -ENOBUFS;
8b3a7005 3504 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3505 if (i == NULL)
3506 goto done;
3507
3508 err = 0;
3509 for (ml = po->mclist; ml; ml = ml->next) {
3510 if (ml->ifindex == mreq->mr_ifindex &&
3511 ml->type == mreq->mr_type &&
3512 ml->alen == mreq->mr_alen &&
3513 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3514 ml->count++;
3515 /* Free the new element ... */
3516 kfree(i);
3517 goto done;
3518 }
3519 }
3520
3521 i->type = mreq->mr_type;
3522 i->ifindex = mreq->mr_ifindex;
3523 i->alen = mreq->mr_alen;
3524 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3525 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3526 i->count = 1;
3527 i->next = po->mclist;
3528 po->mclist = i;
2aeb0b88
WC
3529 err = packet_dev_mc(dev, i, 1);
3530 if (err) {
3531 po->mclist = i->next;
3532 kfree(i);
3533 }
1da177e4
LT
3534
3535done:
3536 rtnl_unlock();
3537 return err;
3538}
3539
0fb375fb 3540static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3541{
3542 struct packet_mclist *ml, **mlp;
3543
3544 rtnl_lock();
3545
3546 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3547 if (ml->ifindex == mreq->mr_ifindex &&
3548 ml->type == mreq->mr_type &&
3549 ml->alen == mreq->mr_alen &&
3550 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3551 if (--ml->count == 0) {
3552 struct net_device *dev;
3553 *mlp = ml->next;
ad959e76
ED
3554 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3555 if (dev)
1da177e4 3556 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3557 kfree(ml);
3558 }
82f17091 3559 break;
1da177e4
LT
3560 }
3561 }
3562 rtnl_unlock();
82f17091 3563 return 0;
1da177e4
LT
3564}
3565
3566static void packet_flush_mclist(struct sock *sk)
3567{
3568 struct packet_sock *po = pkt_sk(sk);
3569 struct packet_mclist *ml;
3570
3571 if (!po->mclist)
3572 return;
3573
3574 rtnl_lock();
3575 while ((ml = po->mclist) != NULL) {
3576 struct net_device *dev;
3577
3578 po->mclist = ml->next;
ad959e76
ED
3579 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3580 if (dev != NULL)
1da177e4 3581 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3582 kfree(ml);
3583 }
3584 rtnl_unlock();
3585}
1da177e4
LT
3586
3587static int
b7058842 3588packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3589{
3590 struct sock *sk = sock->sk;
8dc41944 3591 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3592 int ret;
3593
3594 if (level != SOL_PACKET)
3595 return -ENOPROTOOPT;
3596
69e3c75f 3597 switch (optname) {
1ce4f28b 3598 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3599 case PACKET_DROP_MEMBERSHIP:
3600 {
0fb375fb
EB
3601 struct packet_mreq_max mreq;
3602 int len = optlen;
3603 memset(&mreq, 0, sizeof(mreq));
3604 if (len < sizeof(struct packet_mreq))
1da177e4 3605 return -EINVAL;
0fb375fb
EB
3606 if (len > sizeof(mreq))
3607 len = sizeof(mreq);
40d4e3df 3608 if (copy_from_user(&mreq, optval, len))
1da177e4 3609 return -EFAULT;
0fb375fb
EB
3610 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3611 return -EINVAL;
1da177e4
LT
3612 if (optname == PACKET_ADD_MEMBERSHIP)
3613 ret = packet_mc_add(sk, &mreq);
3614 else
3615 ret = packet_mc_drop(sk, &mreq);
3616 return ret;
3617 }
a2efcfa0 3618
1da177e4 3619 case PACKET_RX_RING:
69e3c75f 3620 case PACKET_TX_RING:
1da177e4 3621 {
f6fb8f10 3622 union tpacket_req_u req_u;
3623 int len;
1da177e4 3624
5171b37d 3625 lock_sock(sk);
f6fb8f10 3626 switch (po->tp_version) {
3627 case TPACKET_V1:
3628 case TPACKET_V2:
3629 len = sizeof(req_u.req);
3630 break;
3631 case TPACKET_V3:
3632 default:
3633 len = sizeof(req_u.req3);
3634 break;
3635 }
5171b37d
ED
3636 if (optlen < len) {
3637 ret = -EINVAL;
3638 } else {
3639 if (copy_from_user(&req_u.req, optval, len))
3640 ret = -EFAULT;
3641 else
3642 ret = packet_set_ring(sk, &req_u, 0,
3643 optname == PACKET_TX_RING);
3644 }
3645 release_sock(sk);
3646 return ret;
1da177e4
LT
3647 }
3648 case PACKET_COPY_THRESH:
3649 {
3650 int val;
3651
40d4e3df 3652 if (optlen != sizeof(val))
1da177e4 3653 return -EINVAL;
40d4e3df 3654 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3655 return -EFAULT;
3656
3657 pkt_sk(sk)->copy_thresh = val;
3658 return 0;
3659 }
bbd6ef87
PM
3660 case PACKET_VERSION:
3661 {
3662 int val;
3663
3664 if (optlen != sizeof(val))
3665 return -EINVAL;
bbd6ef87
PM
3666 if (copy_from_user(&val, optval, sizeof(val)))
3667 return -EFAULT;
3668 switch (val) {
3669 case TPACKET_V1:
3670 case TPACKET_V2:
f6fb8f10 3671 case TPACKET_V3:
84ac7260 3672 break;
bbd6ef87
PM
3673 default:
3674 return -EINVAL;
3675 }
84ac7260
PP
3676 lock_sock(sk);
3677 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3678 ret = -EBUSY;
3679 } else {
3680 po->tp_version = val;
3681 ret = 0;
3682 }
3683 release_sock(sk);
3684 return ret;
bbd6ef87 3685 }
8913336a
PM
3686 case PACKET_RESERVE:
3687 {
3688 unsigned int val;
3689
3690 if (optlen != sizeof(val))
3691 return -EINVAL;
8913336a
PM
3692 if (copy_from_user(&val, optval, sizeof(val)))
3693 return -EFAULT;
bcc5364b
AK
3694 if (val > INT_MAX)
3695 return -EINVAL;
c27927e3
WB
3696 lock_sock(sk);
3697 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3698 ret = -EBUSY;
3699 } else {
3700 po->tp_reserve = val;
3701 ret = 0;
3702 }
3703 release_sock(sk);
3704 return ret;
8913336a 3705 }
69e3c75f
JB
3706 case PACKET_LOSS:
3707 {
3708 unsigned int val;
3709
3710 if (optlen != sizeof(val))
3711 return -EINVAL;
69e3c75f
JB
3712 if (copy_from_user(&val, optval, sizeof(val)))
3713 return -EFAULT;
a6361f0c
WB
3714
3715 lock_sock(sk);
3716 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3717 ret = -EBUSY;
3718 } else {
3719 po->tp_loss = !!val;
3720 ret = 0;
3721 }
3722 release_sock(sk);
3723 return ret;
69e3c75f 3724 }
8dc41944
HX
3725 case PACKET_AUXDATA:
3726 {
3727 int val;
3728
3729 if (optlen < sizeof(val))
3730 return -EINVAL;
3731 if (copy_from_user(&val, optval, sizeof(val)))
3732 return -EFAULT;
3733
a6361f0c 3734 lock_sock(sk);
8dc41944 3735 po->auxdata = !!val;
a6361f0c 3736 release_sock(sk);
8dc41944
HX
3737 return 0;
3738 }
80feaacb
PWJ
3739 case PACKET_ORIGDEV:
3740 {
3741 int val;
3742
3743 if (optlen < sizeof(val))
3744 return -EINVAL;
3745 if (copy_from_user(&val, optval, sizeof(val)))
3746 return -EFAULT;
3747
a6361f0c 3748 lock_sock(sk);
80feaacb 3749 po->origdev = !!val;
a6361f0c 3750 release_sock(sk);
80feaacb
PWJ
3751 return 0;
3752 }
bfd5f4a3
SS
3753 case PACKET_VNET_HDR:
3754 {
3755 int val;
3756
3757 if (sock->type != SOCK_RAW)
3758 return -EINVAL;
bfd5f4a3
SS
3759 if (optlen < sizeof(val))
3760 return -EINVAL;
3761 if (copy_from_user(&val, optval, sizeof(val)))
3762 return -EFAULT;
3763
a6361f0c
WB
3764 lock_sock(sk);
3765 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3766 ret = -EBUSY;
3767 } else {
3768 po->has_vnet_hdr = !!val;
3769 ret = 0;
3770 }
3771 release_sock(sk);
3772 return ret;
bfd5f4a3 3773 }
614f60fa
SM
3774 case PACKET_TIMESTAMP:
3775 {
3776 int val;
3777
3778 if (optlen != sizeof(val))
3779 return -EINVAL;
3780 if (copy_from_user(&val, optval, sizeof(val)))
3781 return -EFAULT;
3782
3783 po->tp_tstamp = val;
3784 return 0;
3785 }
dc99f600
DM
3786 case PACKET_FANOUT:
3787 {
3788 int val;
3789
3790 if (optlen != sizeof(val))
3791 return -EINVAL;
3792 if (copy_from_user(&val, optval, sizeof(val)))
3793 return -EFAULT;
3794
3795 return fanout_add(sk, val & 0xffff, val >> 16);
3796 }
47dceb8e
WB
3797 case PACKET_FANOUT_DATA:
3798 {
3799 if (!po->fanout)
3800 return -EINVAL;
3801
3802 return fanout_set_data(po, optval, optlen);
3803 }
5920cd3a
PC
3804 case PACKET_TX_HAS_OFF:
3805 {
3806 unsigned int val;
3807
3808 if (optlen != sizeof(val))
3809 return -EINVAL;
5920cd3a
PC
3810 if (copy_from_user(&val, optval, sizeof(val)))
3811 return -EFAULT;
a6361f0c
WB
3812
3813 lock_sock(sk);
3814 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3815 ret = -EBUSY;
3816 } else {
3817 po->tp_tx_has_off = !!val;
3818 ret = 0;
3819 }
3820 release_sock(sk);
5920cd3a
PC
3821 return 0;
3822 }
d346a3fa
DB
3823 case PACKET_QDISC_BYPASS:
3824 {
3825 int val;
3826
3827 if (optlen != sizeof(val))
3828 return -EINVAL;
3829 if (copy_from_user(&val, optval, sizeof(val)))
3830 return -EFAULT;
3831
3832 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3833 return 0;
3834 }
1da177e4
LT
3835 default:
3836 return -ENOPROTOOPT;
3837 }
3838}
3839
3840static int packet_getsockopt(struct socket *sock, int level, int optname,
3841 char __user *optval, int __user *optlen)
3842{
3843 int len;
c06fff6e 3844 int val, lv = sizeof(val);
1da177e4
LT
3845 struct sock *sk = sock->sk;
3846 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3847 void *data = &val;
ee80fbf3 3848 union tpacket_stats_u st;
a9b63918 3849 struct tpacket_rollover_stats rstats;
1da177e4
LT
3850
3851 if (level != SOL_PACKET)
3852 return -ENOPROTOOPT;
3853
8ae55f04
KK
3854 if (get_user(len, optlen))
3855 return -EFAULT;
1da177e4
LT
3856
3857 if (len < 0)
3858 return -EINVAL;
1ce4f28b 3859
69e3c75f 3860 switch (optname) {
1da177e4 3861 case PACKET_STATISTICS:
1da177e4 3862 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3863 memcpy(&st, &po->stats, sizeof(st));
3864 memset(&po->stats, 0, sizeof(po->stats));
3865 spin_unlock_bh(&sk->sk_receive_queue.lock);
3866
f6fb8f10 3867 if (po->tp_version == TPACKET_V3) {
c06fff6e 3868 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3869 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3870 data = &st.stats3;
f6fb8f10 3871 } else {
c06fff6e 3872 lv = sizeof(struct tpacket_stats);
8bcdeaff 3873 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3874 data = &st.stats1;
f6fb8f10 3875 }
ee80fbf3 3876
8dc41944
HX
3877 break;
3878 case PACKET_AUXDATA:
8dc41944 3879 val = po->auxdata;
80feaacb
PWJ
3880 break;
3881 case PACKET_ORIGDEV:
80feaacb 3882 val = po->origdev;
bfd5f4a3
SS
3883 break;
3884 case PACKET_VNET_HDR:
bfd5f4a3 3885 val = po->has_vnet_hdr;
1da177e4 3886 break;
bbd6ef87 3887 case PACKET_VERSION:
bbd6ef87 3888 val = po->tp_version;
bbd6ef87
PM
3889 break;
3890 case PACKET_HDRLEN:
3891 if (len > sizeof(int))
3892 len = sizeof(int);
fd2c83b3
AP
3893 if (len < sizeof(int))
3894 return -EINVAL;
bbd6ef87
PM
3895 if (copy_from_user(&val, optval, len))
3896 return -EFAULT;
3897 switch (val) {
3898 case TPACKET_V1:
3899 val = sizeof(struct tpacket_hdr);
3900 break;
3901 case TPACKET_V2:
3902 val = sizeof(struct tpacket2_hdr);
3903 break;
f6fb8f10 3904 case TPACKET_V3:
3905 val = sizeof(struct tpacket3_hdr);
3906 break;
bbd6ef87
PM
3907 default:
3908 return -EINVAL;
3909 }
bbd6ef87 3910 break;
8913336a 3911 case PACKET_RESERVE:
8913336a 3912 val = po->tp_reserve;
8913336a 3913 break;
69e3c75f 3914 case PACKET_LOSS:
69e3c75f 3915 val = po->tp_loss;
69e3c75f 3916 break;
614f60fa 3917 case PACKET_TIMESTAMP:
614f60fa 3918 val = po->tp_tstamp;
614f60fa 3919 break;
dc99f600 3920 case PACKET_FANOUT:
dc99f600
DM
3921 val = (po->fanout ?
3922 ((u32)po->fanout->id |
77f65ebd
WB
3923 ((u32)po->fanout->type << 16) |
3924 ((u32)po->fanout->flags << 24)) :
dc99f600 3925 0);
dc99f600 3926 break;
a9b63918 3927 case PACKET_ROLLOVER_STATS:
57f015f5 3928 if (!po->rollover)
a9b63918 3929 return -EINVAL;
57f015f5
MM
3930 rstats.tp_all = atomic_long_read(&po->rollover->num);
3931 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3932 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3933 data = &rstats;
3934 lv = sizeof(rstats);
a9b63918 3935 break;
5920cd3a
PC
3936 case PACKET_TX_HAS_OFF:
3937 val = po->tp_tx_has_off;
3938 break;
d346a3fa
DB
3939 case PACKET_QDISC_BYPASS:
3940 val = packet_use_direct_xmit(po);
3941 break;
1da177e4
LT
3942 default:
3943 return -ENOPROTOOPT;
3944 }
3945
c06fff6e
ED
3946 if (len > lv)
3947 len = lv;
8ae55f04
KK
3948 if (put_user(len, optlen))
3949 return -EFAULT;
8dc41944
HX
3950 if (copy_to_user(optval, data, len))
3951 return -EFAULT;
8ae55f04 3952 return 0;
1da177e4
LT
3953}
3954
3955
719c44d3
WB
3956#ifdef CONFIG_COMPAT
3957static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3958 char __user *optval, unsigned int optlen)
3959{
3960 struct packet_sock *po = pkt_sk(sock->sk);
3961
3962 if (level != SOL_PACKET)
3963 return -ENOPROTOOPT;
3964
3965 if (optname == PACKET_FANOUT_DATA &&
3966 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3967 optval = (char __user *)get_compat_bpf_fprog(optval);
3968 if (!optval)
3969 return -EFAULT;
3970 optlen = sizeof(struct sock_fprog);
3971 }
3972
3973 return packet_setsockopt(sock, level, optname, optval, optlen);
3974}
3975#endif
3976
351638e7
JP
3977static int packet_notifier(struct notifier_block *this,
3978 unsigned long msg, void *ptr)
1da177e4
LT
3979{
3980 struct sock *sk;
351638e7 3981 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3982 struct net *net = dev_net(dev);
1da177e4 3983
808f5114 3984 rcu_read_lock();
b67bfe0d 3985 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3986 struct packet_sock *po = pkt_sk(sk);
3987
3988 switch (msg) {
3989 case NETDEV_UNREGISTER:
1da177e4 3990 if (po->mclist)
82f17091 3991 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3992 /* fallthrough */
3993
1da177e4
LT
3994 case NETDEV_DOWN:
3995 if (dev->ifindex == po->ifindex) {
3996 spin_lock(&po->bind_lock);
3997 if (po->running) {
ce06b03e 3998 __unregister_prot_hook(sk, false);
1da177e4
LT
3999 sk->sk_err = ENETDOWN;
4000 if (!sock_flag(sk, SOCK_DEAD))
4001 sk->sk_error_report(sk);
4002 }
4003 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4004 packet_cached_dev_reset(po);
1da177e4 4005 po->ifindex = -1;
160ff18a
BG
4006 if (po->prot_hook.dev)
4007 dev_put(po->prot_hook.dev);
1da177e4
LT
4008 po->prot_hook.dev = NULL;
4009 }
4010 spin_unlock(&po->bind_lock);
4011 }
4012 break;
4013 case NETDEV_UP:
808f5114 4014 if (dev->ifindex == po->ifindex) {
4015 spin_lock(&po->bind_lock);
ce06b03e
DM
4016 if (po->num)
4017 register_prot_hook(sk);
808f5114 4018 spin_unlock(&po->bind_lock);
1da177e4 4019 }
1da177e4
LT
4020 break;
4021 }
4022 }
808f5114 4023 rcu_read_unlock();
1da177e4
LT
4024 return NOTIFY_DONE;
4025}
4026
4027
4028static int packet_ioctl(struct socket *sock, unsigned int cmd,
4029 unsigned long arg)
4030{
4031 struct sock *sk = sock->sk;
4032
69e3c75f 4033 switch (cmd) {
40d4e3df
ED
4034 case SIOCOUTQ:
4035 {
4036 int amount = sk_wmem_alloc_get(sk);
31e6d363 4037
40d4e3df
ED
4038 return put_user(amount, (int __user *)arg);
4039 }
4040 case SIOCINQ:
4041 {
4042 struct sk_buff *skb;
4043 int amount = 0;
4044
4045 spin_lock_bh(&sk->sk_receive_queue.lock);
4046 skb = skb_peek(&sk->sk_receive_queue);
4047 if (skb)
4048 amount = skb->len;
4049 spin_unlock_bh(&sk->sk_receive_queue.lock);
4050 return put_user(amount, (int __user *)arg);
4051 }
4052 case SIOCGSTAMP:
4053 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4054 case SIOCGSTAMPNS:
4055 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4056
1da177e4 4057#ifdef CONFIG_INET
40d4e3df
ED
4058 case SIOCADDRT:
4059 case SIOCDELRT:
4060 case SIOCDARP:
4061 case SIOCGARP:
4062 case SIOCSARP:
4063 case SIOCGIFADDR:
4064 case SIOCSIFADDR:
4065 case SIOCGIFBRDADDR:
4066 case SIOCSIFBRDADDR:
4067 case SIOCGIFNETMASK:
4068 case SIOCSIFNETMASK:
4069 case SIOCGIFDSTADDR:
4070 case SIOCSIFDSTADDR:
4071 case SIOCSIFFLAGS:
40d4e3df 4072 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4073#endif
4074
40d4e3df
ED
4075 default:
4076 return -ENOIOCTLCMD;
1da177e4
LT
4077 }
4078 return 0;
4079}
4080
db5051ea 4081static __poll_t packet_poll_mask(struct socket *sock, __poll_t events)
1da177e4
LT
4082{
4083 struct sock *sk = sock->sk;
4084 struct packet_sock *po = pkt_sk(sk);
db5051ea 4085 __poll_t mask = datagram_poll_mask(sock, events);
1da177e4
LT
4086
4087 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4088 if (po->rx_ring.pg_vec) {
f6fb8f10 4089 if (!packet_previous_rx_frame(po, &po->rx_ring,
4090 TP_STATUS_KERNEL))
a9a08845 4091 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4092 }
2ccdbaa6 4093 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4094 po->pressure = 0;
1da177e4 4095 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4096 spin_lock_bh(&sk->sk_write_queue.lock);
4097 if (po->tx_ring.pg_vec) {
4098 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4099 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4100 }
4101 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4102 return mask;
4103}
4104
4105
4106/* Dirty? Well, I still did not learn better way to account
4107 * for user mmaps.
4108 */
4109
4110static void packet_mm_open(struct vm_area_struct *vma)
4111{
4112 struct file *file = vma->vm_file;
40d4e3df 4113 struct socket *sock = file->private_data;
1da177e4 4114 struct sock *sk = sock->sk;
1ce4f28b 4115
1da177e4
LT
4116 if (sk)
4117 atomic_inc(&pkt_sk(sk)->mapped);
4118}
4119
4120static void packet_mm_close(struct vm_area_struct *vma)
4121{
4122 struct file *file = vma->vm_file;
40d4e3df 4123 struct socket *sock = file->private_data;
1da177e4 4124 struct sock *sk = sock->sk;
1ce4f28b 4125
1da177e4
LT
4126 if (sk)
4127 atomic_dec(&pkt_sk(sk)->mapped);
4128}
4129
f0f37e2f 4130static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4131 .open = packet_mm_open,
4132 .close = packet_mm_close,
1da177e4
LT
4133};
4134
0e3125c7
NH
4135static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4136 unsigned int len)
1da177e4
LT
4137{
4138 int i;
4139
4ebf0ae2 4140 for (i = 0; i < len; i++) {
0e3125c7 4141 if (likely(pg_vec[i].buffer)) {
c56b4d90 4142 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
4143 vfree(pg_vec[i].buffer);
4144 else
4145 free_pages((unsigned long)pg_vec[i].buffer,
4146 order);
4147 pg_vec[i].buffer = NULL;
4148 }
1da177e4
LT
4149 }
4150 kfree(pg_vec);
4151}
4152
eea49cc9 4153static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4154{
f0d4eb29 4155 char *buffer;
0e3125c7
NH
4156 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4157 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
4158
4159 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4160 if (buffer)
4161 return buffer;
4162
f0d4eb29 4163 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 4164 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
4165 if (buffer)
4166 return buffer;
4167
f0d4eb29 4168 /* vmalloc failed, lets dig into swap here */
0e3125c7 4169 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 4170 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4171 if (buffer)
4172 return buffer;
4173
f0d4eb29 4174 /* complete and utter failure */
0e3125c7 4175 return NULL;
4ebf0ae2
DM
4176}
4177
0e3125c7 4178static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4179{
4180 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4181 struct pgv *pg_vec;
4ebf0ae2
DM
4182 int i;
4183
0e3125c7 4184 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4185 if (unlikely(!pg_vec))
4186 goto out;
4187
4188 for (i = 0; i < block_nr; i++) {
c56b4d90 4189 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4190 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4191 goto out_free_pgvec;
4192 }
4193
4194out:
4195 return pg_vec;
4196
4197out_free_pgvec:
4198 free_pg_vec(pg_vec, order, block_nr);
4199 pg_vec = NULL;
4200 goto out;
4201}
1da177e4 4202
f6fb8f10 4203static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4204 int closing, int tx_ring)
1da177e4 4205{
0e3125c7 4206 struct pgv *pg_vec = NULL;
1da177e4 4207 struct packet_sock *po = pkt_sk(sk);
0e11c91e 4208 int was_running, order = 0;
69e3c75f
JB
4209 struct packet_ring_buffer *rb;
4210 struct sk_buff_head *rb_queue;
0e11c91e 4211 __be16 num;
f6fb8f10 4212 int err = -EINVAL;
4213 /* Added to avoid minimal code churn */
4214 struct tpacket_req *req = &req_u->req;
4215
69e3c75f
JB
4216 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4217 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4218
69e3c75f
JB
4219 err = -EBUSY;
4220 if (!closing) {
4221 if (atomic_read(&po->mapped))
4222 goto out;
b0138408 4223 if (packet_read_pending(rb))
69e3c75f
JB
4224 goto out;
4225 }
1da177e4 4226
69e3c75f
JB
4227 if (req->tp_block_nr) {
4228 /* Sanity tests and some calculations */
4229 err = -EBUSY;
4230 if (unlikely(rb->pg_vec))
4231 goto out;
1da177e4 4232
bbd6ef87
PM
4233 switch (po->tp_version) {
4234 case TPACKET_V1:
4235 po->tp_hdrlen = TPACKET_HDRLEN;
4236 break;
4237 case TPACKET_V2:
4238 po->tp_hdrlen = TPACKET2_HDRLEN;
4239 break;
f6fb8f10 4240 case TPACKET_V3:
4241 po->tp_hdrlen = TPACKET3_HDRLEN;
4242 break;
bbd6ef87
PM
4243 }
4244
69e3c75f 4245 err = -EINVAL;
4ebf0ae2 4246 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4247 goto out;
90836b67 4248 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4249 goto out;
dc808110 4250 if (po->tp_version >= TPACKET_V3 &&
2b6867c2 4251 req->tp_block_size <=
eb73190f 4252 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + sizeof(struct tpacket3_hdr))
dc808110 4253 goto out;
8913336a 4254 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
4255 po->tp_reserve))
4256 goto out;
4ebf0ae2 4257 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4258 goto out;
1da177e4 4259
4194b491
TK
4260 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4261 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4262 goto out;
8f8d28e4
AK
4263 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4264 goto out;
69e3c75f
JB
4265 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4266 req->tp_frame_nr))
4267 goto out;
1da177e4
LT
4268
4269 err = -ENOMEM;
4ebf0ae2
DM
4270 order = get_order(req->tp_block_size);
4271 pg_vec = alloc_pg_vec(req, order);
4272 if (unlikely(!pg_vec))
1da177e4 4273 goto out;
f6fb8f10 4274 switch (po->tp_version) {
4275 case TPACKET_V3:
7f953ab2
SV
4276 /* Block transmit is not supported yet */
4277 if (!tx_ring) {
e8e85cc5 4278 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4279 } else {
4280 struct tpacket_req3 *req3 = &req_u->req3;
4281
4282 if (req3->tp_retire_blk_tov ||
4283 req3->tp_sizeof_priv ||
4284 req3->tp_feature_req_word) {
4285 err = -EINVAL;
4286 goto out;
4287 }
4288 }
d7cf0c34 4289 break;
f6fb8f10 4290 default:
4291 break;
4292 }
69e3c75f
JB
4293 }
4294 /* Done */
4295 else {
4296 err = -EINVAL;
4ebf0ae2 4297 if (unlikely(req->tp_frame_nr))
69e3c75f 4298 goto out;
1da177e4
LT
4299 }
4300
1da177e4
LT
4301
4302 /* Detach socket from network */
4303 spin_lock(&po->bind_lock);
4304 was_running = po->running;
4305 num = po->num;
4306 if (was_running) {
1da177e4 4307 po->num = 0;
ce06b03e 4308 __unregister_prot_hook(sk, false);
1da177e4
LT
4309 }
4310 spin_unlock(&po->bind_lock);
1ce4f28b 4311
1da177e4
LT
4312 synchronize_net();
4313
4314 err = -EBUSY;
905db440 4315 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4316 if (closing || atomic_read(&po->mapped) == 0) {
4317 err = 0;
69e3c75f 4318 spin_lock_bh(&rb_queue->lock);
c053fd96 4319 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4320 rb->frame_max = (req->tp_frame_nr - 1);
4321 rb->head = 0;
4322 rb->frame_size = req->tp_frame_size;
4323 spin_unlock_bh(&rb_queue->lock);
4324
c053fd96
CG
4325 swap(rb->pg_vec_order, order);
4326 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4327
4328 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4329 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4330 tpacket_rcv : packet_rcv;
4331 skb_queue_purge(rb_queue);
1da177e4 4332 if (atomic_read(&po->mapped))
40d4e3df
ED
4333 pr_err("packet_mmap: vma is busy: %d\n",
4334 atomic_read(&po->mapped));
1da177e4 4335 }
905db440 4336 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4337
4338 spin_lock(&po->bind_lock);
ce06b03e 4339 if (was_running) {
1da177e4 4340 po->num = num;
ce06b03e 4341 register_prot_hook(sk);
1da177e4
LT
4342 }
4343 spin_unlock(&po->bind_lock);
c800aaf8 4344 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4345 /* Because we don't support block-based V3 on tx-ring */
4346 if (!tx_ring)
73d0fcf2 4347 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4348 }
1da177e4 4349
1da177e4
LT
4350 if (pg_vec)
4351 free_pg_vec(pg_vec, order, req->tp_block_nr);
4352out:
4353 return err;
4354}
4355
69e3c75f
JB
4356static int packet_mmap(struct file *file, struct socket *sock,
4357 struct vm_area_struct *vma)
1da177e4
LT
4358{
4359 struct sock *sk = sock->sk;
4360 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4361 unsigned long size, expected_size;
4362 struct packet_ring_buffer *rb;
1da177e4
LT
4363 unsigned long start;
4364 int err = -EINVAL;
4365 int i;
4366
4367 if (vma->vm_pgoff)
4368 return -EINVAL;
4369
905db440 4370 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4371
4372 expected_size = 0;
4373 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4374 if (rb->pg_vec) {
4375 expected_size += rb->pg_vec_len
4376 * rb->pg_vec_pages
4377 * PAGE_SIZE;
4378 }
4379 }
4380
4381 if (expected_size == 0)
1da177e4 4382 goto out;
69e3c75f
JB
4383
4384 size = vma->vm_end - vma->vm_start;
4385 if (size != expected_size)
1da177e4
LT
4386 goto out;
4387
1da177e4 4388 start = vma->vm_start;
69e3c75f
JB
4389 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4390 if (rb->pg_vec == NULL)
4391 continue;
4392
4393 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4394 struct page *page;
4395 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4396 int pg_num;
4397
c56b4d90
CG
4398 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4399 page = pgv_to_page(kaddr);
69e3c75f
JB
4400 err = vm_insert_page(vma, start, page);
4401 if (unlikely(err))
4402 goto out;
4403 start += PAGE_SIZE;
0e3125c7 4404 kaddr += PAGE_SIZE;
69e3c75f 4405 }
4ebf0ae2 4406 }
1da177e4 4407 }
69e3c75f 4408
4ebf0ae2 4409 atomic_inc(&po->mapped);
1da177e4
LT
4410 vma->vm_ops = &packet_mmap_ops;
4411 err = 0;
4412
4413out:
905db440 4414 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4415 return err;
4416}
1da177e4 4417
90ddc4f0 4418static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4419 .family = PF_PACKET,
4420 .owner = THIS_MODULE,
4421 .release = packet_release,
4422 .bind = packet_bind_spkt,
4423 .connect = sock_no_connect,
4424 .socketpair = sock_no_socketpair,
4425 .accept = sock_no_accept,
4426 .getname = packet_getname_spkt,
db5051ea 4427 .poll_mask = datagram_poll_mask,
1da177e4
LT
4428 .ioctl = packet_ioctl,
4429 .listen = sock_no_listen,
4430 .shutdown = sock_no_shutdown,
4431 .setsockopt = sock_no_setsockopt,
4432 .getsockopt = sock_no_getsockopt,
4433 .sendmsg = packet_sendmsg_spkt,
4434 .recvmsg = packet_recvmsg,
4435 .mmap = sock_no_mmap,
4436 .sendpage = sock_no_sendpage,
4437};
1da177e4 4438
90ddc4f0 4439static const struct proto_ops packet_ops = {
1da177e4
LT
4440 .family = PF_PACKET,
4441 .owner = THIS_MODULE,
4442 .release = packet_release,
4443 .bind = packet_bind,
4444 .connect = sock_no_connect,
4445 .socketpair = sock_no_socketpair,
4446 .accept = sock_no_accept,
1ce4f28b 4447 .getname = packet_getname,
db5051ea 4448 .poll_mask = packet_poll_mask,
1da177e4
LT
4449 .ioctl = packet_ioctl,
4450 .listen = sock_no_listen,
4451 .shutdown = sock_no_shutdown,
4452 .setsockopt = packet_setsockopt,
4453 .getsockopt = packet_getsockopt,
719c44d3
WB
4454#ifdef CONFIG_COMPAT
4455 .compat_setsockopt = compat_packet_setsockopt,
4456#endif
1da177e4
LT
4457 .sendmsg = packet_sendmsg,
4458 .recvmsg = packet_recvmsg,
4459 .mmap = packet_mmap,
4460 .sendpage = sock_no_sendpage,
4461};
4462
ec1b4cf7 4463static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4464 .family = PF_PACKET,
4465 .create = packet_create,
4466 .owner = THIS_MODULE,
4467};
4468
4469static struct notifier_block packet_netdev_notifier = {
40d4e3df 4470 .notifier_call = packet_notifier,
1da177e4
LT
4471};
4472
4473#ifdef CONFIG_PROC_FS
1da177e4
LT
4474
4475static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4476 __acquires(RCU)
1da177e4 4477{
e372c414 4478 struct net *net = seq_file_net(seq);
808f5114 4479
4480 rcu_read_lock();
4481 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4482}
4483
4484static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4485{
1bf40954 4486 struct net *net = seq_file_net(seq);
808f5114 4487 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4488}
4489
4490static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4491 __releases(RCU)
1da177e4 4492{
808f5114 4493 rcu_read_unlock();
1da177e4
LT
4494}
4495
1ce4f28b 4496static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4497{
4498 if (v == SEQ_START_TOKEN)
4499 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4500 else {
b7ceabd9 4501 struct sock *s = sk_entry(v);
1da177e4
LT
4502 const struct packet_sock *po = pkt_sk(s);
4503
4504 seq_printf(seq,
71338aa7 4505 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4506 s,
41c6d650 4507 refcount_read(&s->sk_refcnt),
1da177e4
LT
4508 s->sk_type,
4509 ntohs(po->num),
4510 po->ifindex,
4511 po->running,
4512 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4513 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4514 sock_i_ino(s));
1da177e4
LT
4515 }
4516
4517 return 0;
4518}
4519
56b3d975 4520static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4521 .start = packet_seq_start,
4522 .next = packet_seq_next,
4523 .stop = packet_seq_stop,
4524 .show = packet_seq_show,
4525};
1da177e4
LT
4526#endif
4527
2c8c1e72 4528static int __net_init packet_net_init(struct net *net)
d12d01d6 4529{
0fa7fa98 4530 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4531 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4532
c3506372
CH
4533 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4534 sizeof(struct seq_net_private)))
d12d01d6
DL
4535 return -ENOMEM;
4536
4537 return 0;
4538}
4539
2c8c1e72 4540static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4541{
ece31ffd 4542 remove_proc_entry("packet", net->proc_net);
669f8f1a 4543 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4544}
4545
4546static struct pernet_operations packet_net_ops = {
4547 .init = packet_net_init,
4548 .exit = packet_net_exit,
4549};
4550
4551
1da177e4
LT
4552static void __exit packet_exit(void)
4553{
1da177e4 4554 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4555 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4556 sock_unregister(PF_PACKET);
4557 proto_unregister(&packet_proto);
4558}
4559
4560static int __init packet_init(void)
4561{
4562 int rc = proto_register(&packet_proto, 0);
4563
4564 if (rc != 0)
4565 goto out;
4566
4567 sock_register(&packet_family_ops);
d12d01d6 4568 register_pernet_subsys(&packet_net_ops);
1da177e4 4569 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4570out:
4571 return rc;
4572}
4573
4574module_init(packet_init);
4575module_exit(packet_exit);
4576MODULE_LICENSE("GPL");
4577MODULE_ALIAS_NETPROTO(PF_PACKET);