]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/packet/af_packet.c
net: dev: rename queue selection helpers.
[mirror_ubuntu-jammy-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
17bfd8c8 204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
1da177e4 211static void packet_flush_mclist(struct sock *sk);
865b03f2 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 213
ffbc6111 214struct packet_skb_cb {
ffbc6111
HX
215 union {
216 struct sockaddr_pkt pkt;
2472d761
EB
217 union {
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
220 * to save room.
221 */
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
ffbc6111
HX
225 } sa;
226};
227
d3869efe
DW
228#define vio_le() virtio_legacy_is_little_endian()
229
ffbc6111 230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 231
bc59ba39 232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 233#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
dc99f600
DM
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
d346a3fa
DB
244static int packet_direct_xmit(struct sk_buff *skb)
245{
865b03f2 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
247}
248
66e56cd4
DB
249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
250{
251 struct net_device *dev;
252
253 rcu_read_lock();
254 dev = rcu_dereference(po->cached_dev);
255 if (likely(dev))
256 dev_hold(dev);
257 rcu_read_unlock();
258
259 return dev;
260}
261
262static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
264{
265 rcu_assign_pointer(po->cached_dev, dev);
266}
267
268static void packet_cached_dev_reset(struct packet_sock *po)
269{
270 RCU_INIT_POINTER(po->cached_dev, NULL);
271}
272
d346a3fa
DB
273static bool packet_use_direct_xmit(const struct packet_sock *po)
274{
275 return po->xmit == packet_direct_xmit;
276}
277
8ec56fc3
AD
278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
279 struct net_device *sb_dev)
d346a3fa 280{
8ec56fc3 281 return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
d346a3fa
DB
282}
283
865b03f2 284static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 285{
865b03f2 286 struct net_device *dev = skb->dev;
0fd5d57b
DB
287 const struct net_device_ops *ops = dev->netdev_ops;
288 u16 queue_index;
289
290 if (ops->ndo_select_queue) {
291 queue_index = ops->ndo_select_queue(dev, skb, NULL,
292 __packet_pick_tx_queue);
293 queue_index = netdev_cap_txqueue(dev, queue_index);
294 } else {
8ec56fc3 295 queue_index = __packet_pick_tx_queue(dev, skb, NULL);
0fd5d57b
DB
296 }
297
865b03f2 298 return queue_index;
0fd5d57b
DB
299}
300
a6361f0c 301/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
302 * or from a context in which asynchronous accesses to the packet
303 * socket is not possible (packet_create()).
304 */
a6361f0c 305static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
306{
307 struct packet_sock *po = pkt_sk(sk);
e40526cb 308
ce06b03e 309 if (!po->running) {
66e56cd4 310 if (po->fanout)
dc99f600 311 __fanout_link(sk, po);
66e56cd4 312 else
dc99f600 313 dev_add_pack(&po->prot_hook);
e40526cb 314
ce06b03e
DM
315 sock_hold(sk);
316 po->running = 1;
317 }
318}
319
a6361f0c
WB
320static void register_prot_hook(struct sock *sk)
321{
322 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
323 __register_prot_hook(sk);
324}
325
326/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
327 * the po->bind_lock and do a synchronize_net to make sure no
328 * asynchronous packet processing paths still refer to the elements
329 * of po->prot_hook. If the sync parameter is false, it is the
330 * callers responsibility to take care of this.
331 */
332static void __unregister_prot_hook(struct sock *sk, bool sync)
333{
334 struct packet_sock *po = pkt_sk(sk);
335
a6361f0c
WB
336 lockdep_assert_held_once(&po->bind_lock);
337
ce06b03e 338 po->running = 0;
66e56cd4
DB
339
340 if (po->fanout)
dc99f600 341 __fanout_unlink(sk, po);
66e56cd4 342 else
dc99f600 343 __dev_remove_pack(&po->prot_hook);
e40526cb 344
ce06b03e
DM
345 __sock_put(sk);
346
347 if (sync) {
348 spin_unlock(&po->bind_lock);
349 synchronize_net();
350 spin_lock(&po->bind_lock);
351 }
352}
353
354static void unregister_prot_hook(struct sock *sk, bool sync)
355{
356 struct packet_sock *po = pkt_sk(sk);
357
358 if (po->running)
359 __unregister_prot_hook(sk, sync);
360}
361
6e58040b 362static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
363{
364 if (is_vmalloc_addr(addr))
365 return vmalloc_to_page(addr);
366 return virt_to_page(addr);
367}
368
69e3c75f 369static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 370{
184f489e 371 union tpacket_uhdr h;
1da177e4 372
69e3c75f 373 h.raw = frame;
bbd6ef87
PM
374 switch (po->tp_version) {
375 case TPACKET_V1:
69e3c75f 376 h.h1->tp_status = status;
0af55bb5 377 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
378 break;
379 case TPACKET_V2:
69e3c75f 380 h.h2->tp_status = status;
0af55bb5 381 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 382 break;
f6fb8f10 383 case TPACKET_V3:
7f953ab2
SV
384 h.h3->tp_status = status;
385 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
386 break;
69e3c75f 387 default:
f6fb8f10 388 WARN(1, "TPACKET version not supported.\n");
69e3c75f 389 BUG();
bbd6ef87 390 }
69e3c75f
JB
391
392 smp_wmb();
bbd6ef87
PM
393}
394
69e3c75f 395static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 396{
184f489e 397 union tpacket_uhdr h;
bbd6ef87 398
69e3c75f
JB
399 smp_rmb();
400
bbd6ef87
PM
401 h.raw = frame;
402 switch (po->tp_version) {
403 case TPACKET_V1:
0af55bb5 404 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 405 return h.h1->tp_status;
bbd6ef87 406 case TPACKET_V2:
0af55bb5 407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 408 return h.h2->tp_status;
f6fb8f10 409 case TPACKET_V3:
7f953ab2
SV
410 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
411 return h.h3->tp_status;
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
414 BUG();
415 return 0;
bbd6ef87 416 }
1da177e4 417}
69e3c75f 418
b9c32fb2
DB
419static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
420 unsigned int flags)
7a51384c
DB
421{
422 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
423
68a360e8
WB
424 if (shhwtstamps &&
425 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
426 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
427 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
428
429 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 430 return TP_STATUS_TS_SOFTWARE;
7a51384c 431
b9c32fb2 432 return 0;
7a51384c
DB
433}
434
b9c32fb2
DB
435static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
436 struct sk_buff *skb)
2e31396f
WB
437{
438 union tpacket_uhdr h;
439 struct timespec ts;
b9c32fb2 440 __u32 ts_status;
2e31396f 441
b9c32fb2
DB
442 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
443 return 0;
2e31396f
WB
444
445 h.raw = frame;
446 switch (po->tp_version) {
447 case TPACKET_V1:
448 h.h1->tp_sec = ts.tv_sec;
449 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
450 break;
451 case TPACKET_V2:
452 h.h2->tp_sec = ts.tv_sec;
453 h.h2->tp_nsec = ts.tv_nsec;
454 break;
455 case TPACKET_V3:
57ea884b
DB
456 h.h3->tp_sec = ts.tv_sec;
457 h.h3->tp_nsec = ts.tv_nsec;
458 break;
2e31396f
WB
459 default:
460 WARN(1, "TPACKET version not supported.\n");
461 BUG();
462 }
463
464 /* one flush is safe, as both fields always lie on the same cacheline */
465 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
466 smp_wmb();
b9c32fb2
DB
467
468 return ts_status;
2e31396f
WB
469}
470
69e3c75f
JB
471static void *packet_lookup_frame(struct packet_sock *po,
472 struct packet_ring_buffer *rb,
473 unsigned int position,
474 int status)
475{
476 unsigned int pg_vec_pos, frame_offset;
184f489e 477 union tpacket_uhdr h;
69e3c75f
JB
478
479 pg_vec_pos = position / rb->frames_per_block;
480 frame_offset = position % rb->frames_per_block;
481
0e3125c7
NH
482 h.raw = rb->pg_vec[pg_vec_pos].buffer +
483 (frame_offset * rb->frame_size);
69e3c75f
JB
484
485 if (status != __packet_get_status(po, h.raw))
486 return NULL;
487
488 return h.raw;
489}
490
eea49cc9 491static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
492 struct packet_ring_buffer *rb,
493 int status)
494{
495 return packet_lookup_frame(po, rb, rb->head, status);
496}
497
bc59ba39 498static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 499{
500 del_timer_sync(&pkc->retire_blk_timer);
501}
502
503static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 504 struct sk_buff_head *rb_queue)
505{
bc59ba39 506 struct tpacket_kbdq_core *pkc;
f6fb8f10 507
73d0fcf2 508 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 509
ec6f809f 510 spin_lock_bh(&rb_queue->lock);
f6fb8f10 511 pkc->delete_blk_timer = 1;
ec6f809f 512 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 513
514 prb_del_retire_blk_timer(pkc);
515}
516
e8e85cc5 517static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 518{
bc59ba39 519 struct tpacket_kbdq_core *pkc;
f6fb8f10 520
e8e85cc5 521 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
522 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
523 0);
524 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 532 struct ethtool_link_ksettings ecmd;
4bc71cb9 533 int err;
f6fb8f10 534
4bc71cb9
JP
535 rtnl_lock();
536 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
537 if (unlikely(!dev)) {
538 rtnl_unlock();
f6fb8f10 539 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 540 }
7cad1bac 541 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
542 rtnl_unlock();
543 if (!err) {
4bc71cb9
JP
544 /*
545 * If the link speed is so slow you don't really
546 * need to worry about perf anyways
547 */
7cad1bac
DD
548 if (ecmd.base.speed < SPEED_1000 ||
549 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 550 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 551 } else {
552 msec = 1;
7cad1bac 553 div = ecmd.base.speed / 1000;
f6fb8f10 554 }
555 }
556
557 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
558
559 if (div)
560 mbits /= div;
561
562 tmo = mbits * msec;
563
564 if (div)
565 return tmo+1;
566 return tmo;
567}
568
bc59ba39 569static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 570 union tpacket_req_u *req_u)
571{
572 p1->feature_req_word = req_u->req3.tp_feature_req_word;
573}
574
575static void init_prb_bdqc(struct packet_sock *po,
576 struct packet_ring_buffer *rb,
577 struct pgv *pg_vec,
e8e85cc5 578 union tpacket_req_u *req_u)
f6fb8f10 579{
22781a5b 580 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 581 struct tpacket_block_desc *pbd;
f6fb8f10 582
583 memset(p1, 0x0, sizeof(*p1));
584
585 p1->knxt_seq_num = 1;
586 p1->pkbdq = pg_vec;
bc59ba39 587 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 588 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 589 p1->kblk_size = req_u->req3.tp_block_size;
590 p1->knum_blocks = req_u->req3.tp_block_nr;
591 p1->hdrlen = po->tp_hdrlen;
592 p1->version = po->tp_version;
593 p1->last_kactive_blk_num = 0;
ee80fbf3 594 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 595 if (req_u->req3.tp_retire_blk_tov)
596 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
597 else
598 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
599 req_u->req3.tp_block_size);
600 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
601 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
602
dc808110 603 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 604 prb_init_ft_ops(p1, req_u);
e8e85cc5 605 prb_setup_retire_blk_timer(po);
f6fb8f10 606 prb_open_block(p1, pbd);
607}
608
609/* Do NOT update the last_blk_num first.
610 * Assumes sk_buff_head lock is held.
611 */
bc59ba39 612static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 613{
614 mod_timer(&pkc->retire_blk_timer,
615 jiffies + pkc->tov_in_jiffies);
616 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
617}
618
619/*
620 * Timer logic:
621 * 1) We refresh the timer only when we open a block.
622 * By doing this we don't waste cycles refreshing the timer
623 * on packet-by-packet basis.
624 *
625 * With a 1MB block-size, on a 1Gbps line, it will take
626 * i) ~8 ms to fill a block + ii) memcpy etc.
627 * In this cut we are not accounting for the memcpy time.
628 *
629 * So, if the user sets the 'tmo' to 10ms then the timer
630 * will never fire while the block is still getting filled
631 * (which is what we want). However, the user could choose
632 * to close a block early and that's fine.
633 *
634 * But when the timer does fire, we check whether or not to refresh it.
635 * Since the tmo granularity is in msecs, it is not too expensive
636 * to refresh the timer, lets say every '8' msecs.
637 * Either the user can set the 'tmo' or we can derive it based on
638 * a) line-speed and b) block-size.
639 * prb_calc_retire_blk_tmo() calculates the tmo.
640 *
641 */
17bfd8c8 642static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 643{
17bfd8c8
KC
644 struct packet_sock *po =
645 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 646 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 647 unsigned int frozen;
bc59ba39 648 struct tpacket_block_desc *pbd;
f6fb8f10 649
650 spin_lock(&po->sk.sk_receive_queue.lock);
651
652 frozen = prb_queue_frozen(pkc);
653 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
654
655 if (unlikely(pkc->delete_blk_timer))
656 goto out;
657
658 /* We only need to plug the race when the block is partially filled.
659 * tpacket_rcv:
660 * lock(); increment BLOCK_NUM_PKTS; unlock()
661 * copy_bits() is in progress ...
662 * timer fires on other cpu:
663 * we can't retire the current block because copy_bits
664 * is in progress.
665 *
666 */
667 if (BLOCK_NUM_PKTS(pbd)) {
668 while (atomic_read(&pkc->blk_fill_in_prog)) {
669 /* Waiting for skb_copy_bits to finish... */
670 cpu_relax();
671 }
672 }
673
674 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
675 if (!frozen) {
41a50d62
AD
676 if (!BLOCK_NUM_PKTS(pbd)) {
677 /* An empty block. Just refresh the timer. */
678 goto refresh_timer;
679 }
f6fb8f10 680 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
681 if (!prb_dispatch_next_block(pkc, po))
682 goto refresh_timer;
683 else
684 goto out;
685 } else {
686 /* Case 1. Queue was frozen because user-space was
687 * lagging behind.
688 */
878cd3ba 689 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 690 /*
691 * Ok, user-space is still behind.
692 * So just refresh the timer.
693 */
694 goto refresh_timer;
695 } else {
696 /* Case 2. queue was frozen,user-space caught up,
697 * now the link went idle && the timer fired.
698 * We don't have a block to close.So we open this
699 * block and restart the timer.
700 * opening a block thaws the queue,restarts timer
701 * Thawing/timer-refresh is a side effect.
702 */
703 prb_open_block(pkc, pbd);
704 goto out;
705 }
706 }
707 }
708
709refresh_timer:
710 _prb_refresh_rx_retire_blk_timer(pkc);
711
712out:
713 spin_unlock(&po->sk.sk_receive_queue.lock);
714}
715
eea49cc9 716static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 717 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 718{
719 /* Flush everything minus the block header */
720
721#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
722 u8 *start, *end;
723
724 start = (u8 *)pbd1;
725
726 /* Skip the block header(we know header WILL fit in 4K) */
727 start += PAGE_SIZE;
728
729 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
730 for (; start < end; start += PAGE_SIZE)
731 flush_dcache_page(pgv_to_page(start));
732
733 smp_wmb();
734#endif
735
736 /* Now update the block status. */
737
738 BLOCK_STATUS(pbd1) = status;
739
740 /* Flush the block header */
741
742#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
743 start = (u8 *)pbd1;
744 flush_dcache_page(pgv_to_page(start));
745
746 smp_wmb();
747#endif
748}
749
750/*
751 * Side effect:
752 *
753 * 1) flush the block
754 * 2) Increment active_blk_num
755 *
756 * Note:We DONT refresh the timer on purpose.
757 * Because almost always the next block will be opened.
758 */
bc59ba39 759static void prb_close_block(struct tpacket_kbdq_core *pkc1,
760 struct tpacket_block_desc *pbd1,
f6fb8f10 761 struct packet_sock *po, unsigned int stat)
762{
763 __u32 status = TP_STATUS_USER | stat;
764
765 struct tpacket3_hdr *last_pkt;
bc59ba39 766 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 767 struct sock *sk = &po->sk;
f6fb8f10 768
ee80fbf3 769 if (po->stats.stats3.tp_drops)
f6fb8f10 770 status |= TP_STATUS_LOSING;
771
772 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
773 last_pkt->tp_next_offset = 0;
774
775 /* Get the ts of the last pkt */
776 if (BLOCK_NUM_PKTS(pbd1)) {
777 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
778 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
779 } else {
41a50d62
AD
780 /* Ok, we tmo'd - so get the current time.
781 *
782 * It shouldn't really happen as we don't close empty
783 * blocks. See prb_retire_rx_blk_timer_expired().
784 */
f6fb8f10 785 struct timespec ts;
786 getnstimeofday(&ts);
787 h1->ts_last_pkt.ts_sec = ts.tv_sec;
788 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
789 }
790
791 smp_wmb();
792
793 /* Flush the block */
794 prb_flush_block(pkc1, pbd1, status);
795
da413eec
DC
796 sk->sk_data_ready(sk);
797
f6fb8f10 798 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
799}
800
eea49cc9 801static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 802{
803 pkc->reset_pending_on_curr_blk = 0;
804}
805
806/*
807 * Side effect of opening a block:
808 *
809 * 1) prb_queue is thawed.
810 * 2) retire_blk_timer is refreshed.
811 *
812 */
bc59ba39 813static void prb_open_block(struct tpacket_kbdq_core *pkc1,
814 struct tpacket_block_desc *pbd1)
f6fb8f10 815{
816 struct timespec ts;
bc59ba39 817 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 818
819 smp_rmb();
820
8da3056c
DB
821 /* We could have just memset this but we will lose the
822 * flexibility of making the priv area sticky
823 */
f6fb8f10 824
8da3056c
DB
825 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
826 BLOCK_NUM_PKTS(pbd1) = 0;
827 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 828
8da3056c
DB
829 getnstimeofday(&ts);
830
831 h1->ts_first_pkt.ts_sec = ts.tv_sec;
832 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 833
8da3056c
DB
834 pkc1->pkblk_start = (char *)pbd1;
835 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
836
837 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
838 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
839
840 pbd1->version = pkc1->version;
841 pkc1->prev = pkc1->nxt_offset;
842 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
843
844 prb_thaw_queue(pkc1);
845 _prb_refresh_rx_retire_blk_timer(pkc1);
846
847 smp_wmb();
f6fb8f10 848}
849
850/*
851 * Queue freeze logic:
852 * 1) Assume tp_block_nr = 8 blocks.
853 * 2) At time 't0', user opens Rx ring.
854 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
855 * 4) user-space is either sleeping or processing block '0'.
856 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
857 * it will close block-7,loop around and try to fill block '0'.
858 * call-flow:
859 * __packet_lookup_frame_in_block
860 * prb_retire_current_block()
861 * prb_dispatch_next_block()
862 * |->(BLOCK_STATUS == USER) evaluates to true
863 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
864 * 6) Now there are two cases:
865 * 6.1) Link goes idle right after the queue is frozen.
866 * But remember, the last open_block() refreshed the timer.
867 * When this timer expires,it will refresh itself so that we can
868 * re-open block-0 in near future.
869 * 6.2) Link is busy and keeps on receiving packets. This is a simple
870 * case and __packet_lookup_frame_in_block will check if block-0
871 * is free and can now be re-used.
872 */
eea49cc9 873static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 874 struct packet_sock *po)
875{
876 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 877 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 878}
879
880#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
881
882/*
883 * If the next block is free then we will dispatch it
884 * and return a good offset.
885 * Else, we will freeze the queue.
886 * So, caller must check the return value.
887 */
bc59ba39 888static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 889 struct packet_sock *po)
890{
bc59ba39 891 struct tpacket_block_desc *pbd;
f6fb8f10 892
893 smp_rmb();
894
895 /* 1. Get current block num */
896 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
897
898 /* 2. If this block is currently in_use then freeze the queue */
899 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
900 prb_freeze_queue(pkc, po);
901 return NULL;
902 }
903
904 /*
905 * 3.
906 * open this block and return the offset where the first packet
907 * needs to get stored.
908 */
909 prb_open_block(pkc, pbd);
910 return (void *)pkc->nxt_offset;
911}
912
bc59ba39 913static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 914 struct packet_sock *po, unsigned int status)
915{
bc59ba39 916 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 917
918 /* retire/close the current block */
919 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
920 /*
921 * Plug the case where copy_bits() is in progress on
922 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
923 * have space to copy the pkt in the current block and
924 * called prb_retire_current_block()
925 *
926 * We don't need to worry about the TMO case because
927 * the timer-handler already handled this case.
928 */
929 if (!(status & TP_STATUS_BLK_TMO)) {
930 while (atomic_read(&pkc->blk_fill_in_prog)) {
931 /* Waiting for skb_copy_bits to finish... */
932 cpu_relax();
933 }
934 }
935 prb_close_block(pkc, pbd, po, status);
936 return;
937 }
f6fb8f10 938}
939
878cd3ba 940static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 941{
942 return TP_STATUS_USER & BLOCK_STATUS(pbd);
943}
944
eea49cc9 945static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 946{
947 return pkc->reset_pending_on_curr_blk;
948}
949
eea49cc9 950static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 951{
bc59ba39 952 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 953 atomic_dec(&pkc->blk_fill_in_prog);
954}
955
eea49cc9 956static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 957 struct tpacket3_hdr *ppd)
958{
3958afa1 959 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 960}
961
eea49cc9 962static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 963 struct tpacket3_hdr *ppd)
964{
965 ppd->hv1.tp_rxhash = 0;
966}
967
eea49cc9 968static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 969 struct tpacket3_hdr *ppd)
970{
df8a39de
JP
971 if (skb_vlan_tag_present(pkc->skb)) {
972 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
973 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
974 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 975 } else {
9e67030a 976 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 977 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 978 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 979 }
980}
981
bc59ba39 982static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 983 struct tpacket3_hdr *ppd)
984{
a0cdfcf3 985 ppd->hv1.tp_padding = 0;
f6fb8f10 986 prb_fill_vlan_info(pkc, ppd);
987
988 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
989 prb_fill_rxhash(pkc, ppd);
990 else
991 prb_clear_rxhash(pkc, ppd);
992}
993
eea49cc9 994static void prb_fill_curr_block(char *curr,
bc59ba39 995 struct tpacket_kbdq_core *pkc,
996 struct tpacket_block_desc *pbd,
f6fb8f10 997 unsigned int len)
998{
999 struct tpacket3_hdr *ppd;
1000
1001 ppd = (struct tpacket3_hdr *)curr;
1002 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 pkc->prev = curr;
1004 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1006 BLOCK_NUM_PKTS(pbd) += 1;
1007 atomic_inc(&pkc->blk_fill_in_prog);
1008 prb_run_all_ft_ops(pkc, ppd);
1009}
1010
1011/* Assumes caller has the sk->rx_queue.lock */
1012static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1013 struct sk_buff *skb,
1014 int status,
1015 unsigned int len
1016 )
1017{
bc59ba39 1018 struct tpacket_kbdq_core *pkc;
1019 struct tpacket_block_desc *pbd;
f6fb8f10 1020 char *curr, *end;
1021
e3192690 1022 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1023 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1024
1025 /* Queue is frozen when user space is lagging behind */
1026 if (prb_queue_frozen(pkc)) {
1027 /*
1028 * Check if that last block which caused the queue to freeze,
1029 * is still in_use by user-space.
1030 */
878cd3ba 1031 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1032 /* Can't record this packet */
1033 return NULL;
1034 } else {
1035 /*
1036 * Ok, the block was released by user-space.
1037 * Now let's open that block.
1038 * opening a block also thaws the queue.
1039 * Thawing is a side effect.
1040 */
1041 prb_open_block(pkc, pbd);
1042 }
1043 }
1044
1045 smp_mb();
1046 curr = pkc->nxt_offset;
1047 pkc->skb = skb;
e3192690 1048 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1049
1050 /* first try the current block */
1051 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1052 prb_fill_curr_block(curr, pkc, pbd, len);
1053 return (void *)curr;
1054 }
1055
1056 /* Ok, close the current block */
1057 prb_retire_current_block(pkc, po, 0);
1058
1059 /* Now, try to dispatch the next block */
1060 curr = (char *)prb_dispatch_next_block(pkc, po);
1061 if (curr) {
1062 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1063 prb_fill_curr_block(curr, pkc, pbd, len);
1064 return (void *)curr;
1065 }
1066
1067 /*
1068 * No free blocks are available.user_space hasn't caught up yet.
1069 * Queue was just frozen and now this packet will get dropped.
1070 */
1071 return NULL;
1072}
1073
eea49cc9 1074static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1075 struct sk_buff *skb,
1076 int status, unsigned int len)
1077{
1078 char *curr = NULL;
1079 switch (po->tp_version) {
1080 case TPACKET_V1:
1081 case TPACKET_V2:
1082 curr = packet_lookup_frame(po, &po->rx_ring,
1083 po->rx_ring.head, status);
1084 return curr;
1085 case TPACKET_V3:
1086 return __packet_lookup_frame_in_block(po, skb, status, len);
1087 default:
1088 WARN(1, "TPACKET version not supported\n");
1089 BUG();
99aa3473 1090 return NULL;
f6fb8f10 1091 }
1092}
1093
eea49cc9 1094static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1095 struct packet_ring_buffer *rb,
77f65ebd 1096 unsigned int idx,
f6fb8f10 1097 int status)
1098{
bc59ba39 1099 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1100 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1101
1102 if (status != BLOCK_STATUS(pbd))
1103 return NULL;
1104 return pbd;
1105}
1106
eea49cc9 1107static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1108{
1109 unsigned int prev;
1110 if (rb->prb_bdqc.kactive_blk_num)
1111 prev = rb->prb_bdqc.kactive_blk_num-1;
1112 else
1113 prev = rb->prb_bdqc.knum_blocks-1;
1114 return prev;
1115}
1116
1117/* Assumes caller has held the rx_queue.lock */
eea49cc9 1118static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 unsigned int previous = prb_previous_blk_num(rb);
1123 return prb_lookup_block(po, rb, previous, status);
1124}
1125
eea49cc9 1126static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 if (po->tp_version <= TPACKET_V2)
1131 return packet_previous_frame(po, rb, status);
1132
1133 return __prb_previous_block(po, rb, status);
1134}
1135
eea49cc9 1136static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1137 struct packet_ring_buffer *rb)
1138{
1139 switch (po->tp_version) {
1140 case TPACKET_V1:
1141 case TPACKET_V2:
1142 return packet_increment_head(rb);
1143 case TPACKET_V3:
1144 default:
1145 WARN(1, "TPACKET version not supported.\n");
1146 BUG();
1147 return;
1148 }
1149}
1150
eea49cc9 1151static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1152 struct packet_ring_buffer *rb,
1153 int status)
1154{
1155 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1156 return packet_lookup_frame(po, rb, previous, status);
1157}
1158
eea49cc9 1159static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1160{
1161 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1162}
1163
b0138408
DB
1164static void packet_inc_pending(struct packet_ring_buffer *rb)
1165{
1166 this_cpu_inc(*rb->pending_refcnt);
1167}
1168
1169static void packet_dec_pending(struct packet_ring_buffer *rb)
1170{
1171 this_cpu_dec(*rb->pending_refcnt);
1172}
1173
1174static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1175{
1176 unsigned int refcnt = 0;
1177 int cpu;
1178
1179 /* We don't use pending refcount in rx_ring. */
1180 if (rb->pending_refcnt == NULL)
1181 return 0;
1182
1183 for_each_possible_cpu(cpu)
1184 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1185
1186 return refcnt;
1187}
1188
1189static int packet_alloc_pending(struct packet_sock *po)
1190{
1191 po->rx_ring.pending_refcnt = NULL;
1192
1193 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1194 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1195 return -ENOBUFS;
1196
1197 return 0;
1198}
1199
1200static void packet_free_pending(struct packet_sock *po)
1201{
1202 free_percpu(po->tx_ring.pending_refcnt);
1203}
1204
9954729b
WB
1205#define ROOM_POW_OFF 2
1206#define ROOM_NONE 0x0
1207#define ROOM_LOW 0x1
1208#define ROOM_NORMAL 0x2
1209
1210static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1211{
9954729b
WB
1212 int idx, len;
1213
1214 len = po->rx_ring.frame_max + 1;
1215 idx = po->rx_ring.head;
1216 if (pow_off)
1217 idx += len >> pow_off;
1218 if (idx >= len)
1219 idx -= len;
1220 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1221}
1222
1223static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1224{
1225 int idx, len;
1226
1227 len = po->rx_ring.prb_bdqc.knum_blocks;
1228 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1229 if (pow_off)
1230 idx += len >> pow_off;
1231 if (idx >= len)
1232 idx -= len;
1233 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1234}
77f65ebd 1235
2ccdbaa6 1236static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1237{
1238 struct sock *sk = &po->sk;
1239 int ret = ROOM_NONE;
1240
1241 if (po->prot_hook.func != tpacket_rcv) {
1242 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1243 - (skb ? skb->truesize : 0);
9954729b
WB
1244 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1245 return ROOM_NORMAL;
1246 else if (avail > 0)
1247 return ROOM_LOW;
1248 else
1249 return ROOM_NONE;
1250 }
77f65ebd 1251
9954729b
WB
1252 if (po->tp_version == TPACKET_V3) {
1253 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1254 ret = ROOM_NORMAL;
1255 else if (__tpacket_v3_has_room(po, 0))
1256 ret = ROOM_LOW;
1257 } else {
1258 if (__tpacket_has_room(po, ROOM_POW_OFF))
1259 ret = ROOM_NORMAL;
1260 else if (__tpacket_has_room(po, 0))
1261 ret = ROOM_LOW;
1262 }
2ccdbaa6
WB
1263
1264 return ret;
1265}
1266
1267static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1268{
1269 int ret;
1270 bool has_room;
1271
54d7c01d
WB
1272 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1273 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1274 has_room = ret == ROOM_NORMAL;
1275 if (po->pressure == has_room)
54d7c01d
WB
1276 po->pressure = !has_room;
1277 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1278
9954729b 1279 return ret;
77f65ebd
WB
1280}
1281
1da177e4
LT
1282static void packet_sock_destruct(struct sock *sk)
1283{
ed85b565
RC
1284 skb_queue_purge(&sk->sk_error_queue);
1285
547b792c 1286 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1287 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1288
1289 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1290 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1291 return;
1292 }
1293
17ab56a2 1294 sk_refcnt_debug_dec(sk);
1da177e4
LT
1295}
1296
3b3a5b0a
WB
1297static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1298{
1299 u32 rxhash;
1300 int i, count = 0;
1301
1302 rxhash = skb_get_hash(skb);
1303 for (i = 0; i < ROLLOVER_HLEN; i++)
1304 if (po->rollover->history[i] == rxhash)
1305 count++;
1306
1307 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1308 return count > (ROLLOVER_HLEN >> 1);
1309}
1310
77f65ebd
WB
1311static unsigned int fanout_demux_hash(struct packet_fanout *f,
1312 struct sk_buff *skb,
1313 unsigned int num)
dc99f600 1314{
eb70db87 1315 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1316}
1317
77f65ebd
WB
1318static unsigned int fanout_demux_lb(struct packet_fanout *f,
1319 struct sk_buff *skb,
1320 unsigned int num)
dc99f600 1321{
468479e6 1322 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1323
468479e6 1324 return val % num;
77f65ebd
WB
1325}
1326
1327static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1328 struct sk_buff *skb,
1329 unsigned int num)
1330{
1331 return smp_processor_id() % num;
dc99f600
DM
1332}
1333
5df0ddfb
DB
1334static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1335 struct sk_buff *skb,
1336 unsigned int num)
1337{
f337db64 1338 return prandom_u32_max(num);
5df0ddfb
DB
1339}
1340
77f65ebd
WB
1341static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1342 struct sk_buff *skb,
ad377cab 1343 unsigned int idx, bool try_self,
77f65ebd 1344 unsigned int num)
95ec3eb4 1345{
4633c9e0 1346 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1347 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1348
0648ab70 1349 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1350
1351 if (try_self) {
1352 room = packet_rcv_has_room(po, skb);
1353 if (room == ROOM_NORMAL ||
1354 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1355 return idx;
4633c9e0 1356 po_skip = po;
3b3a5b0a 1357 }
ad377cab 1358
0648ab70 1359 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1360 do {
2ccdbaa6 1361 po_next = pkt_sk(f->arr[i]);
4633c9e0 1362 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1363 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1364 if (i != j)
0648ab70 1365 po->rollover->sock = i;
a9b63918
WB
1366 atomic_long_inc(&po->rollover->num);
1367 if (room == ROOM_LOW)
1368 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1369 return i;
1370 }
ad377cab 1371
77f65ebd
WB
1372 if (++i == num)
1373 i = 0;
1374 } while (i != j);
1375
a9b63918 1376 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1377 return idx;
1378}
1379
2d36097d
NH
1380static unsigned int fanout_demux_qm(struct packet_fanout *f,
1381 struct sk_buff *skb,
1382 unsigned int num)
1383{
1384 return skb_get_queue_mapping(skb) % num;
1385}
1386
47dceb8e
WB
1387static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1388 struct sk_buff *skb,
1389 unsigned int num)
1390{
1391 struct bpf_prog *prog;
1392 unsigned int ret = 0;
1393
1394 rcu_read_lock();
1395 prog = rcu_dereference(f->bpf_prog);
1396 if (prog)
ff936a04 1397 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1398 rcu_read_unlock();
1399
1400 return ret;
1401}
1402
77f65ebd
WB
1403static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1404{
1405 return f->flags & (flag >> 8);
95ec3eb4
DM
1406}
1407
95ec3eb4
DM
1408static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1409 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1410{
1411 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1412 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1413 struct net *net = read_pnet(&f->net);
dc99f600 1414 struct packet_sock *po;
77f65ebd 1415 unsigned int idx;
dc99f600 1416
19bcf9f2 1417 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1418 kfree_skb(skb);
1419 return 0;
1420 }
1421
3f34b24a 1422 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1423 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1424 if (!skb)
1425 return 0;
1426 }
95ec3eb4
DM
1427 switch (f->type) {
1428 case PACKET_FANOUT_HASH:
1429 default:
77f65ebd 1430 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1431 break;
1432 case PACKET_FANOUT_LB:
77f65ebd 1433 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1434 break;
1435 case PACKET_FANOUT_CPU:
77f65ebd
WB
1436 idx = fanout_demux_cpu(f, skb, num);
1437 break;
5df0ddfb
DB
1438 case PACKET_FANOUT_RND:
1439 idx = fanout_demux_rnd(f, skb, num);
1440 break;
2d36097d
NH
1441 case PACKET_FANOUT_QM:
1442 idx = fanout_demux_qm(f, skb, num);
1443 break;
77f65ebd 1444 case PACKET_FANOUT_ROLLOVER:
ad377cab 1445 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1446 break;
47dceb8e 1447 case PACKET_FANOUT_CBPF:
f2e52095 1448 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1449 idx = fanout_demux_bpf(f, skb, num);
1450 break;
dc99f600
DM
1451 }
1452
ad377cab
WB
1453 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1454 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1455
ad377cab 1456 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1457 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1458}
1459
fff3321d
PE
1460DEFINE_MUTEX(fanout_mutex);
1461EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1462static LIST_HEAD(fanout_list);
4a69a864 1463static u16 fanout_next_id;
dc99f600
DM
1464
1465static void __fanout_link(struct sock *sk, struct packet_sock *po)
1466{
1467 struct packet_fanout *f = po->fanout;
1468
1469 spin_lock(&f->lock);
1470 f->arr[f->num_members] = sk;
1471 smp_wmb();
1472 f->num_members++;
2bd624b4
AS
1473 if (f->num_members == 1)
1474 dev_add_pack(&f->prot_hook);
dc99f600
DM
1475 spin_unlock(&f->lock);
1476}
1477
1478static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1479{
1480 struct packet_fanout *f = po->fanout;
1481 int i;
1482
1483 spin_lock(&f->lock);
1484 for (i = 0; i < f->num_members; i++) {
1485 if (f->arr[i] == sk)
1486 break;
1487 }
1488 BUG_ON(i >= f->num_members);
1489 f->arr[i] = f->arr[f->num_members - 1];
1490 f->num_members--;
2bd624b4
AS
1491 if (f->num_members == 0)
1492 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1493 spin_unlock(&f->lock);
1494}
1495
d4dd8aee 1496static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1497{
161642e2
ED
1498 if (sk->sk_family != PF_PACKET)
1499 return false;
c0de08d0 1500
161642e2 1501 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1502}
1503
47dceb8e
WB
1504static void fanout_init_data(struct packet_fanout *f)
1505{
1506 switch (f->type) {
1507 case PACKET_FANOUT_LB:
1508 atomic_set(&f->rr_cur, 0);
1509 break;
1510 case PACKET_FANOUT_CBPF:
f2e52095 1511 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1512 RCU_INIT_POINTER(f->bpf_prog, NULL);
1513 break;
1514 }
1515}
1516
1517static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1518{
1519 struct bpf_prog *old;
1520
1521 spin_lock(&f->lock);
1522 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1523 rcu_assign_pointer(f->bpf_prog, new);
1524 spin_unlock(&f->lock);
1525
1526 if (old) {
1527 synchronize_net();
1528 bpf_prog_destroy(old);
1529 }
1530}
1531
1532static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1533 unsigned int len)
1534{
1535 struct bpf_prog *new;
1536 struct sock_fprog fprog;
1537 int ret;
1538
1539 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1540 return -EPERM;
1541 if (len != sizeof(fprog))
1542 return -EINVAL;
1543 if (copy_from_user(&fprog, data, len))
1544 return -EFAULT;
1545
bab18991 1546 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1547 if (ret)
1548 return ret;
1549
1550 __fanout_set_data_bpf(po->fanout, new);
1551 return 0;
1552}
1553
f2e52095
WB
1554static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1555 unsigned int len)
1556{
1557 struct bpf_prog *new;
1558 u32 fd;
1559
1560 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1561 return -EPERM;
1562 if (len != sizeof(fd))
1563 return -EINVAL;
1564 if (copy_from_user(&fd, data, len))
1565 return -EFAULT;
1566
113214be 1567 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1568 if (IS_ERR(new))
1569 return PTR_ERR(new);
f2e52095
WB
1570
1571 __fanout_set_data_bpf(po->fanout, new);
1572 return 0;
1573}
1574
47dceb8e
WB
1575static int fanout_set_data(struct packet_sock *po, char __user *data,
1576 unsigned int len)
1577{
1578 switch (po->fanout->type) {
1579 case PACKET_FANOUT_CBPF:
1580 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1581 case PACKET_FANOUT_EBPF:
1582 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1583 default:
1584 return -EINVAL;
07d53ae4 1585 }
47dceb8e
WB
1586}
1587
1588static void fanout_release_data(struct packet_fanout *f)
1589{
1590 switch (f->type) {
1591 case PACKET_FANOUT_CBPF:
f2e52095 1592 case PACKET_FANOUT_EBPF:
47dceb8e 1593 __fanout_set_data_bpf(f, NULL);
07d53ae4 1594 }
47dceb8e
WB
1595}
1596
4a69a864
MM
1597static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1598{
1599 struct packet_fanout *f;
1600
1601 list_for_each_entry(f, &fanout_list, list) {
1602 if (f->id == candidate_id &&
1603 read_pnet(&f->net) == sock_net(sk)) {
1604 return false;
1605 }
1606 }
1607 return true;
1608}
1609
1610static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1611{
1612 u16 id = fanout_next_id;
1613
1614 do {
1615 if (__fanout_id_is_free(sk, id)) {
1616 *new_id = id;
1617 fanout_next_id = id + 1;
1618 return true;
1619 }
1620
1621 id++;
1622 } while (id != fanout_next_id);
1623
1624 return false;
1625}
1626
7736d33f 1627static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1628{
d199fab6 1629 struct packet_rollover *rollover = NULL;
dc99f600
DM
1630 struct packet_sock *po = pkt_sk(sk);
1631 struct packet_fanout *f, *match;
7736d33f 1632 u8 type = type_flags & 0xff;
77f65ebd 1633 u8 flags = type_flags >> 8;
dc99f600
DM
1634 int err;
1635
1636 switch (type) {
77f65ebd
WB
1637 case PACKET_FANOUT_ROLLOVER:
1638 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1639 return -EINVAL;
dc99f600
DM
1640 case PACKET_FANOUT_HASH:
1641 case PACKET_FANOUT_LB:
95ec3eb4 1642 case PACKET_FANOUT_CPU:
5df0ddfb 1643 case PACKET_FANOUT_RND:
2d36097d 1644 case PACKET_FANOUT_QM:
47dceb8e 1645 case PACKET_FANOUT_CBPF:
f2e52095 1646 case PACKET_FANOUT_EBPF:
dc99f600
DM
1647 break;
1648 default:
1649 return -EINVAL;
1650 }
1651
d199fab6
ED
1652 mutex_lock(&fanout_mutex);
1653
d199fab6 1654 err = -EALREADY;
dc99f600 1655 if (po->fanout)
d199fab6 1656 goto out;
dc99f600 1657
4633c9e0
WB
1658 if (type == PACKET_FANOUT_ROLLOVER ||
1659 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1660 err = -ENOMEM;
1661 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1662 if (!rollover)
1663 goto out;
1664 atomic_long_set(&rollover->num, 0);
1665 atomic_long_set(&rollover->num_huge, 0);
1666 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1667 }
1668
4a69a864
MM
1669 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1670 if (id != 0) {
1671 err = -EINVAL;
1672 goto out;
1673 }
1674 if (!fanout_find_new_id(sk, &id)) {
1675 err = -ENOMEM;
1676 goto out;
1677 }
1678 /* ephemeral flag for the first socket in the group: drop it */
1679 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1680 }
1681
dc99f600
DM
1682 match = NULL;
1683 list_for_each_entry(f, &fanout_list, list) {
1684 if (f->id == id &&
1685 read_pnet(&f->net) == sock_net(sk)) {
1686 match = f;
1687 break;
1688 }
1689 }
afe62c68 1690 err = -EINVAL;
77f65ebd 1691 if (match && match->flags != flags)
afe62c68 1692 goto out;
dc99f600 1693 if (!match) {
afe62c68 1694 err = -ENOMEM;
dc99f600 1695 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1696 if (!match)
1697 goto out;
1698 write_pnet(&match->net, sock_net(sk));
1699 match->id = id;
1700 match->type = type;
77f65ebd 1701 match->flags = flags;
afe62c68
ED
1702 INIT_LIST_HEAD(&match->list);
1703 spin_lock_init(&match->lock);
fb5c2c17 1704 refcount_set(&match->sk_ref, 0);
47dceb8e 1705 fanout_init_data(match);
afe62c68
ED
1706 match->prot_hook.type = po->prot_hook.type;
1707 match->prot_hook.dev = po->prot_hook.dev;
1708 match->prot_hook.func = packet_rcv_fanout;
1709 match->prot_hook.af_packet_priv = match;
c0de08d0 1710 match->prot_hook.id_match = match_fanout_group;
afe62c68 1711 list_add(&match->list, &fanout_list);
dc99f600 1712 }
afe62c68 1713 err = -EINVAL;
008ba2a1
WB
1714
1715 spin_lock(&po->bind_lock);
1716 if (po->running &&
1717 match->type == type &&
afe62c68
ED
1718 match->prot_hook.type == po->prot_hook.type &&
1719 match->prot_hook.dev == po->prot_hook.dev) {
1720 err = -ENOSPC;
fb5c2c17 1721 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1722 __dev_remove_pack(&po->prot_hook);
1723 po->fanout = match;
57f015f5
MM
1724 po->rollover = rollover;
1725 rollover = NULL;
fb5c2c17 1726 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1727 __fanout_link(sk, po);
1728 err = 0;
dc99f600
DM
1729 }
1730 }
008ba2a1
WB
1731 spin_unlock(&po->bind_lock);
1732
1733 if (err && !refcount_read(&match->sk_ref)) {
1734 list_del(&match->list);
1735 kfree(match);
1736 }
1737
afe62c68 1738out:
57f015f5 1739 kfree(rollover);
d199fab6 1740 mutex_unlock(&fanout_mutex);
dc99f600
DM
1741 return err;
1742}
1743
2bd624b4
AS
1744/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1745 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1746 * It is the responsibility of the caller to call fanout_release_data() and
1747 * free the returned packet_fanout (after synchronize_net())
1748 */
1749static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1750{
1751 struct packet_sock *po = pkt_sk(sk);
1752 struct packet_fanout *f;
1753
fff3321d 1754 mutex_lock(&fanout_mutex);
d199fab6
ED
1755 f = po->fanout;
1756 if (f) {
1757 po->fanout = NULL;
1758
fb5c2c17 1759 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1760 list_del(&f->list);
2bd624b4
AS
1761 else
1762 f = NULL;
dc99f600
DM
1763 }
1764 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1765
1766 return f;
dc99f600 1767}
1da177e4 1768
3c70c132
DB
1769static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1770 struct sk_buff *skb)
1771{
1772 /* Earlier code assumed this would be a VLAN pkt, double-check
1773 * this now that we have the actual packet in hand. We can only
1774 * do this check on Ethernet devices.
1775 */
1776 if (unlikely(dev->type != ARPHRD_ETHER))
1777 return false;
1778
1779 skb_reset_mac_header(skb);
1780 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1781}
1782
90ddc4f0 1783static const struct proto_ops packet_ops;
1da177e4 1784
90ddc4f0 1785static const struct proto_ops packet_ops_spkt;
1da177e4 1786
40d4e3df
ED
1787static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1788 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1789{
1790 struct sock *sk;
1791 struct sockaddr_pkt *spkt;
1792
1793 /*
1794 * When we registered the protocol we saved the socket in the data
1795 * field for just this event.
1796 */
1797
1798 sk = pt->af_packet_priv;
1ce4f28b 1799
1da177e4
LT
1800 /*
1801 * Yank back the headers [hope the device set this
1802 * right or kerboom...]
1803 *
1804 * Incoming packets have ll header pulled,
1805 * push it back.
1806 *
98e399f8 1807 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1808 * so that this procedure is noop.
1809 */
1810
1811 if (skb->pkt_type == PACKET_LOOPBACK)
1812 goto out;
1813
09ad9bc7 1814 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1815 goto out;
1816
40d4e3df
ED
1817 skb = skb_share_check(skb, GFP_ATOMIC);
1818 if (skb == NULL)
1da177e4
LT
1819 goto oom;
1820
1821 /* drop any routing info */
adf30907 1822 skb_dst_drop(skb);
1da177e4 1823
84531c24
PO
1824 /* drop conntrack reference */
1825 nf_reset(skb);
1826
ffbc6111 1827 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1828
98e399f8 1829 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1830
1831 /*
1832 * The SOCK_PACKET socket receives _all_ frames.
1833 */
1834
1835 spkt->spkt_family = dev->type;
1836 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1837 spkt->spkt_protocol = skb->protocol;
1838
1839 /*
1840 * Charge the memory to the socket. This is done specifically
1841 * to prevent sockets using all the memory up.
1842 */
1843
40d4e3df 1844 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1845 return 0;
1846
1847out:
1848 kfree_skb(skb);
1849oom:
1850 return 0;
1851}
1852
75c65772
MM
1853static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1854{
18bed891
YK
1855 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1856 sock->type == SOCK_RAW) {
75c65772
MM
1857 skb_reset_mac_header(skb);
1858 skb->protocol = dev_parse_header_protocol(skb);
1859 }
1860
1861 skb_probe_transport_header(skb);
1862}
1da177e4
LT
1863
1864/*
1865 * Output a raw packet to a device layer. This bypasses all the other
1866 * protocol layers and you must therefore supply it with a complete frame
1867 */
1ce4f28b 1868
1b784140
YX
1869static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1870 size_t len)
1da177e4
LT
1871{
1872 struct sock *sk = sock->sk;
342dfc30 1873 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1874 struct sk_buff *skb = NULL;
1da177e4 1875 struct net_device *dev;
c14ac945 1876 struct sockcm_cookie sockc;
40d4e3df 1877 __be16 proto = 0;
1da177e4 1878 int err;
3bdc0eba 1879 int extra_len = 0;
1ce4f28b 1880
1da177e4 1881 /*
1ce4f28b 1882 * Get and verify the address.
1da177e4
LT
1883 */
1884
40d4e3df 1885 if (saddr) {
1da177e4 1886 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1887 return -EINVAL;
1888 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1889 proto = saddr->spkt_protocol;
1890 } else
1891 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1892
1893 /*
1ce4f28b 1894 * Find the device first to size check it
1da177e4
LT
1895 */
1896
de74e92a 1897 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1898retry:
654d1f8a
ED
1899 rcu_read_lock();
1900 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1901 err = -ENODEV;
1902 if (dev == NULL)
1903 goto out_unlock;
1ce4f28b 1904
d5e76b0a
DM
1905 err = -ENETDOWN;
1906 if (!(dev->flags & IFF_UP))
1907 goto out_unlock;
1908
1da177e4 1909 /*
40d4e3df
ED
1910 * You may not queue a frame bigger than the mtu. This is the lowest level
1911 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1912 */
1ce4f28b 1913
3bdc0eba
BG
1914 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1915 if (!netif_supports_nofcs(dev)) {
1916 err = -EPROTONOSUPPORT;
1917 goto out_unlock;
1918 }
1919 extra_len = 4; /* We're doing our own CRC */
1920 }
1921
1da177e4 1922 err = -EMSGSIZE;
3bdc0eba 1923 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1924 goto out_unlock;
1925
1a35ca80
ED
1926 if (!skb) {
1927 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1928 int tlen = dev->needed_tailroom;
1a35ca80
ED
1929 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1930
1931 rcu_read_unlock();
4ce40912 1932 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1933 if (skb == NULL)
1934 return -ENOBUFS;
1935 /* FIXME: Save some space for broken drivers that write a hard
1936 * header at transmission time by themselves. PPP is the notable
1937 * one here. This should really be fixed at the driver level.
1938 */
1939 skb_reserve(skb, reserved);
1940 skb_reset_network_header(skb);
1941
1942 /* Try to align data part correctly */
1943 if (hhlen) {
1944 skb->data -= hhlen;
1945 skb->tail -= hhlen;
1946 if (len < hhlen)
1947 skb_reset_network_header(skb);
1948 }
6ce8e9ce 1949 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1950 if (err)
1951 goto out_free;
1952 goto retry;
1da177e4
LT
1953 }
1954
9ed988cd
WB
1955 if (!dev_validate_header(dev, skb->data, len)) {
1956 err = -EINVAL;
1957 goto out_unlock;
1958 }
3c70c132
DB
1959 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1960 !packet_extra_vlan_len_allowed(dev, skb)) {
1961 err = -EMSGSIZE;
1962 goto out_unlock;
57f89bfa 1963 }
1a35ca80 1964
657a0667 1965 sockcm_init(&sockc, sk);
c14ac945
SHY
1966 if (msg->msg_controllen) {
1967 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1968 if (unlikely(err))
c14ac945 1969 goto out_unlock;
c14ac945
SHY
1970 }
1971
1da177e4
LT
1972 skb->protocol = proto;
1973 skb->dev = dev;
1974 skb->priority = sk->sk_priority;
2d37a186 1975 skb->mark = sk->sk_mark;
3d0ba8c0 1976 skb->tstamp = sockc.transmit_time;
bf84a010 1977
8f932f76 1978 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1979
3bdc0eba
BG
1980 if (unlikely(extra_len == 4))
1981 skb->no_fcs = 1;
1982
75c65772 1983 packet_parse_headers(skb, sock);
c1aad275 1984
1da177e4 1985 dev_queue_xmit(skb);
654d1f8a 1986 rcu_read_unlock();
40d4e3df 1987 return len;
1da177e4 1988
1da177e4 1989out_unlock:
654d1f8a 1990 rcu_read_unlock();
1a35ca80
ED
1991out_free:
1992 kfree_skb(skb);
1da177e4
LT
1993 return err;
1994}
1da177e4 1995
ff936a04
AS
1996static unsigned int run_filter(struct sk_buff *skb,
1997 const struct sock *sk,
1998 unsigned int res)
1da177e4
LT
1999{
2000 struct sk_filter *filter;
fda9ef5d 2001
80f8f102
ED
2002 rcu_read_lock();
2003 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2004 if (filter != NULL)
ff936a04 2005 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2006 rcu_read_unlock();
1da177e4 2007
dbcb5855 2008 return res;
1da177e4
LT
2009}
2010
16cc1400
WB
2011static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2012 size_t *len)
2013{
2014 struct virtio_net_hdr vnet_hdr;
2015
2016 if (*len < sizeof(vnet_hdr))
2017 return -EINVAL;
2018 *len -= sizeof(vnet_hdr);
2019
fd3a8862 2020 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2021 return -EINVAL;
2022
2023 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2024}
2025
1da177e4 2026/*
62ab0812
ED
2027 * This function makes lazy skb cloning in hope that most of packets
2028 * are discarded by BPF.
2029 *
2030 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2031 * and skb->cb are mangled. It works because (and until) packets
2032 * falling here are owned by current CPU. Output packets are cloned
2033 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2034 * sequencially, so that if we return skb to original state on exit,
2035 * we will not harm anyone.
1da177e4
LT
2036 */
2037
40d4e3df
ED
2038static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2039 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2040{
2041 struct sock *sk;
2042 struct sockaddr_ll *sll;
2043 struct packet_sock *po;
40d4e3df 2044 u8 *skb_head = skb->data;
1da177e4 2045 int skb_len = skb->len;
dbcb5855 2046 unsigned int snaplen, res;
da37845f 2047 bool is_drop_n_account = false;
1da177e4
LT
2048
2049 if (skb->pkt_type == PACKET_LOOPBACK)
2050 goto drop;
2051
2052 sk = pt->af_packet_priv;
2053 po = pkt_sk(sk);
2054
09ad9bc7 2055 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2056 goto drop;
2057
1da177e4
LT
2058 skb->dev = dev;
2059
3b04ddde 2060 if (dev->header_ops) {
1da177e4 2061 /* The device has an explicit notion of ll header,
62ab0812
ED
2062 * exported to higher levels.
2063 *
2064 * Otherwise, the device hides details of its frame
2065 * structure, so that corresponding packet head is
2066 * never delivered to user.
1da177e4
LT
2067 */
2068 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2069 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2070 else if (skb->pkt_type == PACKET_OUTGOING) {
2071 /* Special case: outgoing packets have ll header at head */
bbe735e4 2072 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2073 }
2074 }
2075
2076 snaplen = skb->len;
2077
dbcb5855
DM
2078 res = run_filter(skb, sk, snaplen);
2079 if (!res)
fda9ef5d 2080 goto drop_n_restore;
dbcb5855
DM
2081 if (snaplen > res)
2082 snaplen = res;
1da177e4 2083
0fd7bac6 2084 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2085 goto drop_n_acct;
2086
2087 if (skb_shared(skb)) {
2088 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2089 if (nskb == NULL)
2090 goto drop_n_acct;
2091
2092 if (skb_head != skb->data) {
2093 skb->data = skb_head;
2094 skb->len = skb_len;
2095 }
abc4e4fa 2096 consume_skb(skb);
1da177e4
LT
2097 skb = nskb;
2098 }
2099
b4772ef8 2100 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2101
2102 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2103 sll->sll_hatype = dev->type;
1da177e4 2104 sll->sll_pkttype = skb->pkt_type;
8032b464 2105 if (unlikely(po->origdev))
80feaacb
PWJ
2106 sll->sll_ifindex = orig_dev->ifindex;
2107 else
2108 sll->sll_ifindex = dev->ifindex;
1da177e4 2109
b95cce35 2110 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2111
2472d761
EB
2112 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2113 * Use their space for storing the original skb length.
2114 */
2115 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2116
1da177e4
LT
2117 if (pskb_trim(skb, snaplen))
2118 goto drop_n_acct;
2119
2120 skb_set_owner_r(skb, sk);
2121 skb->dev = NULL;
adf30907 2122 skb_dst_drop(skb);
1da177e4 2123
84531c24
PO
2124 /* drop conntrack reference */
2125 nf_reset(skb);
2126
1da177e4 2127 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2128 po->stats.stats1.tp_packets++;
3bc3b96f 2129 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2130 __skb_queue_tail(&sk->sk_receive_queue, skb);
2131 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2132 sk->sk_data_ready(sk);
1da177e4
LT
2133 return 0;
2134
2135drop_n_acct:
da37845f 2136 is_drop_n_account = true;
7091fbd8 2137 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2138 po->stats.stats1.tp_drops++;
7091fbd8
WB
2139 atomic_inc(&sk->sk_drops);
2140 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2141
2142drop_n_restore:
2143 if (skb_head != skb->data && skb_shared(skb)) {
2144 skb->data = skb_head;
2145 skb->len = skb_len;
2146 }
2147drop:
da37845f
WJ
2148 if (!is_drop_n_account)
2149 consume_skb(skb);
2150 else
2151 kfree_skb(skb);
1da177e4
LT
2152 return 0;
2153}
2154
40d4e3df
ED
2155static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2156 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2157{
2158 struct sock *sk;
2159 struct packet_sock *po;
2160 struct sockaddr_ll *sll;
184f489e 2161 union tpacket_uhdr h;
40d4e3df 2162 u8 *skb_head = skb->data;
1da177e4 2163 int skb_len = skb->len;
dbcb5855 2164 unsigned int snaplen, res;
f6fb8f10 2165 unsigned long status = TP_STATUS_USER;
bbd6ef87 2166 unsigned short macoff, netoff, hdrlen;
1da177e4 2167 struct sk_buff *copy_skb = NULL;
bbd6ef87 2168 struct timespec ts;
b9c32fb2 2169 __u32 ts_status;
da37845f 2170 bool is_drop_n_account = false;
edbd58be 2171 bool do_vnet = false;
1da177e4 2172
51846355
AW
2173 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2174 * We may add members to them until current aligned size without forcing
2175 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2176 */
2177 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2178 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2179
1da177e4
LT
2180 if (skb->pkt_type == PACKET_LOOPBACK)
2181 goto drop;
2182
2183 sk = pt->af_packet_priv;
2184 po = pkt_sk(sk);
2185
09ad9bc7 2186 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2187 goto drop;
2188
3b04ddde 2189 if (dev->header_ops) {
1da177e4 2190 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2191 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2192 else if (skb->pkt_type == PACKET_OUTGOING) {
2193 /* Special case: outgoing packets have ll header at head */
bbe735e4 2194 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2195 }
2196 }
2197
2198 snaplen = skb->len;
2199
dbcb5855
DM
2200 res = run_filter(skb, sk, snaplen);
2201 if (!res)
fda9ef5d 2202 goto drop_n_restore;
68c2e5de
AD
2203
2204 if (skb->ip_summed == CHECKSUM_PARTIAL)
2205 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2206 else if (skb->pkt_type != PACKET_OUTGOING &&
2207 (skb->ip_summed == CHECKSUM_COMPLETE ||
2208 skb_csum_unnecessary(skb)))
2209 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2210
dbcb5855
DM
2211 if (snaplen > res)
2212 snaplen = res;
1da177e4
LT
2213
2214 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2215 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2216 po->tp_reserve;
1da177e4 2217 } else {
95c96174 2218 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2219 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2220 (maclen < 16 ? 16 : maclen)) +
58d19b19 2221 po->tp_reserve;
edbd58be 2222 if (po->has_vnet_hdr) {
58d19b19 2223 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2224 do_vnet = true;
2225 }
1da177e4
LT
2226 macoff = netoff - maclen;
2227 }
f6fb8f10 2228 if (po->tp_version <= TPACKET_V2) {
2229 if (macoff + snaplen > po->rx_ring.frame_size) {
2230 if (po->copy_thresh &&
0fd7bac6 2231 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2232 if (skb_shared(skb)) {
2233 copy_skb = skb_clone(skb, GFP_ATOMIC);
2234 } else {
2235 copy_skb = skb_get(skb);
2236 skb_head = skb->data;
2237 }
2238 if (copy_skb)
2239 skb_set_owner_r(copy_skb, sk);
1da177e4 2240 }
f6fb8f10 2241 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2242 if ((int)snaplen < 0) {
f6fb8f10 2243 snaplen = 0;
edbd58be
BP
2244 do_vnet = false;
2245 }
1da177e4 2246 }
dc808110
ED
2247 } else if (unlikely(macoff + snaplen >
2248 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2249 u32 nval;
2250
2251 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2252 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2253 snaplen, nval, macoff);
2254 snaplen = nval;
2255 if (unlikely((int)snaplen < 0)) {
2256 snaplen = 0;
2257 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2258 do_vnet = false;
dc808110 2259 }
1da177e4 2260 }
1da177e4 2261 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2262 h.raw = packet_current_rx_frame(po, skb,
2263 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2264 if (!h.raw)
58d19b19 2265 goto drop_n_account;
f6fb8f10 2266 if (po->tp_version <= TPACKET_V2) {
2267 packet_increment_rx_head(po, &po->rx_ring);
2268 /*
2269 * LOSING will be reported till you read the stats,
2270 * because it's COR - Clear On Read.
2271 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2272 * at packet level.
2273 */
ee80fbf3 2274 if (po->stats.stats1.tp_drops)
f6fb8f10 2275 status |= TP_STATUS_LOSING;
2276 }
945d015e
ED
2277
2278 if (do_vnet &&
2279 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2280 sizeof(struct virtio_net_hdr),
2281 vio_le(), true, 0))
2282 goto drop_n_account;
2283
ee80fbf3 2284 po->stats.stats1.tp_packets++;
1da177e4
LT
2285 if (copy_skb) {
2286 status |= TP_STATUS_COPY;
2287 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2288 }
1da177e4
LT
2289 spin_unlock(&sk->sk_receive_queue.lock);
2290
bbd6ef87 2291 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2292
2293 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2294 getnstimeofday(&ts);
1da177e4 2295
b9c32fb2
DB
2296 status |= ts_status;
2297
bbd6ef87
PM
2298 switch (po->tp_version) {
2299 case TPACKET_V1:
2300 h.h1->tp_len = skb->len;
2301 h.h1->tp_snaplen = snaplen;
2302 h.h1->tp_mac = macoff;
2303 h.h1->tp_net = netoff;
4b457bdf
DB
2304 h.h1->tp_sec = ts.tv_sec;
2305 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2306 hdrlen = sizeof(*h.h1);
2307 break;
2308 case TPACKET_V2:
2309 h.h2->tp_len = skb->len;
2310 h.h2->tp_snaplen = snaplen;
2311 h.h2->tp_mac = macoff;
2312 h.h2->tp_net = netoff;
bbd6ef87
PM
2313 h.h2->tp_sec = ts.tv_sec;
2314 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2315 if (skb_vlan_tag_present(skb)) {
2316 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2317 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2318 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2319 } else {
2320 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2321 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2322 }
e4d26f4b 2323 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2324 hdrlen = sizeof(*h.h2);
2325 break;
f6fb8f10 2326 case TPACKET_V3:
2327 /* tp_nxt_offset,vlan are already populated above.
2328 * So DONT clear those fields here
2329 */
2330 h.h3->tp_status |= status;
2331 h.h3->tp_len = skb->len;
2332 h.h3->tp_snaplen = snaplen;
2333 h.h3->tp_mac = macoff;
2334 h.h3->tp_net = netoff;
f6fb8f10 2335 h.h3->tp_sec = ts.tv_sec;
2336 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2337 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2338 hdrlen = sizeof(*h.h3);
2339 break;
bbd6ef87
PM
2340 default:
2341 BUG();
2342 }
1da177e4 2343
bbd6ef87 2344 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2345 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2346 sll->sll_family = AF_PACKET;
2347 sll->sll_hatype = dev->type;
2348 sll->sll_protocol = skb->protocol;
2349 sll->sll_pkttype = skb->pkt_type;
8032b464 2350 if (unlikely(po->origdev))
80feaacb
PWJ
2351 sll->sll_ifindex = orig_dev->ifindex;
2352 else
2353 sll->sll_ifindex = dev->ifindex;
1da177e4 2354
e16aa207 2355 smp_mb();
f0d4eb29 2356
f6dafa95 2357#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2358 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2359 u8 *start, *end;
2360
f0d4eb29
DB
2361 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2362 macoff + snaplen);
2363
2364 for (start = h.raw; start < end; start += PAGE_SIZE)
2365 flush_dcache_page(pgv_to_page(start));
1da177e4 2366 }
f0d4eb29 2367 smp_wmb();
f6dafa95 2368#endif
f0d4eb29 2369
da413eec 2370 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2371 __packet_set_status(po, h.raw, status);
da413eec
DC
2372 sk->sk_data_ready(sk);
2373 } else {
f6fb8f10 2374 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2375 }
1da177e4
LT
2376
2377drop_n_restore:
2378 if (skb_head != skb->data && skb_shared(skb)) {
2379 skb->data = skb_head;
2380 skb->len = skb_len;
2381 }
2382drop:
da37845f
WJ
2383 if (!is_drop_n_account)
2384 consume_skb(skb);
2385 else
2386 kfree_skb(skb);
1da177e4
LT
2387 return 0;
2388
58d19b19 2389drop_n_account:
da37845f 2390 is_drop_n_account = true;
ee80fbf3 2391 po->stats.stats1.tp_drops++;
1da177e4
LT
2392 spin_unlock(&sk->sk_receive_queue.lock);
2393
676d2369 2394 sk->sk_data_ready(sk);
acb5d75b 2395 kfree_skb(copy_skb);
1da177e4
LT
2396 goto drop_n_restore;
2397}
2398
69e3c75f
JB
2399static void tpacket_destruct_skb(struct sk_buff *skb)
2400{
2401 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2402
69e3c75f 2403 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2404 void *ph;
b9c32fb2
DB
2405 __u32 ts;
2406
5cd8d46e 2407 ph = skb_zcopy_get_nouarg(skb);
b0138408 2408 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2409
2410 ts = __packet_set_timestamp(po, ph, skb);
2411 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2412 }
2413
2414 sock_wfree(skb);
2415}
2416
16cc1400
WB
2417static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2418{
16cc1400
WB
2419 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2420 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2421 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2422 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2423 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2424 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2425 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2426
2427 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2428 return -EINVAL;
2429
16cc1400
WB
2430 return 0;
2431}
2432
2433static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2434 struct virtio_net_hdr *vnet_hdr)
2435{
16cc1400
WB
2436 if (*len < sizeof(*vnet_hdr))
2437 return -EINVAL;
2438 *len -= sizeof(*vnet_hdr);
2439
cbbd26b8 2440 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2441 return -EFAULT;
2442
2443 return __packet_snd_vnet_parse(vnet_hdr, *len);
2444}
2445
40d4e3df 2446static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2447 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2448 __be16 proto, unsigned char *addr, int hlen, int copylen,
2449 const struct sockcm_cookie *sockc)
69e3c75f 2450{
184f489e 2451 union tpacket_uhdr ph;
8d39b4a6 2452 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2453 struct socket *sock = po->sk.sk_socket;
2454 struct page *page;
69e3c75f
JB
2455 int err;
2456
2457 ph.raw = frame;
2458
2459 skb->protocol = proto;
2460 skb->dev = dev;
2461 skb->priority = po->sk.sk_priority;
2d37a186 2462 skb->mark = po->sk.sk_mark;
3d0ba8c0 2463 skb->tstamp = sockc->transmit_time;
8f932f76 2464 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2465 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2466
ae641949 2467 skb_reserve(skb, hlen);
69e3c75f 2468 skb_reset_network_header(skb);
c1aad275 2469
69e3c75f
JB
2470 to_write = tp_len;
2471
2472 if (sock->type == SOCK_DGRAM) {
2473 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2474 NULL, tp_len);
2475 if (unlikely(err < 0))
2476 return -EINVAL;
1d036d25 2477 } else if (copylen) {
9ed988cd
WB
2478 int hdrlen = min_t(int, copylen, tp_len);
2479
69e3c75f 2480 skb_push(skb, dev->hard_header_len);
1d036d25 2481 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2482 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2483 if (unlikely(err))
2484 return err;
9ed988cd
WB
2485 if (!dev_validate_header(dev, skb->data, hdrlen))
2486 return -EINVAL;
69e3c75f 2487
9ed988cd
WB
2488 data += hdrlen;
2489 to_write -= hdrlen;
69e3c75f
JB
2490 }
2491
69e3c75f
JB
2492 offset = offset_in_page(data);
2493 len_max = PAGE_SIZE - offset;
2494 len = ((to_write > len_max) ? len_max : to_write);
2495
2496 skb->data_len = to_write;
2497 skb->len += to_write;
2498 skb->truesize += to_write;
14afee4b 2499 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2500
2501 while (likely(to_write)) {
2502 nr_frags = skb_shinfo(skb)->nr_frags;
2503
2504 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2505 pr_err("Packet exceed the number of skb frags(%lu)\n",
2506 MAX_SKB_FRAGS);
69e3c75f
JB
2507 return -EFAULT;
2508 }
2509
0af55bb5
CG
2510 page = pgv_to_page(data);
2511 data += len;
69e3c75f
JB
2512 flush_dcache_page(page);
2513 get_page(page);
0af55bb5 2514 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2515 to_write -= len;
2516 offset = 0;
2517 len_max = PAGE_SIZE;
2518 len = ((to_write > len_max) ? len_max : to_write);
2519 }
2520
75c65772 2521 packet_parse_headers(skb, sock);
efdfa2f7 2522
69e3c75f
JB
2523 return tp_len;
2524}
2525
8d39b4a6
WB
2526static int tpacket_parse_header(struct packet_sock *po, void *frame,
2527 int size_max, void **data)
2528{
2529 union tpacket_uhdr ph;
2530 int tp_len, off;
2531
2532 ph.raw = frame;
2533
2534 switch (po->tp_version) {
7f953ab2
SV
2535 case TPACKET_V3:
2536 if (ph.h3->tp_next_offset != 0) {
2537 pr_warn_once("variable sized slot not supported");
2538 return -EINVAL;
2539 }
2540 tp_len = ph.h3->tp_len;
2541 break;
8d39b4a6
WB
2542 case TPACKET_V2:
2543 tp_len = ph.h2->tp_len;
2544 break;
2545 default:
2546 tp_len = ph.h1->tp_len;
2547 break;
2548 }
2549 if (unlikely(tp_len > size_max)) {
2550 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2551 return -EMSGSIZE;
2552 }
2553
2554 if (unlikely(po->tp_tx_has_off)) {
2555 int off_min, off_max;
2556
2557 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2558 off_max = po->tx_ring.frame_size - tp_len;
2559 if (po->sk.sk_type == SOCK_DGRAM) {
2560 switch (po->tp_version) {
7f953ab2
SV
2561 case TPACKET_V3:
2562 off = ph.h3->tp_net;
2563 break;
8d39b4a6
WB
2564 case TPACKET_V2:
2565 off = ph.h2->tp_net;
2566 break;
2567 default:
2568 off = ph.h1->tp_net;
2569 break;
2570 }
2571 } else {
2572 switch (po->tp_version) {
7f953ab2
SV
2573 case TPACKET_V3:
2574 off = ph.h3->tp_mac;
2575 break;
8d39b4a6
WB
2576 case TPACKET_V2:
2577 off = ph.h2->tp_mac;
2578 break;
2579 default:
2580 off = ph.h1->tp_mac;
2581 break;
2582 }
2583 }
2584 if (unlikely((off < off_min) || (off_max < off)))
2585 return -EINVAL;
2586 } else {
2587 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2588 }
2589
2590 *data = frame + off;
2591 return tp_len;
2592}
2593
69e3c75f
JB
2594static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2595{
69e3c75f
JB
2596 struct sk_buff *skb;
2597 struct net_device *dev;
1d036d25 2598 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2599 struct sockcm_cookie sockc;
69e3c75f 2600 __be16 proto;
09effa67 2601 int err, reserve = 0;
40d4e3df 2602 void *ph;
342dfc30 2603 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2604 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2605 int tp_len, size_max;
2606 unsigned char *addr;
8d39b4a6 2607 void *data;
69e3c75f 2608 int len_sum = 0;
9e67030a 2609 int status = TP_STATUS_AVAILABLE;
1d036d25 2610 int hlen, tlen, copylen = 0;
69e3c75f 2611
69e3c75f
JB
2612 mutex_lock(&po->pg_vec_lock);
2613
66e56cd4 2614 if (likely(saddr == NULL)) {
e40526cb 2615 dev = packet_cached_dev_get(po);
69e3c75f
JB
2616 proto = po->num;
2617 addr = NULL;
2618 } else {
2619 err = -EINVAL;
2620 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2621 goto out;
2622 if (msg->msg_namelen < (saddr->sll_halen
2623 + offsetof(struct sockaddr_ll,
2624 sll_addr)))
2625 goto out;
69e3c75f 2626 proto = saddr->sll_protocol;
6b8d95f1 2627 addr = saddr->sll_halen ? saddr->sll_addr : NULL;
827d9780 2628 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
99137b78 2629 if (addr && dev && saddr->sll_halen < dev->addr_len)
d972f3dc 2630 goto out_put;
69e3c75f
JB
2631 }
2632
69e3c75f
JB
2633 err = -ENXIO;
2634 if (unlikely(dev == NULL))
2635 goto out;
69e3c75f
JB
2636 err = -ENETDOWN;
2637 if (unlikely(!(dev->flags & IFF_UP)))
2638 goto out_put;
2639
657a0667 2640 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2641 if (msg->msg_controllen) {
2642 err = sock_cmsg_send(&po->sk, msg, &sockc);
2643 if (unlikely(err))
2644 goto out_put;
2645 }
2646
5cfb4c8d
DB
2647 if (po->sk.sk_socket->type == SOCK_RAW)
2648 reserve = dev->hard_header_len;
69e3c75f 2649 size_max = po->tx_ring.frame_size
b5dd884e 2650 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2651
1d036d25 2652 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2653 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2654
69e3c75f
JB
2655 do {
2656 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2657 TP_STATUS_SEND_REQUEST);
69e3c75f 2658 if (unlikely(ph == NULL)) {
87a2fd28
DB
2659 if (need_wait && need_resched())
2660 schedule();
69e3c75f
JB
2661 continue;
2662 }
2663
8d39b4a6
WB
2664 skb = NULL;
2665 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2666 if (tp_len < 0)
2667 goto tpacket_error;
2668
69e3c75f 2669 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2670 hlen = LL_RESERVED_SPACE(dev);
2671 tlen = dev->needed_tailroom;
1d036d25
WB
2672 if (po->has_vnet_hdr) {
2673 vnet_hdr = data;
2674 data += sizeof(*vnet_hdr);
2675 tp_len -= sizeof(*vnet_hdr);
2676 if (tp_len < 0 ||
2677 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2678 tp_len = -EINVAL;
2679 goto tpacket_error;
2680 }
2681 copylen = __virtio16_to_cpu(vio_le(),
2682 vnet_hdr->hdr_len);
2683 }
9ed988cd 2684 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2685 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2686 hlen + tlen + sizeof(struct sockaddr_ll) +
2687 (copylen - dev->hard_header_len),
fbf33a28 2688 !need_wait, &err);
69e3c75f 2689
fbf33a28
KM
2690 if (unlikely(skb == NULL)) {
2691 /* we assume the socket was initially writeable ... */
2692 if (likely(len_sum > 0))
2693 err = len_sum;
69e3c75f 2694 goto out_status;
fbf33a28 2695 }
8d39b4a6 2696 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2697 addr, hlen, copylen, &sockc);
dbd46ab4 2698 if (likely(tp_len >= 0) &&
5cfb4c8d 2699 tp_len > dev->mtu + reserve &&
1d036d25 2700 !po->has_vnet_hdr &&
3c70c132
DB
2701 !packet_extra_vlan_len_allowed(dev, skb))
2702 tp_len = -EMSGSIZE;
69e3c75f
JB
2703
2704 if (unlikely(tp_len < 0)) {
8d39b4a6 2705tpacket_error:
69e3c75f
JB
2706 if (po->tp_loss) {
2707 __packet_set_status(po, ph,
2708 TP_STATUS_AVAILABLE);
2709 packet_increment_head(&po->tx_ring);
2710 kfree_skb(skb);
2711 continue;
2712 } else {
2713 status = TP_STATUS_WRONG_FORMAT;
2714 err = tp_len;
2715 goto out_status;
2716 }
2717 }
2718
9d2f67e4
JT
2719 if (po->has_vnet_hdr) {
2720 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2721 tp_len = -EINVAL;
2722 goto tpacket_error;
2723 }
2724 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2725 }
2726
69e3c75f
JB
2727 skb->destructor = tpacket_destruct_skb;
2728 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2729 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2730
2731 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2732 err = po->xmit(skb);
eb70df13
JP
2733 if (unlikely(err > 0)) {
2734 err = net_xmit_errno(err);
2735 if (err && __packet_get_status(po, ph) ==
2736 TP_STATUS_AVAILABLE) {
2737 /* skb was destructed already */
2738 skb = NULL;
2739 goto out_status;
2740 }
2741 /*
2742 * skb was dropped but not destructed yet;
2743 * let's treat it like congestion or err < 0
2744 */
2745 err = 0;
2746 }
69e3c75f
JB
2747 packet_increment_head(&po->tx_ring);
2748 len_sum += tp_len;
b0138408
DB
2749 } while (likely((ph != NULL) ||
2750 /* Note: packet_read_pending() might be slow if we have
2751 * to call it as it's per_cpu variable, but in fast-path
2752 * we already short-circuit the loop with the first
2753 * condition, and luckily don't have to go that path
2754 * anyway.
2755 */
2756 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2757
2758 err = len_sum;
2759 goto out_put;
2760
69e3c75f
JB
2761out_status:
2762 __packet_set_status(po, ph, status);
2763 kfree_skb(skb);
2764out_put:
e40526cb 2765 dev_put(dev);
69e3c75f
JB
2766out:
2767 mutex_unlock(&po->pg_vec_lock);
2768 return err;
2769}
69e3c75f 2770
eea49cc9
OJ
2771static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2772 size_t reserve, size_t len,
2773 size_t linear, int noblock,
2774 int *err)
bfd5f4a3
SS
2775{
2776 struct sk_buff *skb;
2777
2778 /* Under a page? Don't bother with paged skb. */
2779 if (prepad + len < PAGE_SIZE || !linear)
2780 linear = len;
2781
2782 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2783 err, 0);
bfd5f4a3
SS
2784 if (!skb)
2785 return NULL;
2786
2787 skb_reserve(skb, reserve);
2788 skb_put(skb, linear);
2789 skb->data_len = len - linear;
2790 skb->len += len - linear;
2791
2792 return skb;
2793}
2794
d346a3fa 2795static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2796{
2797 struct sock *sk = sock->sk;
342dfc30 2798 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2799 struct sk_buff *skb;
2800 struct net_device *dev;
0e11c91e 2801 __be16 proto;
1da177e4 2802 unsigned char *addr;
827d9780 2803 int err, reserve = 0;
c7d39e32 2804 struct sockcm_cookie sockc;
bfd5f4a3
SS
2805 struct virtio_net_hdr vnet_hdr = { 0 };
2806 int offset = 0;
bfd5f4a3 2807 struct packet_sock *po = pkt_sk(sk);
da7c9561 2808 bool has_vnet_hdr = false;
57031eb7 2809 int hlen, tlen, linear;
3bdc0eba 2810 int extra_len = 0;
1da177e4
LT
2811
2812 /*
1ce4f28b 2813 * Get and verify the address.
1da177e4 2814 */
1ce4f28b 2815
66e56cd4 2816 if (likely(saddr == NULL)) {
e40526cb 2817 dev = packet_cached_dev_get(po);
1da177e4
LT
2818 proto = po->num;
2819 addr = NULL;
2820 } else {
2821 err = -EINVAL;
2822 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2823 goto out;
0fb375fb
EB
2824 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2825 goto out;
1da177e4 2826 proto = saddr->sll_protocol;
6b8d95f1 2827 addr = saddr->sll_halen ? saddr->sll_addr : NULL;
827d9780 2828 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
99137b78 2829 if (addr && dev && saddr->sll_halen < dev->addr_len)
d972f3dc 2830 goto out_unlock;
1da177e4
LT
2831 }
2832
1da177e4 2833 err = -ENXIO;
e40526cb 2834 if (unlikely(dev == NULL))
1da177e4 2835 goto out_unlock;
d5e76b0a 2836 err = -ENETDOWN;
e40526cb 2837 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2838 goto out_unlock;
2839
657a0667 2840 sockcm_init(&sockc, sk);
c7d39e32
EJ
2841 sockc.mark = sk->sk_mark;
2842 if (msg->msg_controllen) {
2843 err = sock_cmsg_send(sk, msg, &sockc);
2844 if (unlikely(err))
2845 goto out_unlock;
2846 }
2847
e40526cb
DB
2848 if (sock->type == SOCK_RAW)
2849 reserve = dev->hard_header_len;
bfd5f4a3 2850 if (po->has_vnet_hdr) {
16cc1400
WB
2851 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2852 if (err)
bfd5f4a3 2853 goto out_unlock;
da7c9561 2854 has_vnet_hdr = true;
bfd5f4a3
SS
2855 }
2856
3bdc0eba
BG
2857 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2858 if (!netif_supports_nofcs(dev)) {
2859 err = -EPROTONOSUPPORT;
2860 goto out_unlock;
2861 }
2862 extra_len = 4; /* We're doing our own CRC */
2863 }
2864
1da177e4 2865 err = -EMSGSIZE;
16cc1400
WB
2866 if (!vnet_hdr.gso_type &&
2867 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2868 goto out_unlock;
2869
bfd5f4a3 2870 err = -ENOBUFS;
ae641949
HX
2871 hlen = LL_RESERVED_SPACE(dev);
2872 tlen = dev->needed_tailroom;
57031eb7
WB
2873 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2874 linear = max(linear, min_t(int, len, dev->hard_header_len));
2875 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2876 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2877 if (skb == NULL)
1da177e4
LT
2878 goto out_unlock;
2879
b84bbaf7 2880 skb_reset_network_header(skb);
1da177e4 2881
0c4e8581 2882 err = -EINVAL;
9c707762
WB
2883 if (sock->type == SOCK_DGRAM) {
2884 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2885 if (unlikely(offset < 0))
9c707762 2886 goto out_free;
b84bbaf7 2887 } else if (reserve) {
9aad13b0 2888 skb_reserve(skb, -reserve);
88a8121d
ND
2889 if (len < reserve + sizeof(struct ipv6hdr) &&
2890 dev->min_header_len != dev->hard_header_len)
993675a3 2891 skb_reset_network_header(skb);
9c707762 2892 }
1da177e4
LT
2893
2894 /* Returns -EFAULT on error */
c0371da6 2895 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2896 if (err)
2897 goto out_free;
bf84a010 2898
9ed988cd
WB
2899 if (sock->type == SOCK_RAW &&
2900 !dev_validate_header(dev, skb->data, len)) {
2901 err = -EINVAL;
2902 goto out_free;
2903 }
2904
8f932f76 2905 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2906
16cc1400 2907 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2908 !packet_extra_vlan_len_allowed(dev, skb)) {
2909 err = -EMSGSIZE;
2910 goto out_free;
57f89bfa
BG
2911 }
2912
09effa67
DM
2913 skb->protocol = proto;
2914 skb->dev = dev;
1da177e4 2915 skb->priority = sk->sk_priority;
c7d39e32 2916 skb->mark = sockc.mark;
3d0ba8c0 2917 skb->tstamp = sockc.transmit_time;
0fd5d57b 2918
da7c9561 2919 if (has_vnet_hdr) {
db60eb5f 2920 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2921 if (err)
2922 goto out_free;
2923 len += sizeof(vnet_hdr);
9d2f67e4 2924 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2925 }
2926
75c65772 2927 packet_parse_headers(skb, sock);
8fd6c80d 2928
3bdc0eba
BG
2929 if (unlikely(extra_len == 4))
2930 skb->no_fcs = 1;
2931
d346a3fa 2932 err = po->xmit(skb);
1da177e4
LT
2933 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2934 goto out_unlock;
2935
e40526cb 2936 dev_put(dev);
1da177e4 2937
40d4e3df 2938 return len;
1da177e4
LT
2939
2940out_free:
2941 kfree_skb(skb);
2942out_unlock:
e40526cb 2943 if (dev)
1da177e4
LT
2944 dev_put(dev);
2945out:
2946 return err;
2947}
2948
1b784140 2949static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2950{
69e3c75f
JB
2951 struct sock *sk = sock->sk;
2952 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2953
69e3c75f
JB
2954 if (po->tx_ring.pg_vec)
2955 return tpacket_snd(po, msg);
2956 else
69e3c75f
JB
2957 return packet_snd(sock, msg, len);
2958}
2959
1da177e4
LT
2960/*
2961 * Close a PACKET socket. This is fairly simple. We immediately go
2962 * to 'closed' state and remove our protocol entry in the device list.
2963 */
2964
2965static int packet_release(struct socket *sock)
2966{
2967 struct sock *sk = sock->sk;
2968 struct packet_sock *po;
2bd624b4 2969 struct packet_fanout *f;
d12d01d6 2970 struct net *net;
f6fb8f10 2971 union tpacket_req_u req_u;
1da177e4
LT
2972
2973 if (!sk)
2974 return 0;
2975
3b1e0a65 2976 net = sock_net(sk);
1da177e4
LT
2977 po = pkt_sk(sk);
2978
0fa7fa98 2979 mutex_lock(&net->packet.sklist_lock);
808f5114 2980 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2981 mutex_unlock(&net->packet.sklist_lock);
2982
2983 preempt_disable();
920de804 2984 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2985 preempt_enable();
1da177e4 2986
808f5114 2987 spin_lock(&po->bind_lock);
ce06b03e 2988 unregister_prot_hook(sk, false);
66e56cd4
DB
2989 packet_cached_dev_reset(po);
2990
160ff18a
BG
2991 if (po->prot_hook.dev) {
2992 dev_put(po->prot_hook.dev);
2993 po->prot_hook.dev = NULL;
2994 }
808f5114 2995 spin_unlock(&po->bind_lock);
1da177e4 2996
1da177e4 2997 packet_flush_mclist(sk);
1da177e4 2998
5171b37d 2999 lock_sock(sk);
9665d5d6
PS
3000 if (po->rx_ring.pg_vec) {
3001 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3002 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3003 }
69e3c75f 3004
9665d5d6
PS
3005 if (po->tx_ring.pg_vec) {
3006 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3007 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3008 }
5171b37d 3009 release_sock(sk);
1da177e4 3010
2bd624b4 3011 f = fanout_release(sk);
dc99f600 3012
808f5114 3013 synchronize_net();
2bd624b4
AS
3014
3015 if (f) {
57f015f5 3016 kfree(po->rollover);
2bd624b4
AS
3017 fanout_release_data(f);
3018 kfree(f);
3019 }
1da177e4
LT
3020 /*
3021 * Now the socket is dead. No more input will appear.
3022 */
1da177e4
LT
3023 sock_orphan(sk);
3024 sock->sk = NULL;
3025
3026 /* Purge queues */
3027
3028 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3029 packet_free_pending(po);
17ab56a2 3030 sk_refcnt_debug_release(sk);
1da177e4
LT
3031
3032 sock_put(sk);
3033 return 0;
3034}
3035
3036/*
3037 * Attach a packet hook.
3038 */
3039
30f7ea1c
FR
3040static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3041 __be16 proto)
1da177e4
LT
3042{
3043 struct packet_sock *po = pkt_sk(sk);
158cd4af 3044 struct net_device *dev_curr;
902fefb8
DB
3045 __be16 proto_curr;
3046 bool need_rehook;
30f7ea1c
FR
3047 struct net_device *dev = NULL;
3048 int ret = 0;
3049 bool unlisted = false;
dc99f600 3050
1da177e4 3051 lock_sock(sk);
1da177e4 3052 spin_lock(&po->bind_lock);
30f7ea1c
FR
3053 rcu_read_lock();
3054
4971613c
WB
3055 if (po->fanout) {
3056 ret = -EINVAL;
3057 goto out_unlock;
3058 }
3059
30f7ea1c
FR
3060 if (name) {
3061 dev = dev_get_by_name_rcu(sock_net(sk), name);
3062 if (!dev) {
3063 ret = -ENODEV;
3064 goto out_unlock;
3065 }
3066 } else if (ifindex) {
3067 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3068 if (!dev) {
3069 ret = -ENODEV;
3070 goto out_unlock;
3071 }
3072 }
3073
3074 if (dev)
3075 dev_hold(dev);
66e56cd4 3076
902fefb8
DB
3077 proto_curr = po->prot_hook.type;
3078 dev_curr = po->prot_hook.dev;
3079
3080 need_rehook = proto_curr != proto || dev_curr != dev;
3081
3082 if (need_rehook) {
30f7ea1c
FR
3083 if (po->running) {
3084 rcu_read_unlock();
15fe076e
ED
3085 /* prevents packet_notifier() from calling
3086 * register_prot_hook()
3087 */
3088 po->num = 0;
30f7ea1c
FR
3089 __unregister_prot_hook(sk, true);
3090 rcu_read_lock();
3091 dev_curr = po->prot_hook.dev;
3092 if (dev)
3093 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3094 dev->ifindex);
3095 }
1da177e4 3096
15fe076e 3097 BUG_ON(po->running);
902fefb8
DB
3098 po->num = proto;
3099 po->prot_hook.type = proto;
902fefb8 3100
30f7ea1c
FR
3101 if (unlikely(unlisted)) {
3102 dev_put(dev);
3103 po->prot_hook.dev = NULL;
3104 po->ifindex = -1;
3105 packet_cached_dev_reset(po);
3106 } else {
3107 po->prot_hook.dev = dev;
3108 po->ifindex = dev ? dev->ifindex : 0;
3109 packet_cached_dev_assign(po, dev);
3110 }
902fefb8 3111 }
158cd4af
LW
3112 if (dev_curr)
3113 dev_put(dev_curr);
66e56cd4 3114
902fefb8 3115 if (proto == 0 || !need_rehook)
1da177e4
LT
3116 goto out_unlock;
3117
30f7ea1c 3118 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3119 register_prot_hook(sk);
be85d4ad
UT
3120 } else {
3121 sk->sk_err = ENETDOWN;
3122 if (!sock_flag(sk, SOCK_DEAD))
3123 sk->sk_error_report(sk);
1da177e4
LT
3124 }
3125
3126out_unlock:
30f7ea1c 3127 rcu_read_unlock();
1da177e4
LT
3128 spin_unlock(&po->bind_lock);
3129 release_sock(sk);
30f7ea1c 3130 return ret;
1da177e4
LT
3131}
3132
3133/*
3134 * Bind a packet socket to a device
3135 */
3136
40d4e3df
ED
3137static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3138 int addr_len)
1da177e4 3139{
40d4e3df 3140 struct sock *sk = sock->sk;
540e2894 3141 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3142
1da177e4
LT
3143 /*
3144 * Check legality
3145 */
1ce4f28b 3146
8ae55f04 3147 if (addr_len != sizeof(struct sockaddr))
1da177e4 3148 return -EINVAL;
540e2894
AP
3149 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3150 * zero-terminated.
3151 */
3152 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3153 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3154
30f7ea1c 3155 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3156}
1da177e4
LT
3157
3158static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3159{
40d4e3df
ED
3160 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3161 struct sock *sk = sock->sk;
1da177e4
LT
3162
3163 /*
3164 * Check legality
3165 */
1ce4f28b 3166
1da177e4
LT
3167 if (addr_len < sizeof(struct sockaddr_ll))
3168 return -EINVAL;
3169 if (sll->sll_family != AF_PACKET)
3170 return -EINVAL;
3171
30f7ea1c
FR
3172 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3173 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3174}
3175
3176static struct proto packet_proto = {
3177 .name = "PACKET",
3178 .owner = THIS_MODULE,
3179 .obj_size = sizeof(struct packet_sock),
3180};
3181
3182/*
1ce4f28b 3183 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3184 */
3185
3f378b68
EP
3186static int packet_create(struct net *net, struct socket *sock, int protocol,
3187 int kern)
1da177e4
LT
3188{
3189 struct sock *sk;
3190 struct packet_sock *po;
0e11c91e 3191 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3192 int err;
3193
df008c91 3194 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3195 return -EPERM;
be02097c
DM
3196 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3197 sock->type != SOCK_PACKET)
1da177e4
LT
3198 return -ESOCKTNOSUPPORT;
3199
3200 sock->state = SS_UNCONNECTED;
3201
3202 err = -ENOBUFS;
11aa9c28 3203 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3204 if (sk == NULL)
3205 goto out;
3206
3207 sock->ops = &packet_ops;
1da177e4
LT
3208 if (sock->type == SOCK_PACKET)
3209 sock->ops = &packet_ops_spkt;
be02097c 3210
1da177e4
LT
3211 sock_init_data(sock, sk);
3212
3213 po = pkt_sk(sk);
3214 sk->sk_family = PF_PACKET;
0e11c91e 3215 po->num = proto;
d346a3fa 3216 po->xmit = dev_queue_xmit;
66e56cd4 3217
b0138408
DB
3218 err = packet_alloc_pending(po);
3219 if (err)
3220 goto out2;
3221
66e56cd4 3222 packet_cached_dev_reset(po);
1da177e4
LT
3223
3224 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3225 sk_refcnt_debug_inc(sk);
1da177e4
LT
3226
3227 /*
3228 * Attach a protocol block
3229 */
3230
3231 spin_lock_init(&po->bind_lock);
905db440 3232 mutex_init(&po->pg_vec_lock);
0648ab70 3233 po->rollover = NULL;
1da177e4 3234 po->prot_hook.func = packet_rcv;
be02097c 3235
1da177e4
LT
3236 if (sock->type == SOCK_PACKET)
3237 po->prot_hook.func = packet_rcv_spkt;
be02097c 3238
1da177e4
LT
3239 po->prot_hook.af_packet_priv = sk;
3240
0e11c91e
AV
3241 if (proto) {
3242 po->prot_hook.type = proto;
a6361f0c 3243 __register_prot_hook(sk);
1da177e4
LT
3244 }
3245
0fa7fa98 3246 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3247 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3248 mutex_unlock(&net->packet.sklist_lock);
3249
3250 preempt_disable();
3680453c 3251 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3252 preempt_enable();
808f5114 3253
40d4e3df 3254 return 0;
b0138408
DB
3255out2:
3256 sk_free(sk);
1da177e4
LT
3257out:
3258 return err;
3259}
3260
3261/*
3262 * Pull a packet from our receive queue and hand it to the user.
3263 * If necessary we block.
3264 */
3265
1b784140
YX
3266static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3267 int flags)
1da177e4
LT
3268{
3269 struct sock *sk = sock->sk;
3270 struct sk_buff *skb;
3271 int copied, err;
bfd5f4a3 3272 int vnet_hdr_len = 0;
2472d761 3273 unsigned int origlen = 0;
1da177e4
LT
3274
3275 err = -EINVAL;
ed85b565 3276 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3277 goto out;
3278
3279#if 0
3280 /* What error should we return now? EUNATTACH? */
3281 if (pkt_sk(sk)->ifindex < 0)
3282 return -ENODEV;
3283#endif
3284
ed85b565 3285 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3286 err = sock_recv_errqueue(sk, msg, len,
3287 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3288 goto out;
3289 }
3290
1da177e4
LT
3291 /*
3292 * Call the generic datagram receiver. This handles all sorts
3293 * of horrible races and re-entrancy so we can forget about it
3294 * in the protocol layers.
3295 *
3296 * Now it will return ENETDOWN, if device have just gone down,
3297 * but then it will block.
3298 */
3299
40d4e3df 3300 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3301
3302 /*
1ce4f28b 3303 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3304 * handles the blocking we don't see and worry about blocking
3305 * retries.
3306 */
3307
8ae55f04 3308 if (skb == NULL)
1da177e4
LT
3309 goto out;
3310
2ccdbaa6
WB
3311 if (pkt_sk(sk)->pressure)
3312 packet_rcv_has_room(pkt_sk(sk), NULL);
3313
bfd5f4a3 3314 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3315 err = packet_rcv_vnet(msg, skb, &len);
3316 if (err)
bfd5f4a3 3317 goto out_free;
16cc1400 3318 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3319 }
3320
f3d33426
HFS
3321 /* You lose any data beyond the buffer you gave. If it worries
3322 * a user program they can ask the device for its MTU
3323 * anyway.
1da177e4 3324 */
1da177e4 3325 copied = skb->len;
40d4e3df
ED
3326 if (copied > len) {
3327 copied = len;
3328 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3329 }
3330
51f3d02b 3331 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3332 if (err)
3333 goto out_free;
3334
2472d761
EB
3335 if (sock->type != SOCK_PACKET) {
3336 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3337
3338 /* Original length was stored in sockaddr_ll fields */
3339 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3340 sll->sll_family = AF_PACKET;
3341 sll->sll_protocol = skb->protocol;
3342 }
3343
3b885787 3344 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3345
f3d33426
HFS
3346 if (msg->msg_name) {
3347 /* If the address length field is there to be filled
3348 * in, we fill it in now.
3349 */
3350 if (sock->type == SOCK_PACKET) {
342dfc30 3351 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3352 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3353 } else {
3354 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3355
f3d33426
HFS
3356 msg->msg_namelen = sll->sll_halen +
3357 offsetof(struct sockaddr_ll, sll_addr);
3358 }
ffbc6111
HX
3359 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3360 msg->msg_namelen);
f3d33426 3361 }
1da177e4 3362
8dc41944 3363 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3364 struct tpacket_auxdata aux;
3365
3366 aux.tp_status = TP_STATUS_USER;
3367 if (skb->ip_summed == CHECKSUM_PARTIAL)
3368 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3369 else if (skb->pkt_type != PACKET_OUTGOING &&
3370 (skb->ip_summed == CHECKSUM_COMPLETE ||
3371 skb_csum_unnecessary(skb)))
3372 aux.tp_status |= TP_STATUS_CSUM_VALID;
3373
2472d761 3374 aux.tp_len = origlen;
ffbc6111
HX
3375 aux.tp_snaplen = skb->len;
3376 aux.tp_mac = 0;
bbe735e4 3377 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3378 if (skb_vlan_tag_present(skb)) {
3379 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3380 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3381 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3382 } else {
3383 aux.tp_vlan_tci = 0;
a0cdfcf3 3384 aux.tp_vlan_tpid = 0;
a3bcc23e 3385 }
ffbc6111 3386 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3387 }
3388
1da177e4
LT
3389 /*
3390 * Free or return the buffer as appropriate. Again this
3391 * hides all the races and re-entrancy issues from us.
3392 */
bfd5f4a3 3393 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3394
3395out_free:
3396 skb_free_datagram(sk, skb);
3397out:
3398 return err;
3399}
3400
1da177e4 3401static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3402 int peer)
1da177e4
LT
3403{
3404 struct net_device *dev;
3405 struct sock *sk = sock->sk;
3406
3407 if (peer)
3408 return -EOPNOTSUPP;
3409
3410 uaddr->sa_family = AF_PACKET;
2dc85bf3 3411 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3412 rcu_read_lock();
3413 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3414 if (dev)
2dc85bf3 3415 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3416 rcu_read_unlock();
1da177e4 3417
9b2c45d4 3418 return sizeof(*uaddr);
1da177e4 3419}
1da177e4
LT
3420
3421static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3422 int peer)
1da177e4
LT
3423{
3424 struct net_device *dev;
3425 struct sock *sk = sock->sk;
3426 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3427 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3428
3429 if (peer)
3430 return -EOPNOTSUPP;
3431
3432 sll->sll_family = AF_PACKET;
3433 sll->sll_ifindex = po->ifindex;
3434 sll->sll_protocol = po->num;
67286640 3435 sll->sll_pkttype = 0;
654d1f8a
ED
3436 rcu_read_lock();
3437 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3438 if (dev) {
3439 sll->sll_hatype = dev->type;
3440 sll->sll_halen = dev->addr_len;
3441 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3442 } else {
3443 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3444 sll->sll_halen = 0;
3445 }
654d1f8a 3446 rcu_read_unlock();
1da177e4 3447
9b2c45d4 3448 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3449}
3450
2aeb0b88
WC
3451static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3452 int what)
1da177e4
LT
3453{
3454 switch (i->type) {
3455 case PACKET_MR_MULTICAST:
1162563f
JP
3456 if (i->alen != dev->addr_len)
3457 return -EINVAL;
1da177e4 3458 if (what > 0)
22bedad3 3459 return dev_mc_add(dev, i->addr);
1da177e4 3460 else
22bedad3 3461 return dev_mc_del(dev, i->addr);
1da177e4
LT
3462 break;
3463 case PACKET_MR_PROMISC:
2aeb0b88 3464 return dev_set_promiscuity(dev, what);
1da177e4 3465 case PACKET_MR_ALLMULTI:
2aeb0b88 3466 return dev_set_allmulti(dev, what);
d95ed927 3467 case PACKET_MR_UNICAST:
1162563f
JP
3468 if (i->alen != dev->addr_len)
3469 return -EINVAL;
d95ed927 3470 if (what > 0)
a748ee24 3471 return dev_uc_add(dev, i->addr);
d95ed927 3472 else
a748ee24 3473 return dev_uc_del(dev, i->addr);
d95ed927 3474 break;
40d4e3df
ED
3475 default:
3476 break;
1da177e4 3477 }
2aeb0b88 3478 return 0;
1da177e4
LT
3479}
3480
82f17091
FR
3481static void packet_dev_mclist_delete(struct net_device *dev,
3482 struct packet_mclist **mlp)
1da177e4 3483{
82f17091
FR
3484 struct packet_mclist *ml;
3485
3486 while ((ml = *mlp) != NULL) {
3487 if (ml->ifindex == dev->ifindex) {
3488 packet_dev_mc(dev, ml, -1);
3489 *mlp = ml->next;
3490 kfree(ml);
3491 } else
3492 mlp = &ml->next;
1da177e4
LT
3493 }
3494}
3495
0fb375fb 3496static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3497{
3498 struct packet_sock *po = pkt_sk(sk);
3499 struct packet_mclist *ml, *i;
3500 struct net_device *dev;
3501 int err;
3502
3503 rtnl_lock();
3504
3505 err = -ENODEV;
3b1e0a65 3506 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3507 if (!dev)
3508 goto done;
3509
3510 err = -EINVAL;
1162563f 3511 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3512 goto done;
3513
3514 err = -ENOBUFS;
8b3a7005 3515 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3516 if (i == NULL)
3517 goto done;
3518
3519 err = 0;
3520 for (ml = po->mclist; ml; ml = ml->next) {
3521 if (ml->ifindex == mreq->mr_ifindex &&
3522 ml->type == mreq->mr_type &&
3523 ml->alen == mreq->mr_alen &&
3524 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3525 ml->count++;
3526 /* Free the new element ... */
3527 kfree(i);
3528 goto done;
3529 }
3530 }
3531
3532 i->type = mreq->mr_type;
3533 i->ifindex = mreq->mr_ifindex;
3534 i->alen = mreq->mr_alen;
3535 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3536 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3537 i->count = 1;
3538 i->next = po->mclist;
3539 po->mclist = i;
2aeb0b88
WC
3540 err = packet_dev_mc(dev, i, 1);
3541 if (err) {
3542 po->mclist = i->next;
3543 kfree(i);
3544 }
1da177e4
LT
3545
3546done:
3547 rtnl_unlock();
3548 return err;
3549}
3550
0fb375fb 3551static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3552{
3553 struct packet_mclist *ml, **mlp;
3554
3555 rtnl_lock();
3556
3557 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3558 if (ml->ifindex == mreq->mr_ifindex &&
3559 ml->type == mreq->mr_type &&
3560 ml->alen == mreq->mr_alen &&
3561 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3562 if (--ml->count == 0) {
3563 struct net_device *dev;
3564 *mlp = ml->next;
ad959e76
ED
3565 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3566 if (dev)
1da177e4 3567 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3568 kfree(ml);
3569 }
82f17091 3570 break;
1da177e4
LT
3571 }
3572 }
3573 rtnl_unlock();
82f17091 3574 return 0;
1da177e4
LT
3575}
3576
3577static void packet_flush_mclist(struct sock *sk)
3578{
3579 struct packet_sock *po = pkt_sk(sk);
3580 struct packet_mclist *ml;
3581
3582 if (!po->mclist)
3583 return;
3584
3585 rtnl_lock();
3586 while ((ml = po->mclist) != NULL) {
3587 struct net_device *dev;
3588
3589 po->mclist = ml->next;
ad959e76
ED
3590 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3591 if (dev != NULL)
1da177e4 3592 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3593 kfree(ml);
3594 }
3595 rtnl_unlock();
3596}
1da177e4
LT
3597
3598static int
b7058842 3599packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3600{
3601 struct sock *sk = sock->sk;
8dc41944 3602 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3603 int ret;
3604
3605 if (level != SOL_PACKET)
3606 return -ENOPROTOOPT;
3607
69e3c75f 3608 switch (optname) {
1ce4f28b 3609 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3610 case PACKET_DROP_MEMBERSHIP:
3611 {
0fb375fb
EB
3612 struct packet_mreq_max mreq;
3613 int len = optlen;
3614 memset(&mreq, 0, sizeof(mreq));
3615 if (len < sizeof(struct packet_mreq))
1da177e4 3616 return -EINVAL;
0fb375fb
EB
3617 if (len > sizeof(mreq))
3618 len = sizeof(mreq);
40d4e3df 3619 if (copy_from_user(&mreq, optval, len))
1da177e4 3620 return -EFAULT;
0fb375fb
EB
3621 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3622 return -EINVAL;
1da177e4
LT
3623 if (optname == PACKET_ADD_MEMBERSHIP)
3624 ret = packet_mc_add(sk, &mreq);
3625 else
3626 ret = packet_mc_drop(sk, &mreq);
3627 return ret;
3628 }
a2efcfa0 3629
1da177e4 3630 case PACKET_RX_RING:
69e3c75f 3631 case PACKET_TX_RING:
1da177e4 3632 {
f6fb8f10 3633 union tpacket_req_u req_u;
3634 int len;
1da177e4 3635
5171b37d 3636 lock_sock(sk);
f6fb8f10 3637 switch (po->tp_version) {
3638 case TPACKET_V1:
3639 case TPACKET_V2:
3640 len = sizeof(req_u.req);
3641 break;
3642 case TPACKET_V3:
3643 default:
3644 len = sizeof(req_u.req3);
3645 break;
3646 }
5171b37d
ED
3647 if (optlen < len) {
3648 ret = -EINVAL;
3649 } else {
3650 if (copy_from_user(&req_u.req, optval, len))
3651 ret = -EFAULT;
3652 else
3653 ret = packet_set_ring(sk, &req_u, 0,
3654 optname == PACKET_TX_RING);
3655 }
3656 release_sock(sk);
3657 return ret;
1da177e4
LT
3658 }
3659 case PACKET_COPY_THRESH:
3660 {
3661 int val;
3662
40d4e3df 3663 if (optlen != sizeof(val))
1da177e4 3664 return -EINVAL;
40d4e3df 3665 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3666 return -EFAULT;
3667
3668 pkt_sk(sk)->copy_thresh = val;
3669 return 0;
3670 }
bbd6ef87
PM
3671 case PACKET_VERSION:
3672 {
3673 int val;
3674
3675 if (optlen != sizeof(val))
3676 return -EINVAL;
bbd6ef87
PM
3677 if (copy_from_user(&val, optval, sizeof(val)))
3678 return -EFAULT;
3679 switch (val) {
3680 case TPACKET_V1:
3681 case TPACKET_V2:
f6fb8f10 3682 case TPACKET_V3:
84ac7260 3683 break;
bbd6ef87
PM
3684 default:
3685 return -EINVAL;
3686 }
84ac7260
PP
3687 lock_sock(sk);
3688 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3689 ret = -EBUSY;
3690 } else {
3691 po->tp_version = val;
3692 ret = 0;
3693 }
3694 release_sock(sk);
3695 return ret;
bbd6ef87 3696 }
8913336a
PM
3697 case PACKET_RESERVE:
3698 {
3699 unsigned int val;
3700
3701 if (optlen != sizeof(val))
3702 return -EINVAL;
8913336a
PM
3703 if (copy_from_user(&val, optval, sizeof(val)))
3704 return -EFAULT;
bcc5364b
AK
3705 if (val > INT_MAX)
3706 return -EINVAL;
c27927e3
WB
3707 lock_sock(sk);
3708 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3709 ret = -EBUSY;
3710 } else {
3711 po->tp_reserve = val;
3712 ret = 0;
3713 }
3714 release_sock(sk);
3715 return ret;
8913336a 3716 }
69e3c75f
JB
3717 case PACKET_LOSS:
3718 {
3719 unsigned int val;
3720
3721 if (optlen != sizeof(val))
3722 return -EINVAL;
69e3c75f
JB
3723 if (copy_from_user(&val, optval, sizeof(val)))
3724 return -EFAULT;
a6361f0c
WB
3725
3726 lock_sock(sk);
3727 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3728 ret = -EBUSY;
3729 } else {
3730 po->tp_loss = !!val;
3731 ret = 0;
3732 }
3733 release_sock(sk);
3734 return ret;
69e3c75f 3735 }
8dc41944
HX
3736 case PACKET_AUXDATA:
3737 {
3738 int val;
3739
3740 if (optlen < sizeof(val))
3741 return -EINVAL;
3742 if (copy_from_user(&val, optval, sizeof(val)))
3743 return -EFAULT;
3744
a6361f0c 3745 lock_sock(sk);
8dc41944 3746 po->auxdata = !!val;
a6361f0c 3747 release_sock(sk);
8dc41944
HX
3748 return 0;
3749 }
80feaacb
PWJ
3750 case PACKET_ORIGDEV:
3751 {
3752 int val;
3753
3754 if (optlen < sizeof(val))
3755 return -EINVAL;
3756 if (copy_from_user(&val, optval, sizeof(val)))
3757 return -EFAULT;
3758
a6361f0c 3759 lock_sock(sk);
80feaacb 3760 po->origdev = !!val;
a6361f0c 3761 release_sock(sk);
80feaacb
PWJ
3762 return 0;
3763 }
bfd5f4a3
SS
3764 case PACKET_VNET_HDR:
3765 {
3766 int val;
3767
3768 if (sock->type != SOCK_RAW)
3769 return -EINVAL;
bfd5f4a3
SS
3770 if (optlen < sizeof(val))
3771 return -EINVAL;
3772 if (copy_from_user(&val, optval, sizeof(val)))
3773 return -EFAULT;
3774
a6361f0c
WB
3775 lock_sock(sk);
3776 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3777 ret = -EBUSY;
3778 } else {
3779 po->has_vnet_hdr = !!val;
3780 ret = 0;
3781 }
3782 release_sock(sk);
3783 return ret;
bfd5f4a3 3784 }
614f60fa
SM
3785 case PACKET_TIMESTAMP:
3786 {
3787 int val;
3788
3789 if (optlen != sizeof(val))
3790 return -EINVAL;
3791 if (copy_from_user(&val, optval, sizeof(val)))
3792 return -EFAULT;
3793
3794 po->tp_tstamp = val;
3795 return 0;
3796 }
dc99f600
DM
3797 case PACKET_FANOUT:
3798 {
3799 int val;
3800
3801 if (optlen != sizeof(val))
3802 return -EINVAL;
3803 if (copy_from_user(&val, optval, sizeof(val)))
3804 return -EFAULT;
3805
3806 return fanout_add(sk, val & 0xffff, val >> 16);
3807 }
47dceb8e
WB
3808 case PACKET_FANOUT_DATA:
3809 {
3810 if (!po->fanout)
3811 return -EINVAL;
3812
3813 return fanout_set_data(po, optval, optlen);
3814 }
fa788d98
VW
3815 case PACKET_IGNORE_OUTGOING:
3816 {
3817 int val;
3818
3819 if (optlen != sizeof(val))
3820 return -EINVAL;
3821 if (copy_from_user(&val, optval, sizeof(val)))
3822 return -EFAULT;
3823 if (val < 0 || val > 1)
3824 return -EINVAL;
3825
3826 po->prot_hook.ignore_outgoing = !!val;
3827 return 0;
3828 }
5920cd3a
PC
3829 case PACKET_TX_HAS_OFF:
3830 {
3831 unsigned int val;
3832
3833 if (optlen != sizeof(val))
3834 return -EINVAL;
5920cd3a
PC
3835 if (copy_from_user(&val, optval, sizeof(val)))
3836 return -EFAULT;
a6361f0c
WB
3837
3838 lock_sock(sk);
3839 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3840 ret = -EBUSY;
3841 } else {
3842 po->tp_tx_has_off = !!val;
3843 ret = 0;
3844 }
3845 release_sock(sk);
5920cd3a
PC
3846 return 0;
3847 }
d346a3fa
DB
3848 case PACKET_QDISC_BYPASS:
3849 {
3850 int val;
3851
3852 if (optlen != sizeof(val))
3853 return -EINVAL;
3854 if (copy_from_user(&val, optval, sizeof(val)))
3855 return -EFAULT;
3856
3857 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3858 return 0;
3859 }
1da177e4
LT
3860 default:
3861 return -ENOPROTOOPT;
3862 }
3863}
3864
3865static int packet_getsockopt(struct socket *sock, int level, int optname,
3866 char __user *optval, int __user *optlen)
3867{
3868 int len;
c06fff6e 3869 int val, lv = sizeof(val);
1da177e4
LT
3870 struct sock *sk = sock->sk;
3871 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3872 void *data = &val;
ee80fbf3 3873 union tpacket_stats_u st;
a9b63918 3874 struct tpacket_rollover_stats rstats;
1da177e4
LT
3875
3876 if (level != SOL_PACKET)
3877 return -ENOPROTOOPT;
3878
8ae55f04
KK
3879 if (get_user(len, optlen))
3880 return -EFAULT;
1da177e4
LT
3881
3882 if (len < 0)
3883 return -EINVAL;
1ce4f28b 3884
69e3c75f 3885 switch (optname) {
1da177e4 3886 case PACKET_STATISTICS:
1da177e4 3887 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3888 memcpy(&st, &po->stats, sizeof(st));
3889 memset(&po->stats, 0, sizeof(po->stats));
3890 spin_unlock_bh(&sk->sk_receive_queue.lock);
3891
f6fb8f10 3892 if (po->tp_version == TPACKET_V3) {
c06fff6e 3893 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3894 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3895 data = &st.stats3;
f6fb8f10 3896 } else {
c06fff6e 3897 lv = sizeof(struct tpacket_stats);
8bcdeaff 3898 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3899 data = &st.stats1;
f6fb8f10 3900 }
ee80fbf3 3901
8dc41944
HX
3902 break;
3903 case PACKET_AUXDATA:
8dc41944 3904 val = po->auxdata;
80feaacb
PWJ
3905 break;
3906 case PACKET_ORIGDEV:
80feaacb 3907 val = po->origdev;
bfd5f4a3
SS
3908 break;
3909 case PACKET_VNET_HDR:
bfd5f4a3 3910 val = po->has_vnet_hdr;
1da177e4 3911 break;
bbd6ef87 3912 case PACKET_VERSION:
bbd6ef87 3913 val = po->tp_version;
bbd6ef87
PM
3914 break;
3915 case PACKET_HDRLEN:
3916 if (len > sizeof(int))
3917 len = sizeof(int);
fd2c83b3
AP
3918 if (len < sizeof(int))
3919 return -EINVAL;
bbd6ef87
PM
3920 if (copy_from_user(&val, optval, len))
3921 return -EFAULT;
3922 switch (val) {
3923 case TPACKET_V1:
3924 val = sizeof(struct tpacket_hdr);
3925 break;
3926 case TPACKET_V2:
3927 val = sizeof(struct tpacket2_hdr);
3928 break;
f6fb8f10 3929 case TPACKET_V3:
3930 val = sizeof(struct tpacket3_hdr);
3931 break;
bbd6ef87
PM
3932 default:
3933 return -EINVAL;
3934 }
bbd6ef87 3935 break;
8913336a 3936 case PACKET_RESERVE:
8913336a 3937 val = po->tp_reserve;
8913336a 3938 break;
69e3c75f 3939 case PACKET_LOSS:
69e3c75f 3940 val = po->tp_loss;
69e3c75f 3941 break;
614f60fa 3942 case PACKET_TIMESTAMP:
614f60fa 3943 val = po->tp_tstamp;
614f60fa 3944 break;
dc99f600 3945 case PACKET_FANOUT:
dc99f600
DM
3946 val = (po->fanout ?
3947 ((u32)po->fanout->id |
77f65ebd
WB
3948 ((u32)po->fanout->type << 16) |
3949 ((u32)po->fanout->flags << 24)) :
dc99f600 3950 0);
dc99f600 3951 break;
fa788d98
VW
3952 case PACKET_IGNORE_OUTGOING:
3953 val = po->prot_hook.ignore_outgoing;
3954 break;
a9b63918 3955 case PACKET_ROLLOVER_STATS:
57f015f5 3956 if (!po->rollover)
a9b63918 3957 return -EINVAL;
57f015f5
MM
3958 rstats.tp_all = atomic_long_read(&po->rollover->num);
3959 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3960 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3961 data = &rstats;
3962 lv = sizeof(rstats);
a9b63918 3963 break;
5920cd3a
PC
3964 case PACKET_TX_HAS_OFF:
3965 val = po->tp_tx_has_off;
3966 break;
d346a3fa
DB
3967 case PACKET_QDISC_BYPASS:
3968 val = packet_use_direct_xmit(po);
3969 break;
1da177e4
LT
3970 default:
3971 return -ENOPROTOOPT;
3972 }
3973
c06fff6e
ED
3974 if (len > lv)
3975 len = lv;
8ae55f04
KK
3976 if (put_user(len, optlen))
3977 return -EFAULT;
8dc41944
HX
3978 if (copy_to_user(optval, data, len))
3979 return -EFAULT;
8ae55f04 3980 return 0;
1da177e4
LT
3981}
3982
3983
719c44d3
WB
3984#ifdef CONFIG_COMPAT
3985static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3986 char __user *optval, unsigned int optlen)
3987{
3988 struct packet_sock *po = pkt_sk(sock->sk);
3989
3990 if (level != SOL_PACKET)
3991 return -ENOPROTOOPT;
3992
3993 if (optname == PACKET_FANOUT_DATA &&
3994 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3995 optval = (char __user *)get_compat_bpf_fprog(optval);
3996 if (!optval)
3997 return -EFAULT;
3998 optlen = sizeof(struct sock_fprog);
3999 }
4000
4001 return packet_setsockopt(sock, level, optname, optval, optlen);
4002}
4003#endif
4004
351638e7
JP
4005static int packet_notifier(struct notifier_block *this,
4006 unsigned long msg, void *ptr)
1da177e4
LT
4007{
4008 struct sock *sk;
351638e7 4009 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4010 struct net *net = dev_net(dev);
1da177e4 4011
808f5114 4012 rcu_read_lock();
b67bfe0d 4013 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4014 struct packet_sock *po = pkt_sk(sk);
4015
4016 switch (msg) {
4017 case NETDEV_UNREGISTER:
1da177e4 4018 if (po->mclist)
82f17091 4019 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4020 /* fallthrough */
4021
1da177e4
LT
4022 case NETDEV_DOWN:
4023 if (dev->ifindex == po->ifindex) {
4024 spin_lock(&po->bind_lock);
4025 if (po->running) {
ce06b03e 4026 __unregister_prot_hook(sk, false);
1da177e4
LT
4027 sk->sk_err = ENETDOWN;
4028 if (!sock_flag(sk, SOCK_DEAD))
4029 sk->sk_error_report(sk);
4030 }
4031 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4032 packet_cached_dev_reset(po);
1da177e4 4033 po->ifindex = -1;
160ff18a
BG
4034 if (po->prot_hook.dev)
4035 dev_put(po->prot_hook.dev);
1da177e4
LT
4036 po->prot_hook.dev = NULL;
4037 }
4038 spin_unlock(&po->bind_lock);
4039 }
4040 break;
4041 case NETDEV_UP:
808f5114 4042 if (dev->ifindex == po->ifindex) {
4043 spin_lock(&po->bind_lock);
ce06b03e
DM
4044 if (po->num)
4045 register_prot_hook(sk);
808f5114 4046 spin_unlock(&po->bind_lock);
1da177e4 4047 }
1da177e4
LT
4048 break;
4049 }
4050 }
808f5114 4051 rcu_read_unlock();
1da177e4
LT
4052 return NOTIFY_DONE;
4053}
4054
4055
4056static int packet_ioctl(struct socket *sock, unsigned int cmd,
4057 unsigned long arg)
4058{
4059 struct sock *sk = sock->sk;
4060
69e3c75f 4061 switch (cmd) {
40d4e3df
ED
4062 case SIOCOUTQ:
4063 {
4064 int amount = sk_wmem_alloc_get(sk);
31e6d363 4065
40d4e3df
ED
4066 return put_user(amount, (int __user *)arg);
4067 }
4068 case SIOCINQ:
4069 {
4070 struct sk_buff *skb;
4071 int amount = 0;
4072
4073 spin_lock_bh(&sk->sk_receive_queue.lock);
4074 skb = skb_peek(&sk->sk_receive_queue);
4075 if (skb)
4076 amount = skb->len;
4077 spin_unlock_bh(&sk->sk_receive_queue.lock);
4078 return put_user(amount, (int __user *)arg);
4079 }
4080 case SIOCGSTAMP:
4081 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4082 case SIOCGSTAMPNS:
4083 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4084
1da177e4 4085#ifdef CONFIG_INET
40d4e3df
ED
4086 case SIOCADDRT:
4087 case SIOCDELRT:
4088 case SIOCDARP:
4089 case SIOCGARP:
4090 case SIOCSARP:
4091 case SIOCGIFADDR:
4092 case SIOCSIFADDR:
4093 case SIOCGIFBRDADDR:
4094 case SIOCSIFBRDADDR:
4095 case SIOCGIFNETMASK:
4096 case SIOCSIFNETMASK:
4097 case SIOCGIFDSTADDR:
4098 case SIOCSIFDSTADDR:
4099 case SIOCSIFFLAGS:
40d4e3df 4100 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4101#endif
4102
40d4e3df
ED
4103 default:
4104 return -ENOIOCTLCMD;
1da177e4
LT
4105 }
4106 return 0;
4107}
4108
a11e1d43
LT
4109static __poll_t packet_poll(struct file *file, struct socket *sock,
4110 poll_table *wait)
1da177e4
LT
4111{
4112 struct sock *sk = sock->sk;
4113 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4114 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4115
4116 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4117 if (po->rx_ring.pg_vec) {
f6fb8f10 4118 if (!packet_previous_rx_frame(po, &po->rx_ring,
4119 TP_STATUS_KERNEL))
a9a08845 4120 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4121 }
2ccdbaa6 4122 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4123 po->pressure = 0;
1da177e4 4124 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4125 spin_lock_bh(&sk->sk_write_queue.lock);
4126 if (po->tx_ring.pg_vec) {
4127 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4128 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4129 }
4130 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4131 return mask;
4132}
4133
4134
4135/* Dirty? Well, I still did not learn better way to account
4136 * for user mmaps.
4137 */
4138
4139static void packet_mm_open(struct vm_area_struct *vma)
4140{
4141 struct file *file = vma->vm_file;
40d4e3df 4142 struct socket *sock = file->private_data;
1da177e4 4143 struct sock *sk = sock->sk;
1ce4f28b 4144
1da177e4
LT
4145 if (sk)
4146 atomic_inc(&pkt_sk(sk)->mapped);
4147}
4148
4149static void packet_mm_close(struct vm_area_struct *vma)
4150{
4151 struct file *file = vma->vm_file;
40d4e3df 4152 struct socket *sock = file->private_data;
1da177e4 4153 struct sock *sk = sock->sk;
1ce4f28b 4154
1da177e4
LT
4155 if (sk)
4156 atomic_dec(&pkt_sk(sk)->mapped);
4157}
4158
f0f37e2f 4159static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4160 .open = packet_mm_open,
4161 .close = packet_mm_close,
1da177e4
LT
4162};
4163
3a7ad063
ED
4164static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4165 unsigned int len)
1da177e4
LT
4166{
4167 int i;
4168
4ebf0ae2 4169 for (i = 0; i < len; i++) {
0e3125c7 4170 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4171 if (is_vmalloc_addr(pg_vec[i].buffer))
4172 vfree(pg_vec[i].buffer);
4173 else
4174 free_pages((unsigned long)pg_vec[i].buffer,
4175 order);
0e3125c7
NH
4176 pg_vec[i].buffer = NULL;
4177 }
1da177e4
LT
4178 }
4179 kfree(pg_vec);
4180}
4181
3a7ad063 4182static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4183{
f0d4eb29 4184 char *buffer;
3a7ad063
ED
4185 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4186 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4187
3a7ad063 4188 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4189 if (buffer)
4190 return buffer;
4191
3a7ad063
ED
4192 /* __get_free_pages failed, fall back to vmalloc */
4193 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4194 if (buffer)
4195 return buffer;
0e3125c7 4196
3a7ad063
ED
4197 /* vmalloc failed, lets dig into swap here */
4198 gfp_flags &= ~__GFP_NORETRY;
4199 buffer = (char *) __get_free_pages(gfp_flags, order);
4200 if (buffer)
4201 return buffer;
4202
4203 /* complete and utter failure */
4204 return NULL;
4ebf0ae2
DM
4205}
4206
3a7ad063 4207static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4208{
4209 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4210 struct pgv *pg_vec;
4ebf0ae2
DM
4211 int i;
4212
0e3125c7 4213 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4214 if (unlikely(!pg_vec))
4215 goto out;
4216
4217 for (i = 0; i < block_nr; i++) {
3a7ad063 4218 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4219 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4220 goto out_free_pgvec;
4221 }
4222
4223out:
4224 return pg_vec;
4225
4226out_free_pgvec:
3a7ad063 4227 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4228 pg_vec = NULL;
4229 goto out;
4230}
1da177e4 4231
f6fb8f10 4232static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4233 int closing, int tx_ring)
1da177e4 4234{
0e3125c7 4235 struct pgv *pg_vec = NULL;
1da177e4 4236 struct packet_sock *po = pkt_sk(sk);
3a7ad063 4237 int was_running, order = 0;
69e3c75f
JB
4238 struct packet_ring_buffer *rb;
4239 struct sk_buff_head *rb_queue;
0e11c91e 4240 __be16 num;
f6fb8f10 4241 int err = -EINVAL;
4242 /* Added to avoid minimal code churn */
4243 struct tpacket_req *req = &req_u->req;
4244
69e3c75f
JB
4245 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4246 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4247
69e3c75f
JB
4248 err = -EBUSY;
4249 if (!closing) {
4250 if (atomic_read(&po->mapped))
4251 goto out;
b0138408 4252 if (packet_read_pending(rb))
69e3c75f
JB
4253 goto out;
4254 }
1da177e4 4255
69e3c75f 4256 if (req->tp_block_nr) {
4576cd46
WB
4257 unsigned int min_frame_size;
4258
69e3c75f
JB
4259 /* Sanity tests and some calculations */
4260 err = -EBUSY;
4261 if (unlikely(rb->pg_vec))
4262 goto out;
1da177e4 4263
bbd6ef87
PM
4264 switch (po->tp_version) {
4265 case TPACKET_V1:
4266 po->tp_hdrlen = TPACKET_HDRLEN;
4267 break;
4268 case TPACKET_V2:
4269 po->tp_hdrlen = TPACKET2_HDRLEN;
4270 break;
f6fb8f10 4271 case TPACKET_V3:
4272 po->tp_hdrlen = TPACKET3_HDRLEN;
4273 break;
bbd6ef87
PM
4274 }
4275
69e3c75f 4276 err = -EINVAL;
4ebf0ae2 4277 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4278 goto out;
90836b67 4279 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4280 goto out;
4576cd46 4281 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4282 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4283 req->tp_block_size <
4284 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4285 goto out;
4576cd46 4286 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4287 goto out;
4ebf0ae2 4288 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4289 goto out;
1da177e4 4290
4194b491
TK
4291 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4292 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4293 goto out;
fc62814d 4294 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4295 goto out;
69e3c75f
JB
4296 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4297 req->tp_frame_nr))
4298 goto out;
1da177e4
LT
4299
4300 err = -ENOMEM;
3a7ad063
ED
4301 order = get_order(req->tp_block_size);
4302 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4303 if (unlikely(!pg_vec))
1da177e4 4304 goto out;
f6fb8f10 4305 switch (po->tp_version) {
4306 case TPACKET_V3:
7f953ab2
SV
4307 /* Block transmit is not supported yet */
4308 if (!tx_ring) {
e8e85cc5 4309 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4310 } else {
4311 struct tpacket_req3 *req3 = &req_u->req3;
4312
4313 if (req3->tp_retire_blk_tov ||
4314 req3->tp_sizeof_priv ||
4315 req3->tp_feature_req_word) {
4316 err = -EINVAL;
4317 goto out;
4318 }
4319 }
d7cf0c34 4320 break;
f6fb8f10 4321 default:
4322 break;
4323 }
69e3c75f
JB
4324 }
4325 /* Done */
4326 else {
4327 err = -EINVAL;
4ebf0ae2 4328 if (unlikely(req->tp_frame_nr))
69e3c75f 4329 goto out;
1da177e4
LT
4330 }
4331
1da177e4
LT
4332
4333 /* Detach socket from network */
4334 spin_lock(&po->bind_lock);
4335 was_running = po->running;
4336 num = po->num;
4337 if (was_running) {
1da177e4 4338 po->num = 0;
ce06b03e 4339 __unregister_prot_hook(sk, false);
1da177e4
LT
4340 }
4341 spin_unlock(&po->bind_lock);
1ce4f28b 4342
1da177e4
LT
4343 synchronize_net();
4344
4345 err = -EBUSY;
905db440 4346 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4347 if (closing || atomic_read(&po->mapped) == 0) {
4348 err = 0;
69e3c75f 4349 spin_lock_bh(&rb_queue->lock);
c053fd96 4350 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4351 rb->frame_max = (req->tp_frame_nr - 1);
4352 rb->head = 0;
4353 rb->frame_size = req->tp_frame_size;
4354 spin_unlock_bh(&rb_queue->lock);
4355
3a7ad063 4356 swap(rb->pg_vec_order, order);
c053fd96 4357 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4358
4359 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4360 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4361 tpacket_rcv : packet_rcv;
4362 skb_queue_purge(rb_queue);
1da177e4 4363 if (atomic_read(&po->mapped))
40d4e3df
ED
4364 pr_err("packet_mmap: vma is busy: %d\n",
4365 atomic_read(&po->mapped));
1da177e4 4366 }
905db440 4367 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4368
4369 spin_lock(&po->bind_lock);
ce06b03e 4370 if (was_running) {
1da177e4 4371 po->num = num;
ce06b03e 4372 register_prot_hook(sk);
1da177e4
LT
4373 }
4374 spin_unlock(&po->bind_lock);
c800aaf8 4375 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4376 /* Because we don't support block-based V3 on tx-ring */
4377 if (!tx_ring)
73d0fcf2 4378 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4379 }
1da177e4 4380
1da177e4 4381 if (pg_vec)
3a7ad063 4382 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4383out:
4384 return err;
4385}
4386
69e3c75f
JB
4387static int packet_mmap(struct file *file, struct socket *sock,
4388 struct vm_area_struct *vma)
1da177e4
LT
4389{
4390 struct sock *sk = sock->sk;
4391 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4392 unsigned long size, expected_size;
4393 struct packet_ring_buffer *rb;
1da177e4
LT
4394 unsigned long start;
4395 int err = -EINVAL;
4396 int i;
4397
4398 if (vma->vm_pgoff)
4399 return -EINVAL;
4400
905db440 4401 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4402
4403 expected_size = 0;
4404 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4405 if (rb->pg_vec) {
4406 expected_size += rb->pg_vec_len
4407 * rb->pg_vec_pages
4408 * PAGE_SIZE;
4409 }
4410 }
4411
4412 if (expected_size == 0)
1da177e4 4413 goto out;
69e3c75f
JB
4414
4415 size = vma->vm_end - vma->vm_start;
4416 if (size != expected_size)
1da177e4
LT
4417 goto out;
4418
1da177e4 4419 start = vma->vm_start;
69e3c75f
JB
4420 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4421 if (rb->pg_vec == NULL)
4422 continue;
4423
4424 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4425 struct page *page;
4426 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4427 int pg_num;
4428
c56b4d90
CG
4429 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4430 page = pgv_to_page(kaddr);
69e3c75f
JB
4431 err = vm_insert_page(vma, start, page);
4432 if (unlikely(err))
4433 goto out;
4434 start += PAGE_SIZE;
0e3125c7 4435 kaddr += PAGE_SIZE;
69e3c75f 4436 }
4ebf0ae2 4437 }
1da177e4 4438 }
69e3c75f 4439
4ebf0ae2 4440 atomic_inc(&po->mapped);
1da177e4
LT
4441 vma->vm_ops = &packet_mmap_ops;
4442 err = 0;
4443
4444out:
905db440 4445 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4446 return err;
4447}
1da177e4 4448
90ddc4f0 4449static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4450 .family = PF_PACKET,
4451 .owner = THIS_MODULE,
4452 .release = packet_release,
4453 .bind = packet_bind_spkt,
4454 .connect = sock_no_connect,
4455 .socketpair = sock_no_socketpair,
4456 .accept = sock_no_accept,
4457 .getname = packet_getname_spkt,
a11e1d43 4458 .poll = datagram_poll,
1da177e4
LT
4459 .ioctl = packet_ioctl,
4460 .listen = sock_no_listen,
4461 .shutdown = sock_no_shutdown,
4462 .setsockopt = sock_no_setsockopt,
4463 .getsockopt = sock_no_getsockopt,
4464 .sendmsg = packet_sendmsg_spkt,
4465 .recvmsg = packet_recvmsg,
4466 .mmap = sock_no_mmap,
4467 .sendpage = sock_no_sendpage,
4468};
1da177e4 4469
90ddc4f0 4470static const struct proto_ops packet_ops = {
1da177e4
LT
4471 .family = PF_PACKET,
4472 .owner = THIS_MODULE,
4473 .release = packet_release,
4474 .bind = packet_bind,
4475 .connect = sock_no_connect,
4476 .socketpair = sock_no_socketpair,
4477 .accept = sock_no_accept,
1ce4f28b 4478 .getname = packet_getname,
a11e1d43 4479 .poll = packet_poll,
1da177e4
LT
4480 .ioctl = packet_ioctl,
4481 .listen = sock_no_listen,
4482 .shutdown = sock_no_shutdown,
4483 .setsockopt = packet_setsockopt,
4484 .getsockopt = packet_getsockopt,
719c44d3
WB
4485#ifdef CONFIG_COMPAT
4486 .compat_setsockopt = compat_packet_setsockopt,
4487#endif
1da177e4
LT
4488 .sendmsg = packet_sendmsg,
4489 .recvmsg = packet_recvmsg,
4490 .mmap = packet_mmap,
4491 .sendpage = sock_no_sendpage,
4492};
4493
ec1b4cf7 4494static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4495 .family = PF_PACKET,
4496 .create = packet_create,
4497 .owner = THIS_MODULE,
4498};
4499
4500static struct notifier_block packet_netdev_notifier = {
40d4e3df 4501 .notifier_call = packet_notifier,
1da177e4
LT
4502};
4503
4504#ifdef CONFIG_PROC_FS
1da177e4
LT
4505
4506static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4507 __acquires(RCU)
1da177e4 4508{
e372c414 4509 struct net *net = seq_file_net(seq);
808f5114 4510
4511 rcu_read_lock();
4512 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4513}
4514
4515static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4516{
1bf40954 4517 struct net *net = seq_file_net(seq);
808f5114 4518 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4519}
4520
4521static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4522 __releases(RCU)
1da177e4 4523{
808f5114 4524 rcu_read_unlock();
1da177e4
LT
4525}
4526
1ce4f28b 4527static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4528{
4529 if (v == SEQ_START_TOKEN)
4530 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4531 else {
b7ceabd9 4532 struct sock *s = sk_entry(v);
1da177e4
LT
4533 const struct packet_sock *po = pkt_sk(s);
4534
4535 seq_printf(seq,
71338aa7 4536 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4537 s,
41c6d650 4538 refcount_read(&s->sk_refcnt),
1da177e4
LT
4539 s->sk_type,
4540 ntohs(po->num),
4541 po->ifindex,
4542 po->running,
4543 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4544 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4545 sock_i_ino(s));
1da177e4
LT
4546 }
4547
4548 return 0;
4549}
4550
56b3d975 4551static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4552 .start = packet_seq_start,
4553 .next = packet_seq_next,
4554 .stop = packet_seq_stop,
4555 .show = packet_seq_show,
4556};
1da177e4
LT
4557#endif
4558
2c8c1e72 4559static int __net_init packet_net_init(struct net *net)
d12d01d6 4560{
0fa7fa98 4561 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4562 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4563
c3506372
CH
4564 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4565 sizeof(struct seq_net_private)))
d12d01d6
DL
4566 return -ENOMEM;
4567
4568 return 0;
4569}
4570
2c8c1e72 4571static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4572{
ece31ffd 4573 remove_proc_entry("packet", net->proc_net);
669f8f1a 4574 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4575}
4576
4577static struct pernet_operations packet_net_ops = {
4578 .init = packet_net_init,
4579 .exit = packet_net_exit,
4580};
4581
4582
1da177e4
LT
4583static void __exit packet_exit(void)
4584{
1da177e4 4585 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4586 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4587 sock_unregister(PF_PACKET);
4588 proto_unregister(&packet_proto);
4589}
4590
4591static int __init packet_init(void)
4592{
4593 int rc = proto_register(&packet_proto, 0);
4594
4595 if (rc != 0)
4596 goto out;
4597
4598 sock_register(&packet_family_ops);
d12d01d6 4599 register_pernet_subsys(&packet_net_ops);
1da177e4 4600 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4601out:
4602 return rc;
4603}
4604
4605module_init(packet_init);
4606module_exit(packet_exit);
4607MODULE_LICENSE("GPL");
4608MODULE_ALIAS_NETPROTO(PF_PACKET);