]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/packet/af_packet.c
packets: Always register packet sk in the same order
[mirror_ubuntu-jammy-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
17bfd8c8 204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
1da177e4 211static void packet_flush_mclist(struct sock *sk);
865b03f2 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 213
ffbc6111 214struct packet_skb_cb {
ffbc6111
HX
215 union {
216 struct sockaddr_pkt pkt;
2472d761
EB
217 union {
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
220 * to save room.
221 */
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
ffbc6111
HX
225 } sa;
226};
227
d3869efe
DW
228#define vio_le() virtio_legacy_is_little_endian()
229
ffbc6111 230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 231
bc59ba39 232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 233#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
dc99f600
DM
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
d346a3fa
DB
244static int packet_direct_xmit(struct sk_buff *skb)
245{
865b03f2 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
247}
248
66e56cd4
DB
249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
250{
251 struct net_device *dev;
252
253 rcu_read_lock();
254 dev = rcu_dereference(po->cached_dev);
255 if (likely(dev))
256 dev_hold(dev);
257 rcu_read_unlock();
258
259 return dev;
260}
261
262static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
264{
265 rcu_assign_pointer(po->cached_dev, dev);
266}
267
268static void packet_cached_dev_reset(struct packet_sock *po)
269{
270 RCU_INIT_POINTER(po->cached_dev, NULL);
271}
272
d346a3fa
DB
273static bool packet_use_direct_xmit(const struct packet_sock *po)
274{
275 return po->xmit == packet_direct_xmit;
276}
277
8ec56fc3
AD
278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
279 struct net_device *sb_dev)
d346a3fa 280{
8ec56fc3 281 return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
d346a3fa
DB
282}
283
865b03f2 284static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 285{
865b03f2 286 struct net_device *dev = skb->dev;
0fd5d57b
DB
287 const struct net_device_ops *ops = dev->netdev_ops;
288 u16 queue_index;
289
290 if (ops->ndo_select_queue) {
291 queue_index = ops->ndo_select_queue(dev, skb, NULL,
292 __packet_pick_tx_queue);
293 queue_index = netdev_cap_txqueue(dev, queue_index);
294 } else {
8ec56fc3 295 queue_index = __packet_pick_tx_queue(dev, skb, NULL);
0fd5d57b
DB
296 }
297
865b03f2 298 return queue_index;
0fd5d57b
DB
299}
300
a6361f0c 301/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
302 * or from a context in which asynchronous accesses to the packet
303 * socket is not possible (packet_create()).
304 */
a6361f0c 305static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
306{
307 struct packet_sock *po = pkt_sk(sk);
e40526cb 308
ce06b03e 309 if (!po->running) {
66e56cd4 310 if (po->fanout)
dc99f600 311 __fanout_link(sk, po);
66e56cd4 312 else
dc99f600 313 dev_add_pack(&po->prot_hook);
e40526cb 314
ce06b03e
DM
315 sock_hold(sk);
316 po->running = 1;
317 }
318}
319
a6361f0c
WB
320static void register_prot_hook(struct sock *sk)
321{
322 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
323 __register_prot_hook(sk);
324}
325
326/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
327 * the po->bind_lock and do a synchronize_net to make sure no
328 * asynchronous packet processing paths still refer to the elements
329 * of po->prot_hook. If the sync parameter is false, it is the
330 * callers responsibility to take care of this.
331 */
332static void __unregister_prot_hook(struct sock *sk, bool sync)
333{
334 struct packet_sock *po = pkt_sk(sk);
335
a6361f0c
WB
336 lockdep_assert_held_once(&po->bind_lock);
337
ce06b03e 338 po->running = 0;
66e56cd4
DB
339
340 if (po->fanout)
dc99f600 341 __fanout_unlink(sk, po);
66e56cd4 342 else
dc99f600 343 __dev_remove_pack(&po->prot_hook);
e40526cb 344
ce06b03e
DM
345 __sock_put(sk);
346
347 if (sync) {
348 spin_unlock(&po->bind_lock);
349 synchronize_net();
350 spin_lock(&po->bind_lock);
351 }
352}
353
354static void unregister_prot_hook(struct sock *sk, bool sync)
355{
356 struct packet_sock *po = pkt_sk(sk);
357
358 if (po->running)
359 __unregister_prot_hook(sk, sync);
360}
361
6e58040b 362static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
363{
364 if (is_vmalloc_addr(addr))
365 return vmalloc_to_page(addr);
366 return virt_to_page(addr);
367}
368
69e3c75f 369static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 370{
184f489e 371 union tpacket_uhdr h;
1da177e4 372
69e3c75f 373 h.raw = frame;
bbd6ef87
PM
374 switch (po->tp_version) {
375 case TPACKET_V1:
69e3c75f 376 h.h1->tp_status = status;
0af55bb5 377 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
378 break;
379 case TPACKET_V2:
69e3c75f 380 h.h2->tp_status = status;
0af55bb5 381 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 382 break;
f6fb8f10 383 case TPACKET_V3:
7f953ab2
SV
384 h.h3->tp_status = status;
385 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
386 break;
69e3c75f 387 default:
f6fb8f10 388 WARN(1, "TPACKET version not supported.\n");
69e3c75f 389 BUG();
bbd6ef87 390 }
69e3c75f
JB
391
392 smp_wmb();
bbd6ef87
PM
393}
394
69e3c75f 395static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 396{
184f489e 397 union tpacket_uhdr h;
bbd6ef87 398
69e3c75f
JB
399 smp_rmb();
400
bbd6ef87
PM
401 h.raw = frame;
402 switch (po->tp_version) {
403 case TPACKET_V1:
0af55bb5 404 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 405 return h.h1->tp_status;
bbd6ef87 406 case TPACKET_V2:
0af55bb5 407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 408 return h.h2->tp_status;
f6fb8f10 409 case TPACKET_V3:
7f953ab2
SV
410 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
411 return h.h3->tp_status;
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
414 BUG();
415 return 0;
bbd6ef87 416 }
1da177e4 417}
69e3c75f 418
b9c32fb2
DB
419static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
420 unsigned int flags)
7a51384c
DB
421{
422 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
423
68a360e8
WB
424 if (shhwtstamps &&
425 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
426 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
427 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
428
429 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 430 return TP_STATUS_TS_SOFTWARE;
7a51384c 431
b9c32fb2 432 return 0;
7a51384c
DB
433}
434
b9c32fb2
DB
435static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
436 struct sk_buff *skb)
2e31396f
WB
437{
438 union tpacket_uhdr h;
439 struct timespec ts;
b9c32fb2 440 __u32 ts_status;
2e31396f 441
b9c32fb2
DB
442 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
443 return 0;
2e31396f
WB
444
445 h.raw = frame;
446 switch (po->tp_version) {
447 case TPACKET_V1:
448 h.h1->tp_sec = ts.tv_sec;
449 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
450 break;
451 case TPACKET_V2:
452 h.h2->tp_sec = ts.tv_sec;
453 h.h2->tp_nsec = ts.tv_nsec;
454 break;
455 case TPACKET_V3:
57ea884b
DB
456 h.h3->tp_sec = ts.tv_sec;
457 h.h3->tp_nsec = ts.tv_nsec;
458 break;
2e31396f
WB
459 default:
460 WARN(1, "TPACKET version not supported.\n");
461 BUG();
462 }
463
464 /* one flush is safe, as both fields always lie on the same cacheline */
465 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
466 smp_wmb();
b9c32fb2
DB
467
468 return ts_status;
2e31396f
WB
469}
470
69e3c75f
JB
471static void *packet_lookup_frame(struct packet_sock *po,
472 struct packet_ring_buffer *rb,
473 unsigned int position,
474 int status)
475{
476 unsigned int pg_vec_pos, frame_offset;
184f489e 477 union tpacket_uhdr h;
69e3c75f
JB
478
479 pg_vec_pos = position / rb->frames_per_block;
480 frame_offset = position % rb->frames_per_block;
481
0e3125c7
NH
482 h.raw = rb->pg_vec[pg_vec_pos].buffer +
483 (frame_offset * rb->frame_size);
69e3c75f
JB
484
485 if (status != __packet_get_status(po, h.raw))
486 return NULL;
487
488 return h.raw;
489}
490
eea49cc9 491static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
492 struct packet_ring_buffer *rb,
493 int status)
494{
495 return packet_lookup_frame(po, rb, rb->head, status);
496}
497
bc59ba39 498static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 499{
500 del_timer_sync(&pkc->retire_blk_timer);
501}
502
503static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 504 struct sk_buff_head *rb_queue)
505{
bc59ba39 506 struct tpacket_kbdq_core *pkc;
f6fb8f10 507
73d0fcf2 508 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 509
ec6f809f 510 spin_lock_bh(&rb_queue->lock);
f6fb8f10 511 pkc->delete_blk_timer = 1;
ec6f809f 512 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 513
514 prb_del_retire_blk_timer(pkc);
515}
516
e8e85cc5 517static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 518{
bc59ba39 519 struct tpacket_kbdq_core *pkc;
f6fb8f10 520
e8e85cc5 521 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
522 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
523 0);
524 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 532 struct ethtool_link_ksettings ecmd;
4bc71cb9 533 int err;
f6fb8f10 534
4bc71cb9
JP
535 rtnl_lock();
536 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
537 if (unlikely(!dev)) {
538 rtnl_unlock();
f6fb8f10 539 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 540 }
7cad1bac 541 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
542 rtnl_unlock();
543 if (!err) {
4bc71cb9
JP
544 /*
545 * If the link speed is so slow you don't really
546 * need to worry about perf anyways
547 */
7cad1bac
DD
548 if (ecmd.base.speed < SPEED_1000 ||
549 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 550 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 551 } else {
552 msec = 1;
7cad1bac 553 div = ecmd.base.speed / 1000;
f6fb8f10 554 }
555 }
556
557 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
558
559 if (div)
560 mbits /= div;
561
562 tmo = mbits * msec;
563
564 if (div)
565 return tmo+1;
566 return tmo;
567}
568
bc59ba39 569static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 570 union tpacket_req_u *req_u)
571{
572 p1->feature_req_word = req_u->req3.tp_feature_req_word;
573}
574
575static void init_prb_bdqc(struct packet_sock *po,
576 struct packet_ring_buffer *rb,
577 struct pgv *pg_vec,
e8e85cc5 578 union tpacket_req_u *req_u)
f6fb8f10 579{
22781a5b 580 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 581 struct tpacket_block_desc *pbd;
f6fb8f10 582
583 memset(p1, 0x0, sizeof(*p1));
584
585 p1->knxt_seq_num = 1;
586 p1->pkbdq = pg_vec;
bc59ba39 587 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 588 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 589 p1->kblk_size = req_u->req3.tp_block_size;
590 p1->knum_blocks = req_u->req3.tp_block_nr;
591 p1->hdrlen = po->tp_hdrlen;
592 p1->version = po->tp_version;
593 p1->last_kactive_blk_num = 0;
ee80fbf3 594 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 595 if (req_u->req3.tp_retire_blk_tov)
596 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
597 else
598 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
599 req_u->req3.tp_block_size);
600 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
601 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
602
dc808110 603 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 604 prb_init_ft_ops(p1, req_u);
e8e85cc5 605 prb_setup_retire_blk_timer(po);
f6fb8f10 606 prb_open_block(p1, pbd);
607}
608
609/* Do NOT update the last_blk_num first.
610 * Assumes sk_buff_head lock is held.
611 */
bc59ba39 612static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 613{
614 mod_timer(&pkc->retire_blk_timer,
615 jiffies + pkc->tov_in_jiffies);
616 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
617}
618
619/*
620 * Timer logic:
621 * 1) We refresh the timer only when we open a block.
622 * By doing this we don't waste cycles refreshing the timer
623 * on packet-by-packet basis.
624 *
625 * With a 1MB block-size, on a 1Gbps line, it will take
626 * i) ~8 ms to fill a block + ii) memcpy etc.
627 * In this cut we are not accounting for the memcpy time.
628 *
629 * So, if the user sets the 'tmo' to 10ms then the timer
630 * will never fire while the block is still getting filled
631 * (which is what we want). However, the user could choose
632 * to close a block early and that's fine.
633 *
634 * But when the timer does fire, we check whether or not to refresh it.
635 * Since the tmo granularity is in msecs, it is not too expensive
636 * to refresh the timer, lets say every '8' msecs.
637 * Either the user can set the 'tmo' or we can derive it based on
638 * a) line-speed and b) block-size.
639 * prb_calc_retire_blk_tmo() calculates the tmo.
640 *
641 */
17bfd8c8 642static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 643{
17bfd8c8
KC
644 struct packet_sock *po =
645 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 646 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 647 unsigned int frozen;
bc59ba39 648 struct tpacket_block_desc *pbd;
f6fb8f10 649
650 spin_lock(&po->sk.sk_receive_queue.lock);
651
652 frozen = prb_queue_frozen(pkc);
653 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
654
655 if (unlikely(pkc->delete_blk_timer))
656 goto out;
657
658 /* We only need to plug the race when the block is partially filled.
659 * tpacket_rcv:
660 * lock(); increment BLOCK_NUM_PKTS; unlock()
661 * copy_bits() is in progress ...
662 * timer fires on other cpu:
663 * we can't retire the current block because copy_bits
664 * is in progress.
665 *
666 */
667 if (BLOCK_NUM_PKTS(pbd)) {
668 while (atomic_read(&pkc->blk_fill_in_prog)) {
669 /* Waiting for skb_copy_bits to finish... */
670 cpu_relax();
671 }
672 }
673
674 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
675 if (!frozen) {
41a50d62
AD
676 if (!BLOCK_NUM_PKTS(pbd)) {
677 /* An empty block. Just refresh the timer. */
678 goto refresh_timer;
679 }
f6fb8f10 680 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
681 if (!prb_dispatch_next_block(pkc, po))
682 goto refresh_timer;
683 else
684 goto out;
685 } else {
686 /* Case 1. Queue was frozen because user-space was
687 * lagging behind.
688 */
878cd3ba 689 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 690 /*
691 * Ok, user-space is still behind.
692 * So just refresh the timer.
693 */
694 goto refresh_timer;
695 } else {
696 /* Case 2. queue was frozen,user-space caught up,
697 * now the link went idle && the timer fired.
698 * We don't have a block to close.So we open this
699 * block and restart the timer.
700 * opening a block thaws the queue,restarts timer
701 * Thawing/timer-refresh is a side effect.
702 */
703 prb_open_block(pkc, pbd);
704 goto out;
705 }
706 }
707 }
708
709refresh_timer:
710 _prb_refresh_rx_retire_blk_timer(pkc);
711
712out:
713 spin_unlock(&po->sk.sk_receive_queue.lock);
714}
715
eea49cc9 716static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 717 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 718{
719 /* Flush everything minus the block header */
720
721#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
722 u8 *start, *end;
723
724 start = (u8 *)pbd1;
725
726 /* Skip the block header(we know header WILL fit in 4K) */
727 start += PAGE_SIZE;
728
729 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
730 for (; start < end; start += PAGE_SIZE)
731 flush_dcache_page(pgv_to_page(start));
732
733 smp_wmb();
734#endif
735
736 /* Now update the block status. */
737
738 BLOCK_STATUS(pbd1) = status;
739
740 /* Flush the block header */
741
742#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
743 start = (u8 *)pbd1;
744 flush_dcache_page(pgv_to_page(start));
745
746 smp_wmb();
747#endif
748}
749
750/*
751 * Side effect:
752 *
753 * 1) flush the block
754 * 2) Increment active_blk_num
755 *
756 * Note:We DONT refresh the timer on purpose.
757 * Because almost always the next block will be opened.
758 */
bc59ba39 759static void prb_close_block(struct tpacket_kbdq_core *pkc1,
760 struct tpacket_block_desc *pbd1,
f6fb8f10 761 struct packet_sock *po, unsigned int stat)
762{
763 __u32 status = TP_STATUS_USER | stat;
764
765 struct tpacket3_hdr *last_pkt;
bc59ba39 766 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 767 struct sock *sk = &po->sk;
f6fb8f10 768
ee80fbf3 769 if (po->stats.stats3.tp_drops)
f6fb8f10 770 status |= TP_STATUS_LOSING;
771
772 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
773 last_pkt->tp_next_offset = 0;
774
775 /* Get the ts of the last pkt */
776 if (BLOCK_NUM_PKTS(pbd1)) {
777 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
778 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
779 } else {
41a50d62
AD
780 /* Ok, we tmo'd - so get the current time.
781 *
782 * It shouldn't really happen as we don't close empty
783 * blocks. See prb_retire_rx_blk_timer_expired().
784 */
f6fb8f10 785 struct timespec ts;
786 getnstimeofday(&ts);
787 h1->ts_last_pkt.ts_sec = ts.tv_sec;
788 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
789 }
790
791 smp_wmb();
792
793 /* Flush the block */
794 prb_flush_block(pkc1, pbd1, status);
795
da413eec
DC
796 sk->sk_data_ready(sk);
797
f6fb8f10 798 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
799}
800
eea49cc9 801static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 802{
803 pkc->reset_pending_on_curr_blk = 0;
804}
805
806/*
807 * Side effect of opening a block:
808 *
809 * 1) prb_queue is thawed.
810 * 2) retire_blk_timer is refreshed.
811 *
812 */
bc59ba39 813static void prb_open_block(struct tpacket_kbdq_core *pkc1,
814 struct tpacket_block_desc *pbd1)
f6fb8f10 815{
816 struct timespec ts;
bc59ba39 817 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 818
819 smp_rmb();
820
8da3056c
DB
821 /* We could have just memset this but we will lose the
822 * flexibility of making the priv area sticky
823 */
f6fb8f10 824
8da3056c
DB
825 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
826 BLOCK_NUM_PKTS(pbd1) = 0;
827 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 828
8da3056c
DB
829 getnstimeofday(&ts);
830
831 h1->ts_first_pkt.ts_sec = ts.tv_sec;
832 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 833
8da3056c
DB
834 pkc1->pkblk_start = (char *)pbd1;
835 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
836
837 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
838 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
839
840 pbd1->version = pkc1->version;
841 pkc1->prev = pkc1->nxt_offset;
842 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
843
844 prb_thaw_queue(pkc1);
845 _prb_refresh_rx_retire_blk_timer(pkc1);
846
847 smp_wmb();
f6fb8f10 848}
849
850/*
851 * Queue freeze logic:
852 * 1) Assume tp_block_nr = 8 blocks.
853 * 2) At time 't0', user opens Rx ring.
854 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
855 * 4) user-space is either sleeping or processing block '0'.
856 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
857 * it will close block-7,loop around and try to fill block '0'.
858 * call-flow:
859 * __packet_lookup_frame_in_block
860 * prb_retire_current_block()
861 * prb_dispatch_next_block()
862 * |->(BLOCK_STATUS == USER) evaluates to true
863 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
864 * 6) Now there are two cases:
865 * 6.1) Link goes idle right after the queue is frozen.
866 * But remember, the last open_block() refreshed the timer.
867 * When this timer expires,it will refresh itself so that we can
868 * re-open block-0 in near future.
869 * 6.2) Link is busy and keeps on receiving packets. This is a simple
870 * case and __packet_lookup_frame_in_block will check if block-0
871 * is free and can now be re-used.
872 */
eea49cc9 873static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 874 struct packet_sock *po)
875{
876 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 877 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 878}
879
880#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
881
882/*
883 * If the next block is free then we will dispatch it
884 * and return a good offset.
885 * Else, we will freeze the queue.
886 * So, caller must check the return value.
887 */
bc59ba39 888static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 889 struct packet_sock *po)
890{
bc59ba39 891 struct tpacket_block_desc *pbd;
f6fb8f10 892
893 smp_rmb();
894
895 /* 1. Get current block num */
896 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
897
898 /* 2. If this block is currently in_use then freeze the queue */
899 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
900 prb_freeze_queue(pkc, po);
901 return NULL;
902 }
903
904 /*
905 * 3.
906 * open this block and return the offset where the first packet
907 * needs to get stored.
908 */
909 prb_open_block(pkc, pbd);
910 return (void *)pkc->nxt_offset;
911}
912
bc59ba39 913static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 914 struct packet_sock *po, unsigned int status)
915{
bc59ba39 916 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 917
918 /* retire/close the current block */
919 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
920 /*
921 * Plug the case where copy_bits() is in progress on
922 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
923 * have space to copy the pkt in the current block and
924 * called prb_retire_current_block()
925 *
926 * We don't need to worry about the TMO case because
927 * the timer-handler already handled this case.
928 */
929 if (!(status & TP_STATUS_BLK_TMO)) {
930 while (atomic_read(&pkc->blk_fill_in_prog)) {
931 /* Waiting for skb_copy_bits to finish... */
932 cpu_relax();
933 }
934 }
935 prb_close_block(pkc, pbd, po, status);
936 return;
937 }
f6fb8f10 938}
939
878cd3ba 940static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 941{
942 return TP_STATUS_USER & BLOCK_STATUS(pbd);
943}
944
eea49cc9 945static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 946{
947 return pkc->reset_pending_on_curr_blk;
948}
949
eea49cc9 950static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 951{
bc59ba39 952 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 953 atomic_dec(&pkc->blk_fill_in_prog);
954}
955
eea49cc9 956static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 957 struct tpacket3_hdr *ppd)
958{
3958afa1 959 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 960}
961
eea49cc9 962static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 963 struct tpacket3_hdr *ppd)
964{
965 ppd->hv1.tp_rxhash = 0;
966}
967
eea49cc9 968static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 969 struct tpacket3_hdr *ppd)
970{
df8a39de
JP
971 if (skb_vlan_tag_present(pkc->skb)) {
972 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
973 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
974 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 975 } else {
9e67030a 976 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 977 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 978 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 979 }
980}
981
bc59ba39 982static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 983 struct tpacket3_hdr *ppd)
984{
a0cdfcf3 985 ppd->hv1.tp_padding = 0;
f6fb8f10 986 prb_fill_vlan_info(pkc, ppd);
987
988 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
989 prb_fill_rxhash(pkc, ppd);
990 else
991 prb_clear_rxhash(pkc, ppd);
992}
993
eea49cc9 994static void prb_fill_curr_block(char *curr,
bc59ba39 995 struct tpacket_kbdq_core *pkc,
996 struct tpacket_block_desc *pbd,
f6fb8f10 997 unsigned int len)
998{
999 struct tpacket3_hdr *ppd;
1000
1001 ppd = (struct tpacket3_hdr *)curr;
1002 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 pkc->prev = curr;
1004 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1006 BLOCK_NUM_PKTS(pbd) += 1;
1007 atomic_inc(&pkc->blk_fill_in_prog);
1008 prb_run_all_ft_ops(pkc, ppd);
1009}
1010
1011/* Assumes caller has the sk->rx_queue.lock */
1012static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1013 struct sk_buff *skb,
1014 int status,
1015 unsigned int len
1016 )
1017{
bc59ba39 1018 struct tpacket_kbdq_core *pkc;
1019 struct tpacket_block_desc *pbd;
f6fb8f10 1020 char *curr, *end;
1021
e3192690 1022 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1023 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1024
1025 /* Queue is frozen when user space is lagging behind */
1026 if (prb_queue_frozen(pkc)) {
1027 /*
1028 * Check if that last block which caused the queue to freeze,
1029 * is still in_use by user-space.
1030 */
878cd3ba 1031 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1032 /* Can't record this packet */
1033 return NULL;
1034 } else {
1035 /*
1036 * Ok, the block was released by user-space.
1037 * Now let's open that block.
1038 * opening a block also thaws the queue.
1039 * Thawing is a side effect.
1040 */
1041 prb_open_block(pkc, pbd);
1042 }
1043 }
1044
1045 smp_mb();
1046 curr = pkc->nxt_offset;
1047 pkc->skb = skb;
e3192690 1048 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1049
1050 /* first try the current block */
1051 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1052 prb_fill_curr_block(curr, pkc, pbd, len);
1053 return (void *)curr;
1054 }
1055
1056 /* Ok, close the current block */
1057 prb_retire_current_block(pkc, po, 0);
1058
1059 /* Now, try to dispatch the next block */
1060 curr = (char *)prb_dispatch_next_block(pkc, po);
1061 if (curr) {
1062 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1063 prb_fill_curr_block(curr, pkc, pbd, len);
1064 return (void *)curr;
1065 }
1066
1067 /*
1068 * No free blocks are available.user_space hasn't caught up yet.
1069 * Queue was just frozen and now this packet will get dropped.
1070 */
1071 return NULL;
1072}
1073
eea49cc9 1074static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1075 struct sk_buff *skb,
1076 int status, unsigned int len)
1077{
1078 char *curr = NULL;
1079 switch (po->tp_version) {
1080 case TPACKET_V1:
1081 case TPACKET_V2:
1082 curr = packet_lookup_frame(po, &po->rx_ring,
1083 po->rx_ring.head, status);
1084 return curr;
1085 case TPACKET_V3:
1086 return __packet_lookup_frame_in_block(po, skb, status, len);
1087 default:
1088 WARN(1, "TPACKET version not supported\n");
1089 BUG();
99aa3473 1090 return NULL;
f6fb8f10 1091 }
1092}
1093
eea49cc9 1094static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1095 struct packet_ring_buffer *rb,
77f65ebd 1096 unsigned int idx,
f6fb8f10 1097 int status)
1098{
bc59ba39 1099 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1100 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1101
1102 if (status != BLOCK_STATUS(pbd))
1103 return NULL;
1104 return pbd;
1105}
1106
eea49cc9 1107static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1108{
1109 unsigned int prev;
1110 if (rb->prb_bdqc.kactive_blk_num)
1111 prev = rb->prb_bdqc.kactive_blk_num-1;
1112 else
1113 prev = rb->prb_bdqc.knum_blocks-1;
1114 return prev;
1115}
1116
1117/* Assumes caller has held the rx_queue.lock */
eea49cc9 1118static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 unsigned int previous = prb_previous_blk_num(rb);
1123 return prb_lookup_block(po, rb, previous, status);
1124}
1125
eea49cc9 1126static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 if (po->tp_version <= TPACKET_V2)
1131 return packet_previous_frame(po, rb, status);
1132
1133 return __prb_previous_block(po, rb, status);
1134}
1135
eea49cc9 1136static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1137 struct packet_ring_buffer *rb)
1138{
1139 switch (po->tp_version) {
1140 case TPACKET_V1:
1141 case TPACKET_V2:
1142 return packet_increment_head(rb);
1143 case TPACKET_V3:
1144 default:
1145 WARN(1, "TPACKET version not supported.\n");
1146 BUG();
1147 return;
1148 }
1149}
1150
eea49cc9 1151static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1152 struct packet_ring_buffer *rb,
1153 int status)
1154{
1155 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1156 return packet_lookup_frame(po, rb, previous, status);
1157}
1158
eea49cc9 1159static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1160{
1161 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1162}
1163
b0138408
DB
1164static void packet_inc_pending(struct packet_ring_buffer *rb)
1165{
1166 this_cpu_inc(*rb->pending_refcnt);
1167}
1168
1169static void packet_dec_pending(struct packet_ring_buffer *rb)
1170{
1171 this_cpu_dec(*rb->pending_refcnt);
1172}
1173
1174static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1175{
1176 unsigned int refcnt = 0;
1177 int cpu;
1178
1179 /* We don't use pending refcount in rx_ring. */
1180 if (rb->pending_refcnt == NULL)
1181 return 0;
1182
1183 for_each_possible_cpu(cpu)
1184 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1185
1186 return refcnt;
1187}
1188
1189static int packet_alloc_pending(struct packet_sock *po)
1190{
1191 po->rx_ring.pending_refcnt = NULL;
1192
1193 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1194 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1195 return -ENOBUFS;
1196
1197 return 0;
1198}
1199
1200static void packet_free_pending(struct packet_sock *po)
1201{
1202 free_percpu(po->tx_ring.pending_refcnt);
1203}
1204
9954729b
WB
1205#define ROOM_POW_OFF 2
1206#define ROOM_NONE 0x0
1207#define ROOM_LOW 0x1
1208#define ROOM_NORMAL 0x2
1209
1210static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1211{
9954729b
WB
1212 int idx, len;
1213
1214 len = po->rx_ring.frame_max + 1;
1215 idx = po->rx_ring.head;
1216 if (pow_off)
1217 idx += len >> pow_off;
1218 if (idx >= len)
1219 idx -= len;
1220 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1221}
1222
1223static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1224{
1225 int idx, len;
1226
1227 len = po->rx_ring.prb_bdqc.knum_blocks;
1228 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1229 if (pow_off)
1230 idx += len >> pow_off;
1231 if (idx >= len)
1232 idx -= len;
1233 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1234}
77f65ebd 1235
2ccdbaa6 1236static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1237{
1238 struct sock *sk = &po->sk;
1239 int ret = ROOM_NONE;
1240
1241 if (po->prot_hook.func != tpacket_rcv) {
1242 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1243 - (skb ? skb->truesize : 0);
9954729b
WB
1244 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1245 return ROOM_NORMAL;
1246 else if (avail > 0)
1247 return ROOM_LOW;
1248 else
1249 return ROOM_NONE;
1250 }
77f65ebd 1251
9954729b
WB
1252 if (po->tp_version == TPACKET_V3) {
1253 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1254 ret = ROOM_NORMAL;
1255 else if (__tpacket_v3_has_room(po, 0))
1256 ret = ROOM_LOW;
1257 } else {
1258 if (__tpacket_has_room(po, ROOM_POW_OFF))
1259 ret = ROOM_NORMAL;
1260 else if (__tpacket_has_room(po, 0))
1261 ret = ROOM_LOW;
1262 }
2ccdbaa6
WB
1263
1264 return ret;
1265}
1266
1267static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1268{
1269 int ret;
1270 bool has_room;
1271
54d7c01d
WB
1272 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1273 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1274 has_room = ret == ROOM_NORMAL;
1275 if (po->pressure == has_room)
54d7c01d
WB
1276 po->pressure = !has_room;
1277 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1278
9954729b 1279 return ret;
77f65ebd
WB
1280}
1281
1da177e4
LT
1282static void packet_sock_destruct(struct sock *sk)
1283{
ed85b565
RC
1284 skb_queue_purge(&sk->sk_error_queue);
1285
547b792c 1286 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1287 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1288
1289 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1290 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1291 return;
1292 }
1293
17ab56a2 1294 sk_refcnt_debug_dec(sk);
1da177e4
LT
1295}
1296
3b3a5b0a
WB
1297static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1298{
1299 u32 rxhash;
1300 int i, count = 0;
1301
1302 rxhash = skb_get_hash(skb);
1303 for (i = 0; i < ROLLOVER_HLEN; i++)
1304 if (po->rollover->history[i] == rxhash)
1305 count++;
1306
1307 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1308 return count > (ROLLOVER_HLEN >> 1);
1309}
1310
77f65ebd
WB
1311static unsigned int fanout_demux_hash(struct packet_fanout *f,
1312 struct sk_buff *skb,
1313 unsigned int num)
dc99f600 1314{
eb70db87 1315 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1316}
1317
77f65ebd
WB
1318static unsigned int fanout_demux_lb(struct packet_fanout *f,
1319 struct sk_buff *skb,
1320 unsigned int num)
dc99f600 1321{
468479e6 1322 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1323
468479e6 1324 return val % num;
77f65ebd
WB
1325}
1326
1327static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1328 struct sk_buff *skb,
1329 unsigned int num)
1330{
1331 return smp_processor_id() % num;
dc99f600
DM
1332}
1333
5df0ddfb
DB
1334static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1335 struct sk_buff *skb,
1336 unsigned int num)
1337{
f337db64 1338 return prandom_u32_max(num);
5df0ddfb
DB
1339}
1340
77f65ebd
WB
1341static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1342 struct sk_buff *skb,
ad377cab 1343 unsigned int idx, bool try_self,
77f65ebd 1344 unsigned int num)
95ec3eb4 1345{
4633c9e0 1346 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1347 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1348
0648ab70 1349 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1350
1351 if (try_self) {
1352 room = packet_rcv_has_room(po, skb);
1353 if (room == ROOM_NORMAL ||
1354 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1355 return idx;
4633c9e0 1356 po_skip = po;
3b3a5b0a 1357 }
ad377cab 1358
0648ab70 1359 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1360 do {
2ccdbaa6 1361 po_next = pkt_sk(f->arr[i]);
4633c9e0 1362 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1363 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1364 if (i != j)
0648ab70 1365 po->rollover->sock = i;
a9b63918
WB
1366 atomic_long_inc(&po->rollover->num);
1367 if (room == ROOM_LOW)
1368 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1369 return i;
1370 }
ad377cab 1371
77f65ebd
WB
1372 if (++i == num)
1373 i = 0;
1374 } while (i != j);
1375
a9b63918 1376 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1377 return idx;
1378}
1379
2d36097d
NH
1380static unsigned int fanout_demux_qm(struct packet_fanout *f,
1381 struct sk_buff *skb,
1382 unsigned int num)
1383{
1384 return skb_get_queue_mapping(skb) % num;
1385}
1386
47dceb8e
WB
1387static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1388 struct sk_buff *skb,
1389 unsigned int num)
1390{
1391 struct bpf_prog *prog;
1392 unsigned int ret = 0;
1393
1394 rcu_read_lock();
1395 prog = rcu_dereference(f->bpf_prog);
1396 if (prog)
ff936a04 1397 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1398 rcu_read_unlock();
1399
1400 return ret;
1401}
1402
77f65ebd
WB
1403static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1404{
1405 return f->flags & (flag >> 8);
95ec3eb4
DM
1406}
1407
95ec3eb4
DM
1408static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1409 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1410{
1411 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1412 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1413 struct net *net = read_pnet(&f->net);
dc99f600 1414 struct packet_sock *po;
77f65ebd 1415 unsigned int idx;
dc99f600 1416
19bcf9f2 1417 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1418 kfree_skb(skb);
1419 return 0;
1420 }
1421
3f34b24a 1422 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1423 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1424 if (!skb)
1425 return 0;
1426 }
95ec3eb4
DM
1427 switch (f->type) {
1428 case PACKET_FANOUT_HASH:
1429 default:
77f65ebd 1430 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1431 break;
1432 case PACKET_FANOUT_LB:
77f65ebd 1433 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1434 break;
1435 case PACKET_FANOUT_CPU:
77f65ebd
WB
1436 idx = fanout_demux_cpu(f, skb, num);
1437 break;
5df0ddfb
DB
1438 case PACKET_FANOUT_RND:
1439 idx = fanout_demux_rnd(f, skb, num);
1440 break;
2d36097d
NH
1441 case PACKET_FANOUT_QM:
1442 idx = fanout_demux_qm(f, skb, num);
1443 break;
77f65ebd 1444 case PACKET_FANOUT_ROLLOVER:
ad377cab 1445 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1446 break;
47dceb8e 1447 case PACKET_FANOUT_CBPF:
f2e52095 1448 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1449 idx = fanout_demux_bpf(f, skb, num);
1450 break;
dc99f600
DM
1451 }
1452
ad377cab
WB
1453 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1454 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1455
ad377cab 1456 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1457 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1458}
1459
fff3321d
PE
1460DEFINE_MUTEX(fanout_mutex);
1461EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1462static LIST_HEAD(fanout_list);
4a69a864 1463static u16 fanout_next_id;
dc99f600
DM
1464
1465static void __fanout_link(struct sock *sk, struct packet_sock *po)
1466{
1467 struct packet_fanout *f = po->fanout;
1468
1469 spin_lock(&f->lock);
1470 f->arr[f->num_members] = sk;
1471 smp_wmb();
1472 f->num_members++;
2bd624b4
AS
1473 if (f->num_members == 1)
1474 dev_add_pack(&f->prot_hook);
dc99f600
DM
1475 spin_unlock(&f->lock);
1476}
1477
1478static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1479{
1480 struct packet_fanout *f = po->fanout;
1481 int i;
1482
1483 spin_lock(&f->lock);
1484 for (i = 0; i < f->num_members; i++) {
1485 if (f->arr[i] == sk)
1486 break;
1487 }
1488 BUG_ON(i >= f->num_members);
1489 f->arr[i] = f->arr[f->num_members - 1];
1490 f->num_members--;
2bd624b4
AS
1491 if (f->num_members == 0)
1492 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1493 spin_unlock(&f->lock);
1494}
1495
d4dd8aee 1496static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1497{
161642e2
ED
1498 if (sk->sk_family != PF_PACKET)
1499 return false;
c0de08d0 1500
161642e2 1501 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1502}
1503
47dceb8e
WB
1504static void fanout_init_data(struct packet_fanout *f)
1505{
1506 switch (f->type) {
1507 case PACKET_FANOUT_LB:
1508 atomic_set(&f->rr_cur, 0);
1509 break;
1510 case PACKET_FANOUT_CBPF:
f2e52095 1511 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1512 RCU_INIT_POINTER(f->bpf_prog, NULL);
1513 break;
1514 }
1515}
1516
1517static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1518{
1519 struct bpf_prog *old;
1520
1521 spin_lock(&f->lock);
1522 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1523 rcu_assign_pointer(f->bpf_prog, new);
1524 spin_unlock(&f->lock);
1525
1526 if (old) {
1527 synchronize_net();
1528 bpf_prog_destroy(old);
1529 }
1530}
1531
1532static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1533 unsigned int len)
1534{
1535 struct bpf_prog *new;
1536 struct sock_fprog fprog;
1537 int ret;
1538
1539 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1540 return -EPERM;
1541 if (len != sizeof(fprog))
1542 return -EINVAL;
1543 if (copy_from_user(&fprog, data, len))
1544 return -EFAULT;
1545
bab18991 1546 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1547 if (ret)
1548 return ret;
1549
1550 __fanout_set_data_bpf(po->fanout, new);
1551 return 0;
1552}
1553
f2e52095
WB
1554static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1555 unsigned int len)
1556{
1557 struct bpf_prog *new;
1558 u32 fd;
1559
1560 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1561 return -EPERM;
1562 if (len != sizeof(fd))
1563 return -EINVAL;
1564 if (copy_from_user(&fd, data, len))
1565 return -EFAULT;
1566
113214be 1567 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1568 if (IS_ERR(new))
1569 return PTR_ERR(new);
f2e52095
WB
1570
1571 __fanout_set_data_bpf(po->fanout, new);
1572 return 0;
1573}
1574
47dceb8e
WB
1575static int fanout_set_data(struct packet_sock *po, char __user *data,
1576 unsigned int len)
1577{
1578 switch (po->fanout->type) {
1579 case PACKET_FANOUT_CBPF:
1580 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1581 case PACKET_FANOUT_EBPF:
1582 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1583 default:
1584 return -EINVAL;
07d53ae4 1585 }
47dceb8e
WB
1586}
1587
1588static void fanout_release_data(struct packet_fanout *f)
1589{
1590 switch (f->type) {
1591 case PACKET_FANOUT_CBPF:
f2e52095 1592 case PACKET_FANOUT_EBPF:
47dceb8e 1593 __fanout_set_data_bpf(f, NULL);
07d53ae4 1594 }
47dceb8e
WB
1595}
1596
4a69a864
MM
1597static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1598{
1599 struct packet_fanout *f;
1600
1601 list_for_each_entry(f, &fanout_list, list) {
1602 if (f->id == candidate_id &&
1603 read_pnet(&f->net) == sock_net(sk)) {
1604 return false;
1605 }
1606 }
1607 return true;
1608}
1609
1610static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1611{
1612 u16 id = fanout_next_id;
1613
1614 do {
1615 if (__fanout_id_is_free(sk, id)) {
1616 *new_id = id;
1617 fanout_next_id = id + 1;
1618 return true;
1619 }
1620
1621 id++;
1622 } while (id != fanout_next_id);
1623
1624 return false;
1625}
1626
7736d33f 1627static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1628{
d199fab6 1629 struct packet_rollover *rollover = NULL;
dc99f600
DM
1630 struct packet_sock *po = pkt_sk(sk);
1631 struct packet_fanout *f, *match;
7736d33f 1632 u8 type = type_flags & 0xff;
77f65ebd 1633 u8 flags = type_flags >> 8;
dc99f600
DM
1634 int err;
1635
1636 switch (type) {
77f65ebd
WB
1637 case PACKET_FANOUT_ROLLOVER:
1638 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1639 return -EINVAL;
dc99f600
DM
1640 case PACKET_FANOUT_HASH:
1641 case PACKET_FANOUT_LB:
95ec3eb4 1642 case PACKET_FANOUT_CPU:
5df0ddfb 1643 case PACKET_FANOUT_RND:
2d36097d 1644 case PACKET_FANOUT_QM:
47dceb8e 1645 case PACKET_FANOUT_CBPF:
f2e52095 1646 case PACKET_FANOUT_EBPF:
dc99f600
DM
1647 break;
1648 default:
1649 return -EINVAL;
1650 }
1651
d199fab6
ED
1652 mutex_lock(&fanout_mutex);
1653
d199fab6 1654 err = -EALREADY;
dc99f600 1655 if (po->fanout)
d199fab6 1656 goto out;
dc99f600 1657
4633c9e0
WB
1658 if (type == PACKET_FANOUT_ROLLOVER ||
1659 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1660 err = -ENOMEM;
1661 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1662 if (!rollover)
1663 goto out;
1664 atomic_long_set(&rollover->num, 0);
1665 atomic_long_set(&rollover->num_huge, 0);
1666 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1667 }
1668
4a69a864
MM
1669 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1670 if (id != 0) {
1671 err = -EINVAL;
1672 goto out;
1673 }
1674 if (!fanout_find_new_id(sk, &id)) {
1675 err = -ENOMEM;
1676 goto out;
1677 }
1678 /* ephemeral flag for the first socket in the group: drop it */
1679 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1680 }
1681
dc99f600
DM
1682 match = NULL;
1683 list_for_each_entry(f, &fanout_list, list) {
1684 if (f->id == id &&
1685 read_pnet(&f->net) == sock_net(sk)) {
1686 match = f;
1687 break;
1688 }
1689 }
afe62c68 1690 err = -EINVAL;
77f65ebd 1691 if (match && match->flags != flags)
afe62c68 1692 goto out;
dc99f600 1693 if (!match) {
afe62c68 1694 err = -ENOMEM;
dc99f600 1695 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1696 if (!match)
1697 goto out;
1698 write_pnet(&match->net, sock_net(sk));
1699 match->id = id;
1700 match->type = type;
77f65ebd 1701 match->flags = flags;
afe62c68
ED
1702 INIT_LIST_HEAD(&match->list);
1703 spin_lock_init(&match->lock);
fb5c2c17 1704 refcount_set(&match->sk_ref, 0);
47dceb8e 1705 fanout_init_data(match);
afe62c68
ED
1706 match->prot_hook.type = po->prot_hook.type;
1707 match->prot_hook.dev = po->prot_hook.dev;
1708 match->prot_hook.func = packet_rcv_fanout;
1709 match->prot_hook.af_packet_priv = match;
c0de08d0 1710 match->prot_hook.id_match = match_fanout_group;
afe62c68 1711 list_add(&match->list, &fanout_list);
dc99f600 1712 }
afe62c68 1713 err = -EINVAL;
008ba2a1
WB
1714
1715 spin_lock(&po->bind_lock);
1716 if (po->running &&
1717 match->type == type &&
afe62c68
ED
1718 match->prot_hook.type == po->prot_hook.type &&
1719 match->prot_hook.dev == po->prot_hook.dev) {
1720 err = -ENOSPC;
fb5c2c17 1721 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1722 __dev_remove_pack(&po->prot_hook);
1723 po->fanout = match;
57f015f5
MM
1724 po->rollover = rollover;
1725 rollover = NULL;
fb5c2c17 1726 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1727 __fanout_link(sk, po);
1728 err = 0;
dc99f600
DM
1729 }
1730 }
008ba2a1
WB
1731 spin_unlock(&po->bind_lock);
1732
1733 if (err && !refcount_read(&match->sk_ref)) {
1734 list_del(&match->list);
1735 kfree(match);
1736 }
1737
afe62c68 1738out:
57f015f5 1739 kfree(rollover);
d199fab6 1740 mutex_unlock(&fanout_mutex);
dc99f600
DM
1741 return err;
1742}
1743
2bd624b4
AS
1744/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1745 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1746 * It is the responsibility of the caller to call fanout_release_data() and
1747 * free the returned packet_fanout (after synchronize_net())
1748 */
1749static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1750{
1751 struct packet_sock *po = pkt_sk(sk);
1752 struct packet_fanout *f;
1753
fff3321d 1754 mutex_lock(&fanout_mutex);
d199fab6
ED
1755 f = po->fanout;
1756 if (f) {
1757 po->fanout = NULL;
1758
fb5c2c17 1759 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1760 list_del(&f->list);
2bd624b4
AS
1761 else
1762 f = NULL;
dc99f600
DM
1763 }
1764 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1765
1766 return f;
dc99f600 1767}
1da177e4 1768
3c70c132
DB
1769static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1770 struct sk_buff *skb)
1771{
1772 /* Earlier code assumed this would be a VLAN pkt, double-check
1773 * this now that we have the actual packet in hand. We can only
1774 * do this check on Ethernet devices.
1775 */
1776 if (unlikely(dev->type != ARPHRD_ETHER))
1777 return false;
1778
1779 skb_reset_mac_header(skb);
1780 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1781}
1782
90ddc4f0 1783static const struct proto_ops packet_ops;
1da177e4 1784
90ddc4f0 1785static const struct proto_ops packet_ops_spkt;
1da177e4 1786
40d4e3df
ED
1787static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1788 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1789{
1790 struct sock *sk;
1791 struct sockaddr_pkt *spkt;
1792
1793 /*
1794 * When we registered the protocol we saved the socket in the data
1795 * field for just this event.
1796 */
1797
1798 sk = pt->af_packet_priv;
1ce4f28b 1799
1da177e4
LT
1800 /*
1801 * Yank back the headers [hope the device set this
1802 * right or kerboom...]
1803 *
1804 * Incoming packets have ll header pulled,
1805 * push it back.
1806 *
98e399f8 1807 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1808 * so that this procedure is noop.
1809 */
1810
1811 if (skb->pkt_type == PACKET_LOOPBACK)
1812 goto out;
1813
09ad9bc7 1814 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1815 goto out;
1816
40d4e3df
ED
1817 skb = skb_share_check(skb, GFP_ATOMIC);
1818 if (skb == NULL)
1da177e4
LT
1819 goto oom;
1820
1821 /* drop any routing info */
adf30907 1822 skb_dst_drop(skb);
1da177e4 1823
84531c24
PO
1824 /* drop conntrack reference */
1825 nf_reset(skb);
1826
ffbc6111 1827 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1828
98e399f8 1829 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1830
1831 /*
1832 * The SOCK_PACKET socket receives _all_ frames.
1833 */
1834
1835 spkt->spkt_family = dev->type;
1836 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1837 spkt->spkt_protocol = skb->protocol;
1838
1839 /*
1840 * Charge the memory to the socket. This is done specifically
1841 * to prevent sockets using all the memory up.
1842 */
1843
40d4e3df 1844 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1845 return 0;
1846
1847out:
1848 kfree_skb(skb);
1849oom:
1850 return 0;
1851}
1852
75c65772
MM
1853static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1854{
1855 if (!skb->protocol && sock->type == SOCK_RAW) {
1856 skb_reset_mac_header(skb);
1857 skb->protocol = dev_parse_header_protocol(skb);
1858 }
1859
1860 skb_probe_transport_header(skb);
1861}
1da177e4
LT
1862
1863/*
1864 * Output a raw packet to a device layer. This bypasses all the other
1865 * protocol layers and you must therefore supply it with a complete frame
1866 */
1ce4f28b 1867
1b784140
YX
1868static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1869 size_t len)
1da177e4
LT
1870{
1871 struct sock *sk = sock->sk;
342dfc30 1872 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1873 struct sk_buff *skb = NULL;
1da177e4 1874 struct net_device *dev;
c14ac945 1875 struct sockcm_cookie sockc;
40d4e3df 1876 __be16 proto = 0;
1da177e4 1877 int err;
3bdc0eba 1878 int extra_len = 0;
1ce4f28b 1879
1da177e4 1880 /*
1ce4f28b 1881 * Get and verify the address.
1da177e4
LT
1882 */
1883
40d4e3df 1884 if (saddr) {
1da177e4 1885 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1886 return -EINVAL;
1887 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1888 proto = saddr->spkt_protocol;
1889 } else
1890 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1891
1892 /*
1ce4f28b 1893 * Find the device first to size check it
1da177e4
LT
1894 */
1895
de74e92a 1896 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1897retry:
654d1f8a
ED
1898 rcu_read_lock();
1899 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1900 err = -ENODEV;
1901 if (dev == NULL)
1902 goto out_unlock;
1ce4f28b 1903
d5e76b0a
DM
1904 err = -ENETDOWN;
1905 if (!(dev->flags & IFF_UP))
1906 goto out_unlock;
1907
1da177e4 1908 /*
40d4e3df
ED
1909 * You may not queue a frame bigger than the mtu. This is the lowest level
1910 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1911 */
1ce4f28b 1912
3bdc0eba
BG
1913 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1914 if (!netif_supports_nofcs(dev)) {
1915 err = -EPROTONOSUPPORT;
1916 goto out_unlock;
1917 }
1918 extra_len = 4; /* We're doing our own CRC */
1919 }
1920
1da177e4 1921 err = -EMSGSIZE;
3bdc0eba 1922 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1923 goto out_unlock;
1924
1a35ca80
ED
1925 if (!skb) {
1926 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1927 int tlen = dev->needed_tailroom;
1a35ca80
ED
1928 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1929
1930 rcu_read_unlock();
4ce40912 1931 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1932 if (skb == NULL)
1933 return -ENOBUFS;
1934 /* FIXME: Save some space for broken drivers that write a hard
1935 * header at transmission time by themselves. PPP is the notable
1936 * one here. This should really be fixed at the driver level.
1937 */
1938 skb_reserve(skb, reserved);
1939 skb_reset_network_header(skb);
1940
1941 /* Try to align data part correctly */
1942 if (hhlen) {
1943 skb->data -= hhlen;
1944 skb->tail -= hhlen;
1945 if (len < hhlen)
1946 skb_reset_network_header(skb);
1947 }
6ce8e9ce 1948 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1949 if (err)
1950 goto out_free;
1951 goto retry;
1da177e4
LT
1952 }
1953
9ed988cd
WB
1954 if (!dev_validate_header(dev, skb->data, len)) {
1955 err = -EINVAL;
1956 goto out_unlock;
1957 }
3c70c132
DB
1958 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1959 !packet_extra_vlan_len_allowed(dev, skb)) {
1960 err = -EMSGSIZE;
1961 goto out_unlock;
57f89bfa 1962 }
1a35ca80 1963
657a0667 1964 sockcm_init(&sockc, sk);
c14ac945
SHY
1965 if (msg->msg_controllen) {
1966 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1967 if (unlikely(err))
c14ac945 1968 goto out_unlock;
c14ac945
SHY
1969 }
1970
1da177e4
LT
1971 skb->protocol = proto;
1972 skb->dev = dev;
1973 skb->priority = sk->sk_priority;
2d37a186 1974 skb->mark = sk->sk_mark;
3d0ba8c0 1975 skb->tstamp = sockc.transmit_time;
bf84a010 1976
8f932f76 1977 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1978
3bdc0eba
BG
1979 if (unlikely(extra_len == 4))
1980 skb->no_fcs = 1;
1981
75c65772 1982 packet_parse_headers(skb, sock);
c1aad275 1983
1da177e4 1984 dev_queue_xmit(skb);
654d1f8a 1985 rcu_read_unlock();
40d4e3df 1986 return len;
1da177e4 1987
1da177e4 1988out_unlock:
654d1f8a 1989 rcu_read_unlock();
1a35ca80
ED
1990out_free:
1991 kfree_skb(skb);
1da177e4
LT
1992 return err;
1993}
1da177e4 1994
ff936a04
AS
1995static unsigned int run_filter(struct sk_buff *skb,
1996 const struct sock *sk,
1997 unsigned int res)
1da177e4
LT
1998{
1999 struct sk_filter *filter;
fda9ef5d 2000
80f8f102
ED
2001 rcu_read_lock();
2002 filter = rcu_dereference(sk->sk_filter);
dbcb5855 2003 if (filter != NULL)
ff936a04 2004 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 2005 rcu_read_unlock();
1da177e4 2006
dbcb5855 2007 return res;
1da177e4
LT
2008}
2009
16cc1400
WB
2010static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2011 size_t *len)
2012{
2013 struct virtio_net_hdr vnet_hdr;
2014
2015 if (*len < sizeof(vnet_hdr))
2016 return -EINVAL;
2017 *len -= sizeof(vnet_hdr);
2018
fd3a8862 2019 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2020 return -EINVAL;
2021
2022 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2023}
2024
1da177e4 2025/*
62ab0812
ED
2026 * This function makes lazy skb cloning in hope that most of packets
2027 * are discarded by BPF.
2028 *
2029 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2030 * and skb->cb are mangled. It works because (and until) packets
2031 * falling here are owned by current CPU. Output packets are cloned
2032 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2033 * sequencially, so that if we return skb to original state on exit,
2034 * we will not harm anyone.
1da177e4
LT
2035 */
2036
40d4e3df
ED
2037static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2038 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2039{
2040 struct sock *sk;
2041 struct sockaddr_ll *sll;
2042 struct packet_sock *po;
40d4e3df 2043 u8 *skb_head = skb->data;
1da177e4 2044 int skb_len = skb->len;
dbcb5855 2045 unsigned int snaplen, res;
da37845f 2046 bool is_drop_n_account = false;
1da177e4
LT
2047
2048 if (skb->pkt_type == PACKET_LOOPBACK)
2049 goto drop;
2050
2051 sk = pt->af_packet_priv;
2052 po = pkt_sk(sk);
2053
09ad9bc7 2054 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2055 goto drop;
2056
1da177e4
LT
2057 skb->dev = dev;
2058
3b04ddde 2059 if (dev->header_ops) {
1da177e4 2060 /* The device has an explicit notion of ll header,
62ab0812
ED
2061 * exported to higher levels.
2062 *
2063 * Otherwise, the device hides details of its frame
2064 * structure, so that corresponding packet head is
2065 * never delivered to user.
1da177e4
LT
2066 */
2067 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2068 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2069 else if (skb->pkt_type == PACKET_OUTGOING) {
2070 /* Special case: outgoing packets have ll header at head */
bbe735e4 2071 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2072 }
2073 }
2074
2075 snaplen = skb->len;
2076
dbcb5855
DM
2077 res = run_filter(skb, sk, snaplen);
2078 if (!res)
fda9ef5d 2079 goto drop_n_restore;
dbcb5855
DM
2080 if (snaplen > res)
2081 snaplen = res;
1da177e4 2082
0fd7bac6 2083 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2084 goto drop_n_acct;
2085
2086 if (skb_shared(skb)) {
2087 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2088 if (nskb == NULL)
2089 goto drop_n_acct;
2090
2091 if (skb_head != skb->data) {
2092 skb->data = skb_head;
2093 skb->len = skb_len;
2094 }
abc4e4fa 2095 consume_skb(skb);
1da177e4
LT
2096 skb = nskb;
2097 }
2098
b4772ef8 2099 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2100
2101 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2102 sll->sll_hatype = dev->type;
1da177e4 2103 sll->sll_pkttype = skb->pkt_type;
8032b464 2104 if (unlikely(po->origdev))
80feaacb
PWJ
2105 sll->sll_ifindex = orig_dev->ifindex;
2106 else
2107 sll->sll_ifindex = dev->ifindex;
1da177e4 2108
b95cce35 2109 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2110
2472d761
EB
2111 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2112 * Use their space for storing the original skb length.
2113 */
2114 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2115
1da177e4
LT
2116 if (pskb_trim(skb, snaplen))
2117 goto drop_n_acct;
2118
2119 skb_set_owner_r(skb, sk);
2120 skb->dev = NULL;
adf30907 2121 skb_dst_drop(skb);
1da177e4 2122
84531c24
PO
2123 /* drop conntrack reference */
2124 nf_reset(skb);
2125
1da177e4 2126 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2127 po->stats.stats1.tp_packets++;
3bc3b96f 2128 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2129 __skb_queue_tail(&sk->sk_receive_queue, skb);
2130 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2131 sk->sk_data_ready(sk);
1da177e4
LT
2132 return 0;
2133
2134drop_n_acct:
da37845f 2135 is_drop_n_account = true;
7091fbd8 2136 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2137 po->stats.stats1.tp_drops++;
7091fbd8
WB
2138 atomic_inc(&sk->sk_drops);
2139 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2140
2141drop_n_restore:
2142 if (skb_head != skb->data && skb_shared(skb)) {
2143 skb->data = skb_head;
2144 skb->len = skb_len;
2145 }
2146drop:
da37845f
WJ
2147 if (!is_drop_n_account)
2148 consume_skb(skb);
2149 else
2150 kfree_skb(skb);
1da177e4
LT
2151 return 0;
2152}
2153
40d4e3df
ED
2154static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2155 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2156{
2157 struct sock *sk;
2158 struct packet_sock *po;
2159 struct sockaddr_ll *sll;
184f489e 2160 union tpacket_uhdr h;
40d4e3df 2161 u8 *skb_head = skb->data;
1da177e4 2162 int skb_len = skb->len;
dbcb5855 2163 unsigned int snaplen, res;
f6fb8f10 2164 unsigned long status = TP_STATUS_USER;
bbd6ef87 2165 unsigned short macoff, netoff, hdrlen;
1da177e4 2166 struct sk_buff *copy_skb = NULL;
bbd6ef87 2167 struct timespec ts;
b9c32fb2 2168 __u32 ts_status;
da37845f 2169 bool is_drop_n_account = false;
edbd58be 2170 bool do_vnet = false;
1da177e4 2171
51846355
AW
2172 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2173 * We may add members to them until current aligned size without forcing
2174 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2175 */
2176 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2177 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2178
1da177e4
LT
2179 if (skb->pkt_type == PACKET_LOOPBACK)
2180 goto drop;
2181
2182 sk = pt->af_packet_priv;
2183 po = pkt_sk(sk);
2184
09ad9bc7 2185 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2186 goto drop;
2187
3b04ddde 2188 if (dev->header_ops) {
1da177e4 2189 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2190 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2191 else if (skb->pkt_type == PACKET_OUTGOING) {
2192 /* Special case: outgoing packets have ll header at head */
bbe735e4 2193 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2194 }
2195 }
2196
2197 snaplen = skb->len;
2198
dbcb5855
DM
2199 res = run_filter(skb, sk, snaplen);
2200 if (!res)
fda9ef5d 2201 goto drop_n_restore;
68c2e5de
AD
2202
2203 if (skb->ip_summed == CHECKSUM_PARTIAL)
2204 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2205 else if (skb->pkt_type != PACKET_OUTGOING &&
2206 (skb->ip_summed == CHECKSUM_COMPLETE ||
2207 skb_csum_unnecessary(skb)))
2208 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2209
dbcb5855
DM
2210 if (snaplen > res)
2211 snaplen = res;
1da177e4
LT
2212
2213 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2214 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2215 po->tp_reserve;
1da177e4 2216 } else {
95c96174 2217 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2218 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2219 (maclen < 16 ? 16 : maclen)) +
58d19b19 2220 po->tp_reserve;
edbd58be 2221 if (po->has_vnet_hdr) {
58d19b19 2222 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2223 do_vnet = true;
2224 }
1da177e4
LT
2225 macoff = netoff - maclen;
2226 }
f6fb8f10 2227 if (po->tp_version <= TPACKET_V2) {
2228 if (macoff + snaplen > po->rx_ring.frame_size) {
2229 if (po->copy_thresh &&
0fd7bac6 2230 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2231 if (skb_shared(skb)) {
2232 copy_skb = skb_clone(skb, GFP_ATOMIC);
2233 } else {
2234 copy_skb = skb_get(skb);
2235 skb_head = skb->data;
2236 }
2237 if (copy_skb)
2238 skb_set_owner_r(copy_skb, sk);
1da177e4 2239 }
f6fb8f10 2240 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2241 if ((int)snaplen < 0) {
f6fb8f10 2242 snaplen = 0;
edbd58be
BP
2243 do_vnet = false;
2244 }
1da177e4 2245 }
dc808110
ED
2246 } else if (unlikely(macoff + snaplen >
2247 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2248 u32 nval;
2249
2250 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2251 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2252 snaplen, nval, macoff);
2253 snaplen = nval;
2254 if (unlikely((int)snaplen < 0)) {
2255 snaplen = 0;
2256 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2257 do_vnet = false;
dc808110 2258 }
1da177e4 2259 }
1da177e4 2260 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2261 h.raw = packet_current_rx_frame(po, skb,
2262 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2263 if (!h.raw)
58d19b19 2264 goto drop_n_account;
f6fb8f10 2265 if (po->tp_version <= TPACKET_V2) {
2266 packet_increment_rx_head(po, &po->rx_ring);
2267 /*
2268 * LOSING will be reported till you read the stats,
2269 * because it's COR - Clear On Read.
2270 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2271 * at packet level.
2272 */
ee80fbf3 2273 if (po->stats.stats1.tp_drops)
f6fb8f10 2274 status |= TP_STATUS_LOSING;
2275 }
945d015e
ED
2276
2277 if (do_vnet &&
2278 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2279 sizeof(struct virtio_net_hdr),
2280 vio_le(), true, 0))
2281 goto drop_n_account;
2282
ee80fbf3 2283 po->stats.stats1.tp_packets++;
1da177e4
LT
2284 if (copy_skb) {
2285 status |= TP_STATUS_COPY;
2286 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2287 }
1da177e4
LT
2288 spin_unlock(&sk->sk_receive_queue.lock);
2289
bbd6ef87 2290 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2291
2292 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2293 getnstimeofday(&ts);
1da177e4 2294
b9c32fb2
DB
2295 status |= ts_status;
2296
bbd6ef87
PM
2297 switch (po->tp_version) {
2298 case TPACKET_V1:
2299 h.h1->tp_len = skb->len;
2300 h.h1->tp_snaplen = snaplen;
2301 h.h1->tp_mac = macoff;
2302 h.h1->tp_net = netoff;
4b457bdf
DB
2303 h.h1->tp_sec = ts.tv_sec;
2304 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2305 hdrlen = sizeof(*h.h1);
2306 break;
2307 case TPACKET_V2:
2308 h.h2->tp_len = skb->len;
2309 h.h2->tp_snaplen = snaplen;
2310 h.h2->tp_mac = macoff;
2311 h.h2->tp_net = netoff;
bbd6ef87
PM
2312 h.h2->tp_sec = ts.tv_sec;
2313 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2314 if (skb_vlan_tag_present(skb)) {
2315 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2316 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2317 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2318 } else {
2319 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2320 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2321 }
e4d26f4b 2322 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2323 hdrlen = sizeof(*h.h2);
2324 break;
f6fb8f10 2325 case TPACKET_V3:
2326 /* tp_nxt_offset,vlan are already populated above.
2327 * So DONT clear those fields here
2328 */
2329 h.h3->tp_status |= status;
2330 h.h3->tp_len = skb->len;
2331 h.h3->tp_snaplen = snaplen;
2332 h.h3->tp_mac = macoff;
2333 h.h3->tp_net = netoff;
f6fb8f10 2334 h.h3->tp_sec = ts.tv_sec;
2335 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2336 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2337 hdrlen = sizeof(*h.h3);
2338 break;
bbd6ef87
PM
2339 default:
2340 BUG();
2341 }
1da177e4 2342
bbd6ef87 2343 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2344 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2345 sll->sll_family = AF_PACKET;
2346 sll->sll_hatype = dev->type;
2347 sll->sll_protocol = skb->protocol;
2348 sll->sll_pkttype = skb->pkt_type;
8032b464 2349 if (unlikely(po->origdev))
80feaacb
PWJ
2350 sll->sll_ifindex = orig_dev->ifindex;
2351 else
2352 sll->sll_ifindex = dev->ifindex;
1da177e4 2353
e16aa207 2354 smp_mb();
f0d4eb29 2355
f6dafa95 2356#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2357 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2358 u8 *start, *end;
2359
f0d4eb29
DB
2360 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2361 macoff + snaplen);
2362
2363 for (start = h.raw; start < end; start += PAGE_SIZE)
2364 flush_dcache_page(pgv_to_page(start));
1da177e4 2365 }
f0d4eb29 2366 smp_wmb();
f6dafa95 2367#endif
f0d4eb29 2368
da413eec 2369 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2370 __packet_set_status(po, h.raw, status);
da413eec
DC
2371 sk->sk_data_ready(sk);
2372 } else {
f6fb8f10 2373 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2374 }
1da177e4
LT
2375
2376drop_n_restore:
2377 if (skb_head != skb->data && skb_shared(skb)) {
2378 skb->data = skb_head;
2379 skb->len = skb_len;
2380 }
2381drop:
da37845f
WJ
2382 if (!is_drop_n_account)
2383 consume_skb(skb);
2384 else
2385 kfree_skb(skb);
1da177e4
LT
2386 return 0;
2387
58d19b19 2388drop_n_account:
da37845f 2389 is_drop_n_account = true;
ee80fbf3 2390 po->stats.stats1.tp_drops++;
1da177e4
LT
2391 spin_unlock(&sk->sk_receive_queue.lock);
2392
676d2369 2393 sk->sk_data_ready(sk);
acb5d75b 2394 kfree_skb(copy_skb);
1da177e4
LT
2395 goto drop_n_restore;
2396}
2397
69e3c75f
JB
2398static void tpacket_destruct_skb(struct sk_buff *skb)
2399{
2400 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2401
69e3c75f 2402 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2403 void *ph;
b9c32fb2
DB
2404 __u32 ts;
2405
5cd8d46e 2406 ph = skb_zcopy_get_nouarg(skb);
b0138408 2407 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2408
2409 ts = __packet_set_timestamp(po, ph, skb);
2410 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2411 }
2412
2413 sock_wfree(skb);
2414}
2415
16cc1400
WB
2416static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2417{
16cc1400
WB
2418 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2419 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2420 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2421 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2422 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2423 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2424 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2425
2426 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2427 return -EINVAL;
2428
16cc1400
WB
2429 return 0;
2430}
2431
2432static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2433 struct virtio_net_hdr *vnet_hdr)
2434{
16cc1400
WB
2435 if (*len < sizeof(*vnet_hdr))
2436 return -EINVAL;
2437 *len -= sizeof(*vnet_hdr);
2438
cbbd26b8 2439 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2440 return -EFAULT;
2441
2442 return __packet_snd_vnet_parse(vnet_hdr, *len);
2443}
2444
40d4e3df 2445static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2446 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2447 __be16 proto, unsigned char *addr, int hlen, int copylen,
2448 const struct sockcm_cookie *sockc)
69e3c75f 2449{
184f489e 2450 union tpacket_uhdr ph;
8d39b4a6 2451 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2452 struct socket *sock = po->sk.sk_socket;
2453 struct page *page;
69e3c75f
JB
2454 int err;
2455
2456 ph.raw = frame;
2457
2458 skb->protocol = proto;
2459 skb->dev = dev;
2460 skb->priority = po->sk.sk_priority;
2d37a186 2461 skb->mark = po->sk.sk_mark;
3d0ba8c0 2462 skb->tstamp = sockc->transmit_time;
8f932f76 2463 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2464 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2465
ae641949 2466 skb_reserve(skb, hlen);
69e3c75f 2467 skb_reset_network_header(skb);
c1aad275 2468
69e3c75f
JB
2469 to_write = tp_len;
2470
2471 if (sock->type == SOCK_DGRAM) {
2472 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2473 NULL, tp_len);
2474 if (unlikely(err < 0))
2475 return -EINVAL;
1d036d25 2476 } else if (copylen) {
9ed988cd
WB
2477 int hdrlen = min_t(int, copylen, tp_len);
2478
69e3c75f 2479 skb_push(skb, dev->hard_header_len);
1d036d25 2480 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2481 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2482 if (unlikely(err))
2483 return err;
9ed988cd
WB
2484 if (!dev_validate_header(dev, skb->data, hdrlen))
2485 return -EINVAL;
69e3c75f 2486
9ed988cd
WB
2487 data += hdrlen;
2488 to_write -= hdrlen;
69e3c75f
JB
2489 }
2490
69e3c75f
JB
2491 offset = offset_in_page(data);
2492 len_max = PAGE_SIZE - offset;
2493 len = ((to_write > len_max) ? len_max : to_write);
2494
2495 skb->data_len = to_write;
2496 skb->len += to_write;
2497 skb->truesize += to_write;
14afee4b 2498 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2499
2500 while (likely(to_write)) {
2501 nr_frags = skb_shinfo(skb)->nr_frags;
2502
2503 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2504 pr_err("Packet exceed the number of skb frags(%lu)\n",
2505 MAX_SKB_FRAGS);
69e3c75f
JB
2506 return -EFAULT;
2507 }
2508
0af55bb5
CG
2509 page = pgv_to_page(data);
2510 data += len;
69e3c75f
JB
2511 flush_dcache_page(page);
2512 get_page(page);
0af55bb5 2513 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2514 to_write -= len;
2515 offset = 0;
2516 len_max = PAGE_SIZE;
2517 len = ((to_write > len_max) ? len_max : to_write);
2518 }
2519
75c65772 2520 packet_parse_headers(skb, sock);
efdfa2f7 2521
69e3c75f
JB
2522 return tp_len;
2523}
2524
8d39b4a6
WB
2525static int tpacket_parse_header(struct packet_sock *po, void *frame,
2526 int size_max, void **data)
2527{
2528 union tpacket_uhdr ph;
2529 int tp_len, off;
2530
2531 ph.raw = frame;
2532
2533 switch (po->tp_version) {
7f953ab2
SV
2534 case TPACKET_V3:
2535 if (ph.h3->tp_next_offset != 0) {
2536 pr_warn_once("variable sized slot not supported");
2537 return -EINVAL;
2538 }
2539 tp_len = ph.h3->tp_len;
2540 break;
8d39b4a6
WB
2541 case TPACKET_V2:
2542 tp_len = ph.h2->tp_len;
2543 break;
2544 default:
2545 tp_len = ph.h1->tp_len;
2546 break;
2547 }
2548 if (unlikely(tp_len > size_max)) {
2549 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2550 return -EMSGSIZE;
2551 }
2552
2553 if (unlikely(po->tp_tx_has_off)) {
2554 int off_min, off_max;
2555
2556 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2557 off_max = po->tx_ring.frame_size - tp_len;
2558 if (po->sk.sk_type == SOCK_DGRAM) {
2559 switch (po->tp_version) {
7f953ab2
SV
2560 case TPACKET_V3:
2561 off = ph.h3->tp_net;
2562 break;
8d39b4a6
WB
2563 case TPACKET_V2:
2564 off = ph.h2->tp_net;
2565 break;
2566 default:
2567 off = ph.h1->tp_net;
2568 break;
2569 }
2570 } else {
2571 switch (po->tp_version) {
7f953ab2
SV
2572 case TPACKET_V3:
2573 off = ph.h3->tp_mac;
2574 break;
8d39b4a6
WB
2575 case TPACKET_V2:
2576 off = ph.h2->tp_mac;
2577 break;
2578 default:
2579 off = ph.h1->tp_mac;
2580 break;
2581 }
2582 }
2583 if (unlikely((off < off_min) || (off_max < off)))
2584 return -EINVAL;
2585 } else {
2586 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2587 }
2588
2589 *data = frame + off;
2590 return tp_len;
2591}
2592
69e3c75f
JB
2593static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2594{
69e3c75f
JB
2595 struct sk_buff *skb;
2596 struct net_device *dev;
1d036d25 2597 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2598 struct sockcm_cookie sockc;
69e3c75f 2599 __be16 proto;
09effa67 2600 int err, reserve = 0;
40d4e3df 2601 void *ph;
342dfc30 2602 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2603 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2604 int tp_len, size_max;
2605 unsigned char *addr;
8d39b4a6 2606 void *data;
69e3c75f 2607 int len_sum = 0;
9e67030a 2608 int status = TP_STATUS_AVAILABLE;
1d036d25 2609 int hlen, tlen, copylen = 0;
69e3c75f 2610
69e3c75f
JB
2611 mutex_lock(&po->pg_vec_lock);
2612
66e56cd4 2613 if (likely(saddr == NULL)) {
e40526cb 2614 dev = packet_cached_dev_get(po);
69e3c75f
JB
2615 proto = po->num;
2616 addr = NULL;
2617 } else {
2618 err = -EINVAL;
2619 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2620 goto out;
2621 if (msg->msg_namelen < (saddr->sll_halen
2622 + offsetof(struct sockaddr_ll,
2623 sll_addr)))
2624 goto out;
69e3c75f 2625 proto = saddr->sll_protocol;
6b8d95f1 2626 addr = saddr->sll_halen ? saddr->sll_addr : NULL;
827d9780 2627 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
99137b78 2628 if (addr && dev && saddr->sll_halen < dev->addr_len)
d972f3dc 2629 goto out_put;
69e3c75f
JB
2630 }
2631
69e3c75f
JB
2632 err = -ENXIO;
2633 if (unlikely(dev == NULL))
2634 goto out;
69e3c75f
JB
2635 err = -ENETDOWN;
2636 if (unlikely(!(dev->flags & IFF_UP)))
2637 goto out_put;
2638
657a0667 2639 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2640 if (msg->msg_controllen) {
2641 err = sock_cmsg_send(&po->sk, msg, &sockc);
2642 if (unlikely(err))
2643 goto out_put;
2644 }
2645
5cfb4c8d
DB
2646 if (po->sk.sk_socket->type == SOCK_RAW)
2647 reserve = dev->hard_header_len;
69e3c75f 2648 size_max = po->tx_ring.frame_size
b5dd884e 2649 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2650
1d036d25 2651 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2652 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2653
69e3c75f
JB
2654 do {
2655 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2656 TP_STATUS_SEND_REQUEST);
69e3c75f 2657 if (unlikely(ph == NULL)) {
87a2fd28
DB
2658 if (need_wait && need_resched())
2659 schedule();
69e3c75f
JB
2660 continue;
2661 }
2662
8d39b4a6
WB
2663 skb = NULL;
2664 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2665 if (tp_len < 0)
2666 goto tpacket_error;
2667
69e3c75f 2668 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2669 hlen = LL_RESERVED_SPACE(dev);
2670 tlen = dev->needed_tailroom;
1d036d25
WB
2671 if (po->has_vnet_hdr) {
2672 vnet_hdr = data;
2673 data += sizeof(*vnet_hdr);
2674 tp_len -= sizeof(*vnet_hdr);
2675 if (tp_len < 0 ||
2676 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2677 tp_len = -EINVAL;
2678 goto tpacket_error;
2679 }
2680 copylen = __virtio16_to_cpu(vio_le(),
2681 vnet_hdr->hdr_len);
2682 }
9ed988cd 2683 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2684 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2685 hlen + tlen + sizeof(struct sockaddr_ll) +
2686 (copylen - dev->hard_header_len),
fbf33a28 2687 !need_wait, &err);
69e3c75f 2688
fbf33a28
KM
2689 if (unlikely(skb == NULL)) {
2690 /* we assume the socket was initially writeable ... */
2691 if (likely(len_sum > 0))
2692 err = len_sum;
69e3c75f 2693 goto out_status;
fbf33a28 2694 }
8d39b4a6 2695 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2696 addr, hlen, copylen, &sockc);
dbd46ab4 2697 if (likely(tp_len >= 0) &&
5cfb4c8d 2698 tp_len > dev->mtu + reserve &&
1d036d25 2699 !po->has_vnet_hdr &&
3c70c132
DB
2700 !packet_extra_vlan_len_allowed(dev, skb))
2701 tp_len = -EMSGSIZE;
69e3c75f
JB
2702
2703 if (unlikely(tp_len < 0)) {
8d39b4a6 2704tpacket_error:
69e3c75f
JB
2705 if (po->tp_loss) {
2706 __packet_set_status(po, ph,
2707 TP_STATUS_AVAILABLE);
2708 packet_increment_head(&po->tx_ring);
2709 kfree_skb(skb);
2710 continue;
2711 } else {
2712 status = TP_STATUS_WRONG_FORMAT;
2713 err = tp_len;
2714 goto out_status;
2715 }
2716 }
2717
9d2f67e4
JT
2718 if (po->has_vnet_hdr) {
2719 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2720 tp_len = -EINVAL;
2721 goto tpacket_error;
2722 }
2723 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2724 }
2725
69e3c75f
JB
2726 skb->destructor = tpacket_destruct_skb;
2727 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2728 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2729
2730 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2731 err = po->xmit(skb);
eb70df13
JP
2732 if (unlikely(err > 0)) {
2733 err = net_xmit_errno(err);
2734 if (err && __packet_get_status(po, ph) ==
2735 TP_STATUS_AVAILABLE) {
2736 /* skb was destructed already */
2737 skb = NULL;
2738 goto out_status;
2739 }
2740 /*
2741 * skb was dropped but not destructed yet;
2742 * let's treat it like congestion or err < 0
2743 */
2744 err = 0;
2745 }
69e3c75f
JB
2746 packet_increment_head(&po->tx_ring);
2747 len_sum += tp_len;
b0138408
DB
2748 } while (likely((ph != NULL) ||
2749 /* Note: packet_read_pending() might be slow if we have
2750 * to call it as it's per_cpu variable, but in fast-path
2751 * we already short-circuit the loop with the first
2752 * condition, and luckily don't have to go that path
2753 * anyway.
2754 */
2755 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2756
2757 err = len_sum;
2758 goto out_put;
2759
69e3c75f
JB
2760out_status:
2761 __packet_set_status(po, ph, status);
2762 kfree_skb(skb);
2763out_put:
e40526cb 2764 dev_put(dev);
69e3c75f
JB
2765out:
2766 mutex_unlock(&po->pg_vec_lock);
2767 return err;
2768}
69e3c75f 2769
eea49cc9
OJ
2770static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2771 size_t reserve, size_t len,
2772 size_t linear, int noblock,
2773 int *err)
bfd5f4a3
SS
2774{
2775 struct sk_buff *skb;
2776
2777 /* Under a page? Don't bother with paged skb. */
2778 if (prepad + len < PAGE_SIZE || !linear)
2779 linear = len;
2780
2781 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2782 err, 0);
bfd5f4a3
SS
2783 if (!skb)
2784 return NULL;
2785
2786 skb_reserve(skb, reserve);
2787 skb_put(skb, linear);
2788 skb->data_len = len - linear;
2789 skb->len += len - linear;
2790
2791 return skb;
2792}
2793
d346a3fa 2794static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2795{
2796 struct sock *sk = sock->sk;
342dfc30 2797 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2798 struct sk_buff *skb;
2799 struct net_device *dev;
0e11c91e 2800 __be16 proto;
1da177e4 2801 unsigned char *addr;
827d9780 2802 int err, reserve = 0;
c7d39e32 2803 struct sockcm_cookie sockc;
bfd5f4a3
SS
2804 struct virtio_net_hdr vnet_hdr = { 0 };
2805 int offset = 0;
bfd5f4a3 2806 struct packet_sock *po = pkt_sk(sk);
da7c9561 2807 bool has_vnet_hdr = false;
57031eb7 2808 int hlen, tlen, linear;
3bdc0eba 2809 int extra_len = 0;
1da177e4
LT
2810
2811 /*
1ce4f28b 2812 * Get and verify the address.
1da177e4 2813 */
1ce4f28b 2814
66e56cd4 2815 if (likely(saddr == NULL)) {
e40526cb 2816 dev = packet_cached_dev_get(po);
1da177e4
LT
2817 proto = po->num;
2818 addr = NULL;
2819 } else {
2820 err = -EINVAL;
2821 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2822 goto out;
0fb375fb
EB
2823 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2824 goto out;
1da177e4 2825 proto = saddr->sll_protocol;
6b8d95f1 2826 addr = saddr->sll_halen ? saddr->sll_addr : NULL;
827d9780 2827 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
99137b78 2828 if (addr && dev && saddr->sll_halen < dev->addr_len)
d972f3dc 2829 goto out_unlock;
1da177e4
LT
2830 }
2831
1da177e4 2832 err = -ENXIO;
e40526cb 2833 if (unlikely(dev == NULL))
1da177e4 2834 goto out_unlock;
d5e76b0a 2835 err = -ENETDOWN;
e40526cb 2836 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2837 goto out_unlock;
2838
657a0667 2839 sockcm_init(&sockc, sk);
c7d39e32
EJ
2840 sockc.mark = sk->sk_mark;
2841 if (msg->msg_controllen) {
2842 err = sock_cmsg_send(sk, msg, &sockc);
2843 if (unlikely(err))
2844 goto out_unlock;
2845 }
2846
e40526cb
DB
2847 if (sock->type == SOCK_RAW)
2848 reserve = dev->hard_header_len;
bfd5f4a3 2849 if (po->has_vnet_hdr) {
16cc1400
WB
2850 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2851 if (err)
bfd5f4a3 2852 goto out_unlock;
da7c9561 2853 has_vnet_hdr = true;
bfd5f4a3
SS
2854 }
2855
3bdc0eba
BG
2856 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2857 if (!netif_supports_nofcs(dev)) {
2858 err = -EPROTONOSUPPORT;
2859 goto out_unlock;
2860 }
2861 extra_len = 4; /* We're doing our own CRC */
2862 }
2863
1da177e4 2864 err = -EMSGSIZE;
16cc1400
WB
2865 if (!vnet_hdr.gso_type &&
2866 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2867 goto out_unlock;
2868
bfd5f4a3 2869 err = -ENOBUFS;
ae641949
HX
2870 hlen = LL_RESERVED_SPACE(dev);
2871 tlen = dev->needed_tailroom;
57031eb7
WB
2872 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2873 linear = max(linear, min_t(int, len, dev->hard_header_len));
2874 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2875 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2876 if (skb == NULL)
1da177e4
LT
2877 goto out_unlock;
2878
b84bbaf7 2879 skb_reset_network_header(skb);
1da177e4 2880
0c4e8581 2881 err = -EINVAL;
9c707762
WB
2882 if (sock->type == SOCK_DGRAM) {
2883 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2884 if (unlikely(offset < 0))
9c707762 2885 goto out_free;
b84bbaf7 2886 } else if (reserve) {
9aad13b0 2887 skb_reserve(skb, -reserve);
88a8121d
ND
2888 if (len < reserve + sizeof(struct ipv6hdr) &&
2889 dev->min_header_len != dev->hard_header_len)
993675a3 2890 skb_reset_network_header(skb);
9c707762 2891 }
1da177e4
LT
2892
2893 /* Returns -EFAULT on error */
c0371da6 2894 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2895 if (err)
2896 goto out_free;
bf84a010 2897
9ed988cd
WB
2898 if (sock->type == SOCK_RAW &&
2899 !dev_validate_header(dev, skb->data, len)) {
2900 err = -EINVAL;
2901 goto out_free;
2902 }
2903
8f932f76 2904 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2905
16cc1400 2906 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2907 !packet_extra_vlan_len_allowed(dev, skb)) {
2908 err = -EMSGSIZE;
2909 goto out_free;
57f89bfa
BG
2910 }
2911
09effa67
DM
2912 skb->protocol = proto;
2913 skb->dev = dev;
1da177e4 2914 skb->priority = sk->sk_priority;
c7d39e32 2915 skb->mark = sockc.mark;
3d0ba8c0 2916 skb->tstamp = sockc.transmit_time;
0fd5d57b 2917
da7c9561 2918 if (has_vnet_hdr) {
db60eb5f 2919 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2920 if (err)
2921 goto out_free;
2922 len += sizeof(vnet_hdr);
9d2f67e4 2923 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2924 }
2925
75c65772 2926 packet_parse_headers(skb, sock);
8fd6c80d 2927
3bdc0eba
BG
2928 if (unlikely(extra_len == 4))
2929 skb->no_fcs = 1;
2930
d346a3fa 2931 err = po->xmit(skb);
1da177e4
LT
2932 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2933 goto out_unlock;
2934
e40526cb 2935 dev_put(dev);
1da177e4 2936
40d4e3df 2937 return len;
1da177e4
LT
2938
2939out_free:
2940 kfree_skb(skb);
2941out_unlock:
e40526cb 2942 if (dev)
1da177e4
LT
2943 dev_put(dev);
2944out:
2945 return err;
2946}
2947
1b784140 2948static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2949{
69e3c75f
JB
2950 struct sock *sk = sock->sk;
2951 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2952
69e3c75f
JB
2953 if (po->tx_ring.pg_vec)
2954 return tpacket_snd(po, msg);
2955 else
69e3c75f
JB
2956 return packet_snd(sock, msg, len);
2957}
2958
1da177e4
LT
2959/*
2960 * Close a PACKET socket. This is fairly simple. We immediately go
2961 * to 'closed' state and remove our protocol entry in the device list.
2962 */
2963
2964static int packet_release(struct socket *sock)
2965{
2966 struct sock *sk = sock->sk;
2967 struct packet_sock *po;
2bd624b4 2968 struct packet_fanout *f;
d12d01d6 2969 struct net *net;
f6fb8f10 2970 union tpacket_req_u req_u;
1da177e4
LT
2971
2972 if (!sk)
2973 return 0;
2974
3b1e0a65 2975 net = sock_net(sk);
1da177e4
LT
2976 po = pkt_sk(sk);
2977
0fa7fa98 2978 mutex_lock(&net->packet.sklist_lock);
808f5114 2979 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2980 mutex_unlock(&net->packet.sklist_lock);
2981
2982 preempt_disable();
920de804 2983 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2984 preempt_enable();
1da177e4 2985
808f5114 2986 spin_lock(&po->bind_lock);
ce06b03e 2987 unregister_prot_hook(sk, false);
66e56cd4
DB
2988 packet_cached_dev_reset(po);
2989
160ff18a
BG
2990 if (po->prot_hook.dev) {
2991 dev_put(po->prot_hook.dev);
2992 po->prot_hook.dev = NULL;
2993 }
808f5114 2994 spin_unlock(&po->bind_lock);
1da177e4 2995
1da177e4 2996 packet_flush_mclist(sk);
1da177e4 2997
5171b37d 2998 lock_sock(sk);
9665d5d6
PS
2999 if (po->rx_ring.pg_vec) {
3000 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3001 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3002 }
69e3c75f 3003
9665d5d6
PS
3004 if (po->tx_ring.pg_vec) {
3005 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3006 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3007 }
5171b37d 3008 release_sock(sk);
1da177e4 3009
2bd624b4 3010 f = fanout_release(sk);
dc99f600 3011
808f5114 3012 synchronize_net();
2bd624b4
AS
3013
3014 if (f) {
57f015f5 3015 kfree(po->rollover);
2bd624b4
AS
3016 fanout_release_data(f);
3017 kfree(f);
3018 }
1da177e4
LT
3019 /*
3020 * Now the socket is dead. No more input will appear.
3021 */
1da177e4
LT
3022 sock_orphan(sk);
3023 sock->sk = NULL;
3024
3025 /* Purge queues */
3026
3027 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3028 packet_free_pending(po);
17ab56a2 3029 sk_refcnt_debug_release(sk);
1da177e4
LT
3030
3031 sock_put(sk);
3032 return 0;
3033}
3034
3035/*
3036 * Attach a packet hook.
3037 */
3038
30f7ea1c
FR
3039static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3040 __be16 proto)
1da177e4
LT
3041{
3042 struct packet_sock *po = pkt_sk(sk);
158cd4af 3043 struct net_device *dev_curr;
902fefb8
DB
3044 __be16 proto_curr;
3045 bool need_rehook;
30f7ea1c
FR
3046 struct net_device *dev = NULL;
3047 int ret = 0;
3048 bool unlisted = false;
dc99f600 3049
1da177e4 3050 lock_sock(sk);
1da177e4 3051 spin_lock(&po->bind_lock);
30f7ea1c
FR
3052 rcu_read_lock();
3053
4971613c
WB
3054 if (po->fanout) {
3055 ret = -EINVAL;
3056 goto out_unlock;
3057 }
3058
30f7ea1c
FR
3059 if (name) {
3060 dev = dev_get_by_name_rcu(sock_net(sk), name);
3061 if (!dev) {
3062 ret = -ENODEV;
3063 goto out_unlock;
3064 }
3065 } else if (ifindex) {
3066 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3067 if (!dev) {
3068 ret = -ENODEV;
3069 goto out_unlock;
3070 }
3071 }
3072
3073 if (dev)
3074 dev_hold(dev);
66e56cd4 3075
902fefb8
DB
3076 proto_curr = po->prot_hook.type;
3077 dev_curr = po->prot_hook.dev;
3078
3079 need_rehook = proto_curr != proto || dev_curr != dev;
3080
3081 if (need_rehook) {
30f7ea1c
FR
3082 if (po->running) {
3083 rcu_read_unlock();
15fe076e
ED
3084 /* prevents packet_notifier() from calling
3085 * register_prot_hook()
3086 */
3087 po->num = 0;
30f7ea1c
FR
3088 __unregister_prot_hook(sk, true);
3089 rcu_read_lock();
3090 dev_curr = po->prot_hook.dev;
3091 if (dev)
3092 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3093 dev->ifindex);
3094 }
1da177e4 3095
15fe076e 3096 BUG_ON(po->running);
902fefb8
DB
3097 po->num = proto;
3098 po->prot_hook.type = proto;
902fefb8 3099
30f7ea1c
FR
3100 if (unlikely(unlisted)) {
3101 dev_put(dev);
3102 po->prot_hook.dev = NULL;
3103 po->ifindex = -1;
3104 packet_cached_dev_reset(po);
3105 } else {
3106 po->prot_hook.dev = dev;
3107 po->ifindex = dev ? dev->ifindex : 0;
3108 packet_cached_dev_assign(po, dev);
3109 }
902fefb8 3110 }
158cd4af
LW
3111 if (dev_curr)
3112 dev_put(dev_curr);
66e56cd4 3113
902fefb8 3114 if (proto == 0 || !need_rehook)
1da177e4
LT
3115 goto out_unlock;
3116
30f7ea1c 3117 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3118 register_prot_hook(sk);
be85d4ad
UT
3119 } else {
3120 sk->sk_err = ENETDOWN;
3121 if (!sock_flag(sk, SOCK_DEAD))
3122 sk->sk_error_report(sk);
1da177e4
LT
3123 }
3124
3125out_unlock:
30f7ea1c 3126 rcu_read_unlock();
1da177e4
LT
3127 spin_unlock(&po->bind_lock);
3128 release_sock(sk);
30f7ea1c 3129 return ret;
1da177e4
LT
3130}
3131
3132/*
3133 * Bind a packet socket to a device
3134 */
3135
40d4e3df
ED
3136static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3137 int addr_len)
1da177e4 3138{
40d4e3df 3139 struct sock *sk = sock->sk;
540e2894 3140 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3141
1da177e4
LT
3142 /*
3143 * Check legality
3144 */
1ce4f28b 3145
8ae55f04 3146 if (addr_len != sizeof(struct sockaddr))
1da177e4 3147 return -EINVAL;
540e2894
AP
3148 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3149 * zero-terminated.
3150 */
3151 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3152 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3153
30f7ea1c 3154 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3155}
1da177e4
LT
3156
3157static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3158{
40d4e3df
ED
3159 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3160 struct sock *sk = sock->sk;
1da177e4
LT
3161
3162 /*
3163 * Check legality
3164 */
1ce4f28b 3165
1da177e4
LT
3166 if (addr_len < sizeof(struct sockaddr_ll))
3167 return -EINVAL;
3168 if (sll->sll_family != AF_PACKET)
3169 return -EINVAL;
3170
30f7ea1c
FR
3171 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3172 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3173}
3174
3175static struct proto packet_proto = {
3176 .name = "PACKET",
3177 .owner = THIS_MODULE,
3178 .obj_size = sizeof(struct packet_sock),
3179};
3180
3181/*
1ce4f28b 3182 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3183 */
3184
3f378b68
EP
3185static int packet_create(struct net *net, struct socket *sock, int protocol,
3186 int kern)
1da177e4
LT
3187{
3188 struct sock *sk;
3189 struct packet_sock *po;
0e11c91e 3190 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3191 int err;
3192
df008c91 3193 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3194 return -EPERM;
be02097c
DM
3195 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3196 sock->type != SOCK_PACKET)
1da177e4
LT
3197 return -ESOCKTNOSUPPORT;
3198
3199 sock->state = SS_UNCONNECTED;
3200
3201 err = -ENOBUFS;
11aa9c28 3202 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3203 if (sk == NULL)
3204 goto out;
3205
3206 sock->ops = &packet_ops;
1da177e4
LT
3207 if (sock->type == SOCK_PACKET)
3208 sock->ops = &packet_ops_spkt;
be02097c 3209
1da177e4
LT
3210 sock_init_data(sock, sk);
3211
3212 po = pkt_sk(sk);
3213 sk->sk_family = PF_PACKET;
0e11c91e 3214 po->num = proto;
d346a3fa 3215 po->xmit = dev_queue_xmit;
66e56cd4 3216
b0138408
DB
3217 err = packet_alloc_pending(po);
3218 if (err)
3219 goto out2;
3220
66e56cd4 3221 packet_cached_dev_reset(po);
1da177e4
LT
3222
3223 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3224 sk_refcnt_debug_inc(sk);
1da177e4
LT
3225
3226 /*
3227 * Attach a protocol block
3228 */
3229
3230 spin_lock_init(&po->bind_lock);
905db440 3231 mutex_init(&po->pg_vec_lock);
0648ab70 3232 po->rollover = NULL;
1da177e4 3233 po->prot_hook.func = packet_rcv;
be02097c 3234
1da177e4
LT
3235 if (sock->type == SOCK_PACKET)
3236 po->prot_hook.func = packet_rcv_spkt;
be02097c 3237
1da177e4
LT
3238 po->prot_hook.af_packet_priv = sk;
3239
0e11c91e
AV
3240 if (proto) {
3241 po->prot_hook.type = proto;
a6361f0c 3242 __register_prot_hook(sk);
1da177e4
LT
3243 }
3244
0fa7fa98 3245 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3246 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3247 mutex_unlock(&net->packet.sklist_lock);
3248
3249 preempt_disable();
3680453c 3250 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3251 preempt_enable();
808f5114 3252
40d4e3df 3253 return 0;
b0138408
DB
3254out2:
3255 sk_free(sk);
1da177e4
LT
3256out:
3257 return err;
3258}
3259
3260/*
3261 * Pull a packet from our receive queue and hand it to the user.
3262 * If necessary we block.
3263 */
3264
1b784140
YX
3265static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3266 int flags)
1da177e4
LT
3267{
3268 struct sock *sk = sock->sk;
3269 struct sk_buff *skb;
3270 int copied, err;
bfd5f4a3 3271 int vnet_hdr_len = 0;
2472d761 3272 unsigned int origlen = 0;
1da177e4
LT
3273
3274 err = -EINVAL;
ed85b565 3275 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3276 goto out;
3277
3278#if 0
3279 /* What error should we return now? EUNATTACH? */
3280 if (pkt_sk(sk)->ifindex < 0)
3281 return -ENODEV;
3282#endif
3283
ed85b565 3284 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3285 err = sock_recv_errqueue(sk, msg, len,
3286 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3287 goto out;
3288 }
3289
1da177e4
LT
3290 /*
3291 * Call the generic datagram receiver. This handles all sorts
3292 * of horrible races and re-entrancy so we can forget about it
3293 * in the protocol layers.
3294 *
3295 * Now it will return ENETDOWN, if device have just gone down,
3296 * but then it will block.
3297 */
3298
40d4e3df 3299 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3300
3301 /*
1ce4f28b 3302 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3303 * handles the blocking we don't see and worry about blocking
3304 * retries.
3305 */
3306
8ae55f04 3307 if (skb == NULL)
1da177e4
LT
3308 goto out;
3309
2ccdbaa6
WB
3310 if (pkt_sk(sk)->pressure)
3311 packet_rcv_has_room(pkt_sk(sk), NULL);
3312
bfd5f4a3 3313 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3314 err = packet_rcv_vnet(msg, skb, &len);
3315 if (err)
bfd5f4a3 3316 goto out_free;
16cc1400 3317 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3318 }
3319
f3d33426
HFS
3320 /* You lose any data beyond the buffer you gave. If it worries
3321 * a user program they can ask the device for its MTU
3322 * anyway.
1da177e4 3323 */
1da177e4 3324 copied = skb->len;
40d4e3df
ED
3325 if (copied > len) {
3326 copied = len;
3327 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3328 }
3329
51f3d02b 3330 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3331 if (err)
3332 goto out_free;
3333
2472d761
EB
3334 if (sock->type != SOCK_PACKET) {
3335 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3336
3337 /* Original length was stored in sockaddr_ll fields */
3338 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3339 sll->sll_family = AF_PACKET;
3340 sll->sll_protocol = skb->protocol;
3341 }
3342
3b885787 3343 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3344
f3d33426
HFS
3345 if (msg->msg_name) {
3346 /* If the address length field is there to be filled
3347 * in, we fill it in now.
3348 */
3349 if (sock->type == SOCK_PACKET) {
342dfc30 3350 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3351 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3352 } else {
3353 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3354
f3d33426
HFS
3355 msg->msg_namelen = sll->sll_halen +
3356 offsetof(struct sockaddr_ll, sll_addr);
3357 }
ffbc6111
HX
3358 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3359 msg->msg_namelen);
f3d33426 3360 }
1da177e4 3361
8dc41944 3362 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3363 struct tpacket_auxdata aux;
3364
3365 aux.tp_status = TP_STATUS_USER;
3366 if (skb->ip_summed == CHECKSUM_PARTIAL)
3367 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3368 else if (skb->pkt_type != PACKET_OUTGOING &&
3369 (skb->ip_summed == CHECKSUM_COMPLETE ||
3370 skb_csum_unnecessary(skb)))
3371 aux.tp_status |= TP_STATUS_CSUM_VALID;
3372
2472d761 3373 aux.tp_len = origlen;
ffbc6111
HX
3374 aux.tp_snaplen = skb->len;
3375 aux.tp_mac = 0;
bbe735e4 3376 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3377 if (skb_vlan_tag_present(skb)) {
3378 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3379 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3380 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3381 } else {
3382 aux.tp_vlan_tci = 0;
a0cdfcf3 3383 aux.tp_vlan_tpid = 0;
a3bcc23e 3384 }
ffbc6111 3385 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3386 }
3387
1da177e4
LT
3388 /*
3389 * Free or return the buffer as appropriate. Again this
3390 * hides all the races and re-entrancy issues from us.
3391 */
bfd5f4a3 3392 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3393
3394out_free:
3395 skb_free_datagram(sk, skb);
3396out:
3397 return err;
3398}
3399
1da177e4 3400static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3401 int peer)
1da177e4
LT
3402{
3403 struct net_device *dev;
3404 struct sock *sk = sock->sk;
3405
3406 if (peer)
3407 return -EOPNOTSUPP;
3408
3409 uaddr->sa_family = AF_PACKET;
2dc85bf3 3410 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3411 rcu_read_lock();
3412 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3413 if (dev)
2dc85bf3 3414 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3415 rcu_read_unlock();
1da177e4 3416
9b2c45d4 3417 return sizeof(*uaddr);
1da177e4 3418}
1da177e4
LT
3419
3420static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3421 int peer)
1da177e4
LT
3422{
3423 struct net_device *dev;
3424 struct sock *sk = sock->sk;
3425 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3426 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3427
3428 if (peer)
3429 return -EOPNOTSUPP;
3430
3431 sll->sll_family = AF_PACKET;
3432 sll->sll_ifindex = po->ifindex;
3433 sll->sll_protocol = po->num;
67286640 3434 sll->sll_pkttype = 0;
654d1f8a
ED
3435 rcu_read_lock();
3436 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3437 if (dev) {
3438 sll->sll_hatype = dev->type;
3439 sll->sll_halen = dev->addr_len;
3440 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3441 } else {
3442 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3443 sll->sll_halen = 0;
3444 }
654d1f8a 3445 rcu_read_unlock();
1da177e4 3446
9b2c45d4 3447 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3448}
3449
2aeb0b88
WC
3450static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3451 int what)
1da177e4
LT
3452{
3453 switch (i->type) {
3454 case PACKET_MR_MULTICAST:
1162563f
JP
3455 if (i->alen != dev->addr_len)
3456 return -EINVAL;
1da177e4 3457 if (what > 0)
22bedad3 3458 return dev_mc_add(dev, i->addr);
1da177e4 3459 else
22bedad3 3460 return dev_mc_del(dev, i->addr);
1da177e4
LT
3461 break;
3462 case PACKET_MR_PROMISC:
2aeb0b88 3463 return dev_set_promiscuity(dev, what);
1da177e4 3464 case PACKET_MR_ALLMULTI:
2aeb0b88 3465 return dev_set_allmulti(dev, what);
d95ed927 3466 case PACKET_MR_UNICAST:
1162563f
JP
3467 if (i->alen != dev->addr_len)
3468 return -EINVAL;
d95ed927 3469 if (what > 0)
a748ee24 3470 return dev_uc_add(dev, i->addr);
d95ed927 3471 else
a748ee24 3472 return dev_uc_del(dev, i->addr);
d95ed927 3473 break;
40d4e3df
ED
3474 default:
3475 break;
1da177e4 3476 }
2aeb0b88 3477 return 0;
1da177e4
LT
3478}
3479
82f17091
FR
3480static void packet_dev_mclist_delete(struct net_device *dev,
3481 struct packet_mclist **mlp)
1da177e4 3482{
82f17091
FR
3483 struct packet_mclist *ml;
3484
3485 while ((ml = *mlp) != NULL) {
3486 if (ml->ifindex == dev->ifindex) {
3487 packet_dev_mc(dev, ml, -1);
3488 *mlp = ml->next;
3489 kfree(ml);
3490 } else
3491 mlp = &ml->next;
1da177e4
LT
3492 }
3493}
3494
0fb375fb 3495static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3496{
3497 struct packet_sock *po = pkt_sk(sk);
3498 struct packet_mclist *ml, *i;
3499 struct net_device *dev;
3500 int err;
3501
3502 rtnl_lock();
3503
3504 err = -ENODEV;
3b1e0a65 3505 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3506 if (!dev)
3507 goto done;
3508
3509 err = -EINVAL;
1162563f 3510 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3511 goto done;
3512
3513 err = -ENOBUFS;
8b3a7005 3514 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3515 if (i == NULL)
3516 goto done;
3517
3518 err = 0;
3519 for (ml = po->mclist; ml; ml = ml->next) {
3520 if (ml->ifindex == mreq->mr_ifindex &&
3521 ml->type == mreq->mr_type &&
3522 ml->alen == mreq->mr_alen &&
3523 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3524 ml->count++;
3525 /* Free the new element ... */
3526 kfree(i);
3527 goto done;
3528 }
3529 }
3530
3531 i->type = mreq->mr_type;
3532 i->ifindex = mreq->mr_ifindex;
3533 i->alen = mreq->mr_alen;
3534 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3535 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3536 i->count = 1;
3537 i->next = po->mclist;
3538 po->mclist = i;
2aeb0b88
WC
3539 err = packet_dev_mc(dev, i, 1);
3540 if (err) {
3541 po->mclist = i->next;
3542 kfree(i);
3543 }
1da177e4
LT
3544
3545done:
3546 rtnl_unlock();
3547 return err;
3548}
3549
0fb375fb 3550static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3551{
3552 struct packet_mclist *ml, **mlp;
3553
3554 rtnl_lock();
3555
3556 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3557 if (ml->ifindex == mreq->mr_ifindex &&
3558 ml->type == mreq->mr_type &&
3559 ml->alen == mreq->mr_alen &&
3560 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3561 if (--ml->count == 0) {
3562 struct net_device *dev;
3563 *mlp = ml->next;
ad959e76
ED
3564 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3565 if (dev)
1da177e4 3566 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3567 kfree(ml);
3568 }
82f17091 3569 break;
1da177e4
LT
3570 }
3571 }
3572 rtnl_unlock();
82f17091 3573 return 0;
1da177e4
LT
3574}
3575
3576static void packet_flush_mclist(struct sock *sk)
3577{
3578 struct packet_sock *po = pkt_sk(sk);
3579 struct packet_mclist *ml;
3580
3581 if (!po->mclist)
3582 return;
3583
3584 rtnl_lock();
3585 while ((ml = po->mclist) != NULL) {
3586 struct net_device *dev;
3587
3588 po->mclist = ml->next;
ad959e76
ED
3589 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3590 if (dev != NULL)
1da177e4 3591 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3592 kfree(ml);
3593 }
3594 rtnl_unlock();
3595}
1da177e4
LT
3596
3597static int
b7058842 3598packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3599{
3600 struct sock *sk = sock->sk;
8dc41944 3601 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3602 int ret;
3603
3604 if (level != SOL_PACKET)
3605 return -ENOPROTOOPT;
3606
69e3c75f 3607 switch (optname) {
1ce4f28b 3608 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3609 case PACKET_DROP_MEMBERSHIP:
3610 {
0fb375fb
EB
3611 struct packet_mreq_max mreq;
3612 int len = optlen;
3613 memset(&mreq, 0, sizeof(mreq));
3614 if (len < sizeof(struct packet_mreq))
1da177e4 3615 return -EINVAL;
0fb375fb
EB
3616 if (len > sizeof(mreq))
3617 len = sizeof(mreq);
40d4e3df 3618 if (copy_from_user(&mreq, optval, len))
1da177e4 3619 return -EFAULT;
0fb375fb
EB
3620 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3621 return -EINVAL;
1da177e4
LT
3622 if (optname == PACKET_ADD_MEMBERSHIP)
3623 ret = packet_mc_add(sk, &mreq);
3624 else
3625 ret = packet_mc_drop(sk, &mreq);
3626 return ret;
3627 }
a2efcfa0 3628
1da177e4 3629 case PACKET_RX_RING:
69e3c75f 3630 case PACKET_TX_RING:
1da177e4 3631 {
f6fb8f10 3632 union tpacket_req_u req_u;
3633 int len;
1da177e4 3634
5171b37d 3635 lock_sock(sk);
f6fb8f10 3636 switch (po->tp_version) {
3637 case TPACKET_V1:
3638 case TPACKET_V2:
3639 len = sizeof(req_u.req);
3640 break;
3641 case TPACKET_V3:
3642 default:
3643 len = sizeof(req_u.req3);
3644 break;
3645 }
5171b37d
ED
3646 if (optlen < len) {
3647 ret = -EINVAL;
3648 } else {
3649 if (copy_from_user(&req_u.req, optval, len))
3650 ret = -EFAULT;
3651 else
3652 ret = packet_set_ring(sk, &req_u, 0,
3653 optname == PACKET_TX_RING);
3654 }
3655 release_sock(sk);
3656 return ret;
1da177e4
LT
3657 }
3658 case PACKET_COPY_THRESH:
3659 {
3660 int val;
3661
40d4e3df 3662 if (optlen != sizeof(val))
1da177e4 3663 return -EINVAL;
40d4e3df 3664 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3665 return -EFAULT;
3666
3667 pkt_sk(sk)->copy_thresh = val;
3668 return 0;
3669 }
bbd6ef87
PM
3670 case PACKET_VERSION:
3671 {
3672 int val;
3673
3674 if (optlen != sizeof(val))
3675 return -EINVAL;
bbd6ef87
PM
3676 if (copy_from_user(&val, optval, sizeof(val)))
3677 return -EFAULT;
3678 switch (val) {
3679 case TPACKET_V1:
3680 case TPACKET_V2:
f6fb8f10 3681 case TPACKET_V3:
84ac7260 3682 break;
bbd6ef87
PM
3683 default:
3684 return -EINVAL;
3685 }
84ac7260
PP
3686 lock_sock(sk);
3687 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3688 ret = -EBUSY;
3689 } else {
3690 po->tp_version = val;
3691 ret = 0;
3692 }
3693 release_sock(sk);
3694 return ret;
bbd6ef87 3695 }
8913336a
PM
3696 case PACKET_RESERVE:
3697 {
3698 unsigned int val;
3699
3700 if (optlen != sizeof(val))
3701 return -EINVAL;
8913336a
PM
3702 if (copy_from_user(&val, optval, sizeof(val)))
3703 return -EFAULT;
bcc5364b
AK
3704 if (val > INT_MAX)
3705 return -EINVAL;
c27927e3
WB
3706 lock_sock(sk);
3707 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3708 ret = -EBUSY;
3709 } else {
3710 po->tp_reserve = val;
3711 ret = 0;
3712 }
3713 release_sock(sk);
3714 return ret;
8913336a 3715 }
69e3c75f
JB
3716 case PACKET_LOSS:
3717 {
3718 unsigned int val;
3719
3720 if (optlen != sizeof(val))
3721 return -EINVAL;
69e3c75f
JB
3722 if (copy_from_user(&val, optval, sizeof(val)))
3723 return -EFAULT;
a6361f0c
WB
3724
3725 lock_sock(sk);
3726 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3727 ret = -EBUSY;
3728 } else {
3729 po->tp_loss = !!val;
3730 ret = 0;
3731 }
3732 release_sock(sk);
3733 return ret;
69e3c75f 3734 }
8dc41944
HX
3735 case PACKET_AUXDATA:
3736 {
3737 int val;
3738
3739 if (optlen < sizeof(val))
3740 return -EINVAL;
3741 if (copy_from_user(&val, optval, sizeof(val)))
3742 return -EFAULT;
3743
a6361f0c 3744 lock_sock(sk);
8dc41944 3745 po->auxdata = !!val;
a6361f0c 3746 release_sock(sk);
8dc41944
HX
3747 return 0;
3748 }
80feaacb
PWJ
3749 case PACKET_ORIGDEV:
3750 {
3751 int val;
3752
3753 if (optlen < sizeof(val))
3754 return -EINVAL;
3755 if (copy_from_user(&val, optval, sizeof(val)))
3756 return -EFAULT;
3757
a6361f0c 3758 lock_sock(sk);
80feaacb 3759 po->origdev = !!val;
a6361f0c 3760 release_sock(sk);
80feaacb
PWJ
3761 return 0;
3762 }
bfd5f4a3
SS
3763 case PACKET_VNET_HDR:
3764 {
3765 int val;
3766
3767 if (sock->type != SOCK_RAW)
3768 return -EINVAL;
bfd5f4a3
SS
3769 if (optlen < sizeof(val))
3770 return -EINVAL;
3771 if (copy_from_user(&val, optval, sizeof(val)))
3772 return -EFAULT;
3773
a6361f0c
WB
3774 lock_sock(sk);
3775 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3776 ret = -EBUSY;
3777 } else {
3778 po->has_vnet_hdr = !!val;
3779 ret = 0;
3780 }
3781 release_sock(sk);
3782 return ret;
bfd5f4a3 3783 }
614f60fa
SM
3784 case PACKET_TIMESTAMP:
3785 {
3786 int val;
3787
3788 if (optlen != sizeof(val))
3789 return -EINVAL;
3790 if (copy_from_user(&val, optval, sizeof(val)))
3791 return -EFAULT;
3792
3793 po->tp_tstamp = val;
3794 return 0;
3795 }
dc99f600
DM
3796 case PACKET_FANOUT:
3797 {
3798 int val;
3799
3800 if (optlen != sizeof(val))
3801 return -EINVAL;
3802 if (copy_from_user(&val, optval, sizeof(val)))
3803 return -EFAULT;
3804
3805 return fanout_add(sk, val & 0xffff, val >> 16);
3806 }
47dceb8e
WB
3807 case PACKET_FANOUT_DATA:
3808 {
3809 if (!po->fanout)
3810 return -EINVAL;
3811
3812 return fanout_set_data(po, optval, optlen);
3813 }
fa788d98
VW
3814 case PACKET_IGNORE_OUTGOING:
3815 {
3816 int val;
3817
3818 if (optlen != sizeof(val))
3819 return -EINVAL;
3820 if (copy_from_user(&val, optval, sizeof(val)))
3821 return -EFAULT;
3822 if (val < 0 || val > 1)
3823 return -EINVAL;
3824
3825 po->prot_hook.ignore_outgoing = !!val;
3826 return 0;
3827 }
5920cd3a
PC
3828 case PACKET_TX_HAS_OFF:
3829 {
3830 unsigned int val;
3831
3832 if (optlen != sizeof(val))
3833 return -EINVAL;
5920cd3a
PC
3834 if (copy_from_user(&val, optval, sizeof(val)))
3835 return -EFAULT;
a6361f0c
WB
3836
3837 lock_sock(sk);
3838 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3839 ret = -EBUSY;
3840 } else {
3841 po->tp_tx_has_off = !!val;
3842 ret = 0;
3843 }
3844 release_sock(sk);
5920cd3a
PC
3845 return 0;
3846 }
d346a3fa
DB
3847 case PACKET_QDISC_BYPASS:
3848 {
3849 int val;
3850
3851 if (optlen != sizeof(val))
3852 return -EINVAL;
3853 if (copy_from_user(&val, optval, sizeof(val)))
3854 return -EFAULT;
3855
3856 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3857 return 0;
3858 }
1da177e4
LT
3859 default:
3860 return -ENOPROTOOPT;
3861 }
3862}
3863
3864static int packet_getsockopt(struct socket *sock, int level, int optname,
3865 char __user *optval, int __user *optlen)
3866{
3867 int len;
c06fff6e 3868 int val, lv = sizeof(val);
1da177e4
LT
3869 struct sock *sk = sock->sk;
3870 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3871 void *data = &val;
ee80fbf3 3872 union tpacket_stats_u st;
a9b63918 3873 struct tpacket_rollover_stats rstats;
1da177e4
LT
3874
3875 if (level != SOL_PACKET)
3876 return -ENOPROTOOPT;
3877
8ae55f04
KK
3878 if (get_user(len, optlen))
3879 return -EFAULT;
1da177e4
LT
3880
3881 if (len < 0)
3882 return -EINVAL;
1ce4f28b 3883
69e3c75f 3884 switch (optname) {
1da177e4 3885 case PACKET_STATISTICS:
1da177e4 3886 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3887 memcpy(&st, &po->stats, sizeof(st));
3888 memset(&po->stats, 0, sizeof(po->stats));
3889 spin_unlock_bh(&sk->sk_receive_queue.lock);
3890
f6fb8f10 3891 if (po->tp_version == TPACKET_V3) {
c06fff6e 3892 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3893 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3894 data = &st.stats3;
f6fb8f10 3895 } else {
c06fff6e 3896 lv = sizeof(struct tpacket_stats);
8bcdeaff 3897 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3898 data = &st.stats1;
f6fb8f10 3899 }
ee80fbf3 3900
8dc41944
HX
3901 break;
3902 case PACKET_AUXDATA:
8dc41944 3903 val = po->auxdata;
80feaacb
PWJ
3904 break;
3905 case PACKET_ORIGDEV:
80feaacb 3906 val = po->origdev;
bfd5f4a3
SS
3907 break;
3908 case PACKET_VNET_HDR:
bfd5f4a3 3909 val = po->has_vnet_hdr;
1da177e4 3910 break;
bbd6ef87 3911 case PACKET_VERSION:
bbd6ef87 3912 val = po->tp_version;
bbd6ef87
PM
3913 break;
3914 case PACKET_HDRLEN:
3915 if (len > sizeof(int))
3916 len = sizeof(int);
fd2c83b3
AP
3917 if (len < sizeof(int))
3918 return -EINVAL;
bbd6ef87
PM
3919 if (copy_from_user(&val, optval, len))
3920 return -EFAULT;
3921 switch (val) {
3922 case TPACKET_V1:
3923 val = sizeof(struct tpacket_hdr);
3924 break;
3925 case TPACKET_V2:
3926 val = sizeof(struct tpacket2_hdr);
3927 break;
f6fb8f10 3928 case TPACKET_V3:
3929 val = sizeof(struct tpacket3_hdr);
3930 break;
bbd6ef87
PM
3931 default:
3932 return -EINVAL;
3933 }
bbd6ef87 3934 break;
8913336a 3935 case PACKET_RESERVE:
8913336a 3936 val = po->tp_reserve;
8913336a 3937 break;
69e3c75f 3938 case PACKET_LOSS:
69e3c75f 3939 val = po->tp_loss;
69e3c75f 3940 break;
614f60fa 3941 case PACKET_TIMESTAMP:
614f60fa 3942 val = po->tp_tstamp;
614f60fa 3943 break;
dc99f600 3944 case PACKET_FANOUT:
dc99f600
DM
3945 val = (po->fanout ?
3946 ((u32)po->fanout->id |
77f65ebd
WB
3947 ((u32)po->fanout->type << 16) |
3948 ((u32)po->fanout->flags << 24)) :
dc99f600 3949 0);
dc99f600 3950 break;
fa788d98
VW
3951 case PACKET_IGNORE_OUTGOING:
3952 val = po->prot_hook.ignore_outgoing;
3953 break;
a9b63918 3954 case PACKET_ROLLOVER_STATS:
57f015f5 3955 if (!po->rollover)
a9b63918 3956 return -EINVAL;
57f015f5
MM
3957 rstats.tp_all = atomic_long_read(&po->rollover->num);
3958 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3959 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3960 data = &rstats;
3961 lv = sizeof(rstats);
a9b63918 3962 break;
5920cd3a
PC
3963 case PACKET_TX_HAS_OFF:
3964 val = po->tp_tx_has_off;
3965 break;
d346a3fa
DB
3966 case PACKET_QDISC_BYPASS:
3967 val = packet_use_direct_xmit(po);
3968 break;
1da177e4
LT
3969 default:
3970 return -ENOPROTOOPT;
3971 }
3972
c06fff6e
ED
3973 if (len > lv)
3974 len = lv;
8ae55f04
KK
3975 if (put_user(len, optlen))
3976 return -EFAULT;
8dc41944
HX
3977 if (copy_to_user(optval, data, len))
3978 return -EFAULT;
8ae55f04 3979 return 0;
1da177e4
LT
3980}
3981
3982
719c44d3
WB
3983#ifdef CONFIG_COMPAT
3984static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3985 char __user *optval, unsigned int optlen)
3986{
3987 struct packet_sock *po = pkt_sk(sock->sk);
3988
3989 if (level != SOL_PACKET)
3990 return -ENOPROTOOPT;
3991
3992 if (optname == PACKET_FANOUT_DATA &&
3993 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3994 optval = (char __user *)get_compat_bpf_fprog(optval);
3995 if (!optval)
3996 return -EFAULT;
3997 optlen = sizeof(struct sock_fprog);
3998 }
3999
4000 return packet_setsockopt(sock, level, optname, optval, optlen);
4001}
4002#endif
4003
351638e7
JP
4004static int packet_notifier(struct notifier_block *this,
4005 unsigned long msg, void *ptr)
1da177e4
LT
4006{
4007 struct sock *sk;
351638e7 4008 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4009 struct net *net = dev_net(dev);
1da177e4 4010
808f5114 4011 rcu_read_lock();
b67bfe0d 4012 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4013 struct packet_sock *po = pkt_sk(sk);
4014
4015 switch (msg) {
4016 case NETDEV_UNREGISTER:
1da177e4 4017 if (po->mclist)
82f17091 4018 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4019 /* fallthrough */
4020
1da177e4
LT
4021 case NETDEV_DOWN:
4022 if (dev->ifindex == po->ifindex) {
4023 spin_lock(&po->bind_lock);
4024 if (po->running) {
ce06b03e 4025 __unregister_prot_hook(sk, false);
1da177e4
LT
4026 sk->sk_err = ENETDOWN;
4027 if (!sock_flag(sk, SOCK_DEAD))
4028 sk->sk_error_report(sk);
4029 }
4030 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4031 packet_cached_dev_reset(po);
1da177e4 4032 po->ifindex = -1;
160ff18a
BG
4033 if (po->prot_hook.dev)
4034 dev_put(po->prot_hook.dev);
1da177e4
LT
4035 po->prot_hook.dev = NULL;
4036 }
4037 spin_unlock(&po->bind_lock);
4038 }
4039 break;
4040 case NETDEV_UP:
808f5114 4041 if (dev->ifindex == po->ifindex) {
4042 spin_lock(&po->bind_lock);
ce06b03e
DM
4043 if (po->num)
4044 register_prot_hook(sk);
808f5114 4045 spin_unlock(&po->bind_lock);
1da177e4 4046 }
1da177e4
LT
4047 break;
4048 }
4049 }
808f5114 4050 rcu_read_unlock();
1da177e4
LT
4051 return NOTIFY_DONE;
4052}
4053
4054
4055static int packet_ioctl(struct socket *sock, unsigned int cmd,
4056 unsigned long arg)
4057{
4058 struct sock *sk = sock->sk;
4059
69e3c75f 4060 switch (cmd) {
40d4e3df
ED
4061 case SIOCOUTQ:
4062 {
4063 int amount = sk_wmem_alloc_get(sk);
31e6d363 4064
40d4e3df
ED
4065 return put_user(amount, (int __user *)arg);
4066 }
4067 case SIOCINQ:
4068 {
4069 struct sk_buff *skb;
4070 int amount = 0;
4071
4072 spin_lock_bh(&sk->sk_receive_queue.lock);
4073 skb = skb_peek(&sk->sk_receive_queue);
4074 if (skb)
4075 amount = skb->len;
4076 spin_unlock_bh(&sk->sk_receive_queue.lock);
4077 return put_user(amount, (int __user *)arg);
4078 }
4079 case SIOCGSTAMP:
4080 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4081 case SIOCGSTAMPNS:
4082 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4083
1da177e4 4084#ifdef CONFIG_INET
40d4e3df
ED
4085 case SIOCADDRT:
4086 case SIOCDELRT:
4087 case SIOCDARP:
4088 case SIOCGARP:
4089 case SIOCSARP:
4090 case SIOCGIFADDR:
4091 case SIOCSIFADDR:
4092 case SIOCGIFBRDADDR:
4093 case SIOCSIFBRDADDR:
4094 case SIOCGIFNETMASK:
4095 case SIOCSIFNETMASK:
4096 case SIOCGIFDSTADDR:
4097 case SIOCSIFDSTADDR:
4098 case SIOCSIFFLAGS:
40d4e3df 4099 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4100#endif
4101
40d4e3df
ED
4102 default:
4103 return -ENOIOCTLCMD;
1da177e4
LT
4104 }
4105 return 0;
4106}
4107
a11e1d43
LT
4108static __poll_t packet_poll(struct file *file, struct socket *sock,
4109 poll_table *wait)
1da177e4
LT
4110{
4111 struct sock *sk = sock->sk;
4112 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4113 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4114
4115 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4116 if (po->rx_ring.pg_vec) {
f6fb8f10 4117 if (!packet_previous_rx_frame(po, &po->rx_ring,
4118 TP_STATUS_KERNEL))
a9a08845 4119 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4120 }
2ccdbaa6 4121 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4122 po->pressure = 0;
1da177e4 4123 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4124 spin_lock_bh(&sk->sk_write_queue.lock);
4125 if (po->tx_ring.pg_vec) {
4126 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4127 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4128 }
4129 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4130 return mask;
4131}
4132
4133
4134/* Dirty? Well, I still did not learn better way to account
4135 * for user mmaps.
4136 */
4137
4138static void packet_mm_open(struct vm_area_struct *vma)
4139{
4140 struct file *file = vma->vm_file;
40d4e3df 4141 struct socket *sock = file->private_data;
1da177e4 4142 struct sock *sk = sock->sk;
1ce4f28b 4143
1da177e4
LT
4144 if (sk)
4145 atomic_inc(&pkt_sk(sk)->mapped);
4146}
4147
4148static void packet_mm_close(struct vm_area_struct *vma)
4149{
4150 struct file *file = vma->vm_file;
40d4e3df 4151 struct socket *sock = file->private_data;
1da177e4 4152 struct sock *sk = sock->sk;
1ce4f28b 4153
1da177e4
LT
4154 if (sk)
4155 atomic_dec(&pkt_sk(sk)->mapped);
4156}
4157
f0f37e2f 4158static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4159 .open = packet_mm_open,
4160 .close = packet_mm_close,
1da177e4
LT
4161};
4162
3a7ad063
ED
4163static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4164 unsigned int len)
1da177e4
LT
4165{
4166 int i;
4167
4ebf0ae2 4168 for (i = 0; i < len; i++) {
0e3125c7 4169 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4170 if (is_vmalloc_addr(pg_vec[i].buffer))
4171 vfree(pg_vec[i].buffer);
4172 else
4173 free_pages((unsigned long)pg_vec[i].buffer,
4174 order);
0e3125c7
NH
4175 pg_vec[i].buffer = NULL;
4176 }
1da177e4
LT
4177 }
4178 kfree(pg_vec);
4179}
4180
3a7ad063 4181static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4182{
f0d4eb29 4183 char *buffer;
3a7ad063
ED
4184 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4185 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4186
3a7ad063 4187 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4188 if (buffer)
4189 return buffer;
4190
3a7ad063
ED
4191 /* __get_free_pages failed, fall back to vmalloc */
4192 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4193 if (buffer)
4194 return buffer;
0e3125c7 4195
3a7ad063
ED
4196 /* vmalloc failed, lets dig into swap here */
4197 gfp_flags &= ~__GFP_NORETRY;
4198 buffer = (char *) __get_free_pages(gfp_flags, order);
4199 if (buffer)
4200 return buffer;
4201
4202 /* complete and utter failure */
4203 return NULL;
4ebf0ae2
DM
4204}
4205
3a7ad063 4206static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4207{
4208 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4209 struct pgv *pg_vec;
4ebf0ae2
DM
4210 int i;
4211
0e3125c7 4212 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4213 if (unlikely(!pg_vec))
4214 goto out;
4215
4216 for (i = 0; i < block_nr; i++) {
3a7ad063 4217 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4218 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4219 goto out_free_pgvec;
4220 }
4221
4222out:
4223 return pg_vec;
4224
4225out_free_pgvec:
3a7ad063 4226 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4227 pg_vec = NULL;
4228 goto out;
4229}
1da177e4 4230
f6fb8f10 4231static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4232 int closing, int tx_ring)
1da177e4 4233{
0e3125c7 4234 struct pgv *pg_vec = NULL;
1da177e4 4235 struct packet_sock *po = pkt_sk(sk);
3a7ad063 4236 int was_running, order = 0;
69e3c75f
JB
4237 struct packet_ring_buffer *rb;
4238 struct sk_buff_head *rb_queue;
0e11c91e 4239 __be16 num;
f6fb8f10 4240 int err = -EINVAL;
4241 /* Added to avoid minimal code churn */
4242 struct tpacket_req *req = &req_u->req;
4243
69e3c75f
JB
4244 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4245 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4246
69e3c75f
JB
4247 err = -EBUSY;
4248 if (!closing) {
4249 if (atomic_read(&po->mapped))
4250 goto out;
b0138408 4251 if (packet_read_pending(rb))
69e3c75f
JB
4252 goto out;
4253 }
1da177e4 4254
69e3c75f 4255 if (req->tp_block_nr) {
4576cd46
WB
4256 unsigned int min_frame_size;
4257
69e3c75f
JB
4258 /* Sanity tests and some calculations */
4259 err = -EBUSY;
4260 if (unlikely(rb->pg_vec))
4261 goto out;
1da177e4 4262
bbd6ef87
PM
4263 switch (po->tp_version) {
4264 case TPACKET_V1:
4265 po->tp_hdrlen = TPACKET_HDRLEN;
4266 break;
4267 case TPACKET_V2:
4268 po->tp_hdrlen = TPACKET2_HDRLEN;
4269 break;
f6fb8f10 4270 case TPACKET_V3:
4271 po->tp_hdrlen = TPACKET3_HDRLEN;
4272 break;
bbd6ef87
PM
4273 }
4274
69e3c75f 4275 err = -EINVAL;
4ebf0ae2 4276 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4277 goto out;
90836b67 4278 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4279 goto out;
4576cd46 4280 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4281 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4282 req->tp_block_size <
4283 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4284 goto out;
4576cd46 4285 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4286 goto out;
4ebf0ae2 4287 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4288 goto out;
1da177e4 4289
4194b491
TK
4290 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4291 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4292 goto out;
fc62814d 4293 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4294 goto out;
69e3c75f
JB
4295 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4296 req->tp_frame_nr))
4297 goto out;
1da177e4
LT
4298
4299 err = -ENOMEM;
3a7ad063
ED
4300 order = get_order(req->tp_block_size);
4301 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4302 if (unlikely(!pg_vec))
1da177e4 4303 goto out;
f6fb8f10 4304 switch (po->tp_version) {
4305 case TPACKET_V3:
7f953ab2
SV
4306 /* Block transmit is not supported yet */
4307 if (!tx_ring) {
e8e85cc5 4308 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4309 } else {
4310 struct tpacket_req3 *req3 = &req_u->req3;
4311
4312 if (req3->tp_retire_blk_tov ||
4313 req3->tp_sizeof_priv ||
4314 req3->tp_feature_req_word) {
4315 err = -EINVAL;
4316 goto out;
4317 }
4318 }
d7cf0c34 4319 break;
f6fb8f10 4320 default:
4321 break;
4322 }
69e3c75f
JB
4323 }
4324 /* Done */
4325 else {
4326 err = -EINVAL;
4ebf0ae2 4327 if (unlikely(req->tp_frame_nr))
69e3c75f 4328 goto out;
1da177e4
LT
4329 }
4330
1da177e4
LT
4331
4332 /* Detach socket from network */
4333 spin_lock(&po->bind_lock);
4334 was_running = po->running;
4335 num = po->num;
4336 if (was_running) {
1da177e4 4337 po->num = 0;
ce06b03e 4338 __unregister_prot_hook(sk, false);
1da177e4
LT
4339 }
4340 spin_unlock(&po->bind_lock);
1ce4f28b 4341
1da177e4
LT
4342 synchronize_net();
4343
4344 err = -EBUSY;
905db440 4345 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4346 if (closing || atomic_read(&po->mapped) == 0) {
4347 err = 0;
69e3c75f 4348 spin_lock_bh(&rb_queue->lock);
c053fd96 4349 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4350 rb->frame_max = (req->tp_frame_nr - 1);
4351 rb->head = 0;
4352 rb->frame_size = req->tp_frame_size;
4353 spin_unlock_bh(&rb_queue->lock);
4354
3a7ad063 4355 swap(rb->pg_vec_order, order);
c053fd96 4356 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4357
4358 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4359 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4360 tpacket_rcv : packet_rcv;
4361 skb_queue_purge(rb_queue);
1da177e4 4362 if (atomic_read(&po->mapped))
40d4e3df
ED
4363 pr_err("packet_mmap: vma is busy: %d\n",
4364 atomic_read(&po->mapped));
1da177e4 4365 }
905db440 4366 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4367
4368 spin_lock(&po->bind_lock);
ce06b03e 4369 if (was_running) {
1da177e4 4370 po->num = num;
ce06b03e 4371 register_prot_hook(sk);
1da177e4
LT
4372 }
4373 spin_unlock(&po->bind_lock);
c800aaf8 4374 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4375 /* Because we don't support block-based V3 on tx-ring */
4376 if (!tx_ring)
73d0fcf2 4377 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4378 }
1da177e4 4379
1da177e4 4380 if (pg_vec)
3a7ad063 4381 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4382out:
4383 return err;
4384}
4385
69e3c75f
JB
4386static int packet_mmap(struct file *file, struct socket *sock,
4387 struct vm_area_struct *vma)
1da177e4
LT
4388{
4389 struct sock *sk = sock->sk;
4390 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4391 unsigned long size, expected_size;
4392 struct packet_ring_buffer *rb;
1da177e4
LT
4393 unsigned long start;
4394 int err = -EINVAL;
4395 int i;
4396
4397 if (vma->vm_pgoff)
4398 return -EINVAL;
4399
905db440 4400 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4401
4402 expected_size = 0;
4403 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4404 if (rb->pg_vec) {
4405 expected_size += rb->pg_vec_len
4406 * rb->pg_vec_pages
4407 * PAGE_SIZE;
4408 }
4409 }
4410
4411 if (expected_size == 0)
1da177e4 4412 goto out;
69e3c75f
JB
4413
4414 size = vma->vm_end - vma->vm_start;
4415 if (size != expected_size)
1da177e4
LT
4416 goto out;
4417
1da177e4 4418 start = vma->vm_start;
69e3c75f
JB
4419 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4420 if (rb->pg_vec == NULL)
4421 continue;
4422
4423 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4424 struct page *page;
4425 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4426 int pg_num;
4427
c56b4d90
CG
4428 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4429 page = pgv_to_page(kaddr);
69e3c75f
JB
4430 err = vm_insert_page(vma, start, page);
4431 if (unlikely(err))
4432 goto out;
4433 start += PAGE_SIZE;
0e3125c7 4434 kaddr += PAGE_SIZE;
69e3c75f 4435 }
4ebf0ae2 4436 }
1da177e4 4437 }
69e3c75f 4438
4ebf0ae2 4439 atomic_inc(&po->mapped);
1da177e4
LT
4440 vma->vm_ops = &packet_mmap_ops;
4441 err = 0;
4442
4443out:
905db440 4444 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4445 return err;
4446}
1da177e4 4447
90ddc4f0 4448static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4449 .family = PF_PACKET,
4450 .owner = THIS_MODULE,
4451 .release = packet_release,
4452 .bind = packet_bind_spkt,
4453 .connect = sock_no_connect,
4454 .socketpair = sock_no_socketpair,
4455 .accept = sock_no_accept,
4456 .getname = packet_getname_spkt,
a11e1d43 4457 .poll = datagram_poll,
1da177e4
LT
4458 .ioctl = packet_ioctl,
4459 .listen = sock_no_listen,
4460 .shutdown = sock_no_shutdown,
4461 .setsockopt = sock_no_setsockopt,
4462 .getsockopt = sock_no_getsockopt,
4463 .sendmsg = packet_sendmsg_spkt,
4464 .recvmsg = packet_recvmsg,
4465 .mmap = sock_no_mmap,
4466 .sendpage = sock_no_sendpage,
4467};
1da177e4 4468
90ddc4f0 4469static const struct proto_ops packet_ops = {
1da177e4
LT
4470 .family = PF_PACKET,
4471 .owner = THIS_MODULE,
4472 .release = packet_release,
4473 .bind = packet_bind,
4474 .connect = sock_no_connect,
4475 .socketpair = sock_no_socketpair,
4476 .accept = sock_no_accept,
1ce4f28b 4477 .getname = packet_getname,
a11e1d43 4478 .poll = packet_poll,
1da177e4
LT
4479 .ioctl = packet_ioctl,
4480 .listen = sock_no_listen,
4481 .shutdown = sock_no_shutdown,
4482 .setsockopt = packet_setsockopt,
4483 .getsockopt = packet_getsockopt,
719c44d3
WB
4484#ifdef CONFIG_COMPAT
4485 .compat_setsockopt = compat_packet_setsockopt,
4486#endif
1da177e4
LT
4487 .sendmsg = packet_sendmsg,
4488 .recvmsg = packet_recvmsg,
4489 .mmap = packet_mmap,
4490 .sendpage = sock_no_sendpage,
4491};
4492
ec1b4cf7 4493static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4494 .family = PF_PACKET,
4495 .create = packet_create,
4496 .owner = THIS_MODULE,
4497};
4498
4499static struct notifier_block packet_netdev_notifier = {
40d4e3df 4500 .notifier_call = packet_notifier,
1da177e4
LT
4501};
4502
4503#ifdef CONFIG_PROC_FS
1da177e4
LT
4504
4505static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4506 __acquires(RCU)
1da177e4 4507{
e372c414 4508 struct net *net = seq_file_net(seq);
808f5114 4509
4510 rcu_read_lock();
4511 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4512}
4513
4514static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4515{
1bf40954 4516 struct net *net = seq_file_net(seq);
808f5114 4517 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4518}
4519
4520static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4521 __releases(RCU)
1da177e4 4522{
808f5114 4523 rcu_read_unlock();
1da177e4
LT
4524}
4525
1ce4f28b 4526static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4527{
4528 if (v == SEQ_START_TOKEN)
4529 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4530 else {
b7ceabd9 4531 struct sock *s = sk_entry(v);
1da177e4
LT
4532 const struct packet_sock *po = pkt_sk(s);
4533
4534 seq_printf(seq,
71338aa7 4535 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4536 s,
41c6d650 4537 refcount_read(&s->sk_refcnt),
1da177e4
LT
4538 s->sk_type,
4539 ntohs(po->num),
4540 po->ifindex,
4541 po->running,
4542 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4543 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4544 sock_i_ino(s));
1da177e4
LT
4545 }
4546
4547 return 0;
4548}
4549
56b3d975 4550static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4551 .start = packet_seq_start,
4552 .next = packet_seq_next,
4553 .stop = packet_seq_stop,
4554 .show = packet_seq_show,
4555};
1da177e4
LT
4556#endif
4557
2c8c1e72 4558static int __net_init packet_net_init(struct net *net)
d12d01d6 4559{
0fa7fa98 4560 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4561 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4562
c3506372
CH
4563 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4564 sizeof(struct seq_net_private)))
d12d01d6
DL
4565 return -ENOMEM;
4566
4567 return 0;
4568}
4569
2c8c1e72 4570static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4571{
ece31ffd 4572 remove_proc_entry("packet", net->proc_net);
669f8f1a 4573 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4574}
4575
4576static struct pernet_operations packet_net_ops = {
4577 .init = packet_net_init,
4578 .exit = packet_net_exit,
4579};
4580
4581
1da177e4
LT
4582static void __exit packet_exit(void)
4583{
1da177e4 4584 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4585 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4586 sock_unregister(PF_PACKET);
4587 proto_unregister(&packet_proto);
4588}
4589
4590static int __init packet_init(void)
4591{
4592 int rc = proto_register(&packet_proto, 0);
4593
4594 if (rc != 0)
4595 goto out;
4596
4597 sock_register(&packet_family_ops);
d12d01d6 4598 register_pernet_subsys(&packet_net_ops);
1da177e4 4599 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4600out:
4601 return rc;
4602}
4603
4604module_init(packet_init);
4605module_exit(packet_exit);
4606MODULE_LICENSE("GPL");
4607MODULE_ALIAS_NETPROTO(PF_PACKET);