]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - net/packet/af_packet.c
Merge tag 'armsoc-late' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
[mirror_ubuntu-eoan-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
7c0f6ba6 76#include <linux/uaccess.h>
1da177e4
LT
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
47dceb8e 95#include <linux/bpf.h>
719c44d3 96#include <net/compat.h>
1da177e4 97
2787b04b
PE
98#include "internal.h"
99
1da177e4
LT
100/*
101 Assumptions:
102 - if device has no dev->hard_header routine, it adds and removes ll header
103 inside itself. In this case ll header is invisible outside of device,
104 but higher levels still should reserve dev->hard_header_len.
105 Some devices are enough clever to reallocate skb, when header
106 will not fit to reserved space (tunnel), another ones are silly
107 (PPP).
108 - packet socket receives packets with pulled ll header,
109 so that SOCK_RAW should push it back.
110
111On receive:
112-----------
113
114Incoming, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> data
1da177e4
LT
117
118Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
119 mac_header -> ll header
120 data -> ll header
1da177e4
LT
121
122Incoming, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> UNKNOWN position. It is very likely, that it points to ll
124 header. PPP makes it, that is wrong, because introduce
db0c58f9 125 assymetry between rx and tx paths.
b0e380b1 126 data -> data
1da177e4
LT
127
128Outgoing, dev->hard_header==NULL
b0e380b1
ACM
129 mac_header -> data. ll header is still not built!
130 data -> data
1da177e4
LT
131
132Resume
133 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
134
135
136On transmit:
137------------
138
139dev->hard_header != NULL
b0e380b1
ACM
140 mac_header -> ll header
141 data -> ll header
1da177e4
LT
142
143dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
144 mac_header -> data
145 data -> data
1da177e4
LT
146
147 We should set nh.raw on output to correct posistion,
148 packet classifier depends on it.
149 */
150
1da177e4
LT
151/* Private packet socket structures. */
152
0fb375fb
EB
153/* identical to struct packet_mreq except it has
154 * a longer address field.
155 */
40d4e3df 156struct packet_mreq_max {
0fb375fb
EB
157 int mr_ifindex;
158 unsigned short mr_type;
159 unsigned short mr_alen;
160 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 161};
a2efcfa0 162
184f489e
DB
163union tpacket_uhdr {
164 struct tpacket_hdr *h1;
165 struct tpacket2_hdr *h2;
166 struct tpacket3_hdr *h3;
167 void *raw;
168};
169
f6fb8f10 170static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
171 int closing, int tx_ring);
172
f6fb8f10 173#define V3_ALIGNMENT (8)
174
bc59ba39 175#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 176
177#define BLK_PLUS_PRIV(sz_of_priv) \
178 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f 188struct packet_sock;
77f65ebd
WB
189static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
190 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 191
f6fb8f10 192static void *packet_previous_frame(struct packet_sock *po,
193 struct packet_ring_buffer *rb,
194 int status);
195static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 196static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 197static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 198 struct packet_sock *);
bc59ba39 199static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *, unsigned int status);
bc59ba39 201static int prb_queue_frozen(struct tpacket_kbdq_core *);
202static void prb_open_block(struct tpacket_kbdq_core *,
203 struct tpacket_block_desc *);
17bfd8c8 204static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 205static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 206static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
207static void prb_clear_rxhash(struct tpacket_kbdq_core *,
208 struct tpacket3_hdr *);
209static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
210 struct tpacket3_hdr *);
1da177e4 211static void packet_flush_mclist(struct sock *sk);
865b03f2 212static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 213
ffbc6111 214struct packet_skb_cb {
ffbc6111
HX
215 union {
216 struct sockaddr_pkt pkt;
2472d761
EB
217 union {
218 /* Trick: alias skb original length with
219 * ll.sll_family and ll.protocol in order
220 * to save room.
221 */
222 unsigned int origlen;
223 struct sockaddr_ll ll;
224 };
ffbc6111
HX
225 } sa;
226};
227
d3869efe
DW
228#define vio_le() virtio_legacy_is_little_endian()
229
ffbc6111 230#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 231
bc59ba39 232#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 233#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 234 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 235#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 237#define GET_NEXT_PRB_BLK_NUM(x) \
238 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
239 ((x)->kactive_blk_num+1) : 0)
240
dc99f600
DM
241static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
242static void __fanout_link(struct sock *sk, struct packet_sock *po);
243
d346a3fa
DB
244static int packet_direct_xmit(struct sk_buff *skb)
245{
865b03f2 246 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
247}
248
66e56cd4
DB
249static struct net_device *packet_cached_dev_get(struct packet_sock *po)
250{
251 struct net_device *dev;
252
253 rcu_read_lock();
254 dev = rcu_dereference(po->cached_dev);
255 if (likely(dev))
256 dev_hold(dev);
257 rcu_read_unlock();
258
259 return dev;
260}
261
262static void packet_cached_dev_assign(struct packet_sock *po,
263 struct net_device *dev)
264{
265 rcu_assign_pointer(po->cached_dev, dev);
266}
267
268static void packet_cached_dev_reset(struct packet_sock *po)
269{
270 RCU_INIT_POINTER(po->cached_dev, NULL);
271}
272
d346a3fa
DB
273static bool packet_use_direct_xmit(const struct packet_sock *po)
274{
275 return po->xmit == packet_direct_xmit;
276}
277
8ec56fc3
AD
278static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
279 struct net_device *sb_dev)
d346a3fa 280{
8ec56fc3 281 return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
d346a3fa
DB
282}
283
865b03f2 284static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 285{
865b03f2 286 struct net_device *dev = skb->dev;
0fd5d57b
DB
287 const struct net_device_ops *ops = dev->netdev_ops;
288 u16 queue_index;
289
290 if (ops->ndo_select_queue) {
291 queue_index = ops->ndo_select_queue(dev, skb, NULL,
292 __packet_pick_tx_queue);
293 queue_index = netdev_cap_txqueue(dev, queue_index);
294 } else {
8ec56fc3 295 queue_index = __packet_pick_tx_queue(dev, skb, NULL);
0fd5d57b
DB
296 }
297
865b03f2 298 return queue_index;
0fd5d57b
DB
299}
300
a6361f0c 301/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
302 * or from a context in which asynchronous accesses to the packet
303 * socket is not possible (packet_create()).
304 */
a6361f0c 305static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
306{
307 struct packet_sock *po = pkt_sk(sk);
e40526cb 308
ce06b03e 309 if (!po->running) {
66e56cd4 310 if (po->fanout)
dc99f600 311 __fanout_link(sk, po);
66e56cd4 312 else
dc99f600 313 dev_add_pack(&po->prot_hook);
e40526cb 314
ce06b03e
DM
315 sock_hold(sk);
316 po->running = 1;
317 }
318}
319
a6361f0c
WB
320static void register_prot_hook(struct sock *sk)
321{
322 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
323 __register_prot_hook(sk);
324}
325
326/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
327 * the po->bind_lock and do a synchronize_net to make sure no
328 * asynchronous packet processing paths still refer to the elements
329 * of po->prot_hook. If the sync parameter is false, it is the
330 * callers responsibility to take care of this.
331 */
332static void __unregister_prot_hook(struct sock *sk, bool sync)
333{
334 struct packet_sock *po = pkt_sk(sk);
335
a6361f0c
WB
336 lockdep_assert_held_once(&po->bind_lock);
337
ce06b03e 338 po->running = 0;
66e56cd4
DB
339
340 if (po->fanout)
dc99f600 341 __fanout_unlink(sk, po);
66e56cd4 342 else
dc99f600 343 __dev_remove_pack(&po->prot_hook);
e40526cb 344
ce06b03e
DM
345 __sock_put(sk);
346
347 if (sync) {
348 spin_unlock(&po->bind_lock);
349 synchronize_net();
350 spin_lock(&po->bind_lock);
351 }
352}
353
354static void unregister_prot_hook(struct sock *sk, bool sync)
355{
356 struct packet_sock *po = pkt_sk(sk);
357
358 if (po->running)
359 __unregister_prot_hook(sk, sync);
360}
361
6e58040b 362static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
363{
364 if (is_vmalloc_addr(addr))
365 return vmalloc_to_page(addr);
366 return virt_to_page(addr);
367}
368
69e3c75f 369static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 370{
184f489e 371 union tpacket_uhdr h;
1da177e4 372
69e3c75f 373 h.raw = frame;
bbd6ef87
PM
374 switch (po->tp_version) {
375 case TPACKET_V1:
69e3c75f 376 h.h1->tp_status = status;
0af55bb5 377 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
378 break;
379 case TPACKET_V2:
69e3c75f 380 h.h2->tp_status = status;
0af55bb5 381 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 382 break;
f6fb8f10 383 case TPACKET_V3:
7f953ab2
SV
384 h.h3->tp_status = status;
385 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
386 break;
69e3c75f 387 default:
f6fb8f10 388 WARN(1, "TPACKET version not supported.\n");
69e3c75f 389 BUG();
bbd6ef87 390 }
69e3c75f
JB
391
392 smp_wmb();
bbd6ef87
PM
393}
394
69e3c75f 395static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 396{
184f489e 397 union tpacket_uhdr h;
bbd6ef87 398
69e3c75f
JB
399 smp_rmb();
400
bbd6ef87
PM
401 h.raw = frame;
402 switch (po->tp_version) {
403 case TPACKET_V1:
0af55bb5 404 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 405 return h.h1->tp_status;
bbd6ef87 406 case TPACKET_V2:
0af55bb5 407 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 408 return h.h2->tp_status;
f6fb8f10 409 case TPACKET_V3:
7f953ab2
SV
410 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
411 return h.h3->tp_status;
69e3c75f 412 default:
f6fb8f10 413 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
414 BUG();
415 return 0;
bbd6ef87 416 }
1da177e4 417}
69e3c75f 418
b9c32fb2
DB
419static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
420 unsigned int flags)
7a51384c
DB
421{
422 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
423
68a360e8
WB
424 if (shhwtstamps &&
425 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
426 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
427 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
428
429 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 430 return TP_STATUS_TS_SOFTWARE;
7a51384c 431
b9c32fb2 432 return 0;
7a51384c
DB
433}
434
b9c32fb2
DB
435static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
436 struct sk_buff *skb)
2e31396f
WB
437{
438 union tpacket_uhdr h;
439 struct timespec ts;
b9c32fb2 440 __u32 ts_status;
2e31396f 441
b9c32fb2
DB
442 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
443 return 0;
2e31396f
WB
444
445 h.raw = frame;
446 switch (po->tp_version) {
447 case TPACKET_V1:
448 h.h1->tp_sec = ts.tv_sec;
449 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
450 break;
451 case TPACKET_V2:
452 h.h2->tp_sec = ts.tv_sec;
453 h.h2->tp_nsec = ts.tv_nsec;
454 break;
455 case TPACKET_V3:
57ea884b
DB
456 h.h3->tp_sec = ts.tv_sec;
457 h.h3->tp_nsec = ts.tv_nsec;
458 break;
2e31396f
WB
459 default:
460 WARN(1, "TPACKET version not supported.\n");
461 BUG();
462 }
463
464 /* one flush is safe, as both fields always lie on the same cacheline */
465 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
466 smp_wmb();
b9c32fb2
DB
467
468 return ts_status;
2e31396f
WB
469}
470
69e3c75f
JB
471static void *packet_lookup_frame(struct packet_sock *po,
472 struct packet_ring_buffer *rb,
473 unsigned int position,
474 int status)
475{
476 unsigned int pg_vec_pos, frame_offset;
184f489e 477 union tpacket_uhdr h;
69e3c75f
JB
478
479 pg_vec_pos = position / rb->frames_per_block;
480 frame_offset = position % rb->frames_per_block;
481
0e3125c7
NH
482 h.raw = rb->pg_vec[pg_vec_pos].buffer +
483 (frame_offset * rb->frame_size);
69e3c75f
JB
484
485 if (status != __packet_get_status(po, h.raw))
486 return NULL;
487
488 return h.raw;
489}
490
eea49cc9 491static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
492 struct packet_ring_buffer *rb,
493 int status)
494{
495 return packet_lookup_frame(po, rb, rb->head, status);
496}
497
bc59ba39 498static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 499{
500 del_timer_sync(&pkc->retire_blk_timer);
501}
502
503static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 504 struct sk_buff_head *rb_queue)
505{
bc59ba39 506 struct tpacket_kbdq_core *pkc;
f6fb8f10 507
73d0fcf2 508 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 509
ec6f809f 510 spin_lock_bh(&rb_queue->lock);
f6fb8f10 511 pkc->delete_blk_timer = 1;
ec6f809f 512 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 513
514 prb_del_retire_blk_timer(pkc);
515}
516
e8e85cc5 517static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 518{
bc59ba39 519 struct tpacket_kbdq_core *pkc;
f6fb8f10 520
e8e85cc5 521 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
522 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
523 0);
524 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 532 struct ethtool_link_ksettings ecmd;
4bc71cb9 533 int err;
f6fb8f10 534
4bc71cb9
JP
535 rtnl_lock();
536 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
537 if (unlikely(!dev)) {
538 rtnl_unlock();
f6fb8f10 539 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 540 }
7cad1bac 541 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
542 rtnl_unlock();
543 if (!err) {
4bc71cb9
JP
544 /*
545 * If the link speed is so slow you don't really
546 * need to worry about perf anyways
547 */
7cad1bac
DD
548 if (ecmd.base.speed < SPEED_1000 ||
549 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 550 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 551 } else {
552 msec = 1;
7cad1bac 553 div = ecmd.base.speed / 1000;
f6fb8f10 554 }
555 }
556
557 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
558
559 if (div)
560 mbits /= div;
561
562 tmo = mbits * msec;
563
564 if (div)
565 return tmo+1;
566 return tmo;
567}
568
bc59ba39 569static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 570 union tpacket_req_u *req_u)
571{
572 p1->feature_req_word = req_u->req3.tp_feature_req_word;
573}
574
575static void init_prb_bdqc(struct packet_sock *po,
576 struct packet_ring_buffer *rb,
577 struct pgv *pg_vec,
e8e85cc5 578 union tpacket_req_u *req_u)
f6fb8f10 579{
22781a5b 580 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 581 struct tpacket_block_desc *pbd;
f6fb8f10 582
583 memset(p1, 0x0, sizeof(*p1));
584
585 p1->knxt_seq_num = 1;
586 p1->pkbdq = pg_vec;
bc59ba39 587 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 588 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 589 p1->kblk_size = req_u->req3.tp_block_size;
590 p1->knum_blocks = req_u->req3.tp_block_nr;
591 p1->hdrlen = po->tp_hdrlen;
592 p1->version = po->tp_version;
593 p1->last_kactive_blk_num = 0;
ee80fbf3 594 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 595 if (req_u->req3.tp_retire_blk_tov)
596 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
597 else
598 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
599 req_u->req3.tp_block_size);
600 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
601 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
602
dc808110 603 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 604 prb_init_ft_ops(p1, req_u);
e8e85cc5 605 prb_setup_retire_blk_timer(po);
f6fb8f10 606 prb_open_block(p1, pbd);
607}
608
609/* Do NOT update the last_blk_num first.
610 * Assumes sk_buff_head lock is held.
611 */
bc59ba39 612static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 613{
614 mod_timer(&pkc->retire_blk_timer,
615 jiffies + pkc->tov_in_jiffies);
616 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
617}
618
619/*
620 * Timer logic:
621 * 1) We refresh the timer only when we open a block.
622 * By doing this we don't waste cycles refreshing the timer
623 * on packet-by-packet basis.
624 *
625 * With a 1MB block-size, on a 1Gbps line, it will take
626 * i) ~8 ms to fill a block + ii) memcpy etc.
627 * In this cut we are not accounting for the memcpy time.
628 *
629 * So, if the user sets the 'tmo' to 10ms then the timer
630 * will never fire while the block is still getting filled
631 * (which is what we want). However, the user could choose
632 * to close a block early and that's fine.
633 *
634 * But when the timer does fire, we check whether or not to refresh it.
635 * Since the tmo granularity is in msecs, it is not too expensive
636 * to refresh the timer, lets say every '8' msecs.
637 * Either the user can set the 'tmo' or we can derive it based on
638 * a) line-speed and b) block-size.
639 * prb_calc_retire_blk_tmo() calculates the tmo.
640 *
641 */
17bfd8c8 642static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 643{
17bfd8c8
KC
644 struct packet_sock *po =
645 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 646 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 647 unsigned int frozen;
bc59ba39 648 struct tpacket_block_desc *pbd;
f6fb8f10 649
650 spin_lock(&po->sk.sk_receive_queue.lock);
651
652 frozen = prb_queue_frozen(pkc);
653 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
654
655 if (unlikely(pkc->delete_blk_timer))
656 goto out;
657
658 /* We only need to plug the race when the block is partially filled.
659 * tpacket_rcv:
660 * lock(); increment BLOCK_NUM_PKTS; unlock()
661 * copy_bits() is in progress ...
662 * timer fires on other cpu:
663 * we can't retire the current block because copy_bits
664 * is in progress.
665 *
666 */
667 if (BLOCK_NUM_PKTS(pbd)) {
668 while (atomic_read(&pkc->blk_fill_in_prog)) {
669 /* Waiting for skb_copy_bits to finish... */
670 cpu_relax();
671 }
672 }
673
674 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
675 if (!frozen) {
41a50d62
AD
676 if (!BLOCK_NUM_PKTS(pbd)) {
677 /* An empty block. Just refresh the timer. */
678 goto refresh_timer;
679 }
f6fb8f10 680 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
681 if (!prb_dispatch_next_block(pkc, po))
682 goto refresh_timer;
683 else
684 goto out;
685 } else {
686 /* Case 1. Queue was frozen because user-space was
687 * lagging behind.
688 */
878cd3ba 689 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 690 /*
691 * Ok, user-space is still behind.
692 * So just refresh the timer.
693 */
694 goto refresh_timer;
695 } else {
696 /* Case 2. queue was frozen,user-space caught up,
697 * now the link went idle && the timer fired.
698 * We don't have a block to close.So we open this
699 * block and restart the timer.
700 * opening a block thaws the queue,restarts timer
701 * Thawing/timer-refresh is a side effect.
702 */
703 prb_open_block(pkc, pbd);
704 goto out;
705 }
706 }
707 }
708
709refresh_timer:
710 _prb_refresh_rx_retire_blk_timer(pkc);
711
712out:
713 spin_unlock(&po->sk.sk_receive_queue.lock);
714}
715
eea49cc9 716static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 717 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 718{
719 /* Flush everything minus the block header */
720
721#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
722 u8 *start, *end;
723
724 start = (u8 *)pbd1;
725
726 /* Skip the block header(we know header WILL fit in 4K) */
727 start += PAGE_SIZE;
728
729 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
730 for (; start < end; start += PAGE_SIZE)
731 flush_dcache_page(pgv_to_page(start));
732
733 smp_wmb();
734#endif
735
736 /* Now update the block status. */
737
738 BLOCK_STATUS(pbd1) = status;
739
740 /* Flush the block header */
741
742#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
743 start = (u8 *)pbd1;
744 flush_dcache_page(pgv_to_page(start));
745
746 smp_wmb();
747#endif
748}
749
750/*
751 * Side effect:
752 *
753 * 1) flush the block
754 * 2) Increment active_blk_num
755 *
756 * Note:We DONT refresh the timer on purpose.
757 * Because almost always the next block will be opened.
758 */
bc59ba39 759static void prb_close_block(struct tpacket_kbdq_core *pkc1,
760 struct tpacket_block_desc *pbd1,
f6fb8f10 761 struct packet_sock *po, unsigned int stat)
762{
763 __u32 status = TP_STATUS_USER | stat;
764
765 struct tpacket3_hdr *last_pkt;
bc59ba39 766 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 767 struct sock *sk = &po->sk;
f6fb8f10 768
ee80fbf3 769 if (po->stats.stats3.tp_drops)
f6fb8f10 770 status |= TP_STATUS_LOSING;
771
772 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
773 last_pkt->tp_next_offset = 0;
774
775 /* Get the ts of the last pkt */
776 if (BLOCK_NUM_PKTS(pbd1)) {
777 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
778 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
779 } else {
41a50d62
AD
780 /* Ok, we tmo'd - so get the current time.
781 *
782 * It shouldn't really happen as we don't close empty
783 * blocks. See prb_retire_rx_blk_timer_expired().
784 */
f6fb8f10 785 struct timespec ts;
786 getnstimeofday(&ts);
787 h1->ts_last_pkt.ts_sec = ts.tv_sec;
788 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
789 }
790
791 smp_wmb();
792
793 /* Flush the block */
794 prb_flush_block(pkc1, pbd1, status);
795
da413eec
DC
796 sk->sk_data_ready(sk);
797
f6fb8f10 798 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
799}
800
eea49cc9 801static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 802{
803 pkc->reset_pending_on_curr_blk = 0;
804}
805
806/*
807 * Side effect of opening a block:
808 *
809 * 1) prb_queue is thawed.
810 * 2) retire_blk_timer is refreshed.
811 *
812 */
bc59ba39 813static void prb_open_block(struct tpacket_kbdq_core *pkc1,
814 struct tpacket_block_desc *pbd1)
f6fb8f10 815{
816 struct timespec ts;
bc59ba39 817 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 818
819 smp_rmb();
820
8da3056c
DB
821 /* We could have just memset this but we will lose the
822 * flexibility of making the priv area sticky
823 */
f6fb8f10 824
8da3056c
DB
825 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
826 BLOCK_NUM_PKTS(pbd1) = 0;
827 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 828
8da3056c
DB
829 getnstimeofday(&ts);
830
831 h1->ts_first_pkt.ts_sec = ts.tv_sec;
832 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 833
8da3056c
DB
834 pkc1->pkblk_start = (char *)pbd1;
835 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
836
837 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
838 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
839
840 pbd1->version = pkc1->version;
841 pkc1->prev = pkc1->nxt_offset;
842 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
843
844 prb_thaw_queue(pkc1);
845 _prb_refresh_rx_retire_blk_timer(pkc1);
846
847 smp_wmb();
f6fb8f10 848}
849
850/*
851 * Queue freeze logic:
852 * 1) Assume tp_block_nr = 8 blocks.
853 * 2) At time 't0', user opens Rx ring.
854 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
855 * 4) user-space is either sleeping or processing block '0'.
856 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
857 * it will close block-7,loop around and try to fill block '0'.
858 * call-flow:
859 * __packet_lookup_frame_in_block
860 * prb_retire_current_block()
861 * prb_dispatch_next_block()
862 * |->(BLOCK_STATUS == USER) evaluates to true
863 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
864 * 6) Now there are two cases:
865 * 6.1) Link goes idle right after the queue is frozen.
866 * But remember, the last open_block() refreshed the timer.
867 * When this timer expires,it will refresh itself so that we can
868 * re-open block-0 in near future.
869 * 6.2) Link is busy and keeps on receiving packets. This is a simple
870 * case and __packet_lookup_frame_in_block will check if block-0
871 * is free and can now be re-used.
872 */
eea49cc9 873static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 874 struct packet_sock *po)
875{
876 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 877 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 878}
879
880#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
881
882/*
883 * If the next block is free then we will dispatch it
884 * and return a good offset.
885 * Else, we will freeze the queue.
886 * So, caller must check the return value.
887 */
bc59ba39 888static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 889 struct packet_sock *po)
890{
bc59ba39 891 struct tpacket_block_desc *pbd;
f6fb8f10 892
893 smp_rmb();
894
895 /* 1. Get current block num */
896 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
897
898 /* 2. If this block is currently in_use then freeze the queue */
899 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
900 prb_freeze_queue(pkc, po);
901 return NULL;
902 }
903
904 /*
905 * 3.
906 * open this block and return the offset where the first packet
907 * needs to get stored.
908 */
909 prb_open_block(pkc, pbd);
910 return (void *)pkc->nxt_offset;
911}
912
bc59ba39 913static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 914 struct packet_sock *po, unsigned int status)
915{
bc59ba39 916 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 917
918 /* retire/close the current block */
919 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
920 /*
921 * Plug the case where copy_bits() is in progress on
922 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
923 * have space to copy the pkt in the current block and
924 * called prb_retire_current_block()
925 *
926 * We don't need to worry about the TMO case because
927 * the timer-handler already handled this case.
928 */
929 if (!(status & TP_STATUS_BLK_TMO)) {
930 while (atomic_read(&pkc->blk_fill_in_prog)) {
931 /* Waiting for skb_copy_bits to finish... */
932 cpu_relax();
933 }
934 }
935 prb_close_block(pkc, pbd, po, status);
936 return;
937 }
f6fb8f10 938}
939
878cd3ba 940static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 941{
942 return TP_STATUS_USER & BLOCK_STATUS(pbd);
943}
944
eea49cc9 945static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 946{
947 return pkc->reset_pending_on_curr_blk;
948}
949
eea49cc9 950static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 951{
bc59ba39 952 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 953 atomic_dec(&pkc->blk_fill_in_prog);
954}
955
eea49cc9 956static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 957 struct tpacket3_hdr *ppd)
958{
3958afa1 959 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 960}
961
eea49cc9 962static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 963 struct tpacket3_hdr *ppd)
964{
965 ppd->hv1.tp_rxhash = 0;
966}
967
eea49cc9 968static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 969 struct tpacket3_hdr *ppd)
970{
df8a39de
JP
971 if (skb_vlan_tag_present(pkc->skb)) {
972 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
973 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
974 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 975 } else {
9e67030a 976 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 977 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 978 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 979 }
980}
981
bc59ba39 982static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 983 struct tpacket3_hdr *ppd)
984{
a0cdfcf3 985 ppd->hv1.tp_padding = 0;
f6fb8f10 986 prb_fill_vlan_info(pkc, ppd);
987
988 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
989 prb_fill_rxhash(pkc, ppd);
990 else
991 prb_clear_rxhash(pkc, ppd);
992}
993
eea49cc9 994static void prb_fill_curr_block(char *curr,
bc59ba39 995 struct tpacket_kbdq_core *pkc,
996 struct tpacket_block_desc *pbd,
f6fb8f10 997 unsigned int len)
998{
999 struct tpacket3_hdr *ppd;
1000
1001 ppd = (struct tpacket3_hdr *)curr;
1002 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 pkc->prev = curr;
1004 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1006 BLOCK_NUM_PKTS(pbd) += 1;
1007 atomic_inc(&pkc->blk_fill_in_prog);
1008 prb_run_all_ft_ops(pkc, ppd);
1009}
1010
1011/* Assumes caller has the sk->rx_queue.lock */
1012static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1013 struct sk_buff *skb,
1014 int status,
1015 unsigned int len
1016 )
1017{
bc59ba39 1018 struct tpacket_kbdq_core *pkc;
1019 struct tpacket_block_desc *pbd;
f6fb8f10 1020 char *curr, *end;
1021
e3192690 1022 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1023 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1024
1025 /* Queue is frozen when user space is lagging behind */
1026 if (prb_queue_frozen(pkc)) {
1027 /*
1028 * Check if that last block which caused the queue to freeze,
1029 * is still in_use by user-space.
1030 */
878cd3ba 1031 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1032 /* Can't record this packet */
1033 return NULL;
1034 } else {
1035 /*
1036 * Ok, the block was released by user-space.
1037 * Now let's open that block.
1038 * opening a block also thaws the queue.
1039 * Thawing is a side effect.
1040 */
1041 prb_open_block(pkc, pbd);
1042 }
1043 }
1044
1045 smp_mb();
1046 curr = pkc->nxt_offset;
1047 pkc->skb = skb;
e3192690 1048 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1049
1050 /* first try the current block */
1051 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1052 prb_fill_curr_block(curr, pkc, pbd, len);
1053 return (void *)curr;
1054 }
1055
1056 /* Ok, close the current block */
1057 prb_retire_current_block(pkc, po, 0);
1058
1059 /* Now, try to dispatch the next block */
1060 curr = (char *)prb_dispatch_next_block(pkc, po);
1061 if (curr) {
1062 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1063 prb_fill_curr_block(curr, pkc, pbd, len);
1064 return (void *)curr;
1065 }
1066
1067 /*
1068 * No free blocks are available.user_space hasn't caught up yet.
1069 * Queue was just frozen and now this packet will get dropped.
1070 */
1071 return NULL;
1072}
1073
eea49cc9 1074static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1075 struct sk_buff *skb,
1076 int status, unsigned int len)
1077{
1078 char *curr = NULL;
1079 switch (po->tp_version) {
1080 case TPACKET_V1:
1081 case TPACKET_V2:
1082 curr = packet_lookup_frame(po, &po->rx_ring,
1083 po->rx_ring.head, status);
1084 return curr;
1085 case TPACKET_V3:
1086 return __packet_lookup_frame_in_block(po, skb, status, len);
1087 default:
1088 WARN(1, "TPACKET version not supported\n");
1089 BUG();
99aa3473 1090 return NULL;
f6fb8f10 1091 }
1092}
1093
eea49cc9 1094static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1095 struct packet_ring_buffer *rb,
77f65ebd 1096 unsigned int idx,
f6fb8f10 1097 int status)
1098{
bc59ba39 1099 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1100 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1101
1102 if (status != BLOCK_STATUS(pbd))
1103 return NULL;
1104 return pbd;
1105}
1106
eea49cc9 1107static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1108{
1109 unsigned int prev;
1110 if (rb->prb_bdqc.kactive_blk_num)
1111 prev = rb->prb_bdqc.kactive_blk_num-1;
1112 else
1113 prev = rb->prb_bdqc.knum_blocks-1;
1114 return prev;
1115}
1116
1117/* Assumes caller has held the rx_queue.lock */
eea49cc9 1118static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 unsigned int previous = prb_previous_blk_num(rb);
1123 return prb_lookup_block(po, rb, previous, status);
1124}
1125
eea49cc9 1126static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 if (po->tp_version <= TPACKET_V2)
1131 return packet_previous_frame(po, rb, status);
1132
1133 return __prb_previous_block(po, rb, status);
1134}
1135
eea49cc9 1136static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1137 struct packet_ring_buffer *rb)
1138{
1139 switch (po->tp_version) {
1140 case TPACKET_V1:
1141 case TPACKET_V2:
1142 return packet_increment_head(rb);
1143 case TPACKET_V3:
1144 default:
1145 WARN(1, "TPACKET version not supported.\n");
1146 BUG();
1147 return;
1148 }
1149}
1150
eea49cc9 1151static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1152 struct packet_ring_buffer *rb,
1153 int status)
1154{
1155 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1156 return packet_lookup_frame(po, rb, previous, status);
1157}
1158
eea49cc9 1159static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1160{
1161 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1162}
1163
b0138408
DB
1164static void packet_inc_pending(struct packet_ring_buffer *rb)
1165{
1166 this_cpu_inc(*rb->pending_refcnt);
1167}
1168
1169static void packet_dec_pending(struct packet_ring_buffer *rb)
1170{
1171 this_cpu_dec(*rb->pending_refcnt);
1172}
1173
1174static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1175{
1176 unsigned int refcnt = 0;
1177 int cpu;
1178
1179 /* We don't use pending refcount in rx_ring. */
1180 if (rb->pending_refcnt == NULL)
1181 return 0;
1182
1183 for_each_possible_cpu(cpu)
1184 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1185
1186 return refcnt;
1187}
1188
1189static int packet_alloc_pending(struct packet_sock *po)
1190{
1191 po->rx_ring.pending_refcnt = NULL;
1192
1193 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1194 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1195 return -ENOBUFS;
1196
1197 return 0;
1198}
1199
1200static void packet_free_pending(struct packet_sock *po)
1201{
1202 free_percpu(po->tx_ring.pending_refcnt);
1203}
1204
9954729b
WB
1205#define ROOM_POW_OFF 2
1206#define ROOM_NONE 0x0
1207#define ROOM_LOW 0x1
1208#define ROOM_NORMAL 0x2
1209
1210static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1211{
9954729b
WB
1212 int idx, len;
1213
1214 len = po->rx_ring.frame_max + 1;
1215 idx = po->rx_ring.head;
1216 if (pow_off)
1217 idx += len >> pow_off;
1218 if (idx >= len)
1219 idx -= len;
1220 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1221}
1222
1223static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1224{
1225 int idx, len;
1226
1227 len = po->rx_ring.prb_bdqc.knum_blocks;
1228 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1229 if (pow_off)
1230 idx += len >> pow_off;
1231 if (idx >= len)
1232 idx -= len;
1233 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1234}
77f65ebd 1235
2ccdbaa6 1236static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1237{
1238 struct sock *sk = &po->sk;
1239 int ret = ROOM_NONE;
1240
1241 if (po->prot_hook.func != tpacket_rcv) {
1242 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1243 - (skb ? skb->truesize : 0);
9954729b
WB
1244 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1245 return ROOM_NORMAL;
1246 else if (avail > 0)
1247 return ROOM_LOW;
1248 else
1249 return ROOM_NONE;
1250 }
77f65ebd 1251
9954729b
WB
1252 if (po->tp_version == TPACKET_V3) {
1253 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1254 ret = ROOM_NORMAL;
1255 else if (__tpacket_v3_has_room(po, 0))
1256 ret = ROOM_LOW;
1257 } else {
1258 if (__tpacket_has_room(po, ROOM_POW_OFF))
1259 ret = ROOM_NORMAL;
1260 else if (__tpacket_has_room(po, 0))
1261 ret = ROOM_LOW;
1262 }
2ccdbaa6
WB
1263
1264 return ret;
1265}
1266
1267static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1268{
1269 int ret;
1270 bool has_room;
1271
54d7c01d
WB
1272 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1273 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1274 has_room = ret == ROOM_NORMAL;
1275 if (po->pressure == has_room)
54d7c01d
WB
1276 po->pressure = !has_room;
1277 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1278
9954729b 1279 return ret;
77f65ebd
WB
1280}
1281
1da177e4
LT
1282static void packet_sock_destruct(struct sock *sk)
1283{
ed85b565
RC
1284 skb_queue_purge(&sk->sk_error_queue);
1285
547b792c 1286 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1287 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1288
1289 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1290 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1291 return;
1292 }
1293
17ab56a2 1294 sk_refcnt_debug_dec(sk);
1da177e4
LT
1295}
1296
3b3a5b0a
WB
1297static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1298{
1299 u32 rxhash;
1300 int i, count = 0;
1301
1302 rxhash = skb_get_hash(skb);
1303 for (i = 0; i < ROLLOVER_HLEN; i++)
1304 if (po->rollover->history[i] == rxhash)
1305 count++;
1306
1307 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1308 return count > (ROLLOVER_HLEN >> 1);
1309}
1310
77f65ebd
WB
1311static unsigned int fanout_demux_hash(struct packet_fanout *f,
1312 struct sk_buff *skb,
1313 unsigned int num)
dc99f600 1314{
eb70db87 1315 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1316}
1317
77f65ebd
WB
1318static unsigned int fanout_demux_lb(struct packet_fanout *f,
1319 struct sk_buff *skb,
1320 unsigned int num)
dc99f600 1321{
468479e6 1322 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1323
468479e6 1324 return val % num;
77f65ebd
WB
1325}
1326
1327static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1328 struct sk_buff *skb,
1329 unsigned int num)
1330{
1331 return smp_processor_id() % num;
dc99f600
DM
1332}
1333
5df0ddfb
DB
1334static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1335 struct sk_buff *skb,
1336 unsigned int num)
1337{
f337db64 1338 return prandom_u32_max(num);
5df0ddfb
DB
1339}
1340
77f65ebd
WB
1341static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1342 struct sk_buff *skb,
ad377cab 1343 unsigned int idx, bool try_self,
77f65ebd 1344 unsigned int num)
95ec3eb4 1345{
4633c9e0 1346 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1347 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1348
0648ab70 1349 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1350
1351 if (try_self) {
1352 room = packet_rcv_has_room(po, skb);
1353 if (room == ROOM_NORMAL ||
1354 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1355 return idx;
4633c9e0 1356 po_skip = po;
3b3a5b0a 1357 }
ad377cab 1358
0648ab70 1359 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1360 do {
2ccdbaa6 1361 po_next = pkt_sk(f->arr[i]);
4633c9e0 1362 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1363 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1364 if (i != j)
0648ab70 1365 po->rollover->sock = i;
a9b63918
WB
1366 atomic_long_inc(&po->rollover->num);
1367 if (room == ROOM_LOW)
1368 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1369 return i;
1370 }
ad377cab 1371
77f65ebd
WB
1372 if (++i == num)
1373 i = 0;
1374 } while (i != j);
1375
a9b63918 1376 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1377 return idx;
1378}
1379
2d36097d
NH
1380static unsigned int fanout_demux_qm(struct packet_fanout *f,
1381 struct sk_buff *skb,
1382 unsigned int num)
1383{
1384 return skb_get_queue_mapping(skb) % num;
1385}
1386
47dceb8e
WB
1387static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1388 struct sk_buff *skb,
1389 unsigned int num)
1390{
1391 struct bpf_prog *prog;
1392 unsigned int ret = 0;
1393
1394 rcu_read_lock();
1395 prog = rcu_dereference(f->bpf_prog);
1396 if (prog)
ff936a04 1397 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1398 rcu_read_unlock();
1399
1400 return ret;
1401}
1402
77f65ebd
WB
1403static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1404{
1405 return f->flags & (flag >> 8);
95ec3eb4
DM
1406}
1407
95ec3eb4
DM
1408static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1409 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1410{
1411 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1412 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1413 struct net *net = read_pnet(&f->net);
dc99f600 1414 struct packet_sock *po;
77f65ebd 1415 unsigned int idx;
dc99f600 1416
19bcf9f2 1417 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1418 kfree_skb(skb);
1419 return 0;
1420 }
1421
3f34b24a 1422 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1423 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1424 if (!skb)
1425 return 0;
1426 }
95ec3eb4
DM
1427 switch (f->type) {
1428 case PACKET_FANOUT_HASH:
1429 default:
77f65ebd 1430 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1431 break;
1432 case PACKET_FANOUT_LB:
77f65ebd 1433 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1434 break;
1435 case PACKET_FANOUT_CPU:
77f65ebd
WB
1436 idx = fanout_demux_cpu(f, skb, num);
1437 break;
5df0ddfb
DB
1438 case PACKET_FANOUT_RND:
1439 idx = fanout_demux_rnd(f, skb, num);
1440 break;
2d36097d
NH
1441 case PACKET_FANOUT_QM:
1442 idx = fanout_demux_qm(f, skb, num);
1443 break;
77f65ebd 1444 case PACKET_FANOUT_ROLLOVER:
ad377cab 1445 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1446 break;
47dceb8e 1447 case PACKET_FANOUT_CBPF:
f2e52095 1448 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1449 idx = fanout_demux_bpf(f, skb, num);
1450 break;
dc99f600
DM
1451 }
1452
ad377cab
WB
1453 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1454 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1455
ad377cab 1456 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1457 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1458}
1459
fff3321d
PE
1460DEFINE_MUTEX(fanout_mutex);
1461EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1462static LIST_HEAD(fanout_list);
4a69a864 1463static u16 fanout_next_id;
dc99f600
DM
1464
1465static void __fanout_link(struct sock *sk, struct packet_sock *po)
1466{
1467 struct packet_fanout *f = po->fanout;
1468
1469 spin_lock(&f->lock);
1470 f->arr[f->num_members] = sk;
1471 smp_wmb();
1472 f->num_members++;
2bd624b4
AS
1473 if (f->num_members == 1)
1474 dev_add_pack(&f->prot_hook);
dc99f600
DM
1475 spin_unlock(&f->lock);
1476}
1477
1478static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1479{
1480 struct packet_fanout *f = po->fanout;
1481 int i;
1482
1483 spin_lock(&f->lock);
1484 for (i = 0; i < f->num_members; i++) {
1485 if (f->arr[i] == sk)
1486 break;
1487 }
1488 BUG_ON(i >= f->num_members);
1489 f->arr[i] = f->arr[f->num_members - 1];
1490 f->num_members--;
2bd624b4
AS
1491 if (f->num_members == 0)
1492 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1493 spin_unlock(&f->lock);
1494}
1495
d4dd8aee 1496static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1497{
161642e2
ED
1498 if (sk->sk_family != PF_PACKET)
1499 return false;
c0de08d0 1500
161642e2 1501 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1502}
1503
47dceb8e
WB
1504static void fanout_init_data(struct packet_fanout *f)
1505{
1506 switch (f->type) {
1507 case PACKET_FANOUT_LB:
1508 atomic_set(&f->rr_cur, 0);
1509 break;
1510 case PACKET_FANOUT_CBPF:
f2e52095 1511 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1512 RCU_INIT_POINTER(f->bpf_prog, NULL);
1513 break;
1514 }
1515}
1516
1517static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1518{
1519 struct bpf_prog *old;
1520
1521 spin_lock(&f->lock);
1522 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1523 rcu_assign_pointer(f->bpf_prog, new);
1524 spin_unlock(&f->lock);
1525
1526 if (old) {
1527 synchronize_net();
1528 bpf_prog_destroy(old);
1529 }
1530}
1531
1532static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1533 unsigned int len)
1534{
1535 struct bpf_prog *new;
1536 struct sock_fprog fprog;
1537 int ret;
1538
1539 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1540 return -EPERM;
1541 if (len != sizeof(fprog))
1542 return -EINVAL;
1543 if (copy_from_user(&fprog, data, len))
1544 return -EFAULT;
1545
bab18991 1546 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1547 if (ret)
1548 return ret;
1549
1550 __fanout_set_data_bpf(po->fanout, new);
1551 return 0;
1552}
1553
f2e52095
WB
1554static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1555 unsigned int len)
1556{
1557 struct bpf_prog *new;
1558 u32 fd;
1559
1560 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1561 return -EPERM;
1562 if (len != sizeof(fd))
1563 return -EINVAL;
1564 if (copy_from_user(&fd, data, len))
1565 return -EFAULT;
1566
113214be 1567 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1568 if (IS_ERR(new))
1569 return PTR_ERR(new);
f2e52095
WB
1570
1571 __fanout_set_data_bpf(po->fanout, new);
1572 return 0;
1573}
1574
47dceb8e
WB
1575static int fanout_set_data(struct packet_sock *po, char __user *data,
1576 unsigned int len)
1577{
1578 switch (po->fanout->type) {
1579 case PACKET_FANOUT_CBPF:
1580 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1581 case PACKET_FANOUT_EBPF:
1582 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1583 default:
1584 return -EINVAL;
07d53ae4 1585 }
47dceb8e
WB
1586}
1587
1588static void fanout_release_data(struct packet_fanout *f)
1589{
1590 switch (f->type) {
1591 case PACKET_FANOUT_CBPF:
f2e52095 1592 case PACKET_FANOUT_EBPF:
47dceb8e 1593 __fanout_set_data_bpf(f, NULL);
07d53ae4 1594 }
47dceb8e
WB
1595}
1596
4a69a864
MM
1597static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1598{
1599 struct packet_fanout *f;
1600
1601 list_for_each_entry(f, &fanout_list, list) {
1602 if (f->id == candidate_id &&
1603 read_pnet(&f->net) == sock_net(sk)) {
1604 return false;
1605 }
1606 }
1607 return true;
1608}
1609
1610static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1611{
1612 u16 id = fanout_next_id;
1613
1614 do {
1615 if (__fanout_id_is_free(sk, id)) {
1616 *new_id = id;
1617 fanout_next_id = id + 1;
1618 return true;
1619 }
1620
1621 id++;
1622 } while (id != fanout_next_id);
1623
1624 return false;
1625}
1626
7736d33f 1627static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1628{
d199fab6 1629 struct packet_rollover *rollover = NULL;
dc99f600
DM
1630 struct packet_sock *po = pkt_sk(sk);
1631 struct packet_fanout *f, *match;
7736d33f 1632 u8 type = type_flags & 0xff;
77f65ebd 1633 u8 flags = type_flags >> 8;
dc99f600
DM
1634 int err;
1635
1636 switch (type) {
77f65ebd
WB
1637 case PACKET_FANOUT_ROLLOVER:
1638 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1639 return -EINVAL;
dc99f600
DM
1640 case PACKET_FANOUT_HASH:
1641 case PACKET_FANOUT_LB:
95ec3eb4 1642 case PACKET_FANOUT_CPU:
5df0ddfb 1643 case PACKET_FANOUT_RND:
2d36097d 1644 case PACKET_FANOUT_QM:
47dceb8e 1645 case PACKET_FANOUT_CBPF:
f2e52095 1646 case PACKET_FANOUT_EBPF:
dc99f600
DM
1647 break;
1648 default:
1649 return -EINVAL;
1650 }
1651
d199fab6
ED
1652 mutex_lock(&fanout_mutex);
1653
d199fab6 1654 err = -EALREADY;
dc99f600 1655 if (po->fanout)
d199fab6 1656 goto out;
dc99f600 1657
4633c9e0
WB
1658 if (type == PACKET_FANOUT_ROLLOVER ||
1659 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1660 err = -ENOMEM;
1661 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1662 if (!rollover)
1663 goto out;
1664 atomic_long_set(&rollover->num, 0);
1665 atomic_long_set(&rollover->num_huge, 0);
1666 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1667 }
1668
4a69a864
MM
1669 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1670 if (id != 0) {
1671 err = -EINVAL;
1672 goto out;
1673 }
1674 if (!fanout_find_new_id(sk, &id)) {
1675 err = -ENOMEM;
1676 goto out;
1677 }
1678 /* ephemeral flag for the first socket in the group: drop it */
1679 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1680 }
1681
dc99f600
DM
1682 match = NULL;
1683 list_for_each_entry(f, &fanout_list, list) {
1684 if (f->id == id &&
1685 read_pnet(&f->net) == sock_net(sk)) {
1686 match = f;
1687 break;
1688 }
1689 }
afe62c68 1690 err = -EINVAL;
77f65ebd 1691 if (match && match->flags != flags)
afe62c68 1692 goto out;
dc99f600 1693 if (!match) {
afe62c68 1694 err = -ENOMEM;
dc99f600 1695 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1696 if (!match)
1697 goto out;
1698 write_pnet(&match->net, sock_net(sk));
1699 match->id = id;
1700 match->type = type;
77f65ebd 1701 match->flags = flags;
afe62c68
ED
1702 INIT_LIST_HEAD(&match->list);
1703 spin_lock_init(&match->lock);
fb5c2c17 1704 refcount_set(&match->sk_ref, 0);
47dceb8e 1705 fanout_init_data(match);
afe62c68
ED
1706 match->prot_hook.type = po->prot_hook.type;
1707 match->prot_hook.dev = po->prot_hook.dev;
1708 match->prot_hook.func = packet_rcv_fanout;
1709 match->prot_hook.af_packet_priv = match;
c0de08d0 1710 match->prot_hook.id_match = match_fanout_group;
afe62c68 1711 list_add(&match->list, &fanout_list);
dc99f600 1712 }
afe62c68 1713 err = -EINVAL;
008ba2a1
WB
1714
1715 spin_lock(&po->bind_lock);
1716 if (po->running &&
1717 match->type == type &&
afe62c68
ED
1718 match->prot_hook.type == po->prot_hook.type &&
1719 match->prot_hook.dev == po->prot_hook.dev) {
1720 err = -ENOSPC;
fb5c2c17 1721 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1722 __dev_remove_pack(&po->prot_hook);
1723 po->fanout = match;
57f015f5
MM
1724 po->rollover = rollover;
1725 rollover = NULL;
fb5c2c17 1726 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1727 __fanout_link(sk, po);
1728 err = 0;
dc99f600
DM
1729 }
1730 }
008ba2a1
WB
1731 spin_unlock(&po->bind_lock);
1732
1733 if (err && !refcount_read(&match->sk_ref)) {
1734 list_del(&match->list);
1735 kfree(match);
1736 }
1737
afe62c68 1738out:
57f015f5 1739 kfree(rollover);
d199fab6 1740 mutex_unlock(&fanout_mutex);
dc99f600
DM
1741 return err;
1742}
1743
2bd624b4
AS
1744/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1745 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1746 * It is the responsibility of the caller to call fanout_release_data() and
1747 * free the returned packet_fanout (after synchronize_net())
1748 */
1749static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1750{
1751 struct packet_sock *po = pkt_sk(sk);
1752 struct packet_fanout *f;
1753
fff3321d 1754 mutex_lock(&fanout_mutex);
d199fab6
ED
1755 f = po->fanout;
1756 if (f) {
1757 po->fanout = NULL;
1758
fb5c2c17 1759 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1760 list_del(&f->list);
2bd624b4
AS
1761 else
1762 f = NULL;
dc99f600
DM
1763 }
1764 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1765
1766 return f;
dc99f600 1767}
1da177e4 1768
3c70c132
DB
1769static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1770 struct sk_buff *skb)
1771{
1772 /* Earlier code assumed this would be a VLAN pkt, double-check
1773 * this now that we have the actual packet in hand. We can only
1774 * do this check on Ethernet devices.
1775 */
1776 if (unlikely(dev->type != ARPHRD_ETHER))
1777 return false;
1778
1779 skb_reset_mac_header(skb);
1780 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1781}
1782
90ddc4f0 1783static const struct proto_ops packet_ops;
1da177e4 1784
90ddc4f0 1785static const struct proto_ops packet_ops_spkt;
1da177e4 1786
40d4e3df
ED
1787static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1788 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1789{
1790 struct sock *sk;
1791 struct sockaddr_pkt *spkt;
1792
1793 /*
1794 * When we registered the protocol we saved the socket in the data
1795 * field for just this event.
1796 */
1797
1798 sk = pt->af_packet_priv;
1ce4f28b 1799
1da177e4
LT
1800 /*
1801 * Yank back the headers [hope the device set this
1802 * right or kerboom...]
1803 *
1804 * Incoming packets have ll header pulled,
1805 * push it back.
1806 *
98e399f8 1807 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1808 * so that this procedure is noop.
1809 */
1810
1811 if (skb->pkt_type == PACKET_LOOPBACK)
1812 goto out;
1813
09ad9bc7 1814 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1815 goto out;
1816
40d4e3df
ED
1817 skb = skb_share_check(skb, GFP_ATOMIC);
1818 if (skb == NULL)
1da177e4
LT
1819 goto oom;
1820
1821 /* drop any routing info */
adf30907 1822 skb_dst_drop(skb);
1da177e4 1823
84531c24
PO
1824 /* drop conntrack reference */
1825 nf_reset(skb);
1826
ffbc6111 1827 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1828
98e399f8 1829 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1830
1831 /*
1832 * The SOCK_PACKET socket receives _all_ frames.
1833 */
1834
1835 spkt->spkt_family = dev->type;
1836 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1837 spkt->spkt_protocol = skb->protocol;
1838
1839 /*
1840 * Charge the memory to the socket. This is done specifically
1841 * to prevent sockets using all the memory up.
1842 */
1843
40d4e3df 1844 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1845 return 0;
1846
1847out:
1848 kfree_skb(skb);
1849oom:
1850 return 0;
1851}
1852
1853
1854/*
1855 * Output a raw packet to a device layer. This bypasses all the other
1856 * protocol layers and you must therefore supply it with a complete frame
1857 */
1ce4f28b 1858
1b784140
YX
1859static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1860 size_t len)
1da177e4
LT
1861{
1862 struct sock *sk = sock->sk;
342dfc30 1863 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1864 struct sk_buff *skb = NULL;
1da177e4 1865 struct net_device *dev;
c14ac945 1866 struct sockcm_cookie sockc;
40d4e3df 1867 __be16 proto = 0;
1da177e4 1868 int err;
3bdc0eba 1869 int extra_len = 0;
1ce4f28b 1870
1da177e4 1871 /*
1ce4f28b 1872 * Get and verify the address.
1da177e4
LT
1873 */
1874
40d4e3df 1875 if (saddr) {
1da177e4 1876 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1877 return -EINVAL;
1878 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1879 proto = saddr->spkt_protocol;
1880 } else
1881 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1882
1883 /*
1ce4f28b 1884 * Find the device first to size check it
1da177e4
LT
1885 */
1886
de74e92a 1887 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1888retry:
654d1f8a
ED
1889 rcu_read_lock();
1890 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1891 err = -ENODEV;
1892 if (dev == NULL)
1893 goto out_unlock;
1ce4f28b 1894
d5e76b0a
DM
1895 err = -ENETDOWN;
1896 if (!(dev->flags & IFF_UP))
1897 goto out_unlock;
1898
1da177e4 1899 /*
40d4e3df
ED
1900 * You may not queue a frame bigger than the mtu. This is the lowest level
1901 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1902 */
1ce4f28b 1903
3bdc0eba
BG
1904 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1905 if (!netif_supports_nofcs(dev)) {
1906 err = -EPROTONOSUPPORT;
1907 goto out_unlock;
1908 }
1909 extra_len = 4; /* We're doing our own CRC */
1910 }
1911
1da177e4 1912 err = -EMSGSIZE;
3bdc0eba 1913 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1914 goto out_unlock;
1915
1a35ca80
ED
1916 if (!skb) {
1917 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1918 int tlen = dev->needed_tailroom;
1a35ca80
ED
1919 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1920
1921 rcu_read_unlock();
4ce40912 1922 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1923 if (skb == NULL)
1924 return -ENOBUFS;
1925 /* FIXME: Save some space for broken drivers that write a hard
1926 * header at transmission time by themselves. PPP is the notable
1927 * one here. This should really be fixed at the driver level.
1928 */
1929 skb_reserve(skb, reserved);
1930 skb_reset_network_header(skb);
1931
1932 /* Try to align data part correctly */
1933 if (hhlen) {
1934 skb->data -= hhlen;
1935 skb->tail -= hhlen;
1936 if (len < hhlen)
1937 skb_reset_network_header(skb);
1938 }
6ce8e9ce 1939 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1940 if (err)
1941 goto out_free;
1942 goto retry;
1da177e4
LT
1943 }
1944
9ed988cd
WB
1945 if (!dev_validate_header(dev, skb->data, len)) {
1946 err = -EINVAL;
1947 goto out_unlock;
1948 }
3c70c132
DB
1949 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1950 !packet_extra_vlan_len_allowed(dev, skb)) {
1951 err = -EMSGSIZE;
1952 goto out_unlock;
57f89bfa 1953 }
1a35ca80 1954
657a0667 1955 sockcm_init(&sockc, sk);
c14ac945
SHY
1956 if (msg->msg_controllen) {
1957 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1958 if (unlikely(err))
c14ac945 1959 goto out_unlock;
c14ac945
SHY
1960 }
1961
1da177e4
LT
1962 skb->protocol = proto;
1963 skb->dev = dev;
1964 skb->priority = sk->sk_priority;
2d37a186 1965 skb->mark = sk->sk_mark;
3d0ba8c0 1966 skb->tstamp = sockc.transmit_time;
bf84a010 1967
8f932f76 1968 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1969
3bdc0eba
BG
1970 if (unlikely(extra_len == 4))
1971 skb->no_fcs = 1;
1972
40893fd0 1973 skb_probe_transport_header(skb, 0);
c1aad275 1974
1da177e4 1975 dev_queue_xmit(skb);
654d1f8a 1976 rcu_read_unlock();
40d4e3df 1977 return len;
1da177e4 1978
1da177e4 1979out_unlock:
654d1f8a 1980 rcu_read_unlock();
1a35ca80
ED
1981out_free:
1982 kfree_skb(skb);
1da177e4
LT
1983 return err;
1984}
1da177e4 1985
ff936a04
AS
1986static unsigned int run_filter(struct sk_buff *skb,
1987 const struct sock *sk,
1988 unsigned int res)
1da177e4
LT
1989{
1990 struct sk_filter *filter;
fda9ef5d 1991
80f8f102
ED
1992 rcu_read_lock();
1993 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1994 if (filter != NULL)
ff936a04 1995 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1996 rcu_read_unlock();
1da177e4 1997
dbcb5855 1998 return res;
1da177e4
LT
1999}
2000
16cc1400
WB
2001static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2002 size_t *len)
2003{
2004 struct virtio_net_hdr vnet_hdr;
2005
2006 if (*len < sizeof(vnet_hdr))
2007 return -EINVAL;
2008 *len -= sizeof(vnet_hdr);
2009
fd3a8862 2010 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2011 return -EINVAL;
2012
2013 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2014}
2015
1da177e4 2016/*
62ab0812
ED
2017 * This function makes lazy skb cloning in hope that most of packets
2018 * are discarded by BPF.
2019 *
2020 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2021 * and skb->cb are mangled. It works because (and until) packets
2022 * falling here are owned by current CPU. Output packets are cloned
2023 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2024 * sequencially, so that if we return skb to original state on exit,
2025 * we will not harm anyone.
1da177e4
LT
2026 */
2027
40d4e3df
ED
2028static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2029 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2030{
2031 struct sock *sk;
2032 struct sockaddr_ll *sll;
2033 struct packet_sock *po;
40d4e3df 2034 u8 *skb_head = skb->data;
1da177e4 2035 int skb_len = skb->len;
dbcb5855 2036 unsigned int snaplen, res;
da37845f 2037 bool is_drop_n_account = false;
1da177e4
LT
2038
2039 if (skb->pkt_type == PACKET_LOOPBACK)
2040 goto drop;
2041
2042 sk = pt->af_packet_priv;
2043 po = pkt_sk(sk);
2044
09ad9bc7 2045 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2046 goto drop;
2047
1da177e4
LT
2048 skb->dev = dev;
2049
3b04ddde 2050 if (dev->header_ops) {
1da177e4 2051 /* The device has an explicit notion of ll header,
62ab0812
ED
2052 * exported to higher levels.
2053 *
2054 * Otherwise, the device hides details of its frame
2055 * structure, so that corresponding packet head is
2056 * never delivered to user.
1da177e4
LT
2057 */
2058 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2059 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2060 else if (skb->pkt_type == PACKET_OUTGOING) {
2061 /* Special case: outgoing packets have ll header at head */
bbe735e4 2062 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2063 }
2064 }
2065
2066 snaplen = skb->len;
2067
dbcb5855
DM
2068 res = run_filter(skb, sk, snaplen);
2069 if (!res)
fda9ef5d 2070 goto drop_n_restore;
dbcb5855
DM
2071 if (snaplen > res)
2072 snaplen = res;
1da177e4 2073
0fd7bac6 2074 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2075 goto drop_n_acct;
2076
2077 if (skb_shared(skb)) {
2078 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2079 if (nskb == NULL)
2080 goto drop_n_acct;
2081
2082 if (skb_head != skb->data) {
2083 skb->data = skb_head;
2084 skb->len = skb_len;
2085 }
abc4e4fa 2086 consume_skb(skb);
1da177e4
LT
2087 skb = nskb;
2088 }
2089
b4772ef8 2090 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2091
2092 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2093 sll->sll_hatype = dev->type;
1da177e4 2094 sll->sll_pkttype = skb->pkt_type;
8032b464 2095 if (unlikely(po->origdev))
80feaacb
PWJ
2096 sll->sll_ifindex = orig_dev->ifindex;
2097 else
2098 sll->sll_ifindex = dev->ifindex;
1da177e4 2099
b95cce35 2100 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2101
2472d761
EB
2102 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2103 * Use their space for storing the original skb length.
2104 */
2105 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2106
1da177e4
LT
2107 if (pskb_trim(skb, snaplen))
2108 goto drop_n_acct;
2109
2110 skb_set_owner_r(skb, sk);
2111 skb->dev = NULL;
adf30907 2112 skb_dst_drop(skb);
1da177e4 2113
84531c24
PO
2114 /* drop conntrack reference */
2115 nf_reset(skb);
2116
1da177e4 2117 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2118 po->stats.stats1.tp_packets++;
3bc3b96f 2119 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2120 __skb_queue_tail(&sk->sk_receive_queue, skb);
2121 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2122 sk->sk_data_ready(sk);
1da177e4
LT
2123 return 0;
2124
2125drop_n_acct:
da37845f 2126 is_drop_n_account = true;
7091fbd8 2127 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2128 po->stats.stats1.tp_drops++;
7091fbd8
WB
2129 atomic_inc(&sk->sk_drops);
2130 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2131
2132drop_n_restore:
2133 if (skb_head != skb->data && skb_shared(skb)) {
2134 skb->data = skb_head;
2135 skb->len = skb_len;
2136 }
2137drop:
da37845f
WJ
2138 if (!is_drop_n_account)
2139 consume_skb(skb);
2140 else
2141 kfree_skb(skb);
1da177e4
LT
2142 return 0;
2143}
2144
40d4e3df
ED
2145static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2146 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2147{
2148 struct sock *sk;
2149 struct packet_sock *po;
2150 struct sockaddr_ll *sll;
184f489e 2151 union tpacket_uhdr h;
40d4e3df 2152 u8 *skb_head = skb->data;
1da177e4 2153 int skb_len = skb->len;
dbcb5855 2154 unsigned int snaplen, res;
f6fb8f10 2155 unsigned long status = TP_STATUS_USER;
bbd6ef87 2156 unsigned short macoff, netoff, hdrlen;
1da177e4 2157 struct sk_buff *copy_skb = NULL;
bbd6ef87 2158 struct timespec ts;
b9c32fb2 2159 __u32 ts_status;
da37845f 2160 bool is_drop_n_account = false;
edbd58be 2161 bool do_vnet = false;
1da177e4 2162
51846355
AW
2163 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2164 * We may add members to them until current aligned size without forcing
2165 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2166 */
2167 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2168 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2169
1da177e4
LT
2170 if (skb->pkt_type == PACKET_LOOPBACK)
2171 goto drop;
2172
2173 sk = pt->af_packet_priv;
2174 po = pkt_sk(sk);
2175
09ad9bc7 2176 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2177 goto drop;
2178
3b04ddde 2179 if (dev->header_ops) {
1da177e4 2180 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2181 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2182 else if (skb->pkt_type == PACKET_OUTGOING) {
2183 /* Special case: outgoing packets have ll header at head */
bbe735e4 2184 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2185 }
2186 }
2187
2188 snaplen = skb->len;
2189
dbcb5855
DM
2190 res = run_filter(skb, sk, snaplen);
2191 if (!res)
fda9ef5d 2192 goto drop_n_restore;
68c2e5de
AD
2193
2194 if (skb->ip_summed == CHECKSUM_PARTIAL)
2195 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2196 else if (skb->pkt_type != PACKET_OUTGOING &&
2197 (skb->ip_summed == CHECKSUM_COMPLETE ||
2198 skb_csum_unnecessary(skb)))
2199 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2200
dbcb5855
DM
2201 if (snaplen > res)
2202 snaplen = res;
1da177e4
LT
2203
2204 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2205 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2206 po->tp_reserve;
1da177e4 2207 } else {
95c96174 2208 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2209 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2210 (maclen < 16 ? 16 : maclen)) +
58d19b19 2211 po->tp_reserve;
edbd58be 2212 if (po->has_vnet_hdr) {
58d19b19 2213 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2214 do_vnet = true;
2215 }
1da177e4
LT
2216 macoff = netoff - maclen;
2217 }
f6fb8f10 2218 if (po->tp_version <= TPACKET_V2) {
2219 if (macoff + snaplen > po->rx_ring.frame_size) {
2220 if (po->copy_thresh &&
0fd7bac6 2221 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2222 if (skb_shared(skb)) {
2223 copy_skb = skb_clone(skb, GFP_ATOMIC);
2224 } else {
2225 copy_skb = skb_get(skb);
2226 skb_head = skb->data;
2227 }
2228 if (copy_skb)
2229 skb_set_owner_r(copy_skb, sk);
1da177e4 2230 }
f6fb8f10 2231 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2232 if ((int)snaplen < 0) {
f6fb8f10 2233 snaplen = 0;
edbd58be
BP
2234 do_vnet = false;
2235 }
1da177e4 2236 }
dc808110
ED
2237 } else if (unlikely(macoff + snaplen >
2238 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2239 u32 nval;
2240
2241 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2242 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2243 snaplen, nval, macoff);
2244 snaplen = nval;
2245 if (unlikely((int)snaplen < 0)) {
2246 snaplen = 0;
2247 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2248 do_vnet = false;
dc808110 2249 }
1da177e4 2250 }
1da177e4 2251 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2252 h.raw = packet_current_rx_frame(po, skb,
2253 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2254 if (!h.raw)
58d19b19 2255 goto drop_n_account;
f6fb8f10 2256 if (po->tp_version <= TPACKET_V2) {
2257 packet_increment_rx_head(po, &po->rx_ring);
2258 /*
2259 * LOSING will be reported till you read the stats,
2260 * because it's COR - Clear On Read.
2261 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2262 * at packet level.
2263 */
ee80fbf3 2264 if (po->stats.stats1.tp_drops)
f6fb8f10 2265 status |= TP_STATUS_LOSING;
2266 }
945d015e
ED
2267
2268 if (do_vnet &&
2269 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2270 sizeof(struct virtio_net_hdr),
2271 vio_le(), true, 0))
2272 goto drop_n_account;
2273
ee80fbf3 2274 po->stats.stats1.tp_packets++;
1da177e4
LT
2275 if (copy_skb) {
2276 status |= TP_STATUS_COPY;
2277 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2278 }
1da177e4
LT
2279 spin_unlock(&sk->sk_receive_queue.lock);
2280
bbd6ef87 2281 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2282
2283 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2284 getnstimeofday(&ts);
1da177e4 2285
b9c32fb2
DB
2286 status |= ts_status;
2287
bbd6ef87
PM
2288 switch (po->tp_version) {
2289 case TPACKET_V1:
2290 h.h1->tp_len = skb->len;
2291 h.h1->tp_snaplen = snaplen;
2292 h.h1->tp_mac = macoff;
2293 h.h1->tp_net = netoff;
4b457bdf
DB
2294 h.h1->tp_sec = ts.tv_sec;
2295 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2296 hdrlen = sizeof(*h.h1);
2297 break;
2298 case TPACKET_V2:
2299 h.h2->tp_len = skb->len;
2300 h.h2->tp_snaplen = snaplen;
2301 h.h2->tp_mac = macoff;
2302 h.h2->tp_net = netoff;
bbd6ef87
PM
2303 h.h2->tp_sec = ts.tv_sec;
2304 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2305 if (skb_vlan_tag_present(skb)) {
2306 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2307 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2308 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2309 } else {
2310 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2311 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2312 }
e4d26f4b 2313 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2314 hdrlen = sizeof(*h.h2);
2315 break;
f6fb8f10 2316 case TPACKET_V3:
2317 /* tp_nxt_offset,vlan are already populated above.
2318 * So DONT clear those fields here
2319 */
2320 h.h3->tp_status |= status;
2321 h.h3->tp_len = skb->len;
2322 h.h3->tp_snaplen = snaplen;
2323 h.h3->tp_mac = macoff;
2324 h.h3->tp_net = netoff;
f6fb8f10 2325 h.h3->tp_sec = ts.tv_sec;
2326 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2327 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2328 hdrlen = sizeof(*h.h3);
2329 break;
bbd6ef87
PM
2330 default:
2331 BUG();
2332 }
1da177e4 2333
bbd6ef87 2334 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2335 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2336 sll->sll_family = AF_PACKET;
2337 sll->sll_hatype = dev->type;
2338 sll->sll_protocol = skb->protocol;
2339 sll->sll_pkttype = skb->pkt_type;
8032b464 2340 if (unlikely(po->origdev))
80feaacb
PWJ
2341 sll->sll_ifindex = orig_dev->ifindex;
2342 else
2343 sll->sll_ifindex = dev->ifindex;
1da177e4 2344
e16aa207 2345 smp_mb();
f0d4eb29 2346
f6dafa95 2347#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2348 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2349 u8 *start, *end;
2350
f0d4eb29
DB
2351 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2352 macoff + snaplen);
2353
2354 for (start = h.raw; start < end; start += PAGE_SIZE)
2355 flush_dcache_page(pgv_to_page(start));
1da177e4 2356 }
f0d4eb29 2357 smp_wmb();
f6dafa95 2358#endif
f0d4eb29 2359
da413eec 2360 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2361 __packet_set_status(po, h.raw, status);
da413eec
DC
2362 sk->sk_data_ready(sk);
2363 } else {
f6fb8f10 2364 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2365 }
1da177e4
LT
2366
2367drop_n_restore:
2368 if (skb_head != skb->data && skb_shared(skb)) {
2369 skb->data = skb_head;
2370 skb->len = skb_len;
2371 }
2372drop:
da37845f
WJ
2373 if (!is_drop_n_account)
2374 consume_skb(skb);
2375 else
2376 kfree_skb(skb);
1da177e4
LT
2377 return 0;
2378
58d19b19 2379drop_n_account:
da37845f 2380 is_drop_n_account = true;
ee80fbf3 2381 po->stats.stats1.tp_drops++;
1da177e4
LT
2382 spin_unlock(&sk->sk_receive_queue.lock);
2383
676d2369 2384 sk->sk_data_ready(sk);
acb5d75b 2385 kfree_skb(copy_skb);
1da177e4
LT
2386 goto drop_n_restore;
2387}
2388
69e3c75f
JB
2389static void tpacket_destruct_skb(struct sk_buff *skb)
2390{
2391 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2392
69e3c75f 2393 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2394 void *ph;
b9c32fb2
DB
2395 __u32 ts;
2396
5cd8d46e 2397 ph = skb_zcopy_get_nouarg(skb);
b0138408 2398 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2399
2400 ts = __packet_set_timestamp(po, ph, skb);
2401 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2402 }
2403
2404 sock_wfree(skb);
2405}
2406
c72219b7
DB
2407static void tpacket_set_protocol(const struct net_device *dev,
2408 struct sk_buff *skb)
2409{
2410 if (dev->type == ARPHRD_ETHER) {
2411 skb_reset_mac_header(skb);
2412 skb->protocol = eth_hdr(skb)->h_proto;
2413 }
2414}
2415
16cc1400
WB
2416static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2417{
16cc1400
WB
2418 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2419 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2420 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2421 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2422 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2423 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2424 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2425
2426 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2427 return -EINVAL;
2428
16cc1400
WB
2429 return 0;
2430}
2431
2432static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2433 struct virtio_net_hdr *vnet_hdr)
2434{
16cc1400
WB
2435 if (*len < sizeof(*vnet_hdr))
2436 return -EINVAL;
2437 *len -= sizeof(*vnet_hdr);
2438
cbbd26b8 2439 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2440 return -EFAULT;
2441
2442 return __packet_snd_vnet_parse(vnet_hdr, *len);
2443}
2444
40d4e3df 2445static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2446 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2447 __be16 proto, unsigned char *addr, int hlen, int copylen,
2448 const struct sockcm_cookie *sockc)
69e3c75f 2449{
184f489e 2450 union tpacket_uhdr ph;
8d39b4a6 2451 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2452 struct socket *sock = po->sk.sk_socket;
2453 struct page *page;
69e3c75f
JB
2454 int err;
2455
2456 ph.raw = frame;
2457
2458 skb->protocol = proto;
2459 skb->dev = dev;
2460 skb->priority = po->sk.sk_priority;
2d37a186 2461 skb->mark = po->sk.sk_mark;
3d0ba8c0 2462 skb->tstamp = sockc->transmit_time;
8f932f76 2463 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2464 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2465
ae641949 2466 skb_reserve(skb, hlen);
69e3c75f 2467 skb_reset_network_header(skb);
c1aad275 2468
69e3c75f
JB
2469 to_write = tp_len;
2470
2471 if (sock->type == SOCK_DGRAM) {
2472 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2473 NULL, tp_len);
2474 if (unlikely(err < 0))
2475 return -EINVAL;
1d036d25 2476 } else if (copylen) {
9ed988cd
WB
2477 int hdrlen = min_t(int, copylen, tp_len);
2478
69e3c75f 2479 skb_push(skb, dev->hard_header_len);
1d036d25 2480 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2481 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2482 if (unlikely(err))
2483 return err;
9ed988cd
WB
2484 if (!dev_validate_header(dev, skb->data, hdrlen))
2485 return -EINVAL;
c72219b7
DB
2486 if (!skb->protocol)
2487 tpacket_set_protocol(dev, skb);
69e3c75f 2488
9ed988cd
WB
2489 data += hdrlen;
2490 to_write -= hdrlen;
69e3c75f
JB
2491 }
2492
69e3c75f
JB
2493 offset = offset_in_page(data);
2494 len_max = PAGE_SIZE - offset;
2495 len = ((to_write > len_max) ? len_max : to_write);
2496
2497 skb->data_len = to_write;
2498 skb->len += to_write;
2499 skb->truesize += to_write;
14afee4b 2500 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2501
2502 while (likely(to_write)) {
2503 nr_frags = skb_shinfo(skb)->nr_frags;
2504
2505 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2506 pr_err("Packet exceed the number of skb frags(%lu)\n",
2507 MAX_SKB_FRAGS);
69e3c75f
JB
2508 return -EFAULT;
2509 }
2510
0af55bb5
CG
2511 page = pgv_to_page(data);
2512 data += len;
69e3c75f
JB
2513 flush_dcache_page(page);
2514 get_page(page);
0af55bb5 2515 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2516 to_write -= len;
2517 offset = 0;
2518 len_max = PAGE_SIZE;
2519 len = ((to_write > len_max) ? len_max : to_write);
2520 }
2521
8fd6c80d 2522 skb_probe_transport_header(skb, 0);
efdfa2f7 2523
69e3c75f
JB
2524 return tp_len;
2525}
2526
8d39b4a6
WB
2527static int tpacket_parse_header(struct packet_sock *po, void *frame,
2528 int size_max, void **data)
2529{
2530 union tpacket_uhdr ph;
2531 int tp_len, off;
2532
2533 ph.raw = frame;
2534
2535 switch (po->tp_version) {
7f953ab2
SV
2536 case TPACKET_V3:
2537 if (ph.h3->tp_next_offset != 0) {
2538 pr_warn_once("variable sized slot not supported");
2539 return -EINVAL;
2540 }
2541 tp_len = ph.h3->tp_len;
2542 break;
8d39b4a6
WB
2543 case TPACKET_V2:
2544 tp_len = ph.h2->tp_len;
2545 break;
2546 default:
2547 tp_len = ph.h1->tp_len;
2548 break;
2549 }
2550 if (unlikely(tp_len > size_max)) {
2551 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2552 return -EMSGSIZE;
2553 }
2554
2555 if (unlikely(po->tp_tx_has_off)) {
2556 int off_min, off_max;
2557
2558 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2559 off_max = po->tx_ring.frame_size - tp_len;
2560 if (po->sk.sk_type == SOCK_DGRAM) {
2561 switch (po->tp_version) {
7f953ab2
SV
2562 case TPACKET_V3:
2563 off = ph.h3->tp_net;
2564 break;
8d39b4a6
WB
2565 case TPACKET_V2:
2566 off = ph.h2->tp_net;
2567 break;
2568 default:
2569 off = ph.h1->tp_net;
2570 break;
2571 }
2572 } else {
2573 switch (po->tp_version) {
7f953ab2
SV
2574 case TPACKET_V3:
2575 off = ph.h3->tp_mac;
2576 break;
8d39b4a6
WB
2577 case TPACKET_V2:
2578 off = ph.h2->tp_mac;
2579 break;
2580 default:
2581 off = ph.h1->tp_mac;
2582 break;
2583 }
2584 }
2585 if (unlikely((off < off_min) || (off_max < off)))
2586 return -EINVAL;
2587 } else {
2588 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2589 }
2590
2591 *data = frame + off;
2592 return tp_len;
2593}
2594
69e3c75f
JB
2595static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2596{
69e3c75f
JB
2597 struct sk_buff *skb;
2598 struct net_device *dev;
1d036d25 2599 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2600 struct sockcm_cookie sockc;
69e3c75f 2601 __be16 proto;
09effa67 2602 int err, reserve = 0;
40d4e3df 2603 void *ph;
342dfc30 2604 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2605 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2606 int tp_len, size_max;
2607 unsigned char *addr;
8d39b4a6 2608 void *data;
69e3c75f 2609 int len_sum = 0;
9e67030a 2610 int status = TP_STATUS_AVAILABLE;
1d036d25 2611 int hlen, tlen, copylen = 0;
69e3c75f 2612
69e3c75f
JB
2613 mutex_lock(&po->pg_vec_lock);
2614
66e56cd4 2615 if (likely(saddr == NULL)) {
e40526cb 2616 dev = packet_cached_dev_get(po);
69e3c75f
JB
2617 proto = po->num;
2618 addr = NULL;
2619 } else {
2620 err = -EINVAL;
2621 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2622 goto out;
2623 if (msg->msg_namelen < (saddr->sll_halen
2624 + offsetof(struct sockaddr_ll,
2625 sll_addr)))
2626 goto out;
69e3c75f 2627 proto = saddr->sll_protocol;
6b8d95f1 2628 addr = saddr->sll_halen ? saddr->sll_addr : NULL;
827d9780 2629 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
99137b78
WB
2630 if (addr && dev && saddr->sll_halen < dev->addr_len)
2631 goto out;
69e3c75f
JB
2632 }
2633
69e3c75f
JB
2634 err = -ENXIO;
2635 if (unlikely(dev == NULL))
2636 goto out;
69e3c75f
JB
2637 err = -ENETDOWN;
2638 if (unlikely(!(dev->flags & IFF_UP)))
2639 goto out_put;
2640
657a0667 2641 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2642 if (msg->msg_controllen) {
2643 err = sock_cmsg_send(&po->sk, msg, &sockc);
2644 if (unlikely(err))
2645 goto out_put;
2646 }
2647
5cfb4c8d
DB
2648 if (po->sk.sk_socket->type == SOCK_RAW)
2649 reserve = dev->hard_header_len;
69e3c75f 2650 size_max = po->tx_ring.frame_size
b5dd884e 2651 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2652
1d036d25 2653 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2654 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2655
69e3c75f
JB
2656 do {
2657 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2658 TP_STATUS_SEND_REQUEST);
69e3c75f 2659 if (unlikely(ph == NULL)) {
87a2fd28
DB
2660 if (need_wait && need_resched())
2661 schedule();
69e3c75f
JB
2662 continue;
2663 }
2664
8d39b4a6
WB
2665 skb = NULL;
2666 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2667 if (tp_len < 0)
2668 goto tpacket_error;
2669
69e3c75f 2670 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2671 hlen = LL_RESERVED_SPACE(dev);
2672 tlen = dev->needed_tailroom;
1d036d25
WB
2673 if (po->has_vnet_hdr) {
2674 vnet_hdr = data;
2675 data += sizeof(*vnet_hdr);
2676 tp_len -= sizeof(*vnet_hdr);
2677 if (tp_len < 0 ||
2678 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2679 tp_len = -EINVAL;
2680 goto tpacket_error;
2681 }
2682 copylen = __virtio16_to_cpu(vio_le(),
2683 vnet_hdr->hdr_len);
2684 }
9ed988cd 2685 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2686 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2687 hlen + tlen + sizeof(struct sockaddr_ll) +
2688 (copylen - dev->hard_header_len),
fbf33a28 2689 !need_wait, &err);
69e3c75f 2690
fbf33a28
KM
2691 if (unlikely(skb == NULL)) {
2692 /* we assume the socket was initially writeable ... */
2693 if (likely(len_sum > 0))
2694 err = len_sum;
69e3c75f 2695 goto out_status;
fbf33a28 2696 }
8d39b4a6 2697 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2698 addr, hlen, copylen, &sockc);
dbd46ab4 2699 if (likely(tp_len >= 0) &&
5cfb4c8d 2700 tp_len > dev->mtu + reserve &&
1d036d25 2701 !po->has_vnet_hdr &&
3c70c132
DB
2702 !packet_extra_vlan_len_allowed(dev, skb))
2703 tp_len = -EMSGSIZE;
69e3c75f
JB
2704
2705 if (unlikely(tp_len < 0)) {
8d39b4a6 2706tpacket_error:
69e3c75f
JB
2707 if (po->tp_loss) {
2708 __packet_set_status(po, ph,
2709 TP_STATUS_AVAILABLE);
2710 packet_increment_head(&po->tx_ring);
2711 kfree_skb(skb);
2712 continue;
2713 } else {
2714 status = TP_STATUS_WRONG_FORMAT;
2715 err = tp_len;
2716 goto out_status;
2717 }
2718 }
2719
9d2f67e4
JT
2720 if (po->has_vnet_hdr) {
2721 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2722 tp_len = -EINVAL;
2723 goto tpacket_error;
2724 }
2725 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2726 }
2727
69e3c75f
JB
2728 skb->destructor = tpacket_destruct_skb;
2729 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2730 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2731
2732 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2733 err = po->xmit(skb);
eb70df13
JP
2734 if (unlikely(err > 0)) {
2735 err = net_xmit_errno(err);
2736 if (err && __packet_get_status(po, ph) ==
2737 TP_STATUS_AVAILABLE) {
2738 /* skb was destructed already */
2739 skb = NULL;
2740 goto out_status;
2741 }
2742 /*
2743 * skb was dropped but not destructed yet;
2744 * let's treat it like congestion or err < 0
2745 */
2746 err = 0;
2747 }
69e3c75f
JB
2748 packet_increment_head(&po->tx_ring);
2749 len_sum += tp_len;
b0138408
DB
2750 } while (likely((ph != NULL) ||
2751 /* Note: packet_read_pending() might be slow if we have
2752 * to call it as it's per_cpu variable, but in fast-path
2753 * we already short-circuit the loop with the first
2754 * condition, and luckily don't have to go that path
2755 * anyway.
2756 */
2757 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2758
2759 err = len_sum;
2760 goto out_put;
2761
69e3c75f
JB
2762out_status:
2763 __packet_set_status(po, ph, status);
2764 kfree_skb(skb);
2765out_put:
e40526cb 2766 dev_put(dev);
69e3c75f
JB
2767out:
2768 mutex_unlock(&po->pg_vec_lock);
2769 return err;
2770}
69e3c75f 2771
eea49cc9
OJ
2772static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2773 size_t reserve, size_t len,
2774 size_t linear, int noblock,
2775 int *err)
bfd5f4a3
SS
2776{
2777 struct sk_buff *skb;
2778
2779 /* Under a page? Don't bother with paged skb. */
2780 if (prepad + len < PAGE_SIZE || !linear)
2781 linear = len;
2782
2783 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2784 err, 0);
bfd5f4a3
SS
2785 if (!skb)
2786 return NULL;
2787
2788 skb_reserve(skb, reserve);
2789 skb_put(skb, linear);
2790 skb->data_len = len - linear;
2791 skb->len += len - linear;
2792
2793 return skb;
2794}
2795
d346a3fa 2796static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2797{
2798 struct sock *sk = sock->sk;
342dfc30 2799 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2800 struct sk_buff *skb;
2801 struct net_device *dev;
0e11c91e 2802 __be16 proto;
1da177e4 2803 unsigned char *addr;
827d9780 2804 int err, reserve = 0;
c7d39e32 2805 struct sockcm_cookie sockc;
bfd5f4a3
SS
2806 struct virtio_net_hdr vnet_hdr = { 0 };
2807 int offset = 0;
bfd5f4a3 2808 struct packet_sock *po = pkt_sk(sk);
da7c9561 2809 bool has_vnet_hdr = false;
57031eb7 2810 int hlen, tlen, linear;
3bdc0eba 2811 int extra_len = 0;
1da177e4
LT
2812
2813 /*
1ce4f28b 2814 * Get and verify the address.
1da177e4 2815 */
1ce4f28b 2816
66e56cd4 2817 if (likely(saddr == NULL)) {
e40526cb 2818 dev = packet_cached_dev_get(po);
1da177e4
LT
2819 proto = po->num;
2820 addr = NULL;
2821 } else {
2822 err = -EINVAL;
2823 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2824 goto out;
0fb375fb
EB
2825 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2826 goto out;
1da177e4 2827 proto = saddr->sll_protocol;
6b8d95f1 2828 addr = saddr->sll_halen ? saddr->sll_addr : NULL;
827d9780 2829 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
99137b78
WB
2830 if (addr && dev && saddr->sll_halen < dev->addr_len)
2831 goto out;
1da177e4
LT
2832 }
2833
1da177e4 2834 err = -ENXIO;
e40526cb 2835 if (unlikely(dev == NULL))
1da177e4 2836 goto out_unlock;
d5e76b0a 2837 err = -ENETDOWN;
e40526cb 2838 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2839 goto out_unlock;
2840
657a0667 2841 sockcm_init(&sockc, sk);
c7d39e32
EJ
2842 sockc.mark = sk->sk_mark;
2843 if (msg->msg_controllen) {
2844 err = sock_cmsg_send(sk, msg, &sockc);
2845 if (unlikely(err))
2846 goto out_unlock;
2847 }
2848
e40526cb
DB
2849 if (sock->type == SOCK_RAW)
2850 reserve = dev->hard_header_len;
bfd5f4a3 2851 if (po->has_vnet_hdr) {
16cc1400
WB
2852 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2853 if (err)
bfd5f4a3 2854 goto out_unlock;
da7c9561 2855 has_vnet_hdr = true;
bfd5f4a3
SS
2856 }
2857
3bdc0eba
BG
2858 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2859 if (!netif_supports_nofcs(dev)) {
2860 err = -EPROTONOSUPPORT;
2861 goto out_unlock;
2862 }
2863 extra_len = 4; /* We're doing our own CRC */
2864 }
2865
1da177e4 2866 err = -EMSGSIZE;
16cc1400
WB
2867 if (!vnet_hdr.gso_type &&
2868 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2869 goto out_unlock;
2870
bfd5f4a3 2871 err = -ENOBUFS;
ae641949
HX
2872 hlen = LL_RESERVED_SPACE(dev);
2873 tlen = dev->needed_tailroom;
57031eb7
WB
2874 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2875 linear = max(linear, min_t(int, len, dev->hard_header_len));
2876 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2877 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2878 if (skb == NULL)
1da177e4
LT
2879 goto out_unlock;
2880
b84bbaf7 2881 skb_reset_network_header(skb);
1da177e4 2882
0c4e8581 2883 err = -EINVAL;
9c707762
WB
2884 if (sock->type == SOCK_DGRAM) {
2885 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2886 if (unlikely(offset < 0))
9c707762 2887 goto out_free;
b84bbaf7 2888 } else if (reserve) {
9aad13b0 2889 skb_reserve(skb, -reserve);
993675a3
WB
2890 if (len < reserve)
2891 skb_reset_network_header(skb);
9c707762 2892 }
1da177e4
LT
2893
2894 /* Returns -EFAULT on error */
c0371da6 2895 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2896 if (err)
2897 goto out_free;
bf84a010 2898
9ed988cd
WB
2899 if (sock->type == SOCK_RAW &&
2900 !dev_validate_header(dev, skb->data, len)) {
2901 err = -EINVAL;
2902 goto out_free;
2903 }
2904
8f932f76 2905 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2906
16cc1400 2907 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2908 !packet_extra_vlan_len_allowed(dev, skb)) {
2909 err = -EMSGSIZE;
2910 goto out_free;
57f89bfa
BG
2911 }
2912
09effa67
DM
2913 skb->protocol = proto;
2914 skb->dev = dev;
1da177e4 2915 skb->priority = sk->sk_priority;
c7d39e32 2916 skb->mark = sockc.mark;
3d0ba8c0 2917 skb->tstamp = sockc.transmit_time;
0fd5d57b 2918
da7c9561 2919 if (has_vnet_hdr) {
db60eb5f 2920 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2921 if (err)
2922 goto out_free;
2923 len += sizeof(vnet_hdr);
9d2f67e4 2924 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2925 }
2926
8fd6c80d
DB
2927 skb_probe_transport_header(skb, reserve);
2928
3bdc0eba
BG
2929 if (unlikely(extra_len == 4))
2930 skb->no_fcs = 1;
2931
d346a3fa 2932 err = po->xmit(skb);
1da177e4
LT
2933 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2934 goto out_unlock;
2935
e40526cb 2936 dev_put(dev);
1da177e4 2937
40d4e3df 2938 return len;
1da177e4
LT
2939
2940out_free:
2941 kfree_skb(skb);
2942out_unlock:
e40526cb 2943 if (dev)
1da177e4
LT
2944 dev_put(dev);
2945out:
2946 return err;
2947}
2948
1b784140 2949static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2950{
69e3c75f
JB
2951 struct sock *sk = sock->sk;
2952 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2953
69e3c75f
JB
2954 if (po->tx_ring.pg_vec)
2955 return tpacket_snd(po, msg);
2956 else
69e3c75f
JB
2957 return packet_snd(sock, msg, len);
2958}
2959
1da177e4
LT
2960/*
2961 * Close a PACKET socket. This is fairly simple. We immediately go
2962 * to 'closed' state and remove our protocol entry in the device list.
2963 */
2964
2965static int packet_release(struct socket *sock)
2966{
2967 struct sock *sk = sock->sk;
2968 struct packet_sock *po;
2bd624b4 2969 struct packet_fanout *f;
d12d01d6 2970 struct net *net;
f6fb8f10 2971 union tpacket_req_u req_u;
1da177e4
LT
2972
2973 if (!sk)
2974 return 0;
2975
3b1e0a65 2976 net = sock_net(sk);
1da177e4
LT
2977 po = pkt_sk(sk);
2978
0fa7fa98 2979 mutex_lock(&net->packet.sklist_lock);
808f5114 2980 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2981 mutex_unlock(&net->packet.sklist_lock);
2982
2983 preempt_disable();
920de804 2984 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2985 preempt_enable();
1da177e4 2986
808f5114 2987 spin_lock(&po->bind_lock);
ce06b03e 2988 unregister_prot_hook(sk, false);
66e56cd4
DB
2989 packet_cached_dev_reset(po);
2990
160ff18a
BG
2991 if (po->prot_hook.dev) {
2992 dev_put(po->prot_hook.dev);
2993 po->prot_hook.dev = NULL;
2994 }
808f5114 2995 spin_unlock(&po->bind_lock);
1da177e4 2996
1da177e4 2997 packet_flush_mclist(sk);
1da177e4 2998
5171b37d 2999 lock_sock(sk);
9665d5d6
PS
3000 if (po->rx_ring.pg_vec) {
3001 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3002 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 3003 }
69e3c75f 3004
9665d5d6
PS
3005 if (po->tx_ring.pg_vec) {
3006 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3007 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3008 }
5171b37d 3009 release_sock(sk);
1da177e4 3010
2bd624b4 3011 f = fanout_release(sk);
dc99f600 3012
808f5114 3013 synchronize_net();
2bd624b4
AS
3014
3015 if (f) {
57f015f5 3016 kfree(po->rollover);
2bd624b4
AS
3017 fanout_release_data(f);
3018 kfree(f);
3019 }
1da177e4
LT
3020 /*
3021 * Now the socket is dead. No more input will appear.
3022 */
1da177e4
LT
3023 sock_orphan(sk);
3024 sock->sk = NULL;
3025
3026 /* Purge queues */
3027
3028 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3029 packet_free_pending(po);
17ab56a2 3030 sk_refcnt_debug_release(sk);
1da177e4
LT
3031
3032 sock_put(sk);
3033 return 0;
3034}
3035
3036/*
3037 * Attach a packet hook.
3038 */
3039
30f7ea1c
FR
3040static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3041 __be16 proto)
1da177e4
LT
3042{
3043 struct packet_sock *po = pkt_sk(sk);
158cd4af 3044 struct net_device *dev_curr;
902fefb8
DB
3045 __be16 proto_curr;
3046 bool need_rehook;
30f7ea1c
FR
3047 struct net_device *dev = NULL;
3048 int ret = 0;
3049 bool unlisted = false;
dc99f600 3050
1da177e4 3051 lock_sock(sk);
1da177e4 3052 spin_lock(&po->bind_lock);
30f7ea1c
FR
3053 rcu_read_lock();
3054
4971613c
WB
3055 if (po->fanout) {
3056 ret = -EINVAL;
3057 goto out_unlock;
3058 }
3059
30f7ea1c
FR
3060 if (name) {
3061 dev = dev_get_by_name_rcu(sock_net(sk), name);
3062 if (!dev) {
3063 ret = -ENODEV;
3064 goto out_unlock;
3065 }
3066 } else if (ifindex) {
3067 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3068 if (!dev) {
3069 ret = -ENODEV;
3070 goto out_unlock;
3071 }
3072 }
3073
3074 if (dev)
3075 dev_hold(dev);
66e56cd4 3076
902fefb8
DB
3077 proto_curr = po->prot_hook.type;
3078 dev_curr = po->prot_hook.dev;
3079
3080 need_rehook = proto_curr != proto || dev_curr != dev;
3081
3082 if (need_rehook) {
30f7ea1c
FR
3083 if (po->running) {
3084 rcu_read_unlock();
15fe076e
ED
3085 /* prevents packet_notifier() from calling
3086 * register_prot_hook()
3087 */
3088 po->num = 0;
30f7ea1c
FR
3089 __unregister_prot_hook(sk, true);
3090 rcu_read_lock();
3091 dev_curr = po->prot_hook.dev;
3092 if (dev)
3093 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3094 dev->ifindex);
3095 }
1da177e4 3096
15fe076e 3097 BUG_ON(po->running);
902fefb8
DB
3098 po->num = proto;
3099 po->prot_hook.type = proto;
902fefb8 3100
30f7ea1c
FR
3101 if (unlikely(unlisted)) {
3102 dev_put(dev);
3103 po->prot_hook.dev = NULL;
3104 po->ifindex = -1;
3105 packet_cached_dev_reset(po);
3106 } else {
3107 po->prot_hook.dev = dev;
3108 po->ifindex = dev ? dev->ifindex : 0;
3109 packet_cached_dev_assign(po, dev);
3110 }
902fefb8 3111 }
158cd4af
LW
3112 if (dev_curr)
3113 dev_put(dev_curr);
66e56cd4 3114
902fefb8 3115 if (proto == 0 || !need_rehook)
1da177e4
LT
3116 goto out_unlock;
3117
30f7ea1c 3118 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3119 register_prot_hook(sk);
be85d4ad
UT
3120 } else {
3121 sk->sk_err = ENETDOWN;
3122 if (!sock_flag(sk, SOCK_DEAD))
3123 sk->sk_error_report(sk);
1da177e4
LT
3124 }
3125
3126out_unlock:
30f7ea1c 3127 rcu_read_unlock();
1da177e4
LT
3128 spin_unlock(&po->bind_lock);
3129 release_sock(sk);
30f7ea1c 3130 return ret;
1da177e4
LT
3131}
3132
3133/*
3134 * Bind a packet socket to a device
3135 */
3136
40d4e3df
ED
3137static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3138 int addr_len)
1da177e4 3139{
40d4e3df 3140 struct sock *sk = sock->sk;
540e2894 3141 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3142
1da177e4
LT
3143 /*
3144 * Check legality
3145 */
1ce4f28b 3146
8ae55f04 3147 if (addr_len != sizeof(struct sockaddr))
1da177e4 3148 return -EINVAL;
540e2894
AP
3149 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3150 * zero-terminated.
3151 */
3152 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3153 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3154
30f7ea1c 3155 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3156}
1da177e4
LT
3157
3158static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3159{
40d4e3df
ED
3160 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3161 struct sock *sk = sock->sk;
1da177e4
LT
3162
3163 /*
3164 * Check legality
3165 */
1ce4f28b 3166
1da177e4
LT
3167 if (addr_len < sizeof(struct sockaddr_ll))
3168 return -EINVAL;
3169 if (sll->sll_family != AF_PACKET)
3170 return -EINVAL;
3171
30f7ea1c
FR
3172 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3173 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3174}
3175
3176static struct proto packet_proto = {
3177 .name = "PACKET",
3178 .owner = THIS_MODULE,
3179 .obj_size = sizeof(struct packet_sock),
3180};
3181
3182/*
1ce4f28b 3183 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3184 */
3185
3f378b68
EP
3186static int packet_create(struct net *net, struct socket *sock, int protocol,
3187 int kern)
1da177e4
LT
3188{
3189 struct sock *sk;
3190 struct packet_sock *po;
0e11c91e 3191 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3192 int err;
3193
df008c91 3194 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3195 return -EPERM;
be02097c
DM
3196 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3197 sock->type != SOCK_PACKET)
1da177e4
LT
3198 return -ESOCKTNOSUPPORT;
3199
3200 sock->state = SS_UNCONNECTED;
3201
3202 err = -ENOBUFS;
11aa9c28 3203 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3204 if (sk == NULL)
3205 goto out;
3206
3207 sock->ops = &packet_ops;
1da177e4
LT
3208 if (sock->type == SOCK_PACKET)
3209 sock->ops = &packet_ops_spkt;
be02097c 3210
1da177e4
LT
3211 sock_init_data(sock, sk);
3212
3213 po = pkt_sk(sk);
3214 sk->sk_family = PF_PACKET;
0e11c91e 3215 po->num = proto;
d346a3fa 3216 po->xmit = dev_queue_xmit;
66e56cd4 3217
b0138408
DB
3218 err = packet_alloc_pending(po);
3219 if (err)
3220 goto out2;
3221
66e56cd4 3222 packet_cached_dev_reset(po);
1da177e4
LT
3223
3224 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3225 sk_refcnt_debug_inc(sk);
1da177e4
LT
3226
3227 /*
3228 * Attach a protocol block
3229 */
3230
3231 spin_lock_init(&po->bind_lock);
905db440 3232 mutex_init(&po->pg_vec_lock);
0648ab70 3233 po->rollover = NULL;
1da177e4 3234 po->prot_hook.func = packet_rcv;
be02097c 3235
1da177e4
LT
3236 if (sock->type == SOCK_PACKET)
3237 po->prot_hook.func = packet_rcv_spkt;
be02097c 3238
1da177e4
LT
3239 po->prot_hook.af_packet_priv = sk;
3240
0e11c91e
AV
3241 if (proto) {
3242 po->prot_hook.type = proto;
a6361f0c 3243 __register_prot_hook(sk);
1da177e4
LT
3244 }
3245
0fa7fa98 3246 mutex_lock(&net->packet.sklist_lock);
808f5114 3247 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3248 mutex_unlock(&net->packet.sklist_lock);
3249
3250 preempt_disable();
3680453c 3251 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3252 preempt_enable();
808f5114 3253
40d4e3df 3254 return 0;
b0138408
DB
3255out2:
3256 sk_free(sk);
1da177e4
LT
3257out:
3258 return err;
3259}
3260
3261/*
3262 * Pull a packet from our receive queue and hand it to the user.
3263 * If necessary we block.
3264 */
3265
1b784140
YX
3266static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3267 int flags)
1da177e4
LT
3268{
3269 struct sock *sk = sock->sk;
3270 struct sk_buff *skb;
3271 int copied, err;
bfd5f4a3 3272 int vnet_hdr_len = 0;
2472d761 3273 unsigned int origlen = 0;
1da177e4
LT
3274
3275 err = -EINVAL;
ed85b565 3276 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3277 goto out;
3278
3279#if 0
3280 /* What error should we return now? EUNATTACH? */
3281 if (pkt_sk(sk)->ifindex < 0)
3282 return -ENODEV;
3283#endif
3284
ed85b565 3285 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3286 err = sock_recv_errqueue(sk, msg, len,
3287 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3288 goto out;
3289 }
3290
1da177e4
LT
3291 /*
3292 * Call the generic datagram receiver. This handles all sorts
3293 * of horrible races and re-entrancy so we can forget about it
3294 * in the protocol layers.
3295 *
3296 * Now it will return ENETDOWN, if device have just gone down,
3297 * but then it will block.
3298 */
3299
40d4e3df 3300 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3301
3302 /*
1ce4f28b 3303 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3304 * handles the blocking we don't see and worry about blocking
3305 * retries.
3306 */
3307
8ae55f04 3308 if (skb == NULL)
1da177e4
LT
3309 goto out;
3310
2ccdbaa6
WB
3311 if (pkt_sk(sk)->pressure)
3312 packet_rcv_has_room(pkt_sk(sk), NULL);
3313
bfd5f4a3 3314 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3315 err = packet_rcv_vnet(msg, skb, &len);
3316 if (err)
bfd5f4a3 3317 goto out_free;
16cc1400 3318 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3319 }
3320
f3d33426
HFS
3321 /* You lose any data beyond the buffer you gave. If it worries
3322 * a user program they can ask the device for its MTU
3323 * anyway.
1da177e4 3324 */
1da177e4 3325 copied = skb->len;
40d4e3df
ED
3326 if (copied > len) {
3327 copied = len;
3328 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3329 }
3330
51f3d02b 3331 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3332 if (err)
3333 goto out_free;
3334
2472d761
EB
3335 if (sock->type != SOCK_PACKET) {
3336 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3337
3338 /* Original length was stored in sockaddr_ll fields */
3339 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3340 sll->sll_family = AF_PACKET;
3341 sll->sll_protocol = skb->protocol;
3342 }
3343
3b885787 3344 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3345
f3d33426
HFS
3346 if (msg->msg_name) {
3347 /* If the address length field is there to be filled
3348 * in, we fill it in now.
3349 */
3350 if (sock->type == SOCK_PACKET) {
342dfc30 3351 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3352 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3353 } else {
3354 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3355
f3d33426
HFS
3356 msg->msg_namelen = sll->sll_halen +
3357 offsetof(struct sockaddr_ll, sll_addr);
3358 }
ffbc6111
HX
3359 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3360 msg->msg_namelen);
f3d33426 3361 }
1da177e4 3362
8dc41944 3363 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3364 struct tpacket_auxdata aux;
3365
3366 aux.tp_status = TP_STATUS_USER;
3367 if (skb->ip_summed == CHECKSUM_PARTIAL)
3368 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3369 else if (skb->pkt_type != PACKET_OUTGOING &&
3370 (skb->ip_summed == CHECKSUM_COMPLETE ||
3371 skb_csum_unnecessary(skb)))
3372 aux.tp_status |= TP_STATUS_CSUM_VALID;
3373
2472d761 3374 aux.tp_len = origlen;
ffbc6111
HX
3375 aux.tp_snaplen = skb->len;
3376 aux.tp_mac = 0;
bbe735e4 3377 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3378 if (skb_vlan_tag_present(skb)) {
3379 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3380 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3381 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3382 } else {
3383 aux.tp_vlan_tci = 0;
a0cdfcf3 3384 aux.tp_vlan_tpid = 0;
a3bcc23e 3385 }
ffbc6111 3386 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3387 }
3388
1da177e4
LT
3389 /*
3390 * Free or return the buffer as appropriate. Again this
3391 * hides all the races and re-entrancy issues from us.
3392 */
bfd5f4a3 3393 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3394
3395out_free:
3396 skb_free_datagram(sk, skb);
3397out:
3398 return err;
3399}
3400
1da177e4 3401static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3402 int peer)
1da177e4
LT
3403{
3404 struct net_device *dev;
3405 struct sock *sk = sock->sk;
3406
3407 if (peer)
3408 return -EOPNOTSUPP;
3409
3410 uaddr->sa_family = AF_PACKET;
2dc85bf3 3411 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3412 rcu_read_lock();
3413 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3414 if (dev)
2dc85bf3 3415 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3416 rcu_read_unlock();
1da177e4 3417
9b2c45d4 3418 return sizeof(*uaddr);
1da177e4 3419}
1da177e4
LT
3420
3421static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3422 int peer)
1da177e4
LT
3423{
3424 struct net_device *dev;
3425 struct sock *sk = sock->sk;
3426 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3427 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3428
3429 if (peer)
3430 return -EOPNOTSUPP;
3431
3432 sll->sll_family = AF_PACKET;
3433 sll->sll_ifindex = po->ifindex;
3434 sll->sll_protocol = po->num;
67286640 3435 sll->sll_pkttype = 0;
654d1f8a
ED
3436 rcu_read_lock();
3437 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3438 if (dev) {
3439 sll->sll_hatype = dev->type;
3440 sll->sll_halen = dev->addr_len;
3441 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3442 } else {
3443 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3444 sll->sll_halen = 0;
3445 }
654d1f8a 3446 rcu_read_unlock();
1da177e4 3447
9b2c45d4 3448 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3449}
3450
2aeb0b88
WC
3451static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3452 int what)
1da177e4
LT
3453{
3454 switch (i->type) {
3455 case PACKET_MR_MULTICAST:
1162563f
JP
3456 if (i->alen != dev->addr_len)
3457 return -EINVAL;
1da177e4 3458 if (what > 0)
22bedad3 3459 return dev_mc_add(dev, i->addr);
1da177e4 3460 else
22bedad3 3461 return dev_mc_del(dev, i->addr);
1da177e4
LT
3462 break;
3463 case PACKET_MR_PROMISC:
2aeb0b88 3464 return dev_set_promiscuity(dev, what);
1da177e4 3465 case PACKET_MR_ALLMULTI:
2aeb0b88 3466 return dev_set_allmulti(dev, what);
d95ed927 3467 case PACKET_MR_UNICAST:
1162563f
JP
3468 if (i->alen != dev->addr_len)
3469 return -EINVAL;
d95ed927 3470 if (what > 0)
a748ee24 3471 return dev_uc_add(dev, i->addr);
d95ed927 3472 else
a748ee24 3473 return dev_uc_del(dev, i->addr);
d95ed927 3474 break;
40d4e3df
ED
3475 default:
3476 break;
1da177e4 3477 }
2aeb0b88 3478 return 0;
1da177e4
LT
3479}
3480
82f17091
FR
3481static void packet_dev_mclist_delete(struct net_device *dev,
3482 struct packet_mclist **mlp)
1da177e4 3483{
82f17091
FR
3484 struct packet_mclist *ml;
3485
3486 while ((ml = *mlp) != NULL) {
3487 if (ml->ifindex == dev->ifindex) {
3488 packet_dev_mc(dev, ml, -1);
3489 *mlp = ml->next;
3490 kfree(ml);
3491 } else
3492 mlp = &ml->next;
1da177e4
LT
3493 }
3494}
3495
0fb375fb 3496static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3497{
3498 struct packet_sock *po = pkt_sk(sk);
3499 struct packet_mclist *ml, *i;
3500 struct net_device *dev;
3501 int err;
3502
3503 rtnl_lock();
3504
3505 err = -ENODEV;
3b1e0a65 3506 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3507 if (!dev)
3508 goto done;
3509
3510 err = -EINVAL;
1162563f 3511 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3512 goto done;
3513
3514 err = -ENOBUFS;
8b3a7005 3515 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3516 if (i == NULL)
3517 goto done;
3518
3519 err = 0;
3520 for (ml = po->mclist; ml; ml = ml->next) {
3521 if (ml->ifindex == mreq->mr_ifindex &&
3522 ml->type == mreq->mr_type &&
3523 ml->alen == mreq->mr_alen &&
3524 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3525 ml->count++;
3526 /* Free the new element ... */
3527 kfree(i);
3528 goto done;
3529 }
3530 }
3531
3532 i->type = mreq->mr_type;
3533 i->ifindex = mreq->mr_ifindex;
3534 i->alen = mreq->mr_alen;
3535 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3536 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3537 i->count = 1;
3538 i->next = po->mclist;
3539 po->mclist = i;
2aeb0b88
WC
3540 err = packet_dev_mc(dev, i, 1);
3541 if (err) {
3542 po->mclist = i->next;
3543 kfree(i);
3544 }
1da177e4
LT
3545
3546done:
3547 rtnl_unlock();
3548 return err;
3549}
3550
0fb375fb 3551static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3552{
3553 struct packet_mclist *ml, **mlp;
3554
3555 rtnl_lock();
3556
3557 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3558 if (ml->ifindex == mreq->mr_ifindex &&
3559 ml->type == mreq->mr_type &&
3560 ml->alen == mreq->mr_alen &&
3561 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3562 if (--ml->count == 0) {
3563 struct net_device *dev;
3564 *mlp = ml->next;
ad959e76
ED
3565 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3566 if (dev)
1da177e4 3567 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3568 kfree(ml);
3569 }
82f17091 3570 break;
1da177e4
LT
3571 }
3572 }
3573 rtnl_unlock();
82f17091 3574 return 0;
1da177e4
LT
3575}
3576
3577static void packet_flush_mclist(struct sock *sk)
3578{
3579 struct packet_sock *po = pkt_sk(sk);
3580 struct packet_mclist *ml;
3581
3582 if (!po->mclist)
3583 return;
3584
3585 rtnl_lock();
3586 while ((ml = po->mclist) != NULL) {
3587 struct net_device *dev;
3588
3589 po->mclist = ml->next;
ad959e76
ED
3590 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3591 if (dev != NULL)
1da177e4 3592 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3593 kfree(ml);
3594 }
3595 rtnl_unlock();
3596}
1da177e4
LT
3597
3598static int
b7058842 3599packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3600{
3601 struct sock *sk = sock->sk;
8dc41944 3602 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3603 int ret;
3604
3605 if (level != SOL_PACKET)
3606 return -ENOPROTOOPT;
3607
69e3c75f 3608 switch (optname) {
1ce4f28b 3609 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3610 case PACKET_DROP_MEMBERSHIP:
3611 {
0fb375fb
EB
3612 struct packet_mreq_max mreq;
3613 int len = optlen;
3614 memset(&mreq, 0, sizeof(mreq));
3615 if (len < sizeof(struct packet_mreq))
1da177e4 3616 return -EINVAL;
0fb375fb
EB
3617 if (len > sizeof(mreq))
3618 len = sizeof(mreq);
40d4e3df 3619 if (copy_from_user(&mreq, optval, len))
1da177e4 3620 return -EFAULT;
0fb375fb
EB
3621 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3622 return -EINVAL;
1da177e4
LT
3623 if (optname == PACKET_ADD_MEMBERSHIP)
3624 ret = packet_mc_add(sk, &mreq);
3625 else
3626 ret = packet_mc_drop(sk, &mreq);
3627 return ret;
3628 }
a2efcfa0 3629
1da177e4 3630 case PACKET_RX_RING:
69e3c75f 3631 case PACKET_TX_RING:
1da177e4 3632 {
f6fb8f10 3633 union tpacket_req_u req_u;
3634 int len;
1da177e4 3635
5171b37d 3636 lock_sock(sk);
f6fb8f10 3637 switch (po->tp_version) {
3638 case TPACKET_V1:
3639 case TPACKET_V2:
3640 len = sizeof(req_u.req);
3641 break;
3642 case TPACKET_V3:
3643 default:
3644 len = sizeof(req_u.req3);
3645 break;
3646 }
5171b37d
ED
3647 if (optlen < len) {
3648 ret = -EINVAL;
3649 } else {
3650 if (copy_from_user(&req_u.req, optval, len))
3651 ret = -EFAULT;
3652 else
3653 ret = packet_set_ring(sk, &req_u, 0,
3654 optname == PACKET_TX_RING);
3655 }
3656 release_sock(sk);
3657 return ret;
1da177e4
LT
3658 }
3659 case PACKET_COPY_THRESH:
3660 {
3661 int val;
3662
40d4e3df 3663 if (optlen != sizeof(val))
1da177e4 3664 return -EINVAL;
40d4e3df 3665 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3666 return -EFAULT;
3667
3668 pkt_sk(sk)->copy_thresh = val;
3669 return 0;
3670 }
bbd6ef87
PM
3671 case PACKET_VERSION:
3672 {
3673 int val;
3674
3675 if (optlen != sizeof(val))
3676 return -EINVAL;
bbd6ef87
PM
3677 if (copy_from_user(&val, optval, sizeof(val)))
3678 return -EFAULT;
3679 switch (val) {
3680 case TPACKET_V1:
3681 case TPACKET_V2:
f6fb8f10 3682 case TPACKET_V3:
84ac7260 3683 break;
bbd6ef87
PM
3684 default:
3685 return -EINVAL;
3686 }
84ac7260
PP
3687 lock_sock(sk);
3688 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3689 ret = -EBUSY;
3690 } else {
3691 po->tp_version = val;
3692 ret = 0;
3693 }
3694 release_sock(sk);
3695 return ret;
bbd6ef87 3696 }
8913336a
PM
3697 case PACKET_RESERVE:
3698 {
3699 unsigned int val;
3700
3701 if (optlen != sizeof(val))
3702 return -EINVAL;
8913336a
PM
3703 if (copy_from_user(&val, optval, sizeof(val)))
3704 return -EFAULT;
bcc5364b
AK
3705 if (val > INT_MAX)
3706 return -EINVAL;
c27927e3
WB
3707 lock_sock(sk);
3708 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3709 ret = -EBUSY;
3710 } else {
3711 po->tp_reserve = val;
3712 ret = 0;
3713 }
3714 release_sock(sk);
3715 return ret;
8913336a 3716 }
69e3c75f
JB
3717 case PACKET_LOSS:
3718 {
3719 unsigned int val;
3720
3721 if (optlen != sizeof(val))
3722 return -EINVAL;
69e3c75f
JB
3723 if (copy_from_user(&val, optval, sizeof(val)))
3724 return -EFAULT;
a6361f0c
WB
3725
3726 lock_sock(sk);
3727 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3728 ret = -EBUSY;
3729 } else {
3730 po->tp_loss = !!val;
3731 ret = 0;
3732 }
3733 release_sock(sk);
3734 return ret;
69e3c75f 3735 }
8dc41944
HX
3736 case PACKET_AUXDATA:
3737 {
3738 int val;
3739
3740 if (optlen < sizeof(val))
3741 return -EINVAL;
3742 if (copy_from_user(&val, optval, sizeof(val)))
3743 return -EFAULT;
3744
a6361f0c 3745 lock_sock(sk);
8dc41944 3746 po->auxdata = !!val;
a6361f0c 3747 release_sock(sk);
8dc41944
HX
3748 return 0;
3749 }
80feaacb
PWJ
3750 case PACKET_ORIGDEV:
3751 {
3752 int val;
3753
3754 if (optlen < sizeof(val))
3755 return -EINVAL;
3756 if (copy_from_user(&val, optval, sizeof(val)))
3757 return -EFAULT;
3758
a6361f0c 3759 lock_sock(sk);
80feaacb 3760 po->origdev = !!val;
a6361f0c 3761 release_sock(sk);
80feaacb
PWJ
3762 return 0;
3763 }
bfd5f4a3
SS
3764 case PACKET_VNET_HDR:
3765 {
3766 int val;
3767
3768 if (sock->type != SOCK_RAW)
3769 return -EINVAL;
bfd5f4a3
SS
3770 if (optlen < sizeof(val))
3771 return -EINVAL;
3772 if (copy_from_user(&val, optval, sizeof(val)))
3773 return -EFAULT;
3774
a6361f0c
WB
3775 lock_sock(sk);
3776 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3777 ret = -EBUSY;
3778 } else {
3779 po->has_vnet_hdr = !!val;
3780 ret = 0;
3781 }
3782 release_sock(sk);
3783 return ret;
bfd5f4a3 3784 }
614f60fa
SM
3785 case PACKET_TIMESTAMP:
3786 {
3787 int val;
3788
3789 if (optlen != sizeof(val))
3790 return -EINVAL;
3791 if (copy_from_user(&val, optval, sizeof(val)))
3792 return -EFAULT;
3793
3794 po->tp_tstamp = val;
3795 return 0;
3796 }
dc99f600
DM
3797 case PACKET_FANOUT:
3798 {
3799 int val;
3800
3801 if (optlen != sizeof(val))
3802 return -EINVAL;
3803 if (copy_from_user(&val, optval, sizeof(val)))
3804 return -EFAULT;
3805
3806 return fanout_add(sk, val & 0xffff, val >> 16);
3807 }
47dceb8e
WB
3808 case PACKET_FANOUT_DATA:
3809 {
3810 if (!po->fanout)
3811 return -EINVAL;
3812
3813 return fanout_set_data(po, optval, optlen);
3814 }
fa788d98
VW
3815 case PACKET_IGNORE_OUTGOING:
3816 {
3817 int val;
3818
3819 if (optlen != sizeof(val))
3820 return -EINVAL;
3821 if (copy_from_user(&val, optval, sizeof(val)))
3822 return -EFAULT;
3823 if (val < 0 || val > 1)
3824 return -EINVAL;
3825
3826 po->prot_hook.ignore_outgoing = !!val;
3827 return 0;
3828 }
5920cd3a
PC
3829 case PACKET_TX_HAS_OFF:
3830 {
3831 unsigned int val;
3832
3833 if (optlen != sizeof(val))
3834 return -EINVAL;
5920cd3a
PC
3835 if (copy_from_user(&val, optval, sizeof(val)))
3836 return -EFAULT;
a6361f0c
WB
3837
3838 lock_sock(sk);
3839 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3840 ret = -EBUSY;
3841 } else {
3842 po->tp_tx_has_off = !!val;
3843 ret = 0;
3844 }
3845 release_sock(sk);
5920cd3a
PC
3846 return 0;
3847 }
d346a3fa
DB
3848 case PACKET_QDISC_BYPASS:
3849 {
3850 int val;
3851
3852 if (optlen != sizeof(val))
3853 return -EINVAL;
3854 if (copy_from_user(&val, optval, sizeof(val)))
3855 return -EFAULT;
3856
3857 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3858 return 0;
3859 }
1da177e4
LT
3860 default:
3861 return -ENOPROTOOPT;
3862 }
3863}
3864
3865static int packet_getsockopt(struct socket *sock, int level, int optname,
3866 char __user *optval, int __user *optlen)
3867{
3868 int len;
c06fff6e 3869 int val, lv = sizeof(val);
1da177e4
LT
3870 struct sock *sk = sock->sk;
3871 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3872 void *data = &val;
ee80fbf3 3873 union tpacket_stats_u st;
a9b63918 3874 struct tpacket_rollover_stats rstats;
1da177e4
LT
3875
3876 if (level != SOL_PACKET)
3877 return -ENOPROTOOPT;
3878
8ae55f04
KK
3879 if (get_user(len, optlen))
3880 return -EFAULT;
1da177e4
LT
3881
3882 if (len < 0)
3883 return -EINVAL;
1ce4f28b 3884
69e3c75f 3885 switch (optname) {
1da177e4 3886 case PACKET_STATISTICS:
1da177e4 3887 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3888 memcpy(&st, &po->stats, sizeof(st));
3889 memset(&po->stats, 0, sizeof(po->stats));
3890 spin_unlock_bh(&sk->sk_receive_queue.lock);
3891
f6fb8f10 3892 if (po->tp_version == TPACKET_V3) {
c06fff6e 3893 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3894 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3895 data = &st.stats3;
f6fb8f10 3896 } else {
c06fff6e 3897 lv = sizeof(struct tpacket_stats);
8bcdeaff 3898 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3899 data = &st.stats1;
f6fb8f10 3900 }
ee80fbf3 3901
8dc41944
HX
3902 break;
3903 case PACKET_AUXDATA:
8dc41944 3904 val = po->auxdata;
80feaacb
PWJ
3905 break;
3906 case PACKET_ORIGDEV:
80feaacb 3907 val = po->origdev;
bfd5f4a3
SS
3908 break;
3909 case PACKET_VNET_HDR:
bfd5f4a3 3910 val = po->has_vnet_hdr;
1da177e4 3911 break;
bbd6ef87 3912 case PACKET_VERSION:
bbd6ef87 3913 val = po->tp_version;
bbd6ef87
PM
3914 break;
3915 case PACKET_HDRLEN:
3916 if (len > sizeof(int))
3917 len = sizeof(int);
fd2c83b3
AP
3918 if (len < sizeof(int))
3919 return -EINVAL;
bbd6ef87
PM
3920 if (copy_from_user(&val, optval, len))
3921 return -EFAULT;
3922 switch (val) {
3923 case TPACKET_V1:
3924 val = sizeof(struct tpacket_hdr);
3925 break;
3926 case TPACKET_V2:
3927 val = sizeof(struct tpacket2_hdr);
3928 break;
f6fb8f10 3929 case TPACKET_V3:
3930 val = sizeof(struct tpacket3_hdr);
3931 break;
bbd6ef87
PM
3932 default:
3933 return -EINVAL;
3934 }
bbd6ef87 3935 break;
8913336a 3936 case PACKET_RESERVE:
8913336a 3937 val = po->tp_reserve;
8913336a 3938 break;
69e3c75f 3939 case PACKET_LOSS:
69e3c75f 3940 val = po->tp_loss;
69e3c75f 3941 break;
614f60fa 3942 case PACKET_TIMESTAMP:
614f60fa 3943 val = po->tp_tstamp;
614f60fa 3944 break;
dc99f600 3945 case PACKET_FANOUT:
dc99f600
DM
3946 val = (po->fanout ?
3947 ((u32)po->fanout->id |
77f65ebd
WB
3948 ((u32)po->fanout->type << 16) |
3949 ((u32)po->fanout->flags << 24)) :
dc99f600 3950 0);
dc99f600 3951 break;
fa788d98
VW
3952 case PACKET_IGNORE_OUTGOING:
3953 val = po->prot_hook.ignore_outgoing;
3954 break;
a9b63918 3955 case PACKET_ROLLOVER_STATS:
57f015f5 3956 if (!po->rollover)
a9b63918 3957 return -EINVAL;
57f015f5
MM
3958 rstats.tp_all = atomic_long_read(&po->rollover->num);
3959 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3960 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3961 data = &rstats;
3962 lv = sizeof(rstats);
a9b63918 3963 break;
5920cd3a
PC
3964 case PACKET_TX_HAS_OFF:
3965 val = po->tp_tx_has_off;
3966 break;
d346a3fa
DB
3967 case PACKET_QDISC_BYPASS:
3968 val = packet_use_direct_xmit(po);
3969 break;
1da177e4
LT
3970 default:
3971 return -ENOPROTOOPT;
3972 }
3973
c06fff6e
ED
3974 if (len > lv)
3975 len = lv;
8ae55f04
KK
3976 if (put_user(len, optlen))
3977 return -EFAULT;
8dc41944
HX
3978 if (copy_to_user(optval, data, len))
3979 return -EFAULT;
8ae55f04 3980 return 0;
1da177e4
LT
3981}
3982
3983
719c44d3
WB
3984#ifdef CONFIG_COMPAT
3985static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3986 char __user *optval, unsigned int optlen)
3987{
3988 struct packet_sock *po = pkt_sk(sock->sk);
3989
3990 if (level != SOL_PACKET)
3991 return -ENOPROTOOPT;
3992
3993 if (optname == PACKET_FANOUT_DATA &&
3994 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
3995 optval = (char __user *)get_compat_bpf_fprog(optval);
3996 if (!optval)
3997 return -EFAULT;
3998 optlen = sizeof(struct sock_fprog);
3999 }
4000
4001 return packet_setsockopt(sock, level, optname, optval, optlen);
4002}
4003#endif
4004
351638e7
JP
4005static int packet_notifier(struct notifier_block *this,
4006 unsigned long msg, void *ptr)
1da177e4
LT
4007{
4008 struct sock *sk;
351638e7 4009 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4010 struct net *net = dev_net(dev);
1da177e4 4011
808f5114 4012 rcu_read_lock();
b67bfe0d 4013 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4014 struct packet_sock *po = pkt_sk(sk);
4015
4016 switch (msg) {
4017 case NETDEV_UNREGISTER:
1da177e4 4018 if (po->mclist)
82f17091 4019 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4020 /* fallthrough */
4021
1da177e4
LT
4022 case NETDEV_DOWN:
4023 if (dev->ifindex == po->ifindex) {
4024 spin_lock(&po->bind_lock);
4025 if (po->running) {
ce06b03e 4026 __unregister_prot_hook(sk, false);
1da177e4
LT
4027 sk->sk_err = ENETDOWN;
4028 if (!sock_flag(sk, SOCK_DEAD))
4029 sk->sk_error_report(sk);
4030 }
4031 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4032 packet_cached_dev_reset(po);
1da177e4 4033 po->ifindex = -1;
160ff18a
BG
4034 if (po->prot_hook.dev)
4035 dev_put(po->prot_hook.dev);
1da177e4
LT
4036 po->prot_hook.dev = NULL;
4037 }
4038 spin_unlock(&po->bind_lock);
4039 }
4040 break;
4041 case NETDEV_UP:
808f5114 4042 if (dev->ifindex == po->ifindex) {
4043 spin_lock(&po->bind_lock);
ce06b03e
DM
4044 if (po->num)
4045 register_prot_hook(sk);
808f5114 4046 spin_unlock(&po->bind_lock);
1da177e4 4047 }
1da177e4
LT
4048 break;
4049 }
4050 }
808f5114 4051 rcu_read_unlock();
1da177e4
LT
4052 return NOTIFY_DONE;
4053}
4054
4055
4056static int packet_ioctl(struct socket *sock, unsigned int cmd,
4057 unsigned long arg)
4058{
4059 struct sock *sk = sock->sk;
4060
69e3c75f 4061 switch (cmd) {
40d4e3df
ED
4062 case SIOCOUTQ:
4063 {
4064 int amount = sk_wmem_alloc_get(sk);
31e6d363 4065
40d4e3df
ED
4066 return put_user(amount, (int __user *)arg);
4067 }
4068 case SIOCINQ:
4069 {
4070 struct sk_buff *skb;
4071 int amount = 0;
4072
4073 spin_lock_bh(&sk->sk_receive_queue.lock);
4074 skb = skb_peek(&sk->sk_receive_queue);
4075 if (skb)
4076 amount = skb->len;
4077 spin_unlock_bh(&sk->sk_receive_queue.lock);
4078 return put_user(amount, (int __user *)arg);
4079 }
4080 case SIOCGSTAMP:
4081 return sock_get_timestamp(sk, (struct timeval __user *)arg);
4082 case SIOCGSTAMPNS:
4083 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 4084
1da177e4 4085#ifdef CONFIG_INET
40d4e3df
ED
4086 case SIOCADDRT:
4087 case SIOCDELRT:
4088 case SIOCDARP:
4089 case SIOCGARP:
4090 case SIOCSARP:
4091 case SIOCGIFADDR:
4092 case SIOCSIFADDR:
4093 case SIOCGIFBRDADDR:
4094 case SIOCSIFBRDADDR:
4095 case SIOCGIFNETMASK:
4096 case SIOCSIFNETMASK:
4097 case SIOCGIFDSTADDR:
4098 case SIOCSIFDSTADDR:
4099 case SIOCSIFFLAGS:
40d4e3df 4100 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4101#endif
4102
40d4e3df
ED
4103 default:
4104 return -ENOIOCTLCMD;
1da177e4
LT
4105 }
4106 return 0;
4107}
4108
a11e1d43
LT
4109static __poll_t packet_poll(struct file *file, struct socket *sock,
4110 poll_table *wait)
1da177e4
LT
4111{
4112 struct sock *sk = sock->sk;
4113 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4114 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4115
4116 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4117 if (po->rx_ring.pg_vec) {
f6fb8f10 4118 if (!packet_previous_rx_frame(po, &po->rx_ring,
4119 TP_STATUS_KERNEL))
a9a08845 4120 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4121 }
2ccdbaa6 4122 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4123 po->pressure = 0;
1da177e4 4124 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4125 spin_lock_bh(&sk->sk_write_queue.lock);
4126 if (po->tx_ring.pg_vec) {
4127 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4128 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4129 }
4130 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4131 return mask;
4132}
4133
4134
4135/* Dirty? Well, I still did not learn better way to account
4136 * for user mmaps.
4137 */
4138
4139static void packet_mm_open(struct vm_area_struct *vma)
4140{
4141 struct file *file = vma->vm_file;
40d4e3df 4142 struct socket *sock = file->private_data;
1da177e4 4143 struct sock *sk = sock->sk;
1ce4f28b 4144
1da177e4
LT
4145 if (sk)
4146 atomic_inc(&pkt_sk(sk)->mapped);
4147}
4148
4149static void packet_mm_close(struct vm_area_struct *vma)
4150{
4151 struct file *file = vma->vm_file;
40d4e3df 4152 struct socket *sock = file->private_data;
1da177e4 4153 struct sock *sk = sock->sk;
1ce4f28b 4154
1da177e4
LT
4155 if (sk)
4156 atomic_dec(&pkt_sk(sk)->mapped);
4157}
4158
f0f37e2f 4159static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4160 .open = packet_mm_open,
4161 .close = packet_mm_close,
1da177e4
LT
4162};
4163
3a7ad063
ED
4164static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4165 unsigned int len)
1da177e4
LT
4166{
4167 int i;
4168
4ebf0ae2 4169 for (i = 0; i < len; i++) {
0e3125c7 4170 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4171 if (is_vmalloc_addr(pg_vec[i].buffer))
4172 vfree(pg_vec[i].buffer);
4173 else
4174 free_pages((unsigned long)pg_vec[i].buffer,
4175 order);
0e3125c7
NH
4176 pg_vec[i].buffer = NULL;
4177 }
1da177e4
LT
4178 }
4179 kfree(pg_vec);
4180}
4181
3a7ad063 4182static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4183{
f0d4eb29 4184 char *buffer;
3a7ad063
ED
4185 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4186 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4187
3a7ad063 4188 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4189 if (buffer)
4190 return buffer;
4191
3a7ad063
ED
4192 /* __get_free_pages failed, fall back to vmalloc */
4193 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4194 if (buffer)
4195 return buffer;
0e3125c7 4196
3a7ad063
ED
4197 /* vmalloc failed, lets dig into swap here */
4198 gfp_flags &= ~__GFP_NORETRY;
4199 buffer = (char *) __get_free_pages(gfp_flags, order);
4200 if (buffer)
4201 return buffer;
4202
4203 /* complete and utter failure */
4204 return NULL;
4ebf0ae2
DM
4205}
4206
3a7ad063 4207static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4208{
4209 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4210 struct pgv *pg_vec;
4ebf0ae2
DM
4211 int i;
4212
0e3125c7 4213 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
4214 if (unlikely(!pg_vec))
4215 goto out;
4216
4217 for (i = 0; i < block_nr; i++) {
3a7ad063 4218 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4219 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4220 goto out_free_pgvec;
4221 }
4222
4223out:
4224 return pg_vec;
4225
4226out_free_pgvec:
3a7ad063 4227 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4228 pg_vec = NULL;
4229 goto out;
4230}
1da177e4 4231
f6fb8f10 4232static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4233 int closing, int tx_ring)
1da177e4 4234{
0e3125c7 4235 struct pgv *pg_vec = NULL;
1da177e4 4236 struct packet_sock *po = pkt_sk(sk);
3a7ad063 4237 int was_running, order = 0;
69e3c75f
JB
4238 struct packet_ring_buffer *rb;
4239 struct sk_buff_head *rb_queue;
0e11c91e 4240 __be16 num;
f6fb8f10 4241 int err = -EINVAL;
4242 /* Added to avoid minimal code churn */
4243 struct tpacket_req *req = &req_u->req;
4244
69e3c75f
JB
4245 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4246 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4247
69e3c75f
JB
4248 err = -EBUSY;
4249 if (!closing) {
4250 if (atomic_read(&po->mapped))
4251 goto out;
b0138408 4252 if (packet_read_pending(rb))
69e3c75f
JB
4253 goto out;
4254 }
1da177e4 4255
69e3c75f 4256 if (req->tp_block_nr) {
4576cd46
WB
4257 unsigned int min_frame_size;
4258
69e3c75f
JB
4259 /* Sanity tests and some calculations */
4260 err = -EBUSY;
4261 if (unlikely(rb->pg_vec))
4262 goto out;
1da177e4 4263
bbd6ef87
PM
4264 switch (po->tp_version) {
4265 case TPACKET_V1:
4266 po->tp_hdrlen = TPACKET_HDRLEN;
4267 break;
4268 case TPACKET_V2:
4269 po->tp_hdrlen = TPACKET2_HDRLEN;
4270 break;
f6fb8f10 4271 case TPACKET_V3:
4272 po->tp_hdrlen = TPACKET3_HDRLEN;
4273 break;
bbd6ef87
PM
4274 }
4275
69e3c75f 4276 err = -EINVAL;
4ebf0ae2 4277 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4278 goto out;
90836b67 4279 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4280 goto out;
4576cd46 4281 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4282 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4283 req->tp_block_size <
4284 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4285 goto out;
4576cd46 4286 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4287 goto out;
4ebf0ae2 4288 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4289 goto out;
1da177e4 4290
4194b491
TK
4291 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4292 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4293 goto out;
8f8d28e4
AK
4294 if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
4295 goto out;
69e3c75f
JB
4296 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4297 req->tp_frame_nr))
4298 goto out;
1da177e4
LT
4299
4300 err = -ENOMEM;
3a7ad063
ED
4301 order = get_order(req->tp_block_size);
4302 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4303 if (unlikely(!pg_vec))
1da177e4 4304 goto out;
f6fb8f10 4305 switch (po->tp_version) {
4306 case TPACKET_V3:
7f953ab2
SV
4307 /* Block transmit is not supported yet */
4308 if (!tx_ring) {
e8e85cc5 4309 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4310 } else {
4311 struct tpacket_req3 *req3 = &req_u->req3;
4312
4313 if (req3->tp_retire_blk_tov ||
4314 req3->tp_sizeof_priv ||
4315 req3->tp_feature_req_word) {
4316 err = -EINVAL;
4317 goto out;
4318 }
4319 }
d7cf0c34 4320 break;
f6fb8f10 4321 default:
4322 break;
4323 }
69e3c75f
JB
4324 }
4325 /* Done */
4326 else {
4327 err = -EINVAL;
4ebf0ae2 4328 if (unlikely(req->tp_frame_nr))
69e3c75f 4329 goto out;
1da177e4
LT
4330 }
4331
1da177e4
LT
4332
4333 /* Detach socket from network */
4334 spin_lock(&po->bind_lock);
4335 was_running = po->running;
4336 num = po->num;
4337 if (was_running) {
1da177e4 4338 po->num = 0;
ce06b03e 4339 __unregister_prot_hook(sk, false);
1da177e4
LT
4340 }
4341 spin_unlock(&po->bind_lock);
1ce4f28b 4342
1da177e4
LT
4343 synchronize_net();
4344
4345 err = -EBUSY;
905db440 4346 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4347 if (closing || atomic_read(&po->mapped) == 0) {
4348 err = 0;
69e3c75f 4349 spin_lock_bh(&rb_queue->lock);
c053fd96 4350 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4351 rb->frame_max = (req->tp_frame_nr - 1);
4352 rb->head = 0;
4353 rb->frame_size = req->tp_frame_size;
4354 spin_unlock_bh(&rb_queue->lock);
4355
3a7ad063 4356 swap(rb->pg_vec_order, order);
c053fd96 4357 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4358
4359 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4360 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4361 tpacket_rcv : packet_rcv;
4362 skb_queue_purge(rb_queue);
1da177e4 4363 if (atomic_read(&po->mapped))
40d4e3df
ED
4364 pr_err("packet_mmap: vma is busy: %d\n",
4365 atomic_read(&po->mapped));
1da177e4 4366 }
905db440 4367 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4368
4369 spin_lock(&po->bind_lock);
ce06b03e 4370 if (was_running) {
1da177e4 4371 po->num = num;
ce06b03e 4372 register_prot_hook(sk);
1da177e4
LT
4373 }
4374 spin_unlock(&po->bind_lock);
c800aaf8 4375 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4376 /* Because we don't support block-based V3 on tx-ring */
4377 if (!tx_ring)
73d0fcf2 4378 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4379 }
1da177e4 4380
1da177e4 4381 if (pg_vec)
3a7ad063 4382 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4383out:
4384 return err;
4385}
4386
69e3c75f
JB
4387static int packet_mmap(struct file *file, struct socket *sock,
4388 struct vm_area_struct *vma)
1da177e4
LT
4389{
4390 struct sock *sk = sock->sk;
4391 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4392 unsigned long size, expected_size;
4393 struct packet_ring_buffer *rb;
1da177e4
LT
4394 unsigned long start;
4395 int err = -EINVAL;
4396 int i;
4397
4398 if (vma->vm_pgoff)
4399 return -EINVAL;
4400
905db440 4401 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4402
4403 expected_size = 0;
4404 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4405 if (rb->pg_vec) {
4406 expected_size += rb->pg_vec_len
4407 * rb->pg_vec_pages
4408 * PAGE_SIZE;
4409 }
4410 }
4411
4412 if (expected_size == 0)
1da177e4 4413 goto out;
69e3c75f
JB
4414
4415 size = vma->vm_end - vma->vm_start;
4416 if (size != expected_size)
1da177e4
LT
4417 goto out;
4418
1da177e4 4419 start = vma->vm_start;
69e3c75f
JB
4420 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4421 if (rb->pg_vec == NULL)
4422 continue;
4423
4424 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4425 struct page *page;
4426 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4427 int pg_num;
4428
c56b4d90
CG
4429 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4430 page = pgv_to_page(kaddr);
69e3c75f
JB
4431 err = vm_insert_page(vma, start, page);
4432 if (unlikely(err))
4433 goto out;
4434 start += PAGE_SIZE;
0e3125c7 4435 kaddr += PAGE_SIZE;
69e3c75f 4436 }
4ebf0ae2 4437 }
1da177e4 4438 }
69e3c75f 4439
4ebf0ae2 4440 atomic_inc(&po->mapped);
1da177e4
LT
4441 vma->vm_ops = &packet_mmap_ops;
4442 err = 0;
4443
4444out:
905db440 4445 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4446 return err;
4447}
1da177e4 4448
90ddc4f0 4449static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4450 .family = PF_PACKET,
4451 .owner = THIS_MODULE,
4452 .release = packet_release,
4453 .bind = packet_bind_spkt,
4454 .connect = sock_no_connect,
4455 .socketpair = sock_no_socketpair,
4456 .accept = sock_no_accept,
4457 .getname = packet_getname_spkt,
a11e1d43 4458 .poll = datagram_poll,
1da177e4
LT
4459 .ioctl = packet_ioctl,
4460 .listen = sock_no_listen,
4461 .shutdown = sock_no_shutdown,
4462 .setsockopt = sock_no_setsockopt,
4463 .getsockopt = sock_no_getsockopt,
4464 .sendmsg = packet_sendmsg_spkt,
4465 .recvmsg = packet_recvmsg,
4466 .mmap = sock_no_mmap,
4467 .sendpage = sock_no_sendpage,
4468};
1da177e4 4469
90ddc4f0 4470static const struct proto_ops packet_ops = {
1da177e4
LT
4471 .family = PF_PACKET,
4472 .owner = THIS_MODULE,
4473 .release = packet_release,
4474 .bind = packet_bind,
4475 .connect = sock_no_connect,
4476 .socketpair = sock_no_socketpair,
4477 .accept = sock_no_accept,
1ce4f28b 4478 .getname = packet_getname,
a11e1d43 4479 .poll = packet_poll,
1da177e4
LT
4480 .ioctl = packet_ioctl,
4481 .listen = sock_no_listen,
4482 .shutdown = sock_no_shutdown,
4483 .setsockopt = packet_setsockopt,
4484 .getsockopt = packet_getsockopt,
719c44d3
WB
4485#ifdef CONFIG_COMPAT
4486 .compat_setsockopt = compat_packet_setsockopt,
4487#endif
1da177e4
LT
4488 .sendmsg = packet_sendmsg,
4489 .recvmsg = packet_recvmsg,
4490 .mmap = packet_mmap,
4491 .sendpage = sock_no_sendpage,
4492};
4493
ec1b4cf7 4494static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4495 .family = PF_PACKET,
4496 .create = packet_create,
4497 .owner = THIS_MODULE,
4498};
4499
4500static struct notifier_block packet_netdev_notifier = {
40d4e3df 4501 .notifier_call = packet_notifier,
1da177e4
LT
4502};
4503
4504#ifdef CONFIG_PROC_FS
1da177e4
LT
4505
4506static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4507 __acquires(RCU)
1da177e4 4508{
e372c414 4509 struct net *net = seq_file_net(seq);
808f5114 4510
4511 rcu_read_lock();
4512 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4513}
4514
4515static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4516{
1bf40954 4517 struct net *net = seq_file_net(seq);
808f5114 4518 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4519}
4520
4521static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4522 __releases(RCU)
1da177e4 4523{
808f5114 4524 rcu_read_unlock();
1da177e4
LT
4525}
4526
1ce4f28b 4527static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4528{
4529 if (v == SEQ_START_TOKEN)
4530 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4531 else {
b7ceabd9 4532 struct sock *s = sk_entry(v);
1da177e4
LT
4533 const struct packet_sock *po = pkt_sk(s);
4534
4535 seq_printf(seq,
71338aa7 4536 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4537 s,
41c6d650 4538 refcount_read(&s->sk_refcnt),
1da177e4
LT
4539 s->sk_type,
4540 ntohs(po->num),
4541 po->ifindex,
4542 po->running,
4543 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4544 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4545 sock_i_ino(s));
1da177e4
LT
4546 }
4547
4548 return 0;
4549}
4550
56b3d975 4551static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4552 .start = packet_seq_start,
4553 .next = packet_seq_next,
4554 .stop = packet_seq_stop,
4555 .show = packet_seq_show,
4556};
1da177e4
LT
4557#endif
4558
2c8c1e72 4559static int __net_init packet_net_init(struct net *net)
d12d01d6 4560{
0fa7fa98 4561 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4562 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4563
c3506372
CH
4564 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4565 sizeof(struct seq_net_private)))
d12d01d6
DL
4566 return -ENOMEM;
4567
4568 return 0;
4569}
4570
2c8c1e72 4571static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4572{
ece31ffd 4573 remove_proc_entry("packet", net->proc_net);
669f8f1a 4574 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4575}
4576
4577static struct pernet_operations packet_net_ops = {
4578 .init = packet_net_init,
4579 .exit = packet_net_exit,
4580};
4581
4582
1da177e4
LT
4583static void __exit packet_exit(void)
4584{
1da177e4 4585 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4586 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4587 sock_unregister(PF_PACKET);
4588 proto_unregister(&packet_proto);
4589}
4590
4591static int __init packet_init(void)
4592{
4593 int rc = proto_register(&packet_proto, 0);
4594
4595 if (rc != 0)
4596 goto out;
4597
4598 sock_register(&packet_family_ops);
d12d01d6 4599 register_pernet_subsys(&packet_net_ops);
1da177e4 4600 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4601out:
4602 return rc;
4603}
4604
4605module_init(packet_init);
4606module_exit(packet_exit);
4607MODULE_LICENSE("GPL");
4608MODULE_ALIAS_NETPROTO(PF_PACKET);