]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
arp: filter NOARP neighbours for SIOCGARP
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
b0138408 91#include <linux/percpu.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111 218struct packet_skb_cb {
ffbc6111
HX
219 union {
220 struct sockaddr_pkt pkt;
2472d761
EB
221 union {
222 /* Trick: alias skb original length with
223 * ll.sll_family and ll.protocol in order
224 * to save room.
225 */
226 unsigned int origlen;
227 struct sockaddr_ll ll;
228 };
ffbc6111
HX
229 } sa;
230};
231
232#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 233
bc59ba39 234#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 235#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 236 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 237#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 238 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 239#define GET_NEXT_PRB_BLK_NUM(x) \
240 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241 ((x)->kactive_blk_num+1) : 0)
242
dc99f600
DM
243static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
244static void __fanout_link(struct sock *sk, struct packet_sock *po);
245
d346a3fa
DB
246static int packet_direct_xmit(struct sk_buff *skb)
247{
248 struct net_device *dev = skb->dev;
d346a3fa
DB
249 netdev_features_t features;
250 struct netdev_queue *txq;
43279500 251 int ret = NETDEV_TX_BUSY;
d346a3fa
DB
252
253 if (unlikely(!netif_running(dev) ||
43279500
DB
254 !netif_carrier_ok(dev)))
255 goto drop;
d346a3fa
DB
256
257 features = netif_skb_features(skb);
258 if (skb_needs_linearize(skb, features) &&
43279500
DB
259 __skb_linearize(skb))
260 goto drop;
d346a3fa 261
10c51b56 262 txq = skb_get_tx_queue(dev, skb);
d346a3fa 263
43279500
DB
264 local_bh_disable();
265
266 HARD_TX_LOCK(dev, txq, smp_processor_id());
10b3ad8c 267 if (!netif_xmit_frozen_or_drv_stopped(txq))
fa2dbdc2 268 ret = netdev_start_xmit(skb, dev, txq, false);
43279500 269 HARD_TX_UNLOCK(dev, txq);
d346a3fa 270
43279500
DB
271 local_bh_enable();
272
273 if (!dev_xmit_complete(ret))
d346a3fa 274 kfree_skb(skb);
43279500 275
d346a3fa 276 return ret;
43279500 277drop:
0f97ede4 278 atomic_long_inc(&dev->tx_dropped);
43279500
DB
279 kfree_skb(skb);
280 return NET_XMIT_DROP;
d346a3fa
DB
281}
282
66e56cd4
DB
283static struct net_device *packet_cached_dev_get(struct packet_sock *po)
284{
285 struct net_device *dev;
286
287 rcu_read_lock();
288 dev = rcu_dereference(po->cached_dev);
289 if (likely(dev))
290 dev_hold(dev);
291 rcu_read_unlock();
292
293 return dev;
294}
295
296static void packet_cached_dev_assign(struct packet_sock *po,
297 struct net_device *dev)
298{
299 rcu_assign_pointer(po->cached_dev, dev);
300}
301
302static void packet_cached_dev_reset(struct packet_sock *po)
303{
304 RCU_INIT_POINTER(po->cached_dev, NULL);
305}
306
d346a3fa
DB
307static bool packet_use_direct_xmit(const struct packet_sock *po)
308{
309 return po->xmit == packet_direct_xmit;
310}
311
0fd5d57b 312static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
d346a3fa 313{
1cbac010 314 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
315}
316
0fd5d57b
DB
317static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
318{
319 const struct net_device_ops *ops = dev->netdev_ops;
320 u16 queue_index;
321
322 if (ops->ndo_select_queue) {
323 queue_index = ops->ndo_select_queue(dev, skb, NULL,
324 __packet_pick_tx_queue);
325 queue_index = netdev_cap_txqueue(dev, queue_index);
326 } else {
327 queue_index = __packet_pick_tx_queue(dev, skb);
328 }
329
330 skb_set_queue_mapping(skb, queue_index);
331}
332
ce06b03e
DM
333/* register_prot_hook must be invoked with the po->bind_lock held,
334 * or from a context in which asynchronous accesses to the packet
335 * socket is not possible (packet_create()).
336 */
337static void register_prot_hook(struct sock *sk)
338{
339 struct packet_sock *po = pkt_sk(sk);
e40526cb 340
ce06b03e 341 if (!po->running) {
66e56cd4 342 if (po->fanout)
dc99f600 343 __fanout_link(sk, po);
66e56cd4 344 else
dc99f600 345 dev_add_pack(&po->prot_hook);
e40526cb 346
ce06b03e
DM
347 sock_hold(sk);
348 po->running = 1;
349 }
350}
351
352/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
353 * held. If the sync parameter is true, we will temporarily drop
354 * the po->bind_lock and do a synchronize_net to make sure no
355 * asynchronous packet processing paths still refer to the elements
356 * of po->prot_hook. If the sync parameter is false, it is the
357 * callers responsibility to take care of this.
358 */
359static void __unregister_prot_hook(struct sock *sk, bool sync)
360{
361 struct packet_sock *po = pkt_sk(sk);
362
363 po->running = 0;
66e56cd4
DB
364
365 if (po->fanout)
dc99f600 366 __fanout_unlink(sk, po);
66e56cd4 367 else
dc99f600 368 __dev_remove_pack(&po->prot_hook);
e40526cb 369
ce06b03e
DM
370 __sock_put(sk);
371
372 if (sync) {
373 spin_unlock(&po->bind_lock);
374 synchronize_net();
375 spin_lock(&po->bind_lock);
376 }
377}
378
379static void unregister_prot_hook(struct sock *sk, bool sync)
380{
381 struct packet_sock *po = pkt_sk(sk);
382
383 if (po->running)
384 __unregister_prot_hook(sk, sync);
385}
386
6e58040b 387static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
388{
389 if (is_vmalloc_addr(addr))
390 return vmalloc_to_page(addr);
391 return virt_to_page(addr);
392}
393
69e3c75f 394static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 395{
184f489e 396 union tpacket_uhdr h;
1da177e4 397
69e3c75f 398 h.raw = frame;
bbd6ef87
PM
399 switch (po->tp_version) {
400 case TPACKET_V1:
69e3c75f 401 h.h1->tp_status = status;
0af55bb5 402 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
403 break;
404 case TPACKET_V2:
69e3c75f 405 h.h2->tp_status = status;
0af55bb5 406 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 407 break;
f6fb8f10 408 case TPACKET_V3:
69e3c75f 409 default:
f6fb8f10 410 WARN(1, "TPACKET version not supported.\n");
69e3c75f 411 BUG();
bbd6ef87 412 }
69e3c75f
JB
413
414 smp_wmb();
bbd6ef87
PM
415}
416
69e3c75f 417static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 418{
184f489e 419 union tpacket_uhdr h;
bbd6ef87 420
69e3c75f
JB
421 smp_rmb();
422
bbd6ef87
PM
423 h.raw = frame;
424 switch (po->tp_version) {
425 case TPACKET_V1:
0af55bb5 426 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 427 return h.h1->tp_status;
bbd6ef87 428 case TPACKET_V2:
0af55bb5 429 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 430 return h.h2->tp_status;
f6fb8f10 431 case TPACKET_V3:
69e3c75f 432 default:
f6fb8f10 433 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
434 BUG();
435 return 0;
bbd6ef87 436 }
1da177e4 437}
69e3c75f 438
b9c32fb2
DB
439static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
440 unsigned int flags)
7a51384c
DB
441{
442 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
443
68a360e8
WB
444 if (shhwtstamps &&
445 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
446 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
447 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
448
449 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 450 return TP_STATUS_TS_SOFTWARE;
7a51384c 451
b9c32fb2 452 return 0;
7a51384c
DB
453}
454
b9c32fb2
DB
455static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
456 struct sk_buff *skb)
2e31396f
WB
457{
458 union tpacket_uhdr h;
459 struct timespec ts;
b9c32fb2 460 __u32 ts_status;
2e31396f 461
b9c32fb2
DB
462 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
463 return 0;
2e31396f
WB
464
465 h.raw = frame;
466 switch (po->tp_version) {
467 case TPACKET_V1:
468 h.h1->tp_sec = ts.tv_sec;
469 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
470 break;
471 case TPACKET_V2:
472 h.h2->tp_sec = ts.tv_sec;
473 h.h2->tp_nsec = ts.tv_nsec;
474 break;
475 case TPACKET_V3:
476 default:
477 WARN(1, "TPACKET version not supported.\n");
478 BUG();
479 }
480
481 /* one flush is safe, as both fields always lie on the same cacheline */
482 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
483 smp_wmb();
b9c32fb2
DB
484
485 return ts_status;
2e31396f
WB
486}
487
69e3c75f
JB
488static void *packet_lookup_frame(struct packet_sock *po,
489 struct packet_ring_buffer *rb,
490 unsigned int position,
491 int status)
492{
493 unsigned int pg_vec_pos, frame_offset;
184f489e 494 union tpacket_uhdr h;
69e3c75f
JB
495
496 pg_vec_pos = position / rb->frames_per_block;
497 frame_offset = position % rb->frames_per_block;
498
0e3125c7
NH
499 h.raw = rb->pg_vec[pg_vec_pos].buffer +
500 (frame_offset * rb->frame_size);
69e3c75f
JB
501
502 if (status != __packet_get_status(po, h.raw))
503 return NULL;
504
505 return h.raw;
506}
507
eea49cc9 508static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
509 struct packet_ring_buffer *rb,
510 int status)
511{
512 return packet_lookup_frame(po, rb, rb->head, status);
513}
514
bc59ba39 515static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 516{
517 del_timer_sync(&pkc->retire_blk_timer);
518}
519
520static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
521 int tx_ring,
522 struct sk_buff_head *rb_queue)
523{
bc59ba39 524 struct tpacket_kbdq_core *pkc;
f6fb8f10 525
22781a5b
DJ
526 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
527 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 528
ec6f809f 529 spin_lock_bh(&rb_queue->lock);
f6fb8f10 530 pkc->delete_blk_timer = 1;
ec6f809f 531 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 532
533 prb_del_retire_blk_timer(pkc);
534}
535
536static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 537 struct tpacket_kbdq_core *pkc,
f6fb8f10 538 void (*func) (unsigned long))
539{
540 init_timer(&pkc->retire_blk_timer);
541 pkc->retire_blk_timer.data = (long)po;
542 pkc->retire_blk_timer.function = func;
543 pkc->retire_blk_timer.expires = jiffies;
544}
545
e8e85cc5 546static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 547{
bc59ba39 548 struct tpacket_kbdq_core *pkc;
f6fb8f10 549
e8e85cc5 550 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 551 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
552}
553
554static int prb_calc_retire_blk_tmo(struct packet_sock *po,
555 int blk_size_in_bytes)
556{
557 struct net_device *dev;
558 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
559 struct ethtool_cmd ecmd;
560 int err;
e440cf2c 561 u32 speed;
f6fb8f10 562
4bc71cb9
JP
563 rtnl_lock();
564 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
565 if (unlikely(!dev)) {
566 rtnl_unlock();
f6fb8f10 567 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
568 }
569 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 570 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
571 rtnl_unlock();
572 if (!err) {
4bc71cb9
JP
573 /*
574 * If the link speed is so slow you don't really
575 * need to worry about perf anyways
576 */
e440cf2c 577 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 578 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 579 } else {
580 msec = 1;
581 div = speed / 1000;
f6fb8f10 582 }
583 }
584
585 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
586
587 if (div)
588 mbits /= div;
589
590 tmo = mbits * msec;
591
592 if (div)
593 return tmo+1;
594 return tmo;
595}
596
bc59ba39 597static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 598 union tpacket_req_u *req_u)
599{
600 p1->feature_req_word = req_u->req3.tp_feature_req_word;
601}
602
603static void init_prb_bdqc(struct packet_sock *po,
604 struct packet_ring_buffer *rb,
605 struct pgv *pg_vec,
e8e85cc5 606 union tpacket_req_u *req_u)
f6fb8f10 607{
22781a5b 608 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 609 struct tpacket_block_desc *pbd;
f6fb8f10 610
611 memset(p1, 0x0, sizeof(*p1));
612
613 p1->knxt_seq_num = 1;
614 p1->pkbdq = pg_vec;
bc59ba39 615 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 616 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 617 p1->kblk_size = req_u->req3.tp_block_size;
618 p1->knum_blocks = req_u->req3.tp_block_nr;
619 p1->hdrlen = po->tp_hdrlen;
620 p1->version = po->tp_version;
621 p1->last_kactive_blk_num = 0;
ee80fbf3 622 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 623 if (req_u->req3.tp_retire_blk_tov)
624 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
625 else
626 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
627 req_u->req3.tp_block_size);
628 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
629 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
630
dc808110 631 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 632 prb_init_ft_ops(p1, req_u);
e8e85cc5 633 prb_setup_retire_blk_timer(po);
f6fb8f10 634 prb_open_block(p1, pbd);
635}
636
637/* Do NOT update the last_blk_num first.
638 * Assumes sk_buff_head lock is held.
639 */
bc59ba39 640static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 641{
642 mod_timer(&pkc->retire_blk_timer,
643 jiffies + pkc->tov_in_jiffies);
644 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
645}
646
647/*
648 * Timer logic:
649 * 1) We refresh the timer only when we open a block.
650 * By doing this we don't waste cycles refreshing the timer
651 * on packet-by-packet basis.
652 *
653 * With a 1MB block-size, on a 1Gbps line, it will take
654 * i) ~8 ms to fill a block + ii) memcpy etc.
655 * In this cut we are not accounting for the memcpy time.
656 *
657 * So, if the user sets the 'tmo' to 10ms then the timer
658 * will never fire while the block is still getting filled
659 * (which is what we want). However, the user could choose
660 * to close a block early and that's fine.
661 *
662 * But when the timer does fire, we check whether or not to refresh it.
663 * Since the tmo granularity is in msecs, it is not too expensive
664 * to refresh the timer, lets say every '8' msecs.
665 * Either the user can set the 'tmo' or we can derive it based on
666 * a) line-speed and b) block-size.
667 * prb_calc_retire_blk_tmo() calculates the tmo.
668 *
669 */
670static void prb_retire_rx_blk_timer_expired(unsigned long data)
671{
672 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 673 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 674 unsigned int frozen;
bc59ba39 675 struct tpacket_block_desc *pbd;
f6fb8f10 676
677 spin_lock(&po->sk.sk_receive_queue.lock);
678
679 frozen = prb_queue_frozen(pkc);
680 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
681
682 if (unlikely(pkc->delete_blk_timer))
683 goto out;
684
685 /* We only need to plug the race when the block is partially filled.
686 * tpacket_rcv:
687 * lock(); increment BLOCK_NUM_PKTS; unlock()
688 * copy_bits() is in progress ...
689 * timer fires on other cpu:
690 * we can't retire the current block because copy_bits
691 * is in progress.
692 *
693 */
694 if (BLOCK_NUM_PKTS(pbd)) {
695 while (atomic_read(&pkc->blk_fill_in_prog)) {
696 /* Waiting for skb_copy_bits to finish... */
697 cpu_relax();
698 }
699 }
700
701 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
702 if (!frozen) {
41a50d62
AD
703 if (!BLOCK_NUM_PKTS(pbd)) {
704 /* An empty block. Just refresh the timer. */
705 goto refresh_timer;
706 }
f6fb8f10 707 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
708 if (!prb_dispatch_next_block(pkc, po))
709 goto refresh_timer;
710 else
711 goto out;
712 } else {
713 /* Case 1. Queue was frozen because user-space was
714 * lagging behind.
715 */
716 if (prb_curr_blk_in_use(pkc, pbd)) {
717 /*
718 * Ok, user-space is still behind.
719 * So just refresh the timer.
720 */
721 goto refresh_timer;
722 } else {
723 /* Case 2. queue was frozen,user-space caught up,
724 * now the link went idle && the timer fired.
725 * We don't have a block to close.So we open this
726 * block and restart the timer.
727 * opening a block thaws the queue,restarts timer
728 * Thawing/timer-refresh is a side effect.
729 */
730 prb_open_block(pkc, pbd);
731 goto out;
732 }
733 }
734 }
735
736refresh_timer:
737 _prb_refresh_rx_retire_blk_timer(pkc);
738
739out:
740 spin_unlock(&po->sk.sk_receive_queue.lock);
741}
742
eea49cc9 743static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 744 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 745{
746 /* Flush everything minus the block header */
747
748#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
749 u8 *start, *end;
750
751 start = (u8 *)pbd1;
752
753 /* Skip the block header(we know header WILL fit in 4K) */
754 start += PAGE_SIZE;
755
756 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
757 for (; start < end; start += PAGE_SIZE)
758 flush_dcache_page(pgv_to_page(start));
759
760 smp_wmb();
761#endif
762
763 /* Now update the block status. */
764
765 BLOCK_STATUS(pbd1) = status;
766
767 /* Flush the block header */
768
769#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
770 start = (u8 *)pbd1;
771 flush_dcache_page(pgv_to_page(start));
772
773 smp_wmb();
774#endif
775}
776
777/*
778 * Side effect:
779 *
780 * 1) flush the block
781 * 2) Increment active_blk_num
782 *
783 * Note:We DONT refresh the timer on purpose.
784 * Because almost always the next block will be opened.
785 */
bc59ba39 786static void prb_close_block(struct tpacket_kbdq_core *pkc1,
787 struct tpacket_block_desc *pbd1,
f6fb8f10 788 struct packet_sock *po, unsigned int stat)
789{
790 __u32 status = TP_STATUS_USER | stat;
791
792 struct tpacket3_hdr *last_pkt;
bc59ba39 793 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 794 struct sock *sk = &po->sk;
f6fb8f10 795
ee80fbf3 796 if (po->stats.stats3.tp_drops)
f6fb8f10 797 status |= TP_STATUS_LOSING;
798
799 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
800 last_pkt->tp_next_offset = 0;
801
802 /* Get the ts of the last pkt */
803 if (BLOCK_NUM_PKTS(pbd1)) {
804 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
805 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
806 } else {
41a50d62
AD
807 /* Ok, we tmo'd - so get the current time.
808 *
809 * It shouldn't really happen as we don't close empty
810 * blocks. See prb_retire_rx_blk_timer_expired().
811 */
f6fb8f10 812 struct timespec ts;
813 getnstimeofday(&ts);
814 h1->ts_last_pkt.ts_sec = ts.tv_sec;
815 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
816 }
817
818 smp_wmb();
819
820 /* Flush the block */
821 prb_flush_block(pkc1, pbd1, status);
822
da413eec
DC
823 sk->sk_data_ready(sk);
824
f6fb8f10 825 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
826}
827
eea49cc9 828static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 829{
830 pkc->reset_pending_on_curr_blk = 0;
831}
832
833/*
834 * Side effect of opening a block:
835 *
836 * 1) prb_queue is thawed.
837 * 2) retire_blk_timer is refreshed.
838 *
839 */
bc59ba39 840static void prb_open_block(struct tpacket_kbdq_core *pkc1,
841 struct tpacket_block_desc *pbd1)
f6fb8f10 842{
843 struct timespec ts;
bc59ba39 844 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 845
846 smp_rmb();
847
8da3056c
DB
848 /* We could have just memset this but we will lose the
849 * flexibility of making the priv area sticky
850 */
f6fb8f10 851
8da3056c
DB
852 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
853 BLOCK_NUM_PKTS(pbd1) = 0;
854 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 855
8da3056c
DB
856 getnstimeofday(&ts);
857
858 h1->ts_first_pkt.ts_sec = ts.tv_sec;
859 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 860
8da3056c
DB
861 pkc1->pkblk_start = (char *)pbd1;
862 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
863
864 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
865 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
866
867 pbd1->version = pkc1->version;
868 pkc1->prev = pkc1->nxt_offset;
869 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
870
871 prb_thaw_queue(pkc1);
872 _prb_refresh_rx_retire_blk_timer(pkc1);
873
874 smp_wmb();
f6fb8f10 875}
876
877/*
878 * Queue freeze logic:
879 * 1) Assume tp_block_nr = 8 blocks.
880 * 2) At time 't0', user opens Rx ring.
881 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
882 * 4) user-space is either sleeping or processing block '0'.
883 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
884 * it will close block-7,loop around and try to fill block '0'.
885 * call-flow:
886 * __packet_lookup_frame_in_block
887 * prb_retire_current_block()
888 * prb_dispatch_next_block()
889 * |->(BLOCK_STATUS == USER) evaluates to true
890 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
891 * 6) Now there are two cases:
892 * 6.1) Link goes idle right after the queue is frozen.
893 * But remember, the last open_block() refreshed the timer.
894 * When this timer expires,it will refresh itself so that we can
895 * re-open block-0 in near future.
896 * 6.2) Link is busy and keeps on receiving packets. This is a simple
897 * case and __packet_lookup_frame_in_block will check if block-0
898 * is free and can now be re-used.
899 */
eea49cc9 900static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 901 struct packet_sock *po)
902{
903 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 904 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 905}
906
907#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
908
909/*
910 * If the next block is free then we will dispatch it
911 * and return a good offset.
912 * Else, we will freeze the queue.
913 * So, caller must check the return value.
914 */
bc59ba39 915static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 916 struct packet_sock *po)
917{
bc59ba39 918 struct tpacket_block_desc *pbd;
f6fb8f10 919
920 smp_rmb();
921
922 /* 1. Get current block num */
923 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
924
925 /* 2. If this block is currently in_use then freeze the queue */
926 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
927 prb_freeze_queue(pkc, po);
928 return NULL;
929 }
930
931 /*
932 * 3.
933 * open this block and return the offset where the first packet
934 * needs to get stored.
935 */
936 prb_open_block(pkc, pbd);
937 return (void *)pkc->nxt_offset;
938}
939
bc59ba39 940static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 941 struct packet_sock *po, unsigned int status)
942{
bc59ba39 943 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 944
945 /* retire/close the current block */
946 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
947 /*
948 * Plug the case where copy_bits() is in progress on
949 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
950 * have space to copy the pkt in the current block and
951 * called prb_retire_current_block()
952 *
953 * We don't need to worry about the TMO case because
954 * the timer-handler already handled this case.
955 */
956 if (!(status & TP_STATUS_BLK_TMO)) {
957 while (atomic_read(&pkc->blk_fill_in_prog)) {
958 /* Waiting for skb_copy_bits to finish... */
959 cpu_relax();
960 }
961 }
962 prb_close_block(pkc, pbd, po, status);
963 return;
964 }
f6fb8f10 965}
966
eea49cc9 967static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 968 struct tpacket_block_desc *pbd)
f6fb8f10 969{
970 return TP_STATUS_USER & BLOCK_STATUS(pbd);
971}
972
eea49cc9 973static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 974{
975 return pkc->reset_pending_on_curr_blk;
976}
977
eea49cc9 978static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 979{
bc59ba39 980 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 981 atomic_dec(&pkc->blk_fill_in_prog);
982}
983
eea49cc9 984static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 985 struct tpacket3_hdr *ppd)
986{
3958afa1 987 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 988}
989
eea49cc9 990static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 991 struct tpacket3_hdr *ppd)
992{
993 ppd->hv1.tp_rxhash = 0;
994}
995
eea49cc9 996static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 997 struct tpacket3_hdr *ppd)
998{
df8a39de
JP
999 if (skb_vlan_tag_present(pkc->skb)) {
1000 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
1001 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1002 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 1003 } else {
9e67030a 1004 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 1005 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 1006 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 1007 }
1008}
1009
bc59ba39 1010static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 1011 struct tpacket3_hdr *ppd)
1012{
a0cdfcf3 1013 ppd->hv1.tp_padding = 0;
f6fb8f10 1014 prb_fill_vlan_info(pkc, ppd);
1015
1016 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1017 prb_fill_rxhash(pkc, ppd);
1018 else
1019 prb_clear_rxhash(pkc, ppd);
1020}
1021
eea49cc9 1022static void prb_fill_curr_block(char *curr,
bc59ba39 1023 struct tpacket_kbdq_core *pkc,
1024 struct tpacket_block_desc *pbd,
f6fb8f10 1025 unsigned int len)
1026{
1027 struct tpacket3_hdr *ppd;
1028
1029 ppd = (struct tpacket3_hdr *)curr;
1030 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1031 pkc->prev = curr;
1032 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1033 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1034 BLOCK_NUM_PKTS(pbd) += 1;
1035 atomic_inc(&pkc->blk_fill_in_prog);
1036 prb_run_all_ft_ops(pkc, ppd);
1037}
1038
1039/* Assumes caller has the sk->rx_queue.lock */
1040static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1041 struct sk_buff *skb,
1042 int status,
1043 unsigned int len
1044 )
1045{
bc59ba39 1046 struct tpacket_kbdq_core *pkc;
1047 struct tpacket_block_desc *pbd;
f6fb8f10 1048 char *curr, *end;
1049
e3192690 1050 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1051 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1052
1053 /* Queue is frozen when user space is lagging behind */
1054 if (prb_queue_frozen(pkc)) {
1055 /*
1056 * Check if that last block which caused the queue to freeze,
1057 * is still in_use by user-space.
1058 */
1059 if (prb_curr_blk_in_use(pkc, pbd)) {
1060 /* Can't record this packet */
1061 return NULL;
1062 } else {
1063 /*
1064 * Ok, the block was released by user-space.
1065 * Now let's open that block.
1066 * opening a block also thaws the queue.
1067 * Thawing is a side effect.
1068 */
1069 prb_open_block(pkc, pbd);
1070 }
1071 }
1072
1073 smp_mb();
1074 curr = pkc->nxt_offset;
1075 pkc->skb = skb;
e3192690 1076 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1077
1078 /* first try the current block */
1079 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1080 prb_fill_curr_block(curr, pkc, pbd, len);
1081 return (void *)curr;
1082 }
1083
1084 /* Ok, close the current block */
1085 prb_retire_current_block(pkc, po, 0);
1086
1087 /* Now, try to dispatch the next block */
1088 curr = (char *)prb_dispatch_next_block(pkc, po);
1089 if (curr) {
1090 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1091 prb_fill_curr_block(curr, pkc, pbd, len);
1092 return (void *)curr;
1093 }
1094
1095 /*
1096 * No free blocks are available.user_space hasn't caught up yet.
1097 * Queue was just frozen and now this packet will get dropped.
1098 */
1099 return NULL;
1100}
1101
eea49cc9 1102static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1103 struct sk_buff *skb,
1104 int status, unsigned int len)
1105{
1106 char *curr = NULL;
1107 switch (po->tp_version) {
1108 case TPACKET_V1:
1109 case TPACKET_V2:
1110 curr = packet_lookup_frame(po, &po->rx_ring,
1111 po->rx_ring.head, status);
1112 return curr;
1113 case TPACKET_V3:
1114 return __packet_lookup_frame_in_block(po, skb, status, len);
1115 default:
1116 WARN(1, "TPACKET version not supported\n");
1117 BUG();
99aa3473 1118 return NULL;
f6fb8f10 1119 }
1120}
1121
eea49cc9 1122static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1123 struct packet_ring_buffer *rb,
77f65ebd 1124 unsigned int idx,
f6fb8f10 1125 int status)
1126{
bc59ba39 1127 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1128 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1129
1130 if (status != BLOCK_STATUS(pbd))
1131 return NULL;
1132 return pbd;
1133}
1134
eea49cc9 1135static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1136{
1137 unsigned int prev;
1138 if (rb->prb_bdqc.kactive_blk_num)
1139 prev = rb->prb_bdqc.kactive_blk_num-1;
1140 else
1141 prev = rb->prb_bdqc.knum_blocks-1;
1142 return prev;
1143}
1144
1145/* Assumes caller has held the rx_queue.lock */
eea49cc9 1146static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1147 struct packet_ring_buffer *rb,
1148 int status)
1149{
1150 unsigned int previous = prb_previous_blk_num(rb);
1151 return prb_lookup_block(po, rb, previous, status);
1152}
1153
eea49cc9 1154static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1155 struct packet_ring_buffer *rb,
1156 int status)
1157{
1158 if (po->tp_version <= TPACKET_V2)
1159 return packet_previous_frame(po, rb, status);
1160
1161 return __prb_previous_block(po, rb, status);
1162}
1163
eea49cc9 1164static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1165 struct packet_ring_buffer *rb)
1166{
1167 switch (po->tp_version) {
1168 case TPACKET_V1:
1169 case TPACKET_V2:
1170 return packet_increment_head(rb);
1171 case TPACKET_V3:
1172 default:
1173 WARN(1, "TPACKET version not supported.\n");
1174 BUG();
1175 return;
1176 }
1177}
1178
eea49cc9 1179static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1180 struct packet_ring_buffer *rb,
1181 int status)
1182{
1183 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1184 return packet_lookup_frame(po, rb, previous, status);
1185}
1186
eea49cc9 1187static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1188{
1189 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1190}
1191
b0138408
DB
1192static void packet_inc_pending(struct packet_ring_buffer *rb)
1193{
1194 this_cpu_inc(*rb->pending_refcnt);
1195}
1196
1197static void packet_dec_pending(struct packet_ring_buffer *rb)
1198{
1199 this_cpu_dec(*rb->pending_refcnt);
1200}
1201
1202static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1203{
1204 unsigned int refcnt = 0;
1205 int cpu;
1206
1207 /* We don't use pending refcount in rx_ring. */
1208 if (rb->pending_refcnt == NULL)
1209 return 0;
1210
1211 for_each_possible_cpu(cpu)
1212 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1213
1214 return refcnt;
1215}
1216
1217static int packet_alloc_pending(struct packet_sock *po)
1218{
1219 po->rx_ring.pending_refcnt = NULL;
1220
1221 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1222 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1223 return -ENOBUFS;
1224
1225 return 0;
1226}
1227
1228static void packet_free_pending(struct packet_sock *po)
1229{
1230 free_percpu(po->tx_ring.pending_refcnt);
1231}
1232
9954729b
WB
1233#define ROOM_POW_OFF 2
1234#define ROOM_NONE 0x0
1235#define ROOM_LOW 0x1
1236#define ROOM_NORMAL 0x2
1237
1238static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1239{
9954729b
WB
1240 int idx, len;
1241
1242 len = po->rx_ring.frame_max + 1;
1243 idx = po->rx_ring.head;
1244 if (pow_off)
1245 idx += len >> pow_off;
1246 if (idx >= len)
1247 idx -= len;
1248 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1249}
1250
1251static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1252{
1253 int idx, len;
1254
1255 len = po->rx_ring.prb_bdqc.knum_blocks;
1256 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1257 if (pow_off)
1258 idx += len >> pow_off;
1259 if (idx >= len)
1260 idx -= len;
1261 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1262}
77f65ebd 1263
2ccdbaa6 1264static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1265{
1266 struct sock *sk = &po->sk;
1267 int ret = ROOM_NONE;
1268
1269 if (po->prot_hook.func != tpacket_rcv) {
1270 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1271 - (skb ? skb->truesize : 0);
9954729b
WB
1272 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1273 return ROOM_NORMAL;
1274 else if (avail > 0)
1275 return ROOM_LOW;
1276 else
1277 return ROOM_NONE;
1278 }
77f65ebd 1279
9954729b
WB
1280 if (po->tp_version == TPACKET_V3) {
1281 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1282 ret = ROOM_NORMAL;
1283 else if (__tpacket_v3_has_room(po, 0))
1284 ret = ROOM_LOW;
1285 } else {
1286 if (__tpacket_has_room(po, ROOM_POW_OFF))
1287 ret = ROOM_NORMAL;
1288 else if (__tpacket_has_room(po, 0))
1289 ret = ROOM_LOW;
1290 }
2ccdbaa6
WB
1291
1292 return ret;
1293}
1294
1295static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1296{
1297 int ret;
1298 bool has_room;
1299
54d7c01d
WB
1300 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1301 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1302 has_room = ret == ROOM_NORMAL;
1303 if (po->pressure == has_room)
54d7c01d
WB
1304 po->pressure = !has_room;
1305 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1306
9954729b 1307 return ret;
77f65ebd
WB
1308}
1309
1da177e4
LT
1310static void packet_sock_destruct(struct sock *sk)
1311{
ed85b565
RC
1312 skb_queue_purge(&sk->sk_error_queue);
1313
547b792c
IJ
1314 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1315 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1316
1317 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1318 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1319 return;
1320 }
1321
17ab56a2 1322 sk_refcnt_debug_dec(sk);
1da177e4
LT
1323}
1324
3b3a5b0a
WB
1325static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1326{
1327 u32 rxhash;
1328 int i, count = 0;
1329
1330 rxhash = skb_get_hash(skb);
1331 for (i = 0; i < ROLLOVER_HLEN; i++)
1332 if (po->rollover->history[i] == rxhash)
1333 count++;
1334
1335 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1336 return count > (ROLLOVER_HLEN >> 1);
1337}
1338
77f65ebd
WB
1339static unsigned int fanout_demux_hash(struct packet_fanout *f,
1340 struct sk_buff *skb,
1341 unsigned int num)
dc99f600 1342{
61b905da 1343 return reciprocal_scale(skb_get_hash(skb), num);
dc99f600
DM
1344}
1345
77f65ebd
WB
1346static unsigned int fanout_demux_lb(struct packet_fanout *f,
1347 struct sk_buff *skb,
1348 unsigned int num)
dc99f600 1349{
468479e6 1350 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1351
468479e6 1352 return val % num;
77f65ebd
WB
1353}
1354
1355static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1356 struct sk_buff *skb,
1357 unsigned int num)
1358{
1359 return smp_processor_id() % num;
dc99f600
DM
1360}
1361
5df0ddfb
DB
1362static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1363 struct sk_buff *skb,
1364 unsigned int num)
1365{
f337db64 1366 return prandom_u32_max(num);
5df0ddfb
DB
1367}
1368
77f65ebd
WB
1369static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1370 struct sk_buff *skb,
ad377cab 1371 unsigned int idx, bool try_self,
77f65ebd 1372 unsigned int num)
95ec3eb4 1373{
4633c9e0 1374 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1375 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1376
0648ab70 1377 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1378
1379 if (try_self) {
1380 room = packet_rcv_has_room(po, skb);
1381 if (room == ROOM_NORMAL ||
1382 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1383 return idx;
4633c9e0 1384 po_skip = po;
3b3a5b0a 1385 }
ad377cab 1386
0648ab70 1387 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1388 do {
2ccdbaa6 1389 po_next = pkt_sk(f->arr[i]);
4633c9e0 1390 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1391 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1392 if (i != j)
0648ab70 1393 po->rollover->sock = i;
a9b63918
WB
1394 atomic_long_inc(&po->rollover->num);
1395 if (room == ROOM_LOW)
1396 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1397 return i;
1398 }
ad377cab 1399
77f65ebd
WB
1400 if (++i == num)
1401 i = 0;
1402 } while (i != j);
1403
a9b63918 1404 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1405 return idx;
1406}
1407
2d36097d
NH
1408static unsigned int fanout_demux_qm(struct packet_fanout *f,
1409 struct sk_buff *skb,
1410 unsigned int num)
1411{
1412 return skb_get_queue_mapping(skb) % num;
1413}
1414
77f65ebd
WB
1415static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1416{
1417 return f->flags & (flag >> 8);
95ec3eb4
DM
1418}
1419
95ec3eb4
DM
1420static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1421 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1422{
1423 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1424 unsigned int num = READ_ONCE(f->num_members);
dc99f600 1425 struct packet_sock *po;
77f65ebd 1426 unsigned int idx;
dc99f600
DM
1427
1428 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1429 !num) {
1430 kfree_skb(skb);
1431 return 0;
1432 }
1433
3f34b24a
AD
1434 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1435 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1436 if (!skb)
1437 return 0;
1438 }
95ec3eb4
DM
1439 switch (f->type) {
1440 case PACKET_FANOUT_HASH:
1441 default:
77f65ebd 1442 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1443 break;
1444 case PACKET_FANOUT_LB:
77f65ebd 1445 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1446 break;
1447 case PACKET_FANOUT_CPU:
77f65ebd
WB
1448 idx = fanout_demux_cpu(f, skb, num);
1449 break;
5df0ddfb
DB
1450 case PACKET_FANOUT_RND:
1451 idx = fanout_demux_rnd(f, skb, num);
1452 break;
2d36097d
NH
1453 case PACKET_FANOUT_QM:
1454 idx = fanout_demux_qm(f, skb, num);
1455 break;
77f65ebd 1456 case PACKET_FANOUT_ROLLOVER:
ad377cab 1457 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1458 break;
dc99f600
DM
1459 }
1460
ad377cab
WB
1461 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1462 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1463
ad377cab 1464 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1465 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1466}
1467
fff3321d
PE
1468DEFINE_MUTEX(fanout_mutex);
1469EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1470static LIST_HEAD(fanout_list);
1471
1472static void __fanout_link(struct sock *sk, struct packet_sock *po)
1473{
1474 struct packet_fanout *f = po->fanout;
1475
1476 spin_lock(&f->lock);
1477 f->arr[f->num_members] = sk;
1478 smp_wmb();
1479 f->num_members++;
1480 spin_unlock(&f->lock);
1481}
1482
1483static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1484{
1485 struct packet_fanout *f = po->fanout;
1486 int i;
1487
1488 spin_lock(&f->lock);
1489 for (i = 0; i < f->num_members; i++) {
1490 if (f->arr[i] == sk)
1491 break;
1492 }
1493 BUG_ON(i >= f->num_members);
1494 f->arr[i] = f->arr[f->num_members - 1];
1495 f->num_members--;
1496 spin_unlock(&f->lock);
1497}
1498
d4dd8aee 1499static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1500{
d4dd8aee 1501 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1502 return true;
1503
1504 return false;
1505}
1506
7736d33f 1507static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1508{
1509 struct packet_sock *po = pkt_sk(sk);
1510 struct packet_fanout *f, *match;
7736d33f 1511 u8 type = type_flags & 0xff;
77f65ebd 1512 u8 flags = type_flags >> 8;
dc99f600
DM
1513 int err;
1514
1515 switch (type) {
77f65ebd
WB
1516 case PACKET_FANOUT_ROLLOVER:
1517 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1518 return -EINVAL;
dc99f600
DM
1519 case PACKET_FANOUT_HASH:
1520 case PACKET_FANOUT_LB:
95ec3eb4 1521 case PACKET_FANOUT_CPU:
5df0ddfb 1522 case PACKET_FANOUT_RND:
2d36097d 1523 case PACKET_FANOUT_QM:
dc99f600
DM
1524 break;
1525 default:
1526 return -EINVAL;
1527 }
1528
1529 if (!po->running)
1530 return -EINVAL;
1531
1532 if (po->fanout)
1533 return -EALREADY;
1534
4633c9e0
WB
1535 if (type == PACKET_FANOUT_ROLLOVER ||
1536 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
0648ab70
WB
1537 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1538 if (!po->rollover)
1539 return -ENOMEM;
a9b63918
WB
1540 atomic_long_set(&po->rollover->num, 0);
1541 atomic_long_set(&po->rollover->num_huge, 0);
1542 atomic_long_set(&po->rollover->num_failed, 0);
0648ab70
WB
1543 }
1544
dc99f600
DM
1545 mutex_lock(&fanout_mutex);
1546 match = NULL;
1547 list_for_each_entry(f, &fanout_list, list) {
1548 if (f->id == id &&
1549 read_pnet(&f->net) == sock_net(sk)) {
1550 match = f;
1551 break;
1552 }
1553 }
afe62c68 1554 err = -EINVAL;
77f65ebd 1555 if (match && match->flags != flags)
afe62c68 1556 goto out;
dc99f600 1557 if (!match) {
afe62c68 1558 err = -ENOMEM;
dc99f600 1559 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1560 if (!match)
1561 goto out;
1562 write_pnet(&match->net, sock_net(sk));
1563 match->id = id;
1564 match->type = type;
77f65ebd 1565 match->flags = flags;
afe62c68
ED
1566 atomic_set(&match->rr_cur, 0);
1567 INIT_LIST_HEAD(&match->list);
1568 spin_lock_init(&match->lock);
1569 atomic_set(&match->sk_ref, 0);
1570 match->prot_hook.type = po->prot_hook.type;
1571 match->prot_hook.dev = po->prot_hook.dev;
1572 match->prot_hook.func = packet_rcv_fanout;
1573 match->prot_hook.af_packet_priv = match;
c0de08d0 1574 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1575 dev_add_pack(&match->prot_hook);
1576 list_add(&match->list, &fanout_list);
dc99f600 1577 }
afe62c68
ED
1578 err = -EINVAL;
1579 if (match->type == type &&
1580 match->prot_hook.type == po->prot_hook.type &&
1581 match->prot_hook.dev == po->prot_hook.dev) {
1582 err = -ENOSPC;
1583 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1584 __dev_remove_pack(&po->prot_hook);
1585 po->fanout = match;
1586 atomic_inc(&match->sk_ref);
1587 __fanout_link(sk, po);
1588 err = 0;
dc99f600
DM
1589 }
1590 }
afe62c68 1591out:
dc99f600 1592 mutex_unlock(&fanout_mutex);
0648ab70
WB
1593 if (err) {
1594 kfree(po->rollover);
1595 po->rollover = NULL;
1596 }
dc99f600
DM
1597 return err;
1598}
1599
1600static void fanout_release(struct sock *sk)
1601{
1602 struct packet_sock *po = pkt_sk(sk);
1603 struct packet_fanout *f;
1604
1605 f = po->fanout;
1606 if (!f)
1607 return;
1608
fff3321d 1609 mutex_lock(&fanout_mutex);
dc99f600
DM
1610 po->fanout = NULL;
1611
dc99f600
DM
1612 if (atomic_dec_and_test(&f->sk_ref)) {
1613 list_del(&f->list);
1614 dev_remove_pack(&f->prot_hook);
1615 kfree(f);
1616 }
1617 mutex_unlock(&fanout_mutex);
0648ab70 1618
59f21118
WB
1619 if (po->rollover)
1620 kfree_rcu(po->rollover, rcu);
dc99f600 1621}
1da177e4 1622
90ddc4f0 1623static const struct proto_ops packet_ops;
1da177e4 1624
90ddc4f0 1625static const struct proto_ops packet_ops_spkt;
1da177e4 1626
40d4e3df
ED
1627static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1628 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1629{
1630 struct sock *sk;
1631 struct sockaddr_pkt *spkt;
1632
1633 /*
1634 * When we registered the protocol we saved the socket in the data
1635 * field for just this event.
1636 */
1637
1638 sk = pt->af_packet_priv;
1ce4f28b 1639
1da177e4
LT
1640 /*
1641 * Yank back the headers [hope the device set this
1642 * right or kerboom...]
1643 *
1644 * Incoming packets have ll header pulled,
1645 * push it back.
1646 *
98e399f8 1647 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1648 * so that this procedure is noop.
1649 */
1650
1651 if (skb->pkt_type == PACKET_LOOPBACK)
1652 goto out;
1653
09ad9bc7 1654 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1655 goto out;
1656
40d4e3df
ED
1657 skb = skb_share_check(skb, GFP_ATOMIC);
1658 if (skb == NULL)
1da177e4
LT
1659 goto oom;
1660
1661 /* drop any routing info */
adf30907 1662 skb_dst_drop(skb);
1da177e4 1663
84531c24
PO
1664 /* drop conntrack reference */
1665 nf_reset(skb);
1666
ffbc6111 1667 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1668
98e399f8 1669 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1670
1671 /*
1672 * The SOCK_PACKET socket receives _all_ frames.
1673 */
1674
1675 spkt->spkt_family = dev->type;
1676 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1677 spkt->spkt_protocol = skb->protocol;
1678
1679 /*
1680 * Charge the memory to the socket. This is done specifically
1681 * to prevent sockets using all the memory up.
1682 */
1683
40d4e3df 1684 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1685 return 0;
1686
1687out:
1688 kfree_skb(skb);
1689oom:
1690 return 0;
1691}
1692
1693
1694/*
1695 * Output a raw packet to a device layer. This bypasses all the other
1696 * protocol layers and you must therefore supply it with a complete frame
1697 */
1ce4f28b 1698
1b784140
YX
1699static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1700 size_t len)
1da177e4
LT
1701{
1702 struct sock *sk = sock->sk;
342dfc30 1703 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1704 struct sk_buff *skb = NULL;
1da177e4 1705 struct net_device *dev;
40d4e3df 1706 __be16 proto = 0;
1da177e4 1707 int err;
3bdc0eba 1708 int extra_len = 0;
1ce4f28b 1709
1da177e4 1710 /*
1ce4f28b 1711 * Get and verify the address.
1da177e4
LT
1712 */
1713
40d4e3df 1714 if (saddr) {
1da177e4 1715 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1716 return -EINVAL;
1717 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1718 proto = saddr->spkt_protocol;
1719 } else
1720 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1721
1722 /*
1ce4f28b 1723 * Find the device first to size check it
1da177e4
LT
1724 */
1725
de74e92a 1726 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1727retry:
654d1f8a
ED
1728 rcu_read_lock();
1729 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1730 err = -ENODEV;
1731 if (dev == NULL)
1732 goto out_unlock;
1ce4f28b 1733
d5e76b0a
DM
1734 err = -ENETDOWN;
1735 if (!(dev->flags & IFF_UP))
1736 goto out_unlock;
1737
1da177e4 1738 /*
40d4e3df
ED
1739 * You may not queue a frame bigger than the mtu. This is the lowest level
1740 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1741 */
1ce4f28b 1742
3bdc0eba
BG
1743 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1744 if (!netif_supports_nofcs(dev)) {
1745 err = -EPROTONOSUPPORT;
1746 goto out_unlock;
1747 }
1748 extra_len = 4; /* We're doing our own CRC */
1749 }
1750
1da177e4 1751 err = -EMSGSIZE;
3bdc0eba 1752 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1753 goto out_unlock;
1754
1a35ca80
ED
1755 if (!skb) {
1756 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1757 int tlen = dev->needed_tailroom;
1a35ca80
ED
1758 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1759
1760 rcu_read_unlock();
4ce40912 1761 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1762 if (skb == NULL)
1763 return -ENOBUFS;
1764 /* FIXME: Save some space for broken drivers that write a hard
1765 * header at transmission time by themselves. PPP is the notable
1766 * one here. This should really be fixed at the driver level.
1767 */
1768 skb_reserve(skb, reserved);
1769 skb_reset_network_header(skb);
1770
1771 /* Try to align data part correctly */
1772 if (hhlen) {
1773 skb->data -= hhlen;
1774 skb->tail -= hhlen;
1775 if (len < hhlen)
1776 skb_reset_network_header(skb);
1777 }
6ce8e9ce 1778 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1779 if (err)
1780 goto out_free;
1781 goto retry;
1da177e4
LT
1782 }
1783
3bdc0eba 1784 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1785 /* Earlier code assumed this would be a VLAN pkt,
1786 * double-check this now that we have the actual
1787 * packet in hand.
1788 */
1789 struct ethhdr *ehdr;
1790 skb_reset_mac_header(skb);
1791 ehdr = eth_hdr(skb);
1792 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1793 err = -EMSGSIZE;
1794 goto out_unlock;
1795 }
1796 }
1a35ca80 1797
1da177e4
LT
1798 skb->protocol = proto;
1799 skb->dev = dev;
1800 skb->priority = sk->sk_priority;
2d37a186 1801 skb->mark = sk->sk_mark;
bf84a010
DB
1802
1803 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1804
3bdc0eba
BG
1805 if (unlikely(extra_len == 4))
1806 skb->no_fcs = 1;
1807
40893fd0 1808 skb_probe_transport_header(skb, 0);
c1aad275 1809
1da177e4 1810 dev_queue_xmit(skb);
654d1f8a 1811 rcu_read_unlock();
40d4e3df 1812 return len;
1da177e4 1813
1da177e4 1814out_unlock:
654d1f8a 1815 rcu_read_unlock();
1a35ca80
ED
1816out_free:
1817 kfree_skb(skb);
1da177e4
LT
1818 return err;
1819}
1da177e4 1820
eea49cc9 1821static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1822 const struct sock *sk,
dbcb5855 1823 unsigned int res)
1da177e4
LT
1824{
1825 struct sk_filter *filter;
fda9ef5d 1826
80f8f102
ED
1827 rcu_read_lock();
1828 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1829 if (filter != NULL)
0a14842f 1830 res = SK_RUN_FILTER(filter, skb);
80f8f102 1831 rcu_read_unlock();
1da177e4 1832
dbcb5855 1833 return res;
1da177e4
LT
1834}
1835
1836/*
62ab0812
ED
1837 * This function makes lazy skb cloning in hope that most of packets
1838 * are discarded by BPF.
1839 *
1840 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1841 * and skb->cb are mangled. It works because (and until) packets
1842 * falling here are owned by current CPU. Output packets are cloned
1843 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1844 * sequencially, so that if we return skb to original state on exit,
1845 * we will not harm anyone.
1da177e4
LT
1846 */
1847
40d4e3df
ED
1848static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1849 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1850{
1851 struct sock *sk;
1852 struct sockaddr_ll *sll;
1853 struct packet_sock *po;
40d4e3df 1854 u8 *skb_head = skb->data;
1da177e4 1855 int skb_len = skb->len;
dbcb5855 1856 unsigned int snaplen, res;
1da177e4
LT
1857
1858 if (skb->pkt_type == PACKET_LOOPBACK)
1859 goto drop;
1860
1861 sk = pt->af_packet_priv;
1862 po = pkt_sk(sk);
1863
09ad9bc7 1864 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1865 goto drop;
1866
1da177e4
LT
1867 skb->dev = dev;
1868
3b04ddde 1869 if (dev->header_ops) {
1da177e4 1870 /* The device has an explicit notion of ll header,
62ab0812
ED
1871 * exported to higher levels.
1872 *
1873 * Otherwise, the device hides details of its frame
1874 * structure, so that corresponding packet head is
1875 * never delivered to user.
1da177e4
LT
1876 */
1877 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1878 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1879 else if (skb->pkt_type == PACKET_OUTGOING) {
1880 /* Special case: outgoing packets have ll header at head */
bbe735e4 1881 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1882 }
1883 }
1884
1885 snaplen = skb->len;
1886
dbcb5855
DM
1887 res = run_filter(skb, sk, snaplen);
1888 if (!res)
fda9ef5d 1889 goto drop_n_restore;
dbcb5855
DM
1890 if (snaplen > res)
1891 snaplen = res;
1da177e4 1892
0fd7bac6 1893 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1894 goto drop_n_acct;
1895
1896 if (skb_shared(skb)) {
1897 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1898 if (nskb == NULL)
1899 goto drop_n_acct;
1900
1901 if (skb_head != skb->data) {
1902 skb->data = skb_head;
1903 skb->len = skb_len;
1904 }
abc4e4fa 1905 consume_skb(skb);
1da177e4
LT
1906 skb = nskb;
1907 }
1908
b4772ef8 1909 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
1910
1911 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 1912 sll->sll_hatype = dev->type;
1da177e4 1913 sll->sll_pkttype = skb->pkt_type;
8032b464 1914 if (unlikely(po->origdev))
80feaacb
PWJ
1915 sll->sll_ifindex = orig_dev->ifindex;
1916 else
1917 sll->sll_ifindex = dev->ifindex;
1da177e4 1918
b95cce35 1919 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1920
2472d761
EB
1921 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
1922 * Use their space for storing the original skb length.
1923 */
1924 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 1925
1da177e4
LT
1926 if (pskb_trim(skb, snaplen))
1927 goto drop_n_acct;
1928
1929 skb_set_owner_r(skb, sk);
1930 skb->dev = NULL;
adf30907 1931 skb_dst_drop(skb);
1da177e4 1932
84531c24
PO
1933 /* drop conntrack reference */
1934 nf_reset(skb);
1935
1da177e4 1936 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1937 po->stats.stats1.tp_packets++;
3bc3b96f 1938 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
1939 __skb_queue_tail(&sk->sk_receive_queue, skb);
1940 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 1941 sk->sk_data_ready(sk);
1da177e4
LT
1942 return 0;
1943
1944drop_n_acct:
7091fbd8 1945 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1946 po->stats.stats1.tp_drops++;
7091fbd8
WB
1947 atomic_inc(&sk->sk_drops);
1948 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1949
1950drop_n_restore:
1951 if (skb_head != skb->data && skb_shared(skb)) {
1952 skb->data = skb_head;
1953 skb->len = skb_len;
1954 }
1955drop:
ead2ceb0 1956 consume_skb(skb);
1da177e4
LT
1957 return 0;
1958}
1959
40d4e3df
ED
1960static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1961 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1962{
1963 struct sock *sk;
1964 struct packet_sock *po;
1965 struct sockaddr_ll *sll;
184f489e 1966 union tpacket_uhdr h;
40d4e3df 1967 u8 *skb_head = skb->data;
1da177e4 1968 int skb_len = skb->len;
dbcb5855 1969 unsigned int snaplen, res;
f6fb8f10 1970 unsigned long status = TP_STATUS_USER;
bbd6ef87 1971 unsigned short macoff, netoff, hdrlen;
1da177e4 1972 struct sk_buff *copy_skb = NULL;
bbd6ef87 1973 struct timespec ts;
b9c32fb2 1974 __u32 ts_status;
1da177e4 1975
51846355
AW
1976 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1977 * We may add members to them until current aligned size without forcing
1978 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1979 */
1980 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1981 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1982
1da177e4
LT
1983 if (skb->pkt_type == PACKET_LOOPBACK)
1984 goto drop;
1985
1986 sk = pt->af_packet_priv;
1987 po = pkt_sk(sk);
1988
09ad9bc7 1989 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1990 goto drop;
1991
3b04ddde 1992 if (dev->header_ops) {
1da177e4 1993 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1994 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1995 else if (skb->pkt_type == PACKET_OUTGOING) {
1996 /* Special case: outgoing packets have ll header at head */
bbe735e4 1997 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1998 }
1999 }
2000
2001 snaplen = skb->len;
2002
dbcb5855
DM
2003 res = run_filter(skb, sk, snaplen);
2004 if (!res)
fda9ef5d 2005 goto drop_n_restore;
68c2e5de
AD
2006
2007 if (skb->ip_summed == CHECKSUM_PARTIAL)
2008 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2009 else if (skb->pkt_type != PACKET_OUTGOING &&
2010 (skb->ip_summed == CHECKSUM_COMPLETE ||
2011 skb_csum_unnecessary(skb)))
2012 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2013
dbcb5855
DM
2014 if (snaplen > res)
2015 snaplen = res;
1da177e4
LT
2016
2017 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2018 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2019 po->tp_reserve;
1da177e4 2020 } else {
95c96174 2021 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2022 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
2023 (maclen < 16 ? 16 : maclen)) +
2024 po->tp_reserve;
1da177e4
LT
2025 macoff = netoff - maclen;
2026 }
f6fb8f10 2027 if (po->tp_version <= TPACKET_V2) {
2028 if (macoff + snaplen > po->rx_ring.frame_size) {
2029 if (po->copy_thresh &&
0fd7bac6 2030 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2031 if (skb_shared(skb)) {
2032 copy_skb = skb_clone(skb, GFP_ATOMIC);
2033 } else {
2034 copy_skb = skb_get(skb);
2035 skb_head = skb->data;
2036 }
2037 if (copy_skb)
2038 skb_set_owner_r(copy_skb, sk);
1da177e4 2039 }
f6fb8f10 2040 snaplen = po->rx_ring.frame_size - macoff;
2041 if ((int)snaplen < 0)
2042 snaplen = 0;
1da177e4 2043 }
dc808110
ED
2044 } else if (unlikely(macoff + snaplen >
2045 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2046 u32 nval;
2047
2048 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2049 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2050 snaplen, nval, macoff);
2051 snaplen = nval;
2052 if (unlikely((int)snaplen < 0)) {
2053 snaplen = 0;
2054 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2055 }
1da177e4 2056 }
1da177e4 2057 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2058 h.raw = packet_current_rx_frame(po, skb,
2059 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2060 if (!h.raw)
1da177e4 2061 goto ring_is_full;
f6fb8f10 2062 if (po->tp_version <= TPACKET_V2) {
2063 packet_increment_rx_head(po, &po->rx_ring);
2064 /*
2065 * LOSING will be reported till you read the stats,
2066 * because it's COR - Clear On Read.
2067 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2068 * at packet level.
2069 */
ee80fbf3 2070 if (po->stats.stats1.tp_drops)
f6fb8f10 2071 status |= TP_STATUS_LOSING;
2072 }
ee80fbf3 2073 po->stats.stats1.tp_packets++;
1da177e4
LT
2074 if (copy_skb) {
2075 status |= TP_STATUS_COPY;
2076 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2077 }
1da177e4
LT
2078 spin_unlock(&sk->sk_receive_queue.lock);
2079
bbd6ef87 2080 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2081
2082 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2083 getnstimeofday(&ts);
1da177e4 2084
b9c32fb2
DB
2085 status |= ts_status;
2086
bbd6ef87
PM
2087 switch (po->tp_version) {
2088 case TPACKET_V1:
2089 h.h1->tp_len = skb->len;
2090 h.h1->tp_snaplen = snaplen;
2091 h.h1->tp_mac = macoff;
2092 h.h1->tp_net = netoff;
4b457bdf
DB
2093 h.h1->tp_sec = ts.tv_sec;
2094 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2095 hdrlen = sizeof(*h.h1);
2096 break;
2097 case TPACKET_V2:
2098 h.h2->tp_len = skb->len;
2099 h.h2->tp_snaplen = snaplen;
2100 h.h2->tp_mac = macoff;
2101 h.h2->tp_net = netoff;
bbd6ef87
PM
2102 h.h2->tp_sec = ts.tv_sec;
2103 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2104 if (skb_vlan_tag_present(skb)) {
2105 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2106 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2107 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2108 } else {
2109 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2110 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2111 }
e4d26f4b 2112 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2113 hdrlen = sizeof(*h.h2);
2114 break;
f6fb8f10 2115 case TPACKET_V3:
2116 /* tp_nxt_offset,vlan are already populated above.
2117 * So DONT clear those fields here
2118 */
2119 h.h3->tp_status |= status;
2120 h.h3->tp_len = skb->len;
2121 h.h3->tp_snaplen = snaplen;
2122 h.h3->tp_mac = macoff;
2123 h.h3->tp_net = netoff;
f6fb8f10 2124 h.h3->tp_sec = ts.tv_sec;
2125 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2126 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2127 hdrlen = sizeof(*h.h3);
2128 break;
bbd6ef87
PM
2129 default:
2130 BUG();
2131 }
1da177e4 2132
bbd6ef87 2133 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2134 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2135 sll->sll_family = AF_PACKET;
2136 sll->sll_hatype = dev->type;
2137 sll->sll_protocol = skb->protocol;
2138 sll->sll_pkttype = skb->pkt_type;
8032b464 2139 if (unlikely(po->origdev))
80feaacb
PWJ
2140 sll->sll_ifindex = orig_dev->ifindex;
2141 else
2142 sll->sll_ifindex = dev->ifindex;
1da177e4 2143
e16aa207 2144 smp_mb();
f0d4eb29 2145
f6dafa95 2146#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2147 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2148 u8 *start, *end;
2149
f0d4eb29
DB
2150 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2151 macoff + snaplen);
2152
2153 for (start = h.raw; start < end; start += PAGE_SIZE)
2154 flush_dcache_page(pgv_to_page(start));
1da177e4 2155 }
f0d4eb29 2156 smp_wmb();
f6dafa95 2157#endif
f0d4eb29 2158
da413eec 2159 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2160 __packet_set_status(po, h.raw, status);
da413eec
DC
2161 sk->sk_data_ready(sk);
2162 } else {
f6fb8f10 2163 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2164 }
1da177e4
LT
2165
2166drop_n_restore:
2167 if (skb_head != skb->data && skb_shared(skb)) {
2168 skb->data = skb_head;
2169 skb->len = skb_len;
2170 }
2171drop:
1ce4f28b 2172 kfree_skb(skb);
1da177e4
LT
2173 return 0;
2174
2175ring_is_full:
ee80fbf3 2176 po->stats.stats1.tp_drops++;
1da177e4
LT
2177 spin_unlock(&sk->sk_receive_queue.lock);
2178
676d2369 2179 sk->sk_data_ready(sk);
acb5d75b 2180 kfree_skb(copy_skb);
1da177e4
LT
2181 goto drop_n_restore;
2182}
2183
69e3c75f
JB
2184static void tpacket_destruct_skb(struct sk_buff *skb)
2185{
2186 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2187
69e3c75f 2188 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2189 void *ph;
b9c32fb2
DB
2190 __u32 ts;
2191
69e3c75f 2192 ph = skb_shinfo(skb)->destructor_arg;
b0138408 2193 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2194
2195 ts = __packet_set_timestamp(po, ph, skb);
2196 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2197 }
2198
2199 sock_wfree(skb);
2200}
2201
9c707762
WB
2202static bool ll_header_truncated(const struct net_device *dev, int len)
2203{
2204 /* net device doesn't like empty head */
2205 if (unlikely(len <= dev->hard_header_len)) {
eee2f04b 2206 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
9c707762
WB
2207 current->comm, len, dev->hard_header_len);
2208 return true;
2209 }
2210
2211 return false;
2212}
2213
40d4e3df
ED
2214static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2215 void *frame, struct net_device *dev, int size_max,
ae641949 2216 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2217{
184f489e 2218 union tpacket_uhdr ph;
09effa67 2219 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2220 struct socket *sock = po->sk.sk_socket;
2221 struct page *page;
2222 void *data;
2223 int err;
2224
2225 ph.raw = frame;
2226
2227 skb->protocol = proto;
2228 skb->dev = dev;
2229 skb->priority = po->sk.sk_priority;
2d37a186 2230 skb->mark = po->sk.sk_mark;
2e31396f 2231 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2232 skb_shinfo(skb)->destructor_arg = ph.raw;
2233
2234 switch (po->tp_version) {
2235 case TPACKET_V2:
2236 tp_len = ph.h2->tp_len;
2237 break;
2238 default:
2239 tp_len = ph.h1->tp_len;
2240 break;
2241 }
09effa67
DM
2242 if (unlikely(tp_len > size_max)) {
2243 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2244 return -EMSGSIZE;
2245 }
69e3c75f 2246
ae641949 2247 skb_reserve(skb, hlen);
69e3c75f 2248 skb_reset_network_header(skb);
c1aad275 2249
d346a3fa
DB
2250 if (!packet_use_direct_xmit(po))
2251 skb_probe_transport_header(skb, 0);
2252 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2253 int off_min, off_max, off;
2254 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2255 off_max = po->tx_ring.frame_size - tp_len;
2256 if (sock->type == SOCK_DGRAM) {
2257 switch (po->tp_version) {
2258 case TPACKET_V2:
2259 off = ph.h2->tp_net;
2260 break;
2261 default:
2262 off = ph.h1->tp_net;
2263 break;
2264 }
2265 } else {
2266 switch (po->tp_version) {
2267 case TPACKET_V2:
2268 off = ph.h2->tp_mac;
2269 break;
2270 default:
2271 off = ph.h1->tp_mac;
2272 break;
2273 }
2274 }
2275 if (unlikely((off < off_min) || (off_max < off)))
2276 return -EINVAL;
2277 data = ph.raw + off;
2278 } else {
2279 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2280 }
69e3c75f
JB
2281 to_write = tp_len;
2282
2283 if (sock->type == SOCK_DGRAM) {
2284 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2285 NULL, tp_len);
2286 if (unlikely(err < 0))
2287 return -EINVAL;
40d4e3df 2288 } else if (dev->hard_header_len) {
9c707762 2289 if (ll_header_truncated(dev, tp_len))
69e3c75f 2290 return -EINVAL;
69e3c75f
JB
2291
2292 skb_push(skb, dev->hard_header_len);
2293 err = skb_store_bits(skb, 0, data,
2294 dev->hard_header_len);
2295 if (unlikely(err))
2296 return err;
2297
2298 data += dev->hard_header_len;
2299 to_write -= dev->hard_header_len;
2300 }
2301
69e3c75f
JB
2302 offset = offset_in_page(data);
2303 len_max = PAGE_SIZE - offset;
2304 len = ((to_write > len_max) ? len_max : to_write);
2305
2306 skb->data_len = to_write;
2307 skb->len += to_write;
2308 skb->truesize += to_write;
2309 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2310
2311 while (likely(to_write)) {
2312 nr_frags = skb_shinfo(skb)->nr_frags;
2313
2314 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2315 pr_err("Packet exceed the number of skb frags(%lu)\n",
2316 MAX_SKB_FRAGS);
69e3c75f
JB
2317 return -EFAULT;
2318 }
2319
0af55bb5
CG
2320 page = pgv_to_page(data);
2321 data += len;
69e3c75f
JB
2322 flush_dcache_page(page);
2323 get_page(page);
0af55bb5 2324 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2325 to_write -= len;
2326 offset = 0;
2327 len_max = PAGE_SIZE;
2328 len = ((to_write > len_max) ? len_max : to_write);
2329 }
2330
2331 return tp_len;
2332}
2333
2334static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2335{
69e3c75f
JB
2336 struct sk_buff *skb;
2337 struct net_device *dev;
2338 __be16 proto;
09effa67 2339 int err, reserve = 0;
40d4e3df 2340 void *ph;
342dfc30 2341 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2342 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2343 int tp_len, size_max;
2344 unsigned char *addr;
2345 int len_sum = 0;
9e67030a 2346 int status = TP_STATUS_AVAILABLE;
ae641949 2347 int hlen, tlen;
69e3c75f 2348
69e3c75f
JB
2349 mutex_lock(&po->pg_vec_lock);
2350
66e56cd4 2351 if (likely(saddr == NULL)) {
e40526cb 2352 dev = packet_cached_dev_get(po);
69e3c75f
JB
2353 proto = po->num;
2354 addr = NULL;
2355 } else {
2356 err = -EINVAL;
2357 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2358 goto out;
2359 if (msg->msg_namelen < (saddr->sll_halen
2360 + offsetof(struct sockaddr_ll,
2361 sll_addr)))
2362 goto out;
69e3c75f
JB
2363 proto = saddr->sll_protocol;
2364 addr = saddr->sll_addr;
827d9780 2365 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2366 }
2367
69e3c75f
JB
2368 err = -ENXIO;
2369 if (unlikely(dev == NULL))
2370 goto out;
69e3c75f
JB
2371 err = -ENETDOWN;
2372 if (unlikely(!(dev->flags & IFF_UP)))
2373 goto out_put;
2374
52f1454f 2375 reserve = dev->hard_header_len + VLAN_HLEN;
69e3c75f 2376 size_max = po->tx_ring.frame_size
b5dd884e 2377 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2378
09effa67
DM
2379 if (size_max > dev->mtu + reserve)
2380 size_max = dev->mtu + reserve;
2381
69e3c75f
JB
2382 do {
2383 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2384 TP_STATUS_SEND_REQUEST);
69e3c75f 2385 if (unlikely(ph == NULL)) {
87a2fd28
DB
2386 if (need_wait && need_resched())
2387 schedule();
69e3c75f
JB
2388 continue;
2389 }
2390
2391 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2392 hlen = LL_RESERVED_SPACE(dev);
2393 tlen = dev->needed_tailroom;
69e3c75f 2394 skb = sock_alloc_send_skb(&po->sk,
ae641949 2395 hlen + tlen + sizeof(struct sockaddr_ll),
fbf33a28 2396 !need_wait, &err);
69e3c75f 2397
fbf33a28
KM
2398 if (unlikely(skb == NULL)) {
2399 /* we assume the socket was initially writeable ... */
2400 if (likely(len_sum > 0))
2401 err = len_sum;
69e3c75f 2402 goto out_status;
fbf33a28 2403 }
69e3c75f 2404 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
52f1454f
DB
2405 addr, hlen);
2406 if (tp_len > dev->mtu + dev->hard_header_len) {
2407 struct ethhdr *ehdr;
2408 /* Earlier code assumed this would be a VLAN pkt,
2409 * double-check this now that we have the actual
2410 * packet in hand.
2411 */
69e3c75f 2412
52f1454f
DB
2413 skb_reset_mac_header(skb);
2414 ehdr = eth_hdr(skb);
2415 if (ehdr->h_proto != htons(ETH_P_8021Q))
2416 tp_len = -EMSGSIZE;
2417 }
69e3c75f
JB
2418 if (unlikely(tp_len < 0)) {
2419 if (po->tp_loss) {
2420 __packet_set_status(po, ph,
2421 TP_STATUS_AVAILABLE);
2422 packet_increment_head(&po->tx_ring);
2423 kfree_skb(skb);
2424 continue;
2425 } else {
2426 status = TP_STATUS_WRONG_FORMAT;
2427 err = tp_len;
2428 goto out_status;
2429 }
2430 }
2431
0fd5d57b
DB
2432 packet_pick_tx_queue(dev, skb);
2433
69e3c75f
JB
2434 skb->destructor = tpacket_destruct_skb;
2435 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2436 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2437
2438 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2439 err = po->xmit(skb);
eb70df13
JP
2440 if (unlikely(err > 0)) {
2441 err = net_xmit_errno(err);
2442 if (err && __packet_get_status(po, ph) ==
2443 TP_STATUS_AVAILABLE) {
2444 /* skb was destructed already */
2445 skb = NULL;
2446 goto out_status;
2447 }
2448 /*
2449 * skb was dropped but not destructed yet;
2450 * let's treat it like congestion or err < 0
2451 */
2452 err = 0;
2453 }
69e3c75f
JB
2454 packet_increment_head(&po->tx_ring);
2455 len_sum += tp_len;
b0138408
DB
2456 } while (likely((ph != NULL) ||
2457 /* Note: packet_read_pending() might be slow if we have
2458 * to call it as it's per_cpu variable, but in fast-path
2459 * we already short-circuit the loop with the first
2460 * condition, and luckily don't have to go that path
2461 * anyway.
2462 */
2463 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2464
2465 err = len_sum;
2466 goto out_put;
2467
69e3c75f
JB
2468out_status:
2469 __packet_set_status(po, ph, status);
2470 kfree_skb(skb);
2471out_put:
e40526cb 2472 dev_put(dev);
69e3c75f
JB
2473out:
2474 mutex_unlock(&po->pg_vec_lock);
2475 return err;
2476}
69e3c75f 2477
eea49cc9
OJ
2478static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2479 size_t reserve, size_t len,
2480 size_t linear, int noblock,
2481 int *err)
bfd5f4a3
SS
2482{
2483 struct sk_buff *skb;
2484
2485 /* Under a page? Don't bother with paged skb. */
2486 if (prepad + len < PAGE_SIZE || !linear)
2487 linear = len;
2488
2489 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2490 err, 0);
bfd5f4a3
SS
2491 if (!skb)
2492 return NULL;
2493
2494 skb_reserve(skb, reserve);
2495 skb_put(skb, linear);
2496 skb->data_len = len - linear;
2497 skb->len += len - linear;
2498
2499 return skb;
2500}
2501
d346a3fa 2502static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2503{
2504 struct sock *sk = sock->sk;
342dfc30 2505 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2506 struct sk_buff *skb;
2507 struct net_device *dev;
0e11c91e 2508 __be16 proto;
1da177e4 2509 unsigned char *addr;
827d9780 2510 int err, reserve = 0;
bfd5f4a3
SS
2511 struct virtio_net_hdr vnet_hdr = { 0 };
2512 int offset = 0;
2513 int vnet_hdr_len;
2514 struct packet_sock *po = pkt_sk(sk);
2515 unsigned short gso_type = 0;
ae641949 2516 int hlen, tlen;
3bdc0eba 2517 int extra_len = 0;
8feb2fb2 2518 ssize_t n;
1da177e4
LT
2519
2520 /*
1ce4f28b 2521 * Get and verify the address.
1da177e4 2522 */
1ce4f28b 2523
66e56cd4 2524 if (likely(saddr == NULL)) {
e40526cb 2525 dev = packet_cached_dev_get(po);
1da177e4
LT
2526 proto = po->num;
2527 addr = NULL;
2528 } else {
2529 err = -EINVAL;
2530 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2531 goto out;
0fb375fb
EB
2532 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2533 goto out;
1da177e4
LT
2534 proto = saddr->sll_protocol;
2535 addr = saddr->sll_addr;
827d9780 2536 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2537 }
2538
1da177e4 2539 err = -ENXIO;
e40526cb 2540 if (unlikely(dev == NULL))
1da177e4 2541 goto out_unlock;
d5e76b0a 2542 err = -ENETDOWN;
e40526cb 2543 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2544 goto out_unlock;
2545
e40526cb
DB
2546 if (sock->type == SOCK_RAW)
2547 reserve = dev->hard_header_len;
bfd5f4a3
SS
2548 if (po->has_vnet_hdr) {
2549 vnet_hdr_len = sizeof(vnet_hdr);
2550
2551 err = -EINVAL;
2552 if (len < vnet_hdr_len)
2553 goto out_unlock;
2554
2555 len -= vnet_hdr_len;
2556
8feb2fb2 2557 err = -EFAULT;
c0371da6 2558 n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
8feb2fb2 2559 if (n != vnet_hdr_len)
bfd5f4a3
SS
2560 goto out_unlock;
2561
2562 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
dc9e5153
MT
2563 (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
2564 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
2565 __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
2566 vnet_hdr.hdr_len = __cpu_to_virtio16(false,
2567 __virtio16_to_cpu(false, vnet_hdr.csum_start) +
2568 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
bfd5f4a3
SS
2569
2570 err = -EINVAL;
dc9e5153 2571 if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
bfd5f4a3
SS
2572 goto out_unlock;
2573
2574 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2575 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2576 case VIRTIO_NET_HDR_GSO_TCPV4:
2577 gso_type = SKB_GSO_TCPV4;
2578 break;
2579 case VIRTIO_NET_HDR_GSO_TCPV6:
2580 gso_type = SKB_GSO_TCPV6;
2581 break;
2582 case VIRTIO_NET_HDR_GSO_UDP:
2583 gso_type = SKB_GSO_UDP;
2584 break;
2585 default:
2586 goto out_unlock;
2587 }
2588
2589 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2590 gso_type |= SKB_GSO_TCP_ECN;
2591
2592 if (vnet_hdr.gso_size == 0)
2593 goto out_unlock;
2594
2595 }
2596 }
2597
3bdc0eba
BG
2598 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2599 if (!netif_supports_nofcs(dev)) {
2600 err = -EPROTONOSUPPORT;
2601 goto out_unlock;
2602 }
2603 extra_len = 4; /* We're doing our own CRC */
2604 }
2605
1da177e4 2606 err = -EMSGSIZE;
3bdc0eba 2607 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2608 goto out_unlock;
2609
bfd5f4a3 2610 err = -ENOBUFS;
ae641949
HX
2611 hlen = LL_RESERVED_SPACE(dev);
2612 tlen = dev->needed_tailroom;
dc9e5153
MT
2613 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2614 __virtio16_to_cpu(false, vnet_hdr.hdr_len),
bfd5f4a3 2615 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2616 if (skb == NULL)
1da177e4
LT
2617 goto out_unlock;
2618
bfd5f4a3 2619 skb_set_network_header(skb, reserve);
1da177e4 2620
0c4e8581 2621 err = -EINVAL;
9c707762
WB
2622 if (sock->type == SOCK_DGRAM) {
2623 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2624 if (unlikely(offset < 0))
9c707762
WB
2625 goto out_free;
2626 } else {
2627 if (ll_header_truncated(dev, len))
2628 goto out_free;
2629 }
1da177e4
LT
2630
2631 /* Returns -EFAULT on error */
c0371da6 2632 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2633 if (err)
2634 goto out_free;
bf84a010
DB
2635
2636 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2637
3bdc0eba 2638 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2639 /* Earlier code assumed this would be a VLAN pkt,
2640 * double-check this now that we have the actual
2641 * packet in hand.
2642 */
2643 struct ethhdr *ehdr;
2644 skb_reset_mac_header(skb);
2645 ehdr = eth_hdr(skb);
2646 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2647 err = -EMSGSIZE;
2648 goto out_free;
2649 }
57f89bfa
BG
2650 }
2651
09effa67
DM
2652 skb->protocol = proto;
2653 skb->dev = dev;
1da177e4 2654 skb->priority = sk->sk_priority;
2d37a186 2655 skb->mark = sk->sk_mark;
0fd5d57b
DB
2656
2657 packet_pick_tx_queue(dev, skb);
1da177e4 2658
bfd5f4a3
SS
2659 if (po->has_vnet_hdr) {
2660 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
dc9e5153
MT
2661 u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
2662 u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
2663 if (!skb_partial_csum_set(skb, s, o)) {
bfd5f4a3
SS
2664 err = -EINVAL;
2665 goto out_free;
2666 }
2667 }
2668
dc9e5153
MT
2669 skb_shinfo(skb)->gso_size =
2670 __virtio16_to_cpu(false, vnet_hdr.gso_size);
bfd5f4a3
SS
2671 skb_shinfo(skb)->gso_type = gso_type;
2672
2673 /* Header must be checked, and gso_segs computed. */
2674 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2675 skb_shinfo(skb)->gso_segs = 0;
2676
2677 len += vnet_hdr_len;
2678 }
2679
d346a3fa
DB
2680 if (!packet_use_direct_xmit(po))
2681 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2682 if (unlikely(extra_len == 4))
2683 skb->no_fcs = 1;
2684
d346a3fa 2685 err = po->xmit(skb);
1da177e4
LT
2686 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2687 goto out_unlock;
2688
e40526cb 2689 dev_put(dev);
1da177e4 2690
40d4e3df 2691 return len;
1da177e4
LT
2692
2693out_free:
2694 kfree_skb(skb);
2695out_unlock:
e40526cb 2696 if (dev)
1da177e4
LT
2697 dev_put(dev);
2698out:
2699 return err;
2700}
2701
1b784140 2702static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2703{
69e3c75f
JB
2704 struct sock *sk = sock->sk;
2705 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2706
69e3c75f
JB
2707 if (po->tx_ring.pg_vec)
2708 return tpacket_snd(po, msg);
2709 else
69e3c75f
JB
2710 return packet_snd(sock, msg, len);
2711}
2712
1da177e4
LT
2713/*
2714 * Close a PACKET socket. This is fairly simple. We immediately go
2715 * to 'closed' state and remove our protocol entry in the device list.
2716 */
2717
2718static int packet_release(struct socket *sock)
2719{
2720 struct sock *sk = sock->sk;
2721 struct packet_sock *po;
d12d01d6 2722 struct net *net;
f6fb8f10 2723 union tpacket_req_u req_u;
1da177e4
LT
2724
2725 if (!sk)
2726 return 0;
2727
3b1e0a65 2728 net = sock_net(sk);
1da177e4
LT
2729 po = pkt_sk(sk);
2730
0fa7fa98 2731 mutex_lock(&net->packet.sklist_lock);
808f5114 2732 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2733 mutex_unlock(&net->packet.sklist_lock);
2734
2735 preempt_disable();
920de804 2736 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2737 preempt_enable();
1da177e4 2738
808f5114 2739 spin_lock(&po->bind_lock);
ce06b03e 2740 unregister_prot_hook(sk, false);
66e56cd4
DB
2741 packet_cached_dev_reset(po);
2742
160ff18a
BG
2743 if (po->prot_hook.dev) {
2744 dev_put(po->prot_hook.dev);
2745 po->prot_hook.dev = NULL;
2746 }
808f5114 2747 spin_unlock(&po->bind_lock);
1da177e4 2748
1da177e4 2749 packet_flush_mclist(sk);
1da177e4 2750
9665d5d6
PS
2751 if (po->rx_ring.pg_vec) {
2752 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2753 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2754 }
69e3c75f 2755
9665d5d6
PS
2756 if (po->tx_ring.pg_vec) {
2757 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2758 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2759 }
1da177e4 2760
dc99f600
DM
2761 fanout_release(sk);
2762
808f5114 2763 synchronize_net();
1da177e4
LT
2764 /*
2765 * Now the socket is dead. No more input will appear.
2766 */
1da177e4
LT
2767 sock_orphan(sk);
2768 sock->sk = NULL;
2769
2770 /* Purge queues */
2771
2772 skb_queue_purge(&sk->sk_receive_queue);
b0138408 2773 packet_free_pending(po);
17ab56a2 2774 sk_refcnt_debug_release(sk);
1da177e4
LT
2775
2776 sock_put(sk);
2777 return 0;
2778}
2779
2780/*
2781 * Attach a packet hook.
2782 */
2783
902fefb8 2784static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2785{
2786 struct packet_sock *po = pkt_sk(sk);
158cd4af 2787 struct net_device *dev_curr;
902fefb8
DB
2788 __be16 proto_curr;
2789 bool need_rehook;
dc99f600 2790
aef950b4
WY
2791 if (po->fanout) {
2792 if (dev)
2793 dev_put(dev);
2794
dc99f600 2795 return -EINVAL;
aef950b4 2796 }
1da177e4
LT
2797
2798 lock_sock(sk);
1da177e4 2799 spin_lock(&po->bind_lock);
66e56cd4 2800
902fefb8
DB
2801 proto_curr = po->prot_hook.type;
2802 dev_curr = po->prot_hook.dev;
2803
2804 need_rehook = proto_curr != proto || dev_curr != dev;
2805
2806 if (need_rehook) {
2807 unregister_prot_hook(sk, true);
1da177e4 2808
902fefb8
DB
2809 po->num = proto;
2810 po->prot_hook.type = proto;
902fefb8
DB
2811 po->prot_hook.dev = dev;
2812
2813 po->ifindex = dev ? dev->ifindex : 0;
2814 packet_cached_dev_assign(po, dev);
2815 }
158cd4af
LW
2816 if (dev_curr)
2817 dev_put(dev_curr);
66e56cd4 2818
902fefb8 2819 if (proto == 0 || !need_rehook)
1da177e4
LT
2820 goto out_unlock;
2821
be85d4ad 2822 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2823 register_prot_hook(sk);
be85d4ad
UT
2824 } else {
2825 sk->sk_err = ENETDOWN;
2826 if (!sock_flag(sk, SOCK_DEAD))
2827 sk->sk_error_report(sk);
1da177e4
LT
2828 }
2829
2830out_unlock:
2831 spin_unlock(&po->bind_lock);
2832 release_sock(sk);
2833 return 0;
2834}
2835
2836/*
2837 * Bind a packet socket to a device
2838 */
2839
40d4e3df
ED
2840static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2841 int addr_len)
1da177e4 2842{
40d4e3df 2843 struct sock *sk = sock->sk;
1da177e4
LT
2844 char name[15];
2845 struct net_device *dev;
2846 int err = -ENODEV;
1ce4f28b 2847
1da177e4
LT
2848 /*
2849 * Check legality
2850 */
1ce4f28b 2851
8ae55f04 2852 if (addr_len != sizeof(struct sockaddr))
1da177e4 2853 return -EINVAL;
40d4e3df 2854 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2855
3b1e0a65 2856 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2857 if (dev)
1da177e4 2858 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2859 return err;
2860}
1da177e4
LT
2861
2862static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2863{
40d4e3df
ED
2864 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2865 struct sock *sk = sock->sk;
1da177e4
LT
2866 struct net_device *dev = NULL;
2867 int err;
2868
2869
2870 /*
2871 * Check legality
2872 */
1ce4f28b 2873
1da177e4
LT
2874 if (addr_len < sizeof(struct sockaddr_ll))
2875 return -EINVAL;
2876 if (sll->sll_family != AF_PACKET)
2877 return -EINVAL;
2878
2879 if (sll->sll_ifindex) {
2880 err = -ENODEV;
3b1e0a65 2881 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2882 if (dev == NULL)
2883 goto out;
2884 }
2885 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2886
2887out:
2888 return err;
2889}
2890
2891static struct proto packet_proto = {
2892 .name = "PACKET",
2893 .owner = THIS_MODULE,
2894 .obj_size = sizeof(struct packet_sock),
2895};
2896
2897/*
1ce4f28b 2898 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2899 */
2900
3f378b68
EP
2901static int packet_create(struct net *net, struct socket *sock, int protocol,
2902 int kern)
1da177e4
LT
2903{
2904 struct sock *sk;
2905 struct packet_sock *po;
0e11c91e 2906 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2907 int err;
2908
df008c91 2909 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2910 return -EPERM;
be02097c
DM
2911 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2912 sock->type != SOCK_PACKET)
1da177e4
LT
2913 return -ESOCKTNOSUPPORT;
2914
2915 sock->state = SS_UNCONNECTED;
2916
2917 err = -ENOBUFS;
11aa9c28 2918 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
2919 if (sk == NULL)
2920 goto out;
2921
2922 sock->ops = &packet_ops;
1da177e4
LT
2923 if (sock->type == SOCK_PACKET)
2924 sock->ops = &packet_ops_spkt;
be02097c 2925
1da177e4
LT
2926 sock_init_data(sock, sk);
2927
2928 po = pkt_sk(sk);
2929 sk->sk_family = PF_PACKET;
0e11c91e 2930 po->num = proto;
d346a3fa 2931 po->xmit = dev_queue_xmit;
66e56cd4 2932
b0138408
DB
2933 err = packet_alloc_pending(po);
2934 if (err)
2935 goto out2;
2936
66e56cd4 2937 packet_cached_dev_reset(po);
1da177e4
LT
2938
2939 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2940 sk_refcnt_debug_inc(sk);
1da177e4
LT
2941
2942 /*
2943 * Attach a protocol block
2944 */
2945
2946 spin_lock_init(&po->bind_lock);
905db440 2947 mutex_init(&po->pg_vec_lock);
0648ab70 2948 po->rollover = NULL;
1da177e4 2949 po->prot_hook.func = packet_rcv;
be02097c 2950
1da177e4
LT
2951 if (sock->type == SOCK_PACKET)
2952 po->prot_hook.func = packet_rcv_spkt;
be02097c 2953
1da177e4
LT
2954 po->prot_hook.af_packet_priv = sk;
2955
0e11c91e
AV
2956 if (proto) {
2957 po->prot_hook.type = proto;
ce06b03e 2958 register_prot_hook(sk);
1da177e4
LT
2959 }
2960
0fa7fa98 2961 mutex_lock(&net->packet.sklist_lock);
808f5114 2962 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2963 mutex_unlock(&net->packet.sklist_lock);
2964
2965 preempt_disable();
3680453c 2966 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2967 preempt_enable();
808f5114 2968
40d4e3df 2969 return 0;
b0138408
DB
2970out2:
2971 sk_free(sk);
1da177e4
LT
2972out:
2973 return err;
2974}
2975
2976/*
2977 * Pull a packet from our receive queue and hand it to the user.
2978 * If necessary we block.
2979 */
2980
1b784140
YX
2981static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
2982 int flags)
1da177e4
LT
2983{
2984 struct sock *sk = sock->sk;
2985 struct sk_buff *skb;
2986 int copied, err;
bfd5f4a3 2987 int vnet_hdr_len = 0;
2472d761 2988 unsigned int origlen = 0;
1da177e4
LT
2989
2990 err = -EINVAL;
ed85b565 2991 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2992 goto out;
2993
2994#if 0
2995 /* What error should we return now? EUNATTACH? */
2996 if (pkt_sk(sk)->ifindex < 0)
2997 return -ENODEV;
2998#endif
2999
ed85b565 3000 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3001 err = sock_recv_errqueue(sk, msg, len,
3002 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3003 goto out;
3004 }
3005
1da177e4
LT
3006 /*
3007 * Call the generic datagram receiver. This handles all sorts
3008 * of horrible races and re-entrancy so we can forget about it
3009 * in the protocol layers.
3010 *
3011 * Now it will return ENETDOWN, if device have just gone down,
3012 * but then it will block.
3013 */
3014
40d4e3df 3015 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3016
3017 /*
1ce4f28b 3018 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3019 * handles the blocking we don't see and worry about blocking
3020 * retries.
3021 */
3022
8ae55f04 3023 if (skb == NULL)
1da177e4
LT
3024 goto out;
3025
2ccdbaa6
WB
3026 if (pkt_sk(sk)->pressure)
3027 packet_rcv_has_room(pkt_sk(sk), NULL);
3028
bfd5f4a3
SS
3029 if (pkt_sk(sk)->has_vnet_hdr) {
3030 struct virtio_net_hdr vnet_hdr = { 0 };
3031
3032 err = -EINVAL;
3033 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 3034 if (len < vnet_hdr_len)
bfd5f4a3
SS
3035 goto out_free;
3036
1f18b717
MK
3037 len -= vnet_hdr_len;
3038
bfd5f4a3
SS
3039 if (skb_is_gso(skb)) {
3040 struct skb_shared_info *sinfo = skb_shinfo(skb);
3041
3042 /* This is a hint as to how much should be linear. */
dc9e5153
MT
3043 vnet_hdr.hdr_len =
3044 __cpu_to_virtio16(false, skb_headlen(skb));
3045 vnet_hdr.gso_size =
3046 __cpu_to_virtio16(false, sinfo->gso_size);
bfd5f4a3
SS
3047 if (sinfo->gso_type & SKB_GSO_TCPV4)
3048 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
3049 else if (sinfo->gso_type & SKB_GSO_TCPV6)
3050 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
3051 else if (sinfo->gso_type & SKB_GSO_UDP)
3052 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
3053 else if (sinfo->gso_type & SKB_GSO_FCOE)
3054 goto out_free;
3055 else
3056 BUG();
3057 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
3058 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
3059 } else
3060 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
3061
3062 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3063 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
dc9e5153
MT
3064 vnet_hdr.csum_start = __cpu_to_virtio16(false,
3065 skb_checksum_start_offset(skb));
3066 vnet_hdr.csum_offset = __cpu_to_virtio16(false,
3067 skb->csum_offset);
10a8d94a
JW
3068 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
3069 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
3070 } /* else everything is zero */
3071
7eab8d9e 3072 err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
bfd5f4a3
SS
3073 if (err < 0)
3074 goto out_free;
3075 }
3076
f3d33426
HFS
3077 /* You lose any data beyond the buffer you gave. If it worries
3078 * a user program they can ask the device for its MTU
3079 * anyway.
1da177e4 3080 */
1da177e4 3081 copied = skb->len;
40d4e3df
ED
3082 if (copied > len) {
3083 copied = len;
3084 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3085 }
3086
51f3d02b 3087 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3088 if (err)
3089 goto out_free;
3090
2472d761
EB
3091 if (sock->type != SOCK_PACKET) {
3092 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3093
3094 /* Original length was stored in sockaddr_ll fields */
3095 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3096 sll->sll_family = AF_PACKET;
3097 sll->sll_protocol = skb->protocol;
3098 }
3099
3b885787 3100 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3101
f3d33426
HFS
3102 if (msg->msg_name) {
3103 /* If the address length field is there to be filled
3104 * in, we fill it in now.
3105 */
3106 if (sock->type == SOCK_PACKET) {
342dfc30 3107 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426
HFS
3108 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3109 } else {
3110 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3111
f3d33426
HFS
3112 msg->msg_namelen = sll->sll_halen +
3113 offsetof(struct sockaddr_ll, sll_addr);
3114 }
ffbc6111
HX
3115 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3116 msg->msg_namelen);
f3d33426 3117 }
1da177e4 3118
8dc41944 3119 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3120 struct tpacket_auxdata aux;
3121
3122 aux.tp_status = TP_STATUS_USER;
3123 if (skb->ip_summed == CHECKSUM_PARTIAL)
3124 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3125 else if (skb->pkt_type != PACKET_OUTGOING &&
3126 (skb->ip_summed == CHECKSUM_COMPLETE ||
3127 skb_csum_unnecessary(skb)))
3128 aux.tp_status |= TP_STATUS_CSUM_VALID;
3129
2472d761 3130 aux.tp_len = origlen;
ffbc6111
HX
3131 aux.tp_snaplen = skb->len;
3132 aux.tp_mac = 0;
bbe735e4 3133 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3134 if (skb_vlan_tag_present(skb)) {
3135 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3136 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3137 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3138 } else {
3139 aux.tp_vlan_tci = 0;
a0cdfcf3 3140 aux.tp_vlan_tpid = 0;
a3bcc23e 3141 }
ffbc6111 3142 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3143 }
3144
1da177e4
LT
3145 /*
3146 * Free or return the buffer as appropriate. Again this
3147 * hides all the races and re-entrancy issues from us.
3148 */
bfd5f4a3 3149 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3150
3151out_free:
3152 skb_free_datagram(sk, skb);
3153out:
3154 return err;
3155}
3156
1da177e4
LT
3157static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3158 int *uaddr_len, int peer)
3159{
3160 struct net_device *dev;
3161 struct sock *sk = sock->sk;
3162
3163 if (peer)
3164 return -EOPNOTSUPP;
3165
3166 uaddr->sa_family = AF_PACKET;
2dc85bf3 3167 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3168 rcu_read_lock();
3169 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3170 if (dev)
2dc85bf3 3171 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3172 rcu_read_unlock();
1da177e4
LT
3173 *uaddr_len = sizeof(*uaddr);
3174
3175 return 0;
3176}
1da177e4
LT
3177
3178static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3179 int *uaddr_len, int peer)
3180{
3181 struct net_device *dev;
3182 struct sock *sk = sock->sk;
3183 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3184 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3185
3186 if (peer)
3187 return -EOPNOTSUPP;
3188
3189 sll->sll_family = AF_PACKET;
3190 sll->sll_ifindex = po->ifindex;
3191 sll->sll_protocol = po->num;
67286640 3192 sll->sll_pkttype = 0;
654d1f8a
ED
3193 rcu_read_lock();
3194 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3195 if (dev) {
3196 sll->sll_hatype = dev->type;
3197 sll->sll_halen = dev->addr_len;
3198 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3199 } else {
3200 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3201 sll->sll_halen = 0;
3202 }
654d1f8a 3203 rcu_read_unlock();
0fb375fb 3204 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3205
3206 return 0;
3207}
3208
2aeb0b88
WC
3209static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3210 int what)
1da177e4
LT
3211{
3212 switch (i->type) {
3213 case PACKET_MR_MULTICAST:
1162563f
JP
3214 if (i->alen != dev->addr_len)
3215 return -EINVAL;
1da177e4 3216 if (what > 0)
22bedad3 3217 return dev_mc_add(dev, i->addr);
1da177e4 3218 else
22bedad3 3219 return dev_mc_del(dev, i->addr);
1da177e4
LT
3220 break;
3221 case PACKET_MR_PROMISC:
2aeb0b88 3222 return dev_set_promiscuity(dev, what);
1da177e4 3223 case PACKET_MR_ALLMULTI:
2aeb0b88 3224 return dev_set_allmulti(dev, what);
d95ed927 3225 case PACKET_MR_UNICAST:
1162563f
JP
3226 if (i->alen != dev->addr_len)
3227 return -EINVAL;
d95ed927 3228 if (what > 0)
a748ee24 3229 return dev_uc_add(dev, i->addr);
d95ed927 3230 else
a748ee24 3231 return dev_uc_del(dev, i->addr);
d95ed927 3232 break;
40d4e3df
ED
3233 default:
3234 break;
1da177e4 3235 }
2aeb0b88 3236 return 0;
1da177e4
LT
3237}
3238
82f17091
FR
3239static void packet_dev_mclist_delete(struct net_device *dev,
3240 struct packet_mclist **mlp)
1da177e4 3241{
82f17091
FR
3242 struct packet_mclist *ml;
3243
3244 while ((ml = *mlp) != NULL) {
3245 if (ml->ifindex == dev->ifindex) {
3246 packet_dev_mc(dev, ml, -1);
3247 *mlp = ml->next;
3248 kfree(ml);
3249 } else
3250 mlp = &ml->next;
1da177e4
LT
3251 }
3252}
3253
0fb375fb 3254static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3255{
3256 struct packet_sock *po = pkt_sk(sk);
3257 struct packet_mclist *ml, *i;
3258 struct net_device *dev;
3259 int err;
3260
3261 rtnl_lock();
3262
3263 err = -ENODEV;
3b1e0a65 3264 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3265 if (!dev)
3266 goto done;
3267
3268 err = -EINVAL;
1162563f 3269 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3270 goto done;
3271
3272 err = -ENOBUFS;
8b3a7005 3273 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3274 if (i == NULL)
3275 goto done;
3276
3277 err = 0;
3278 for (ml = po->mclist; ml; ml = ml->next) {
3279 if (ml->ifindex == mreq->mr_ifindex &&
3280 ml->type == mreq->mr_type &&
3281 ml->alen == mreq->mr_alen &&
3282 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3283 ml->count++;
3284 /* Free the new element ... */
3285 kfree(i);
3286 goto done;
3287 }
3288 }
3289
3290 i->type = mreq->mr_type;
3291 i->ifindex = mreq->mr_ifindex;
3292 i->alen = mreq->mr_alen;
3293 memcpy(i->addr, mreq->mr_address, i->alen);
3294 i->count = 1;
3295 i->next = po->mclist;
3296 po->mclist = i;
2aeb0b88
WC
3297 err = packet_dev_mc(dev, i, 1);
3298 if (err) {
3299 po->mclist = i->next;
3300 kfree(i);
3301 }
1da177e4
LT
3302
3303done:
3304 rtnl_unlock();
3305 return err;
3306}
3307
0fb375fb 3308static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3309{
3310 struct packet_mclist *ml, **mlp;
3311
3312 rtnl_lock();
3313
3314 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3315 if (ml->ifindex == mreq->mr_ifindex &&
3316 ml->type == mreq->mr_type &&
3317 ml->alen == mreq->mr_alen &&
3318 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3319 if (--ml->count == 0) {
3320 struct net_device *dev;
3321 *mlp = ml->next;
ad959e76
ED
3322 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3323 if (dev)
1da177e4 3324 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3325 kfree(ml);
3326 }
82f17091 3327 break;
1da177e4
LT
3328 }
3329 }
3330 rtnl_unlock();
82f17091 3331 return 0;
1da177e4
LT
3332}
3333
3334static void packet_flush_mclist(struct sock *sk)
3335{
3336 struct packet_sock *po = pkt_sk(sk);
3337 struct packet_mclist *ml;
3338
3339 if (!po->mclist)
3340 return;
3341
3342 rtnl_lock();
3343 while ((ml = po->mclist) != NULL) {
3344 struct net_device *dev;
3345
3346 po->mclist = ml->next;
ad959e76
ED
3347 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3348 if (dev != NULL)
1da177e4 3349 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3350 kfree(ml);
3351 }
3352 rtnl_unlock();
3353}
1da177e4
LT
3354
3355static int
b7058842 3356packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3357{
3358 struct sock *sk = sock->sk;
8dc41944 3359 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3360 int ret;
3361
3362 if (level != SOL_PACKET)
3363 return -ENOPROTOOPT;
3364
69e3c75f 3365 switch (optname) {
1ce4f28b 3366 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3367 case PACKET_DROP_MEMBERSHIP:
3368 {
0fb375fb
EB
3369 struct packet_mreq_max mreq;
3370 int len = optlen;
3371 memset(&mreq, 0, sizeof(mreq));
3372 if (len < sizeof(struct packet_mreq))
1da177e4 3373 return -EINVAL;
0fb375fb
EB
3374 if (len > sizeof(mreq))
3375 len = sizeof(mreq);
40d4e3df 3376 if (copy_from_user(&mreq, optval, len))
1da177e4 3377 return -EFAULT;
0fb375fb
EB
3378 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3379 return -EINVAL;
1da177e4
LT
3380 if (optname == PACKET_ADD_MEMBERSHIP)
3381 ret = packet_mc_add(sk, &mreq);
3382 else
3383 ret = packet_mc_drop(sk, &mreq);
3384 return ret;
3385 }
a2efcfa0 3386
1da177e4 3387 case PACKET_RX_RING:
69e3c75f 3388 case PACKET_TX_RING:
1da177e4 3389 {
f6fb8f10 3390 union tpacket_req_u req_u;
3391 int len;
1da177e4 3392
f6fb8f10 3393 switch (po->tp_version) {
3394 case TPACKET_V1:
3395 case TPACKET_V2:
3396 len = sizeof(req_u.req);
3397 break;
3398 case TPACKET_V3:
3399 default:
3400 len = sizeof(req_u.req3);
3401 break;
3402 }
3403 if (optlen < len)
1da177e4 3404 return -EINVAL;
bfd5f4a3
SS
3405 if (pkt_sk(sk)->has_vnet_hdr)
3406 return -EINVAL;
f6fb8f10 3407 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3408 return -EFAULT;
f6fb8f10 3409 return packet_set_ring(sk, &req_u, 0,
3410 optname == PACKET_TX_RING);
1da177e4
LT
3411 }
3412 case PACKET_COPY_THRESH:
3413 {
3414 int val;
3415
40d4e3df 3416 if (optlen != sizeof(val))
1da177e4 3417 return -EINVAL;
40d4e3df 3418 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3419 return -EFAULT;
3420
3421 pkt_sk(sk)->copy_thresh = val;
3422 return 0;
3423 }
bbd6ef87
PM
3424 case PACKET_VERSION:
3425 {
3426 int val;
3427
3428 if (optlen != sizeof(val))
3429 return -EINVAL;
69e3c75f 3430 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3431 return -EBUSY;
3432 if (copy_from_user(&val, optval, sizeof(val)))
3433 return -EFAULT;
3434 switch (val) {
3435 case TPACKET_V1:
3436 case TPACKET_V2:
f6fb8f10 3437 case TPACKET_V3:
bbd6ef87
PM
3438 po->tp_version = val;
3439 return 0;
3440 default:
3441 return -EINVAL;
3442 }
3443 }
8913336a
PM
3444 case PACKET_RESERVE:
3445 {
3446 unsigned int val;
3447
3448 if (optlen != sizeof(val))
3449 return -EINVAL;
69e3c75f 3450 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3451 return -EBUSY;
3452 if (copy_from_user(&val, optval, sizeof(val)))
3453 return -EFAULT;
3454 po->tp_reserve = val;
3455 return 0;
3456 }
69e3c75f
JB
3457 case PACKET_LOSS:
3458 {
3459 unsigned int val;
3460
3461 if (optlen != sizeof(val))
3462 return -EINVAL;
3463 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3464 return -EBUSY;
3465 if (copy_from_user(&val, optval, sizeof(val)))
3466 return -EFAULT;
3467 po->tp_loss = !!val;
3468 return 0;
3469 }
8dc41944
HX
3470 case PACKET_AUXDATA:
3471 {
3472 int val;
3473
3474 if (optlen < sizeof(val))
3475 return -EINVAL;
3476 if (copy_from_user(&val, optval, sizeof(val)))
3477 return -EFAULT;
3478
3479 po->auxdata = !!val;
3480 return 0;
3481 }
80feaacb
PWJ
3482 case PACKET_ORIGDEV:
3483 {
3484 int val;
3485
3486 if (optlen < sizeof(val))
3487 return -EINVAL;
3488 if (copy_from_user(&val, optval, sizeof(val)))
3489 return -EFAULT;
3490
3491 po->origdev = !!val;
3492 return 0;
3493 }
bfd5f4a3
SS
3494 case PACKET_VNET_HDR:
3495 {
3496 int val;
3497
3498 if (sock->type != SOCK_RAW)
3499 return -EINVAL;
3500 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3501 return -EBUSY;
3502 if (optlen < sizeof(val))
3503 return -EINVAL;
3504 if (copy_from_user(&val, optval, sizeof(val)))
3505 return -EFAULT;
3506
3507 po->has_vnet_hdr = !!val;
3508 return 0;
3509 }
614f60fa
SM
3510 case PACKET_TIMESTAMP:
3511 {
3512 int val;
3513
3514 if (optlen != sizeof(val))
3515 return -EINVAL;
3516 if (copy_from_user(&val, optval, sizeof(val)))
3517 return -EFAULT;
3518
3519 po->tp_tstamp = val;
3520 return 0;
3521 }
dc99f600
DM
3522 case PACKET_FANOUT:
3523 {
3524 int val;
3525
3526 if (optlen != sizeof(val))
3527 return -EINVAL;
3528 if (copy_from_user(&val, optval, sizeof(val)))
3529 return -EFAULT;
3530
3531 return fanout_add(sk, val & 0xffff, val >> 16);
3532 }
5920cd3a
PC
3533 case PACKET_TX_HAS_OFF:
3534 {
3535 unsigned int val;
3536
3537 if (optlen != sizeof(val))
3538 return -EINVAL;
3539 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3540 return -EBUSY;
3541 if (copy_from_user(&val, optval, sizeof(val)))
3542 return -EFAULT;
3543 po->tp_tx_has_off = !!val;
3544 return 0;
3545 }
d346a3fa
DB
3546 case PACKET_QDISC_BYPASS:
3547 {
3548 int val;
3549
3550 if (optlen != sizeof(val))
3551 return -EINVAL;
3552 if (copy_from_user(&val, optval, sizeof(val)))
3553 return -EFAULT;
3554
3555 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3556 return 0;
3557 }
1da177e4
LT
3558 default:
3559 return -ENOPROTOOPT;
3560 }
3561}
3562
3563static int packet_getsockopt(struct socket *sock, int level, int optname,
3564 char __user *optval, int __user *optlen)
3565{
3566 int len;
c06fff6e 3567 int val, lv = sizeof(val);
1da177e4
LT
3568 struct sock *sk = sock->sk;
3569 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3570 void *data = &val;
ee80fbf3 3571 union tpacket_stats_u st;
a9b63918 3572 struct tpacket_rollover_stats rstats;
1da177e4
LT
3573
3574 if (level != SOL_PACKET)
3575 return -ENOPROTOOPT;
3576
8ae55f04
KK
3577 if (get_user(len, optlen))
3578 return -EFAULT;
1da177e4
LT
3579
3580 if (len < 0)
3581 return -EINVAL;
1ce4f28b 3582
69e3c75f 3583 switch (optname) {
1da177e4 3584 case PACKET_STATISTICS:
1da177e4 3585 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3586 memcpy(&st, &po->stats, sizeof(st));
3587 memset(&po->stats, 0, sizeof(po->stats));
3588 spin_unlock_bh(&sk->sk_receive_queue.lock);
3589
f6fb8f10 3590 if (po->tp_version == TPACKET_V3) {
c06fff6e 3591 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3592 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3593 data = &st.stats3;
f6fb8f10 3594 } else {
c06fff6e 3595 lv = sizeof(struct tpacket_stats);
8bcdeaff 3596 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3597 data = &st.stats1;
f6fb8f10 3598 }
ee80fbf3 3599
8dc41944
HX
3600 break;
3601 case PACKET_AUXDATA:
8dc41944 3602 val = po->auxdata;
80feaacb
PWJ
3603 break;
3604 case PACKET_ORIGDEV:
80feaacb 3605 val = po->origdev;
bfd5f4a3
SS
3606 break;
3607 case PACKET_VNET_HDR:
bfd5f4a3 3608 val = po->has_vnet_hdr;
1da177e4 3609 break;
bbd6ef87 3610 case PACKET_VERSION:
bbd6ef87 3611 val = po->tp_version;
bbd6ef87
PM
3612 break;
3613 case PACKET_HDRLEN:
3614 if (len > sizeof(int))
3615 len = sizeof(int);
3616 if (copy_from_user(&val, optval, len))
3617 return -EFAULT;
3618 switch (val) {
3619 case TPACKET_V1:
3620 val = sizeof(struct tpacket_hdr);
3621 break;
3622 case TPACKET_V2:
3623 val = sizeof(struct tpacket2_hdr);
3624 break;
f6fb8f10 3625 case TPACKET_V3:
3626 val = sizeof(struct tpacket3_hdr);
3627 break;
bbd6ef87
PM
3628 default:
3629 return -EINVAL;
3630 }
bbd6ef87 3631 break;
8913336a 3632 case PACKET_RESERVE:
8913336a 3633 val = po->tp_reserve;
8913336a 3634 break;
69e3c75f 3635 case PACKET_LOSS:
69e3c75f 3636 val = po->tp_loss;
69e3c75f 3637 break;
614f60fa 3638 case PACKET_TIMESTAMP:
614f60fa 3639 val = po->tp_tstamp;
614f60fa 3640 break;
dc99f600 3641 case PACKET_FANOUT:
dc99f600
DM
3642 val = (po->fanout ?
3643 ((u32)po->fanout->id |
77f65ebd
WB
3644 ((u32)po->fanout->type << 16) |
3645 ((u32)po->fanout->flags << 24)) :
dc99f600 3646 0);
dc99f600 3647 break;
a9b63918
WB
3648 case PACKET_ROLLOVER_STATS:
3649 if (!po->rollover)
3650 return -EINVAL;
3651 rstats.tp_all = atomic_long_read(&po->rollover->num);
3652 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3653 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3654 data = &rstats;
3655 lv = sizeof(rstats);
3656 break;
5920cd3a
PC
3657 case PACKET_TX_HAS_OFF:
3658 val = po->tp_tx_has_off;
3659 break;
d346a3fa
DB
3660 case PACKET_QDISC_BYPASS:
3661 val = packet_use_direct_xmit(po);
3662 break;
1da177e4
LT
3663 default:
3664 return -ENOPROTOOPT;
3665 }
3666
c06fff6e
ED
3667 if (len > lv)
3668 len = lv;
8ae55f04
KK
3669 if (put_user(len, optlen))
3670 return -EFAULT;
8dc41944
HX
3671 if (copy_to_user(optval, data, len))
3672 return -EFAULT;
8ae55f04 3673 return 0;
1da177e4
LT
3674}
3675
3676
351638e7
JP
3677static int packet_notifier(struct notifier_block *this,
3678 unsigned long msg, void *ptr)
1da177e4
LT
3679{
3680 struct sock *sk;
351638e7 3681 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3682 struct net *net = dev_net(dev);
1da177e4 3683
808f5114 3684 rcu_read_lock();
b67bfe0d 3685 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3686 struct packet_sock *po = pkt_sk(sk);
3687
3688 switch (msg) {
3689 case NETDEV_UNREGISTER:
1da177e4 3690 if (po->mclist)
82f17091 3691 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
3692 /* fallthrough */
3693
1da177e4
LT
3694 case NETDEV_DOWN:
3695 if (dev->ifindex == po->ifindex) {
3696 spin_lock(&po->bind_lock);
3697 if (po->running) {
ce06b03e 3698 __unregister_prot_hook(sk, false);
1da177e4
LT
3699 sk->sk_err = ENETDOWN;
3700 if (!sock_flag(sk, SOCK_DEAD))
3701 sk->sk_error_report(sk);
3702 }
3703 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3704 packet_cached_dev_reset(po);
1da177e4 3705 po->ifindex = -1;
160ff18a
BG
3706 if (po->prot_hook.dev)
3707 dev_put(po->prot_hook.dev);
1da177e4
LT
3708 po->prot_hook.dev = NULL;
3709 }
3710 spin_unlock(&po->bind_lock);
3711 }
3712 break;
3713 case NETDEV_UP:
808f5114 3714 if (dev->ifindex == po->ifindex) {
3715 spin_lock(&po->bind_lock);
ce06b03e
DM
3716 if (po->num)
3717 register_prot_hook(sk);
808f5114 3718 spin_unlock(&po->bind_lock);
1da177e4 3719 }
1da177e4
LT
3720 break;
3721 }
3722 }
808f5114 3723 rcu_read_unlock();
1da177e4
LT
3724 return NOTIFY_DONE;
3725}
3726
3727
3728static int packet_ioctl(struct socket *sock, unsigned int cmd,
3729 unsigned long arg)
3730{
3731 struct sock *sk = sock->sk;
3732
69e3c75f 3733 switch (cmd) {
40d4e3df
ED
3734 case SIOCOUTQ:
3735 {
3736 int amount = sk_wmem_alloc_get(sk);
31e6d363 3737
40d4e3df
ED
3738 return put_user(amount, (int __user *)arg);
3739 }
3740 case SIOCINQ:
3741 {
3742 struct sk_buff *skb;
3743 int amount = 0;
3744
3745 spin_lock_bh(&sk->sk_receive_queue.lock);
3746 skb = skb_peek(&sk->sk_receive_queue);
3747 if (skb)
3748 amount = skb->len;
3749 spin_unlock_bh(&sk->sk_receive_queue.lock);
3750 return put_user(amount, (int __user *)arg);
3751 }
3752 case SIOCGSTAMP:
3753 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3754 case SIOCGSTAMPNS:
3755 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3756
1da177e4 3757#ifdef CONFIG_INET
40d4e3df
ED
3758 case SIOCADDRT:
3759 case SIOCDELRT:
3760 case SIOCDARP:
3761 case SIOCGARP:
3762 case SIOCSARP:
3763 case SIOCGIFADDR:
3764 case SIOCSIFADDR:
3765 case SIOCGIFBRDADDR:
3766 case SIOCSIFBRDADDR:
3767 case SIOCGIFNETMASK:
3768 case SIOCSIFNETMASK:
3769 case SIOCGIFDSTADDR:
3770 case SIOCSIFDSTADDR:
3771 case SIOCSIFFLAGS:
40d4e3df 3772 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3773#endif
3774
40d4e3df
ED
3775 default:
3776 return -ENOIOCTLCMD;
1da177e4
LT
3777 }
3778 return 0;
3779}
3780
40d4e3df 3781static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3782 poll_table *wait)
3783{
3784 struct sock *sk = sock->sk;
3785 struct packet_sock *po = pkt_sk(sk);
3786 unsigned int mask = datagram_poll(file, sock, wait);
3787
3788 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3789 if (po->rx_ring.pg_vec) {
f6fb8f10 3790 if (!packet_previous_rx_frame(po, &po->rx_ring,
3791 TP_STATUS_KERNEL))
1da177e4
LT
3792 mask |= POLLIN | POLLRDNORM;
3793 }
2ccdbaa6 3794 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 3795 po->pressure = 0;
1da177e4 3796 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3797 spin_lock_bh(&sk->sk_write_queue.lock);
3798 if (po->tx_ring.pg_vec) {
3799 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3800 mask |= POLLOUT | POLLWRNORM;
3801 }
3802 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3803 return mask;
3804}
3805
3806
3807/* Dirty? Well, I still did not learn better way to account
3808 * for user mmaps.
3809 */
3810
3811static void packet_mm_open(struct vm_area_struct *vma)
3812{
3813 struct file *file = vma->vm_file;
40d4e3df 3814 struct socket *sock = file->private_data;
1da177e4 3815 struct sock *sk = sock->sk;
1ce4f28b 3816
1da177e4
LT
3817 if (sk)
3818 atomic_inc(&pkt_sk(sk)->mapped);
3819}
3820
3821static void packet_mm_close(struct vm_area_struct *vma)
3822{
3823 struct file *file = vma->vm_file;
40d4e3df 3824 struct socket *sock = file->private_data;
1da177e4 3825 struct sock *sk = sock->sk;
1ce4f28b 3826
1da177e4
LT
3827 if (sk)
3828 atomic_dec(&pkt_sk(sk)->mapped);
3829}
3830
f0f37e2f 3831static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3832 .open = packet_mm_open,
3833 .close = packet_mm_close,
1da177e4
LT
3834};
3835
0e3125c7
NH
3836static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3837 unsigned int len)
1da177e4
LT
3838{
3839 int i;
3840
4ebf0ae2 3841 for (i = 0; i < len; i++) {
0e3125c7 3842 if (likely(pg_vec[i].buffer)) {
c56b4d90 3843 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3844 vfree(pg_vec[i].buffer);
3845 else
3846 free_pages((unsigned long)pg_vec[i].buffer,
3847 order);
3848 pg_vec[i].buffer = NULL;
3849 }
1da177e4
LT
3850 }
3851 kfree(pg_vec);
3852}
3853
eea49cc9 3854static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3855{
f0d4eb29 3856 char *buffer;
0e3125c7
NH
3857 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3858 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3859
3860 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3861 if (buffer)
3862 return buffer;
3863
f0d4eb29 3864 /* __get_free_pages failed, fall back to vmalloc */
bbce5a59 3865 buffer = vzalloc((1 << order) * PAGE_SIZE);
0e3125c7
NH
3866 if (buffer)
3867 return buffer;
3868
f0d4eb29 3869 /* vmalloc failed, lets dig into swap here */
0e3125c7 3870 gfp_flags &= ~__GFP_NORETRY;
f0d4eb29 3871 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
3872 if (buffer)
3873 return buffer;
3874
f0d4eb29 3875 /* complete and utter failure */
0e3125c7 3876 return NULL;
4ebf0ae2
DM
3877}
3878
0e3125c7 3879static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3880{
3881 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3882 struct pgv *pg_vec;
4ebf0ae2
DM
3883 int i;
3884
0e3125c7 3885 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3886 if (unlikely(!pg_vec))
3887 goto out;
3888
3889 for (i = 0; i < block_nr; i++) {
c56b4d90 3890 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3891 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3892 goto out_free_pgvec;
3893 }
3894
3895out:
3896 return pg_vec;
3897
3898out_free_pgvec:
3899 free_pg_vec(pg_vec, order, block_nr);
3900 pg_vec = NULL;
3901 goto out;
3902}
1da177e4 3903
f6fb8f10 3904static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3905 int closing, int tx_ring)
1da177e4 3906{
0e3125c7 3907 struct pgv *pg_vec = NULL;
1da177e4 3908 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3909 int was_running, order = 0;
69e3c75f
JB
3910 struct packet_ring_buffer *rb;
3911 struct sk_buff_head *rb_queue;
0e11c91e 3912 __be16 num;
f6fb8f10 3913 int err = -EINVAL;
3914 /* Added to avoid minimal code churn */
3915 struct tpacket_req *req = &req_u->req;
3916
3917 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3918 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3919 WARN(1, "Tx-ring is not supported.\n");
3920 goto out;
3921 }
1ce4f28b 3922
69e3c75f
JB
3923 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3924 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3925
69e3c75f
JB
3926 err = -EBUSY;
3927 if (!closing) {
3928 if (atomic_read(&po->mapped))
3929 goto out;
b0138408 3930 if (packet_read_pending(rb))
69e3c75f
JB
3931 goto out;
3932 }
1da177e4 3933
69e3c75f
JB
3934 if (req->tp_block_nr) {
3935 /* Sanity tests and some calculations */
3936 err = -EBUSY;
3937 if (unlikely(rb->pg_vec))
3938 goto out;
1da177e4 3939
bbd6ef87
PM
3940 switch (po->tp_version) {
3941 case TPACKET_V1:
3942 po->tp_hdrlen = TPACKET_HDRLEN;
3943 break;
3944 case TPACKET_V2:
3945 po->tp_hdrlen = TPACKET2_HDRLEN;
3946 break;
f6fb8f10 3947 case TPACKET_V3:
3948 po->tp_hdrlen = TPACKET3_HDRLEN;
3949 break;
bbd6ef87
PM
3950 }
3951
69e3c75f 3952 err = -EINVAL;
4ebf0ae2 3953 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3954 goto out;
4ebf0ae2 3955 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3956 goto out;
dc808110
ED
3957 if (po->tp_version >= TPACKET_V3 &&
3958 (int)(req->tp_block_size -
3959 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
3960 goto out;
8913336a 3961 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3962 po->tp_reserve))
3963 goto out;
4ebf0ae2 3964 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3965 goto out;
1da177e4 3966
69e3c75f
JB
3967 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3968 if (unlikely(rb->frames_per_block <= 0))
3969 goto out;
3970 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3971 req->tp_frame_nr))
3972 goto out;
1da177e4
LT
3973
3974 err = -ENOMEM;
4ebf0ae2
DM
3975 order = get_order(req->tp_block_size);
3976 pg_vec = alloc_pg_vec(req, order);
3977 if (unlikely(!pg_vec))
1da177e4 3978 goto out;
f6fb8f10 3979 switch (po->tp_version) {
3980 case TPACKET_V3:
3981 /* Transmit path is not supported. We checked
3982 * it above but just being paranoid
3983 */
3984 if (!tx_ring)
e8e85cc5 3985 init_prb_bdqc(po, rb, pg_vec, req_u);
d7cf0c34 3986 break;
f6fb8f10 3987 default:
3988 break;
3989 }
69e3c75f
JB
3990 }
3991 /* Done */
3992 else {
3993 err = -EINVAL;
4ebf0ae2 3994 if (unlikely(req->tp_frame_nr))
69e3c75f 3995 goto out;
1da177e4
LT
3996 }
3997
3998 lock_sock(sk);
3999
4000 /* Detach socket from network */
4001 spin_lock(&po->bind_lock);
4002 was_running = po->running;
4003 num = po->num;
4004 if (was_running) {
1da177e4 4005 po->num = 0;
ce06b03e 4006 __unregister_prot_hook(sk, false);
1da177e4
LT
4007 }
4008 spin_unlock(&po->bind_lock);
1ce4f28b 4009
1da177e4
LT
4010 synchronize_net();
4011
4012 err = -EBUSY;
905db440 4013 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4014 if (closing || atomic_read(&po->mapped) == 0) {
4015 err = 0;
69e3c75f 4016 spin_lock_bh(&rb_queue->lock);
c053fd96 4017 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4018 rb->frame_max = (req->tp_frame_nr - 1);
4019 rb->head = 0;
4020 rb->frame_size = req->tp_frame_size;
4021 spin_unlock_bh(&rb_queue->lock);
4022
c053fd96
CG
4023 swap(rb->pg_vec_order, order);
4024 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4025
4026 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4027 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4028 tpacket_rcv : packet_rcv;
4029 skb_queue_purge(rb_queue);
1da177e4 4030 if (atomic_read(&po->mapped))
40d4e3df
ED
4031 pr_err("packet_mmap: vma is busy: %d\n",
4032 atomic_read(&po->mapped));
1da177e4 4033 }
905db440 4034 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4035
4036 spin_lock(&po->bind_lock);
ce06b03e 4037 if (was_running) {
1da177e4 4038 po->num = num;
ce06b03e 4039 register_prot_hook(sk);
1da177e4
LT
4040 }
4041 spin_unlock(&po->bind_lock);
f6fb8f10 4042 if (closing && (po->tp_version > TPACKET_V2)) {
4043 /* Because we don't support block-based V3 on tx-ring */
4044 if (!tx_ring)
4045 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
4046 }
1da177e4
LT
4047 release_sock(sk);
4048
1da177e4
LT
4049 if (pg_vec)
4050 free_pg_vec(pg_vec, order, req->tp_block_nr);
4051out:
4052 return err;
4053}
4054
69e3c75f
JB
4055static int packet_mmap(struct file *file, struct socket *sock,
4056 struct vm_area_struct *vma)
1da177e4
LT
4057{
4058 struct sock *sk = sock->sk;
4059 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4060 unsigned long size, expected_size;
4061 struct packet_ring_buffer *rb;
1da177e4
LT
4062 unsigned long start;
4063 int err = -EINVAL;
4064 int i;
4065
4066 if (vma->vm_pgoff)
4067 return -EINVAL;
4068
905db440 4069 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4070
4071 expected_size = 0;
4072 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4073 if (rb->pg_vec) {
4074 expected_size += rb->pg_vec_len
4075 * rb->pg_vec_pages
4076 * PAGE_SIZE;
4077 }
4078 }
4079
4080 if (expected_size == 0)
1da177e4 4081 goto out;
69e3c75f
JB
4082
4083 size = vma->vm_end - vma->vm_start;
4084 if (size != expected_size)
1da177e4
LT
4085 goto out;
4086
1da177e4 4087 start = vma->vm_start;
69e3c75f
JB
4088 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4089 if (rb->pg_vec == NULL)
4090 continue;
4091
4092 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4093 struct page *page;
4094 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4095 int pg_num;
4096
c56b4d90
CG
4097 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4098 page = pgv_to_page(kaddr);
69e3c75f
JB
4099 err = vm_insert_page(vma, start, page);
4100 if (unlikely(err))
4101 goto out;
4102 start += PAGE_SIZE;
0e3125c7 4103 kaddr += PAGE_SIZE;
69e3c75f 4104 }
4ebf0ae2 4105 }
1da177e4 4106 }
69e3c75f 4107
4ebf0ae2 4108 atomic_inc(&po->mapped);
1da177e4
LT
4109 vma->vm_ops = &packet_mmap_ops;
4110 err = 0;
4111
4112out:
905db440 4113 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4114 return err;
4115}
1da177e4 4116
90ddc4f0 4117static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4118 .family = PF_PACKET,
4119 .owner = THIS_MODULE,
4120 .release = packet_release,
4121 .bind = packet_bind_spkt,
4122 .connect = sock_no_connect,
4123 .socketpair = sock_no_socketpair,
4124 .accept = sock_no_accept,
4125 .getname = packet_getname_spkt,
4126 .poll = datagram_poll,
4127 .ioctl = packet_ioctl,
4128 .listen = sock_no_listen,
4129 .shutdown = sock_no_shutdown,
4130 .setsockopt = sock_no_setsockopt,
4131 .getsockopt = sock_no_getsockopt,
4132 .sendmsg = packet_sendmsg_spkt,
4133 .recvmsg = packet_recvmsg,
4134 .mmap = sock_no_mmap,
4135 .sendpage = sock_no_sendpage,
4136};
1da177e4 4137
90ddc4f0 4138static const struct proto_ops packet_ops = {
1da177e4
LT
4139 .family = PF_PACKET,
4140 .owner = THIS_MODULE,
4141 .release = packet_release,
4142 .bind = packet_bind,
4143 .connect = sock_no_connect,
4144 .socketpair = sock_no_socketpair,
4145 .accept = sock_no_accept,
1ce4f28b 4146 .getname = packet_getname,
1da177e4
LT
4147 .poll = packet_poll,
4148 .ioctl = packet_ioctl,
4149 .listen = sock_no_listen,
4150 .shutdown = sock_no_shutdown,
4151 .setsockopt = packet_setsockopt,
4152 .getsockopt = packet_getsockopt,
4153 .sendmsg = packet_sendmsg,
4154 .recvmsg = packet_recvmsg,
4155 .mmap = packet_mmap,
4156 .sendpage = sock_no_sendpage,
4157};
4158
ec1b4cf7 4159static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4160 .family = PF_PACKET,
4161 .create = packet_create,
4162 .owner = THIS_MODULE,
4163};
4164
4165static struct notifier_block packet_netdev_notifier = {
40d4e3df 4166 .notifier_call = packet_notifier,
1da177e4
LT
4167};
4168
4169#ifdef CONFIG_PROC_FS
1da177e4
LT
4170
4171static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4172 __acquires(RCU)
1da177e4 4173{
e372c414 4174 struct net *net = seq_file_net(seq);
808f5114 4175
4176 rcu_read_lock();
4177 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4178}
4179
4180static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4181{
1bf40954 4182 struct net *net = seq_file_net(seq);
808f5114 4183 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4184}
4185
4186static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4187 __releases(RCU)
1da177e4 4188{
808f5114 4189 rcu_read_unlock();
1da177e4
LT
4190}
4191
1ce4f28b 4192static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4193{
4194 if (v == SEQ_START_TOKEN)
4195 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4196 else {
b7ceabd9 4197 struct sock *s = sk_entry(v);
1da177e4
LT
4198 const struct packet_sock *po = pkt_sk(s);
4199
4200 seq_printf(seq,
71338aa7 4201 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
4202 s,
4203 atomic_read(&s->sk_refcnt),
4204 s->sk_type,
4205 ntohs(po->num),
4206 po->ifindex,
4207 po->running,
4208 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4209 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4210 sock_i_ino(s));
1da177e4
LT
4211 }
4212
4213 return 0;
4214}
4215
56b3d975 4216static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4217 .start = packet_seq_start,
4218 .next = packet_seq_next,
4219 .stop = packet_seq_stop,
4220 .show = packet_seq_show,
4221};
4222
4223static int packet_seq_open(struct inode *inode, struct file *file)
4224{
e372c414
DL
4225 return seq_open_net(inode, file, &packet_seq_ops,
4226 sizeof(struct seq_net_private));
1da177e4
LT
4227}
4228
da7071d7 4229static const struct file_operations packet_seq_fops = {
1da177e4
LT
4230 .owner = THIS_MODULE,
4231 .open = packet_seq_open,
4232 .read = seq_read,
4233 .llseek = seq_lseek,
e372c414 4234 .release = seq_release_net,
1da177e4
LT
4235};
4236
4237#endif
4238
2c8c1e72 4239static int __net_init packet_net_init(struct net *net)
d12d01d6 4240{
0fa7fa98 4241 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4242 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4243
d4beaa66 4244 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
4245 return -ENOMEM;
4246
4247 return 0;
4248}
4249
2c8c1e72 4250static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4251{
ece31ffd 4252 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
4253}
4254
4255static struct pernet_operations packet_net_ops = {
4256 .init = packet_net_init,
4257 .exit = packet_net_exit,
4258};
4259
4260
1da177e4
LT
4261static void __exit packet_exit(void)
4262{
1da177e4 4263 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4264 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4265 sock_unregister(PF_PACKET);
4266 proto_unregister(&packet_proto);
4267}
4268
4269static int __init packet_init(void)
4270{
4271 int rc = proto_register(&packet_proto, 0);
4272
4273 if (rc != 0)
4274 goto out;
4275
4276 sock_register(&packet_family_ops);
d12d01d6 4277 register_pernet_subsys(&packet_net_ops);
1da177e4 4278 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4279out:
4280 return rc;
4281}
4282
4283module_init(packet_init);
4284module_exit(packet_exit);
4285MODULE_LICENSE("GPL");
4286MODULE_ALIAS_NETPROTO(PF_PACKET);