]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
packet: don't unconditionally schedule() in case of MSG_DONTWAIT
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
5df0ddfb 91#include <linux/reciprocal_div.h>
1da177e4
LT
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
d346a3fa
DB
240static int packet_direct_xmit(struct sk_buff *skb)
241{
242 struct net_device *dev = skb->dev;
243 const struct net_device_ops *ops = dev->netdev_ops;
244 netdev_features_t features;
245 struct netdev_queue *txq;
246 u16 queue_map;
247 int ret;
248
249 if (unlikely(!netif_running(dev) ||
250 !netif_carrier_ok(dev))) {
251 kfree_skb(skb);
252 return NET_XMIT_DROP;
253 }
254
255 features = netif_skb_features(skb);
256 if (skb_needs_linearize(skb, features) &&
257 __skb_linearize(skb)) {
258 kfree_skb(skb);
259 return NET_XMIT_DROP;
260 }
261
262 queue_map = skb_get_queue_mapping(skb);
263 txq = netdev_get_tx_queue(dev, queue_map);
264
265 __netif_tx_lock_bh(txq);
266 if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
267 ret = NETDEV_TX_BUSY;
268 kfree_skb(skb);
269 goto out;
270 }
271
272 ret = ops->ndo_start_xmit(skb, dev);
273 if (likely(dev_xmit_complete(ret)))
274 txq_trans_update(txq);
275 else
276 kfree_skb(skb);
277out:
278 __netif_tx_unlock_bh(txq);
279 return ret;
280}
281
66e56cd4
DB
282static struct net_device *packet_cached_dev_get(struct packet_sock *po)
283{
284 struct net_device *dev;
285
286 rcu_read_lock();
287 dev = rcu_dereference(po->cached_dev);
288 if (likely(dev))
289 dev_hold(dev);
290 rcu_read_unlock();
291
292 return dev;
293}
294
295static void packet_cached_dev_assign(struct packet_sock *po,
296 struct net_device *dev)
297{
298 rcu_assign_pointer(po->cached_dev, dev);
299}
300
301static void packet_cached_dev_reset(struct packet_sock *po)
302{
303 RCU_INIT_POINTER(po->cached_dev, NULL);
304}
305
d346a3fa
DB
306static bool packet_use_direct_xmit(const struct packet_sock *po)
307{
308 return po->xmit == packet_direct_xmit;
309}
310
311static u16 packet_pick_tx_queue(struct net_device *dev)
312{
1cbac010 313 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
d346a3fa
DB
314}
315
ce06b03e
DM
316/* register_prot_hook must be invoked with the po->bind_lock held,
317 * or from a context in which asynchronous accesses to the packet
318 * socket is not possible (packet_create()).
319 */
320static void register_prot_hook(struct sock *sk)
321{
322 struct packet_sock *po = pkt_sk(sk);
e40526cb 323
ce06b03e 324 if (!po->running) {
66e56cd4 325 if (po->fanout)
dc99f600 326 __fanout_link(sk, po);
66e56cd4 327 else
dc99f600 328 dev_add_pack(&po->prot_hook);
e40526cb 329
ce06b03e
DM
330 sock_hold(sk);
331 po->running = 1;
332 }
333}
334
335/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
336 * held. If the sync parameter is true, we will temporarily drop
337 * the po->bind_lock and do a synchronize_net to make sure no
338 * asynchronous packet processing paths still refer to the elements
339 * of po->prot_hook. If the sync parameter is false, it is the
340 * callers responsibility to take care of this.
341 */
342static void __unregister_prot_hook(struct sock *sk, bool sync)
343{
344 struct packet_sock *po = pkt_sk(sk);
345
346 po->running = 0;
66e56cd4
DB
347
348 if (po->fanout)
dc99f600 349 __fanout_unlink(sk, po);
66e56cd4 350 else
dc99f600 351 __dev_remove_pack(&po->prot_hook);
e40526cb 352
ce06b03e
DM
353 __sock_put(sk);
354
355 if (sync) {
356 spin_unlock(&po->bind_lock);
357 synchronize_net();
358 spin_lock(&po->bind_lock);
359 }
360}
361
362static void unregister_prot_hook(struct sock *sk, bool sync)
363{
364 struct packet_sock *po = pkt_sk(sk);
365
366 if (po->running)
367 __unregister_prot_hook(sk, sync);
368}
369
f6dafa95 370static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
371{
372 if (is_vmalloc_addr(addr))
373 return vmalloc_to_page(addr);
374 return virt_to_page(addr);
375}
376
69e3c75f 377static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 378{
184f489e 379 union tpacket_uhdr h;
1da177e4 380
69e3c75f 381 h.raw = frame;
bbd6ef87
PM
382 switch (po->tp_version) {
383 case TPACKET_V1:
69e3c75f 384 h.h1->tp_status = status;
0af55bb5 385 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
386 break;
387 case TPACKET_V2:
69e3c75f 388 h.h2->tp_status = status;
0af55bb5 389 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 390 break;
f6fb8f10 391 case TPACKET_V3:
69e3c75f 392 default:
f6fb8f10 393 WARN(1, "TPACKET version not supported.\n");
69e3c75f 394 BUG();
bbd6ef87 395 }
69e3c75f
JB
396
397 smp_wmb();
bbd6ef87
PM
398}
399
69e3c75f 400static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 401{
184f489e 402 union tpacket_uhdr h;
bbd6ef87 403
69e3c75f
JB
404 smp_rmb();
405
bbd6ef87
PM
406 h.raw = frame;
407 switch (po->tp_version) {
408 case TPACKET_V1:
0af55bb5 409 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 410 return h.h1->tp_status;
bbd6ef87 411 case TPACKET_V2:
0af55bb5 412 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 413 return h.h2->tp_status;
f6fb8f10 414 case TPACKET_V3:
69e3c75f 415 default:
f6fb8f10 416 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
417 BUG();
418 return 0;
bbd6ef87 419 }
1da177e4 420}
69e3c75f 421
b9c32fb2
DB
422static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
423 unsigned int flags)
7a51384c
DB
424{
425 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
426
427 if (shhwtstamps) {
428 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
429 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 430 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
431 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
432 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 433 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
434 }
435
436 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 437 return TP_STATUS_TS_SOFTWARE;
7a51384c 438
b9c32fb2 439 return 0;
7a51384c
DB
440}
441
b9c32fb2
DB
442static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
443 struct sk_buff *skb)
2e31396f
WB
444{
445 union tpacket_uhdr h;
446 struct timespec ts;
b9c32fb2 447 __u32 ts_status;
2e31396f 448
b9c32fb2
DB
449 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
450 return 0;
2e31396f
WB
451
452 h.raw = frame;
453 switch (po->tp_version) {
454 case TPACKET_V1:
455 h.h1->tp_sec = ts.tv_sec;
456 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
457 break;
458 case TPACKET_V2:
459 h.h2->tp_sec = ts.tv_sec;
460 h.h2->tp_nsec = ts.tv_nsec;
461 break;
462 case TPACKET_V3:
463 default:
464 WARN(1, "TPACKET version not supported.\n");
465 BUG();
466 }
467
468 /* one flush is safe, as both fields always lie on the same cacheline */
469 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
470 smp_wmb();
b9c32fb2
DB
471
472 return ts_status;
2e31396f
WB
473}
474
69e3c75f
JB
475static void *packet_lookup_frame(struct packet_sock *po,
476 struct packet_ring_buffer *rb,
477 unsigned int position,
478 int status)
479{
480 unsigned int pg_vec_pos, frame_offset;
184f489e 481 union tpacket_uhdr h;
69e3c75f
JB
482
483 pg_vec_pos = position / rb->frames_per_block;
484 frame_offset = position % rb->frames_per_block;
485
0e3125c7
NH
486 h.raw = rb->pg_vec[pg_vec_pos].buffer +
487 (frame_offset * rb->frame_size);
69e3c75f
JB
488
489 if (status != __packet_get_status(po, h.raw))
490 return NULL;
491
492 return h.raw;
493}
494
eea49cc9 495static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
496 struct packet_ring_buffer *rb,
497 int status)
498{
499 return packet_lookup_frame(po, rb, rb->head, status);
500}
501
bc59ba39 502static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 503{
504 del_timer_sync(&pkc->retire_blk_timer);
505}
506
507static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
508 int tx_ring,
509 struct sk_buff_head *rb_queue)
510{
bc59ba39 511 struct tpacket_kbdq_core *pkc;
f6fb8f10 512
22781a5b
DJ
513 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
514 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 515
ec6f809f 516 spin_lock_bh(&rb_queue->lock);
f6fb8f10 517 pkc->delete_blk_timer = 1;
ec6f809f 518 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 519
520 prb_del_retire_blk_timer(pkc);
521}
522
523static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 524 struct tpacket_kbdq_core *pkc,
f6fb8f10 525 void (*func) (unsigned long))
526{
527 init_timer(&pkc->retire_blk_timer);
528 pkc->retire_blk_timer.data = (long)po;
529 pkc->retire_blk_timer.function = func;
530 pkc->retire_blk_timer.expires = jiffies;
531}
532
533static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
534{
bc59ba39 535 struct tpacket_kbdq_core *pkc;
f6fb8f10 536
537 if (tx_ring)
538 BUG();
539
22781a5b
DJ
540 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
541 GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 542 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
543}
544
545static int prb_calc_retire_blk_tmo(struct packet_sock *po,
546 int blk_size_in_bytes)
547{
548 struct net_device *dev;
549 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
550 struct ethtool_cmd ecmd;
551 int err;
e440cf2c 552 u32 speed;
f6fb8f10 553
4bc71cb9
JP
554 rtnl_lock();
555 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
556 if (unlikely(!dev)) {
557 rtnl_unlock();
f6fb8f10 558 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
559 }
560 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 561 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
562 rtnl_unlock();
563 if (!err) {
4bc71cb9
JP
564 /*
565 * If the link speed is so slow you don't really
566 * need to worry about perf anyways
567 */
e440cf2c 568 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 569 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 570 } else {
571 msec = 1;
572 div = speed / 1000;
f6fb8f10 573 }
574 }
575
576 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
577
578 if (div)
579 mbits /= div;
580
581 tmo = mbits * msec;
582
583 if (div)
584 return tmo+1;
585 return tmo;
586}
587
bc59ba39 588static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 589 union tpacket_req_u *req_u)
590{
591 p1->feature_req_word = req_u->req3.tp_feature_req_word;
592}
593
594static void init_prb_bdqc(struct packet_sock *po,
595 struct packet_ring_buffer *rb,
596 struct pgv *pg_vec,
597 union tpacket_req_u *req_u, int tx_ring)
598{
22781a5b 599 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 600 struct tpacket_block_desc *pbd;
f6fb8f10 601
602 memset(p1, 0x0, sizeof(*p1));
603
604 p1->knxt_seq_num = 1;
605 p1->pkbdq = pg_vec;
bc59ba39 606 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 607 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 608 p1->kblk_size = req_u->req3.tp_block_size;
609 p1->knum_blocks = req_u->req3.tp_block_nr;
610 p1->hdrlen = po->tp_hdrlen;
611 p1->version = po->tp_version;
612 p1->last_kactive_blk_num = 0;
ee80fbf3 613 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 614 if (req_u->req3.tp_retire_blk_tov)
615 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
616 else
617 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
618 req_u->req3.tp_block_size);
619 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
620 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
621
622 prb_init_ft_ops(p1, req_u);
623 prb_setup_retire_blk_timer(po, tx_ring);
624 prb_open_block(p1, pbd);
625}
626
627/* Do NOT update the last_blk_num first.
628 * Assumes sk_buff_head lock is held.
629 */
bc59ba39 630static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 631{
632 mod_timer(&pkc->retire_blk_timer,
633 jiffies + pkc->tov_in_jiffies);
634 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
635}
636
637/*
638 * Timer logic:
639 * 1) We refresh the timer only when we open a block.
640 * By doing this we don't waste cycles refreshing the timer
641 * on packet-by-packet basis.
642 *
643 * With a 1MB block-size, on a 1Gbps line, it will take
644 * i) ~8 ms to fill a block + ii) memcpy etc.
645 * In this cut we are not accounting for the memcpy time.
646 *
647 * So, if the user sets the 'tmo' to 10ms then the timer
648 * will never fire while the block is still getting filled
649 * (which is what we want). However, the user could choose
650 * to close a block early and that's fine.
651 *
652 * But when the timer does fire, we check whether or not to refresh it.
653 * Since the tmo granularity is in msecs, it is not too expensive
654 * to refresh the timer, lets say every '8' msecs.
655 * Either the user can set the 'tmo' or we can derive it based on
656 * a) line-speed and b) block-size.
657 * prb_calc_retire_blk_tmo() calculates the tmo.
658 *
659 */
660static void prb_retire_rx_blk_timer_expired(unsigned long data)
661{
662 struct packet_sock *po = (struct packet_sock *)data;
22781a5b 663 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 664 unsigned int frozen;
bc59ba39 665 struct tpacket_block_desc *pbd;
f6fb8f10 666
667 spin_lock(&po->sk.sk_receive_queue.lock);
668
669 frozen = prb_queue_frozen(pkc);
670 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
671
672 if (unlikely(pkc->delete_blk_timer))
673 goto out;
674
675 /* We only need to plug the race when the block is partially filled.
676 * tpacket_rcv:
677 * lock(); increment BLOCK_NUM_PKTS; unlock()
678 * copy_bits() is in progress ...
679 * timer fires on other cpu:
680 * we can't retire the current block because copy_bits
681 * is in progress.
682 *
683 */
684 if (BLOCK_NUM_PKTS(pbd)) {
685 while (atomic_read(&pkc->blk_fill_in_prog)) {
686 /* Waiting for skb_copy_bits to finish... */
687 cpu_relax();
688 }
689 }
690
691 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
692 if (!frozen) {
693 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
694 if (!prb_dispatch_next_block(pkc, po))
695 goto refresh_timer;
696 else
697 goto out;
698 } else {
699 /* Case 1. Queue was frozen because user-space was
700 * lagging behind.
701 */
702 if (prb_curr_blk_in_use(pkc, pbd)) {
703 /*
704 * Ok, user-space is still behind.
705 * So just refresh the timer.
706 */
707 goto refresh_timer;
708 } else {
709 /* Case 2. queue was frozen,user-space caught up,
710 * now the link went idle && the timer fired.
711 * We don't have a block to close.So we open this
712 * block and restart the timer.
713 * opening a block thaws the queue,restarts timer
714 * Thawing/timer-refresh is a side effect.
715 */
716 prb_open_block(pkc, pbd);
717 goto out;
718 }
719 }
720 }
721
722refresh_timer:
723 _prb_refresh_rx_retire_blk_timer(pkc);
724
725out:
726 spin_unlock(&po->sk.sk_receive_queue.lock);
727}
728
eea49cc9 729static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 730 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 731{
732 /* Flush everything minus the block header */
733
734#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
735 u8 *start, *end;
736
737 start = (u8 *)pbd1;
738
739 /* Skip the block header(we know header WILL fit in 4K) */
740 start += PAGE_SIZE;
741
742 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
743 for (; start < end; start += PAGE_SIZE)
744 flush_dcache_page(pgv_to_page(start));
745
746 smp_wmb();
747#endif
748
749 /* Now update the block status. */
750
751 BLOCK_STATUS(pbd1) = status;
752
753 /* Flush the block header */
754
755#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
756 start = (u8 *)pbd1;
757 flush_dcache_page(pgv_to_page(start));
758
759 smp_wmb();
760#endif
761}
762
763/*
764 * Side effect:
765 *
766 * 1) flush the block
767 * 2) Increment active_blk_num
768 *
769 * Note:We DONT refresh the timer on purpose.
770 * Because almost always the next block will be opened.
771 */
bc59ba39 772static void prb_close_block(struct tpacket_kbdq_core *pkc1,
773 struct tpacket_block_desc *pbd1,
f6fb8f10 774 struct packet_sock *po, unsigned int stat)
775{
776 __u32 status = TP_STATUS_USER | stat;
777
778 struct tpacket3_hdr *last_pkt;
bc59ba39 779 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 780
ee80fbf3 781 if (po->stats.stats3.tp_drops)
f6fb8f10 782 status |= TP_STATUS_LOSING;
783
784 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
785 last_pkt->tp_next_offset = 0;
786
787 /* Get the ts of the last pkt */
788 if (BLOCK_NUM_PKTS(pbd1)) {
789 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
790 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
791 } else {
792 /* Ok, we tmo'd - so get the current time */
793 struct timespec ts;
794 getnstimeofday(&ts);
795 h1->ts_last_pkt.ts_sec = ts.tv_sec;
796 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
797 }
798
799 smp_wmb();
800
801 /* Flush the block */
802 prb_flush_block(pkc1, pbd1, status);
803
804 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
805}
806
eea49cc9 807static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 808{
809 pkc->reset_pending_on_curr_blk = 0;
810}
811
812/*
813 * Side effect of opening a block:
814 *
815 * 1) prb_queue is thawed.
816 * 2) retire_blk_timer is refreshed.
817 *
818 */
bc59ba39 819static void prb_open_block(struct tpacket_kbdq_core *pkc1,
820 struct tpacket_block_desc *pbd1)
f6fb8f10 821{
822 struct timespec ts;
bc59ba39 823 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 824
825 smp_rmb();
826
8da3056c
DB
827 /* We could have just memset this but we will lose the
828 * flexibility of making the priv area sticky
829 */
f6fb8f10 830
8da3056c
DB
831 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
832 BLOCK_NUM_PKTS(pbd1) = 0;
833 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 834
8da3056c
DB
835 getnstimeofday(&ts);
836
837 h1->ts_first_pkt.ts_sec = ts.tv_sec;
838 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 839
8da3056c
DB
840 pkc1->pkblk_start = (char *)pbd1;
841 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
842
843 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
844 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
845
846 pbd1->version = pkc1->version;
847 pkc1->prev = pkc1->nxt_offset;
848 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
849
850 prb_thaw_queue(pkc1);
851 _prb_refresh_rx_retire_blk_timer(pkc1);
852
853 smp_wmb();
f6fb8f10 854}
855
856/*
857 * Queue freeze logic:
858 * 1) Assume tp_block_nr = 8 blocks.
859 * 2) At time 't0', user opens Rx ring.
860 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
861 * 4) user-space is either sleeping or processing block '0'.
862 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
863 * it will close block-7,loop around and try to fill block '0'.
864 * call-flow:
865 * __packet_lookup_frame_in_block
866 * prb_retire_current_block()
867 * prb_dispatch_next_block()
868 * |->(BLOCK_STATUS == USER) evaluates to true
869 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
870 * 6) Now there are two cases:
871 * 6.1) Link goes idle right after the queue is frozen.
872 * But remember, the last open_block() refreshed the timer.
873 * When this timer expires,it will refresh itself so that we can
874 * re-open block-0 in near future.
875 * 6.2) Link is busy and keeps on receiving packets. This is a simple
876 * case and __packet_lookup_frame_in_block will check if block-0
877 * is free and can now be re-used.
878 */
eea49cc9 879static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 880 struct packet_sock *po)
881{
882 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 883 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 884}
885
886#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
887
888/*
889 * If the next block is free then we will dispatch it
890 * and return a good offset.
891 * Else, we will freeze the queue.
892 * So, caller must check the return value.
893 */
bc59ba39 894static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 895 struct packet_sock *po)
896{
bc59ba39 897 struct tpacket_block_desc *pbd;
f6fb8f10 898
899 smp_rmb();
900
901 /* 1. Get current block num */
902 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
903
904 /* 2. If this block is currently in_use then freeze the queue */
905 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
906 prb_freeze_queue(pkc, po);
907 return NULL;
908 }
909
910 /*
911 * 3.
912 * open this block and return the offset where the first packet
913 * needs to get stored.
914 */
915 prb_open_block(pkc, pbd);
916 return (void *)pkc->nxt_offset;
917}
918
bc59ba39 919static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 920 struct packet_sock *po, unsigned int status)
921{
bc59ba39 922 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 923
924 /* retire/close the current block */
925 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
926 /*
927 * Plug the case where copy_bits() is in progress on
928 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
929 * have space to copy the pkt in the current block and
930 * called prb_retire_current_block()
931 *
932 * We don't need to worry about the TMO case because
933 * the timer-handler already handled this case.
934 */
935 if (!(status & TP_STATUS_BLK_TMO)) {
936 while (atomic_read(&pkc->blk_fill_in_prog)) {
937 /* Waiting for skb_copy_bits to finish... */
938 cpu_relax();
939 }
940 }
941 prb_close_block(pkc, pbd, po, status);
942 return;
943 }
f6fb8f10 944}
945
eea49cc9 946static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 947 struct tpacket_block_desc *pbd)
f6fb8f10 948{
949 return TP_STATUS_USER & BLOCK_STATUS(pbd);
950}
951
eea49cc9 952static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 953{
954 return pkc->reset_pending_on_curr_blk;
955}
956
eea49cc9 957static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 958{
bc59ba39 959 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 960 atomic_dec(&pkc->blk_fill_in_prog);
961}
962
eea49cc9 963static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 964 struct tpacket3_hdr *ppd)
965{
3958afa1 966 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 967}
968
eea49cc9 969static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 970 struct tpacket3_hdr *ppd)
971{
972 ppd->hv1.tp_rxhash = 0;
973}
974
eea49cc9 975static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 976 struct tpacket3_hdr *ppd)
977{
978 if (vlan_tx_tag_present(pkc->skb)) {
979 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
a0cdfcf3
AW
980 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
981 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 982 } else {
9e67030a 983 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 984 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 985 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 986 }
987}
988
bc59ba39 989static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 990 struct tpacket3_hdr *ppd)
991{
a0cdfcf3 992 ppd->hv1.tp_padding = 0;
f6fb8f10 993 prb_fill_vlan_info(pkc, ppd);
994
995 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
996 prb_fill_rxhash(pkc, ppd);
997 else
998 prb_clear_rxhash(pkc, ppd);
999}
1000
eea49cc9 1001static void prb_fill_curr_block(char *curr,
bc59ba39 1002 struct tpacket_kbdq_core *pkc,
1003 struct tpacket_block_desc *pbd,
f6fb8f10 1004 unsigned int len)
1005{
1006 struct tpacket3_hdr *ppd;
1007
1008 ppd = (struct tpacket3_hdr *)curr;
1009 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1010 pkc->prev = curr;
1011 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1012 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1013 BLOCK_NUM_PKTS(pbd) += 1;
1014 atomic_inc(&pkc->blk_fill_in_prog);
1015 prb_run_all_ft_ops(pkc, ppd);
1016}
1017
1018/* Assumes caller has the sk->rx_queue.lock */
1019static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1020 struct sk_buff *skb,
1021 int status,
1022 unsigned int len
1023 )
1024{
bc59ba39 1025 struct tpacket_kbdq_core *pkc;
1026 struct tpacket_block_desc *pbd;
f6fb8f10 1027 char *curr, *end;
1028
e3192690 1029 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1030 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1031
1032 /* Queue is frozen when user space is lagging behind */
1033 if (prb_queue_frozen(pkc)) {
1034 /*
1035 * Check if that last block which caused the queue to freeze,
1036 * is still in_use by user-space.
1037 */
1038 if (prb_curr_blk_in_use(pkc, pbd)) {
1039 /* Can't record this packet */
1040 return NULL;
1041 } else {
1042 /*
1043 * Ok, the block was released by user-space.
1044 * Now let's open that block.
1045 * opening a block also thaws the queue.
1046 * Thawing is a side effect.
1047 */
1048 prb_open_block(pkc, pbd);
1049 }
1050 }
1051
1052 smp_mb();
1053 curr = pkc->nxt_offset;
1054 pkc->skb = skb;
e3192690 1055 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1056
1057 /* first try the current block */
1058 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1059 prb_fill_curr_block(curr, pkc, pbd, len);
1060 return (void *)curr;
1061 }
1062
1063 /* Ok, close the current block */
1064 prb_retire_current_block(pkc, po, 0);
1065
1066 /* Now, try to dispatch the next block */
1067 curr = (char *)prb_dispatch_next_block(pkc, po);
1068 if (curr) {
1069 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1070 prb_fill_curr_block(curr, pkc, pbd, len);
1071 return (void *)curr;
1072 }
1073
1074 /*
1075 * No free blocks are available.user_space hasn't caught up yet.
1076 * Queue was just frozen and now this packet will get dropped.
1077 */
1078 return NULL;
1079}
1080
eea49cc9 1081static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1082 struct sk_buff *skb,
1083 int status, unsigned int len)
1084{
1085 char *curr = NULL;
1086 switch (po->tp_version) {
1087 case TPACKET_V1:
1088 case TPACKET_V2:
1089 curr = packet_lookup_frame(po, &po->rx_ring,
1090 po->rx_ring.head, status);
1091 return curr;
1092 case TPACKET_V3:
1093 return __packet_lookup_frame_in_block(po, skb, status, len);
1094 default:
1095 WARN(1, "TPACKET version not supported\n");
1096 BUG();
99aa3473 1097 return NULL;
f6fb8f10 1098 }
1099}
1100
eea49cc9 1101static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1102 struct packet_ring_buffer *rb,
77f65ebd 1103 unsigned int idx,
f6fb8f10 1104 int status)
1105{
bc59ba39 1106 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1107 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1108
1109 if (status != BLOCK_STATUS(pbd))
1110 return NULL;
1111 return pbd;
1112}
1113
eea49cc9 1114static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1115{
1116 unsigned int prev;
1117 if (rb->prb_bdqc.kactive_blk_num)
1118 prev = rb->prb_bdqc.kactive_blk_num-1;
1119 else
1120 prev = rb->prb_bdqc.knum_blocks-1;
1121 return prev;
1122}
1123
1124/* Assumes caller has held the rx_queue.lock */
eea49cc9 1125static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1126 struct packet_ring_buffer *rb,
1127 int status)
1128{
1129 unsigned int previous = prb_previous_blk_num(rb);
1130 return prb_lookup_block(po, rb, previous, status);
1131}
1132
eea49cc9 1133static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1134 struct packet_ring_buffer *rb,
1135 int status)
1136{
1137 if (po->tp_version <= TPACKET_V2)
1138 return packet_previous_frame(po, rb, status);
1139
1140 return __prb_previous_block(po, rb, status);
1141}
1142
eea49cc9 1143static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1144 struct packet_ring_buffer *rb)
1145{
1146 switch (po->tp_version) {
1147 case TPACKET_V1:
1148 case TPACKET_V2:
1149 return packet_increment_head(rb);
1150 case TPACKET_V3:
1151 default:
1152 WARN(1, "TPACKET version not supported.\n");
1153 BUG();
1154 return;
1155 }
1156}
1157
eea49cc9 1158static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1159 struct packet_ring_buffer *rb,
1160 int status)
1161{
1162 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1163 return packet_lookup_frame(po, rb, previous, status);
1164}
1165
eea49cc9 1166static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1167{
1168 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1169}
1170
77f65ebd
WB
1171static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1172{
1173 struct sock *sk = &po->sk;
1174 bool has_room;
1175
1176 if (po->prot_hook.func != tpacket_rcv)
1177 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1178 <= sk->sk_rcvbuf;
1179
1180 spin_lock(&sk->sk_receive_queue.lock);
1181 if (po->tp_version == TPACKET_V3)
1182 has_room = prb_lookup_block(po, &po->rx_ring,
1183 po->rx_ring.prb_bdqc.kactive_blk_num,
1184 TP_STATUS_KERNEL);
1185 else
1186 has_room = packet_lookup_frame(po, &po->rx_ring,
1187 po->rx_ring.head,
1188 TP_STATUS_KERNEL);
1189 spin_unlock(&sk->sk_receive_queue.lock);
1190
1191 return has_room;
1192}
1193
1da177e4
LT
1194static void packet_sock_destruct(struct sock *sk)
1195{
ed85b565
RC
1196 skb_queue_purge(&sk->sk_error_queue);
1197
547b792c
IJ
1198 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1199 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1200
1201 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1202 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1203 return;
1204 }
1205
17ab56a2 1206 sk_refcnt_debug_dec(sk);
1da177e4
LT
1207}
1208
dc99f600
DM
1209static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1210{
1211 int x = atomic_read(&f->rr_cur) + 1;
1212
1213 if (x >= num)
1214 x = 0;
1215
1216 return x;
1217}
1218
77f65ebd
WB
1219static unsigned int fanout_demux_hash(struct packet_fanout *f,
1220 struct sk_buff *skb,
1221 unsigned int num)
dc99f600 1222{
f55d112e 1223 return reciprocal_divide(skb->rxhash, num);
dc99f600
DM
1224}
1225
77f65ebd
WB
1226static unsigned int fanout_demux_lb(struct packet_fanout *f,
1227 struct sk_buff *skb,
1228 unsigned int num)
dc99f600
DM
1229{
1230 int cur, old;
1231
1232 cur = atomic_read(&f->rr_cur);
1233 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1234 fanout_rr_next(f, num))) != cur)
1235 cur = old;
77f65ebd
WB
1236 return cur;
1237}
1238
1239static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1240 struct sk_buff *skb,
1241 unsigned int num)
1242{
1243 return smp_processor_id() % num;
dc99f600
DM
1244}
1245
5df0ddfb
DB
1246static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1247 struct sk_buff *skb,
1248 unsigned int num)
1249{
1250 return reciprocal_divide(prandom_u32(), num);
1251}
1252
77f65ebd
WB
1253static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1254 struct sk_buff *skb,
1255 unsigned int idx, unsigned int skip,
1256 unsigned int num)
95ec3eb4 1257{
77f65ebd 1258 unsigned int i, j;
95ec3eb4 1259
77f65ebd
WB
1260 i = j = min_t(int, f->next[idx], num - 1);
1261 do {
1262 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1263 if (i != j)
1264 f->next[idx] = i;
1265 return i;
1266 }
1267 if (++i == num)
1268 i = 0;
1269 } while (i != j);
1270
1271 return idx;
1272}
1273
1274static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1275{
1276 return f->flags & (flag >> 8);
95ec3eb4
DM
1277}
1278
95ec3eb4
DM
1279static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1280 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1281{
1282 struct packet_fanout *f = pt->af_packet_priv;
1283 unsigned int num = f->num_members;
1284 struct packet_sock *po;
77f65ebd 1285 unsigned int idx;
dc99f600
DM
1286
1287 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1288 !num) {
1289 kfree_skb(skb);
1290 return 0;
1291 }
1292
95ec3eb4
DM
1293 switch (f->type) {
1294 case PACKET_FANOUT_HASH:
1295 default:
77f65ebd 1296 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1297 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1298 if (!skb)
1299 return 0;
1300 }
3958afa1 1301 skb_get_hash(skb);
77f65ebd 1302 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1303 break;
1304 case PACKET_FANOUT_LB:
77f65ebd 1305 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1306 break;
1307 case PACKET_FANOUT_CPU:
77f65ebd
WB
1308 idx = fanout_demux_cpu(f, skb, num);
1309 break;
5df0ddfb
DB
1310 case PACKET_FANOUT_RND:
1311 idx = fanout_demux_rnd(f, skb, num);
1312 break;
77f65ebd
WB
1313 case PACKET_FANOUT_ROLLOVER:
1314 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1315 break;
dc99f600
DM
1316 }
1317
77f65ebd
WB
1318 po = pkt_sk(f->arr[idx]);
1319 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1320 unlikely(!packet_rcv_has_room(po, skb))) {
1321 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1322 po = pkt_sk(f->arr[idx]);
1323 }
dc99f600
DM
1324
1325 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1326}
1327
fff3321d
PE
1328DEFINE_MUTEX(fanout_mutex);
1329EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1330static LIST_HEAD(fanout_list);
1331
1332static void __fanout_link(struct sock *sk, struct packet_sock *po)
1333{
1334 struct packet_fanout *f = po->fanout;
1335
1336 spin_lock(&f->lock);
1337 f->arr[f->num_members] = sk;
1338 smp_wmb();
1339 f->num_members++;
1340 spin_unlock(&f->lock);
1341}
1342
1343static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1344{
1345 struct packet_fanout *f = po->fanout;
1346 int i;
1347
1348 spin_lock(&f->lock);
1349 for (i = 0; i < f->num_members; i++) {
1350 if (f->arr[i] == sk)
1351 break;
1352 }
1353 BUG_ON(i >= f->num_members);
1354 f->arr[i] = f->arr[f->num_members - 1];
1355 f->num_members--;
1356 spin_unlock(&f->lock);
1357}
1358
d4dd8aee 1359static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1360{
d4dd8aee 1361 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
c0de08d0
EL
1362 return true;
1363
1364 return false;
1365}
1366
7736d33f 1367static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1368{
1369 struct packet_sock *po = pkt_sk(sk);
1370 struct packet_fanout *f, *match;
7736d33f 1371 u8 type = type_flags & 0xff;
77f65ebd 1372 u8 flags = type_flags >> 8;
dc99f600
DM
1373 int err;
1374
1375 switch (type) {
77f65ebd
WB
1376 case PACKET_FANOUT_ROLLOVER:
1377 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1378 return -EINVAL;
dc99f600
DM
1379 case PACKET_FANOUT_HASH:
1380 case PACKET_FANOUT_LB:
95ec3eb4 1381 case PACKET_FANOUT_CPU:
5df0ddfb 1382 case PACKET_FANOUT_RND:
dc99f600
DM
1383 break;
1384 default:
1385 return -EINVAL;
1386 }
1387
1388 if (!po->running)
1389 return -EINVAL;
1390
1391 if (po->fanout)
1392 return -EALREADY;
1393
1394 mutex_lock(&fanout_mutex);
1395 match = NULL;
1396 list_for_each_entry(f, &fanout_list, list) {
1397 if (f->id == id &&
1398 read_pnet(&f->net) == sock_net(sk)) {
1399 match = f;
1400 break;
1401 }
1402 }
afe62c68 1403 err = -EINVAL;
77f65ebd 1404 if (match && match->flags != flags)
afe62c68 1405 goto out;
dc99f600 1406 if (!match) {
afe62c68 1407 err = -ENOMEM;
dc99f600 1408 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1409 if (!match)
1410 goto out;
1411 write_pnet(&match->net, sock_net(sk));
1412 match->id = id;
1413 match->type = type;
77f65ebd 1414 match->flags = flags;
afe62c68
ED
1415 atomic_set(&match->rr_cur, 0);
1416 INIT_LIST_HEAD(&match->list);
1417 spin_lock_init(&match->lock);
1418 atomic_set(&match->sk_ref, 0);
1419 match->prot_hook.type = po->prot_hook.type;
1420 match->prot_hook.dev = po->prot_hook.dev;
1421 match->prot_hook.func = packet_rcv_fanout;
1422 match->prot_hook.af_packet_priv = match;
c0de08d0 1423 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1424 dev_add_pack(&match->prot_hook);
1425 list_add(&match->list, &fanout_list);
dc99f600 1426 }
afe62c68
ED
1427 err = -EINVAL;
1428 if (match->type == type &&
1429 match->prot_hook.type == po->prot_hook.type &&
1430 match->prot_hook.dev == po->prot_hook.dev) {
1431 err = -ENOSPC;
1432 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1433 __dev_remove_pack(&po->prot_hook);
1434 po->fanout = match;
1435 atomic_inc(&match->sk_ref);
1436 __fanout_link(sk, po);
1437 err = 0;
dc99f600
DM
1438 }
1439 }
afe62c68 1440out:
dc99f600
DM
1441 mutex_unlock(&fanout_mutex);
1442 return err;
1443}
1444
1445static void fanout_release(struct sock *sk)
1446{
1447 struct packet_sock *po = pkt_sk(sk);
1448 struct packet_fanout *f;
1449
1450 f = po->fanout;
1451 if (!f)
1452 return;
1453
fff3321d 1454 mutex_lock(&fanout_mutex);
dc99f600
DM
1455 po->fanout = NULL;
1456
dc99f600
DM
1457 if (atomic_dec_and_test(&f->sk_ref)) {
1458 list_del(&f->list);
1459 dev_remove_pack(&f->prot_hook);
1460 kfree(f);
1461 }
1462 mutex_unlock(&fanout_mutex);
1463}
1da177e4 1464
90ddc4f0 1465static const struct proto_ops packet_ops;
1da177e4 1466
90ddc4f0 1467static const struct proto_ops packet_ops_spkt;
1da177e4 1468
40d4e3df
ED
1469static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1470 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1471{
1472 struct sock *sk;
1473 struct sockaddr_pkt *spkt;
1474
1475 /*
1476 * When we registered the protocol we saved the socket in the data
1477 * field for just this event.
1478 */
1479
1480 sk = pt->af_packet_priv;
1ce4f28b 1481
1da177e4
LT
1482 /*
1483 * Yank back the headers [hope the device set this
1484 * right or kerboom...]
1485 *
1486 * Incoming packets have ll header pulled,
1487 * push it back.
1488 *
98e399f8 1489 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1490 * so that this procedure is noop.
1491 */
1492
1493 if (skb->pkt_type == PACKET_LOOPBACK)
1494 goto out;
1495
09ad9bc7 1496 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1497 goto out;
1498
40d4e3df
ED
1499 skb = skb_share_check(skb, GFP_ATOMIC);
1500 if (skb == NULL)
1da177e4
LT
1501 goto oom;
1502
1503 /* drop any routing info */
adf30907 1504 skb_dst_drop(skb);
1da177e4 1505
84531c24
PO
1506 /* drop conntrack reference */
1507 nf_reset(skb);
1508
ffbc6111 1509 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1510
98e399f8 1511 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1512
1513 /*
1514 * The SOCK_PACKET socket receives _all_ frames.
1515 */
1516
1517 spkt->spkt_family = dev->type;
1518 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1519 spkt->spkt_protocol = skb->protocol;
1520
1521 /*
1522 * Charge the memory to the socket. This is done specifically
1523 * to prevent sockets using all the memory up.
1524 */
1525
40d4e3df 1526 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1527 return 0;
1528
1529out:
1530 kfree_skb(skb);
1531oom:
1532 return 0;
1533}
1534
1535
1536/*
1537 * Output a raw packet to a device layer. This bypasses all the other
1538 * protocol layers and you must therefore supply it with a complete frame
1539 */
1ce4f28b 1540
1da177e4
LT
1541static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1542 struct msghdr *msg, size_t len)
1543{
1544 struct sock *sk = sock->sk;
40d4e3df 1545 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1546 struct sk_buff *skb = NULL;
1da177e4 1547 struct net_device *dev;
40d4e3df 1548 __be16 proto = 0;
1da177e4 1549 int err;
3bdc0eba 1550 int extra_len = 0;
1ce4f28b 1551
1da177e4 1552 /*
1ce4f28b 1553 * Get and verify the address.
1da177e4
LT
1554 */
1555
40d4e3df 1556 if (saddr) {
1da177e4 1557 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1558 return -EINVAL;
1559 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1560 proto = saddr->spkt_protocol;
1561 } else
1562 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1563
1564 /*
1ce4f28b 1565 * Find the device first to size check it
1da177e4
LT
1566 */
1567
de74e92a 1568 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1569retry:
654d1f8a
ED
1570 rcu_read_lock();
1571 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1572 err = -ENODEV;
1573 if (dev == NULL)
1574 goto out_unlock;
1ce4f28b 1575
d5e76b0a
DM
1576 err = -ENETDOWN;
1577 if (!(dev->flags & IFF_UP))
1578 goto out_unlock;
1579
1da177e4 1580 /*
40d4e3df
ED
1581 * You may not queue a frame bigger than the mtu. This is the lowest level
1582 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1583 */
1ce4f28b 1584
3bdc0eba
BG
1585 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1586 if (!netif_supports_nofcs(dev)) {
1587 err = -EPROTONOSUPPORT;
1588 goto out_unlock;
1589 }
1590 extra_len = 4; /* We're doing our own CRC */
1591 }
1592
1da177e4 1593 err = -EMSGSIZE;
3bdc0eba 1594 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1595 goto out_unlock;
1596
1a35ca80
ED
1597 if (!skb) {
1598 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1599 int tlen = dev->needed_tailroom;
1a35ca80
ED
1600 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1601
1602 rcu_read_unlock();
4ce40912 1603 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1604 if (skb == NULL)
1605 return -ENOBUFS;
1606 /* FIXME: Save some space for broken drivers that write a hard
1607 * header at transmission time by themselves. PPP is the notable
1608 * one here. This should really be fixed at the driver level.
1609 */
1610 skb_reserve(skb, reserved);
1611 skb_reset_network_header(skb);
1612
1613 /* Try to align data part correctly */
1614 if (hhlen) {
1615 skb->data -= hhlen;
1616 skb->tail -= hhlen;
1617 if (len < hhlen)
1618 skb_reset_network_header(skb);
1619 }
1620 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1621 if (err)
1622 goto out_free;
1623 goto retry;
1da177e4
LT
1624 }
1625
3bdc0eba 1626 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1627 /* Earlier code assumed this would be a VLAN pkt,
1628 * double-check this now that we have the actual
1629 * packet in hand.
1630 */
1631 struct ethhdr *ehdr;
1632 skb_reset_mac_header(skb);
1633 ehdr = eth_hdr(skb);
1634 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1635 err = -EMSGSIZE;
1636 goto out_unlock;
1637 }
1638 }
1a35ca80 1639
1da177e4
LT
1640 skb->protocol = proto;
1641 skb->dev = dev;
1642 skb->priority = sk->sk_priority;
2d37a186 1643 skb->mark = sk->sk_mark;
bf84a010
DB
1644
1645 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1646
3bdc0eba
BG
1647 if (unlikely(extra_len == 4))
1648 skb->no_fcs = 1;
1649
40893fd0 1650 skb_probe_transport_header(skb, 0);
c1aad275 1651
1da177e4 1652 dev_queue_xmit(skb);
654d1f8a 1653 rcu_read_unlock();
40d4e3df 1654 return len;
1da177e4 1655
1da177e4 1656out_unlock:
654d1f8a 1657 rcu_read_unlock();
1a35ca80
ED
1658out_free:
1659 kfree_skb(skb);
1da177e4
LT
1660 return err;
1661}
1da177e4 1662
eea49cc9 1663static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1664 const struct sock *sk,
dbcb5855 1665 unsigned int res)
1da177e4
LT
1666{
1667 struct sk_filter *filter;
fda9ef5d 1668
80f8f102
ED
1669 rcu_read_lock();
1670 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1671 if (filter != NULL)
0a14842f 1672 res = SK_RUN_FILTER(filter, skb);
80f8f102 1673 rcu_read_unlock();
1da177e4 1674
dbcb5855 1675 return res;
1da177e4
LT
1676}
1677
1678/*
62ab0812
ED
1679 * This function makes lazy skb cloning in hope that most of packets
1680 * are discarded by BPF.
1681 *
1682 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1683 * and skb->cb are mangled. It works because (and until) packets
1684 * falling here are owned by current CPU. Output packets are cloned
1685 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1686 * sequencially, so that if we return skb to original state on exit,
1687 * we will not harm anyone.
1da177e4
LT
1688 */
1689
40d4e3df
ED
1690static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1691 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1692{
1693 struct sock *sk;
1694 struct sockaddr_ll *sll;
1695 struct packet_sock *po;
40d4e3df 1696 u8 *skb_head = skb->data;
1da177e4 1697 int skb_len = skb->len;
dbcb5855 1698 unsigned int snaplen, res;
1da177e4
LT
1699
1700 if (skb->pkt_type == PACKET_LOOPBACK)
1701 goto drop;
1702
1703 sk = pt->af_packet_priv;
1704 po = pkt_sk(sk);
1705
09ad9bc7 1706 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1707 goto drop;
1708
1da177e4
LT
1709 skb->dev = dev;
1710
3b04ddde 1711 if (dev->header_ops) {
1da177e4 1712 /* The device has an explicit notion of ll header,
62ab0812
ED
1713 * exported to higher levels.
1714 *
1715 * Otherwise, the device hides details of its frame
1716 * structure, so that corresponding packet head is
1717 * never delivered to user.
1da177e4
LT
1718 */
1719 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1720 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1721 else if (skb->pkt_type == PACKET_OUTGOING) {
1722 /* Special case: outgoing packets have ll header at head */
bbe735e4 1723 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1724 }
1725 }
1726
1727 snaplen = skb->len;
1728
dbcb5855
DM
1729 res = run_filter(skb, sk, snaplen);
1730 if (!res)
fda9ef5d 1731 goto drop_n_restore;
dbcb5855
DM
1732 if (snaplen > res)
1733 snaplen = res;
1da177e4 1734
0fd7bac6 1735 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1736 goto drop_n_acct;
1737
1738 if (skb_shared(skb)) {
1739 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1740 if (nskb == NULL)
1741 goto drop_n_acct;
1742
1743 if (skb_head != skb->data) {
1744 skb->data = skb_head;
1745 skb->len = skb_len;
1746 }
abc4e4fa 1747 consume_skb(skb);
1da177e4
LT
1748 skb = nskb;
1749 }
1750
ffbc6111
HX
1751 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1752 sizeof(skb->cb));
1753
1754 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1755 sll->sll_family = AF_PACKET;
1756 sll->sll_hatype = dev->type;
1757 sll->sll_protocol = skb->protocol;
1758 sll->sll_pkttype = skb->pkt_type;
8032b464 1759 if (unlikely(po->origdev))
80feaacb
PWJ
1760 sll->sll_ifindex = orig_dev->ifindex;
1761 else
1762 sll->sll_ifindex = dev->ifindex;
1da177e4 1763
b95cce35 1764 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1765
ffbc6111 1766 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1767
1da177e4
LT
1768 if (pskb_trim(skb, snaplen))
1769 goto drop_n_acct;
1770
1771 skb_set_owner_r(skb, sk);
1772 skb->dev = NULL;
adf30907 1773 skb_dst_drop(skb);
1da177e4 1774
84531c24
PO
1775 /* drop conntrack reference */
1776 nf_reset(skb);
1777
1da177e4 1778 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1779 po->stats.stats1.tp_packets++;
3b885787 1780 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1781 __skb_queue_tail(&sk->sk_receive_queue, skb);
1782 spin_unlock(&sk->sk_receive_queue.lock);
1783 sk->sk_data_ready(sk, skb->len);
1784 return 0;
1785
1786drop_n_acct:
7091fbd8 1787 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1788 po->stats.stats1.tp_drops++;
7091fbd8
WB
1789 atomic_inc(&sk->sk_drops);
1790 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1791
1792drop_n_restore:
1793 if (skb_head != skb->data && skb_shared(skb)) {
1794 skb->data = skb_head;
1795 skb->len = skb_len;
1796 }
1797drop:
ead2ceb0 1798 consume_skb(skb);
1da177e4
LT
1799 return 0;
1800}
1801
40d4e3df
ED
1802static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1803 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1804{
1805 struct sock *sk;
1806 struct packet_sock *po;
1807 struct sockaddr_ll *sll;
184f489e 1808 union tpacket_uhdr h;
40d4e3df 1809 u8 *skb_head = skb->data;
1da177e4 1810 int skb_len = skb->len;
dbcb5855 1811 unsigned int snaplen, res;
f6fb8f10 1812 unsigned long status = TP_STATUS_USER;
bbd6ef87 1813 unsigned short macoff, netoff, hdrlen;
1da177e4 1814 struct sk_buff *copy_skb = NULL;
bbd6ef87 1815 struct timespec ts;
b9c32fb2 1816 __u32 ts_status;
1da177e4 1817
51846355
AW
1818 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1819 * We may add members to them until current aligned size without forcing
1820 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1821 */
1822 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1823 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1824
1da177e4
LT
1825 if (skb->pkt_type == PACKET_LOOPBACK)
1826 goto drop;
1827
1828 sk = pt->af_packet_priv;
1829 po = pkt_sk(sk);
1830
09ad9bc7 1831 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1832 goto drop;
1833
3b04ddde 1834 if (dev->header_ops) {
1da177e4 1835 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1836 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1837 else if (skb->pkt_type == PACKET_OUTGOING) {
1838 /* Special case: outgoing packets have ll header at head */
bbe735e4 1839 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1840 }
1841 }
1842
8dc41944
HX
1843 if (skb->ip_summed == CHECKSUM_PARTIAL)
1844 status |= TP_STATUS_CSUMNOTREADY;
1845
1da177e4
LT
1846 snaplen = skb->len;
1847
dbcb5855
DM
1848 res = run_filter(skb, sk, snaplen);
1849 if (!res)
fda9ef5d 1850 goto drop_n_restore;
dbcb5855
DM
1851 if (snaplen > res)
1852 snaplen = res;
1da177e4
LT
1853
1854 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1855 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1856 po->tp_reserve;
1da177e4 1857 } else {
95c96174 1858 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1859 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1860 (maclen < 16 ? 16 : maclen)) +
1861 po->tp_reserve;
1da177e4
LT
1862 macoff = netoff - maclen;
1863 }
f6fb8f10 1864 if (po->tp_version <= TPACKET_V2) {
1865 if (macoff + snaplen > po->rx_ring.frame_size) {
1866 if (po->copy_thresh &&
0fd7bac6 1867 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1868 if (skb_shared(skb)) {
1869 copy_skb = skb_clone(skb, GFP_ATOMIC);
1870 } else {
1871 copy_skb = skb_get(skb);
1872 skb_head = skb->data;
1873 }
1874 if (copy_skb)
1875 skb_set_owner_r(copy_skb, sk);
1da177e4 1876 }
f6fb8f10 1877 snaplen = po->rx_ring.frame_size - macoff;
1878 if ((int)snaplen < 0)
1879 snaplen = 0;
1da177e4 1880 }
1da177e4 1881 }
1da177e4 1882 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1883 h.raw = packet_current_rx_frame(po, skb,
1884 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1885 if (!h.raw)
1da177e4 1886 goto ring_is_full;
f6fb8f10 1887 if (po->tp_version <= TPACKET_V2) {
1888 packet_increment_rx_head(po, &po->rx_ring);
1889 /*
1890 * LOSING will be reported till you read the stats,
1891 * because it's COR - Clear On Read.
1892 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1893 * at packet level.
1894 */
ee80fbf3 1895 if (po->stats.stats1.tp_drops)
f6fb8f10 1896 status |= TP_STATUS_LOSING;
1897 }
ee80fbf3 1898 po->stats.stats1.tp_packets++;
1da177e4
LT
1899 if (copy_skb) {
1900 status |= TP_STATUS_COPY;
1901 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1902 }
1da177e4
LT
1903 spin_unlock(&sk->sk_receive_queue.lock);
1904
bbd6ef87 1905 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1906
1907 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1908 getnstimeofday(&ts);
1da177e4 1909
b9c32fb2
DB
1910 status |= ts_status;
1911
bbd6ef87
PM
1912 switch (po->tp_version) {
1913 case TPACKET_V1:
1914 h.h1->tp_len = skb->len;
1915 h.h1->tp_snaplen = snaplen;
1916 h.h1->tp_mac = macoff;
1917 h.h1->tp_net = netoff;
4b457bdf
DB
1918 h.h1->tp_sec = ts.tv_sec;
1919 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1920 hdrlen = sizeof(*h.h1);
1921 break;
1922 case TPACKET_V2:
1923 h.h2->tp_len = skb->len;
1924 h.h2->tp_snaplen = snaplen;
1925 h.h2->tp_mac = macoff;
1926 h.h2->tp_net = netoff;
bbd6ef87
PM
1927 h.h2->tp_sec = ts.tv_sec;
1928 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1929 if (vlan_tx_tag_present(skb)) {
1930 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
a0cdfcf3
AW
1931 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
1932 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
1933 } else {
1934 h.h2->tp_vlan_tci = 0;
a0cdfcf3 1935 h.h2->tp_vlan_tpid = 0;
a3bcc23e 1936 }
e4d26f4b 1937 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
1938 hdrlen = sizeof(*h.h2);
1939 break;
f6fb8f10 1940 case TPACKET_V3:
1941 /* tp_nxt_offset,vlan are already populated above.
1942 * So DONT clear those fields here
1943 */
1944 h.h3->tp_status |= status;
1945 h.h3->tp_len = skb->len;
1946 h.h3->tp_snaplen = snaplen;
1947 h.h3->tp_mac = macoff;
1948 h.h3->tp_net = netoff;
f6fb8f10 1949 h.h3->tp_sec = ts.tv_sec;
1950 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 1951 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 1952 hdrlen = sizeof(*h.h3);
1953 break;
bbd6ef87
PM
1954 default:
1955 BUG();
1956 }
1da177e4 1957
bbd6ef87 1958 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1959 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1960 sll->sll_family = AF_PACKET;
1961 sll->sll_hatype = dev->type;
1962 sll->sll_protocol = skb->protocol;
1963 sll->sll_pkttype = skb->pkt_type;
8032b464 1964 if (unlikely(po->origdev))
80feaacb
PWJ
1965 sll->sll_ifindex = orig_dev->ifindex;
1966 else
1967 sll->sll_ifindex = dev->ifindex;
1da177e4 1968
e16aa207 1969 smp_mb();
f6dafa95 1970#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1971 {
0af55bb5
CG
1972 u8 *start, *end;
1973
f6fb8f10 1974 if (po->tp_version <= TPACKET_V2) {
1975 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1976 + macoff + snaplen);
1977 for (start = h.raw; start < end; start += PAGE_SIZE)
1978 flush_dcache_page(pgv_to_page(start));
1979 }
cc9f01b2 1980 smp_wmb();
1da177e4 1981 }
f6dafa95 1982#endif
f6fb8f10 1983 if (po->tp_version <= TPACKET_V2)
1984 __packet_set_status(po, h.raw, status);
1985 else
1986 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1987
1988 sk->sk_data_ready(sk, 0);
1989
1990drop_n_restore:
1991 if (skb_head != skb->data && skb_shared(skb)) {
1992 skb->data = skb_head;
1993 skb->len = skb_len;
1994 }
1995drop:
1ce4f28b 1996 kfree_skb(skb);
1da177e4
LT
1997 return 0;
1998
1999ring_is_full:
ee80fbf3 2000 po->stats.stats1.tp_drops++;
1da177e4
LT
2001 spin_unlock(&sk->sk_receive_queue.lock);
2002
2003 sk->sk_data_ready(sk, 0);
acb5d75b 2004 kfree_skb(copy_skb);
1da177e4
LT
2005 goto drop_n_restore;
2006}
2007
69e3c75f
JB
2008static void tpacket_destruct_skb(struct sk_buff *skb)
2009{
2010 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 2011 void *ph;
1da177e4 2012
69e3c75f 2013 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
2014 __u32 ts;
2015
69e3c75f 2016 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
2017 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
2018 atomic_dec(&po->tx_ring.pending);
b9c32fb2
DB
2019
2020 ts = __packet_set_timestamp(po, ph, skb);
2021 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2022 }
2023
2024 sock_wfree(skb);
2025}
2026
40d4e3df
ED
2027static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2028 void *frame, struct net_device *dev, int size_max,
ae641949 2029 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 2030{
184f489e 2031 union tpacket_uhdr ph;
09effa67 2032 int to_write, offset, len, tp_len, nr_frags, len_max;
69e3c75f
JB
2033 struct socket *sock = po->sk.sk_socket;
2034 struct page *page;
2035 void *data;
2036 int err;
2037
2038 ph.raw = frame;
2039
2040 skb->protocol = proto;
2041 skb->dev = dev;
2042 skb->priority = po->sk.sk_priority;
2d37a186 2043 skb->mark = po->sk.sk_mark;
2e31396f 2044 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
2045 skb_shinfo(skb)->destructor_arg = ph.raw;
2046
2047 switch (po->tp_version) {
2048 case TPACKET_V2:
2049 tp_len = ph.h2->tp_len;
2050 break;
2051 default:
2052 tp_len = ph.h1->tp_len;
2053 break;
2054 }
09effa67
DM
2055 if (unlikely(tp_len > size_max)) {
2056 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2057 return -EMSGSIZE;
2058 }
69e3c75f 2059
ae641949 2060 skb_reserve(skb, hlen);
69e3c75f 2061 skb_reset_network_header(skb);
c1aad275 2062
d346a3fa
DB
2063 if (!packet_use_direct_xmit(po))
2064 skb_probe_transport_header(skb, 0);
2065 if (unlikely(po->tp_tx_has_off)) {
5920cd3a
PC
2066 int off_min, off_max, off;
2067 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2068 off_max = po->tx_ring.frame_size - tp_len;
2069 if (sock->type == SOCK_DGRAM) {
2070 switch (po->tp_version) {
2071 case TPACKET_V2:
2072 off = ph.h2->tp_net;
2073 break;
2074 default:
2075 off = ph.h1->tp_net;
2076 break;
2077 }
2078 } else {
2079 switch (po->tp_version) {
2080 case TPACKET_V2:
2081 off = ph.h2->tp_mac;
2082 break;
2083 default:
2084 off = ph.h1->tp_mac;
2085 break;
2086 }
2087 }
2088 if (unlikely((off < off_min) || (off_max < off)))
2089 return -EINVAL;
2090 data = ph.raw + off;
2091 } else {
2092 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2093 }
69e3c75f
JB
2094 to_write = tp_len;
2095
2096 if (sock->type == SOCK_DGRAM) {
2097 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2098 NULL, tp_len);
2099 if (unlikely(err < 0))
2100 return -EINVAL;
40d4e3df 2101 } else if (dev->hard_header_len) {
69e3c75f
JB
2102 /* net device doesn't like empty head */
2103 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2104 pr_err("packet size is too short (%d < %d)\n",
2105 tp_len, dev->hard_header_len);
69e3c75f
JB
2106 return -EINVAL;
2107 }
2108
2109 skb_push(skb, dev->hard_header_len);
2110 err = skb_store_bits(skb, 0, data,
2111 dev->hard_header_len);
2112 if (unlikely(err))
2113 return err;
2114
2115 data += dev->hard_header_len;
2116 to_write -= dev->hard_header_len;
2117 }
2118
69e3c75f
JB
2119 offset = offset_in_page(data);
2120 len_max = PAGE_SIZE - offset;
2121 len = ((to_write > len_max) ? len_max : to_write);
2122
2123 skb->data_len = to_write;
2124 skb->len += to_write;
2125 skb->truesize += to_write;
2126 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2127
2128 while (likely(to_write)) {
2129 nr_frags = skb_shinfo(skb)->nr_frags;
2130
2131 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2132 pr_err("Packet exceed the number of skb frags(%lu)\n",
2133 MAX_SKB_FRAGS);
69e3c75f
JB
2134 return -EFAULT;
2135 }
2136
0af55bb5
CG
2137 page = pgv_to_page(data);
2138 data += len;
69e3c75f
JB
2139 flush_dcache_page(page);
2140 get_page(page);
0af55bb5 2141 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2142 to_write -= len;
2143 offset = 0;
2144 len_max = PAGE_SIZE;
2145 len = ((to_write > len_max) ? len_max : to_write);
2146 }
2147
2148 return tp_len;
2149}
2150
2151static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2152{
69e3c75f
JB
2153 struct sk_buff *skb;
2154 struct net_device *dev;
2155 __be16 proto;
09effa67 2156 int err, reserve = 0;
40d4e3df
ED
2157 void *ph;
2158 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
87a2fd28 2159 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
69e3c75f
JB
2160 int tp_len, size_max;
2161 unsigned char *addr;
2162 int len_sum = 0;
9e67030a 2163 int status = TP_STATUS_AVAILABLE;
ae641949 2164 int hlen, tlen;
69e3c75f 2165
69e3c75f
JB
2166 mutex_lock(&po->pg_vec_lock);
2167
66e56cd4 2168 if (likely(saddr == NULL)) {
e40526cb 2169 dev = packet_cached_dev_get(po);
69e3c75f
JB
2170 proto = po->num;
2171 addr = NULL;
2172 } else {
2173 err = -EINVAL;
2174 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2175 goto out;
2176 if (msg->msg_namelen < (saddr->sll_halen
2177 + offsetof(struct sockaddr_ll,
2178 sll_addr)))
2179 goto out;
69e3c75f
JB
2180 proto = saddr->sll_protocol;
2181 addr = saddr->sll_addr;
827d9780 2182 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
69e3c75f
JB
2183 }
2184
69e3c75f
JB
2185 err = -ENXIO;
2186 if (unlikely(dev == NULL))
2187 goto out;
69e3c75f
JB
2188 err = -ENETDOWN;
2189 if (unlikely(!(dev->flags & IFF_UP)))
2190 goto out_put;
2191
e40526cb
DB
2192 reserve = dev->hard_header_len;
2193
69e3c75f 2194 size_max = po->tx_ring.frame_size
b5dd884e 2195 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2196
09effa67
DM
2197 if (size_max > dev->mtu + reserve)
2198 size_max = dev->mtu + reserve;
2199
69e3c75f
JB
2200 do {
2201 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2202 TP_STATUS_SEND_REQUEST);
69e3c75f 2203 if (unlikely(ph == NULL)) {
87a2fd28
DB
2204 if (need_wait && need_resched())
2205 schedule();
69e3c75f
JB
2206 continue;
2207 }
2208
2209 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2210 hlen = LL_RESERVED_SPACE(dev);
2211 tlen = dev->needed_tailroom;
69e3c75f 2212 skb = sock_alloc_send_skb(&po->sk,
ae641949 2213 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2214 0, &err);
2215
2216 if (unlikely(skb == NULL))
2217 goto out_status;
2218
2219 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2220 addr, hlen);
69e3c75f
JB
2221
2222 if (unlikely(tp_len < 0)) {
2223 if (po->tp_loss) {
2224 __packet_set_status(po, ph,
2225 TP_STATUS_AVAILABLE);
2226 packet_increment_head(&po->tx_ring);
2227 kfree_skb(skb);
2228 continue;
2229 } else {
2230 status = TP_STATUS_WRONG_FORMAT;
2231 err = tp_len;
2232 goto out_status;
2233 }
2234 }
2235
d346a3fa 2236 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
69e3c75f
JB
2237 skb->destructor = tpacket_destruct_skb;
2238 __packet_set_status(po, ph, TP_STATUS_SENDING);
2239 atomic_inc(&po->tx_ring.pending);
2240
2241 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2242 err = po->xmit(skb);
eb70df13
JP
2243 if (unlikely(err > 0)) {
2244 err = net_xmit_errno(err);
2245 if (err && __packet_get_status(po, ph) ==
2246 TP_STATUS_AVAILABLE) {
2247 /* skb was destructed already */
2248 skb = NULL;
2249 goto out_status;
2250 }
2251 /*
2252 * skb was dropped but not destructed yet;
2253 * let's treat it like congestion or err < 0
2254 */
2255 err = 0;
2256 }
69e3c75f
JB
2257 packet_increment_head(&po->tx_ring);
2258 len_sum += tp_len;
87a2fd28
DB
2259 } while (likely((ph != NULL) || (need_wait &&
2260 atomic_read(&po->tx_ring.pending))));
69e3c75f
JB
2261
2262 err = len_sum;
2263 goto out_put;
2264
69e3c75f
JB
2265out_status:
2266 __packet_set_status(po, ph, status);
2267 kfree_skb(skb);
2268out_put:
e40526cb 2269 dev_put(dev);
69e3c75f
JB
2270out:
2271 mutex_unlock(&po->pg_vec_lock);
2272 return err;
2273}
69e3c75f 2274
eea49cc9
OJ
2275static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2276 size_t reserve, size_t len,
2277 size_t linear, int noblock,
2278 int *err)
bfd5f4a3
SS
2279{
2280 struct sk_buff *skb;
2281
2282 /* Under a page? Don't bother with paged skb. */
2283 if (prepad + len < PAGE_SIZE || !linear)
2284 linear = len;
2285
2286 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2287 err, 0);
bfd5f4a3
SS
2288 if (!skb)
2289 return NULL;
2290
2291 skb_reserve(skb, reserve);
2292 skb_put(skb, linear);
2293 skb->data_len = len - linear;
2294 skb->len += len - linear;
2295
2296 return skb;
2297}
2298
d346a3fa 2299static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2300{
2301 struct sock *sk = sock->sk;
40d4e3df 2302 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2303 struct sk_buff *skb;
2304 struct net_device *dev;
0e11c91e 2305 __be16 proto;
1da177e4 2306 unsigned char *addr;
827d9780 2307 int err, reserve = 0;
bfd5f4a3
SS
2308 struct virtio_net_hdr vnet_hdr = { 0 };
2309 int offset = 0;
2310 int vnet_hdr_len;
2311 struct packet_sock *po = pkt_sk(sk);
2312 unsigned short gso_type = 0;
ae641949 2313 int hlen, tlen;
3bdc0eba 2314 int extra_len = 0;
1da177e4
LT
2315
2316 /*
1ce4f28b 2317 * Get and verify the address.
1da177e4 2318 */
1ce4f28b 2319
66e56cd4 2320 if (likely(saddr == NULL)) {
e40526cb 2321 dev = packet_cached_dev_get(po);
1da177e4
LT
2322 proto = po->num;
2323 addr = NULL;
2324 } else {
2325 err = -EINVAL;
2326 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2327 goto out;
0fb375fb
EB
2328 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2329 goto out;
1da177e4
LT
2330 proto = saddr->sll_protocol;
2331 addr = saddr->sll_addr;
827d9780 2332 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
1da177e4
LT
2333 }
2334
1da177e4 2335 err = -ENXIO;
e40526cb 2336 if (unlikely(dev == NULL))
1da177e4 2337 goto out_unlock;
d5e76b0a 2338 err = -ENETDOWN;
e40526cb 2339 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2340 goto out_unlock;
2341
e40526cb
DB
2342 if (sock->type == SOCK_RAW)
2343 reserve = dev->hard_header_len;
bfd5f4a3
SS
2344 if (po->has_vnet_hdr) {
2345 vnet_hdr_len = sizeof(vnet_hdr);
2346
2347 err = -EINVAL;
2348 if (len < vnet_hdr_len)
2349 goto out_unlock;
2350
2351 len -= vnet_hdr_len;
2352
2353 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2354 vnet_hdr_len);
2355 if (err < 0)
2356 goto out_unlock;
2357
2358 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2359 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2360 vnet_hdr.hdr_len))
2361 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2362 vnet_hdr.csum_offset + 2;
2363
2364 err = -EINVAL;
2365 if (vnet_hdr.hdr_len > len)
2366 goto out_unlock;
2367
2368 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2369 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2370 case VIRTIO_NET_HDR_GSO_TCPV4:
2371 gso_type = SKB_GSO_TCPV4;
2372 break;
2373 case VIRTIO_NET_HDR_GSO_TCPV6:
2374 gso_type = SKB_GSO_TCPV6;
2375 break;
2376 case VIRTIO_NET_HDR_GSO_UDP:
2377 gso_type = SKB_GSO_UDP;
2378 break;
2379 default:
2380 goto out_unlock;
2381 }
2382
2383 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2384 gso_type |= SKB_GSO_TCP_ECN;
2385
2386 if (vnet_hdr.gso_size == 0)
2387 goto out_unlock;
2388
2389 }
2390 }
2391
3bdc0eba
BG
2392 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2393 if (!netif_supports_nofcs(dev)) {
2394 err = -EPROTONOSUPPORT;
2395 goto out_unlock;
2396 }
2397 extra_len = 4; /* We're doing our own CRC */
2398 }
2399
1da177e4 2400 err = -EMSGSIZE;
3bdc0eba 2401 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2402 goto out_unlock;
2403
bfd5f4a3 2404 err = -ENOBUFS;
ae641949
HX
2405 hlen = LL_RESERVED_SPACE(dev);
2406 tlen = dev->needed_tailroom;
2407 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2408 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2409 if (skb == NULL)
1da177e4
LT
2410 goto out_unlock;
2411
bfd5f4a3 2412 skb_set_network_header(skb, reserve);
1da177e4 2413
0c4e8581
SH
2414 err = -EINVAL;
2415 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2416 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2417 goto out_free;
1da177e4
LT
2418
2419 /* Returns -EFAULT on error */
bfd5f4a3 2420 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2421 if (err)
2422 goto out_free;
bf84a010
DB
2423
2424 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2425
3bdc0eba 2426 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
09effa67
DM
2427 /* Earlier code assumed this would be a VLAN pkt,
2428 * double-check this now that we have the actual
2429 * packet in hand.
2430 */
2431 struct ethhdr *ehdr;
2432 skb_reset_mac_header(skb);
2433 ehdr = eth_hdr(skb);
2434 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2435 err = -EMSGSIZE;
2436 goto out_free;
2437 }
57f89bfa
BG
2438 }
2439
09effa67
DM
2440 skb->protocol = proto;
2441 skb->dev = dev;
1da177e4 2442 skb->priority = sk->sk_priority;
2d37a186 2443 skb->mark = sk->sk_mark;
d346a3fa 2444 skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
1da177e4 2445
bfd5f4a3
SS
2446 if (po->has_vnet_hdr) {
2447 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2448 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2449 vnet_hdr.csum_offset)) {
2450 err = -EINVAL;
2451 goto out_free;
2452 }
2453 }
2454
2455 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2456 skb_shinfo(skb)->gso_type = gso_type;
2457
2458 /* Header must be checked, and gso_segs computed. */
2459 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2460 skb_shinfo(skb)->gso_segs = 0;
2461
2462 len += vnet_hdr_len;
2463 }
2464
d346a3fa
DB
2465 if (!packet_use_direct_xmit(po))
2466 skb_probe_transport_header(skb, reserve);
3bdc0eba
BG
2467 if (unlikely(extra_len == 4))
2468 skb->no_fcs = 1;
2469
d346a3fa 2470 err = po->xmit(skb);
1da177e4
LT
2471 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2472 goto out_unlock;
2473
e40526cb 2474 dev_put(dev);
1da177e4 2475
40d4e3df 2476 return len;
1da177e4
LT
2477
2478out_free:
2479 kfree_skb(skb);
2480out_unlock:
e40526cb 2481 if (dev)
1da177e4
LT
2482 dev_put(dev);
2483out:
2484 return err;
2485}
2486
69e3c75f
JB
2487static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2488 struct msghdr *msg, size_t len)
2489{
69e3c75f
JB
2490 struct sock *sk = sock->sk;
2491 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2492
69e3c75f
JB
2493 if (po->tx_ring.pg_vec)
2494 return tpacket_snd(po, msg);
2495 else
69e3c75f
JB
2496 return packet_snd(sock, msg, len);
2497}
2498
1da177e4
LT
2499/*
2500 * Close a PACKET socket. This is fairly simple. We immediately go
2501 * to 'closed' state and remove our protocol entry in the device list.
2502 */
2503
2504static int packet_release(struct socket *sock)
2505{
2506 struct sock *sk = sock->sk;
2507 struct packet_sock *po;
d12d01d6 2508 struct net *net;
f6fb8f10 2509 union tpacket_req_u req_u;
1da177e4
LT
2510
2511 if (!sk)
2512 return 0;
2513
3b1e0a65 2514 net = sock_net(sk);
1da177e4
LT
2515 po = pkt_sk(sk);
2516
0fa7fa98 2517 mutex_lock(&net->packet.sklist_lock);
808f5114 2518 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2519 mutex_unlock(&net->packet.sklist_lock);
2520
2521 preempt_disable();
920de804 2522 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2523 preempt_enable();
1da177e4 2524
808f5114 2525 spin_lock(&po->bind_lock);
ce06b03e 2526 unregister_prot_hook(sk, false);
66e56cd4
DB
2527 packet_cached_dev_reset(po);
2528
160ff18a
BG
2529 if (po->prot_hook.dev) {
2530 dev_put(po->prot_hook.dev);
2531 po->prot_hook.dev = NULL;
2532 }
808f5114 2533 spin_unlock(&po->bind_lock);
1da177e4 2534
1da177e4 2535 packet_flush_mclist(sk);
1da177e4 2536
9665d5d6
PS
2537 if (po->rx_ring.pg_vec) {
2538 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2539 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2540 }
69e3c75f 2541
9665d5d6
PS
2542 if (po->tx_ring.pg_vec) {
2543 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2544 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2545 }
1da177e4 2546
dc99f600
DM
2547 fanout_release(sk);
2548
808f5114 2549 synchronize_net();
1da177e4
LT
2550 /*
2551 * Now the socket is dead. No more input will appear.
2552 */
1da177e4
LT
2553 sock_orphan(sk);
2554 sock->sk = NULL;
2555
2556 /* Purge queues */
2557
2558 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2559 sk_refcnt_debug_release(sk);
1da177e4
LT
2560
2561 sock_put(sk);
2562 return 0;
2563}
2564
2565/*
2566 * Attach a packet hook.
2567 */
2568
902fefb8 2569static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
1da177e4
LT
2570{
2571 struct packet_sock *po = pkt_sk(sk);
902fefb8
DB
2572 const struct net_device *dev_curr;
2573 __be16 proto_curr;
2574 bool need_rehook;
dc99f600 2575
aef950b4
WY
2576 if (po->fanout) {
2577 if (dev)
2578 dev_put(dev);
2579
dc99f600 2580 return -EINVAL;
aef950b4 2581 }
1da177e4
LT
2582
2583 lock_sock(sk);
1da177e4 2584 spin_lock(&po->bind_lock);
66e56cd4 2585
902fefb8
DB
2586 proto_curr = po->prot_hook.type;
2587 dev_curr = po->prot_hook.dev;
2588
2589 need_rehook = proto_curr != proto || dev_curr != dev;
2590
2591 if (need_rehook) {
2592 unregister_prot_hook(sk, true);
1da177e4 2593
902fefb8
DB
2594 po->num = proto;
2595 po->prot_hook.type = proto;
1da177e4 2596
902fefb8
DB
2597 if (po->prot_hook.dev)
2598 dev_put(po->prot_hook.dev);
2599
2600 po->prot_hook.dev = dev;
2601
2602 po->ifindex = dev ? dev->ifindex : 0;
2603 packet_cached_dev_assign(po, dev);
2604 }
66e56cd4 2605
902fefb8 2606 if (proto == 0 || !need_rehook)
1da177e4
LT
2607 goto out_unlock;
2608
be85d4ad 2609 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2610 register_prot_hook(sk);
be85d4ad
UT
2611 } else {
2612 sk->sk_err = ENETDOWN;
2613 if (!sock_flag(sk, SOCK_DEAD))
2614 sk->sk_error_report(sk);
1da177e4
LT
2615 }
2616
2617out_unlock:
2618 spin_unlock(&po->bind_lock);
2619 release_sock(sk);
2620 return 0;
2621}
2622
2623/*
2624 * Bind a packet socket to a device
2625 */
2626
40d4e3df
ED
2627static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2628 int addr_len)
1da177e4 2629{
40d4e3df 2630 struct sock *sk = sock->sk;
1da177e4
LT
2631 char name[15];
2632 struct net_device *dev;
2633 int err = -ENODEV;
1ce4f28b 2634
1da177e4
LT
2635 /*
2636 * Check legality
2637 */
1ce4f28b 2638
8ae55f04 2639 if (addr_len != sizeof(struct sockaddr))
1da177e4 2640 return -EINVAL;
40d4e3df 2641 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2642
3b1e0a65 2643 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2644 if (dev)
1da177e4 2645 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2646 return err;
2647}
1da177e4
LT
2648
2649static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2650{
40d4e3df
ED
2651 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2652 struct sock *sk = sock->sk;
1da177e4
LT
2653 struct net_device *dev = NULL;
2654 int err;
2655
2656
2657 /*
2658 * Check legality
2659 */
1ce4f28b 2660
1da177e4
LT
2661 if (addr_len < sizeof(struct sockaddr_ll))
2662 return -EINVAL;
2663 if (sll->sll_family != AF_PACKET)
2664 return -EINVAL;
2665
2666 if (sll->sll_ifindex) {
2667 err = -ENODEV;
3b1e0a65 2668 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2669 if (dev == NULL)
2670 goto out;
2671 }
2672 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2673
2674out:
2675 return err;
2676}
2677
2678static struct proto packet_proto = {
2679 .name = "PACKET",
2680 .owner = THIS_MODULE,
2681 .obj_size = sizeof(struct packet_sock),
2682};
2683
2684/*
1ce4f28b 2685 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2686 */
2687
3f378b68
EP
2688static int packet_create(struct net *net, struct socket *sock, int protocol,
2689 int kern)
1da177e4
LT
2690{
2691 struct sock *sk;
2692 struct packet_sock *po;
0e11c91e 2693 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2694 int err;
2695
df008c91 2696 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2697 return -EPERM;
be02097c
DM
2698 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2699 sock->type != SOCK_PACKET)
1da177e4
LT
2700 return -ESOCKTNOSUPPORT;
2701
2702 sock->state = SS_UNCONNECTED;
2703
2704 err = -ENOBUFS;
6257ff21 2705 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2706 if (sk == NULL)
2707 goto out;
2708
2709 sock->ops = &packet_ops;
1da177e4
LT
2710 if (sock->type == SOCK_PACKET)
2711 sock->ops = &packet_ops_spkt;
be02097c 2712
1da177e4
LT
2713 sock_init_data(sock, sk);
2714
2715 po = pkt_sk(sk);
2716 sk->sk_family = PF_PACKET;
0e11c91e 2717 po->num = proto;
d346a3fa 2718 po->xmit = dev_queue_xmit;
66e56cd4
DB
2719
2720 packet_cached_dev_reset(po);
1da177e4
LT
2721
2722 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2723 sk_refcnt_debug_inc(sk);
1da177e4
LT
2724
2725 /*
2726 * Attach a protocol block
2727 */
2728
2729 spin_lock_init(&po->bind_lock);
905db440 2730 mutex_init(&po->pg_vec_lock);
1da177e4 2731 po->prot_hook.func = packet_rcv;
be02097c 2732
1da177e4
LT
2733 if (sock->type == SOCK_PACKET)
2734 po->prot_hook.func = packet_rcv_spkt;
be02097c 2735
1da177e4
LT
2736 po->prot_hook.af_packet_priv = sk;
2737
0e11c91e
AV
2738 if (proto) {
2739 po->prot_hook.type = proto;
ce06b03e 2740 register_prot_hook(sk);
1da177e4
LT
2741 }
2742
0fa7fa98 2743 mutex_lock(&net->packet.sklist_lock);
808f5114 2744 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2745 mutex_unlock(&net->packet.sklist_lock);
2746
2747 preempt_disable();
3680453c 2748 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2749 preempt_enable();
808f5114 2750
40d4e3df 2751 return 0;
1da177e4
LT
2752out:
2753 return err;
2754}
2755
2756/*
2757 * Pull a packet from our receive queue and hand it to the user.
2758 * If necessary we block.
2759 */
2760
2761static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2762 struct msghdr *msg, size_t len, int flags)
2763{
2764 struct sock *sk = sock->sk;
2765 struct sk_buff *skb;
2766 int copied, err;
bfd5f4a3 2767 int vnet_hdr_len = 0;
1da177e4
LT
2768
2769 err = -EINVAL;
ed85b565 2770 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2771 goto out;
2772
2773#if 0
2774 /* What error should we return now? EUNATTACH? */
2775 if (pkt_sk(sk)->ifindex < 0)
2776 return -ENODEV;
2777#endif
2778
ed85b565 2779 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
2780 err = sock_recv_errqueue(sk, msg, len,
2781 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
2782 goto out;
2783 }
2784
1da177e4
LT
2785 /*
2786 * Call the generic datagram receiver. This handles all sorts
2787 * of horrible races and re-entrancy so we can forget about it
2788 * in the protocol layers.
2789 *
2790 * Now it will return ENETDOWN, if device have just gone down,
2791 * but then it will block.
2792 */
2793
40d4e3df 2794 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2795
2796 /*
1ce4f28b 2797 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2798 * handles the blocking we don't see and worry about blocking
2799 * retries.
2800 */
2801
8ae55f04 2802 if (skb == NULL)
1da177e4
LT
2803 goto out;
2804
bfd5f4a3
SS
2805 if (pkt_sk(sk)->has_vnet_hdr) {
2806 struct virtio_net_hdr vnet_hdr = { 0 };
2807
2808 err = -EINVAL;
2809 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2810 if (len < vnet_hdr_len)
bfd5f4a3
SS
2811 goto out_free;
2812
1f18b717
MK
2813 len -= vnet_hdr_len;
2814
bfd5f4a3
SS
2815 if (skb_is_gso(skb)) {
2816 struct skb_shared_info *sinfo = skb_shinfo(skb);
2817
2818 /* This is a hint as to how much should be linear. */
2819 vnet_hdr.hdr_len = skb_headlen(skb);
2820 vnet_hdr.gso_size = sinfo->gso_size;
2821 if (sinfo->gso_type & SKB_GSO_TCPV4)
2822 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2823 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2824 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2825 else if (sinfo->gso_type & SKB_GSO_UDP)
2826 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2827 else if (sinfo->gso_type & SKB_GSO_FCOE)
2828 goto out_free;
2829 else
2830 BUG();
2831 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2832 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2833 } else
2834 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2835
2836 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2837 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2838 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2839 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2840 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2841 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2842 } /* else everything is zero */
2843
2844 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2845 vnet_hdr_len);
2846 if (err < 0)
2847 goto out_free;
2848 }
2849
f3d33426
HFS
2850 /* You lose any data beyond the buffer you gave. If it worries
2851 * a user program they can ask the device for its MTU
2852 * anyway.
1da177e4 2853 */
1da177e4 2854 copied = skb->len;
40d4e3df
ED
2855 if (copied > len) {
2856 copied = len;
2857 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2858 }
2859
2860 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2861 if (err)
2862 goto out_free;
2863
3b885787 2864 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 2865
f3d33426
HFS
2866 if (msg->msg_name) {
2867 /* If the address length field is there to be filled
2868 * in, we fill it in now.
2869 */
2870 if (sock->type == SOCK_PACKET) {
2871 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2872 } else {
2873 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2874 msg->msg_namelen = sll->sll_halen +
2875 offsetof(struct sockaddr_ll, sll_addr);
2876 }
ffbc6111
HX
2877 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2878 msg->msg_namelen);
f3d33426 2879 }
1da177e4 2880
8dc41944 2881 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2882 struct tpacket_auxdata aux;
2883
2884 aux.tp_status = TP_STATUS_USER;
2885 if (skb->ip_summed == CHECKSUM_PARTIAL)
2886 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2887 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2888 aux.tp_snaplen = skb->len;
2889 aux.tp_mac = 0;
bbe735e4 2890 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2891 if (vlan_tx_tag_present(skb)) {
2892 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
a0cdfcf3
AW
2893 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
2894 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2895 } else {
2896 aux.tp_vlan_tci = 0;
a0cdfcf3 2897 aux.tp_vlan_tpid = 0;
a3bcc23e 2898 }
ffbc6111 2899 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2900 }
2901
1da177e4
LT
2902 /*
2903 * Free or return the buffer as appropriate. Again this
2904 * hides all the races and re-entrancy issues from us.
2905 */
bfd5f4a3 2906 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2907
2908out_free:
2909 skb_free_datagram(sk, skb);
2910out:
2911 return err;
2912}
2913
1da177e4
LT
2914static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2915 int *uaddr_len, int peer)
2916{
2917 struct net_device *dev;
2918 struct sock *sk = sock->sk;
2919
2920 if (peer)
2921 return -EOPNOTSUPP;
2922
2923 uaddr->sa_family = AF_PACKET;
2dc85bf3 2924 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
2925 rcu_read_lock();
2926 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2927 if (dev)
2dc85bf3 2928 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 2929 rcu_read_unlock();
1da177e4
LT
2930 *uaddr_len = sizeof(*uaddr);
2931
2932 return 0;
2933}
1da177e4
LT
2934
2935static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2936 int *uaddr_len, int peer)
2937{
2938 struct net_device *dev;
2939 struct sock *sk = sock->sk;
2940 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2941 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2942
2943 if (peer)
2944 return -EOPNOTSUPP;
2945
2946 sll->sll_family = AF_PACKET;
2947 sll->sll_ifindex = po->ifindex;
2948 sll->sll_protocol = po->num;
67286640 2949 sll->sll_pkttype = 0;
654d1f8a
ED
2950 rcu_read_lock();
2951 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2952 if (dev) {
2953 sll->sll_hatype = dev->type;
2954 sll->sll_halen = dev->addr_len;
2955 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2956 } else {
2957 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2958 sll->sll_halen = 0;
2959 }
654d1f8a 2960 rcu_read_unlock();
0fb375fb 2961 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2962
2963 return 0;
2964}
2965
2aeb0b88
WC
2966static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2967 int what)
1da177e4
LT
2968{
2969 switch (i->type) {
2970 case PACKET_MR_MULTICAST:
1162563f
JP
2971 if (i->alen != dev->addr_len)
2972 return -EINVAL;
1da177e4 2973 if (what > 0)
22bedad3 2974 return dev_mc_add(dev, i->addr);
1da177e4 2975 else
22bedad3 2976 return dev_mc_del(dev, i->addr);
1da177e4
LT
2977 break;
2978 case PACKET_MR_PROMISC:
2aeb0b88 2979 return dev_set_promiscuity(dev, what);
1da177e4
LT
2980 break;
2981 case PACKET_MR_ALLMULTI:
2aeb0b88 2982 return dev_set_allmulti(dev, what);
1da177e4 2983 break;
d95ed927 2984 case PACKET_MR_UNICAST:
1162563f
JP
2985 if (i->alen != dev->addr_len)
2986 return -EINVAL;
d95ed927 2987 if (what > 0)
a748ee24 2988 return dev_uc_add(dev, i->addr);
d95ed927 2989 else
a748ee24 2990 return dev_uc_del(dev, i->addr);
d95ed927 2991 break;
40d4e3df
ED
2992 default:
2993 break;
1da177e4 2994 }
2aeb0b88 2995 return 0;
1da177e4
LT
2996}
2997
2998static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2999{
40d4e3df 3000 for ( ; i; i = i->next) {
1da177e4
LT
3001 if (i->ifindex == dev->ifindex)
3002 packet_dev_mc(dev, i, what);
3003 }
3004}
3005
0fb375fb 3006static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3007{
3008 struct packet_sock *po = pkt_sk(sk);
3009 struct packet_mclist *ml, *i;
3010 struct net_device *dev;
3011 int err;
3012
3013 rtnl_lock();
3014
3015 err = -ENODEV;
3b1e0a65 3016 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3017 if (!dev)
3018 goto done;
3019
3020 err = -EINVAL;
1162563f 3021 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3022 goto done;
3023
3024 err = -ENOBUFS;
8b3a7005 3025 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3026 if (i == NULL)
3027 goto done;
3028
3029 err = 0;
3030 for (ml = po->mclist; ml; ml = ml->next) {
3031 if (ml->ifindex == mreq->mr_ifindex &&
3032 ml->type == mreq->mr_type &&
3033 ml->alen == mreq->mr_alen &&
3034 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3035 ml->count++;
3036 /* Free the new element ... */
3037 kfree(i);
3038 goto done;
3039 }
3040 }
3041
3042 i->type = mreq->mr_type;
3043 i->ifindex = mreq->mr_ifindex;
3044 i->alen = mreq->mr_alen;
3045 memcpy(i->addr, mreq->mr_address, i->alen);
3046 i->count = 1;
3047 i->next = po->mclist;
3048 po->mclist = i;
2aeb0b88
WC
3049 err = packet_dev_mc(dev, i, 1);
3050 if (err) {
3051 po->mclist = i->next;
3052 kfree(i);
3053 }
1da177e4
LT
3054
3055done:
3056 rtnl_unlock();
3057 return err;
3058}
3059
0fb375fb 3060static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3061{
3062 struct packet_mclist *ml, **mlp;
3063
3064 rtnl_lock();
3065
3066 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3067 if (ml->ifindex == mreq->mr_ifindex &&
3068 ml->type == mreq->mr_type &&
3069 ml->alen == mreq->mr_alen &&
3070 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3071 if (--ml->count == 0) {
3072 struct net_device *dev;
3073 *mlp = ml->next;
ad959e76
ED
3074 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3075 if (dev)
1da177e4 3076 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3077 kfree(ml);
3078 }
3079 rtnl_unlock();
3080 return 0;
3081 }
3082 }
3083 rtnl_unlock();
3084 return -EADDRNOTAVAIL;
3085}
3086
3087static void packet_flush_mclist(struct sock *sk)
3088{
3089 struct packet_sock *po = pkt_sk(sk);
3090 struct packet_mclist *ml;
3091
3092 if (!po->mclist)
3093 return;
3094
3095 rtnl_lock();
3096 while ((ml = po->mclist) != NULL) {
3097 struct net_device *dev;
3098
3099 po->mclist = ml->next;
ad959e76
ED
3100 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3101 if (dev != NULL)
1da177e4 3102 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3103 kfree(ml);
3104 }
3105 rtnl_unlock();
3106}
1da177e4
LT
3107
3108static int
b7058842 3109packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3110{
3111 struct sock *sk = sock->sk;
8dc41944 3112 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3113 int ret;
3114
3115 if (level != SOL_PACKET)
3116 return -ENOPROTOOPT;
3117
69e3c75f 3118 switch (optname) {
1ce4f28b 3119 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3120 case PACKET_DROP_MEMBERSHIP:
3121 {
0fb375fb
EB
3122 struct packet_mreq_max mreq;
3123 int len = optlen;
3124 memset(&mreq, 0, sizeof(mreq));
3125 if (len < sizeof(struct packet_mreq))
1da177e4 3126 return -EINVAL;
0fb375fb
EB
3127 if (len > sizeof(mreq))
3128 len = sizeof(mreq);
40d4e3df 3129 if (copy_from_user(&mreq, optval, len))
1da177e4 3130 return -EFAULT;
0fb375fb
EB
3131 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3132 return -EINVAL;
1da177e4
LT
3133 if (optname == PACKET_ADD_MEMBERSHIP)
3134 ret = packet_mc_add(sk, &mreq);
3135 else
3136 ret = packet_mc_drop(sk, &mreq);
3137 return ret;
3138 }
a2efcfa0 3139
1da177e4 3140 case PACKET_RX_RING:
69e3c75f 3141 case PACKET_TX_RING:
1da177e4 3142 {
f6fb8f10 3143 union tpacket_req_u req_u;
3144 int len;
1da177e4 3145
f6fb8f10 3146 switch (po->tp_version) {
3147 case TPACKET_V1:
3148 case TPACKET_V2:
3149 len = sizeof(req_u.req);
3150 break;
3151 case TPACKET_V3:
3152 default:
3153 len = sizeof(req_u.req3);
3154 break;
3155 }
3156 if (optlen < len)
1da177e4 3157 return -EINVAL;
bfd5f4a3
SS
3158 if (pkt_sk(sk)->has_vnet_hdr)
3159 return -EINVAL;
f6fb8f10 3160 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3161 return -EFAULT;
f6fb8f10 3162 return packet_set_ring(sk, &req_u, 0,
3163 optname == PACKET_TX_RING);
1da177e4
LT
3164 }
3165 case PACKET_COPY_THRESH:
3166 {
3167 int val;
3168
40d4e3df 3169 if (optlen != sizeof(val))
1da177e4 3170 return -EINVAL;
40d4e3df 3171 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3172 return -EFAULT;
3173
3174 pkt_sk(sk)->copy_thresh = val;
3175 return 0;
3176 }
bbd6ef87
PM
3177 case PACKET_VERSION:
3178 {
3179 int val;
3180
3181 if (optlen != sizeof(val))
3182 return -EINVAL;
69e3c75f 3183 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3184 return -EBUSY;
3185 if (copy_from_user(&val, optval, sizeof(val)))
3186 return -EFAULT;
3187 switch (val) {
3188 case TPACKET_V1:
3189 case TPACKET_V2:
f6fb8f10 3190 case TPACKET_V3:
bbd6ef87
PM
3191 po->tp_version = val;
3192 return 0;
3193 default:
3194 return -EINVAL;
3195 }
3196 }
8913336a
PM
3197 case PACKET_RESERVE:
3198 {
3199 unsigned int val;
3200
3201 if (optlen != sizeof(val))
3202 return -EINVAL;
69e3c75f 3203 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3204 return -EBUSY;
3205 if (copy_from_user(&val, optval, sizeof(val)))
3206 return -EFAULT;
3207 po->tp_reserve = val;
3208 return 0;
3209 }
69e3c75f
JB
3210 case PACKET_LOSS:
3211 {
3212 unsigned int val;
3213
3214 if (optlen != sizeof(val))
3215 return -EINVAL;
3216 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3217 return -EBUSY;
3218 if (copy_from_user(&val, optval, sizeof(val)))
3219 return -EFAULT;
3220 po->tp_loss = !!val;
3221 return 0;
3222 }
8dc41944
HX
3223 case PACKET_AUXDATA:
3224 {
3225 int val;
3226
3227 if (optlen < sizeof(val))
3228 return -EINVAL;
3229 if (copy_from_user(&val, optval, sizeof(val)))
3230 return -EFAULT;
3231
3232 po->auxdata = !!val;
3233 return 0;
3234 }
80feaacb
PWJ
3235 case PACKET_ORIGDEV:
3236 {
3237 int val;
3238
3239 if (optlen < sizeof(val))
3240 return -EINVAL;
3241 if (copy_from_user(&val, optval, sizeof(val)))
3242 return -EFAULT;
3243
3244 po->origdev = !!val;
3245 return 0;
3246 }
bfd5f4a3
SS
3247 case PACKET_VNET_HDR:
3248 {
3249 int val;
3250
3251 if (sock->type != SOCK_RAW)
3252 return -EINVAL;
3253 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3254 return -EBUSY;
3255 if (optlen < sizeof(val))
3256 return -EINVAL;
3257 if (copy_from_user(&val, optval, sizeof(val)))
3258 return -EFAULT;
3259
3260 po->has_vnet_hdr = !!val;
3261 return 0;
3262 }
614f60fa
SM
3263 case PACKET_TIMESTAMP:
3264 {
3265 int val;
3266
3267 if (optlen != sizeof(val))
3268 return -EINVAL;
3269 if (copy_from_user(&val, optval, sizeof(val)))
3270 return -EFAULT;
3271
3272 po->tp_tstamp = val;
3273 return 0;
3274 }
dc99f600
DM
3275 case PACKET_FANOUT:
3276 {
3277 int val;
3278
3279 if (optlen != sizeof(val))
3280 return -EINVAL;
3281 if (copy_from_user(&val, optval, sizeof(val)))
3282 return -EFAULT;
3283
3284 return fanout_add(sk, val & 0xffff, val >> 16);
3285 }
5920cd3a
PC
3286 case PACKET_TX_HAS_OFF:
3287 {
3288 unsigned int val;
3289
3290 if (optlen != sizeof(val))
3291 return -EINVAL;
3292 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3293 return -EBUSY;
3294 if (copy_from_user(&val, optval, sizeof(val)))
3295 return -EFAULT;
3296 po->tp_tx_has_off = !!val;
3297 return 0;
3298 }
d346a3fa
DB
3299 case PACKET_QDISC_BYPASS:
3300 {
3301 int val;
3302
3303 if (optlen != sizeof(val))
3304 return -EINVAL;
3305 if (copy_from_user(&val, optval, sizeof(val)))
3306 return -EFAULT;
3307
3308 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3309 return 0;
3310 }
1da177e4
LT
3311 default:
3312 return -ENOPROTOOPT;
3313 }
3314}
3315
3316static int packet_getsockopt(struct socket *sock, int level, int optname,
3317 char __user *optval, int __user *optlen)
3318{
3319 int len;
c06fff6e 3320 int val, lv = sizeof(val);
1da177e4
LT
3321 struct sock *sk = sock->sk;
3322 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3323 void *data = &val;
ee80fbf3 3324 union tpacket_stats_u st;
1da177e4
LT
3325
3326 if (level != SOL_PACKET)
3327 return -ENOPROTOOPT;
3328
8ae55f04
KK
3329 if (get_user(len, optlen))
3330 return -EFAULT;
1da177e4
LT
3331
3332 if (len < 0)
3333 return -EINVAL;
1ce4f28b 3334
69e3c75f 3335 switch (optname) {
1da177e4 3336 case PACKET_STATISTICS:
1da177e4 3337 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3338 memcpy(&st, &po->stats, sizeof(st));
3339 memset(&po->stats, 0, sizeof(po->stats));
3340 spin_unlock_bh(&sk->sk_receive_queue.lock);
3341
f6fb8f10 3342 if (po->tp_version == TPACKET_V3) {
c06fff6e 3343 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3344 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3345 data = &st.stats3;
f6fb8f10 3346 } else {
c06fff6e 3347 lv = sizeof(struct tpacket_stats);
8bcdeaff 3348 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3349 data = &st.stats1;
f6fb8f10 3350 }
ee80fbf3 3351
8dc41944
HX
3352 break;
3353 case PACKET_AUXDATA:
8dc41944 3354 val = po->auxdata;
80feaacb
PWJ
3355 break;
3356 case PACKET_ORIGDEV:
80feaacb 3357 val = po->origdev;
bfd5f4a3
SS
3358 break;
3359 case PACKET_VNET_HDR:
bfd5f4a3 3360 val = po->has_vnet_hdr;
1da177e4 3361 break;
bbd6ef87 3362 case PACKET_VERSION:
bbd6ef87 3363 val = po->tp_version;
bbd6ef87
PM
3364 break;
3365 case PACKET_HDRLEN:
3366 if (len > sizeof(int))
3367 len = sizeof(int);
3368 if (copy_from_user(&val, optval, len))
3369 return -EFAULT;
3370 switch (val) {
3371 case TPACKET_V1:
3372 val = sizeof(struct tpacket_hdr);
3373 break;
3374 case TPACKET_V2:
3375 val = sizeof(struct tpacket2_hdr);
3376 break;
f6fb8f10 3377 case TPACKET_V3:
3378 val = sizeof(struct tpacket3_hdr);
3379 break;
bbd6ef87
PM
3380 default:
3381 return -EINVAL;
3382 }
bbd6ef87 3383 break;
8913336a 3384 case PACKET_RESERVE:
8913336a 3385 val = po->tp_reserve;
8913336a 3386 break;
69e3c75f 3387 case PACKET_LOSS:
69e3c75f 3388 val = po->tp_loss;
69e3c75f 3389 break;
614f60fa 3390 case PACKET_TIMESTAMP:
614f60fa 3391 val = po->tp_tstamp;
614f60fa 3392 break;
dc99f600 3393 case PACKET_FANOUT:
dc99f600
DM
3394 val = (po->fanout ?
3395 ((u32)po->fanout->id |
77f65ebd
WB
3396 ((u32)po->fanout->type << 16) |
3397 ((u32)po->fanout->flags << 24)) :
dc99f600 3398 0);
dc99f600 3399 break;
5920cd3a
PC
3400 case PACKET_TX_HAS_OFF:
3401 val = po->tp_tx_has_off;
3402 break;
d346a3fa
DB
3403 case PACKET_QDISC_BYPASS:
3404 val = packet_use_direct_xmit(po);
3405 break;
1da177e4
LT
3406 default:
3407 return -ENOPROTOOPT;
3408 }
3409
c06fff6e
ED
3410 if (len > lv)
3411 len = lv;
8ae55f04
KK
3412 if (put_user(len, optlen))
3413 return -EFAULT;
8dc41944
HX
3414 if (copy_to_user(optval, data, len))
3415 return -EFAULT;
8ae55f04 3416 return 0;
1da177e4
LT
3417}
3418
3419
351638e7
JP
3420static int packet_notifier(struct notifier_block *this,
3421 unsigned long msg, void *ptr)
1da177e4
LT
3422{
3423 struct sock *sk;
351638e7 3424 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 3425 struct net *net = dev_net(dev);
1da177e4 3426
808f5114 3427 rcu_read_lock();
b67bfe0d 3428 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3429 struct packet_sock *po = pkt_sk(sk);
3430
3431 switch (msg) {
3432 case NETDEV_UNREGISTER:
1da177e4
LT
3433 if (po->mclist)
3434 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3435 /* fallthrough */
3436
1da177e4
LT
3437 case NETDEV_DOWN:
3438 if (dev->ifindex == po->ifindex) {
3439 spin_lock(&po->bind_lock);
3440 if (po->running) {
ce06b03e 3441 __unregister_prot_hook(sk, false);
1da177e4
LT
3442 sk->sk_err = ENETDOWN;
3443 if (!sock_flag(sk, SOCK_DEAD))
3444 sk->sk_error_report(sk);
3445 }
3446 if (msg == NETDEV_UNREGISTER) {
66e56cd4 3447 packet_cached_dev_reset(po);
1da177e4 3448 po->ifindex = -1;
160ff18a
BG
3449 if (po->prot_hook.dev)
3450 dev_put(po->prot_hook.dev);
1da177e4
LT
3451 po->prot_hook.dev = NULL;
3452 }
3453 spin_unlock(&po->bind_lock);
3454 }
3455 break;
3456 case NETDEV_UP:
808f5114 3457 if (dev->ifindex == po->ifindex) {
3458 spin_lock(&po->bind_lock);
ce06b03e
DM
3459 if (po->num)
3460 register_prot_hook(sk);
808f5114 3461 spin_unlock(&po->bind_lock);
1da177e4 3462 }
1da177e4
LT
3463 break;
3464 }
3465 }
808f5114 3466 rcu_read_unlock();
1da177e4
LT
3467 return NOTIFY_DONE;
3468}
3469
3470
3471static int packet_ioctl(struct socket *sock, unsigned int cmd,
3472 unsigned long arg)
3473{
3474 struct sock *sk = sock->sk;
3475
69e3c75f 3476 switch (cmd) {
40d4e3df
ED
3477 case SIOCOUTQ:
3478 {
3479 int amount = sk_wmem_alloc_get(sk);
31e6d363 3480
40d4e3df
ED
3481 return put_user(amount, (int __user *)arg);
3482 }
3483 case SIOCINQ:
3484 {
3485 struct sk_buff *skb;
3486 int amount = 0;
3487
3488 spin_lock_bh(&sk->sk_receive_queue.lock);
3489 skb = skb_peek(&sk->sk_receive_queue);
3490 if (skb)
3491 amount = skb->len;
3492 spin_unlock_bh(&sk->sk_receive_queue.lock);
3493 return put_user(amount, (int __user *)arg);
3494 }
3495 case SIOCGSTAMP:
3496 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3497 case SIOCGSTAMPNS:
3498 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3499
1da177e4 3500#ifdef CONFIG_INET
40d4e3df
ED
3501 case SIOCADDRT:
3502 case SIOCDELRT:
3503 case SIOCDARP:
3504 case SIOCGARP:
3505 case SIOCSARP:
3506 case SIOCGIFADDR:
3507 case SIOCSIFADDR:
3508 case SIOCGIFBRDADDR:
3509 case SIOCSIFBRDADDR:
3510 case SIOCGIFNETMASK:
3511 case SIOCSIFNETMASK:
3512 case SIOCGIFDSTADDR:
3513 case SIOCSIFDSTADDR:
3514 case SIOCSIFFLAGS:
40d4e3df 3515 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3516#endif
3517
40d4e3df
ED
3518 default:
3519 return -ENOIOCTLCMD;
1da177e4
LT
3520 }
3521 return 0;
3522}
3523
40d4e3df 3524static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3525 poll_table *wait)
3526{
3527 struct sock *sk = sock->sk;
3528 struct packet_sock *po = pkt_sk(sk);
3529 unsigned int mask = datagram_poll(file, sock, wait);
3530
3531 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3532 if (po->rx_ring.pg_vec) {
f6fb8f10 3533 if (!packet_previous_rx_frame(po, &po->rx_ring,
3534 TP_STATUS_KERNEL))
1da177e4
LT
3535 mask |= POLLIN | POLLRDNORM;
3536 }
3537 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3538 spin_lock_bh(&sk->sk_write_queue.lock);
3539 if (po->tx_ring.pg_vec) {
3540 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3541 mask |= POLLOUT | POLLWRNORM;
3542 }
3543 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3544 return mask;
3545}
3546
3547
3548/* Dirty? Well, I still did not learn better way to account
3549 * for user mmaps.
3550 */
3551
3552static void packet_mm_open(struct vm_area_struct *vma)
3553{
3554 struct file *file = vma->vm_file;
40d4e3df 3555 struct socket *sock = file->private_data;
1da177e4 3556 struct sock *sk = sock->sk;
1ce4f28b 3557
1da177e4
LT
3558 if (sk)
3559 atomic_inc(&pkt_sk(sk)->mapped);
3560}
3561
3562static void packet_mm_close(struct vm_area_struct *vma)
3563{
3564 struct file *file = vma->vm_file;
40d4e3df 3565 struct socket *sock = file->private_data;
1da177e4 3566 struct sock *sk = sock->sk;
1ce4f28b 3567
1da177e4
LT
3568 if (sk)
3569 atomic_dec(&pkt_sk(sk)->mapped);
3570}
3571
f0f37e2f 3572static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3573 .open = packet_mm_open,
3574 .close = packet_mm_close,
1da177e4
LT
3575};
3576
0e3125c7
NH
3577static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3578 unsigned int len)
1da177e4
LT
3579{
3580 int i;
3581
4ebf0ae2 3582 for (i = 0; i < len; i++) {
0e3125c7 3583 if (likely(pg_vec[i].buffer)) {
c56b4d90 3584 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3585 vfree(pg_vec[i].buffer);
3586 else
3587 free_pages((unsigned long)pg_vec[i].buffer,
3588 order);
3589 pg_vec[i].buffer = NULL;
3590 }
1da177e4
LT
3591 }
3592 kfree(pg_vec);
3593}
3594
eea49cc9 3595static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3596{
0e3125c7
NH
3597 char *buffer = NULL;
3598 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3599 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3600
3601 buffer = (char *) __get_free_pages(gfp_flags, order);
3602
3603 if (buffer)
3604 return buffer;
3605
3606 /*
3607 * __get_free_pages failed, fall back to vmalloc
3608 */
bbce5a59 3609 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3610
0e3125c7
NH
3611 if (buffer)
3612 return buffer;
3613
3614 /*
3615 * vmalloc failed, lets dig into swap here
3616 */
0e3125c7
NH
3617 gfp_flags &= ~__GFP_NORETRY;
3618 buffer = (char *)__get_free_pages(gfp_flags, order);
3619 if (buffer)
3620 return buffer;
3621
3622 /*
3623 * complete and utter failure
3624 */
3625 return NULL;
4ebf0ae2
DM
3626}
3627
0e3125c7 3628static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3629{
3630 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3631 struct pgv *pg_vec;
4ebf0ae2
DM
3632 int i;
3633
0e3125c7 3634 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3635 if (unlikely(!pg_vec))
3636 goto out;
3637
3638 for (i = 0; i < block_nr; i++) {
c56b4d90 3639 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3640 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3641 goto out_free_pgvec;
3642 }
3643
3644out:
3645 return pg_vec;
3646
3647out_free_pgvec:
3648 free_pg_vec(pg_vec, order, block_nr);
3649 pg_vec = NULL;
3650 goto out;
3651}
1da177e4 3652
f6fb8f10 3653static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3654 int closing, int tx_ring)
1da177e4 3655{
0e3125c7 3656 struct pgv *pg_vec = NULL;
1da177e4 3657 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3658 int was_running, order = 0;
69e3c75f
JB
3659 struct packet_ring_buffer *rb;
3660 struct sk_buff_head *rb_queue;
0e11c91e 3661 __be16 num;
f6fb8f10 3662 int err = -EINVAL;
3663 /* Added to avoid minimal code churn */
3664 struct tpacket_req *req = &req_u->req;
3665
3666 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3667 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3668 WARN(1, "Tx-ring is not supported.\n");
3669 goto out;
3670 }
1ce4f28b 3671
69e3c75f
JB
3672 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3673 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3674
69e3c75f
JB
3675 err = -EBUSY;
3676 if (!closing) {
3677 if (atomic_read(&po->mapped))
3678 goto out;
3679 if (atomic_read(&rb->pending))
3680 goto out;
3681 }
1da177e4 3682
69e3c75f
JB
3683 if (req->tp_block_nr) {
3684 /* Sanity tests and some calculations */
3685 err = -EBUSY;
3686 if (unlikely(rb->pg_vec))
3687 goto out;
1da177e4 3688
bbd6ef87
PM
3689 switch (po->tp_version) {
3690 case TPACKET_V1:
3691 po->tp_hdrlen = TPACKET_HDRLEN;
3692 break;
3693 case TPACKET_V2:
3694 po->tp_hdrlen = TPACKET2_HDRLEN;
3695 break;
f6fb8f10 3696 case TPACKET_V3:
3697 po->tp_hdrlen = TPACKET3_HDRLEN;
3698 break;
bbd6ef87
PM
3699 }
3700
69e3c75f 3701 err = -EINVAL;
4ebf0ae2 3702 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3703 goto out;
4ebf0ae2 3704 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3705 goto out;
8913336a 3706 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3707 po->tp_reserve))
3708 goto out;
4ebf0ae2 3709 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3710 goto out;
1da177e4 3711
69e3c75f
JB
3712 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3713 if (unlikely(rb->frames_per_block <= 0))
3714 goto out;
3715 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3716 req->tp_frame_nr))
3717 goto out;
1da177e4
LT
3718
3719 err = -ENOMEM;
4ebf0ae2
DM
3720 order = get_order(req->tp_block_size);
3721 pg_vec = alloc_pg_vec(req, order);
3722 if (unlikely(!pg_vec))
1da177e4 3723 goto out;
f6fb8f10 3724 switch (po->tp_version) {
3725 case TPACKET_V3:
3726 /* Transmit path is not supported. We checked
3727 * it above but just being paranoid
3728 */
3729 if (!tx_ring)
3730 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3731 break;
3732 default:
3733 break;
3734 }
69e3c75f
JB
3735 }
3736 /* Done */
3737 else {
3738 err = -EINVAL;
4ebf0ae2 3739 if (unlikely(req->tp_frame_nr))
69e3c75f 3740 goto out;
1da177e4
LT
3741 }
3742
3743 lock_sock(sk);
3744
3745 /* Detach socket from network */
3746 spin_lock(&po->bind_lock);
3747 was_running = po->running;
3748 num = po->num;
3749 if (was_running) {
1da177e4 3750 po->num = 0;
ce06b03e 3751 __unregister_prot_hook(sk, false);
1da177e4
LT
3752 }
3753 spin_unlock(&po->bind_lock);
1ce4f28b 3754
1da177e4
LT
3755 synchronize_net();
3756
3757 err = -EBUSY;
905db440 3758 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3759 if (closing || atomic_read(&po->mapped) == 0) {
3760 err = 0;
69e3c75f 3761 spin_lock_bh(&rb_queue->lock);
c053fd96 3762 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3763 rb->frame_max = (req->tp_frame_nr - 1);
3764 rb->head = 0;
3765 rb->frame_size = req->tp_frame_size;
3766 spin_unlock_bh(&rb_queue->lock);
3767
c053fd96
CG
3768 swap(rb->pg_vec_order, order);
3769 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3770
3771 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3772 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3773 tpacket_rcv : packet_rcv;
3774 skb_queue_purge(rb_queue);
1da177e4 3775 if (atomic_read(&po->mapped))
40d4e3df
ED
3776 pr_err("packet_mmap: vma is busy: %d\n",
3777 atomic_read(&po->mapped));
1da177e4 3778 }
905db440 3779 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3780
3781 spin_lock(&po->bind_lock);
ce06b03e 3782 if (was_running) {
1da177e4 3783 po->num = num;
ce06b03e 3784 register_prot_hook(sk);
1da177e4
LT
3785 }
3786 spin_unlock(&po->bind_lock);
f6fb8f10 3787 if (closing && (po->tp_version > TPACKET_V2)) {
3788 /* Because we don't support block-based V3 on tx-ring */
3789 if (!tx_ring)
3790 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3791 }
1da177e4
LT
3792 release_sock(sk);
3793
1da177e4
LT
3794 if (pg_vec)
3795 free_pg_vec(pg_vec, order, req->tp_block_nr);
3796out:
3797 return err;
3798}
3799
69e3c75f
JB
3800static int packet_mmap(struct file *file, struct socket *sock,
3801 struct vm_area_struct *vma)
1da177e4
LT
3802{
3803 struct sock *sk = sock->sk;
3804 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3805 unsigned long size, expected_size;
3806 struct packet_ring_buffer *rb;
1da177e4
LT
3807 unsigned long start;
3808 int err = -EINVAL;
3809 int i;
3810
3811 if (vma->vm_pgoff)
3812 return -EINVAL;
3813
905db440 3814 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3815
3816 expected_size = 0;
3817 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3818 if (rb->pg_vec) {
3819 expected_size += rb->pg_vec_len
3820 * rb->pg_vec_pages
3821 * PAGE_SIZE;
3822 }
3823 }
3824
3825 if (expected_size == 0)
1da177e4 3826 goto out;
69e3c75f
JB
3827
3828 size = vma->vm_end - vma->vm_start;
3829 if (size != expected_size)
1da177e4
LT
3830 goto out;
3831
1da177e4 3832 start = vma->vm_start;
69e3c75f
JB
3833 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3834 if (rb->pg_vec == NULL)
3835 continue;
3836
3837 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3838 struct page *page;
3839 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3840 int pg_num;
3841
c56b4d90
CG
3842 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3843 page = pgv_to_page(kaddr);
69e3c75f
JB
3844 err = vm_insert_page(vma, start, page);
3845 if (unlikely(err))
3846 goto out;
3847 start += PAGE_SIZE;
0e3125c7 3848 kaddr += PAGE_SIZE;
69e3c75f 3849 }
4ebf0ae2 3850 }
1da177e4 3851 }
69e3c75f 3852
4ebf0ae2 3853 atomic_inc(&po->mapped);
1da177e4
LT
3854 vma->vm_ops = &packet_mmap_ops;
3855 err = 0;
3856
3857out:
905db440 3858 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3859 return err;
3860}
1da177e4 3861
90ddc4f0 3862static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3863 .family = PF_PACKET,
3864 .owner = THIS_MODULE,
3865 .release = packet_release,
3866 .bind = packet_bind_spkt,
3867 .connect = sock_no_connect,
3868 .socketpair = sock_no_socketpair,
3869 .accept = sock_no_accept,
3870 .getname = packet_getname_spkt,
3871 .poll = datagram_poll,
3872 .ioctl = packet_ioctl,
3873 .listen = sock_no_listen,
3874 .shutdown = sock_no_shutdown,
3875 .setsockopt = sock_no_setsockopt,
3876 .getsockopt = sock_no_getsockopt,
3877 .sendmsg = packet_sendmsg_spkt,
3878 .recvmsg = packet_recvmsg,
3879 .mmap = sock_no_mmap,
3880 .sendpage = sock_no_sendpage,
3881};
1da177e4 3882
90ddc4f0 3883static const struct proto_ops packet_ops = {
1da177e4
LT
3884 .family = PF_PACKET,
3885 .owner = THIS_MODULE,
3886 .release = packet_release,
3887 .bind = packet_bind,
3888 .connect = sock_no_connect,
3889 .socketpair = sock_no_socketpair,
3890 .accept = sock_no_accept,
1ce4f28b 3891 .getname = packet_getname,
1da177e4
LT
3892 .poll = packet_poll,
3893 .ioctl = packet_ioctl,
3894 .listen = sock_no_listen,
3895 .shutdown = sock_no_shutdown,
3896 .setsockopt = packet_setsockopt,
3897 .getsockopt = packet_getsockopt,
3898 .sendmsg = packet_sendmsg,
3899 .recvmsg = packet_recvmsg,
3900 .mmap = packet_mmap,
3901 .sendpage = sock_no_sendpage,
3902};
3903
ec1b4cf7 3904static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3905 .family = PF_PACKET,
3906 .create = packet_create,
3907 .owner = THIS_MODULE,
3908};
3909
3910static struct notifier_block packet_netdev_notifier = {
40d4e3df 3911 .notifier_call = packet_notifier,
1da177e4
LT
3912};
3913
3914#ifdef CONFIG_PROC_FS
1da177e4
LT
3915
3916static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3917 __acquires(RCU)
1da177e4 3918{
e372c414 3919 struct net *net = seq_file_net(seq);
808f5114 3920
3921 rcu_read_lock();
3922 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3923}
3924
3925static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3926{
1bf40954 3927 struct net *net = seq_file_net(seq);
808f5114 3928 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3929}
3930
3931static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3932 __releases(RCU)
1da177e4 3933{
808f5114 3934 rcu_read_unlock();
1da177e4
LT
3935}
3936
1ce4f28b 3937static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3938{
3939 if (v == SEQ_START_TOKEN)
3940 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3941 else {
b7ceabd9 3942 struct sock *s = sk_entry(v);
1da177e4
LT
3943 const struct packet_sock *po = pkt_sk(s);
3944
3945 seq_printf(seq,
71338aa7 3946 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3947 s,
3948 atomic_read(&s->sk_refcnt),
3949 s->sk_type,
3950 ntohs(po->num),
3951 po->ifindex,
3952 po->running,
3953 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3954 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3955 sock_i_ino(s));
1da177e4
LT
3956 }
3957
3958 return 0;
3959}
3960
56b3d975 3961static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3962 .start = packet_seq_start,
3963 .next = packet_seq_next,
3964 .stop = packet_seq_stop,
3965 .show = packet_seq_show,
3966};
3967
3968static int packet_seq_open(struct inode *inode, struct file *file)
3969{
e372c414
DL
3970 return seq_open_net(inode, file, &packet_seq_ops,
3971 sizeof(struct seq_net_private));
1da177e4
LT
3972}
3973
da7071d7 3974static const struct file_operations packet_seq_fops = {
1da177e4
LT
3975 .owner = THIS_MODULE,
3976 .open = packet_seq_open,
3977 .read = seq_read,
3978 .llseek = seq_lseek,
e372c414 3979 .release = seq_release_net,
1da177e4
LT
3980};
3981
3982#endif
3983
2c8c1e72 3984static int __net_init packet_net_init(struct net *net)
d12d01d6 3985{
0fa7fa98 3986 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3987 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3988
d4beaa66 3989 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3990 return -ENOMEM;
3991
3992 return 0;
3993}
3994
2c8c1e72 3995static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3996{
ece31ffd 3997 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3998}
3999
4000static struct pernet_operations packet_net_ops = {
4001 .init = packet_net_init,
4002 .exit = packet_net_exit,
4003};
4004
4005
1da177e4
LT
4006static void __exit packet_exit(void)
4007{
1da177e4 4008 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4009 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4010 sock_unregister(PF_PACKET);
4011 proto_unregister(&packet_proto);
4012}
4013
4014static int __init packet_init(void)
4015{
4016 int rc = proto_register(&packet_proto, 0);
4017
4018 if (rc != 0)
4019 goto out;
4020
4021 sock_register(&packet_family_ops);
d12d01d6 4022 register_pernet_subsys(&packet_net_ops);
1da177e4 4023 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
4024out:
4025 return rc;
4026}
4027
4028module_init(packet_init);
4029module_exit(packet_exit);
4030MODULE_LICENSE("GPL");
4031MODULE_ALIAS_NETPROTO(PF_PACKET);