]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/packet/af_packet.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal
[mirror_ubuntu-artful-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
2787b04b
PE
96#include "internal.h"
97
1da177e4
LT
98/*
99 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
105 (PPP).
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
108
109On receive:
110-----------
111
112Incoming, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> data
1da177e4
LT
115
116Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
117 mac_header -> ll header
118 data -> ll header
1da177e4
LT
119
120Incoming, dev->hard_header==NULL
b0e380b1
ACM
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
db0c58f9 123 assymetry between rx and tx paths.
b0e380b1 124 data -> data
1da177e4
LT
125
126Outgoing, dev->hard_header==NULL
b0e380b1
ACM
127 mac_header -> data. ll header is still not built!
128 data -> data
1da177e4
LT
129
130Resume
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
132
133
134On transmit:
135------------
136
137dev->hard_header != NULL
b0e380b1
ACM
138 mac_header -> ll header
139 data -> ll header
1da177e4
LT
140
141dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
142 mac_header -> data
143 data -> data
1da177e4
LT
144
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
147 */
148
1da177e4
LT
149/* Private packet socket structures. */
150
0fb375fb
EB
151/* identical to struct packet_mreq except it has
152 * a longer address field.
153 */
40d4e3df 154struct packet_mreq_max {
0fb375fb
EB
155 int mr_ifindex;
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 159};
a2efcfa0 160
184f489e
DB
161union tpacket_uhdr {
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
165 void *raw;
166};
167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
f6fb8f10 178#define PGV_FROM_VMALLOC 1
69e3c75f 179
f6fb8f10 180#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
187
69e3c75f
JB
188struct packet_sock;
189static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
77f65ebd
WB
190static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 192
f6fb8f10 193static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
195 int status);
196static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 197static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 200 struct packet_sock *);
bc59ba39 201static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 202 struct packet_sock *, unsigned int status);
bc59ba39 203static int prb_queue_frozen(struct tpacket_kbdq_core *);
204static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
f6fb8f10 206static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 207static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
1da177e4
LT
216static void packet_flush_mclist(struct sock *sk);
217
ffbc6111
HX
218struct packet_skb_cb {
219 unsigned int origlen;
220 union {
221 struct sockaddr_pkt pkt;
222 struct sockaddr_ll ll;
223 } sa;
224};
225
226#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 227
bc59ba39 228#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 229#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 231#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 232 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 233#define GET_NEXT_PRB_BLK_NUM(x) \
234 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
235 ((x)->kactive_blk_num+1) : 0)
236
dc99f600
DM
237static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238static void __fanout_link(struct sock *sk, struct packet_sock *po);
239
ce06b03e
DM
240/* register_prot_hook must be invoked with the po->bind_lock held,
241 * or from a context in which asynchronous accesses to the packet
242 * socket is not possible (packet_create()).
243 */
244static void register_prot_hook(struct sock *sk)
245{
246 struct packet_sock *po = pkt_sk(sk);
247 if (!po->running) {
dc99f600
DM
248 if (po->fanout)
249 __fanout_link(sk, po);
250 else
251 dev_add_pack(&po->prot_hook);
ce06b03e
DM
252 sock_hold(sk);
253 po->running = 1;
254 }
255}
256
257/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
258 * held. If the sync parameter is true, we will temporarily drop
259 * the po->bind_lock and do a synchronize_net to make sure no
260 * asynchronous packet processing paths still refer to the elements
261 * of po->prot_hook. If the sync parameter is false, it is the
262 * callers responsibility to take care of this.
263 */
264static void __unregister_prot_hook(struct sock *sk, bool sync)
265{
266 struct packet_sock *po = pkt_sk(sk);
267
268 po->running = 0;
dc99f600
DM
269 if (po->fanout)
270 __fanout_unlink(sk, po);
271 else
272 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
273 __sock_put(sk);
274
275 if (sync) {
276 spin_unlock(&po->bind_lock);
277 synchronize_net();
278 spin_lock(&po->bind_lock);
279 }
280}
281
282static void unregister_prot_hook(struct sock *sk, bool sync)
283{
284 struct packet_sock *po = pkt_sk(sk);
285
286 if (po->running)
287 __unregister_prot_hook(sk, sync);
288}
289
f6dafa95 290static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
291{
292 if (is_vmalloc_addr(addr))
293 return vmalloc_to_page(addr);
294 return virt_to_page(addr);
295}
296
69e3c75f 297static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 298{
184f489e 299 union tpacket_uhdr h;
1da177e4 300
69e3c75f 301 h.raw = frame;
bbd6ef87
PM
302 switch (po->tp_version) {
303 case TPACKET_V1:
69e3c75f 304 h.h1->tp_status = status;
0af55bb5 305 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
306 break;
307 case TPACKET_V2:
69e3c75f 308 h.h2->tp_status = status;
0af55bb5 309 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 310 break;
f6fb8f10 311 case TPACKET_V3:
69e3c75f 312 default:
f6fb8f10 313 WARN(1, "TPACKET version not supported.\n");
69e3c75f 314 BUG();
bbd6ef87 315 }
69e3c75f
JB
316
317 smp_wmb();
bbd6ef87
PM
318}
319
69e3c75f 320static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 321{
184f489e 322 union tpacket_uhdr h;
bbd6ef87 323
69e3c75f
JB
324 smp_rmb();
325
bbd6ef87
PM
326 h.raw = frame;
327 switch (po->tp_version) {
328 case TPACKET_V1:
0af55bb5 329 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 330 return h.h1->tp_status;
bbd6ef87 331 case TPACKET_V2:
0af55bb5 332 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 333 return h.h2->tp_status;
f6fb8f10 334 case TPACKET_V3:
69e3c75f 335 default:
f6fb8f10 336 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
337 BUG();
338 return 0;
bbd6ef87 339 }
1da177e4 340}
69e3c75f 341
b9c32fb2
DB
342static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
343 unsigned int flags)
7a51384c
DB
344{
345 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
346
347 if (shhwtstamps) {
348 if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
349 ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
b9c32fb2 350 return TP_STATUS_TS_SYS_HARDWARE;
7a51384c
DB
351 if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
352 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
b9c32fb2 353 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
354 }
355
356 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 357 return TP_STATUS_TS_SOFTWARE;
7a51384c 358
b9c32fb2 359 return 0;
7a51384c
DB
360}
361
b9c32fb2
DB
362static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
363 struct sk_buff *skb)
2e31396f
WB
364{
365 union tpacket_uhdr h;
366 struct timespec ts;
b9c32fb2 367 __u32 ts_status;
2e31396f 368
b9c32fb2
DB
369 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
370 return 0;
2e31396f
WB
371
372 h.raw = frame;
373 switch (po->tp_version) {
374 case TPACKET_V1:
375 h.h1->tp_sec = ts.tv_sec;
376 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
377 break;
378 case TPACKET_V2:
379 h.h2->tp_sec = ts.tv_sec;
380 h.h2->tp_nsec = ts.tv_nsec;
381 break;
382 case TPACKET_V3:
383 default:
384 WARN(1, "TPACKET version not supported.\n");
385 BUG();
386 }
387
388 /* one flush is safe, as both fields always lie on the same cacheline */
389 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
390 smp_wmb();
b9c32fb2
DB
391
392 return ts_status;
2e31396f
WB
393}
394
69e3c75f
JB
395static void *packet_lookup_frame(struct packet_sock *po,
396 struct packet_ring_buffer *rb,
397 unsigned int position,
398 int status)
399{
400 unsigned int pg_vec_pos, frame_offset;
184f489e 401 union tpacket_uhdr h;
69e3c75f
JB
402
403 pg_vec_pos = position / rb->frames_per_block;
404 frame_offset = position % rb->frames_per_block;
405
0e3125c7
NH
406 h.raw = rb->pg_vec[pg_vec_pos].buffer +
407 (frame_offset * rb->frame_size);
69e3c75f
JB
408
409 if (status != __packet_get_status(po, h.raw))
410 return NULL;
411
412 return h.raw;
413}
414
eea49cc9 415static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
416 struct packet_ring_buffer *rb,
417 int status)
418{
419 return packet_lookup_frame(po, rb, rb->head, status);
420}
421
bc59ba39 422static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 423{
424 del_timer_sync(&pkc->retire_blk_timer);
425}
426
427static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
428 int tx_ring,
429 struct sk_buff_head *rb_queue)
430{
bc59ba39 431 struct tpacket_kbdq_core *pkc;
f6fb8f10 432
433 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
434
435 spin_lock(&rb_queue->lock);
436 pkc->delete_blk_timer = 1;
437 spin_unlock(&rb_queue->lock);
438
439 prb_del_retire_blk_timer(pkc);
440}
441
442static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 443 struct tpacket_kbdq_core *pkc,
f6fb8f10 444 void (*func) (unsigned long))
445{
446 init_timer(&pkc->retire_blk_timer);
447 pkc->retire_blk_timer.data = (long)po;
448 pkc->retire_blk_timer.function = func;
449 pkc->retire_blk_timer.expires = jiffies;
450}
451
452static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
453{
bc59ba39 454 struct tpacket_kbdq_core *pkc;
f6fb8f10 455
456 if (tx_ring)
457 BUG();
458
459 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
460 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
461}
462
463static int prb_calc_retire_blk_tmo(struct packet_sock *po,
464 int blk_size_in_bytes)
465{
466 struct net_device *dev;
467 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
468 struct ethtool_cmd ecmd;
469 int err;
e440cf2c 470 u32 speed;
f6fb8f10 471
4bc71cb9
JP
472 rtnl_lock();
473 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
474 if (unlikely(!dev)) {
475 rtnl_unlock();
f6fb8f10 476 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
477 }
478 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 479 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
480 rtnl_unlock();
481 if (!err) {
4bc71cb9
JP
482 /*
483 * If the link speed is so slow you don't really
484 * need to worry about perf anyways
485 */
e440cf2c 486 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 487 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 488 } else {
489 msec = 1;
490 div = speed / 1000;
f6fb8f10 491 }
492 }
493
494 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
495
496 if (div)
497 mbits /= div;
498
499 tmo = mbits * msec;
500
501 if (div)
502 return tmo+1;
503 return tmo;
504}
505
bc59ba39 506static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 507 union tpacket_req_u *req_u)
508{
509 p1->feature_req_word = req_u->req3.tp_feature_req_word;
510}
511
512static void init_prb_bdqc(struct packet_sock *po,
513 struct packet_ring_buffer *rb,
514 struct pgv *pg_vec,
515 union tpacket_req_u *req_u, int tx_ring)
516{
bc59ba39 517 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
518 struct tpacket_block_desc *pbd;
f6fb8f10 519
520 memset(p1, 0x0, sizeof(*p1));
521
522 p1->knxt_seq_num = 1;
523 p1->pkbdq = pg_vec;
bc59ba39 524 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 525 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 526 p1->kblk_size = req_u->req3.tp_block_size;
527 p1->knum_blocks = req_u->req3.tp_block_nr;
528 p1->hdrlen = po->tp_hdrlen;
529 p1->version = po->tp_version;
530 p1->last_kactive_blk_num = 0;
ee80fbf3 531 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 532 if (req_u->req3.tp_retire_blk_tov)
533 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
534 else
535 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
536 req_u->req3.tp_block_size);
537 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
538 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
539
540 prb_init_ft_ops(p1, req_u);
541 prb_setup_retire_blk_timer(po, tx_ring);
542 prb_open_block(p1, pbd);
543}
544
545/* Do NOT update the last_blk_num first.
546 * Assumes sk_buff_head lock is held.
547 */
bc59ba39 548static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 549{
550 mod_timer(&pkc->retire_blk_timer,
551 jiffies + pkc->tov_in_jiffies);
552 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
553}
554
555/*
556 * Timer logic:
557 * 1) We refresh the timer only when we open a block.
558 * By doing this we don't waste cycles refreshing the timer
559 * on packet-by-packet basis.
560 *
561 * With a 1MB block-size, on a 1Gbps line, it will take
562 * i) ~8 ms to fill a block + ii) memcpy etc.
563 * In this cut we are not accounting for the memcpy time.
564 *
565 * So, if the user sets the 'tmo' to 10ms then the timer
566 * will never fire while the block is still getting filled
567 * (which is what we want). However, the user could choose
568 * to close a block early and that's fine.
569 *
570 * But when the timer does fire, we check whether or not to refresh it.
571 * Since the tmo granularity is in msecs, it is not too expensive
572 * to refresh the timer, lets say every '8' msecs.
573 * Either the user can set the 'tmo' or we can derive it based on
574 * a) line-speed and b) block-size.
575 * prb_calc_retire_blk_tmo() calculates the tmo.
576 *
577 */
578static void prb_retire_rx_blk_timer_expired(unsigned long data)
579{
580 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 581 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 582 unsigned int frozen;
bc59ba39 583 struct tpacket_block_desc *pbd;
f6fb8f10 584
585 spin_lock(&po->sk.sk_receive_queue.lock);
586
587 frozen = prb_queue_frozen(pkc);
588 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
589
590 if (unlikely(pkc->delete_blk_timer))
591 goto out;
592
593 /* We only need to plug the race when the block is partially filled.
594 * tpacket_rcv:
595 * lock(); increment BLOCK_NUM_PKTS; unlock()
596 * copy_bits() is in progress ...
597 * timer fires on other cpu:
598 * we can't retire the current block because copy_bits
599 * is in progress.
600 *
601 */
602 if (BLOCK_NUM_PKTS(pbd)) {
603 while (atomic_read(&pkc->blk_fill_in_prog)) {
604 /* Waiting for skb_copy_bits to finish... */
605 cpu_relax();
606 }
607 }
608
609 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
610 if (!frozen) {
611 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
612 if (!prb_dispatch_next_block(pkc, po))
613 goto refresh_timer;
614 else
615 goto out;
616 } else {
617 /* Case 1. Queue was frozen because user-space was
618 * lagging behind.
619 */
620 if (prb_curr_blk_in_use(pkc, pbd)) {
621 /*
622 * Ok, user-space is still behind.
623 * So just refresh the timer.
624 */
625 goto refresh_timer;
626 } else {
627 /* Case 2. queue was frozen,user-space caught up,
628 * now the link went idle && the timer fired.
629 * We don't have a block to close.So we open this
630 * block and restart the timer.
631 * opening a block thaws the queue,restarts timer
632 * Thawing/timer-refresh is a side effect.
633 */
634 prb_open_block(pkc, pbd);
635 goto out;
636 }
637 }
638 }
639
640refresh_timer:
641 _prb_refresh_rx_retire_blk_timer(pkc);
642
643out:
644 spin_unlock(&po->sk.sk_receive_queue.lock);
645}
646
eea49cc9 647static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 648 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 649{
650 /* Flush everything minus the block header */
651
652#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
653 u8 *start, *end;
654
655 start = (u8 *)pbd1;
656
657 /* Skip the block header(we know header WILL fit in 4K) */
658 start += PAGE_SIZE;
659
660 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
661 for (; start < end; start += PAGE_SIZE)
662 flush_dcache_page(pgv_to_page(start));
663
664 smp_wmb();
665#endif
666
667 /* Now update the block status. */
668
669 BLOCK_STATUS(pbd1) = status;
670
671 /* Flush the block header */
672
673#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
674 start = (u8 *)pbd1;
675 flush_dcache_page(pgv_to_page(start));
676
677 smp_wmb();
678#endif
679}
680
681/*
682 * Side effect:
683 *
684 * 1) flush the block
685 * 2) Increment active_blk_num
686 *
687 * Note:We DONT refresh the timer on purpose.
688 * Because almost always the next block will be opened.
689 */
bc59ba39 690static void prb_close_block(struct tpacket_kbdq_core *pkc1,
691 struct tpacket_block_desc *pbd1,
f6fb8f10 692 struct packet_sock *po, unsigned int stat)
693{
694 __u32 status = TP_STATUS_USER | stat;
695
696 struct tpacket3_hdr *last_pkt;
bc59ba39 697 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 698
ee80fbf3 699 if (po->stats.stats3.tp_drops)
f6fb8f10 700 status |= TP_STATUS_LOSING;
701
702 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
703 last_pkt->tp_next_offset = 0;
704
705 /* Get the ts of the last pkt */
706 if (BLOCK_NUM_PKTS(pbd1)) {
707 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
708 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
709 } else {
710 /* Ok, we tmo'd - so get the current time */
711 struct timespec ts;
712 getnstimeofday(&ts);
713 h1->ts_last_pkt.ts_sec = ts.tv_sec;
714 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
715 }
716
717 smp_wmb();
718
719 /* Flush the block */
720 prb_flush_block(pkc1, pbd1, status);
721
722 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
723}
724
eea49cc9 725static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 726{
727 pkc->reset_pending_on_curr_blk = 0;
728}
729
730/*
731 * Side effect of opening a block:
732 *
733 * 1) prb_queue is thawed.
734 * 2) retire_blk_timer is refreshed.
735 *
736 */
bc59ba39 737static void prb_open_block(struct tpacket_kbdq_core *pkc1,
738 struct tpacket_block_desc *pbd1)
f6fb8f10 739{
740 struct timespec ts;
bc59ba39 741 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 742
743 smp_rmb();
744
8da3056c
DB
745 /* We could have just memset this but we will lose the
746 * flexibility of making the priv area sticky
747 */
f6fb8f10 748
8da3056c
DB
749 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
750 BLOCK_NUM_PKTS(pbd1) = 0;
751 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 752
8da3056c
DB
753 getnstimeofday(&ts);
754
755 h1->ts_first_pkt.ts_sec = ts.tv_sec;
756 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 757
8da3056c
DB
758 pkc1->pkblk_start = (char *)pbd1;
759 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
760
761 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
762 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
763
764 pbd1->version = pkc1->version;
765 pkc1->prev = pkc1->nxt_offset;
766 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
767
768 prb_thaw_queue(pkc1);
769 _prb_refresh_rx_retire_blk_timer(pkc1);
770
771 smp_wmb();
f6fb8f10 772}
773
774/*
775 * Queue freeze logic:
776 * 1) Assume tp_block_nr = 8 blocks.
777 * 2) At time 't0', user opens Rx ring.
778 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
779 * 4) user-space is either sleeping or processing block '0'.
780 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
781 * it will close block-7,loop around and try to fill block '0'.
782 * call-flow:
783 * __packet_lookup_frame_in_block
784 * prb_retire_current_block()
785 * prb_dispatch_next_block()
786 * |->(BLOCK_STATUS == USER) evaluates to true
787 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
788 * 6) Now there are two cases:
789 * 6.1) Link goes idle right after the queue is frozen.
790 * But remember, the last open_block() refreshed the timer.
791 * When this timer expires,it will refresh itself so that we can
792 * re-open block-0 in near future.
793 * 6.2) Link is busy and keeps on receiving packets. This is a simple
794 * case and __packet_lookup_frame_in_block will check if block-0
795 * is free and can now be re-used.
796 */
eea49cc9 797static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 798 struct packet_sock *po)
799{
800 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 801 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 802}
803
804#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
805
806/*
807 * If the next block is free then we will dispatch it
808 * and return a good offset.
809 * Else, we will freeze the queue.
810 * So, caller must check the return value.
811 */
bc59ba39 812static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 813 struct packet_sock *po)
814{
bc59ba39 815 struct tpacket_block_desc *pbd;
f6fb8f10 816
817 smp_rmb();
818
819 /* 1. Get current block num */
820 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
821
822 /* 2. If this block is currently in_use then freeze the queue */
823 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
824 prb_freeze_queue(pkc, po);
825 return NULL;
826 }
827
828 /*
829 * 3.
830 * open this block and return the offset where the first packet
831 * needs to get stored.
832 */
833 prb_open_block(pkc, pbd);
834 return (void *)pkc->nxt_offset;
835}
836
bc59ba39 837static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 838 struct packet_sock *po, unsigned int status)
839{
bc59ba39 840 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 841
842 /* retire/close the current block */
843 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
844 /*
845 * Plug the case where copy_bits() is in progress on
846 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
847 * have space to copy the pkt in the current block and
848 * called prb_retire_current_block()
849 *
850 * We don't need to worry about the TMO case because
851 * the timer-handler already handled this case.
852 */
853 if (!(status & TP_STATUS_BLK_TMO)) {
854 while (atomic_read(&pkc->blk_fill_in_prog)) {
855 /* Waiting for skb_copy_bits to finish... */
856 cpu_relax();
857 }
858 }
859 prb_close_block(pkc, pbd, po, status);
860 return;
861 }
f6fb8f10 862}
863
eea49cc9 864static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 865 struct tpacket_block_desc *pbd)
f6fb8f10 866{
867 return TP_STATUS_USER & BLOCK_STATUS(pbd);
868}
869
eea49cc9 870static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 871{
872 return pkc->reset_pending_on_curr_blk;
873}
874
eea49cc9 875static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 876{
bc59ba39 877 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 878 atomic_dec(&pkc->blk_fill_in_prog);
879}
880
eea49cc9 881static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 882 struct tpacket3_hdr *ppd)
883{
884 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
885}
886
eea49cc9 887static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 888 struct tpacket3_hdr *ppd)
889{
890 ppd->hv1.tp_rxhash = 0;
891}
892
eea49cc9 893static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 894 struct tpacket3_hdr *ppd)
895{
896 if (vlan_tx_tag_present(pkc->skb)) {
897 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
898 ppd->tp_status = TP_STATUS_VLAN_VALID;
899 } else {
9e67030a 900 ppd->hv1.tp_vlan_tci = 0;
901 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 902 }
903}
904
bc59ba39 905static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 906 struct tpacket3_hdr *ppd)
907{
908 prb_fill_vlan_info(pkc, ppd);
909
910 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
911 prb_fill_rxhash(pkc, ppd);
912 else
913 prb_clear_rxhash(pkc, ppd);
914}
915
eea49cc9 916static void prb_fill_curr_block(char *curr,
bc59ba39 917 struct tpacket_kbdq_core *pkc,
918 struct tpacket_block_desc *pbd,
f6fb8f10 919 unsigned int len)
920{
921 struct tpacket3_hdr *ppd;
922
923 ppd = (struct tpacket3_hdr *)curr;
924 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
925 pkc->prev = curr;
926 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
927 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
928 BLOCK_NUM_PKTS(pbd) += 1;
929 atomic_inc(&pkc->blk_fill_in_prog);
930 prb_run_all_ft_ops(pkc, ppd);
931}
932
933/* Assumes caller has the sk->rx_queue.lock */
934static void *__packet_lookup_frame_in_block(struct packet_sock *po,
935 struct sk_buff *skb,
936 int status,
937 unsigned int len
938 )
939{
bc59ba39 940 struct tpacket_kbdq_core *pkc;
941 struct tpacket_block_desc *pbd;
f6fb8f10 942 char *curr, *end;
943
e3192690 944 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 945 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
946
947 /* Queue is frozen when user space is lagging behind */
948 if (prb_queue_frozen(pkc)) {
949 /*
950 * Check if that last block which caused the queue to freeze,
951 * is still in_use by user-space.
952 */
953 if (prb_curr_blk_in_use(pkc, pbd)) {
954 /* Can't record this packet */
955 return NULL;
956 } else {
957 /*
958 * Ok, the block was released by user-space.
959 * Now let's open that block.
960 * opening a block also thaws the queue.
961 * Thawing is a side effect.
962 */
963 prb_open_block(pkc, pbd);
964 }
965 }
966
967 smp_mb();
968 curr = pkc->nxt_offset;
969 pkc->skb = skb;
e3192690 970 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 971
972 /* first try the current block */
973 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
974 prb_fill_curr_block(curr, pkc, pbd, len);
975 return (void *)curr;
976 }
977
978 /* Ok, close the current block */
979 prb_retire_current_block(pkc, po, 0);
980
981 /* Now, try to dispatch the next block */
982 curr = (char *)prb_dispatch_next_block(pkc, po);
983 if (curr) {
984 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
985 prb_fill_curr_block(curr, pkc, pbd, len);
986 return (void *)curr;
987 }
988
989 /*
990 * No free blocks are available.user_space hasn't caught up yet.
991 * Queue was just frozen and now this packet will get dropped.
992 */
993 return NULL;
994}
995
eea49cc9 996static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 997 struct sk_buff *skb,
998 int status, unsigned int len)
999{
1000 char *curr = NULL;
1001 switch (po->tp_version) {
1002 case TPACKET_V1:
1003 case TPACKET_V2:
1004 curr = packet_lookup_frame(po, &po->rx_ring,
1005 po->rx_ring.head, status);
1006 return curr;
1007 case TPACKET_V3:
1008 return __packet_lookup_frame_in_block(po, skb, status, len);
1009 default:
1010 WARN(1, "TPACKET version not supported\n");
1011 BUG();
99aa3473 1012 return NULL;
f6fb8f10 1013 }
1014}
1015
eea49cc9 1016static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1017 struct packet_ring_buffer *rb,
77f65ebd 1018 unsigned int idx,
f6fb8f10 1019 int status)
1020{
bc59ba39 1021 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1022 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1023
1024 if (status != BLOCK_STATUS(pbd))
1025 return NULL;
1026 return pbd;
1027}
1028
eea49cc9 1029static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1030{
1031 unsigned int prev;
1032 if (rb->prb_bdqc.kactive_blk_num)
1033 prev = rb->prb_bdqc.kactive_blk_num-1;
1034 else
1035 prev = rb->prb_bdqc.knum_blocks-1;
1036 return prev;
1037}
1038
1039/* Assumes caller has held the rx_queue.lock */
eea49cc9 1040static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1041 struct packet_ring_buffer *rb,
1042 int status)
1043{
1044 unsigned int previous = prb_previous_blk_num(rb);
1045 return prb_lookup_block(po, rb, previous, status);
1046}
1047
eea49cc9 1048static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1049 struct packet_ring_buffer *rb,
1050 int status)
1051{
1052 if (po->tp_version <= TPACKET_V2)
1053 return packet_previous_frame(po, rb, status);
1054
1055 return __prb_previous_block(po, rb, status);
1056}
1057
eea49cc9 1058static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1059 struct packet_ring_buffer *rb)
1060{
1061 switch (po->tp_version) {
1062 case TPACKET_V1:
1063 case TPACKET_V2:
1064 return packet_increment_head(rb);
1065 case TPACKET_V3:
1066 default:
1067 WARN(1, "TPACKET version not supported.\n");
1068 BUG();
1069 return;
1070 }
1071}
1072
eea49cc9 1073static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1074 struct packet_ring_buffer *rb,
1075 int status)
1076{
1077 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1078 return packet_lookup_frame(po, rb, previous, status);
1079}
1080
eea49cc9 1081static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1082{
1083 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1084}
1085
77f65ebd
WB
1086static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1087{
1088 struct sock *sk = &po->sk;
1089 bool has_room;
1090
1091 if (po->prot_hook.func != tpacket_rcv)
1092 return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
1093 <= sk->sk_rcvbuf;
1094
1095 spin_lock(&sk->sk_receive_queue.lock);
1096 if (po->tp_version == TPACKET_V3)
1097 has_room = prb_lookup_block(po, &po->rx_ring,
1098 po->rx_ring.prb_bdqc.kactive_blk_num,
1099 TP_STATUS_KERNEL);
1100 else
1101 has_room = packet_lookup_frame(po, &po->rx_ring,
1102 po->rx_ring.head,
1103 TP_STATUS_KERNEL);
1104 spin_unlock(&sk->sk_receive_queue.lock);
1105
1106 return has_room;
1107}
1108
1da177e4
LT
1109static void packet_sock_destruct(struct sock *sk)
1110{
ed85b565
RC
1111 skb_queue_purge(&sk->sk_error_queue);
1112
547b792c
IJ
1113 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1114 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1115
1116 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1117 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1118 return;
1119 }
1120
17ab56a2 1121 sk_refcnt_debug_dec(sk);
1da177e4
LT
1122}
1123
dc99f600
DM
1124static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1125{
1126 int x = atomic_read(&f->rr_cur) + 1;
1127
1128 if (x >= num)
1129 x = 0;
1130
1131 return x;
1132}
1133
77f65ebd
WB
1134static unsigned int fanout_demux_hash(struct packet_fanout *f,
1135 struct sk_buff *skb,
1136 unsigned int num)
dc99f600 1137{
77f65ebd 1138 return (((u64)skb->rxhash) * num) >> 32;
dc99f600
DM
1139}
1140
77f65ebd
WB
1141static unsigned int fanout_demux_lb(struct packet_fanout *f,
1142 struct sk_buff *skb,
1143 unsigned int num)
dc99f600
DM
1144{
1145 int cur, old;
1146
1147 cur = atomic_read(&f->rr_cur);
1148 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1149 fanout_rr_next(f, num))) != cur)
1150 cur = old;
77f65ebd
WB
1151 return cur;
1152}
1153
1154static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1155 struct sk_buff *skb,
1156 unsigned int num)
1157{
1158 return smp_processor_id() % num;
dc99f600
DM
1159}
1160
77f65ebd
WB
1161static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1162 struct sk_buff *skb,
1163 unsigned int idx, unsigned int skip,
1164 unsigned int num)
95ec3eb4 1165{
77f65ebd 1166 unsigned int i, j;
95ec3eb4 1167
77f65ebd
WB
1168 i = j = min_t(int, f->next[idx], num - 1);
1169 do {
1170 if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
1171 if (i != j)
1172 f->next[idx] = i;
1173 return i;
1174 }
1175 if (++i == num)
1176 i = 0;
1177 } while (i != j);
1178
1179 return idx;
1180}
1181
1182static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1183{
1184 return f->flags & (flag >> 8);
95ec3eb4
DM
1185}
1186
95ec3eb4
DM
1187static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1188 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1189{
1190 struct packet_fanout *f = pt->af_packet_priv;
1191 unsigned int num = f->num_members;
1192 struct packet_sock *po;
77f65ebd 1193 unsigned int idx;
dc99f600
DM
1194
1195 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1196 !num) {
1197 kfree_skb(skb);
1198 return 0;
1199 }
1200
95ec3eb4
DM
1201 switch (f->type) {
1202 case PACKET_FANOUT_HASH:
1203 default:
77f65ebd 1204 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
bc416d97 1205 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1206 if (!skb)
1207 return 0;
1208 }
1209 skb_get_rxhash(skb);
77f65ebd 1210 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1211 break;
1212 case PACKET_FANOUT_LB:
77f65ebd 1213 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1214 break;
1215 case PACKET_FANOUT_CPU:
77f65ebd
WB
1216 idx = fanout_demux_cpu(f, skb, num);
1217 break;
1218 case PACKET_FANOUT_ROLLOVER:
1219 idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
95ec3eb4 1220 break;
dc99f600
DM
1221 }
1222
77f65ebd
WB
1223 po = pkt_sk(f->arr[idx]);
1224 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
1225 unlikely(!packet_rcv_has_room(po, skb))) {
1226 idx = fanout_demux_rollover(f, skb, idx, idx, num);
1227 po = pkt_sk(f->arr[idx]);
1228 }
dc99f600
DM
1229
1230 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1231}
1232
fff3321d
PE
1233DEFINE_MUTEX(fanout_mutex);
1234EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600
DM
1235static LIST_HEAD(fanout_list);
1236
1237static void __fanout_link(struct sock *sk, struct packet_sock *po)
1238{
1239 struct packet_fanout *f = po->fanout;
1240
1241 spin_lock(&f->lock);
1242 f->arr[f->num_members] = sk;
1243 smp_wmb();
1244 f->num_members++;
1245 spin_unlock(&f->lock);
1246}
1247
1248static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1249{
1250 struct packet_fanout *f = po->fanout;
1251 int i;
1252
1253 spin_lock(&f->lock);
1254 for (i = 0; i < f->num_members; i++) {
1255 if (f->arr[i] == sk)
1256 break;
1257 }
1258 BUG_ON(i >= f->num_members);
1259 f->arr[i] = f->arr[f->num_members - 1];
1260 f->num_members--;
1261 spin_unlock(&f->lock);
1262}
1263
a0dfb263 1264static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
c0de08d0
EL
1265{
1266 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1267 return true;
1268
1269 return false;
1270}
1271
7736d33f 1272static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1273{
1274 struct packet_sock *po = pkt_sk(sk);
1275 struct packet_fanout *f, *match;
7736d33f 1276 u8 type = type_flags & 0xff;
77f65ebd 1277 u8 flags = type_flags >> 8;
dc99f600
DM
1278 int err;
1279
1280 switch (type) {
77f65ebd
WB
1281 case PACKET_FANOUT_ROLLOVER:
1282 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1283 return -EINVAL;
dc99f600
DM
1284 case PACKET_FANOUT_HASH:
1285 case PACKET_FANOUT_LB:
95ec3eb4 1286 case PACKET_FANOUT_CPU:
dc99f600
DM
1287 break;
1288 default:
1289 return -EINVAL;
1290 }
1291
1292 if (!po->running)
1293 return -EINVAL;
1294
1295 if (po->fanout)
1296 return -EALREADY;
1297
1298 mutex_lock(&fanout_mutex);
1299 match = NULL;
1300 list_for_each_entry(f, &fanout_list, list) {
1301 if (f->id == id &&
1302 read_pnet(&f->net) == sock_net(sk)) {
1303 match = f;
1304 break;
1305 }
1306 }
afe62c68 1307 err = -EINVAL;
77f65ebd 1308 if (match && match->flags != flags)
afe62c68 1309 goto out;
dc99f600 1310 if (!match) {
afe62c68 1311 err = -ENOMEM;
dc99f600 1312 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1313 if (!match)
1314 goto out;
1315 write_pnet(&match->net, sock_net(sk));
1316 match->id = id;
1317 match->type = type;
77f65ebd 1318 match->flags = flags;
afe62c68
ED
1319 atomic_set(&match->rr_cur, 0);
1320 INIT_LIST_HEAD(&match->list);
1321 spin_lock_init(&match->lock);
1322 atomic_set(&match->sk_ref, 0);
1323 match->prot_hook.type = po->prot_hook.type;
1324 match->prot_hook.dev = po->prot_hook.dev;
1325 match->prot_hook.func = packet_rcv_fanout;
1326 match->prot_hook.af_packet_priv = match;
c0de08d0 1327 match->prot_hook.id_match = match_fanout_group;
afe62c68
ED
1328 dev_add_pack(&match->prot_hook);
1329 list_add(&match->list, &fanout_list);
dc99f600 1330 }
afe62c68
ED
1331 err = -EINVAL;
1332 if (match->type == type &&
1333 match->prot_hook.type == po->prot_hook.type &&
1334 match->prot_hook.dev == po->prot_hook.dev) {
1335 err = -ENOSPC;
1336 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1337 __dev_remove_pack(&po->prot_hook);
1338 po->fanout = match;
1339 atomic_inc(&match->sk_ref);
1340 __fanout_link(sk, po);
1341 err = 0;
dc99f600
DM
1342 }
1343 }
afe62c68 1344out:
dc99f600
DM
1345 mutex_unlock(&fanout_mutex);
1346 return err;
1347}
1348
1349static void fanout_release(struct sock *sk)
1350{
1351 struct packet_sock *po = pkt_sk(sk);
1352 struct packet_fanout *f;
1353
1354 f = po->fanout;
1355 if (!f)
1356 return;
1357
fff3321d 1358 mutex_lock(&fanout_mutex);
dc99f600
DM
1359 po->fanout = NULL;
1360
dc99f600
DM
1361 if (atomic_dec_and_test(&f->sk_ref)) {
1362 list_del(&f->list);
1363 dev_remove_pack(&f->prot_hook);
1364 kfree(f);
1365 }
1366 mutex_unlock(&fanout_mutex);
1367}
1da177e4 1368
90ddc4f0 1369static const struct proto_ops packet_ops;
1da177e4 1370
90ddc4f0 1371static const struct proto_ops packet_ops_spkt;
1da177e4 1372
40d4e3df
ED
1373static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1374 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1375{
1376 struct sock *sk;
1377 struct sockaddr_pkt *spkt;
1378
1379 /*
1380 * When we registered the protocol we saved the socket in the data
1381 * field for just this event.
1382 */
1383
1384 sk = pt->af_packet_priv;
1ce4f28b 1385
1da177e4
LT
1386 /*
1387 * Yank back the headers [hope the device set this
1388 * right or kerboom...]
1389 *
1390 * Incoming packets have ll header pulled,
1391 * push it back.
1392 *
98e399f8 1393 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1394 * so that this procedure is noop.
1395 */
1396
1397 if (skb->pkt_type == PACKET_LOOPBACK)
1398 goto out;
1399
09ad9bc7 1400 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1401 goto out;
1402
40d4e3df
ED
1403 skb = skb_share_check(skb, GFP_ATOMIC);
1404 if (skb == NULL)
1da177e4
LT
1405 goto oom;
1406
1407 /* drop any routing info */
adf30907 1408 skb_dst_drop(skb);
1da177e4 1409
84531c24
PO
1410 /* drop conntrack reference */
1411 nf_reset(skb);
1412
ffbc6111 1413 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1414
98e399f8 1415 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1416
1417 /*
1418 * The SOCK_PACKET socket receives _all_ frames.
1419 */
1420
1421 spkt->spkt_family = dev->type;
1422 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1423 spkt->spkt_protocol = skb->protocol;
1424
1425 /*
1426 * Charge the memory to the socket. This is done specifically
1427 * to prevent sockets using all the memory up.
1428 */
1429
40d4e3df 1430 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1431 return 0;
1432
1433out:
1434 kfree_skb(skb);
1435oom:
1436 return 0;
1437}
1438
1439
1440/*
1441 * Output a raw packet to a device layer. This bypasses all the other
1442 * protocol layers and you must therefore supply it with a complete frame
1443 */
1ce4f28b 1444
1da177e4
LT
1445static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1446 struct msghdr *msg, size_t len)
1447{
1448 struct sock *sk = sock->sk;
40d4e3df 1449 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1450 struct sk_buff *skb = NULL;
1da177e4 1451 struct net_device *dev;
40d4e3df 1452 __be16 proto = 0;
1da177e4 1453 int err;
3bdc0eba 1454 int extra_len = 0;
1ce4f28b 1455
1da177e4 1456 /*
1ce4f28b 1457 * Get and verify the address.
1da177e4
LT
1458 */
1459
40d4e3df 1460 if (saddr) {
1da177e4 1461 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1462 return -EINVAL;
1463 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1464 proto = saddr->spkt_protocol;
1465 } else
1466 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1467
1468 /*
1ce4f28b 1469 * Find the device first to size check it
1da177e4
LT
1470 */
1471
de74e92a 1472 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1473retry:
654d1f8a
ED
1474 rcu_read_lock();
1475 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1476 err = -ENODEV;
1477 if (dev == NULL)
1478 goto out_unlock;
1ce4f28b 1479
d5e76b0a
DM
1480 err = -ENETDOWN;
1481 if (!(dev->flags & IFF_UP))
1482 goto out_unlock;
1483
1da177e4 1484 /*
40d4e3df
ED
1485 * You may not queue a frame bigger than the mtu. This is the lowest level
1486 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1487 */
1ce4f28b 1488
3bdc0eba
BG
1489 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1490 if (!netif_supports_nofcs(dev)) {
1491 err = -EPROTONOSUPPORT;
1492 goto out_unlock;
1493 }
1494 extra_len = 4; /* We're doing our own CRC */
1495 }
1496
1da177e4 1497 err = -EMSGSIZE;
3bdc0eba 1498 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1499 goto out_unlock;
1500
1a35ca80
ED
1501 if (!skb) {
1502 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1503 int tlen = dev->needed_tailroom;
1a35ca80
ED
1504 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1505
1506 rcu_read_unlock();
4ce40912 1507 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1508 if (skb == NULL)
1509 return -ENOBUFS;
1510 /* FIXME: Save some space for broken drivers that write a hard
1511 * header at transmission time by themselves. PPP is the notable
1512 * one here. This should really be fixed at the driver level.
1513 */
1514 skb_reserve(skb, reserved);
1515 skb_reset_network_header(skb);
1516
1517 /* Try to align data part correctly */
1518 if (hhlen) {
1519 skb->data -= hhlen;
1520 skb->tail -= hhlen;
1521 if (len < hhlen)
1522 skb_reset_network_header(skb);
1523 }
1524 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1525 if (err)
1526 goto out_free;
1527 goto retry;
1da177e4
LT
1528 }
1529
3bdc0eba 1530 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1531 /* Earlier code assumed this would be a VLAN pkt,
1532 * double-check this now that we have the actual
1533 * packet in hand.
1534 */
1535 struct ethhdr *ehdr;
1536 skb_reset_mac_header(skb);
1537 ehdr = eth_hdr(skb);
1538 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1539 err = -EMSGSIZE;
1540 goto out_unlock;
1541 }
1542 }
1a35ca80 1543
1da177e4
LT
1544 skb->protocol = proto;
1545 skb->dev = dev;
1546 skb->priority = sk->sk_priority;
2d37a186 1547 skb->mark = sk->sk_mark;
bf84a010
DB
1548
1549 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 1550
3bdc0eba
BG
1551 if (unlikely(extra_len == 4))
1552 skb->no_fcs = 1;
1553
40893fd0 1554 skb_probe_transport_header(skb, 0);
c1aad275 1555
1da177e4 1556 dev_queue_xmit(skb);
654d1f8a 1557 rcu_read_unlock();
40d4e3df 1558 return len;
1da177e4 1559
1da177e4 1560out_unlock:
654d1f8a 1561 rcu_read_unlock();
1a35ca80
ED
1562out_free:
1563 kfree_skb(skb);
1da177e4
LT
1564 return err;
1565}
1da177e4 1566
eea49cc9 1567static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1568 const struct sock *sk,
dbcb5855 1569 unsigned int res)
1da177e4
LT
1570{
1571 struct sk_filter *filter;
fda9ef5d 1572
80f8f102
ED
1573 rcu_read_lock();
1574 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1575 if (filter != NULL)
0a14842f 1576 res = SK_RUN_FILTER(filter, skb);
80f8f102 1577 rcu_read_unlock();
1da177e4 1578
dbcb5855 1579 return res;
1da177e4
LT
1580}
1581
1582/*
62ab0812
ED
1583 * This function makes lazy skb cloning in hope that most of packets
1584 * are discarded by BPF.
1585 *
1586 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1587 * and skb->cb are mangled. It works because (and until) packets
1588 * falling here are owned by current CPU. Output packets are cloned
1589 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1590 * sequencially, so that if we return skb to original state on exit,
1591 * we will not harm anyone.
1da177e4
LT
1592 */
1593
40d4e3df
ED
1594static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1595 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1596{
1597 struct sock *sk;
1598 struct sockaddr_ll *sll;
1599 struct packet_sock *po;
40d4e3df 1600 u8 *skb_head = skb->data;
1da177e4 1601 int skb_len = skb->len;
dbcb5855 1602 unsigned int snaplen, res;
1da177e4
LT
1603
1604 if (skb->pkt_type == PACKET_LOOPBACK)
1605 goto drop;
1606
1607 sk = pt->af_packet_priv;
1608 po = pkt_sk(sk);
1609
09ad9bc7 1610 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1611 goto drop;
1612
1da177e4
LT
1613 skb->dev = dev;
1614
3b04ddde 1615 if (dev->header_ops) {
1da177e4 1616 /* The device has an explicit notion of ll header,
62ab0812
ED
1617 * exported to higher levels.
1618 *
1619 * Otherwise, the device hides details of its frame
1620 * structure, so that corresponding packet head is
1621 * never delivered to user.
1da177e4
LT
1622 */
1623 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1624 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1625 else if (skb->pkt_type == PACKET_OUTGOING) {
1626 /* Special case: outgoing packets have ll header at head */
bbe735e4 1627 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1628 }
1629 }
1630
1631 snaplen = skb->len;
1632
dbcb5855
DM
1633 res = run_filter(skb, sk, snaplen);
1634 if (!res)
fda9ef5d 1635 goto drop_n_restore;
dbcb5855
DM
1636 if (snaplen > res)
1637 snaplen = res;
1da177e4 1638
0fd7bac6 1639 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1640 goto drop_n_acct;
1641
1642 if (skb_shared(skb)) {
1643 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1644 if (nskb == NULL)
1645 goto drop_n_acct;
1646
1647 if (skb_head != skb->data) {
1648 skb->data = skb_head;
1649 skb->len = skb_len;
1650 }
abc4e4fa 1651 consume_skb(skb);
1da177e4
LT
1652 skb = nskb;
1653 }
1654
ffbc6111
HX
1655 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1656 sizeof(skb->cb));
1657
1658 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1659 sll->sll_family = AF_PACKET;
1660 sll->sll_hatype = dev->type;
1661 sll->sll_protocol = skb->protocol;
1662 sll->sll_pkttype = skb->pkt_type;
8032b464 1663 if (unlikely(po->origdev))
80feaacb
PWJ
1664 sll->sll_ifindex = orig_dev->ifindex;
1665 else
1666 sll->sll_ifindex = dev->ifindex;
1da177e4 1667
b95cce35 1668 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1669
ffbc6111 1670 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1671
1da177e4
LT
1672 if (pskb_trim(skb, snaplen))
1673 goto drop_n_acct;
1674
1675 skb_set_owner_r(skb, sk);
1676 skb->dev = NULL;
adf30907 1677 skb_dst_drop(skb);
1da177e4 1678
84531c24
PO
1679 /* drop conntrack reference */
1680 nf_reset(skb);
1681
1da177e4 1682 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1683 po->stats.stats1.tp_packets++;
3b885787 1684 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1685 __skb_queue_tail(&sk->sk_receive_queue, skb);
1686 spin_unlock(&sk->sk_receive_queue.lock);
1687 sk->sk_data_ready(sk, skb->len);
1688 return 0;
1689
1690drop_n_acct:
7091fbd8 1691 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 1692 po->stats.stats1.tp_drops++;
7091fbd8
WB
1693 atomic_inc(&sk->sk_drops);
1694 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1695
1696drop_n_restore:
1697 if (skb_head != skb->data && skb_shared(skb)) {
1698 skb->data = skb_head;
1699 skb->len = skb_len;
1700 }
1701drop:
ead2ceb0 1702 consume_skb(skb);
1da177e4
LT
1703 return 0;
1704}
1705
40d4e3df
ED
1706static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1707 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1708{
1709 struct sock *sk;
1710 struct packet_sock *po;
1711 struct sockaddr_ll *sll;
184f489e 1712 union tpacket_uhdr h;
40d4e3df 1713 u8 *skb_head = skb->data;
1da177e4 1714 int skb_len = skb->len;
dbcb5855 1715 unsigned int snaplen, res;
f6fb8f10 1716 unsigned long status = TP_STATUS_USER;
bbd6ef87 1717 unsigned short macoff, netoff, hdrlen;
1da177e4 1718 struct sk_buff *copy_skb = NULL;
bbd6ef87 1719 struct timespec ts;
b9c32fb2 1720 __u32 ts_status;
1da177e4
LT
1721
1722 if (skb->pkt_type == PACKET_LOOPBACK)
1723 goto drop;
1724
1725 sk = pt->af_packet_priv;
1726 po = pkt_sk(sk);
1727
09ad9bc7 1728 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1729 goto drop;
1730
3b04ddde 1731 if (dev->header_ops) {
1da177e4 1732 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1733 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1734 else if (skb->pkt_type == PACKET_OUTGOING) {
1735 /* Special case: outgoing packets have ll header at head */
bbe735e4 1736 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1737 }
1738 }
1739
8dc41944
HX
1740 if (skb->ip_summed == CHECKSUM_PARTIAL)
1741 status |= TP_STATUS_CSUMNOTREADY;
1742
1da177e4
LT
1743 snaplen = skb->len;
1744
dbcb5855
DM
1745 res = run_filter(skb, sk, snaplen);
1746 if (!res)
fda9ef5d 1747 goto drop_n_restore;
dbcb5855
DM
1748 if (snaplen > res)
1749 snaplen = res;
1da177e4
LT
1750
1751 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1752 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1753 po->tp_reserve;
1da177e4 1754 } else {
95c96174 1755 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1756 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1757 (maclen < 16 ? 16 : maclen)) +
1758 po->tp_reserve;
1da177e4
LT
1759 macoff = netoff - maclen;
1760 }
f6fb8f10 1761 if (po->tp_version <= TPACKET_V2) {
1762 if (macoff + snaplen > po->rx_ring.frame_size) {
1763 if (po->copy_thresh &&
0fd7bac6 1764 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1765 if (skb_shared(skb)) {
1766 copy_skb = skb_clone(skb, GFP_ATOMIC);
1767 } else {
1768 copy_skb = skb_get(skb);
1769 skb_head = skb->data;
1770 }
1771 if (copy_skb)
1772 skb_set_owner_r(copy_skb, sk);
1da177e4 1773 }
f6fb8f10 1774 snaplen = po->rx_ring.frame_size - macoff;
1775 if ((int)snaplen < 0)
1776 snaplen = 0;
1da177e4 1777 }
1da177e4 1778 }
1da177e4 1779 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1780 h.raw = packet_current_rx_frame(po, skb,
1781 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1782 if (!h.raw)
1da177e4 1783 goto ring_is_full;
f6fb8f10 1784 if (po->tp_version <= TPACKET_V2) {
1785 packet_increment_rx_head(po, &po->rx_ring);
1786 /*
1787 * LOSING will be reported till you read the stats,
1788 * because it's COR - Clear On Read.
1789 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1790 * at packet level.
1791 */
ee80fbf3 1792 if (po->stats.stats1.tp_drops)
f6fb8f10 1793 status |= TP_STATUS_LOSING;
1794 }
ee80fbf3 1795 po->stats.stats1.tp_packets++;
1da177e4
LT
1796 if (copy_skb) {
1797 status |= TP_STATUS_COPY;
1798 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1799 }
1da177e4
LT
1800 spin_unlock(&sk->sk_receive_queue.lock);
1801
bbd6ef87 1802 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
1803
1804 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 1805 getnstimeofday(&ts);
1da177e4 1806
b9c32fb2
DB
1807 status |= ts_status;
1808
bbd6ef87
PM
1809 switch (po->tp_version) {
1810 case TPACKET_V1:
1811 h.h1->tp_len = skb->len;
1812 h.h1->tp_snaplen = snaplen;
1813 h.h1->tp_mac = macoff;
1814 h.h1->tp_net = netoff;
4b457bdf
DB
1815 h.h1->tp_sec = ts.tv_sec;
1816 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
1817 hdrlen = sizeof(*h.h1);
1818 break;
1819 case TPACKET_V2:
1820 h.h2->tp_len = skb->len;
1821 h.h2->tp_snaplen = snaplen;
1822 h.h2->tp_mac = macoff;
1823 h.h2->tp_net = netoff;
bbd6ef87
PM
1824 h.h2->tp_sec = ts.tv_sec;
1825 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1826 if (vlan_tx_tag_present(skb)) {
1827 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1828 status |= TP_STATUS_VLAN_VALID;
1829 } else {
1830 h.h2->tp_vlan_tci = 0;
1831 }
13fcb7bd 1832 h.h2->tp_padding = 0;
bbd6ef87
PM
1833 hdrlen = sizeof(*h.h2);
1834 break;
f6fb8f10 1835 case TPACKET_V3:
1836 /* tp_nxt_offset,vlan are already populated above.
1837 * So DONT clear those fields here
1838 */
1839 h.h3->tp_status |= status;
1840 h.h3->tp_len = skb->len;
1841 h.h3->tp_snaplen = snaplen;
1842 h.h3->tp_mac = macoff;
1843 h.h3->tp_net = netoff;
f6fb8f10 1844 h.h3->tp_sec = ts.tv_sec;
1845 h.h3->tp_nsec = ts.tv_nsec;
1846 hdrlen = sizeof(*h.h3);
1847 break;
bbd6ef87
PM
1848 default:
1849 BUG();
1850 }
1da177e4 1851
bbd6ef87 1852 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1853 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1854 sll->sll_family = AF_PACKET;
1855 sll->sll_hatype = dev->type;
1856 sll->sll_protocol = skb->protocol;
1857 sll->sll_pkttype = skb->pkt_type;
8032b464 1858 if (unlikely(po->origdev))
80feaacb
PWJ
1859 sll->sll_ifindex = orig_dev->ifindex;
1860 else
1861 sll->sll_ifindex = dev->ifindex;
1da177e4 1862
e16aa207 1863 smp_mb();
f6dafa95 1864#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1865 {
0af55bb5
CG
1866 u8 *start, *end;
1867
f6fb8f10 1868 if (po->tp_version <= TPACKET_V2) {
1869 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1870 + macoff + snaplen);
1871 for (start = h.raw; start < end; start += PAGE_SIZE)
1872 flush_dcache_page(pgv_to_page(start));
1873 }
cc9f01b2 1874 smp_wmb();
1da177e4 1875 }
f6dafa95 1876#endif
f6fb8f10 1877 if (po->tp_version <= TPACKET_V2)
1878 __packet_set_status(po, h.raw, status);
1879 else
1880 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1881
1882 sk->sk_data_ready(sk, 0);
1883
1884drop_n_restore:
1885 if (skb_head != skb->data && skb_shared(skb)) {
1886 skb->data = skb_head;
1887 skb->len = skb_len;
1888 }
1889drop:
1ce4f28b 1890 kfree_skb(skb);
1da177e4
LT
1891 return 0;
1892
1893ring_is_full:
ee80fbf3 1894 po->stats.stats1.tp_drops++;
1da177e4
LT
1895 spin_unlock(&sk->sk_receive_queue.lock);
1896
1897 sk->sk_data_ready(sk, 0);
acb5d75b 1898 kfree_skb(copy_skb);
1da177e4
LT
1899 goto drop_n_restore;
1900}
1901
69e3c75f
JB
1902static void tpacket_destruct_skb(struct sk_buff *skb)
1903{
1904 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1905 void *ph;
1da177e4 1906
69e3c75f 1907 if (likely(po->tx_ring.pg_vec)) {
b9c32fb2
DB
1908 __u32 ts;
1909
69e3c75f 1910 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1911 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1912 atomic_dec(&po->tx_ring.pending);
b9c32fb2
DB
1913
1914 ts = __packet_set_timestamp(po, ph, skb);
1915 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
1916 }
1917
1918 sock_wfree(skb);
1919}
1920
40d4e3df
ED
1921static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1922 void *frame, struct net_device *dev, int size_max,
ae641949 1923 __be16 proto, unsigned char *addr, int hlen)
69e3c75f 1924{
184f489e 1925 union tpacket_uhdr ph;
69e3c75f
JB
1926 int to_write, offset, len, tp_len, nr_frags, len_max;
1927 struct socket *sock = po->sk.sk_socket;
1928 struct page *page;
1929 void *data;
1930 int err;
1931
1932 ph.raw = frame;
1933
1934 skb->protocol = proto;
1935 skb->dev = dev;
1936 skb->priority = po->sk.sk_priority;
2d37a186 1937 skb->mark = po->sk.sk_mark;
2e31396f 1938 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
69e3c75f
JB
1939 skb_shinfo(skb)->destructor_arg = ph.raw;
1940
1941 switch (po->tp_version) {
1942 case TPACKET_V2:
1943 tp_len = ph.h2->tp_len;
1944 break;
1945 default:
1946 tp_len = ph.h1->tp_len;
1947 break;
1948 }
1949 if (unlikely(tp_len > size_max)) {
40d4e3df 1950 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1951 return -EMSGSIZE;
1952 }
1953
ae641949 1954 skb_reserve(skb, hlen);
69e3c75f 1955 skb_reset_network_header(skb);
40893fd0 1956 skb_probe_transport_header(skb, 0);
c1aad275 1957
5920cd3a
PC
1958 if (po->tp_tx_has_off) {
1959 int off_min, off_max, off;
1960 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1961 off_max = po->tx_ring.frame_size - tp_len;
1962 if (sock->type == SOCK_DGRAM) {
1963 switch (po->tp_version) {
1964 case TPACKET_V2:
1965 off = ph.h2->tp_net;
1966 break;
1967 default:
1968 off = ph.h1->tp_net;
1969 break;
1970 }
1971 } else {
1972 switch (po->tp_version) {
1973 case TPACKET_V2:
1974 off = ph.h2->tp_mac;
1975 break;
1976 default:
1977 off = ph.h1->tp_mac;
1978 break;
1979 }
1980 }
1981 if (unlikely((off < off_min) || (off_max < off)))
1982 return -EINVAL;
1983 data = ph.raw + off;
1984 } else {
1985 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1986 }
69e3c75f
JB
1987 to_write = tp_len;
1988
1989 if (sock->type == SOCK_DGRAM) {
1990 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1991 NULL, tp_len);
1992 if (unlikely(err < 0))
1993 return -EINVAL;
40d4e3df 1994 } else if (dev->hard_header_len) {
69e3c75f
JB
1995 /* net device doesn't like empty head */
1996 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1997 pr_err("packet size is too short (%d < %d)\n",
1998 tp_len, dev->hard_header_len);
69e3c75f
JB
1999 return -EINVAL;
2000 }
2001
2002 skb_push(skb, dev->hard_header_len);
2003 err = skb_store_bits(skb, 0, data,
2004 dev->hard_header_len);
2005 if (unlikely(err))
2006 return err;
2007
2008 data += dev->hard_header_len;
2009 to_write -= dev->hard_header_len;
2010 }
2011
69e3c75f
JB
2012 offset = offset_in_page(data);
2013 len_max = PAGE_SIZE - offset;
2014 len = ((to_write > len_max) ? len_max : to_write);
2015
2016 skb->data_len = to_write;
2017 skb->len += to_write;
2018 skb->truesize += to_write;
2019 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2020
2021 while (likely(to_write)) {
2022 nr_frags = skb_shinfo(skb)->nr_frags;
2023
2024 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2025 pr_err("Packet exceed the number of skb frags(%lu)\n",
2026 MAX_SKB_FRAGS);
69e3c75f
JB
2027 return -EFAULT;
2028 }
2029
0af55bb5
CG
2030 page = pgv_to_page(data);
2031 data += len;
69e3c75f
JB
2032 flush_dcache_page(page);
2033 get_page(page);
0af55bb5 2034 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2035 to_write -= len;
2036 offset = 0;
2037 len_max = PAGE_SIZE;
2038 len = ((to_write > len_max) ? len_max : to_write);
2039 }
2040
2041 return tp_len;
2042}
2043
2044static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2045{
69e3c75f
JB
2046 struct sk_buff *skb;
2047 struct net_device *dev;
2048 __be16 proto;
827d9780
BG
2049 bool need_rls_dev = false;
2050 int err, reserve = 0;
40d4e3df
ED
2051 void *ph;
2052 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2053 int tp_len, size_max;
2054 unsigned char *addr;
2055 int len_sum = 0;
9e67030a 2056 int status = TP_STATUS_AVAILABLE;
ae641949 2057 int hlen, tlen;
69e3c75f 2058
69e3c75f
JB
2059 mutex_lock(&po->pg_vec_lock);
2060
69e3c75f 2061 if (saddr == NULL) {
827d9780 2062 dev = po->prot_hook.dev;
69e3c75f
JB
2063 proto = po->num;
2064 addr = NULL;
2065 } else {
2066 err = -EINVAL;
2067 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2068 goto out;
2069 if (msg->msg_namelen < (saddr->sll_halen
2070 + offsetof(struct sockaddr_ll,
2071 sll_addr)))
2072 goto out;
69e3c75f
JB
2073 proto = saddr->sll_protocol;
2074 addr = saddr->sll_addr;
827d9780
BG
2075 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2076 need_rls_dev = true;
69e3c75f
JB
2077 }
2078
69e3c75f
JB
2079 err = -ENXIO;
2080 if (unlikely(dev == NULL))
2081 goto out;
2082
2083 reserve = dev->hard_header_len;
2084
2085 err = -ENETDOWN;
2086 if (unlikely(!(dev->flags & IFF_UP)))
2087 goto out_put;
2088
2089 size_max = po->tx_ring.frame_size
b5dd884e 2090 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2091
2092 if (size_max > dev->mtu + reserve)
2093 size_max = dev->mtu + reserve;
2094
2095 do {
2096 ph = packet_current_frame(po, &po->tx_ring,
2097 TP_STATUS_SEND_REQUEST);
2098
2099 if (unlikely(ph == NULL)) {
2100 schedule();
2101 continue;
2102 }
2103
2104 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2105 hlen = LL_RESERVED_SPACE(dev);
2106 tlen = dev->needed_tailroom;
69e3c75f 2107 skb = sock_alloc_send_skb(&po->sk,
ae641949 2108 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2109 0, &err);
2110
2111 if (unlikely(skb == NULL))
2112 goto out_status;
2113
2114 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2115 addr, hlen);
69e3c75f
JB
2116
2117 if (unlikely(tp_len < 0)) {
2118 if (po->tp_loss) {
2119 __packet_set_status(po, ph,
2120 TP_STATUS_AVAILABLE);
2121 packet_increment_head(&po->tx_ring);
2122 kfree_skb(skb);
2123 continue;
2124 } else {
2125 status = TP_STATUS_WRONG_FORMAT;
2126 err = tp_len;
2127 goto out_status;
2128 }
2129 }
2130
2131 skb->destructor = tpacket_destruct_skb;
2132 __packet_set_status(po, ph, TP_STATUS_SENDING);
2133 atomic_inc(&po->tx_ring.pending);
2134
2135 status = TP_STATUS_SEND_REQUEST;
2136 err = dev_queue_xmit(skb);
eb70df13
JP
2137 if (unlikely(err > 0)) {
2138 err = net_xmit_errno(err);
2139 if (err && __packet_get_status(po, ph) ==
2140 TP_STATUS_AVAILABLE) {
2141 /* skb was destructed already */
2142 skb = NULL;
2143 goto out_status;
2144 }
2145 /*
2146 * skb was dropped but not destructed yet;
2147 * let's treat it like congestion or err < 0
2148 */
2149 err = 0;
2150 }
69e3c75f
JB
2151 packet_increment_head(&po->tx_ring);
2152 len_sum += tp_len;
f64f9e71
JP
2153 } while (likely((ph != NULL) ||
2154 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2155 (atomic_read(&po->tx_ring.pending))))
2156 );
69e3c75f
JB
2157
2158 err = len_sum;
2159 goto out_put;
2160
69e3c75f
JB
2161out_status:
2162 __packet_set_status(po, ph, status);
2163 kfree_skb(skb);
2164out_put:
827d9780
BG
2165 if (need_rls_dev)
2166 dev_put(dev);
69e3c75f
JB
2167out:
2168 mutex_unlock(&po->pg_vec_lock);
2169 return err;
2170}
69e3c75f 2171
eea49cc9
OJ
2172static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2173 size_t reserve, size_t len,
2174 size_t linear, int noblock,
2175 int *err)
bfd5f4a3
SS
2176{
2177 struct sk_buff *skb;
2178
2179 /* Under a page? Don't bother with paged skb. */
2180 if (prepad + len < PAGE_SIZE || !linear)
2181 linear = len;
2182
2183 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2184 err);
2185 if (!skb)
2186 return NULL;
2187
2188 skb_reserve(skb, reserve);
2189 skb_put(skb, linear);
2190 skb->data_len = len - linear;
2191 skb->len += len - linear;
2192
2193 return skb;
2194}
2195
69e3c75f 2196static int packet_snd(struct socket *sock,
1da177e4
LT
2197 struct msghdr *msg, size_t len)
2198{
2199 struct sock *sk = sock->sk;
40d4e3df 2200 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2201 struct sk_buff *skb;
2202 struct net_device *dev;
0e11c91e 2203 __be16 proto;
827d9780 2204 bool need_rls_dev = false;
1da177e4 2205 unsigned char *addr;
827d9780 2206 int err, reserve = 0;
bfd5f4a3
SS
2207 struct virtio_net_hdr vnet_hdr = { 0 };
2208 int offset = 0;
2209 int vnet_hdr_len;
2210 struct packet_sock *po = pkt_sk(sk);
2211 unsigned short gso_type = 0;
ae641949 2212 int hlen, tlen;
3bdc0eba 2213 int extra_len = 0;
1da177e4
LT
2214
2215 /*
1ce4f28b 2216 * Get and verify the address.
1da177e4 2217 */
1ce4f28b 2218
1da177e4 2219 if (saddr == NULL) {
827d9780 2220 dev = po->prot_hook.dev;
1da177e4
LT
2221 proto = po->num;
2222 addr = NULL;
2223 } else {
2224 err = -EINVAL;
2225 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2226 goto out;
0fb375fb
EB
2227 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2228 goto out;
1da177e4
LT
2229 proto = saddr->sll_protocol;
2230 addr = saddr->sll_addr;
827d9780
BG
2231 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2232 need_rls_dev = true;
1da177e4
LT
2233 }
2234
1da177e4
LT
2235 err = -ENXIO;
2236 if (dev == NULL)
2237 goto out_unlock;
2238 if (sock->type == SOCK_RAW)
2239 reserve = dev->hard_header_len;
2240
d5e76b0a
DM
2241 err = -ENETDOWN;
2242 if (!(dev->flags & IFF_UP))
2243 goto out_unlock;
2244
bfd5f4a3
SS
2245 if (po->has_vnet_hdr) {
2246 vnet_hdr_len = sizeof(vnet_hdr);
2247
2248 err = -EINVAL;
2249 if (len < vnet_hdr_len)
2250 goto out_unlock;
2251
2252 len -= vnet_hdr_len;
2253
2254 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2255 vnet_hdr_len);
2256 if (err < 0)
2257 goto out_unlock;
2258
2259 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2260 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2261 vnet_hdr.hdr_len))
2262 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2263 vnet_hdr.csum_offset + 2;
2264
2265 err = -EINVAL;
2266 if (vnet_hdr.hdr_len > len)
2267 goto out_unlock;
2268
2269 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2270 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2271 case VIRTIO_NET_HDR_GSO_TCPV4:
2272 gso_type = SKB_GSO_TCPV4;
2273 break;
2274 case VIRTIO_NET_HDR_GSO_TCPV6:
2275 gso_type = SKB_GSO_TCPV6;
2276 break;
2277 case VIRTIO_NET_HDR_GSO_UDP:
2278 gso_type = SKB_GSO_UDP;
2279 break;
2280 default:
2281 goto out_unlock;
2282 }
2283
2284 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2285 gso_type |= SKB_GSO_TCP_ECN;
2286
2287 if (vnet_hdr.gso_size == 0)
2288 goto out_unlock;
2289
2290 }
2291 }
2292
3bdc0eba
BG
2293 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2294 if (!netif_supports_nofcs(dev)) {
2295 err = -EPROTONOSUPPORT;
2296 goto out_unlock;
2297 }
2298 extra_len = 4; /* We're doing our own CRC */
2299 }
2300
1da177e4 2301 err = -EMSGSIZE;
3bdc0eba 2302 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2303 goto out_unlock;
2304
bfd5f4a3 2305 err = -ENOBUFS;
ae641949
HX
2306 hlen = LL_RESERVED_SPACE(dev);
2307 tlen = dev->needed_tailroom;
2308 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2309 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2310 if (skb == NULL)
1da177e4
LT
2311 goto out_unlock;
2312
bfd5f4a3 2313 skb_set_network_header(skb, reserve);
1da177e4 2314
0c4e8581
SH
2315 err = -EINVAL;
2316 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2317 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2318 goto out_free;
1da177e4
LT
2319
2320 /* Returns -EFAULT on error */
bfd5f4a3 2321 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2322 if (err)
2323 goto out_free;
bf84a010
DB
2324
2325 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1da177e4 2326
3bdc0eba 2327 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2328 /* Earlier code assumed this would be a VLAN pkt,
2329 * double-check this now that we have the actual
2330 * packet in hand.
2331 */
2332 struct ethhdr *ehdr;
2333 skb_reset_mac_header(skb);
2334 ehdr = eth_hdr(skb);
2335 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2336 err = -EMSGSIZE;
2337 goto out_free;
2338 }
2339 }
2340
1da177e4
LT
2341 skb->protocol = proto;
2342 skb->dev = dev;
2343 skb->priority = sk->sk_priority;
2d37a186 2344 skb->mark = sk->sk_mark;
1da177e4 2345
bfd5f4a3
SS
2346 if (po->has_vnet_hdr) {
2347 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2348 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2349 vnet_hdr.csum_offset)) {
2350 err = -EINVAL;
2351 goto out_free;
2352 }
2353 }
2354
2355 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2356 skb_shinfo(skb)->gso_type = gso_type;
2357
2358 /* Header must be checked, and gso_segs computed. */
2359 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2360 skb_shinfo(skb)->gso_segs = 0;
2361
2362 len += vnet_hdr_len;
2363 }
2364
40893fd0 2365 skb_probe_transport_header(skb, reserve);
c1aad275 2366
3bdc0eba
BG
2367 if (unlikely(extra_len == 4))
2368 skb->no_fcs = 1;
2369
1da177e4
LT
2370 /*
2371 * Now send it
2372 */
2373
2374 err = dev_queue_xmit(skb);
2375 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2376 goto out_unlock;
2377
827d9780
BG
2378 if (need_rls_dev)
2379 dev_put(dev);
1da177e4 2380
40d4e3df 2381 return len;
1da177e4
LT
2382
2383out_free:
2384 kfree_skb(skb);
2385out_unlock:
827d9780 2386 if (dev && need_rls_dev)
1da177e4
LT
2387 dev_put(dev);
2388out:
2389 return err;
2390}
2391
69e3c75f
JB
2392static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2393 struct msghdr *msg, size_t len)
2394{
69e3c75f
JB
2395 struct sock *sk = sock->sk;
2396 struct packet_sock *po = pkt_sk(sk);
2397 if (po->tx_ring.pg_vec)
2398 return tpacket_snd(po, msg);
2399 else
69e3c75f
JB
2400 return packet_snd(sock, msg, len);
2401}
2402
1da177e4
LT
2403/*
2404 * Close a PACKET socket. This is fairly simple. We immediately go
2405 * to 'closed' state and remove our protocol entry in the device list.
2406 */
2407
2408static int packet_release(struct socket *sock)
2409{
2410 struct sock *sk = sock->sk;
2411 struct packet_sock *po;
d12d01d6 2412 struct net *net;
f6fb8f10 2413 union tpacket_req_u req_u;
1da177e4
LT
2414
2415 if (!sk)
2416 return 0;
2417
3b1e0a65 2418 net = sock_net(sk);
1da177e4
LT
2419 po = pkt_sk(sk);
2420
0fa7fa98 2421 mutex_lock(&net->packet.sklist_lock);
808f5114 2422 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2423 mutex_unlock(&net->packet.sklist_lock);
2424
2425 preempt_disable();
920de804 2426 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2427 preempt_enable();
1da177e4 2428
808f5114 2429 spin_lock(&po->bind_lock);
ce06b03e 2430 unregister_prot_hook(sk, false);
160ff18a
BG
2431 if (po->prot_hook.dev) {
2432 dev_put(po->prot_hook.dev);
2433 po->prot_hook.dev = NULL;
2434 }
808f5114 2435 spin_unlock(&po->bind_lock);
1da177e4 2436
1da177e4 2437 packet_flush_mclist(sk);
1da177e4 2438
9665d5d6
PS
2439 if (po->rx_ring.pg_vec) {
2440 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2441 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2442 }
69e3c75f 2443
9665d5d6
PS
2444 if (po->tx_ring.pg_vec) {
2445 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2446 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 2447 }
1da177e4 2448
dc99f600
DM
2449 fanout_release(sk);
2450
808f5114 2451 synchronize_net();
1da177e4
LT
2452 /*
2453 * Now the socket is dead. No more input will appear.
2454 */
1da177e4
LT
2455 sock_orphan(sk);
2456 sock->sk = NULL;
2457
2458 /* Purge queues */
2459
2460 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2461 sk_refcnt_debug_release(sk);
1da177e4
LT
2462
2463 sock_put(sk);
2464 return 0;
2465}
2466
2467/*
2468 * Attach a packet hook.
2469 */
2470
0e11c91e 2471static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2472{
2473 struct packet_sock *po = pkt_sk(sk);
dc99f600 2474
aef950b4
WY
2475 if (po->fanout) {
2476 if (dev)
2477 dev_put(dev);
2478
dc99f600 2479 return -EINVAL;
aef950b4 2480 }
1da177e4
LT
2481
2482 lock_sock(sk);
2483
2484 spin_lock(&po->bind_lock);
ce06b03e 2485 unregister_prot_hook(sk, true);
1da177e4
LT
2486 po->num = protocol;
2487 po->prot_hook.type = protocol;
160ff18a
BG
2488 if (po->prot_hook.dev)
2489 dev_put(po->prot_hook.dev);
1da177e4
LT
2490 po->prot_hook.dev = dev;
2491
2492 po->ifindex = dev ? dev->ifindex : 0;
2493
2494 if (protocol == 0)
2495 goto out_unlock;
2496
be85d4ad 2497 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2498 register_prot_hook(sk);
be85d4ad
UT
2499 } else {
2500 sk->sk_err = ENETDOWN;
2501 if (!sock_flag(sk, SOCK_DEAD))
2502 sk->sk_error_report(sk);
1da177e4
LT
2503 }
2504
2505out_unlock:
2506 spin_unlock(&po->bind_lock);
2507 release_sock(sk);
2508 return 0;
2509}
2510
2511/*
2512 * Bind a packet socket to a device
2513 */
2514
40d4e3df
ED
2515static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2516 int addr_len)
1da177e4 2517{
40d4e3df 2518 struct sock *sk = sock->sk;
1da177e4
LT
2519 char name[15];
2520 struct net_device *dev;
2521 int err = -ENODEV;
1ce4f28b 2522
1da177e4
LT
2523 /*
2524 * Check legality
2525 */
1ce4f28b 2526
8ae55f04 2527 if (addr_len != sizeof(struct sockaddr))
1da177e4 2528 return -EINVAL;
40d4e3df 2529 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2530
3b1e0a65 2531 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2532 if (dev)
1da177e4 2533 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2534 return err;
2535}
1da177e4
LT
2536
2537static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2538{
40d4e3df
ED
2539 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2540 struct sock *sk = sock->sk;
1da177e4
LT
2541 struct net_device *dev = NULL;
2542 int err;
2543
2544
2545 /*
2546 * Check legality
2547 */
1ce4f28b 2548
1da177e4
LT
2549 if (addr_len < sizeof(struct sockaddr_ll))
2550 return -EINVAL;
2551 if (sll->sll_family != AF_PACKET)
2552 return -EINVAL;
2553
2554 if (sll->sll_ifindex) {
2555 err = -ENODEV;
3b1e0a65 2556 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2557 if (dev == NULL)
2558 goto out;
2559 }
2560 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2561
2562out:
2563 return err;
2564}
2565
2566static struct proto packet_proto = {
2567 .name = "PACKET",
2568 .owner = THIS_MODULE,
2569 .obj_size = sizeof(struct packet_sock),
2570};
2571
2572/*
1ce4f28b 2573 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2574 */
2575
3f378b68
EP
2576static int packet_create(struct net *net, struct socket *sock, int protocol,
2577 int kern)
1da177e4
LT
2578{
2579 struct sock *sk;
2580 struct packet_sock *po;
0e11c91e 2581 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2582 int err;
2583
df008c91 2584 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 2585 return -EPERM;
be02097c
DM
2586 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2587 sock->type != SOCK_PACKET)
1da177e4
LT
2588 return -ESOCKTNOSUPPORT;
2589
2590 sock->state = SS_UNCONNECTED;
2591
2592 err = -ENOBUFS;
6257ff21 2593 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2594 if (sk == NULL)
2595 goto out;
2596
2597 sock->ops = &packet_ops;
1da177e4
LT
2598 if (sock->type == SOCK_PACKET)
2599 sock->ops = &packet_ops_spkt;
be02097c 2600
1da177e4
LT
2601 sock_init_data(sock, sk);
2602
2603 po = pkt_sk(sk);
2604 sk->sk_family = PF_PACKET;
0e11c91e 2605 po->num = proto;
1da177e4
LT
2606
2607 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2608 sk_refcnt_debug_inc(sk);
1da177e4
LT
2609
2610 /*
2611 * Attach a protocol block
2612 */
2613
2614 spin_lock_init(&po->bind_lock);
905db440 2615 mutex_init(&po->pg_vec_lock);
1da177e4 2616 po->prot_hook.func = packet_rcv;
be02097c 2617
1da177e4
LT
2618 if (sock->type == SOCK_PACKET)
2619 po->prot_hook.func = packet_rcv_spkt;
be02097c 2620
1da177e4
LT
2621 po->prot_hook.af_packet_priv = sk;
2622
0e11c91e
AV
2623 if (proto) {
2624 po->prot_hook.type = proto;
ce06b03e 2625 register_prot_hook(sk);
1da177e4
LT
2626 }
2627
0fa7fa98 2628 mutex_lock(&net->packet.sklist_lock);
808f5114 2629 sk_add_node_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
2630 mutex_unlock(&net->packet.sklist_lock);
2631
2632 preempt_disable();
3680453c 2633 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 2634 preempt_enable();
808f5114 2635
40d4e3df 2636 return 0;
1da177e4
LT
2637out:
2638 return err;
2639}
2640
ed85b565
RC
2641static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2642{
2643 struct sock_exterr_skb *serr;
2644 struct sk_buff *skb, *skb2;
2645 int copied, err;
2646
2647 err = -EAGAIN;
2648 skb = skb_dequeue(&sk->sk_error_queue);
2649 if (skb == NULL)
2650 goto out;
2651
2652 copied = skb->len;
2653 if (copied > len) {
2654 msg->msg_flags |= MSG_TRUNC;
2655 copied = len;
2656 }
2657 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2658 if (err)
2659 goto out_free_skb;
2660
2661 sock_recv_timestamp(msg, sk, skb);
2662
2663 serr = SKB_EXT_ERR(skb);
2664 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2665 sizeof(serr->ee), &serr->ee);
2666
2667 msg->msg_flags |= MSG_ERRQUEUE;
2668 err = copied;
2669
2670 /* Reset and regenerate socket error */
2671 spin_lock_bh(&sk->sk_error_queue.lock);
2672 sk->sk_err = 0;
2673 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2674 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2675 spin_unlock_bh(&sk->sk_error_queue.lock);
2676 sk->sk_error_report(sk);
2677 } else
2678 spin_unlock_bh(&sk->sk_error_queue.lock);
2679
2680out_free_skb:
2681 kfree_skb(skb);
2682out:
2683 return err;
2684}
2685
1da177e4
LT
2686/*
2687 * Pull a packet from our receive queue and hand it to the user.
2688 * If necessary we block.
2689 */
2690
2691static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2692 struct msghdr *msg, size_t len, int flags)
2693{
2694 struct sock *sk = sock->sk;
2695 struct sk_buff *skb;
2696 int copied, err;
0fb375fb 2697 struct sockaddr_ll *sll;
bfd5f4a3 2698 int vnet_hdr_len = 0;
1da177e4
LT
2699
2700 err = -EINVAL;
ed85b565 2701 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2702 goto out;
2703
2704#if 0
2705 /* What error should we return now? EUNATTACH? */
2706 if (pkt_sk(sk)->ifindex < 0)
2707 return -ENODEV;
2708#endif
2709
ed85b565
RC
2710 if (flags & MSG_ERRQUEUE) {
2711 err = packet_recv_error(sk, msg, len);
2712 goto out;
2713 }
2714
1da177e4
LT
2715 /*
2716 * Call the generic datagram receiver. This handles all sorts
2717 * of horrible races and re-entrancy so we can forget about it
2718 * in the protocol layers.
2719 *
2720 * Now it will return ENETDOWN, if device have just gone down,
2721 * but then it will block.
2722 */
2723
40d4e3df 2724 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2725
2726 /*
1ce4f28b 2727 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2728 * handles the blocking we don't see and worry about blocking
2729 * retries.
2730 */
2731
8ae55f04 2732 if (skb == NULL)
1da177e4
LT
2733 goto out;
2734
bfd5f4a3
SS
2735 if (pkt_sk(sk)->has_vnet_hdr) {
2736 struct virtio_net_hdr vnet_hdr = { 0 };
2737
2738 err = -EINVAL;
2739 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2740 if (len < vnet_hdr_len)
bfd5f4a3
SS
2741 goto out_free;
2742
1f18b717
MK
2743 len -= vnet_hdr_len;
2744
bfd5f4a3
SS
2745 if (skb_is_gso(skb)) {
2746 struct skb_shared_info *sinfo = skb_shinfo(skb);
2747
2748 /* This is a hint as to how much should be linear. */
2749 vnet_hdr.hdr_len = skb_headlen(skb);
2750 vnet_hdr.gso_size = sinfo->gso_size;
2751 if (sinfo->gso_type & SKB_GSO_TCPV4)
2752 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2753 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2754 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2755 else if (sinfo->gso_type & SKB_GSO_UDP)
2756 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2757 else if (sinfo->gso_type & SKB_GSO_FCOE)
2758 goto out_free;
2759 else
2760 BUG();
2761 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2762 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2763 } else
2764 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2765
2766 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2767 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2768 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2769 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2770 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2771 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2772 } /* else everything is zero */
2773
2774 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2775 vnet_hdr_len);
2776 if (err < 0)
2777 goto out_free;
2778 }
2779
0fb375fb
EB
2780 /*
2781 * If the address length field is there to be filled in, we fill
2782 * it in now.
2783 */
2784
ffbc6111 2785 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2786 if (sock->type == SOCK_PACKET)
2787 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2788 else
2789 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2790
1da177e4
LT
2791 /*
2792 * You lose any data beyond the buffer you gave. If it worries a
2793 * user program they can ask the device for its MTU anyway.
2794 */
2795
2796 copied = skb->len;
40d4e3df
ED
2797 if (copied > len) {
2798 copied = len;
2799 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2800 }
2801
2802 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2803 if (err)
2804 goto out_free;
2805
3b885787 2806 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2807
2808 if (msg->msg_name)
ffbc6111
HX
2809 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2810 msg->msg_namelen);
1da177e4 2811
8dc41944 2812 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2813 struct tpacket_auxdata aux;
2814
2815 aux.tp_status = TP_STATUS_USER;
2816 if (skb->ip_summed == CHECKSUM_PARTIAL)
2817 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2818 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2819 aux.tp_snaplen = skb->len;
2820 aux.tp_mac = 0;
bbe735e4 2821 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2822 if (vlan_tx_tag_present(skb)) {
2823 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2824 aux.tp_status |= TP_STATUS_VLAN_VALID;
2825 } else {
2826 aux.tp_vlan_tci = 0;
2827 }
13fcb7bd 2828 aux.tp_padding = 0;
ffbc6111 2829 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2830 }
2831
1da177e4
LT
2832 /*
2833 * Free or return the buffer as appropriate. Again this
2834 * hides all the races and re-entrancy issues from us.
2835 */
bfd5f4a3 2836 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2837
2838out_free:
2839 skb_free_datagram(sk, skb);
2840out:
2841 return err;
2842}
2843
1da177e4
LT
2844static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2845 int *uaddr_len, int peer)
2846{
2847 struct net_device *dev;
2848 struct sock *sk = sock->sk;
2849
2850 if (peer)
2851 return -EOPNOTSUPP;
2852
2853 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2854 rcu_read_lock();
2855 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2856 if (dev)
67286640 2857 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2858 else
1da177e4 2859 memset(uaddr->sa_data, 0, 14);
654d1f8a 2860 rcu_read_unlock();
1da177e4
LT
2861 *uaddr_len = sizeof(*uaddr);
2862
2863 return 0;
2864}
1da177e4
LT
2865
2866static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2867 int *uaddr_len, int peer)
2868{
2869 struct net_device *dev;
2870 struct sock *sk = sock->sk;
2871 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2872 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2873
2874 if (peer)
2875 return -EOPNOTSUPP;
2876
2877 sll->sll_family = AF_PACKET;
2878 sll->sll_ifindex = po->ifindex;
2879 sll->sll_protocol = po->num;
67286640 2880 sll->sll_pkttype = 0;
654d1f8a
ED
2881 rcu_read_lock();
2882 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2883 if (dev) {
2884 sll->sll_hatype = dev->type;
2885 sll->sll_halen = dev->addr_len;
2886 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2887 } else {
2888 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2889 sll->sll_halen = 0;
2890 }
654d1f8a 2891 rcu_read_unlock();
0fb375fb 2892 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2893
2894 return 0;
2895}
2896
2aeb0b88
WC
2897static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2898 int what)
1da177e4
LT
2899{
2900 switch (i->type) {
2901 case PACKET_MR_MULTICAST:
1162563f
JP
2902 if (i->alen != dev->addr_len)
2903 return -EINVAL;
1da177e4 2904 if (what > 0)
22bedad3 2905 return dev_mc_add(dev, i->addr);
1da177e4 2906 else
22bedad3 2907 return dev_mc_del(dev, i->addr);
1da177e4
LT
2908 break;
2909 case PACKET_MR_PROMISC:
2aeb0b88 2910 return dev_set_promiscuity(dev, what);
1da177e4
LT
2911 break;
2912 case PACKET_MR_ALLMULTI:
2aeb0b88 2913 return dev_set_allmulti(dev, what);
1da177e4 2914 break;
d95ed927 2915 case PACKET_MR_UNICAST:
1162563f
JP
2916 if (i->alen != dev->addr_len)
2917 return -EINVAL;
d95ed927 2918 if (what > 0)
a748ee24 2919 return dev_uc_add(dev, i->addr);
d95ed927 2920 else
a748ee24 2921 return dev_uc_del(dev, i->addr);
d95ed927 2922 break;
40d4e3df
ED
2923 default:
2924 break;
1da177e4 2925 }
2aeb0b88 2926 return 0;
1da177e4
LT
2927}
2928
2929static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2930{
40d4e3df 2931 for ( ; i; i = i->next) {
1da177e4
LT
2932 if (i->ifindex == dev->ifindex)
2933 packet_dev_mc(dev, i, what);
2934 }
2935}
2936
0fb375fb 2937static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2938{
2939 struct packet_sock *po = pkt_sk(sk);
2940 struct packet_mclist *ml, *i;
2941 struct net_device *dev;
2942 int err;
2943
2944 rtnl_lock();
2945
2946 err = -ENODEV;
3b1e0a65 2947 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2948 if (!dev)
2949 goto done;
2950
2951 err = -EINVAL;
1162563f 2952 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2953 goto done;
2954
2955 err = -ENOBUFS;
8b3a7005 2956 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2957 if (i == NULL)
2958 goto done;
2959
2960 err = 0;
2961 for (ml = po->mclist; ml; ml = ml->next) {
2962 if (ml->ifindex == mreq->mr_ifindex &&
2963 ml->type == mreq->mr_type &&
2964 ml->alen == mreq->mr_alen &&
2965 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2966 ml->count++;
2967 /* Free the new element ... */
2968 kfree(i);
2969 goto done;
2970 }
2971 }
2972
2973 i->type = mreq->mr_type;
2974 i->ifindex = mreq->mr_ifindex;
2975 i->alen = mreq->mr_alen;
2976 memcpy(i->addr, mreq->mr_address, i->alen);
2977 i->count = 1;
2978 i->next = po->mclist;
2979 po->mclist = i;
2aeb0b88
WC
2980 err = packet_dev_mc(dev, i, 1);
2981 if (err) {
2982 po->mclist = i->next;
2983 kfree(i);
2984 }
1da177e4
LT
2985
2986done:
2987 rtnl_unlock();
2988 return err;
2989}
2990
0fb375fb 2991static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2992{
2993 struct packet_mclist *ml, **mlp;
2994
2995 rtnl_lock();
2996
2997 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2998 if (ml->ifindex == mreq->mr_ifindex &&
2999 ml->type == mreq->mr_type &&
3000 ml->alen == mreq->mr_alen &&
3001 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3002 if (--ml->count == 0) {
3003 struct net_device *dev;
3004 *mlp = ml->next;
ad959e76
ED
3005 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3006 if (dev)
1da177e4 3007 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3008 kfree(ml);
3009 }
3010 rtnl_unlock();
3011 return 0;
3012 }
3013 }
3014 rtnl_unlock();
3015 return -EADDRNOTAVAIL;
3016}
3017
3018static void packet_flush_mclist(struct sock *sk)
3019{
3020 struct packet_sock *po = pkt_sk(sk);
3021 struct packet_mclist *ml;
3022
3023 if (!po->mclist)
3024 return;
3025
3026 rtnl_lock();
3027 while ((ml = po->mclist) != NULL) {
3028 struct net_device *dev;
3029
3030 po->mclist = ml->next;
ad959e76
ED
3031 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3032 if (dev != NULL)
1da177e4 3033 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3034 kfree(ml);
3035 }
3036 rtnl_unlock();
3037}
1da177e4
LT
3038
3039static int
b7058842 3040packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3041{
3042 struct sock *sk = sock->sk;
8dc41944 3043 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3044 int ret;
3045
3046 if (level != SOL_PACKET)
3047 return -ENOPROTOOPT;
3048
69e3c75f 3049 switch (optname) {
1ce4f28b 3050 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3051 case PACKET_DROP_MEMBERSHIP:
3052 {
0fb375fb
EB
3053 struct packet_mreq_max mreq;
3054 int len = optlen;
3055 memset(&mreq, 0, sizeof(mreq));
3056 if (len < sizeof(struct packet_mreq))
1da177e4 3057 return -EINVAL;
0fb375fb
EB
3058 if (len > sizeof(mreq))
3059 len = sizeof(mreq);
40d4e3df 3060 if (copy_from_user(&mreq, optval, len))
1da177e4 3061 return -EFAULT;
0fb375fb
EB
3062 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3063 return -EINVAL;
1da177e4
LT
3064 if (optname == PACKET_ADD_MEMBERSHIP)
3065 ret = packet_mc_add(sk, &mreq);
3066 else
3067 ret = packet_mc_drop(sk, &mreq);
3068 return ret;
3069 }
a2efcfa0 3070
1da177e4 3071 case PACKET_RX_RING:
69e3c75f 3072 case PACKET_TX_RING:
1da177e4 3073 {
f6fb8f10 3074 union tpacket_req_u req_u;
3075 int len;
1da177e4 3076
f6fb8f10 3077 switch (po->tp_version) {
3078 case TPACKET_V1:
3079 case TPACKET_V2:
3080 len = sizeof(req_u.req);
3081 break;
3082 case TPACKET_V3:
3083 default:
3084 len = sizeof(req_u.req3);
3085 break;
3086 }
3087 if (optlen < len)
1da177e4 3088 return -EINVAL;
bfd5f4a3
SS
3089 if (pkt_sk(sk)->has_vnet_hdr)
3090 return -EINVAL;
f6fb8f10 3091 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3092 return -EFAULT;
f6fb8f10 3093 return packet_set_ring(sk, &req_u, 0,
3094 optname == PACKET_TX_RING);
1da177e4
LT
3095 }
3096 case PACKET_COPY_THRESH:
3097 {
3098 int val;
3099
40d4e3df 3100 if (optlen != sizeof(val))
1da177e4 3101 return -EINVAL;
40d4e3df 3102 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3103 return -EFAULT;
3104
3105 pkt_sk(sk)->copy_thresh = val;
3106 return 0;
3107 }
bbd6ef87
PM
3108 case PACKET_VERSION:
3109 {
3110 int val;
3111
3112 if (optlen != sizeof(val))
3113 return -EINVAL;
69e3c75f 3114 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3115 return -EBUSY;
3116 if (copy_from_user(&val, optval, sizeof(val)))
3117 return -EFAULT;
3118 switch (val) {
3119 case TPACKET_V1:
3120 case TPACKET_V2:
f6fb8f10 3121 case TPACKET_V3:
bbd6ef87
PM
3122 po->tp_version = val;
3123 return 0;
3124 default:
3125 return -EINVAL;
3126 }
3127 }
8913336a
PM
3128 case PACKET_RESERVE:
3129 {
3130 unsigned int val;
3131
3132 if (optlen != sizeof(val))
3133 return -EINVAL;
69e3c75f 3134 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3135 return -EBUSY;
3136 if (copy_from_user(&val, optval, sizeof(val)))
3137 return -EFAULT;
3138 po->tp_reserve = val;
3139 return 0;
3140 }
69e3c75f
JB
3141 case PACKET_LOSS:
3142 {
3143 unsigned int val;
3144
3145 if (optlen != sizeof(val))
3146 return -EINVAL;
3147 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3148 return -EBUSY;
3149 if (copy_from_user(&val, optval, sizeof(val)))
3150 return -EFAULT;
3151 po->tp_loss = !!val;
3152 return 0;
3153 }
8dc41944
HX
3154 case PACKET_AUXDATA:
3155 {
3156 int val;
3157
3158 if (optlen < sizeof(val))
3159 return -EINVAL;
3160 if (copy_from_user(&val, optval, sizeof(val)))
3161 return -EFAULT;
3162
3163 po->auxdata = !!val;
3164 return 0;
3165 }
80feaacb
PWJ
3166 case PACKET_ORIGDEV:
3167 {
3168 int val;
3169
3170 if (optlen < sizeof(val))
3171 return -EINVAL;
3172 if (copy_from_user(&val, optval, sizeof(val)))
3173 return -EFAULT;
3174
3175 po->origdev = !!val;
3176 return 0;
3177 }
bfd5f4a3
SS
3178 case PACKET_VNET_HDR:
3179 {
3180 int val;
3181
3182 if (sock->type != SOCK_RAW)
3183 return -EINVAL;
3184 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3185 return -EBUSY;
3186 if (optlen < sizeof(val))
3187 return -EINVAL;
3188 if (copy_from_user(&val, optval, sizeof(val)))
3189 return -EFAULT;
3190
3191 po->has_vnet_hdr = !!val;
3192 return 0;
3193 }
614f60fa
SM
3194 case PACKET_TIMESTAMP:
3195 {
3196 int val;
3197
3198 if (optlen != sizeof(val))
3199 return -EINVAL;
3200 if (copy_from_user(&val, optval, sizeof(val)))
3201 return -EFAULT;
3202
3203 po->tp_tstamp = val;
3204 return 0;
3205 }
dc99f600
DM
3206 case PACKET_FANOUT:
3207 {
3208 int val;
3209
3210 if (optlen != sizeof(val))
3211 return -EINVAL;
3212 if (copy_from_user(&val, optval, sizeof(val)))
3213 return -EFAULT;
3214
3215 return fanout_add(sk, val & 0xffff, val >> 16);
3216 }
5920cd3a
PC
3217 case PACKET_TX_HAS_OFF:
3218 {
3219 unsigned int val;
3220
3221 if (optlen != sizeof(val))
3222 return -EINVAL;
3223 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3224 return -EBUSY;
3225 if (copy_from_user(&val, optval, sizeof(val)))
3226 return -EFAULT;
3227 po->tp_tx_has_off = !!val;
3228 return 0;
3229 }
1da177e4
LT
3230 default:
3231 return -ENOPROTOOPT;
3232 }
3233}
3234
3235static int packet_getsockopt(struct socket *sock, int level, int optname,
3236 char __user *optval, int __user *optlen)
3237{
3238 int len;
c06fff6e 3239 int val, lv = sizeof(val);
1da177e4
LT
3240 struct sock *sk = sock->sk;
3241 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3242 void *data = &val;
ee80fbf3 3243 union tpacket_stats_u st;
1da177e4
LT
3244
3245 if (level != SOL_PACKET)
3246 return -ENOPROTOOPT;
3247
8ae55f04
KK
3248 if (get_user(len, optlen))
3249 return -EFAULT;
1da177e4
LT
3250
3251 if (len < 0)
3252 return -EINVAL;
1ce4f28b 3253
69e3c75f 3254 switch (optname) {
1da177e4 3255 case PACKET_STATISTICS:
1da177e4 3256 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3257 memcpy(&st, &po->stats, sizeof(st));
3258 memset(&po->stats, 0, sizeof(po->stats));
3259 spin_unlock_bh(&sk->sk_receive_queue.lock);
3260
f6fb8f10 3261 if (po->tp_version == TPACKET_V3) {
c06fff6e 3262 lv = sizeof(struct tpacket_stats_v3);
ee80fbf3 3263 data = &st.stats3;
f6fb8f10 3264 } else {
c06fff6e 3265 lv = sizeof(struct tpacket_stats);
ee80fbf3 3266 data = &st.stats1;
f6fb8f10 3267 }
ee80fbf3 3268
8dc41944
HX
3269 break;
3270 case PACKET_AUXDATA:
8dc41944 3271 val = po->auxdata;
80feaacb
PWJ
3272 break;
3273 case PACKET_ORIGDEV:
80feaacb 3274 val = po->origdev;
bfd5f4a3
SS
3275 break;
3276 case PACKET_VNET_HDR:
bfd5f4a3 3277 val = po->has_vnet_hdr;
1da177e4 3278 break;
bbd6ef87 3279 case PACKET_VERSION:
bbd6ef87 3280 val = po->tp_version;
bbd6ef87
PM
3281 break;
3282 case PACKET_HDRLEN:
3283 if (len > sizeof(int))
3284 len = sizeof(int);
3285 if (copy_from_user(&val, optval, len))
3286 return -EFAULT;
3287 switch (val) {
3288 case TPACKET_V1:
3289 val = sizeof(struct tpacket_hdr);
3290 break;
3291 case TPACKET_V2:
3292 val = sizeof(struct tpacket2_hdr);
3293 break;
f6fb8f10 3294 case TPACKET_V3:
3295 val = sizeof(struct tpacket3_hdr);
3296 break;
bbd6ef87
PM
3297 default:
3298 return -EINVAL;
3299 }
bbd6ef87 3300 break;
8913336a 3301 case PACKET_RESERVE:
8913336a 3302 val = po->tp_reserve;
8913336a 3303 break;
69e3c75f 3304 case PACKET_LOSS:
69e3c75f 3305 val = po->tp_loss;
69e3c75f 3306 break;
614f60fa 3307 case PACKET_TIMESTAMP:
614f60fa 3308 val = po->tp_tstamp;
614f60fa 3309 break;
dc99f600 3310 case PACKET_FANOUT:
dc99f600
DM
3311 val = (po->fanout ?
3312 ((u32)po->fanout->id |
77f65ebd
WB
3313 ((u32)po->fanout->type << 16) |
3314 ((u32)po->fanout->flags << 24)) :
dc99f600 3315 0);
dc99f600 3316 break;
5920cd3a
PC
3317 case PACKET_TX_HAS_OFF:
3318 val = po->tp_tx_has_off;
3319 break;
1da177e4
LT
3320 default:
3321 return -ENOPROTOOPT;
3322 }
3323
c06fff6e
ED
3324 if (len > lv)
3325 len = lv;
8ae55f04
KK
3326 if (put_user(len, optlen))
3327 return -EFAULT;
8dc41944
HX
3328 if (copy_to_user(optval, data, len))
3329 return -EFAULT;
8ae55f04 3330 return 0;
1da177e4
LT
3331}
3332
3333
3334static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3335{
3336 struct sock *sk;
ad930650 3337 struct net_device *dev = data;
c346dca1 3338 struct net *net = dev_net(dev);
1da177e4 3339
808f5114 3340 rcu_read_lock();
b67bfe0d 3341 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
3342 struct packet_sock *po = pkt_sk(sk);
3343
3344 switch (msg) {
3345 case NETDEV_UNREGISTER:
1da177e4
LT
3346 if (po->mclist)
3347 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3348 /* fallthrough */
3349
1da177e4
LT
3350 case NETDEV_DOWN:
3351 if (dev->ifindex == po->ifindex) {
3352 spin_lock(&po->bind_lock);
3353 if (po->running) {
ce06b03e 3354 __unregister_prot_hook(sk, false);
1da177e4
LT
3355 sk->sk_err = ENETDOWN;
3356 if (!sock_flag(sk, SOCK_DEAD))
3357 sk->sk_error_report(sk);
3358 }
3359 if (msg == NETDEV_UNREGISTER) {
3360 po->ifindex = -1;
160ff18a
BG
3361 if (po->prot_hook.dev)
3362 dev_put(po->prot_hook.dev);
1da177e4
LT
3363 po->prot_hook.dev = NULL;
3364 }
3365 spin_unlock(&po->bind_lock);
3366 }
3367 break;
3368 case NETDEV_UP:
808f5114 3369 if (dev->ifindex == po->ifindex) {
3370 spin_lock(&po->bind_lock);
ce06b03e
DM
3371 if (po->num)
3372 register_prot_hook(sk);
808f5114 3373 spin_unlock(&po->bind_lock);
1da177e4 3374 }
1da177e4
LT
3375 break;
3376 }
3377 }
808f5114 3378 rcu_read_unlock();
1da177e4
LT
3379 return NOTIFY_DONE;
3380}
3381
3382
3383static int packet_ioctl(struct socket *sock, unsigned int cmd,
3384 unsigned long arg)
3385{
3386 struct sock *sk = sock->sk;
3387
69e3c75f 3388 switch (cmd) {
40d4e3df
ED
3389 case SIOCOUTQ:
3390 {
3391 int amount = sk_wmem_alloc_get(sk);
31e6d363 3392
40d4e3df
ED
3393 return put_user(amount, (int __user *)arg);
3394 }
3395 case SIOCINQ:
3396 {
3397 struct sk_buff *skb;
3398 int amount = 0;
3399
3400 spin_lock_bh(&sk->sk_receive_queue.lock);
3401 skb = skb_peek(&sk->sk_receive_queue);
3402 if (skb)
3403 amount = skb->len;
3404 spin_unlock_bh(&sk->sk_receive_queue.lock);
3405 return put_user(amount, (int __user *)arg);
3406 }
3407 case SIOCGSTAMP:
3408 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3409 case SIOCGSTAMPNS:
3410 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3411
1da177e4 3412#ifdef CONFIG_INET
40d4e3df
ED
3413 case SIOCADDRT:
3414 case SIOCDELRT:
3415 case SIOCDARP:
3416 case SIOCGARP:
3417 case SIOCSARP:
3418 case SIOCGIFADDR:
3419 case SIOCSIFADDR:
3420 case SIOCGIFBRDADDR:
3421 case SIOCSIFBRDADDR:
3422 case SIOCGIFNETMASK:
3423 case SIOCSIFNETMASK:
3424 case SIOCGIFDSTADDR:
3425 case SIOCSIFDSTADDR:
3426 case SIOCSIFFLAGS:
40d4e3df 3427 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3428#endif
3429
40d4e3df
ED
3430 default:
3431 return -ENOIOCTLCMD;
1da177e4
LT
3432 }
3433 return 0;
3434}
3435
40d4e3df 3436static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3437 poll_table *wait)
3438{
3439 struct sock *sk = sock->sk;
3440 struct packet_sock *po = pkt_sk(sk);
3441 unsigned int mask = datagram_poll(file, sock, wait);
3442
3443 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3444 if (po->rx_ring.pg_vec) {
f6fb8f10 3445 if (!packet_previous_rx_frame(po, &po->rx_ring,
3446 TP_STATUS_KERNEL))
1da177e4
LT
3447 mask |= POLLIN | POLLRDNORM;
3448 }
3449 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3450 spin_lock_bh(&sk->sk_write_queue.lock);
3451 if (po->tx_ring.pg_vec) {
3452 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3453 mask |= POLLOUT | POLLWRNORM;
3454 }
3455 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3456 return mask;
3457}
3458
3459
3460/* Dirty? Well, I still did not learn better way to account
3461 * for user mmaps.
3462 */
3463
3464static void packet_mm_open(struct vm_area_struct *vma)
3465{
3466 struct file *file = vma->vm_file;
40d4e3df 3467 struct socket *sock = file->private_data;
1da177e4 3468 struct sock *sk = sock->sk;
1ce4f28b 3469
1da177e4
LT
3470 if (sk)
3471 atomic_inc(&pkt_sk(sk)->mapped);
3472}
3473
3474static void packet_mm_close(struct vm_area_struct *vma)
3475{
3476 struct file *file = vma->vm_file;
40d4e3df 3477 struct socket *sock = file->private_data;
1da177e4 3478 struct sock *sk = sock->sk;
1ce4f28b 3479
1da177e4
LT
3480 if (sk)
3481 atomic_dec(&pkt_sk(sk)->mapped);
3482}
3483
f0f37e2f 3484static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3485 .open = packet_mm_open,
3486 .close = packet_mm_close,
1da177e4
LT
3487};
3488
0e3125c7
NH
3489static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3490 unsigned int len)
1da177e4
LT
3491{
3492 int i;
3493
4ebf0ae2 3494 for (i = 0; i < len; i++) {
0e3125c7 3495 if (likely(pg_vec[i].buffer)) {
c56b4d90 3496 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3497 vfree(pg_vec[i].buffer);
3498 else
3499 free_pages((unsigned long)pg_vec[i].buffer,
3500 order);
3501 pg_vec[i].buffer = NULL;
3502 }
1da177e4
LT
3503 }
3504 kfree(pg_vec);
3505}
3506
eea49cc9 3507static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3508{
0e3125c7
NH
3509 char *buffer = NULL;
3510 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3511 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3512
3513 buffer = (char *) __get_free_pages(gfp_flags, order);
3514
3515 if (buffer)
3516 return buffer;
3517
3518 /*
3519 * __get_free_pages failed, fall back to vmalloc
3520 */
bbce5a59 3521 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3522
0e3125c7
NH
3523 if (buffer)
3524 return buffer;
3525
3526 /*
3527 * vmalloc failed, lets dig into swap here
3528 */
0e3125c7
NH
3529 gfp_flags &= ~__GFP_NORETRY;
3530 buffer = (char *)__get_free_pages(gfp_flags, order);
3531 if (buffer)
3532 return buffer;
3533
3534 /*
3535 * complete and utter failure
3536 */
3537 return NULL;
4ebf0ae2
DM
3538}
3539
0e3125c7 3540static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3541{
3542 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3543 struct pgv *pg_vec;
4ebf0ae2
DM
3544 int i;
3545
0e3125c7 3546 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3547 if (unlikely(!pg_vec))
3548 goto out;
3549
3550 for (i = 0; i < block_nr; i++) {
c56b4d90 3551 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3552 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3553 goto out_free_pgvec;
3554 }
3555
3556out:
3557 return pg_vec;
3558
3559out_free_pgvec:
3560 free_pg_vec(pg_vec, order, block_nr);
3561 pg_vec = NULL;
3562 goto out;
3563}
1da177e4 3564
f6fb8f10 3565static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3566 int closing, int tx_ring)
1da177e4 3567{
0e3125c7 3568 struct pgv *pg_vec = NULL;
1da177e4 3569 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3570 int was_running, order = 0;
69e3c75f
JB
3571 struct packet_ring_buffer *rb;
3572 struct sk_buff_head *rb_queue;
0e11c91e 3573 __be16 num;
f6fb8f10 3574 int err = -EINVAL;
3575 /* Added to avoid minimal code churn */
3576 struct tpacket_req *req = &req_u->req;
3577
3578 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3579 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3580 WARN(1, "Tx-ring is not supported.\n");
3581 goto out;
3582 }
1ce4f28b 3583
69e3c75f
JB
3584 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3585 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3586
69e3c75f
JB
3587 err = -EBUSY;
3588 if (!closing) {
3589 if (atomic_read(&po->mapped))
3590 goto out;
3591 if (atomic_read(&rb->pending))
3592 goto out;
3593 }
1da177e4 3594
69e3c75f
JB
3595 if (req->tp_block_nr) {
3596 /* Sanity tests and some calculations */
3597 err = -EBUSY;
3598 if (unlikely(rb->pg_vec))
3599 goto out;
1da177e4 3600
bbd6ef87
PM
3601 switch (po->tp_version) {
3602 case TPACKET_V1:
3603 po->tp_hdrlen = TPACKET_HDRLEN;
3604 break;
3605 case TPACKET_V2:
3606 po->tp_hdrlen = TPACKET2_HDRLEN;
3607 break;
f6fb8f10 3608 case TPACKET_V3:
3609 po->tp_hdrlen = TPACKET3_HDRLEN;
3610 break;
bbd6ef87
PM
3611 }
3612
69e3c75f 3613 err = -EINVAL;
4ebf0ae2 3614 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3615 goto out;
4ebf0ae2 3616 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3617 goto out;
8913336a 3618 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3619 po->tp_reserve))
3620 goto out;
4ebf0ae2 3621 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3622 goto out;
1da177e4 3623
69e3c75f
JB
3624 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3625 if (unlikely(rb->frames_per_block <= 0))
3626 goto out;
3627 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3628 req->tp_frame_nr))
3629 goto out;
1da177e4
LT
3630
3631 err = -ENOMEM;
4ebf0ae2
DM
3632 order = get_order(req->tp_block_size);
3633 pg_vec = alloc_pg_vec(req, order);
3634 if (unlikely(!pg_vec))
1da177e4 3635 goto out;
f6fb8f10 3636 switch (po->tp_version) {
3637 case TPACKET_V3:
3638 /* Transmit path is not supported. We checked
3639 * it above but just being paranoid
3640 */
3641 if (!tx_ring)
3642 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3643 break;
3644 default:
3645 break;
3646 }
69e3c75f
JB
3647 }
3648 /* Done */
3649 else {
3650 err = -EINVAL;
4ebf0ae2 3651 if (unlikely(req->tp_frame_nr))
69e3c75f 3652 goto out;
1da177e4
LT
3653 }
3654
3655 lock_sock(sk);
3656
3657 /* Detach socket from network */
3658 spin_lock(&po->bind_lock);
3659 was_running = po->running;
3660 num = po->num;
3661 if (was_running) {
1da177e4 3662 po->num = 0;
ce06b03e 3663 __unregister_prot_hook(sk, false);
1da177e4
LT
3664 }
3665 spin_unlock(&po->bind_lock);
1ce4f28b 3666
1da177e4
LT
3667 synchronize_net();
3668
3669 err = -EBUSY;
905db440 3670 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3671 if (closing || atomic_read(&po->mapped) == 0) {
3672 err = 0;
69e3c75f 3673 spin_lock_bh(&rb_queue->lock);
c053fd96 3674 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3675 rb->frame_max = (req->tp_frame_nr - 1);
3676 rb->head = 0;
3677 rb->frame_size = req->tp_frame_size;
3678 spin_unlock_bh(&rb_queue->lock);
3679
c053fd96
CG
3680 swap(rb->pg_vec_order, order);
3681 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3682
3683 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3684 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3685 tpacket_rcv : packet_rcv;
3686 skb_queue_purge(rb_queue);
1da177e4 3687 if (atomic_read(&po->mapped))
40d4e3df
ED
3688 pr_err("packet_mmap: vma is busy: %d\n",
3689 atomic_read(&po->mapped));
1da177e4 3690 }
905db440 3691 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3692
3693 spin_lock(&po->bind_lock);
ce06b03e 3694 if (was_running) {
1da177e4 3695 po->num = num;
ce06b03e 3696 register_prot_hook(sk);
1da177e4
LT
3697 }
3698 spin_unlock(&po->bind_lock);
f6fb8f10 3699 if (closing && (po->tp_version > TPACKET_V2)) {
3700 /* Because we don't support block-based V3 on tx-ring */
3701 if (!tx_ring)
3702 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3703 }
1da177e4
LT
3704 release_sock(sk);
3705
1da177e4
LT
3706 if (pg_vec)
3707 free_pg_vec(pg_vec, order, req->tp_block_nr);
3708out:
3709 return err;
3710}
3711
69e3c75f
JB
3712static int packet_mmap(struct file *file, struct socket *sock,
3713 struct vm_area_struct *vma)
1da177e4
LT
3714{
3715 struct sock *sk = sock->sk;
3716 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3717 unsigned long size, expected_size;
3718 struct packet_ring_buffer *rb;
1da177e4
LT
3719 unsigned long start;
3720 int err = -EINVAL;
3721 int i;
3722
3723 if (vma->vm_pgoff)
3724 return -EINVAL;
3725
905db440 3726 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3727
3728 expected_size = 0;
3729 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3730 if (rb->pg_vec) {
3731 expected_size += rb->pg_vec_len
3732 * rb->pg_vec_pages
3733 * PAGE_SIZE;
3734 }
3735 }
3736
3737 if (expected_size == 0)
1da177e4 3738 goto out;
69e3c75f
JB
3739
3740 size = vma->vm_end - vma->vm_start;
3741 if (size != expected_size)
1da177e4
LT
3742 goto out;
3743
1da177e4 3744 start = vma->vm_start;
69e3c75f
JB
3745 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3746 if (rb->pg_vec == NULL)
3747 continue;
3748
3749 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3750 struct page *page;
3751 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3752 int pg_num;
3753
c56b4d90
CG
3754 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3755 page = pgv_to_page(kaddr);
69e3c75f
JB
3756 err = vm_insert_page(vma, start, page);
3757 if (unlikely(err))
3758 goto out;
3759 start += PAGE_SIZE;
0e3125c7 3760 kaddr += PAGE_SIZE;
69e3c75f 3761 }
4ebf0ae2 3762 }
1da177e4 3763 }
69e3c75f 3764
4ebf0ae2 3765 atomic_inc(&po->mapped);
1da177e4
LT
3766 vma->vm_ops = &packet_mmap_ops;
3767 err = 0;
3768
3769out:
905db440 3770 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3771 return err;
3772}
1da177e4 3773
90ddc4f0 3774static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3775 .family = PF_PACKET,
3776 .owner = THIS_MODULE,
3777 .release = packet_release,
3778 .bind = packet_bind_spkt,
3779 .connect = sock_no_connect,
3780 .socketpair = sock_no_socketpair,
3781 .accept = sock_no_accept,
3782 .getname = packet_getname_spkt,
3783 .poll = datagram_poll,
3784 .ioctl = packet_ioctl,
3785 .listen = sock_no_listen,
3786 .shutdown = sock_no_shutdown,
3787 .setsockopt = sock_no_setsockopt,
3788 .getsockopt = sock_no_getsockopt,
3789 .sendmsg = packet_sendmsg_spkt,
3790 .recvmsg = packet_recvmsg,
3791 .mmap = sock_no_mmap,
3792 .sendpage = sock_no_sendpage,
3793};
1da177e4 3794
90ddc4f0 3795static const struct proto_ops packet_ops = {
1da177e4
LT
3796 .family = PF_PACKET,
3797 .owner = THIS_MODULE,
3798 .release = packet_release,
3799 .bind = packet_bind,
3800 .connect = sock_no_connect,
3801 .socketpair = sock_no_socketpair,
3802 .accept = sock_no_accept,
1ce4f28b 3803 .getname = packet_getname,
1da177e4
LT
3804 .poll = packet_poll,
3805 .ioctl = packet_ioctl,
3806 .listen = sock_no_listen,
3807 .shutdown = sock_no_shutdown,
3808 .setsockopt = packet_setsockopt,
3809 .getsockopt = packet_getsockopt,
3810 .sendmsg = packet_sendmsg,
3811 .recvmsg = packet_recvmsg,
3812 .mmap = packet_mmap,
3813 .sendpage = sock_no_sendpage,
3814};
3815
ec1b4cf7 3816static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3817 .family = PF_PACKET,
3818 .create = packet_create,
3819 .owner = THIS_MODULE,
3820};
3821
3822static struct notifier_block packet_netdev_notifier = {
40d4e3df 3823 .notifier_call = packet_notifier,
1da177e4
LT
3824};
3825
3826#ifdef CONFIG_PROC_FS
1da177e4
LT
3827
3828static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3829 __acquires(RCU)
1da177e4 3830{
e372c414 3831 struct net *net = seq_file_net(seq);
808f5114 3832
3833 rcu_read_lock();
3834 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3835}
3836
3837static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3838{
1bf40954 3839 struct net *net = seq_file_net(seq);
808f5114 3840 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3841}
3842
3843static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3844 __releases(RCU)
1da177e4 3845{
808f5114 3846 rcu_read_unlock();
1da177e4
LT
3847}
3848
1ce4f28b 3849static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3850{
3851 if (v == SEQ_START_TOKEN)
3852 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3853 else {
b7ceabd9 3854 struct sock *s = sk_entry(v);
1da177e4
LT
3855 const struct packet_sock *po = pkt_sk(s);
3856
3857 seq_printf(seq,
71338aa7 3858 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3859 s,
3860 atomic_read(&s->sk_refcnt),
3861 s->sk_type,
3862 ntohs(po->num),
3863 po->ifindex,
3864 po->running,
3865 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 3866 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 3867 sock_i_ino(s));
1da177e4
LT
3868 }
3869
3870 return 0;
3871}
3872
56b3d975 3873static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3874 .start = packet_seq_start,
3875 .next = packet_seq_next,
3876 .stop = packet_seq_stop,
3877 .show = packet_seq_show,
3878};
3879
3880static int packet_seq_open(struct inode *inode, struct file *file)
3881{
e372c414
DL
3882 return seq_open_net(inode, file, &packet_seq_ops,
3883 sizeof(struct seq_net_private));
1da177e4
LT
3884}
3885
da7071d7 3886static const struct file_operations packet_seq_fops = {
1da177e4
LT
3887 .owner = THIS_MODULE,
3888 .open = packet_seq_open,
3889 .read = seq_read,
3890 .llseek = seq_lseek,
e372c414 3891 .release = seq_release_net,
1da177e4
LT
3892};
3893
3894#endif
3895
2c8c1e72 3896static int __net_init packet_net_init(struct net *net)
d12d01d6 3897{
0fa7fa98 3898 mutex_init(&net->packet.sklist_lock);
2aaef4e4 3899 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 3900
d4beaa66 3901 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
d12d01d6
DL
3902 return -ENOMEM;
3903
3904 return 0;
3905}
3906
2c8c1e72 3907static void __net_exit packet_net_exit(struct net *net)
d12d01d6 3908{
ece31ffd 3909 remove_proc_entry("packet", net->proc_net);
d12d01d6
DL
3910}
3911
3912static struct pernet_operations packet_net_ops = {
3913 .init = packet_net_init,
3914 .exit = packet_net_exit,
3915};
3916
3917
1da177e4
LT
3918static void __exit packet_exit(void)
3919{
1da177e4 3920 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3921 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3922 sock_unregister(PF_PACKET);
3923 proto_unregister(&packet_proto);
3924}
3925
3926static int __init packet_init(void)
3927{
3928 int rc = proto_register(&packet_proto, 0);
3929
3930 if (rc != 0)
3931 goto out;
3932
3933 sock_register(&packet_family_ops);
d12d01d6 3934 register_pernet_subsys(&packet_net_ops);
1da177e4 3935 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3936out:
3937 return rc;
3938}
3939
3940module_init(packet_init);
3941module_exit(packet_exit);
3942MODULE_LICENSE("GPL");
3943MODULE_ALIAS_NETPROTO(PF_PACKET);