]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
ipv6: tcp: dont drop packet but consume it
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
1da177e4
LT
96/*
97 Assumptions:
98 - if device has no dev->hard_header routine, it adds and removes ll header
99 inside itself. In this case ll header is invisible outside of device,
100 but higher levels still should reserve dev->hard_header_len.
101 Some devices are enough clever to reallocate skb, when header
102 will not fit to reserved space (tunnel), another ones are silly
103 (PPP).
104 - packet socket receives packets with pulled ll header,
105 so that SOCK_RAW should push it back.
106
107On receive:
108-----------
109
110Incoming, dev->hard_header!=NULL
b0e380b1
ACM
111 mac_header -> ll header
112 data -> data
1da177e4
LT
113
114Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> ll header
1da177e4
LT
117
118Incoming, dev->hard_header==NULL
b0e380b1
ACM
119 mac_header -> UNKNOWN position. It is very likely, that it points to ll
120 header. PPP makes it, that is wrong, because introduce
db0c58f9 121 assymetry between rx and tx paths.
b0e380b1 122 data -> data
1da177e4
LT
123
124Outgoing, dev->hard_header==NULL
b0e380b1
ACM
125 mac_header -> data. ll header is still not built!
126 data -> data
1da177e4
LT
127
128Resume
129 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
131
132On transmit:
133------------
134
135dev->hard_header != NULL
b0e380b1
ACM
136 mac_header -> ll header
137 data -> ll header
1da177e4
LT
138
139dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
140 mac_header -> data
141 data -> data
1da177e4
LT
142
143 We should set nh.raw on output to correct posistion,
144 packet classifier depends on it.
145 */
146
1da177e4
LT
147/* Private packet socket structures. */
148
40d4e3df 149struct packet_mclist {
1da177e4
LT
150 struct packet_mclist *next;
151 int ifindex;
152 int count;
153 unsigned short type;
154 unsigned short alen;
0fb375fb
EB
155 unsigned char addr[MAX_ADDR_LEN];
156};
157/* identical to struct packet_mreq except it has
158 * a longer address field.
159 */
40d4e3df 160struct packet_mreq_max {
0fb375fb
EB
161 int mr_ifindex;
162 unsigned short mr_type;
163 unsigned short mr_alen;
164 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 165};
a2efcfa0 166
f6fb8f10 167static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
168 int closing, int tx_ring);
169
f6fb8f10 170
171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
178/* kbdq - kernel block descriptor queue */
bc59ba39 179struct tpacket_kbdq_core {
f6fb8f10 180 struct pgv *pkbdq;
181 unsigned int feature_req_word;
182 unsigned int hdrlen;
183 unsigned char reset_pending_on_curr_blk;
184 unsigned char delete_blk_timer;
185 unsigned short kactive_blk_num;
186 unsigned short blk_sizeof_priv;
187
188 /* last_kactive_blk_num:
189 * trick to see if user-space has caught up
190 * in order to avoid refreshing timer when every single pkt arrives.
191 */
192 unsigned short last_kactive_blk_num;
193
194 char *pkblk_start;
195 char *pkblk_end;
196 int kblk_size;
197 unsigned int knum_blocks;
198 uint64_t knxt_seq_num;
199 char *prev;
200 char *nxt_offset;
201 struct sk_buff *skb;
202
203 atomic_t blk_fill_in_prog;
204
205 /* Default is set to 8ms */
206#define DEFAULT_PRB_RETIRE_TOV (8)
207
208 unsigned short retire_blk_tov;
209 unsigned short version;
210 unsigned long tov_in_jiffies;
211
212 /* timer to retire an outstanding block */
213 struct timer_list retire_blk_timer;
214};
215
216#define PGV_FROM_VMALLOC 1
0e3125c7
NH
217struct pgv {
218 char *buffer;
0e3125c7
NH
219};
220
69e3c75f 221struct packet_ring_buffer {
0e3125c7 222 struct pgv *pg_vec;
69e3c75f
JB
223 unsigned int head;
224 unsigned int frames_per_block;
225 unsigned int frame_size;
226 unsigned int frame_max;
227
228 unsigned int pg_vec_order;
229 unsigned int pg_vec_pages;
230 unsigned int pg_vec_len;
231
bc59ba39 232 struct tpacket_kbdq_core prb_bdqc;
69e3c75f
JB
233 atomic_t pending;
234};
235
f6fb8f10 236#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
237#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
238#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
239#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
240#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
241#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
242#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
243
69e3c75f
JB
244struct packet_sock;
245static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4 246
f6fb8f10 247static void *packet_previous_frame(struct packet_sock *po,
248 struct packet_ring_buffer *rb,
249 int status);
250static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 251static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
252 struct tpacket_block_desc *);
253static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 254 struct packet_sock *);
bc59ba39 255static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 256 struct packet_sock *, unsigned int status);
bc59ba39 257static int prb_queue_frozen(struct tpacket_kbdq_core *);
258static void prb_open_block(struct tpacket_kbdq_core *,
259 struct tpacket_block_desc *);
f6fb8f10 260static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 261static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
262static void prb_init_blk_timer(struct packet_sock *,
263 struct tpacket_kbdq_core *,
264 void (*func) (unsigned long));
265static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
266static void prb_clear_rxhash(struct tpacket_kbdq_core *,
267 struct tpacket3_hdr *);
268static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
269 struct tpacket3_hdr *);
1da177e4
LT
270static void packet_flush_mclist(struct sock *sk);
271
dc99f600 272struct packet_fanout;
1da177e4
LT
273struct packet_sock {
274 /* struct sock has to be the first member of packet_sock */
275 struct sock sk;
dc99f600 276 struct packet_fanout *fanout;
1da177e4 277 struct tpacket_stats stats;
f6fb8f10 278 union tpacket_stats_u stats_u;
69e3c75f
JB
279 struct packet_ring_buffer rx_ring;
280 struct packet_ring_buffer tx_ring;
1da177e4 281 int copy_thresh;
1da177e4 282 spinlock_t bind_lock;
905db440 283 struct mutex pg_vec_lock;
8dc41944 284 unsigned int running:1, /* prot_hook is attached*/
80feaacb 285 auxdata:1,
bfd5f4a3
SS
286 origdev:1,
287 has_vnet_hdr:1;
1da177e4 288 int ifindex; /* bound device */
0e11c91e 289 __be16 num;
1da177e4 290 struct packet_mclist *mclist;
1da177e4 291 atomic_t mapped;
bbd6ef87
PM
292 enum tpacket_versions tp_version;
293 unsigned int tp_hdrlen;
8913336a 294 unsigned int tp_reserve;
69e3c75f 295 unsigned int tp_loss:1;
614f60fa 296 unsigned int tp_tstamp;
94b05952 297 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
298};
299
dc99f600
DM
300#define PACKET_FANOUT_MAX 256
301
302struct packet_fanout {
303#ifdef CONFIG_NET_NS
304 struct net *net;
305#endif
306 unsigned int num_members;
307 u16 id;
308 u8 type;
7736d33f 309 u8 defrag;
dc99f600
DM
310 atomic_t rr_cur;
311 struct list_head list;
312 struct sock *arr[PACKET_FANOUT_MAX];
313 spinlock_t lock;
314 atomic_t sk_ref;
315 struct packet_type prot_hook ____cacheline_aligned_in_smp;
316};
317
ffbc6111
HX
318struct packet_skb_cb {
319 unsigned int origlen;
320 union {
321 struct sockaddr_pkt pkt;
322 struct sockaddr_ll ll;
323 } sa;
324};
325
326#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 327
bc59ba39 328#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 329#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 330 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 331#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 332 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 333#define GET_NEXT_PRB_BLK_NUM(x) \
334 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
335 ((x)->kactive_blk_num+1) : 0)
336
eea49cc9 337static struct packet_sock *pkt_sk(struct sock *sk)
ce06b03e
DM
338{
339 return (struct packet_sock *)sk;
340}
341
dc99f600
DM
342static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
343static void __fanout_link(struct sock *sk, struct packet_sock *po);
344
ce06b03e
DM
345/* register_prot_hook must be invoked with the po->bind_lock held,
346 * or from a context in which asynchronous accesses to the packet
347 * socket is not possible (packet_create()).
348 */
349static void register_prot_hook(struct sock *sk)
350{
351 struct packet_sock *po = pkt_sk(sk);
352 if (!po->running) {
dc99f600
DM
353 if (po->fanout)
354 __fanout_link(sk, po);
355 else
356 dev_add_pack(&po->prot_hook);
ce06b03e
DM
357 sock_hold(sk);
358 po->running = 1;
359 }
360}
361
362/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
363 * held. If the sync parameter is true, we will temporarily drop
364 * the po->bind_lock and do a synchronize_net to make sure no
365 * asynchronous packet processing paths still refer to the elements
366 * of po->prot_hook. If the sync parameter is false, it is the
367 * callers responsibility to take care of this.
368 */
369static void __unregister_prot_hook(struct sock *sk, bool sync)
370{
371 struct packet_sock *po = pkt_sk(sk);
372
373 po->running = 0;
dc99f600
DM
374 if (po->fanout)
375 __fanout_unlink(sk, po);
376 else
377 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
378 __sock_put(sk);
379
380 if (sync) {
381 spin_unlock(&po->bind_lock);
382 synchronize_net();
383 spin_lock(&po->bind_lock);
384 }
385}
386
387static void unregister_prot_hook(struct sock *sk, bool sync)
388{
389 struct packet_sock *po = pkt_sk(sk);
390
391 if (po->running)
392 __unregister_prot_hook(sk, sync);
393}
394
f6dafa95 395static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
396{
397 if (is_vmalloc_addr(addr))
398 return vmalloc_to_page(addr);
399 return virt_to_page(addr);
400}
401
69e3c75f 402static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 403{
bbd6ef87
PM
404 union {
405 struct tpacket_hdr *h1;
406 struct tpacket2_hdr *h2;
407 void *raw;
408 } h;
1da177e4 409
69e3c75f 410 h.raw = frame;
bbd6ef87
PM
411 switch (po->tp_version) {
412 case TPACKET_V1:
69e3c75f 413 h.h1->tp_status = status;
0af55bb5 414 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
415 break;
416 case TPACKET_V2:
69e3c75f 417 h.h2->tp_status = status;
0af55bb5 418 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 419 break;
f6fb8f10 420 case TPACKET_V3:
69e3c75f 421 default:
f6fb8f10 422 WARN(1, "TPACKET version not supported.\n");
69e3c75f 423 BUG();
bbd6ef87 424 }
69e3c75f
JB
425
426 smp_wmb();
bbd6ef87
PM
427}
428
69e3c75f 429static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
430{
431 union {
432 struct tpacket_hdr *h1;
433 struct tpacket2_hdr *h2;
434 void *raw;
435 } h;
436
69e3c75f
JB
437 smp_rmb();
438
bbd6ef87
PM
439 h.raw = frame;
440 switch (po->tp_version) {
441 case TPACKET_V1:
0af55bb5 442 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 443 return h.h1->tp_status;
bbd6ef87 444 case TPACKET_V2:
0af55bb5 445 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 446 return h.h2->tp_status;
f6fb8f10 447 case TPACKET_V3:
69e3c75f 448 default:
f6fb8f10 449 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
450 BUG();
451 return 0;
bbd6ef87 452 }
1da177e4 453}
69e3c75f
JB
454
455static void *packet_lookup_frame(struct packet_sock *po,
456 struct packet_ring_buffer *rb,
457 unsigned int position,
458 int status)
459{
460 unsigned int pg_vec_pos, frame_offset;
461 union {
462 struct tpacket_hdr *h1;
463 struct tpacket2_hdr *h2;
464 void *raw;
465 } h;
466
467 pg_vec_pos = position / rb->frames_per_block;
468 frame_offset = position % rb->frames_per_block;
469
0e3125c7
NH
470 h.raw = rb->pg_vec[pg_vec_pos].buffer +
471 (frame_offset * rb->frame_size);
69e3c75f
JB
472
473 if (status != __packet_get_status(po, h.raw))
474 return NULL;
475
476 return h.raw;
477}
478
eea49cc9 479static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
480 struct packet_ring_buffer *rb,
481 int status)
482{
483 return packet_lookup_frame(po, rb, rb->head, status);
484}
485
bc59ba39 486static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 487{
488 del_timer_sync(&pkc->retire_blk_timer);
489}
490
491static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
492 int tx_ring,
493 struct sk_buff_head *rb_queue)
494{
bc59ba39 495 struct tpacket_kbdq_core *pkc;
f6fb8f10 496
497 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
498
499 spin_lock(&rb_queue->lock);
500 pkc->delete_blk_timer = 1;
501 spin_unlock(&rb_queue->lock);
502
503 prb_del_retire_blk_timer(pkc);
504}
505
506static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 507 struct tpacket_kbdq_core *pkc,
f6fb8f10 508 void (*func) (unsigned long))
509{
510 init_timer(&pkc->retire_blk_timer);
511 pkc->retire_blk_timer.data = (long)po;
512 pkc->retire_blk_timer.function = func;
513 pkc->retire_blk_timer.expires = jiffies;
514}
515
516static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
520 if (tx_ring)
521 BUG();
522
523 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
524 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
532 struct ethtool_cmd ecmd;
533 int err;
f6fb8f10 534
4bc71cb9
JP
535 rtnl_lock();
536 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
537 if (unlikely(!dev)) {
538 rtnl_unlock();
f6fb8f10 539 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
540 }
541 err = __ethtool_get_settings(dev, &ecmd);
542 rtnl_unlock();
543 if (!err) {
544 switch (ecmd.speed) {
545 case SPEED_10000:
546 msec = 1;
547 div = 10000/1000;
548 break;
549 case SPEED_1000:
550 msec = 1;
551 div = 1000/1000;
552 break;
553 /*
554 * If the link speed is so slow you don't really
555 * need to worry about perf anyways
556 */
557 case SPEED_100:
558 case SPEED_10:
559 default:
560 return DEFAULT_PRB_RETIRE_TOV;
f6fb8f10 561 }
562 }
563
564 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
565
566 if (div)
567 mbits /= div;
568
569 tmo = mbits * msec;
570
571 if (div)
572 return tmo+1;
573 return tmo;
574}
575
bc59ba39 576static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 577 union tpacket_req_u *req_u)
578{
579 p1->feature_req_word = req_u->req3.tp_feature_req_word;
580}
581
582static void init_prb_bdqc(struct packet_sock *po,
583 struct packet_ring_buffer *rb,
584 struct pgv *pg_vec,
585 union tpacket_req_u *req_u, int tx_ring)
586{
bc59ba39 587 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
588 struct tpacket_block_desc *pbd;
f6fb8f10 589
590 memset(p1, 0x0, sizeof(*p1));
591
592 p1->knxt_seq_num = 1;
593 p1->pkbdq = pg_vec;
bc59ba39 594 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
f6fb8f10 595 p1->pkblk_start = (char *)pg_vec[0].buffer;
596 p1->kblk_size = req_u->req3.tp_block_size;
597 p1->knum_blocks = req_u->req3.tp_block_nr;
598 p1->hdrlen = po->tp_hdrlen;
599 p1->version = po->tp_version;
600 p1->last_kactive_blk_num = 0;
601 po->stats_u.stats3.tp_freeze_q_cnt = 0;
602 if (req_u->req3.tp_retire_blk_tov)
603 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
604 else
605 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
606 req_u->req3.tp_block_size);
607 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
608 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
609
610 prb_init_ft_ops(p1, req_u);
611 prb_setup_retire_blk_timer(po, tx_ring);
612 prb_open_block(p1, pbd);
613}
614
615/* Do NOT update the last_blk_num first.
616 * Assumes sk_buff_head lock is held.
617 */
bc59ba39 618static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 619{
620 mod_timer(&pkc->retire_blk_timer,
621 jiffies + pkc->tov_in_jiffies);
622 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
623}
624
625/*
626 * Timer logic:
627 * 1) We refresh the timer only when we open a block.
628 * By doing this we don't waste cycles refreshing the timer
629 * on packet-by-packet basis.
630 *
631 * With a 1MB block-size, on a 1Gbps line, it will take
632 * i) ~8 ms to fill a block + ii) memcpy etc.
633 * In this cut we are not accounting for the memcpy time.
634 *
635 * So, if the user sets the 'tmo' to 10ms then the timer
636 * will never fire while the block is still getting filled
637 * (which is what we want). However, the user could choose
638 * to close a block early and that's fine.
639 *
640 * But when the timer does fire, we check whether or not to refresh it.
641 * Since the tmo granularity is in msecs, it is not too expensive
642 * to refresh the timer, lets say every '8' msecs.
643 * Either the user can set the 'tmo' or we can derive it based on
644 * a) line-speed and b) block-size.
645 * prb_calc_retire_blk_tmo() calculates the tmo.
646 *
647 */
648static void prb_retire_rx_blk_timer_expired(unsigned long data)
649{
650 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 651 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 652 unsigned int frozen;
bc59ba39 653 struct tpacket_block_desc *pbd;
f6fb8f10 654
655 spin_lock(&po->sk.sk_receive_queue.lock);
656
657 frozen = prb_queue_frozen(pkc);
658 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
659
660 if (unlikely(pkc->delete_blk_timer))
661 goto out;
662
663 /* We only need to plug the race when the block is partially filled.
664 * tpacket_rcv:
665 * lock(); increment BLOCK_NUM_PKTS; unlock()
666 * copy_bits() is in progress ...
667 * timer fires on other cpu:
668 * we can't retire the current block because copy_bits
669 * is in progress.
670 *
671 */
672 if (BLOCK_NUM_PKTS(pbd)) {
673 while (atomic_read(&pkc->blk_fill_in_prog)) {
674 /* Waiting for skb_copy_bits to finish... */
675 cpu_relax();
676 }
677 }
678
679 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
680 if (!frozen) {
681 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
682 if (!prb_dispatch_next_block(pkc, po))
683 goto refresh_timer;
684 else
685 goto out;
686 } else {
687 /* Case 1. Queue was frozen because user-space was
688 * lagging behind.
689 */
690 if (prb_curr_blk_in_use(pkc, pbd)) {
691 /*
692 * Ok, user-space is still behind.
693 * So just refresh the timer.
694 */
695 goto refresh_timer;
696 } else {
697 /* Case 2. queue was frozen,user-space caught up,
698 * now the link went idle && the timer fired.
699 * We don't have a block to close.So we open this
700 * block and restart the timer.
701 * opening a block thaws the queue,restarts timer
702 * Thawing/timer-refresh is a side effect.
703 */
704 prb_open_block(pkc, pbd);
705 goto out;
706 }
707 }
708 }
709
710refresh_timer:
711 _prb_refresh_rx_retire_blk_timer(pkc);
712
713out:
714 spin_unlock(&po->sk.sk_receive_queue.lock);
715}
716
eea49cc9 717static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 718 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 719{
720 /* Flush everything minus the block header */
721
722#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
723 u8 *start, *end;
724
725 start = (u8 *)pbd1;
726
727 /* Skip the block header(we know header WILL fit in 4K) */
728 start += PAGE_SIZE;
729
730 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
731 for (; start < end; start += PAGE_SIZE)
732 flush_dcache_page(pgv_to_page(start));
733
734 smp_wmb();
735#endif
736
737 /* Now update the block status. */
738
739 BLOCK_STATUS(pbd1) = status;
740
741 /* Flush the block header */
742
743#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
744 start = (u8 *)pbd1;
745 flush_dcache_page(pgv_to_page(start));
746
747 smp_wmb();
748#endif
749}
750
751/*
752 * Side effect:
753 *
754 * 1) flush the block
755 * 2) Increment active_blk_num
756 *
757 * Note:We DONT refresh the timer on purpose.
758 * Because almost always the next block will be opened.
759 */
bc59ba39 760static void prb_close_block(struct tpacket_kbdq_core *pkc1,
761 struct tpacket_block_desc *pbd1,
f6fb8f10 762 struct packet_sock *po, unsigned int stat)
763{
764 __u32 status = TP_STATUS_USER | stat;
765
766 struct tpacket3_hdr *last_pkt;
bc59ba39 767 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 768
769 if (po->stats.tp_drops)
770 status |= TP_STATUS_LOSING;
771
772 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
773 last_pkt->tp_next_offset = 0;
774
775 /* Get the ts of the last pkt */
776 if (BLOCK_NUM_PKTS(pbd1)) {
777 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
778 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
779 } else {
780 /* Ok, we tmo'd - so get the current time */
781 struct timespec ts;
782 getnstimeofday(&ts);
783 h1->ts_last_pkt.ts_sec = ts.tv_sec;
784 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
785 }
786
787 smp_wmb();
788
789 /* Flush the block */
790 prb_flush_block(pkc1, pbd1, status);
791
792 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
793}
794
eea49cc9 795static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 796{
797 pkc->reset_pending_on_curr_blk = 0;
798}
799
800/*
801 * Side effect of opening a block:
802 *
803 * 1) prb_queue is thawed.
804 * 2) retire_blk_timer is refreshed.
805 *
806 */
bc59ba39 807static void prb_open_block(struct tpacket_kbdq_core *pkc1,
808 struct tpacket_block_desc *pbd1)
f6fb8f10 809{
810 struct timespec ts;
bc59ba39 811 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 812
813 smp_rmb();
814
815 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
816
817 /* We could have just memset this but we will lose the
818 * flexibility of making the priv area sticky
819 */
820 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
821 BLOCK_NUM_PKTS(pbd1) = 0;
822 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
823 getnstimeofday(&ts);
824 h1->ts_first_pkt.ts_sec = ts.tv_sec;
825 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
826 pkc1->pkblk_start = (char *)pbd1;
827 pkc1->nxt_offset = (char *)(pkc1->pkblk_start +
828 BLK_PLUS_PRIV(pkc1->blk_sizeof_priv));
829 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
830 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
831 pbd1->version = pkc1->version;
832 pkc1->prev = pkc1->nxt_offset;
833 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
834 prb_thaw_queue(pkc1);
835 _prb_refresh_rx_retire_blk_timer(pkc1);
836
837 smp_wmb();
838
839 return;
840 }
841
842 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
843 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
844 dump_stack();
845 BUG();
846}
847
848/*
849 * Queue freeze logic:
850 * 1) Assume tp_block_nr = 8 blocks.
851 * 2) At time 't0', user opens Rx ring.
852 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
853 * 4) user-space is either sleeping or processing block '0'.
854 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
855 * it will close block-7,loop around and try to fill block '0'.
856 * call-flow:
857 * __packet_lookup_frame_in_block
858 * prb_retire_current_block()
859 * prb_dispatch_next_block()
860 * |->(BLOCK_STATUS == USER) evaluates to true
861 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
862 * 6) Now there are two cases:
863 * 6.1) Link goes idle right after the queue is frozen.
864 * But remember, the last open_block() refreshed the timer.
865 * When this timer expires,it will refresh itself so that we can
866 * re-open block-0 in near future.
867 * 6.2) Link is busy and keeps on receiving packets. This is a simple
868 * case and __packet_lookup_frame_in_block will check if block-0
869 * is free and can now be re-used.
870 */
eea49cc9 871static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 872 struct packet_sock *po)
873{
874 pkc->reset_pending_on_curr_blk = 1;
875 po->stats_u.stats3.tp_freeze_q_cnt++;
876}
877
878#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
879
880/*
881 * If the next block is free then we will dispatch it
882 * and return a good offset.
883 * Else, we will freeze the queue.
884 * So, caller must check the return value.
885 */
bc59ba39 886static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 887 struct packet_sock *po)
888{
bc59ba39 889 struct tpacket_block_desc *pbd;
f6fb8f10 890
891 smp_rmb();
892
893 /* 1. Get current block num */
894 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
895
896 /* 2. If this block is currently in_use then freeze the queue */
897 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
898 prb_freeze_queue(pkc, po);
899 return NULL;
900 }
901
902 /*
903 * 3.
904 * open this block and return the offset where the first packet
905 * needs to get stored.
906 */
907 prb_open_block(pkc, pbd);
908 return (void *)pkc->nxt_offset;
909}
910
bc59ba39 911static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 912 struct packet_sock *po, unsigned int status)
913{
bc59ba39 914 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 915
916 /* retire/close the current block */
917 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
918 /*
919 * Plug the case where copy_bits() is in progress on
920 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
921 * have space to copy the pkt in the current block and
922 * called prb_retire_current_block()
923 *
924 * We don't need to worry about the TMO case because
925 * the timer-handler already handled this case.
926 */
927 if (!(status & TP_STATUS_BLK_TMO)) {
928 while (atomic_read(&pkc->blk_fill_in_prog)) {
929 /* Waiting for skb_copy_bits to finish... */
930 cpu_relax();
931 }
932 }
933 prb_close_block(pkc, pbd, po, status);
934 return;
935 }
936
937 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
938 dump_stack();
939 BUG();
940}
941
eea49cc9 942static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 943 struct tpacket_block_desc *pbd)
f6fb8f10 944{
945 return TP_STATUS_USER & BLOCK_STATUS(pbd);
946}
947
eea49cc9 948static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 949{
950 return pkc->reset_pending_on_curr_blk;
951}
952
eea49cc9 953static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 954{
bc59ba39 955 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 956 atomic_dec(&pkc->blk_fill_in_prog);
957}
958
eea49cc9 959static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 960 struct tpacket3_hdr *ppd)
961{
962 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
963}
964
eea49cc9 965static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 966 struct tpacket3_hdr *ppd)
967{
968 ppd->hv1.tp_rxhash = 0;
969}
970
eea49cc9 971static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 972 struct tpacket3_hdr *ppd)
973{
974 if (vlan_tx_tag_present(pkc->skb)) {
975 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
976 ppd->tp_status = TP_STATUS_VLAN_VALID;
977 } else {
978 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
979 }
980}
981
bc59ba39 982static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 983 struct tpacket3_hdr *ppd)
984{
985 prb_fill_vlan_info(pkc, ppd);
986
987 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
988 prb_fill_rxhash(pkc, ppd);
989 else
990 prb_clear_rxhash(pkc, ppd);
991}
992
eea49cc9 993static void prb_fill_curr_block(char *curr,
bc59ba39 994 struct tpacket_kbdq_core *pkc,
995 struct tpacket_block_desc *pbd,
f6fb8f10 996 unsigned int len)
997{
998 struct tpacket3_hdr *ppd;
999
1000 ppd = (struct tpacket3_hdr *)curr;
1001 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1002 pkc->prev = curr;
1003 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1004 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_NUM_PKTS(pbd) += 1;
1006 atomic_inc(&pkc->blk_fill_in_prog);
1007 prb_run_all_ft_ops(pkc, ppd);
1008}
1009
1010/* Assumes caller has the sk->rx_queue.lock */
1011static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1012 struct sk_buff *skb,
1013 int status,
1014 unsigned int len
1015 )
1016{
bc59ba39 1017 struct tpacket_kbdq_core *pkc;
1018 struct tpacket_block_desc *pbd;
f6fb8f10 1019 char *curr, *end;
1020
1021 pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring));
1022 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1023
1024 /* Queue is frozen when user space is lagging behind */
1025 if (prb_queue_frozen(pkc)) {
1026 /*
1027 * Check if that last block which caused the queue to freeze,
1028 * is still in_use by user-space.
1029 */
1030 if (prb_curr_blk_in_use(pkc, pbd)) {
1031 /* Can't record this packet */
1032 return NULL;
1033 } else {
1034 /*
1035 * Ok, the block was released by user-space.
1036 * Now let's open that block.
1037 * opening a block also thaws the queue.
1038 * Thawing is a side effect.
1039 */
1040 prb_open_block(pkc, pbd);
1041 }
1042 }
1043
1044 smp_mb();
1045 curr = pkc->nxt_offset;
1046 pkc->skb = skb;
1047 end = (char *) ((char *)pbd + pkc->kblk_size);
1048
1049 /* first try the current block */
1050 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1051 prb_fill_curr_block(curr, pkc, pbd, len);
1052 return (void *)curr;
1053 }
1054
1055 /* Ok, close the current block */
1056 prb_retire_current_block(pkc, po, 0);
1057
1058 /* Now, try to dispatch the next block */
1059 curr = (char *)prb_dispatch_next_block(pkc, po);
1060 if (curr) {
1061 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1062 prb_fill_curr_block(curr, pkc, pbd, len);
1063 return (void *)curr;
1064 }
1065
1066 /*
1067 * No free blocks are available.user_space hasn't caught up yet.
1068 * Queue was just frozen and now this packet will get dropped.
1069 */
1070 return NULL;
1071}
1072
eea49cc9 1073static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1074 struct sk_buff *skb,
1075 int status, unsigned int len)
1076{
1077 char *curr = NULL;
1078 switch (po->tp_version) {
1079 case TPACKET_V1:
1080 case TPACKET_V2:
1081 curr = packet_lookup_frame(po, &po->rx_ring,
1082 po->rx_ring.head, status);
1083 return curr;
1084 case TPACKET_V3:
1085 return __packet_lookup_frame_in_block(po, skb, status, len);
1086 default:
1087 WARN(1, "TPACKET version not supported\n");
1088 BUG();
1089 return 0;
1090 }
1091}
1092
eea49cc9 1093static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1094 struct packet_ring_buffer *rb,
1095 unsigned int previous,
1096 int status)
1097{
bc59ba39 1098 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1099 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
f6fb8f10 1100
1101 if (status != BLOCK_STATUS(pbd))
1102 return NULL;
1103 return pbd;
1104}
1105
eea49cc9 1106static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1107{
1108 unsigned int prev;
1109 if (rb->prb_bdqc.kactive_blk_num)
1110 prev = rb->prb_bdqc.kactive_blk_num-1;
1111 else
1112 prev = rb->prb_bdqc.knum_blocks-1;
1113 return prev;
1114}
1115
1116/* Assumes caller has held the rx_queue.lock */
eea49cc9 1117static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1118 struct packet_ring_buffer *rb,
1119 int status)
1120{
1121 unsigned int previous = prb_previous_blk_num(rb);
1122 return prb_lookup_block(po, rb, previous, status);
1123}
1124
eea49cc9 1125static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1126 struct packet_ring_buffer *rb,
1127 int status)
1128{
1129 if (po->tp_version <= TPACKET_V2)
1130 return packet_previous_frame(po, rb, status);
1131
1132 return __prb_previous_block(po, rb, status);
1133}
1134
eea49cc9 1135static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1136 struct packet_ring_buffer *rb)
1137{
1138 switch (po->tp_version) {
1139 case TPACKET_V1:
1140 case TPACKET_V2:
1141 return packet_increment_head(rb);
1142 case TPACKET_V3:
1143 default:
1144 WARN(1, "TPACKET version not supported.\n");
1145 BUG();
1146 return;
1147 }
1148}
1149
eea49cc9 1150static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1151 struct packet_ring_buffer *rb,
1152 int status)
1153{
1154 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1155 return packet_lookup_frame(po, rb, previous, status);
1156}
1157
eea49cc9 1158static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1159{
1160 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1161}
1162
1da177e4
LT
1163static void packet_sock_destruct(struct sock *sk)
1164{
ed85b565
RC
1165 skb_queue_purge(&sk->sk_error_queue);
1166
547b792c
IJ
1167 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1168 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1169
1170 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1171 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1172 return;
1173 }
1174
17ab56a2 1175 sk_refcnt_debug_dec(sk);
1da177e4
LT
1176}
1177
dc99f600
DM
1178static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1179{
1180 int x = atomic_read(&f->rr_cur) + 1;
1181
1182 if (x >= num)
1183 x = 0;
1184
1185 return x;
1186}
1187
1188static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1189{
1190 u32 idx, hash = skb->rxhash;
1191
1192 idx = ((u64)hash * num) >> 32;
1193
1194 return f->arr[idx];
1195}
1196
1197static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1198{
1199 int cur, old;
1200
1201 cur = atomic_read(&f->rr_cur);
1202 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1203 fanout_rr_next(f, num))) != cur)
1204 cur = old;
1205 return f->arr[cur];
1206}
1207
95ec3eb4
DM
1208static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1209{
1210 unsigned int cpu = smp_processor_id();
1211
1212 return f->arr[cpu % num];
1213}
1214
95ec3eb4
DM
1215static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1216 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1217{
1218 struct packet_fanout *f = pt->af_packet_priv;
1219 unsigned int num = f->num_members;
1220 struct packet_sock *po;
1221 struct sock *sk;
1222
1223 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1224 !num) {
1225 kfree_skb(skb);
1226 return 0;
1227 }
1228
95ec3eb4
DM
1229 switch (f->type) {
1230 case PACKET_FANOUT_HASH:
1231 default:
1232 if (f->defrag) {
bc416d97 1233 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1234 if (!skb)
1235 return 0;
1236 }
1237 skb_get_rxhash(skb);
1238 sk = fanout_demux_hash(f, skb, num);
1239 break;
1240 case PACKET_FANOUT_LB:
1241 sk = fanout_demux_lb(f, skb, num);
1242 break;
1243 case PACKET_FANOUT_CPU:
1244 sk = fanout_demux_cpu(f, skb, num);
1245 break;
dc99f600
DM
1246 }
1247
dc99f600
DM
1248 po = pkt_sk(sk);
1249
1250 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1251}
1252
1253static DEFINE_MUTEX(fanout_mutex);
1254static LIST_HEAD(fanout_list);
1255
1256static void __fanout_link(struct sock *sk, struct packet_sock *po)
1257{
1258 struct packet_fanout *f = po->fanout;
1259
1260 spin_lock(&f->lock);
1261 f->arr[f->num_members] = sk;
1262 smp_wmb();
1263 f->num_members++;
1264 spin_unlock(&f->lock);
1265}
1266
1267static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1268{
1269 struct packet_fanout *f = po->fanout;
1270 int i;
1271
1272 spin_lock(&f->lock);
1273 for (i = 0; i < f->num_members; i++) {
1274 if (f->arr[i] == sk)
1275 break;
1276 }
1277 BUG_ON(i >= f->num_members);
1278 f->arr[i] = f->arr[f->num_members - 1];
1279 f->num_members--;
1280 spin_unlock(&f->lock);
1281}
1282
7736d33f 1283static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1284{
1285 struct packet_sock *po = pkt_sk(sk);
1286 struct packet_fanout *f, *match;
7736d33f
DM
1287 u8 type = type_flags & 0xff;
1288 u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
dc99f600
DM
1289 int err;
1290
1291 switch (type) {
1292 case PACKET_FANOUT_HASH:
1293 case PACKET_FANOUT_LB:
95ec3eb4 1294 case PACKET_FANOUT_CPU:
dc99f600
DM
1295 break;
1296 default:
1297 return -EINVAL;
1298 }
1299
1300 if (!po->running)
1301 return -EINVAL;
1302
1303 if (po->fanout)
1304 return -EALREADY;
1305
1306 mutex_lock(&fanout_mutex);
1307 match = NULL;
1308 list_for_each_entry(f, &fanout_list, list) {
1309 if (f->id == id &&
1310 read_pnet(&f->net) == sock_net(sk)) {
1311 match = f;
1312 break;
1313 }
1314 }
afe62c68 1315 err = -EINVAL;
7736d33f 1316 if (match && match->defrag != defrag)
afe62c68 1317 goto out;
dc99f600 1318 if (!match) {
afe62c68 1319 err = -ENOMEM;
dc99f600 1320 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1321 if (!match)
1322 goto out;
1323 write_pnet(&match->net, sock_net(sk));
1324 match->id = id;
1325 match->type = type;
1326 match->defrag = defrag;
1327 atomic_set(&match->rr_cur, 0);
1328 INIT_LIST_HEAD(&match->list);
1329 spin_lock_init(&match->lock);
1330 atomic_set(&match->sk_ref, 0);
1331 match->prot_hook.type = po->prot_hook.type;
1332 match->prot_hook.dev = po->prot_hook.dev;
1333 match->prot_hook.func = packet_rcv_fanout;
1334 match->prot_hook.af_packet_priv = match;
1335 dev_add_pack(&match->prot_hook);
1336 list_add(&match->list, &fanout_list);
dc99f600 1337 }
afe62c68
ED
1338 err = -EINVAL;
1339 if (match->type == type &&
1340 match->prot_hook.type == po->prot_hook.type &&
1341 match->prot_hook.dev == po->prot_hook.dev) {
1342 err = -ENOSPC;
1343 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1344 __dev_remove_pack(&po->prot_hook);
1345 po->fanout = match;
1346 atomic_inc(&match->sk_ref);
1347 __fanout_link(sk, po);
1348 err = 0;
dc99f600
DM
1349 }
1350 }
afe62c68 1351out:
dc99f600
DM
1352 mutex_unlock(&fanout_mutex);
1353 return err;
1354}
1355
1356static void fanout_release(struct sock *sk)
1357{
1358 struct packet_sock *po = pkt_sk(sk);
1359 struct packet_fanout *f;
1360
1361 f = po->fanout;
1362 if (!f)
1363 return;
1364
1365 po->fanout = NULL;
1366
1367 mutex_lock(&fanout_mutex);
1368 if (atomic_dec_and_test(&f->sk_ref)) {
1369 list_del(&f->list);
1370 dev_remove_pack(&f->prot_hook);
1371 kfree(f);
1372 }
1373 mutex_unlock(&fanout_mutex);
1374}
1da177e4 1375
90ddc4f0 1376static const struct proto_ops packet_ops;
1da177e4 1377
90ddc4f0 1378static const struct proto_ops packet_ops_spkt;
1da177e4 1379
40d4e3df
ED
1380static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1381 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1382{
1383 struct sock *sk;
1384 struct sockaddr_pkt *spkt;
1385
1386 /*
1387 * When we registered the protocol we saved the socket in the data
1388 * field for just this event.
1389 */
1390
1391 sk = pt->af_packet_priv;
1ce4f28b 1392
1da177e4
LT
1393 /*
1394 * Yank back the headers [hope the device set this
1395 * right or kerboom...]
1396 *
1397 * Incoming packets have ll header pulled,
1398 * push it back.
1399 *
98e399f8 1400 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1401 * so that this procedure is noop.
1402 */
1403
1404 if (skb->pkt_type == PACKET_LOOPBACK)
1405 goto out;
1406
09ad9bc7 1407 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1408 goto out;
1409
40d4e3df
ED
1410 skb = skb_share_check(skb, GFP_ATOMIC);
1411 if (skb == NULL)
1da177e4
LT
1412 goto oom;
1413
1414 /* drop any routing info */
adf30907 1415 skb_dst_drop(skb);
1da177e4 1416
84531c24
PO
1417 /* drop conntrack reference */
1418 nf_reset(skb);
1419
ffbc6111 1420 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1421
98e399f8 1422 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1423
1424 /*
1425 * The SOCK_PACKET socket receives _all_ frames.
1426 */
1427
1428 spkt->spkt_family = dev->type;
1429 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1430 spkt->spkt_protocol = skb->protocol;
1431
1432 /*
1433 * Charge the memory to the socket. This is done specifically
1434 * to prevent sockets using all the memory up.
1435 */
1436
40d4e3df 1437 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1438 return 0;
1439
1440out:
1441 kfree_skb(skb);
1442oom:
1443 return 0;
1444}
1445
1446
1447/*
1448 * Output a raw packet to a device layer. This bypasses all the other
1449 * protocol layers and you must therefore supply it with a complete frame
1450 */
1ce4f28b 1451
1da177e4
LT
1452static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1453 struct msghdr *msg, size_t len)
1454{
1455 struct sock *sk = sock->sk;
40d4e3df 1456 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1457 struct sk_buff *skb = NULL;
1da177e4 1458 struct net_device *dev;
40d4e3df 1459 __be16 proto = 0;
1da177e4 1460 int err;
3bdc0eba 1461 int extra_len = 0;
1ce4f28b 1462
1da177e4 1463 /*
1ce4f28b 1464 * Get and verify the address.
1da177e4
LT
1465 */
1466
40d4e3df 1467 if (saddr) {
1da177e4 1468 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1469 return -EINVAL;
1470 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1471 proto = saddr->spkt_protocol;
1472 } else
1473 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1474
1475 /*
1ce4f28b 1476 * Find the device first to size check it
1da177e4
LT
1477 */
1478
1479 saddr->spkt_device[13] = 0;
1a35ca80 1480retry:
654d1f8a
ED
1481 rcu_read_lock();
1482 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1483 err = -ENODEV;
1484 if (dev == NULL)
1485 goto out_unlock;
1ce4f28b 1486
d5e76b0a
DM
1487 err = -ENETDOWN;
1488 if (!(dev->flags & IFF_UP))
1489 goto out_unlock;
1490
1da177e4 1491 /*
40d4e3df
ED
1492 * You may not queue a frame bigger than the mtu. This is the lowest level
1493 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1494 */
1ce4f28b 1495
3bdc0eba
BG
1496 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1497 if (!netif_supports_nofcs(dev)) {
1498 err = -EPROTONOSUPPORT;
1499 goto out_unlock;
1500 }
1501 extra_len = 4; /* We're doing our own CRC */
1502 }
1503
1da177e4 1504 err = -EMSGSIZE;
3bdc0eba 1505 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1506 goto out_unlock;
1507
1a35ca80
ED
1508 if (!skb) {
1509 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1510 int tlen = dev->needed_tailroom;
1a35ca80
ED
1511 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1512
1513 rcu_read_unlock();
4ce40912 1514 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1515 if (skb == NULL)
1516 return -ENOBUFS;
1517 /* FIXME: Save some space for broken drivers that write a hard
1518 * header at transmission time by themselves. PPP is the notable
1519 * one here. This should really be fixed at the driver level.
1520 */
1521 skb_reserve(skb, reserved);
1522 skb_reset_network_header(skb);
1523
1524 /* Try to align data part correctly */
1525 if (hhlen) {
1526 skb->data -= hhlen;
1527 skb->tail -= hhlen;
1528 if (len < hhlen)
1529 skb_reset_network_header(skb);
1530 }
1531 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1532 if (err)
1533 goto out_free;
1534 goto retry;
1da177e4
LT
1535 }
1536
3bdc0eba 1537 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1538 /* Earlier code assumed this would be a VLAN pkt,
1539 * double-check this now that we have the actual
1540 * packet in hand.
1541 */
1542 struct ethhdr *ehdr;
1543 skb_reset_mac_header(skb);
1544 ehdr = eth_hdr(skb);
1545 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1546 err = -EMSGSIZE;
1547 goto out_unlock;
1548 }
1549 }
1a35ca80 1550
1da177e4
LT
1551 skb->protocol = proto;
1552 skb->dev = dev;
1553 skb->priority = sk->sk_priority;
2d37a186 1554 skb->mark = sk->sk_mark;
2244d07b 1555 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1556 if (err < 0)
1557 goto out_unlock;
1da177e4 1558
3bdc0eba
BG
1559 if (unlikely(extra_len == 4))
1560 skb->no_fcs = 1;
1561
1da177e4 1562 dev_queue_xmit(skb);
654d1f8a 1563 rcu_read_unlock();
40d4e3df 1564 return len;
1da177e4 1565
1da177e4 1566out_unlock:
654d1f8a 1567 rcu_read_unlock();
1a35ca80
ED
1568out_free:
1569 kfree_skb(skb);
1da177e4
LT
1570 return err;
1571}
1da177e4 1572
eea49cc9 1573static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1574 const struct sock *sk,
dbcb5855 1575 unsigned int res)
1da177e4
LT
1576{
1577 struct sk_filter *filter;
fda9ef5d 1578
80f8f102
ED
1579 rcu_read_lock();
1580 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1581 if (filter != NULL)
0a14842f 1582 res = SK_RUN_FILTER(filter, skb);
80f8f102 1583 rcu_read_unlock();
1da177e4 1584
dbcb5855 1585 return res;
1da177e4
LT
1586}
1587
1588/*
62ab0812
ED
1589 * This function makes lazy skb cloning in hope that most of packets
1590 * are discarded by BPF.
1591 *
1592 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1593 * and skb->cb are mangled. It works because (and until) packets
1594 * falling here are owned by current CPU. Output packets are cloned
1595 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1596 * sequencially, so that if we return skb to original state on exit,
1597 * we will not harm anyone.
1da177e4
LT
1598 */
1599
40d4e3df
ED
1600static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1601 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1602{
1603 struct sock *sk;
1604 struct sockaddr_ll *sll;
1605 struct packet_sock *po;
40d4e3df 1606 u8 *skb_head = skb->data;
1da177e4 1607 int skb_len = skb->len;
dbcb5855 1608 unsigned int snaplen, res;
1da177e4
LT
1609
1610 if (skb->pkt_type == PACKET_LOOPBACK)
1611 goto drop;
1612
1613 sk = pt->af_packet_priv;
1614 po = pkt_sk(sk);
1615
09ad9bc7 1616 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1617 goto drop;
1618
1da177e4
LT
1619 skb->dev = dev;
1620
3b04ddde 1621 if (dev->header_ops) {
1da177e4 1622 /* The device has an explicit notion of ll header,
62ab0812
ED
1623 * exported to higher levels.
1624 *
1625 * Otherwise, the device hides details of its frame
1626 * structure, so that corresponding packet head is
1627 * never delivered to user.
1da177e4
LT
1628 */
1629 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1630 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1631 else if (skb->pkt_type == PACKET_OUTGOING) {
1632 /* Special case: outgoing packets have ll header at head */
bbe735e4 1633 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1634 }
1635 }
1636
1637 snaplen = skb->len;
1638
dbcb5855
DM
1639 res = run_filter(skb, sk, snaplen);
1640 if (!res)
fda9ef5d 1641 goto drop_n_restore;
dbcb5855
DM
1642 if (snaplen > res)
1643 snaplen = res;
1da177e4 1644
0fd7bac6 1645 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1646 goto drop_n_acct;
1647
1648 if (skb_shared(skb)) {
1649 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1650 if (nskb == NULL)
1651 goto drop_n_acct;
1652
1653 if (skb_head != skb->data) {
1654 skb->data = skb_head;
1655 skb->len = skb_len;
1656 }
1657 kfree_skb(skb);
1658 skb = nskb;
1659 }
1660
ffbc6111
HX
1661 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1662 sizeof(skb->cb));
1663
1664 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1665 sll->sll_family = AF_PACKET;
1666 sll->sll_hatype = dev->type;
1667 sll->sll_protocol = skb->protocol;
1668 sll->sll_pkttype = skb->pkt_type;
8032b464 1669 if (unlikely(po->origdev))
80feaacb
PWJ
1670 sll->sll_ifindex = orig_dev->ifindex;
1671 else
1672 sll->sll_ifindex = dev->ifindex;
1da177e4 1673
b95cce35 1674 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1675
ffbc6111 1676 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1677
1da177e4
LT
1678 if (pskb_trim(skb, snaplen))
1679 goto drop_n_acct;
1680
1681 skb_set_owner_r(skb, sk);
1682 skb->dev = NULL;
adf30907 1683 skb_dst_drop(skb);
1da177e4 1684
84531c24
PO
1685 /* drop conntrack reference */
1686 nf_reset(skb);
1687
1da177e4
LT
1688 spin_lock(&sk->sk_receive_queue.lock);
1689 po->stats.tp_packets++;
3b885787 1690 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1691 __skb_queue_tail(&sk->sk_receive_queue, skb);
1692 spin_unlock(&sk->sk_receive_queue.lock);
1693 sk->sk_data_ready(sk, skb->len);
1694 return 0;
1695
1696drop_n_acct:
7091fbd8
WB
1697 spin_lock(&sk->sk_receive_queue.lock);
1698 po->stats.tp_drops++;
1699 atomic_inc(&sk->sk_drops);
1700 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1701
1702drop_n_restore:
1703 if (skb_head != skb->data && skb_shared(skb)) {
1704 skb->data = skb_head;
1705 skb->len = skb_len;
1706 }
1707drop:
ead2ceb0 1708 consume_skb(skb);
1da177e4
LT
1709 return 0;
1710}
1711
40d4e3df
ED
1712static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1713 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1714{
1715 struct sock *sk;
1716 struct packet_sock *po;
1717 struct sockaddr_ll *sll;
bbd6ef87
PM
1718 union {
1719 struct tpacket_hdr *h1;
1720 struct tpacket2_hdr *h2;
f6fb8f10 1721 struct tpacket3_hdr *h3;
bbd6ef87
PM
1722 void *raw;
1723 } h;
40d4e3df 1724 u8 *skb_head = skb->data;
1da177e4 1725 int skb_len = skb->len;
dbcb5855 1726 unsigned int snaplen, res;
f6fb8f10 1727 unsigned long status = TP_STATUS_USER;
bbd6ef87 1728 unsigned short macoff, netoff, hdrlen;
1da177e4 1729 struct sk_buff *copy_skb = NULL;
b7aa0bf7 1730 struct timeval tv;
bbd6ef87 1731 struct timespec ts;
614f60fa 1732 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
1733
1734 if (skb->pkt_type == PACKET_LOOPBACK)
1735 goto drop;
1736
1737 sk = pt->af_packet_priv;
1738 po = pkt_sk(sk);
1739
09ad9bc7 1740 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1741 goto drop;
1742
3b04ddde 1743 if (dev->header_ops) {
1da177e4 1744 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1745 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1746 else if (skb->pkt_type == PACKET_OUTGOING) {
1747 /* Special case: outgoing packets have ll header at head */
bbe735e4 1748 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1749 }
1750 }
1751
8dc41944
HX
1752 if (skb->ip_summed == CHECKSUM_PARTIAL)
1753 status |= TP_STATUS_CSUMNOTREADY;
1754
1da177e4
LT
1755 snaplen = skb->len;
1756
dbcb5855
DM
1757 res = run_filter(skb, sk, snaplen);
1758 if (!res)
fda9ef5d 1759 goto drop_n_restore;
dbcb5855
DM
1760 if (snaplen > res)
1761 snaplen = res;
1da177e4
LT
1762
1763 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1764 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1765 po->tp_reserve;
1da177e4 1766 } else {
95c96174 1767 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1768 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1769 (maclen < 16 ? 16 : maclen)) +
1770 po->tp_reserve;
1da177e4
LT
1771 macoff = netoff - maclen;
1772 }
f6fb8f10 1773 if (po->tp_version <= TPACKET_V2) {
1774 if (macoff + snaplen > po->rx_ring.frame_size) {
1775 if (po->copy_thresh &&
0fd7bac6 1776 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1777 if (skb_shared(skb)) {
1778 copy_skb = skb_clone(skb, GFP_ATOMIC);
1779 } else {
1780 copy_skb = skb_get(skb);
1781 skb_head = skb->data;
1782 }
1783 if (copy_skb)
1784 skb_set_owner_r(copy_skb, sk);
1da177e4 1785 }
f6fb8f10 1786 snaplen = po->rx_ring.frame_size - macoff;
1787 if ((int)snaplen < 0)
1788 snaplen = 0;
1da177e4 1789 }
1da177e4 1790 }
1da177e4 1791 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1792 h.raw = packet_current_rx_frame(po, skb,
1793 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1794 if (!h.raw)
1da177e4 1795 goto ring_is_full;
f6fb8f10 1796 if (po->tp_version <= TPACKET_V2) {
1797 packet_increment_rx_head(po, &po->rx_ring);
1798 /*
1799 * LOSING will be reported till you read the stats,
1800 * because it's COR - Clear On Read.
1801 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1802 * at packet level.
1803 */
1804 if (po->stats.tp_drops)
1805 status |= TP_STATUS_LOSING;
1806 }
1da177e4
LT
1807 po->stats.tp_packets++;
1808 if (copy_skb) {
1809 status |= TP_STATUS_COPY;
1810 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1811 }
1da177e4
LT
1812 spin_unlock(&sk->sk_receive_queue.lock);
1813
bbd6ef87 1814 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1815
bbd6ef87
PM
1816 switch (po->tp_version) {
1817 case TPACKET_V1:
1818 h.h1->tp_len = skb->len;
1819 h.h1->tp_snaplen = snaplen;
1820 h.h1->tp_mac = macoff;
1821 h.h1->tp_net = netoff;
614f60fa
SM
1822 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1823 && shhwtstamps->syststamp.tv64)
1824 tv = ktime_to_timeval(shhwtstamps->syststamp);
1825 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1826 && shhwtstamps->hwtstamp.tv64)
1827 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1828 else if (skb->tstamp.tv64)
bbd6ef87
PM
1829 tv = ktime_to_timeval(skb->tstamp);
1830 else
1831 do_gettimeofday(&tv);
1832 h.h1->tp_sec = tv.tv_sec;
1833 h.h1->tp_usec = tv.tv_usec;
1834 hdrlen = sizeof(*h.h1);
1835 break;
1836 case TPACKET_V2:
1837 h.h2->tp_len = skb->len;
1838 h.h2->tp_snaplen = snaplen;
1839 h.h2->tp_mac = macoff;
1840 h.h2->tp_net = netoff;
614f60fa
SM
1841 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1842 && shhwtstamps->syststamp.tv64)
1843 ts = ktime_to_timespec(shhwtstamps->syststamp);
1844 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1845 && shhwtstamps->hwtstamp.tv64)
1846 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1847 else if (skb->tstamp.tv64)
bbd6ef87
PM
1848 ts = ktime_to_timespec(skb->tstamp);
1849 else
1850 getnstimeofday(&ts);
1851 h.h2->tp_sec = ts.tv_sec;
1852 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1853 if (vlan_tx_tag_present(skb)) {
1854 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1855 status |= TP_STATUS_VLAN_VALID;
1856 } else {
1857 h.h2->tp_vlan_tci = 0;
1858 }
13fcb7bd 1859 h.h2->tp_padding = 0;
bbd6ef87
PM
1860 hdrlen = sizeof(*h.h2);
1861 break;
f6fb8f10 1862 case TPACKET_V3:
1863 /* tp_nxt_offset,vlan are already populated above.
1864 * So DONT clear those fields here
1865 */
1866 h.h3->tp_status |= status;
1867 h.h3->tp_len = skb->len;
1868 h.h3->tp_snaplen = snaplen;
1869 h.h3->tp_mac = macoff;
1870 h.h3->tp_net = netoff;
1871 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1872 && shhwtstamps->syststamp.tv64)
1873 ts = ktime_to_timespec(shhwtstamps->syststamp);
1874 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1875 && shhwtstamps->hwtstamp.tv64)
1876 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1877 else if (skb->tstamp.tv64)
1878 ts = ktime_to_timespec(skb->tstamp);
1879 else
1880 getnstimeofday(&ts);
1881 h.h3->tp_sec = ts.tv_sec;
1882 h.h3->tp_nsec = ts.tv_nsec;
1883 hdrlen = sizeof(*h.h3);
1884 break;
bbd6ef87
PM
1885 default:
1886 BUG();
1887 }
1da177e4 1888
bbd6ef87 1889 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1890 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1891 sll->sll_family = AF_PACKET;
1892 sll->sll_hatype = dev->type;
1893 sll->sll_protocol = skb->protocol;
1894 sll->sll_pkttype = skb->pkt_type;
8032b464 1895 if (unlikely(po->origdev))
80feaacb
PWJ
1896 sll->sll_ifindex = orig_dev->ifindex;
1897 else
1898 sll->sll_ifindex = dev->ifindex;
1da177e4 1899
e16aa207 1900 smp_mb();
f6dafa95 1901#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1902 {
0af55bb5
CG
1903 u8 *start, *end;
1904
f6fb8f10 1905 if (po->tp_version <= TPACKET_V2) {
1906 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1907 + macoff + snaplen);
1908 for (start = h.raw; start < end; start += PAGE_SIZE)
1909 flush_dcache_page(pgv_to_page(start));
1910 }
cc9f01b2 1911 smp_wmb();
1da177e4 1912 }
f6dafa95 1913#endif
f6fb8f10 1914 if (po->tp_version <= TPACKET_V2)
1915 __packet_set_status(po, h.raw, status);
1916 else
1917 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1918
1919 sk->sk_data_ready(sk, 0);
1920
1921drop_n_restore:
1922 if (skb_head != skb->data && skb_shared(skb)) {
1923 skb->data = skb_head;
1924 skb->len = skb_len;
1925 }
1926drop:
1ce4f28b 1927 kfree_skb(skb);
1da177e4
LT
1928 return 0;
1929
1930ring_is_full:
1931 po->stats.tp_drops++;
1932 spin_unlock(&sk->sk_receive_queue.lock);
1933
1934 sk->sk_data_ready(sk, 0);
acb5d75b 1935 kfree_skb(copy_skb);
1da177e4
LT
1936 goto drop_n_restore;
1937}
1938
69e3c75f
JB
1939static void tpacket_destruct_skb(struct sk_buff *skb)
1940{
1941 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1942 void *ph;
1da177e4 1943
69e3c75f
JB
1944 if (likely(po->tx_ring.pg_vec)) {
1945 ph = skb_shinfo(skb)->destructor_arg;
1946 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1947 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1948 atomic_dec(&po->tx_ring.pending);
1949 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1950 }
1951
1952 sock_wfree(skb);
1953}
1954
40d4e3df
ED
1955static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1956 void *frame, struct net_device *dev, int size_max,
ae641949 1957 __be16 proto, unsigned char *addr, int hlen)
69e3c75f
JB
1958{
1959 union {
1960 struct tpacket_hdr *h1;
1961 struct tpacket2_hdr *h2;
1962 void *raw;
1963 } ph;
1964 int to_write, offset, len, tp_len, nr_frags, len_max;
1965 struct socket *sock = po->sk.sk_socket;
1966 struct page *page;
1967 void *data;
1968 int err;
1969
1970 ph.raw = frame;
1971
1972 skb->protocol = proto;
1973 skb->dev = dev;
1974 skb->priority = po->sk.sk_priority;
2d37a186 1975 skb->mark = po->sk.sk_mark;
69e3c75f
JB
1976 skb_shinfo(skb)->destructor_arg = ph.raw;
1977
1978 switch (po->tp_version) {
1979 case TPACKET_V2:
1980 tp_len = ph.h2->tp_len;
1981 break;
1982 default:
1983 tp_len = ph.h1->tp_len;
1984 break;
1985 }
1986 if (unlikely(tp_len > size_max)) {
40d4e3df 1987 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1988 return -EMSGSIZE;
1989 }
1990
ae641949 1991 skb_reserve(skb, hlen);
69e3c75f
JB
1992 skb_reset_network_header(skb);
1993
1994 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1995 to_write = tp_len;
1996
1997 if (sock->type == SOCK_DGRAM) {
1998 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1999 NULL, tp_len);
2000 if (unlikely(err < 0))
2001 return -EINVAL;
40d4e3df 2002 } else if (dev->hard_header_len) {
69e3c75f
JB
2003 /* net device doesn't like empty head */
2004 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2005 pr_err("packet size is too short (%d < %d)\n",
2006 tp_len, dev->hard_header_len);
69e3c75f
JB
2007 return -EINVAL;
2008 }
2009
2010 skb_push(skb, dev->hard_header_len);
2011 err = skb_store_bits(skb, 0, data,
2012 dev->hard_header_len);
2013 if (unlikely(err))
2014 return err;
2015
2016 data += dev->hard_header_len;
2017 to_write -= dev->hard_header_len;
2018 }
2019
2020 err = -EFAULT;
69e3c75f
JB
2021 offset = offset_in_page(data);
2022 len_max = PAGE_SIZE - offset;
2023 len = ((to_write > len_max) ? len_max : to_write);
2024
2025 skb->data_len = to_write;
2026 skb->len += to_write;
2027 skb->truesize += to_write;
2028 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2029
2030 while (likely(to_write)) {
2031 nr_frags = skb_shinfo(skb)->nr_frags;
2032
2033 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2034 pr_err("Packet exceed the number of skb frags(%lu)\n",
2035 MAX_SKB_FRAGS);
69e3c75f
JB
2036 return -EFAULT;
2037 }
2038
0af55bb5
CG
2039 page = pgv_to_page(data);
2040 data += len;
69e3c75f
JB
2041 flush_dcache_page(page);
2042 get_page(page);
0af55bb5 2043 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2044 to_write -= len;
2045 offset = 0;
2046 len_max = PAGE_SIZE;
2047 len = ((to_write > len_max) ? len_max : to_write);
2048 }
2049
2050 return tp_len;
2051}
2052
2053static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2054{
69e3c75f
JB
2055 struct sk_buff *skb;
2056 struct net_device *dev;
2057 __be16 proto;
827d9780
BG
2058 bool need_rls_dev = false;
2059 int err, reserve = 0;
40d4e3df
ED
2060 void *ph;
2061 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2062 int tp_len, size_max;
2063 unsigned char *addr;
2064 int len_sum = 0;
2065 int status = 0;
ae641949 2066 int hlen, tlen;
69e3c75f 2067
69e3c75f
JB
2068 mutex_lock(&po->pg_vec_lock);
2069
2070 err = -EBUSY;
2071 if (saddr == NULL) {
827d9780 2072 dev = po->prot_hook.dev;
69e3c75f
JB
2073 proto = po->num;
2074 addr = NULL;
2075 } else {
2076 err = -EINVAL;
2077 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2078 goto out;
2079 if (msg->msg_namelen < (saddr->sll_halen
2080 + offsetof(struct sockaddr_ll,
2081 sll_addr)))
2082 goto out;
69e3c75f
JB
2083 proto = saddr->sll_protocol;
2084 addr = saddr->sll_addr;
827d9780
BG
2085 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2086 need_rls_dev = true;
69e3c75f
JB
2087 }
2088
69e3c75f
JB
2089 err = -ENXIO;
2090 if (unlikely(dev == NULL))
2091 goto out;
2092
2093 reserve = dev->hard_header_len;
2094
2095 err = -ENETDOWN;
2096 if (unlikely(!(dev->flags & IFF_UP)))
2097 goto out_put;
2098
2099 size_max = po->tx_ring.frame_size
b5dd884e 2100 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2101
2102 if (size_max > dev->mtu + reserve)
2103 size_max = dev->mtu + reserve;
2104
2105 do {
2106 ph = packet_current_frame(po, &po->tx_ring,
2107 TP_STATUS_SEND_REQUEST);
2108
2109 if (unlikely(ph == NULL)) {
2110 schedule();
2111 continue;
2112 }
2113
2114 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2115 hlen = LL_RESERVED_SPACE(dev);
2116 tlen = dev->needed_tailroom;
69e3c75f 2117 skb = sock_alloc_send_skb(&po->sk,
ae641949 2118 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2119 0, &err);
2120
2121 if (unlikely(skb == NULL))
2122 goto out_status;
2123
2124 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2125 addr, hlen);
69e3c75f
JB
2126
2127 if (unlikely(tp_len < 0)) {
2128 if (po->tp_loss) {
2129 __packet_set_status(po, ph,
2130 TP_STATUS_AVAILABLE);
2131 packet_increment_head(&po->tx_ring);
2132 kfree_skb(skb);
2133 continue;
2134 } else {
2135 status = TP_STATUS_WRONG_FORMAT;
2136 err = tp_len;
2137 goto out_status;
2138 }
2139 }
2140
2141 skb->destructor = tpacket_destruct_skb;
2142 __packet_set_status(po, ph, TP_STATUS_SENDING);
2143 atomic_inc(&po->tx_ring.pending);
2144
2145 status = TP_STATUS_SEND_REQUEST;
2146 err = dev_queue_xmit(skb);
eb70df13
JP
2147 if (unlikely(err > 0)) {
2148 err = net_xmit_errno(err);
2149 if (err && __packet_get_status(po, ph) ==
2150 TP_STATUS_AVAILABLE) {
2151 /* skb was destructed already */
2152 skb = NULL;
2153 goto out_status;
2154 }
2155 /*
2156 * skb was dropped but not destructed yet;
2157 * let's treat it like congestion or err < 0
2158 */
2159 err = 0;
2160 }
69e3c75f
JB
2161 packet_increment_head(&po->tx_ring);
2162 len_sum += tp_len;
f64f9e71
JP
2163 } while (likely((ph != NULL) ||
2164 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2165 (atomic_read(&po->tx_ring.pending))))
2166 );
69e3c75f
JB
2167
2168 err = len_sum;
2169 goto out_put;
2170
69e3c75f
JB
2171out_status:
2172 __packet_set_status(po, ph, status);
2173 kfree_skb(skb);
2174out_put:
827d9780
BG
2175 if (need_rls_dev)
2176 dev_put(dev);
69e3c75f
JB
2177out:
2178 mutex_unlock(&po->pg_vec_lock);
2179 return err;
2180}
69e3c75f 2181
eea49cc9
OJ
2182static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2183 size_t reserve, size_t len,
2184 size_t linear, int noblock,
2185 int *err)
bfd5f4a3
SS
2186{
2187 struct sk_buff *skb;
2188
2189 /* Under a page? Don't bother with paged skb. */
2190 if (prepad + len < PAGE_SIZE || !linear)
2191 linear = len;
2192
2193 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2194 err);
2195 if (!skb)
2196 return NULL;
2197
2198 skb_reserve(skb, reserve);
2199 skb_put(skb, linear);
2200 skb->data_len = len - linear;
2201 skb->len += len - linear;
2202
2203 return skb;
2204}
2205
69e3c75f 2206static int packet_snd(struct socket *sock,
1da177e4
LT
2207 struct msghdr *msg, size_t len)
2208{
2209 struct sock *sk = sock->sk;
40d4e3df 2210 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2211 struct sk_buff *skb;
2212 struct net_device *dev;
0e11c91e 2213 __be16 proto;
827d9780 2214 bool need_rls_dev = false;
1da177e4 2215 unsigned char *addr;
827d9780 2216 int err, reserve = 0;
bfd5f4a3
SS
2217 struct virtio_net_hdr vnet_hdr = { 0 };
2218 int offset = 0;
2219 int vnet_hdr_len;
2220 struct packet_sock *po = pkt_sk(sk);
2221 unsigned short gso_type = 0;
ae641949 2222 int hlen, tlen;
3bdc0eba 2223 int extra_len = 0;
1da177e4
LT
2224
2225 /*
1ce4f28b 2226 * Get and verify the address.
1da177e4 2227 */
1ce4f28b 2228
1da177e4 2229 if (saddr == NULL) {
827d9780 2230 dev = po->prot_hook.dev;
1da177e4
LT
2231 proto = po->num;
2232 addr = NULL;
2233 } else {
2234 err = -EINVAL;
2235 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2236 goto out;
0fb375fb
EB
2237 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2238 goto out;
1da177e4
LT
2239 proto = saddr->sll_protocol;
2240 addr = saddr->sll_addr;
827d9780
BG
2241 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2242 need_rls_dev = true;
1da177e4
LT
2243 }
2244
1da177e4
LT
2245 err = -ENXIO;
2246 if (dev == NULL)
2247 goto out_unlock;
2248 if (sock->type == SOCK_RAW)
2249 reserve = dev->hard_header_len;
2250
d5e76b0a
DM
2251 err = -ENETDOWN;
2252 if (!(dev->flags & IFF_UP))
2253 goto out_unlock;
2254
bfd5f4a3
SS
2255 if (po->has_vnet_hdr) {
2256 vnet_hdr_len = sizeof(vnet_hdr);
2257
2258 err = -EINVAL;
2259 if (len < vnet_hdr_len)
2260 goto out_unlock;
2261
2262 len -= vnet_hdr_len;
2263
2264 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2265 vnet_hdr_len);
2266 if (err < 0)
2267 goto out_unlock;
2268
2269 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2270 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2271 vnet_hdr.hdr_len))
2272 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2273 vnet_hdr.csum_offset + 2;
2274
2275 err = -EINVAL;
2276 if (vnet_hdr.hdr_len > len)
2277 goto out_unlock;
2278
2279 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2280 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2281 case VIRTIO_NET_HDR_GSO_TCPV4:
2282 gso_type = SKB_GSO_TCPV4;
2283 break;
2284 case VIRTIO_NET_HDR_GSO_TCPV6:
2285 gso_type = SKB_GSO_TCPV6;
2286 break;
2287 case VIRTIO_NET_HDR_GSO_UDP:
2288 gso_type = SKB_GSO_UDP;
2289 break;
2290 default:
2291 goto out_unlock;
2292 }
2293
2294 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2295 gso_type |= SKB_GSO_TCP_ECN;
2296
2297 if (vnet_hdr.gso_size == 0)
2298 goto out_unlock;
2299
2300 }
2301 }
2302
3bdc0eba
BG
2303 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2304 if (!netif_supports_nofcs(dev)) {
2305 err = -EPROTONOSUPPORT;
2306 goto out_unlock;
2307 }
2308 extra_len = 4; /* We're doing our own CRC */
2309 }
2310
1da177e4 2311 err = -EMSGSIZE;
3bdc0eba 2312 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2313 goto out_unlock;
2314
bfd5f4a3 2315 err = -ENOBUFS;
ae641949
HX
2316 hlen = LL_RESERVED_SPACE(dev);
2317 tlen = dev->needed_tailroom;
2318 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2319 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2320 if (skb == NULL)
1da177e4
LT
2321 goto out_unlock;
2322
bfd5f4a3 2323 skb_set_network_header(skb, reserve);
1da177e4 2324
0c4e8581
SH
2325 err = -EINVAL;
2326 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2327 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2328 goto out_free;
1da177e4
LT
2329
2330 /* Returns -EFAULT on error */
bfd5f4a3 2331 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2332 if (err)
2333 goto out_free;
2244d07b 2334 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
2335 if (err < 0)
2336 goto out_free;
1da177e4 2337
3bdc0eba 2338 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2339 /* Earlier code assumed this would be a VLAN pkt,
2340 * double-check this now that we have the actual
2341 * packet in hand.
2342 */
2343 struct ethhdr *ehdr;
2344 skb_reset_mac_header(skb);
2345 ehdr = eth_hdr(skb);
2346 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2347 err = -EMSGSIZE;
2348 goto out_free;
2349 }
2350 }
2351
1da177e4
LT
2352 skb->protocol = proto;
2353 skb->dev = dev;
2354 skb->priority = sk->sk_priority;
2d37a186 2355 skb->mark = sk->sk_mark;
1da177e4 2356
bfd5f4a3
SS
2357 if (po->has_vnet_hdr) {
2358 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2359 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2360 vnet_hdr.csum_offset)) {
2361 err = -EINVAL;
2362 goto out_free;
2363 }
2364 }
2365
2366 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2367 skb_shinfo(skb)->gso_type = gso_type;
2368
2369 /* Header must be checked, and gso_segs computed. */
2370 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2371 skb_shinfo(skb)->gso_segs = 0;
2372
2373 len += vnet_hdr_len;
2374 }
2375
3bdc0eba
BG
2376 if (unlikely(extra_len == 4))
2377 skb->no_fcs = 1;
2378
1da177e4
LT
2379 /*
2380 * Now send it
2381 */
2382
2383 err = dev_queue_xmit(skb);
2384 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2385 goto out_unlock;
2386
827d9780
BG
2387 if (need_rls_dev)
2388 dev_put(dev);
1da177e4 2389
40d4e3df 2390 return len;
1da177e4
LT
2391
2392out_free:
2393 kfree_skb(skb);
2394out_unlock:
827d9780 2395 if (dev && need_rls_dev)
1da177e4
LT
2396 dev_put(dev);
2397out:
2398 return err;
2399}
2400
69e3c75f
JB
2401static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2402 struct msghdr *msg, size_t len)
2403{
69e3c75f
JB
2404 struct sock *sk = sock->sk;
2405 struct packet_sock *po = pkt_sk(sk);
2406 if (po->tx_ring.pg_vec)
2407 return tpacket_snd(po, msg);
2408 else
69e3c75f
JB
2409 return packet_snd(sock, msg, len);
2410}
2411
1da177e4
LT
2412/*
2413 * Close a PACKET socket. This is fairly simple. We immediately go
2414 * to 'closed' state and remove our protocol entry in the device list.
2415 */
2416
2417static int packet_release(struct socket *sock)
2418{
2419 struct sock *sk = sock->sk;
2420 struct packet_sock *po;
d12d01d6 2421 struct net *net;
f6fb8f10 2422 union tpacket_req_u req_u;
1da177e4
LT
2423
2424 if (!sk)
2425 return 0;
2426
3b1e0a65 2427 net = sock_net(sk);
1da177e4
LT
2428 po = pkt_sk(sk);
2429
808f5114 2430 spin_lock_bh(&net->packet.sklist_lock);
2431 sk_del_node_init_rcu(sk);
920de804 2432 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 2433 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 2434
808f5114 2435 spin_lock(&po->bind_lock);
ce06b03e 2436 unregister_prot_hook(sk, false);
160ff18a
BG
2437 if (po->prot_hook.dev) {
2438 dev_put(po->prot_hook.dev);
2439 po->prot_hook.dev = NULL;
2440 }
808f5114 2441 spin_unlock(&po->bind_lock);
1da177e4 2442
1da177e4 2443 packet_flush_mclist(sk);
1da177e4 2444
f6fb8f10 2445 memset(&req_u, 0, sizeof(req_u));
69e3c75f
JB
2446
2447 if (po->rx_ring.pg_vec)
f6fb8f10 2448 packet_set_ring(sk, &req_u, 1, 0);
69e3c75f
JB
2449
2450 if (po->tx_ring.pg_vec)
f6fb8f10 2451 packet_set_ring(sk, &req_u, 1, 1);
1da177e4 2452
dc99f600
DM
2453 fanout_release(sk);
2454
808f5114 2455 synchronize_net();
1da177e4
LT
2456 /*
2457 * Now the socket is dead. No more input will appear.
2458 */
1da177e4
LT
2459 sock_orphan(sk);
2460 sock->sk = NULL;
2461
2462 /* Purge queues */
2463
2464 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2465 sk_refcnt_debug_release(sk);
1da177e4
LT
2466
2467 sock_put(sk);
2468 return 0;
2469}
2470
2471/*
2472 * Attach a packet hook.
2473 */
2474
0e11c91e 2475static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2476{
2477 struct packet_sock *po = pkt_sk(sk);
dc99f600 2478
aef950b4
WY
2479 if (po->fanout) {
2480 if (dev)
2481 dev_put(dev);
2482
dc99f600 2483 return -EINVAL;
aef950b4 2484 }
1da177e4
LT
2485
2486 lock_sock(sk);
2487
2488 spin_lock(&po->bind_lock);
ce06b03e 2489 unregister_prot_hook(sk, true);
1da177e4
LT
2490 po->num = protocol;
2491 po->prot_hook.type = protocol;
160ff18a
BG
2492 if (po->prot_hook.dev)
2493 dev_put(po->prot_hook.dev);
1da177e4
LT
2494 po->prot_hook.dev = dev;
2495
2496 po->ifindex = dev ? dev->ifindex : 0;
2497
2498 if (protocol == 0)
2499 goto out_unlock;
2500
be85d4ad 2501 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2502 register_prot_hook(sk);
be85d4ad
UT
2503 } else {
2504 sk->sk_err = ENETDOWN;
2505 if (!sock_flag(sk, SOCK_DEAD))
2506 sk->sk_error_report(sk);
1da177e4
LT
2507 }
2508
2509out_unlock:
2510 spin_unlock(&po->bind_lock);
2511 release_sock(sk);
2512 return 0;
2513}
2514
2515/*
2516 * Bind a packet socket to a device
2517 */
2518
40d4e3df
ED
2519static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2520 int addr_len)
1da177e4 2521{
40d4e3df 2522 struct sock *sk = sock->sk;
1da177e4
LT
2523 char name[15];
2524 struct net_device *dev;
2525 int err = -ENODEV;
1ce4f28b 2526
1da177e4
LT
2527 /*
2528 * Check legality
2529 */
1ce4f28b 2530
8ae55f04 2531 if (addr_len != sizeof(struct sockaddr))
1da177e4 2532 return -EINVAL;
40d4e3df 2533 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2534
3b1e0a65 2535 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2536 if (dev)
1da177e4 2537 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2538 return err;
2539}
1da177e4
LT
2540
2541static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2542{
40d4e3df
ED
2543 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2544 struct sock *sk = sock->sk;
1da177e4
LT
2545 struct net_device *dev = NULL;
2546 int err;
2547
2548
2549 /*
2550 * Check legality
2551 */
1ce4f28b 2552
1da177e4
LT
2553 if (addr_len < sizeof(struct sockaddr_ll))
2554 return -EINVAL;
2555 if (sll->sll_family != AF_PACKET)
2556 return -EINVAL;
2557
2558 if (sll->sll_ifindex) {
2559 err = -ENODEV;
3b1e0a65 2560 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2561 if (dev == NULL)
2562 goto out;
2563 }
2564 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2565
2566out:
2567 return err;
2568}
2569
2570static struct proto packet_proto = {
2571 .name = "PACKET",
2572 .owner = THIS_MODULE,
2573 .obj_size = sizeof(struct packet_sock),
2574};
2575
2576/*
1ce4f28b 2577 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2578 */
2579
3f378b68
EP
2580static int packet_create(struct net *net, struct socket *sock, int protocol,
2581 int kern)
1da177e4
LT
2582{
2583 struct sock *sk;
2584 struct packet_sock *po;
0e11c91e 2585 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2586 int err;
2587
2588 if (!capable(CAP_NET_RAW))
2589 return -EPERM;
be02097c
DM
2590 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2591 sock->type != SOCK_PACKET)
1da177e4
LT
2592 return -ESOCKTNOSUPPORT;
2593
2594 sock->state = SS_UNCONNECTED;
2595
2596 err = -ENOBUFS;
6257ff21 2597 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2598 if (sk == NULL)
2599 goto out;
2600
2601 sock->ops = &packet_ops;
1da177e4
LT
2602 if (sock->type == SOCK_PACKET)
2603 sock->ops = &packet_ops_spkt;
be02097c 2604
1da177e4
LT
2605 sock_init_data(sock, sk);
2606
2607 po = pkt_sk(sk);
2608 sk->sk_family = PF_PACKET;
0e11c91e 2609 po->num = proto;
1da177e4
LT
2610
2611 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2612 sk_refcnt_debug_inc(sk);
1da177e4
LT
2613
2614 /*
2615 * Attach a protocol block
2616 */
2617
2618 spin_lock_init(&po->bind_lock);
905db440 2619 mutex_init(&po->pg_vec_lock);
1da177e4 2620 po->prot_hook.func = packet_rcv;
be02097c 2621
1da177e4
LT
2622 if (sock->type == SOCK_PACKET)
2623 po->prot_hook.func = packet_rcv_spkt;
be02097c 2624
1da177e4
LT
2625 po->prot_hook.af_packet_priv = sk;
2626
0e11c91e
AV
2627 if (proto) {
2628 po->prot_hook.type = proto;
ce06b03e 2629 register_prot_hook(sk);
1da177e4
LT
2630 }
2631
808f5114 2632 spin_lock_bh(&net->packet.sklist_lock);
2633 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 2634 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 2635 spin_unlock_bh(&net->packet.sklist_lock);
2636
40d4e3df 2637 return 0;
1da177e4
LT
2638out:
2639 return err;
2640}
2641
ed85b565
RC
2642static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2643{
2644 struct sock_exterr_skb *serr;
2645 struct sk_buff *skb, *skb2;
2646 int copied, err;
2647
2648 err = -EAGAIN;
2649 skb = skb_dequeue(&sk->sk_error_queue);
2650 if (skb == NULL)
2651 goto out;
2652
2653 copied = skb->len;
2654 if (copied > len) {
2655 msg->msg_flags |= MSG_TRUNC;
2656 copied = len;
2657 }
2658 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2659 if (err)
2660 goto out_free_skb;
2661
2662 sock_recv_timestamp(msg, sk, skb);
2663
2664 serr = SKB_EXT_ERR(skb);
2665 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2666 sizeof(serr->ee), &serr->ee);
2667
2668 msg->msg_flags |= MSG_ERRQUEUE;
2669 err = copied;
2670
2671 /* Reset and regenerate socket error */
2672 spin_lock_bh(&sk->sk_error_queue.lock);
2673 sk->sk_err = 0;
2674 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2675 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2676 spin_unlock_bh(&sk->sk_error_queue.lock);
2677 sk->sk_error_report(sk);
2678 } else
2679 spin_unlock_bh(&sk->sk_error_queue.lock);
2680
2681out_free_skb:
2682 kfree_skb(skb);
2683out:
2684 return err;
2685}
2686
1da177e4
LT
2687/*
2688 * Pull a packet from our receive queue and hand it to the user.
2689 * If necessary we block.
2690 */
2691
2692static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2693 struct msghdr *msg, size_t len, int flags)
2694{
2695 struct sock *sk = sock->sk;
2696 struct sk_buff *skb;
2697 int copied, err;
0fb375fb 2698 struct sockaddr_ll *sll;
bfd5f4a3 2699 int vnet_hdr_len = 0;
1da177e4
LT
2700
2701 err = -EINVAL;
ed85b565 2702 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2703 goto out;
2704
2705#if 0
2706 /* What error should we return now? EUNATTACH? */
2707 if (pkt_sk(sk)->ifindex < 0)
2708 return -ENODEV;
2709#endif
2710
ed85b565
RC
2711 if (flags & MSG_ERRQUEUE) {
2712 err = packet_recv_error(sk, msg, len);
2713 goto out;
2714 }
2715
1da177e4
LT
2716 /*
2717 * Call the generic datagram receiver. This handles all sorts
2718 * of horrible races and re-entrancy so we can forget about it
2719 * in the protocol layers.
2720 *
2721 * Now it will return ENETDOWN, if device have just gone down,
2722 * but then it will block.
2723 */
2724
40d4e3df 2725 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2726
2727 /*
1ce4f28b 2728 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2729 * handles the blocking we don't see and worry about blocking
2730 * retries.
2731 */
2732
8ae55f04 2733 if (skb == NULL)
1da177e4
LT
2734 goto out;
2735
bfd5f4a3
SS
2736 if (pkt_sk(sk)->has_vnet_hdr) {
2737 struct virtio_net_hdr vnet_hdr = { 0 };
2738
2739 err = -EINVAL;
2740 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2741 if (len < vnet_hdr_len)
bfd5f4a3
SS
2742 goto out_free;
2743
1f18b717
MK
2744 len -= vnet_hdr_len;
2745
bfd5f4a3
SS
2746 if (skb_is_gso(skb)) {
2747 struct skb_shared_info *sinfo = skb_shinfo(skb);
2748
2749 /* This is a hint as to how much should be linear. */
2750 vnet_hdr.hdr_len = skb_headlen(skb);
2751 vnet_hdr.gso_size = sinfo->gso_size;
2752 if (sinfo->gso_type & SKB_GSO_TCPV4)
2753 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2754 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2755 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2756 else if (sinfo->gso_type & SKB_GSO_UDP)
2757 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2758 else if (sinfo->gso_type & SKB_GSO_FCOE)
2759 goto out_free;
2760 else
2761 BUG();
2762 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2763 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2764 } else
2765 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2766
2767 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2768 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2769 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2770 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2771 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2772 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2773 } /* else everything is zero */
2774
2775 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2776 vnet_hdr_len);
2777 if (err < 0)
2778 goto out_free;
2779 }
2780
0fb375fb
EB
2781 /*
2782 * If the address length field is there to be filled in, we fill
2783 * it in now.
2784 */
2785
ffbc6111 2786 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2787 if (sock->type == SOCK_PACKET)
2788 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2789 else
2790 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2791
1da177e4
LT
2792 /*
2793 * You lose any data beyond the buffer you gave. If it worries a
2794 * user program they can ask the device for its MTU anyway.
2795 */
2796
2797 copied = skb->len;
40d4e3df
ED
2798 if (copied > len) {
2799 copied = len;
2800 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2801 }
2802
2803 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2804 if (err)
2805 goto out_free;
2806
3b885787 2807 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2808
2809 if (msg->msg_name)
ffbc6111
HX
2810 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2811 msg->msg_namelen);
1da177e4 2812
8dc41944 2813 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2814 struct tpacket_auxdata aux;
2815
2816 aux.tp_status = TP_STATUS_USER;
2817 if (skb->ip_summed == CHECKSUM_PARTIAL)
2818 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2819 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2820 aux.tp_snaplen = skb->len;
2821 aux.tp_mac = 0;
bbe735e4 2822 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2823 if (vlan_tx_tag_present(skb)) {
2824 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2825 aux.tp_status |= TP_STATUS_VLAN_VALID;
2826 } else {
2827 aux.tp_vlan_tci = 0;
2828 }
13fcb7bd 2829 aux.tp_padding = 0;
ffbc6111 2830 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2831 }
2832
1da177e4
LT
2833 /*
2834 * Free or return the buffer as appropriate. Again this
2835 * hides all the races and re-entrancy issues from us.
2836 */
bfd5f4a3 2837 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2838
2839out_free:
2840 skb_free_datagram(sk, skb);
2841out:
2842 return err;
2843}
2844
1da177e4
LT
2845static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2846 int *uaddr_len, int peer)
2847{
2848 struct net_device *dev;
2849 struct sock *sk = sock->sk;
2850
2851 if (peer)
2852 return -EOPNOTSUPP;
2853
2854 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2855 rcu_read_lock();
2856 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2857 if (dev)
67286640 2858 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2859 else
1da177e4 2860 memset(uaddr->sa_data, 0, 14);
654d1f8a 2861 rcu_read_unlock();
1da177e4
LT
2862 *uaddr_len = sizeof(*uaddr);
2863
2864 return 0;
2865}
1da177e4
LT
2866
2867static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2868 int *uaddr_len, int peer)
2869{
2870 struct net_device *dev;
2871 struct sock *sk = sock->sk;
2872 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2873 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2874
2875 if (peer)
2876 return -EOPNOTSUPP;
2877
2878 sll->sll_family = AF_PACKET;
2879 sll->sll_ifindex = po->ifindex;
2880 sll->sll_protocol = po->num;
67286640 2881 sll->sll_pkttype = 0;
654d1f8a
ED
2882 rcu_read_lock();
2883 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2884 if (dev) {
2885 sll->sll_hatype = dev->type;
2886 sll->sll_halen = dev->addr_len;
2887 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2888 } else {
2889 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2890 sll->sll_halen = 0;
2891 }
654d1f8a 2892 rcu_read_unlock();
0fb375fb 2893 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2894
2895 return 0;
2896}
2897
2aeb0b88
WC
2898static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2899 int what)
1da177e4
LT
2900{
2901 switch (i->type) {
2902 case PACKET_MR_MULTICAST:
1162563f
JP
2903 if (i->alen != dev->addr_len)
2904 return -EINVAL;
1da177e4 2905 if (what > 0)
22bedad3 2906 return dev_mc_add(dev, i->addr);
1da177e4 2907 else
22bedad3 2908 return dev_mc_del(dev, i->addr);
1da177e4
LT
2909 break;
2910 case PACKET_MR_PROMISC:
2aeb0b88 2911 return dev_set_promiscuity(dev, what);
1da177e4
LT
2912 break;
2913 case PACKET_MR_ALLMULTI:
2aeb0b88 2914 return dev_set_allmulti(dev, what);
1da177e4 2915 break;
d95ed927 2916 case PACKET_MR_UNICAST:
1162563f
JP
2917 if (i->alen != dev->addr_len)
2918 return -EINVAL;
d95ed927 2919 if (what > 0)
a748ee24 2920 return dev_uc_add(dev, i->addr);
d95ed927 2921 else
a748ee24 2922 return dev_uc_del(dev, i->addr);
d95ed927 2923 break;
40d4e3df
ED
2924 default:
2925 break;
1da177e4 2926 }
2aeb0b88 2927 return 0;
1da177e4
LT
2928}
2929
2930static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2931{
40d4e3df 2932 for ( ; i; i = i->next) {
1da177e4
LT
2933 if (i->ifindex == dev->ifindex)
2934 packet_dev_mc(dev, i, what);
2935 }
2936}
2937
0fb375fb 2938static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2939{
2940 struct packet_sock *po = pkt_sk(sk);
2941 struct packet_mclist *ml, *i;
2942 struct net_device *dev;
2943 int err;
2944
2945 rtnl_lock();
2946
2947 err = -ENODEV;
3b1e0a65 2948 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2949 if (!dev)
2950 goto done;
2951
2952 err = -EINVAL;
1162563f 2953 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2954 goto done;
2955
2956 err = -ENOBUFS;
8b3a7005 2957 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2958 if (i == NULL)
2959 goto done;
2960
2961 err = 0;
2962 for (ml = po->mclist; ml; ml = ml->next) {
2963 if (ml->ifindex == mreq->mr_ifindex &&
2964 ml->type == mreq->mr_type &&
2965 ml->alen == mreq->mr_alen &&
2966 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2967 ml->count++;
2968 /* Free the new element ... */
2969 kfree(i);
2970 goto done;
2971 }
2972 }
2973
2974 i->type = mreq->mr_type;
2975 i->ifindex = mreq->mr_ifindex;
2976 i->alen = mreq->mr_alen;
2977 memcpy(i->addr, mreq->mr_address, i->alen);
2978 i->count = 1;
2979 i->next = po->mclist;
2980 po->mclist = i;
2aeb0b88
WC
2981 err = packet_dev_mc(dev, i, 1);
2982 if (err) {
2983 po->mclist = i->next;
2984 kfree(i);
2985 }
1da177e4
LT
2986
2987done:
2988 rtnl_unlock();
2989 return err;
2990}
2991
0fb375fb 2992static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2993{
2994 struct packet_mclist *ml, **mlp;
2995
2996 rtnl_lock();
2997
2998 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2999 if (ml->ifindex == mreq->mr_ifindex &&
3000 ml->type == mreq->mr_type &&
3001 ml->alen == mreq->mr_alen &&
3002 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3003 if (--ml->count == 0) {
3004 struct net_device *dev;
3005 *mlp = ml->next;
ad959e76
ED
3006 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3007 if (dev)
1da177e4 3008 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3009 kfree(ml);
3010 }
3011 rtnl_unlock();
3012 return 0;
3013 }
3014 }
3015 rtnl_unlock();
3016 return -EADDRNOTAVAIL;
3017}
3018
3019static void packet_flush_mclist(struct sock *sk)
3020{
3021 struct packet_sock *po = pkt_sk(sk);
3022 struct packet_mclist *ml;
3023
3024 if (!po->mclist)
3025 return;
3026
3027 rtnl_lock();
3028 while ((ml = po->mclist) != NULL) {
3029 struct net_device *dev;
3030
3031 po->mclist = ml->next;
ad959e76
ED
3032 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3033 if (dev != NULL)
1da177e4 3034 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3035 kfree(ml);
3036 }
3037 rtnl_unlock();
3038}
1da177e4
LT
3039
3040static int
b7058842 3041packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3042{
3043 struct sock *sk = sock->sk;
8dc41944 3044 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3045 int ret;
3046
3047 if (level != SOL_PACKET)
3048 return -ENOPROTOOPT;
3049
69e3c75f 3050 switch (optname) {
1ce4f28b 3051 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3052 case PACKET_DROP_MEMBERSHIP:
3053 {
0fb375fb
EB
3054 struct packet_mreq_max mreq;
3055 int len = optlen;
3056 memset(&mreq, 0, sizeof(mreq));
3057 if (len < sizeof(struct packet_mreq))
1da177e4 3058 return -EINVAL;
0fb375fb
EB
3059 if (len > sizeof(mreq))
3060 len = sizeof(mreq);
40d4e3df 3061 if (copy_from_user(&mreq, optval, len))
1da177e4 3062 return -EFAULT;
0fb375fb
EB
3063 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3064 return -EINVAL;
1da177e4
LT
3065 if (optname == PACKET_ADD_MEMBERSHIP)
3066 ret = packet_mc_add(sk, &mreq);
3067 else
3068 ret = packet_mc_drop(sk, &mreq);
3069 return ret;
3070 }
a2efcfa0 3071
1da177e4 3072 case PACKET_RX_RING:
69e3c75f 3073 case PACKET_TX_RING:
1da177e4 3074 {
f6fb8f10 3075 union tpacket_req_u req_u;
3076 int len;
1da177e4 3077
f6fb8f10 3078 switch (po->tp_version) {
3079 case TPACKET_V1:
3080 case TPACKET_V2:
3081 len = sizeof(req_u.req);
3082 break;
3083 case TPACKET_V3:
3084 default:
3085 len = sizeof(req_u.req3);
3086 break;
3087 }
3088 if (optlen < len)
1da177e4 3089 return -EINVAL;
bfd5f4a3
SS
3090 if (pkt_sk(sk)->has_vnet_hdr)
3091 return -EINVAL;
f6fb8f10 3092 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3093 return -EFAULT;
f6fb8f10 3094 return packet_set_ring(sk, &req_u, 0,
3095 optname == PACKET_TX_RING);
1da177e4
LT
3096 }
3097 case PACKET_COPY_THRESH:
3098 {
3099 int val;
3100
40d4e3df 3101 if (optlen != sizeof(val))
1da177e4 3102 return -EINVAL;
40d4e3df 3103 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3104 return -EFAULT;
3105
3106 pkt_sk(sk)->copy_thresh = val;
3107 return 0;
3108 }
bbd6ef87
PM
3109 case PACKET_VERSION:
3110 {
3111 int val;
3112
3113 if (optlen != sizeof(val))
3114 return -EINVAL;
69e3c75f 3115 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3116 return -EBUSY;
3117 if (copy_from_user(&val, optval, sizeof(val)))
3118 return -EFAULT;
3119 switch (val) {
3120 case TPACKET_V1:
3121 case TPACKET_V2:
f6fb8f10 3122 case TPACKET_V3:
bbd6ef87
PM
3123 po->tp_version = val;
3124 return 0;
3125 default:
3126 return -EINVAL;
3127 }
3128 }
8913336a
PM
3129 case PACKET_RESERVE:
3130 {
3131 unsigned int val;
3132
3133 if (optlen != sizeof(val))
3134 return -EINVAL;
69e3c75f 3135 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3136 return -EBUSY;
3137 if (copy_from_user(&val, optval, sizeof(val)))
3138 return -EFAULT;
3139 po->tp_reserve = val;
3140 return 0;
3141 }
69e3c75f
JB
3142 case PACKET_LOSS:
3143 {
3144 unsigned int val;
3145
3146 if (optlen != sizeof(val))
3147 return -EINVAL;
3148 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3149 return -EBUSY;
3150 if (copy_from_user(&val, optval, sizeof(val)))
3151 return -EFAULT;
3152 po->tp_loss = !!val;
3153 return 0;
3154 }
8dc41944
HX
3155 case PACKET_AUXDATA:
3156 {
3157 int val;
3158
3159 if (optlen < sizeof(val))
3160 return -EINVAL;
3161 if (copy_from_user(&val, optval, sizeof(val)))
3162 return -EFAULT;
3163
3164 po->auxdata = !!val;
3165 return 0;
3166 }
80feaacb
PWJ
3167 case PACKET_ORIGDEV:
3168 {
3169 int val;
3170
3171 if (optlen < sizeof(val))
3172 return -EINVAL;
3173 if (copy_from_user(&val, optval, sizeof(val)))
3174 return -EFAULT;
3175
3176 po->origdev = !!val;
3177 return 0;
3178 }
bfd5f4a3
SS
3179 case PACKET_VNET_HDR:
3180 {
3181 int val;
3182
3183 if (sock->type != SOCK_RAW)
3184 return -EINVAL;
3185 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3186 return -EBUSY;
3187 if (optlen < sizeof(val))
3188 return -EINVAL;
3189 if (copy_from_user(&val, optval, sizeof(val)))
3190 return -EFAULT;
3191
3192 po->has_vnet_hdr = !!val;
3193 return 0;
3194 }
614f60fa
SM
3195 case PACKET_TIMESTAMP:
3196 {
3197 int val;
3198
3199 if (optlen != sizeof(val))
3200 return -EINVAL;
3201 if (copy_from_user(&val, optval, sizeof(val)))
3202 return -EFAULT;
3203
3204 po->tp_tstamp = val;
3205 return 0;
3206 }
dc99f600
DM
3207 case PACKET_FANOUT:
3208 {
3209 int val;
3210
3211 if (optlen != sizeof(val))
3212 return -EINVAL;
3213 if (copy_from_user(&val, optval, sizeof(val)))
3214 return -EFAULT;
3215
3216 return fanout_add(sk, val & 0xffff, val >> 16);
3217 }
1da177e4
LT
3218 default:
3219 return -ENOPROTOOPT;
3220 }
3221}
3222
3223static int packet_getsockopt(struct socket *sock, int level, int optname,
3224 char __user *optval, int __user *optlen)
3225{
3226 int len;
8dc41944 3227 int val;
1da177e4
LT
3228 struct sock *sk = sock->sk;
3229 struct packet_sock *po = pkt_sk(sk);
8dc41944
HX
3230 void *data;
3231 struct tpacket_stats st;
f6fb8f10 3232 union tpacket_stats_u st_u;
1da177e4
LT
3233
3234 if (level != SOL_PACKET)
3235 return -ENOPROTOOPT;
3236
8ae55f04
KK
3237 if (get_user(len, optlen))
3238 return -EFAULT;
1da177e4
LT
3239
3240 if (len < 0)
3241 return -EINVAL;
1ce4f28b 3242
69e3c75f 3243 switch (optname) {
1da177e4 3244 case PACKET_STATISTICS:
f6fb8f10 3245 if (po->tp_version == TPACKET_V3) {
3246 len = sizeof(struct tpacket_stats_v3);
3247 } else {
3248 if (len > sizeof(struct tpacket_stats))
3249 len = sizeof(struct tpacket_stats);
3250 }
1da177e4 3251 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3252 if (po->tp_version == TPACKET_V3) {
3253 memcpy(&st_u.stats3, &po->stats,
3254 sizeof(struct tpacket_stats));
3255 st_u.stats3.tp_freeze_q_cnt =
3256 po->stats_u.stats3.tp_freeze_q_cnt;
3257 st_u.stats3.tp_packets += po->stats.tp_drops;
3258 data = &st_u.stats3;
3259 } else {
3260 st = po->stats;
3261 st.tp_packets += st.tp_drops;
3262 data = &st;
3263 }
1da177e4
LT
3264 memset(&po->stats, 0, sizeof(st));
3265 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3266 break;
3267 case PACKET_AUXDATA:
3268 if (len > sizeof(int))
3269 len = sizeof(int);
3270 val = po->auxdata;
3271
80feaacb
PWJ
3272 data = &val;
3273 break;
3274 case PACKET_ORIGDEV:
3275 if (len > sizeof(int))
3276 len = sizeof(int);
3277 val = po->origdev;
3278
bfd5f4a3
SS
3279 data = &val;
3280 break;
3281 case PACKET_VNET_HDR:
3282 if (len > sizeof(int))
3283 len = sizeof(int);
3284 val = po->has_vnet_hdr;
3285
8dc41944 3286 data = &val;
1da177e4 3287 break;
bbd6ef87
PM
3288 case PACKET_VERSION:
3289 if (len > sizeof(int))
3290 len = sizeof(int);
3291 val = po->tp_version;
3292 data = &val;
3293 break;
3294 case PACKET_HDRLEN:
3295 if (len > sizeof(int))
3296 len = sizeof(int);
3297 if (copy_from_user(&val, optval, len))
3298 return -EFAULT;
3299 switch (val) {
3300 case TPACKET_V1:
3301 val = sizeof(struct tpacket_hdr);
3302 break;
3303 case TPACKET_V2:
3304 val = sizeof(struct tpacket2_hdr);
3305 break;
f6fb8f10 3306 case TPACKET_V3:
3307 val = sizeof(struct tpacket3_hdr);
3308 break;
bbd6ef87
PM
3309 default:
3310 return -EINVAL;
3311 }
3312 data = &val;
3313 break;
8913336a
PM
3314 case PACKET_RESERVE:
3315 if (len > sizeof(unsigned int))
3316 len = sizeof(unsigned int);
3317 val = po->tp_reserve;
3318 data = &val;
3319 break;
69e3c75f
JB
3320 case PACKET_LOSS:
3321 if (len > sizeof(unsigned int))
3322 len = sizeof(unsigned int);
3323 val = po->tp_loss;
3324 data = &val;
3325 break;
614f60fa
SM
3326 case PACKET_TIMESTAMP:
3327 if (len > sizeof(int))
3328 len = sizeof(int);
3329 val = po->tp_tstamp;
3330 data = &val;
3331 break;
dc99f600
DM
3332 case PACKET_FANOUT:
3333 if (len > sizeof(int))
3334 len = sizeof(int);
3335 val = (po->fanout ?
3336 ((u32)po->fanout->id |
3337 ((u32)po->fanout->type << 16)) :
3338 0);
3339 data = &val;
3340 break;
1da177e4
LT
3341 default:
3342 return -ENOPROTOOPT;
3343 }
3344
8ae55f04
KK
3345 if (put_user(len, optlen))
3346 return -EFAULT;
8dc41944
HX
3347 if (copy_to_user(optval, data, len))
3348 return -EFAULT;
8ae55f04 3349 return 0;
1da177e4
LT
3350}
3351
3352
3353static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3354{
3355 struct sock *sk;
3356 struct hlist_node *node;
ad930650 3357 struct net_device *dev = data;
c346dca1 3358 struct net *net = dev_net(dev);
1da177e4 3359
808f5114 3360 rcu_read_lock();
3361 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
3362 struct packet_sock *po = pkt_sk(sk);
3363
3364 switch (msg) {
3365 case NETDEV_UNREGISTER:
1da177e4
LT
3366 if (po->mclist)
3367 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3368 /* fallthrough */
3369
1da177e4
LT
3370 case NETDEV_DOWN:
3371 if (dev->ifindex == po->ifindex) {
3372 spin_lock(&po->bind_lock);
3373 if (po->running) {
ce06b03e 3374 __unregister_prot_hook(sk, false);
1da177e4
LT
3375 sk->sk_err = ENETDOWN;
3376 if (!sock_flag(sk, SOCK_DEAD))
3377 sk->sk_error_report(sk);
3378 }
3379 if (msg == NETDEV_UNREGISTER) {
3380 po->ifindex = -1;
160ff18a
BG
3381 if (po->prot_hook.dev)
3382 dev_put(po->prot_hook.dev);
1da177e4
LT
3383 po->prot_hook.dev = NULL;
3384 }
3385 spin_unlock(&po->bind_lock);
3386 }
3387 break;
3388 case NETDEV_UP:
808f5114 3389 if (dev->ifindex == po->ifindex) {
3390 spin_lock(&po->bind_lock);
ce06b03e
DM
3391 if (po->num)
3392 register_prot_hook(sk);
808f5114 3393 spin_unlock(&po->bind_lock);
1da177e4 3394 }
1da177e4
LT
3395 break;
3396 }
3397 }
808f5114 3398 rcu_read_unlock();
1da177e4
LT
3399 return NOTIFY_DONE;
3400}
3401
3402
3403static int packet_ioctl(struct socket *sock, unsigned int cmd,
3404 unsigned long arg)
3405{
3406 struct sock *sk = sock->sk;
3407
69e3c75f 3408 switch (cmd) {
40d4e3df
ED
3409 case SIOCOUTQ:
3410 {
3411 int amount = sk_wmem_alloc_get(sk);
31e6d363 3412
40d4e3df
ED
3413 return put_user(amount, (int __user *)arg);
3414 }
3415 case SIOCINQ:
3416 {
3417 struct sk_buff *skb;
3418 int amount = 0;
3419
3420 spin_lock_bh(&sk->sk_receive_queue.lock);
3421 skb = skb_peek(&sk->sk_receive_queue);
3422 if (skb)
3423 amount = skb->len;
3424 spin_unlock_bh(&sk->sk_receive_queue.lock);
3425 return put_user(amount, (int __user *)arg);
3426 }
3427 case SIOCGSTAMP:
3428 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3429 case SIOCGSTAMPNS:
3430 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3431
1da177e4 3432#ifdef CONFIG_INET
40d4e3df
ED
3433 case SIOCADDRT:
3434 case SIOCDELRT:
3435 case SIOCDARP:
3436 case SIOCGARP:
3437 case SIOCSARP:
3438 case SIOCGIFADDR:
3439 case SIOCSIFADDR:
3440 case SIOCGIFBRDADDR:
3441 case SIOCSIFBRDADDR:
3442 case SIOCGIFNETMASK:
3443 case SIOCSIFNETMASK:
3444 case SIOCGIFDSTADDR:
3445 case SIOCSIFDSTADDR:
3446 case SIOCSIFFLAGS:
40d4e3df 3447 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3448#endif
3449
40d4e3df
ED
3450 default:
3451 return -ENOIOCTLCMD;
1da177e4
LT
3452 }
3453 return 0;
3454}
3455
40d4e3df 3456static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3457 poll_table *wait)
3458{
3459 struct sock *sk = sock->sk;
3460 struct packet_sock *po = pkt_sk(sk);
3461 unsigned int mask = datagram_poll(file, sock, wait);
3462
3463 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3464 if (po->rx_ring.pg_vec) {
f6fb8f10 3465 if (!packet_previous_rx_frame(po, &po->rx_ring,
3466 TP_STATUS_KERNEL))
1da177e4
LT
3467 mask |= POLLIN | POLLRDNORM;
3468 }
3469 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3470 spin_lock_bh(&sk->sk_write_queue.lock);
3471 if (po->tx_ring.pg_vec) {
3472 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3473 mask |= POLLOUT | POLLWRNORM;
3474 }
3475 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3476 return mask;
3477}
3478
3479
3480/* Dirty? Well, I still did not learn better way to account
3481 * for user mmaps.
3482 */
3483
3484static void packet_mm_open(struct vm_area_struct *vma)
3485{
3486 struct file *file = vma->vm_file;
40d4e3df 3487 struct socket *sock = file->private_data;
1da177e4 3488 struct sock *sk = sock->sk;
1ce4f28b 3489
1da177e4
LT
3490 if (sk)
3491 atomic_inc(&pkt_sk(sk)->mapped);
3492}
3493
3494static void packet_mm_close(struct vm_area_struct *vma)
3495{
3496 struct file *file = vma->vm_file;
40d4e3df 3497 struct socket *sock = file->private_data;
1da177e4 3498 struct sock *sk = sock->sk;
1ce4f28b 3499
1da177e4
LT
3500 if (sk)
3501 atomic_dec(&pkt_sk(sk)->mapped);
3502}
3503
f0f37e2f 3504static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3505 .open = packet_mm_open,
3506 .close = packet_mm_close,
1da177e4
LT
3507};
3508
0e3125c7
NH
3509static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3510 unsigned int len)
1da177e4
LT
3511{
3512 int i;
3513
4ebf0ae2 3514 for (i = 0; i < len; i++) {
0e3125c7 3515 if (likely(pg_vec[i].buffer)) {
c56b4d90 3516 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3517 vfree(pg_vec[i].buffer);
3518 else
3519 free_pages((unsigned long)pg_vec[i].buffer,
3520 order);
3521 pg_vec[i].buffer = NULL;
3522 }
1da177e4
LT
3523 }
3524 kfree(pg_vec);
3525}
3526
eea49cc9 3527static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3528{
0e3125c7
NH
3529 char *buffer = NULL;
3530 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3531 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3532
3533 buffer = (char *) __get_free_pages(gfp_flags, order);
3534
3535 if (buffer)
3536 return buffer;
3537
3538 /*
3539 * __get_free_pages failed, fall back to vmalloc
3540 */
bbce5a59 3541 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3542
0e3125c7
NH
3543 if (buffer)
3544 return buffer;
3545
3546 /*
3547 * vmalloc failed, lets dig into swap here
3548 */
0e3125c7
NH
3549 gfp_flags &= ~__GFP_NORETRY;
3550 buffer = (char *)__get_free_pages(gfp_flags, order);
3551 if (buffer)
3552 return buffer;
3553
3554 /*
3555 * complete and utter failure
3556 */
3557 return NULL;
4ebf0ae2
DM
3558}
3559
0e3125c7 3560static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3561{
3562 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3563 struct pgv *pg_vec;
4ebf0ae2
DM
3564 int i;
3565
0e3125c7 3566 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3567 if (unlikely(!pg_vec))
3568 goto out;
3569
3570 for (i = 0; i < block_nr; i++) {
c56b4d90 3571 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3572 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3573 goto out_free_pgvec;
3574 }
3575
3576out:
3577 return pg_vec;
3578
3579out_free_pgvec:
3580 free_pg_vec(pg_vec, order, block_nr);
3581 pg_vec = NULL;
3582 goto out;
3583}
1da177e4 3584
f6fb8f10 3585static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3586 int closing, int tx_ring)
1da177e4 3587{
0e3125c7 3588 struct pgv *pg_vec = NULL;
1da177e4 3589 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3590 int was_running, order = 0;
69e3c75f
JB
3591 struct packet_ring_buffer *rb;
3592 struct sk_buff_head *rb_queue;
0e11c91e 3593 __be16 num;
f6fb8f10 3594 int err = -EINVAL;
3595 /* Added to avoid minimal code churn */
3596 struct tpacket_req *req = &req_u->req;
3597
3598 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3599 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3600 WARN(1, "Tx-ring is not supported.\n");
3601 goto out;
3602 }
1ce4f28b 3603
69e3c75f
JB
3604 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3605 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3606
69e3c75f
JB
3607 err = -EBUSY;
3608 if (!closing) {
3609 if (atomic_read(&po->mapped))
3610 goto out;
3611 if (atomic_read(&rb->pending))
3612 goto out;
3613 }
1da177e4 3614
69e3c75f
JB
3615 if (req->tp_block_nr) {
3616 /* Sanity tests and some calculations */
3617 err = -EBUSY;
3618 if (unlikely(rb->pg_vec))
3619 goto out;
1da177e4 3620
bbd6ef87
PM
3621 switch (po->tp_version) {
3622 case TPACKET_V1:
3623 po->tp_hdrlen = TPACKET_HDRLEN;
3624 break;
3625 case TPACKET_V2:
3626 po->tp_hdrlen = TPACKET2_HDRLEN;
3627 break;
f6fb8f10 3628 case TPACKET_V3:
3629 po->tp_hdrlen = TPACKET3_HDRLEN;
3630 break;
bbd6ef87
PM
3631 }
3632
69e3c75f 3633 err = -EINVAL;
4ebf0ae2 3634 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3635 goto out;
4ebf0ae2 3636 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3637 goto out;
8913336a 3638 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3639 po->tp_reserve))
3640 goto out;
4ebf0ae2 3641 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3642 goto out;
1da177e4 3643
69e3c75f
JB
3644 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3645 if (unlikely(rb->frames_per_block <= 0))
3646 goto out;
3647 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3648 req->tp_frame_nr))
3649 goto out;
1da177e4
LT
3650
3651 err = -ENOMEM;
4ebf0ae2
DM
3652 order = get_order(req->tp_block_size);
3653 pg_vec = alloc_pg_vec(req, order);
3654 if (unlikely(!pg_vec))
1da177e4 3655 goto out;
f6fb8f10 3656 switch (po->tp_version) {
3657 case TPACKET_V3:
3658 /* Transmit path is not supported. We checked
3659 * it above but just being paranoid
3660 */
3661 if (!tx_ring)
3662 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3663 break;
3664 default:
3665 break;
3666 }
69e3c75f
JB
3667 }
3668 /* Done */
3669 else {
3670 err = -EINVAL;
4ebf0ae2 3671 if (unlikely(req->tp_frame_nr))
69e3c75f 3672 goto out;
1da177e4
LT
3673 }
3674
3675 lock_sock(sk);
3676
3677 /* Detach socket from network */
3678 spin_lock(&po->bind_lock);
3679 was_running = po->running;
3680 num = po->num;
3681 if (was_running) {
1da177e4 3682 po->num = 0;
ce06b03e 3683 __unregister_prot_hook(sk, false);
1da177e4
LT
3684 }
3685 spin_unlock(&po->bind_lock);
1ce4f28b 3686
1da177e4
LT
3687 synchronize_net();
3688
3689 err = -EBUSY;
905db440 3690 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3691 if (closing || atomic_read(&po->mapped) == 0) {
3692 err = 0;
69e3c75f 3693 spin_lock_bh(&rb_queue->lock);
c053fd96 3694 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3695 rb->frame_max = (req->tp_frame_nr - 1);
3696 rb->head = 0;
3697 rb->frame_size = req->tp_frame_size;
3698 spin_unlock_bh(&rb_queue->lock);
3699
c053fd96
CG
3700 swap(rb->pg_vec_order, order);
3701 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3702
3703 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3704 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3705 tpacket_rcv : packet_rcv;
3706 skb_queue_purge(rb_queue);
1da177e4 3707 if (atomic_read(&po->mapped))
40d4e3df
ED
3708 pr_err("packet_mmap: vma is busy: %d\n",
3709 atomic_read(&po->mapped));
1da177e4 3710 }
905db440 3711 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3712
3713 spin_lock(&po->bind_lock);
ce06b03e 3714 if (was_running) {
1da177e4 3715 po->num = num;
ce06b03e 3716 register_prot_hook(sk);
1da177e4
LT
3717 }
3718 spin_unlock(&po->bind_lock);
f6fb8f10 3719 if (closing && (po->tp_version > TPACKET_V2)) {
3720 /* Because we don't support block-based V3 on tx-ring */
3721 if (!tx_ring)
3722 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3723 }
1da177e4
LT
3724 release_sock(sk);
3725
1da177e4
LT
3726 if (pg_vec)
3727 free_pg_vec(pg_vec, order, req->tp_block_nr);
3728out:
3729 return err;
3730}
3731
69e3c75f
JB
3732static int packet_mmap(struct file *file, struct socket *sock,
3733 struct vm_area_struct *vma)
1da177e4
LT
3734{
3735 struct sock *sk = sock->sk;
3736 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3737 unsigned long size, expected_size;
3738 struct packet_ring_buffer *rb;
1da177e4
LT
3739 unsigned long start;
3740 int err = -EINVAL;
3741 int i;
3742
3743 if (vma->vm_pgoff)
3744 return -EINVAL;
3745
905db440 3746 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3747
3748 expected_size = 0;
3749 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3750 if (rb->pg_vec) {
3751 expected_size += rb->pg_vec_len
3752 * rb->pg_vec_pages
3753 * PAGE_SIZE;
3754 }
3755 }
3756
3757 if (expected_size == 0)
1da177e4 3758 goto out;
69e3c75f
JB
3759
3760 size = vma->vm_end - vma->vm_start;
3761 if (size != expected_size)
1da177e4
LT
3762 goto out;
3763
1da177e4 3764 start = vma->vm_start;
69e3c75f
JB
3765 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3766 if (rb->pg_vec == NULL)
3767 continue;
3768
3769 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3770 struct page *page;
3771 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3772 int pg_num;
3773
c56b4d90
CG
3774 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3775 page = pgv_to_page(kaddr);
69e3c75f
JB
3776 err = vm_insert_page(vma, start, page);
3777 if (unlikely(err))
3778 goto out;
3779 start += PAGE_SIZE;
0e3125c7 3780 kaddr += PAGE_SIZE;
69e3c75f 3781 }
4ebf0ae2 3782 }
1da177e4 3783 }
69e3c75f 3784
4ebf0ae2 3785 atomic_inc(&po->mapped);
1da177e4
LT
3786 vma->vm_ops = &packet_mmap_ops;
3787 err = 0;
3788
3789out:
905db440 3790 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3791 return err;
3792}
1da177e4 3793
90ddc4f0 3794static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3795 .family = PF_PACKET,
3796 .owner = THIS_MODULE,
3797 .release = packet_release,
3798 .bind = packet_bind_spkt,
3799 .connect = sock_no_connect,
3800 .socketpair = sock_no_socketpair,
3801 .accept = sock_no_accept,
3802 .getname = packet_getname_spkt,
3803 .poll = datagram_poll,
3804 .ioctl = packet_ioctl,
3805 .listen = sock_no_listen,
3806 .shutdown = sock_no_shutdown,
3807 .setsockopt = sock_no_setsockopt,
3808 .getsockopt = sock_no_getsockopt,
3809 .sendmsg = packet_sendmsg_spkt,
3810 .recvmsg = packet_recvmsg,
3811 .mmap = sock_no_mmap,
3812 .sendpage = sock_no_sendpage,
3813};
1da177e4 3814
90ddc4f0 3815static const struct proto_ops packet_ops = {
1da177e4
LT
3816 .family = PF_PACKET,
3817 .owner = THIS_MODULE,
3818 .release = packet_release,
3819 .bind = packet_bind,
3820 .connect = sock_no_connect,
3821 .socketpair = sock_no_socketpair,
3822 .accept = sock_no_accept,
1ce4f28b 3823 .getname = packet_getname,
1da177e4
LT
3824 .poll = packet_poll,
3825 .ioctl = packet_ioctl,
3826 .listen = sock_no_listen,
3827 .shutdown = sock_no_shutdown,
3828 .setsockopt = packet_setsockopt,
3829 .getsockopt = packet_getsockopt,
3830 .sendmsg = packet_sendmsg,
3831 .recvmsg = packet_recvmsg,
3832 .mmap = packet_mmap,
3833 .sendpage = sock_no_sendpage,
3834};
3835
ec1b4cf7 3836static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3837 .family = PF_PACKET,
3838 .create = packet_create,
3839 .owner = THIS_MODULE,
3840};
3841
3842static struct notifier_block packet_netdev_notifier = {
40d4e3df 3843 .notifier_call = packet_notifier,
1da177e4
LT
3844};
3845
3846#ifdef CONFIG_PROC_FS
1da177e4
LT
3847
3848static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3849 __acquires(RCU)
1da177e4 3850{
e372c414 3851 struct net *net = seq_file_net(seq);
808f5114 3852
3853 rcu_read_lock();
3854 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3855}
3856
3857static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3858{
1bf40954 3859 struct net *net = seq_file_net(seq);
808f5114 3860 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3861}
3862
3863static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3864 __releases(RCU)
1da177e4 3865{
808f5114 3866 rcu_read_unlock();
1da177e4
LT
3867}
3868
1ce4f28b 3869static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3870{
3871 if (v == SEQ_START_TOKEN)
3872 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3873 else {
b7ceabd9 3874 struct sock *s = sk_entry(v);
1da177e4
LT
3875 const struct packet_sock *po = pkt_sk(s);
3876
3877 seq_printf(seq,
71338aa7 3878 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3879 s,
3880 atomic_read(&s->sk_refcnt),
3881 s->sk_type,
3882 ntohs(po->num),
3883 po->ifindex,
3884 po->running,
3885 atomic_read(&s->sk_rmem_alloc),
3886 sock_i_uid(s),
40d4e3df 3887 sock_i_ino(s));
1da177e4
LT
3888 }
3889
3890 return 0;
3891}
3892
56b3d975 3893static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3894 .start = packet_seq_start,
3895 .next = packet_seq_next,
3896 .stop = packet_seq_stop,
3897 .show = packet_seq_show,
3898};
3899
3900static int packet_seq_open(struct inode *inode, struct file *file)
3901{
e372c414
DL
3902 return seq_open_net(inode, file, &packet_seq_ops,
3903 sizeof(struct seq_net_private));
1da177e4
LT
3904}
3905
da7071d7 3906static const struct file_operations packet_seq_fops = {
1da177e4
LT
3907 .owner = THIS_MODULE,
3908 .open = packet_seq_open,
3909 .read = seq_read,
3910 .llseek = seq_lseek,
e372c414 3911 .release = seq_release_net,
1da177e4
LT
3912};
3913
3914#endif
3915
2c8c1e72 3916static int __net_init packet_net_init(struct net *net)
d12d01d6 3917{
808f5114 3918 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 3919 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
3920
3921 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
3922 return -ENOMEM;
3923
3924 return 0;
3925}
3926
2c8c1e72 3927static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
3928{
3929 proc_net_remove(net, "packet");
3930}
3931
3932static struct pernet_operations packet_net_ops = {
3933 .init = packet_net_init,
3934 .exit = packet_net_exit,
3935};
3936
3937
1da177e4
LT
3938static void __exit packet_exit(void)
3939{
1da177e4 3940 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3941 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3942 sock_unregister(PF_PACKET);
3943 proto_unregister(&packet_proto);
3944}
3945
3946static int __init packet_init(void)
3947{
3948 int rc = proto_register(&packet_proto, 0);
3949
3950 if (rc != 0)
3951 goto out;
3952
3953 sock_register(&packet_family_ops);
d12d01d6 3954 register_pernet_subsys(&packet_net_ops);
1da177e4 3955 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3956out:
3957 return rc;
3958}
3959
3960module_init(packet_init);
3961module_exit(packet_exit);
3962MODULE_LICENSE("GPL");
3963MODULE_ALIAS_NETPROTO(PF_PACKET);