]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
drivers/net/irda: fix error return code
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
1da177e4
LT
76#include <asm/uaccess.h>
77#include <asm/ioctls.h>
78#include <asm/page.h>
a1f8e7f7 79#include <asm/cacheflush.h>
1da177e4
LT
80#include <asm/io.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/poll.h>
84#include <linux/module.h>
85#include <linux/init.h>
905db440 86#include <linux/mutex.h>
05423b24 87#include <linux/if_vlan.h>
bfd5f4a3 88#include <linux/virtio_net.h>
ed85b565 89#include <linux/errqueue.h>
614f60fa 90#include <linux/net_tstamp.h>
1da177e4
LT
91
92#ifdef CONFIG_INET
93#include <net/inet_common.h>
94#endif
95
1da177e4
LT
96/*
97 Assumptions:
98 - if device has no dev->hard_header routine, it adds and removes ll header
99 inside itself. In this case ll header is invisible outside of device,
100 but higher levels still should reserve dev->hard_header_len.
101 Some devices are enough clever to reallocate skb, when header
102 will not fit to reserved space (tunnel), another ones are silly
103 (PPP).
104 - packet socket receives packets with pulled ll header,
105 so that SOCK_RAW should push it back.
106
107On receive:
108-----------
109
110Incoming, dev->hard_header!=NULL
b0e380b1
ACM
111 mac_header -> ll header
112 data -> data
1da177e4
LT
113
114Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
115 mac_header -> ll header
116 data -> ll header
1da177e4
LT
117
118Incoming, dev->hard_header==NULL
b0e380b1
ACM
119 mac_header -> UNKNOWN position. It is very likely, that it points to ll
120 header. PPP makes it, that is wrong, because introduce
db0c58f9 121 assymetry between rx and tx paths.
b0e380b1 122 data -> data
1da177e4
LT
123
124Outgoing, dev->hard_header==NULL
b0e380b1
ACM
125 mac_header -> data. ll header is still not built!
126 data -> data
1da177e4
LT
127
128Resume
129 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
131
132On transmit:
133------------
134
135dev->hard_header != NULL
b0e380b1
ACM
136 mac_header -> ll header
137 data -> ll header
1da177e4
LT
138
139dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
140 mac_header -> data
141 data -> data
1da177e4
LT
142
143 We should set nh.raw on output to correct posistion,
144 packet classifier depends on it.
145 */
146
1da177e4
LT
147/* Private packet socket structures. */
148
40d4e3df 149struct packet_mclist {
1da177e4
LT
150 struct packet_mclist *next;
151 int ifindex;
152 int count;
153 unsigned short type;
154 unsigned short alen;
0fb375fb
EB
155 unsigned char addr[MAX_ADDR_LEN];
156};
157/* identical to struct packet_mreq except it has
158 * a longer address field.
159 */
40d4e3df 160struct packet_mreq_max {
0fb375fb
EB
161 int mr_ifindex;
162 unsigned short mr_type;
163 unsigned short mr_alen;
164 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 165};
a2efcfa0 166
f6fb8f10 167static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
168 int closing, int tx_ring);
169
f6fb8f10 170
171#define V3_ALIGNMENT (8)
172
bc59ba39 173#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 174
175#define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
177
178/* kbdq - kernel block descriptor queue */
bc59ba39 179struct tpacket_kbdq_core {
f6fb8f10 180 struct pgv *pkbdq;
181 unsigned int feature_req_word;
182 unsigned int hdrlen;
183 unsigned char reset_pending_on_curr_blk;
184 unsigned char delete_blk_timer;
185 unsigned short kactive_blk_num;
186 unsigned short blk_sizeof_priv;
187
188 /* last_kactive_blk_num:
189 * trick to see if user-space has caught up
190 * in order to avoid refreshing timer when every single pkt arrives.
191 */
192 unsigned short last_kactive_blk_num;
193
194 char *pkblk_start;
195 char *pkblk_end;
196 int kblk_size;
197 unsigned int knum_blocks;
198 uint64_t knxt_seq_num;
199 char *prev;
200 char *nxt_offset;
201 struct sk_buff *skb;
202
203 atomic_t blk_fill_in_prog;
204
205 /* Default is set to 8ms */
206#define DEFAULT_PRB_RETIRE_TOV (8)
207
208 unsigned short retire_blk_tov;
209 unsigned short version;
210 unsigned long tov_in_jiffies;
211
212 /* timer to retire an outstanding block */
213 struct timer_list retire_blk_timer;
214};
215
216#define PGV_FROM_VMALLOC 1
0e3125c7
NH
217struct pgv {
218 char *buffer;
0e3125c7
NH
219};
220
69e3c75f 221struct packet_ring_buffer {
0e3125c7 222 struct pgv *pg_vec;
69e3c75f
JB
223 unsigned int head;
224 unsigned int frames_per_block;
225 unsigned int frame_size;
226 unsigned int frame_max;
227
228 unsigned int pg_vec_order;
229 unsigned int pg_vec_pages;
230 unsigned int pg_vec_len;
231
bc59ba39 232 struct tpacket_kbdq_core prb_bdqc;
69e3c75f
JB
233 atomic_t pending;
234};
235
f6fb8f10 236#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
237#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
238#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
239#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
240#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
241#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
242#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
243
69e3c75f
JB
244struct packet_sock;
245static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4 246
f6fb8f10 247static void *packet_previous_frame(struct packet_sock *po,
248 struct packet_ring_buffer *rb,
249 int status);
250static void packet_increment_head(struct packet_ring_buffer *buff);
bc59ba39 251static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
252 struct tpacket_block_desc *);
253static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 254 struct packet_sock *);
bc59ba39 255static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 256 struct packet_sock *, unsigned int status);
bc59ba39 257static int prb_queue_frozen(struct tpacket_kbdq_core *);
258static void prb_open_block(struct tpacket_kbdq_core *,
259 struct tpacket_block_desc *);
f6fb8f10 260static void prb_retire_rx_blk_timer_expired(unsigned long);
bc59ba39 261static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
262static void prb_init_blk_timer(struct packet_sock *,
263 struct tpacket_kbdq_core *,
264 void (*func) (unsigned long));
265static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
266static void prb_clear_rxhash(struct tpacket_kbdq_core *,
267 struct tpacket3_hdr *);
268static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
269 struct tpacket3_hdr *);
1da177e4
LT
270static void packet_flush_mclist(struct sock *sk);
271
dc99f600 272struct packet_fanout;
1da177e4
LT
273struct packet_sock {
274 /* struct sock has to be the first member of packet_sock */
275 struct sock sk;
dc99f600 276 struct packet_fanout *fanout;
1da177e4 277 struct tpacket_stats stats;
f6fb8f10 278 union tpacket_stats_u stats_u;
69e3c75f
JB
279 struct packet_ring_buffer rx_ring;
280 struct packet_ring_buffer tx_ring;
1da177e4 281 int copy_thresh;
1da177e4 282 spinlock_t bind_lock;
905db440 283 struct mutex pg_vec_lock;
8dc41944 284 unsigned int running:1, /* prot_hook is attached*/
80feaacb 285 auxdata:1,
bfd5f4a3
SS
286 origdev:1,
287 has_vnet_hdr:1;
1da177e4 288 int ifindex; /* bound device */
0e11c91e 289 __be16 num;
1da177e4 290 struct packet_mclist *mclist;
1da177e4 291 atomic_t mapped;
bbd6ef87
PM
292 enum tpacket_versions tp_version;
293 unsigned int tp_hdrlen;
8913336a 294 unsigned int tp_reserve;
69e3c75f 295 unsigned int tp_loss:1;
614f60fa 296 unsigned int tp_tstamp;
94b05952 297 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
298};
299
dc99f600
DM
300#define PACKET_FANOUT_MAX 256
301
302struct packet_fanout {
303#ifdef CONFIG_NET_NS
304 struct net *net;
305#endif
306 unsigned int num_members;
307 u16 id;
308 u8 type;
7736d33f 309 u8 defrag;
dc99f600
DM
310 atomic_t rr_cur;
311 struct list_head list;
312 struct sock *arr[PACKET_FANOUT_MAX];
313 spinlock_t lock;
314 atomic_t sk_ref;
315 struct packet_type prot_hook ____cacheline_aligned_in_smp;
316};
317
ffbc6111
HX
318struct packet_skb_cb {
319 unsigned int origlen;
320 union {
321 struct sockaddr_pkt pkt;
322 struct sockaddr_ll ll;
323 } sa;
324};
325
326#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 327
bc59ba39 328#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 329#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 330 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 331#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 332 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 333#define GET_NEXT_PRB_BLK_NUM(x) \
334 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
335 ((x)->kactive_blk_num+1) : 0)
336
eea49cc9 337static struct packet_sock *pkt_sk(struct sock *sk)
ce06b03e
DM
338{
339 return (struct packet_sock *)sk;
340}
341
dc99f600
DM
342static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
343static void __fanout_link(struct sock *sk, struct packet_sock *po);
344
ce06b03e
DM
345/* register_prot_hook must be invoked with the po->bind_lock held,
346 * or from a context in which asynchronous accesses to the packet
347 * socket is not possible (packet_create()).
348 */
349static void register_prot_hook(struct sock *sk)
350{
351 struct packet_sock *po = pkt_sk(sk);
352 if (!po->running) {
dc99f600
DM
353 if (po->fanout)
354 __fanout_link(sk, po);
355 else
356 dev_add_pack(&po->prot_hook);
ce06b03e
DM
357 sock_hold(sk);
358 po->running = 1;
359 }
360}
361
362/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
363 * held. If the sync parameter is true, we will temporarily drop
364 * the po->bind_lock and do a synchronize_net to make sure no
365 * asynchronous packet processing paths still refer to the elements
366 * of po->prot_hook. If the sync parameter is false, it is the
367 * callers responsibility to take care of this.
368 */
369static void __unregister_prot_hook(struct sock *sk, bool sync)
370{
371 struct packet_sock *po = pkt_sk(sk);
372
373 po->running = 0;
dc99f600
DM
374 if (po->fanout)
375 __fanout_unlink(sk, po);
376 else
377 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
378 __sock_put(sk);
379
380 if (sync) {
381 spin_unlock(&po->bind_lock);
382 synchronize_net();
383 spin_lock(&po->bind_lock);
384 }
385}
386
387static void unregister_prot_hook(struct sock *sk, bool sync)
388{
389 struct packet_sock *po = pkt_sk(sk);
390
391 if (po->running)
392 __unregister_prot_hook(sk, sync);
393}
394
f6dafa95 395static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
396{
397 if (is_vmalloc_addr(addr))
398 return vmalloc_to_page(addr);
399 return virt_to_page(addr);
400}
401
69e3c75f 402static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 403{
bbd6ef87
PM
404 union {
405 struct tpacket_hdr *h1;
406 struct tpacket2_hdr *h2;
407 void *raw;
408 } h;
1da177e4 409
69e3c75f 410 h.raw = frame;
bbd6ef87
PM
411 switch (po->tp_version) {
412 case TPACKET_V1:
69e3c75f 413 h.h1->tp_status = status;
0af55bb5 414 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
415 break;
416 case TPACKET_V2:
69e3c75f 417 h.h2->tp_status = status;
0af55bb5 418 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 419 break;
f6fb8f10 420 case TPACKET_V3:
69e3c75f 421 default:
f6fb8f10 422 WARN(1, "TPACKET version not supported.\n");
69e3c75f 423 BUG();
bbd6ef87 424 }
69e3c75f
JB
425
426 smp_wmb();
bbd6ef87
PM
427}
428
69e3c75f 429static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
430{
431 union {
432 struct tpacket_hdr *h1;
433 struct tpacket2_hdr *h2;
434 void *raw;
435 } h;
436
69e3c75f
JB
437 smp_rmb();
438
bbd6ef87
PM
439 h.raw = frame;
440 switch (po->tp_version) {
441 case TPACKET_V1:
0af55bb5 442 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 443 return h.h1->tp_status;
bbd6ef87 444 case TPACKET_V2:
0af55bb5 445 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 446 return h.h2->tp_status;
f6fb8f10 447 case TPACKET_V3:
69e3c75f 448 default:
f6fb8f10 449 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
450 BUG();
451 return 0;
bbd6ef87 452 }
1da177e4 453}
69e3c75f
JB
454
455static void *packet_lookup_frame(struct packet_sock *po,
456 struct packet_ring_buffer *rb,
457 unsigned int position,
458 int status)
459{
460 unsigned int pg_vec_pos, frame_offset;
461 union {
462 struct tpacket_hdr *h1;
463 struct tpacket2_hdr *h2;
464 void *raw;
465 } h;
466
467 pg_vec_pos = position / rb->frames_per_block;
468 frame_offset = position % rb->frames_per_block;
469
0e3125c7
NH
470 h.raw = rb->pg_vec[pg_vec_pos].buffer +
471 (frame_offset * rb->frame_size);
69e3c75f
JB
472
473 if (status != __packet_get_status(po, h.raw))
474 return NULL;
475
476 return h.raw;
477}
478
eea49cc9 479static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
480 struct packet_ring_buffer *rb,
481 int status)
482{
483 return packet_lookup_frame(po, rb, rb->head, status);
484}
485
bc59ba39 486static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 487{
488 del_timer_sync(&pkc->retire_blk_timer);
489}
490
491static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
492 int tx_ring,
493 struct sk_buff_head *rb_queue)
494{
bc59ba39 495 struct tpacket_kbdq_core *pkc;
f6fb8f10 496
497 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
498
499 spin_lock(&rb_queue->lock);
500 pkc->delete_blk_timer = 1;
501 spin_unlock(&rb_queue->lock);
502
503 prb_del_retire_blk_timer(pkc);
504}
505
506static void prb_init_blk_timer(struct packet_sock *po,
bc59ba39 507 struct tpacket_kbdq_core *pkc,
f6fb8f10 508 void (*func) (unsigned long))
509{
510 init_timer(&pkc->retire_blk_timer);
511 pkc->retire_blk_timer.data = (long)po;
512 pkc->retire_blk_timer.function = func;
513 pkc->retire_blk_timer.expires = jiffies;
514}
515
516static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
517{
bc59ba39 518 struct tpacket_kbdq_core *pkc;
f6fb8f10 519
520 if (tx_ring)
521 BUG();
522
523 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
524 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
525}
526
527static int prb_calc_retire_blk_tmo(struct packet_sock *po,
528 int blk_size_in_bytes)
529{
530 struct net_device *dev;
531 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
4bc71cb9
JP
532 struct ethtool_cmd ecmd;
533 int err;
e440cf2c 534 u32 speed;
f6fb8f10 535
4bc71cb9
JP
536 rtnl_lock();
537 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
538 if (unlikely(!dev)) {
539 rtnl_unlock();
f6fb8f10 540 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9
JP
541 }
542 err = __ethtool_get_settings(dev, &ecmd);
e440cf2c 543 speed = ethtool_cmd_speed(&ecmd);
4bc71cb9
JP
544 rtnl_unlock();
545 if (!err) {
4bc71cb9
JP
546 /*
547 * If the link speed is so slow you don't really
548 * need to worry about perf anyways
549 */
e440cf2c 550 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
4bc71cb9 551 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 552 } else {
553 msec = 1;
554 div = speed / 1000;
f6fb8f10 555 }
556 }
557
558 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
559
560 if (div)
561 mbits /= div;
562
563 tmo = mbits * msec;
564
565 if (div)
566 return tmo+1;
567 return tmo;
568}
569
bc59ba39 570static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 571 union tpacket_req_u *req_u)
572{
573 p1->feature_req_word = req_u->req3.tp_feature_req_word;
574}
575
576static void init_prb_bdqc(struct packet_sock *po,
577 struct packet_ring_buffer *rb,
578 struct pgv *pg_vec,
579 union tpacket_req_u *req_u, int tx_ring)
580{
bc59ba39 581 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
582 struct tpacket_block_desc *pbd;
f6fb8f10 583
584 memset(p1, 0x0, sizeof(*p1));
585
586 p1->knxt_seq_num = 1;
587 p1->pkbdq = pg_vec;
bc59ba39 588 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 589 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 590 p1->kblk_size = req_u->req3.tp_block_size;
591 p1->knum_blocks = req_u->req3.tp_block_nr;
592 p1->hdrlen = po->tp_hdrlen;
593 p1->version = po->tp_version;
594 p1->last_kactive_blk_num = 0;
595 po->stats_u.stats3.tp_freeze_q_cnt = 0;
596 if (req_u->req3.tp_retire_blk_tov)
597 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
598 else
599 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
600 req_u->req3.tp_block_size);
601 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
602 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
603
604 prb_init_ft_ops(p1, req_u);
605 prb_setup_retire_blk_timer(po, tx_ring);
606 prb_open_block(p1, pbd);
607}
608
609/* Do NOT update the last_blk_num first.
610 * Assumes sk_buff_head lock is held.
611 */
bc59ba39 612static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 613{
614 mod_timer(&pkc->retire_blk_timer,
615 jiffies + pkc->tov_in_jiffies);
616 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
617}
618
619/*
620 * Timer logic:
621 * 1) We refresh the timer only when we open a block.
622 * By doing this we don't waste cycles refreshing the timer
623 * on packet-by-packet basis.
624 *
625 * With a 1MB block-size, on a 1Gbps line, it will take
626 * i) ~8 ms to fill a block + ii) memcpy etc.
627 * In this cut we are not accounting for the memcpy time.
628 *
629 * So, if the user sets the 'tmo' to 10ms then the timer
630 * will never fire while the block is still getting filled
631 * (which is what we want). However, the user could choose
632 * to close a block early and that's fine.
633 *
634 * But when the timer does fire, we check whether or not to refresh it.
635 * Since the tmo granularity is in msecs, it is not too expensive
636 * to refresh the timer, lets say every '8' msecs.
637 * Either the user can set the 'tmo' or we can derive it based on
638 * a) line-speed and b) block-size.
639 * prb_calc_retire_blk_tmo() calculates the tmo.
640 *
641 */
642static void prb_retire_rx_blk_timer_expired(unsigned long data)
643{
644 struct packet_sock *po = (struct packet_sock *)data;
bc59ba39 645 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
f6fb8f10 646 unsigned int frozen;
bc59ba39 647 struct tpacket_block_desc *pbd;
f6fb8f10 648
649 spin_lock(&po->sk.sk_receive_queue.lock);
650
651 frozen = prb_queue_frozen(pkc);
652 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
653
654 if (unlikely(pkc->delete_blk_timer))
655 goto out;
656
657 /* We only need to plug the race when the block is partially filled.
658 * tpacket_rcv:
659 * lock(); increment BLOCK_NUM_PKTS; unlock()
660 * copy_bits() is in progress ...
661 * timer fires on other cpu:
662 * we can't retire the current block because copy_bits
663 * is in progress.
664 *
665 */
666 if (BLOCK_NUM_PKTS(pbd)) {
667 while (atomic_read(&pkc->blk_fill_in_prog)) {
668 /* Waiting for skb_copy_bits to finish... */
669 cpu_relax();
670 }
671 }
672
673 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
674 if (!frozen) {
675 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
676 if (!prb_dispatch_next_block(pkc, po))
677 goto refresh_timer;
678 else
679 goto out;
680 } else {
681 /* Case 1. Queue was frozen because user-space was
682 * lagging behind.
683 */
684 if (prb_curr_blk_in_use(pkc, pbd)) {
685 /*
686 * Ok, user-space is still behind.
687 * So just refresh the timer.
688 */
689 goto refresh_timer;
690 } else {
691 /* Case 2. queue was frozen,user-space caught up,
692 * now the link went idle && the timer fired.
693 * We don't have a block to close.So we open this
694 * block and restart the timer.
695 * opening a block thaws the queue,restarts timer
696 * Thawing/timer-refresh is a side effect.
697 */
698 prb_open_block(pkc, pbd);
699 goto out;
700 }
701 }
702 }
703
704refresh_timer:
705 _prb_refresh_rx_retire_blk_timer(pkc);
706
707out:
708 spin_unlock(&po->sk.sk_receive_queue.lock);
709}
710
eea49cc9 711static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 712 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 713{
714 /* Flush everything minus the block header */
715
716#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
717 u8 *start, *end;
718
719 start = (u8 *)pbd1;
720
721 /* Skip the block header(we know header WILL fit in 4K) */
722 start += PAGE_SIZE;
723
724 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
725 for (; start < end; start += PAGE_SIZE)
726 flush_dcache_page(pgv_to_page(start));
727
728 smp_wmb();
729#endif
730
731 /* Now update the block status. */
732
733 BLOCK_STATUS(pbd1) = status;
734
735 /* Flush the block header */
736
737#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
738 start = (u8 *)pbd1;
739 flush_dcache_page(pgv_to_page(start));
740
741 smp_wmb();
742#endif
743}
744
745/*
746 * Side effect:
747 *
748 * 1) flush the block
749 * 2) Increment active_blk_num
750 *
751 * Note:We DONT refresh the timer on purpose.
752 * Because almost always the next block will be opened.
753 */
bc59ba39 754static void prb_close_block(struct tpacket_kbdq_core *pkc1,
755 struct tpacket_block_desc *pbd1,
f6fb8f10 756 struct packet_sock *po, unsigned int stat)
757{
758 __u32 status = TP_STATUS_USER | stat;
759
760 struct tpacket3_hdr *last_pkt;
bc59ba39 761 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 762
763 if (po->stats.tp_drops)
764 status |= TP_STATUS_LOSING;
765
766 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
767 last_pkt->tp_next_offset = 0;
768
769 /* Get the ts of the last pkt */
770 if (BLOCK_NUM_PKTS(pbd1)) {
771 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
772 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
773 } else {
774 /* Ok, we tmo'd - so get the current time */
775 struct timespec ts;
776 getnstimeofday(&ts);
777 h1->ts_last_pkt.ts_sec = ts.tv_sec;
778 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
779 }
780
781 smp_wmb();
782
783 /* Flush the block */
784 prb_flush_block(pkc1, pbd1, status);
785
786 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
787}
788
eea49cc9 789static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 790{
791 pkc->reset_pending_on_curr_blk = 0;
792}
793
794/*
795 * Side effect of opening a block:
796 *
797 * 1) prb_queue is thawed.
798 * 2) retire_blk_timer is refreshed.
799 *
800 */
bc59ba39 801static void prb_open_block(struct tpacket_kbdq_core *pkc1,
802 struct tpacket_block_desc *pbd1)
f6fb8f10 803{
804 struct timespec ts;
bc59ba39 805 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 806
807 smp_rmb();
808
809 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
810
811 /* We could have just memset this but we will lose the
812 * flexibility of making the priv area sticky
813 */
814 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
815 BLOCK_NUM_PKTS(pbd1) = 0;
816 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
817 getnstimeofday(&ts);
818 h1->ts_first_pkt.ts_sec = ts.tv_sec;
819 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
820 pkc1->pkblk_start = (char *)pbd1;
e3192690 821 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 822 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
823 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
824 pbd1->version = pkc1->version;
825 pkc1->prev = pkc1->nxt_offset;
826 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
827 prb_thaw_queue(pkc1);
828 _prb_refresh_rx_retire_blk_timer(pkc1);
829
830 smp_wmb();
831
832 return;
833 }
834
835 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
836 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
837 dump_stack();
838 BUG();
839}
840
841/*
842 * Queue freeze logic:
843 * 1) Assume tp_block_nr = 8 blocks.
844 * 2) At time 't0', user opens Rx ring.
845 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
846 * 4) user-space is either sleeping or processing block '0'.
847 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
848 * it will close block-7,loop around and try to fill block '0'.
849 * call-flow:
850 * __packet_lookup_frame_in_block
851 * prb_retire_current_block()
852 * prb_dispatch_next_block()
853 * |->(BLOCK_STATUS == USER) evaluates to true
854 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
855 * 6) Now there are two cases:
856 * 6.1) Link goes idle right after the queue is frozen.
857 * But remember, the last open_block() refreshed the timer.
858 * When this timer expires,it will refresh itself so that we can
859 * re-open block-0 in near future.
860 * 6.2) Link is busy and keeps on receiving packets. This is a simple
861 * case and __packet_lookup_frame_in_block will check if block-0
862 * is free and can now be re-used.
863 */
eea49cc9 864static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 865 struct packet_sock *po)
866{
867 pkc->reset_pending_on_curr_blk = 1;
868 po->stats_u.stats3.tp_freeze_q_cnt++;
869}
870
871#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
872
873/*
874 * If the next block is free then we will dispatch it
875 * and return a good offset.
876 * Else, we will freeze the queue.
877 * So, caller must check the return value.
878 */
bc59ba39 879static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 880 struct packet_sock *po)
881{
bc59ba39 882 struct tpacket_block_desc *pbd;
f6fb8f10 883
884 smp_rmb();
885
886 /* 1. Get current block num */
887 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
888
889 /* 2. If this block is currently in_use then freeze the queue */
890 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
891 prb_freeze_queue(pkc, po);
892 return NULL;
893 }
894
895 /*
896 * 3.
897 * open this block and return the offset where the first packet
898 * needs to get stored.
899 */
900 prb_open_block(pkc, pbd);
901 return (void *)pkc->nxt_offset;
902}
903
bc59ba39 904static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 905 struct packet_sock *po, unsigned int status)
906{
bc59ba39 907 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 908
909 /* retire/close the current block */
910 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
911 /*
912 * Plug the case where copy_bits() is in progress on
913 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
914 * have space to copy the pkt in the current block and
915 * called prb_retire_current_block()
916 *
917 * We don't need to worry about the TMO case because
918 * the timer-handler already handled this case.
919 */
920 if (!(status & TP_STATUS_BLK_TMO)) {
921 while (atomic_read(&pkc->blk_fill_in_prog)) {
922 /* Waiting for skb_copy_bits to finish... */
923 cpu_relax();
924 }
925 }
926 prb_close_block(pkc, pbd, po, status);
927 return;
928 }
929
930 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
931 dump_stack();
932 BUG();
933}
934
eea49cc9 935static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
bc59ba39 936 struct tpacket_block_desc *pbd)
f6fb8f10 937{
938 return TP_STATUS_USER & BLOCK_STATUS(pbd);
939}
940
eea49cc9 941static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 942{
943 return pkc->reset_pending_on_curr_blk;
944}
945
eea49cc9 946static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 947{
bc59ba39 948 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 949 atomic_dec(&pkc->blk_fill_in_prog);
950}
951
eea49cc9 952static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 953 struct tpacket3_hdr *ppd)
954{
955 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
956}
957
eea49cc9 958static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 959 struct tpacket3_hdr *ppd)
960{
961 ppd->hv1.tp_rxhash = 0;
962}
963
eea49cc9 964static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 965 struct tpacket3_hdr *ppd)
966{
967 if (vlan_tx_tag_present(pkc->skb)) {
968 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
969 ppd->tp_status = TP_STATUS_VLAN_VALID;
970 } else {
971 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
972 }
973}
974
bc59ba39 975static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 976 struct tpacket3_hdr *ppd)
977{
978 prb_fill_vlan_info(pkc, ppd);
979
980 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
981 prb_fill_rxhash(pkc, ppd);
982 else
983 prb_clear_rxhash(pkc, ppd);
984}
985
eea49cc9 986static void prb_fill_curr_block(char *curr,
bc59ba39 987 struct tpacket_kbdq_core *pkc,
988 struct tpacket_block_desc *pbd,
f6fb8f10 989 unsigned int len)
990{
991 struct tpacket3_hdr *ppd;
992
993 ppd = (struct tpacket3_hdr *)curr;
994 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
995 pkc->prev = curr;
996 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
997 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
998 BLOCK_NUM_PKTS(pbd) += 1;
999 atomic_inc(&pkc->blk_fill_in_prog);
1000 prb_run_all_ft_ops(pkc, ppd);
1001}
1002
1003/* Assumes caller has the sk->rx_queue.lock */
1004static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1005 struct sk_buff *skb,
1006 int status,
1007 unsigned int len
1008 )
1009{
bc59ba39 1010 struct tpacket_kbdq_core *pkc;
1011 struct tpacket_block_desc *pbd;
f6fb8f10 1012 char *curr, *end;
1013
e3192690 1014 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1015 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1016
1017 /* Queue is frozen when user space is lagging behind */
1018 if (prb_queue_frozen(pkc)) {
1019 /*
1020 * Check if that last block which caused the queue to freeze,
1021 * is still in_use by user-space.
1022 */
1023 if (prb_curr_blk_in_use(pkc, pbd)) {
1024 /* Can't record this packet */
1025 return NULL;
1026 } else {
1027 /*
1028 * Ok, the block was released by user-space.
1029 * Now let's open that block.
1030 * opening a block also thaws the queue.
1031 * Thawing is a side effect.
1032 */
1033 prb_open_block(pkc, pbd);
1034 }
1035 }
1036
1037 smp_mb();
1038 curr = pkc->nxt_offset;
1039 pkc->skb = skb;
e3192690 1040 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1041
1042 /* first try the current block */
1043 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1044 prb_fill_curr_block(curr, pkc, pbd, len);
1045 return (void *)curr;
1046 }
1047
1048 /* Ok, close the current block */
1049 prb_retire_current_block(pkc, po, 0);
1050
1051 /* Now, try to dispatch the next block */
1052 curr = (char *)prb_dispatch_next_block(pkc, po);
1053 if (curr) {
1054 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1055 prb_fill_curr_block(curr, pkc, pbd, len);
1056 return (void *)curr;
1057 }
1058
1059 /*
1060 * No free blocks are available.user_space hasn't caught up yet.
1061 * Queue was just frozen and now this packet will get dropped.
1062 */
1063 return NULL;
1064}
1065
eea49cc9 1066static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1067 struct sk_buff *skb,
1068 int status, unsigned int len)
1069{
1070 char *curr = NULL;
1071 switch (po->tp_version) {
1072 case TPACKET_V1:
1073 case TPACKET_V2:
1074 curr = packet_lookup_frame(po, &po->rx_ring,
1075 po->rx_ring.head, status);
1076 return curr;
1077 case TPACKET_V3:
1078 return __packet_lookup_frame_in_block(po, skb, status, len);
1079 default:
1080 WARN(1, "TPACKET version not supported\n");
1081 BUG();
99aa3473 1082 return NULL;
f6fb8f10 1083 }
1084}
1085
eea49cc9 1086static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1087 struct packet_ring_buffer *rb,
1088 unsigned int previous,
1089 int status)
1090{
bc59ba39 1091 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1092 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
f6fb8f10 1093
1094 if (status != BLOCK_STATUS(pbd))
1095 return NULL;
1096 return pbd;
1097}
1098
eea49cc9 1099static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1100{
1101 unsigned int prev;
1102 if (rb->prb_bdqc.kactive_blk_num)
1103 prev = rb->prb_bdqc.kactive_blk_num-1;
1104 else
1105 prev = rb->prb_bdqc.knum_blocks-1;
1106 return prev;
1107}
1108
1109/* Assumes caller has held the rx_queue.lock */
eea49cc9 1110static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1111 struct packet_ring_buffer *rb,
1112 int status)
1113{
1114 unsigned int previous = prb_previous_blk_num(rb);
1115 return prb_lookup_block(po, rb, previous, status);
1116}
1117
eea49cc9 1118static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 if (po->tp_version <= TPACKET_V2)
1123 return packet_previous_frame(po, rb, status);
1124
1125 return __prb_previous_block(po, rb, status);
1126}
1127
eea49cc9 1128static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1129 struct packet_ring_buffer *rb)
1130{
1131 switch (po->tp_version) {
1132 case TPACKET_V1:
1133 case TPACKET_V2:
1134 return packet_increment_head(rb);
1135 case TPACKET_V3:
1136 default:
1137 WARN(1, "TPACKET version not supported.\n");
1138 BUG();
1139 return;
1140 }
1141}
1142
eea49cc9 1143static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1144 struct packet_ring_buffer *rb,
1145 int status)
1146{
1147 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1148 return packet_lookup_frame(po, rb, previous, status);
1149}
1150
eea49cc9 1151static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1152{
1153 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1154}
1155
1da177e4
LT
1156static void packet_sock_destruct(struct sock *sk)
1157{
ed85b565
RC
1158 skb_queue_purge(&sk->sk_error_queue);
1159
547b792c
IJ
1160 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1161 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1162
1163 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1164 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1165 return;
1166 }
1167
17ab56a2 1168 sk_refcnt_debug_dec(sk);
1da177e4
LT
1169}
1170
dc99f600
DM
1171static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1172{
1173 int x = atomic_read(&f->rr_cur) + 1;
1174
1175 if (x >= num)
1176 x = 0;
1177
1178 return x;
1179}
1180
1181static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1182{
1183 u32 idx, hash = skb->rxhash;
1184
1185 idx = ((u64)hash * num) >> 32;
1186
1187 return f->arr[idx];
1188}
1189
1190static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1191{
1192 int cur, old;
1193
1194 cur = atomic_read(&f->rr_cur);
1195 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1196 fanout_rr_next(f, num))) != cur)
1197 cur = old;
1198 return f->arr[cur];
1199}
1200
95ec3eb4
DM
1201static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1202{
1203 unsigned int cpu = smp_processor_id();
1204
1205 return f->arr[cpu % num];
1206}
1207
95ec3eb4
DM
1208static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1209 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1210{
1211 struct packet_fanout *f = pt->af_packet_priv;
1212 unsigned int num = f->num_members;
1213 struct packet_sock *po;
1214 struct sock *sk;
1215
1216 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1217 !num) {
1218 kfree_skb(skb);
1219 return 0;
1220 }
1221
95ec3eb4
DM
1222 switch (f->type) {
1223 case PACKET_FANOUT_HASH:
1224 default:
1225 if (f->defrag) {
bc416d97 1226 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
95ec3eb4
DM
1227 if (!skb)
1228 return 0;
1229 }
1230 skb_get_rxhash(skb);
1231 sk = fanout_demux_hash(f, skb, num);
1232 break;
1233 case PACKET_FANOUT_LB:
1234 sk = fanout_demux_lb(f, skb, num);
1235 break;
1236 case PACKET_FANOUT_CPU:
1237 sk = fanout_demux_cpu(f, skb, num);
1238 break;
dc99f600
DM
1239 }
1240
dc99f600
DM
1241 po = pkt_sk(sk);
1242
1243 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1244}
1245
1246static DEFINE_MUTEX(fanout_mutex);
1247static LIST_HEAD(fanout_list);
1248
1249static void __fanout_link(struct sock *sk, struct packet_sock *po)
1250{
1251 struct packet_fanout *f = po->fanout;
1252
1253 spin_lock(&f->lock);
1254 f->arr[f->num_members] = sk;
1255 smp_wmb();
1256 f->num_members++;
1257 spin_unlock(&f->lock);
1258}
1259
1260static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1261{
1262 struct packet_fanout *f = po->fanout;
1263 int i;
1264
1265 spin_lock(&f->lock);
1266 for (i = 0; i < f->num_members; i++) {
1267 if (f->arr[i] == sk)
1268 break;
1269 }
1270 BUG_ON(i >= f->num_members);
1271 f->arr[i] = f->arr[f->num_members - 1];
1272 f->num_members--;
1273 spin_unlock(&f->lock);
1274}
1275
7736d33f 1276static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1277{
1278 struct packet_sock *po = pkt_sk(sk);
1279 struct packet_fanout *f, *match;
7736d33f
DM
1280 u8 type = type_flags & 0xff;
1281 u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
dc99f600
DM
1282 int err;
1283
1284 switch (type) {
1285 case PACKET_FANOUT_HASH:
1286 case PACKET_FANOUT_LB:
95ec3eb4 1287 case PACKET_FANOUT_CPU:
dc99f600
DM
1288 break;
1289 default:
1290 return -EINVAL;
1291 }
1292
1293 if (!po->running)
1294 return -EINVAL;
1295
1296 if (po->fanout)
1297 return -EALREADY;
1298
1299 mutex_lock(&fanout_mutex);
1300 match = NULL;
1301 list_for_each_entry(f, &fanout_list, list) {
1302 if (f->id == id &&
1303 read_pnet(&f->net) == sock_net(sk)) {
1304 match = f;
1305 break;
1306 }
1307 }
afe62c68 1308 err = -EINVAL;
7736d33f 1309 if (match && match->defrag != defrag)
afe62c68 1310 goto out;
dc99f600 1311 if (!match) {
afe62c68 1312 err = -ENOMEM;
dc99f600 1313 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1314 if (!match)
1315 goto out;
1316 write_pnet(&match->net, sock_net(sk));
1317 match->id = id;
1318 match->type = type;
1319 match->defrag = defrag;
1320 atomic_set(&match->rr_cur, 0);
1321 INIT_LIST_HEAD(&match->list);
1322 spin_lock_init(&match->lock);
1323 atomic_set(&match->sk_ref, 0);
1324 match->prot_hook.type = po->prot_hook.type;
1325 match->prot_hook.dev = po->prot_hook.dev;
1326 match->prot_hook.func = packet_rcv_fanout;
1327 match->prot_hook.af_packet_priv = match;
1328 dev_add_pack(&match->prot_hook);
1329 list_add(&match->list, &fanout_list);
dc99f600 1330 }
afe62c68
ED
1331 err = -EINVAL;
1332 if (match->type == type &&
1333 match->prot_hook.type == po->prot_hook.type &&
1334 match->prot_hook.dev == po->prot_hook.dev) {
1335 err = -ENOSPC;
1336 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1337 __dev_remove_pack(&po->prot_hook);
1338 po->fanout = match;
1339 atomic_inc(&match->sk_ref);
1340 __fanout_link(sk, po);
1341 err = 0;
dc99f600
DM
1342 }
1343 }
afe62c68 1344out:
dc99f600
DM
1345 mutex_unlock(&fanout_mutex);
1346 return err;
1347}
1348
1349static void fanout_release(struct sock *sk)
1350{
1351 struct packet_sock *po = pkt_sk(sk);
1352 struct packet_fanout *f;
1353
1354 f = po->fanout;
1355 if (!f)
1356 return;
1357
1358 po->fanout = NULL;
1359
1360 mutex_lock(&fanout_mutex);
1361 if (atomic_dec_and_test(&f->sk_ref)) {
1362 list_del(&f->list);
1363 dev_remove_pack(&f->prot_hook);
1364 kfree(f);
1365 }
1366 mutex_unlock(&fanout_mutex);
1367}
1da177e4 1368
90ddc4f0 1369static const struct proto_ops packet_ops;
1da177e4 1370
90ddc4f0 1371static const struct proto_ops packet_ops_spkt;
1da177e4 1372
40d4e3df
ED
1373static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1374 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1375{
1376 struct sock *sk;
1377 struct sockaddr_pkt *spkt;
1378
1379 /*
1380 * When we registered the protocol we saved the socket in the data
1381 * field for just this event.
1382 */
1383
1384 sk = pt->af_packet_priv;
1ce4f28b 1385
1da177e4
LT
1386 /*
1387 * Yank back the headers [hope the device set this
1388 * right or kerboom...]
1389 *
1390 * Incoming packets have ll header pulled,
1391 * push it back.
1392 *
98e399f8 1393 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1394 * so that this procedure is noop.
1395 */
1396
1397 if (skb->pkt_type == PACKET_LOOPBACK)
1398 goto out;
1399
09ad9bc7 1400 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1401 goto out;
1402
40d4e3df
ED
1403 skb = skb_share_check(skb, GFP_ATOMIC);
1404 if (skb == NULL)
1da177e4
LT
1405 goto oom;
1406
1407 /* drop any routing info */
adf30907 1408 skb_dst_drop(skb);
1da177e4 1409
84531c24
PO
1410 /* drop conntrack reference */
1411 nf_reset(skb);
1412
ffbc6111 1413 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1414
98e399f8 1415 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1416
1417 /*
1418 * The SOCK_PACKET socket receives _all_ frames.
1419 */
1420
1421 spkt->spkt_family = dev->type;
1422 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1423 spkt->spkt_protocol = skb->protocol;
1424
1425 /*
1426 * Charge the memory to the socket. This is done specifically
1427 * to prevent sockets using all the memory up.
1428 */
1429
40d4e3df 1430 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1431 return 0;
1432
1433out:
1434 kfree_skb(skb);
1435oom:
1436 return 0;
1437}
1438
1439
1440/*
1441 * Output a raw packet to a device layer. This bypasses all the other
1442 * protocol layers and you must therefore supply it with a complete frame
1443 */
1ce4f28b 1444
1da177e4
LT
1445static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1446 struct msghdr *msg, size_t len)
1447{
1448 struct sock *sk = sock->sk;
40d4e3df 1449 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1450 struct sk_buff *skb = NULL;
1da177e4 1451 struct net_device *dev;
40d4e3df 1452 __be16 proto = 0;
1da177e4 1453 int err;
3bdc0eba 1454 int extra_len = 0;
1ce4f28b 1455
1da177e4 1456 /*
1ce4f28b 1457 * Get and verify the address.
1da177e4
LT
1458 */
1459
40d4e3df 1460 if (saddr) {
1da177e4 1461 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1462 return -EINVAL;
1463 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1464 proto = saddr->spkt_protocol;
1465 } else
1466 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1467
1468 /*
1ce4f28b 1469 * Find the device first to size check it
1da177e4
LT
1470 */
1471
de74e92a 1472 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1473retry:
654d1f8a
ED
1474 rcu_read_lock();
1475 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1476 err = -ENODEV;
1477 if (dev == NULL)
1478 goto out_unlock;
1ce4f28b 1479
d5e76b0a
DM
1480 err = -ENETDOWN;
1481 if (!(dev->flags & IFF_UP))
1482 goto out_unlock;
1483
1da177e4 1484 /*
40d4e3df
ED
1485 * You may not queue a frame bigger than the mtu. This is the lowest level
1486 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1487 */
1ce4f28b 1488
3bdc0eba
BG
1489 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1490 if (!netif_supports_nofcs(dev)) {
1491 err = -EPROTONOSUPPORT;
1492 goto out_unlock;
1493 }
1494 extra_len = 4; /* We're doing our own CRC */
1495 }
1496
1da177e4 1497 err = -EMSGSIZE;
3bdc0eba 1498 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1499 goto out_unlock;
1500
1a35ca80
ED
1501 if (!skb) {
1502 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1503 int tlen = dev->needed_tailroom;
1a35ca80
ED
1504 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1505
1506 rcu_read_unlock();
4ce40912 1507 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1508 if (skb == NULL)
1509 return -ENOBUFS;
1510 /* FIXME: Save some space for broken drivers that write a hard
1511 * header at transmission time by themselves. PPP is the notable
1512 * one here. This should really be fixed at the driver level.
1513 */
1514 skb_reserve(skb, reserved);
1515 skb_reset_network_header(skb);
1516
1517 /* Try to align data part correctly */
1518 if (hhlen) {
1519 skb->data -= hhlen;
1520 skb->tail -= hhlen;
1521 if (len < hhlen)
1522 skb_reset_network_header(skb);
1523 }
1524 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1525 if (err)
1526 goto out_free;
1527 goto retry;
1da177e4
LT
1528 }
1529
3bdc0eba 1530 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
57f89bfa
BG
1531 /* Earlier code assumed this would be a VLAN pkt,
1532 * double-check this now that we have the actual
1533 * packet in hand.
1534 */
1535 struct ethhdr *ehdr;
1536 skb_reset_mac_header(skb);
1537 ehdr = eth_hdr(skb);
1538 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1539 err = -EMSGSIZE;
1540 goto out_unlock;
1541 }
1542 }
1a35ca80 1543
1da177e4
LT
1544 skb->protocol = proto;
1545 skb->dev = dev;
1546 skb->priority = sk->sk_priority;
2d37a186 1547 skb->mark = sk->sk_mark;
2244d07b 1548 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1549 if (err < 0)
1550 goto out_unlock;
1da177e4 1551
3bdc0eba
BG
1552 if (unlikely(extra_len == 4))
1553 skb->no_fcs = 1;
1554
1da177e4 1555 dev_queue_xmit(skb);
654d1f8a 1556 rcu_read_unlock();
40d4e3df 1557 return len;
1da177e4 1558
1da177e4 1559out_unlock:
654d1f8a 1560 rcu_read_unlock();
1a35ca80
ED
1561out_free:
1562 kfree_skb(skb);
1da177e4
LT
1563 return err;
1564}
1da177e4 1565
eea49cc9 1566static unsigned int run_filter(const struct sk_buff *skb,
62ab0812 1567 const struct sock *sk,
dbcb5855 1568 unsigned int res)
1da177e4
LT
1569{
1570 struct sk_filter *filter;
fda9ef5d 1571
80f8f102
ED
1572 rcu_read_lock();
1573 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1574 if (filter != NULL)
0a14842f 1575 res = SK_RUN_FILTER(filter, skb);
80f8f102 1576 rcu_read_unlock();
1da177e4 1577
dbcb5855 1578 return res;
1da177e4
LT
1579}
1580
1581/*
62ab0812
ED
1582 * This function makes lazy skb cloning in hope that most of packets
1583 * are discarded by BPF.
1584 *
1585 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1586 * and skb->cb are mangled. It works because (and until) packets
1587 * falling here are owned by current CPU. Output packets are cloned
1588 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1589 * sequencially, so that if we return skb to original state on exit,
1590 * we will not harm anyone.
1da177e4
LT
1591 */
1592
40d4e3df
ED
1593static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1594 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1595{
1596 struct sock *sk;
1597 struct sockaddr_ll *sll;
1598 struct packet_sock *po;
40d4e3df 1599 u8 *skb_head = skb->data;
1da177e4 1600 int skb_len = skb->len;
dbcb5855 1601 unsigned int snaplen, res;
1da177e4
LT
1602
1603 if (skb->pkt_type == PACKET_LOOPBACK)
1604 goto drop;
1605
1606 sk = pt->af_packet_priv;
1607 po = pkt_sk(sk);
1608
09ad9bc7 1609 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1610 goto drop;
1611
1da177e4
LT
1612 skb->dev = dev;
1613
3b04ddde 1614 if (dev->header_ops) {
1da177e4 1615 /* The device has an explicit notion of ll header,
62ab0812
ED
1616 * exported to higher levels.
1617 *
1618 * Otherwise, the device hides details of its frame
1619 * structure, so that corresponding packet head is
1620 * never delivered to user.
1da177e4
LT
1621 */
1622 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1623 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1624 else if (skb->pkt_type == PACKET_OUTGOING) {
1625 /* Special case: outgoing packets have ll header at head */
bbe735e4 1626 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1627 }
1628 }
1629
1630 snaplen = skb->len;
1631
dbcb5855
DM
1632 res = run_filter(skb, sk, snaplen);
1633 if (!res)
fda9ef5d 1634 goto drop_n_restore;
dbcb5855
DM
1635 if (snaplen > res)
1636 snaplen = res;
1da177e4 1637
0fd7bac6 1638 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
1639 goto drop_n_acct;
1640
1641 if (skb_shared(skb)) {
1642 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1643 if (nskb == NULL)
1644 goto drop_n_acct;
1645
1646 if (skb_head != skb->data) {
1647 skb->data = skb_head;
1648 skb->len = skb_len;
1649 }
abc4e4fa 1650 consume_skb(skb);
1da177e4
LT
1651 skb = nskb;
1652 }
1653
ffbc6111
HX
1654 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1655 sizeof(skb->cb));
1656
1657 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1658 sll->sll_family = AF_PACKET;
1659 sll->sll_hatype = dev->type;
1660 sll->sll_protocol = skb->protocol;
1661 sll->sll_pkttype = skb->pkt_type;
8032b464 1662 if (unlikely(po->origdev))
80feaacb
PWJ
1663 sll->sll_ifindex = orig_dev->ifindex;
1664 else
1665 sll->sll_ifindex = dev->ifindex;
1da177e4 1666
b95cce35 1667 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1668
ffbc6111 1669 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1670
1da177e4
LT
1671 if (pskb_trim(skb, snaplen))
1672 goto drop_n_acct;
1673
1674 skb_set_owner_r(skb, sk);
1675 skb->dev = NULL;
adf30907 1676 skb_dst_drop(skb);
1da177e4 1677
84531c24
PO
1678 /* drop conntrack reference */
1679 nf_reset(skb);
1680
1da177e4
LT
1681 spin_lock(&sk->sk_receive_queue.lock);
1682 po->stats.tp_packets++;
3b885787 1683 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1684 __skb_queue_tail(&sk->sk_receive_queue, skb);
1685 spin_unlock(&sk->sk_receive_queue.lock);
1686 sk->sk_data_ready(sk, skb->len);
1687 return 0;
1688
1689drop_n_acct:
7091fbd8
WB
1690 spin_lock(&sk->sk_receive_queue.lock);
1691 po->stats.tp_drops++;
1692 atomic_inc(&sk->sk_drops);
1693 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
1694
1695drop_n_restore:
1696 if (skb_head != skb->data && skb_shared(skb)) {
1697 skb->data = skb_head;
1698 skb->len = skb_len;
1699 }
1700drop:
ead2ceb0 1701 consume_skb(skb);
1da177e4
LT
1702 return 0;
1703}
1704
40d4e3df
ED
1705static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1706 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1707{
1708 struct sock *sk;
1709 struct packet_sock *po;
1710 struct sockaddr_ll *sll;
bbd6ef87
PM
1711 union {
1712 struct tpacket_hdr *h1;
1713 struct tpacket2_hdr *h2;
f6fb8f10 1714 struct tpacket3_hdr *h3;
bbd6ef87
PM
1715 void *raw;
1716 } h;
40d4e3df 1717 u8 *skb_head = skb->data;
1da177e4 1718 int skb_len = skb->len;
dbcb5855 1719 unsigned int snaplen, res;
f6fb8f10 1720 unsigned long status = TP_STATUS_USER;
bbd6ef87 1721 unsigned short macoff, netoff, hdrlen;
1da177e4 1722 struct sk_buff *copy_skb = NULL;
b7aa0bf7 1723 struct timeval tv;
bbd6ef87 1724 struct timespec ts;
614f60fa 1725 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
1726
1727 if (skb->pkt_type == PACKET_LOOPBACK)
1728 goto drop;
1729
1730 sk = pt->af_packet_priv;
1731 po = pkt_sk(sk);
1732
09ad9bc7 1733 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1734 goto drop;
1735
3b04ddde 1736 if (dev->header_ops) {
1da177e4 1737 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1738 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1739 else if (skb->pkt_type == PACKET_OUTGOING) {
1740 /* Special case: outgoing packets have ll header at head */
bbe735e4 1741 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1742 }
1743 }
1744
8dc41944
HX
1745 if (skb->ip_summed == CHECKSUM_PARTIAL)
1746 status |= TP_STATUS_CSUMNOTREADY;
1747
1da177e4
LT
1748 snaplen = skb->len;
1749
dbcb5855
DM
1750 res = run_filter(skb, sk, snaplen);
1751 if (!res)
fda9ef5d 1752 goto drop_n_restore;
dbcb5855
DM
1753 if (snaplen > res)
1754 snaplen = res;
1da177e4
LT
1755
1756 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1757 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1758 po->tp_reserve;
1da177e4 1759 } else {
95c96174 1760 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 1761 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1762 (maclen < 16 ? 16 : maclen)) +
1763 po->tp_reserve;
1da177e4
LT
1764 macoff = netoff - maclen;
1765 }
f6fb8f10 1766 if (po->tp_version <= TPACKET_V2) {
1767 if (macoff + snaplen > po->rx_ring.frame_size) {
1768 if (po->copy_thresh &&
0fd7bac6 1769 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 1770 if (skb_shared(skb)) {
1771 copy_skb = skb_clone(skb, GFP_ATOMIC);
1772 } else {
1773 copy_skb = skb_get(skb);
1774 skb_head = skb->data;
1775 }
1776 if (copy_skb)
1777 skb_set_owner_r(copy_skb, sk);
1da177e4 1778 }
f6fb8f10 1779 snaplen = po->rx_ring.frame_size - macoff;
1780 if ((int)snaplen < 0)
1781 snaplen = 0;
1da177e4 1782 }
1da177e4 1783 }
1da177e4 1784 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1785 h.raw = packet_current_rx_frame(po, skb,
1786 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1787 if (!h.raw)
1da177e4 1788 goto ring_is_full;
f6fb8f10 1789 if (po->tp_version <= TPACKET_V2) {
1790 packet_increment_rx_head(po, &po->rx_ring);
1791 /*
1792 * LOSING will be reported till you read the stats,
1793 * because it's COR - Clear On Read.
1794 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1795 * at packet level.
1796 */
1797 if (po->stats.tp_drops)
1798 status |= TP_STATUS_LOSING;
1799 }
1da177e4
LT
1800 po->stats.tp_packets++;
1801 if (copy_skb) {
1802 status |= TP_STATUS_COPY;
1803 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1804 }
1da177e4
LT
1805 spin_unlock(&sk->sk_receive_queue.lock);
1806
bbd6ef87 1807 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1808
bbd6ef87
PM
1809 switch (po->tp_version) {
1810 case TPACKET_V1:
1811 h.h1->tp_len = skb->len;
1812 h.h1->tp_snaplen = snaplen;
1813 h.h1->tp_mac = macoff;
1814 h.h1->tp_net = netoff;
614f60fa
SM
1815 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1816 && shhwtstamps->syststamp.tv64)
1817 tv = ktime_to_timeval(shhwtstamps->syststamp);
1818 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1819 && shhwtstamps->hwtstamp.tv64)
1820 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1821 else if (skb->tstamp.tv64)
bbd6ef87
PM
1822 tv = ktime_to_timeval(skb->tstamp);
1823 else
1824 do_gettimeofday(&tv);
1825 h.h1->tp_sec = tv.tv_sec;
1826 h.h1->tp_usec = tv.tv_usec;
1827 hdrlen = sizeof(*h.h1);
1828 break;
1829 case TPACKET_V2:
1830 h.h2->tp_len = skb->len;
1831 h.h2->tp_snaplen = snaplen;
1832 h.h2->tp_mac = macoff;
1833 h.h2->tp_net = netoff;
614f60fa
SM
1834 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1835 && shhwtstamps->syststamp.tv64)
1836 ts = ktime_to_timespec(shhwtstamps->syststamp);
1837 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1838 && shhwtstamps->hwtstamp.tv64)
1839 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1840 else if (skb->tstamp.tv64)
bbd6ef87
PM
1841 ts = ktime_to_timespec(skb->tstamp);
1842 else
1843 getnstimeofday(&ts);
1844 h.h2->tp_sec = ts.tv_sec;
1845 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1846 if (vlan_tx_tag_present(skb)) {
1847 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1848 status |= TP_STATUS_VLAN_VALID;
1849 } else {
1850 h.h2->tp_vlan_tci = 0;
1851 }
13fcb7bd 1852 h.h2->tp_padding = 0;
bbd6ef87
PM
1853 hdrlen = sizeof(*h.h2);
1854 break;
f6fb8f10 1855 case TPACKET_V3:
1856 /* tp_nxt_offset,vlan are already populated above.
1857 * So DONT clear those fields here
1858 */
1859 h.h3->tp_status |= status;
1860 h.h3->tp_len = skb->len;
1861 h.h3->tp_snaplen = snaplen;
1862 h.h3->tp_mac = macoff;
1863 h.h3->tp_net = netoff;
1864 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1865 && shhwtstamps->syststamp.tv64)
1866 ts = ktime_to_timespec(shhwtstamps->syststamp);
1867 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1868 && shhwtstamps->hwtstamp.tv64)
1869 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1870 else if (skb->tstamp.tv64)
1871 ts = ktime_to_timespec(skb->tstamp);
1872 else
1873 getnstimeofday(&ts);
1874 h.h3->tp_sec = ts.tv_sec;
1875 h.h3->tp_nsec = ts.tv_nsec;
1876 hdrlen = sizeof(*h.h3);
1877 break;
bbd6ef87
PM
1878 default:
1879 BUG();
1880 }
1da177e4 1881
bbd6ef87 1882 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1883 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1884 sll->sll_family = AF_PACKET;
1885 sll->sll_hatype = dev->type;
1886 sll->sll_protocol = skb->protocol;
1887 sll->sll_pkttype = skb->pkt_type;
8032b464 1888 if (unlikely(po->origdev))
80feaacb
PWJ
1889 sll->sll_ifindex = orig_dev->ifindex;
1890 else
1891 sll->sll_ifindex = dev->ifindex;
1da177e4 1892
e16aa207 1893 smp_mb();
f6dafa95 1894#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1895 {
0af55bb5
CG
1896 u8 *start, *end;
1897
f6fb8f10 1898 if (po->tp_version <= TPACKET_V2) {
1899 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1900 + macoff + snaplen);
1901 for (start = h.raw; start < end; start += PAGE_SIZE)
1902 flush_dcache_page(pgv_to_page(start));
1903 }
cc9f01b2 1904 smp_wmb();
1da177e4 1905 }
f6dafa95 1906#endif
f6fb8f10 1907 if (po->tp_version <= TPACKET_V2)
1908 __packet_set_status(po, h.raw, status);
1909 else
1910 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1911
1912 sk->sk_data_ready(sk, 0);
1913
1914drop_n_restore:
1915 if (skb_head != skb->data && skb_shared(skb)) {
1916 skb->data = skb_head;
1917 skb->len = skb_len;
1918 }
1919drop:
1ce4f28b 1920 kfree_skb(skb);
1da177e4
LT
1921 return 0;
1922
1923ring_is_full:
1924 po->stats.tp_drops++;
1925 spin_unlock(&sk->sk_receive_queue.lock);
1926
1927 sk->sk_data_ready(sk, 0);
acb5d75b 1928 kfree_skb(copy_skb);
1da177e4
LT
1929 goto drop_n_restore;
1930}
1931
69e3c75f
JB
1932static void tpacket_destruct_skb(struct sk_buff *skb)
1933{
1934 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1935 void *ph;
1da177e4 1936
69e3c75f
JB
1937 if (likely(po->tx_ring.pg_vec)) {
1938 ph = skb_shinfo(skb)->destructor_arg;
69e3c75f
JB
1939 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1940 atomic_dec(&po->tx_ring.pending);
1941 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1942 }
1943
1944 sock_wfree(skb);
1945}
1946
40d4e3df
ED
1947static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1948 void *frame, struct net_device *dev, int size_max,
ae641949 1949 __be16 proto, unsigned char *addr, int hlen)
69e3c75f
JB
1950{
1951 union {
1952 struct tpacket_hdr *h1;
1953 struct tpacket2_hdr *h2;
1954 void *raw;
1955 } ph;
1956 int to_write, offset, len, tp_len, nr_frags, len_max;
1957 struct socket *sock = po->sk.sk_socket;
1958 struct page *page;
1959 void *data;
1960 int err;
1961
1962 ph.raw = frame;
1963
1964 skb->protocol = proto;
1965 skb->dev = dev;
1966 skb->priority = po->sk.sk_priority;
2d37a186 1967 skb->mark = po->sk.sk_mark;
69e3c75f
JB
1968 skb_shinfo(skb)->destructor_arg = ph.raw;
1969
1970 switch (po->tp_version) {
1971 case TPACKET_V2:
1972 tp_len = ph.h2->tp_len;
1973 break;
1974 default:
1975 tp_len = ph.h1->tp_len;
1976 break;
1977 }
1978 if (unlikely(tp_len > size_max)) {
40d4e3df 1979 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
1980 return -EMSGSIZE;
1981 }
1982
ae641949 1983 skb_reserve(skb, hlen);
69e3c75f
JB
1984 skb_reset_network_header(skb);
1985
1986 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1987 to_write = tp_len;
1988
1989 if (sock->type == SOCK_DGRAM) {
1990 err = dev_hard_header(skb, dev, ntohs(proto), addr,
1991 NULL, tp_len);
1992 if (unlikely(err < 0))
1993 return -EINVAL;
40d4e3df 1994 } else if (dev->hard_header_len) {
69e3c75f
JB
1995 /* net device doesn't like empty head */
1996 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
1997 pr_err("packet size is too short (%d < %d)\n",
1998 tp_len, dev->hard_header_len);
69e3c75f
JB
1999 return -EINVAL;
2000 }
2001
2002 skb_push(skb, dev->hard_header_len);
2003 err = skb_store_bits(skb, 0, data,
2004 dev->hard_header_len);
2005 if (unlikely(err))
2006 return err;
2007
2008 data += dev->hard_header_len;
2009 to_write -= dev->hard_header_len;
2010 }
2011
2012 err = -EFAULT;
69e3c75f
JB
2013 offset = offset_in_page(data);
2014 len_max = PAGE_SIZE - offset;
2015 len = ((to_write > len_max) ? len_max : to_write);
2016
2017 skb->data_len = to_write;
2018 skb->len += to_write;
2019 skb->truesize += to_write;
2020 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2021
2022 while (likely(to_write)) {
2023 nr_frags = skb_shinfo(skb)->nr_frags;
2024
2025 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2026 pr_err("Packet exceed the number of skb frags(%lu)\n",
2027 MAX_SKB_FRAGS);
69e3c75f
JB
2028 return -EFAULT;
2029 }
2030
0af55bb5
CG
2031 page = pgv_to_page(data);
2032 data += len;
69e3c75f
JB
2033 flush_dcache_page(page);
2034 get_page(page);
0af55bb5 2035 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2036 to_write -= len;
2037 offset = 0;
2038 len_max = PAGE_SIZE;
2039 len = ((to_write > len_max) ? len_max : to_write);
2040 }
2041
2042 return tp_len;
2043}
2044
2045static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2046{
69e3c75f
JB
2047 struct sk_buff *skb;
2048 struct net_device *dev;
2049 __be16 proto;
827d9780
BG
2050 bool need_rls_dev = false;
2051 int err, reserve = 0;
40d4e3df
ED
2052 void *ph;
2053 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2054 int tp_len, size_max;
2055 unsigned char *addr;
2056 int len_sum = 0;
2057 int status = 0;
ae641949 2058 int hlen, tlen;
69e3c75f 2059
69e3c75f
JB
2060 mutex_lock(&po->pg_vec_lock);
2061
2062 err = -EBUSY;
2063 if (saddr == NULL) {
827d9780 2064 dev = po->prot_hook.dev;
69e3c75f
JB
2065 proto = po->num;
2066 addr = NULL;
2067 } else {
2068 err = -EINVAL;
2069 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2070 goto out;
2071 if (msg->msg_namelen < (saddr->sll_halen
2072 + offsetof(struct sockaddr_ll,
2073 sll_addr)))
2074 goto out;
69e3c75f
JB
2075 proto = saddr->sll_protocol;
2076 addr = saddr->sll_addr;
827d9780
BG
2077 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2078 need_rls_dev = true;
69e3c75f
JB
2079 }
2080
69e3c75f
JB
2081 err = -ENXIO;
2082 if (unlikely(dev == NULL))
2083 goto out;
2084
2085 reserve = dev->hard_header_len;
2086
2087 err = -ENETDOWN;
2088 if (unlikely(!(dev->flags & IFF_UP)))
2089 goto out_put;
2090
2091 size_max = po->tx_ring.frame_size
b5dd884e 2092 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2093
2094 if (size_max > dev->mtu + reserve)
2095 size_max = dev->mtu + reserve;
2096
2097 do {
2098 ph = packet_current_frame(po, &po->tx_ring,
2099 TP_STATUS_SEND_REQUEST);
2100
2101 if (unlikely(ph == NULL)) {
2102 schedule();
2103 continue;
2104 }
2105
2106 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2107 hlen = LL_RESERVED_SPACE(dev);
2108 tlen = dev->needed_tailroom;
69e3c75f 2109 skb = sock_alloc_send_skb(&po->sk,
ae641949 2110 hlen + tlen + sizeof(struct sockaddr_ll),
69e3c75f
JB
2111 0, &err);
2112
2113 if (unlikely(skb == NULL))
2114 goto out_status;
2115
2116 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
ae641949 2117 addr, hlen);
69e3c75f
JB
2118
2119 if (unlikely(tp_len < 0)) {
2120 if (po->tp_loss) {
2121 __packet_set_status(po, ph,
2122 TP_STATUS_AVAILABLE);
2123 packet_increment_head(&po->tx_ring);
2124 kfree_skb(skb);
2125 continue;
2126 } else {
2127 status = TP_STATUS_WRONG_FORMAT;
2128 err = tp_len;
2129 goto out_status;
2130 }
2131 }
2132
2133 skb->destructor = tpacket_destruct_skb;
2134 __packet_set_status(po, ph, TP_STATUS_SENDING);
2135 atomic_inc(&po->tx_ring.pending);
2136
2137 status = TP_STATUS_SEND_REQUEST;
2138 err = dev_queue_xmit(skb);
eb70df13
JP
2139 if (unlikely(err > 0)) {
2140 err = net_xmit_errno(err);
2141 if (err && __packet_get_status(po, ph) ==
2142 TP_STATUS_AVAILABLE) {
2143 /* skb was destructed already */
2144 skb = NULL;
2145 goto out_status;
2146 }
2147 /*
2148 * skb was dropped but not destructed yet;
2149 * let's treat it like congestion or err < 0
2150 */
2151 err = 0;
2152 }
69e3c75f
JB
2153 packet_increment_head(&po->tx_ring);
2154 len_sum += tp_len;
f64f9e71
JP
2155 } while (likely((ph != NULL) ||
2156 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2157 (atomic_read(&po->tx_ring.pending))))
2158 );
69e3c75f
JB
2159
2160 err = len_sum;
2161 goto out_put;
2162
69e3c75f
JB
2163out_status:
2164 __packet_set_status(po, ph, status);
2165 kfree_skb(skb);
2166out_put:
827d9780
BG
2167 if (need_rls_dev)
2168 dev_put(dev);
69e3c75f
JB
2169out:
2170 mutex_unlock(&po->pg_vec_lock);
2171 return err;
2172}
69e3c75f 2173
eea49cc9
OJ
2174static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2175 size_t reserve, size_t len,
2176 size_t linear, int noblock,
2177 int *err)
bfd5f4a3
SS
2178{
2179 struct sk_buff *skb;
2180
2181 /* Under a page? Don't bother with paged skb. */
2182 if (prepad + len < PAGE_SIZE || !linear)
2183 linear = len;
2184
2185 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2186 err);
2187 if (!skb)
2188 return NULL;
2189
2190 skb_reserve(skb, reserve);
2191 skb_put(skb, linear);
2192 skb->data_len = len - linear;
2193 skb->len += len - linear;
2194
2195 return skb;
2196}
2197
69e3c75f 2198static int packet_snd(struct socket *sock,
1da177e4
LT
2199 struct msghdr *msg, size_t len)
2200{
2201 struct sock *sk = sock->sk;
40d4e3df 2202 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2203 struct sk_buff *skb;
2204 struct net_device *dev;
0e11c91e 2205 __be16 proto;
827d9780 2206 bool need_rls_dev = false;
1da177e4 2207 unsigned char *addr;
827d9780 2208 int err, reserve = 0;
bfd5f4a3
SS
2209 struct virtio_net_hdr vnet_hdr = { 0 };
2210 int offset = 0;
2211 int vnet_hdr_len;
2212 struct packet_sock *po = pkt_sk(sk);
2213 unsigned short gso_type = 0;
ae641949 2214 int hlen, tlen;
3bdc0eba 2215 int extra_len = 0;
1da177e4
LT
2216
2217 /*
1ce4f28b 2218 * Get and verify the address.
1da177e4 2219 */
1ce4f28b 2220
1da177e4 2221 if (saddr == NULL) {
827d9780 2222 dev = po->prot_hook.dev;
1da177e4
LT
2223 proto = po->num;
2224 addr = NULL;
2225 } else {
2226 err = -EINVAL;
2227 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2228 goto out;
0fb375fb
EB
2229 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2230 goto out;
1da177e4
LT
2231 proto = saddr->sll_protocol;
2232 addr = saddr->sll_addr;
827d9780
BG
2233 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2234 need_rls_dev = true;
1da177e4
LT
2235 }
2236
1da177e4
LT
2237 err = -ENXIO;
2238 if (dev == NULL)
2239 goto out_unlock;
2240 if (sock->type == SOCK_RAW)
2241 reserve = dev->hard_header_len;
2242
d5e76b0a
DM
2243 err = -ENETDOWN;
2244 if (!(dev->flags & IFF_UP))
2245 goto out_unlock;
2246
bfd5f4a3
SS
2247 if (po->has_vnet_hdr) {
2248 vnet_hdr_len = sizeof(vnet_hdr);
2249
2250 err = -EINVAL;
2251 if (len < vnet_hdr_len)
2252 goto out_unlock;
2253
2254 len -= vnet_hdr_len;
2255
2256 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2257 vnet_hdr_len);
2258 if (err < 0)
2259 goto out_unlock;
2260
2261 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2262 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2263 vnet_hdr.hdr_len))
2264 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2265 vnet_hdr.csum_offset + 2;
2266
2267 err = -EINVAL;
2268 if (vnet_hdr.hdr_len > len)
2269 goto out_unlock;
2270
2271 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2272 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2273 case VIRTIO_NET_HDR_GSO_TCPV4:
2274 gso_type = SKB_GSO_TCPV4;
2275 break;
2276 case VIRTIO_NET_HDR_GSO_TCPV6:
2277 gso_type = SKB_GSO_TCPV6;
2278 break;
2279 case VIRTIO_NET_HDR_GSO_UDP:
2280 gso_type = SKB_GSO_UDP;
2281 break;
2282 default:
2283 goto out_unlock;
2284 }
2285
2286 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2287 gso_type |= SKB_GSO_TCP_ECN;
2288
2289 if (vnet_hdr.gso_size == 0)
2290 goto out_unlock;
2291
2292 }
2293 }
2294
3bdc0eba
BG
2295 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2296 if (!netif_supports_nofcs(dev)) {
2297 err = -EPROTONOSUPPORT;
2298 goto out_unlock;
2299 }
2300 extra_len = 4; /* We're doing our own CRC */
2301 }
2302
1da177e4 2303 err = -EMSGSIZE;
3bdc0eba 2304 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2305 goto out_unlock;
2306
bfd5f4a3 2307 err = -ENOBUFS;
ae641949
HX
2308 hlen = LL_RESERVED_SPACE(dev);
2309 tlen = dev->needed_tailroom;
2310 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
bfd5f4a3 2311 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2312 if (skb == NULL)
1da177e4
LT
2313 goto out_unlock;
2314
bfd5f4a3 2315 skb_set_network_header(skb, reserve);
1da177e4 2316
0c4e8581
SH
2317 err = -EINVAL;
2318 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2319 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2320 goto out_free;
1da177e4
LT
2321
2322 /* Returns -EFAULT on error */
bfd5f4a3 2323 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2324 if (err)
2325 goto out_free;
2244d07b 2326 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
2327 if (err < 0)
2328 goto out_free;
1da177e4 2329
3bdc0eba 2330 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
57f89bfa
BG
2331 /* Earlier code assumed this would be a VLAN pkt,
2332 * double-check this now that we have the actual
2333 * packet in hand.
2334 */
2335 struct ethhdr *ehdr;
2336 skb_reset_mac_header(skb);
2337 ehdr = eth_hdr(skb);
2338 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2339 err = -EMSGSIZE;
2340 goto out_free;
2341 }
2342 }
2343
1da177e4
LT
2344 skb->protocol = proto;
2345 skb->dev = dev;
2346 skb->priority = sk->sk_priority;
2d37a186 2347 skb->mark = sk->sk_mark;
1da177e4 2348
bfd5f4a3
SS
2349 if (po->has_vnet_hdr) {
2350 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2351 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2352 vnet_hdr.csum_offset)) {
2353 err = -EINVAL;
2354 goto out_free;
2355 }
2356 }
2357
2358 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2359 skb_shinfo(skb)->gso_type = gso_type;
2360
2361 /* Header must be checked, and gso_segs computed. */
2362 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2363 skb_shinfo(skb)->gso_segs = 0;
2364
2365 len += vnet_hdr_len;
2366 }
2367
3bdc0eba
BG
2368 if (unlikely(extra_len == 4))
2369 skb->no_fcs = 1;
2370
1da177e4
LT
2371 /*
2372 * Now send it
2373 */
2374
2375 err = dev_queue_xmit(skb);
2376 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2377 goto out_unlock;
2378
827d9780
BG
2379 if (need_rls_dev)
2380 dev_put(dev);
1da177e4 2381
40d4e3df 2382 return len;
1da177e4
LT
2383
2384out_free:
2385 kfree_skb(skb);
2386out_unlock:
827d9780 2387 if (dev && need_rls_dev)
1da177e4
LT
2388 dev_put(dev);
2389out:
2390 return err;
2391}
2392
69e3c75f
JB
2393static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2394 struct msghdr *msg, size_t len)
2395{
69e3c75f
JB
2396 struct sock *sk = sock->sk;
2397 struct packet_sock *po = pkt_sk(sk);
2398 if (po->tx_ring.pg_vec)
2399 return tpacket_snd(po, msg);
2400 else
69e3c75f
JB
2401 return packet_snd(sock, msg, len);
2402}
2403
1da177e4
LT
2404/*
2405 * Close a PACKET socket. This is fairly simple. We immediately go
2406 * to 'closed' state and remove our protocol entry in the device list.
2407 */
2408
2409static int packet_release(struct socket *sock)
2410{
2411 struct sock *sk = sock->sk;
2412 struct packet_sock *po;
d12d01d6 2413 struct net *net;
f6fb8f10 2414 union tpacket_req_u req_u;
1da177e4
LT
2415
2416 if (!sk)
2417 return 0;
2418
3b1e0a65 2419 net = sock_net(sk);
1da177e4
LT
2420 po = pkt_sk(sk);
2421
808f5114 2422 spin_lock_bh(&net->packet.sklist_lock);
2423 sk_del_node_init_rcu(sk);
920de804 2424 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 2425 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 2426
808f5114 2427 spin_lock(&po->bind_lock);
ce06b03e 2428 unregister_prot_hook(sk, false);
160ff18a
BG
2429 if (po->prot_hook.dev) {
2430 dev_put(po->prot_hook.dev);
2431 po->prot_hook.dev = NULL;
2432 }
808f5114 2433 spin_unlock(&po->bind_lock);
1da177e4 2434
1da177e4 2435 packet_flush_mclist(sk);
1da177e4 2436
f6fb8f10 2437 memset(&req_u, 0, sizeof(req_u));
69e3c75f
JB
2438
2439 if (po->rx_ring.pg_vec)
f6fb8f10 2440 packet_set_ring(sk, &req_u, 1, 0);
69e3c75f
JB
2441
2442 if (po->tx_ring.pg_vec)
f6fb8f10 2443 packet_set_ring(sk, &req_u, 1, 1);
1da177e4 2444
dc99f600
DM
2445 fanout_release(sk);
2446
808f5114 2447 synchronize_net();
1da177e4
LT
2448 /*
2449 * Now the socket is dead. No more input will appear.
2450 */
1da177e4
LT
2451 sock_orphan(sk);
2452 sock->sk = NULL;
2453
2454 /* Purge queues */
2455
2456 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2457 sk_refcnt_debug_release(sk);
1da177e4
LT
2458
2459 sock_put(sk);
2460 return 0;
2461}
2462
2463/*
2464 * Attach a packet hook.
2465 */
2466
0e11c91e 2467static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2468{
2469 struct packet_sock *po = pkt_sk(sk);
dc99f600 2470
aef950b4
WY
2471 if (po->fanout) {
2472 if (dev)
2473 dev_put(dev);
2474
dc99f600 2475 return -EINVAL;
aef950b4 2476 }
1da177e4
LT
2477
2478 lock_sock(sk);
2479
2480 spin_lock(&po->bind_lock);
ce06b03e 2481 unregister_prot_hook(sk, true);
1da177e4
LT
2482 po->num = protocol;
2483 po->prot_hook.type = protocol;
160ff18a
BG
2484 if (po->prot_hook.dev)
2485 dev_put(po->prot_hook.dev);
1da177e4
LT
2486 po->prot_hook.dev = dev;
2487
2488 po->ifindex = dev ? dev->ifindex : 0;
2489
2490 if (protocol == 0)
2491 goto out_unlock;
2492
be85d4ad 2493 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2494 register_prot_hook(sk);
be85d4ad
UT
2495 } else {
2496 sk->sk_err = ENETDOWN;
2497 if (!sock_flag(sk, SOCK_DEAD))
2498 sk->sk_error_report(sk);
1da177e4
LT
2499 }
2500
2501out_unlock:
2502 spin_unlock(&po->bind_lock);
2503 release_sock(sk);
2504 return 0;
2505}
2506
2507/*
2508 * Bind a packet socket to a device
2509 */
2510
40d4e3df
ED
2511static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2512 int addr_len)
1da177e4 2513{
40d4e3df 2514 struct sock *sk = sock->sk;
1da177e4
LT
2515 char name[15];
2516 struct net_device *dev;
2517 int err = -ENODEV;
1ce4f28b 2518
1da177e4
LT
2519 /*
2520 * Check legality
2521 */
1ce4f28b 2522
8ae55f04 2523 if (addr_len != sizeof(struct sockaddr))
1da177e4 2524 return -EINVAL;
40d4e3df 2525 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2526
3b1e0a65 2527 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2528 if (dev)
1da177e4 2529 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2530 return err;
2531}
1da177e4
LT
2532
2533static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2534{
40d4e3df
ED
2535 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2536 struct sock *sk = sock->sk;
1da177e4
LT
2537 struct net_device *dev = NULL;
2538 int err;
2539
2540
2541 /*
2542 * Check legality
2543 */
1ce4f28b 2544
1da177e4
LT
2545 if (addr_len < sizeof(struct sockaddr_ll))
2546 return -EINVAL;
2547 if (sll->sll_family != AF_PACKET)
2548 return -EINVAL;
2549
2550 if (sll->sll_ifindex) {
2551 err = -ENODEV;
3b1e0a65 2552 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2553 if (dev == NULL)
2554 goto out;
2555 }
2556 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2557
2558out:
2559 return err;
2560}
2561
2562static struct proto packet_proto = {
2563 .name = "PACKET",
2564 .owner = THIS_MODULE,
2565 .obj_size = sizeof(struct packet_sock),
2566};
2567
2568/*
1ce4f28b 2569 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2570 */
2571
3f378b68
EP
2572static int packet_create(struct net *net, struct socket *sock, int protocol,
2573 int kern)
1da177e4
LT
2574{
2575 struct sock *sk;
2576 struct packet_sock *po;
0e11c91e 2577 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2578 int err;
2579
2580 if (!capable(CAP_NET_RAW))
2581 return -EPERM;
be02097c
DM
2582 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2583 sock->type != SOCK_PACKET)
1da177e4
LT
2584 return -ESOCKTNOSUPPORT;
2585
2586 sock->state = SS_UNCONNECTED;
2587
2588 err = -ENOBUFS;
6257ff21 2589 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2590 if (sk == NULL)
2591 goto out;
2592
2593 sock->ops = &packet_ops;
1da177e4
LT
2594 if (sock->type == SOCK_PACKET)
2595 sock->ops = &packet_ops_spkt;
be02097c 2596
1da177e4
LT
2597 sock_init_data(sock, sk);
2598
2599 po = pkt_sk(sk);
2600 sk->sk_family = PF_PACKET;
0e11c91e 2601 po->num = proto;
1da177e4
LT
2602
2603 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2604 sk_refcnt_debug_inc(sk);
1da177e4
LT
2605
2606 /*
2607 * Attach a protocol block
2608 */
2609
2610 spin_lock_init(&po->bind_lock);
905db440 2611 mutex_init(&po->pg_vec_lock);
1da177e4 2612 po->prot_hook.func = packet_rcv;
be02097c 2613
1da177e4
LT
2614 if (sock->type == SOCK_PACKET)
2615 po->prot_hook.func = packet_rcv_spkt;
be02097c 2616
1da177e4
LT
2617 po->prot_hook.af_packet_priv = sk;
2618
0e11c91e
AV
2619 if (proto) {
2620 po->prot_hook.type = proto;
ce06b03e 2621 register_prot_hook(sk);
1da177e4
LT
2622 }
2623
808f5114 2624 spin_lock_bh(&net->packet.sklist_lock);
2625 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 2626 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 2627 spin_unlock_bh(&net->packet.sklist_lock);
2628
40d4e3df 2629 return 0;
1da177e4
LT
2630out:
2631 return err;
2632}
2633
ed85b565
RC
2634static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2635{
2636 struct sock_exterr_skb *serr;
2637 struct sk_buff *skb, *skb2;
2638 int copied, err;
2639
2640 err = -EAGAIN;
2641 skb = skb_dequeue(&sk->sk_error_queue);
2642 if (skb == NULL)
2643 goto out;
2644
2645 copied = skb->len;
2646 if (copied > len) {
2647 msg->msg_flags |= MSG_TRUNC;
2648 copied = len;
2649 }
2650 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2651 if (err)
2652 goto out_free_skb;
2653
2654 sock_recv_timestamp(msg, sk, skb);
2655
2656 serr = SKB_EXT_ERR(skb);
2657 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2658 sizeof(serr->ee), &serr->ee);
2659
2660 msg->msg_flags |= MSG_ERRQUEUE;
2661 err = copied;
2662
2663 /* Reset and regenerate socket error */
2664 spin_lock_bh(&sk->sk_error_queue.lock);
2665 sk->sk_err = 0;
2666 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2667 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2668 spin_unlock_bh(&sk->sk_error_queue.lock);
2669 sk->sk_error_report(sk);
2670 } else
2671 spin_unlock_bh(&sk->sk_error_queue.lock);
2672
2673out_free_skb:
2674 kfree_skb(skb);
2675out:
2676 return err;
2677}
2678
1da177e4
LT
2679/*
2680 * Pull a packet from our receive queue and hand it to the user.
2681 * If necessary we block.
2682 */
2683
2684static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2685 struct msghdr *msg, size_t len, int flags)
2686{
2687 struct sock *sk = sock->sk;
2688 struct sk_buff *skb;
2689 int copied, err;
0fb375fb 2690 struct sockaddr_ll *sll;
bfd5f4a3 2691 int vnet_hdr_len = 0;
1da177e4
LT
2692
2693 err = -EINVAL;
ed85b565 2694 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2695 goto out;
2696
2697#if 0
2698 /* What error should we return now? EUNATTACH? */
2699 if (pkt_sk(sk)->ifindex < 0)
2700 return -ENODEV;
2701#endif
2702
ed85b565
RC
2703 if (flags & MSG_ERRQUEUE) {
2704 err = packet_recv_error(sk, msg, len);
2705 goto out;
2706 }
2707
1da177e4
LT
2708 /*
2709 * Call the generic datagram receiver. This handles all sorts
2710 * of horrible races and re-entrancy so we can forget about it
2711 * in the protocol layers.
2712 *
2713 * Now it will return ENETDOWN, if device have just gone down,
2714 * but then it will block.
2715 */
2716
40d4e3df 2717 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2718
2719 /*
1ce4f28b 2720 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2721 * handles the blocking we don't see and worry about blocking
2722 * retries.
2723 */
2724
8ae55f04 2725 if (skb == NULL)
1da177e4
LT
2726 goto out;
2727
bfd5f4a3
SS
2728 if (pkt_sk(sk)->has_vnet_hdr) {
2729 struct virtio_net_hdr vnet_hdr = { 0 };
2730
2731 err = -EINVAL;
2732 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2733 if (len < vnet_hdr_len)
bfd5f4a3
SS
2734 goto out_free;
2735
1f18b717
MK
2736 len -= vnet_hdr_len;
2737
bfd5f4a3
SS
2738 if (skb_is_gso(skb)) {
2739 struct skb_shared_info *sinfo = skb_shinfo(skb);
2740
2741 /* This is a hint as to how much should be linear. */
2742 vnet_hdr.hdr_len = skb_headlen(skb);
2743 vnet_hdr.gso_size = sinfo->gso_size;
2744 if (sinfo->gso_type & SKB_GSO_TCPV4)
2745 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2746 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2747 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2748 else if (sinfo->gso_type & SKB_GSO_UDP)
2749 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2750 else if (sinfo->gso_type & SKB_GSO_FCOE)
2751 goto out_free;
2752 else
2753 BUG();
2754 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2755 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2756 } else
2757 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2758
2759 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2760 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2761 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2762 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2763 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2764 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2765 } /* else everything is zero */
2766
2767 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2768 vnet_hdr_len);
2769 if (err < 0)
2770 goto out_free;
2771 }
2772
0fb375fb
EB
2773 /*
2774 * If the address length field is there to be filled in, we fill
2775 * it in now.
2776 */
2777
ffbc6111 2778 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2779 if (sock->type == SOCK_PACKET)
2780 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2781 else
2782 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2783
1da177e4
LT
2784 /*
2785 * You lose any data beyond the buffer you gave. If it worries a
2786 * user program they can ask the device for its MTU anyway.
2787 */
2788
2789 copied = skb->len;
40d4e3df
ED
2790 if (copied > len) {
2791 copied = len;
2792 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2793 }
2794
2795 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2796 if (err)
2797 goto out_free;
2798
3b885787 2799 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2800
2801 if (msg->msg_name)
ffbc6111
HX
2802 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2803 msg->msg_namelen);
1da177e4 2804
8dc41944 2805 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2806 struct tpacket_auxdata aux;
2807
2808 aux.tp_status = TP_STATUS_USER;
2809 if (skb->ip_summed == CHECKSUM_PARTIAL)
2810 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2811 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2812 aux.tp_snaplen = skb->len;
2813 aux.tp_mac = 0;
bbe735e4 2814 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2815 if (vlan_tx_tag_present(skb)) {
2816 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2817 aux.tp_status |= TP_STATUS_VLAN_VALID;
2818 } else {
2819 aux.tp_vlan_tci = 0;
2820 }
13fcb7bd 2821 aux.tp_padding = 0;
ffbc6111 2822 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2823 }
2824
1da177e4
LT
2825 /*
2826 * Free or return the buffer as appropriate. Again this
2827 * hides all the races and re-entrancy issues from us.
2828 */
bfd5f4a3 2829 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2830
2831out_free:
2832 skb_free_datagram(sk, skb);
2833out:
2834 return err;
2835}
2836
1da177e4
LT
2837static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2838 int *uaddr_len, int peer)
2839{
2840 struct net_device *dev;
2841 struct sock *sk = sock->sk;
2842
2843 if (peer)
2844 return -EOPNOTSUPP;
2845
2846 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2847 rcu_read_lock();
2848 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2849 if (dev)
67286640 2850 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2851 else
1da177e4 2852 memset(uaddr->sa_data, 0, 14);
654d1f8a 2853 rcu_read_unlock();
1da177e4
LT
2854 *uaddr_len = sizeof(*uaddr);
2855
2856 return 0;
2857}
1da177e4
LT
2858
2859static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2860 int *uaddr_len, int peer)
2861{
2862 struct net_device *dev;
2863 struct sock *sk = sock->sk;
2864 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2865 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2866
2867 if (peer)
2868 return -EOPNOTSUPP;
2869
2870 sll->sll_family = AF_PACKET;
2871 sll->sll_ifindex = po->ifindex;
2872 sll->sll_protocol = po->num;
67286640 2873 sll->sll_pkttype = 0;
654d1f8a
ED
2874 rcu_read_lock();
2875 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2876 if (dev) {
2877 sll->sll_hatype = dev->type;
2878 sll->sll_halen = dev->addr_len;
2879 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2880 } else {
2881 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2882 sll->sll_halen = 0;
2883 }
654d1f8a 2884 rcu_read_unlock();
0fb375fb 2885 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2886
2887 return 0;
2888}
2889
2aeb0b88
WC
2890static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2891 int what)
1da177e4
LT
2892{
2893 switch (i->type) {
2894 case PACKET_MR_MULTICAST:
1162563f
JP
2895 if (i->alen != dev->addr_len)
2896 return -EINVAL;
1da177e4 2897 if (what > 0)
22bedad3 2898 return dev_mc_add(dev, i->addr);
1da177e4 2899 else
22bedad3 2900 return dev_mc_del(dev, i->addr);
1da177e4
LT
2901 break;
2902 case PACKET_MR_PROMISC:
2aeb0b88 2903 return dev_set_promiscuity(dev, what);
1da177e4
LT
2904 break;
2905 case PACKET_MR_ALLMULTI:
2aeb0b88 2906 return dev_set_allmulti(dev, what);
1da177e4 2907 break;
d95ed927 2908 case PACKET_MR_UNICAST:
1162563f
JP
2909 if (i->alen != dev->addr_len)
2910 return -EINVAL;
d95ed927 2911 if (what > 0)
a748ee24 2912 return dev_uc_add(dev, i->addr);
d95ed927 2913 else
a748ee24 2914 return dev_uc_del(dev, i->addr);
d95ed927 2915 break;
40d4e3df
ED
2916 default:
2917 break;
1da177e4 2918 }
2aeb0b88 2919 return 0;
1da177e4
LT
2920}
2921
2922static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2923{
40d4e3df 2924 for ( ; i; i = i->next) {
1da177e4
LT
2925 if (i->ifindex == dev->ifindex)
2926 packet_dev_mc(dev, i, what);
2927 }
2928}
2929
0fb375fb 2930static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2931{
2932 struct packet_sock *po = pkt_sk(sk);
2933 struct packet_mclist *ml, *i;
2934 struct net_device *dev;
2935 int err;
2936
2937 rtnl_lock();
2938
2939 err = -ENODEV;
3b1e0a65 2940 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2941 if (!dev)
2942 goto done;
2943
2944 err = -EINVAL;
1162563f 2945 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2946 goto done;
2947
2948 err = -ENOBUFS;
8b3a7005 2949 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2950 if (i == NULL)
2951 goto done;
2952
2953 err = 0;
2954 for (ml = po->mclist; ml; ml = ml->next) {
2955 if (ml->ifindex == mreq->mr_ifindex &&
2956 ml->type == mreq->mr_type &&
2957 ml->alen == mreq->mr_alen &&
2958 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2959 ml->count++;
2960 /* Free the new element ... */
2961 kfree(i);
2962 goto done;
2963 }
2964 }
2965
2966 i->type = mreq->mr_type;
2967 i->ifindex = mreq->mr_ifindex;
2968 i->alen = mreq->mr_alen;
2969 memcpy(i->addr, mreq->mr_address, i->alen);
2970 i->count = 1;
2971 i->next = po->mclist;
2972 po->mclist = i;
2aeb0b88
WC
2973 err = packet_dev_mc(dev, i, 1);
2974 if (err) {
2975 po->mclist = i->next;
2976 kfree(i);
2977 }
1da177e4
LT
2978
2979done:
2980 rtnl_unlock();
2981 return err;
2982}
2983
0fb375fb 2984static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2985{
2986 struct packet_mclist *ml, **mlp;
2987
2988 rtnl_lock();
2989
2990 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2991 if (ml->ifindex == mreq->mr_ifindex &&
2992 ml->type == mreq->mr_type &&
2993 ml->alen == mreq->mr_alen &&
2994 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2995 if (--ml->count == 0) {
2996 struct net_device *dev;
2997 *mlp = ml->next;
ad959e76
ED
2998 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
2999 if (dev)
1da177e4 3000 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3001 kfree(ml);
3002 }
3003 rtnl_unlock();
3004 return 0;
3005 }
3006 }
3007 rtnl_unlock();
3008 return -EADDRNOTAVAIL;
3009}
3010
3011static void packet_flush_mclist(struct sock *sk)
3012{
3013 struct packet_sock *po = pkt_sk(sk);
3014 struct packet_mclist *ml;
3015
3016 if (!po->mclist)
3017 return;
3018
3019 rtnl_lock();
3020 while ((ml = po->mclist) != NULL) {
3021 struct net_device *dev;
3022
3023 po->mclist = ml->next;
ad959e76
ED
3024 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3025 if (dev != NULL)
1da177e4 3026 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3027 kfree(ml);
3028 }
3029 rtnl_unlock();
3030}
1da177e4
LT
3031
3032static int
b7058842 3033packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3034{
3035 struct sock *sk = sock->sk;
8dc41944 3036 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3037 int ret;
3038
3039 if (level != SOL_PACKET)
3040 return -ENOPROTOOPT;
3041
69e3c75f 3042 switch (optname) {
1ce4f28b 3043 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3044 case PACKET_DROP_MEMBERSHIP:
3045 {
0fb375fb
EB
3046 struct packet_mreq_max mreq;
3047 int len = optlen;
3048 memset(&mreq, 0, sizeof(mreq));
3049 if (len < sizeof(struct packet_mreq))
1da177e4 3050 return -EINVAL;
0fb375fb
EB
3051 if (len > sizeof(mreq))
3052 len = sizeof(mreq);
40d4e3df 3053 if (copy_from_user(&mreq, optval, len))
1da177e4 3054 return -EFAULT;
0fb375fb
EB
3055 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3056 return -EINVAL;
1da177e4
LT
3057 if (optname == PACKET_ADD_MEMBERSHIP)
3058 ret = packet_mc_add(sk, &mreq);
3059 else
3060 ret = packet_mc_drop(sk, &mreq);
3061 return ret;
3062 }
a2efcfa0 3063
1da177e4 3064 case PACKET_RX_RING:
69e3c75f 3065 case PACKET_TX_RING:
1da177e4 3066 {
f6fb8f10 3067 union tpacket_req_u req_u;
3068 int len;
1da177e4 3069
f6fb8f10 3070 switch (po->tp_version) {
3071 case TPACKET_V1:
3072 case TPACKET_V2:
3073 len = sizeof(req_u.req);
3074 break;
3075 case TPACKET_V3:
3076 default:
3077 len = sizeof(req_u.req3);
3078 break;
3079 }
3080 if (optlen < len)
1da177e4 3081 return -EINVAL;
bfd5f4a3
SS
3082 if (pkt_sk(sk)->has_vnet_hdr)
3083 return -EINVAL;
f6fb8f10 3084 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3085 return -EFAULT;
f6fb8f10 3086 return packet_set_ring(sk, &req_u, 0,
3087 optname == PACKET_TX_RING);
1da177e4
LT
3088 }
3089 case PACKET_COPY_THRESH:
3090 {
3091 int val;
3092
40d4e3df 3093 if (optlen != sizeof(val))
1da177e4 3094 return -EINVAL;
40d4e3df 3095 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3096 return -EFAULT;
3097
3098 pkt_sk(sk)->copy_thresh = val;
3099 return 0;
3100 }
bbd6ef87
PM
3101 case PACKET_VERSION:
3102 {
3103 int val;
3104
3105 if (optlen != sizeof(val))
3106 return -EINVAL;
69e3c75f 3107 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3108 return -EBUSY;
3109 if (copy_from_user(&val, optval, sizeof(val)))
3110 return -EFAULT;
3111 switch (val) {
3112 case TPACKET_V1:
3113 case TPACKET_V2:
f6fb8f10 3114 case TPACKET_V3:
bbd6ef87
PM
3115 po->tp_version = val;
3116 return 0;
3117 default:
3118 return -EINVAL;
3119 }
3120 }
8913336a
PM
3121 case PACKET_RESERVE:
3122 {
3123 unsigned int val;
3124
3125 if (optlen != sizeof(val))
3126 return -EINVAL;
69e3c75f 3127 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3128 return -EBUSY;
3129 if (copy_from_user(&val, optval, sizeof(val)))
3130 return -EFAULT;
3131 po->tp_reserve = val;
3132 return 0;
3133 }
69e3c75f
JB
3134 case PACKET_LOSS:
3135 {
3136 unsigned int val;
3137
3138 if (optlen != sizeof(val))
3139 return -EINVAL;
3140 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3141 return -EBUSY;
3142 if (copy_from_user(&val, optval, sizeof(val)))
3143 return -EFAULT;
3144 po->tp_loss = !!val;
3145 return 0;
3146 }
8dc41944
HX
3147 case PACKET_AUXDATA:
3148 {
3149 int val;
3150
3151 if (optlen < sizeof(val))
3152 return -EINVAL;
3153 if (copy_from_user(&val, optval, sizeof(val)))
3154 return -EFAULT;
3155
3156 po->auxdata = !!val;
3157 return 0;
3158 }
80feaacb
PWJ
3159 case PACKET_ORIGDEV:
3160 {
3161 int val;
3162
3163 if (optlen < sizeof(val))
3164 return -EINVAL;
3165 if (copy_from_user(&val, optval, sizeof(val)))
3166 return -EFAULT;
3167
3168 po->origdev = !!val;
3169 return 0;
3170 }
bfd5f4a3
SS
3171 case PACKET_VNET_HDR:
3172 {
3173 int val;
3174
3175 if (sock->type != SOCK_RAW)
3176 return -EINVAL;
3177 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3178 return -EBUSY;
3179 if (optlen < sizeof(val))
3180 return -EINVAL;
3181 if (copy_from_user(&val, optval, sizeof(val)))
3182 return -EFAULT;
3183
3184 po->has_vnet_hdr = !!val;
3185 return 0;
3186 }
614f60fa
SM
3187 case PACKET_TIMESTAMP:
3188 {
3189 int val;
3190
3191 if (optlen != sizeof(val))
3192 return -EINVAL;
3193 if (copy_from_user(&val, optval, sizeof(val)))
3194 return -EFAULT;
3195
3196 po->tp_tstamp = val;
3197 return 0;
3198 }
dc99f600
DM
3199 case PACKET_FANOUT:
3200 {
3201 int val;
3202
3203 if (optlen != sizeof(val))
3204 return -EINVAL;
3205 if (copy_from_user(&val, optval, sizeof(val)))
3206 return -EFAULT;
3207
3208 return fanout_add(sk, val & 0xffff, val >> 16);
3209 }
1da177e4
LT
3210 default:
3211 return -ENOPROTOOPT;
3212 }
3213}
3214
3215static int packet_getsockopt(struct socket *sock, int level, int optname,
3216 char __user *optval, int __user *optlen)
3217{
3218 int len;
c06fff6e 3219 int val, lv = sizeof(val);
1da177e4
LT
3220 struct sock *sk = sock->sk;
3221 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3222 void *data = &val;
8dc41944 3223 struct tpacket_stats st;
f6fb8f10 3224 union tpacket_stats_u st_u;
1da177e4
LT
3225
3226 if (level != SOL_PACKET)
3227 return -ENOPROTOOPT;
3228
8ae55f04
KK
3229 if (get_user(len, optlen))
3230 return -EFAULT;
1da177e4
LT
3231
3232 if (len < 0)
3233 return -EINVAL;
1ce4f28b 3234
69e3c75f 3235 switch (optname) {
1da177e4 3236 case PACKET_STATISTICS:
1da177e4 3237 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3238 if (po->tp_version == TPACKET_V3) {
c06fff6e 3239 lv = sizeof(struct tpacket_stats_v3);
f6fb8f10 3240 memcpy(&st_u.stats3, &po->stats,
c06fff6e 3241 sizeof(struct tpacket_stats));
f6fb8f10 3242 st_u.stats3.tp_freeze_q_cnt =
c06fff6e 3243 po->stats_u.stats3.tp_freeze_q_cnt;
f6fb8f10 3244 st_u.stats3.tp_packets += po->stats.tp_drops;
3245 data = &st_u.stats3;
3246 } else {
c06fff6e 3247 lv = sizeof(struct tpacket_stats);
f6fb8f10 3248 st = po->stats;
3249 st.tp_packets += st.tp_drops;
3250 data = &st;
3251 }
1da177e4
LT
3252 memset(&po->stats, 0, sizeof(st));
3253 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3254 break;
3255 case PACKET_AUXDATA:
8dc41944 3256 val = po->auxdata;
80feaacb
PWJ
3257 break;
3258 case PACKET_ORIGDEV:
80feaacb 3259 val = po->origdev;
bfd5f4a3
SS
3260 break;
3261 case PACKET_VNET_HDR:
bfd5f4a3 3262 val = po->has_vnet_hdr;
1da177e4 3263 break;
bbd6ef87 3264 case PACKET_VERSION:
bbd6ef87 3265 val = po->tp_version;
bbd6ef87
PM
3266 break;
3267 case PACKET_HDRLEN:
3268 if (len > sizeof(int))
3269 len = sizeof(int);
3270 if (copy_from_user(&val, optval, len))
3271 return -EFAULT;
3272 switch (val) {
3273 case TPACKET_V1:
3274 val = sizeof(struct tpacket_hdr);
3275 break;
3276 case TPACKET_V2:
3277 val = sizeof(struct tpacket2_hdr);
3278 break;
f6fb8f10 3279 case TPACKET_V3:
3280 val = sizeof(struct tpacket3_hdr);
3281 break;
bbd6ef87
PM
3282 default:
3283 return -EINVAL;
3284 }
bbd6ef87 3285 break;
8913336a 3286 case PACKET_RESERVE:
8913336a 3287 val = po->tp_reserve;
8913336a 3288 break;
69e3c75f 3289 case PACKET_LOSS:
69e3c75f 3290 val = po->tp_loss;
69e3c75f 3291 break;
614f60fa 3292 case PACKET_TIMESTAMP:
614f60fa 3293 val = po->tp_tstamp;
614f60fa 3294 break;
dc99f600 3295 case PACKET_FANOUT:
dc99f600
DM
3296 val = (po->fanout ?
3297 ((u32)po->fanout->id |
3298 ((u32)po->fanout->type << 16)) :
3299 0);
dc99f600 3300 break;
1da177e4
LT
3301 default:
3302 return -ENOPROTOOPT;
3303 }
3304
c06fff6e
ED
3305 if (len > lv)
3306 len = lv;
8ae55f04
KK
3307 if (put_user(len, optlen))
3308 return -EFAULT;
8dc41944
HX
3309 if (copy_to_user(optval, data, len))
3310 return -EFAULT;
8ae55f04 3311 return 0;
1da177e4
LT
3312}
3313
3314
3315static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3316{
3317 struct sock *sk;
3318 struct hlist_node *node;
ad930650 3319 struct net_device *dev = data;
c346dca1 3320 struct net *net = dev_net(dev);
1da177e4 3321
808f5114 3322 rcu_read_lock();
3323 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
3324 struct packet_sock *po = pkt_sk(sk);
3325
3326 switch (msg) {
3327 case NETDEV_UNREGISTER:
1da177e4
LT
3328 if (po->mclist)
3329 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3330 /* fallthrough */
3331
1da177e4
LT
3332 case NETDEV_DOWN:
3333 if (dev->ifindex == po->ifindex) {
3334 spin_lock(&po->bind_lock);
3335 if (po->running) {
ce06b03e 3336 __unregister_prot_hook(sk, false);
1da177e4
LT
3337 sk->sk_err = ENETDOWN;
3338 if (!sock_flag(sk, SOCK_DEAD))
3339 sk->sk_error_report(sk);
3340 }
3341 if (msg == NETDEV_UNREGISTER) {
3342 po->ifindex = -1;
160ff18a
BG
3343 if (po->prot_hook.dev)
3344 dev_put(po->prot_hook.dev);
1da177e4
LT
3345 po->prot_hook.dev = NULL;
3346 }
3347 spin_unlock(&po->bind_lock);
3348 }
3349 break;
3350 case NETDEV_UP:
808f5114 3351 if (dev->ifindex == po->ifindex) {
3352 spin_lock(&po->bind_lock);
ce06b03e
DM
3353 if (po->num)
3354 register_prot_hook(sk);
808f5114 3355 spin_unlock(&po->bind_lock);
1da177e4 3356 }
1da177e4
LT
3357 break;
3358 }
3359 }
808f5114 3360 rcu_read_unlock();
1da177e4
LT
3361 return NOTIFY_DONE;
3362}
3363
3364
3365static int packet_ioctl(struct socket *sock, unsigned int cmd,
3366 unsigned long arg)
3367{
3368 struct sock *sk = sock->sk;
3369
69e3c75f 3370 switch (cmd) {
40d4e3df
ED
3371 case SIOCOUTQ:
3372 {
3373 int amount = sk_wmem_alloc_get(sk);
31e6d363 3374
40d4e3df
ED
3375 return put_user(amount, (int __user *)arg);
3376 }
3377 case SIOCINQ:
3378 {
3379 struct sk_buff *skb;
3380 int amount = 0;
3381
3382 spin_lock_bh(&sk->sk_receive_queue.lock);
3383 skb = skb_peek(&sk->sk_receive_queue);
3384 if (skb)
3385 amount = skb->len;
3386 spin_unlock_bh(&sk->sk_receive_queue.lock);
3387 return put_user(amount, (int __user *)arg);
3388 }
3389 case SIOCGSTAMP:
3390 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3391 case SIOCGSTAMPNS:
3392 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3393
1da177e4 3394#ifdef CONFIG_INET
40d4e3df
ED
3395 case SIOCADDRT:
3396 case SIOCDELRT:
3397 case SIOCDARP:
3398 case SIOCGARP:
3399 case SIOCSARP:
3400 case SIOCGIFADDR:
3401 case SIOCSIFADDR:
3402 case SIOCGIFBRDADDR:
3403 case SIOCSIFBRDADDR:
3404 case SIOCGIFNETMASK:
3405 case SIOCSIFNETMASK:
3406 case SIOCGIFDSTADDR:
3407 case SIOCSIFDSTADDR:
3408 case SIOCSIFFLAGS:
40d4e3df 3409 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3410#endif
3411
40d4e3df
ED
3412 default:
3413 return -ENOIOCTLCMD;
1da177e4
LT
3414 }
3415 return 0;
3416}
3417
40d4e3df 3418static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3419 poll_table *wait)
3420{
3421 struct sock *sk = sock->sk;
3422 struct packet_sock *po = pkt_sk(sk);
3423 unsigned int mask = datagram_poll(file, sock, wait);
3424
3425 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3426 if (po->rx_ring.pg_vec) {
f6fb8f10 3427 if (!packet_previous_rx_frame(po, &po->rx_ring,
3428 TP_STATUS_KERNEL))
1da177e4
LT
3429 mask |= POLLIN | POLLRDNORM;
3430 }
3431 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3432 spin_lock_bh(&sk->sk_write_queue.lock);
3433 if (po->tx_ring.pg_vec) {
3434 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3435 mask |= POLLOUT | POLLWRNORM;
3436 }
3437 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3438 return mask;
3439}
3440
3441
3442/* Dirty? Well, I still did not learn better way to account
3443 * for user mmaps.
3444 */
3445
3446static void packet_mm_open(struct vm_area_struct *vma)
3447{
3448 struct file *file = vma->vm_file;
40d4e3df 3449 struct socket *sock = file->private_data;
1da177e4 3450 struct sock *sk = sock->sk;
1ce4f28b 3451
1da177e4
LT
3452 if (sk)
3453 atomic_inc(&pkt_sk(sk)->mapped);
3454}
3455
3456static void packet_mm_close(struct vm_area_struct *vma)
3457{
3458 struct file *file = vma->vm_file;
40d4e3df 3459 struct socket *sock = file->private_data;
1da177e4 3460 struct sock *sk = sock->sk;
1ce4f28b 3461
1da177e4
LT
3462 if (sk)
3463 atomic_dec(&pkt_sk(sk)->mapped);
3464}
3465
f0f37e2f 3466static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3467 .open = packet_mm_open,
3468 .close = packet_mm_close,
1da177e4
LT
3469};
3470
0e3125c7
NH
3471static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3472 unsigned int len)
1da177e4
LT
3473{
3474 int i;
3475
4ebf0ae2 3476 for (i = 0; i < len; i++) {
0e3125c7 3477 if (likely(pg_vec[i].buffer)) {
c56b4d90 3478 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3479 vfree(pg_vec[i].buffer);
3480 else
3481 free_pages((unsigned long)pg_vec[i].buffer,
3482 order);
3483 pg_vec[i].buffer = NULL;
3484 }
1da177e4
LT
3485 }
3486 kfree(pg_vec);
3487}
3488
eea49cc9 3489static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3490{
0e3125c7
NH
3491 char *buffer = NULL;
3492 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3493 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3494
3495 buffer = (char *) __get_free_pages(gfp_flags, order);
3496
3497 if (buffer)
3498 return buffer;
3499
3500 /*
3501 * __get_free_pages failed, fall back to vmalloc
3502 */
bbce5a59 3503 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3504
0e3125c7
NH
3505 if (buffer)
3506 return buffer;
3507
3508 /*
3509 * vmalloc failed, lets dig into swap here
3510 */
0e3125c7
NH
3511 gfp_flags &= ~__GFP_NORETRY;
3512 buffer = (char *)__get_free_pages(gfp_flags, order);
3513 if (buffer)
3514 return buffer;
3515
3516 /*
3517 * complete and utter failure
3518 */
3519 return NULL;
4ebf0ae2
DM
3520}
3521
0e3125c7 3522static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3523{
3524 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3525 struct pgv *pg_vec;
4ebf0ae2
DM
3526 int i;
3527
0e3125c7 3528 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3529 if (unlikely(!pg_vec))
3530 goto out;
3531
3532 for (i = 0; i < block_nr; i++) {
c56b4d90 3533 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3534 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3535 goto out_free_pgvec;
3536 }
3537
3538out:
3539 return pg_vec;
3540
3541out_free_pgvec:
3542 free_pg_vec(pg_vec, order, block_nr);
3543 pg_vec = NULL;
3544 goto out;
3545}
1da177e4 3546
f6fb8f10 3547static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3548 int closing, int tx_ring)
1da177e4 3549{
0e3125c7 3550 struct pgv *pg_vec = NULL;
1da177e4 3551 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3552 int was_running, order = 0;
69e3c75f
JB
3553 struct packet_ring_buffer *rb;
3554 struct sk_buff_head *rb_queue;
0e11c91e 3555 __be16 num;
f6fb8f10 3556 int err = -EINVAL;
3557 /* Added to avoid minimal code churn */
3558 struct tpacket_req *req = &req_u->req;
3559
3560 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3561 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3562 WARN(1, "Tx-ring is not supported.\n");
3563 goto out;
3564 }
1ce4f28b 3565
69e3c75f
JB
3566 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3567 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3568
69e3c75f
JB
3569 err = -EBUSY;
3570 if (!closing) {
3571 if (atomic_read(&po->mapped))
3572 goto out;
3573 if (atomic_read(&rb->pending))
3574 goto out;
3575 }
1da177e4 3576
69e3c75f
JB
3577 if (req->tp_block_nr) {
3578 /* Sanity tests and some calculations */
3579 err = -EBUSY;
3580 if (unlikely(rb->pg_vec))
3581 goto out;
1da177e4 3582
bbd6ef87
PM
3583 switch (po->tp_version) {
3584 case TPACKET_V1:
3585 po->tp_hdrlen = TPACKET_HDRLEN;
3586 break;
3587 case TPACKET_V2:
3588 po->tp_hdrlen = TPACKET2_HDRLEN;
3589 break;
f6fb8f10 3590 case TPACKET_V3:
3591 po->tp_hdrlen = TPACKET3_HDRLEN;
3592 break;
bbd6ef87
PM
3593 }
3594
69e3c75f 3595 err = -EINVAL;
4ebf0ae2 3596 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3597 goto out;
4ebf0ae2 3598 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3599 goto out;
8913336a 3600 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3601 po->tp_reserve))
3602 goto out;
4ebf0ae2 3603 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3604 goto out;
1da177e4 3605
69e3c75f
JB
3606 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3607 if (unlikely(rb->frames_per_block <= 0))
3608 goto out;
3609 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3610 req->tp_frame_nr))
3611 goto out;
1da177e4
LT
3612
3613 err = -ENOMEM;
4ebf0ae2
DM
3614 order = get_order(req->tp_block_size);
3615 pg_vec = alloc_pg_vec(req, order);
3616 if (unlikely(!pg_vec))
1da177e4 3617 goto out;
f6fb8f10 3618 switch (po->tp_version) {
3619 case TPACKET_V3:
3620 /* Transmit path is not supported. We checked
3621 * it above but just being paranoid
3622 */
3623 if (!tx_ring)
3624 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3625 break;
3626 default:
3627 break;
3628 }
69e3c75f
JB
3629 }
3630 /* Done */
3631 else {
3632 err = -EINVAL;
4ebf0ae2 3633 if (unlikely(req->tp_frame_nr))
69e3c75f 3634 goto out;
1da177e4
LT
3635 }
3636
3637 lock_sock(sk);
3638
3639 /* Detach socket from network */
3640 spin_lock(&po->bind_lock);
3641 was_running = po->running;
3642 num = po->num;
3643 if (was_running) {
1da177e4 3644 po->num = 0;
ce06b03e 3645 __unregister_prot_hook(sk, false);
1da177e4
LT
3646 }
3647 spin_unlock(&po->bind_lock);
1ce4f28b 3648
1da177e4
LT
3649 synchronize_net();
3650
3651 err = -EBUSY;
905db440 3652 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3653 if (closing || atomic_read(&po->mapped) == 0) {
3654 err = 0;
69e3c75f 3655 spin_lock_bh(&rb_queue->lock);
c053fd96 3656 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3657 rb->frame_max = (req->tp_frame_nr - 1);
3658 rb->head = 0;
3659 rb->frame_size = req->tp_frame_size;
3660 spin_unlock_bh(&rb_queue->lock);
3661
c053fd96
CG
3662 swap(rb->pg_vec_order, order);
3663 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3664
3665 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3666 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3667 tpacket_rcv : packet_rcv;
3668 skb_queue_purge(rb_queue);
1da177e4 3669 if (atomic_read(&po->mapped))
40d4e3df
ED
3670 pr_err("packet_mmap: vma is busy: %d\n",
3671 atomic_read(&po->mapped));
1da177e4 3672 }
905db440 3673 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3674
3675 spin_lock(&po->bind_lock);
ce06b03e 3676 if (was_running) {
1da177e4 3677 po->num = num;
ce06b03e 3678 register_prot_hook(sk);
1da177e4
LT
3679 }
3680 spin_unlock(&po->bind_lock);
f6fb8f10 3681 if (closing && (po->tp_version > TPACKET_V2)) {
3682 /* Because we don't support block-based V3 on tx-ring */
3683 if (!tx_ring)
3684 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3685 }
1da177e4
LT
3686 release_sock(sk);
3687
1da177e4
LT
3688 if (pg_vec)
3689 free_pg_vec(pg_vec, order, req->tp_block_nr);
3690out:
3691 return err;
3692}
3693
69e3c75f
JB
3694static int packet_mmap(struct file *file, struct socket *sock,
3695 struct vm_area_struct *vma)
1da177e4
LT
3696{
3697 struct sock *sk = sock->sk;
3698 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3699 unsigned long size, expected_size;
3700 struct packet_ring_buffer *rb;
1da177e4
LT
3701 unsigned long start;
3702 int err = -EINVAL;
3703 int i;
3704
3705 if (vma->vm_pgoff)
3706 return -EINVAL;
3707
905db440 3708 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3709
3710 expected_size = 0;
3711 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3712 if (rb->pg_vec) {
3713 expected_size += rb->pg_vec_len
3714 * rb->pg_vec_pages
3715 * PAGE_SIZE;
3716 }
3717 }
3718
3719 if (expected_size == 0)
1da177e4 3720 goto out;
69e3c75f
JB
3721
3722 size = vma->vm_end - vma->vm_start;
3723 if (size != expected_size)
1da177e4
LT
3724 goto out;
3725
1da177e4 3726 start = vma->vm_start;
69e3c75f
JB
3727 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3728 if (rb->pg_vec == NULL)
3729 continue;
3730
3731 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3732 struct page *page;
3733 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3734 int pg_num;
3735
c56b4d90
CG
3736 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3737 page = pgv_to_page(kaddr);
69e3c75f
JB
3738 err = vm_insert_page(vma, start, page);
3739 if (unlikely(err))
3740 goto out;
3741 start += PAGE_SIZE;
0e3125c7 3742 kaddr += PAGE_SIZE;
69e3c75f 3743 }
4ebf0ae2 3744 }
1da177e4 3745 }
69e3c75f 3746
4ebf0ae2 3747 atomic_inc(&po->mapped);
1da177e4
LT
3748 vma->vm_ops = &packet_mmap_ops;
3749 err = 0;
3750
3751out:
905db440 3752 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3753 return err;
3754}
1da177e4 3755
90ddc4f0 3756static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3757 .family = PF_PACKET,
3758 .owner = THIS_MODULE,
3759 .release = packet_release,
3760 .bind = packet_bind_spkt,
3761 .connect = sock_no_connect,
3762 .socketpair = sock_no_socketpair,
3763 .accept = sock_no_accept,
3764 .getname = packet_getname_spkt,
3765 .poll = datagram_poll,
3766 .ioctl = packet_ioctl,
3767 .listen = sock_no_listen,
3768 .shutdown = sock_no_shutdown,
3769 .setsockopt = sock_no_setsockopt,
3770 .getsockopt = sock_no_getsockopt,
3771 .sendmsg = packet_sendmsg_spkt,
3772 .recvmsg = packet_recvmsg,
3773 .mmap = sock_no_mmap,
3774 .sendpage = sock_no_sendpage,
3775};
1da177e4 3776
90ddc4f0 3777static const struct proto_ops packet_ops = {
1da177e4
LT
3778 .family = PF_PACKET,
3779 .owner = THIS_MODULE,
3780 .release = packet_release,
3781 .bind = packet_bind,
3782 .connect = sock_no_connect,
3783 .socketpair = sock_no_socketpair,
3784 .accept = sock_no_accept,
1ce4f28b 3785 .getname = packet_getname,
1da177e4
LT
3786 .poll = packet_poll,
3787 .ioctl = packet_ioctl,
3788 .listen = sock_no_listen,
3789 .shutdown = sock_no_shutdown,
3790 .setsockopt = packet_setsockopt,
3791 .getsockopt = packet_getsockopt,
3792 .sendmsg = packet_sendmsg,
3793 .recvmsg = packet_recvmsg,
3794 .mmap = packet_mmap,
3795 .sendpage = sock_no_sendpage,
3796};
3797
ec1b4cf7 3798static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3799 .family = PF_PACKET,
3800 .create = packet_create,
3801 .owner = THIS_MODULE,
3802};
3803
3804static struct notifier_block packet_netdev_notifier = {
40d4e3df 3805 .notifier_call = packet_notifier,
1da177e4
LT
3806};
3807
3808#ifdef CONFIG_PROC_FS
1da177e4
LT
3809
3810static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3811 __acquires(RCU)
1da177e4 3812{
e372c414 3813 struct net *net = seq_file_net(seq);
808f5114 3814
3815 rcu_read_lock();
3816 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3817}
3818
3819static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3820{
1bf40954 3821 struct net *net = seq_file_net(seq);
808f5114 3822 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3823}
3824
3825static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3826 __releases(RCU)
1da177e4 3827{
808f5114 3828 rcu_read_unlock();
1da177e4
LT
3829}
3830
1ce4f28b 3831static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3832{
3833 if (v == SEQ_START_TOKEN)
3834 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3835 else {
b7ceabd9 3836 struct sock *s = sk_entry(v);
1da177e4
LT
3837 const struct packet_sock *po = pkt_sk(s);
3838
3839 seq_printf(seq,
71338aa7 3840 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3841 s,
3842 atomic_read(&s->sk_refcnt),
3843 s->sk_type,
3844 ntohs(po->num),
3845 po->ifindex,
3846 po->running,
3847 atomic_read(&s->sk_rmem_alloc),
3848 sock_i_uid(s),
40d4e3df 3849 sock_i_ino(s));
1da177e4
LT
3850 }
3851
3852 return 0;
3853}
3854
56b3d975 3855static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3856 .start = packet_seq_start,
3857 .next = packet_seq_next,
3858 .stop = packet_seq_stop,
3859 .show = packet_seq_show,
3860};
3861
3862static int packet_seq_open(struct inode *inode, struct file *file)
3863{
e372c414
DL
3864 return seq_open_net(inode, file, &packet_seq_ops,
3865 sizeof(struct seq_net_private));
1da177e4
LT
3866}
3867
da7071d7 3868static const struct file_operations packet_seq_fops = {
1da177e4
LT
3869 .owner = THIS_MODULE,
3870 .open = packet_seq_open,
3871 .read = seq_read,
3872 .llseek = seq_lseek,
e372c414 3873 .release = seq_release_net,
1da177e4
LT
3874};
3875
3876#endif
3877
2c8c1e72 3878static int __net_init packet_net_init(struct net *net)
d12d01d6 3879{
808f5114 3880 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 3881 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
3882
3883 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
3884 return -ENOMEM;
3885
3886 return 0;
3887}
3888
2c8c1e72 3889static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
3890{
3891 proc_net_remove(net, "packet");
3892}
3893
3894static struct pernet_operations packet_net_ops = {
3895 .init = packet_net_init,
3896 .exit = packet_net_exit,
3897};
3898
3899
1da177e4
LT
3900static void __exit packet_exit(void)
3901{
1da177e4 3902 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3903 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3904 sock_unregister(PF_PACKET);
3905 proto_unregister(&packet_proto);
3906}
3907
3908static int __init packet_init(void)
3909{
3910 int rc = proto_register(&packet_proto, 0);
3911
3912 if (rc != 0)
3913 goto out;
3914
3915 sock_register(&packet_family_ops);
d12d01d6 3916 register_pernet_subsys(&packet_net_ops);
1da177e4 3917 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3918out:
3919 return rc;
3920}
3921
3922module_init(packet_init);
3923module_exit(packet_exit);
3924MODULE_LICENSE("GPL");
3925MODULE_ALIAS_NETPROTO(PF_PACKET);