]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/packet/af_packet.c
fcoe: convert to SKB paged frag API.
[mirror_ubuntu-zesty-kernel.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
f6fb8f10 43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
1da177e4
LT
47 *
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
52 *
53 */
1ce4f28b 54
1da177e4 55#include <linux/types.h>
1da177e4 56#include <linux/mm.h>
4fc268d2 57#include <linux/capability.h>
1da177e4
LT
58#include <linux/fcntl.h>
59#include <linux/socket.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_packet.h>
64#include <linux/wireless.h>
ffbc6111 65#include <linux/kernel.h>
1da177e4 66#include <linux/kmod.h>
5a0e3ad6 67#include <linux/slab.h>
0e3125c7 68#include <linux/vmalloc.h>
457c4cbc 69#include <net/net_namespace.h>
1da177e4
LT
70#include <net/ip.h>
71#include <net/protocol.h>
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <linux/errno.h>
75#include <linux/timer.h>
76#include <asm/system.h>
77#include <asm/uaccess.h>
78#include <asm/ioctls.h>
79#include <asm/page.h>
a1f8e7f7 80#include <asm/cacheflush.h>
1da177e4
LT
81#include <asm/io.h>
82#include <linux/proc_fs.h>
83#include <linux/seq_file.h>
84#include <linux/poll.h>
85#include <linux/module.h>
86#include <linux/init.h>
905db440 87#include <linux/mutex.h>
05423b24 88#include <linux/if_vlan.h>
bfd5f4a3 89#include <linux/virtio_net.h>
ed85b565 90#include <linux/errqueue.h>
614f60fa 91#include <linux/net_tstamp.h>
1da177e4
LT
92
93#ifdef CONFIG_INET
94#include <net/inet_common.h>
95#endif
96
1da177e4
LT
97/*
98 Assumptions:
99 - if device has no dev->hard_header routine, it adds and removes ll header
100 inside itself. In this case ll header is invisible outside of device,
101 but higher levels still should reserve dev->hard_header_len.
102 Some devices are enough clever to reallocate skb, when header
103 will not fit to reserved space (tunnel), another ones are silly
104 (PPP).
105 - packet socket receives packets with pulled ll header,
106 so that SOCK_RAW should push it back.
107
108On receive:
109-----------
110
111Incoming, dev->hard_header!=NULL
b0e380b1
ACM
112 mac_header -> ll header
113 data -> data
1da177e4
LT
114
115Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
116 mac_header -> ll header
117 data -> ll header
1da177e4
LT
118
119Incoming, dev->hard_header==NULL
b0e380b1
ACM
120 mac_header -> UNKNOWN position. It is very likely, that it points to ll
121 header. PPP makes it, that is wrong, because introduce
db0c58f9 122 assymetry between rx and tx paths.
b0e380b1 123 data -> data
1da177e4
LT
124
125Outgoing, dev->hard_header==NULL
b0e380b1
ACM
126 mac_header -> data. ll header is still not built!
127 data -> data
1da177e4
LT
128
129Resume
130 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
131
132
133On transmit:
134------------
135
136dev->hard_header != NULL
b0e380b1
ACM
137 mac_header -> ll header
138 data -> ll header
1da177e4
LT
139
140dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
141 mac_header -> data
142 data -> data
1da177e4
LT
143
144 We should set nh.raw on output to correct posistion,
145 packet classifier depends on it.
146 */
147
1da177e4
LT
148/* Private packet socket structures. */
149
40d4e3df 150struct packet_mclist {
1da177e4
LT
151 struct packet_mclist *next;
152 int ifindex;
153 int count;
154 unsigned short type;
155 unsigned short alen;
0fb375fb
EB
156 unsigned char addr[MAX_ADDR_LEN];
157};
158/* identical to struct packet_mreq except it has
159 * a longer address field.
160 */
40d4e3df 161struct packet_mreq_max {
0fb375fb
EB
162 int mr_ifindex;
163 unsigned short mr_type;
164 unsigned short mr_alen;
165 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 166};
a2efcfa0 167
f6fb8f10 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
169 int closing, int tx_ring);
170
f6fb8f10 171
172#define V3_ALIGNMENT (8)
173
174#define BLK_HDR_LEN (ALIGN(sizeof(struct block_desc), V3_ALIGNMENT))
175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
179/* kbdq - kernel block descriptor queue */
180struct kbdq_core {
181 struct pgv *pkbdq;
182 unsigned int feature_req_word;
183 unsigned int hdrlen;
184 unsigned char reset_pending_on_curr_blk;
185 unsigned char delete_blk_timer;
186 unsigned short kactive_blk_num;
187 unsigned short blk_sizeof_priv;
188
189 /* last_kactive_blk_num:
190 * trick to see if user-space has caught up
191 * in order to avoid refreshing timer when every single pkt arrives.
192 */
193 unsigned short last_kactive_blk_num;
194
195 char *pkblk_start;
196 char *pkblk_end;
197 int kblk_size;
198 unsigned int knum_blocks;
199 uint64_t knxt_seq_num;
200 char *prev;
201 char *nxt_offset;
202 struct sk_buff *skb;
203
204 atomic_t blk_fill_in_prog;
205
206 /* Default is set to 8ms */
207#define DEFAULT_PRB_RETIRE_TOV (8)
208
209 unsigned short retire_blk_tov;
210 unsigned short version;
211 unsigned long tov_in_jiffies;
212
213 /* timer to retire an outstanding block */
214 struct timer_list retire_blk_timer;
215};
216
217#define PGV_FROM_VMALLOC 1
0e3125c7
NH
218struct pgv {
219 char *buffer;
0e3125c7
NH
220};
221
69e3c75f 222struct packet_ring_buffer {
0e3125c7 223 struct pgv *pg_vec;
69e3c75f
JB
224 unsigned int head;
225 unsigned int frames_per_block;
226 unsigned int frame_size;
227 unsigned int frame_max;
228
229 unsigned int pg_vec_order;
230 unsigned int pg_vec_pages;
231 unsigned int pg_vec_len;
232
f6fb8f10 233 struct kbdq_core prb_bdqc;
69e3c75f
JB
234 atomic_t pending;
235};
236
f6fb8f10 237#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
238#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
239#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
240#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
241#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
242#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
243#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
244
69e3c75f
JB
245struct packet_sock;
246static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4 247
f6fb8f10 248static void *packet_previous_frame(struct packet_sock *po,
249 struct packet_ring_buffer *rb,
250 int status);
251static void packet_increment_head(struct packet_ring_buffer *buff);
252static int prb_curr_blk_in_use(struct kbdq_core *,
253 struct block_desc *);
254static void *prb_dispatch_next_block(struct kbdq_core *,
255 struct packet_sock *);
256static void prb_retire_current_block(struct kbdq_core *,
257 struct packet_sock *, unsigned int status);
258static int prb_queue_frozen(struct kbdq_core *);
259static void prb_open_block(struct kbdq_core *, struct block_desc *);
260static void prb_retire_rx_blk_timer_expired(unsigned long);
261static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *);
262static void prb_init_blk_timer(struct packet_sock *, struct kbdq_core *,
263 void (*func) (unsigned long));
264static void prb_fill_rxhash(struct kbdq_core *, struct tpacket3_hdr *);
265static void prb_clear_rxhash(struct kbdq_core *, struct tpacket3_hdr *);
266static void prb_fill_vlan_info(struct kbdq_core *, struct tpacket3_hdr *);
1da177e4
LT
267static void packet_flush_mclist(struct sock *sk);
268
dc99f600 269struct packet_fanout;
1da177e4
LT
270struct packet_sock {
271 /* struct sock has to be the first member of packet_sock */
272 struct sock sk;
dc99f600 273 struct packet_fanout *fanout;
1da177e4 274 struct tpacket_stats stats;
f6fb8f10 275 union tpacket_stats_u stats_u;
69e3c75f
JB
276 struct packet_ring_buffer rx_ring;
277 struct packet_ring_buffer tx_ring;
1da177e4 278 int copy_thresh;
1da177e4 279 spinlock_t bind_lock;
905db440 280 struct mutex pg_vec_lock;
8dc41944 281 unsigned int running:1, /* prot_hook is attached*/
80feaacb 282 auxdata:1,
bfd5f4a3
SS
283 origdev:1,
284 has_vnet_hdr:1;
1da177e4 285 int ifindex; /* bound device */
0e11c91e 286 __be16 num;
1da177e4 287 struct packet_mclist *mclist;
1da177e4 288 atomic_t mapped;
bbd6ef87
PM
289 enum tpacket_versions tp_version;
290 unsigned int tp_hdrlen;
8913336a 291 unsigned int tp_reserve;
69e3c75f 292 unsigned int tp_loss:1;
614f60fa 293 unsigned int tp_tstamp;
94b05952 294 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
295};
296
dc99f600
DM
297#define PACKET_FANOUT_MAX 256
298
299struct packet_fanout {
300#ifdef CONFIG_NET_NS
301 struct net *net;
302#endif
303 unsigned int num_members;
304 u16 id;
305 u8 type;
7736d33f 306 u8 defrag;
dc99f600
DM
307 atomic_t rr_cur;
308 struct list_head list;
309 struct sock *arr[PACKET_FANOUT_MAX];
310 spinlock_t lock;
311 atomic_t sk_ref;
312 struct packet_type prot_hook ____cacheline_aligned_in_smp;
313};
314
ffbc6111
HX
315struct packet_skb_cb {
316 unsigned int origlen;
317 union {
318 struct sockaddr_pkt pkt;
319 struct sockaddr_ll ll;
320 } sa;
321};
322
323#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 324
f6fb8f10 325#define GET_PBDQC_FROM_RB(x) ((struct kbdq_core *)(&(x)->prb_bdqc))
326#define GET_PBLOCK_DESC(x, bid) \
327 ((struct block_desc *)((x)->pkbdq[(bid)].buffer))
328#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
329 ((struct block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
330#define GET_NEXT_PRB_BLK_NUM(x) \
331 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
332 ((x)->kactive_blk_num+1) : 0)
333
ce06b03e
DM
334static inline struct packet_sock *pkt_sk(struct sock *sk)
335{
336 return (struct packet_sock *)sk;
337}
338
dc99f600
DM
339static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
340static void __fanout_link(struct sock *sk, struct packet_sock *po);
341
ce06b03e
DM
342/* register_prot_hook must be invoked with the po->bind_lock held,
343 * or from a context in which asynchronous accesses to the packet
344 * socket is not possible (packet_create()).
345 */
346static void register_prot_hook(struct sock *sk)
347{
348 struct packet_sock *po = pkt_sk(sk);
349 if (!po->running) {
dc99f600
DM
350 if (po->fanout)
351 __fanout_link(sk, po);
352 else
353 dev_add_pack(&po->prot_hook);
ce06b03e
DM
354 sock_hold(sk);
355 po->running = 1;
356 }
357}
358
359/* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
360 * held. If the sync parameter is true, we will temporarily drop
361 * the po->bind_lock and do a synchronize_net to make sure no
362 * asynchronous packet processing paths still refer to the elements
363 * of po->prot_hook. If the sync parameter is false, it is the
364 * callers responsibility to take care of this.
365 */
366static void __unregister_prot_hook(struct sock *sk, bool sync)
367{
368 struct packet_sock *po = pkt_sk(sk);
369
370 po->running = 0;
dc99f600
DM
371 if (po->fanout)
372 __fanout_unlink(sk, po);
373 else
374 __dev_remove_pack(&po->prot_hook);
ce06b03e
DM
375 __sock_put(sk);
376
377 if (sync) {
378 spin_unlock(&po->bind_lock);
379 synchronize_net();
380 spin_lock(&po->bind_lock);
381 }
382}
383
384static void unregister_prot_hook(struct sock *sk, bool sync)
385{
386 struct packet_sock *po = pkt_sk(sk);
387
388 if (po->running)
389 __unregister_prot_hook(sk, sync);
390}
391
f6dafa95 392static inline __pure struct page *pgv_to_page(void *addr)
0af55bb5
CG
393{
394 if (is_vmalloc_addr(addr))
395 return vmalloc_to_page(addr);
396 return virt_to_page(addr);
397}
398
69e3c75f 399static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 400{
bbd6ef87
PM
401 union {
402 struct tpacket_hdr *h1;
403 struct tpacket2_hdr *h2;
404 void *raw;
405 } h;
1da177e4 406
69e3c75f 407 h.raw = frame;
bbd6ef87
PM
408 switch (po->tp_version) {
409 case TPACKET_V1:
69e3c75f 410 h.h1->tp_status = status;
0af55bb5 411 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
412 break;
413 case TPACKET_V2:
69e3c75f 414 h.h2->tp_status = status;
0af55bb5 415 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 416 break;
f6fb8f10 417 case TPACKET_V3:
69e3c75f 418 default:
f6fb8f10 419 WARN(1, "TPACKET version not supported.\n");
69e3c75f 420 BUG();
bbd6ef87 421 }
69e3c75f
JB
422
423 smp_wmb();
bbd6ef87
PM
424}
425
69e3c75f 426static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
427{
428 union {
429 struct tpacket_hdr *h1;
430 struct tpacket2_hdr *h2;
431 void *raw;
432 } h;
433
69e3c75f
JB
434 smp_rmb();
435
bbd6ef87
PM
436 h.raw = frame;
437 switch (po->tp_version) {
438 case TPACKET_V1:
0af55bb5 439 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 440 return h.h1->tp_status;
bbd6ef87 441 case TPACKET_V2:
0af55bb5 442 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 443 return h.h2->tp_status;
f6fb8f10 444 case TPACKET_V3:
69e3c75f 445 default:
f6fb8f10 446 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
447 BUG();
448 return 0;
bbd6ef87 449 }
1da177e4 450}
69e3c75f
JB
451
452static void *packet_lookup_frame(struct packet_sock *po,
453 struct packet_ring_buffer *rb,
454 unsigned int position,
455 int status)
456{
457 unsigned int pg_vec_pos, frame_offset;
458 union {
459 struct tpacket_hdr *h1;
460 struct tpacket2_hdr *h2;
461 void *raw;
462 } h;
463
464 pg_vec_pos = position / rb->frames_per_block;
465 frame_offset = position % rb->frames_per_block;
466
0e3125c7
NH
467 h.raw = rb->pg_vec[pg_vec_pos].buffer +
468 (frame_offset * rb->frame_size);
69e3c75f
JB
469
470 if (status != __packet_get_status(po, h.raw))
471 return NULL;
472
473 return h.raw;
474}
475
476static inline void *packet_current_frame(struct packet_sock *po,
477 struct packet_ring_buffer *rb,
478 int status)
479{
480 return packet_lookup_frame(po, rb, rb->head, status);
481}
482
f6fb8f10 483static void prb_del_retire_blk_timer(struct kbdq_core *pkc)
484{
485 del_timer_sync(&pkc->retire_blk_timer);
486}
487
488static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
489 int tx_ring,
490 struct sk_buff_head *rb_queue)
491{
492 struct kbdq_core *pkc;
493
494 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
495
496 spin_lock(&rb_queue->lock);
497 pkc->delete_blk_timer = 1;
498 spin_unlock(&rb_queue->lock);
499
500 prb_del_retire_blk_timer(pkc);
501}
502
503static void prb_init_blk_timer(struct packet_sock *po,
504 struct kbdq_core *pkc,
505 void (*func) (unsigned long))
506{
507 init_timer(&pkc->retire_blk_timer);
508 pkc->retire_blk_timer.data = (long)po;
509 pkc->retire_blk_timer.function = func;
510 pkc->retire_blk_timer.expires = jiffies;
511}
512
513static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
514{
515 struct kbdq_core *pkc;
516
517 if (tx_ring)
518 BUG();
519
520 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
521 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
522}
523
524static int prb_calc_retire_blk_tmo(struct packet_sock *po,
525 int blk_size_in_bytes)
526{
527 struct net_device *dev;
528 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
529
530 dev = dev_get_by_index(sock_net(&po->sk), po->ifindex);
531 if (unlikely(dev == NULL))
532 return DEFAULT_PRB_RETIRE_TOV;
533
534 if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
535 struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, };
536
537 if (!dev->ethtool_ops->get_settings(dev, &ecmd)) {
538 switch (ecmd.speed) {
539 case SPEED_10000:
540 msec = 1;
541 div = 10000/1000;
542 break;
543 case SPEED_1000:
544 msec = 1;
545 div = 1000/1000;
546 break;
547 /*
548 * If the link speed is so slow you don't really
549 * need to worry about perf anyways
550 */
551 case SPEED_100:
552 case SPEED_10:
553 default:
554 return DEFAULT_PRB_RETIRE_TOV;
555 }
556 }
557 }
558
559 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
560
561 if (div)
562 mbits /= div;
563
564 tmo = mbits * msec;
565
566 if (div)
567 return tmo+1;
568 return tmo;
569}
570
571static void prb_init_ft_ops(struct kbdq_core *p1,
572 union tpacket_req_u *req_u)
573{
574 p1->feature_req_word = req_u->req3.tp_feature_req_word;
575}
576
577static void init_prb_bdqc(struct packet_sock *po,
578 struct packet_ring_buffer *rb,
579 struct pgv *pg_vec,
580 union tpacket_req_u *req_u, int tx_ring)
581{
582 struct kbdq_core *p1 = &rb->prb_bdqc;
583 struct block_desc *pbd;
584
585 memset(p1, 0x0, sizeof(*p1));
586
587 p1->knxt_seq_num = 1;
588 p1->pkbdq = pg_vec;
589 pbd = (struct block_desc *)pg_vec[0].buffer;
590 p1->pkblk_start = (char *)pg_vec[0].buffer;
591 p1->kblk_size = req_u->req3.tp_block_size;
592 p1->knum_blocks = req_u->req3.tp_block_nr;
593 p1->hdrlen = po->tp_hdrlen;
594 p1->version = po->tp_version;
595 p1->last_kactive_blk_num = 0;
596 po->stats_u.stats3.tp_freeze_q_cnt = 0;
597 if (req_u->req3.tp_retire_blk_tov)
598 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
599 else
600 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
601 req_u->req3.tp_block_size);
602 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
603 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
604
605 prb_init_ft_ops(p1, req_u);
606 prb_setup_retire_blk_timer(po, tx_ring);
607 prb_open_block(p1, pbd);
608}
609
610/* Do NOT update the last_blk_num first.
611 * Assumes sk_buff_head lock is held.
612 */
613static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *pkc)
614{
615 mod_timer(&pkc->retire_blk_timer,
616 jiffies + pkc->tov_in_jiffies);
617 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
618}
619
620/*
621 * Timer logic:
622 * 1) We refresh the timer only when we open a block.
623 * By doing this we don't waste cycles refreshing the timer
624 * on packet-by-packet basis.
625 *
626 * With a 1MB block-size, on a 1Gbps line, it will take
627 * i) ~8 ms to fill a block + ii) memcpy etc.
628 * In this cut we are not accounting for the memcpy time.
629 *
630 * So, if the user sets the 'tmo' to 10ms then the timer
631 * will never fire while the block is still getting filled
632 * (which is what we want). However, the user could choose
633 * to close a block early and that's fine.
634 *
635 * But when the timer does fire, we check whether or not to refresh it.
636 * Since the tmo granularity is in msecs, it is not too expensive
637 * to refresh the timer, lets say every '8' msecs.
638 * Either the user can set the 'tmo' or we can derive it based on
639 * a) line-speed and b) block-size.
640 * prb_calc_retire_blk_tmo() calculates the tmo.
641 *
642 */
643static void prb_retire_rx_blk_timer_expired(unsigned long data)
644{
645 struct packet_sock *po = (struct packet_sock *)data;
646 struct kbdq_core *pkc = &po->rx_ring.prb_bdqc;
647 unsigned int frozen;
648 struct block_desc *pbd;
649
650 spin_lock(&po->sk.sk_receive_queue.lock);
651
652 frozen = prb_queue_frozen(pkc);
653 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
654
655 if (unlikely(pkc->delete_blk_timer))
656 goto out;
657
658 /* We only need to plug the race when the block is partially filled.
659 * tpacket_rcv:
660 * lock(); increment BLOCK_NUM_PKTS; unlock()
661 * copy_bits() is in progress ...
662 * timer fires on other cpu:
663 * we can't retire the current block because copy_bits
664 * is in progress.
665 *
666 */
667 if (BLOCK_NUM_PKTS(pbd)) {
668 while (atomic_read(&pkc->blk_fill_in_prog)) {
669 /* Waiting for skb_copy_bits to finish... */
670 cpu_relax();
671 }
672 }
673
674 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
675 if (!frozen) {
676 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
677 if (!prb_dispatch_next_block(pkc, po))
678 goto refresh_timer;
679 else
680 goto out;
681 } else {
682 /* Case 1. Queue was frozen because user-space was
683 * lagging behind.
684 */
685 if (prb_curr_blk_in_use(pkc, pbd)) {
686 /*
687 * Ok, user-space is still behind.
688 * So just refresh the timer.
689 */
690 goto refresh_timer;
691 } else {
692 /* Case 2. queue was frozen,user-space caught up,
693 * now the link went idle && the timer fired.
694 * We don't have a block to close.So we open this
695 * block and restart the timer.
696 * opening a block thaws the queue,restarts timer
697 * Thawing/timer-refresh is a side effect.
698 */
699 prb_open_block(pkc, pbd);
700 goto out;
701 }
702 }
703 }
704
705refresh_timer:
706 _prb_refresh_rx_retire_blk_timer(pkc);
707
708out:
709 spin_unlock(&po->sk.sk_receive_queue.lock);
710}
711
712static inline void prb_flush_block(struct kbdq_core *pkc1,
713 struct block_desc *pbd1, __u32 status)
714{
715 /* Flush everything minus the block header */
716
717#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
718 u8 *start, *end;
719
720 start = (u8 *)pbd1;
721
722 /* Skip the block header(we know header WILL fit in 4K) */
723 start += PAGE_SIZE;
724
725 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
726 for (; start < end; start += PAGE_SIZE)
727 flush_dcache_page(pgv_to_page(start));
728
729 smp_wmb();
730#endif
731
732 /* Now update the block status. */
733
734 BLOCK_STATUS(pbd1) = status;
735
736 /* Flush the block header */
737
738#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
739 start = (u8 *)pbd1;
740 flush_dcache_page(pgv_to_page(start));
741
742 smp_wmb();
743#endif
744}
745
746/*
747 * Side effect:
748 *
749 * 1) flush the block
750 * 2) Increment active_blk_num
751 *
752 * Note:We DONT refresh the timer on purpose.
753 * Because almost always the next block will be opened.
754 */
755static void prb_close_block(struct kbdq_core *pkc1, struct block_desc *pbd1,
756 struct packet_sock *po, unsigned int stat)
757{
758 __u32 status = TP_STATUS_USER | stat;
759
760 struct tpacket3_hdr *last_pkt;
761 struct hdr_v1 *h1 = &pbd1->hdr.bh1;
762
763 if (po->stats.tp_drops)
764 status |= TP_STATUS_LOSING;
765
766 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
767 last_pkt->tp_next_offset = 0;
768
769 /* Get the ts of the last pkt */
770 if (BLOCK_NUM_PKTS(pbd1)) {
771 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
772 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
773 } else {
774 /* Ok, we tmo'd - so get the current time */
775 struct timespec ts;
776 getnstimeofday(&ts);
777 h1->ts_last_pkt.ts_sec = ts.tv_sec;
778 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
779 }
780
781 smp_wmb();
782
783 /* Flush the block */
784 prb_flush_block(pkc1, pbd1, status);
785
786 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
787}
788
789static inline void prb_thaw_queue(struct kbdq_core *pkc)
790{
791 pkc->reset_pending_on_curr_blk = 0;
792}
793
794/*
795 * Side effect of opening a block:
796 *
797 * 1) prb_queue is thawed.
798 * 2) retire_blk_timer is refreshed.
799 *
800 */
801static void prb_open_block(struct kbdq_core *pkc1, struct block_desc *pbd1)
802{
803 struct timespec ts;
804 struct hdr_v1 *h1 = &pbd1->hdr.bh1;
805
806 smp_rmb();
807
808 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
809
810 /* We could have just memset this but we will lose the
811 * flexibility of making the priv area sticky
812 */
813 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
814 BLOCK_NUM_PKTS(pbd1) = 0;
815 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
816 getnstimeofday(&ts);
817 h1->ts_first_pkt.ts_sec = ts.tv_sec;
818 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
819 pkc1->pkblk_start = (char *)pbd1;
820 pkc1->nxt_offset = (char *)(pkc1->pkblk_start +
821 BLK_PLUS_PRIV(pkc1->blk_sizeof_priv));
822 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
823 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
824 pbd1->version = pkc1->version;
825 pkc1->prev = pkc1->nxt_offset;
826 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
827 prb_thaw_queue(pkc1);
828 _prb_refresh_rx_retire_blk_timer(pkc1);
829
830 smp_wmb();
831
832 return;
833 }
834
835 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
836 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
837 dump_stack();
838 BUG();
839}
840
841/*
842 * Queue freeze logic:
843 * 1) Assume tp_block_nr = 8 blocks.
844 * 2) At time 't0', user opens Rx ring.
845 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
846 * 4) user-space is either sleeping or processing block '0'.
847 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
848 * it will close block-7,loop around and try to fill block '0'.
849 * call-flow:
850 * __packet_lookup_frame_in_block
851 * prb_retire_current_block()
852 * prb_dispatch_next_block()
853 * |->(BLOCK_STATUS == USER) evaluates to true
854 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
855 * 6) Now there are two cases:
856 * 6.1) Link goes idle right after the queue is frozen.
857 * But remember, the last open_block() refreshed the timer.
858 * When this timer expires,it will refresh itself so that we can
859 * re-open block-0 in near future.
860 * 6.2) Link is busy and keeps on receiving packets. This is a simple
861 * case and __packet_lookup_frame_in_block will check if block-0
862 * is free and can now be re-used.
863 */
864static inline void prb_freeze_queue(struct kbdq_core *pkc,
865 struct packet_sock *po)
866{
867 pkc->reset_pending_on_curr_blk = 1;
868 po->stats_u.stats3.tp_freeze_q_cnt++;
869}
870
871#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
872
873/*
874 * If the next block is free then we will dispatch it
875 * and return a good offset.
876 * Else, we will freeze the queue.
877 * So, caller must check the return value.
878 */
879static void *prb_dispatch_next_block(struct kbdq_core *pkc,
880 struct packet_sock *po)
881{
882 struct block_desc *pbd;
883
884 smp_rmb();
885
886 /* 1. Get current block num */
887 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
888
889 /* 2. If this block is currently in_use then freeze the queue */
890 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
891 prb_freeze_queue(pkc, po);
892 return NULL;
893 }
894
895 /*
896 * 3.
897 * open this block and return the offset where the first packet
898 * needs to get stored.
899 */
900 prb_open_block(pkc, pbd);
901 return (void *)pkc->nxt_offset;
902}
903
904static void prb_retire_current_block(struct kbdq_core *pkc,
905 struct packet_sock *po, unsigned int status)
906{
907 struct block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
908
909 /* retire/close the current block */
910 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
911 /*
912 * Plug the case where copy_bits() is in progress on
913 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
914 * have space to copy the pkt in the current block and
915 * called prb_retire_current_block()
916 *
917 * We don't need to worry about the TMO case because
918 * the timer-handler already handled this case.
919 */
920 if (!(status & TP_STATUS_BLK_TMO)) {
921 while (atomic_read(&pkc->blk_fill_in_prog)) {
922 /* Waiting for skb_copy_bits to finish... */
923 cpu_relax();
924 }
925 }
926 prb_close_block(pkc, pbd, po, status);
927 return;
928 }
929
930 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
931 dump_stack();
932 BUG();
933}
934
935static inline int prb_curr_blk_in_use(struct kbdq_core *pkc,
936 struct block_desc *pbd)
937{
938 return TP_STATUS_USER & BLOCK_STATUS(pbd);
939}
940
941static inline int prb_queue_frozen(struct kbdq_core *pkc)
942{
943 return pkc->reset_pending_on_curr_blk;
944}
945
946static inline void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
947{
948 struct kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
949 atomic_dec(&pkc->blk_fill_in_prog);
950}
951
952static inline void prb_fill_rxhash(struct kbdq_core *pkc,
953 struct tpacket3_hdr *ppd)
954{
955 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
956}
957
958static inline void prb_clear_rxhash(struct kbdq_core *pkc,
959 struct tpacket3_hdr *ppd)
960{
961 ppd->hv1.tp_rxhash = 0;
962}
963
964static inline void prb_fill_vlan_info(struct kbdq_core *pkc,
965 struct tpacket3_hdr *ppd)
966{
967 if (vlan_tx_tag_present(pkc->skb)) {
968 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
969 ppd->tp_status = TP_STATUS_VLAN_VALID;
970 } else {
971 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
972 }
973}
974
975static void prb_run_all_ft_ops(struct kbdq_core *pkc,
976 struct tpacket3_hdr *ppd)
977{
978 prb_fill_vlan_info(pkc, ppd);
979
980 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
981 prb_fill_rxhash(pkc, ppd);
982 else
983 prb_clear_rxhash(pkc, ppd);
984}
985
986static inline void prb_fill_curr_block(char *curr, struct kbdq_core *pkc,
987 struct block_desc *pbd,
988 unsigned int len)
989{
990 struct tpacket3_hdr *ppd;
991
992 ppd = (struct tpacket3_hdr *)curr;
993 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
994 pkc->prev = curr;
995 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
996 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
997 BLOCK_NUM_PKTS(pbd) += 1;
998 atomic_inc(&pkc->blk_fill_in_prog);
999 prb_run_all_ft_ops(pkc, ppd);
1000}
1001
1002/* Assumes caller has the sk->rx_queue.lock */
1003static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1004 struct sk_buff *skb,
1005 int status,
1006 unsigned int len
1007 )
1008{
1009 struct kbdq_core *pkc;
1010 struct block_desc *pbd;
1011 char *curr, *end;
1012
1013 pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring));
1014 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1015
1016 /* Queue is frozen when user space is lagging behind */
1017 if (prb_queue_frozen(pkc)) {
1018 /*
1019 * Check if that last block which caused the queue to freeze,
1020 * is still in_use by user-space.
1021 */
1022 if (prb_curr_blk_in_use(pkc, pbd)) {
1023 /* Can't record this packet */
1024 return NULL;
1025 } else {
1026 /*
1027 * Ok, the block was released by user-space.
1028 * Now let's open that block.
1029 * opening a block also thaws the queue.
1030 * Thawing is a side effect.
1031 */
1032 prb_open_block(pkc, pbd);
1033 }
1034 }
1035
1036 smp_mb();
1037 curr = pkc->nxt_offset;
1038 pkc->skb = skb;
1039 end = (char *) ((char *)pbd + pkc->kblk_size);
1040
1041 /* first try the current block */
1042 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1043 prb_fill_curr_block(curr, pkc, pbd, len);
1044 return (void *)curr;
1045 }
1046
1047 /* Ok, close the current block */
1048 prb_retire_current_block(pkc, po, 0);
1049
1050 /* Now, try to dispatch the next block */
1051 curr = (char *)prb_dispatch_next_block(pkc, po);
1052 if (curr) {
1053 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1054 prb_fill_curr_block(curr, pkc, pbd, len);
1055 return (void *)curr;
1056 }
1057
1058 /*
1059 * No free blocks are available.user_space hasn't caught up yet.
1060 * Queue was just frozen and now this packet will get dropped.
1061 */
1062 return NULL;
1063}
1064
1065static inline void *packet_current_rx_frame(struct packet_sock *po,
1066 struct sk_buff *skb,
1067 int status, unsigned int len)
1068{
1069 char *curr = NULL;
1070 switch (po->tp_version) {
1071 case TPACKET_V1:
1072 case TPACKET_V2:
1073 curr = packet_lookup_frame(po, &po->rx_ring,
1074 po->rx_ring.head, status);
1075 return curr;
1076 case TPACKET_V3:
1077 return __packet_lookup_frame_in_block(po, skb, status, len);
1078 default:
1079 WARN(1, "TPACKET version not supported\n");
1080 BUG();
1081 return 0;
1082 }
1083}
1084
1085static inline void *prb_lookup_block(struct packet_sock *po,
1086 struct packet_ring_buffer *rb,
1087 unsigned int previous,
1088 int status)
1089{
1090 struct kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1091 struct block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
1092
1093 if (status != BLOCK_STATUS(pbd))
1094 return NULL;
1095 return pbd;
1096}
1097
1098static inline int prb_previous_blk_num(struct packet_ring_buffer *rb)
1099{
1100 unsigned int prev;
1101 if (rb->prb_bdqc.kactive_blk_num)
1102 prev = rb->prb_bdqc.kactive_blk_num-1;
1103 else
1104 prev = rb->prb_bdqc.knum_blocks-1;
1105 return prev;
1106}
1107
1108/* Assumes caller has held the rx_queue.lock */
1109static inline void *__prb_previous_block(struct packet_sock *po,
1110 struct packet_ring_buffer *rb,
1111 int status)
1112{
1113 unsigned int previous = prb_previous_blk_num(rb);
1114 return prb_lookup_block(po, rb, previous, status);
1115}
1116
1117static inline void *packet_previous_rx_frame(struct packet_sock *po,
1118 struct packet_ring_buffer *rb,
1119 int status)
1120{
1121 if (po->tp_version <= TPACKET_V2)
1122 return packet_previous_frame(po, rb, status);
1123
1124 return __prb_previous_block(po, rb, status);
1125}
1126
1127static inline void packet_increment_rx_head(struct packet_sock *po,
1128 struct packet_ring_buffer *rb)
1129{
1130 switch (po->tp_version) {
1131 case TPACKET_V1:
1132 case TPACKET_V2:
1133 return packet_increment_head(rb);
1134 case TPACKET_V3:
1135 default:
1136 WARN(1, "TPACKET version not supported.\n");
1137 BUG();
1138 return;
1139 }
1140}
1141
69e3c75f
JB
1142static inline void *packet_previous_frame(struct packet_sock *po,
1143 struct packet_ring_buffer *rb,
1144 int status)
1145{
1146 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1147 return packet_lookup_frame(po, rb, previous, status);
1148}
1149
1150static inline void packet_increment_head(struct packet_ring_buffer *buff)
1151{
1152 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1153}
1154
1da177e4
LT
1155static void packet_sock_destruct(struct sock *sk)
1156{
ed85b565
RC
1157 skb_queue_purge(&sk->sk_error_queue);
1158
547b792c
IJ
1159 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1160 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
1161
1162 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1163 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1164 return;
1165 }
1166
17ab56a2 1167 sk_refcnt_debug_dec(sk);
1da177e4
LT
1168}
1169
dc99f600
DM
1170static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1171{
1172 int x = atomic_read(&f->rr_cur) + 1;
1173
1174 if (x >= num)
1175 x = 0;
1176
1177 return x;
1178}
1179
1180static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1181{
1182 u32 idx, hash = skb->rxhash;
1183
1184 idx = ((u64)hash * num) >> 32;
1185
1186 return f->arr[idx];
1187}
1188
1189static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1190{
1191 int cur, old;
1192
1193 cur = atomic_read(&f->rr_cur);
1194 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1195 fanout_rr_next(f, num))) != cur)
1196 cur = old;
1197 return f->arr[cur];
1198}
1199
95ec3eb4
DM
1200static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
1201{
1202 unsigned int cpu = smp_processor_id();
1203
1204 return f->arr[cpu % num];
1205}
1206
7736d33f
DM
1207static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
1208{
31817df0 1209#ifdef CONFIG_INET
7736d33f
DM
1210 const struct iphdr *iph;
1211 u32 len;
1212
1213 if (skb->protocol != htons(ETH_P_IP))
1214 return skb;
1215
1216 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
1217 return skb;
1218
1219 iph = ip_hdr(skb);
1220 if (iph->ihl < 5 || iph->version != 4)
1221 return skb;
1222 if (!pskb_may_pull(skb, iph->ihl*4))
1223 return skb;
1224 iph = ip_hdr(skb);
1225 len = ntohs(iph->tot_len);
1226 if (skb->len < len || len < (iph->ihl * 4))
1227 return skb;
1228
1229 if (ip_is_fragment(ip_hdr(skb))) {
aec27311 1230 skb = skb_share_check(skb, GFP_ATOMIC);
7736d33f
DM
1231 if (skb) {
1232 if (pskb_trim_rcsum(skb, len))
1233 return skb;
1234 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
1235 if (ip_defrag(skb, IP_DEFRAG_AF_PACKET))
1236 return NULL;
1237 skb->rxhash = 0;
1238 }
1239 }
31817df0 1240#endif
7736d33f
DM
1241 return skb;
1242}
1243
95ec3eb4
DM
1244static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1245 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1246{
1247 struct packet_fanout *f = pt->af_packet_priv;
1248 unsigned int num = f->num_members;
1249 struct packet_sock *po;
1250 struct sock *sk;
1251
1252 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1253 !num) {
1254 kfree_skb(skb);
1255 return 0;
1256 }
1257
95ec3eb4
DM
1258 switch (f->type) {
1259 case PACKET_FANOUT_HASH:
1260 default:
1261 if (f->defrag) {
1262 skb = fanout_check_defrag(skb);
1263 if (!skb)
1264 return 0;
1265 }
1266 skb_get_rxhash(skb);
1267 sk = fanout_demux_hash(f, skb, num);
1268 break;
1269 case PACKET_FANOUT_LB:
1270 sk = fanout_demux_lb(f, skb, num);
1271 break;
1272 case PACKET_FANOUT_CPU:
1273 sk = fanout_demux_cpu(f, skb, num);
1274 break;
dc99f600
DM
1275 }
1276
dc99f600
DM
1277 po = pkt_sk(sk);
1278
1279 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1280}
1281
1282static DEFINE_MUTEX(fanout_mutex);
1283static LIST_HEAD(fanout_list);
1284
1285static void __fanout_link(struct sock *sk, struct packet_sock *po)
1286{
1287 struct packet_fanout *f = po->fanout;
1288
1289 spin_lock(&f->lock);
1290 f->arr[f->num_members] = sk;
1291 smp_wmb();
1292 f->num_members++;
1293 spin_unlock(&f->lock);
1294}
1295
1296static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1297{
1298 struct packet_fanout *f = po->fanout;
1299 int i;
1300
1301 spin_lock(&f->lock);
1302 for (i = 0; i < f->num_members; i++) {
1303 if (f->arr[i] == sk)
1304 break;
1305 }
1306 BUG_ON(i >= f->num_members);
1307 f->arr[i] = f->arr[f->num_members - 1];
1308 f->num_members--;
1309 spin_unlock(&f->lock);
1310}
1311
7736d33f 1312static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600
DM
1313{
1314 struct packet_sock *po = pkt_sk(sk);
1315 struct packet_fanout *f, *match;
7736d33f
DM
1316 u8 type = type_flags & 0xff;
1317 u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
dc99f600
DM
1318 int err;
1319
1320 switch (type) {
1321 case PACKET_FANOUT_HASH:
1322 case PACKET_FANOUT_LB:
95ec3eb4 1323 case PACKET_FANOUT_CPU:
dc99f600
DM
1324 break;
1325 default:
1326 return -EINVAL;
1327 }
1328
1329 if (!po->running)
1330 return -EINVAL;
1331
1332 if (po->fanout)
1333 return -EALREADY;
1334
1335 mutex_lock(&fanout_mutex);
1336 match = NULL;
1337 list_for_each_entry(f, &fanout_list, list) {
1338 if (f->id == id &&
1339 read_pnet(&f->net) == sock_net(sk)) {
1340 match = f;
1341 break;
1342 }
1343 }
afe62c68 1344 err = -EINVAL;
7736d33f 1345 if (match && match->defrag != defrag)
afe62c68 1346 goto out;
dc99f600 1347 if (!match) {
afe62c68 1348 err = -ENOMEM;
dc99f600 1349 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1350 if (!match)
1351 goto out;
1352 write_pnet(&match->net, sock_net(sk));
1353 match->id = id;
1354 match->type = type;
1355 match->defrag = defrag;
1356 atomic_set(&match->rr_cur, 0);
1357 INIT_LIST_HEAD(&match->list);
1358 spin_lock_init(&match->lock);
1359 atomic_set(&match->sk_ref, 0);
1360 match->prot_hook.type = po->prot_hook.type;
1361 match->prot_hook.dev = po->prot_hook.dev;
1362 match->prot_hook.func = packet_rcv_fanout;
1363 match->prot_hook.af_packet_priv = match;
1364 dev_add_pack(&match->prot_hook);
1365 list_add(&match->list, &fanout_list);
dc99f600 1366 }
afe62c68
ED
1367 err = -EINVAL;
1368 if (match->type == type &&
1369 match->prot_hook.type == po->prot_hook.type &&
1370 match->prot_hook.dev == po->prot_hook.dev) {
1371 err = -ENOSPC;
1372 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1373 __dev_remove_pack(&po->prot_hook);
1374 po->fanout = match;
1375 atomic_inc(&match->sk_ref);
1376 __fanout_link(sk, po);
1377 err = 0;
dc99f600
DM
1378 }
1379 }
afe62c68 1380out:
dc99f600
DM
1381 mutex_unlock(&fanout_mutex);
1382 return err;
1383}
1384
1385static void fanout_release(struct sock *sk)
1386{
1387 struct packet_sock *po = pkt_sk(sk);
1388 struct packet_fanout *f;
1389
1390 f = po->fanout;
1391 if (!f)
1392 return;
1393
1394 po->fanout = NULL;
1395
1396 mutex_lock(&fanout_mutex);
1397 if (atomic_dec_and_test(&f->sk_ref)) {
1398 list_del(&f->list);
1399 dev_remove_pack(&f->prot_hook);
1400 kfree(f);
1401 }
1402 mutex_unlock(&fanout_mutex);
1403}
1da177e4 1404
90ddc4f0 1405static const struct proto_ops packet_ops;
1da177e4 1406
90ddc4f0 1407static const struct proto_ops packet_ops_spkt;
1da177e4 1408
40d4e3df
ED
1409static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1410 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1411{
1412 struct sock *sk;
1413 struct sockaddr_pkt *spkt;
1414
1415 /*
1416 * When we registered the protocol we saved the socket in the data
1417 * field for just this event.
1418 */
1419
1420 sk = pt->af_packet_priv;
1ce4f28b 1421
1da177e4
LT
1422 /*
1423 * Yank back the headers [hope the device set this
1424 * right or kerboom...]
1425 *
1426 * Incoming packets have ll header pulled,
1427 * push it back.
1428 *
98e399f8 1429 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1430 * so that this procedure is noop.
1431 */
1432
1433 if (skb->pkt_type == PACKET_LOOPBACK)
1434 goto out;
1435
09ad9bc7 1436 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1437 goto out;
1438
40d4e3df
ED
1439 skb = skb_share_check(skb, GFP_ATOMIC);
1440 if (skb == NULL)
1da177e4
LT
1441 goto oom;
1442
1443 /* drop any routing info */
adf30907 1444 skb_dst_drop(skb);
1da177e4 1445
84531c24
PO
1446 /* drop conntrack reference */
1447 nf_reset(skb);
1448
ffbc6111 1449 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1450
98e399f8 1451 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1452
1453 /*
1454 * The SOCK_PACKET socket receives _all_ frames.
1455 */
1456
1457 spkt->spkt_family = dev->type;
1458 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1459 spkt->spkt_protocol = skb->protocol;
1460
1461 /*
1462 * Charge the memory to the socket. This is done specifically
1463 * to prevent sockets using all the memory up.
1464 */
1465
40d4e3df 1466 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1467 return 0;
1468
1469out:
1470 kfree_skb(skb);
1471oom:
1472 return 0;
1473}
1474
1475
1476/*
1477 * Output a raw packet to a device layer. This bypasses all the other
1478 * protocol layers and you must therefore supply it with a complete frame
1479 */
1ce4f28b 1480
1da177e4
LT
1481static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1482 struct msghdr *msg, size_t len)
1483{
1484 struct sock *sk = sock->sk;
40d4e3df 1485 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 1486 struct sk_buff *skb = NULL;
1da177e4 1487 struct net_device *dev;
40d4e3df 1488 __be16 proto = 0;
1da177e4 1489 int err;
1ce4f28b 1490
1da177e4 1491 /*
1ce4f28b 1492 * Get and verify the address.
1da177e4
LT
1493 */
1494
40d4e3df 1495 if (saddr) {
1da177e4 1496 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1497 return -EINVAL;
1498 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1499 proto = saddr->spkt_protocol;
1500 } else
1501 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1502
1503 /*
1ce4f28b 1504 * Find the device first to size check it
1da177e4
LT
1505 */
1506
1507 saddr->spkt_device[13] = 0;
1a35ca80 1508retry:
654d1f8a
ED
1509 rcu_read_lock();
1510 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1511 err = -ENODEV;
1512 if (dev == NULL)
1513 goto out_unlock;
1ce4f28b 1514
d5e76b0a
DM
1515 err = -ENETDOWN;
1516 if (!(dev->flags & IFF_UP))
1517 goto out_unlock;
1518
1da177e4 1519 /*
40d4e3df
ED
1520 * You may not queue a frame bigger than the mtu. This is the lowest level
1521 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1522 */
1ce4f28b 1523
1da177e4 1524 err = -EMSGSIZE;
57f89bfa 1525 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN)
1da177e4
LT
1526 goto out_unlock;
1527
1a35ca80
ED
1528 if (!skb) {
1529 size_t reserved = LL_RESERVED_SPACE(dev);
1530 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1531
1532 rcu_read_unlock();
1533 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
1534 if (skb == NULL)
1535 return -ENOBUFS;
1536 /* FIXME: Save some space for broken drivers that write a hard
1537 * header at transmission time by themselves. PPP is the notable
1538 * one here. This should really be fixed at the driver level.
1539 */
1540 skb_reserve(skb, reserved);
1541 skb_reset_network_header(skb);
1542
1543 /* Try to align data part correctly */
1544 if (hhlen) {
1545 skb->data -= hhlen;
1546 skb->tail -= hhlen;
1547 if (len < hhlen)
1548 skb_reset_network_header(skb);
1549 }
1550 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1551 if (err)
1552 goto out_free;
1553 goto retry;
1da177e4
LT
1554 }
1555
57f89bfa
BG
1556 if (len > (dev->mtu + dev->hard_header_len)) {
1557 /* Earlier code assumed this would be a VLAN pkt,
1558 * double-check this now that we have the actual
1559 * packet in hand.
1560 */
1561 struct ethhdr *ehdr;
1562 skb_reset_mac_header(skb);
1563 ehdr = eth_hdr(skb);
1564 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1565 err = -EMSGSIZE;
1566 goto out_unlock;
1567 }
1568 }
1a35ca80 1569
1da177e4
LT
1570 skb->protocol = proto;
1571 skb->dev = dev;
1572 skb->priority = sk->sk_priority;
2d37a186 1573 skb->mark = sk->sk_mark;
2244d07b 1574 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1575 if (err < 0)
1576 goto out_unlock;
1da177e4
LT
1577
1578 dev_queue_xmit(skb);
654d1f8a 1579 rcu_read_unlock();
40d4e3df 1580 return len;
1da177e4 1581
1da177e4 1582out_unlock:
654d1f8a 1583 rcu_read_unlock();
1a35ca80
ED
1584out_free:
1585 kfree_skb(skb);
1da177e4
LT
1586 return err;
1587}
1da177e4 1588
62ab0812
ED
1589static inline unsigned int run_filter(const struct sk_buff *skb,
1590 const struct sock *sk,
dbcb5855 1591 unsigned int res)
1da177e4
LT
1592{
1593 struct sk_filter *filter;
fda9ef5d 1594
80f8f102
ED
1595 rcu_read_lock();
1596 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1597 if (filter != NULL)
0a14842f 1598 res = SK_RUN_FILTER(filter, skb);
80f8f102 1599 rcu_read_unlock();
1da177e4 1600
dbcb5855 1601 return res;
1da177e4
LT
1602}
1603
1604/*
62ab0812
ED
1605 * This function makes lazy skb cloning in hope that most of packets
1606 * are discarded by BPF.
1607 *
1608 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1609 * and skb->cb are mangled. It works because (and until) packets
1610 * falling here are owned by current CPU. Output packets are cloned
1611 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1612 * sequencially, so that if we return skb to original state on exit,
1613 * we will not harm anyone.
1da177e4
LT
1614 */
1615
40d4e3df
ED
1616static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1617 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1618{
1619 struct sock *sk;
1620 struct sockaddr_ll *sll;
1621 struct packet_sock *po;
40d4e3df 1622 u8 *skb_head = skb->data;
1da177e4 1623 int skb_len = skb->len;
dbcb5855 1624 unsigned int snaplen, res;
1da177e4
LT
1625
1626 if (skb->pkt_type == PACKET_LOOPBACK)
1627 goto drop;
1628
1629 sk = pt->af_packet_priv;
1630 po = pkt_sk(sk);
1631
09ad9bc7 1632 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1633 goto drop;
1634
1da177e4
LT
1635 skb->dev = dev;
1636
3b04ddde 1637 if (dev->header_ops) {
1da177e4 1638 /* The device has an explicit notion of ll header,
62ab0812
ED
1639 * exported to higher levels.
1640 *
1641 * Otherwise, the device hides details of its frame
1642 * structure, so that corresponding packet head is
1643 * never delivered to user.
1da177e4
LT
1644 */
1645 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1646 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1647 else if (skb->pkt_type == PACKET_OUTGOING) {
1648 /* Special case: outgoing packets have ll header at head */
bbe735e4 1649 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1650 }
1651 }
1652
1653 snaplen = skb->len;
1654
dbcb5855
DM
1655 res = run_filter(skb, sk, snaplen);
1656 if (!res)
fda9ef5d 1657 goto drop_n_restore;
dbcb5855
DM
1658 if (snaplen > res)
1659 snaplen = res;
1da177e4
LT
1660
1661 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
1662 (unsigned)sk->sk_rcvbuf)
1663 goto drop_n_acct;
1664
1665 if (skb_shared(skb)) {
1666 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1667 if (nskb == NULL)
1668 goto drop_n_acct;
1669
1670 if (skb_head != skb->data) {
1671 skb->data = skb_head;
1672 skb->len = skb_len;
1673 }
1674 kfree_skb(skb);
1675 skb = nskb;
1676 }
1677
ffbc6111
HX
1678 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
1679 sizeof(skb->cb));
1680
1681 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
1682 sll->sll_family = AF_PACKET;
1683 sll->sll_hatype = dev->type;
1684 sll->sll_protocol = skb->protocol;
1685 sll->sll_pkttype = skb->pkt_type;
8032b464 1686 if (unlikely(po->origdev))
80feaacb
PWJ
1687 sll->sll_ifindex = orig_dev->ifindex;
1688 else
1689 sll->sll_ifindex = dev->ifindex;
1da177e4 1690
b95cce35 1691 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 1692
ffbc6111 1693 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 1694
1da177e4
LT
1695 if (pskb_trim(skb, snaplen))
1696 goto drop_n_acct;
1697
1698 skb_set_owner_r(skb, sk);
1699 skb->dev = NULL;
adf30907 1700 skb_dst_drop(skb);
1da177e4 1701
84531c24
PO
1702 /* drop conntrack reference */
1703 nf_reset(skb);
1704
1da177e4
LT
1705 spin_lock(&sk->sk_receive_queue.lock);
1706 po->stats.tp_packets++;
3b885787 1707 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
1708 __skb_queue_tail(&sk->sk_receive_queue, skb);
1709 spin_unlock(&sk->sk_receive_queue.lock);
1710 sk->sk_data_ready(sk, skb->len);
1711 return 0;
1712
1713drop_n_acct:
3b885787 1714 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
1da177e4
LT
1715
1716drop_n_restore:
1717 if (skb_head != skb->data && skb_shared(skb)) {
1718 skb->data = skb_head;
1719 skb->len = skb_len;
1720 }
1721drop:
ead2ceb0 1722 consume_skb(skb);
1da177e4
LT
1723 return 0;
1724}
1725
40d4e3df
ED
1726static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1727 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1728{
1729 struct sock *sk;
1730 struct packet_sock *po;
1731 struct sockaddr_ll *sll;
bbd6ef87
PM
1732 union {
1733 struct tpacket_hdr *h1;
1734 struct tpacket2_hdr *h2;
f6fb8f10 1735 struct tpacket3_hdr *h3;
bbd6ef87
PM
1736 void *raw;
1737 } h;
40d4e3df 1738 u8 *skb_head = skb->data;
1da177e4 1739 int skb_len = skb->len;
dbcb5855 1740 unsigned int snaplen, res;
f6fb8f10 1741 unsigned long status = TP_STATUS_USER;
bbd6ef87 1742 unsigned short macoff, netoff, hdrlen;
1da177e4 1743 struct sk_buff *copy_skb = NULL;
b7aa0bf7 1744 struct timeval tv;
bbd6ef87 1745 struct timespec ts;
614f60fa 1746 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
1747
1748 if (skb->pkt_type == PACKET_LOOPBACK)
1749 goto drop;
1750
1751 sk = pt->af_packet_priv;
1752 po = pkt_sk(sk);
1753
09ad9bc7 1754 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1755 goto drop;
1756
3b04ddde 1757 if (dev->header_ops) {
1da177e4 1758 if (sk->sk_type != SOCK_DGRAM)
98e399f8 1759 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1760 else if (skb->pkt_type == PACKET_OUTGOING) {
1761 /* Special case: outgoing packets have ll header at head */
bbe735e4 1762 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
1763 }
1764 }
1765
8dc41944
HX
1766 if (skb->ip_summed == CHECKSUM_PARTIAL)
1767 status |= TP_STATUS_CSUMNOTREADY;
1768
1da177e4
LT
1769 snaplen = skb->len;
1770
dbcb5855
DM
1771 res = run_filter(skb, sk, snaplen);
1772 if (!res)
fda9ef5d 1773 goto drop_n_restore;
dbcb5855
DM
1774 if (snaplen > res)
1775 snaplen = res;
1da177e4
LT
1776
1777 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
1778 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1779 po->tp_reserve;
1da177e4 1780 } else {
bbe735e4 1781 unsigned maclen = skb_network_offset(skb);
bbd6ef87 1782 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
1783 (maclen < 16 ? 16 : maclen)) +
1784 po->tp_reserve;
1da177e4
LT
1785 macoff = netoff - maclen;
1786 }
f6fb8f10 1787 if (po->tp_version <= TPACKET_V2) {
1788 if (macoff + snaplen > po->rx_ring.frame_size) {
1789 if (po->copy_thresh &&
1790 atomic_read(&sk->sk_rmem_alloc) + skb->truesize
1791 < (unsigned)sk->sk_rcvbuf) {
1792 if (skb_shared(skb)) {
1793 copy_skb = skb_clone(skb, GFP_ATOMIC);
1794 } else {
1795 copy_skb = skb_get(skb);
1796 skb_head = skb->data;
1797 }
1798 if (copy_skb)
1799 skb_set_owner_r(copy_skb, sk);
1da177e4 1800 }
f6fb8f10 1801 snaplen = po->rx_ring.frame_size - macoff;
1802 if ((int)snaplen < 0)
1803 snaplen = 0;
1da177e4 1804 }
1da177e4 1805 }
1da177e4 1806 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 1807 h.raw = packet_current_rx_frame(po, skb,
1808 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 1809 if (!h.raw)
1da177e4 1810 goto ring_is_full;
f6fb8f10 1811 if (po->tp_version <= TPACKET_V2) {
1812 packet_increment_rx_head(po, &po->rx_ring);
1813 /*
1814 * LOSING will be reported till you read the stats,
1815 * because it's COR - Clear On Read.
1816 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1817 * at packet level.
1818 */
1819 if (po->stats.tp_drops)
1820 status |= TP_STATUS_LOSING;
1821 }
1da177e4
LT
1822 po->stats.tp_packets++;
1823 if (copy_skb) {
1824 status |= TP_STATUS_COPY;
1825 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1826 }
1da177e4
LT
1827 spin_unlock(&sk->sk_receive_queue.lock);
1828
bbd6ef87 1829 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 1830
bbd6ef87
PM
1831 switch (po->tp_version) {
1832 case TPACKET_V1:
1833 h.h1->tp_len = skb->len;
1834 h.h1->tp_snaplen = snaplen;
1835 h.h1->tp_mac = macoff;
1836 h.h1->tp_net = netoff;
614f60fa
SM
1837 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1838 && shhwtstamps->syststamp.tv64)
1839 tv = ktime_to_timeval(shhwtstamps->syststamp);
1840 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1841 && shhwtstamps->hwtstamp.tv64)
1842 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
1843 else if (skb->tstamp.tv64)
bbd6ef87
PM
1844 tv = ktime_to_timeval(skb->tstamp);
1845 else
1846 do_gettimeofday(&tv);
1847 h.h1->tp_sec = tv.tv_sec;
1848 h.h1->tp_usec = tv.tv_usec;
1849 hdrlen = sizeof(*h.h1);
1850 break;
1851 case TPACKET_V2:
1852 h.h2->tp_len = skb->len;
1853 h.h2->tp_snaplen = snaplen;
1854 h.h2->tp_mac = macoff;
1855 h.h2->tp_net = netoff;
614f60fa
SM
1856 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1857 && shhwtstamps->syststamp.tv64)
1858 ts = ktime_to_timespec(shhwtstamps->syststamp);
1859 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1860 && shhwtstamps->hwtstamp.tv64)
1861 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1862 else if (skb->tstamp.tv64)
bbd6ef87
PM
1863 ts = ktime_to_timespec(skb->tstamp);
1864 else
1865 getnstimeofday(&ts);
1866 h.h2->tp_sec = ts.tv_sec;
1867 h.h2->tp_nsec = ts.tv_nsec;
a3bcc23e
BG
1868 if (vlan_tx_tag_present(skb)) {
1869 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
1870 status |= TP_STATUS_VLAN_VALID;
1871 } else {
1872 h.h2->tp_vlan_tci = 0;
1873 }
13fcb7bd 1874 h.h2->tp_padding = 0;
bbd6ef87
PM
1875 hdrlen = sizeof(*h.h2);
1876 break;
f6fb8f10 1877 case TPACKET_V3:
1878 /* tp_nxt_offset,vlan are already populated above.
1879 * So DONT clear those fields here
1880 */
1881 h.h3->tp_status |= status;
1882 h.h3->tp_len = skb->len;
1883 h.h3->tp_snaplen = snaplen;
1884 h.h3->tp_mac = macoff;
1885 h.h3->tp_net = netoff;
1886 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1887 && shhwtstamps->syststamp.tv64)
1888 ts = ktime_to_timespec(shhwtstamps->syststamp);
1889 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1890 && shhwtstamps->hwtstamp.tv64)
1891 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1892 else if (skb->tstamp.tv64)
1893 ts = ktime_to_timespec(skb->tstamp);
1894 else
1895 getnstimeofday(&ts);
1896 h.h3->tp_sec = ts.tv_sec;
1897 h.h3->tp_nsec = ts.tv_nsec;
1898 hdrlen = sizeof(*h.h3);
1899 break;
bbd6ef87
PM
1900 default:
1901 BUG();
1902 }
1da177e4 1903
bbd6ef87 1904 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 1905 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
1906 sll->sll_family = AF_PACKET;
1907 sll->sll_hatype = dev->type;
1908 sll->sll_protocol = skb->protocol;
1909 sll->sll_pkttype = skb->pkt_type;
8032b464 1910 if (unlikely(po->origdev))
80feaacb
PWJ
1911 sll->sll_ifindex = orig_dev->ifindex;
1912 else
1913 sll->sll_ifindex = dev->ifindex;
1da177e4 1914
e16aa207 1915 smp_mb();
f6dafa95 1916#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
1da177e4 1917 {
0af55bb5
CG
1918 u8 *start, *end;
1919
f6fb8f10 1920 if (po->tp_version <= TPACKET_V2) {
1921 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1922 + macoff + snaplen);
1923 for (start = h.raw; start < end; start += PAGE_SIZE)
1924 flush_dcache_page(pgv_to_page(start));
1925 }
cc9f01b2 1926 smp_wmb();
1da177e4 1927 }
f6dafa95 1928#endif
f6fb8f10 1929 if (po->tp_version <= TPACKET_V2)
1930 __packet_set_status(po, h.raw, status);
1931 else
1932 prb_clear_blk_fill_status(&po->rx_ring);
1da177e4
LT
1933
1934 sk->sk_data_ready(sk, 0);
1935
1936drop_n_restore:
1937 if (skb_head != skb->data && skb_shared(skb)) {
1938 skb->data = skb_head;
1939 skb->len = skb_len;
1940 }
1941drop:
1ce4f28b 1942 kfree_skb(skb);
1da177e4
LT
1943 return 0;
1944
1945ring_is_full:
1946 po->stats.tp_drops++;
1947 spin_unlock(&sk->sk_receive_queue.lock);
1948
1949 sk->sk_data_ready(sk, 0);
acb5d75b 1950 kfree_skb(copy_skb);
1da177e4
LT
1951 goto drop_n_restore;
1952}
1953
69e3c75f
JB
1954static void tpacket_destruct_skb(struct sk_buff *skb)
1955{
1956 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 1957 void *ph;
1da177e4 1958
69e3c75f 1959 BUG_ON(skb == NULL);
1da177e4 1960
69e3c75f
JB
1961 if (likely(po->tx_ring.pg_vec)) {
1962 ph = skb_shinfo(skb)->destructor_arg;
1963 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1964 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1965 atomic_dec(&po->tx_ring.pending);
1966 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
1967 }
1968
1969 sock_wfree(skb);
1970}
1971
40d4e3df
ED
1972static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1973 void *frame, struct net_device *dev, int size_max,
1974 __be16 proto, unsigned char *addr)
69e3c75f
JB
1975{
1976 union {
1977 struct tpacket_hdr *h1;
1978 struct tpacket2_hdr *h2;
1979 void *raw;
1980 } ph;
1981 int to_write, offset, len, tp_len, nr_frags, len_max;
1982 struct socket *sock = po->sk.sk_socket;
1983 struct page *page;
1984 void *data;
1985 int err;
1986
1987 ph.raw = frame;
1988
1989 skb->protocol = proto;
1990 skb->dev = dev;
1991 skb->priority = po->sk.sk_priority;
2d37a186 1992 skb->mark = po->sk.sk_mark;
69e3c75f
JB
1993 skb_shinfo(skb)->destructor_arg = ph.raw;
1994
1995 switch (po->tp_version) {
1996 case TPACKET_V2:
1997 tp_len = ph.h2->tp_len;
1998 break;
1999 default:
2000 tp_len = ph.h1->tp_len;
2001 break;
2002 }
2003 if (unlikely(tp_len > size_max)) {
40d4e3df 2004 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
2005 return -EMSGSIZE;
2006 }
2007
2008 skb_reserve(skb, LL_RESERVED_SPACE(dev));
2009 skb_reset_network_header(skb);
2010
2011 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2012 to_write = tp_len;
2013
2014 if (sock->type == SOCK_DGRAM) {
2015 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2016 NULL, tp_len);
2017 if (unlikely(err < 0))
2018 return -EINVAL;
40d4e3df 2019 } else if (dev->hard_header_len) {
69e3c75f
JB
2020 /* net device doesn't like empty head */
2021 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
2022 pr_err("packet size is too short (%d < %d)\n",
2023 tp_len, dev->hard_header_len);
69e3c75f
JB
2024 return -EINVAL;
2025 }
2026
2027 skb_push(skb, dev->hard_header_len);
2028 err = skb_store_bits(skb, 0, data,
2029 dev->hard_header_len);
2030 if (unlikely(err))
2031 return err;
2032
2033 data += dev->hard_header_len;
2034 to_write -= dev->hard_header_len;
2035 }
2036
2037 err = -EFAULT;
69e3c75f
JB
2038 offset = offset_in_page(data);
2039 len_max = PAGE_SIZE - offset;
2040 len = ((to_write > len_max) ? len_max : to_write);
2041
2042 skb->data_len = to_write;
2043 skb->len += to_write;
2044 skb->truesize += to_write;
2045 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2046
2047 while (likely(to_write)) {
2048 nr_frags = skb_shinfo(skb)->nr_frags;
2049
2050 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2051 pr_err("Packet exceed the number of skb frags(%lu)\n",
2052 MAX_SKB_FRAGS);
69e3c75f
JB
2053 return -EFAULT;
2054 }
2055
0af55bb5
CG
2056 page = pgv_to_page(data);
2057 data += len;
69e3c75f
JB
2058 flush_dcache_page(page);
2059 get_page(page);
0af55bb5 2060 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2061 to_write -= len;
2062 offset = 0;
2063 len_max = PAGE_SIZE;
2064 len = ((to_write > len_max) ? len_max : to_write);
2065 }
2066
2067 return tp_len;
2068}
2069
2070static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2071{
69e3c75f
JB
2072 struct sk_buff *skb;
2073 struct net_device *dev;
2074 __be16 proto;
827d9780
BG
2075 bool need_rls_dev = false;
2076 int err, reserve = 0;
40d4e3df
ED
2077 void *ph;
2078 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
2079 int tp_len, size_max;
2080 unsigned char *addr;
2081 int len_sum = 0;
2082 int status = 0;
2083
69e3c75f
JB
2084 mutex_lock(&po->pg_vec_lock);
2085
2086 err = -EBUSY;
2087 if (saddr == NULL) {
827d9780 2088 dev = po->prot_hook.dev;
69e3c75f
JB
2089 proto = po->num;
2090 addr = NULL;
2091 } else {
2092 err = -EINVAL;
2093 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2094 goto out;
2095 if (msg->msg_namelen < (saddr->sll_halen
2096 + offsetof(struct sockaddr_ll,
2097 sll_addr)))
2098 goto out;
69e3c75f
JB
2099 proto = saddr->sll_protocol;
2100 addr = saddr->sll_addr;
827d9780
BG
2101 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2102 need_rls_dev = true;
69e3c75f
JB
2103 }
2104
69e3c75f
JB
2105 err = -ENXIO;
2106 if (unlikely(dev == NULL))
2107 goto out;
2108
2109 reserve = dev->hard_header_len;
2110
2111 err = -ENETDOWN;
2112 if (unlikely(!(dev->flags & IFF_UP)))
2113 goto out_put;
2114
2115 size_max = po->tx_ring.frame_size
b5dd884e 2116 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
2117
2118 if (size_max > dev->mtu + reserve)
2119 size_max = dev->mtu + reserve;
2120
2121 do {
2122 ph = packet_current_frame(po, &po->tx_ring,
2123 TP_STATUS_SEND_REQUEST);
2124
2125 if (unlikely(ph == NULL)) {
2126 schedule();
2127 continue;
2128 }
2129
2130 status = TP_STATUS_SEND_REQUEST;
2131 skb = sock_alloc_send_skb(&po->sk,
2132 LL_ALLOCATED_SPACE(dev)
2133 + sizeof(struct sockaddr_ll),
2134 0, &err);
2135
2136 if (unlikely(skb == NULL))
2137 goto out_status;
2138
2139 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2140 addr);
2141
2142 if (unlikely(tp_len < 0)) {
2143 if (po->tp_loss) {
2144 __packet_set_status(po, ph,
2145 TP_STATUS_AVAILABLE);
2146 packet_increment_head(&po->tx_ring);
2147 kfree_skb(skb);
2148 continue;
2149 } else {
2150 status = TP_STATUS_WRONG_FORMAT;
2151 err = tp_len;
2152 goto out_status;
2153 }
2154 }
2155
2156 skb->destructor = tpacket_destruct_skb;
2157 __packet_set_status(po, ph, TP_STATUS_SENDING);
2158 atomic_inc(&po->tx_ring.pending);
2159
2160 status = TP_STATUS_SEND_REQUEST;
2161 err = dev_queue_xmit(skb);
eb70df13
JP
2162 if (unlikely(err > 0)) {
2163 err = net_xmit_errno(err);
2164 if (err && __packet_get_status(po, ph) ==
2165 TP_STATUS_AVAILABLE) {
2166 /* skb was destructed already */
2167 skb = NULL;
2168 goto out_status;
2169 }
2170 /*
2171 * skb was dropped but not destructed yet;
2172 * let's treat it like congestion or err < 0
2173 */
2174 err = 0;
2175 }
69e3c75f
JB
2176 packet_increment_head(&po->tx_ring);
2177 len_sum += tp_len;
f64f9e71
JP
2178 } while (likely((ph != NULL) ||
2179 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
2180 (atomic_read(&po->tx_ring.pending))))
2181 );
69e3c75f
JB
2182
2183 err = len_sum;
2184 goto out_put;
2185
69e3c75f
JB
2186out_status:
2187 __packet_set_status(po, ph, status);
2188 kfree_skb(skb);
2189out_put:
827d9780
BG
2190 if (need_rls_dev)
2191 dev_put(dev);
69e3c75f
JB
2192out:
2193 mutex_unlock(&po->pg_vec_lock);
2194 return err;
2195}
69e3c75f 2196
bfd5f4a3
SS
2197static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2198 size_t reserve, size_t len,
2199 size_t linear, int noblock,
2200 int *err)
2201{
2202 struct sk_buff *skb;
2203
2204 /* Under a page? Don't bother with paged skb. */
2205 if (prepad + len < PAGE_SIZE || !linear)
2206 linear = len;
2207
2208 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2209 err);
2210 if (!skb)
2211 return NULL;
2212
2213 skb_reserve(skb, reserve);
2214 skb_put(skb, linear);
2215 skb->data_len = len - linear;
2216 skb->len += len - linear;
2217
2218 return skb;
2219}
2220
69e3c75f 2221static int packet_snd(struct socket *sock,
1da177e4
LT
2222 struct msghdr *msg, size_t len)
2223{
2224 struct sock *sk = sock->sk;
40d4e3df 2225 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
2226 struct sk_buff *skb;
2227 struct net_device *dev;
0e11c91e 2228 __be16 proto;
827d9780 2229 bool need_rls_dev = false;
1da177e4 2230 unsigned char *addr;
827d9780 2231 int err, reserve = 0;
bfd5f4a3
SS
2232 struct virtio_net_hdr vnet_hdr = { 0 };
2233 int offset = 0;
2234 int vnet_hdr_len;
2235 struct packet_sock *po = pkt_sk(sk);
2236 unsigned short gso_type = 0;
1da177e4
LT
2237
2238 /*
1ce4f28b 2239 * Get and verify the address.
1da177e4 2240 */
1ce4f28b 2241
1da177e4 2242 if (saddr == NULL) {
827d9780 2243 dev = po->prot_hook.dev;
1da177e4
LT
2244 proto = po->num;
2245 addr = NULL;
2246 } else {
2247 err = -EINVAL;
2248 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2249 goto out;
0fb375fb
EB
2250 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2251 goto out;
1da177e4
LT
2252 proto = saddr->sll_protocol;
2253 addr = saddr->sll_addr;
827d9780
BG
2254 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2255 need_rls_dev = true;
1da177e4
LT
2256 }
2257
1da177e4
LT
2258 err = -ENXIO;
2259 if (dev == NULL)
2260 goto out_unlock;
2261 if (sock->type == SOCK_RAW)
2262 reserve = dev->hard_header_len;
2263
d5e76b0a
DM
2264 err = -ENETDOWN;
2265 if (!(dev->flags & IFF_UP))
2266 goto out_unlock;
2267
bfd5f4a3
SS
2268 if (po->has_vnet_hdr) {
2269 vnet_hdr_len = sizeof(vnet_hdr);
2270
2271 err = -EINVAL;
2272 if (len < vnet_hdr_len)
2273 goto out_unlock;
2274
2275 len -= vnet_hdr_len;
2276
2277 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
2278 vnet_hdr_len);
2279 if (err < 0)
2280 goto out_unlock;
2281
2282 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2283 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
2284 vnet_hdr.hdr_len))
2285 vnet_hdr.hdr_len = vnet_hdr.csum_start +
2286 vnet_hdr.csum_offset + 2;
2287
2288 err = -EINVAL;
2289 if (vnet_hdr.hdr_len > len)
2290 goto out_unlock;
2291
2292 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2293 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2294 case VIRTIO_NET_HDR_GSO_TCPV4:
2295 gso_type = SKB_GSO_TCPV4;
2296 break;
2297 case VIRTIO_NET_HDR_GSO_TCPV6:
2298 gso_type = SKB_GSO_TCPV6;
2299 break;
2300 case VIRTIO_NET_HDR_GSO_UDP:
2301 gso_type = SKB_GSO_UDP;
2302 break;
2303 default:
2304 goto out_unlock;
2305 }
2306
2307 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2308 gso_type |= SKB_GSO_TCP_ECN;
2309
2310 if (vnet_hdr.gso_size == 0)
2311 goto out_unlock;
2312
2313 }
2314 }
2315
1da177e4 2316 err = -EMSGSIZE;
57f89bfa 2317 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN))
1da177e4
LT
2318 goto out_unlock;
2319
bfd5f4a3
SS
2320 err = -ENOBUFS;
2321 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
2322 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
2323 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2324 if (skb == NULL)
1da177e4
LT
2325 goto out_unlock;
2326
bfd5f4a3 2327 skb_set_network_header(skb, reserve);
1da177e4 2328
0c4e8581
SH
2329 err = -EINVAL;
2330 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 2331 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 2332 goto out_free;
1da177e4
LT
2333
2334 /* Returns -EFAULT on error */
bfd5f4a3 2335 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
2336 if (err)
2337 goto out_free;
2244d07b 2338 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
2339 if (err < 0)
2340 goto out_free;
1da177e4 2341
57f89bfa
BG
2342 if (!gso_type && (len > dev->mtu + reserve)) {
2343 /* Earlier code assumed this would be a VLAN pkt,
2344 * double-check this now that we have the actual
2345 * packet in hand.
2346 */
2347 struct ethhdr *ehdr;
2348 skb_reset_mac_header(skb);
2349 ehdr = eth_hdr(skb);
2350 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2351 err = -EMSGSIZE;
2352 goto out_free;
2353 }
2354 }
2355
1da177e4
LT
2356 skb->protocol = proto;
2357 skb->dev = dev;
2358 skb->priority = sk->sk_priority;
2d37a186 2359 skb->mark = sk->sk_mark;
1da177e4 2360
bfd5f4a3
SS
2361 if (po->has_vnet_hdr) {
2362 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2363 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
2364 vnet_hdr.csum_offset)) {
2365 err = -EINVAL;
2366 goto out_free;
2367 }
2368 }
2369
2370 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
2371 skb_shinfo(skb)->gso_type = gso_type;
2372
2373 /* Header must be checked, and gso_segs computed. */
2374 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2375 skb_shinfo(skb)->gso_segs = 0;
2376
2377 len += vnet_hdr_len;
2378 }
2379
1da177e4
LT
2380 /*
2381 * Now send it
2382 */
2383
2384 err = dev_queue_xmit(skb);
2385 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2386 goto out_unlock;
2387
827d9780
BG
2388 if (need_rls_dev)
2389 dev_put(dev);
1da177e4 2390
40d4e3df 2391 return len;
1da177e4
LT
2392
2393out_free:
2394 kfree_skb(skb);
2395out_unlock:
827d9780 2396 if (dev && need_rls_dev)
1da177e4
LT
2397 dev_put(dev);
2398out:
2399 return err;
2400}
2401
69e3c75f
JB
2402static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
2403 struct msghdr *msg, size_t len)
2404{
69e3c75f
JB
2405 struct sock *sk = sock->sk;
2406 struct packet_sock *po = pkt_sk(sk);
2407 if (po->tx_ring.pg_vec)
2408 return tpacket_snd(po, msg);
2409 else
69e3c75f
JB
2410 return packet_snd(sock, msg, len);
2411}
2412
1da177e4
LT
2413/*
2414 * Close a PACKET socket. This is fairly simple. We immediately go
2415 * to 'closed' state and remove our protocol entry in the device list.
2416 */
2417
2418static int packet_release(struct socket *sock)
2419{
2420 struct sock *sk = sock->sk;
2421 struct packet_sock *po;
d12d01d6 2422 struct net *net;
f6fb8f10 2423 union tpacket_req_u req_u;
1da177e4
LT
2424
2425 if (!sk)
2426 return 0;
2427
3b1e0a65 2428 net = sock_net(sk);
1da177e4
LT
2429 po = pkt_sk(sk);
2430
808f5114 2431 spin_lock_bh(&net->packet.sklist_lock);
2432 sk_del_node_init_rcu(sk);
920de804 2433 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 2434 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 2435
808f5114 2436 spin_lock(&po->bind_lock);
ce06b03e 2437 unregister_prot_hook(sk, false);
160ff18a
BG
2438 if (po->prot_hook.dev) {
2439 dev_put(po->prot_hook.dev);
2440 po->prot_hook.dev = NULL;
2441 }
808f5114 2442 spin_unlock(&po->bind_lock);
1da177e4 2443
1da177e4 2444 packet_flush_mclist(sk);
1da177e4 2445
f6fb8f10 2446 memset(&req_u, 0, sizeof(req_u));
69e3c75f
JB
2447
2448 if (po->rx_ring.pg_vec)
f6fb8f10 2449 packet_set_ring(sk, &req_u, 1, 0);
69e3c75f
JB
2450
2451 if (po->tx_ring.pg_vec)
f6fb8f10 2452 packet_set_ring(sk, &req_u, 1, 1);
1da177e4 2453
dc99f600
DM
2454 fanout_release(sk);
2455
808f5114 2456 synchronize_net();
1da177e4
LT
2457 /*
2458 * Now the socket is dead. No more input will appear.
2459 */
1da177e4
LT
2460 sock_orphan(sk);
2461 sock->sk = NULL;
2462
2463 /* Purge queues */
2464
2465 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 2466 sk_refcnt_debug_release(sk);
1da177e4
LT
2467
2468 sock_put(sk);
2469 return 0;
2470}
2471
2472/*
2473 * Attach a packet hook.
2474 */
2475
0e11c91e 2476static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
2477{
2478 struct packet_sock *po = pkt_sk(sk);
dc99f600
DM
2479
2480 if (po->fanout)
2481 return -EINVAL;
1da177e4
LT
2482
2483 lock_sock(sk);
2484
2485 spin_lock(&po->bind_lock);
ce06b03e 2486 unregister_prot_hook(sk, true);
1da177e4
LT
2487 po->num = protocol;
2488 po->prot_hook.type = protocol;
160ff18a
BG
2489 if (po->prot_hook.dev)
2490 dev_put(po->prot_hook.dev);
1da177e4
LT
2491 po->prot_hook.dev = dev;
2492
2493 po->ifindex = dev ? dev->ifindex : 0;
2494
2495 if (protocol == 0)
2496 goto out_unlock;
2497
be85d4ad 2498 if (!dev || (dev->flags & IFF_UP)) {
ce06b03e 2499 register_prot_hook(sk);
be85d4ad
UT
2500 } else {
2501 sk->sk_err = ENETDOWN;
2502 if (!sock_flag(sk, SOCK_DEAD))
2503 sk->sk_error_report(sk);
1da177e4
LT
2504 }
2505
2506out_unlock:
2507 spin_unlock(&po->bind_lock);
2508 release_sock(sk);
2509 return 0;
2510}
2511
2512/*
2513 * Bind a packet socket to a device
2514 */
2515
40d4e3df
ED
2516static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2517 int addr_len)
1da177e4 2518{
40d4e3df 2519 struct sock *sk = sock->sk;
1da177e4
LT
2520 char name[15];
2521 struct net_device *dev;
2522 int err = -ENODEV;
1ce4f28b 2523
1da177e4
LT
2524 /*
2525 * Check legality
2526 */
1ce4f28b 2527
8ae55f04 2528 if (addr_len != sizeof(struct sockaddr))
1da177e4 2529 return -EINVAL;
40d4e3df 2530 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 2531
3b1e0a65 2532 dev = dev_get_by_name(sock_net(sk), name);
160ff18a 2533 if (dev)
1da177e4 2534 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1da177e4
LT
2535 return err;
2536}
1da177e4
LT
2537
2538static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2539{
40d4e3df
ED
2540 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2541 struct sock *sk = sock->sk;
1da177e4
LT
2542 struct net_device *dev = NULL;
2543 int err;
2544
2545
2546 /*
2547 * Check legality
2548 */
1ce4f28b 2549
1da177e4
LT
2550 if (addr_len < sizeof(struct sockaddr_ll))
2551 return -EINVAL;
2552 if (sll->sll_family != AF_PACKET)
2553 return -EINVAL;
2554
2555 if (sll->sll_ifindex) {
2556 err = -ENODEV;
3b1e0a65 2557 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
2558 if (dev == NULL)
2559 goto out;
2560 }
2561 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
2562
2563out:
2564 return err;
2565}
2566
2567static struct proto packet_proto = {
2568 .name = "PACKET",
2569 .owner = THIS_MODULE,
2570 .obj_size = sizeof(struct packet_sock),
2571};
2572
2573/*
1ce4f28b 2574 * Create a packet of type SOCK_PACKET.
1da177e4
LT
2575 */
2576
3f378b68
EP
2577static int packet_create(struct net *net, struct socket *sock, int protocol,
2578 int kern)
1da177e4
LT
2579{
2580 struct sock *sk;
2581 struct packet_sock *po;
0e11c91e 2582 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
2583 int err;
2584
2585 if (!capable(CAP_NET_RAW))
2586 return -EPERM;
be02097c
DM
2587 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2588 sock->type != SOCK_PACKET)
1da177e4
LT
2589 return -ESOCKTNOSUPPORT;
2590
2591 sock->state = SS_UNCONNECTED;
2592
2593 err = -ENOBUFS;
6257ff21 2594 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
2595 if (sk == NULL)
2596 goto out;
2597
2598 sock->ops = &packet_ops;
1da177e4
LT
2599 if (sock->type == SOCK_PACKET)
2600 sock->ops = &packet_ops_spkt;
be02097c 2601
1da177e4
LT
2602 sock_init_data(sock, sk);
2603
2604 po = pkt_sk(sk);
2605 sk->sk_family = PF_PACKET;
0e11c91e 2606 po->num = proto;
1da177e4
LT
2607
2608 sk->sk_destruct = packet_sock_destruct;
17ab56a2 2609 sk_refcnt_debug_inc(sk);
1da177e4
LT
2610
2611 /*
2612 * Attach a protocol block
2613 */
2614
2615 spin_lock_init(&po->bind_lock);
905db440 2616 mutex_init(&po->pg_vec_lock);
1da177e4 2617 po->prot_hook.func = packet_rcv;
be02097c 2618
1da177e4
LT
2619 if (sock->type == SOCK_PACKET)
2620 po->prot_hook.func = packet_rcv_spkt;
be02097c 2621
1da177e4
LT
2622 po->prot_hook.af_packet_priv = sk;
2623
0e11c91e
AV
2624 if (proto) {
2625 po->prot_hook.type = proto;
ce06b03e 2626 register_prot_hook(sk);
1da177e4
LT
2627 }
2628
808f5114 2629 spin_lock_bh(&net->packet.sklist_lock);
2630 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 2631 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 2632 spin_unlock_bh(&net->packet.sklist_lock);
2633
40d4e3df 2634 return 0;
1da177e4
LT
2635out:
2636 return err;
2637}
2638
ed85b565
RC
2639static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
2640{
2641 struct sock_exterr_skb *serr;
2642 struct sk_buff *skb, *skb2;
2643 int copied, err;
2644
2645 err = -EAGAIN;
2646 skb = skb_dequeue(&sk->sk_error_queue);
2647 if (skb == NULL)
2648 goto out;
2649
2650 copied = skb->len;
2651 if (copied > len) {
2652 msg->msg_flags |= MSG_TRUNC;
2653 copied = len;
2654 }
2655 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2656 if (err)
2657 goto out_free_skb;
2658
2659 sock_recv_timestamp(msg, sk, skb);
2660
2661 serr = SKB_EXT_ERR(skb);
2662 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
2663 sizeof(serr->ee), &serr->ee);
2664
2665 msg->msg_flags |= MSG_ERRQUEUE;
2666 err = copied;
2667
2668 /* Reset and regenerate socket error */
2669 spin_lock_bh(&sk->sk_error_queue.lock);
2670 sk->sk_err = 0;
2671 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2672 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2673 spin_unlock_bh(&sk->sk_error_queue.lock);
2674 sk->sk_error_report(sk);
2675 } else
2676 spin_unlock_bh(&sk->sk_error_queue.lock);
2677
2678out_free_skb:
2679 kfree_skb(skb);
2680out:
2681 return err;
2682}
2683
1da177e4
LT
2684/*
2685 * Pull a packet from our receive queue and hand it to the user.
2686 * If necessary we block.
2687 */
2688
2689static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
2690 struct msghdr *msg, size_t len, int flags)
2691{
2692 struct sock *sk = sock->sk;
2693 struct sk_buff *skb;
2694 int copied, err;
0fb375fb 2695 struct sockaddr_ll *sll;
bfd5f4a3 2696 int vnet_hdr_len = 0;
1da177e4
LT
2697
2698 err = -EINVAL;
ed85b565 2699 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
2700 goto out;
2701
2702#if 0
2703 /* What error should we return now? EUNATTACH? */
2704 if (pkt_sk(sk)->ifindex < 0)
2705 return -ENODEV;
2706#endif
2707
ed85b565
RC
2708 if (flags & MSG_ERRQUEUE) {
2709 err = packet_recv_error(sk, msg, len);
2710 goto out;
2711 }
2712
1da177e4
LT
2713 /*
2714 * Call the generic datagram receiver. This handles all sorts
2715 * of horrible races and re-entrancy so we can forget about it
2716 * in the protocol layers.
2717 *
2718 * Now it will return ENETDOWN, if device have just gone down,
2719 * but then it will block.
2720 */
2721
40d4e3df 2722 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
2723
2724 /*
1ce4f28b 2725 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
2726 * handles the blocking we don't see and worry about blocking
2727 * retries.
2728 */
2729
8ae55f04 2730 if (skb == NULL)
1da177e4
LT
2731 goto out;
2732
bfd5f4a3
SS
2733 if (pkt_sk(sk)->has_vnet_hdr) {
2734 struct virtio_net_hdr vnet_hdr = { 0 };
2735
2736 err = -EINVAL;
2737 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 2738 if (len < vnet_hdr_len)
bfd5f4a3
SS
2739 goto out_free;
2740
1f18b717
MK
2741 len -= vnet_hdr_len;
2742
bfd5f4a3
SS
2743 if (skb_is_gso(skb)) {
2744 struct skb_shared_info *sinfo = skb_shinfo(skb);
2745
2746 /* This is a hint as to how much should be linear. */
2747 vnet_hdr.hdr_len = skb_headlen(skb);
2748 vnet_hdr.gso_size = sinfo->gso_size;
2749 if (sinfo->gso_type & SKB_GSO_TCPV4)
2750 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2751 else if (sinfo->gso_type & SKB_GSO_TCPV6)
2752 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2753 else if (sinfo->gso_type & SKB_GSO_UDP)
2754 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
2755 else if (sinfo->gso_type & SKB_GSO_FCOE)
2756 goto out_free;
2757 else
2758 BUG();
2759 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
2760 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2761 } else
2762 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
2763
2764 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2765 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
55508d60 2766 vnet_hdr.csum_start = skb_checksum_start_offset(skb);
bfd5f4a3 2767 vnet_hdr.csum_offset = skb->csum_offset;
10a8d94a
JW
2768 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
2769 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
bfd5f4a3
SS
2770 } /* else everything is zero */
2771
2772 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
2773 vnet_hdr_len);
2774 if (err < 0)
2775 goto out_free;
2776 }
2777
0fb375fb
EB
2778 /*
2779 * If the address length field is there to be filled in, we fill
2780 * it in now.
2781 */
2782
ffbc6111 2783 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
2784 if (sock->type == SOCK_PACKET)
2785 msg->msg_namelen = sizeof(struct sockaddr_pkt);
2786 else
2787 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
2788
1da177e4
LT
2789 /*
2790 * You lose any data beyond the buffer you gave. If it worries a
2791 * user program they can ask the device for its MTU anyway.
2792 */
2793
2794 copied = skb->len;
40d4e3df
ED
2795 if (copied > len) {
2796 copied = len;
2797 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
2798 }
2799
2800 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2801 if (err)
2802 goto out_free;
2803
3b885787 2804 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
2805
2806 if (msg->msg_name)
ffbc6111
HX
2807 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
2808 msg->msg_namelen);
1da177e4 2809
8dc41944 2810 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
2811 struct tpacket_auxdata aux;
2812
2813 aux.tp_status = TP_STATUS_USER;
2814 if (skb->ip_summed == CHECKSUM_PARTIAL)
2815 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
2816 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
2817 aux.tp_snaplen = skb->len;
2818 aux.tp_mac = 0;
bbe735e4 2819 aux.tp_net = skb_network_offset(skb);
a3bcc23e
BG
2820 if (vlan_tx_tag_present(skb)) {
2821 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
2822 aux.tp_status |= TP_STATUS_VLAN_VALID;
2823 } else {
2824 aux.tp_vlan_tci = 0;
2825 }
13fcb7bd 2826 aux.tp_padding = 0;
ffbc6111 2827 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
2828 }
2829
1da177e4
LT
2830 /*
2831 * Free or return the buffer as appropriate. Again this
2832 * hides all the races and re-entrancy issues from us.
2833 */
bfd5f4a3 2834 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
2835
2836out_free:
2837 skb_free_datagram(sk, skb);
2838out:
2839 return err;
2840}
2841
1da177e4
LT
2842static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
2843 int *uaddr_len, int peer)
2844{
2845 struct net_device *dev;
2846 struct sock *sk = sock->sk;
2847
2848 if (peer)
2849 return -EOPNOTSUPP;
2850
2851 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
2852 rcu_read_lock();
2853 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
2854 if (dev)
67286640 2855 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 2856 else
1da177e4 2857 memset(uaddr->sa_data, 0, 14);
654d1f8a 2858 rcu_read_unlock();
1da177e4
LT
2859 *uaddr_len = sizeof(*uaddr);
2860
2861 return 0;
2862}
1da177e4
LT
2863
2864static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
2865 int *uaddr_len, int peer)
2866{
2867 struct net_device *dev;
2868 struct sock *sk = sock->sk;
2869 struct packet_sock *po = pkt_sk(sk);
13cfa97b 2870 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
2871
2872 if (peer)
2873 return -EOPNOTSUPP;
2874
2875 sll->sll_family = AF_PACKET;
2876 sll->sll_ifindex = po->ifindex;
2877 sll->sll_protocol = po->num;
67286640 2878 sll->sll_pkttype = 0;
654d1f8a
ED
2879 rcu_read_lock();
2880 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
2881 if (dev) {
2882 sll->sll_hatype = dev->type;
2883 sll->sll_halen = dev->addr_len;
2884 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
2885 } else {
2886 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
2887 sll->sll_halen = 0;
2888 }
654d1f8a 2889 rcu_read_unlock();
0fb375fb 2890 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
2891
2892 return 0;
2893}
2894
2aeb0b88
WC
2895static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
2896 int what)
1da177e4
LT
2897{
2898 switch (i->type) {
2899 case PACKET_MR_MULTICAST:
1162563f
JP
2900 if (i->alen != dev->addr_len)
2901 return -EINVAL;
1da177e4 2902 if (what > 0)
22bedad3 2903 return dev_mc_add(dev, i->addr);
1da177e4 2904 else
22bedad3 2905 return dev_mc_del(dev, i->addr);
1da177e4
LT
2906 break;
2907 case PACKET_MR_PROMISC:
2aeb0b88 2908 return dev_set_promiscuity(dev, what);
1da177e4
LT
2909 break;
2910 case PACKET_MR_ALLMULTI:
2aeb0b88 2911 return dev_set_allmulti(dev, what);
1da177e4 2912 break;
d95ed927 2913 case PACKET_MR_UNICAST:
1162563f
JP
2914 if (i->alen != dev->addr_len)
2915 return -EINVAL;
d95ed927 2916 if (what > 0)
a748ee24 2917 return dev_uc_add(dev, i->addr);
d95ed927 2918 else
a748ee24 2919 return dev_uc_del(dev, i->addr);
d95ed927 2920 break;
40d4e3df
ED
2921 default:
2922 break;
1da177e4 2923 }
2aeb0b88 2924 return 0;
1da177e4
LT
2925}
2926
2927static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
2928{
40d4e3df 2929 for ( ; i; i = i->next) {
1da177e4
LT
2930 if (i->ifindex == dev->ifindex)
2931 packet_dev_mc(dev, i, what);
2932 }
2933}
2934
0fb375fb 2935static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2936{
2937 struct packet_sock *po = pkt_sk(sk);
2938 struct packet_mclist *ml, *i;
2939 struct net_device *dev;
2940 int err;
2941
2942 rtnl_lock();
2943
2944 err = -ENODEV;
3b1e0a65 2945 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
2946 if (!dev)
2947 goto done;
2948
2949 err = -EINVAL;
1162563f 2950 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
2951 goto done;
2952
2953 err = -ENOBUFS;
8b3a7005 2954 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
2955 if (i == NULL)
2956 goto done;
2957
2958 err = 0;
2959 for (ml = po->mclist; ml; ml = ml->next) {
2960 if (ml->ifindex == mreq->mr_ifindex &&
2961 ml->type == mreq->mr_type &&
2962 ml->alen == mreq->mr_alen &&
2963 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
2964 ml->count++;
2965 /* Free the new element ... */
2966 kfree(i);
2967 goto done;
2968 }
2969 }
2970
2971 i->type = mreq->mr_type;
2972 i->ifindex = mreq->mr_ifindex;
2973 i->alen = mreq->mr_alen;
2974 memcpy(i->addr, mreq->mr_address, i->alen);
2975 i->count = 1;
2976 i->next = po->mclist;
2977 po->mclist = i;
2aeb0b88
WC
2978 err = packet_dev_mc(dev, i, 1);
2979 if (err) {
2980 po->mclist = i->next;
2981 kfree(i);
2982 }
1da177e4
LT
2983
2984done:
2985 rtnl_unlock();
2986 return err;
2987}
2988
0fb375fb 2989static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
2990{
2991 struct packet_mclist *ml, **mlp;
2992
2993 rtnl_lock();
2994
2995 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
2996 if (ml->ifindex == mreq->mr_ifindex &&
2997 ml->type == mreq->mr_type &&
2998 ml->alen == mreq->mr_alen &&
2999 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3000 if (--ml->count == 0) {
3001 struct net_device *dev;
3002 *mlp = ml->next;
ad959e76
ED
3003 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3004 if (dev)
1da177e4 3005 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3006 kfree(ml);
3007 }
3008 rtnl_unlock();
3009 return 0;
3010 }
3011 }
3012 rtnl_unlock();
3013 return -EADDRNOTAVAIL;
3014}
3015
3016static void packet_flush_mclist(struct sock *sk)
3017{
3018 struct packet_sock *po = pkt_sk(sk);
3019 struct packet_mclist *ml;
3020
3021 if (!po->mclist)
3022 return;
3023
3024 rtnl_lock();
3025 while ((ml = po->mclist) != NULL) {
3026 struct net_device *dev;
3027
3028 po->mclist = ml->next;
ad959e76
ED
3029 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3030 if (dev != NULL)
1da177e4 3031 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3032 kfree(ml);
3033 }
3034 rtnl_unlock();
3035}
1da177e4
LT
3036
3037static int
b7058842 3038packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3039{
3040 struct sock *sk = sock->sk;
8dc41944 3041 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3042 int ret;
3043
3044 if (level != SOL_PACKET)
3045 return -ENOPROTOOPT;
3046
69e3c75f 3047 switch (optname) {
1ce4f28b 3048 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3049 case PACKET_DROP_MEMBERSHIP:
3050 {
0fb375fb
EB
3051 struct packet_mreq_max mreq;
3052 int len = optlen;
3053 memset(&mreq, 0, sizeof(mreq));
3054 if (len < sizeof(struct packet_mreq))
1da177e4 3055 return -EINVAL;
0fb375fb
EB
3056 if (len > sizeof(mreq))
3057 len = sizeof(mreq);
40d4e3df 3058 if (copy_from_user(&mreq, optval, len))
1da177e4 3059 return -EFAULT;
0fb375fb
EB
3060 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3061 return -EINVAL;
1da177e4
LT
3062 if (optname == PACKET_ADD_MEMBERSHIP)
3063 ret = packet_mc_add(sk, &mreq);
3064 else
3065 ret = packet_mc_drop(sk, &mreq);
3066 return ret;
3067 }
a2efcfa0 3068
1da177e4 3069 case PACKET_RX_RING:
69e3c75f 3070 case PACKET_TX_RING:
1da177e4 3071 {
f6fb8f10 3072 union tpacket_req_u req_u;
3073 int len;
1da177e4 3074
f6fb8f10 3075 switch (po->tp_version) {
3076 case TPACKET_V1:
3077 case TPACKET_V2:
3078 len = sizeof(req_u.req);
3079 break;
3080 case TPACKET_V3:
3081 default:
3082 len = sizeof(req_u.req3);
3083 break;
3084 }
3085 if (optlen < len)
1da177e4 3086 return -EINVAL;
bfd5f4a3
SS
3087 if (pkt_sk(sk)->has_vnet_hdr)
3088 return -EINVAL;
f6fb8f10 3089 if (copy_from_user(&req_u.req, optval, len))
1da177e4 3090 return -EFAULT;
f6fb8f10 3091 return packet_set_ring(sk, &req_u, 0,
3092 optname == PACKET_TX_RING);
1da177e4
LT
3093 }
3094 case PACKET_COPY_THRESH:
3095 {
3096 int val;
3097
40d4e3df 3098 if (optlen != sizeof(val))
1da177e4 3099 return -EINVAL;
40d4e3df 3100 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3101 return -EFAULT;
3102
3103 pkt_sk(sk)->copy_thresh = val;
3104 return 0;
3105 }
bbd6ef87
PM
3106 case PACKET_VERSION:
3107 {
3108 int val;
3109
3110 if (optlen != sizeof(val))
3111 return -EINVAL;
69e3c75f 3112 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
3113 return -EBUSY;
3114 if (copy_from_user(&val, optval, sizeof(val)))
3115 return -EFAULT;
3116 switch (val) {
3117 case TPACKET_V1:
3118 case TPACKET_V2:
f6fb8f10 3119 case TPACKET_V3:
bbd6ef87
PM
3120 po->tp_version = val;
3121 return 0;
3122 default:
3123 return -EINVAL;
3124 }
3125 }
8913336a
PM
3126 case PACKET_RESERVE:
3127 {
3128 unsigned int val;
3129
3130 if (optlen != sizeof(val))
3131 return -EINVAL;
69e3c75f 3132 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
3133 return -EBUSY;
3134 if (copy_from_user(&val, optval, sizeof(val)))
3135 return -EFAULT;
3136 po->tp_reserve = val;
3137 return 0;
3138 }
69e3c75f
JB
3139 case PACKET_LOSS:
3140 {
3141 unsigned int val;
3142
3143 if (optlen != sizeof(val))
3144 return -EINVAL;
3145 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3146 return -EBUSY;
3147 if (copy_from_user(&val, optval, sizeof(val)))
3148 return -EFAULT;
3149 po->tp_loss = !!val;
3150 return 0;
3151 }
8dc41944
HX
3152 case PACKET_AUXDATA:
3153 {
3154 int val;
3155
3156 if (optlen < sizeof(val))
3157 return -EINVAL;
3158 if (copy_from_user(&val, optval, sizeof(val)))
3159 return -EFAULT;
3160
3161 po->auxdata = !!val;
3162 return 0;
3163 }
80feaacb
PWJ
3164 case PACKET_ORIGDEV:
3165 {
3166 int val;
3167
3168 if (optlen < sizeof(val))
3169 return -EINVAL;
3170 if (copy_from_user(&val, optval, sizeof(val)))
3171 return -EFAULT;
3172
3173 po->origdev = !!val;
3174 return 0;
3175 }
bfd5f4a3
SS
3176 case PACKET_VNET_HDR:
3177 {
3178 int val;
3179
3180 if (sock->type != SOCK_RAW)
3181 return -EINVAL;
3182 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3183 return -EBUSY;
3184 if (optlen < sizeof(val))
3185 return -EINVAL;
3186 if (copy_from_user(&val, optval, sizeof(val)))
3187 return -EFAULT;
3188
3189 po->has_vnet_hdr = !!val;
3190 return 0;
3191 }
614f60fa
SM
3192 case PACKET_TIMESTAMP:
3193 {
3194 int val;
3195
3196 if (optlen != sizeof(val))
3197 return -EINVAL;
3198 if (copy_from_user(&val, optval, sizeof(val)))
3199 return -EFAULT;
3200
3201 po->tp_tstamp = val;
3202 return 0;
3203 }
dc99f600
DM
3204 case PACKET_FANOUT:
3205 {
3206 int val;
3207
3208 if (optlen != sizeof(val))
3209 return -EINVAL;
3210 if (copy_from_user(&val, optval, sizeof(val)))
3211 return -EFAULT;
3212
3213 return fanout_add(sk, val & 0xffff, val >> 16);
3214 }
1da177e4
LT
3215 default:
3216 return -ENOPROTOOPT;
3217 }
3218}
3219
3220static int packet_getsockopt(struct socket *sock, int level, int optname,
3221 char __user *optval, int __user *optlen)
3222{
3223 int len;
8dc41944 3224 int val;
1da177e4
LT
3225 struct sock *sk = sock->sk;
3226 struct packet_sock *po = pkt_sk(sk);
8dc41944
HX
3227 void *data;
3228 struct tpacket_stats st;
f6fb8f10 3229 union tpacket_stats_u st_u;
1da177e4
LT
3230
3231 if (level != SOL_PACKET)
3232 return -ENOPROTOOPT;
3233
8ae55f04
KK
3234 if (get_user(len, optlen))
3235 return -EFAULT;
1da177e4
LT
3236
3237 if (len < 0)
3238 return -EINVAL;
1ce4f28b 3239
69e3c75f 3240 switch (optname) {
1da177e4 3241 case PACKET_STATISTICS:
f6fb8f10 3242 if (po->tp_version == TPACKET_V3) {
3243 len = sizeof(struct tpacket_stats_v3);
3244 } else {
3245 if (len > sizeof(struct tpacket_stats))
3246 len = sizeof(struct tpacket_stats);
3247 }
1da177e4 3248 spin_lock_bh(&sk->sk_receive_queue.lock);
f6fb8f10 3249 if (po->tp_version == TPACKET_V3) {
3250 memcpy(&st_u.stats3, &po->stats,
3251 sizeof(struct tpacket_stats));
3252 st_u.stats3.tp_freeze_q_cnt =
3253 po->stats_u.stats3.tp_freeze_q_cnt;
3254 st_u.stats3.tp_packets += po->stats.tp_drops;
3255 data = &st_u.stats3;
3256 } else {
3257 st = po->stats;
3258 st.tp_packets += st.tp_drops;
3259 data = &st;
3260 }
1da177e4
LT
3261 memset(&po->stats, 0, sizeof(st));
3262 spin_unlock_bh(&sk->sk_receive_queue.lock);
8dc41944
HX
3263 break;
3264 case PACKET_AUXDATA:
3265 if (len > sizeof(int))
3266 len = sizeof(int);
3267 val = po->auxdata;
3268
80feaacb
PWJ
3269 data = &val;
3270 break;
3271 case PACKET_ORIGDEV:
3272 if (len > sizeof(int))
3273 len = sizeof(int);
3274 val = po->origdev;
3275
bfd5f4a3
SS
3276 data = &val;
3277 break;
3278 case PACKET_VNET_HDR:
3279 if (len > sizeof(int))
3280 len = sizeof(int);
3281 val = po->has_vnet_hdr;
3282
8dc41944 3283 data = &val;
1da177e4 3284 break;
bbd6ef87
PM
3285 case PACKET_VERSION:
3286 if (len > sizeof(int))
3287 len = sizeof(int);
3288 val = po->tp_version;
3289 data = &val;
3290 break;
3291 case PACKET_HDRLEN:
3292 if (len > sizeof(int))
3293 len = sizeof(int);
3294 if (copy_from_user(&val, optval, len))
3295 return -EFAULT;
3296 switch (val) {
3297 case TPACKET_V1:
3298 val = sizeof(struct tpacket_hdr);
3299 break;
3300 case TPACKET_V2:
3301 val = sizeof(struct tpacket2_hdr);
3302 break;
f6fb8f10 3303 case TPACKET_V3:
3304 val = sizeof(struct tpacket3_hdr);
3305 break;
bbd6ef87
PM
3306 default:
3307 return -EINVAL;
3308 }
3309 data = &val;
3310 break;
8913336a
PM
3311 case PACKET_RESERVE:
3312 if (len > sizeof(unsigned int))
3313 len = sizeof(unsigned int);
3314 val = po->tp_reserve;
3315 data = &val;
3316 break;
69e3c75f
JB
3317 case PACKET_LOSS:
3318 if (len > sizeof(unsigned int))
3319 len = sizeof(unsigned int);
3320 val = po->tp_loss;
3321 data = &val;
3322 break;
614f60fa
SM
3323 case PACKET_TIMESTAMP:
3324 if (len > sizeof(int))
3325 len = sizeof(int);
3326 val = po->tp_tstamp;
3327 data = &val;
3328 break;
dc99f600
DM
3329 case PACKET_FANOUT:
3330 if (len > sizeof(int))
3331 len = sizeof(int);
3332 val = (po->fanout ?
3333 ((u32)po->fanout->id |
3334 ((u32)po->fanout->type << 16)) :
3335 0);
3336 data = &val;
3337 break;
1da177e4
LT
3338 default:
3339 return -ENOPROTOOPT;
3340 }
3341
8ae55f04
KK
3342 if (put_user(len, optlen))
3343 return -EFAULT;
8dc41944
HX
3344 if (copy_to_user(optval, data, len))
3345 return -EFAULT;
8ae55f04 3346 return 0;
1da177e4
LT
3347}
3348
3349
3350static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
3351{
3352 struct sock *sk;
3353 struct hlist_node *node;
ad930650 3354 struct net_device *dev = data;
c346dca1 3355 struct net *net = dev_net(dev);
1da177e4 3356
808f5114 3357 rcu_read_lock();
3358 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
3359 struct packet_sock *po = pkt_sk(sk);
3360
3361 switch (msg) {
3362 case NETDEV_UNREGISTER:
1da177e4
LT
3363 if (po->mclist)
3364 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
3365 /* fallthrough */
3366
1da177e4
LT
3367 case NETDEV_DOWN:
3368 if (dev->ifindex == po->ifindex) {
3369 spin_lock(&po->bind_lock);
3370 if (po->running) {
ce06b03e 3371 __unregister_prot_hook(sk, false);
1da177e4
LT
3372 sk->sk_err = ENETDOWN;
3373 if (!sock_flag(sk, SOCK_DEAD))
3374 sk->sk_error_report(sk);
3375 }
3376 if (msg == NETDEV_UNREGISTER) {
3377 po->ifindex = -1;
160ff18a
BG
3378 if (po->prot_hook.dev)
3379 dev_put(po->prot_hook.dev);
1da177e4
LT
3380 po->prot_hook.dev = NULL;
3381 }
3382 spin_unlock(&po->bind_lock);
3383 }
3384 break;
3385 case NETDEV_UP:
808f5114 3386 if (dev->ifindex == po->ifindex) {
3387 spin_lock(&po->bind_lock);
ce06b03e
DM
3388 if (po->num)
3389 register_prot_hook(sk);
808f5114 3390 spin_unlock(&po->bind_lock);
1da177e4 3391 }
1da177e4
LT
3392 break;
3393 }
3394 }
808f5114 3395 rcu_read_unlock();
1da177e4
LT
3396 return NOTIFY_DONE;
3397}
3398
3399
3400static int packet_ioctl(struct socket *sock, unsigned int cmd,
3401 unsigned long arg)
3402{
3403 struct sock *sk = sock->sk;
3404
69e3c75f 3405 switch (cmd) {
40d4e3df
ED
3406 case SIOCOUTQ:
3407 {
3408 int amount = sk_wmem_alloc_get(sk);
31e6d363 3409
40d4e3df
ED
3410 return put_user(amount, (int __user *)arg);
3411 }
3412 case SIOCINQ:
3413 {
3414 struct sk_buff *skb;
3415 int amount = 0;
3416
3417 spin_lock_bh(&sk->sk_receive_queue.lock);
3418 skb = skb_peek(&sk->sk_receive_queue);
3419 if (skb)
3420 amount = skb->len;
3421 spin_unlock_bh(&sk->sk_receive_queue.lock);
3422 return put_user(amount, (int __user *)arg);
3423 }
3424 case SIOCGSTAMP:
3425 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3426 case SIOCGSTAMPNS:
3427 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 3428
1da177e4 3429#ifdef CONFIG_INET
40d4e3df
ED
3430 case SIOCADDRT:
3431 case SIOCDELRT:
3432 case SIOCDARP:
3433 case SIOCGARP:
3434 case SIOCSARP:
3435 case SIOCGIFADDR:
3436 case SIOCSIFADDR:
3437 case SIOCGIFBRDADDR:
3438 case SIOCSIFBRDADDR:
3439 case SIOCGIFNETMASK:
3440 case SIOCSIFNETMASK:
3441 case SIOCGIFDSTADDR:
3442 case SIOCSIFDSTADDR:
3443 case SIOCSIFFLAGS:
40d4e3df 3444 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
3445#endif
3446
40d4e3df
ED
3447 default:
3448 return -ENOIOCTLCMD;
1da177e4
LT
3449 }
3450 return 0;
3451}
3452
40d4e3df 3453static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
3454 poll_table *wait)
3455{
3456 struct sock *sk = sock->sk;
3457 struct packet_sock *po = pkt_sk(sk);
3458 unsigned int mask = datagram_poll(file, sock, wait);
3459
3460 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 3461 if (po->rx_ring.pg_vec) {
f6fb8f10 3462 if (!packet_previous_rx_frame(po, &po->rx_ring,
3463 TP_STATUS_KERNEL))
1da177e4
LT
3464 mask |= POLLIN | POLLRDNORM;
3465 }
3466 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
3467 spin_lock_bh(&sk->sk_write_queue.lock);
3468 if (po->tx_ring.pg_vec) {
3469 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3470 mask |= POLLOUT | POLLWRNORM;
3471 }
3472 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
3473 return mask;
3474}
3475
3476
3477/* Dirty? Well, I still did not learn better way to account
3478 * for user mmaps.
3479 */
3480
3481static void packet_mm_open(struct vm_area_struct *vma)
3482{
3483 struct file *file = vma->vm_file;
40d4e3df 3484 struct socket *sock = file->private_data;
1da177e4 3485 struct sock *sk = sock->sk;
1ce4f28b 3486
1da177e4
LT
3487 if (sk)
3488 atomic_inc(&pkt_sk(sk)->mapped);
3489}
3490
3491static void packet_mm_close(struct vm_area_struct *vma)
3492{
3493 struct file *file = vma->vm_file;
40d4e3df 3494 struct socket *sock = file->private_data;
1da177e4 3495 struct sock *sk = sock->sk;
1ce4f28b 3496
1da177e4
LT
3497 if (sk)
3498 atomic_dec(&pkt_sk(sk)->mapped);
3499}
3500
f0f37e2f 3501static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
3502 .open = packet_mm_open,
3503 .close = packet_mm_close,
1da177e4
LT
3504};
3505
0e3125c7
NH
3506static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3507 unsigned int len)
1da177e4
LT
3508{
3509 int i;
3510
4ebf0ae2 3511 for (i = 0; i < len; i++) {
0e3125c7 3512 if (likely(pg_vec[i].buffer)) {
c56b4d90 3513 if (is_vmalloc_addr(pg_vec[i].buffer))
0e3125c7
NH
3514 vfree(pg_vec[i].buffer);
3515 else
3516 free_pages((unsigned long)pg_vec[i].buffer,
3517 order);
3518 pg_vec[i].buffer = NULL;
3519 }
1da177e4
LT
3520 }
3521 kfree(pg_vec);
3522}
3523
c56b4d90 3524static inline char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 3525{
0e3125c7
NH
3526 char *buffer = NULL;
3527 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3528 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3529
3530 buffer = (char *) __get_free_pages(gfp_flags, order);
3531
3532 if (buffer)
3533 return buffer;
3534
3535 /*
3536 * __get_free_pages failed, fall back to vmalloc
3537 */
bbce5a59 3538 buffer = vzalloc((1 << order) * PAGE_SIZE);
719bfeaa 3539
0e3125c7
NH
3540 if (buffer)
3541 return buffer;
3542
3543 /*
3544 * vmalloc failed, lets dig into swap here
3545 */
0e3125c7
NH
3546 gfp_flags &= ~__GFP_NORETRY;
3547 buffer = (char *)__get_free_pages(gfp_flags, order);
3548 if (buffer)
3549 return buffer;
3550
3551 /*
3552 * complete and utter failure
3553 */
3554 return NULL;
4ebf0ae2
DM
3555}
3556
0e3125c7 3557static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
3558{
3559 unsigned int block_nr = req->tp_block_nr;
0e3125c7 3560 struct pgv *pg_vec;
4ebf0ae2
DM
3561 int i;
3562
0e3125c7 3563 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
3564 if (unlikely(!pg_vec))
3565 goto out;
3566
3567 for (i = 0; i < block_nr; i++) {
c56b4d90 3568 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 3569 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
3570 goto out_free_pgvec;
3571 }
3572
3573out:
3574 return pg_vec;
3575
3576out_free_pgvec:
3577 free_pg_vec(pg_vec, order, block_nr);
3578 pg_vec = NULL;
3579 goto out;
3580}
1da177e4 3581
f6fb8f10 3582static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 3583 int closing, int tx_ring)
1da177e4 3584{
0e3125c7 3585 struct pgv *pg_vec = NULL;
1da177e4 3586 struct packet_sock *po = pkt_sk(sk);
0e11c91e 3587 int was_running, order = 0;
69e3c75f
JB
3588 struct packet_ring_buffer *rb;
3589 struct sk_buff_head *rb_queue;
0e11c91e 3590 __be16 num;
f6fb8f10 3591 int err = -EINVAL;
3592 /* Added to avoid minimal code churn */
3593 struct tpacket_req *req = &req_u->req;
3594
3595 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3596 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3597 WARN(1, "Tx-ring is not supported.\n");
3598 goto out;
3599 }
1ce4f28b 3600
69e3c75f
JB
3601 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3602 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 3603
69e3c75f
JB
3604 err = -EBUSY;
3605 if (!closing) {
3606 if (atomic_read(&po->mapped))
3607 goto out;
3608 if (atomic_read(&rb->pending))
3609 goto out;
3610 }
1da177e4 3611
69e3c75f
JB
3612 if (req->tp_block_nr) {
3613 /* Sanity tests and some calculations */
3614 err = -EBUSY;
3615 if (unlikely(rb->pg_vec))
3616 goto out;
1da177e4 3617
bbd6ef87
PM
3618 switch (po->tp_version) {
3619 case TPACKET_V1:
3620 po->tp_hdrlen = TPACKET_HDRLEN;
3621 break;
3622 case TPACKET_V2:
3623 po->tp_hdrlen = TPACKET2_HDRLEN;
3624 break;
f6fb8f10 3625 case TPACKET_V3:
3626 po->tp_hdrlen = TPACKET3_HDRLEN;
3627 break;
bbd6ef87
PM
3628 }
3629
69e3c75f 3630 err = -EINVAL;
4ebf0ae2 3631 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 3632 goto out;
4ebf0ae2 3633 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 3634 goto out;
8913336a 3635 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
3636 po->tp_reserve))
3637 goto out;
4ebf0ae2 3638 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 3639 goto out;
1da177e4 3640
69e3c75f
JB
3641 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3642 if (unlikely(rb->frames_per_block <= 0))
3643 goto out;
3644 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3645 req->tp_frame_nr))
3646 goto out;
1da177e4
LT
3647
3648 err = -ENOMEM;
4ebf0ae2
DM
3649 order = get_order(req->tp_block_size);
3650 pg_vec = alloc_pg_vec(req, order);
3651 if (unlikely(!pg_vec))
1da177e4 3652 goto out;
f6fb8f10 3653 switch (po->tp_version) {
3654 case TPACKET_V3:
3655 /* Transmit path is not supported. We checked
3656 * it above but just being paranoid
3657 */
3658 if (!tx_ring)
3659 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3660 break;
3661 default:
3662 break;
3663 }
69e3c75f
JB
3664 }
3665 /* Done */
3666 else {
3667 err = -EINVAL;
4ebf0ae2 3668 if (unlikely(req->tp_frame_nr))
69e3c75f 3669 goto out;
1da177e4
LT
3670 }
3671
3672 lock_sock(sk);
3673
3674 /* Detach socket from network */
3675 spin_lock(&po->bind_lock);
3676 was_running = po->running;
3677 num = po->num;
3678 if (was_running) {
1da177e4 3679 po->num = 0;
ce06b03e 3680 __unregister_prot_hook(sk, false);
1da177e4
LT
3681 }
3682 spin_unlock(&po->bind_lock);
1ce4f28b 3683
1da177e4
LT
3684 synchronize_net();
3685
3686 err = -EBUSY;
905db440 3687 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
3688 if (closing || atomic_read(&po->mapped) == 0) {
3689 err = 0;
69e3c75f 3690 spin_lock_bh(&rb_queue->lock);
c053fd96 3691 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
3692 rb->frame_max = (req->tp_frame_nr - 1);
3693 rb->head = 0;
3694 rb->frame_size = req->tp_frame_size;
3695 spin_unlock_bh(&rb_queue->lock);
3696
c053fd96
CG
3697 swap(rb->pg_vec_order, order);
3698 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
3699
3700 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
3701 po->prot_hook.func = (po->rx_ring.pg_vec) ?
3702 tpacket_rcv : packet_rcv;
3703 skb_queue_purge(rb_queue);
1da177e4 3704 if (atomic_read(&po->mapped))
40d4e3df
ED
3705 pr_err("packet_mmap: vma is busy: %d\n",
3706 atomic_read(&po->mapped));
1da177e4 3707 }
905db440 3708 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3709
3710 spin_lock(&po->bind_lock);
ce06b03e 3711 if (was_running) {
1da177e4 3712 po->num = num;
ce06b03e 3713 register_prot_hook(sk);
1da177e4
LT
3714 }
3715 spin_unlock(&po->bind_lock);
f6fb8f10 3716 if (closing && (po->tp_version > TPACKET_V2)) {
3717 /* Because we don't support block-based V3 on tx-ring */
3718 if (!tx_ring)
3719 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3720 }
1da177e4
LT
3721 release_sock(sk);
3722
1da177e4
LT
3723 if (pg_vec)
3724 free_pg_vec(pg_vec, order, req->tp_block_nr);
3725out:
3726 return err;
3727}
3728
69e3c75f
JB
3729static int packet_mmap(struct file *file, struct socket *sock,
3730 struct vm_area_struct *vma)
1da177e4
LT
3731{
3732 struct sock *sk = sock->sk;
3733 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
3734 unsigned long size, expected_size;
3735 struct packet_ring_buffer *rb;
1da177e4
LT
3736 unsigned long start;
3737 int err = -EINVAL;
3738 int i;
3739
3740 if (vma->vm_pgoff)
3741 return -EINVAL;
3742
905db440 3743 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
3744
3745 expected_size = 0;
3746 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3747 if (rb->pg_vec) {
3748 expected_size += rb->pg_vec_len
3749 * rb->pg_vec_pages
3750 * PAGE_SIZE;
3751 }
3752 }
3753
3754 if (expected_size == 0)
1da177e4 3755 goto out;
69e3c75f
JB
3756
3757 size = vma->vm_end - vma->vm_start;
3758 if (size != expected_size)
1da177e4
LT
3759 goto out;
3760
1da177e4 3761 start = vma->vm_start;
69e3c75f
JB
3762 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
3763 if (rb->pg_vec == NULL)
3764 continue;
3765
3766 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
3767 struct page *page;
3768 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
3769 int pg_num;
3770
c56b4d90
CG
3771 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
3772 page = pgv_to_page(kaddr);
69e3c75f
JB
3773 err = vm_insert_page(vma, start, page);
3774 if (unlikely(err))
3775 goto out;
3776 start += PAGE_SIZE;
0e3125c7 3777 kaddr += PAGE_SIZE;
69e3c75f 3778 }
4ebf0ae2 3779 }
1da177e4 3780 }
69e3c75f 3781
4ebf0ae2 3782 atomic_inc(&po->mapped);
1da177e4
LT
3783 vma->vm_ops = &packet_mmap_ops;
3784 err = 0;
3785
3786out:
905db440 3787 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
3788 return err;
3789}
1da177e4 3790
90ddc4f0 3791static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
3792 .family = PF_PACKET,
3793 .owner = THIS_MODULE,
3794 .release = packet_release,
3795 .bind = packet_bind_spkt,
3796 .connect = sock_no_connect,
3797 .socketpair = sock_no_socketpair,
3798 .accept = sock_no_accept,
3799 .getname = packet_getname_spkt,
3800 .poll = datagram_poll,
3801 .ioctl = packet_ioctl,
3802 .listen = sock_no_listen,
3803 .shutdown = sock_no_shutdown,
3804 .setsockopt = sock_no_setsockopt,
3805 .getsockopt = sock_no_getsockopt,
3806 .sendmsg = packet_sendmsg_spkt,
3807 .recvmsg = packet_recvmsg,
3808 .mmap = sock_no_mmap,
3809 .sendpage = sock_no_sendpage,
3810};
1da177e4 3811
90ddc4f0 3812static const struct proto_ops packet_ops = {
1da177e4
LT
3813 .family = PF_PACKET,
3814 .owner = THIS_MODULE,
3815 .release = packet_release,
3816 .bind = packet_bind,
3817 .connect = sock_no_connect,
3818 .socketpair = sock_no_socketpair,
3819 .accept = sock_no_accept,
1ce4f28b 3820 .getname = packet_getname,
1da177e4
LT
3821 .poll = packet_poll,
3822 .ioctl = packet_ioctl,
3823 .listen = sock_no_listen,
3824 .shutdown = sock_no_shutdown,
3825 .setsockopt = packet_setsockopt,
3826 .getsockopt = packet_getsockopt,
3827 .sendmsg = packet_sendmsg,
3828 .recvmsg = packet_recvmsg,
3829 .mmap = packet_mmap,
3830 .sendpage = sock_no_sendpage,
3831};
3832
ec1b4cf7 3833static const struct net_proto_family packet_family_ops = {
1da177e4
LT
3834 .family = PF_PACKET,
3835 .create = packet_create,
3836 .owner = THIS_MODULE,
3837};
3838
3839static struct notifier_block packet_netdev_notifier = {
40d4e3df 3840 .notifier_call = packet_notifier,
1da177e4
LT
3841};
3842
3843#ifdef CONFIG_PROC_FS
1da177e4
LT
3844
3845static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 3846 __acquires(RCU)
1da177e4 3847{
e372c414 3848 struct net *net = seq_file_net(seq);
808f5114 3849
3850 rcu_read_lock();
3851 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
3852}
3853
3854static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3855{
1bf40954 3856 struct net *net = seq_file_net(seq);
808f5114 3857 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
3858}
3859
3860static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 3861 __releases(RCU)
1da177e4 3862{
808f5114 3863 rcu_read_unlock();
1da177e4
LT
3864}
3865
1ce4f28b 3866static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
3867{
3868 if (v == SEQ_START_TOKEN)
3869 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
3870 else {
b7ceabd9 3871 struct sock *s = sk_entry(v);
1da177e4
LT
3872 const struct packet_sock *po = pkt_sk(s);
3873
3874 seq_printf(seq,
71338aa7 3875 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4
LT
3876 s,
3877 atomic_read(&s->sk_refcnt),
3878 s->sk_type,
3879 ntohs(po->num),
3880 po->ifindex,
3881 po->running,
3882 atomic_read(&s->sk_rmem_alloc),
3883 sock_i_uid(s),
40d4e3df 3884 sock_i_ino(s));
1da177e4
LT
3885 }
3886
3887 return 0;
3888}
3889
56b3d975 3890static const struct seq_operations packet_seq_ops = {
1da177e4
LT
3891 .start = packet_seq_start,
3892 .next = packet_seq_next,
3893 .stop = packet_seq_stop,
3894 .show = packet_seq_show,
3895};
3896
3897static int packet_seq_open(struct inode *inode, struct file *file)
3898{
e372c414
DL
3899 return seq_open_net(inode, file, &packet_seq_ops,
3900 sizeof(struct seq_net_private));
1da177e4
LT
3901}
3902
da7071d7 3903static const struct file_operations packet_seq_fops = {
1da177e4
LT
3904 .owner = THIS_MODULE,
3905 .open = packet_seq_open,
3906 .read = seq_read,
3907 .llseek = seq_lseek,
e372c414 3908 .release = seq_release_net,
1da177e4
LT
3909};
3910
3911#endif
3912
2c8c1e72 3913static int __net_init packet_net_init(struct net *net)
d12d01d6 3914{
808f5114 3915 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 3916 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
3917
3918 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
3919 return -ENOMEM;
3920
3921 return 0;
3922}
3923
2c8c1e72 3924static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
3925{
3926 proc_net_remove(net, "packet");
3927}
3928
3929static struct pernet_operations packet_net_ops = {
3930 .init = packet_net_init,
3931 .exit = packet_net_exit,
3932};
3933
3934
1da177e4
LT
3935static void __exit packet_exit(void)
3936{
1da177e4 3937 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 3938 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
3939 sock_unregister(PF_PACKET);
3940 proto_unregister(&packet_proto);
3941}
3942
3943static int __init packet_init(void)
3944{
3945 int rc = proto_register(&packet_proto, 0);
3946
3947 if (rc != 0)
3948 goto out;
3949
3950 sock_register(&packet_family_ops);
d12d01d6 3951 register_pernet_subsys(&packet_net_ops);
1da177e4 3952 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
3953out:
3954 return rc;
3955}
3956
3957module_init(packet_init);
3958module_exit(packet_exit);
3959MODULE_LICENSE("GPL");
3960MODULE_ALIAS_NETPROTO(PF_PACKET);