]> git.proxmox.com Git - mirror_ubuntu-kernels.git/blame - net/packet/af_packet.c
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152
[mirror_ubuntu-kernels.git] / net / packet / af_packet.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * PACKET - implements raw packet sockets.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 *
1ce4f28b 13 * Fixes:
1da177e4
LT
14 * Alan Cox : verify_area() now used correctly
15 * Alan Cox : new skbuff lists, look ma no backlogs!
16 * Alan Cox : tidied skbuff lists.
17 * Alan Cox : Now uses generic datagram routines I
18 * added. Also fixed the peek/read crash
19 * from all old Linux datagram code.
20 * Alan Cox : Uses the improved datagram code.
21 * Alan Cox : Added NULL's for socket options.
22 * Alan Cox : Re-commented the code.
23 * Alan Cox : Use new kernel side addressing
24 * Rob Janssen : Correct MTU usage.
25 * Dave Platt : Counter leaks caused by incorrect
26 * interrupt locking and some slightly
27 * dubious gcc output. Can you read
28 * compiler: it said _VOLATILE_
29 * Richard Kooijman : Timestamp fixes.
30 * Alan Cox : New buffers. Use sk->mac.raw.
31 * Alan Cox : sendmsg/recvmsg support.
32 * Alan Cox : Protocol setting support
33 * Alexey Kuznetsov : Untied from IPv4 stack.
34 * Cyrus Durgin : Fixed kerneld for kmod.
35 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 36 * Ulises Alonso : Frame number limit removal and
1da177e4 37 * packet_set_ring memory leak.
0fb375fb
EB
38 * Eric Biederman : Allow for > 8 byte hardware addresses.
39 * The convention is that longer addresses
40 * will simply extend the hardware address
1ce4f28b 41 * byte arrays at the end of sockaddr_ll
0fb375fb 42 * and packet_mreq.
69e3c75f 43 * Johann Baudy : Added TX RING.
f6fb8f10 44 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * layer.
46 * Copyright (C) 2011, <lokec@ccs.neu.edu>
1da177e4 47 */
1ce4f28b 48
1da177e4 49#include <linux/types.h>
1da177e4 50#include <linux/mm.h>
4fc268d2 51#include <linux/capability.h>
1da177e4
LT
52#include <linux/fcntl.h>
53#include <linux/socket.h>
54#include <linux/in.h>
55#include <linux/inet.h>
56#include <linux/netdevice.h>
57#include <linux/if_packet.h>
58#include <linux/wireless.h>
ffbc6111 59#include <linux/kernel.h>
1da177e4 60#include <linux/kmod.h>
5a0e3ad6 61#include <linux/slab.h>
0e3125c7 62#include <linux/vmalloc.h>
457c4cbc 63#include <net/net_namespace.h>
1da177e4
LT
64#include <net/ip.h>
65#include <net/protocol.h>
66#include <linux/skbuff.h>
67#include <net/sock.h>
68#include <linux/errno.h>
69#include <linux/timer.h>
7c0f6ba6 70#include <linux/uaccess.h>
1da177e4
LT
71#include <asm/ioctls.h>
72#include <asm/page.h>
a1f8e7f7 73#include <asm/cacheflush.h>
1da177e4
LT
74#include <asm/io.h>
75#include <linux/proc_fs.h>
76#include <linux/seq_file.h>
77#include <linux/poll.h>
78#include <linux/module.h>
79#include <linux/init.h>
905db440 80#include <linux/mutex.h>
05423b24 81#include <linux/if_vlan.h>
bfd5f4a3 82#include <linux/virtio_net.h>
ed85b565 83#include <linux/errqueue.h>
614f60fa 84#include <linux/net_tstamp.h>
b0138408 85#include <linux/percpu.h>
1da177e4
LT
86#ifdef CONFIG_INET
87#include <net/inet_common.h>
88#endif
47dceb8e 89#include <linux/bpf.h>
719c44d3 90#include <net/compat.h>
1da177e4 91
2787b04b
PE
92#include "internal.h"
93
1da177e4
LT
94/*
95 Assumptions:
96 - if device has no dev->hard_header routine, it adds and removes ll header
97 inside itself. In this case ll header is invisible outside of device,
98 but higher levels still should reserve dev->hard_header_len.
99 Some devices are enough clever to reallocate skb, when header
100 will not fit to reserved space (tunnel), another ones are silly
101 (PPP).
102 - packet socket receives packets with pulled ll header,
103 so that SOCK_RAW should push it back.
104
105On receive:
106-----------
107
108Incoming, dev->hard_header!=NULL
b0e380b1
ACM
109 mac_header -> ll header
110 data -> data
1da177e4
LT
111
112Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
113 mac_header -> ll header
114 data -> ll header
1da177e4
LT
115
116Incoming, dev->hard_header==NULL
b0e380b1
ACM
117 mac_header -> UNKNOWN position. It is very likely, that it points to ll
118 header. PPP makes it, that is wrong, because introduce
db0c58f9 119 assymetry between rx and tx paths.
b0e380b1 120 data -> data
1da177e4
LT
121
122Outgoing, dev->hard_header==NULL
b0e380b1
ACM
123 mac_header -> data. ll header is still not built!
124 data -> data
1da177e4
LT
125
126Resume
127 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
128
129
130On transmit:
131------------
132
133dev->hard_header != NULL
b0e380b1
ACM
134 mac_header -> ll header
135 data -> ll header
1da177e4
LT
136
137dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
138 mac_header -> data
139 data -> data
1da177e4
LT
140
141 We should set nh.raw on output to correct posistion,
142 packet classifier depends on it.
143 */
144
1da177e4
LT
145/* Private packet socket structures. */
146
0fb375fb
EB
147/* identical to struct packet_mreq except it has
148 * a longer address field.
149 */
40d4e3df 150struct packet_mreq_max {
0fb375fb
EB
151 int mr_ifindex;
152 unsigned short mr_type;
153 unsigned short mr_alen;
154 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 155};
a2efcfa0 156
184f489e
DB
157union tpacket_uhdr {
158 struct tpacket_hdr *h1;
159 struct tpacket2_hdr *h2;
160 struct tpacket3_hdr *h3;
161 void *raw;
162};
163
f6fb8f10 164static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f
JB
165 int closing, int tx_ring);
166
f6fb8f10 167#define V3_ALIGNMENT (8)
168
bc59ba39 169#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
f6fb8f10 170
171#define BLK_PLUS_PRIV(sz_of_priv) \
172 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
173
f6fb8f10 174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
175#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
176#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
181
69e3c75f 182struct packet_sock;
77f65ebd
WB
183static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
184 struct packet_type *pt, struct net_device *orig_dev);
1da177e4 185
f6fb8f10 186static void *packet_previous_frame(struct packet_sock *po,
187 struct packet_ring_buffer *rb,
188 int status);
189static void packet_increment_head(struct packet_ring_buffer *buff);
878cd3ba 190static int prb_curr_blk_in_use(struct tpacket_block_desc *);
bc59ba39 191static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
f6fb8f10 192 struct packet_sock *);
bc59ba39 193static void prb_retire_current_block(struct tpacket_kbdq_core *,
f6fb8f10 194 struct packet_sock *, unsigned int status);
bc59ba39 195static int prb_queue_frozen(struct tpacket_kbdq_core *);
196static void prb_open_block(struct tpacket_kbdq_core *,
197 struct tpacket_block_desc *);
17bfd8c8 198static void prb_retire_rx_blk_timer_expired(struct timer_list *);
bc59ba39 199static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
bc59ba39 200static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
201static void prb_clear_rxhash(struct tpacket_kbdq_core *,
202 struct tpacket3_hdr *);
203static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
204 struct tpacket3_hdr *);
1da177e4 205static void packet_flush_mclist(struct sock *sk);
865b03f2 206static u16 packet_pick_tx_queue(struct sk_buff *skb);
1da177e4 207
ffbc6111 208struct packet_skb_cb {
ffbc6111
HX
209 union {
210 struct sockaddr_pkt pkt;
2472d761
EB
211 union {
212 /* Trick: alias skb original length with
213 * ll.sll_family and ll.protocol in order
214 * to save room.
215 */
216 unsigned int origlen;
217 struct sockaddr_ll ll;
218 };
ffbc6111
HX
219 } sa;
220};
221
d3869efe
DW
222#define vio_le() virtio_legacy_is_little_endian()
223
ffbc6111 224#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 225
bc59ba39 226#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
f6fb8f10 227#define GET_PBLOCK_DESC(x, bid) \
bc59ba39 228 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
f6fb8f10 229#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
bc59ba39 230 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
f6fb8f10 231#define GET_NEXT_PRB_BLK_NUM(x) \
232 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
233 ((x)->kactive_blk_num+1) : 0)
234
dc99f600
DM
235static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
236static void __fanout_link(struct sock *sk, struct packet_sock *po);
237
d346a3fa
DB
238static int packet_direct_xmit(struct sk_buff *skb)
239{
865b03f2 240 return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
d346a3fa
DB
241}
242
66e56cd4
DB
243static struct net_device *packet_cached_dev_get(struct packet_sock *po)
244{
245 struct net_device *dev;
246
247 rcu_read_lock();
248 dev = rcu_dereference(po->cached_dev);
249 if (likely(dev))
250 dev_hold(dev);
251 rcu_read_unlock();
252
253 return dev;
254}
255
256static void packet_cached_dev_assign(struct packet_sock *po,
257 struct net_device *dev)
258{
259 rcu_assign_pointer(po->cached_dev, dev);
260}
261
262static void packet_cached_dev_reset(struct packet_sock *po)
263{
264 RCU_INIT_POINTER(po->cached_dev, NULL);
265}
266
d346a3fa
DB
267static bool packet_use_direct_xmit(const struct packet_sock *po)
268{
269 return po->xmit == packet_direct_xmit;
270}
271
865b03f2 272static u16 packet_pick_tx_queue(struct sk_buff *skb)
0fd5d57b 273{
865b03f2 274 struct net_device *dev = skb->dev;
0fd5d57b 275 const struct net_device_ops *ops = dev->netdev_ops;
b71b5837 276 int cpu = raw_smp_processor_id();
0fd5d57b
DB
277 u16 queue_index;
278
b71b5837
PA
279#ifdef CONFIG_XPS
280 skb->sender_cpu = cpu + 1;
281#endif
282 skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
0fd5d57b 283 if (ops->ndo_select_queue) {
a350ecce 284 queue_index = ops->ndo_select_queue(dev, skb, NULL);
0fd5d57b
DB
285 queue_index = netdev_cap_txqueue(dev, queue_index);
286 } else {
b71b5837 287 queue_index = netdev_pick_tx(dev, skb, NULL);
0fd5d57b
DB
288 }
289
865b03f2 290 return queue_index;
0fd5d57b
DB
291}
292
a6361f0c 293/* __register_prot_hook must be invoked through register_prot_hook
ce06b03e
DM
294 * or from a context in which asynchronous accesses to the packet
295 * socket is not possible (packet_create()).
296 */
a6361f0c 297static void __register_prot_hook(struct sock *sk)
ce06b03e
DM
298{
299 struct packet_sock *po = pkt_sk(sk);
e40526cb 300
ce06b03e 301 if (!po->running) {
66e56cd4 302 if (po->fanout)
dc99f600 303 __fanout_link(sk, po);
66e56cd4 304 else
dc99f600 305 dev_add_pack(&po->prot_hook);
e40526cb 306
ce06b03e
DM
307 sock_hold(sk);
308 po->running = 1;
309 }
310}
311
a6361f0c
WB
312static void register_prot_hook(struct sock *sk)
313{
314 lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
315 __register_prot_hook(sk);
316}
317
318/* If the sync parameter is true, we will temporarily drop
ce06b03e
DM
319 * the po->bind_lock and do a synchronize_net to make sure no
320 * asynchronous packet processing paths still refer to the elements
321 * of po->prot_hook. If the sync parameter is false, it is the
322 * callers responsibility to take care of this.
323 */
324static void __unregister_prot_hook(struct sock *sk, bool sync)
325{
326 struct packet_sock *po = pkt_sk(sk);
327
a6361f0c
WB
328 lockdep_assert_held_once(&po->bind_lock);
329
ce06b03e 330 po->running = 0;
66e56cd4
DB
331
332 if (po->fanout)
dc99f600 333 __fanout_unlink(sk, po);
66e56cd4 334 else
dc99f600 335 __dev_remove_pack(&po->prot_hook);
e40526cb 336
ce06b03e
DM
337 __sock_put(sk);
338
339 if (sync) {
340 spin_unlock(&po->bind_lock);
341 synchronize_net();
342 spin_lock(&po->bind_lock);
343 }
344}
345
346static void unregister_prot_hook(struct sock *sk, bool sync)
347{
348 struct packet_sock *po = pkt_sk(sk);
349
350 if (po->running)
351 __unregister_prot_hook(sk, sync);
352}
353
6e58040b 354static inline struct page * __pure pgv_to_page(void *addr)
0af55bb5
CG
355{
356 if (is_vmalloc_addr(addr))
357 return vmalloc_to_page(addr);
358 return virt_to_page(addr);
359}
360
69e3c75f 361static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 362{
184f489e 363 union tpacket_uhdr h;
1da177e4 364
69e3c75f 365 h.raw = frame;
bbd6ef87
PM
366 switch (po->tp_version) {
367 case TPACKET_V1:
69e3c75f 368 h.h1->tp_status = status;
0af55bb5 369 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
bbd6ef87
PM
370 break;
371 case TPACKET_V2:
69e3c75f 372 h.h2->tp_status = status;
0af55bb5 373 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
bbd6ef87 374 break;
f6fb8f10 375 case TPACKET_V3:
7f953ab2
SV
376 h.h3->tp_status = status;
377 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
378 break;
69e3c75f 379 default:
f6fb8f10 380 WARN(1, "TPACKET version not supported.\n");
69e3c75f 381 BUG();
bbd6ef87 382 }
69e3c75f
JB
383
384 smp_wmb();
bbd6ef87
PM
385}
386
69e3c75f 387static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87 388{
184f489e 389 union tpacket_uhdr h;
bbd6ef87 390
69e3c75f
JB
391 smp_rmb();
392
bbd6ef87
PM
393 h.raw = frame;
394 switch (po->tp_version) {
395 case TPACKET_V1:
0af55bb5 396 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
69e3c75f 397 return h.h1->tp_status;
bbd6ef87 398 case TPACKET_V2:
0af55bb5 399 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
69e3c75f 400 return h.h2->tp_status;
f6fb8f10 401 case TPACKET_V3:
7f953ab2
SV
402 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
403 return h.h3->tp_status;
69e3c75f 404 default:
f6fb8f10 405 WARN(1, "TPACKET version not supported.\n");
69e3c75f
JB
406 BUG();
407 return 0;
bbd6ef87 408 }
1da177e4 409}
69e3c75f 410
b9c32fb2
DB
411static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
412 unsigned int flags)
7a51384c
DB
413{
414 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
415
68a360e8
WB
416 if (shhwtstamps &&
417 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
418 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
419 return TP_STATUS_TS_RAW_HARDWARE;
7a51384c
DB
420
421 if (ktime_to_timespec_cond(skb->tstamp, ts))
b9c32fb2 422 return TP_STATUS_TS_SOFTWARE;
7a51384c 423
b9c32fb2 424 return 0;
7a51384c
DB
425}
426
b9c32fb2
DB
427static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
428 struct sk_buff *skb)
2e31396f
WB
429{
430 union tpacket_uhdr h;
431 struct timespec ts;
b9c32fb2 432 __u32 ts_status;
2e31396f 433
b9c32fb2
DB
434 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
435 return 0;
2e31396f
WB
436
437 h.raw = frame;
438 switch (po->tp_version) {
439 case TPACKET_V1:
440 h.h1->tp_sec = ts.tv_sec;
441 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
442 break;
443 case TPACKET_V2:
444 h.h2->tp_sec = ts.tv_sec;
445 h.h2->tp_nsec = ts.tv_nsec;
446 break;
447 case TPACKET_V3:
57ea884b
DB
448 h.h3->tp_sec = ts.tv_sec;
449 h.h3->tp_nsec = ts.tv_nsec;
450 break;
2e31396f
WB
451 default:
452 WARN(1, "TPACKET version not supported.\n");
453 BUG();
454 }
455
456 /* one flush is safe, as both fields always lie on the same cacheline */
457 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
458 smp_wmb();
b9c32fb2
DB
459
460 return ts_status;
2e31396f
WB
461}
462
69e3c75f
JB
463static void *packet_lookup_frame(struct packet_sock *po,
464 struct packet_ring_buffer *rb,
465 unsigned int position,
466 int status)
467{
468 unsigned int pg_vec_pos, frame_offset;
184f489e 469 union tpacket_uhdr h;
69e3c75f
JB
470
471 pg_vec_pos = position / rb->frames_per_block;
472 frame_offset = position % rb->frames_per_block;
473
0e3125c7
NH
474 h.raw = rb->pg_vec[pg_vec_pos].buffer +
475 (frame_offset * rb->frame_size);
69e3c75f
JB
476
477 if (status != __packet_get_status(po, h.raw))
478 return NULL;
479
480 return h.raw;
481}
482
eea49cc9 483static void *packet_current_frame(struct packet_sock *po,
69e3c75f
JB
484 struct packet_ring_buffer *rb,
485 int status)
486{
487 return packet_lookup_frame(po, rb, rb->head, status);
488}
489
bc59ba39 490static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 491{
492 del_timer_sync(&pkc->retire_blk_timer);
493}
494
495static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
f6fb8f10 496 struct sk_buff_head *rb_queue)
497{
bc59ba39 498 struct tpacket_kbdq_core *pkc;
f6fb8f10 499
73d0fcf2 500 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 501
ec6f809f 502 spin_lock_bh(&rb_queue->lock);
f6fb8f10 503 pkc->delete_blk_timer = 1;
ec6f809f 504 spin_unlock_bh(&rb_queue->lock);
f6fb8f10 505
506 prb_del_retire_blk_timer(pkc);
507}
508
e8e85cc5 509static void prb_setup_retire_blk_timer(struct packet_sock *po)
f6fb8f10 510{
bc59ba39 511 struct tpacket_kbdq_core *pkc;
f6fb8f10 512
e8e85cc5 513 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
17bfd8c8
KC
514 timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
515 0);
516 pkc->retire_blk_timer.expires = jiffies;
f6fb8f10 517}
518
519static int prb_calc_retire_blk_tmo(struct packet_sock *po,
520 int blk_size_in_bytes)
521{
522 struct net_device *dev;
523 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
7cad1bac 524 struct ethtool_link_ksettings ecmd;
4bc71cb9 525 int err;
f6fb8f10 526
4bc71cb9
JP
527 rtnl_lock();
528 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
529 if (unlikely(!dev)) {
530 rtnl_unlock();
f6fb8f10 531 return DEFAULT_PRB_RETIRE_TOV;
4bc71cb9 532 }
7cad1bac 533 err = __ethtool_get_link_ksettings(dev, &ecmd);
4bc71cb9
JP
534 rtnl_unlock();
535 if (!err) {
4bc71cb9
JP
536 /*
537 * If the link speed is so slow you don't really
538 * need to worry about perf anyways
539 */
7cad1bac
DD
540 if (ecmd.base.speed < SPEED_1000 ||
541 ecmd.base.speed == SPEED_UNKNOWN) {
4bc71cb9 542 return DEFAULT_PRB_RETIRE_TOV;
e440cf2c 543 } else {
544 msec = 1;
7cad1bac 545 div = ecmd.base.speed / 1000;
f6fb8f10 546 }
547 }
548
549 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
550
551 if (div)
552 mbits /= div;
553
554 tmo = mbits * msec;
555
556 if (div)
557 return tmo+1;
558 return tmo;
559}
560
bc59ba39 561static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
f6fb8f10 562 union tpacket_req_u *req_u)
563{
564 p1->feature_req_word = req_u->req3.tp_feature_req_word;
565}
566
567static void init_prb_bdqc(struct packet_sock *po,
568 struct packet_ring_buffer *rb,
569 struct pgv *pg_vec,
e8e85cc5 570 union tpacket_req_u *req_u)
f6fb8f10 571{
22781a5b 572 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
bc59ba39 573 struct tpacket_block_desc *pbd;
f6fb8f10 574
575 memset(p1, 0x0, sizeof(*p1));
576
577 p1->knxt_seq_num = 1;
578 p1->pkbdq = pg_vec;
bc59ba39 579 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
e3192690 580 p1->pkblk_start = pg_vec[0].buffer;
f6fb8f10 581 p1->kblk_size = req_u->req3.tp_block_size;
582 p1->knum_blocks = req_u->req3.tp_block_nr;
583 p1->hdrlen = po->tp_hdrlen;
584 p1->version = po->tp_version;
585 p1->last_kactive_blk_num = 0;
ee80fbf3 586 po->stats.stats3.tp_freeze_q_cnt = 0;
f6fb8f10 587 if (req_u->req3.tp_retire_blk_tov)
588 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
589 else
590 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
591 req_u->req3.tp_block_size);
592 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
593 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
594
dc808110 595 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
f6fb8f10 596 prb_init_ft_ops(p1, req_u);
e8e85cc5 597 prb_setup_retire_blk_timer(po);
f6fb8f10 598 prb_open_block(p1, pbd);
599}
600
601/* Do NOT update the last_blk_num first.
602 * Assumes sk_buff_head lock is held.
603 */
bc59ba39 604static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
f6fb8f10 605{
606 mod_timer(&pkc->retire_blk_timer,
607 jiffies + pkc->tov_in_jiffies);
608 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
609}
610
611/*
612 * Timer logic:
613 * 1) We refresh the timer only when we open a block.
614 * By doing this we don't waste cycles refreshing the timer
615 * on packet-by-packet basis.
616 *
617 * With a 1MB block-size, on a 1Gbps line, it will take
618 * i) ~8 ms to fill a block + ii) memcpy etc.
619 * In this cut we are not accounting for the memcpy time.
620 *
621 * So, if the user sets the 'tmo' to 10ms then the timer
622 * will never fire while the block is still getting filled
623 * (which is what we want). However, the user could choose
624 * to close a block early and that's fine.
625 *
626 * But when the timer does fire, we check whether or not to refresh it.
627 * Since the tmo granularity is in msecs, it is not too expensive
628 * to refresh the timer, lets say every '8' msecs.
629 * Either the user can set the 'tmo' or we can derive it based on
630 * a) line-speed and b) block-size.
631 * prb_calc_retire_blk_tmo() calculates the tmo.
632 *
633 */
17bfd8c8 634static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
f6fb8f10 635{
17bfd8c8
KC
636 struct packet_sock *po =
637 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
22781a5b 638 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 639 unsigned int frozen;
bc59ba39 640 struct tpacket_block_desc *pbd;
f6fb8f10 641
642 spin_lock(&po->sk.sk_receive_queue.lock);
643
644 frozen = prb_queue_frozen(pkc);
645 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
646
647 if (unlikely(pkc->delete_blk_timer))
648 goto out;
649
650 /* We only need to plug the race when the block is partially filled.
651 * tpacket_rcv:
652 * lock(); increment BLOCK_NUM_PKTS; unlock()
653 * copy_bits() is in progress ...
654 * timer fires on other cpu:
655 * we can't retire the current block because copy_bits
656 * is in progress.
657 *
658 */
659 if (BLOCK_NUM_PKTS(pbd)) {
660 while (atomic_read(&pkc->blk_fill_in_prog)) {
661 /* Waiting for skb_copy_bits to finish... */
662 cpu_relax();
663 }
664 }
665
666 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
667 if (!frozen) {
41a50d62
AD
668 if (!BLOCK_NUM_PKTS(pbd)) {
669 /* An empty block. Just refresh the timer. */
670 goto refresh_timer;
671 }
f6fb8f10 672 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
673 if (!prb_dispatch_next_block(pkc, po))
674 goto refresh_timer;
675 else
676 goto out;
677 } else {
678 /* Case 1. Queue was frozen because user-space was
679 * lagging behind.
680 */
878cd3ba 681 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 682 /*
683 * Ok, user-space is still behind.
684 * So just refresh the timer.
685 */
686 goto refresh_timer;
687 } else {
688 /* Case 2. queue was frozen,user-space caught up,
689 * now the link went idle && the timer fired.
690 * We don't have a block to close.So we open this
691 * block and restart the timer.
692 * opening a block thaws the queue,restarts timer
693 * Thawing/timer-refresh is a side effect.
694 */
695 prb_open_block(pkc, pbd);
696 goto out;
697 }
698 }
699 }
700
701refresh_timer:
702 _prb_refresh_rx_retire_blk_timer(pkc);
703
704out:
705 spin_unlock(&po->sk.sk_receive_queue.lock);
706}
707
eea49cc9 708static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
bc59ba39 709 struct tpacket_block_desc *pbd1, __u32 status)
f6fb8f10 710{
711 /* Flush everything minus the block header */
712
713#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
714 u8 *start, *end;
715
716 start = (u8 *)pbd1;
717
718 /* Skip the block header(we know header WILL fit in 4K) */
719 start += PAGE_SIZE;
720
721 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
722 for (; start < end; start += PAGE_SIZE)
723 flush_dcache_page(pgv_to_page(start));
724
725 smp_wmb();
726#endif
727
728 /* Now update the block status. */
729
730 BLOCK_STATUS(pbd1) = status;
731
732 /* Flush the block header */
733
734#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
735 start = (u8 *)pbd1;
736 flush_dcache_page(pgv_to_page(start));
737
738 smp_wmb();
739#endif
740}
741
742/*
743 * Side effect:
744 *
745 * 1) flush the block
746 * 2) Increment active_blk_num
747 *
748 * Note:We DONT refresh the timer on purpose.
749 * Because almost always the next block will be opened.
750 */
bc59ba39 751static void prb_close_block(struct tpacket_kbdq_core *pkc1,
752 struct tpacket_block_desc *pbd1,
f6fb8f10 753 struct packet_sock *po, unsigned int stat)
754{
755 __u32 status = TP_STATUS_USER | stat;
756
757 struct tpacket3_hdr *last_pkt;
bc59ba39 758 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
da413eec 759 struct sock *sk = &po->sk;
f6fb8f10 760
ee80fbf3 761 if (po->stats.stats3.tp_drops)
f6fb8f10 762 status |= TP_STATUS_LOSING;
763
764 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
765 last_pkt->tp_next_offset = 0;
766
767 /* Get the ts of the last pkt */
768 if (BLOCK_NUM_PKTS(pbd1)) {
769 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
770 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
771 } else {
41a50d62
AD
772 /* Ok, we tmo'd - so get the current time.
773 *
774 * It shouldn't really happen as we don't close empty
775 * blocks. See prb_retire_rx_blk_timer_expired().
776 */
f6fb8f10 777 struct timespec ts;
778 getnstimeofday(&ts);
779 h1->ts_last_pkt.ts_sec = ts.tv_sec;
780 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
781 }
782
783 smp_wmb();
784
785 /* Flush the block */
786 prb_flush_block(pkc1, pbd1, status);
787
da413eec
DC
788 sk->sk_data_ready(sk);
789
f6fb8f10 790 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
791}
792
eea49cc9 793static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
f6fb8f10 794{
795 pkc->reset_pending_on_curr_blk = 0;
796}
797
798/*
799 * Side effect of opening a block:
800 *
801 * 1) prb_queue is thawed.
802 * 2) retire_blk_timer is refreshed.
803 *
804 */
bc59ba39 805static void prb_open_block(struct tpacket_kbdq_core *pkc1,
806 struct tpacket_block_desc *pbd1)
f6fb8f10 807{
808 struct timespec ts;
bc59ba39 809 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
f6fb8f10 810
811 smp_rmb();
812
8da3056c
DB
813 /* We could have just memset this but we will lose the
814 * flexibility of making the priv area sticky
815 */
f6fb8f10 816
8da3056c
DB
817 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
818 BLOCK_NUM_PKTS(pbd1) = 0;
819 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
f6fb8f10 820
8da3056c
DB
821 getnstimeofday(&ts);
822
823 h1->ts_first_pkt.ts_sec = ts.tv_sec;
824 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
f6fb8f10 825
8da3056c
DB
826 pkc1->pkblk_start = (char *)pbd1;
827 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
828
829 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
830 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
831
832 pbd1->version = pkc1->version;
833 pkc1->prev = pkc1->nxt_offset;
834 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
835
836 prb_thaw_queue(pkc1);
837 _prb_refresh_rx_retire_blk_timer(pkc1);
838
839 smp_wmb();
f6fb8f10 840}
841
842/*
843 * Queue freeze logic:
844 * 1) Assume tp_block_nr = 8 blocks.
845 * 2) At time 't0', user opens Rx ring.
846 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
847 * 4) user-space is either sleeping or processing block '0'.
848 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
849 * it will close block-7,loop around and try to fill block '0'.
850 * call-flow:
851 * __packet_lookup_frame_in_block
852 * prb_retire_current_block()
853 * prb_dispatch_next_block()
854 * |->(BLOCK_STATUS == USER) evaluates to true
855 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
856 * 6) Now there are two cases:
857 * 6.1) Link goes idle right after the queue is frozen.
858 * But remember, the last open_block() refreshed the timer.
859 * When this timer expires,it will refresh itself so that we can
860 * re-open block-0 in near future.
861 * 6.2) Link is busy and keeps on receiving packets. This is a simple
862 * case and __packet_lookup_frame_in_block will check if block-0
863 * is free and can now be re-used.
864 */
eea49cc9 865static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
f6fb8f10 866 struct packet_sock *po)
867{
868 pkc->reset_pending_on_curr_blk = 1;
ee80fbf3 869 po->stats.stats3.tp_freeze_q_cnt++;
f6fb8f10 870}
871
872#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
873
874/*
875 * If the next block is free then we will dispatch it
876 * and return a good offset.
877 * Else, we will freeze the queue.
878 * So, caller must check the return value.
879 */
bc59ba39 880static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 881 struct packet_sock *po)
882{
bc59ba39 883 struct tpacket_block_desc *pbd;
f6fb8f10 884
885 smp_rmb();
886
887 /* 1. Get current block num */
888 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
889
890 /* 2. If this block is currently in_use then freeze the queue */
891 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
892 prb_freeze_queue(pkc, po);
893 return NULL;
894 }
895
896 /*
897 * 3.
898 * open this block and return the offset where the first packet
899 * needs to get stored.
900 */
901 prb_open_block(pkc, pbd);
902 return (void *)pkc->nxt_offset;
903}
904
bc59ba39 905static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
f6fb8f10 906 struct packet_sock *po, unsigned int status)
907{
bc59ba39 908 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
f6fb8f10 909
910 /* retire/close the current block */
911 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
912 /*
913 * Plug the case where copy_bits() is in progress on
914 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
915 * have space to copy the pkt in the current block and
916 * called prb_retire_current_block()
917 *
918 * We don't need to worry about the TMO case because
919 * the timer-handler already handled this case.
920 */
921 if (!(status & TP_STATUS_BLK_TMO)) {
922 while (atomic_read(&pkc->blk_fill_in_prog)) {
923 /* Waiting for skb_copy_bits to finish... */
924 cpu_relax();
925 }
926 }
927 prb_close_block(pkc, pbd, po, status);
928 return;
929 }
f6fb8f10 930}
931
878cd3ba 932static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
f6fb8f10 933{
934 return TP_STATUS_USER & BLOCK_STATUS(pbd);
935}
936
eea49cc9 937static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
f6fb8f10 938{
939 return pkc->reset_pending_on_curr_blk;
940}
941
eea49cc9 942static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
f6fb8f10 943{
bc59ba39 944 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
f6fb8f10 945 atomic_dec(&pkc->blk_fill_in_prog);
946}
947
eea49cc9 948static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 949 struct tpacket3_hdr *ppd)
950{
3958afa1 951 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
f6fb8f10 952}
953
eea49cc9 954static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
f6fb8f10 955 struct tpacket3_hdr *ppd)
956{
957 ppd->hv1.tp_rxhash = 0;
958}
959
eea49cc9 960static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
f6fb8f10 961 struct tpacket3_hdr *ppd)
962{
df8a39de
JP
963 if (skb_vlan_tag_present(pkc->skb)) {
964 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
a0cdfcf3
AW
965 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
966 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
f6fb8f10 967 } else {
9e67030a 968 ppd->hv1.tp_vlan_tci = 0;
a0cdfcf3 969 ppd->hv1.tp_vlan_tpid = 0;
9e67030a 970 ppd->tp_status = TP_STATUS_AVAILABLE;
f6fb8f10 971 }
972}
973
bc59ba39 974static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
f6fb8f10 975 struct tpacket3_hdr *ppd)
976{
a0cdfcf3 977 ppd->hv1.tp_padding = 0;
f6fb8f10 978 prb_fill_vlan_info(pkc, ppd);
979
980 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
981 prb_fill_rxhash(pkc, ppd);
982 else
983 prb_clear_rxhash(pkc, ppd);
984}
985
eea49cc9 986static void prb_fill_curr_block(char *curr,
bc59ba39 987 struct tpacket_kbdq_core *pkc,
988 struct tpacket_block_desc *pbd,
f6fb8f10 989 unsigned int len)
990{
991 struct tpacket3_hdr *ppd;
992
993 ppd = (struct tpacket3_hdr *)curr;
994 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
995 pkc->prev = curr;
996 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
997 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
998 BLOCK_NUM_PKTS(pbd) += 1;
999 atomic_inc(&pkc->blk_fill_in_prog);
1000 prb_run_all_ft_ops(pkc, ppd);
1001}
1002
1003/* Assumes caller has the sk->rx_queue.lock */
1004static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1005 struct sk_buff *skb,
1006 int status,
1007 unsigned int len
1008 )
1009{
bc59ba39 1010 struct tpacket_kbdq_core *pkc;
1011 struct tpacket_block_desc *pbd;
f6fb8f10 1012 char *curr, *end;
1013
e3192690 1014 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
f6fb8f10 1015 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1016
1017 /* Queue is frozen when user space is lagging behind */
1018 if (prb_queue_frozen(pkc)) {
1019 /*
1020 * Check if that last block which caused the queue to freeze,
1021 * is still in_use by user-space.
1022 */
878cd3ba 1023 if (prb_curr_blk_in_use(pbd)) {
f6fb8f10 1024 /* Can't record this packet */
1025 return NULL;
1026 } else {
1027 /*
1028 * Ok, the block was released by user-space.
1029 * Now let's open that block.
1030 * opening a block also thaws the queue.
1031 * Thawing is a side effect.
1032 */
1033 prb_open_block(pkc, pbd);
1034 }
1035 }
1036
1037 smp_mb();
1038 curr = pkc->nxt_offset;
1039 pkc->skb = skb;
e3192690 1040 end = (char *)pbd + pkc->kblk_size;
f6fb8f10 1041
1042 /* first try the current block */
1043 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1044 prb_fill_curr_block(curr, pkc, pbd, len);
1045 return (void *)curr;
1046 }
1047
1048 /* Ok, close the current block */
1049 prb_retire_current_block(pkc, po, 0);
1050
1051 /* Now, try to dispatch the next block */
1052 curr = (char *)prb_dispatch_next_block(pkc, po);
1053 if (curr) {
1054 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1055 prb_fill_curr_block(curr, pkc, pbd, len);
1056 return (void *)curr;
1057 }
1058
1059 /*
1060 * No free blocks are available.user_space hasn't caught up yet.
1061 * Queue was just frozen and now this packet will get dropped.
1062 */
1063 return NULL;
1064}
1065
eea49cc9 1066static void *packet_current_rx_frame(struct packet_sock *po,
f6fb8f10 1067 struct sk_buff *skb,
1068 int status, unsigned int len)
1069{
1070 char *curr = NULL;
1071 switch (po->tp_version) {
1072 case TPACKET_V1:
1073 case TPACKET_V2:
1074 curr = packet_lookup_frame(po, &po->rx_ring,
1075 po->rx_ring.head, status);
1076 return curr;
1077 case TPACKET_V3:
1078 return __packet_lookup_frame_in_block(po, skb, status, len);
1079 default:
1080 WARN(1, "TPACKET version not supported\n");
1081 BUG();
99aa3473 1082 return NULL;
f6fb8f10 1083 }
1084}
1085
eea49cc9 1086static void *prb_lookup_block(struct packet_sock *po,
f6fb8f10 1087 struct packet_ring_buffer *rb,
77f65ebd 1088 unsigned int idx,
f6fb8f10 1089 int status)
1090{
bc59ba39 1091 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
77f65ebd 1092 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
f6fb8f10 1093
1094 if (status != BLOCK_STATUS(pbd))
1095 return NULL;
1096 return pbd;
1097}
1098
eea49cc9 1099static int prb_previous_blk_num(struct packet_ring_buffer *rb)
f6fb8f10 1100{
1101 unsigned int prev;
1102 if (rb->prb_bdqc.kactive_blk_num)
1103 prev = rb->prb_bdqc.kactive_blk_num-1;
1104 else
1105 prev = rb->prb_bdqc.knum_blocks-1;
1106 return prev;
1107}
1108
1109/* Assumes caller has held the rx_queue.lock */
eea49cc9 1110static void *__prb_previous_block(struct packet_sock *po,
f6fb8f10 1111 struct packet_ring_buffer *rb,
1112 int status)
1113{
1114 unsigned int previous = prb_previous_blk_num(rb);
1115 return prb_lookup_block(po, rb, previous, status);
1116}
1117
eea49cc9 1118static void *packet_previous_rx_frame(struct packet_sock *po,
f6fb8f10 1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 if (po->tp_version <= TPACKET_V2)
1123 return packet_previous_frame(po, rb, status);
1124
1125 return __prb_previous_block(po, rb, status);
1126}
1127
eea49cc9 1128static void packet_increment_rx_head(struct packet_sock *po,
f6fb8f10 1129 struct packet_ring_buffer *rb)
1130{
1131 switch (po->tp_version) {
1132 case TPACKET_V1:
1133 case TPACKET_V2:
1134 return packet_increment_head(rb);
1135 case TPACKET_V3:
1136 default:
1137 WARN(1, "TPACKET version not supported.\n");
1138 BUG();
1139 return;
1140 }
1141}
1142
eea49cc9 1143static void *packet_previous_frame(struct packet_sock *po,
69e3c75f
JB
1144 struct packet_ring_buffer *rb,
1145 int status)
1146{
1147 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1148 return packet_lookup_frame(po, rb, previous, status);
1149}
1150
eea49cc9 1151static void packet_increment_head(struct packet_ring_buffer *buff)
69e3c75f
JB
1152{
1153 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1154}
1155
b0138408
DB
1156static void packet_inc_pending(struct packet_ring_buffer *rb)
1157{
1158 this_cpu_inc(*rb->pending_refcnt);
1159}
1160
1161static void packet_dec_pending(struct packet_ring_buffer *rb)
1162{
1163 this_cpu_dec(*rb->pending_refcnt);
1164}
1165
1166static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1167{
1168 unsigned int refcnt = 0;
1169 int cpu;
1170
1171 /* We don't use pending refcount in rx_ring. */
1172 if (rb->pending_refcnt == NULL)
1173 return 0;
1174
1175 for_each_possible_cpu(cpu)
1176 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1177
1178 return refcnt;
1179}
1180
1181static int packet_alloc_pending(struct packet_sock *po)
1182{
1183 po->rx_ring.pending_refcnt = NULL;
1184
1185 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1186 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1187 return -ENOBUFS;
1188
1189 return 0;
1190}
1191
1192static void packet_free_pending(struct packet_sock *po)
1193{
1194 free_percpu(po->tx_ring.pending_refcnt);
1195}
1196
9954729b
WB
1197#define ROOM_POW_OFF 2
1198#define ROOM_NONE 0x0
1199#define ROOM_LOW 0x1
1200#define ROOM_NORMAL 0x2
1201
1202static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
77f65ebd 1203{
9954729b
WB
1204 int idx, len;
1205
1206 len = po->rx_ring.frame_max + 1;
1207 idx = po->rx_ring.head;
1208 if (pow_off)
1209 idx += len >> pow_off;
1210 if (idx >= len)
1211 idx -= len;
1212 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1213}
1214
1215static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1216{
1217 int idx, len;
1218
1219 len = po->rx_ring.prb_bdqc.knum_blocks;
1220 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1221 if (pow_off)
1222 idx += len >> pow_off;
1223 if (idx >= len)
1224 idx -= len;
1225 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1226}
77f65ebd 1227
2ccdbaa6 1228static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
9954729b
WB
1229{
1230 struct sock *sk = &po->sk;
1231 int ret = ROOM_NONE;
1232
1233 if (po->prot_hook.func != tpacket_rcv) {
1234 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
2ccdbaa6 1235 - (skb ? skb->truesize : 0);
9954729b
WB
1236 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1237 return ROOM_NORMAL;
1238 else if (avail > 0)
1239 return ROOM_LOW;
1240 else
1241 return ROOM_NONE;
1242 }
77f65ebd 1243
9954729b
WB
1244 if (po->tp_version == TPACKET_V3) {
1245 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1246 ret = ROOM_NORMAL;
1247 else if (__tpacket_v3_has_room(po, 0))
1248 ret = ROOM_LOW;
1249 } else {
1250 if (__tpacket_has_room(po, ROOM_POW_OFF))
1251 ret = ROOM_NORMAL;
1252 else if (__tpacket_has_room(po, 0))
1253 ret = ROOM_LOW;
1254 }
2ccdbaa6
WB
1255
1256 return ret;
1257}
1258
1259static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1260{
1261 int ret;
1262 bool has_room;
1263
54d7c01d
WB
1264 spin_lock_bh(&po->sk.sk_receive_queue.lock);
1265 ret = __packet_rcv_has_room(po, skb);
2ccdbaa6
WB
1266 has_room = ret == ROOM_NORMAL;
1267 if (po->pressure == has_room)
54d7c01d
WB
1268 po->pressure = !has_room;
1269 spin_unlock_bh(&po->sk.sk_receive_queue.lock);
77f65ebd 1270
9954729b 1271 return ret;
77f65ebd
WB
1272}
1273
1da177e4
LT
1274static void packet_sock_destruct(struct sock *sk)
1275{
ed85b565
RC
1276 skb_queue_purge(&sk->sk_error_queue);
1277
547b792c 1278 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
14afee4b 1279 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
1da177e4
LT
1280
1281 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 1282 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
1283 return;
1284 }
1285
17ab56a2 1286 sk_refcnt_debug_dec(sk);
1da177e4
LT
1287}
1288
3b3a5b0a
WB
1289static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1290{
1291 u32 rxhash;
1292 int i, count = 0;
1293
1294 rxhash = skb_get_hash(skb);
1295 for (i = 0; i < ROLLOVER_HLEN; i++)
1296 if (po->rollover->history[i] == rxhash)
1297 count++;
1298
1299 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1300 return count > (ROLLOVER_HLEN >> 1);
1301}
1302
77f65ebd
WB
1303static unsigned int fanout_demux_hash(struct packet_fanout *f,
1304 struct sk_buff *skb,
1305 unsigned int num)
dc99f600 1306{
eb70db87 1307 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
dc99f600
DM
1308}
1309
77f65ebd
WB
1310static unsigned int fanout_demux_lb(struct packet_fanout *f,
1311 struct sk_buff *skb,
1312 unsigned int num)
dc99f600 1313{
468479e6 1314 unsigned int val = atomic_inc_return(&f->rr_cur);
dc99f600 1315
468479e6 1316 return val % num;
77f65ebd
WB
1317}
1318
1319static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1320 struct sk_buff *skb,
1321 unsigned int num)
1322{
1323 return smp_processor_id() % num;
dc99f600
DM
1324}
1325
5df0ddfb
DB
1326static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1327 struct sk_buff *skb,
1328 unsigned int num)
1329{
f337db64 1330 return prandom_u32_max(num);
5df0ddfb
DB
1331}
1332
77f65ebd
WB
1333static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1334 struct sk_buff *skb,
ad377cab 1335 unsigned int idx, bool try_self,
77f65ebd 1336 unsigned int num)
95ec3eb4 1337{
4633c9e0 1338 struct packet_sock *po, *po_next, *po_skip = NULL;
a9b63918 1339 unsigned int i, j, room = ROOM_NONE;
95ec3eb4 1340
0648ab70 1341 po = pkt_sk(f->arr[idx]);
3b3a5b0a
WB
1342
1343 if (try_self) {
1344 room = packet_rcv_has_room(po, skb);
1345 if (room == ROOM_NORMAL ||
1346 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1347 return idx;
4633c9e0 1348 po_skip = po;
3b3a5b0a 1349 }
ad377cab 1350
0648ab70 1351 i = j = min_t(int, po->rollover->sock, num - 1);
77f65ebd 1352 do {
2ccdbaa6 1353 po_next = pkt_sk(f->arr[i]);
4633c9e0 1354 if (po_next != po_skip && !po_next->pressure &&
2ccdbaa6 1355 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
77f65ebd 1356 if (i != j)
0648ab70 1357 po->rollover->sock = i;
a9b63918
WB
1358 atomic_long_inc(&po->rollover->num);
1359 if (room == ROOM_LOW)
1360 atomic_long_inc(&po->rollover->num_huge);
77f65ebd
WB
1361 return i;
1362 }
ad377cab 1363
77f65ebd
WB
1364 if (++i == num)
1365 i = 0;
1366 } while (i != j);
1367
a9b63918 1368 atomic_long_inc(&po->rollover->num_failed);
77f65ebd
WB
1369 return idx;
1370}
1371
2d36097d
NH
1372static unsigned int fanout_demux_qm(struct packet_fanout *f,
1373 struct sk_buff *skb,
1374 unsigned int num)
1375{
1376 return skb_get_queue_mapping(skb) % num;
1377}
1378
47dceb8e
WB
1379static unsigned int fanout_demux_bpf(struct packet_fanout *f,
1380 struct sk_buff *skb,
1381 unsigned int num)
1382{
1383 struct bpf_prog *prog;
1384 unsigned int ret = 0;
1385
1386 rcu_read_lock();
1387 prog = rcu_dereference(f->bpf_prog);
1388 if (prog)
ff936a04 1389 ret = bpf_prog_run_clear_cb(prog, skb) % num;
47dceb8e
WB
1390 rcu_read_unlock();
1391
1392 return ret;
1393}
1394
77f65ebd
WB
1395static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1396{
1397 return f->flags & (flag >> 8);
95ec3eb4
DM
1398}
1399
95ec3eb4
DM
1400static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1401 struct packet_type *pt, struct net_device *orig_dev)
dc99f600
DM
1402{
1403 struct packet_fanout *f = pt->af_packet_priv;
f98f4514 1404 unsigned int num = READ_ONCE(f->num_members);
19bcf9f2 1405 struct net *net = read_pnet(&f->net);
dc99f600 1406 struct packet_sock *po;
77f65ebd 1407 unsigned int idx;
dc99f600 1408
19bcf9f2 1409 if (!net_eq(dev_net(dev), net) || !num) {
dc99f600
DM
1410 kfree_skb(skb);
1411 return 0;
1412 }
1413
3f34b24a 1414 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
19bcf9f2 1415 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
3f34b24a
AD
1416 if (!skb)
1417 return 0;
1418 }
95ec3eb4
DM
1419 switch (f->type) {
1420 case PACKET_FANOUT_HASH:
1421 default:
77f65ebd 1422 idx = fanout_demux_hash(f, skb, num);
95ec3eb4
DM
1423 break;
1424 case PACKET_FANOUT_LB:
77f65ebd 1425 idx = fanout_demux_lb(f, skb, num);
95ec3eb4
DM
1426 break;
1427 case PACKET_FANOUT_CPU:
77f65ebd
WB
1428 idx = fanout_demux_cpu(f, skb, num);
1429 break;
5df0ddfb
DB
1430 case PACKET_FANOUT_RND:
1431 idx = fanout_demux_rnd(f, skb, num);
1432 break;
2d36097d
NH
1433 case PACKET_FANOUT_QM:
1434 idx = fanout_demux_qm(f, skb, num);
1435 break;
77f65ebd 1436 case PACKET_FANOUT_ROLLOVER:
ad377cab 1437 idx = fanout_demux_rollover(f, skb, 0, false, num);
95ec3eb4 1438 break;
47dceb8e 1439 case PACKET_FANOUT_CBPF:
f2e52095 1440 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1441 idx = fanout_demux_bpf(f, skb, num);
1442 break;
dc99f600
DM
1443 }
1444
ad377cab
WB
1445 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1446 idx = fanout_demux_rollover(f, skb, idx, true, num);
dc99f600 1447
ad377cab 1448 po = pkt_sk(f->arr[idx]);
dc99f600
DM
1449 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1450}
1451
fff3321d
PE
1452DEFINE_MUTEX(fanout_mutex);
1453EXPORT_SYMBOL_GPL(fanout_mutex);
dc99f600 1454static LIST_HEAD(fanout_list);
4a69a864 1455static u16 fanout_next_id;
dc99f600
DM
1456
1457static void __fanout_link(struct sock *sk, struct packet_sock *po)
1458{
1459 struct packet_fanout *f = po->fanout;
1460
1461 spin_lock(&f->lock);
1462 f->arr[f->num_members] = sk;
1463 smp_wmb();
1464 f->num_members++;
2bd624b4
AS
1465 if (f->num_members == 1)
1466 dev_add_pack(&f->prot_hook);
dc99f600
DM
1467 spin_unlock(&f->lock);
1468}
1469
1470static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1471{
1472 struct packet_fanout *f = po->fanout;
1473 int i;
1474
1475 spin_lock(&f->lock);
1476 for (i = 0; i < f->num_members; i++) {
1477 if (f->arr[i] == sk)
1478 break;
1479 }
1480 BUG_ON(i >= f->num_members);
1481 f->arr[i] = f->arr[f->num_members - 1];
1482 f->num_members--;
2bd624b4
AS
1483 if (f->num_members == 0)
1484 __dev_remove_pack(&f->prot_hook);
dc99f600
DM
1485 spin_unlock(&f->lock);
1486}
1487
d4dd8aee 1488static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
c0de08d0 1489{
161642e2
ED
1490 if (sk->sk_family != PF_PACKET)
1491 return false;
c0de08d0 1492
161642e2 1493 return ptype->af_packet_priv == pkt_sk(sk)->fanout;
c0de08d0
EL
1494}
1495
47dceb8e
WB
1496static void fanout_init_data(struct packet_fanout *f)
1497{
1498 switch (f->type) {
1499 case PACKET_FANOUT_LB:
1500 atomic_set(&f->rr_cur, 0);
1501 break;
1502 case PACKET_FANOUT_CBPF:
f2e52095 1503 case PACKET_FANOUT_EBPF:
47dceb8e
WB
1504 RCU_INIT_POINTER(f->bpf_prog, NULL);
1505 break;
1506 }
1507}
1508
1509static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
1510{
1511 struct bpf_prog *old;
1512
1513 spin_lock(&f->lock);
1514 old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
1515 rcu_assign_pointer(f->bpf_prog, new);
1516 spin_unlock(&f->lock);
1517
1518 if (old) {
1519 synchronize_net();
1520 bpf_prog_destroy(old);
1521 }
1522}
1523
1524static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
1525 unsigned int len)
1526{
1527 struct bpf_prog *new;
1528 struct sock_fprog fprog;
1529 int ret;
1530
1531 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1532 return -EPERM;
1533 if (len != sizeof(fprog))
1534 return -EINVAL;
1535 if (copy_from_user(&fprog, data, len))
1536 return -EFAULT;
1537
bab18991 1538 ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
47dceb8e
WB
1539 if (ret)
1540 return ret;
1541
1542 __fanout_set_data_bpf(po->fanout, new);
1543 return 0;
1544}
1545
f2e52095
WB
1546static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
1547 unsigned int len)
1548{
1549 struct bpf_prog *new;
1550 u32 fd;
1551
1552 if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
1553 return -EPERM;
1554 if (len != sizeof(fd))
1555 return -EINVAL;
1556 if (copy_from_user(&fd, data, len))
1557 return -EFAULT;
1558
113214be 1559 new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
f2e52095
WB
1560 if (IS_ERR(new))
1561 return PTR_ERR(new);
f2e52095
WB
1562
1563 __fanout_set_data_bpf(po->fanout, new);
1564 return 0;
1565}
1566
47dceb8e
WB
1567static int fanout_set_data(struct packet_sock *po, char __user *data,
1568 unsigned int len)
1569{
1570 switch (po->fanout->type) {
1571 case PACKET_FANOUT_CBPF:
1572 return fanout_set_data_cbpf(po, data, len);
f2e52095
WB
1573 case PACKET_FANOUT_EBPF:
1574 return fanout_set_data_ebpf(po, data, len);
47dceb8e
WB
1575 default:
1576 return -EINVAL;
07d53ae4 1577 }
47dceb8e
WB
1578}
1579
1580static void fanout_release_data(struct packet_fanout *f)
1581{
1582 switch (f->type) {
1583 case PACKET_FANOUT_CBPF:
f2e52095 1584 case PACKET_FANOUT_EBPF:
47dceb8e 1585 __fanout_set_data_bpf(f, NULL);
07d53ae4 1586 }
47dceb8e
WB
1587}
1588
4a69a864
MM
1589static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
1590{
1591 struct packet_fanout *f;
1592
1593 list_for_each_entry(f, &fanout_list, list) {
1594 if (f->id == candidate_id &&
1595 read_pnet(&f->net) == sock_net(sk)) {
1596 return false;
1597 }
1598 }
1599 return true;
1600}
1601
1602static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
1603{
1604 u16 id = fanout_next_id;
1605
1606 do {
1607 if (__fanout_id_is_free(sk, id)) {
1608 *new_id = id;
1609 fanout_next_id = id + 1;
1610 return true;
1611 }
1612
1613 id++;
1614 } while (id != fanout_next_id);
1615
1616 return false;
1617}
1618
7736d33f 1619static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
dc99f600 1620{
d199fab6 1621 struct packet_rollover *rollover = NULL;
dc99f600
DM
1622 struct packet_sock *po = pkt_sk(sk);
1623 struct packet_fanout *f, *match;
7736d33f 1624 u8 type = type_flags & 0xff;
77f65ebd 1625 u8 flags = type_flags >> 8;
dc99f600
DM
1626 int err;
1627
1628 switch (type) {
77f65ebd
WB
1629 case PACKET_FANOUT_ROLLOVER:
1630 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1631 return -EINVAL;
dc99f600
DM
1632 case PACKET_FANOUT_HASH:
1633 case PACKET_FANOUT_LB:
95ec3eb4 1634 case PACKET_FANOUT_CPU:
5df0ddfb 1635 case PACKET_FANOUT_RND:
2d36097d 1636 case PACKET_FANOUT_QM:
47dceb8e 1637 case PACKET_FANOUT_CBPF:
f2e52095 1638 case PACKET_FANOUT_EBPF:
dc99f600
DM
1639 break;
1640 default:
1641 return -EINVAL;
1642 }
1643
d199fab6
ED
1644 mutex_lock(&fanout_mutex);
1645
d199fab6 1646 err = -EALREADY;
dc99f600 1647 if (po->fanout)
d199fab6 1648 goto out;
dc99f600 1649
4633c9e0
WB
1650 if (type == PACKET_FANOUT_ROLLOVER ||
1651 (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
d199fab6
ED
1652 err = -ENOMEM;
1653 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
1654 if (!rollover)
1655 goto out;
1656 atomic_long_set(&rollover->num, 0);
1657 atomic_long_set(&rollover->num_huge, 0);
1658 atomic_long_set(&rollover->num_failed, 0);
0648ab70
WB
1659 }
1660
4a69a864
MM
1661 if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
1662 if (id != 0) {
1663 err = -EINVAL;
1664 goto out;
1665 }
1666 if (!fanout_find_new_id(sk, &id)) {
1667 err = -ENOMEM;
1668 goto out;
1669 }
1670 /* ephemeral flag for the first socket in the group: drop it */
1671 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
1672 }
1673
dc99f600
DM
1674 match = NULL;
1675 list_for_each_entry(f, &fanout_list, list) {
1676 if (f->id == id &&
1677 read_pnet(&f->net) == sock_net(sk)) {
1678 match = f;
1679 break;
1680 }
1681 }
afe62c68 1682 err = -EINVAL;
77f65ebd 1683 if (match && match->flags != flags)
afe62c68 1684 goto out;
dc99f600 1685 if (!match) {
afe62c68 1686 err = -ENOMEM;
dc99f600 1687 match = kzalloc(sizeof(*match), GFP_KERNEL);
afe62c68
ED
1688 if (!match)
1689 goto out;
1690 write_pnet(&match->net, sock_net(sk));
1691 match->id = id;
1692 match->type = type;
77f65ebd 1693 match->flags = flags;
afe62c68
ED
1694 INIT_LIST_HEAD(&match->list);
1695 spin_lock_init(&match->lock);
fb5c2c17 1696 refcount_set(&match->sk_ref, 0);
47dceb8e 1697 fanout_init_data(match);
afe62c68
ED
1698 match->prot_hook.type = po->prot_hook.type;
1699 match->prot_hook.dev = po->prot_hook.dev;
1700 match->prot_hook.func = packet_rcv_fanout;
1701 match->prot_hook.af_packet_priv = match;
c0de08d0 1702 match->prot_hook.id_match = match_fanout_group;
afe62c68 1703 list_add(&match->list, &fanout_list);
dc99f600 1704 }
afe62c68 1705 err = -EINVAL;
008ba2a1
WB
1706
1707 spin_lock(&po->bind_lock);
1708 if (po->running &&
1709 match->type == type &&
afe62c68
ED
1710 match->prot_hook.type == po->prot_hook.type &&
1711 match->prot_hook.dev == po->prot_hook.dev) {
1712 err = -ENOSPC;
fb5c2c17 1713 if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
afe62c68
ED
1714 __dev_remove_pack(&po->prot_hook);
1715 po->fanout = match;
57f015f5
MM
1716 po->rollover = rollover;
1717 rollover = NULL;
fb5c2c17 1718 refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
afe62c68
ED
1719 __fanout_link(sk, po);
1720 err = 0;
dc99f600
DM
1721 }
1722 }
008ba2a1
WB
1723 spin_unlock(&po->bind_lock);
1724
1725 if (err && !refcount_read(&match->sk_ref)) {
1726 list_del(&match->list);
1727 kfree(match);
1728 }
1729
afe62c68 1730out:
57f015f5 1731 kfree(rollover);
d199fab6 1732 mutex_unlock(&fanout_mutex);
dc99f600
DM
1733 return err;
1734}
1735
2bd624b4
AS
1736/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
1737 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
1738 * It is the responsibility of the caller to call fanout_release_data() and
1739 * free the returned packet_fanout (after synchronize_net())
1740 */
1741static struct packet_fanout *fanout_release(struct sock *sk)
dc99f600
DM
1742{
1743 struct packet_sock *po = pkt_sk(sk);
1744 struct packet_fanout *f;
1745
fff3321d 1746 mutex_lock(&fanout_mutex);
d199fab6
ED
1747 f = po->fanout;
1748 if (f) {
1749 po->fanout = NULL;
1750
fb5c2c17 1751 if (refcount_dec_and_test(&f->sk_ref))
d199fab6 1752 list_del(&f->list);
2bd624b4
AS
1753 else
1754 f = NULL;
dc99f600
DM
1755 }
1756 mutex_unlock(&fanout_mutex);
2bd624b4
AS
1757
1758 return f;
dc99f600 1759}
1da177e4 1760
3c70c132
DB
1761static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
1762 struct sk_buff *skb)
1763{
1764 /* Earlier code assumed this would be a VLAN pkt, double-check
1765 * this now that we have the actual packet in hand. We can only
1766 * do this check on Ethernet devices.
1767 */
1768 if (unlikely(dev->type != ARPHRD_ETHER))
1769 return false;
1770
1771 skb_reset_mac_header(skb);
1772 return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
1773}
1774
90ddc4f0 1775static const struct proto_ops packet_ops;
1da177e4 1776
90ddc4f0 1777static const struct proto_ops packet_ops_spkt;
1da177e4 1778
40d4e3df
ED
1779static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1780 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
1781{
1782 struct sock *sk;
1783 struct sockaddr_pkt *spkt;
1784
1785 /*
1786 * When we registered the protocol we saved the socket in the data
1787 * field for just this event.
1788 */
1789
1790 sk = pt->af_packet_priv;
1ce4f28b 1791
1da177e4
LT
1792 /*
1793 * Yank back the headers [hope the device set this
1794 * right or kerboom...]
1795 *
1796 * Incoming packets have ll header pulled,
1797 * push it back.
1798 *
98e399f8 1799 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
1800 * so that this procedure is noop.
1801 */
1802
1803 if (skb->pkt_type == PACKET_LOOPBACK)
1804 goto out;
1805
09ad9bc7 1806 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
1807 goto out;
1808
40d4e3df
ED
1809 skb = skb_share_check(skb, GFP_ATOMIC);
1810 if (skb == NULL)
1da177e4
LT
1811 goto oom;
1812
1813 /* drop any routing info */
adf30907 1814 skb_dst_drop(skb);
1da177e4 1815
84531c24
PO
1816 /* drop conntrack reference */
1817 nf_reset(skb);
1818
ffbc6111 1819 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 1820
98e399f8 1821 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
1822
1823 /*
1824 * The SOCK_PACKET socket receives _all_ frames.
1825 */
1826
1827 spkt->spkt_family = dev->type;
1828 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1829 spkt->spkt_protocol = skb->protocol;
1830
1831 /*
1832 * Charge the memory to the socket. This is done specifically
1833 * to prevent sockets using all the memory up.
1834 */
1835
40d4e3df 1836 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
1837 return 0;
1838
1839out:
1840 kfree_skb(skb);
1841oom:
1842 return 0;
1843}
1844
75c65772
MM
1845static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
1846{
18bed891
YK
1847 if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
1848 sock->type == SOCK_RAW) {
75c65772
MM
1849 skb_reset_mac_header(skb);
1850 skb->protocol = dev_parse_header_protocol(skb);
1851 }
1852
1853 skb_probe_transport_header(skb);
1854}
1da177e4
LT
1855
1856/*
1857 * Output a raw packet to a device layer. This bypasses all the other
1858 * protocol layers and you must therefore supply it with a complete frame
1859 */
1ce4f28b 1860
1b784140
YX
1861static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1862 size_t len)
1da177e4
LT
1863{
1864 struct sock *sk = sock->sk;
342dfc30 1865 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1a35ca80 1866 struct sk_buff *skb = NULL;
1da177e4 1867 struct net_device *dev;
c14ac945 1868 struct sockcm_cookie sockc;
40d4e3df 1869 __be16 proto = 0;
1da177e4 1870 int err;
3bdc0eba 1871 int extra_len = 0;
1ce4f28b 1872
1da177e4 1873 /*
1ce4f28b 1874 * Get and verify the address.
1da177e4
LT
1875 */
1876
40d4e3df 1877 if (saddr) {
1da177e4 1878 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
1879 return -EINVAL;
1880 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1881 proto = saddr->spkt_protocol;
1882 } else
1883 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
1884
1885 /*
1ce4f28b 1886 * Find the device first to size check it
1da177e4
LT
1887 */
1888
de74e92a 1889 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1a35ca80 1890retry:
654d1f8a
ED
1891 rcu_read_lock();
1892 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
1893 err = -ENODEV;
1894 if (dev == NULL)
1895 goto out_unlock;
1ce4f28b 1896
d5e76b0a
DM
1897 err = -ENETDOWN;
1898 if (!(dev->flags & IFF_UP))
1899 goto out_unlock;
1900
1da177e4 1901 /*
40d4e3df
ED
1902 * You may not queue a frame bigger than the mtu. This is the lowest level
1903 * raw protocol and you must do your own fragmentation at this level.
1da177e4 1904 */
1ce4f28b 1905
3bdc0eba
BG
1906 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1907 if (!netif_supports_nofcs(dev)) {
1908 err = -EPROTONOSUPPORT;
1909 goto out_unlock;
1910 }
1911 extra_len = 4; /* We're doing our own CRC */
1912 }
1913
1da177e4 1914 err = -EMSGSIZE;
3bdc0eba 1915 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1da177e4
LT
1916 goto out_unlock;
1917
1a35ca80
ED
1918 if (!skb) {
1919 size_t reserved = LL_RESERVED_SPACE(dev);
4ce40912 1920 int tlen = dev->needed_tailroom;
1a35ca80
ED
1921 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1922
1923 rcu_read_unlock();
4ce40912 1924 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1a35ca80
ED
1925 if (skb == NULL)
1926 return -ENOBUFS;
1927 /* FIXME: Save some space for broken drivers that write a hard
1928 * header at transmission time by themselves. PPP is the notable
1929 * one here. This should really be fixed at the driver level.
1930 */
1931 skb_reserve(skb, reserved);
1932 skb_reset_network_header(skb);
1933
1934 /* Try to align data part correctly */
1935 if (hhlen) {
1936 skb->data -= hhlen;
1937 skb->tail -= hhlen;
1938 if (len < hhlen)
1939 skb_reset_network_header(skb);
1940 }
6ce8e9ce 1941 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1a35ca80
ED
1942 if (err)
1943 goto out_free;
1944 goto retry;
1da177e4
LT
1945 }
1946
9ed988cd
WB
1947 if (!dev_validate_header(dev, skb->data, len)) {
1948 err = -EINVAL;
1949 goto out_unlock;
1950 }
3c70c132
DB
1951 if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
1952 !packet_extra_vlan_len_allowed(dev, skb)) {
1953 err = -EMSGSIZE;
1954 goto out_unlock;
57f89bfa 1955 }
1a35ca80 1956
657a0667 1957 sockcm_init(&sockc, sk);
c14ac945
SHY
1958 if (msg->msg_controllen) {
1959 err = sock_cmsg_send(sk, msg, &sockc);
f8e7718c 1960 if (unlikely(err))
c14ac945 1961 goto out_unlock;
c14ac945
SHY
1962 }
1963
1da177e4
LT
1964 skb->protocol = proto;
1965 skb->dev = dev;
1966 skb->priority = sk->sk_priority;
2d37a186 1967 skb->mark = sk->sk_mark;
3d0ba8c0 1968 skb->tstamp = sockc.transmit_time;
bf84a010 1969
8f932f76 1970 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 1971
3bdc0eba
BG
1972 if (unlikely(extra_len == 4))
1973 skb->no_fcs = 1;
1974
75c65772 1975 packet_parse_headers(skb, sock);
c1aad275 1976
1da177e4 1977 dev_queue_xmit(skb);
654d1f8a 1978 rcu_read_unlock();
40d4e3df 1979 return len;
1da177e4 1980
1da177e4 1981out_unlock:
654d1f8a 1982 rcu_read_unlock();
1a35ca80
ED
1983out_free:
1984 kfree_skb(skb);
1da177e4
LT
1985 return err;
1986}
1da177e4 1987
ff936a04
AS
1988static unsigned int run_filter(struct sk_buff *skb,
1989 const struct sock *sk,
1990 unsigned int res)
1da177e4
LT
1991{
1992 struct sk_filter *filter;
fda9ef5d 1993
80f8f102
ED
1994 rcu_read_lock();
1995 filter = rcu_dereference(sk->sk_filter);
dbcb5855 1996 if (filter != NULL)
ff936a04 1997 res = bpf_prog_run_clear_cb(filter->prog, skb);
80f8f102 1998 rcu_read_unlock();
1da177e4 1999
dbcb5855 2000 return res;
1da177e4
LT
2001}
2002
16cc1400
WB
2003static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
2004 size_t *len)
2005{
2006 struct virtio_net_hdr vnet_hdr;
2007
2008 if (*len < sizeof(vnet_hdr))
2009 return -EINVAL;
2010 *len -= sizeof(vnet_hdr);
2011
fd3a8862 2012 if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
16cc1400
WB
2013 return -EINVAL;
2014
2015 return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
2016}
2017
1da177e4 2018/*
62ab0812
ED
2019 * This function makes lazy skb cloning in hope that most of packets
2020 * are discarded by BPF.
2021 *
2022 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
2023 * and skb->cb are mangled. It works because (and until) packets
2024 * falling here are owned by current CPU. Output packets are cloned
2025 * by dev_queue_xmit_nit(), input packets are processed by net_bh
2026 * sequencially, so that if we return skb to original state on exit,
2027 * we will not harm anyone.
1da177e4
LT
2028 */
2029
40d4e3df
ED
2030static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
2031 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2032{
2033 struct sock *sk;
2034 struct sockaddr_ll *sll;
2035 struct packet_sock *po;
40d4e3df 2036 u8 *skb_head = skb->data;
1da177e4 2037 int skb_len = skb->len;
dbcb5855 2038 unsigned int snaplen, res;
da37845f 2039 bool is_drop_n_account = false;
1da177e4
LT
2040
2041 if (skb->pkt_type == PACKET_LOOPBACK)
2042 goto drop;
2043
2044 sk = pt->af_packet_priv;
2045 po = pkt_sk(sk);
2046
09ad9bc7 2047 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2048 goto drop;
2049
1da177e4
LT
2050 skb->dev = dev;
2051
3b04ddde 2052 if (dev->header_ops) {
1da177e4 2053 /* The device has an explicit notion of ll header,
62ab0812
ED
2054 * exported to higher levels.
2055 *
2056 * Otherwise, the device hides details of its frame
2057 * structure, so that corresponding packet head is
2058 * never delivered to user.
1da177e4
LT
2059 */
2060 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2061 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2062 else if (skb->pkt_type == PACKET_OUTGOING) {
2063 /* Special case: outgoing packets have ll header at head */
bbe735e4 2064 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2065 }
2066 }
2067
2068 snaplen = skb->len;
2069
dbcb5855
DM
2070 res = run_filter(skb, sk, snaplen);
2071 if (!res)
fda9ef5d 2072 goto drop_n_restore;
dbcb5855
DM
2073 if (snaplen > res)
2074 snaplen = res;
1da177e4 2075
0fd7bac6 2076 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1da177e4
LT
2077 goto drop_n_acct;
2078
2079 if (skb_shared(skb)) {
2080 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
2081 if (nskb == NULL)
2082 goto drop_n_acct;
2083
2084 if (skb_head != skb->data) {
2085 skb->data = skb_head;
2086 skb->len = skb_len;
2087 }
abc4e4fa 2088 consume_skb(skb);
1da177e4
LT
2089 skb = nskb;
2090 }
2091
b4772ef8 2092 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
ffbc6111
HX
2093
2094 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4 2095 sll->sll_hatype = dev->type;
1da177e4 2096 sll->sll_pkttype = skb->pkt_type;
8032b464 2097 if (unlikely(po->origdev))
80feaacb
PWJ
2098 sll->sll_ifindex = orig_dev->ifindex;
2099 else
2100 sll->sll_ifindex = dev->ifindex;
1da177e4 2101
b95cce35 2102 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 2103
2472d761
EB
2104 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
2105 * Use their space for storing the original skb length.
2106 */
2107 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
8dc41944 2108
1da177e4
LT
2109 if (pskb_trim(skb, snaplen))
2110 goto drop_n_acct;
2111
2112 skb_set_owner_r(skb, sk);
2113 skb->dev = NULL;
adf30907 2114 skb_dst_drop(skb);
1da177e4 2115
84531c24
PO
2116 /* drop conntrack reference */
2117 nf_reset(skb);
2118
1da177e4 2119 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2120 po->stats.stats1.tp_packets++;
3bc3b96f 2121 sock_skb_set_dropcount(sk, skb);
1da177e4
LT
2122 __skb_queue_tail(&sk->sk_receive_queue, skb);
2123 spin_unlock(&sk->sk_receive_queue.lock);
676d2369 2124 sk->sk_data_ready(sk);
1da177e4
LT
2125 return 0;
2126
2127drop_n_acct:
da37845f 2128 is_drop_n_account = true;
7091fbd8 2129 spin_lock(&sk->sk_receive_queue.lock);
ee80fbf3 2130 po->stats.stats1.tp_drops++;
7091fbd8
WB
2131 atomic_inc(&sk->sk_drops);
2132 spin_unlock(&sk->sk_receive_queue.lock);
1da177e4
LT
2133
2134drop_n_restore:
2135 if (skb_head != skb->data && skb_shared(skb)) {
2136 skb->data = skb_head;
2137 skb->len = skb_len;
2138 }
2139drop:
da37845f
WJ
2140 if (!is_drop_n_account)
2141 consume_skb(skb);
2142 else
2143 kfree_skb(skb);
1da177e4
LT
2144 return 0;
2145}
2146
40d4e3df
ED
2147static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
2148 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
2149{
2150 struct sock *sk;
2151 struct packet_sock *po;
2152 struct sockaddr_ll *sll;
184f489e 2153 union tpacket_uhdr h;
40d4e3df 2154 u8 *skb_head = skb->data;
1da177e4 2155 int skb_len = skb->len;
dbcb5855 2156 unsigned int snaplen, res;
f6fb8f10 2157 unsigned long status = TP_STATUS_USER;
bbd6ef87 2158 unsigned short macoff, netoff, hdrlen;
1da177e4 2159 struct sk_buff *copy_skb = NULL;
bbd6ef87 2160 struct timespec ts;
b9c32fb2 2161 __u32 ts_status;
da37845f 2162 bool is_drop_n_account = false;
edbd58be 2163 bool do_vnet = false;
1da177e4 2164
51846355
AW
2165 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
2166 * We may add members to them until current aligned size without forcing
2167 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
2168 */
2169 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
2170 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
2171
1da177e4
LT
2172 if (skb->pkt_type == PACKET_LOOPBACK)
2173 goto drop;
2174
2175 sk = pt->af_packet_priv;
2176 po = pkt_sk(sk);
2177
09ad9bc7 2178 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
2179 goto drop;
2180
3b04ddde 2181 if (dev->header_ops) {
1da177e4 2182 if (sk->sk_type != SOCK_DGRAM)
98e399f8 2183 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
2184 else if (skb->pkt_type == PACKET_OUTGOING) {
2185 /* Special case: outgoing packets have ll header at head */
bbe735e4 2186 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
2187 }
2188 }
2189
2190 snaplen = skb->len;
2191
dbcb5855
DM
2192 res = run_filter(skb, sk, snaplen);
2193 if (!res)
fda9ef5d 2194 goto drop_n_restore;
68c2e5de
AD
2195
2196 if (skb->ip_summed == CHECKSUM_PARTIAL)
2197 status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
2198 else if (skb->pkt_type != PACKET_OUTGOING &&
2199 (skb->ip_summed == CHECKSUM_COMPLETE ||
2200 skb_csum_unnecessary(skb)))
2201 status |= TP_STATUS_CSUM_VALID;
68c2e5de 2202
dbcb5855
DM
2203 if (snaplen > res)
2204 snaplen = res;
1da177e4
LT
2205
2206 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
2207 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2208 po->tp_reserve;
1da177e4 2209 } else {
95c96174 2210 unsigned int maclen = skb_network_offset(skb);
bbd6ef87 2211 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a 2212 (maclen < 16 ? 16 : maclen)) +
58d19b19 2213 po->tp_reserve;
edbd58be 2214 if (po->has_vnet_hdr) {
58d19b19 2215 netoff += sizeof(struct virtio_net_hdr);
edbd58be
BP
2216 do_vnet = true;
2217 }
1da177e4
LT
2218 macoff = netoff - maclen;
2219 }
f6fb8f10 2220 if (po->tp_version <= TPACKET_V2) {
2221 if (macoff + snaplen > po->rx_ring.frame_size) {
2222 if (po->copy_thresh &&
0fd7bac6 2223 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
f6fb8f10 2224 if (skb_shared(skb)) {
2225 copy_skb = skb_clone(skb, GFP_ATOMIC);
2226 } else {
2227 copy_skb = skb_get(skb);
2228 skb_head = skb->data;
2229 }
2230 if (copy_skb)
2231 skb_set_owner_r(copy_skb, sk);
1da177e4 2232 }
f6fb8f10 2233 snaplen = po->rx_ring.frame_size - macoff;
edbd58be 2234 if ((int)snaplen < 0) {
f6fb8f10 2235 snaplen = 0;
edbd58be
BP
2236 do_vnet = false;
2237 }
1da177e4 2238 }
dc808110
ED
2239 } else if (unlikely(macoff + snaplen >
2240 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2241 u32 nval;
2242
2243 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2244 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2245 snaplen, nval, macoff);
2246 snaplen = nval;
2247 if (unlikely((int)snaplen < 0)) {
2248 snaplen = 0;
2249 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
edbd58be 2250 do_vnet = false;
dc808110 2251 }
1da177e4 2252 }
1da177e4 2253 spin_lock(&sk->sk_receive_queue.lock);
f6fb8f10 2254 h.raw = packet_current_rx_frame(po, skb,
2255 TP_STATUS_KERNEL, (macoff+snaplen));
bbd6ef87 2256 if (!h.raw)
58d19b19 2257 goto drop_n_account;
f6fb8f10 2258 if (po->tp_version <= TPACKET_V2) {
2259 packet_increment_rx_head(po, &po->rx_ring);
2260 /*
2261 * LOSING will be reported till you read the stats,
2262 * because it's COR - Clear On Read.
2263 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2264 * at packet level.
2265 */
ee80fbf3 2266 if (po->stats.stats1.tp_drops)
f6fb8f10 2267 status |= TP_STATUS_LOSING;
2268 }
945d015e
ED
2269
2270 if (do_vnet &&
2271 virtio_net_hdr_from_skb(skb, h.raw + macoff -
2272 sizeof(struct virtio_net_hdr),
2273 vio_le(), true, 0))
2274 goto drop_n_account;
2275
ee80fbf3 2276 po->stats.stats1.tp_packets++;
1da177e4
LT
2277 if (copy_skb) {
2278 status |= TP_STATUS_COPY;
2279 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2280 }
1da177e4
LT
2281 spin_unlock(&sk->sk_receive_queue.lock);
2282
bbd6ef87 2283 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
b9c32fb2
DB
2284
2285 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
7a51384c 2286 getnstimeofday(&ts);
1da177e4 2287
b9c32fb2
DB
2288 status |= ts_status;
2289
bbd6ef87
PM
2290 switch (po->tp_version) {
2291 case TPACKET_V1:
2292 h.h1->tp_len = skb->len;
2293 h.h1->tp_snaplen = snaplen;
2294 h.h1->tp_mac = macoff;
2295 h.h1->tp_net = netoff;
4b457bdf
DB
2296 h.h1->tp_sec = ts.tv_sec;
2297 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
bbd6ef87
PM
2298 hdrlen = sizeof(*h.h1);
2299 break;
2300 case TPACKET_V2:
2301 h.h2->tp_len = skb->len;
2302 h.h2->tp_snaplen = snaplen;
2303 h.h2->tp_mac = macoff;
2304 h.h2->tp_net = netoff;
bbd6ef87
PM
2305 h.h2->tp_sec = ts.tv_sec;
2306 h.h2->tp_nsec = ts.tv_nsec;
df8a39de
JP
2307 if (skb_vlan_tag_present(skb)) {
2308 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
2309 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2310 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
2311 } else {
2312 h.h2->tp_vlan_tci = 0;
a0cdfcf3 2313 h.h2->tp_vlan_tpid = 0;
a3bcc23e 2314 }
e4d26f4b 2315 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
bbd6ef87
PM
2316 hdrlen = sizeof(*h.h2);
2317 break;
f6fb8f10 2318 case TPACKET_V3:
2319 /* tp_nxt_offset,vlan are already populated above.
2320 * So DONT clear those fields here
2321 */
2322 h.h3->tp_status |= status;
2323 h.h3->tp_len = skb->len;
2324 h.h3->tp_snaplen = snaplen;
2325 h.h3->tp_mac = macoff;
2326 h.h3->tp_net = netoff;
f6fb8f10 2327 h.h3->tp_sec = ts.tv_sec;
2328 h.h3->tp_nsec = ts.tv_nsec;
e4d26f4b 2329 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
f6fb8f10 2330 hdrlen = sizeof(*h.h3);
2331 break;
bbd6ef87
PM
2332 default:
2333 BUG();
2334 }
1da177e4 2335
bbd6ef87 2336 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 2337 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
2338 sll->sll_family = AF_PACKET;
2339 sll->sll_hatype = dev->type;
2340 sll->sll_protocol = skb->protocol;
2341 sll->sll_pkttype = skb->pkt_type;
8032b464 2342 if (unlikely(po->origdev))
80feaacb
PWJ
2343 sll->sll_ifindex = orig_dev->ifindex;
2344 else
2345 sll->sll_ifindex = dev->ifindex;
1da177e4 2346
e16aa207 2347 smp_mb();
f0d4eb29 2348
f6dafa95 2349#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
f0d4eb29 2350 if (po->tp_version <= TPACKET_V2) {
0af55bb5
CG
2351 u8 *start, *end;
2352
f0d4eb29
DB
2353 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2354 macoff + snaplen);
2355
2356 for (start = h.raw; start < end; start += PAGE_SIZE)
2357 flush_dcache_page(pgv_to_page(start));
1da177e4 2358 }
f0d4eb29 2359 smp_wmb();
f6dafa95 2360#endif
f0d4eb29 2361
da413eec 2362 if (po->tp_version <= TPACKET_V2) {
f6fb8f10 2363 __packet_set_status(po, h.raw, status);
da413eec
DC
2364 sk->sk_data_ready(sk);
2365 } else {
f6fb8f10 2366 prb_clear_blk_fill_status(&po->rx_ring);
da413eec 2367 }
1da177e4
LT
2368
2369drop_n_restore:
2370 if (skb_head != skb->data && skb_shared(skb)) {
2371 skb->data = skb_head;
2372 skb->len = skb_len;
2373 }
2374drop:
da37845f
WJ
2375 if (!is_drop_n_account)
2376 consume_skb(skb);
2377 else
2378 kfree_skb(skb);
1da177e4
LT
2379 return 0;
2380
58d19b19 2381drop_n_account:
da37845f 2382 is_drop_n_account = true;
ee80fbf3 2383 po->stats.stats1.tp_drops++;
1da177e4
LT
2384 spin_unlock(&sk->sk_receive_queue.lock);
2385
676d2369 2386 sk->sk_data_ready(sk);
acb5d75b 2387 kfree_skb(copy_skb);
1da177e4
LT
2388 goto drop_n_restore;
2389}
2390
69e3c75f
JB
2391static void tpacket_destruct_skb(struct sk_buff *skb)
2392{
2393 struct packet_sock *po = pkt_sk(skb->sk);
1da177e4 2394
69e3c75f 2395 if (likely(po->tx_ring.pg_vec)) {
f0d4eb29 2396 void *ph;
b9c32fb2
DB
2397 __u32 ts;
2398
5cd8d46e 2399 ph = skb_zcopy_get_nouarg(skb);
b0138408 2400 packet_dec_pending(&po->tx_ring);
b9c32fb2
DB
2401
2402 ts = __packet_set_timestamp(po, ph, skb);
2403 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
69e3c75f
JB
2404 }
2405
2406 sock_wfree(skb);
2407}
2408
16cc1400
WB
2409static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
2410{
16cc1400
WB
2411 if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2412 (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2413 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
2414 __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
2415 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
2416 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
2417 __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
2418
2419 if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
2420 return -EINVAL;
2421
16cc1400
WB
2422 return 0;
2423}
2424
2425static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
2426 struct virtio_net_hdr *vnet_hdr)
2427{
16cc1400
WB
2428 if (*len < sizeof(*vnet_hdr))
2429 return -EINVAL;
2430 *len -= sizeof(*vnet_hdr);
2431
cbbd26b8 2432 if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
16cc1400
WB
2433 return -EFAULT;
2434
2435 return __packet_snd_vnet_parse(vnet_hdr, *len);
2436}
2437
40d4e3df 2438static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
8d39b4a6 2439 void *frame, struct net_device *dev, void *data, int tp_len,
c14ac945
SHY
2440 __be16 proto, unsigned char *addr, int hlen, int copylen,
2441 const struct sockcm_cookie *sockc)
69e3c75f 2442{
184f489e 2443 union tpacket_uhdr ph;
8d39b4a6 2444 int to_write, offset, len, nr_frags, len_max;
69e3c75f
JB
2445 struct socket *sock = po->sk.sk_socket;
2446 struct page *page;
69e3c75f
JB
2447 int err;
2448
2449 ph.raw = frame;
2450
2451 skb->protocol = proto;
2452 skb->dev = dev;
2453 skb->priority = po->sk.sk_priority;
2d37a186 2454 skb->mark = po->sk.sk_mark;
3d0ba8c0 2455 skb->tstamp = sockc->transmit_time;
8f932f76 2456 skb_setup_tx_timestamp(skb, sockc->tsflags);
5cd8d46e 2457 skb_zcopy_set_nouarg(skb, ph.raw);
69e3c75f 2458
ae641949 2459 skb_reserve(skb, hlen);
69e3c75f 2460 skb_reset_network_header(skb);
c1aad275 2461
69e3c75f
JB
2462 to_write = tp_len;
2463
2464 if (sock->type == SOCK_DGRAM) {
2465 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2466 NULL, tp_len);
2467 if (unlikely(err < 0))
2468 return -EINVAL;
1d036d25 2469 } else if (copylen) {
9ed988cd
WB
2470 int hdrlen = min_t(int, copylen, tp_len);
2471
69e3c75f 2472 skb_push(skb, dev->hard_header_len);
1d036d25 2473 skb_put(skb, copylen - dev->hard_header_len);
9ed988cd 2474 err = skb_store_bits(skb, 0, data, hdrlen);
69e3c75f
JB
2475 if (unlikely(err))
2476 return err;
9ed988cd
WB
2477 if (!dev_validate_header(dev, skb->data, hdrlen))
2478 return -EINVAL;
69e3c75f 2479
9ed988cd
WB
2480 data += hdrlen;
2481 to_write -= hdrlen;
69e3c75f
JB
2482 }
2483
69e3c75f
JB
2484 offset = offset_in_page(data);
2485 len_max = PAGE_SIZE - offset;
2486 len = ((to_write > len_max) ? len_max : to_write);
2487
2488 skb->data_len = to_write;
2489 skb->len += to_write;
2490 skb->truesize += to_write;
14afee4b 2491 refcount_add(to_write, &po->sk.sk_wmem_alloc);
69e3c75f
JB
2492
2493 while (likely(to_write)) {
2494 nr_frags = skb_shinfo(skb)->nr_frags;
2495
2496 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
2497 pr_err("Packet exceed the number of skb frags(%lu)\n",
2498 MAX_SKB_FRAGS);
69e3c75f
JB
2499 return -EFAULT;
2500 }
2501
0af55bb5
CG
2502 page = pgv_to_page(data);
2503 data += len;
69e3c75f
JB
2504 flush_dcache_page(page);
2505 get_page(page);
0af55bb5 2506 skb_fill_page_desc(skb, nr_frags, page, offset, len);
69e3c75f
JB
2507 to_write -= len;
2508 offset = 0;
2509 len_max = PAGE_SIZE;
2510 len = ((to_write > len_max) ? len_max : to_write);
2511 }
2512
75c65772 2513 packet_parse_headers(skb, sock);
efdfa2f7 2514
69e3c75f
JB
2515 return tp_len;
2516}
2517
8d39b4a6
WB
2518static int tpacket_parse_header(struct packet_sock *po, void *frame,
2519 int size_max, void **data)
2520{
2521 union tpacket_uhdr ph;
2522 int tp_len, off;
2523
2524 ph.raw = frame;
2525
2526 switch (po->tp_version) {
7f953ab2
SV
2527 case TPACKET_V3:
2528 if (ph.h3->tp_next_offset != 0) {
2529 pr_warn_once("variable sized slot not supported");
2530 return -EINVAL;
2531 }
2532 tp_len = ph.h3->tp_len;
2533 break;
8d39b4a6
WB
2534 case TPACKET_V2:
2535 tp_len = ph.h2->tp_len;
2536 break;
2537 default:
2538 tp_len = ph.h1->tp_len;
2539 break;
2540 }
2541 if (unlikely(tp_len > size_max)) {
2542 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2543 return -EMSGSIZE;
2544 }
2545
2546 if (unlikely(po->tp_tx_has_off)) {
2547 int off_min, off_max;
2548
2549 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2550 off_max = po->tx_ring.frame_size - tp_len;
2551 if (po->sk.sk_type == SOCK_DGRAM) {
2552 switch (po->tp_version) {
7f953ab2
SV
2553 case TPACKET_V3:
2554 off = ph.h3->tp_net;
2555 break;
8d39b4a6
WB
2556 case TPACKET_V2:
2557 off = ph.h2->tp_net;
2558 break;
2559 default:
2560 off = ph.h1->tp_net;
2561 break;
2562 }
2563 } else {
2564 switch (po->tp_version) {
7f953ab2
SV
2565 case TPACKET_V3:
2566 off = ph.h3->tp_mac;
2567 break;
8d39b4a6
WB
2568 case TPACKET_V2:
2569 off = ph.h2->tp_mac;
2570 break;
2571 default:
2572 off = ph.h1->tp_mac;
2573 break;
2574 }
2575 }
2576 if (unlikely((off < off_min) || (off_max < off)))
2577 return -EINVAL;
2578 } else {
2579 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2580 }
2581
2582 *data = frame + off;
2583 return tp_len;
2584}
2585
69e3c75f
JB
2586static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2587{
69e3c75f
JB
2588 struct sk_buff *skb;
2589 struct net_device *dev;
1d036d25 2590 struct virtio_net_hdr *vnet_hdr = NULL;
c14ac945 2591 struct sockcm_cookie sockc;
69e3c75f 2592 __be16 proto;
09effa67 2593 int err, reserve = 0;
40d4e3df 2594 void *ph;
342dfc30 2595 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
87a2fd28 2596 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
486efdc8 2597 unsigned char *addr = NULL;
69e3c75f 2598 int tp_len, size_max;
8d39b4a6 2599 void *data;
69e3c75f 2600 int len_sum = 0;
9e67030a 2601 int status = TP_STATUS_AVAILABLE;
1d036d25 2602 int hlen, tlen, copylen = 0;
69e3c75f 2603
69e3c75f
JB
2604 mutex_lock(&po->pg_vec_lock);
2605
66e56cd4 2606 if (likely(saddr == NULL)) {
e40526cb 2607 dev = packet_cached_dev_get(po);
69e3c75f 2608 proto = po->num;
69e3c75f
JB
2609 } else {
2610 err = -EINVAL;
2611 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2612 goto out;
2613 if (msg->msg_namelen < (saddr->sll_halen
2614 + offsetof(struct sockaddr_ll,
2615 sll_addr)))
2616 goto out;
69e3c75f 2617 proto = saddr->sll_protocol;
827d9780 2618 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
486efdc8
WB
2619 if (po->sk.sk_socket->type == SOCK_DGRAM) {
2620 if (dev && msg->msg_namelen < dev->addr_len +
2621 offsetof(struct sockaddr_ll, sll_addr))
2622 goto out_put;
2623 addr = saddr->sll_addr;
2624 }
69e3c75f
JB
2625 }
2626
69e3c75f
JB
2627 err = -ENXIO;
2628 if (unlikely(dev == NULL))
2629 goto out;
69e3c75f
JB
2630 err = -ENETDOWN;
2631 if (unlikely(!(dev->flags & IFF_UP)))
2632 goto out_put;
2633
657a0667 2634 sockcm_init(&sockc, &po->sk);
d19b183c
DCS
2635 if (msg->msg_controllen) {
2636 err = sock_cmsg_send(&po->sk, msg, &sockc);
2637 if (unlikely(err))
2638 goto out_put;
2639 }
2640
5cfb4c8d
DB
2641 if (po->sk.sk_socket->type == SOCK_RAW)
2642 reserve = dev->hard_header_len;
69e3c75f 2643 size_max = po->tx_ring.frame_size
b5dd884e 2644 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f 2645
1d036d25 2646 if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
5cfb4c8d 2647 size_max = dev->mtu + reserve + VLAN_HLEN;
09effa67 2648
69e3c75f
JB
2649 do {
2650 ph = packet_current_frame(po, &po->tx_ring,
87a2fd28 2651 TP_STATUS_SEND_REQUEST);
69e3c75f 2652 if (unlikely(ph == NULL)) {
87a2fd28
DB
2653 if (need_wait && need_resched())
2654 schedule();
69e3c75f
JB
2655 continue;
2656 }
2657
8d39b4a6
WB
2658 skb = NULL;
2659 tp_len = tpacket_parse_header(po, ph, size_max, &data);
2660 if (tp_len < 0)
2661 goto tpacket_error;
2662
69e3c75f 2663 status = TP_STATUS_SEND_REQUEST;
ae641949
HX
2664 hlen = LL_RESERVED_SPACE(dev);
2665 tlen = dev->needed_tailroom;
1d036d25
WB
2666 if (po->has_vnet_hdr) {
2667 vnet_hdr = data;
2668 data += sizeof(*vnet_hdr);
2669 tp_len -= sizeof(*vnet_hdr);
2670 if (tp_len < 0 ||
2671 __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
2672 tp_len = -EINVAL;
2673 goto tpacket_error;
2674 }
2675 copylen = __virtio16_to_cpu(vio_le(),
2676 vnet_hdr->hdr_len);
2677 }
9ed988cd 2678 copylen = max_t(int, copylen, dev->hard_header_len);
69e3c75f 2679 skb = sock_alloc_send_skb(&po->sk,
1d036d25
WB
2680 hlen + tlen + sizeof(struct sockaddr_ll) +
2681 (copylen - dev->hard_header_len),
fbf33a28 2682 !need_wait, &err);
69e3c75f 2683
fbf33a28
KM
2684 if (unlikely(skb == NULL)) {
2685 /* we assume the socket was initially writeable ... */
2686 if (likely(len_sum > 0))
2687 err = len_sum;
69e3c75f 2688 goto out_status;
fbf33a28 2689 }
8d39b4a6 2690 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
c14ac945 2691 addr, hlen, copylen, &sockc);
dbd46ab4 2692 if (likely(tp_len >= 0) &&
5cfb4c8d 2693 tp_len > dev->mtu + reserve &&
1d036d25 2694 !po->has_vnet_hdr &&
3c70c132
DB
2695 !packet_extra_vlan_len_allowed(dev, skb))
2696 tp_len = -EMSGSIZE;
69e3c75f
JB
2697
2698 if (unlikely(tp_len < 0)) {
8d39b4a6 2699tpacket_error:
69e3c75f
JB
2700 if (po->tp_loss) {
2701 __packet_set_status(po, ph,
2702 TP_STATUS_AVAILABLE);
2703 packet_increment_head(&po->tx_ring);
2704 kfree_skb(skb);
2705 continue;
2706 } else {
2707 status = TP_STATUS_WRONG_FORMAT;
2708 err = tp_len;
2709 goto out_status;
2710 }
2711 }
2712
9d2f67e4
JT
2713 if (po->has_vnet_hdr) {
2714 if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
2715 tp_len = -EINVAL;
2716 goto tpacket_error;
2717 }
2718 virtio_net_hdr_set_proto(skb, vnet_hdr);
1d036d25
WB
2719 }
2720
69e3c75f
JB
2721 skb->destructor = tpacket_destruct_skb;
2722 __packet_set_status(po, ph, TP_STATUS_SENDING);
b0138408 2723 packet_inc_pending(&po->tx_ring);
69e3c75f
JB
2724
2725 status = TP_STATUS_SEND_REQUEST;
d346a3fa 2726 err = po->xmit(skb);
eb70df13
JP
2727 if (unlikely(err > 0)) {
2728 err = net_xmit_errno(err);
2729 if (err && __packet_get_status(po, ph) ==
2730 TP_STATUS_AVAILABLE) {
2731 /* skb was destructed already */
2732 skb = NULL;
2733 goto out_status;
2734 }
2735 /*
2736 * skb was dropped but not destructed yet;
2737 * let's treat it like congestion or err < 0
2738 */
2739 err = 0;
2740 }
69e3c75f
JB
2741 packet_increment_head(&po->tx_ring);
2742 len_sum += tp_len;
b0138408
DB
2743 } while (likely((ph != NULL) ||
2744 /* Note: packet_read_pending() might be slow if we have
2745 * to call it as it's per_cpu variable, but in fast-path
2746 * we already short-circuit the loop with the first
2747 * condition, and luckily don't have to go that path
2748 * anyway.
2749 */
2750 (need_wait && packet_read_pending(&po->tx_ring))));
69e3c75f
JB
2751
2752 err = len_sum;
2753 goto out_put;
2754
69e3c75f
JB
2755out_status:
2756 __packet_set_status(po, ph, status);
2757 kfree_skb(skb);
2758out_put:
e40526cb 2759 dev_put(dev);
69e3c75f
JB
2760out:
2761 mutex_unlock(&po->pg_vec_lock);
2762 return err;
2763}
69e3c75f 2764
eea49cc9
OJ
2765static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2766 size_t reserve, size_t len,
2767 size_t linear, int noblock,
2768 int *err)
bfd5f4a3
SS
2769{
2770 struct sk_buff *skb;
2771
2772 /* Under a page? Don't bother with paged skb. */
2773 if (prepad + len < PAGE_SIZE || !linear)
2774 linear = len;
2775
2776 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
28d64271 2777 err, 0);
bfd5f4a3
SS
2778 if (!skb)
2779 return NULL;
2780
2781 skb_reserve(skb, reserve);
2782 skb_put(skb, linear);
2783 skb->data_len = len - linear;
2784 skb->len += len - linear;
2785
2786 return skb;
2787}
2788
d346a3fa 2789static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
1da177e4
LT
2790{
2791 struct sock *sk = sock->sk;
342dfc30 2792 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
1da177e4
LT
2793 struct sk_buff *skb;
2794 struct net_device *dev;
0e11c91e 2795 __be16 proto;
486efdc8 2796 unsigned char *addr = NULL;
827d9780 2797 int err, reserve = 0;
c7d39e32 2798 struct sockcm_cookie sockc;
bfd5f4a3
SS
2799 struct virtio_net_hdr vnet_hdr = { 0 };
2800 int offset = 0;
bfd5f4a3 2801 struct packet_sock *po = pkt_sk(sk);
da7c9561 2802 bool has_vnet_hdr = false;
57031eb7 2803 int hlen, tlen, linear;
3bdc0eba 2804 int extra_len = 0;
1da177e4
LT
2805
2806 /*
1ce4f28b 2807 * Get and verify the address.
1da177e4 2808 */
1ce4f28b 2809
66e56cd4 2810 if (likely(saddr == NULL)) {
e40526cb 2811 dev = packet_cached_dev_get(po);
1da177e4 2812 proto = po->num;
1da177e4
LT
2813 } else {
2814 err = -EINVAL;
2815 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2816 goto out;
0fb375fb
EB
2817 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2818 goto out;
1da177e4 2819 proto = saddr->sll_protocol;
827d9780 2820 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
486efdc8
WB
2821 if (sock->type == SOCK_DGRAM) {
2822 if (dev && msg->msg_namelen < dev->addr_len +
2823 offsetof(struct sockaddr_ll, sll_addr))
2824 goto out_unlock;
2825 addr = saddr->sll_addr;
2826 }
1da177e4
LT
2827 }
2828
1da177e4 2829 err = -ENXIO;
e40526cb 2830 if (unlikely(dev == NULL))
1da177e4 2831 goto out_unlock;
d5e76b0a 2832 err = -ENETDOWN;
e40526cb 2833 if (unlikely(!(dev->flags & IFF_UP)))
d5e76b0a
DM
2834 goto out_unlock;
2835
657a0667 2836 sockcm_init(&sockc, sk);
c7d39e32
EJ
2837 sockc.mark = sk->sk_mark;
2838 if (msg->msg_controllen) {
2839 err = sock_cmsg_send(sk, msg, &sockc);
2840 if (unlikely(err))
2841 goto out_unlock;
2842 }
2843
e40526cb
DB
2844 if (sock->type == SOCK_RAW)
2845 reserve = dev->hard_header_len;
bfd5f4a3 2846 if (po->has_vnet_hdr) {
16cc1400
WB
2847 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
2848 if (err)
bfd5f4a3 2849 goto out_unlock;
da7c9561 2850 has_vnet_hdr = true;
bfd5f4a3
SS
2851 }
2852
3bdc0eba
BG
2853 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2854 if (!netif_supports_nofcs(dev)) {
2855 err = -EPROTONOSUPPORT;
2856 goto out_unlock;
2857 }
2858 extra_len = 4; /* We're doing our own CRC */
2859 }
2860
1da177e4 2861 err = -EMSGSIZE;
16cc1400
WB
2862 if (!vnet_hdr.gso_type &&
2863 (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
1da177e4
LT
2864 goto out_unlock;
2865
bfd5f4a3 2866 err = -ENOBUFS;
ae641949
HX
2867 hlen = LL_RESERVED_SPACE(dev);
2868 tlen = dev->needed_tailroom;
57031eb7
WB
2869 linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
2870 linear = max(linear, min_t(int, len, dev->hard_header_len));
2871 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
bfd5f4a3 2872 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 2873 if (skb == NULL)
1da177e4
LT
2874 goto out_unlock;
2875
b84bbaf7 2876 skb_reset_network_header(skb);
1da177e4 2877
0c4e8581 2878 err = -EINVAL;
9c707762
WB
2879 if (sock->type == SOCK_DGRAM) {
2880 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
46d2cfb1 2881 if (unlikely(offset < 0))
9c707762 2882 goto out_free;
b84bbaf7 2883 } else if (reserve) {
9aad13b0 2884 skb_reserve(skb, -reserve);
88a8121d
ND
2885 if (len < reserve + sizeof(struct ipv6hdr) &&
2886 dev->min_header_len != dev->hard_header_len)
993675a3 2887 skb_reset_network_header(skb);
9c707762 2888 }
1da177e4
LT
2889
2890 /* Returns -EFAULT on error */
c0371da6 2891 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
1da177e4
LT
2892 if (err)
2893 goto out_free;
bf84a010 2894
9ed988cd
WB
2895 if (sock->type == SOCK_RAW &&
2896 !dev_validate_header(dev, skb->data, len)) {
2897 err = -EINVAL;
2898 goto out_free;
2899 }
2900
8f932f76 2901 skb_setup_tx_timestamp(skb, sockc.tsflags);
1da177e4 2902
16cc1400 2903 if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
3c70c132
DB
2904 !packet_extra_vlan_len_allowed(dev, skb)) {
2905 err = -EMSGSIZE;
2906 goto out_free;
57f89bfa
BG
2907 }
2908
09effa67
DM
2909 skb->protocol = proto;
2910 skb->dev = dev;
1da177e4 2911 skb->priority = sk->sk_priority;
c7d39e32 2912 skb->mark = sockc.mark;
3d0ba8c0 2913 skb->tstamp = sockc.transmit_time;
0fd5d57b 2914
da7c9561 2915 if (has_vnet_hdr) {
db60eb5f 2916 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
16cc1400
WB
2917 if (err)
2918 goto out_free;
2919 len += sizeof(vnet_hdr);
9d2f67e4 2920 virtio_net_hdr_set_proto(skb, &vnet_hdr);
bfd5f4a3
SS
2921 }
2922
75c65772 2923 packet_parse_headers(skb, sock);
8fd6c80d 2924
3bdc0eba
BG
2925 if (unlikely(extra_len == 4))
2926 skb->no_fcs = 1;
2927
d346a3fa 2928 err = po->xmit(skb);
1da177e4
LT
2929 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2930 goto out_unlock;
2931
e40526cb 2932 dev_put(dev);
1da177e4 2933
40d4e3df 2934 return len;
1da177e4
LT
2935
2936out_free:
2937 kfree_skb(skb);
2938out_unlock:
e40526cb 2939 if (dev)
1da177e4
LT
2940 dev_put(dev);
2941out:
2942 return err;
2943}
2944
1b784140 2945static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
69e3c75f 2946{
69e3c75f
JB
2947 struct sock *sk = sock->sk;
2948 struct packet_sock *po = pkt_sk(sk);
d346a3fa 2949
69e3c75f
JB
2950 if (po->tx_ring.pg_vec)
2951 return tpacket_snd(po, msg);
2952 else
69e3c75f
JB
2953 return packet_snd(sock, msg, len);
2954}
2955
1da177e4
LT
2956/*
2957 * Close a PACKET socket. This is fairly simple. We immediately go
2958 * to 'closed' state and remove our protocol entry in the device list.
2959 */
2960
2961static int packet_release(struct socket *sock)
2962{
2963 struct sock *sk = sock->sk;
2964 struct packet_sock *po;
2bd624b4 2965 struct packet_fanout *f;
d12d01d6 2966 struct net *net;
f6fb8f10 2967 union tpacket_req_u req_u;
1da177e4
LT
2968
2969 if (!sk)
2970 return 0;
2971
3b1e0a65 2972 net = sock_net(sk);
1da177e4
LT
2973 po = pkt_sk(sk);
2974
0fa7fa98 2975 mutex_lock(&net->packet.sklist_lock);
808f5114 2976 sk_del_node_init_rcu(sk);
0fa7fa98
PE
2977 mutex_unlock(&net->packet.sklist_lock);
2978
2979 preempt_disable();
920de804 2980 sock_prot_inuse_add(net, sk->sk_prot, -1);
0fa7fa98 2981 preempt_enable();
1da177e4 2982
808f5114 2983 spin_lock(&po->bind_lock);
ce06b03e 2984 unregister_prot_hook(sk, false);
66e56cd4
DB
2985 packet_cached_dev_reset(po);
2986
160ff18a
BG
2987 if (po->prot_hook.dev) {
2988 dev_put(po->prot_hook.dev);
2989 po->prot_hook.dev = NULL;
2990 }
808f5114 2991 spin_unlock(&po->bind_lock);
1da177e4 2992
1da177e4 2993 packet_flush_mclist(sk);
1da177e4 2994
5171b37d 2995 lock_sock(sk);
9665d5d6
PS
2996 if (po->rx_ring.pg_vec) {
2997 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 2998 packet_set_ring(sk, &req_u, 1, 0);
9665d5d6 2999 }
69e3c75f 3000
9665d5d6
PS
3001 if (po->tx_ring.pg_vec) {
3002 memset(&req_u, 0, sizeof(req_u));
f6fb8f10 3003 packet_set_ring(sk, &req_u, 1, 1);
9665d5d6 3004 }
5171b37d 3005 release_sock(sk);
1da177e4 3006
2bd624b4 3007 f = fanout_release(sk);
dc99f600 3008
808f5114 3009 synchronize_net();
2bd624b4
AS
3010
3011 if (f) {
57f015f5 3012 kfree(po->rollover);
2bd624b4
AS
3013 fanout_release_data(f);
3014 kfree(f);
3015 }
1da177e4
LT
3016 /*
3017 * Now the socket is dead. No more input will appear.
3018 */
1da177e4
LT
3019 sock_orphan(sk);
3020 sock->sk = NULL;
3021
3022 /* Purge queues */
3023
3024 skb_queue_purge(&sk->sk_receive_queue);
b0138408 3025 packet_free_pending(po);
17ab56a2 3026 sk_refcnt_debug_release(sk);
1da177e4
LT
3027
3028 sock_put(sk);
3029 return 0;
3030}
3031
3032/*
3033 * Attach a packet hook.
3034 */
3035
30f7ea1c
FR
3036static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
3037 __be16 proto)
1da177e4
LT
3038{
3039 struct packet_sock *po = pkt_sk(sk);
158cd4af 3040 struct net_device *dev_curr;
902fefb8
DB
3041 __be16 proto_curr;
3042 bool need_rehook;
30f7ea1c
FR
3043 struct net_device *dev = NULL;
3044 int ret = 0;
3045 bool unlisted = false;
dc99f600 3046
1da177e4 3047 lock_sock(sk);
1da177e4 3048 spin_lock(&po->bind_lock);
30f7ea1c
FR
3049 rcu_read_lock();
3050
4971613c
WB
3051 if (po->fanout) {
3052 ret = -EINVAL;
3053 goto out_unlock;
3054 }
3055
30f7ea1c
FR
3056 if (name) {
3057 dev = dev_get_by_name_rcu(sock_net(sk), name);
3058 if (!dev) {
3059 ret = -ENODEV;
3060 goto out_unlock;
3061 }
3062 } else if (ifindex) {
3063 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
3064 if (!dev) {
3065 ret = -ENODEV;
3066 goto out_unlock;
3067 }
3068 }
3069
3070 if (dev)
3071 dev_hold(dev);
66e56cd4 3072
902fefb8
DB
3073 proto_curr = po->prot_hook.type;
3074 dev_curr = po->prot_hook.dev;
3075
3076 need_rehook = proto_curr != proto || dev_curr != dev;
3077
3078 if (need_rehook) {
30f7ea1c
FR
3079 if (po->running) {
3080 rcu_read_unlock();
15fe076e
ED
3081 /* prevents packet_notifier() from calling
3082 * register_prot_hook()
3083 */
3084 po->num = 0;
30f7ea1c
FR
3085 __unregister_prot_hook(sk, true);
3086 rcu_read_lock();
3087 dev_curr = po->prot_hook.dev;
3088 if (dev)
3089 unlisted = !dev_get_by_index_rcu(sock_net(sk),
3090 dev->ifindex);
3091 }
1da177e4 3092
15fe076e 3093 BUG_ON(po->running);
902fefb8
DB
3094 po->num = proto;
3095 po->prot_hook.type = proto;
902fefb8 3096
30f7ea1c
FR
3097 if (unlikely(unlisted)) {
3098 dev_put(dev);
3099 po->prot_hook.dev = NULL;
3100 po->ifindex = -1;
3101 packet_cached_dev_reset(po);
3102 } else {
3103 po->prot_hook.dev = dev;
3104 po->ifindex = dev ? dev->ifindex : 0;
3105 packet_cached_dev_assign(po, dev);
3106 }
902fefb8 3107 }
158cd4af
LW
3108 if (dev_curr)
3109 dev_put(dev_curr);
66e56cd4 3110
902fefb8 3111 if (proto == 0 || !need_rehook)
1da177e4
LT
3112 goto out_unlock;
3113
30f7ea1c 3114 if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
ce06b03e 3115 register_prot_hook(sk);
be85d4ad
UT
3116 } else {
3117 sk->sk_err = ENETDOWN;
3118 if (!sock_flag(sk, SOCK_DEAD))
3119 sk->sk_error_report(sk);
1da177e4
LT
3120 }
3121
3122out_unlock:
30f7ea1c 3123 rcu_read_unlock();
1da177e4
LT
3124 spin_unlock(&po->bind_lock);
3125 release_sock(sk);
30f7ea1c 3126 return ret;
1da177e4
LT
3127}
3128
3129/*
3130 * Bind a packet socket to a device
3131 */
3132
40d4e3df
ED
3133static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
3134 int addr_len)
1da177e4 3135{
40d4e3df 3136 struct sock *sk = sock->sk;
540e2894 3137 char name[sizeof(uaddr->sa_data) + 1];
1ce4f28b 3138
1da177e4
LT
3139 /*
3140 * Check legality
3141 */
1ce4f28b 3142
8ae55f04 3143 if (addr_len != sizeof(struct sockaddr))
1da177e4 3144 return -EINVAL;
540e2894
AP
3145 /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
3146 * zero-terminated.
3147 */
3148 memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
3149 name[sizeof(uaddr->sa_data)] = 0;
1da177e4 3150
30f7ea1c 3151 return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
1da177e4 3152}
1da177e4
LT
3153
3154static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
3155{
40d4e3df
ED
3156 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
3157 struct sock *sk = sock->sk;
1da177e4
LT
3158
3159 /*
3160 * Check legality
3161 */
1ce4f28b 3162
1da177e4
LT
3163 if (addr_len < sizeof(struct sockaddr_ll))
3164 return -EINVAL;
3165 if (sll->sll_family != AF_PACKET)
3166 return -EINVAL;
3167
30f7ea1c
FR
3168 return packet_do_bind(sk, NULL, sll->sll_ifindex,
3169 sll->sll_protocol ? : pkt_sk(sk)->num);
1da177e4
LT
3170}
3171
3172static struct proto packet_proto = {
3173 .name = "PACKET",
3174 .owner = THIS_MODULE,
3175 .obj_size = sizeof(struct packet_sock),
3176};
3177
3178/*
1ce4f28b 3179 * Create a packet of type SOCK_PACKET.
1da177e4
LT
3180 */
3181
3f378b68
EP
3182static int packet_create(struct net *net, struct socket *sock, int protocol,
3183 int kern)
1da177e4
LT
3184{
3185 struct sock *sk;
3186 struct packet_sock *po;
0e11c91e 3187 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
3188 int err;
3189
df008c91 3190 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1da177e4 3191 return -EPERM;
be02097c
DM
3192 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
3193 sock->type != SOCK_PACKET)
1da177e4
LT
3194 return -ESOCKTNOSUPPORT;
3195
3196 sock->state = SS_UNCONNECTED;
3197
3198 err = -ENOBUFS;
11aa9c28 3199 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
1da177e4
LT
3200 if (sk == NULL)
3201 goto out;
3202
3203 sock->ops = &packet_ops;
1da177e4
LT
3204 if (sock->type == SOCK_PACKET)
3205 sock->ops = &packet_ops_spkt;
be02097c 3206
1da177e4
LT
3207 sock_init_data(sock, sk);
3208
3209 po = pkt_sk(sk);
3210 sk->sk_family = PF_PACKET;
0e11c91e 3211 po->num = proto;
d346a3fa 3212 po->xmit = dev_queue_xmit;
66e56cd4 3213
b0138408
DB
3214 err = packet_alloc_pending(po);
3215 if (err)
3216 goto out2;
3217
66e56cd4 3218 packet_cached_dev_reset(po);
1da177e4
LT
3219
3220 sk->sk_destruct = packet_sock_destruct;
17ab56a2 3221 sk_refcnt_debug_inc(sk);
1da177e4
LT
3222
3223 /*
3224 * Attach a protocol block
3225 */
3226
3227 spin_lock_init(&po->bind_lock);
905db440 3228 mutex_init(&po->pg_vec_lock);
0648ab70 3229 po->rollover = NULL;
1da177e4 3230 po->prot_hook.func = packet_rcv;
be02097c 3231
1da177e4
LT
3232 if (sock->type == SOCK_PACKET)
3233 po->prot_hook.func = packet_rcv_spkt;
be02097c 3234
1da177e4
LT
3235 po->prot_hook.af_packet_priv = sk;
3236
0e11c91e
AV
3237 if (proto) {
3238 po->prot_hook.type = proto;
a6361f0c 3239 __register_prot_hook(sk);
1da177e4
LT
3240 }
3241
0fa7fa98 3242 mutex_lock(&net->packet.sklist_lock);
a4dc6a49 3243 sk_add_node_tail_rcu(sk, &net->packet.sklist);
0fa7fa98
PE
3244 mutex_unlock(&net->packet.sklist_lock);
3245
3246 preempt_disable();
3680453c 3247 sock_prot_inuse_add(net, &packet_proto, 1);
0fa7fa98 3248 preempt_enable();
808f5114 3249
40d4e3df 3250 return 0;
b0138408
DB
3251out2:
3252 sk_free(sk);
1da177e4
LT
3253out:
3254 return err;
3255}
3256
3257/*
3258 * Pull a packet from our receive queue and hand it to the user.
3259 * If necessary we block.
3260 */
3261
1b784140
YX
3262static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
3263 int flags)
1da177e4
LT
3264{
3265 struct sock *sk = sock->sk;
3266 struct sk_buff *skb;
3267 int copied, err;
bfd5f4a3 3268 int vnet_hdr_len = 0;
2472d761 3269 unsigned int origlen = 0;
1da177e4
LT
3270
3271 err = -EINVAL;
ed85b565 3272 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
3273 goto out;
3274
3275#if 0
3276 /* What error should we return now? EUNATTACH? */
3277 if (pkt_sk(sk)->ifindex < 0)
3278 return -ENODEV;
3279#endif
3280
ed85b565 3281 if (flags & MSG_ERRQUEUE) {
cb820f8e
RC
3282 err = sock_recv_errqueue(sk, msg, len,
3283 SOL_PACKET, PACKET_TX_TIMESTAMP);
ed85b565
RC
3284 goto out;
3285 }
3286
1da177e4
LT
3287 /*
3288 * Call the generic datagram receiver. This handles all sorts
3289 * of horrible races and re-entrancy so we can forget about it
3290 * in the protocol layers.
3291 *
3292 * Now it will return ENETDOWN, if device have just gone down,
3293 * but then it will block.
3294 */
3295
40d4e3df 3296 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
3297
3298 /*
1ce4f28b 3299 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
3300 * handles the blocking we don't see and worry about blocking
3301 * retries.
3302 */
3303
8ae55f04 3304 if (skb == NULL)
1da177e4
LT
3305 goto out;
3306
2ccdbaa6
WB
3307 if (pkt_sk(sk)->pressure)
3308 packet_rcv_has_room(pkt_sk(sk), NULL);
3309
bfd5f4a3 3310 if (pkt_sk(sk)->has_vnet_hdr) {
16cc1400
WB
3311 err = packet_rcv_vnet(msg, skb, &len);
3312 if (err)
bfd5f4a3 3313 goto out_free;
16cc1400 3314 vnet_hdr_len = sizeof(struct virtio_net_hdr);
bfd5f4a3
SS
3315 }
3316
f3d33426
HFS
3317 /* You lose any data beyond the buffer you gave. If it worries
3318 * a user program they can ask the device for its MTU
3319 * anyway.
1da177e4 3320 */
1da177e4 3321 copied = skb->len;
40d4e3df
ED
3322 if (copied > len) {
3323 copied = len;
3324 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
3325 }
3326
51f3d02b 3327 err = skb_copy_datagram_msg(skb, 0, msg, copied);
1da177e4
LT
3328 if (err)
3329 goto out_free;
3330
2472d761
EB
3331 if (sock->type != SOCK_PACKET) {
3332 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3333
3334 /* Original length was stored in sockaddr_ll fields */
3335 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3336 sll->sll_family = AF_PACKET;
3337 sll->sll_protocol = skb->protocol;
3338 }
3339
3b885787 3340 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4 3341
f3d33426 3342 if (msg->msg_name) {
b2cf86e1
WB
3343 int copy_len;
3344
f3d33426
HFS
3345 /* If the address length field is there to be filled
3346 * in, we fill it in now.
3347 */
3348 if (sock->type == SOCK_PACKET) {
342dfc30 3349 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
f3d33426 3350 msg->msg_namelen = sizeof(struct sockaddr_pkt);
b2cf86e1 3351 copy_len = msg->msg_namelen;
f3d33426
HFS
3352 } else {
3353 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
2472d761 3354
f3d33426
HFS
3355 msg->msg_namelen = sll->sll_halen +
3356 offsetof(struct sockaddr_ll, sll_addr);
b2cf86e1
WB
3357 copy_len = msg->msg_namelen;
3358 if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
3359 memset(msg->msg_name +
3360 offsetof(struct sockaddr_ll, sll_addr),
3361 0, sizeof(sll->sll_addr));
3362 msg->msg_namelen = sizeof(struct sockaddr_ll);
3363 }
f3d33426 3364 }
b2cf86e1 3365 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
f3d33426 3366 }
1da177e4 3367
8dc41944 3368 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
3369 struct tpacket_auxdata aux;
3370
3371 aux.tp_status = TP_STATUS_USER;
3372 if (skb->ip_summed == CHECKSUM_PARTIAL)
3373 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
682f048b
AD
3374 else if (skb->pkt_type != PACKET_OUTGOING &&
3375 (skb->ip_summed == CHECKSUM_COMPLETE ||
3376 skb_csum_unnecessary(skb)))
3377 aux.tp_status |= TP_STATUS_CSUM_VALID;
3378
2472d761 3379 aux.tp_len = origlen;
ffbc6111
HX
3380 aux.tp_snaplen = skb->len;
3381 aux.tp_mac = 0;
bbe735e4 3382 aux.tp_net = skb_network_offset(skb);
df8a39de
JP
3383 if (skb_vlan_tag_present(skb)) {
3384 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
a0cdfcf3
AW
3385 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3386 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
a3bcc23e
BG
3387 } else {
3388 aux.tp_vlan_tci = 0;
a0cdfcf3 3389 aux.tp_vlan_tpid = 0;
a3bcc23e 3390 }
ffbc6111 3391 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
3392 }
3393
1da177e4
LT
3394 /*
3395 * Free or return the buffer as appropriate. Again this
3396 * hides all the races and re-entrancy issues from us.
3397 */
bfd5f4a3 3398 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
3399
3400out_free:
3401 skb_free_datagram(sk, skb);
3402out:
3403 return err;
3404}
3405
1da177e4 3406static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3407 int peer)
1da177e4
LT
3408{
3409 struct net_device *dev;
3410 struct sock *sk = sock->sk;
3411
3412 if (peer)
3413 return -EOPNOTSUPP;
3414
3415 uaddr->sa_family = AF_PACKET;
2dc85bf3 3416 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
654d1f8a
ED
3417 rcu_read_lock();
3418 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3419 if (dev)
2dc85bf3 3420 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
654d1f8a 3421 rcu_read_unlock();
1da177e4 3422
9b2c45d4 3423 return sizeof(*uaddr);
1da177e4 3424}
1da177e4
LT
3425
3426static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
9b2c45d4 3427 int peer)
1da177e4
LT
3428{
3429 struct net_device *dev;
3430 struct sock *sk = sock->sk;
3431 struct packet_sock *po = pkt_sk(sk);
13cfa97b 3432 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
3433
3434 if (peer)
3435 return -EOPNOTSUPP;
3436
3437 sll->sll_family = AF_PACKET;
3438 sll->sll_ifindex = po->ifindex;
3439 sll->sll_protocol = po->num;
67286640 3440 sll->sll_pkttype = 0;
654d1f8a
ED
3441 rcu_read_lock();
3442 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
3443 if (dev) {
3444 sll->sll_hatype = dev->type;
3445 sll->sll_halen = dev->addr_len;
3446 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
3447 } else {
3448 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3449 sll->sll_halen = 0;
3450 }
654d1f8a 3451 rcu_read_unlock();
1da177e4 3452
9b2c45d4 3453 return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
3454}
3455
2aeb0b88
WC
3456static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3457 int what)
1da177e4
LT
3458{
3459 switch (i->type) {
3460 case PACKET_MR_MULTICAST:
1162563f
JP
3461 if (i->alen != dev->addr_len)
3462 return -EINVAL;
1da177e4 3463 if (what > 0)
22bedad3 3464 return dev_mc_add(dev, i->addr);
1da177e4 3465 else
22bedad3 3466 return dev_mc_del(dev, i->addr);
1da177e4
LT
3467 break;
3468 case PACKET_MR_PROMISC:
2aeb0b88 3469 return dev_set_promiscuity(dev, what);
1da177e4 3470 case PACKET_MR_ALLMULTI:
2aeb0b88 3471 return dev_set_allmulti(dev, what);
d95ed927 3472 case PACKET_MR_UNICAST:
1162563f
JP
3473 if (i->alen != dev->addr_len)
3474 return -EINVAL;
d95ed927 3475 if (what > 0)
a748ee24 3476 return dev_uc_add(dev, i->addr);
d95ed927 3477 else
a748ee24 3478 return dev_uc_del(dev, i->addr);
d95ed927 3479 break;
40d4e3df
ED
3480 default:
3481 break;
1da177e4 3482 }
2aeb0b88 3483 return 0;
1da177e4
LT
3484}
3485
82f17091
FR
3486static void packet_dev_mclist_delete(struct net_device *dev,
3487 struct packet_mclist **mlp)
1da177e4 3488{
82f17091
FR
3489 struct packet_mclist *ml;
3490
3491 while ((ml = *mlp) != NULL) {
3492 if (ml->ifindex == dev->ifindex) {
3493 packet_dev_mc(dev, ml, -1);
3494 *mlp = ml->next;
3495 kfree(ml);
3496 } else
3497 mlp = &ml->next;
1da177e4
LT
3498 }
3499}
3500
0fb375fb 3501static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3502{
3503 struct packet_sock *po = pkt_sk(sk);
3504 struct packet_mclist *ml, *i;
3505 struct net_device *dev;
3506 int err;
3507
3508 rtnl_lock();
3509
3510 err = -ENODEV;
3b1e0a65 3511 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
3512 if (!dev)
3513 goto done;
3514
3515 err = -EINVAL;
1162563f 3516 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
3517 goto done;
3518
3519 err = -ENOBUFS;
8b3a7005 3520 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
3521 if (i == NULL)
3522 goto done;
3523
3524 err = 0;
3525 for (ml = po->mclist; ml; ml = ml->next) {
3526 if (ml->ifindex == mreq->mr_ifindex &&
3527 ml->type == mreq->mr_type &&
3528 ml->alen == mreq->mr_alen &&
3529 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3530 ml->count++;
3531 /* Free the new element ... */
3532 kfree(i);
3533 goto done;
3534 }
3535 }
3536
3537 i->type = mreq->mr_type;
3538 i->ifindex = mreq->mr_ifindex;
3539 i->alen = mreq->mr_alen;
3540 memcpy(i->addr, mreq->mr_address, i->alen);
309cf37f 3541 memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
1da177e4
LT
3542 i->count = 1;
3543 i->next = po->mclist;
3544 po->mclist = i;
2aeb0b88
WC
3545 err = packet_dev_mc(dev, i, 1);
3546 if (err) {
3547 po->mclist = i->next;
3548 kfree(i);
3549 }
1da177e4
LT
3550
3551done:
3552 rtnl_unlock();
3553 return err;
3554}
3555
0fb375fb 3556static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
3557{
3558 struct packet_mclist *ml, **mlp;
3559
3560 rtnl_lock();
3561
3562 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3563 if (ml->ifindex == mreq->mr_ifindex &&
3564 ml->type == mreq->mr_type &&
3565 ml->alen == mreq->mr_alen &&
3566 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3567 if (--ml->count == 0) {
3568 struct net_device *dev;
3569 *mlp = ml->next;
ad959e76
ED
3570 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3571 if (dev)
1da177e4 3572 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3573 kfree(ml);
3574 }
82f17091 3575 break;
1da177e4
LT
3576 }
3577 }
3578 rtnl_unlock();
82f17091 3579 return 0;
1da177e4
LT
3580}
3581
3582static void packet_flush_mclist(struct sock *sk)
3583{
3584 struct packet_sock *po = pkt_sk(sk);
3585 struct packet_mclist *ml;
3586
3587 if (!po->mclist)
3588 return;
3589
3590 rtnl_lock();
3591 while ((ml = po->mclist) != NULL) {
3592 struct net_device *dev;
3593
3594 po->mclist = ml->next;
ad959e76
ED
3595 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3596 if (dev != NULL)
1da177e4 3597 packet_dev_mc(dev, ml, -1);
1da177e4
LT
3598 kfree(ml);
3599 }
3600 rtnl_unlock();
3601}
1da177e4
LT
3602
3603static int
b7058842 3604packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
3605{
3606 struct sock *sk = sock->sk;
8dc41944 3607 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
3608 int ret;
3609
3610 if (level != SOL_PACKET)
3611 return -ENOPROTOOPT;
3612
69e3c75f 3613 switch (optname) {
1ce4f28b 3614 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
3615 case PACKET_DROP_MEMBERSHIP:
3616 {
0fb375fb
EB
3617 struct packet_mreq_max mreq;
3618 int len = optlen;
3619 memset(&mreq, 0, sizeof(mreq));
3620 if (len < sizeof(struct packet_mreq))
1da177e4 3621 return -EINVAL;
0fb375fb
EB
3622 if (len > sizeof(mreq))
3623 len = sizeof(mreq);
40d4e3df 3624 if (copy_from_user(&mreq, optval, len))
1da177e4 3625 return -EFAULT;
0fb375fb
EB
3626 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3627 return -EINVAL;
1da177e4
LT
3628 if (optname == PACKET_ADD_MEMBERSHIP)
3629 ret = packet_mc_add(sk, &mreq);
3630 else
3631 ret = packet_mc_drop(sk, &mreq);
3632 return ret;
3633 }
a2efcfa0 3634
1da177e4 3635 case PACKET_RX_RING:
69e3c75f 3636 case PACKET_TX_RING:
1da177e4 3637 {
f6fb8f10 3638 union tpacket_req_u req_u;
3639 int len;
1da177e4 3640
5171b37d 3641 lock_sock(sk);
f6fb8f10 3642 switch (po->tp_version) {
3643 case TPACKET_V1:
3644 case TPACKET_V2:
3645 len = sizeof(req_u.req);
3646 break;
3647 case TPACKET_V3:
3648 default:
3649 len = sizeof(req_u.req3);
3650 break;
3651 }
5171b37d
ED
3652 if (optlen < len) {
3653 ret = -EINVAL;
3654 } else {
3655 if (copy_from_user(&req_u.req, optval, len))
3656 ret = -EFAULT;
3657 else
3658 ret = packet_set_ring(sk, &req_u, 0,
3659 optname == PACKET_TX_RING);
3660 }
3661 release_sock(sk);
3662 return ret;
1da177e4
LT
3663 }
3664 case PACKET_COPY_THRESH:
3665 {
3666 int val;
3667
40d4e3df 3668 if (optlen != sizeof(val))
1da177e4 3669 return -EINVAL;
40d4e3df 3670 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
3671 return -EFAULT;
3672
3673 pkt_sk(sk)->copy_thresh = val;
3674 return 0;
3675 }
bbd6ef87
PM
3676 case PACKET_VERSION:
3677 {
3678 int val;
3679
3680 if (optlen != sizeof(val))
3681 return -EINVAL;
bbd6ef87
PM
3682 if (copy_from_user(&val, optval, sizeof(val)))
3683 return -EFAULT;
3684 switch (val) {
3685 case TPACKET_V1:
3686 case TPACKET_V2:
f6fb8f10 3687 case TPACKET_V3:
84ac7260 3688 break;
bbd6ef87
PM
3689 default:
3690 return -EINVAL;
3691 }
84ac7260
PP
3692 lock_sock(sk);
3693 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3694 ret = -EBUSY;
3695 } else {
3696 po->tp_version = val;
3697 ret = 0;
3698 }
3699 release_sock(sk);
3700 return ret;
bbd6ef87 3701 }
8913336a
PM
3702 case PACKET_RESERVE:
3703 {
3704 unsigned int val;
3705
3706 if (optlen != sizeof(val))
3707 return -EINVAL;
8913336a
PM
3708 if (copy_from_user(&val, optval, sizeof(val)))
3709 return -EFAULT;
bcc5364b
AK
3710 if (val > INT_MAX)
3711 return -EINVAL;
c27927e3
WB
3712 lock_sock(sk);
3713 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3714 ret = -EBUSY;
3715 } else {
3716 po->tp_reserve = val;
3717 ret = 0;
3718 }
3719 release_sock(sk);
3720 return ret;
8913336a 3721 }
69e3c75f
JB
3722 case PACKET_LOSS:
3723 {
3724 unsigned int val;
3725
3726 if (optlen != sizeof(val))
3727 return -EINVAL;
69e3c75f
JB
3728 if (copy_from_user(&val, optval, sizeof(val)))
3729 return -EFAULT;
a6361f0c
WB
3730
3731 lock_sock(sk);
3732 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3733 ret = -EBUSY;
3734 } else {
3735 po->tp_loss = !!val;
3736 ret = 0;
3737 }
3738 release_sock(sk);
3739 return ret;
69e3c75f 3740 }
8dc41944
HX
3741 case PACKET_AUXDATA:
3742 {
3743 int val;
3744
3745 if (optlen < sizeof(val))
3746 return -EINVAL;
3747 if (copy_from_user(&val, optval, sizeof(val)))
3748 return -EFAULT;
3749
a6361f0c 3750 lock_sock(sk);
8dc41944 3751 po->auxdata = !!val;
a6361f0c 3752 release_sock(sk);
8dc41944
HX
3753 return 0;
3754 }
80feaacb
PWJ
3755 case PACKET_ORIGDEV:
3756 {
3757 int val;
3758
3759 if (optlen < sizeof(val))
3760 return -EINVAL;
3761 if (copy_from_user(&val, optval, sizeof(val)))
3762 return -EFAULT;
3763
a6361f0c 3764 lock_sock(sk);
80feaacb 3765 po->origdev = !!val;
a6361f0c 3766 release_sock(sk);
80feaacb
PWJ
3767 return 0;
3768 }
bfd5f4a3
SS
3769 case PACKET_VNET_HDR:
3770 {
3771 int val;
3772
3773 if (sock->type != SOCK_RAW)
3774 return -EINVAL;
bfd5f4a3
SS
3775 if (optlen < sizeof(val))
3776 return -EINVAL;
3777 if (copy_from_user(&val, optval, sizeof(val)))
3778 return -EFAULT;
3779
a6361f0c
WB
3780 lock_sock(sk);
3781 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3782 ret = -EBUSY;
3783 } else {
3784 po->has_vnet_hdr = !!val;
3785 ret = 0;
3786 }
3787 release_sock(sk);
3788 return ret;
bfd5f4a3 3789 }
614f60fa
SM
3790 case PACKET_TIMESTAMP:
3791 {
3792 int val;
3793
3794 if (optlen != sizeof(val))
3795 return -EINVAL;
3796 if (copy_from_user(&val, optval, sizeof(val)))
3797 return -EFAULT;
3798
3799 po->tp_tstamp = val;
3800 return 0;
3801 }
dc99f600
DM
3802 case PACKET_FANOUT:
3803 {
3804 int val;
3805
3806 if (optlen != sizeof(val))
3807 return -EINVAL;
3808 if (copy_from_user(&val, optval, sizeof(val)))
3809 return -EFAULT;
3810
3811 return fanout_add(sk, val & 0xffff, val >> 16);
3812 }
47dceb8e
WB
3813 case PACKET_FANOUT_DATA:
3814 {
3815 if (!po->fanout)
3816 return -EINVAL;
3817
3818 return fanout_set_data(po, optval, optlen);
3819 }
fa788d98
VW
3820 case PACKET_IGNORE_OUTGOING:
3821 {
3822 int val;
3823
3824 if (optlen != sizeof(val))
3825 return -EINVAL;
3826 if (copy_from_user(&val, optval, sizeof(val)))
3827 return -EFAULT;
3828 if (val < 0 || val > 1)
3829 return -EINVAL;
3830
3831 po->prot_hook.ignore_outgoing = !!val;
3832 return 0;
3833 }
5920cd3a
PC
3834 case PACKET_TX_HAS_OFF:
3835 {
3836 unsigned int val;
3837
3838 if (optlen != sizeof(val))
3839 return -EINVAL;
5920cd3a
PC
3840 if (copy_from_user(&val, optval, sizeof(val)))
3841 return -EFAULT;
a6361f0c
WB
3842
3843 lock_sock(sk);
3844 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
3845 ret = -EBUSY;
3846 } else {
3847 po->tp_tx_has_off = !!val;
3848 ret = 0;
3849 }
3850 release_sock(sk);
5920cd3a
PC
3851 return 0;
3852 }
d346a3fa
DB
3853 case PACKET_QDISC_BYPASS:
3854 {
3855 int val;
3856
3857 if (optlen != sizeof(val))
3858 return -EINVAL;
3859 if (copy_from_user(&val, optval, sizeof(val)))
3860 return -EFAULT;
3861
3862 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3863 return 0;
3864 }
1da177e4
LT
3865 default:
3866 return -ENOPROTOOPT;
3867 }
3868}
3869
3870static int packet_getsockopt(struct socket *sock, int level, int optname,
3871 char __user *optval, int __user *optlen)
3872{
3873 int len;
c06fff6e 3874 int val, lv = sizeof(val);
1da177e4
LT
3875 struct sock *sk = sock->sk;
3876 struct packet_sock *po = pkt_sk(sk);
c06fff6e 3877 void *data = &val;
ee80fbf3 3878 union tpacket_stats_u st;
a9b63918 3879 struct tpacket_rollover_stats rstats;
1da177e4
LT
3880
3881 if (level != SOL_PACKET)
3882 return -ENOPROTOOPT;
3883
8ae55f04
KK
3884 if (get_user(len, optlen))
3885 return -EFAULT;
1da177e4
LT
3886
3887 if (len < 0)
3888 return -EINVAL;
1ce4f28b 3889
69e3c75f 3890 switch (optname) {
1da177e4 3891 case PACKET_STATISTICS:
1da177e4 3892 spin_lock_bh(&sk->sk_receive_queue.lock);
ee80fbf3
DB
3893 memcpy(&st, &po->stats, sizeof(st));
3894 memset(&po->stats, 0, sizeof(po->stats));
3895 spin_unlock_bh(&sk->sk_receive_queue.lock);
3896
f6fb8f10 3897 if (po->tp_version == TPACKET_V3) {
c06fff6e 3898 lv = sizeof(struct tpacket_stats_v3);
8bcdeaff 3899 st.stats3.tp_packets += st.stats3.tp_drops;
ee80fbf3 3900 data = &st.stats3;
f6fb8f10 3901 } else {
c06fff6e 3902 lv = sizeof(struct tpacket_stats);
8bcdeaff 3903 st.stats1.tp_packets += st.stats1.tp_drops;
ee80fbf3 3904 data = &st.stats1;
f6fb8f10 3905 }
ee80fbf3 3906
8dc41944
HX
3907 break;
3908 case PACKET_AUXDATA:
8dc41944 3909 val = po->auxdata;
80feaacb
PWJ
3910 break;
3911 case PACKET_ORIGDEV:
80feaacb 3912 val = po->origdev;
bfd5f4a3
SS
3913 break;
3914 case PACKET_VNET_HDR:
bfd5f4a3 3915 val = po->has_vnet_hdr;
1da177e4 3916 break;
bbd6ef87 3917 case PACKET_VERSION:
bbd6ef87 3918 val = po->tp_version;
bbd6ef87
PM
3919 break;
3920 case PACKET_HDRLEN:
3921 if (len > sizeof(int))
3922 len = sizeof(int);
fd2c83b3
AP
3923 if (len < sizeof(int))
3924 return -EINVAL;
bbd6ef87
PM
3925 if (copy_from_user(&val, optval, len))
3926 return -EFAULT;
3927 switch (val) {
3928 case TPACKET_V1:
3929 val = sizeof(struct tpacket_hdr);
3930 break;
3931 case TPACKET_V2:
3932 val = sizeof(struct tpacket2_hdr);
3933 break;
f6fb8f10 3934 case TPACKET_V3:
3935 val = sizeof(struct tpacket3_hdr);
3936 break;
bbd6ef87
PM
3937 default:
3938 return -EINVAL;
3939 }
bbd6ef87 3940 break;
8913336a 3941 case PACKET_RESERVE:
8913336a 3942 val = po->tp_reserve;
8913336a 3943 break;
69e3c75f 3944 case PACKET_LOSS:
69e3c75f 3945 val = po->tp_loss;
69e3c75f 3946 break;
614f60fa 3947 case PACKET_TIMESTAMP:
614f60fa 3948 val = po->tp_tstamp;
614f60fa 3949 break;
dc99f600 3950 case PACKET_FANOUT:
dc99f600
DM
3951 val = (po->fanout ?
3952 ((u32)po->fanout->id |
77f65ebd
WB
3953 ((u32)po->fanout->type << 16) |
3954 ((u32)po->fanout->flags << 24)) :
dc99f600 3955 0);
dc99f600 3956 break;
fa788d98
VW
3957 case PACKET_IGNORE_OUTGOING:
3958 val = po->prot_hook.ignore_outgoing;
3959 break;
a9b63918 3960 case PACKET_ROLLOVER_STATS:
57f015f5 3961 if (!po->rollover)
a9b63918 3962 return -EINVAL;
57f015f5
MM
3963 rstats.tp_all = atomic_long_read(&po->rollover->num);
3964 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
3965 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
3966 data = &rstats;
3967 lv = sizeof(rstats);
a9b63918 3968 break;
5920cd3a
PC
3969 case PACKET_TX_HAS_OFF:
3970 val = po->tp_tx_has_off;
3971 break;
d346a3fa
DB
3972 case PACKET_QDISC_BYPASS:
3973 val = packet_use_direct_xmit(po);
3974 break;
1da177e4
LT
3975 default:
3976 return -ENOPROTOOPT;
3977 }
3978
c06fff6e
ED
3979 if (len > lv)
3980 len = lv;
8ae55f04
KK
3981 if (put_user(len, optlen))
3982 return -EFAULT;
8dc41944
HX
3983 if (copy_to_user(optval, data, len))
3984 return -EFAULT;
8ae55f04 3985 return 0;
1da177e4
LT
3986}
3987
3988
719c44d3
WB
3989#ifdef CONFIG_COMPAT
3990static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
3991 char __user *optval, unsigned int optlen)
3992{
3993 struct packet_sock *po = pkt_sk(sock->sk);
3994
3995 if (level != SOL_PACKET)
3996 return -ENOPROTOOPT;
3997
3998 if (optname == PACKET_FANOUT_DATA &&
3999 po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
4000 optval = (char __user *)get_compat_bpf_fprog(optval);
4001 if (!optval)
4002 return -EFAULT;
4003 optlen = sizeof(struct sock_fprog);
4004 }
4005
4006 return packet_setsockopt(sock, level, optname, optval, optlen);
4007}
4008#endif
4009
351638e7
JP
4010static int packet_notifier(struct notifier_block *this,
4011 unsigned long msg, void *ptr)
1da177e4
LT
4012{
4013 struct sock *sk;
351638e7 4014 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4015 struct net *net = dev_net(dev);
1da177e4 4016
808f5114 4017 rcu_read_lock();
b67bfe0d 4018 sk_for_each_rcu(sk, &net->packet.sklist) {
1da177e4
LT
4019 struct packet_sock *po = pkt_sk(sk);
4020
4021 switch (msg) {
4022 case NETDEV_UNREGISTER:
1da177e4 4023 if (po->mclist)
82f17091 4024 packet_dev_mclist_delete(dev, &po->mclist);
a2efcfa0
DM
4025 /* fallthrough */
4026
1da177e4
LT
4027 case NETDEV_DOWN:
4028 if (dev->ifindex == po->ifindex) {
4029 spin_lock(&po->bind_lock);
4030 if (po->running) {
ce06b03e 4031 __unregister_prot_hook(sk, false);
1da177e4
LT
4032 sk->sk_err = ENETDOWN;
4033 if (!sock_flag(sk, SOCK_DEAD))
4034 sk->sk_error_report(sk);
4035 }
4036 if (msg == NETDEV_UNREGISTER) {
66e56cd4 4037 packet_cached_dev_reset(po);
1da177e4 4038 po->ifindex = -1;
160ff18a
BG
4039 if (po->prot_hook.dev)
4040 dev_put(po->prot_hook.dev);
1da177e4
LT
4041 po->prot_hook.dev = NULL;
4042 }
4043 spin_unlock(&po->bind_lock);
4044 }
4045 break;
4046 case NETDEV_UP:
808f5114 4047 if (dev->ifindex == po->ifindex) {
4048 spin_lock(&po->bind_lock);
ce06b03e
DM
4049 if (po->num)
4050 register_prot_hook(sk);
808f5114 4051 spin_unlock(&po->bind_lock);
1da177e4 4052 }
1da177e4
LT
4053 break;
4054 }
4055 }
808f5114 4056 rcu_read_unlock();
1da177e4
LT
4057 return NOTIFY_DONE;
4058}
4059
4060
4061static int packet_ioctl(struct socket *sock, unsigned int cmd,
4062 unsigned long arg)
4063{
4064 struct sock *sk = sock->sk;
4065
69e3c75f 4066 switch (cmd) {
40d4e3df
ED
4067 case SIOCOUTQ:
4068 {
4069 int amount = sk_wmem_alloc_get(sk);
31e6d363 4070
40d4e3df
ED
4071 return put_user(amount, (int __user *)arg);
4072 }
4073 case SIOCINQ:
4074 {
4075 struct sk_buff *skb;
4076 int amount = 0;
4077
4078 spin_lock_bh(&sk->sk_receive_queue.lock);
4079 skb = skb_peek(&sk->sk_receive_queue);
4080 if (skb)
4081 amount = skb->len;
4082 spin_unlock_bh(&sk->sk_receive_queue.lock);
4083 return put_user(amount, (int __user *)arg);
4084 }
1da177e4 4085#ifdef CONFIG_INET
40d4e3df
ED
4086 case SIOCADDRT:
4087 case SIOCDELRT:
4088 case SIOCDARP:
4089 case SIOCGARP:
4090 case SIOCSARP:
4091 case SIOCGIFADDR:
4092 case SIOCSIFADDR:
4093 case SIOCGIFBRDADDR:
4094 case SIOCSIFBRDADDR:
4095 case SIOCGIFNETMASK:
4096 case SIOCSIFNETMASK:
4097 case SIOCGIFDSTADDR:
4098 case SIOCSIFDSTADDR:
4099 case SIOCSIFFLAGS:
40d4e3df 4100 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
4101#endif
4102
40d4e3df
ED
4103 default:
4104 return -ENOIOCTLCMD;
1da177e4
LT
4105 }
4106 return 0;
4107}
4108
a11e1d43
LT
4109static __poll_t packet_poll(struct file *file, struct socket *sock,
4110 poll_table *wait)
1da177e4
LT
4111{
4112 struct sock *sk = sock->sk;
4113 struct packet_sock *po = pkt_sk(sk);
a11e1d43 4114 __poll_t mask = datagram_poll(file, sock, wait);
1da177e4
LT
4115
4116 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f 4117 if (po->rx_ring.pg_vec) {
f6fb8f10 4118 if (!packet_previous_rx_frame(po, &po->rx_ring,
4119 TP_STATUS_KERNEL))
a9a08845 4120 mask |= EPOLLIN | EPOLLRDNORM;
1da177e4 4121 }
2ccdbaa6 4122 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
54d7c01d 4123 po->pressure = 0;
1da177e4 4124 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
4125 spin_lock_bh(&sk->sk_write_queue.lock);
4126 if (po->tx_ring.pg_vec) {
4127 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
a9a08845 4128 mask |= EPOLLOUT | EPOLLWRNORM;
69e3c75f
JB
4129 }
4130 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
4131 return mask;
4132}
4133
4134
4135/* Dirty? Well, I still did not learn better way to account
4136 * for user mmaps.
4137 */
4138
4139static void packet_mm_open(struct vm_area_struct *vma)
4140{
4141 struct file *file = vma->vm_file;
40d4e3df 4142 struct socket *sock = file->private_data;
1da177e4 4143 struct sock *sk = sock->sk;
1ce4f28b 4144
1da177e4
LT
4145 if (sk)
4146 atomic_inc(&pkt_sk(sk)->mapped);
4147}
4148
4149static void packet_mm_close(struct vm_area_struct *vma)
4150{
4151 struct file *file = vma->vm_file;
40d4e3df 4152 struct socket *sock = file->private_data;
1da177e4 4153 struct sock *sk = sock->sk;
1ce4f28b 4154
1da177e4
LT
4155 if (sk)
4156 atomic_dec(&pkt_sk(sk)->mapped);
4157}
4158
f0f37e2f 4159static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
4160 .open = packet_mm_open,
4161 .close = packet_mm_close,
1da177e4
LT
4162};
4163
3a7ad063
ED
4164static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
4165 unsigned int len)
1da177e4
LT
4166{
4167 int i;
4168
4ebf0ae2 4169 for (i = 0; i < len; i++) {
0e3125c7 4170 if (likely(pg_vec[i].buffer)) {
3a7ad063
ED
4171 if (is_vmalloc_addr(pg_vec[i].buffer))
4172 vfree(pg_vec[i].buffer);
4173 else
4174 free_pages((unsigned long)pg_vec[i].buffer,
4175 order);
0e3125c7
NH
4176 pg_vec[i].buffer = NULL;
4177 }
1da177e4
LT
4178 }
4179 kfree(pg_vec);
4180}
4181
3a7ad063 4182static char *alloc_one_pg_vec_page(unsigned long order)
4ebf0ae2 4183{
f0d4eb29 4184 char *buffer;
3a7ad063
ED
4185 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
4186 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
0e3125c7 4187
3a7ad063 4188 buffer = (char *) __get_free_pages(gfp_flags, order);
0e3125c7
NH
4189 if (buffer)
4190 return buffer;
4191
3a7ad063
ED
4192 /* __get_free_pages failed, fall back to vmalloc */
4193 buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
4194 if (buffer)
4195 return buffer;
0e3125c7 4196
3a7ad063
ED
4197 /* vmalloc failed, lets dig into swap here */
4198 gfp_flags &= ~__GFP_NORETRY;
4199 buffer = (char *) __get_free_pages(gfp_flags, order);
4200 if (buffer)
4201 return buffer;
4202
4203 /* complete and utter failure */
4204 return NULL;
4ebf0ae2
DM
4205}
4206
3a7ad063 4207static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
4208{
4209 unsigned int block_nr = req->tp_block_nr;
0e3125c7 4210 struct pgv *pg_vec;
4ebf0ae2
DM
4211 int i;
4212
398f0132 4213 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
4ebf0ae2
DM
4214 if (unlikely(!pg_vec))
4215 goto out;
4216
4217 for (i = 0; i < block_nr; i++) {
3a7ad063 4218 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
0e3125c7 4219 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
4220 goto out_free_pgvec;
4221 }
4222
4223out:
4224 return pg_vec;
4225
4226out_free_pgvec:
3a7ad063 4227 free_pg_vec(pg_vec, order, block_nr);
4ebf0ae2
DM
4228 pg_vec = NULL;
4229 goto out;
4230}
1da177e4 4231
f6fb8f10 4232static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
69e3c75f 4233 int closing, int tx_ring)
1da177e4 4234{
0e3125c7 4235 struct pgv *pg_vec = NULL;
1da177e4 4236 struct packet_sock *po = pkt_sk(sk);
3a7ad063 4237 int was_running, order = 0;
69e3c75f
JB
4238 struct packet_ring_buffer *rb;
4239 struct sk_buff_head *rb_queue;
0e11c91e 4240 __be16 num;
f6fb8f10 4241 int err = -EINVAL;
4242 /* Added to avoid minimal code churn */
4243 struct tpacket_req *req = &req_u->req;
4244
69e3c75f
JB
4245 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
4246 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 4247
69e3c75f
JB
4248 err = -EBUSY;
4249 if (!closing) {
4250 if (atomic_read(&po->mapped))
4251 goto out;
b0138408 4252 if (packet_read_pending(rb))
69e3c75f
JB
4253 goto out;
4254 }
1da177e4 4255
69e3c75f 4256 if (req->tp_block_nr) {
4576cd46
WB
4257 unsigned int min_frame_size;
4258
69e3c75f
JB
4259 /* Sanity tests and some calculations */
4260 err = -EBUSY;
4261 if (unlikely(rb->pg_vec))
4262 goto out;
1da177e4 4263
bbd6ef87
PM
4264 switch (po->tp_version) {
4265 case TPACKET_V1:
4266 po->tp_hdrlen = TPACKET_HDRLEN;
4267 break;
4268 case TPACKET_V2:
4269 po->tp_hdrlen = TPACKET2_HDRLEN;
4270 break;
f6fb8f10 4271 case TPACKET_V3:
4272 po->tp_hdrlen = TPACKET3_HDRLEN;
4273 break;
bbd6ef87
PM
4274 }
4275
69e3c75f 4276 err = -EINVAL;
4ebf0ae2 4277 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 4278 goto out;
90836b67 4279 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
69e3c75f 4280 goto out;
4576cd46 4281 min_frame_size = po->tp_hdrlen + po->tp_reserve;
dc808110 4282 if (po->tp_version >= TPACKET_V3 &&
4576cd46
WB
4283 req->tp_block_size <
4284 BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
dc808110 4285 goto out;
4576cd46 4286 if (unlikely(req->tp_frame_size < min_frame_size))
69e3c75f 4287 goto out;
4ebf0ae2 4288 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 4289 goto out;
1da177e4 4290
4194b491
TK
4291 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
4292 if (unlikely(rb->frames_per_block == 0))
69e3c75f 4293 goto out;
fc62814d 4294 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
8f8d28e4 4295 goto out;
69e3c75f
JB
4296 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
4297 req->tp_frame_nr))
4298 goto out;
1da177e4
LT
4299
4300 err = -ENOMEM;
3a7ad063
ED
4301 order = get_order(req->tp_block_size);
4302 pg_vec = alloc_pg_vec(req, order);
4ebf0ae2 4303 if (unlikely(!pg_vec))
1da177e4 4304 goto out;
f6fb8f10 4305 switch (po->tp_version) {
4306 case TPACKET_V3:
7f953ab2
SV
4307 /* Block transmit is not supported yet */
4308 if (!tx_ring) {
e8e85cc5 4309 init_prb_bdqc(po, rb, pg_vec, req_u);
7f953ab2
SV
4310 } else {
4311 struct tpacket_req3 *req3 = &req_u->req3;
4312
4313 if (req3->tp_retire_blk_tov ||
4314 req3->tp_sizeof_priv ||
4315 req3->tp_feature_req_word) {
4316 err = -EINVAL;
4317 goto out;
4318 }
4319 }
d7cf0c34 4320 break;
f6fb8f10 4321 default:
4322 break;
4323 }
69e3c75f
JB
4324 }
4325 /* Done */
4326 else {
4327 err = -EINVAL;
4ebf0ae2 4328 if (unlikely(req->tp_frame_nr))
69e3c75f 4329 goto out;
1da177e4
LT
4330 }
4331
1da177e4
LT
4332
4333 /* Detach socket from network */
4334 spin_lock(&po->bind_lock);
4335 was_running = po->running;
4336 num = po->num;
4337 if (was_running) {
1da177e4 4338 po->num = 0;
ce06b03e 4339 __unregister_prot_hook(sk, false);
1da177e4
LT
4340 }
4341 spin_unlock(&po->bind_lock);
1ce4f28b 4342
1da177e4
LT
4343 synchronize_net();
4344
4345 err = -EBUSY;
905db440 4346 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
4347 if (closing || atomic_read(&po->mapped) == 0) {
4348 err = 0;
69e3c75f 4349 spin_lock_bh(&rb_queue->lock);
c053fd96 4350 swap(rb->pg_vec, pg_vec);
69e3c75f
JB
4351 rb->frame_max = (req->tp_frame_nr - 1);
4352 rb->head = 0;
4353 rb->frame_size = req->tp_frame_size;
4354 spin_unlock_bh(&rb_queue->lock);
4355
3a7ad063 4356 swap(rb->pg_vec_order, order);
c053fd96 4357 swap(rb->pg_vec_len, req->tp_block_nr);
69e3c75f
JB
4358
4359 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4360 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4361 tpacket_rcv : packet_rcv;
4362 skb_queue_purge(rb_queue);
1da177e4 4363 if (atomic_read(&po->mapped))
40d4e3df
ED
4364 pr_err("packet_mmap: vma is busy: %d\n",
4365 atomic_read(&po->mapped));
1da177e4 4366 }
905db440 4367 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4368
4369 spin_lock(&po->bind_lock);
ce06b03e 4370 if (was_running) {
1da177e4 4371 po->num = num;
ce06b03e 4372 register_prot_hook(sk);
1da177e4
LT
4373 }
4374 spin_unlock(&po->bind_lock);
c800aaf8 4375 if (pg_vec && (po->tp_version > TPACKET_V2)) {
f6fb8f10 4376 /* Because we don't support block-based V3 on tx-ring */
4377 if (!tx_ring)
73d0fcf2 4378 prb_shutdown_retire_blk_timer(po, rb_queue);
f6fb8f10 4379 }
1da177e4 4380
1da177e4 4381 if (pg_vec)
3a7ad063 4382 free_pg_vec(pg_vec, order, req->tp_block_nr);
1da177e4
LT
4383out:
4384 return err;
4385}
4386
69e3c75f
JB
4387static int packet_mmap(struct file *file, struct socket *sock,
4388 struct vm_area_struct *vma)
1da177e4
LT
4389{
4390 struct sock *sk = sock->sk;
4391 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
4392 unsigned long size, expected_size;
4393 struct packet_ring_buffer *rb;
1da177e4
LT
4394 unsigned long start;
4395 int err = -EINVAL;
4396 int i;
4397
4398 if (vma->vm_pgoff)
4399 return -EINVAL;
4400
905db440 4401 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
4402
4403 expected_size = 0;
4404 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4405 if (rb->pg_vec) {
4406 expected_size += rb->pg_vec_len
4407 * rb->pg_vec_pages
4408 * PAGE_SIZE;
4409 }
4410 }
4411
4412 if (expected_size == 0)
1da177e4 4413 goto out;
69e3c75f
JB
4414
4415 size = vma->vm_end - vma->vm_start;
4416 if (size != expected_size)
1da177e4
LT
4417 goto out;
4418
1da177e4 4419 start = vma->vm_start;
69e3c75f
JB
4420 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4421 if (rb->pg_vec == NULL)
4422 continue;
4423
4424 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
4425 struct page *page;
4426 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
4427 int pg_num;
4428
c56b4d90
CG
4429 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4430 page = pgv_to_page(kaddr);
69e3c75f
JB
4431 err = vm_insert_page(vma, start, page);
4432 if (unlikely(err))
4433 goto out;
4434 start += PAGE_SIZE;
0e3125c7 4435 kaddr += PAGE_SIZE;
69e3c75f 4436 }
4ebf0ae2 4437 }
1da177e4 4438 }
69e3c75f 4439
4ebf0ae2 4440 atomic_inc(&po->mapped);
1da177e4
LT
4441 vma->vm_ops = &packet_mmap_ops;
4442 err = 0;
4443
4444out:
905db440 4445 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
4446 return err;
4447}
1da177e4 4448
90ddc4f0 4449static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
4450 .family = PF_PACKET,
4451 .owner = THIS_MODULE,
4452 .release = packet_release,
4453 .bind = packet_bind_spkt,
4454 .connect = sock_no_connect,
4455 .socketpair = sock_no_socketpair,
4456 .accept = sock_no_accept,
4457 .getname = packet_getname_spkt,
a11e1d43 4458 .poll = datagram_poll,
1da177e4 4459 .ioctl = packet_ioctl,
c7cbdbf2 4460 .gettstamp = sock_gettstamp,
1da177e4
LT
4461 .listen = sock_no_listen,
4462 .shutdown = sock_no_shutdown,
4463 .setsockopt = sock_no_setsockopt,
4464 .getsockopt = sock_no_getsockopt,
4465 .sendmsg = packet_sendmsg_spkt,
4466 .recvmsg = packet_recvmsg,
4467 .mmap = sock_no_mmap,
4468 .sendpage = sock_no_sendpage,
4469};
1da177e4 4470
90ddc4f0 4471static const struct proto_ops packet_ops = {
1da177e4
LT
4472 .family = PF_PACKET,
4473 .owner = THIS_MODULE,
4474 .release = packet_release,
4475 .bind = packet_bind,
4476 .connect = sock_no_connect,
4477 .socketpair = sock_no_socketpair,
4478 .accept = sock_no_accept,
1ce4f28b 4479 .getname = packet_getname,
a11e1d43 4480 .poll = packet_poll,
1da177e4 4481 .ioctl = packet_ioctl,
c7cbdbf2 4482 .gettstamp = sock_gettstamp,
1da177e4
LT
4483 .listen = sock_no_listen,
4484 .shutdown = sock_no_shutdown,
4485 .setsockopt = packet_setsockopt,
4486 .getsockopt = packet_getsockopt,
719c44d3
WB
4487#ifdef CONFIG_COMPAT
4488 .compat_setsockopt = compat_packet_setsockopt,
4489#endif
1da177e4
LT
4490 .sendmsg = packet_sendmsg,
4491 .recvmsg = packet_recvmsg,
4492 .mmap = packet_mmap,
4493 .sendpage = sock_no_sendpage,
4494};
4495
ec1b4cf7 4496static const struct net_proto_family packet_family_ops = {
1da177e4
LT
4497 .family = PF_PACKET,
4498 .create = packet_create,
4499 .owner = THIS_MODULE,
4500};
4501
4502static struct notifier_block packet_netdev_notifier = {
40d4e3df 4503 .notifier_call = packet_notifier,
1da177e4
LT
4504};
4505
4506#ifdef CONFIG_PROC_FS
1da177e4
LT
4507
4508static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 4509 __acquires(RCU)
1da177e4 4510{
e372c414 4511 struct net *net = seq_file_net(seq);
808f5114 4512
4513 rcu_read_lock();
4514 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
4515}
4516
4517static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4518{
1bf40954 4519 struct net *net = seq_file_net(seq);
808f5114 4520 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
4521}
4522
4523static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 4524 __releases(RCU)
1da177e4 4525{
808f5114 4526 rcu_read_unlock();
1da177e4
LT
4527}
4528
1ce4f28b 4529static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
4530{
4531 if (v == SEQ_START_TOKEN)
4532 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4533 else {
b7ceabd9 4534 struct sock *s = sk_entry(v);
1da177e4
LT
4535 const struct packet_sock *po = pkt_sk(s);
4536
4537 seq_printf(seq,
71338aa7 4538 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1da177e4 4539 s,
41c6d650 4540 refcount_read(&s->sk_refcnt),
1da177e4
LT
4541 s->sk_type,
4542 ntohs(po->num),
4543 po->ifindex,
4544 po->running,
4545 atomic_read(&s->sk_rmem_alloc),
a7cb5a49 4546 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
40d4e3df 4547 sock_i_ino(s));
1da177e4
LT
4548 }
4549
4550 return 0;
4551}
4552
56b3d975 4553static const struct seq_operations packet_seq_ops = {
1da177e4
LT
4554 .start = packet_seq_start,
4555 .next = packet_seq_next,
4556 .stop = packet_seq_stop,
4557 .show = packet_seq_show,
4558};
1da177e4
LT
4559#endif
4560
2c8c1e72 4561static int __net_init packet_net_init(struct net *net)
d12d01d6 4562{
0fa7fa98 4563 mutex_init(&net->packet.sklist_lock);
2aaef4e4 4564 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6 4565
c3506372
CH
4566 if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
4567 sizeof(struct seq_net_private)))
d12d01d6
DL
4568 return -ENOMEM;
4569
4570 return 0;
4571}
4572
2c8c1e72 4573static void __net_exit packet_net_exit(struct net *net)
d12d01d6 4574{
ece31ffd 4575 remove_proc_entry("packet", net->proc_net);
669f8f1a 4576 WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
d12d01d6
DL
4577}
4578
4579static struct pernet_operations packet_net_ops = {
4580 .init = packet_net_init,
4581 .exit = packet_net_exit,
4582};
4583
4584
1da177e4
LT
4585static void __exit packet_exit(void)
4586{
1da177e4 4587 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 4588 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
4589 sock_unregister(PF_PACKET);
4590 proto_unregister(&packet_proto);
4591}
4592
4593static int __init packet_init(void)
4594{
36096f2f 4595 int rc;
1da177e4 4596
36096f2f
Y
4597 rc = proto_register(&packet_proto, 0);
4598 if (rc)
1da177e4 4599 goto out;
36096f2f
Y
4600 rc = sock_register(&packet_family_ops);
4601 if (rc)
4602 goto out_proto;
4603 rc = register_pernet_subsys(&packet_net_ops);
4604 if (rc)
4605 goto out_sock;
4606 rc = register_netdevice_notifier(&packet_netdev_notifier);
4607 if (rc)
4608 goto out_pernet;
1da177e4 4609
36096f2f
Y
4610 return 0;
4611
4612out_pernet:
4613 unregister_pernet_subsys(&packet_net_ops);
4614out_sock:
4615 sock_unregister(PF_PACKET);
4616out_proto:
4617 proto_unregister(&packet_proto);
1da177e4
LT
4618out:
4619 return rc;
4620}
4621
4622module_init(packet_init);
4623module_exit(packet_exit);
4624MODULE_LICENSE("GPL");
4625MODULE_ALIAS_NETPROTO(PF_PACKET);