]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/core/sock.c
sock: add sk_dst_pending_confirm flag
[mirror_ubuntu-zesty-kernel.git] / net / core / sock.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120
121 #include <linux/uaccess.h>
122
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138
139 #include <trace/events/sock.h>
140
141 #ifdef CONFIG_INET
142 #include <net/tcp.h>
143 #endif
144
145 #include <net/busy_poll.h>
146
147 static DEFINE_MUTEX(proto_list_mutex);
148 static LIST_HEAD(proto_list);
149
150 /**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160 bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162 {
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167
168 /**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
171 * @cap: The global capability to use
172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182
183 /**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
188 * Test to see if the opener of the socket had when the socket was created
189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197
198 /*
199 * Each address family might have different locking rules, so we have
200 * one slock key per address family:
201 */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204
205 /*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
210 static const char *const af_family_key_strings[AF_MAX+1] = {
211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
225 "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
226 };
227 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
228 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
229 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
230 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
231 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
232 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
233 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
234 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
235 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
236 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
237 "slock-27" , "slock-28" , "slock-AF_CAN" ,
238 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
239 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
240 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
241 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
242 "slock-AF_QIPCRTR", "slock-AF_MAX"
243 };
244 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
245 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
246 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
247 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
248 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
249 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
250 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
251 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
252 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
253 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
254 "clock-27" , "clock-28" , "clock-AF_CAN" ,
255 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
256 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
257 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
258 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
259 "clock-AF_QIPCRTR", "clock-AF_MAX"
260 };
261
262 /*
263 * sk_callback_lock locking rules are per-address-family,
264 * so split the lock classes by using a per-AF key:
265 */
266 static struct lock_class_key af_callback_keys[AF_MAX];
267
268 /* Take into consideration the size of the struct sk_buff overhead in the
269 * determination of these values, since that is non-constant across
270 * platforms. This makes socket queueing behavior and performance
271 * not depend upon such differences.
272 */
273 #define _SK_MEM_PACKETS 256
274 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
275 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
276 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
277
278 /* Run time adjustable parameters. */
279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280 EXPORT_SYMBOL(sysctl_wmem_max);
281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282 EXPORT_SYMBOL(sysctl_rmem_max);
283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285
286 /* Maximal space eaten by iovec or ancillary data plus some space */
287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
288 EXPORT_SYMBOL(sysctl_optmem_max);
289
290 int sysctl_tstamp_allow_data __read_mostly = 1;
291
292 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
293 EXPORT_SYMBOL_GPL(memalloc_socks);
294
295 /**
296 * sk_set_memalloc - sets %SOCK_MEMALLOC
297 * @sk: socket to set it on
298 *
299 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
300 * It's the responsibility of the admin to adjust min_free_kbytes
301 * to meet the requirements
302 */
303 void sk_set_memalloc(struct sock *sk)
304 {
305 sock_set_flag(sk, SOCK_MEMALLOC);
306 sk->sk_allocation |= __GFP_MEMALLOC;
307 static_key_slow_inc(&memalloc_socks);
308 }
309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
310
311 void sk_clear_memalloc(struct sock *sk)
312 {
313 sock_reset_flag(sk, SOCK_MEMALLOC);
314 sk->sk_allocation &= ~__GFP_MEMALLOC;
315 static_key_slow_dec(&memalloc_socks);
316
317 /*
318 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
319 * progress of swapping. SOCK_MEMALLOC may be cleared while
320 * it has rmem allocations due to the last swapfile being deactivated
321 * but there is a risk that the socket is unusable due to exceeding
322 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 */
324 sk_mem_reclaim(sk);
325 }
326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327
328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
329 {
330 int ret;
331 unsigned long pflags = current->flags;
332
333 /* these should have been dropped before queueing */
334 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335
336 current->flags |= PF_MEMALLOC;
337 ret = sk->sk_backlog_rcv(sk, skb);
338 tsk_restore_flags(current, pflags, PF_MEMALLOC);
339
340 return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343
344 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
345 {
346 struct timeval tv;
347
348 if (optlen < sizeof(tv))
349 return -EINVAL;
350 if (copy_from_user(&tv, optval, sizeof(tv)))
351 return -EFAULT;
352 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
353 return -EDOM;
354
355 if (tv.tv_sec < 0) {
356 static int warned __read_mostly;
357
358 *timeo_p = 0;
359 if (warned < 10 && net_ratelimit()) {
360 warned++;
361 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
362 __func__, current->comm, task_pid_nr(current));
363 }
364 return 0;
365 }
366 *timeo_p = MAX_SCHEDULE_TIMEOUT;
367 if (tv.tv_sec == 0 && tv.tv_usec == 0)
368 return 0;
369 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
370 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
371 return 0;
372 }
373
374 static void sock_warn_obsolete_bsdism(const char *name)
375 {
376 static int warned;
377 static char warncomm[TASK_COMM_LEN];
378 if (strcmp(warncomm, current->comm) && warned < 5) {
379 strcpy(warncomm, current->comm);
380 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
381 warncomm, name);
382 warned++;
383 }
384 }
385
386 static bool sock_needs_netstamp(const struct sock *sk)
387 {
388 switch (sk->sk_family) {
389 case AF_UNSPEC:
390 case AF_UNIX:
391 return false;
392 default:
393 return true;
394 }
395 }
396
397 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
398 {
399 if (sk->sk_flags & flags) {
400 sk->sk_flags &= ~flags;
401 if (sock_needs_netstamp(sk) &&
402 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
403 net_disable_timestamp();
404 }
405 }
406
407
408 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
409 {
410 unsigned long flags;
411 struct sk_buff_head *list = &sk->sk_receive_queue;
412
413 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
414 atomic_inc(&sk->sk_drops);
415 trace_sock_rcvqueue_full(sk, skb);
416 return -ENOMEM;
417 }
418
419 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
420 atomic_inc(&sk->sk_drops);
421 return -ENOBUFS;
422 }
423
424 skb->dev = NULL;
425 skb_set_owner_r(skb, sk);
426
427 /* we escape from rcu protected region, make sure we dont leak
428 * a norefcounted dst
429 */
430 skb_dst_force(skb);
431
432 spin_lock_irqsave(&list->lock, flags);
433 sock_skb_set_dropcount(sk, skb);
434 __skb_queue_tail(list, skb);
435 spin_unlock_irqrestore(&list->lock, flags);
436
437 if (!sock_flag(sk, SOCK_DEAD))
438 sk->sk_data_ready(sk);
439 return 0;
440 }
441 EXPORT_SYMBOL(__sock_queue_rcv_skb);
442
443 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
444 {
445 int err;
446
447 err = sk_filter(sk, skb);
448 if (err)
449 return err;
450
451 return __sock_queue_rcv_skb(sk, skb);
452 }
453 EXPORT_SYMBOL(sock_queue_rcv_skb);
454
455 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
456 const int nested, unsigned int trim_cap, bool refcounted)
457 {
458 int rc = NET_RX_SUCCESS;
459
460 if (sk_filter_trim_cap(sk, skb, trim_cap))
461 goto discard_and_relse;
462
463 skb->dev = NULL;
464
465 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
466 atomic_inc(&sk->sk_drops);
467 goto discard_and_relse;
468 }
469 if (nested)
470 bh_lock_sock_nested(sk);
471 else
472 bh_lock_sock(sk);
473 if (!sock_owned_by_user(sk)) {
474 /*
475 * trylock + unlock semantics:
476 */
477 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
478
479 rc = sk_backlog_rcv(sk, skb);
480
481 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
482 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
483 bh_unlock_sock(sk);
484 atomic_inc(&sk->sk_drops);
485 goto discard_and_relse;
486 }
487
488 bh_unlock_sock(sk);
489 out:
490 if (refcounted)
491 sock_put(sk);
492 return rc;
493 discard_and_relse:
494 kfree_skb(skb);
495 goto out;
496 }
497 EXPORT_SYMBOL(__sk_receive_skb);
498
499 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
500 {
501 struct dst_entry *dst = __sk_dst_get(sk);
502
503 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
504 sk_tx_queue_clear(sk);
505 sk->sk_dst_pending_confirm = 0;
506 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
507 dst_release(dst);
508 return NULL;
509 }
510
511 return dst;
512 }
513 EXPORT_SYMBOL(__sk_dst_check);
514
515 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
516 {
517 struct dst_entry *dst = sk_dst_get(sk);
518
519 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
520 sk_dst_reset(sk);
521 dst_release(dst);
522 return NULL;
523 }
524
525 return dst;
526 }
527 EXPORT_SYMBOL(sk_dst_check);
528
529 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
530 int optlen)
531 {
532 int ret = -ENOPROTOOPT;
533 #ifdef CONFIG_NETDEVICES
534 struct net *net = sock_net(sk);
535 char devname[IFNAMSIZ];
536 int index;
537
538 /* Sorry... */
539 ret = -EPERM;
540 if (!ns_capable(net->user_ns, CAP_NET_RAW))
541 goto out;
542
543 ret = -EINVAL;
544 if (optlen < 0)
545 goto out;
546
547 /* Bind this socket to a particular device like "eth0",
548 * as specified in the passed interface name. If the
549 * name is "" or the option length is zero the socket
550 * is not bound.
551 */
552 if (optlen > IFNAMSIZ - 1)
553 optlen = IFNAMSIZ - 1;
554 memset(devname, 0, sizeof(devname));
555
556 ret = -EFAULT;
557 if (copy_from_user(devname, optval, optlen))
558 goto out;
559
560 index = 0;
561 if (devname[0] != '\0') {
562 struct net_device *dev;
563
564 rcu_read_lock();
565 dev = dev_get_by_name_rcu(net, devname);
566 if (dev)
567 index = dev->ifindex;
568 rcu_read_unlock();
569 ret = -ENODEV;
570 if (!dev)
571 goto out;
572 }
573
574 lock_sock(sk);
575 sk->sk_bound_dev_if = index;
576 sk_dst_reset(sk);
577 release_sock(sk);
578
579 ret = 0;
580
581 out:
582 #endif
583
584 return ret;
585 }
586
587 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
588 int __user *optlen, int len)
589 {
590 int ret = -ENOPROTOOPT;
591 #ifdef CONFIG_NETDEVICES
592 struct net *net = sock_net(sk);
593 char devname[IFNAMSIZ];
594
595 if (sk->sk_bound_dev_if == 0) {
596 len = 0;
597 goto zero;
598 }
599
600 ret = -EINVAL;
601 if (len < IFNAMSIZ)
602 goto out;
603
604 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
605 if (ret)
606 goto out;
607
608 len = strlen(devname) + 1;
609
610 ret = -EFAULT;
611 if (copy_to_user(optval, devname, len))
612 goto out;
613
614 zero:
615 ret = -EFAULT;
616 if (put_user(len, optlen))
617 goto out;
618
619 ret = 0;
620
621 out:
622 #endif
623
624 return ret;
625 }
626
627 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
628 {
629 if (valbool)
630 sock_set_flag(sk, bit);
631 else
632 sock_reset_flag(sk, bit);
633 }
634
635 bool sk_mc_loop(struct sock *sk)
636 {
637 if (dev_recursion_level())
638 return false;
639 if (!sk)
640 return true;
641 switch (sk->sk_family) {
642 case AF_INET:
643 return inet_sk(sk)->mc_loop;
644 #if IS_ENABLED(CONFIG_IPV6)
645 case AF_INET6:
646 return inet6_sk(sk)->mc_loop;
647 #endif
648 }
649 WARN_ON(1);
650 return true;
651 }
652 EXPORT_SYMBOL(sk_mc_loop);
653
654 /*
655 * This is meant for all protocols to use and covers goings on
656 * at the socket level. Everything here is generic.
657 */
658
659 int sock_setsockopt(struct socket *sock, int level, int optname,
660 char __user *optval, unsigned int optlen)
661 {
662 struct sock *sk = sock->sk;
663 int val;
664 int valbool;
665 struct linger ling;
666 int ret = 0;
667
668 /*
669 * Options without arguments
670 */
671
672 if (optname == SO_BINDTODEVICE)
673 return sock_setbindtodevice(sk, optval, optlen);
674
675 if (optlen < sizeof(int))
676 return -EINVAL;
677
678 if (get_user(val, (int __user *)optval))
679 return -EFAULT;
680
681 valbool = val ? 1 : 0;
682
683 lock_sock(sk);
684
685 switch (optname) {
686 case SO_DEBUG:
687 if (val && !capable(CAP_NET_ADMIN))
688 ret = -EACCES;
689 else
690 sock_valbool_flag(sk, SOCK_DBG, valbool);
691 break;
692 case SO_REUSEADDR:
693 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
694 break;
695 case SO_REUSEPORT:
696 sk->sk_reuseport = valbool;
697 break;
698 case SO_TYPE:
699 case SO_PROTOCOL:
700 case SO_DOMAIN:
701 case SO_ERROR:
702 ret = -ENOPROTOOPT;
703 break;
704 case SO_DONTROUTE:
705 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
706 break;
707 case SO_BROADCAST:
708 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
709 break;
710 case SO_SNDBUF:
711 /* Don't error on this BSD doesn't and if you think
712 * about it this is right. Otherwise apps have to
713 * play 'guess the biggest size' games. RCVBUF/SNDBUF
714 * are treated in BSD as hints
715 */
716 val = min_t(u32, val, sysctl_wmem_max);
717 set_sndbuf:
718 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
719 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
720 /* Wake up sending tasks if we upped the value. */
721 sk->sk_write_space(sk);
722 break;
723
724 case SO_SNDBUFFORCE:
725 if (!capable(CAP_NET_ADMIN)) {
726 ret = -EPERM;
727 break;
728 }
729 goto set_sndbuf;
730
731 case SO_RCVBUF:
732 /* Don't error on this BSD doesn't and if you think
733 * about it this is right. Otherwise apps have to
734 * play 'guess the biggest size' games. RCVBUF/SNDBUF
735 * are treated in BSD as hints
736 */
737 val = min_t(u32, val, sysctl_rmem_max);
738 set_rcvbuf:
739 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
740 /*
741 * We double it on the way in to account for
742 * "struct sk_buff" etc. overhead. Applications
743 * assume that the SO_RCVBUF setting they make will
744 * allow that much actual data to be received on that
745 * socket.
746 *
747 * Applications are unaware that "struct sk_buff" and
748 * other overheads allocate from the receive buffer
749 * during socket buffer allocation.
750 *
751 * And after considering the possible alternatives,
752 * returning the value we actually used in getsockopt
753 * is the most desirable behavior.
754 */
755 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
756 break;
757
758 case SO_RCVBUFFORCE:
759 if (!capable(CAP_NET_ADMIN)) {
760 ret = -EPERM;
761 break;
762 }
763 goto set_rcvbuf;
764
765 case SO_KEEPALIVE:
766 #ifdef CONFIG_INET
767 if (sk->sk_protocol == IPPROTO_TCP &&
768 sk->sk_type == SOCK_STREAM)
769 tcp_set_keepalive(sk, valbool);
770 #endif
771 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
772 break;
773
774 case SO_OOBINLINE:
775 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
776 break;
777
778 case SO_NO_CHECK:
779 sk->sk_no_check_tx = valbool;
780 break;
781
782 case SO_PRIORITY:
783 if ((val >= 0 && val <= 6) ||
784 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
785 sk->sk_priority = val;
786 else
787 ret = -EPERM;
788 break;
789
790 case SO_LINGER:
791 if (optlen < sizeof(ling)) {
792 ret = -EINVAL; /* 1003.1g */
793 break;
794 }
795 if (copy_from_user(&ling, optval, sizeof(ling))) {
796 ret = -EFAULT;
797 break;
798 }
799 if (!ling.l_onoff)
800 sock_reset_flag(sk, SOCK_LINGER);
801 else {
802 #if (BITS_PER_LONG == 32)
803 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
804 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
805 else
806 #endif
807 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
808 sock_set_flag(sk, SOCK_LINGER);
809 }
810 break;
811
812 case SO_BSDCOMPAT:
813 sock_warn_obsolete_bsdism("setsockopt");
814 break;
815
816 case SO_PASSCRED:
817 if (valbool)
818 set_bit(SOCK_PASSCRED, &sock->flags);
819 else
820 clear_bit(SOCK_PASSCRED, &sock->flags);
821 break;
822
823 case SO_TIMESTAMP:
824 case SO_TIMESTAMPNS:
825 if (valbool) {
826 if (optname == SO_TIMESTAMP)
827 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
828 else
829 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
830 sock_set_flag(sk, SOCK_RCVTSTAMP);
831 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
832 } else {
833 sock_reset_flag(sk, SOCK_RCVTSTAMP);
834 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
835 }
836 break;
837
838 case SO_TIMESTAMPING:
839 if (val & ~SOF_TIMESTAMPING_MASK) {
840 ret = -EINVAL;
841 break;
842 }
843
844 if (val & SOF_TIMESTAMPING_OPT_ID &&
845 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
846 if (sk->sk_protocol == IPPROTO_TCP &&
847 sk->sk_type == SOCK_STREAM) {
848 if ((1 << sk->sk_state) &
849 (TCPF_CLOSE | TCPF_LISTEN)) {
850 ret = -EINVAL;
851 break;
852 }
853 sk->sk_tskey = tcp_sk(sk)->snd_una;
854 } else {
855 sk->sk_tskey = 0;
856 }
857 }
858
859 if (val & SOF_TIMESTAMPING_OPT_STATS &&
860 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
861 ret = -EINVAL;
862 break;
863 }
864
865 sk->sk_tsflags = val;
866 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
867 sock_enable_timestamp(sk,
868 SOCK_TIMESTAMPING_RX_SOFTWARE);
869 else
870 sock_disable_timestamp(sk,
871 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
872 break;
873
874 case SO_RCVLOWAT:
875 if (val < 0)
876 val = INT_MAX;
877 sk->sk_rcvlowat = val ? : 1;
878 break;
879
880 case SO_RCVTIMEO:
881 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
882 break;
883
884 case SO_SNDTIMEO:
885 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
886 break;
887
888 case SO_ATTACH_FILTER:
889 ret = -EINVAL;
890 if (optlen == sizeof(struct sock_fprog)) {
891 struct sock_fprog fprog;
892
893 ret = -EFAULT;
894 if (copy_from_user(&fprog, optval, sizeof(fprog)))
895 break;
896
897 ret = sk_attach_filter(&fprog, sk);
898 }
899 break;
900
901 case SO_ATTACH_BPF:
902 ret = -EINVAL;
903 if (optlen == sizeof(u32)) {
904 u32 ufd;
905
906 ret = -EFAULT;
907 if (copy_from_user(&ufd, optval, sizeof(ufd)))
908 break;
909
910 ret = sk_attach_bpf(ufd, sk);
911 }
912 break;
913
914 case SO_ATTACH_REUSEPORT_CBPF:
915 ret = -EINVAL;
916 if (optlen == sizeof(struct sock_fprog)) {
917 struct sock_fprog fprog;
918
919 ret = -EFAULT;
920 if (copy_from_user(&fprog, optval, sizeof(fprog)))
921 break;
922
923 ret = sk_reuseport_attach_filter(&fprog, sk);
924 }
925 break;
926
927 case SO_ATTACH_REUSEPORT_EBPF:
928 ret = -EINVAL;
929 if (optlen == sizeof(u32)) {
930 u32 ufd;
931
932 ret = -EFAULT;
933 if (copy_from_user(&ufd, optval, sizeof(ufd)))
934 break;
935
936 ret = sk_reuseport_attach_bpf(ufd, sk);
937 }
938 break;
939
940 case SO_DETACH_FILTER:
941 ret = sk_detach_filter(sk);
942 break;
943
944 case SO_LOCK_FILTER:
945 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
946 ret = -EPERM;
947 else
948 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
949 break;
950
951 case SO_PASSSEC:
952 if (valbool)
953 set_bit(SOCK_PASSSEC, &sock->flags);
954 else
955 clear_bit(SOCK_PASSSEC, &sock->flags);
956 break;
957 case SO_MARK:
958 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
959 ret = -EPERM;
960 else
961 sk->sk_mark = val;
962 break;
963
964 case SO_RXQ_OVFL:
965 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
966 break;
967
968 case SO_WIFI_STATUS:
969 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
970 break;
971
972 case SO_PEEK_OFF:
973 if (sock->ops->set_peek_off)
974 ret = sock->ops->set_peek_off(sk, val);
975 else
976 ret = -EOPNOTSUPP;
977 break;
978
979 case SO_NOFCS:
980 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
981 break;
982
983 case SO_SELECT_ERR_QUEUE:
984 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
985 break;
986
987 #ifdef CONFIG_NET_RX_BUSY_POLL
988 case SO_BUSY_POLL:
989 /* allow unprivileged users to decrease the value */
990 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
991 ret = -EPERM;
992 else {
993 if (val < 0)
994 ret = -EINVAL;
995 else
996 sk->sk_ll_usec = val;
997 }
998 break;
999 #endif
1000
1001 case SO_MAX_PACING_RATE:
1002 sk->sk_max_pacing_rate = val;
1003 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1004 sk->sk_max_pacing_rate);
1005 break;
1006
1007 case SO_INCOMING_CPU:
1008 sk->sk_incoming_cpu = val;
1009 break;
1010
1011 case SO_CNX_ADVICE:
1012 if (val == 1)
1013 dst_negative_advice(sk);
1014 break;
1015 default:
1016 ret = -ENOPROTOOPT;
1017 break;
1018 }
1019 release_sock(sk);
1020 return ret;
1021 }
1022 EXPORT_SYMBOL(sock_setsockopt);
1023
1024
1025 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1026 struct ucred *ucred)
1027 {
1028 ucred->pid = pid_vnr(pid);
1029 ucred->uid = ucred->gid = -1;
1030 if (cred) {
1031 struct user_namespace *current_ns = current_user_ns();
1032
1033 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1034 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1035 }
1036 }
1037
1038 int sock_getsockopt(struct socket *sock, int level, int optname,
1039 char __user *optval, int __user *optlen)
1040 {
1041 struct sock *sk = sock->sk;
1042
1043 union {
1044 int val;
1045 struct linger ling;
1046 struct timeval tm;
1047 } v;
1048
1049 int lv = sizeof(int);
1050 int len;
1051
1052 if (get_user(len, optlen))
1053 return -EFAULT;
1054 if (len < 0)
1055 return -EINVAL;
1056
1057 memset(&v, 0, sizeof(v));
1058
1059 switch (optname) {
1060 case SO_DEBUG:
1061 v.val = sock_flag(sk, SOCK_DBG);
1062 break;
1063
1064 case SO_DONTROUTE:
1065 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1066 break;
1067
1068 case SO_BROADCAST:
1069 v.val = sock_flag(sk, SOCK_BROADCAST);
1070 break;
1071
1072 case SO_SNDBUF:
1073 v.val = sk->sk_sndbuf;
1074 break;
1075
1076 case SO_RCVBUF:
1077 v.val = sk->sk_rcvbuf;
1078 break;
1079
1080 case SO_REUSEADDR:
1081 v.val = sk->sk_reuse;
1082 break;
1083
1084 case SO_REUSEPORT:
1085 v.val = sk->sk_reuseport;
1086 break;
1087
1088 case SO_KEEPALIVE:
1089 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1090 break;
1091
1092 case SO_TYPE:
1093 v.val = sk->sk_type;
1094 break;
1095
1096 case SO_PROTOCOL:
1097 v.val = sk->sk_protocol;
1098 break;
1099
1100 case SO_DOMAIN:
1101 v.val = sk->sk_family;
1102 break;
1103
1104 case SO_ERROR:
1105 v.val = -sock_error(sk);
1106 if (v.val == 0)
1107 v.val = xchg(&sk->sk_err_soft, 0);
1108 break;
1109
1110 case SO_OOBINLINE:
1111 v.val = sock_flag(sk, SOCK_URGINLINE);
1112 break;
1113
1114 case SO_NO_CHECK:
1115 v.val = sk->sk_no_check_tx;
1116 break;
1117
1118 case SO_PRIORITY:
1119 v.val = sk->sk_priority;
1120 break;
1121
1122 case SO_LINGER:
1123 lv = sizeof(v.ling);
1124 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1125 v.ling.l_linger = sk->sk_lingertime / HZ;
1126 break;
1127
1128 case SO_BSDCOMPAT:
1129 sock_warn_obsolete_bsdism("getsockopt");
1130 break;
1131
1132 case SO_TIMESTAMP:
1133 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1134 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1135 break;
1136
1137 case SO_TIMESTAMPNS:
1138 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1139 break;
1140
1141 case SO_TIMESTAMPING:
1142 v.val = sk->sk_tsflags;
1143 break;
1144
1145 case SO_RCVTIMEO:
1146 lv = sizeof(struct timeval);
1147 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1148 v.tm.tv_sec = 0;
1149 v.tm.tv_usec = 0;
1150 } else {
1151 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1152 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1153 }
1154 break;
1155
1156 case SO_SNDTIMEO:
1157 lv = sizeof(struct timeval);
1158 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1159 v.tm.tv_sec = 0;
1160 v.tm.tv_usec = 0;
1161 } else {
1162 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1163 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1164 }
1165 break;
1166
1167 case SO_RCVLOWAT:
1168 v.val = sk->sk_rcvlowat;
1169 break;
1170
1171 case SO_SNDLOWAT:
1172 v.val = 1;
1173 break;
1174
1175 case SO_PASSCRED:
1176 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1177 break;
1178
1179 case SO_PEERCRED:
1180 {
1181 struct ucred peercred;
1182 if (len > sizeof(peercred))
1183 len = sizeof(peercred);
1184 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1185 if (copy_to_user(optval, &peercred, len))
1186 return -EFAULT;
1187 goto lenout;
1188 }
1189
1190 case SO_PEERNAME:
1191 {
1192 char address[128];
1193
1194 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1195 return -ENOTCONN;
1196 if (lv < len)
1197 return -EINVAL;
1198 if (copy_to_user(optval, address, len))
1199 return -EFAULT;
1200 goto lenout;
1201 }
1202
1203 /* Dubious BSD thing... Probably nobody even uses it, but
1204 * the UNIX standard wants it for whatever reason... -DaveM
1205 */
1206 case SO_ACCEPTCONN:
1207 v.val = sk->sk_state == TCP_LISTEN;
1208 break;
1209
1210 case SO_PASSSEC:
1211 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1212 break;
1213
1214 case SO_PEERSEC:
1215 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1216
1217 case SO_MARK:
1218 v.val = sk->sk_mark;
1219 break;
1220
1221 case SO_RXQ_OVFL:
1222 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1223 break;
1224
1225 case SO_WIFI_STATUS:
1226 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1227 break;
1228
1229 case SO_PEEK_OFF:
1230 if (!sock->ops->set_peek_off)
1231 return -EOPNOTSUPP;
1232
1233 v.val = sk->sk_peek_off;
1234 break;
1235 case SO_NOFCS:
1236 v.val = sock_flag(sk, SOCK_NOFCS);
1237 break;
1238
1239 case SO_BINDTODEVICE:
1240 return sock_getbindtodevice(sk, optval, optlen, len);
1241
1242 case SO_GET_FILTER:
1243 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1244 if (len < 0)
1245 return len;
1246
1247 goto lenout;
1248
1249 case SO_LOCK_FILTER:
1250 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1251 break;
1252
1253 case SO_BPF_EXTENSIONS:
1254 v.val = bpf_tell_extensions();
1255 break;
1256
1257 case SO_SELECT_ERR_QUEUE:
1258 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1259 break;
1260
1261 #ifdef CONFIG_NET_RX_BUSY_POLL
1262 case SO_BUSY_POLL:
1263 v.val = sk->sk_ll_usec;
1264 break;
1265 #endif
1266
1267 case SO_MAX_PACING_RATE:
1268 v.val = sk->sk_max_pacing_rate;
1269 break;
1270
1271 case SO_INCOMING_CPU:
1272 v.val = sk->sk_incoming_cpu;
1273 break;
1274
1275 default:
1276 /* We implement the SO_SNDLOWAT etc to not be settable
1277 * (1003.1g 7).
1278 */
1279 return -ENOPROTOOPT;
1280 }
1281
1282 if (len > lv)
1283 len = lv;
1284 if (copy_to_user(optval, &v, len))
1285 return -EFAULT;
1286 lenout:
1287 if (put_user(len, optlen))
1288 return -EFAULT;
1289 return 0;
1290 }
1291
1292 /*
1293 * Initialize an sk_lock.
1294 *
1295 * (We also register the sk_lock with the lock validator.)
1296 */
1297 static inline void sock_lock_init(struct sock *sk)
1298 {
1299 sock_lock_init_class_and_name(sk,
1300 af_family_slock_key_strings[sk->sk_family],
1301 af_family_slock_keys + sk->sk_family,
1302 af_family_key_strings[sk->sk_family],
1303 af_family_keys + sk->sk_family);
1304 }
1305
1306 /*
1307 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1308 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1309 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1310 */
1311 static void sock_copy(struct sock *nsk, const struct sock *osk)
1312 {
1313 #ifdef CONFIG_SECURITY_NETWORK
1314 void *sptr = nsk->sk_security;
1315 #endif
1316 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1317
1318 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1319 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1320
1321 #ifdef CONFIG_SECURITY_NETWORK
1322 nsk->sk_security = sptr;
1323 security_sk_clone(osk, nsk);
1324 #endif
1325 }
1326
1327 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1328 int family)
1329 {
1330 struct sock *sk;
1331 struct kmem_cache *slab;
1332
1333 slab = prot->slab;
1334 if (slab != NULL) {
1335 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1336 if (!sk)
1337 return sk;
1338 if (priority & __GFP_ZERO)
1339 sk_prot_clear_nulls(sk, prot->obj_size);
1340 } else
1341 sk = kmalloc(prot->obj_size, priority);
1342
1343 if (sk != NULL) {
1344 kmemcheck_annotate_bitfield(sk, flags);
1345
1346 if (security_sk_alloc(sk, family, priority))
1347 goto out_free;
1348
1349 if (!try_module_get(prot->owner))
1350 goto out_free_sec;
1351 sk_tx_queue_clear(sk);
1352 }
1353
1354 return sk;
1355
1356 out_free_sec:
1357 security_sk_free(sk);
1358 out_free:
1359 if (slab != NULL)
1360 kmem_cache_free(slab, sk);
1361 else
1362 kfree(sk);
1363 return NULL;
1364 }
1365
1366 static void sk_prot_free(struct proto *prot, struct sock *sk)
1367 {
1368 struct kmem_cache *slab;
1369 struct module *owner;
1370
1371 owner = prot->owner;
1372 slab = prot->slab;
1373
1374 cgroup_sk_free(&sk->sk_cgrp_data);
1375 mem_cgroup_sk_free(sk);
1376 security_sk_free(sk);
1377 if (slab != NULL)
1378 kmem_cache_free(slab, sk);
1379 else
1380 kfree(sk);
1381 module_put(owner);
1382 }
1383
1384 /**
1385 * sk_alloc - All socket objects are allocated here
1386 * @net: the applicable net namespace
1387 * @family: protocol family
1388 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1389 * @prot: struct proto associated with this new sock instance
1390 * @kern: is this to be a kernel socket?
1391 */
1392 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1393 struct proto *prot, int kern)
1394 {
1395 struct sock *sk;
1396
1397 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1398 if (sk) {
1399 sk->sk_family = family;
1400 /*
1401 * See comment in struct sock definition to understand
1402 * why we need sk_prot_creator -acme
1403 */
1404 sk->sk_prot = sk->sk_prot_creator = prot;
1405 sock_lock_init(sk);
1406 sk->sk_net_refcnt = kern ? 0 : 1;
1407 if (likely(sk->sk_net_refcnt))
1408 get_net(net);
1409 sock_net_set(sk, net);
1410 atomic_set(&sk->sk_wmem_alloc, 1);
1411
1412 mem_cgroup_sk_alloc(sk);
1413 cgroup_sk_alloc(&sk->sk_cgrp_data);
1414 sock_update_classid(&sk->sk_cgrp_data);
1415 sock_update_netprioidx(&sk->sk_cgrp_data);
1416 }
1417
1418 return sk;
1419 }
1420 EXPORT_SYMBOL(sk_alloc);
1421
1422 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1423 * grace period. This is the case for UDP sockets and TCP listeners.
1424 */
1425 static void __sk_destruct(struct rcu_head *head)
1426 {
1427 struct sock *sk = container_of(head, struct sock, sk_rcu);
1428 struct sk_filter *filter;
1429
1430 if (sk->sk_destruct)
1431 sk->sk_destruct(sk);
1432
1433 filter = rcu_dereference_check(sk->sk_filter,
1434 atomic_read(&sk->sk_wmem_alloc) == 0);
1435 if (filter) {
1436 sk_filter_uncharge(sk, filter);
1437 RCU_INIT_POINTER(sk->sk_filter, NULL);
1438 }
1439 if (rcu_access_pointer(sk->sk_reuseport_cb))
1440 reuseport_detach_sock(sk);
1441
1442 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1443
1444 if (atomic_read(&sk->sk_omem_alloc))
1445 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1446 __func__, atomic_read(&sk->sk_omem_alloc));
1447
1448 if (sk->sk_frag.page) {
1449 put_page(sk->sk_frag.page);
1450 sk->sk_frag.page = NULL;
1451 }
1452
1453 if (sk->sk_peer_cred)
1454 put_cred(sk->sk_peer_cred);
1455 put_pid(sk->sk_peer_pid);
1456 if (likely(sk->sk_net_refcnt))
1457 put_net(sock_net(sk));
1458 sk_prot_free(sk->sk_prot_creator, sk);
1459 }
1460
1461 void sk_destruct(struct sock *sk)
1462 {
1463 if (sock_flag(sk, SOCK_RCU_FREE))
1464 call_rcu(&sk->sk_rcu, __sk_destruct);
1465 else
1466 __sk_destruct(&sk->sk_rcu);
1467 }
1468
1469 static void __sk_free(struct sock *sk)
1470 {
1471 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1472 sock_diag_broadcast_destroy(sk);
1473 else
1474 sk_destruct(sk);
1475 }
1476
1477 void sk_free(struct sock *sk)
1478 {
1479 /*
1480 * We subtract one from sk_wmem_alloc and can know if
1481 * some packets are still in some tx queue.
1482 * If not null, sock_wfree() will call __sk_free(sk) later
1483 */
1484 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1485 __sk_free(sk);
1486 }
1487 EXPORT_SYMBOL(sk_free);
1488
1489 /**
1490 * sk_clone_lock - clone a socket, and lock its clone
1491 * @sk: the socket to clone
1492 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1493 *
1494 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1495 */
1496 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1497 {
1498 struct sock *newsk;
1499 bool is_charged = true;
1500
1501 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1502 if (newsk != NULL) {
1503 struct sk_filter *filter;
1504
1505 sock_copy(newsk, sk);
1506
1507 /* SANITY */
1508 if (likely(newsk->sk_net_refcnt))
1509 get_net(sock_net(newsk));
1510 sk_node_init(&newsk->sk_node);
1511 sock_lock_init(newsk);
1512 bh_lock_sock(newsk);
1513 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1514 newsk->sk_backlog.len = 0;
1515
1516 atomic_set(&newsk->sk_rmem_alloc, 0);
1517 /*
1518 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1519 */
1520 atomic_set(&newsk->sk_wmem_alloc, 1);
1521 atomic_set(&newsk->sk_omem_alloc, 0);
1522 skb_queue_head_init(&newsk->sk_receive_queue);
1523 skb_queue_head_init(&newsk->sk_write_queue);
1524
1525 rwlock_init(&newsk->sk_callback_lock);
1526 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1527 af_callback_keys + newsk->sk_family,
1528 af_family_clock_key_strings[newsk->sk_family]);
1529
1530 newsk->sk_dst_cache = NULL;
1531 newsk->sk_dst_pending_confirm = 0;
1532 newsk->sk_wmem_queued = 0;
1533 newsk->sk_forward_alloc = 0;
1534 atomic_set(&newsk->sk_drops, 0);
1535 newsk->sk_send_head = NULL;
1536 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1537
1538 sock_reset_flag(newsk, SOCK_DONE);
1539 skb_queue_head_init(&newsk->sk_error_queue);
1540
1541 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1542 if (filter != NULL)
1543 /* though it's an empty new sock, the charging may fail
1544 * if sysctl_optmem_max was changed between creation of
1545 * original socket and cloning
1546 */
1547 is_charged = sk_filter_charge(newsk, filter);
1548
1549 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1550 /* We need to make sure that we don't uncharge the new
1551 * socket if we couldn't charge it in the first place
1552 * as otherwise we uncharge the parent's filter.
1553 */
1554 if (!is_charged)
1555 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1556 /* It is still raw copy of parent, so invalidate
1557 * destructor and make plain sk_free() */
1558 newsk->sk_destruct = NULL;
1559 bh_unlock_sock(newsk);
1560 sk_free(newsk);
1561 newsk = NULL;
1562 goto out;
1563 }
1564 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1565
1566 newsk->sk_err = 0;
1567 newsk->sk_err_soft = 0;
1568 newsk->sk_priority = 0;
1569 newsk->sk_incoming_cpu = raw_smp_processor_id();
1570 atomic64_set(&newsk->sk_cookie, 0);
1571
1572 mem_cgroup_sk_alloc(newsk);
1573 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1574
1575 /*
1576 * Before updating sk_refcnt, we must commit prior changes to memory
1577 * (Documentation/RCU/rculist_nulls.txt for details)
1578 */
1579 smp_wmb();
1580 atomic_set(&newsk->sk_refcnt, 2);
1581
1582 /*
1583 * Increment the counter in the same struct proto as the master
1584 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1585 * is the same as sk->sk_prot->socks, as this field was copied
1586 * with memcpy).
1587 *
1588 * This _changes_ the previous behaviour, where
1589 * tcp_create_openreq_child always was incrementing the
1590 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1591 * to be taken into account in all callers. -acme
1592 */
1593 sk_refcnt_debug_inc(newsk);
1594 sk_set_socket(newsk, NULL);
1595 newsk->sk_wq = NULL;
1596
1597 if (newsk->sk_prot->sockets_allocated)
1598 sk_sockets_allocated_inc(newsk);
1599
1600 if (sock_needs_netstamp(sk) &&
1601 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1602 net_enable_timestamp();
1603 }
1604 out:
1605 return newsk;
1606 }
1607 EXPORT_SYMBOL_GPL(sk_clone_lock);
1608
1609 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1610 {
1611 u32 max_segs = 1;
1612
1613 sk_dst_set(sk, dst);
1614 sk->sk_route_caps = dst->dev->features;
1615 if (sk->sk_route_caps & NETIF_F_GSO)
1616 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1617 sk->sk_route_caps &= ~sk->sk_route_nocaps;
1618 if (sk_can_gso(sk)) {
1619 if (dst->header_len) {
1620 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1621 } else {
1622 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1623 sk->sk_gso_max_size = dst->dev->gso_max_size;
1624 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1625 }
1626 }
1627 sk->sk_gso_max_segs = max_segs;
1628 }
1629 EXPORT_SYMBOL_GPL(sk_setup_caps);
1630
1631 /*
1632 * Simple resource managers for sockets.
1633 */
1634
1635
1636 /*
1637 * Write buffer destructor automatically called from kfree_skb.
1638 */
1639 void sock_wfree(struct sk_buff *skb)
1640 {
1641 struct sock *sk = skb->sk;
1642 unsigned int len = skb->truesize;
1643
1644 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1645 /*
1646 * Keep a reference on sk_wmem_alloc, this will be released
1647 * after sk_write_space() call
1648 */
1649 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1650 sk->sk_write_space(sk);
1651 len = 1;
1652 }
1653 /*
1654 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1655 * could not do because of in-flight packets
1656 */
1657 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1658 __sk_free(sk);
1659 }
1660 EXPORT_SYMBOL(sock_wfree);
1661
1662 /* This variant of sock_wfree() is used by TCP,
1663 * since it sets SOCK_USE_WRITE_QUEUE.
1664 */
1665 void __sock_wfree(struct sk_buff *skb)
1666 {
1667 struct sock *sk = skb->sk;
1668
1669 if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1670 __sk_free(sk);
1671 }
1672
1673 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1674 {
1675 skb_orphan(skb);
1676 skb->sk = sk;
1677 #ifdef CONFIG_INET
1678 if (unlikely(!sk_fullsock(sk))) {
1679 skb->destructor = sock_edemux;
1680 sock_hold(sk);
1681 return;
1682 }
1683 #endif
1684 skb->destructor = sock_wfree;
1685 skb_set_hash_from_sk(skb, sk);
1686 /*
1687 * We used to take a refcount on sk, but following operation
1688 * is enough to guarantee sk_free() wont free this sock until
1689 * all in-flight packets are completed
1690 */
1691 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1692 }
1693 EXPORT_SYMBOL(skb_set_owner_w);
1694
1695 /* This helper is used by netem, as it can hold packets in its
1696 * delay queue. We want to allow the owner socket to send more
1697 * packets, as if they were already TX completed by a typical driver.
1698 * But we also want to keep skb->sk set because some packet schedulers
1699 * rely on it (sch_fq for example). So we set skb->truesize to a small
1700 * amount (1) and decrease sk_wmem_alloc accordingly.
1701 */
1702 void skb_orphan_partial(struct sk_buff *skb)
1703 {
1704 /* If this skb is a TCP pure ACK or already went here,
1705 * we have nothing to do. 2 is already a very small truesize.
1706 */
1707 if (skb->truesize <= 2)
1708 return;
1709
1710 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1711 * so we do not completely orphan skb, but transfert all
1712 * accounted bytes but one, to avoid unexpected reorders.
1713 */
1714 if (skb->destructor == sock_wfree
1715 #ifdef CONFIG_INET
1716 || skb->destructor == tcp_wfree
1717 #endif
1718 ) {
1719 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1720 skb->truesize = 1;
1721 } else {
1722 skb_orphan(skb);
1723 }
1724 }
1725 EXPORT_SYMBOL(skb_orphan_partial);
1726
1727 /*
1728 * Read buffer destructor automatically called from kfree_skb.
1729 */
1730 void sock_rfree(struct sk_buff *skb)
1731 {
1732 struct sock *sk = skb->sk;
1733 unsigned int len = skb->truesize;
1734
1735 atomic_sub(len, &sk->sk_rmem_alloc);
1736 sk_mem_uncharge(sk, len);
1737 }
1738 EXPORT_SYMBOL(sock_rfree);
1739
1740 /*
1741 * Buffer destructor for skbs that are not used directly in read or write
1742 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1743 */
1744 void sock_efree(struct sk_buff *skb)
1745 {
1746 sock_put(skb->sk);
1747 }
1748 EXPORT_SYMBOL(sock_efree);
1749
1750 kuid_t sock_i_uid(struct sock *sk)
1751 {
1752 kuid_t uid;
1753
1754 read_lock_bh(&sk->sk_callback_lock);
1755 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1756 read_unlock_bh(&sk->sk_callback_lock);
1757 return uid;
1758 }
1759 EXPORT_SYMBOL(sock_i_uid);
1760
1761 unsigned long sock_i_ino(struct sock *sk)
1762 {
1763 unsigned long ino;
1764
1765 read_lock_bh(&sk->sk_callback_lock);
1766 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1767 read_unlock_bh(&sk->sk_callback_lock);
1768 return ino;
1769 }
1770 EXPORT_SYMBOL(sock_i_ino);
1771
1772 /*
1773 * Allocate a skb from the socket's send buffer.
1774 */
1775 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1776 gfp_t priority)
1777 {
1778 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1779 struct sk_buff *skb = alloc_skb(size, priority);
1780 if (skb) {
1781 skb_set_owner_w(skb, sk);
1782 return skb;
1783 }
1784 }
1785 return NULL;
1786 }
1787 EXPORT_SYMBOL(sock_wmalloc);
1788
1789 /*
1790 * Allocate a memory block from the socket's option memory buffer.
1791 */
1792 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1793 {
1794 if ((unsigned int)size <= sysctl_optmem_max &&
1795 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1796 void *mem;
1797 /* First do the add, to avoid the race if kmalloc
1798 * might sleep.
1799 */
1800 atomic_add(size, &sk->sk_omem_alloc);
1801 mem = kmalloc(size, priority);
1802 if (mem)
1803 return mem;
1804 atomic_sub(size, &sk->sk_omem_alloc);
1805 }
1806 return NULL;
1807 }
1808 EXPORT_SYMBOL(sock_kmalloc);
1809
1810 /* Free an option memory block. Note, we actually want the inline
1811 * here as this allows gcc to detect the nullify and fold away the
1812 * condition entirely.
1813 */
1814 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1815 const bool nullify)
1816 {
1817 if (WARN_ON_ONCE(!mem))
1818 return;
1819 if (nullify)
1820 kzfree(mem);
1821 else
1822 kfree(mem);
1823 atomic_sub(size, &sk->sk_omem_alloc);
1824 }
1825
1826 void sock_kfree_s(struct sock *sk, void *mem, int size)
1827 {
1828 __sock_kfree_s(sk, mem, size, false);
1829 }
1830 EXPORT_SYMBOL(sock_kfree_s);
1831
1832 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1833 {
1834 __sock_kfree_s(sk, mem, size, true);
1835 }
1836 EXPORT_SYMBOL(sock_kzfree_s);
1837
1838 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1839 I think, these locks should be removed for datagram sockets.
1840 */
1841 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1842 {
1843 DEFINE_WAIT(wait);
1844
1845 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1846 for (;;) {
1847 if (!timeo)
1848 break;
1849 if (signal_pending(current))
1850 break;
1851 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1852 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1853 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1854 break;
1855 if (sk->sk_shutdown & SEND_SHUTDOWN)
1856 break;
1857 if (sk->sk_err)
1858 break;
1859 timeo = schedule_timeout(timeo);
1860 }
1861 finish_wait(sk_sleep(sk), &wait);
1862 return timeo;
1863 }
1864
1865
1866 /*
1867 * Generic send/receive buffer handlers
1868 */
1869
1870 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1871 unsigned long data_len, int noblock,
1872 int *errcode, int max_page_order)
1873 {
1874 struct sk_buff *skb;
1875 long timeo;
1876 int err;
1877
1878 timeo = sock_sndtimeo(sk, noblock);
1879 for (;;) {
1880 err = sock_error(sk);
1881 if (err != 0)
1882 goto failure;
1883
1884 err = -EPIPE;
1885 if (sk->sk_shutdown & SEND_SHUTDOWN)
1886 goto failure;
1887
1888 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1889 break;
1890
1891 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1892 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1893 err = -EAGAIN;
1894 if (!timeo)
1895 goto failure;
1896 if (signal_pending(current))
1897 goto interrupted;
1898 timeo = sock_wait_for_wmem(sk, timeo);
1899 }
1900 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1901 errcode, sk->sk_allocation);
1902 if (skb)
1903 skb_set_owner_w(skb, sk);
1904 return skb;
1905
1906 interrupted:
1907 err = sock_intr_errno(timeo);
1908 failure:
1909 *errcode = err;
1910 return NULL;
1911 }
1912 EXPORT_SYMBOL(sock_alloc_send_pskb);
1913
1914 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1915 int noblock, int *errcode)
1916 {
1917 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1918 }
1919 EXPORT_SYMBOL(sock_alloc_send_skb);
1920
1921 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1922 struct sockcm_cookie *sockc)
1923 {
1924 u32 tsflags;
1925
1926 switch (cmsg->cmsg_type) {
1927 case SO_MARK:
1928 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1929 return -EPERM;
1930 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1931 return -EINVAL;
1932 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1933 break;
1934 case SO_TIMESTAMPING:
1935 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1936 return -EINVAL;
1937
1938 tsflags = *(u32 *)CMSG_DATA(cmsg);
1939 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1940 return -EINVAL;
1941
1942 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1943 sockc->tsflags |= tsflags;
1944 break;
1945 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1946 case SCM_RIGHTS:
1947 case SCM_CREDENTIALS:
1948 break;
1949 default:
1950 return -EINVAL;
1951 }
1952 return 0;
1953 }
1954 EXPORT_SYMBOL(__sock_cmsg_send);
1955
1956 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1957 struct sockcm_cookie *sockc)
1958 {
1959 struct cmsghdr *cmsg;
1960 int ret;
1961
1962 for_each_cmsghdr(cmsg, msg) {
1963 if (!CMSG_OK(msg, cmsg))
1964 return -EINVAL;
1965 if (cmsg->cmsg_level != SOL_SOCKET)
1966 continue;
1967 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1968 if (ret)
1969 return ret;
1970 }
1971 return 0;
1972 }
1973 EXPORT_SYMBOL(sock_cmsg_send);
1974
1975 /* On 32bit arches, an skb frag is limited to 2^15 */
1976 #define SKB_FRAG_PAGE_ORDER get_order(32768)
1977
1978 /**
1979 * skb_page_frag_refill - check that a page_frag contains enough room
1980 * @sz: minimum size of the fragment we want to get
1981 * @pfrag: pointer to page_frag
1982 * @gfp: priority for memory allocation
1983 *
1984 * Note: While this allocator tries to use high order pages, there is
1985 * no guarantee that allocations succeed. Therefore, @sz MUST be
1986 * less or equal than PAGE_SIZE.
1987 */
1988 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1989 {
1990 if (pfrag->page) {
1991 if (page_ref_count(pfrag->page) == 1) {
1992 pfrag->offset = 0;
1993 return true;
1994 }
1995 if (pfrag->offset + sz <= pfrag->size)
1996 return true;
1997 put_page(pfrag->page);
1998 }
1999
2000 pfrag->offset = 0;
2001 if (SKB_FRAG_PAGE_ORDER) {
2002 /* Avoid direct reclaim but allow kswapd to wake */
2003 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2004 __GFP_COMP | __GFP_NOWARN |
2005 __GFP_NORETRY,
2006 SKB_FRAG_PAGE_ORDER);
2007 if (likely(pfrag->page)) {
2008 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2009 return true;
2010 }
2011 }
2012 pfrag->page = alloc_page(gfp);
2013 if (likely(pfrag->page)) {
2014 pfrag->size = PAGE_SIZE;
2015 return true;
2016 }
2017 return false;
2018 }
2019 EXPORT_SYMBOL(skb_page_frag_refill);
2020
2021 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2022 {
2023 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2024 return true;
2025
2026 sk_enter_memory_pressure(sk);
2027 sk_stream_moderate_sndbuf(sk);
2028 return false;
2029 }
2030 EXPORT_SYMBOL(sk_page_frag_refill);
2031
2032 static void __lock_sock(struct sock *sk)
2033 __releases(&sk->sk_lock.slock)
2034 __acquires(&sk->sk_lock.slock)
2035 {
2036 DEFINE_WAIT(wait);
2037
2038 for (;;) {
2039 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2040 TASK_UNINTERRUPTIBLE);
2041 spin_unlock_bh(&sk->sk_lock.slock);
2042 schedule();
2043 spin_lock_bh(&sk->sk_lock.slock);
2044 if (!sock_owned_by_user(sk))
2045 break;
2046 }
2047 finish_wait(&sk->sk_lock.wq, &wait);
2048 }
2049
2050 static void __release_sock(struct sock *sk)
2051 __releases(&sk->sk_lock.slock)
2052 __acquires(&sk->sk_lock.slock)
2053 {
2054 struct sk_buff *skb, *next;
2055
2056 while ((skb = sk->sk_backlog.head) != NULL) {
2057 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2058
2059 spin_unlock_bh(&sk->sk_lock.slock);
2060
2061 do {
2062 next = skb->next;
2063 prefetch(next);
2064 WARN_ON_ONCE(skb_dst_is_noref(skb));
2065 skb->next = NULL;
2066 sk_backlog_rcv(sk, skb);
2067
2068 cond_resched();
2069
2070 skb = next;
2071 } while (skb != NULL);
2072
2073 spin_lock_bh(&sk->sk_lock.slock);
2074 }
2075
2076 /*
2077 * Doing the zeroing here guarantee we can not loop forever
2078 * while a wild producer attempts to flood us.
2079 */
2080 sk->sk_backlog.len = 0;
2081 }
2082
2083 void __sk_flush_backlog(struct sock *sk)
2084 {
2085 spin_lock_bh(&sk->sk_lock.slock);
2086 __release_sock(sk);
2087 spin_unlock_bh(&sk->sk_lock.slock);
2088 }
2089
2090 /**
2091 * sk_wait_data - wait for data to arrive at sk_receive_queue
2092 * @sk: sock to wait on
2093 * @timeo: for how long
2094 * @skb: last skb seen on sk_receive_queue
2095 *
2096 * Now socket state including sk->sk_err is changed only under lock,
2097 * hence we may omit checks after joining wait queue.
2098 * We check receive queue before schedule() only as optimization;
2099 * it is very likely that release_sock() added new data.
2100 */
2101 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2102 {
2103 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2104 int rc;
2105
2106 add_wait_queue(sk_sleep(sk), &wait);
2107 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2108 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2109 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2110 remove_wait_queue(sk_sleep(sk), &wait);
2111 return rc;
2112 }
2113 EXPORT_SYMBOL(sk_wait_data);
2114
2115 /**
2116 * __sk_mem_raise_allocated - increase memory_allocated
2117 * @sk: socket
2118 * @size: memory size to allocate
2119 * @amt: pages to allocate
2120 * @kind: allocation type
2121 *
2122 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2123 */
2124 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2125 {
2126 struct proto *prot = sk->sk_prot;
2127 long allocated = sk_memory_allocated_add(sk, amt);
2128
2129 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2130 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2131 goto suppress_allocation;
2132
2133 /* Under limit. */
2134 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2135 sk_leave_memory_pressure(sk);
2136 return 1;
2137 }
2138
2139 /* Under pressure. */
2140 if (allocated > sk_prot_mem_limits(sk, 1))
2141 sk_enter_memory_pressure(sk);
2142
2143 /* Over hard limit. */
2144 if (allocated > sk_prot_mem_limits(sk, 2))
2145 goto suppress_allocation;
2146
2147 /* guarantee minimum buffer size under pressure */
2148 if (kind == SK_MEM_RECV) {
2149 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2150 return 1;
2151
2152 } else { /* SK_MEM_SEND */
2153 if (sk->sk_type == SOCK_STREAM) {
2154 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2155 return 1;
2156 } else if (atomic_read(&sk->sk_wmem_alloc) <
2157 prot->sysctl_wmem[0])
2158 return 1;
2159 }
2160
2161 if (sk_has_memory_pressure(sk)) {
2162 int alloc;
2163
2164 if (!sk_under_memory_pressure(sk))
2165 return 1;
2166 alloc = sk_sockets_allocated_read_positive(sk);
2167 if (sk_prot_mem_limits(sk, 2) > alloc *
2168 sk_mem_pages(sk->sk_wmem_queued +
2169 atomic_read(&sk->sk_rmem_alloc) +
2170 sk->sk_forward_alloc))
2171 return 1;
2172 }
2173
2174 suppress_allocation:
2175
2176 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2177 sk_stream_moderate_sndbuf(sk);
2178
2179 /* Fail only if socket is _under_ its sndbuf.
2180 * In this case we cannot block, so that we have to fail.
2181 */
2182 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2183 return 1;
2184 }
2185
2186 trace_sock_exceed_buf_limit(sk, prot, allocated);
2187
2188 sk_memory_allocated_sub(sk, amt);
2189
2190 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2191 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2192
2193 return 0;
2194 }
2195 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2196
2197 /**
2198 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2199 * @sk: socket
2200 * @size: memory size to allocate
2201 * @kind: allocation type
2202 *
2203 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2204 * rmem allocation. This function assumes that protocols which have
2205 * memory_pressure use sk_wmem_queued as write buffer accounting.
2206 */
2207 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2208 {
2209 int ret, amt = sk_mem_pages(size);
2210
2211 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2212 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2213 if (!ret)
2214 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2215 return ret;
2216 }
2217 EXPORT_SYMBOL(__sk_mem_schedule);
2218
2219 /**
2220 * __sk_mem_reduce_allocated - reclaim memory_allocated
2221 * @sk: socket
2222 * @amount: number of quanta
2223 *
2224 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2225 */
2226 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2227 {
2228 sk_memory_allocated_sub(sk, amount);
2229
2230 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2231 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2232
2233 if (sk_under_memory_pressure(sk) &&
2234 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2235 sk_leave_memory_pressure(sk);
2236 }
2237 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2238
2239 /**
2240 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2241 * @sk: socket
2242 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2243 */
2244 void __sk_mem_reclaim(struct sock *sk, int amount)
2245 {
2246 amount >>= SK_MEM_QUANTUM_SHIFT;
2247 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2248 __sk_mem_reduce_allocated(sk, amount);
2249 }
2250 EXPORT_SYMBOL(__sk_mem_reclaim);
2251
2252 int sk_set_peek_off(struct sock *sk, int val)
2253 {
2254 if (val < 0)
2255 return -EINVAL;
2256
2257 sk->sk_peek_off = val;
2258 return 0;
2259 }
2260 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2261
2262 /*
2263 * Set of default routines for initialising struct proto_ops when
2264 * the protocol does not support a particular function. In certain
2265 * cases where it makes no sense for a protocol to have a "do nothing"
2266 * function, some default processing is provided.
2267 */
2268
2269 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2270 {
2271 return -EOPNOTSUPP;
2272 }
2273 EXPORT_SYMBOL(sock_no_bind);
2274
2275 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2276 int len, int flags)
2277 {
2278 return -EOPNOTSUPP;
2279 }
2280 EXPORT_SYMBOL(sock_no_connect);
2281
2282 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2283 {
2284 return -EOPNOTSUPP;
2285 }
2286 EXPORT_SYMBOL(sock_no_socketpair);
2287
2288 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2289 {
2290 return -EOPNOTSUPP;
2291 }
2292 EXPORT_SYMBOL(sock_no_accept);
2293
2294 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2295 int *len, int peer)
2296 {
2297 return -EOPNOTSUPP;
2298 }
2299 EXPORT_SYMBOL(sock_no_getname);
2300
2301 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2302 {
2303 return 0;
2304 }
2305 EXPORT_SYMBOL(sock_no_poll);
2306
2307 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2308 {
2309 return -EOPNOTSUPP;
2310 }
2311 EXPORT_SYMBOL(sock_no_ioctl);
2312
2313 int sock_no_listen(struct socket *sock, int backlog)
2314 {
2315 return -EOPNOTSUPP;
2316 }
2317 EXPORT_SYMBOL(sock_no_listen);
2318
2319 int sock_no_shutdown(struct socket *sock, int how)
2320 {
2321 return -EOPNOTSUPP;
2322 }
2323 EXPORT_SYMBOL(sock_no_shutdown);
2324
2325 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2326 char __user *optval, unsigned int optlen)
2327 {
2328 return -EOPNOTSUPP;
2329 }
2330 EXPORT_SYMBOL(sock_no_setsockopt);
2331
2332 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2333 char __user *optval, int __user *optlen)
2334 {
2335 return -EOPNOTSUPP;
2336 }
2337 EXPORT_SYMBOL(sock_no_getsockopt);
2338
2339 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2340 {
2341 return -EOPNOTSUPP;
2342 }
2343 EXPORT_SYMBOL(sock_no_sendmsg);
2344
2345 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2346 int flags)
2347 {
2348 return -EOPNOTSUPP;
2349 }
2350 EXPORT_SYMBOL(sock_no_recvmsg);
2351
2352 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2353 {
2354 /* Mirror missing mmap method error code */
2355 return -ENODEV;
2356 }
2357 EXPORT_SYMBOL(sock_no_mmap);
2358
2359 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2360 {
2361 ssize_t res;
2362 struct msghdr msg = {.msg_flags = flags};
2363 struct kvec iov;
2364 char *kaddr = kmap(page);
2365 iov.iov_base = kaddr + offset;
2366 iov.iov_len = size;
2367 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2368 kunmap(page);
2369 return res;
2370 }
2371 EXPORT_SYMBOL(sock_no_sendpage);
2372
2373 /*
2374 * Default Socket Callbacks
2375 */
2376
2377 static void sock_def_wakeup(struct sock *sk)
2378 {
2379 struct socket_wq *wq;
2380
2381 rcu_read_lock();
2382 wq = rcu_dereference(sk->sk_wq);
2383 if (skwq_has_sleeper(wq))
2384 wake_up_interruptible_all(&wq->wait);
2385 rcu_read_unlock();
2386 }
2387
2388 static void sock_def_error_report(struct sock *sk)
2389 {
2390 struct socket_wq *wq;
2391
2392 rcu_read_lock();
2393 wq = rcu_dereference(sk->sk_wq);
2394 if (skwq_has_sleeper(wq))
2395 wake_up_interruptible_poll(&wq->wait, POLLERR);
2396 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2397 rcu_read_unlock();
2398 }
2399
2400 static void sock_def_readable(struct sock *sk)
2401 {
2402 struct socket_wq *wq;
2403
2404 rcu_read_lock();
2405 wq = rcu_dereference(sk->sk_wq);
2406 if (skwq_has_sleeper(wq))
2407 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2408 POLLRDNORM | POLLRDBAND);
2409 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2410 rcu_read_unlock();
2411 }
2412
2413 static void sock_def_write_space(struct sock *sk)
2414 {
2415 struct socket_wq *wq;
2416
2417 rcu_read_lock();
2418
2419 /* Do not wake up a writer until he can make "significant"
2420 * progress. --DaveM
2421 */
2422 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2423 wq = rcu_dereference(sk->sk_wq);
2424 if (skwq_has_sleeper(wq))
2425 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2426 POLLWRNORM | POLLWRBAND);
2427
2428 /* Should agree with poll, otherwise some programs break */
2429 if (sock_writeable(sk))
2430 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2431 }
2432
2433 rcu_read_unlock();
2434 }
2435
2436 static void sock_def_destruct(struct sock *sk)
2437 {
2438 }
2439
2440 void sk_send_sigurg(struct sock *sk)
2441 {
2442 if (sk->sk_socket && sk->sk_socket->file)
2443 if (send_sigurg(&sk->sk_socket->file->f_owner))
2444 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2445 }
2446 EXPORT_SYMBOL(sk_send_sigurg);
2447
2448 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2449 unsigned long expires)
2450 {
2451 if (!mod_timer(timer, expires))
2452 sock_hold(sk);
2453 }
2454 EXPORT_SYMBOL(sk_reset_timer);
2455
2456 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2457 {
2458 if (del_timer(timer))
2459 __sock_put(sk);
2460 }
2461 EXPORT_SYMBOL(sk_stop_timer);
2462
2463 void sock_init_data(struct socket *sock, struct sock *sk)
2464 {
2465 skb_queue_head_init(&sk->sk_receive_queue);
2466 skb_queue_head_init(&sk->sk_write_queue);
2467 skb_queue_head_init(&sk->sk_error_queue);
2468
2469 sk->sk_send_head = NULL;
2470
2471 init_timer(&sk->sk_timer);
2472
2473 sk->sk_allocation = GFP_KERNEL;
2474 sk->sk_rcvbuf = sysctl_rmem_default;
2475 sk->sk_sndbuf = sysctl_wmem_default;
2476 sk->sk_state = TCP_CLOSE;
2477 sk_set_socket(sk, sock);
2478
2479 sock_set_flag(sk, SOCK_ZAPPED);
2480
2481 if (sock) {
2482 sk->sk_type = sock->type;
2483 sk->sk_wq = sock->wq;
2484 sock->sk = sk;
2485 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2486 } else {
2487 sk->sk_wq = NULL;
2488 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2489 }
2490
2491 rwlock_init(&sk->sk_callback_lock);
2492 lockdep_set_class_and_name(&sk->sk_callback_lock,
2493 af_callback_keys + sk->sk_family,
2494 af_family_clock_key_strings[sk->sk_family]);
2495
2496 sk->sk_state_change = sock_def_wakeup;
2497 sk->sk_data_ready = sock_def_readable;
2498 sk->sk_write_space = sock_def_write_space;
2499 sk->sk_error_report = sock_def_error_report;
2500 sk->sk_destruct = sock_def_destruct;
2501
2502 sk->sk_frag.page = NULL;
2503 sk->sk_frag.offset = 0;
2504 sk->sk_peek_off = -1;
2505
2506 sk->sk_peer_pid = NULL;
2507 sk->sk_peer_cred = NULL;
2508 sk->sk_write_pending = 0;
2509 sk->sk_rcvlowat = 1;
2510 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2511 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2512
2513 sk->sk_stamp = ktime_set(-1L, 0);
2514
2515 #ifdef CONFIG_NET_RX_BUSY_POLL
2516 sk->sk_napi_id = 0;
2517 sk->sk_ll_usec = sysctl_net_busy_read;
2518 #endif
2519
2520 sk->sk_max_pacing_rate = ~0U;
2521 sk->sk_pacing_rate = ~0U;
2522 sk->sk_incoming_cpu = -1;
2523 /*
2524 * Before updating sk_refcnt, we must commit prior changes to memory
2525 * (Documentation/RCU/rculist_nulls.txt for details)
2526 */
2527 smp_wmb();
2528 atomic_set(&sk->sk_refcnt, 1);
2529 atomic_set(&sk->sk_drops, 0);
2530 }
2531 EXPORT_SYMBOL(sock_init_data);
2532
2533 void lock_sock_nested(struct sock *sk, int subclass)
2534 {
2535 might_sleep();
2536 spin_lock_bh(&sk->sk_lock.slock);
2537 if (sk->sk_lock.owned)
2538 __lock_sock(sk);
2539 sk->sk_lock.owned = 1;
2540 spin_unlock(&sk->sk_lock.slock);
2541 /*
2542 * The sk_lock has mutex_lock() semantics here:
2543 */
2544 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2545 local_bh_enable();
2546 }
2547 EXPORT_SYMBOL(lock_sock_nested);
2548
2549 void release_sock(struct sock *sk)
2550 {
2551 spin_lock_bh(&sk->sk_lock.slock);
2552 if (sk->sk_backlog.tail)
2553 __release_sock(sk);
2554
2555 /* Warning : release_cb() might need to release sk ownership,
2556 * ie call sock_release_ownership(sk) before us.
2557 */
2558 if (sk->sk_prot->release_cb)
2559 sk->sk_prot->release_cb(sk);
2560
2561 sock_release_ownership(sk);
2562 if (waitqueue_active(&sk->sk_lock.wq))
2563 wake_up(&sk->sk_lock.wq);
2564 spin_unlock_bh(&sk->sk_lock.slock);
2565 }
2566 EXPORT_SYMBOL(release_sock);
2567
2568 /**
2569 * lock_sock_fast - fast version of lock_sock
2570 * @sk: socket
2571 *
2572 * This version should be used for very small section, where process wont block
2573 * return false if fast path is taken
2574 * sk_lock.slock locked, owned = 0, BH disabled
2575 * return true if slow path is taken
2576 * sk_lock.slock unlocked, owned = 1, BH enabled
2577 */
2578 bool lock_sock_fast(struct sock *sk)
2579 {
2580 might_sleep();
2581 spin_lock_bh(&sk->sk_lock.slock);
2582
2583 if (!sk->sk_lock.owned)
2584 /*
2585 * Note : We must disable BH
2586 */
2587 return false;
2588
2589 __lock_sock(sk);
2590 sk->sk_lock.owned = 1;
2591 spin_unlock(&sk->sk_lock.slock);
2592 /*
2593 * The sk_lock has mutex_lock() semantics here:
2594 */
2595 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2596 local_bh_enable();
2597 return true;
2598 }
2599 EXPORT_SYMBOL(lock_sock_fast);
2600
2601 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2602 {
2603 struct timeval tv;
2604 if (!sock_flag(sk, SOCK_TIMESTAMP))
2605 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2606 tv = ktime_to_timeval(sk->sk_stamp);
2607 if (tv.tv_sec == -1)
2608 return -ENOENT;
2609 if (tv.tv_sec == 0) {
2610 sk->sk_stamp = ktime_get_real();
2611 tv = ktime_to_timeval(sk->sk_stamp);
2612 }
2613 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2614 }
2615 EXPORT_SYMBOL(sock_get_timestamp);
2616
2617 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2618 {
2619 struct timespec ts;
2620 if (!sock_flag(sk, SOCK_TIMESTAMP))
2621 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2622 ts = ktime_to_timespec(sk->sk_stamp);
2623 if (ts.tv_sec == -1)
2624 return -ENOENT;
2625 if (ts.tv_sec == 0) {
2626 sk->sk_stamp = ktime_get_real();
2627 ts = ktime_to_timespec(sk->sk_stamp);
2628 }
2629 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2630 }
2631 EXPORT_SYMBOL(sock_get_timestampns);
2632
2633 void sock_enable_timestamp(struct sock *sk, int flag)
2634 {
2635 if (!sock_flag(sk, flag)) {
2636 unsigned long previous_flags = sk->sk_flags;
2637
2638 sock_set_flag(sk, flag);
2639 /*
2640 * we just set one of the two flags which require net
2641 * time stamping, but time stamping might have been on
2642 * already because of the other one
2643 */
2644 if (sock_needs_netstamp(sk) &&
2645 !(previous_flags & SK_FLAGS_TIMESTAMP))
2646 net_enable_timestamp();
2647 }
2648 }
2649
2650 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2651 int level, int type)
2652 {
2653 struct sock_exterr_skb *serr;
2654 struct sk_buff *skb;
2655 int copied, err;
2656
2657 err = -EAGAIN;
2658 skb = sock_dequeue_err_skb(sk);
2659 if (skb == NULL)
2660 goto out;
2661
2662 copied = skb->len;
2663 if (copied > len) {
2664 msg->msg_flags |= MSG_TRUNC;
2665 copied = len;
2666 }
2667 err = skb_copy_datagram_msg(skb, 0, msg, copied);
2668 if (err)
2669 goto out_free_skb;
2670
2671 sock_recv_timestamp(msg, sk, skb);
2672
2673 serr = SKB_EXT_ERR(skb);
2674 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2675
2676 msg->msg_flags |= MSG_ERRQUEUE;
2677 err = copied;
2678
2679 out_free_skb:
2680 kfree_skb(skb);
2681 out:
2682 return err;
2683 }
2684 EXPORT_SYMBOL(sock_recv_errqueue);
2685
2686 /*
2687 * Get a socket option on an socket.
2688 *
2689 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2690 * asynchronous errors should be reported by getsockopt. We assume
2691 * this means if you specify SO_ERROR (otherwise whats the point of it).
2692 */
2693 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2694 char __user *optval, int __user *optlen)
2695 {
2696 struct sock *sk = sock->sk;
2697
2698 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2699 }
2700 EXPORT_SYMBOL(sock_common_getsockopt);
2701
2702 #ifdef CONFIG_COMPAT
2703 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2704 char __user *optval, int __user *optlen)
2705 {
2706 struct sock *sk = sock->sk;
2707
2708 if (sk->sk_prot->compat_getsockopt != NULL)
2709 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2710 optval, optlen);
2711 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2712 }
2713 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2714 #endif
2715
2716 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2717 int flags)
2718 {
2719 struct sock *sk = sock->sk;
2720 int addr_len = 0;
2721 int err;
2722
2723 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2724 flags & ~MSG_DONTWAIT, &addr_len);
2725 if (err >= 0)
2726 msg->msg_namelen = addr_len;
2727 return err;
2728 }
2729 EXPORT_SYMBOL(sock_common_recvmsg);
2730
2731 /*
2732 * Set socket options on an inet socket.
2733 */
2734 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2735 char __user *optval, unsigned int optlen)
2736 {
2737 struct sock *sk = sock->sk;
2738
2739 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2740 }
2741 EXPORT_SYMBOL(sock_common_setsockopt);
2742
2743 #ifdef CONFIG_COMPAT
2744 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2745 char __user *optval, unsigned int optlen)
2746 {
2747 struct sock *sk = sock->sk;
2748
2749 if (sk->sk_prot->compat_setsockopt != NULL)
2750 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2751 optval, optlen);
2752 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2753 }
2754 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2755 #endif
2756
2757 void sk_common_release(struct sock *sk)
2758 {
2759 if (sk->sk_prot->destroy)
2760 sk->sk_prot->destroy(sk);
2761
2762 /*
2763 * Observation: when sock_common_release is called, processes have
2764 * no access to socket. But net still has.
2765 * Step one, detach it from networking:
2766 *
2767 * A. Remove from hash tables.
2768 */
2769
2770 sk->sk_prot->unhash(sk);
2771
2772 /*
2773 * In this point socket cannot receive new packets, but it is possible
2774 * that some packets are in flight because some CPU runs receiver and
2775 * did hash table lookup before we unhashed socket. They will achieve
2776 * receive queue and will be purged by socket destructor.
2777 *
2778 * Also we still have packets pending on receive queue and probably,
2779 * our own packets waiting in device queues. sock_destroy will drain
2780 * receive queue, but transmitted packets will delay socket destruction
2781 * until the last reference will be released.
2782 */
2783
2784 sock_orphan(sk);
2785
2786 xfrm_sk_free_policy(sk);
2787
2788 sk_refcnt_debug_release(sk);
2789
2790 sock_put(sk);
2791 }
2792 EXPORT_SYMBOL(sk_common_release);
2793
2794 #ifdef CONFIG_PROC_FS
2795 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2796 struct prot_inuse {
2797 int val[PROTO_INUSE_NR];
2798 };
2799
2800 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2801
2802 #ifdef CONFIG_NET_NS
2803 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2804 {
2805 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2806 }
2807 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2808
2809 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2810 {
2811 int cpu, idx = prot->inuse_idx;
2812 int res = 0;
2813
2814 for_each_possible_cpu(cpu)
2815 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2816
2817 return res >= 0 ? res : 0;
2818 }
2819 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2820
2821 static int __net_init sock_inuse_init_net(struct net *net)
2822 {
2823 net->core.inuse = alloc_percpu(struct prot_inuse);
2824 return net->core.inuse ? 0 : -ENOMEM;
2825 }
2826
2827 static void __net_exit sock_inuse_exit_net(struct net *net)
2828 {
2829 free_percpu(net->core.inuse);
2830 }
2831
2832 static struct pernet_operations net_inuse_ops = {
2833 .init = sock_inuse_init_net,
2834 .exit = sock_inuse_exit_net,
2835 };
2836
2837 static __init int net_inuse_init(void)
2838 {
2839 if (register_pernet_subsys(&net_inuse_ops))
2840 panic("Cannot initialize net inuse counters");
2841
2842 return 0;
2843 }
2844
2845 core_initcall(net_inuse_init);
2846 #else
2847 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2848
2849 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2850 {
2851 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2852 }
2853 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2854
2855 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2856 {
2857 int cpu, idx = prot->inuse_idx;
2858 int res = 0;
2859
2860 for_each_possible_cpu(cpu)
2861 res += per_cpu(prot_inuse, cpu).val[idx];
2862
2863 return res >= 0 ? res : 0;
2864 }
2865 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2866 #endif
2867
2868 static void assign_proto_idx(struct proto *prot)
2869 {
2870 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2871
2872 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2873 pr_err("PROTO_INUSE_NR exhausted\n");
2874 return;
2875 }
2876
2877 set_bit(prot->inuse_idx, proto_inuse_idx);
2878 }
2879
2880 static void release_proto_idx(struct proto *prot)
2881 {
2882 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2883 clear_bit(prot->inuse_idx, proto_inuse_idx);
2884 }
2885 #else
2886 static inline void assign_proto_idx(struct proto *prot)
2887 {
2888 }
2889
2890 static inline void release_proto_idx(struct proto *prot)
2891 {
2892 }
2893 #endif
2894
2895 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2896 {
2897 if (!rsk_prot)
2898 return;
2899 kfree(rsk_prot->slab_name);
2900 rsk_prot->slab_name = NULL;
2901 kmem_cache_destroy(rsk_prot->slab);
2902 rsk_prot->slab = NULL;
2903 }
2904
2905 static int req_prot_init(const struct proto *prot)
2906 {
2907 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2908
2909 if (!rsk_prot)
2910 return 0;
2911
2912 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2913 prot->name);
2914 if (!rsk_prot->slab_name)
2915 return -ENOMEM;
2916
2917 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2918 rsk_prot->obj_size, 0,
2919 prot->slab_flags, NULL);
2920
2921 if (!rsk_prot->slab) {
2922 pr_crit("%s: Can't create request sock SLAB cache!\n",
2923 prot->name);
2924 return -ENOMEM;
2925 }
2926 return 0;
2927 }
2928
2929 int proto_register(struct proto *prot, int alloc_slab)
2930 {
2931 if (alloc_slab) {
2932 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2933 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2934 NULL);
2935
2936 if (prot->slab == NULL) {
2937 pr_crit("%s: Can't create sock SLAB cache!\n",
2938 prot->name);
2939 goto out;
2940 }
2941
2942 if (req_prot_init(prot))
2943 goto out_free_request_sock_slab;
2944
2945 if (prot->twsk_prot != NULL) {
2946 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2947
2948 if (prot->twsk_prot->twsk_slab_name == NULL)
2949 goto out_free_request_sock_slab;
2950
2951 prot->twsk_prot->twsk_slab =
2952 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2953 prot->twsk_prot->twsk_obj_size,
2954 0,
2955 prot->slab_flags,
2956 NULL);
2957 if (prot->twsk_prot->twsk_slab == NULL)
2958 goto out_free_timewait_sock_slab_name;
2959 }
2960 }
2961
2962 mutex_lock(&proto_list_mutex);
2963 list_add(&prot->node, &proto_list);
2964 assign_proto_idx(prot);
2965 mutex_unlock(&proto_list_mutex);
2966 return 0;
2967
2968 out_free_timewait_sock_slab_name:
2969 kfree(prot->twsk_prot->twsk_slab_name);
2970 out_free_request_sock_slab:
2971 req_prot_cleanup(prot->rsk_prot);
2972
2973 kmem_cache_destroy(prot->slab);
2974 prot->slab = NULL;
2975 out:
2976 return -ENOBUFS;
2977 }
2978 EXPORT_SYMBOL(proto_register);
2979
2980 void proto_unregister(struct proto *prot)
2981 {
2982 mutex_lock(&proto_list_mutex);
2983 release_proto_idx(prot);
2984 list_del(&prot->node);
2985 mutex_unlock(&proto_list_mutex);
2986
2987 kmem_cache_destroy(prot->slab);
2988 prot->slab = NULL;
2989
2990 req_prot_cleanup(prot->rsk_prot);
2991
2992 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2993 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2994 kfree(prot->twsk_prot->twsk_slab_name);
2995 prot->twsk_prot->twsk_slab = NULL;
2996 }
2997 }
2998 EXPORT_SYMBOL(proto_unregister);
2999
3000 #ifdef CONFIG_PROC_FS
3001 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3002 __acquires(proto_list_mutex)
3003 {
3004 mutex_lock(&proto_list_mutex);
3005 return seq_list_start_head(&proto_list, *pos);
3006 }
3007
3008 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3009 {
3010 return seq_list_next(v, &proto_list, pos);
3011 }
3012
3013 static void proto_seq_stop(struct seq_file *seq, void *v)
3014 __releases(proto_list_mutex)
3015 {
3016 mutex_unlock(&proto_list_mutex);
3017 }
3018
3019 static char proto_method_implemented(const void *method)
3020 {
3021 return method == NULL ? 'n' : 'y';
3022 }
3023 static long sock_prot_memory_allocated(struct proto *proto)
3024 {
3025 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3026 }
3027
3028 static char *sock_prot_memory_pressure(struct proto *proto)
3029 {
3030 return proto->memory_pressure != NULL ?
3031 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3032 }
3033
3034 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3035 {
3036
3037 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3038 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3039 proto->name,
3040 proto->obj_size,
3041 sock_prot_inuse_get(seq_file_net(seq), proto),
3042 sock_prot_memory_allocated(proto),
3043 sock_prot_memory_pressure(proto),
3044 proto->max_header,
3045 proto->slab == NULL ? "no" : "yes",
3046 module_name(proto->owner),
3047 proto_method_implemented(proto->close),
3048 proto_method_implemented(proto->connect),
3049 proto_method_implemented(proto->disconnect),
3050 proto_method_implemented(proto->accept),
3051 proto_method_implemented(proto->ioctl),
3052 proto_method_implemented(proto->init),
3053 proto_method_implemented(proto->destroy),
3054 proto_method_implemented(proto->shutdown),
3055 proto_method_implemented(proto->setsockopt),
3056 proto_method_implemented(proto->getsockopt),
3057 proto_method_implemented(proto->sendmsg),
3058 proto_method_implemented(proto->recvmsg),
3059 proto_method_implemented(proto->sendpage),
3060 proto_method_implemented(proto->bind),
3061 proto_method_implemented(proto->backlog_rcv),
3062 proto_method_implemented(proto->hash),
3063 proto_method_implemented(proto->unhash),
3064 proto_method_implemented(proto->get_port),
3065 proto_method_implemented(proto->enter_memory_pressure));
3066 }
3067
3068 static int proto_seq_show(struct seq_file *seq, void *v)
3069 {
3070 if (v == &proto_list)
3071 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3072 "protocol",
3073 "size",
3074 "sockets",
3075 "memory",
3076 "press",
3077 "maxhdr",
3078 "slab",
3079 "module",
3080 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3081 else
3082 proto_seq_printf(seq, list_entry(v, struct proto, node));
3083 return 0;
3084 }
3085
3086 static const struct seq_operations proto_seq_ops = {
3087 .start = proto_seq_start,
3088 .next = proto_seq_next,
3089 .stop = proto_seq_stop,
3090 .show = proto_seq_show,
3091 };
3092
3093 static int proto_seq_open(struct inode *inode, struct file *file)
3094 {
3095 return seq_open_net(inode, file, &proto_seq_ops,
3096 sizeof(struct seq_net_private));
3097 }
3098
3099 static const struct file_operations proto_seq_fops = {
3100 .owner = THIS_MODULE,
3101 .open = proto_seq_open,
3102 .read = seq_read,
3103 .llseek = seq_lseek,
3104 .release = seq_release_net,
3105 };
3106
3107 static __net_init int proto_init_net(struct net *net)
3108 {
3109 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3110 return -ENOMEM;
3111
3112 return 0;
3113 }
3114
3115 static __net_exit void proto_exit_net(struct net *net)
3116 {
3117 remove_proc_entry("protocols", net->proc_net);
3118 }
3119
3120
3121 static __net_initdata struct pernet_operations proto_net_ops = {
3122 .init = proto_init_net,
3123 .exit = proto_exit_net,
3124 };
3125
3126 static int __init proto_init(void)
3127 {
3128 return register_pernet_subsys(&proto_net_ops);
3129 }
3130
3131 subsys_initcall(proto_init);
3132
3133 #endif /* PROC_FS */