]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/core/sock.c
Merge tag 'armsoc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc
[mirror_ubuntu-jammy-kernel.git] / net / core / sock.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
1da177e4
LT
84 */
85
e005d193
JP
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
80b14dee 88#include <asm/unaligned.h>
4fc268d2 89#include <linux/capability.h>
1da177e4 90#include <linux/errno.h>
cb820f8e 91#include <linux/errqueue.h>
1da177e4
LT
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
1da177e4
LT
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
f1083048 100#include <linux/sched/mm.h>
1da177e4
LT
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/init.h>
a1f8e7f7 111#include <linux/highmem.h>
3f551f94 112#include <linux/user_namespace.h>
c5905afb 113#include <linux/static_key.h>
3969eb38 114#include <linux/memcontrol.h>
8c1ae10d 115#include <linux/prefetch.h>
1da177e4 116
7c0f6ba6 117#include <linux/uaccess.h>
1da177e4
LT
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
457c4cbc 122#include <net/net_namespace.h>
2e6599cb 123#include <net/request_sock.h>
1da177e4 124#include <net/sock.h>
20d49473 125#include <linux/net_tstamp.h>
1da177e4
LT
126#include <net/xfrm.h>
127#include <linux/ipsec.h>
f8451725 128#include <net/cls_cgroup.h>
5bc1421e 129#include <net/netprio_cgroup.h>
eb4cb008 130#include <linux/sock_diag.h>
1da177e4
LT
131
132#include <linux/filter.h>
538950a1 133#include <net/sock_reuseport.h>
6ac99e8f 134#include <net/bpf_sk_storage.h>
1da177e4 135
3847ce32
SM
136#include <trace/events/sock.h>
137
1da177e4 138#include <net/tcp.h>
076bb0c8 139#include <net/busy_poll.h>
06021292 140
36b77a52 141static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
142static LIST_HEAD(proto_list);
143
648845ab
TZ
144static void sock_inuse_add(struct net *net, int val);
145
a3b299da
EB
146/**
147 * sk_ns_capable - General socket capability test
148 * @sk: Socket to use a capability on or through
149 * @user_ns: The user namespace of the capability to use
150 * @cap: The capability to use
151 *
152 * Test to see if the opener of the socket had when the socket was
153 * created and the current process has the capability @cap in the user
154 * namespace @user_ns.
155 */
156bool sk_ns_capable(const struct sock *sk,
157 struct user_namespace *user_ns, int cap)
158{
159 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
160 ns_capable(user_ns, cap);
161}
162EXPORT_SYMBOL(sk_ns_capable);
163
164/**
165 * sk_capable - Socket global capability test
166 * @sk: Socket to use a capability on or through
e793c0f7 167 * @cap: The global capability to use
a3b299da
EB
168 *
169 * Test to see if the opener of the socket had when the socket was
170 * created and the current process has the capability @cap in all user
171 * namespaces.
172 */
173bool sk_capable(const struct sock *sk, int cap)
174{
175 return sk_ns_capable(sk, &init_user_ns, cap);
176}
177EXPORT_SYMBOL(sk_capable);
178
179/**
180 * sk_net_capable - Network namespace socket capability test
181 * @sk: Socket to use a capability on or through
182 * @cap: The capability to use
183 *
e793c0f7 184 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
185 * and the current process has the capability @cap over the network namespace
186 * the socket is a member of.
187 */
188bool sk_net_capable(const struct sock *sk, int cap)
189{
190 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
191}
192EXPORT_SYMBOL(sk_net_capable);
193
da21f24d
IM
194/*
195 * Each address family might have different locking rules, so we have
cdfbabfb
DH
196 * one slock key per address family and separate keys for internal and
197 * userspace sockets.
da21f24d 198 */
a5b5bb9a 199static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 200static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 201static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 202static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 203
a5b5bb9a
IM
204/*
205 * Make lock validator output more readable. (we pre-construct these
206 * strings build-time, so that runtime initialization of socket
207 * locks is fast):
208 */
cdfbabfb
DH
209
210#define _sock_locks(x) \
211 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
212 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
213 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
214 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
215 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
216 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
217 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
218 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
219 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
220 x "27" , x "28" , x "AF_CAN" , \
221 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
222 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
223 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
224 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
68e8b849
BT
225 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
226 x "AF_MAX"
cdfbabfb 227
36cbd3dc 228static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 229 _sock_locks("sk_lock-")
a5b5bb9a 230};
36cbd3dc 231static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 232 _sock_locks("slock-")
a5b5bb9a 233};
36cbd3dc 234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
235 _sock_locks("clock-")
236};
237
238static const char *const af_family_kern_key_strings[AF_MAX+1] = {
239 _sock_locks("k-sk_lock-")
240};
241static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
242 _sock_locks("k-slock-")
243};
244static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
245 _sock_locks("k-clock-")
443aef0e 246};
581319c5 247static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
6b431d50 248 _sock_locks("rlock-")
581319c5
PA
249};
250static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
6b431d50 251 _sock_locks("wlock-")
581319c5
PA
252};
253static const char *const af_family_elock_key_strings[AF_MAX+1] = {
6b431d50 254 _sock_locks("elock-")
581319c5 255};
da21f24d
IM
256
257/*
581319c5 258 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
259 * so split the lock classes by using a per-AF key:
260 */
261static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
262static struct lock_class_key af_rlock_keys[AF_MAX];
263static struct lock_class_key af_wlock_keys[AF_MAX];
264static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 265static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 266
1da177e4 267/* Run time adjustable parameters. */
ab32ea5d 268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 269EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 271EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 274
25985edc 275/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 277EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 278
b245be1f
WB
279int sysctl_tstamp_allow_data __read_mostly = 1;
280
a7950ae8
DB
281DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
282EXPORT_SYMBOL_GPL(memalloc_socks_key);
c93bdd0e 283
7cb02404
MG
284/**
285 * sk_set_memalloc - sets %SOCK_MEMALLOC
286 * @sk: socket to set it on
287 *
288 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
289 * It's the responsibility of the admin to adjust min_free_kbytes
290 * to meet the requirements
291 */
292void sk_set_memalloc(struct sock *sk)
293{
294 sock_set_flag(sk, SOCK_MEMALLOC);
295 sk->sk_allocation |= __GFP_MEMALLOC;
a7950ae8 296 static_branch_inc(&memalloc_socks_key);
7cb02404
MG
297}
298EXPORT_SYMBOL_GPL(sk_set_memalloc);
299
300void sk_clear_memalloc(struct sock *sk)
301{
302 sock_reset_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation &= ~__GFP_MEMALLOC;
a7950ae8 304 static_branch_dec(&memalloc_socks_key);
c76562b6
MG
305
306 /*
307 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
308 * progress of swapping. SOCK_MEMALLOC may be cleared while
309 * it has rmem allocations due to the last swapfile being deactivated
310 * but there is a risk that the socket is unusable due to exceeding
311 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 312 */
5d753610 313 sk_mem_reclaim(sk);
7cb02404
MG
314}
315EXPORT_SYMBOL_GPL(sk_clear_memalloc);
316
b4b9e355
MG
317int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
318{
319 int ret;
f1083048 320 unsigned int noreclaim_flag;
b4b9e355
MG
321
322 /* these should have been dropped before queueing */
323 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
324
f1083048 325 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 326 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 327 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
328
329 return ret;
330}
331EXPORT_SYMBOL(__sk_backlog_rcv);
332
a9beb86a 333static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
fe0c72f3 334{
a9beb86a
DD
335 struct __kernel_sock_timeval tv;
336 int size;
fe0c72f3
AB
337
338 if (timeo == MAX_SCHEDULE_TIMEOUT) {
339 tv.tv_sec = 0;
340 tv.tv_usec = 0;
341 } else {
342 tv.tv_sec = timeo / HZ;
343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
344 }
345
e6986423 346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
fe0c72f3
AB
347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
348 *(struct old_timeval32 *)optval = tv32;
349 return sizeof(tv32);
350 }
351
a9beb86a
DD
352 if (old_timeval) {
353 struct __kernel_old_timeval old_tv;
354 old_tv.tv_sec = tv.tv_sec;
355 old_tv.tv_usec = tv.tv_usec;
356 *(struct __kernel_old_timeval *)optval = old_tv;
357 size = sizeof(old_tv);
358 } else {
359 *(struct __kernel_sock_timeval *)optval = tv;
360 size = sizeof(tv);
361 }
362
363 return size;
fe0c72f3
AB
364}
365
a9beb86a 366static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
1da177e4 367{
a9beb86a 368 struct __kernel_sock_timeval tv;
1da177e4 369
e6986423 370 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
fe0c72f3
AB
371 struct old_timeval32 tv32;
372
373 if (optlen < sizeof(tv32))
374 return -EINVAL;
375
376 if (copy_from_user(&tv32, optval, sizeof(tv32)))
377 return -EFAULT;
378 tv.tv_sec = tv32.tv_sec;
379 tv.tv_usec = tv32.tv_usec;
a9beb86a
DD
380 } else if (old_timeval) {
381 struct __kernel_old_timeval old_tv;
382
383 if (optlen < sizeof(old_tv))
384 return -EINVAL;
385 if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
386 return -EFAULT;
387 tv.tv_sec = old_tv.tv_sec;
388 tv.tv_usec = old_tv.tv_usec;
fe0c72f3
AB
389 } else {
390 if (optlen < sizeof(tv))
391 return -EINVAL;
392 if (copy_from_user(&tv, optval, sizeof(tv)))
393 return -EFAULT;
394 }
ba78073e
VA
395 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
396 return -EDOM;
1da177e4 397
ba78073e 398 if (tv.tv_sec < 0) {
6f11df83
AM
399 static int warned __read_mostly;
400
ba78073e 401 *timeo_p = 0;
50aab54f 402 if (warned < 10 && net_ratelimit()) {
ba78073e 403 warned++;
e005d193
JP
404 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
405 __func__, current->comm, task_pid_nr(current));
50aab54f 406 }
ba78073e
VA
407 return 0;
408 }
1da177e4
LT
409 *timeo_p = MAX_SCHEDULE_TIMEOUT;
410 if (tv.tv_sec == 0 && tv.tv_usec == 0)
411 return 0;
a9beb86a
DD
412 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
413 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
414 return 0;
415}
416
417static void sock_warn_obsolete_bsdism(const char *name)
418{
419 static int warned;
420 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
421 if (strcmp(warncomm, current->comm) && warned < 5) {
422 strcpy(warncomm, current->comm);
e005d193
JP
423 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
424 warncomm, name);
1da177e4
LT
425 warned++;
426 }
427}
428
080a270f
HFS
429static bool sock_needs_netstamp(const struct sock *sk)
430{
431 switch (sk->sk_family) {
432 case AF_UNSPEC:
433 case AF_UNIX:
434 return false;
435 default:
436 return true;
437 }
438}
439
08e29af3 440static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 441{
08e29af3
ED
442 if (sk->sk_flags & flags) {
443 sk->sk_flags &= ~flags;
080a270f
HFS
444 if (sock_needs_netstamp(sk) &&
445 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 446 net_disable_timestamp();
1da177e4
LT
447 }
448}
449
450
e6afc8ac 451int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 452{
3b885787
NH
453 unsigned long flags;
454 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 455
0fd7bac6 456 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 457 atomic_inc(&sk->sk_drops);
3847ce32 458 trace_sock_rcvqueue_full(sk, skb);
766e9037 459 return -ENOMEM;
f0088a50
DV
460 }
461
c76562b6 462 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
463 atomic_inc(&sk->sk_drops);
464 return -ENOBUFS;
3ab224be
HA
465 }
466
f0088a50
DV
467 skb->dev = NULL;
468 skb_set_owner_r(skb, sk);
49ad9599 469
7fee226a
ED
470 /* we escape from rcu protected region, make sure we dont leak
471 * a norefcounted dst
472 */
473 skb_dst_force(skb);
474
3b885787 475 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 476 sock_skb_set_dropcount(sk, skb);
3b885787
NH
477 __skb_queue_tail(list, skb);
478 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
479
480 if (!sock_flag(sk, SOCK_DEAD))
676d2369 481 sk->sk_data_ready(sk);
766e9037 482 return 0;
f0088a50 483}
e6afc8ac 484EXPORT_SYMBOL(__sock_queue_rcv_skb);
485
486int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
487{
488 int err;
489
490 err = sk_filter(sk, skb);
491 if (err)
492 return err;
493
494 return __sock_queue_rcv_skb(sk, skb);
495}
f0088a50
DV
496EXPORT_SYMBOL(sock_queue_rcv_skb);
497
4f0c40d9 498int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 499 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
500{
501 int rc = NET_RX_SUCCESS;
502
4f0c40d9 503 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
504 goto discard_and_relse;
505
506 skb->dev = NULL;
507
274f482d 508 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
509 atomic_inc(&sk->sk_drops);
510 goto discard_and_relse;
511 }
58a5a7b9
ACM
512 if (nested)
513 bh_lock_sock_nested(sk);
514 else
515 bh_lock_sock(sk);
a5b5bb9a
IM
516 if (!sock_owned_by_user(sk)) {
517 /*
518 * trylock + unlock semantics:
519 */
520 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
521
c57943a1 522 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
523
524 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 525 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
526 bh_unlock_sock(sk);
527 atomic_inc(&sk->sk_drops);
528 goto discard_and_relse;
529 }
530
f0088a50
DV
531 bh_unlock_sock(sk);
532out:
c3f24cfb
ED
533 if (refcounted)
534 sock_put(sk);
f0088a50
DV
535 return rc;
536discard_and_relse:
537 kfree_skb(skb);
538 goto out;
539}
4f0c40d9 540EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
541
542struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
543{
b6c6712a 544 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
545
546 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 547 sk_tx_queue_clear(sk);
9b8805a3 548 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 549 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
550 dst_release(dst);
551 return NULL;
552 }
553
554 return dst;
555}
556EXPORT_SYMBOL(__sk_dst_check);
557
558struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
559{
560 struct dst_entry *dst = sk_dst_get(sk);
561
562 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
563 sk_dst_reset(sk);
564 dst_release(dst);
565 return NULL;
566 }
567
568 return dst;
569}
570EXPORT_SYMBOL(sk_dst_check);
571
f5dd3d0c 572static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
4878809f
DM
573{
574 int ret = -ENOPROTOOPT;
575#ifdef CONFIG_NETDEVICES
3b1e0a65 576 struct net *net = sock_net(sk);
4878809f
DM
577
578 /* Sorry... */
579 ret = -EPERM;
5e1fccc0 580 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
581 goto out;
582
f5dd3d0c
DH
583 ret = -EINVAL;
584 if (ifindex < 0)
585 goto out;
586
587 sk->sk_bound_dev_if = ifindex;
588 if (sk->sk_prot->rehash)
589 sk->sk_prot->rehash(sk);
590 sk_dst_reset(sk);
591
592 ret = 0;
593
594out:
595#endif
596
597 return ret;
598}
599
600static int sock_setbindtodevice(struct sock *sk, char __user *optval,
601 int optlen)
602{
603 int ret = -ENOPROTOOPT;
604#ifdef CONFIG_NETDEVICES
605 struct net *net = sock_net(sk);
606 char devname[IFNAMSIZ];
607 int index;
608
4878809f
DM
609 ret = -EINVAL;
610 if (optlen < 0)
611 goto out;
612
613 /* Bind this socket to a particular device like "eth0",
614 * as specified in the passed interface name. If the
615 * name is "" or the option length is zero the socket
616 * is not bound.
617 */
618 if (optlen > IFNAMSIZ - 1)
619 optlen = IFNAMSIZ - 1;
620 memset(devname, 0, sizeof(devname));
621
622 ret = -EFAULT;
623 if (copy_from_user(devname, optval, optlen))
624 goto out;
625
000ba2e4
DM
626 index = 0;
627 if (devname[0] != '\0') {
bf8e56bf 628 struct net_device *dev;
4878809f 629
bf8e56bf
ED
630 rcu_read_lock();
631 dev = dev_get_by_name_rcu(net, devname);
632 if (dev)
633 index = dev->ifindex;
634 rcu_read_unlock();
4878809f
DM
635 ret = -ENODEV;
636 if (!dev)
637 goto out;
4878809f
DM
638 }
639
640 lock_sock(sk);
f5dd3d0c 641 ret = sock_setbindtodevice_locked(sk, index);
4878809f
DM
642 release_sock(sk);
643
4878809f
DM
644out:
645#endif
646
647 return ret;
648}
649
c91f6df2
BH
650static int sock_getbindtodevice(struct sock *sk, char __user *optval,
651 int __user *optlen, int len)
652{
653 int ret = -ENOPROTOOPT;
654#ifdef CONFIG_NETDEVICES
655 struct net *net = sock_net(sk);
c91f6df2 656 char devname[IFNAMSIZ];
c91f6df2
BH
657
658 if (sk->sk_bound_dev_if == 0) {
659 len = 0;
660 goto zero;
661 }
662
663 ret = -EINVAL;
664 if (len < IFNAMSIZ)
665 goto out;
666
5dbe7c17
NS
667 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
668 if (ret)
c91f6df2 669 goto out;
c91f6df2
BH
670
671 len = strlen(devname) + 1;
672
673 ret = -EFAULT;
674 if (copy_to_user(optval, devname, len))
675 goto out;
676
677zero:
678 ret = -EFAULT;
679 if (put_user(len, optlen))
680 goto out;
681
682 ret = 0;
683
684out:
685#endif
686
687 return ret;
688}
689
c0ef877b
PE
690static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
691{
692 if (valbool)
693 sock_set_flag(sk, bit);
694 else
695 sock_reset_flag(sk, bit);
696}
697
f60e5990 698bool sk_mc_loop(struct sock *sk)
699{
700 if (dev_recursion_level())
701 return false;
702 if (!sk)
703 return true;
704 switch (sk->sk_family) {
705 case AF_INET:
706 return inet_sk(sk)->mc_loop;
707#if IS_ENABLED(CONFIG_IPV6)
708 case AF_INET6:
709 return inet6_sk(sk)->mc_loop;
710#endif
711 }
712 WARN_ON(1);
713 return true;
714}
715EXPORT_SYMBOL(sk_mc_loop);
716
1da177e4
LT
717/*
718 * This is meant for all protocols to use and covers goings on
719 * at the socket level. Everything here is generic.
720 */
721
722int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 723 char __user *optval, unsigned int optlen)
1da177e4 724{
80b14dee 725 struct sock_txtime sk_txtime;
2a91525c 726 struct sock *sk = sock->sk;
1da177e4
LT
727 int val;
728 int valbool;
729 struct linger ling;
730 int ret = 0;
4ec93edb 731
1da177e4
LT
732 /*
733 * Options without arguments
734 */
735
4878809f 736 if (optname == SO_BINDTODEVICE)
c91f6df2 737 return sock_setbindtodevice(sk, optval, optlen);
4878809f 738
e71a4783
SH
739 if (optlen < sizeof(int))
740 return -EINVAL;
4ec93edb 741
1da177e4
LT
742 if (get_user(val, (int __user *)optval))
743 return -EFAULT;
4ec93edb 744
2a91525c 745 valbool = val ? 1 : 0;
1da177e4
LT
746
747 lock_sock(sk);
748
2a91525c 749 switch (optname) {
e71a4783 750 case SO_DEBUG:
2a91525c 751 if (val && !capable(CAP_NET_ADMIN))
e71a4783 752 ret = -EACCES;
2a91525c 753 else
c0ef877b 754 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
755 break;
756 case SO_REUSEADDR:
cdb8744d 757 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 758 break;
055dc21a
TH
759 case SO_REUSEPORT:
760 sk->sk_reuseport = valbool;
761 break;
e71a4783 762 case SO_TYPE:
49c794e9 763 case SO_PROTOCOL:
0d6038ee 764 case SO_DOMAIN:
e71a4783
SH
765 case SO_ERROR:
766 ret = -ENOPROTOOPT;
767 break;
768 case SO_DONTROUTE:
c0ef877b 769 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
0fbe82e6 770 sk_dst_reset(sk);
e71a4783
SH
771 break;
772 case SO_BROADCAST:
773 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
774 break;
775 case SO_SNDBUF:
776 /* Don't error on this BSD doesn't and if you think
82981930
ED
777 * about it this is right. Otherwise apps have to
778 * play 'guess the biggest size' games. RCVBUF/SNDBUF
779 * are treated in BSD as hints
780 */
781 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 782set_sndbuf:
4057765f
GN
783 /* Ensure val * 2 fits into an int, to prevent max_t()
784 * from treating it as a negative value.
785 */
786 val = min_t(int, val, INT_MAX / 2);
e71a4783 787 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 788 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 789 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
790 sk->sk_write_space(sk);
791 break;
1da177e4 792
e71a4783
SH
793 case SO_SNDBUFFORCE:
794 if (!capable(CAP_NET_ADMIN)) {
795 ret = -EPERM;
796 break;
797 }
4057765f
GN
798
799 /* No negative values (to prevent underflow, as val will be
800 * multiplied by 2).
801 */
802 if (val < 0)
803 val = 0;
e71a4783 804 goto set_sndbuf;
b0573dea 805
e71a4783
SH
806 case SO_RCVBUF:
807 /* Don't error on this BSD doesn't and if you think
82981930
ED
808 * about it this is right. Otherwise apps have to
809 * play 'guess the biggest size' games. RCVBUF/SNDBUF
810 * are treated in BSD as hints
811 */
812 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 813set_rcvbuf:
4057765f
GN
814 /* Ensure val * 2 fits into an int, to prevent max_t()
815 * from treating it as a negative value.
816 */
817 val = min_t(int, val, INT_MAX / 2);
e71a4783
SH
818 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
819 /*
820 * We double it on the way in to account for
821 * "struct sk_buff" etc. overhead. Applications
822 * assume that the SO_RCVBUF setting they make will
823 * allow that much actual data to be received on that
824 * socket.
825 *
826 * Applications are unaware that "struct sk_buff" and
827 * other overheads allocate from the receive buffer
828 * during socket buffer allocation.
829 *
830 * And after considering the possible alternatives,
831 * returning the value we actually used in getsockopt
832 * is the most desirable behavior.
833 */
b98b0bc8 834 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
835 break;
836
837 case SO_RCVBUFFORCE:
838 if (!capable(CAP_NET_ADMIN)) {
839 ret = -EPERM;
1da177e4 840 break;
e71a4783 841 }
4057765f
GN
842
843 /* No negative values (to prevent underflow, as val will be
844 * multiplied by 2).
845 */
846 if (val < 0)
847 val = 0;
e71a4783 848 goto set_rcvbuf;
1da177e4 849
e71a4783 850 case SO_KEEPALIVE:
4b9d07a4
UB
851 if (sk->sk_prot->keepalive)
852 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
853 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
854 break;
855
856 case SO_OOBINLINE:
857 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
858 break;
859
860 case SO_NO_CHECK:
28448b80 861 sk->sk_no_check_tx = valbool;
e71a4783
SH
862 break;
863
864 case SO_PRIORITY:
5e1fccc0
EB
865 if ((val >= 0 && val <= 6) ||
866 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
867 sk->sk_priority = val;
868 else
869 ret = -EPERM;
870 break;
871
872 case SO_LINGER:
873 if (optlen < sizeof(ling)) {
874 ret = -EINVAL; /* 1003.1g */
1da177e4 875 break;
e71a4783 876 }
2a91525c 877 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 878 ret = -EFAULT;
1da177e4 879 break;
e71a4783
SH
880 }
881 if (!ling.l_onoff)
882 sock_reset_flag(sk, SOCK_LINGER);
883 else {
1da177e4 884#if (BITS_PER_LONG == 32)
e71a4783
SH
885 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
886 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 887 else
e71a4783
SH
888#endif
889 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
890 sock_set_flag(sk, SOCK_LINGER);
891 }
892 break;
893
894 case SO_BSDCOMPAT:
895 sock_warn_obsolete_bsdism("setsockopt");
896 break;
897
898 case SO_PASSCRED:
899 if (valbool)
900 set_bit(SOCK_PASSCRED, &sock->flags);
901 else
902 clear_bit(SOCK_PASSCRED, &sock->flags);
903 break;
904
7f1bc6e9 905 case SO_TIMESTAMP_OLD:
887feae3 906 case SO_TIMESTAMP_NEW:
7f1bc6e9 907 case SO_TIMESTAMPNS_OLD:
887feae3 908 case SO_TIMESTAMPNS_NEW:
e71a4783 909 if (valbool) {
887feae3
DD
910 if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
911 sock_set_flag(sk, SOCK_TSTAMP_NEW);
912 else
913 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
914
915 if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
92f37fd2
ED
916 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
917 else
918 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 919 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 920 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 921 } else {
e71a4783 922 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2 923 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
887feae3 924 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
92f37fd2 925 }
e71a4783
SH
926 break;
927
9718475e
DD
928 case SO_TIMESTAMPING_NEW:
929 sock_set_flag(sk, SOCK_TSTAMP_NEW);
ff7653f9 930 /* fall through */
7f1bc6e9 931 case SO_TIMESTAMPING_OLD:
20d49473 932 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 933 ret = -EINVAL;
20d49473
PO
934 break;
935 }
b245be1f 936
09c2d251 937 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 938 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
939 if (sk->sk_protocol == IPPROTO_TCP &&
940 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
941 if ((1 << sk->sk_state) &
942 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
943 ret = -EINVAL;
944 break;
945 }
946 sk->sk_tskey = tcp_sk(sk)->snd_una;
947 } else {
948 sk->sk_tskey = 0;
949 }
950 }
1c885808
FY
951
952 if (val & SOF_TIMESTAMPING_OPT_STATS &&
953 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
954 ret = -EINVAL;
955 break;
956 }
957
b9f40e21 958 sk->sk_tsflags = val;
20d49473
PO
959 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
960 sock_enable_timestamp(sk,
961 SOCK_TIMESTAMPING_RX_SOFTWARE);
9718475e
DD
962 else {
963 if (optname == SO_TIMESTAMPING_NEW)
964 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
965
20d49473 966 sock_disable_timestamp(sk,
08e29af3 967 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
9718475e 968 }
20d49473
PO
969 break;
970
e71a4783
SH
971 case SO_RCVLOWAT:
972 if (val < 0)
973 val = INT_MAX;
d1361840
ED
974 if (sock->ops->set_rcvlowat)
975 ret = sock->ops->set_rcvlowat(sk, val);
976 else
977 sk->sk_rcvlowat = val ? : 1;
e71a4783
SH
978 break;
979
45bdc661 980 case SO_RCVTIMEO_OLD:
a9beb86a
DD
981 case SO_RCVTIMEO_NEW:
982 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
e71a4783
SH
983 break;
984
45bdc661 985 case SO_SNDTIMEO_OLD:
a9beb86a
DD
986 case SO_SNDTIMEO_NEW:
987 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
e71a4783 988 break;
1da177e4 989
e71a4783
SH
990 case SO_ATTACH_FILTER:
991 ret = -EINVAL;
992 if (optlen == sizeof(struct sock_fprog)) {
993 struct sock_fprog fprog;
1da177e4 994
e71a4783
SH
995 ret = -EFAULT;
996 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 997 break;
e71a4783
SH
998
999 ret = sk_attach_filter(&fprog, sk);
1000 }
1001 break;
1002
89aa0758
AS
1003 case SO_ATTACH_BPF:
1004 ret = -EINVAL;
1005 if (optlen == sizeof(u32)) {
1006 u32 ufd;
1007
1008 ret = -EFAULT;
1009 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1010 break;
1011
1012 ret = sk_attach_bpf(ufd, sk);
1013 }
1014 break;
1015
538950a1
CG
1016 case SO_ATTACH_REUSEPORT_CBPF:
1017 ret = -EINVAL;
1018 if (optlen == sizeof(struct sock_fprog)) {
1019 struct sock_fprog fprog;
1020
1021 ret = -EFAULT;
1022 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1023 break;
1024
1025 ret = sk_reuseport_attach_filter(&fprog, sk);
1026 }
1027 break;
1028
1029 case SO_ATTACH_REUSEPORT_EBPF:
1030 ret = -EINVAL;
1031 if (optlen == sizeof(u32)) {
1032 u32 ufd;
1033
1034 ret = -EFAULT;
1035 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1036 break;
1037
1038 ret = sk_reuseport_attach_bpf(ufd, sk);
1039 }
1040 break;
1041
e71a4783 1042 case SO_DETACH_FILTER:
55b33325 1043 ret = sk_detach_filter(sk);
e71a4783 1044 break;
1da177e4 1045
d59577b6
VB
1046 case SO_LOCK_FILTER:
1047 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1048 ret = -EPERM;
1049 else
1050 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1051 break;
1052
e71a4783
SH
1053 case SO_PASSSEC:
1054 if (valbool)
1055 set_bit(SOCK_PASSSEC, &sock->flags);
1056 else
1057 clear_bit(SOCK_PASSSEC, &sock->flags);
1058 break;
4a19ec58 1059 case SO_MARK:
50254256 1060 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
4a19ec58 1061 ret = -EPERM;
50254256 1062 } else if (val != sk->sk_mark) {
4a19ec58 1063 sk->sk_mark = val;
50254256
DB
1064 sk_dst_reset(sk);
1065 }
4a19ec58 1066 break;
877ce7c1 1067
3b885787 1068 case SO_RXQ_OVFL:
8083f0fc 1069 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 1070 break;
6e3e939f
JB
1071
1072 case SO_WIFI_STATUS:
1073 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1074 break;
1075
ef64a54f
PE
1076 case SO_PEEK_OFF:
1077 if (sock->ops->set_peek_off)
12663bfc 1078 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
1079 else
1080 ret = -EOPNOTSUPP;
1081 break;
3bdc0eba
BG
1082
1083 case SO_NOFCS:
1084 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1085 break;
1086
7d4c04fc
KJ
1087 case SO_SELECT_ERR_QUEUE:
1088 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1089 break;
1090
e0d1095a 1091#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1092 case SO_BUSY_POLL:
dafcc438
ET
1093 /* allow unprivileged users to decrease the value */
1094 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1095 ret = -EPERM;
1096 else {
1097 if (val < 0)
1098 ret = -EINVAL;
1099 else
1100 sk->sk_ll_usec = val;
1101 }
1102 break;
1103#endif
62748f32
ED
1104
1105 case SO_MAX_PACING_RATE:
6bdef102
ED
1106 {
1107 unsigned long ulval = (val == ~0U) ? ~0UL : val;
1108
1109 if (sizeof(ulval) != sizeof(val) &&
1110 optlen >= sizeof(ulval) &&
1111 get_user(ulval, (unsigned long __user *)optval)) {
1112 ret = -EFAULT;
1113 break;
1114 }
1115 if (ulval != ~0UL)
218af599
ED
1116 cmpxchg(&sk->sk_pacing_status,
1117 SK_PACING_NONE,
1118 SK_PACING_NEEDED);
6bdef102
ED
1119 sk->sk_max_pacing_rate = ulval;
1120 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
62748f32 1121 break;
6bdef102 1122 }
70da268b
ED
1123 case SO_INCOMING_CPU:
1124 sk->sk_incoming_cpu = val;
1125 break;
1126
a87cb3e4
TH
1127 case SO_CNX_ADVICE:
1128 if (val == 1)
1129 dst_negative_advice(sk);
1130 break;
76851d12
WB
1131
1132 case SO_ZEROCOPY:
28190752 1133 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
b5947e5d
WB
1134 if (!((sk->sk_type == SOCK_STREAM &&
1135 sk->sk_protocol == IPPROTO_TCP) ||
1136 (sk->sk_type == SOCK_DGRAM &&
1137 sk->sk_protocol == IPPROTO_UDP)))
28190752 1138 ret = -ENOTSUPP;
28190752 1139 } else if (sk->sk_family != PF_RDS) {
76851d12 1140 ret = -ENOTSUPP;
28190752
SV
1141 }
1142 if (!ret) {
1143 if (val < 0 || val > 1)
1144 ret = -EINVAL;
1145 else
1146 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1147 }
334e6413
JSP
1148 break;
1149
80b14dee
RC
1150 case SO_TXTIME:
1151 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1152 ret = -EPERM;
1153 } else if (optlen != sizeof(struct sock_txtime)) {
1154 ret = -EINVAL;
1155 } else if (copy_from_user(&sk_txtime, optval,
1156 sizeof(struct sock_txtime))) {
1157 ret = -EFAULT;
1158 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1159 ret = -EINVAL;
1160 } else {
1161 sock_valbool_flag(sk, SOCK_TXTIME, true);
1162 sk->sk_clockid = sk_txtime.clockid;
1163 sk->sk_txtime_deadline_mode =
1164 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
4b15c707
JSP
1165 sk->sk_txtime_report_errors =
1166 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
80b14dee
RC
1167 }
1168 break;
1169
f5dd3d0c
DH
1170 case SO_BINDTOIFINDEX:
1171 ret = sock_setbindtodevice_locked(sk, val);
1172 break;
1173
e71a4783
SH
1174 default:
1175 ret = -ENOPROTOOPT;
1176 break;
4ec93edb 1177 }
1da177e4
LT
1178 release_sock(sk);
1179 return ret;
1180}
2a91525c 1181EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1182
1183
8f09898b 1184static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1185 struct ucred *ucred)
3f551f94
EB
1186{
1187 ucred->pid = pid_vnr(pid);
1188 ucred->uid = ucred->gid = -1;
1189 if (cred) {
1190 struct user_namespace *current_ns = current_user_ns();
1191
b2e4f544
EB
1192 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1193 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1194 }
1195}
1196
28b5ba2a
DH
1197static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1198{
1199 struct user_namespace *user_ns = current_user_ns();
1200 int i;
1201
1202 for (i = 0; i < src->ngroups; i++)
1203 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1204 return -EFAULT;
1205
1206 return 0;
1207}
1208
1da177e4
LT
1209int sock_getsockopt(struct socket *sock, int level, int optname,
1210 char __user *optval, int __user *optlen)
1211{
1212 struct sock *sk = sock->sk;
4ec93edb 1213
e71a4783 1214 union {
4ec93edb 1215 int val;
5daab9db 1216 u64 val64;
677f136c 1217 unsigned long ulval;
4ec93edb 1218 struct linger ling;
fe0c72f3
AB
1219 struct old_timeval32 tm32;
1220 struct __kernel_old_timeval tm;
a9beb86a 1221 struct __kernel_sock_timeval stm;
80b14dee 1222 struct sock_txtime txtime;
1da177e4 1223 } v;
4ec93edb 1224
4d0392be 1225 int lv = sizeof(int);
1da177e4 1226 int len;
4ec93edb 1227
e71a4783 1228 if (get_user(len, optlen))
4ec93edb 1229 return -EFAULT;
e71a4783 1230 if (len < 0)
1da177e4 1231 return -EINVAL;
4ec93edb 1232
50fee1de 1233 memset(&v, 0, sizeof(v));
df0bca04 1234
2a91525c 1235 switch (optname) {
e71a4783
SH
1236 case SO_DEBUG:
1237 v.val = sock_flag(sk, SOCK_DBG);
1238 break;
1239
1240 case SO_DONTROUTE:
1241 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1242 break;
1243
1244 case SO_BROADCAST:
1b23a5df 1245 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1246 break;
1247
1248 case SO_SNDBUF:
1249 v.val = sk->sk_sndbuf;
1250 break;
1251
1252 case SO_RCVBUF:
1253 v.val = sk->sk_rcvbuf;
1254 break;
1255
1256 case SO_REUSEADDR:
1257 v.val = sk->sk_reuse;
1258 break;
1259
055dc21a
TH
1260 case SO_REUSEPORT:
1261 v.val = sk->sk_reuseport;
1262 break;
1263
e71a4783 1264 case SO_KEEPALIVE:
1b23a5df 1265 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1266 break;
1267
1268 case SO_TYPE:
1269 v.val = sk->sk_type;
1270 break;
1271
49c794e9
JE
1272 case SO_PROTOCOL:
1273 v.val = sk->sk_protocol;
1274 break;
1275
0d6038ee
JE
1276 case SO_DOMAIN:
1277 v.val = sk->sk_family;
1278 break;
1279
e71a4783
SH
1280 case SO_ERROR:
1281 v.val = -sock_error(sk);
2a91525c 1282 if (v.val == 0)
e71a4783
SH
1283 v.val = xchg(&sk->sk_err_soft, 0);
1284 break;
1285
1286 case SO_OOBINLINE:
1b23a5df 1287 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1288 break;
1289
1290 case SO_NO_CHECK:
28448b80 1291 v.val = sk->sk_no_check_tx;
e71a4783
SH
1292 break;
1293
1294 case SO_PRIORITY:
1295 v.val = sk->sk_priority;
1296 break;
1297
1298 case SO_LINGER:
1299 lv = sizeof(v.ling);
1b23a5df 1300 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1301 v.ling.l_linger = sk->sk_lingertime / HZ;
1302 break;
1303
1304 case SO_BSDCOMPAT:
1305 sock_warn_obsolete_bsdism("getsockopt");
1306 break;
1307
7f1bc6e9 1308 case SO_TIMESTAMP_OLD:
92f37fd2 1309 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
887feae3 1310 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
92f37fd2
ED
1311 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1312 break;
1313
7f1bc6e9 1314 case SO_TIMESTAMPNS_OLD:
887feae3
DD
1315 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1316 break;
1317
1318 case SO_TIMESTAMP_NEW:
1319 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1320 break;
1321
1322 case SO_TIMESTAMPNS_NEW:
1323 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
e71a4783
SH
1324 break;
1325
7f1bc6e9 1326 case SO_TIMESTAMPING_OLD:
b9f40e21 1327 v.val = sk->sk_tsflags;
20d49473
PO
1328 break;
1329
a9beb86a
DD
1330 case SO_RCVTIMEO_OLD:
1331 case SO_RCVTIMEO_NEW:
1332 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
e71a4783
SH
1333 break;
1334
a9beb86a
DD
1335 case SO_SNDTIMEO_OLD:
1336 case SO_SNDTIMEO_NEW:
1337 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
e71a4783 1338 break;
1da177e4 1339
e71a4783
SH
1340 case SO_RCVLOWAT:
1341 v.val = sk->sk_rcvlowat;
1342 break;
1da177e4 1343
e71a4783 1344 case SO_SNDLOWAT:
2a91525c 1345 v.val = 1;
e71a4783 1346 break;
1da177e4 1347
e71a4783 1348 case SO_PASSCRED:
82981930 1349 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1350 break;
1da177e4 1351
e71a4783 1352 case SO_PEERCRED:
109f6e39
EB
1353 {
1354 struct ucred peercred;
1355 if (len > sizeof(peercred))
1356 len = sizeof(peercred);
1357 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1358 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1359 return -EFAULT;
1360 goto lenout;
109f6e39 1361 }
1da177e4 1362
28b5ba2a
DH
1363 case SO_PEERGROUPS:
1364 {
1365 int ret, n;
1366
1367 if (!sk->sk_peer_cred)
1368 return -ENODATA;
1369
1370 n = sk->sk_peer_cred->group_info->ngroups;
1371 if (len < n * sizeof(gid_t)) {
1372 len = n * sizeof(gid_t);
1373 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1374 }
1375 len = n * sizeof(gid_t);
1376
1377 ret = groups_to_user((gid_t __user *)optval,
1378 sk->sk_peer_cred->group_info);
1379 if (ret)
1380 return ret;
1381 goto lenout;
1382 }
1383
e71a4783
SH
1384 case SO_PEERNAME:
1385 {
1386 char address[128];
1387
9b2c45d4
DV
1388 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1389 if (lv < 0)
e71a4783
SH
1390 return -ENOTCONN;
1391 if (lv < len)
1392 return -EINVAL;
1393 if (copy_to_user(optval, address, len))
1394 return -EFAULT;
1395 goto lenout;
1396 }
1da177e4 1397
e71a4783
SH
1398 /* Dubious BSD thing... Probably nobody even uses it, but
1399 * the UNIX standard wants it for whatever reason... -DaveM
1400 */
1401 case SO_ACCEPTCONN:
1402 v.val = sk->sk_state == TCP_LISTEN;
1403 break;
1da177e4 1404
e71a4783 1405 case SO_PASSSEC:
82981930 1406 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1407 break;
877ce7c1 1408
e71a4783
SH
1409 case SO_PEERSEC:
1410 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1411
4a19ec58
LAT
1412 case SO_MARK:
1413 v.val = sk->sk_mark;
1414 break;
1415
3b885787 1416 case SO_RXQ_OVFL:
1b23a5df 1417 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1418 break;
1419
6e3e939f 1420 case SO_WIFI_STATUS:
1b23a5df 1421 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1422 break;
1423
ef64a54f
PE
1424 case SO_PEEK_OFF:
1425 if (!sock->ops->set_peek_off)
1426 return -EOPNOTSUPP;
1427
1428 v.val = sk->sk_peek_off;
1429 break;
bc2f7996 1430 case SO_NOFCS:
1b23a5df 1431 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1432 break;
c91f6df2 1433
f7b86bfe 1434 case SO_BINDTODEVICE:
c91f6df2
BH
1435 return sock_getbindtodevice(sk, optval, optlen, len);
1436
a8fc9277
PE
1437 case SO_GET_FILTER:
1438 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1439 if (len < 0)
1440 return len;
1441
1442 goto lenout;
c91f6df2 1443
d59577b6
VB
1444 case SO_LOCK_FILTER:
1445 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1446 break;
1447
ea02f941
MS
1448 case SO_BPF_EXTENSIONS:
1449 v.val = bpf_tell_extensions();
1450 break;
1451
7d4c04fc
KJ
1452 case SO_SELECT_ERR_QUEUE:
1453 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1454 break;
1455
e0d1095a 1456#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1457 case SO_BUSY_POLL:
dafcc438
ET
1458 v.val = sk->sk_ll_usec;
1459 break;
1460#endif
1461
62748f32 1462 case SO_MAX_PACING_RATE:
677f136c
ED
1463 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1464 lv = sizeof(v.ulval);
1465 v.ulval = sk->sk_max_pacing_rate;
1466 } else {
1467 /* 32bit version */
1468 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1469 }
62748f32
ED
1470 break;
1471
2c8c56e1
ED
1472 case SO_INCOMING_CPU:
1473 v.val = sk->sk_incoming_cpu;
1474 break;
1475
a2d133b1
JH
1476 case SO_MEMINFO:
1477 {
1478 u32 meminfo[SK_MEMINFO_VARS];
1479
a2d133b1
JH
1480 sk_get_meminfo(sk, meminfo);
1481
1482 len = min_t(unsigned int, len, sizeof(meminfo));
1483 if (copy_to_user(optval, &meminfo, len))
1484 return -EFAULT;
1485
1486 goto lenout;
1487 }
6d433902
SS
1488
1489#ifdef CONFIG_NET_RX_BUSY_POLL
1490 case SO_INCOMING_NAPI_ID:
1491 v.val = READ_ONCE(sk->sk_napi_id);
1492
1493 /* aggregate non-NAPI IDs down to 0 */
1494 if (v.val < MIN_NAPI_ID)
1495 v.val = 0;
1496
1497 break;
1498#endif
1499
5daab9db
CF
1500 case SO_COOKIE:
1501 lv = sizeof(u64);
1502 if (len < lv)
1503 return -EINVAL;
1504 v.val64 = sock_gen_cookie(sk);
1505 break;
1506
76851d12
WB
1507 case SO_ZEROCOPY:
1508 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1509 break;
1510
80b14dee
RC
1511 case SO_TXTIME:
1512 lv = sizeof(v.txtime);
1513 v.txtime.clockid = sk->sk_clockid;
1514 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1515 SOF_TXTIME_DEADLINE_MODE : 0;
4b15c707
JSP
1516 v.txtime.flags |= sk->sk_txtime_report_errors ?
1517 SOF_TXTIME_REPORT_ERRORS : 0;
80b14dee
RC
1518 break;
1519
f5dd3d0c
DH
1520 case SO_BINDTOIFINDEX:
1521 v.val = sk->sk_bound_dev_if;
1522 break;
1523
e71a4783 1524 default:
443b5991
YH
1525 /* We implement the SO_SNDLOWAT etc to not be settable
1526 * (1003.1g 7).
1527 */
e71a4783 1528 return -ENOPROTOOPT;
1da177e4 1529 }
e71a4783 1530
1da177e4
LT
1531 if (len > lv)
1532 len = lv;
1533 if (copy_to_user(optval, &v, len))
1534 return -EFAULT;
1535lenout:
4ec93edb
YH
1536 if (put_user(len, optlen))
1537 return -EFAULT;
1538 return 0;
1da177e4
LT
1539}
1540
a5b5bb9a
IM
1541/*
1542 * Initialize an sk_lock.
1543 *
1544 * (We also register the sk_lock with the lock validator.)
1545 */
b6f99a21 1546static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1547{
cdfbabfb
DH
1548 if (sk->sk_kern_sock)
1549 sock_lock_init_class_and_name(
1550 sk,
1551 af_family_kern_slock_key_strings[sk->sk_family],
1552 af_family_kern_slock_keys + sk->sk_family,
1553 af_family_kern_key_strings[sk->sk_family],
1554 af_family_kern_keys + sk->sk_family);
1555 else
1556 sock_lock_init_class_and_name(
1557 sk,
ed07536e
PZ
1558 af_family_slock_key_strings[sk->sk_family],
1559 af_family_slock_keys + sk->sk_family,
1560 af_family_key_strings[sk->sk_family],
1561 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1562}
1563
4dc6dc71
ED
1564/*
1565 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1566 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1567 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1568 */
f1a6c4da
PE
1569static void sock_copy(struct sock *nsk, const struct sock *osk)
1570{
1571#ifdef CONFIG_SECURITY_NETWORK
1572 void *sptr = nsk->sk_security;
1573#endif
68835aba
ED
1574 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1575
1576 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1577 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1578
f1a6c4da
PE
1579#ifdef CONFIG_SECURITY_NETWORK
1580 nsk->sk_security = sptr;
1581 security_sk_clone(osk, nsk);
1582#endif
1583}
1584
2e4afe7b
PE
1585static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1586 int family)
c308c1b2
PE
1587{
1588 struct sock *sk;
1589 struct kmem_cache *slab;
1590
1591 slab = prot->slab;
e912b114
ED
1592 if (slab != NULL) {
1593 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1594 if (!sk)
1595 return sk;
ba2489b0
ED
1596 if (priority & __GFP_ZERO)
1597 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1598 } else
c308c1b2
PE
1599 sk = kmalloc(prot->obj_size, priority);
1600
2e4afe7b
PE
1601 if (sk != NULL) {
1602 if (security_sk_alloc(sk, family, priority))
1603 goto out_free;
1604
1605 if (!try_module_get(prot->owner))
1606 goto out_free_sec;
e022f0b4 1607 sk_tx_queue_clear(sk);
2e4afe7b
PE
1608 }
1609
c308c1b2 1610 return sk;
2e4afe7b
PE
1611
1612out_free_sec:
1613 security_sk_free(sk);
1614out_free:
1615 if (slab != NULL)
1616 kmem_cache_free(slab, sk);
1617 else
1618 kfree(sk);
1619 return NULL;
c308c1b2
PE
1620}
1621
1622static void sk_prot_free(struct proto *prot, struct sock *sk)
1623{
1624 struct kmem_cache *slab;
2e4afe7b 1625 struct module *owner;
c308c1b2 1626
2e4afe7b 1627 owner = prot->owner;
c308c1b2 1628 slab = prot->slab;
2e4afe7b 1629
bd1060a1 1630 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1631 mem_cgroup_sk_free(sk);
2e4afe7b 1632 security_sk_free(sk);
c308c1b2
PE
1633 if (slab != NULL)
1634 kmem_cache_free(slab, sk);
1635 else
1636 kfree(sk);
2e4afe7b 1637 module_put(owner);
c308c1b2
PE
1638}
1639
1da177e4
LT
1640/**
1641 * sk_alloc - All socket objects are allocated here
c4ea43c5 1642 * @net: the applicable net namespace
4dc3b16b
PP
1643 * @family: protocol family
1644 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1645 * @prot: struct proto associated with this new sock instance
11aa9c28 1646 * @kern: is this to be a kernel socket?
1da177e4 1647 */
1b8d7ae4 1648struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1649 struct proto *prot, int kern)
1da177e4 1650{
c308c1b2 1651 struct sock *sk;
1da177e4 1652
154adbc8 1653 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1654 if (sk) {
154adbc8
PE
1655 sk->sk_family = family;
1656 /*
1657 * See comment in struct sock definition to understand
1658 * why we need sk_prot_creator -acme
1659 */
1660 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1661 sk->sk_kern_sock = kern;
154adbc8 1662 sock_lock_init(sk);
26abe143 1663 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 1664 if (likely(sk->sk_net_refcnt)) {
26abe143 1665 get_net(net);
648845ab
TZ
1666 sock_inuse_add(net, 1);
1667 }
1668
26abe143 1669 sock_net_set(sk, net);
14afee4b 1670 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1671
2d758073 1672 mem_cgroup_sk_alloc(sk);
d979a39d 1673 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1674 sock_update_classid(&sk->sk_cgrp_data);
1675 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1676 }
a79af59e 1677
2e4afe7b 1678 return sk;
1da177e4 1679}
2a91525c 1680EXPORT_SYMBOL(sk_alloc);
1da177e4 1681
a4298e45
ED
1682/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1683 * grace period. This is the case for UDP sockets and TCP listeners.
1684 */
1685static void __sk_destruct(struct rcu_head *head)
1da177e4 1686{
a4298e45 1687 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1688 struct sk_filter *filter;
1da177e4
LT
1689
1690 if (sk->sk_destruct)
1691 sk->sk_destruct(sk);
1692
a898def2 1693 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1694 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1695 if (filter) {
309dd5fc 1696 sk_filter_uncharge(sk, filter);
a9b3cd7f 1697 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1698 }
538950a1
CG
1699 if (rcu_access_pointer(sk->sk_reuseport_cb))
1700 reuseport_detach_sock(sk);
1da177e4 1701
08e29af3 1702 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4 1703
6ac99e8f
MKL
1704#ifdef CONFIG_BPF_SYSCALL
1705 bpf_sk_storage_free(sk);
1706#endif
1707
1da177e4 1708 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1709 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1710 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1711
22a0e18e
ED
1712 if (sk->sk_frag.page) {
1713 put_page(sk->sk_frag.page);
1714 sk->sk_frag.page = NULL;
1715 }
1716
109f6e39
EB
1717 if (sk->sk_peer_cred)
1718 put_cred(sk->sk_peer_cred);
1719 put_pid(sk->sk_peer_pid);
26abe143
EB
1720 if (likely(sk->sk_net_refcnt))
1721 put_net(sock_net(sk));
c308c1b2 1722 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1723}
2b85a34e 1724
a4298e45
ED
1725void sk_destruct(struct sock *sk)
1726{
1727 if (sock_flag(sk, SOCK_RCU_FREE))
1728 call_rcu(&sk->sk_rcu, __sk_destruct);
1729 else
1730 __sk_destruct(&sk->sk_rcu);
1731}
1732
eb4cb008
CG
1733static void __sk_free(struct sock *sk)
1734{
648845ab
TZ
1735 if (likely(sk->sk_net_refcnt))
1736 sock_inuse_add(sock_net(sk), -1);
1737
9709020c 1738 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
eb4cb008
CG
1739 sock_diag_broadcast_destroy(sk);
1740 else
1741 sk_destruct(sk);
1742}
1743
2b85a34e
ED
1744void sk_free(struct sock *sk)
1745{
1746 /*
25985edc 1747 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1748 * some packets are still in some tx queue.
1749 * If not null, sock_wfree() will call __sk_free(sk) later
1750 */
14afee4b 1751 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1752 __sk_free(sk);
1753}
2a91525c 1754EXPORT_SYMBOL(sk_free);
1da177e4 1755
581319c5
PA
1756static void sk_init_common(struct sock *sk)
1757{
1758 skb_queue_head_init(&sk->sk_receive_queue);
1759 skb_queue_head_init(&sk->sk_write_queue);
1760 skb_queue_head_init(&sk->sk_error_queue);
1761
1762 rwlock_init(&sk->sk_callback_lock);
1763 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1764 af_rlock_keys + sk->sk_family,
1765 af_family_rlock_key_strings[sk->sk_family]);
1766 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1767 af_wlock_keys + sk->sk_family,
1768 af_family_wlock_key_strings[sk->sk_family]);
1769 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1770 af_elock_keys + sk->sk_family,
1771 af_family_elock_key_strings[sk->sk_family]);
1772 lockdep_set_class_and_name(&sk->sk_callback_lock,
1773 af_callback_keys + sk->sk_family,
1774 af_family_clock_key_strings[sk->sk_family]);
1775}
1776
e56c57d0
ED
1777/**
1778 * sk_clone_lock - clone a socket, and lock its clone
1779 * @sk: the socket to clone
1780 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1781 *
1782 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1783 */
1784struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1785{
8fd1d178 1786 struct sock *newsk;
278571ba 1787 bool is_charged = true;
87d11ceb 1788
8fd1d178 1789 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1790 if (newsk != NULL) {
1791 struct sk_filter *filter;
1792
892c141e 1793 sock_copy(newsk, sk);
87d11ceb 1794
9d538fa6
CP
1795 newsk->sk_prot_creator = sk->sk_prot;
1796
87d11ceb 1797 /* SANITY */
8a681736
SV
1798 if (likely(newsk->sk_net_refcnt))
1799 get_net(sock_net(newsk));
87d11ceb
ACM
1800 sk_node_init(&newsk->sk_node);
1801 sock_lock_init(newsk);
1802 bh_lock_sock(newsk);
fa438ccf 1803 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1804 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1805
1806 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1807 /*
1808 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1809 */
14afee4b 1810 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1811 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1812 sk_init_common(newsk);
87d11ceb
ACM
1813
1814 newsk->sk_dst_cache = NULL;
9b8805a3 1815 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1816 newsk->sk_wmem_queued = 0;
1817 newsk->sk_forward_alloc = 0;
9caad864 1818 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1819 newsk->sk_send_head = NULL;
87d11ceb 1820 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1821 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1822
1823 sock_reset_flag(newsk, SOCK_DONE);
edbe69ef 1824 mem_cgroup_sk_alloc(newsk);
c0576e39 1825 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1826
eefca20e
ED
1827 rcu_read_lock();
1828 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1829 if (filter != NULL)
278571ba
AS
1830 /* though it's an empty new sock, the charging may fail
1831 * if sysctl_optmem_max was changed between creation of
1832 * original socket and cloning
1833 */
1834 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1835 RCU_INIT_POINTER(newsk->sk_filter, filter);
1836 rcu_read_unlock();
87d11ceb 1837
d188ba86 1838 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1839 /* We need to make sure that we don't uncharge the new
1840 * socket if we couldn't charge it in the first place
1841 * as otherwise we uncharge the parent's filter.
1842 */
1843 if (!is_charged)
1844 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1845 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1846 newsk = NULL;
1847 goto out;
1848 }
fa463497 1849 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
f12dd759
MKL
1850#ifdef CONFIG_BPF_SYSCALL
1851 RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
1852#endif
87d11ceb
ACM
1853
1854 newsk->sk_err = 0;
e551c32d 1855 newsk->sk_err_soft = 0;
87d11ceb 1856 newsk->sk_priority = 0;
2c8c56e1 1857 newsk->sk_incoming_cpu = raw_smp_processor_id();
648845ab
TZ
1858 if (likely(newsk->sk_net_refcnt))
1859 sock_inuse_add(sock_net(newsk), 1);
d979a39d 1860
4dc6dc71
ED
1861 /*
1862 * Before updating sk_refcnt, we must commit prior changes to memory
1863 * (Documentation/RCU/rculist_nulls.txt for details)
1864 */
1865 smp_wmb();
41c6d650 1866 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1867
1868 /*
1869 * Increment the counter in the same struct proto as the master
1870 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1871 * is the same as sk->sk_prot->socks, as this field was copied
1872 * with memcpy).
1873 *
1874 * This _changes_ the previous behaviour, where
1875 * tcp_create_openreq_child always was incrementing the
1876 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1877 * to be taken into account in all callers. -acme
1878 */
1879 sk_refcnt_debug_inc(newsk);
972692e0 1880 sk_set_socket(newsk, NULL);
c2f26e8f 1881 RCU_INIT_POINTER(newsk->sk_wq, NULL);
87d11ceb
ACM
1882
1883 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1884 sk_sockets_allocated_inc(newsk);
704da560 1885
080a270f
HFS
1886 if (sock_needs_netstamp(sk) &&
1887 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1888 net_enable_timestamp();
87d11ceb
ACM
1889 }
1890out:
1891 return newsk;
1892}
e56c57d0 1893EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1894
94352d45
ACM
1895void sk_free_unlock_clone(struct sock *sk)
1896{
1897 /* It is still raw copy of parent, so invalidate
1898 * destructor and make plain sk_free() */
1899 sk->sk_destruct = NULL;
1900 bh_unlock_sock(sk);
1901 sk_free(sk);
1902}
1903EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1904
9958089a
AK
1905void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1906{
d6a4e26a
ED
1907 u32 max_segs = 1;
1908
6bd4f355 1909 sk_dst_set(sk, dst);
0a6b2a1d 1910 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
9958089a 1911 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1912 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1913 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1914 if (sk_can_gso(sk)) {
f70f250a 1915 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1916 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1917 } else {
9958089a 1918 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1919 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1920 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1921 }
9958089a 1922 }
d6a4e26a 1923 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1924}
1925EXPORT_SYMBOL_GPL(sk_setup_caps);
1926
1da177e4
LT
1927/*
1928 * Simple resource managers for sockets.
1929 */
1930
1931
4ec93edb
YH
1932/*
1933 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1934 */
1935void sock_wfree(struct sk_buff *skb)
1936{
1937 struct sock *sk = skb->sk;
d99927f4 1938 unsigned int len = skb->truesize;
1da177e4 1939
d99927f4
ED
1940 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1941 /*
1942 * Keep a reference on sk_wmem_alloc, this will be released
1943 * after sk_write_space() call
1944 */
14afee4b 1945 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1946 sk->sk_write_space(sk);
d99927f4
ED
1947 len = 1;
1948 }
2b85a34e 1949 /*
d99927f4
ED
1950 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1951 * could not do because of in-flight packets
2b85a34e 1952 */
14afee4b 1953 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1954 __sk_free(sk);
1da177e4 1955}
2a91525c 1956EXPORT_SYMBOL(sock_wfree);
1da177e4 1957
1d2077ac
ED
1958/* This variant of sock_wfree() is used by TCP,
1959 * since it sets SOCK_USE_WRITE_QUEUE.
1960 */
1961void __sock_wfree(struct sk_buff *skb)
1962{
1963 struct sock *sk = skb->sk;
1964
14afee4b 1965 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1966 __sk_free(sk);
1967}
1968
9e17f8a4
ED
1969void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1970{
1971 skb_orphan(skb);
1972 skb->sk = sk;
1973#ifdef CONFIG_INET
1974 if (unlikely(!sk_fullsock(sk))) {
1975 skb->destructor = sock_edemux;
1976 sock_hold(sk);
1977 return;
1978 }
1979#endif
1980 skb->destructor = sock_wfree;
1981 skb_set_hash_from_sk(skb, sk);
1982 /*
1983 * We used to take a refcount on sk, but following operation
1984 * is enough to guarantee sk_free() wont free this sock until
1985 * all in-flight packets are completed
1986 */
14afee4b 1987 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1988}
1989EXPORT_SYMBOL(skb_set_owner_w);
1990
1d2077ac
ED
1991/* This helper is used by netem, as it can hold packets in its
1992 * delay queue. We want to allow the owner socket to send more
1993 * packets, as if they were already TX completed by a typical driver.
1994 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1995 * rely on it (sch_fq for example).
1d2077ac 1996 */
f2f872f9
ED
1997void skb_orphan_partial(struct sk_buff *skb)
1998{
f6ba8d33 1999 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
2000 return;
2001
f2f872f9
ED
2002 if (skb->destructor == sock_wfree
2003#ifdef CONFIG_INET
2004 || skb->destructor == tcp_wfree
2005#endif
2006 ) {
f6ba8d33
ED
2007 struct sock *sk = skb->sk;
2008
41c6d650 2009 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 2010 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
2011 skb->destructor = sock_efree;
2012 }
f2f872f9
ED
2013 } else {
2014 skb_orphan(skb);
2015 }
2016}
2017EXPORT_SYMBOL(skb_orphan_partial);
2018
4ec93edb
YH
2019/*
2020 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
2021 */
2022void sock_rfree(struct sk_buff *skb)
2023{
2024 struct sock *sk = skb->sk;
d361fd59 2025 unsigned int len = skb->truesize;
1da177e4 2026
d361fd59
ED
2027 atomic_sub(len, &sk->sk_rmem_alloc);
2028 sk_mem_uncharge(sk, len);
1da177e4 2029}
2a91525c 2030EXPORT_SYMBOL(sock_rfree);
1da177e4 2031
7768eed8
OH
2032/*
2033 * Buffer destructor for skbs that are not used directly in read or write
2034 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2035 */
62bccb8c
AD
2036void sock_efree(struct sk_buff *skb)
2037{
2038 sock_put(skb->sk);
2039}
2040EXPORT_SYMBOL(sock_efree);
2041
976d0201 2042kuid_t sock_i_uid(struct sock *sk)
1da177e4 2043{
976d0201 2044 kuid_t uid;
1da177e4 2045
f064af1e 2046 read_lock_bh(&sk->sk_callback_lock);
976d0201 2047 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 2048 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2049 return uid;
2050}
2a91525c 2051EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
2052
2053unsigned long sock_i_ino(struct sock *sk)
2054{
2055 unsigned long ino;
2056
f064af1e 2057 read_lock_bh(&sk->sk_callback_lock);
1da177e4 2058 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 2059 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2060 return ino;
2061}
2a91525c 2062EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
2063
2064/*
2065 * Allocate a skb from the socket's send buffer.
2066 */
86a76caf 2067struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 2068 gfp_t priority)
1da177e4 2069{
14afee4b 2070 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 2071 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
2072 if (skb) {
2073 skb_set_owner_w(skb, sk);
2074 return skb;
2075 }
2076 }
2077 return NULL;
2078}
2a91525c 2079EXPORT_SYMBOL(sock_wmalloc);
1da177e4 2080
98ba0bd5
WB
2081static void sock_ofree(struct sk_buff *skb)
2082{
2083 struct sock *sk = skb->sk;
2084
2085 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2086}
2087
2088struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2089 gfp_t priority)
2090{
2091 struct sk_buff *skb;
2092
2093 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2094 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2095 sysctl_optmem_max)
2096 return NULL;
2097
2098 skb = alloc_skb(size, priority);
2099 if (!skb)
2100 return NULL;
2101
2102 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2103 skb->sk = sk;
2104 skb->destructor = sock_ofree;
2105 return skb;
2106}
2107
4ec93edb 2108/*
1da177e4 2109 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 2110 */
dd0fc66f 2111void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 2112{
95c96174 2113 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
2114 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2115 void *mem;
2116 /* First do the add, to avoid the race if kmalloc
4ec93edb 2117 * might sleep.
1da177e4
LT
2118 */
2119 atomic_add(size, &sk->sk_omem_alloc);
2120 mem = kmalloc(size, priority);
2121 if (mem)
2122 return mem;
2123 atomic_sub(size, &sk->sk_omem_alloc);
2124 }
2125 return NULL;
2126}
2a91525c 2127EXPORT_SYMBOL(sock_kmalloc);
1da177e4 2128
79e88659
DB
2129/* Free an option memory block. Note, we actually want the inline
2130 * here as this allows gcc to detect the nullify and fold away the
2131 * condition entirely.
1da177e4 2132 */
79e88659
DB
2133static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2134 const bool nullify)
1da177e4 2135{
e53da5fb
DM
2136 if (WARN_ON_ONCE(!mem))
2137 return;
79e88659
DB
2138 if (nullify)
2139 kzfree(mem);
2140 else
2141 kfree(mem);
1da177e4
LT
2142 atomic_sub(size, &sk->sk_omem_alloc);
2143}
79e88659
DB
2144
2145void sock_kfree_s(struct sock *sk, void *mem, int size)
2146{
2147 __sock_kfree_s(sk, mem, size, false);
2148}
2a91525c 2149EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2150
79e88659
DB
2151void sock_kzfree_s(struct sock *sk, void *mem, int size)
2152{
2153 __sock_kfree_s(sk, mem, size, true);
2154}
2155EXPORT_SYMBOL(sock_kzfree_s);
2156
1da177e4
LT
2157/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2158 I think, these locks should be removed for datagram sockets.
2159 */
2a91525c 2160static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2161{
2162 DEFINE_WAIT(wait);
2163
9cd3e072 2164 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2165 for (;;) {
2166 if (!timeo)
2167 break;
2168 if (signal_pending(current))
2169 break;
2170 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2171 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2172 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2173 break;
2174 if (sk->sk_shutdown & SEND_SHUTDOWN)
2175 break;
2176 if (sk->sk_err)
2177 break;
2178 timeo = schedule_timeout(timeo);
2179 }
aa395145 2180 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2181 return timeo;
2182}
2183
2184
2185/*
2186 * Generic send/receive buffer handlers
2187 */
2188
4cc7f68d
HX
2189struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2190 unsigned long data_len, int noblock,
28d64271 2191 int *errcode, int max_page_order)
1da177e4 2192{
2e4e4410 2193 struct sk_buff *skb;
1da177e4
LT
2194 long timeo;
2195 int err;
2196
1da177e4 2197 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2198 for (;;) {
1da177e4
LT
2199 err = sock_error(sk);
2200 if (err != 0)
2201 goto failure;
2202
2203 err = -EPIPE;
2204 if (sk->sk_shutdown & SEND_SHUTDOWN)
2205 goto failure;
2206
2e4e4410
ED
2207 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2208 break;
28d64271 2209
9cd3e072 2210 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2211 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2212 err = -EAGAIN;
2213 if (!timeo)
1da177e4 2214 goto failure;
2e4e4410
ED
2215 if (signal_pending(current))
2216 goto interrupted;
2217 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2218 }
2e4e4410
ED
2219 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2220 errcode, sk->sk_allocation);
2221 if (skb)
2222 skb_set_owner_w(skb, sk);
1da177e4
LT
2223 return skb;
2224
2225interrupted:
2226 err = sock_intr_errno(timeo);
2227failure:
2228 *errcode = err;
2229 return NULL;
2230}
4cc7f68d 2231EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2232
4ec93edb 2233struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2234 int noblock, int *errcode)
2235{
28d64271 2236 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2237}
2a91525c 2238EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2239
39771b12
WB
2240int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2241 struct sockcm_cookie *sockc)
2242{
3dd17e63
SHY
2243 u32 tsflags;
2244
39771b12
WB
2245 switch (cmsg->cmsg_type) {
2246 case SO_MARK:
2247 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2248 return -EPERM;
2249 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2250 return -EINVAL;
2251 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2252 break;
7f1bc6e9 2253 case SO_TIMESTAMPING_OLD:
3dd17e63
SHY
2254 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2255 return -EINVAL;
2256
2257 tsflags = *(u32 *)CMSG_DATA(cmsg);
2258 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2259 return -EINVAL;
2260
2261 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2262 sockc->tsflags |= tsflags;
2263 break;
80b14dee
RC
2264 case SCM_TXTIME:
2265 if (!sock_flag(sk, SOCK_TXTIME))
2266 return -EINVAL;
2267 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2268 return -EINVAL;
2269 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2270 break;
779f1ede
SHY
2271 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2272 case SCM_RIGHTS:
2273 case SCM_CREDENTIALS:
2274 break;
39771b12
WB
2275 default:
2276 return -EINVAL;
2277 }
2278 return 0;
2279}
2280EXPORT_SYMBOL(__sock_cmsg_send);
2281
f28ea365
EJ
2282int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2283 struct sockcm_cookie *sockc)
2284{
2285 struct cmsghdr *cmsg;
39771b12 2286 int ret;
f28ea365
EJ
2287
2288 for_each_cmsghdr(cmsg, msg) {
2289 if (!CMSG_OK(msg, cmsg))
2290 return -EINVAL;
2291 if (cmsg->cmsg_level != SOL_SOCKET)
2292 continue;
39771b12
WB
2293 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2294 if (ret)
2295 return ret;
f28ea365
EJ
2296 }
2297 return 0;
2298}
2299EXPORT_SYMBOL(sock_cmsg_send);
2300
06044751
ED
2301static void sk_enter_memory_pressure(struct sock *sk)
2302{
2303 if (!sk->sk_prot->enter_memory_pressure)
2304 return;
2305
2306 sk->sk_prot->enter_memory_pressure(sk);
2307}
2308
2309static void sk_leave_memory_pressure(struct sock *sk)
2310{
2311 if (sk->sk_prot->leave_memory_pressure) {
2312 sk->sk_prot->leave_memory_pressure(sk);
2313 } else {
2314 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2315
2316 if (memory_pressure && *memory_pressure)
2317 *memory_pressure = 0;
2318 }
2319}
2320
5640f768
ED
2321/* On 32bit arches, an skb frag is limited to 2^15 */
2322#define SKB_FRAG_PAGE_ORDER get_order(32768)
ce27ec60 2323DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
5640f768 2324
400dfd3a
ED
2325/**
2326 * skb_page_frag_refill - check that a page_frag contains enough room
2327 * @sz: minimum size of the fragment we want to get
2328 * @pfrag: pointer to page_frag
82d5e2b8 2329 * @gfp: priority for memory allocation
400dfd3a
ED
2330 *
2331 * Note: While this allocator tries to use high order pages, there is
2332 * no guarantee that allocations succeed. Therefore, @sz MUST be
2333 * less or equal than PAGE_SIZE.
2334 */
d9b2938a 2335bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2336{
5640f768 2337 if (pfrag->page) {
fe896d18 2338 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2339 pfrag->offset = 0;
2340 return true;
2341 }
400dfd3a 2342 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2343 return true;
2344 put_page(pfrag->page);
2345 }
2346
d9b2938a 2347 pfrag->offset = 0;
ce27ec60
ED
2348 if (SKB_FRAG_PAGE_ORDER &&
2349 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
d0164adc
MG
2350 /* Avoid direct reclaim but allow kswapd to wake */
2351 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2352 __GFP_COMP | __GFP_NOWARN |
2353 __GFP_NORETRY,
d9b2938a 2354 SKB_FRAG_PAGE_ORDER);
5640f768 2355 if (likely(pfrag->page)) {
d9b2938a 2356 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2357 return true;
2358 }
d9b2938a
ED
2359 }
2360 pfrag->page = alloc_page(gfp);
2361 if (likely(pfrag->page)) {
2362 pfrag->size = PAGE_SIZE;
2363 return true;
2364 }
400dfd3a
ED
2365 return false;
2366}
2367EXPORT_SYMBOL(skb_page_frag_refill);
2368
2369bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2370{
2371 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2372 return true;
2373
5640f768
ED
2374 sk_enter_memory_pressure(sk);
2375 sk_stream_moderate_sndbuf(sk);
2376 return false;
2377}
2378EXPORT_SYMBOL(sk_page_frag_refill);
2379
1da177e4 2380static void __lock_sock(struct sock *sk)
f39234d6
NK
2381 __releases(&sk->sk_lock.slock)
2382 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2383{
2384 DEFINE_WAIT(wait);
2385
e71a4783 2386 for (;;) {
1da177e4
LT
2387 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2388 TASK_UNINTERRUPTIBLE);
2389 spin_unlock_bh(&sk->sk_lock.slock);
2390 schedule();
2391 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2392 if (!sock_owned_by_user(sk))
1da177e4
LT
2393 break;
2394 }
2395 finish_wait(&sk->sk_lock.wq, &wait);
2396}
2397
8873c064 2398void __release_sock(struct sock *sk)
f39234d6
NK
2399 __releases(&sk->sk_lock.slock)
2400 __acquires(&sk->sk_lock.slock)
1da177e4 2401{
5413d1ba 2402 struct sk_buff *skb, *next;
1da177e4 2403
5413d1ba 2404 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2405 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2406
5413d1ba 2407 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2408
5413d1ba
ED
2409 do {
2410 next = skb->next;
e4cbb02a 2411 prefetch(next);
7fee226a 2412 WARN_ON_ONCE(skb_dst_is_noref(skb));
a8305bff 2413 skb_mark_not_on_list(skb);
c57943a1 2414 sk_backlog_rcv(sk, skb);
1da177e4 2415
5413d1ba 2416 cond_resched();
1da177e4
LT
2417
2418 skb = next;
2419 } while (skb != NULL);
2420
5413d1ba
ED
2421 spin_lock_bh(&sk->sk_lock.slock);
2422 }
8eae939f
ZY
2423
2424 /*
2425 * Doing the zeroing here guarantee we can not loop forever
2426 * while a wild producer attempts to flood us.
2427 */
2428 sk->sk_backlog.len = 0;
1da177e4
LT
2429}
2430
d41a69f1
ED
2431void __sk_flush_backlog(struct sock *sk)
2432{
2433 spin_lock_bh(&sk->sk_lock.slock);
2434 __release_sock(sk);
2435 spin_unlock_bh(&sk->sk_lock.slock);
2436}
2437
1da177e4
LT
2438/**
2439 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2440 * @sk: sock to wait on
2441 * @timeo: for how long
dfbafc99 2442 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2443 *
2444 * Now socket state including sk->sk_err is changed only under lock,
2445 * hence we may omit checks after joining wait queue.
2446 * We check receive queue before schedule() only as optimization;
2447 * it is very likely that release_sock() added new data.
2448 */
dfbafc99 2449int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2450{
d9dc8b0f 2451 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2452 int rc;
1da177e4 2453
d9dc8b0f 2454 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2455 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2456 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2457 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2458 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2459 return rc;
2460}
1da177e4
LT
2461EXPORT_SYMBOL(sk_wait_data);
2462
3ab224be 2463/**
f8c3bf00 2464 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2465 * @sk: socket
2466 * @size: memory size to allocate
f8c3bf00 2467 * @amt: pages to allocate
3ab224be
HA
2468 * @kind: allocation type
2469 *
f8c3bf00 2470 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2471 */
f8c3bf00 2472int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2473{
2474 struct proto *prot = sk->sk_prot;
f8c3bf00 2475 long allocated = sk_memory_allocated_add(sk, amt);
d6f19938 2476 bool charged = true;
e805605c 2477
baac50bb 2478 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
d6f19938 2479 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
e805605c 2480 goto suppress_allocation;
3ab224be
HA
2481
2482 /* Under limit. */
e805605c 2483 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2484 sk_leave_memory_pressure(sk);
3ab224be
HA
2485 return 1;
2486 }
2487
e805605c
JW
2488 /* Under pressure. */
2489 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2490 sk_enter_memory_pressure(sk);
3ab224be 2491
e805605c
JW
2492 /* Over hard limit. */
2493 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2494 goto suppress_allocation;
2495
2496 /* guarantee minimum buffer size under pressure */
2497 if (kind == SK_MEM_RECV) {
a3dcaf17 2498 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 2499 return 1;
180d8cd9 2500
3ab224be 2501 } else { /* SK_MEM_SEND */
a3dcaf17
ED
2502 int wmem0 = sk_get_wmem0(sk, prot);
2503
3ab224be 2504 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 2505 if (sk->sk_wmem_queued < wmem0)
3ab224be 2506 return 1;
a3dcaf17 2507 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 2508 return 1;
a3dcaf17 2509 }
3ab224be
HA
2510 }
2511
180d8cd9 2512 if (sk_has_memory_pressure(sk)) {
5bf325a5 2513 u64 alloc;
1748376b 2514
180d8cd9 2515 if (!sk_under_memory_pressure(sk))
1748376b 2516 return 1;
180d8cd9
GC
2517 alloc = sk_sockets_allocated_read_positive(sk);
2518 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2519 sk_mem_pages(sk->sk_wmem_queued +
2520 atomic_read(&sk->sk_rmem_alloc) +
2521 sk->sk_forward_alloc))
2522 return 1;
2523 }
2524
2525suppress_allocation:
2526
2527 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2528 sk_stream_moderate_sndbuf(sk);
2529
2530 /* Fail only if socket is _under_ its sndbuf.
2531 * In this case we cannot block, so that we have to fail.
2532 */
2533 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2534 return 1;
2535 }
2536
d6f19938
YS
2537 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2538 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3847ce32 2539
0e90b31f 2540 sk_memory_allocated_sub(sk, amt);
180d8cd9 2541
baac50bb
JW
2542 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2543 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2544
3ab224be
HA
2545 return 0;
2546}
f8c3bf00
PA
2547EXPORT_SYMBOL(__sk_mem_raise_allocated);
2548
2549/**
2550 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2551 * @sk: socket
2552 * @size: memory size to allocate
2553 * @kind: allocation type
2554 *
2555 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2556 * rmem allocation. This function assumes that protocols which have
2557 * memory_pressure use sk_wmem_queued as write buffer accounting.
2558 */
2559int __sk_mem_schedule(struct sock *sk, int size, int kind)
2560{
2561 int ret, amt = sk_mem_pages(size);
2562
2563 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2564 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2565 if (!ret)
2566 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2567 return ret;
2568}
3ab224be
HA
2569EXPORT_SYMBOL(__sk_mem_schedule);
2570
2571/**
f8c3bf00 2572 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2573 * @sk: socket
f8c3bf00
PA
2574 * @amount: number of quanta
2575 *
2576 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2577 */
f8c3bf00 2578void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2579{
1a24e04e 2580 sk_memory_allocated_sub(sk, amount);
3ab224be 2581
baac50bb
JW
2582 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2583 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2584
180d8cd9
GC
2585 if (sk_under_memory_pressure(sk) &&
2586 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2587 sk_leave_memory_pressure(sk);
3ab224be 2588}
f8c3bf00
PA
2589EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2590
2591/**
2592 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2593 * @sk: socket
2594 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2595 */
2596void __sk_mem_reclaim(struct sock *sk, int amount)
2597{
2598 amount >>= SK_MEM_QUANTUM_SHIFT;
2599 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2600 __sk_mem_reduce_allocated(sk, amount);
2601}
3ab224be
HA
2602EXPORT_SYMBOL(__sk_mem_reclaim);
2603
627d2d6b 2604int sk_set_peek_off(struct sock *sk, int val)
2605{
627d2d6b 2606 sk->sk_peek_off = val;
2607 return 0;
2608}
2609EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2610
1da177e4
LT
2611/*
2612 * Set of default routines for initialising struct proto_ops when
2613 * the protocol does not support a particular function. In certain
2614 * cases where it makes no sense for a protocol to have a "do nothing"
2615 * function, some default processing is provided.
2616 */
2617
2618int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2619{
2620 return -EOPNOTSUPP;
2621}
2a91525c 2622EXPORT_SYMBOL(sock_no_bind);
1da177e4 2623
4ec93edb 2624int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2625 int len, int flags)
2626{
2627 return -EOPNOTSUPP;
2628}
2a91525c 2629EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2630
2631int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2632{
2633 return -EOPNOTSUPP;
2634}
2a91525c 2635EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2636
cdfbabfb
DH
2637int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2638 bool kern)
1da177e4
LT
2639{
2640 return -EOPNOTSUPP;
2641}
2a91525c 2642EXPORT_SYMBOL(sock_no_accept);
1da177e4 2643
4ec93edb 2644int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 2645 int peer)
1da177e4
LT
2646{
2647 return -EOPNOTSUPP;
2648}
2a91525c 2649EXPORT_SYMBOL(sock_no_getname);
1da177e4 2650
1da177e4
LT
2651int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2652{
2653 return -EOPNOTSUPP;
2654}
2a91525c 2655EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2656
2657int sock_no_listen(struct socket *sock, int backlog)
2658{
2659 return -EOPNOTSUPP;
2660}
2a91525c 2661EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2662
2663int sock_no_shutdown(struct socket *sock, int how)
2664{
2665 return -EOPNOTSUPP;
2666}
2a91525c 2667EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2668
2669int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2670 char __user *optval, unsigned int optlen)
1da177e4
LT
2671{
2672 return -EOPNOTSUPP;
2673}
2a91525c 2674EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2675
2676int sock_no_getsockopt(struct socket *sock, int level, int optname,
2677 char __user *optval, int __user *optlen)
2678{
2679 return -EOPNOTSUPP;
2680}
2a91525c 2681EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2682
1b784140 2683int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2684{
2685 return -EOPNOTSUPP;
2686}
2a91525c 2687EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2688
306b13eb
TH
2689int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2690{
2691 return -EOPNOTSUPP;
2692}
2693EXPORT_SYMBOL(sock_no_sendmsg_locked);
2694
1b784140
YX
2695int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2696 int flags)
1da177e4
LT
2697{
2698 return -EOPNOTSUPP;
2699}
2a91525c 2700EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2701
2702int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2703{
2704 /* Mirror missing mmap method error code */
2705 return -ENODEV;
2706}
2a91525c 2707EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2708
2709ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2710{
2711 ssize_t res;
2712 struct msghdr msg = {.msg_flags = flags};
2713 struct kvec iov;
2714 char *kaddr = kmap(page);
2715 iov.iov_base = kaddr + offset;
2716 iov.iov_len = size;
2717 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2718 kunmap(page);
2719 return res;
2720}
2a91525c 2721EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2722
306b13eb
TH
2723ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2724 int offset, size_t size, int flags)
2725{
2726 ssize_t res;
2727 struct msghdr msg = {.msg_flags = flags};
2728 struct kvec iov;
2729 char *kaddr = kmap(page);
2730
2731 iov.iov_base = kaddr + offset;
2732 iov.iov_len = size;
2733 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2734 kunmap(page);
2735 return res;
2736}
2737EXPORT_SYMBOL(sock_no_sendpage_locked);
2738
1da177e4
LT
2739/*
2740 * Default Socket Callbacks
2741 */
2742
2743static void sock_def_wakeup(struct sock *sk)
2744{
43815482
ED
2745 struct socket_wq *wq;
2746
2747 rcu_read_lock();
2748 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2749 if (skwq_has_sleeper(wq))
43815482
ED
2750 wake_up_interruptible_all(&wq->wait);
2751 rcu_read_unlock();
1da177e4
LT
2752}
2753
2754static void sock_def_error_report(struct sock *sk)
2755{
43815482
ED
2756 struct socket_wq *wq;
2757
2758 rcu_read_lock();
2759 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2760 if (skwq_has_sleeper(wq))
a9a08845 2761 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 2762 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2763 rcu_read_unlock();
1da177e4
LT
2764}
2765
676d2369 2766static void sock_def_readable(struct sock *sk)
1da177e4 2767{
43815482
ED
2768 struct socket_wq *wq;
2769
2770 rcu_read_lock();
2771 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2772 if (skwq_has_sleeper(wq))
a9a08845
LT
2773 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2774 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 2775 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2776 rcu_read_unlock();
1da177e4
LT
2777}
2778
2779static void sock_def_write_space(struct sock *sk)
2780{
43815482
ED
2781 struct socket_wq *wq;
2782
2783 rcu_read_lock();
1da177e4
LT
2784
2785 /* Do not wake up a writer until he can make "significant"
2786 * progress. --DaveM
2787 */
14afee4b 2788 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2789 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2790 if (skwq_has_sleeper(wq))
a9a08845
LT
2791 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2792 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
2793
2794 /* Should agree with poll, otherwise some programs break */
2795 if (sock_writeable(sk))
8d8ad9d7 2796 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2797 }
2798
43815482 2799 rcu_read_unlock();
1da177e4
LT
2800}
2801
2802static void sock_def_destruct(struct sock *sk)
2803{
1da177e4
LT
2804}
2805
2806void sk_send_sigurg(struct sock *sk)
2807{
2808 if (sk->sk_socket && sk->sk_socket->file)
2809 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2810 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2811}
2a91525c 2812EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2813
2814void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2815 unsigned long expires)
2816{
2817 if (!mod_timer(timer, expires))
2818 sock_hold(sk);
2819}
1da177e4
LT
2820EXPORT_SYMBOL(sk_reset_timer);
2821
2822void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2823{
25cc4ae9 2824 if (del_timer(timer))
1da177e4
LT
2825 __sock_put(sk);
2826}
1da177e4
LT
2827EXPORT_SYMBOL(sk_stop_timer);
2828
2829void sock_init_data(struct socket *sock, struct sock *sk)
2830{
581319c5 2831 sk_init_common(sk);
1da177e4
LT
2832 sk->sk_send_head = NULL;
2833
99767f27 2834 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 2835
1da177e4
LT
2836 sk->sk_allocation = GFP_KERNEL;
2837 sk->sk_rcvbuf = sysctl_rmem_default;
2838 sk->sk_sndbuf = sysctl_wmem_default;
2839 sk->sk_state = TCP_CLOSE;
972692e0 2840 sk_set_socket(sk, sock);
1da177e4
LT
2841
2842 sock_set_flag(sk, SOCK_ZAPPED);
2843
e71a4783 2844 if (sock) {
1da177e4 2845 sk->sk_type = sock->type;
c2f26e8f 2846 RCU_INIT_POINTER(sk->sk_wq, sock->wq);
1da177e4 2847 sock->sk = sk;
86741ec2
LC
2848 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2849 } else {
c2f26e8f 2850 RCU_INIT_POINTER(sk->sk_wq, NULL);
86741ec2
LC
2851 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2852 }
1da177e4 2853
1da177e4 2854 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2855 if (sk->sk_kern_sock)
2856 lockdep_set_class_and_name(
2857 &sk->sk_callback_lock,
2858 af_kern_callback_keys + sk->sk_family,
2859 af_family_kern_clock_key_strings[sk->sk_family]);
2860 else
2861 lockdep_set_class_and_name(
2862 &sk->sk_callback_lock,
443aef0e
PZ
2863 af_callback_keys + sk->sk_family,
2864 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2865
2866 sk->sk_state_change = sock_def_wakeup;
2867 sk->sk_data_ready = sock_def_readable;
2868 sk->sk_write_space = sock_def_write_space;
2869 sk->sk_error_report = sock_def_error_report;
2870 sk->sk_destruct = sock_def_destruct;
2871
5640f768
ED
2872 sk->sk_frag.page = NULL;
2873 sk->sk_frag.offset = 0;
ef64a54f 2874 sk->sk_peek_off = -1;
1da177e4 2875
109f6e39
EB
2876 sk->sk_peer_pid = NULL;
2877 sk->sk_peer_cred = NULL;
1da177e4
LT
2878 sk->sk_write_pending = 0;
2879 sk->sk_rcvlowat = 1;
2880 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2881 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2882
6c7c98ba 2883 sk->sk_stamp = SK_DEFAULT_STAMP;
3a0ed3e9
DD
2884#if BITS_PER_LONG==32
2885 seqlock_init(&sk->sk_stamp_seq);
2886#endif
52267790 2887 atomic_set(&sk->sk_zckey, 0);
1da177e4 2888
e0d1095a 2889#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2890 sk->sk_napi_id = 0;
64b0dc51 2891 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2892#endif
2893
76a9ebe8
ED
2894 sk->sk_max_pacing_rate = ~0UL;
2895 sk->sk_pacing_rate = ~0UL;
3a9b76fd 2896 sk->sk_pacing_shift = 10;
70da268b 2897 sk->sk_incoming_cpu = -1;
c6345ce7
AN
2898
2899 sk_rx_queue_clear(sk);
4dc6dc71
ED
2900 /*
2901 * Before updating sk_refcnt, we must commit prior changes to memory
2902 * (Documentation/RCU/rculist_nulls.txt for details)
2903 */
2904 smp_wmb();
41c6d650 2905 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2906 atomic_set(&sk->sk_drops, 0);
1da177e4 2907}
2a91525c 2908EXPORT_SYMBOL(sock_init_data);
1da177e4 2909
b5606c2d 2910void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2911{
2912 might_sleep();
a5b5bb9a 2913 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2914 if (sk->sk_lock.owned)
1da177e4 2915 __lock_sock(sk);
d2e9117c 2916 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2917 spin_unlock(&sk->sk_lock.slock);
2918 /*
2919 * The sk_lock has mutex_lock() semantics here:
2920 */
fcc70d5f 2921 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2922 local_bh_enable();
1da177e4 2923}
fcc70d5f 2924EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2925
b5606c2d 2926void release_sock(struct sock *sk)
1da177e4 2927{
a5b5bb9a 2928 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2929 if (sk->sk_backlog.tail)
2930 __release_sock(sk);
46d3ceab 2931
c3f9b018
ED
2932 /* Warning : release_cb() might need to release sk ownership,
2933 * ie call sock_release_ownership(sk) before us.
2934 */
46d3ceab
ED
2935 if (sk->sk_prot->release_cb)
2936 sk->sk_prot->release_cb(sk);
2937
c3f9b018 2938 sock_release_ownership(sk);
a5b5bb9a
IM
2939 if (waitqueue_active(&sk->sk_lock.wq))
2940 wake_up(&sk->sk_lock.wq);
2941 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2942}
2943EXPORT_SYMBOL(release_sock);
2944
8a74ad60
ED
2945/**
2946 * lock_sock_fast - fast version of lock_sock
2947 * @sk: socket
2948 *
2949 * This version should be used for very small section, where process wont block
d651983d
MCC
2950 * return false if fast path is taken:
2951 *
8a74ad60 2952 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2953 *
2954 * return true if slow path is taken:
2955 *
8a74ad60
ED
2956 * sk_lock.slock unlocked, owned = 1, BH enabled
2957 */
2958bool lock_sock_fast(struct sock *sk)
2959{
2960 might_sleep();
2961 spin_lock_bh(&sk->sk_lock.slock);
2962
2963 if (!sk->sk_lock.owned)
2964 /*
2965 * Note : We must disable BH
2966 */
2967 return false;
2968
2969 __lock_sock(sk);
2970 sk->sk_lock.owned = 1;
2971 spin_unlock(&sk->sk_lock.slock);
2972 /*
2973 * The sk_lock has mutex_lock() semantics here:
2974 */
2975 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2976 local_bh_enable();
2977 return true;
2978}
2979EXPORT_SYMBOL(lock_sock_fast);
2980
c7cbdbf2
AB
2981int sock_gettstamp(struct socket *sock, void __user *userstamp,
2982 bool timeval, bool time32)
4ec93edb 2983{
c7cbdbf2
AB
2984 struct sock *sk = sock->sk;
2985 struct timespec64 ts;
9dae3497
YS
2986
2987 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
c7cbdbf2
AB
2988 ts = ktime_to_timespec64(sock_read_timestamp(sk));
2989 if (ts.tv_sec == -1)
1da177e4 2990 return -ENOENT;
c7cbdbf2 2991 if (ts.tv_sec == 0) {
3a0ed3e9 2992 ktime_t kt = ktime_get_real();
c7cbdbf2
AB
2993 sock_write_timestamp(sk, kt);;
2994 ts = ktime_to_timespec64(kt);
b7aa0bf7 2995 }
1da177e4 2996
c7cbdbf2
AB
2997 if (timeval)
2998 ts.tv_nsec /= 1000;
9dae3497 2999
c7cbdbf2
AB
3000#ifdef CONFIG_COMPAT_32BIT_TIME
3001 if (time32)
3002 return put_old_timespec32(&ts, userstamp);
3003#endif
3004#ifdef CONFIG_SPARC64
3005 /* beware of padding in sparc64 timeval */
3006 if (timeval && !in_compat_syscall()) {
3007 struct __kernel_old_timeval __user tv = {
c98f4822
SR
3008 .tv_sec = ts.tv_sec,
3009 .tv_usec = ts.tv_nsec,
c7cbdbf2 3010 };
c98f4822 3011 if (copy_to_user(userstamp, &tv, sizeof(tv)))
c7cbdbf2
AB
3012 return -EFAULT;
3013 return 0;
ae40eb1e 3014 }
c7cbdbf2
AB
3015#endif
3016 return put_timespec64(&ts, userstamp);
ae40eb1e 3017}
c7cbdbf2 3018EXPORT_SYMBOL(sock_gettstamp);
ae40eb1e 3019
20d49473 3020void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 3021{
20d49473 3022 if (!sock_flag(sk, flag)) {
08e29af3
ED
3023 unsigned long previous_flags = sk->sk_flags;
3024
20d49473
PO
3025 sock_set_flag(sk, flag);
3026 /*
3027 * we just set one of the two flags which require net
3028 * time stamping, but time stamping might have been on
3029 * already because of the other one
3030 */
080a270f
HFS
3031 if (sock_needs_netstamp(sk) &&
3032 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 3033 net_enable_timestamp();
1da177e4
LT
3034 }
3035}
1da177e4 3036
cb820f8e
RC
3037int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3038 int level, int type)
3039{
3040 struct sock_exterr_skb *serr;
364a9e93 3041 struct sk_buff *skb;
cb820f8e
RC
3042 int copied, err;
3043
3044 err = -EAGAIN;
364a9e93 3045 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
3046 if (skb == NULL)
3047 goto out;
3048
3049 copied = skb->len;
3050 if (copied > len) {
3051 msg->msg_flags |= MSG_TRUNC;
3052 copied = len;
3053 }
51f3d02b 3054 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
3055 if (err)
3056 goto out_free_skb;
3057
3058 sock_recv_timestamp(msg, sk, skb);
3059
3060 serr = SKB_EXT_ERR(skb);
3061 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3062
3063 msg->msg_flags |= MSG_ERRQUEUE;
3064 err = copied;
3065
cb820f8e
RC
3066out_free_skb:
3067 kfree_skb(skb);
3068out:
3069 return err;
3070}
3071EXPORT_SYMBOL(sock_recv_errqueue);
3072
1da177e4
LT
3073/*
3074 * Get a socket option on an socket.
3075 *
3076 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3077 * asynchronous errors should be reported by getsockopt. We assume
3078 * this means if you specify SO_ERROR (otherwise whats the point of it).
3079 */
3080int sock_common_getsockopt(struct socket *sock, int level, int optname,
3081 char __user *optval, int __user *optlen)
3082{
3083 struct sock *sk = sock->sk;
3084
3085 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3086}
1da177e4
LT
3087EXPORT_SYMBOL(sock_common_getsockopt);
3088
3fdadf7d 3089#ifdef CONFIG_COMPAT
543d9cfe
ACM
3090int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3091 char __user *optval, int __user *optlen)
3fdadf7d
DM
3092{
3093 struct sock *sk = sock->sk;
3094
1e51f951 3095 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
3096 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3097 optval, optlen);
3fdadf7d
DM
3098 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3099}
3100EXPORT_SYMBOL(compat_sock_common_getsockopt);
3101#endif
3102
1b784140
YX
3103int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3104 int flags)
1da177e4
LT
3105{
3106 struct sock *sk = sock->sk;
3107 int addr_len = 0;
3108 int err;
3109
1b784140 3110 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
3111 flags & ~MSG_DONTWAIT, &addr_len);
3112 if (err >= 0)
3113 msg->msg_namelen = addr_len;
3114 return err;
3115}
1da177e4
LT
3116EXPORT_SYMBOL(sock_common_recvmsg);
3117
3118/*
3119 * Set socket options on an inet socket.
3120 */
3121int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3122 char __user *optval, unsigned int optlen)
1da177e4
LT
3123{
3124 struct sock *sk = sock->sk;
3125
3126 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3127}
1da177e4
LT
3128EXPORT_SYMBOL(sock_common_setsockopt);
3129
3fdadf7d 3130#ifdef CONFIG_COMPAT
543d9cfe 3131int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3132 char __user *optval, unsigned int optlen)
3fdadf7d
DM
3133{
3134 struct sock *sk = sock->sk;
3135
543d9cfe
ACM
3136 if (sk->sk_prot->compat_setsockopt != NULL)
3137 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3138 optval, optlen);
3fdadf7d
DM
3139 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3140}
3141EXPORT_SYMBOL(compat_sock_common_setsockopt);
3142#endif
3143
1da177e4
LT
3144void sk_common_release(struct sock *sk)
3145{
3146 if (sk->sk_prot->destroy)
3147 sk->sk_prot->destroy(sk);
3148
3149 /*
3150 * Observation: when sock_common_release is called, processes have
3151 * no access to socket. But net still has.
3152 * Step one, detach it from networking:
3153 *
3154 * A. Remove from hash tables.
3155 */
3156
3157 sk->sk_prot->unhash(sk);
3158
3159 /*
3160 * In this point socket cannot receive new packets, but it is possible
3161 * that some packets are in flight because some CPU runs receiver and
3162 * did hash table lookup before we unhashed socket. They will achieve
3163 * receive queue and will be purged by socket destructor.
3164 *
3165 * Also we still have packets pending on receive queue and probably,
3166 * our own packets waiting in device queues. sock_destroy will drain
3167 * receive queue, but transmitted packets will delay socket destruction
3168 * until the last reference will be released.
3169 */
3170
3171 sock_orphan(sk);
3172
3173 xfrm_sk_free_policy(sk);
3174
e6848976 3175 sk_refcnt_debug_release(sk);
5640f768 3176
1da177e4
LT
3177 sock_put(sk);
3178}
1da177e4
LT
3179EXPORT_SYMBOL(sk_common_release);
3180
a2d133b1
JH
3181void sk_get_meminfo(const struct sock *sk, u32 *mem)
3182{
3183 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3184
3185 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3186 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3187 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3188 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3189 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3190 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3191 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3192 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3193 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3194}
3195
13ff3d6f
PE
3196#ifdef CONFIG_PROC_FS
3197#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3198struct prot_inuse {
3199 int val[PROTO_INUSE_NR];
3200};
13ff3d6f
PE
3201
3202static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3203
70ee1159
PE
3204void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3205{
08fc7f81 3206 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
70ee1159
PE
3207}
3208EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3209
3210int sock_prot_inuse_get(struct net *net, struct proto *prot)
3211{
3212 int cpu, idx = prot->inuse_idx;
3213 int res = 0;
3214
3215 for_each_possible_cpu(cpu)
08fc7f81 3216 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3217
3218 return res >= 0 ? res : 0;
3219}
3220EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3221
648845ab
TZ
3222static void sock_inuse_add(struct net *net, int val)
3223{
3224 this_cpu_add(*net->core.sock_inuse, val);
3225}
3226
3227int sock_inuse_get(struct net *net)
3228{
3229 int cpu, res = 0;
3230
3231 for_each_possible_cpu(cpu)
3232 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3233
3234 return res;
3235}
3236
3237EXPORT_SYMBOL_GPL(sock_inuse_get);
3238
2c8c1e72 3239static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3240{
08fc7f81 3241 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3242 if (net->core.prot_inuse == NULL)
3243 return -ENOMEM;
3244
3245 net->core.sock_inuse = alloc_percpu(int);
3246 if (net->core.sock_inuse == NULL)
3247 goto out;
3248
3249 return 0;
3250
3251out:
3252 free_percpu(net->core.prot_inuse);
3253 return -ENOMEM;
70ee1159
PE
3254}
3255
2c8c1e72 3256static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3257{
08fc7f81 3258 free_percpu(net->core.prot_inuse);
648845ab 3259 free_percpu(net->core.sock_inuse);
70ee1159
PE
3260}
3261
3262static struct pernet_operations net_inuse_ops = {
3263 .init = sock_inuse_init_net,
3264 .exit = sock_inuse_exit_net,
3265};
3266
3267static __init int net_inuse_init(void)
3268{
3269 if (register_pernet_subsys(&net_inuse_ops))
3270 panic("Cannot initialize net inuse counters");
3271
3272 return 0;
3273}
3274
3275core_initcall(net_inuse_init);
13ff3d6f
PE
3276
3277static void assign_proto_idx(struct proto *prot)
3278{
3279 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3280
3281 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3282 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3283 return;
3284 }
3285
3286 set_bit(prot->inuse_idx, proto_inuse_idx);
3287}
3288
3289static void release_proto_idx(struct proto *prot)
3290{
3291 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3292 clear_bit(prot->inuse_idx, proto_inuse_idx);
3293}
3294#else
3295static inline void assign_proto_idx(struct proto *prot)
3296{
3297}
3298
3299static inline void release_proto_idx(struct proto *prot)
3300{
3301}
648845ab
TZ
3302
3303static void sock_inuse_add(struct net *net, int val)
3304{
3305}
13ff3d6f
PE
3306#endif
3307
0159dfd3
ED
3308static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3309{
3310 if (!rsk_prot)
3311 return;
3312 kfree(rsk_prot->slab_name);
3313 rsk_prot->slab_name = NULL;
adf78eda
JL
3314 kmem_cache_destroy(rsk_prot->slab);
3315 rsk_prot->slab = NULL;
0159dfd3
ED
3316}
3317
3318static int req_prot_init(const struct proto *prot)
3319{
3320 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3321
3322 if (!rsk_prot)
3323 return 0;
3324
3325 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3326 prot->name);
3327 if (!rsk_prot->slab_name)
3328 return -ENOMEM;
3329
3330 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3331 rsk_prot->obj_size, 0,
e699e2c6
SB
3332 SLAB_ACCOUNT | prot->slab_flags,
3333 NULL);
0159dfd3
ED
3334
3335 if (!rsk_prot->slab) {
3336 pr_crit("%s: Can't create request sock SLAB cache!\n",
3337 prot->name);
3338 return -ENOMEM;
3339 }
3340 return 0;
3341}
3342
b733c007
PE
3343int proto_register(struct proto *prot, int alloc_slab)
3344{
1da177e4 3345 if (alloc_slab) {
30c2c9f1
DW
3346 prot->slab = kmem_cache_create_usercopy(prot->name,
3347 prot->obj_size, 0,
e699e2c6
SB
3348 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3349 prot->slab_flags,
289a4860 3350 prot->useroffset, prot->usersize,
271b72c7 3351 NULL);
1da177e4
LT
3352
3353 if (prot->slab == NULL) {
e005d193
JP
3354 pr_crit("%s: Can't create sock SLAB cache!\n",
3355 prot->name);
60e7663d 3356 goto out;
1da177e4 3357 }
2e6599cb 3358
0159dfd3
ED
3359 if (req_prot_init(prot))
3360 goto out_free_request_sock_slab;
8feaf0c0 3361
6d6ee43e 3362 if (prot->twsk_prot != NULL) {
faf23422 3363 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3364
7e56b5d6 3365 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3366 goto out_free_request_sock_slab;
3367
6d6ee43e 3368 prot->twsk_prot->twsk_slab =
7e56b5d6 3369 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3370 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3371 0,
e699e2c6 3372 SLAB_ACCOUNT |
52db70dc 3373 prot->slab_flags,
20c2df83 3374 NULL);
6d6ee43e 3375 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3376 goto out_free_timewait_sock_slab_name;
3377 }
1da177e4
LT
3378 }
3379
36b77a52 3380 mutex_lock(&proto_list_mutex);
1da177e4 3381 list_add(&prot->node, &proto_list);
13ff3d6f 3382 assign_proto_idx(prot);
36b77a52 3383 mutex_unlock(&proto_list_mutex);
b733c007
PE
3384 return 0;
3385
8feaf0c0 3386out_free_timewait_sock_slab_name:
7e56b5d6 3387 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3388out_free_request_sock_slab:
0159dfd3
ED
3389 req_prot_cleanup(prot->rsk_prot);
3390
2e6599cb
ACM
3391 kmem_cache_destroy(prot->slab);
3392 prot->slab = NULL;
b733c007
PE
3393out:
3394 return -ENOBUFS;
1da177e4 3395}
1da177e4
LT
3396EXPORT_SYMBOL(proto_register);
3397
3398void proto_unregister(struct proto *prot)
3399{
36b77a52 3400 mutex_lock(&proto_list_mutex);
13ff3d6f 3401 release_proto_idx(prot);
0a3f4358 3402 list_del(&prot->node);
36b77a52 3403 mutex_unlock(&proto_list_mutex);
1da177e4 3404
adf78eda
JL
3405 kmem_cache_destroy(prot->slab);
3406 prot->slab = NULL;
1da177e4 3407
0159dfd3 3408 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3409
6d6ee43e 3410 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3411 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3412 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3413 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3414 }
1da177e4 3415}
1da177e4
LT
3416EXPORT_SYMBOL(proto_unregister);
3417
bf2ae2e4
XL
3418int sock_load_diag_module(int family, int protocol)
3419{
3420 if (!protocol) {
3421 if (!sock_is_registered(family))
3422 return -ENOENT;
3423
3424 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3425 NETLINK_SOCK_DIAG, family);
3426 }
3427
3428#ifdef CONFIG_INET
3429 if (family == AF_INET &&
c34c1287 3430 protocol != IPPROTO_RAW &&
bf2ae2e4
XL
3431 !rcu_access_pointer(inet_protos[protocol]))
3432 return -ENOENT;
3433#endif
3434
3435 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3436 NETLINK_SOCK_DIAG, family, protocol);
3437}
3438EXPORT_SYMBOL(sock_load_diag_module);
3439
1da177e4 3440#ifdef CONFIG_PROC_FS
1da177e4 3441static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3442 __acquires(proto_list_mutex)
1da177e4 3443{
36b77a52 3444 mutex_lock(&proto_list_mutex);
60f0438a 3445 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3446}
3447
3448static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3449{
60f0438a 3450 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3451}
3452
3453static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3454 __releases(proto_list_mutex)
1da177e4 3455{
36b77a52 3456 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3457}
3458
3459static char proto_method_implemented(const void *method)
3460{
3461 return method == NULL ? 'n' : 'y';
3462}
180d8cd9
GC
3463static long sock_prot_memory_allocated(struct proto *proto)
3464{
cb75a36c 3465 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3466}
3467
3468static char *sock_prot_memory_pressure(struct proto *proto)
3469{
3470 return proto->memory_pressure != NULL ?
3471 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3472}
1da177e4
LT
3473
3474static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3475{
180d8cd9 3476
8d987e5c 3477 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3478 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3479 proto->name,
3480 proto->obj_size,
14e943db 3481 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3482 sock_prot_memory_allocated(proto),
3483 sock_prot_memory_pressure(proto),
1da177e4
LT
3484 proto->max_header,
3485 proto->slab == NULL ? "no" : "yes",
3486 module_name(proto->owner),
3487 proto_method_implemented(proto->close),
3488 proto_method_implemented(proto->connect),
3489 proto_method_implemented(proto->disconnect),
3490 proto_method_implemented(proto->accept),
3491 proto_method_implemented(proto->ioctl),
3492 proto_method_implemented(proto->init),
3493 proto_method_implemented(proto->destroy),
3494 proto_method_implemented(proto->shutdown),
3495 proto_method_implemented(proto->setsockopt),
3496 proto_method_implemented(proto->getsockopt),
3497 proto_method_implemented(proto->sendmsg),
3498 proto_method_implemented(proto->recvmsg),
3499 proto_method_implemented(proto->sendpage),
3500 proto_method_implemented(proto->bind),
3501 proto_method_implemented(proto->backlog_rcv),
3502 proto_method_implemented(proto->hash),
3503 proto_method_implemented(proto->unhash),
3504 proto_method_implemented(proto->get_port),
3505 proto_method_implemented(proto->enter_memory_pressure));
3506}
3507
3508static int proto_seq_show(struct seq_file *seq, void *v)
3509{
60f0438a 3510 if (v == &proto_list)
1da177e4
LT
3511 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3512 "protocol",
3513 "size",
3514 "sockets",
3515 "memory",
3516 "press",
3517 "maxhdr",
3518 "slab",
3519 "module",
3520 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3521 else
60f0438a 3522 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3523 return 0;
3524}
3525
f690808e 3526static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3527 .start = proto_seq_start,
3528 .next = proto_seq_next,
3529 .stop = proto_seq_stop,
3530 .show = proto_seq_show,
3531};
3532
14e943db
ED
3533static __net_init int proto_init_net(struct net *net)
3534{
c3506372
CH
3535 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3536 sizeof(struct seq_net_private)))
14e943db
ED
3537 return -ENOMEM;
3538
3539 return 0;
3540}
3541
3542static __net_exit void proto_exit_net(struct net *net)
3543{
ece31ffd 3544 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3545}
3546
3547
3548static __net_initdata struct pernet_operations proto_net_ops = {
3549 .init = proto_init_net,
3550 .exit = proto_exit_net,
1da177e4
LT
3551};
3552
3553static int __init proto_init(void)
3554{
14e943db 3555 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3556}
3557
3558subsys_initcall(proto_init);
3559
3560#endif /* PROC_FS */
7db6b048
SS
3561
3562#ifdef CONFIG_NET_RX_BUSY_POLL
3563bool sk_busy_loop_end(void *p, unsigned long start_time)
3564{
3565 struct sock *sk = p;
3566
3567 return !skb_queue_empty(&sk->sk_receive_queue) ||
3568 sk_busy_loop_timeout(sk, start_time);
3569}
3570EXPORT_SYMBOL(sk_busy_loop_end);
3571#endif /* CONFIG_NET_RX_BUSY_POLL */