]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - net/core/sock.c
Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[mirror_ubuntu-jammy-kernel.git] / net / core / sock.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
1da177e4
LT
84 */
85
e005d193
JP
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
80b14dee 88#include <asm/unaligned.h>
4fc268d2 89#include <linux/capability.h>
1da177e4 90#include <linux/errno.h>
cb820f8e 91#include <linux/errqueue.h>
1da177e4
LT
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
1da177e4
LT
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
f1083048 100#include <linux/sched/mm.h>
1da177e4
LT
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/init.h>
a1f8e7f7 111#include <linux/highmem.h>
3f551f94 112#include <linux/user_namespace.h>
c5905afb 113#include <linux/static_key.h>
3969eb38 114#include <linux/memcontrol.h>
8c1ae10d 115#include <linux/prefetch.h>
1da177e4 116
7c0f6ba6 117#include <linux/uaccess.h>
1da177e4
LT
118
119#include <linux/netdevice.h>
120#include <net/protocol.h>
121#include <linux/skbuff.h>
457c4cbc 122#include <net/net_namespace.h>
2e6599cb 123#include <net/request_sock.h>
1da177e4 124#include <net/sock.h>
20d49473 125#include <linux/net_tstamp.h>
1da177e4
LT
126#include <net/xfrm.h>
127#include <linux/ipsec.h>
f8451725 128#include <net/cls_cgroup.h>
5bc1421e 129#include <net/netprio_cgroup.h>
eb4cb008 130#include <linux/sock_diag.h>
1da177e4
LT
131
132#include <linux/filter.h>
538950a1 133#include <net/sock_reuseport.h>
6ac99e8f 134#include <net/bpf_sk_storage.h>
1da177e4 135
3847ce32
SM
136#include <trace/events/sock.h>
137
1da177e4 138#include <net/tcp.h>
076bb0c8 139#include <net/busy_poll.h>
06021292 140
36b77a52 141static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
142static LIST_HEAD(proto_list);
143
648845ab
TZ
144static void sock_inuse_add(struct net *net, int val);
145
a3b299da
EB
146/**
147 * sk_ns_capable - General socket capability test
148 * @sk: Socket to use a capability on or through
149 * @user_ns: The user namespace of the capability to use
150 * @cap: The capability to use
151 *
152 * Test to see if the opener of the socket had when the socket was
153 * created and the current process has the capability @cap in the user
154 * namespace @user_ns.
155 */
156bool sk_ns_capable(const struct sock *sk,
157 struct user_namespace *user_ns, int cap)
158{
159 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
160 ns_capable(user_ns, cap);
161}
162EXPORT_SYMBOL(sk_ns_capable);
163
164/**
165 * sk_capable - Socket global capability test
166 * @sk: Socket to use a capability on or through
e793c0f7 167 * @cap: The global capability to use
a3b299da
EB
168 *
169 * Test to see if the opener of the socket had when the socket was
170 * created and the current process has the capability @cap in all user
171 * namespaces.
172 */
173bool sk_capable(const struct sock *sk, int cap)
174{
175 return sk_ns_capable(sk, &init_user_ns, cap);
176}
177EXPORT_SYMBOL(sk_capable);
178
179/**
180 * sk_net_capable - Network namespace socket capability test
181 * @sk: Socket to use a capability on or through
182 * @cap: The capability to use
183 *
e793c0f7 184 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
185 * and the current process has the capability @cap over the network namespace
186 * the socket is a member of.
187 */
188bool sk_net_capable(const struct sock *sk, int cap)
189{
190 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
191}
192EXPORT_SYMBOL(sk_net_capable);
193
da21f24d
IM
194/*
195 * Each address family might have different locking rules, so we have
cdfbabfb
DH
196 * one slock key per address family and separate keys for internal and
197 * userspace sockets.
da21f24d 198 */
a5b5bb9a 199static struct lock_class_key af_family_keys[AF_MAX];
cdfbabfb 200static struct lock_class_key af_family_kern_keys[AF_MAX];
a5b5bb9a 201static struct lock_class_key af_family_slock_keys[AF_MAX];
cdfbabfb 202static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
a5b5bb9a 203
a5b5bb9a
IM
204/*
205 * Make lock validator output more readable. (we pre-construct these
206 * strings build-time, so that runtime initialization of socket
207 * locks is fast):
208 */
cdfbabfb
DH
209
210#define _sock_locks(x) \
211 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
212 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
213 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
214 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
215 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
216 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
217 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
218 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
219 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
220 x "27" , x "28" , x "AF_CAN" , \
221 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
222 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
223 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
224 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
68e8b849
BT
225 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
226 x "AF_MAX"
cdfbabfb 227
36cbd3dc 228static const char *const af_family_key_strings[AF_MAX+1] = {
cdfbabfb 229 _sock_locks("sk_lock-")
a5b5bb9a 230};
36cbd3dc 231static const char *const af_family_slock_key_strings[AF_MAX+1] = {
cdfbabfb 232 _sock_locks("slock-")
a5b5bb9a 233};
36cbd3dc 234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
cdfbabfb
DH
235 _sock_locks("clock-")
236};
237
238static const char *const af_family_kern_key_strings[AF_MAX+1] = {
239 _sock_locks("k-sk_lock-")
240};
241static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
242 _sock_locks("k-slock-")
243};
244static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
245 _sock_locks("k-clock-")
443aef0e 246};
581319c5 247static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
6b431d50 248 _sock_locks("rlock-")
581319c5
PA
249};
250static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
6b431d50 251 _sock_locks("wlock-")
581319c5
PA
252};
253static const char *const af_family_elock_key_strings[AF_MAX+1] = {
6b431d50 254 _sock_locks("elock-")
581319c5 255};
da21f24d
IM
256
257/*
581319c5 258 * sk_callback_lock and sk queues locking rules are per-address-family,
da21f24d
IM
259 * so split the lock classes by using a per-AF key:
260 */
261static struct lock_class_key af_callback_keys[AF_MAX];
581319c5
PA
262static struct lock_class_key af_rlock_keys[AF_MAX];
263static struct lock_class_key af_wlock_keys[AF_MAX];
264static struct lock_class_key af_elock_keys[AF_MAX];
cdfbabfb 265static struct lock_class_key af_kern_callback_keys[AF_MAX];
da21f24d 266
1da177e4 267/* Run time adjustable parameters. */
ab32ea5d 268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 269EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 271EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 274
25985edc 275/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 277EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 278
b245be1f
WB
279int sysctl_tstamp_allow_data __read_mostly = 1;
280
a7950ae8
DB
281DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
282EXPORT_SYMBOL_GPL(memalloc_socks_key);
c93bdd0e 283
7cb02404
MG
284/**
285 * sk_set_memalloc - sets %SOCK_MEMALLOC
286 * @sk: socket to set it on
287 *
288 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
289 * It's the responsibility of the admin to adjust min_free_kbytes
290 * to meet the requirements
291 */
292void sk_set_memalloc(struct sock *sk)
293{
294 sock_set_flag(sk, SOCK_MEMALLOC);
295 sk->sk_allocation |= __GFP_MEMALLOC;
a7950ae8 296 static_branch_inc(&memalloc_socks_key);
7cb02404
MG
297}
298EXPORT_SYMBOL_GPL(sk_set_memalloc);
299
300void sk_clear_memalloc(struct sock *sk)
301{
302 sock_reset_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation &= ~__GFP_MEMALLOC;
a7950ae8 304 static_branch_dec(&memalloc_socks_key);
c76562b6
MG
305
306 /*
307 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
308 * progress of swapping. SOCK_MEMALLOC may be cleared while
309 * it has rmem allocations due to the last swapfile being deactivated
310 * but there is a risk that the socket is unusable due to exceeding
311 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 312 */
5d753610 313 sk_mem_reclaim(sk);
7cb02404
MG
314}
315EXPORT_SYMBOL_GPL(sk_clear_memalloc);
316
b4b9e355
MG
317int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
318{
319 int ret;
f1083048 320 unsigned int noreclaim_flag;
b4b9e355
MG
321
322 /* these should have been dropped before queueing */
323 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
324
f1083048 325 noreclaim_flag = memalloc_noreclaim_save();
b4b9e355 326 ret = sk->sk_backlog_rcv(sk, skb);
f1083048 327 memalloc_noreclaim_restore(noreclaim_flag);
b4b9e355
MG
328
329 return ret;
330}
331EXPORT_SYMBOL(__sk_backlog_rcv);
332
a9beb86a 333static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
fe0c72f3 334{
a9beb86a
DD
335 struct __kernel_sock_timeval tv;
336 int size;
fe0c72f3
AB
337
338 if (timeo == MAX_SCHEDULE_TIMEOUT) {
339 tv.tv_sec = 0;
340 tv.tv_usec = 0;
341 } else {
342 tv.tv_sec = timeo / HZ;
343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
344 }
345
e6986423 346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
fe0c72f3
AB
347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
348 *(struct old_timeval32 *)optval = tv32;
349 return sizeof(tv32);
350 }
351
a9beb86a
DD
352 if (old_timeval) {
353 struct __kernel_old_timeval old_tv;
354 old_tv.tv_sec = tv.tv_sec;
355 old_tv.tv_usec = tv.tv_usec;
356 *(struct __kernel_old_timeval *)optval = old_tv;
357 size = sizeof(old_tv);
358 } else {
359 *(struct __kernel_sock_timeval *)optval = tv;
360 size = sizeof(tv);
361 }
362
363 return size;
fe0c72f3
AB
364}
365
a9beb86a 366static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
1da177e4 367{
a9beb86a 368 struct __kernel_sock_timeval tv;
1da177e4 369
e6986423 370 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
fe0c72f3
AB
371 struct old_timeval32 tv32;
372
373 if (optlen < sizeof(tv32))
374 return -EINVAL;
375
376 if (copy_from_user(&tv32, optval, sizeof(tv32)))
377 return -EFAULT;
378 tv.tv_sec = tv32.tv_sec;
379 tv.tv_usec = tv32.tv_usec;
a9beb86a
DD
380 } else if (old_timeval) {
381 struct __kernel_old_timeval old_tv;
382
383 if (optlen < sizeof(old_tv))
384 return -EINVAL;
385 if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
386 return -EFAULT;
387 tv.tv_sec = old_tv.tv_sec;
388 tv.tv_usec = old_tv.tv_usec;
fe0c72f3
AB
389 } else {
390 if (optlen < sizeof(tv))
391 return -EINVAL;
392 if (copy_from_user(&tv, optval, sizeof(tv)))
393 return -EFAULT;
394 }
ba78073e
VA
395 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
396 return -EDOM;
1da177e4 397
ba78073e 398 if (tv.tv_sec < 0) {
6f11df83
AM
399 static int warned __read_mostly;
400
ba78073e 401 *timeo_p = 0;
50aab54f 402 if (warned < 10 && net_ratelimit()) {
ba78073e 403 warned++;
e005d193
JP
404 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
405 __func__, current->comm, task_pid_nr(current));
50aab54f 406 }
ba78073e
VA
407 return 0;
408 }
1da177e4
LT
409 *timeo_p = MAX_SCHEDULE_TIMEOUT;
410 if (tv.tv_sec == 0 && tv.tv_usec == 0)
411 return 0;
a9beb86a
DD
412 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
413 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
1da177e4
LT
414 return 0;
415}
416
417static void sock_warn_obsolete_bsdism(const char *name)
418{
419 static int warned;
420 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
421 if (strcmp(warncomm, current->comm) && warned < 5) {
422 strcpy(warncomm, current->comm);
e005d193
JP
423 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
424 warncomm, name);
1da177e4
LT
425 warned++;
426 }
427}
428
080a270f
HFS
429static bool sock_needs_netstamp(const struct sock *sk)
430{
431 switch (sk->sk_family) {
432 case AF_UNSPEC:
433 case AF_UNIX:
434 return false;
435 default:
436 return true;
437 }
438}
439
08e29af3 440static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 441{
08e29af3
ED
442 if (sk->sk_flags & flags) {
443 sk->sk_flags &= ~flags;
080a270f
HFS
444 if (sock_needs_netstamp(sk) &&
445 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 446 net_disable_timestamp();
1da177e4
LT
447 }
448}
449
450
e6afc8ac 451int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
f0088a50 452{
3b885787
NH
453 unsigned long flags;
454 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 455
0fd7bac6 456 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 457 atomic_inc(&sk->sk_drops);
3847ce32 458 trace_sock_rcvqueue_full(sk, skb);
766e9037 459 return -ENOMEM;
f0088a50
DV
460 }
461
c76562b6 462 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
463 atomic_inc(&sk->sk_drops);
464 return -ENOBUFS;
3ab224be
HA
465 }
466
f0088a50
DV
467 skb->dev = NULL;
468 skb_set_owner_r(skb, sk);
49ad9599 469
7fee226a
ED
470 /* we escape from rcu protected region, make sure we dont leak
471 * a norefcounted dst
472 */
473 skb_dst_force(skb);
474
3b885787 475 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 476 sock_skb_set_dropcount(sk, skb);
3b885787
NH
477 __skb_queue_tail(list, skb);
478 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
479
480 if (!sock_flag(sk, SOCK_DEAD))
676d2369 481 sk->sk_data_ready(sk);
766e9037 482 return 0;
f0088a50 483}
e6afc8ac 484EXPORT_SYMBOL(__sock_queue_rcv_skb);
485
486int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
487{
488 int err;
489
490 err = sk_filter(sk, skb);
491 if (err)
492 return err;
493
494 return __sock_queue_rcv_skb(sk, skb);
495}
f0088a50
DV
496EXPORT_SYMBOL(sock_queue_rcv_skb);
497
4f0c40d9 498int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
c3f24cfb 499 const int nested, unsigned int trim_cap, bool refcounted)
f0088a50
DV
500{
501 int rc = NET_RX_SUCCESS;
502
4f0c40d9 503 if (sk_filter_trim_cap(sk, skb, trim_cap))
f0088a50
DV
504 goto discard_and_relse;
505
506 skb->dev = NULL;
507
274f482d 508 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
509 atomic_inc(&sk->sk_drops);
510 goto discard_and_relse;
511 }
58a5a7b9
ACM
512 if (nested)
513 bh_lock_sock_nested(sk);
514 else
515 bh_lock_sock(sk);
a5b5bb9a
IM
516 if (!sock_owned_by_user(sk)) {
517 /*
518 * trylock + unlock semantics:
519 */
520 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
521
c57943a1 522 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
523
524 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 525 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
526 bh_unlock_sock(sk);
527 atomic_inc(&sk->sk_drops);
528 goto discard_and_relse;
529 }
530
f0088a50
DV
531 bh_unlock_sock(sk);
532out:
c3f24cfb
ED
533 if (refcounted)
534 sock_put(sk);
f0088a50
DV
535 return rc;
536discard_and_relse:
537 kfree_skb(skb);
538 goto out;
539}
4f0c40d9 540EXPORT_SYMBOL(__sk_receive_skb);
f0088a50
DV
541
542struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
543{
b6c6712a 544 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
545
546 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 547 sk_tx_queue_clear(sk);
9b8805a3 548 sk->sk_dst_pending_confirm = 0;
a9b3cd7f 549 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
550 dst_release(dst);
551 return NULL;
552 }
553
554 return dst;
555}
556EXPORT_SYMBOL(__sk_dst_check);
557
558struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
559{
560 struct dst_entry *dst = sk_dst_get(sk);
561
562 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
563 sk_dst_reset(sk);
564 dst_release(dst);
565 return NULL;
566 }
567
568 return dst;
569}
570EXPORT_SYMBOL(sk_dst_check);
571
f5dd3d0c 572static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
4878809f
DM
573{
574 int ret = -ENOPROTOOPT;
575#ifdef CONFIG_NETDEVICES
3b1e0a65 576 struct net *net = sock_net(sk);
4878809f
DM
577
578 /* Sorry... */
579 ret = -EPERM;
5e1fccc0 580 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
581 goto out;
582
f5dd3d0c
DH
583 ret = -EINVAL;
584 if (ifindex < 0)
585 goto out;
586
587 sk->sk_bound_dev_if = ifindex;
588 if (sk->sk_prot->rehash)
589 sk->sk_prot->rehash(sk);
590 sk_dst_reset(sk);
591
592 ret = 0;
593
594out:
595#endif
596
597 return ret;
598}
599
600static int sock_setbindtodevice(struct sock *sk, char __user *optval,
601 int optlen)
602{
603 int ret = -ENOPROTOOPT;
604#ifdef CONFIG_NETDEVICES
605 struct net *net = sock_net(sk);
606 char devname[IFNAMSIZ];
607 int index;
608
4878809f
DM
609 ret = -EINVAL;
610 if (optlen < 0)
611 goto out;
612
613 /* Bind this socket to a particular device like "eth0",
614 * as specified in the passed interface name. If the
615 * name is "" or the option length is zero the socket
616 * is not bound.
617 */
618 if (optlen > IFNAMSIZ - 1)
619 optlen = IFNAMSIZ - 1;
620 memset(devname, 0, sizeof(devname));
621
622 ret = -EFAULT;
623 if (copy_from_user(devname, optval, optlen))
624 goto out;
625
000ba2e4
DM
626 index = 0;
627 if (devname[0] != '\0') {
bf8e56bf 628 struct net_device *dev;
4878809f 629
bf8e56bf
ED
630 rcu_read_lock();
631 dev = dev_get_by_name_rcu(net, devname);
632 if (dev)
633 index = dev->ifindex;
634 rcu_read_unlock();
4878809f
DM
635 ret = -ENODEV;
636 if (!dev)
637 goto out;
4878809f
DM
638 }
639
640 lock_sock(sk);
f5dd3d0c 641 ret = sock_setbindtodevice_locked(sk, index);
4878809f
DM
642 release_sock(sk);
643
4878809f
DM
644out:
645#endif
646
647 return ret;
648}
649
c91f6df2
BH
650static int sock_getbindtodevice(struct sock *sk, char __user *optval,
651 int __user *optlen, int len)
652{
653 int ret = -ENOPROTOOPT;
654#ifdef CONFIG_NETDEVICES
655 struct net *net = sock_net(sk);
c91f6df2 656 char devname[IFNAMSIZ];
c91f6df2
BH
657
658 if (sk->sk_bound_dev_if == 0) {
659 len = 0;
660 goto zero;
661 }
662
663 ret = -EINVAL;
664 if (len < IFNAMSIZ)
665 goto out;
666
5dbe7c17
NS
667 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
668 if (ret)
c91f6df2 669 goto out;
c91f6df2
BH
670
671 len = strlen(devname) + 1;
672
673 ret = -EFAULT;
674 if (copy_to_user(optval, devname, len))
675 goto out;
676
677zero:
678 ret = -EFAULT;
679 if (put_user(len, optlen))
680 goto out;
681
682 ret = 0;
683
684out:
685#endif
686
687 return ret;
688}
689
c0ef877b
PE
690static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
691{
692 if (valbool)
693 sock_set_flag(sk, bit);
694 else
695 sock_reset_flag(sk, bit);
696}
697
f60e5990 698bool sk_mc_loop(struct sock *sk)
699{
700 if (dev_recursion_level())
701 return false;
702 if (!sk)
703 return true;
704 switch (sk->sk_family) {
705 case AF_INET:
706 return inet_sk(sk)->mc_loop;
707#if IS_ENABLED(CONFIG_IPV6)
708 case AF_INET6:
709 return inet6_sk(sk)->mc_loop;
710#endif
711 }
712 WARN_ON(1);
713 return true;
714}
715EXPORT_SYMBOL(sk_mc_loop);
716
1da177e4
LT
717/*
718 * This is meant for all protocols to use and covers goings on
719 * at the socket level. Everything here is generic.
720 */
721
722int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 723 char __user *optval, unsigned int optlen)
1da177e4 724{
80b14dee 725 struct sock_txtime sk_txtime;
2a91525c 726 struct sock *sk = sock->sk;
1da177e4
LT
727 int val;
728 int valbool;
729 struct linger ling;
730 int ret = 0;
4ec93edb 731
1da177e4
LT
732 /*
733 * Options without arguments
734 */
735
4878809f 736 if (optname == SO_BINDTODEVICE)
c91f6df2 737 return sock_setbindtodevice(sk, optval, optlen);
4878809f 738
e71a4783
SH
739 if (optlen < sizeof(int))
740 return -EINVAL;
4ec93edb 741
1da177e4
LT
742 if (get_user(val, (int __user *)optval))
743 return -EFAULT;
4ec93edb 744
2a91525c 745 valbool = val ? 1 : 0;
1da177e4
LT
746
747 lock_sock(sk);
748
2a91525c 749 switch (optname) {
e71a4783 750 case SO_DEBUG:
2a91525c 751 if (val && !capable(CAP_NET_ADMIN))
e71a4783 752 ret = -EACCES;
2a91525c 753 else
c0ef877b 754 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
755 break;
756 case SO_REUSEADDR:
cdb8744d 757 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 758 break;
055dc21a
TH
759 case SO_REUSEPORT:
760 sk->sk_reuseport = valbool;
761 break;
e71a4783 762 case SO_TYPE:
49c794e9 763 case SO_PROTOCOL:
0d6038ee 764 case SO_DOMAIN:
e71a4783
SH
765 case SO_ERROR:
766 ret = -ENOPROTOOPT;
767 break;
768 case SO_DONTROUTE:
c0ef877b 769 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
0fbe82e6 770 sk_dst_reset(sk);
e71a4783
SH
771 break;
772 case SO_BROADCAST:
773 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
774 break;
775 case SO_SNDBUF:
776 /* Don't error on this BSD doesn't and if you think
82981930
ED
777 * about it this is right. Otherwise apps have to
778 * play 'guess the biggest size' games. RCVBUF/SNDBUF
779 * are treated in BSD as hints
780 */
781 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 782set_sndbuf:
4057765f
GN
783 /* Ensure val * 2 fits into an int, to prevent max_t()
784 * from treating it as a negative value.
785 */
786 val = min_t(int, val, INT_MAX / 2);
e71a4783 787 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
b98b0bc8 788 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
82981930 789 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
790 sk->sk_write_space(sk);
791 break;
1da177e4 792
e71a4783
SH
793 case SO_SNDBUFFORCE:
794 if (!capable(CAP_NET_ADMIN)) {
795 ret = -EPERM;
796 break;
797 }
4057765f
GN
798
799 /* No negative values (to prevent underflow, as val will be
800 * multiplied by 2).
801 */
802 if (val < 0)
803 val = 0;
e71a4783 804 goto set_sndbuf;
b0573dea 805
e71a4783
SH
806 case SO_RCVBUF:
807 /* Don't error on this BSD doesn't and if you think
82981930
ED
808 * about it this is right. Otherwise apps have to
809 * play 'guess the biggest size' games. RCVBUF/SNDBUF
810 * are treated in BSD as hints
811 */
812 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 813set_rcvbuf:
4057765f
GN
814 /* Ensure val * 2 fits into an int, to prevent max_t()
815 * from treating it as a negative value.
816 */
817 val = min_t(int, val, INT_MAX / 2);
e71a4783
SH
818 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
819 /*
820 * We double it on the way in to account for
821 * "struct sk_buff" etc. overhead. Applications
822 * assume that the SO_RCVBUF setting they make will
823 * allow that much actual data to be received on that
824 * socket.
825 *
826 * Applications are unaware that "struct sk_buff" and
827 * other overheads allocate from the receive buffer
828 * during socket buffer allocation.
829 *
830 * And after considering the possible alternatives,
831 * returning the value we actually used in getsockopt
832 * is the most desirable behavior.
833 */
b98b0bc8 834 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
835 break;
836
837 case SO_RCVBUFFORCE:
838 if (!capable(CAP_NET_ADMIN)) {
839 ret = -EPERM;
1da177e4 840 break;
e71a4783 841 }
4057765f
GN
842
843 /* No negative values (to prevent underflow, as val will be
844 * multiplied by 2).
845 */
846 if (val < 0)
847 val = 0;
e71a4783 848 goto set_rcvbuf;
1da177e4 849
e71a4783 850 case SO_KEEPALIVE:
4b9d07a4
UB
851 if (sk->sk_prot->keepalive)
852 sk->sk_prot->keepalive(sk, valbool);
e71a4783
SH
853 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
854 break;
855
856 case SO_OOBINLINE:
857 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
858 break;
859
860 case SO_NO_CHECK:
28448b80 861 sk->sk_no_check_tx = valbool;
e71a4783
SH
862 break;
863
864 case SO_PRIORITY:
5e1fccc0
EB
865 if ((val >= 0 && val <= 6) ||
866 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
867 sk->sk_priority = val;
868 else
869 ret = -EPERM;
870 break;
871
872 case SO_LINGER:
873 if (optlen < sizeof(ling)) {
874 ret = -EINVAL; /* 1003.1g */
1da177e4 875 break;
e71a4783 876 }
2a91525c 877 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 878 ret = -EFAULT;
1da177e4 879 break;
e71a4783
SH
880 }
881 if (!ling.l_onoff)
882 sock_reset_flag(sk, SOCK_LINGER);
883 else {
1da177e4 884#if (BITS_PER_LONG == 32)
e71a4783
SH
885 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
886 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 887 else
e71a4783
SH
888#endif
889 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
890 sock_set_flag(sk, SOCK_LINGER);
891 }
892 break;
893
894 case SO_BSDCOMPAT:
895 sock_warn_obsolete_bsdism("setsockopt");
896 break;
897
898 case SO_PASSCRED:
899 if (valbool)
900 set_bit(SOCK_PASSCRED, &sock->flags);
901 else
902 clear_bit(SOCK_PASSCRED, &sock->flags);
903 break;
904
7f1bc6e9 905 case SO_TIMESTAMP_OLD:
887feae3 906 case SO_TIMESTAMP_NEW:
7f1bc6e9 907 case SO_TIMESTAMPNS_OLD:
887feae3 908 case SO_TIMESTAMPNS_NEW:
e71a4783 909 if (valbool) {
887feae3
DD
910 if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
911 sock_set_flag(sk, SOCK_TSTAMP_NEW);
912 else
913 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
914
915 if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
92f37fd2
ED
916 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
917 else
918 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 919 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 920 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 921 } else {
e71a4783 922 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2 923 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
887feae3 924 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
92f37fd2 925 }
e71a4783
SH
926 break;
927
9718475e
DD
928 case SO_TIMESTAMPING_NEW:
929 sock_set_flag(sk, SOCK_TSTAMP_NEW);
ff7653f9 930 /* fall through */
7f1bc6e9 931 case SO_TIMESTAMPING_OLD:
20d49473 932 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 933 ret = -EINVAL;
20d49473
PO
934 break;
935 }
b245be1f 936
09c2d251 937 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 938 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
939 if (sk->sk_protocol == IPPROTO_TCP &&
940 sk->sk_type == SOCK_STREAM) {
6db8b963
SHY
941 if ((1 << sk->sk_state) &
942 (TCPF_CLOSE | TCPF_LISTEN)) {
4ed2d765
WB
943 ret = -EINVAL;
944 break;
945 }
946 sk->sk_tskey = tcp_sk(sk)->snd_una;
947 } else {
948 sk->sk_tskey = 0;
949 }
950 }
1c885808
FY
951
952 if (val & SOF_TIMESTAMPING_OPT_STATS &&
953 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
954 ret = -EINVAL;
955 break;
956 }
957
b9f40e21 958 sk->sk_tsflags = val;
20d49473
PO
959 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
960 sock_enable_timestamp(sk,
961 SOCK_TIMESTAMPING_RX_SOFTWARE);
9718475e
DD
962 else {
963 if (optname == SO_TIMESTAMPING_NEW)
964 sock_reset_flag(sk, SOCK_TSTAMP_NEW);
965
20d49473 966 sock_disable_timestamp(sk,
08e29af3 967 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
9718475e 968 }
20d49473
PO
969 break;
970
e71a4783
SH
971 case SO_RCVLOWAT:
972 if (val < 0)
973 val = INT_MAX;
d1361840
ED
974 if (sock->ops->set_rcvlowat)
975 ret = sock->ops->set_rcvlowat(sk, val);
976 else
977 sk->sk_rcvlowat = val ? : 1;
e71a4783
SH
978 break;
979
45bdc661 980 case SO_RCVTIMEO_OLD:
a9beb86a
DD
981 case SO_RCVTIMEO_NEW:
982 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
e71a4783
SH
983 break;
984
45bdc661 985 case SO_SNDTIMEO_OLD:
a9beb86a
DD
986 case SO_SNDTIMEO_NEW:
987 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
e71a4783 988 break;
1da177e4 989
e71a4783
SH
990 case SO_ATTACH_FILTER:
991 ret = -EINVAL;
992 if (optlen == sizeof(struct sock_fprog)) {
993 struct sock_fprog fprog;
1da177e4 994
e71a4783
SH
995 ret = -EFAULT;
996 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 997 break;
e71a4783
SH
998
999 ret = sk_attach_filter(&fprog, sk);
1000 }
1001 break;
1002
89aa0758
AS
1003 case SO_ATTACH_BPF:
1004 ret = -EINVAL;
1005 if (optlen == sizeof(u32)) {
1006 u32 ufd;
1007
1008 ret = -EFAULT;
1009 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1010 break;
1011
1012 ret = sk_attach_bpf(ufd, sk);
1013 }
1014 break;
1015
538950a1
CG
1016 case SO_ATTACH_REUSEPORT_CBPF:
1017 ret = -EINVAL;
1018 if (optlen == sizeof(struct sock_fprog)) {
1019 struct sock_fprog fprog;
1020
1021 ret = -EFAULT;
1022 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1023 break;
1024
1025 ret = sk_reuseport_attach_filter(&fprog, sk);
1026 }
1027 break;
1028
1029 case SO_ATTACH_REUSEPORT_EBPF:
1030 ret = -EINVAL;
1031 if (optlen == sizeof(u32)) {
1032 u32 ufd;
1033
1034 ret = -EFAULT;
1035 if (copy_from_user(&ufd, optval, sizeof(ufd)))
1036 break;
1037
1038 ret = sk_reuseport_attach_bpf(ufd, sk);
1039 }
1040 break;
1041
e71a4783 1042 case SO_DETACH_FILTER:
55b33325 1043 ret = sk_detach_filter(sk);
e71a4783 1044 break;
1da177e4 1045
d59577b6
VB
1046 case SO_LOCK_FILTER:
1047 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1048 ret = -EPERM;
1049 else
1050 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1051 break;
1052
e71a4783
SH
1053 case SO_PASSSEC:
1054 if (valbool)
1055 set_bit(SOCK_PASSSEC, &sock->flags);
1056 else
1057 clear_bit(SOCK_PASSSEC, &sock->flags);
1058 break;
4a19ec58 1059 case SO_MARK:
50254256 1060 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
4a19ec58 1061 ret = -EPERM;
50254256 1062 } else if (val != sk->sk_mark) {
4a19ec58 1063 sk->sk_mark = val;
50254256
DB
1064 sk_dst_reset(sk);
1065 }
4a19ec58 1066 break;
877ce7c1 1067
3b885787 1068 case SO_RXQ_OVFL:
8083f0fc 1069 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 1070 break;
6e3e939f
JB
1071
1072 case SO_WIFI_STATUS:
1073 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1074 break;
1075
ef64a54f
PE
1076 case SO_PEEK_OFF:
1077 if (sock->ops->set_peek_off)
12663bfc 1078 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
1079 else
1080 ret = -EOPNOTSUPP;
1081 break;
3bdc0eba
BG
1082
1083 case SO_NOFCS:
1084 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1085 break;
1086
7d4c04fc
KJ
1087 case SO_SELECT_ERR_QUEUE:
1088 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1089 break;
1090
e0d1095a 1091#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1092 case SO_BUSY_POLL:
dafcc438
ET
1093 /* allow unprivileged users to decrease the value */
1094 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1095 ret = -EPERM;
1096 else {
1097 if (val < 0)
1098 ret = -EINVAL;
1099 else
1100 sk->sk_ll_usec = val;
1101 }
1102 break;
1103#endif
62748f32
ED
1104
1105 case SO_MAX_PACING_RATE:
6bdef102
ED
1106 {
1107 unsigned long ulval = (val == ~0U) ? ~0UL : val;
1108
1109 if (sizeof(ulval) != sizeof(val) &&
1110 optlen >= sizeof(ulval) &&
1111 get_user(ulval, (unsigned long __user *)optval)) {
1112 ret = -EFAULT;
1113 break;
1114 }
1115 if (ulval != ~0UL)
218af599
ED
1116 cmpxchg(&sk->sk_pacing_status,
1117 SK_PACING_NONE,
1118 SK_PACING_NEEDED);
6bdef102
ED
1119 sk->sk_max_pacing_rate = ulval;
1120 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
62748f32 1121 break;
6bdef102 1122 }
70da268b
ED
1123 case SO_INCOMING_CPU:
1124 sk->sk_incoming_cpu = val;
1125 break;
1126
a87cb3e4
TH
1127 case SO_CNX_ADVICE:
1128 if (val == 1)
1129 dst_negative_advice(sk);
1130 break;
76851d12
WB
1131
1132 case SO_ZEROCOPY:
28190752 1133 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
b5947e5d
WB
1134 if (!((sk->sk_type == SOCK_STREAM &&
1135 sk->sk_protocol == IPPROTO_TCP) ||
1136 (sk->sk_type == SOCK_DGRAM &&
1137 sk->sk_protocol == IPPROTO_UDP)))
28190752 1138 ret = -ENOTSUPP;
28190752 1139 } else if (sk->sk_family != PF_RDS) {
76851d12 1140 ret = -ENOTSUPP;
28190752
SV
1141 }
1142 if (!ret) {
1143 if (val < 0 || val > 1)
1144 ret = -EINVAL;
1145 else
1146 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
28190752 1147 }
334e6413
JSP
1148 break;
1149
80b14dee
RC
1150 case SO_TXTIME:
1151 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1152 ret = -EPERM;
1153 } else if (optlen != sizeof(struct sock_txtime)) {
1154 ret = -EINVAL;
1155 } else if (copy_from_user(&sk_txtime, optval,
1156 sizeof(struct sock_txtime))) {
1157 ret = -EFAULT;
1158 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1159 ret = -EINVAL;
1160 } else {
1161 sock_valbool_flag(sk, SOCK_TXTIME, true);
1162 sk->sk_clockid = sk_txtime.clockid;
1163 sk->sk_txtime_deadline_mode =
1164 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
4b15c707
JSP
1165 sk->sk_txtime_report_errors =
1166 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
80b14dee
RC
1167 }
1168 break;
1169
f5dd3d0c
DH
1170 case SO_BINDTOIFINDEX:
1171 ret = sock_setbindtodevice_locked(sk, val);
1172 break;
1173
e71a4783
SH
1174 default:
1175 ret = -ENOPROTOOPT;
1176 break;
4ec93edb 1177 }
1da177e4
LT
1178 release_sock(sk);
1179 return ret;
1180}
2a91525c 1181EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
1182
1183
8f09898b 1184static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1185 struct ucred *ucred)
3f551f94
EB
1186{
1187 ucred->pid = pid_vnr(pid);
1188 ucred->uid = ucred->gid = -1;
1189 if (cred) {
1190 struct user_namespace *current_ns = current_user_ns();
1191
b2e4f544
EB
1192 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1193 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1194 }
1195}
1196
28b5ba2a
DH
1197static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1198{
1199 struct user_namespace *user_ns = current_user_ns();
1200 int i;
1201
1202 for (i = 0; i < src->ngroups; i++)
1203 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1204 return -EFAULT;
1205
1206 return 0;
1207}
1208
1da177e4
LT
1209int sock_getsockopt(struct socket *sock, int level, int optname,
1210 char __user *optval, int __user *optlen)
1211{
1212 struct sock *sk = sock->sk;
4ec93edb 1213
e71a4783 1214 union {
4ec93edb 1215 int val;
5daab9db 1216 u64 val64;
677f136c 1217 unsigned long ulval;
4ec93edb 1218 struct linger ling;
fe0c72f3
AB
1219 struct old_timeval32 tm32;
1220 struct __kernel_old_timeval tm;
a9beb86a 1221 struct __kernel_sock_timeval stm;
80b14dee 1222 struct sock_txtime txtime;
1da177e4 1223 } v;
4ec93edb 1224
4d0392be 1225 int lv = sizeof(int);
1da177e4 1226 int len;
4ec93edb 1227
e71a4783 1228 if (get_user(len, optlen))
4ec93edb 1229 return -EFAULT;
e71a4783 1230 if (len < 0)
1da177e4 1231 return -EINVAL;
4ec93edb 1232
50fee1de 1233 memset(&v, 0, sizeof(v));
df0bca04 1234
2a91525c 1235 switch (optname) {
e71a4783
SH
1236 case SO_DEBUG:
1237 v.val = sock_flag(sk, SOCK_DBG);
1238 break;
1239
1240 case SO_DONTROUTE:
1241 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1242 break;
1243
1244 case SO_BROADCAST:
1b23a5df 1245 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1246 break;
1247
1248 case SO_SNDBUF:
1249 v.val = sk->sk_sndbuf;
1250 break;
1251
1252 case SO_RCVBUF:
1253 v.val = sk->sk_rcvbuf;
1254 break;
1255
1256 case SO_REUSEADDR:
1257 v.val = sk->sk_reuse;
1258 break;
1259
055dc21a
TH
1260 case SO_REUSEPORT:
1261 v.val = sk->sk_reuseport;
1262 break;
1263
e71a4783 1264 case SO_KEEPALIVE:
1b23a5df 1265 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1266 break;
1267
1268 case SO_TYPE:
1269 v.val = sk->sk_type;
1270 break;
1271
49c794e9
JE
1272 case SO_PROTOCOL:
1273 v.val = sk->sk_protocol;
1274 break;
1275
0d6038ee
JE
1276 case SO_DOMAIN:
1277 v.val = sk->sk_family;
1278 break;
1279
e71a4783
SH
1280 case SO_ERROR:
1281 v.val = -sock_error(sk);
2a91525c 1282 if (v.val == 0)
e71a4783
SH
1283 v.val = xchg(&sk->sk_err_soft, 0);
1284 break;
1285
1286 case SO_OOBINLINE:
1b23a5df 1287 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1288 break;
1289
1290 case SO_NO_CHECK:
28448b80 1291 v.val = sk->sk_no_check_tx;
e71a4783
SH
1292 break;
1293
1294 case SO_PRIORITY:
1295 v.val = sk->sk_priority;
1296 break;
1297
1298 case SO_LINGER:
1299 lv = sizeof(v.ling);
1b23a5df 1300 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1301 v.ling.l_linger = sk->sk_lingertime / HZ;
1302 break;
1303
1304 case SO_BSDCOMPAT:
1305 sock_warn_obsolete_bsdism("getsockopt");
1306 break;
1307
7f1bc6e9 1308 case SO_TIMESTAMP_OLD:
92f37fd2 1309 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
887feae3 1310 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
92f37fd2
ED
1311 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1312 break;
1313
7f1bc6e9 1314 case SO_TIMESTAMPNS_OLD:
887feae3
DD
1315 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1316 break;
1317
1318 case SO_TIMESTAMP_NEW:
1319 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1320 break;
1321
1322 case SO_TIMESTAMPNS_NEW:
1323 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
e71a4783
SH
1324 break;
1325
7f1bc6e9 1326 case SO_TIMESTAMPING_OLD:
b9f40e21 1327 v.val = sk->sk_tsflags;
20d49473
PO
1328 break;
1329
a9beb86a
DD
1330 case SO_RCVTIMEO_OLD:
1331 case SO_RCVTIMEO_NEW:
1332 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
e71a4783
SH
1333 break;
1334
a9beb86a
DD
1335 case SO_SNDTIMEO_OLD:
1336 case SO_SNDTIMEO_NEW:
1337 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
e71a4783 1338 break;
1da177e4 1339
e71a4783
SH
1340 case SO_RCVLOWAT:
1341 v.val = sk->sk_rcvlowat;
1342 break;
1da177e4 1343
e71a4783 1344 case SO_SNDLOWAT:
2a91525c 1345 v.val = 1;
e71a4783 1346 break;
1da177e4 1347
e71a4783 1348 case SO_PASSCRED:
82981930 1349 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1350 break;
1da177e4 1351
e71a4783 1352 case SO_PEERCRED:
109f6e39
EB
1353 {
1354 struct ucred peercred;
1355 if (len > sizeof(peercred))
1356 len = sizeof(peercred);
1357 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1358 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1359 return -EFAULT;
1360 goto lenout;
109f6e39 1361 }
1da177e4 1362
28b5ba2a
DH
1363 case SO_PEERGROUPS:
1364 {
1365 int ret, n;
1366
1367 if (!sk->sk_peer_cred)
1368 return -ENODATA;
1369
1370 n = sk->sk_peer_cred->group_info->ngroups;
1371 if (len < n * sizeof(gid_t)) {
1372 len = n * sizeof(gid_t);
1373 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1374 }
1375 len = n * sizeof(gid_t);
1376
1377 ret = groups_to_user((gid_t __user *)optval,
1378 sk->sk_peer_cred->group_info);
1379 if (ret)
1380 return ret;
1381 goto lenout;
1382 }
1383
e71a4783
SH
1384 case SO_PEERNAME:
1385 {
1386 char address[128];
1387
9b2c45d4
DV
1388 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1389 if (lv < 0)
e71a4783
SH
1390 return -ENOTCONN;
1391 if (lv < len)
1392 return -EINVAL;
1393 if (copy_to_user(optval, address, len))
1394 return -EFAULT;
1395 goto lenout;
1396 }
1da177e4 1397
e71a4783
SH
1398 /* Dubious BSD thing... Probably nobody even uses it, but
1399 * the UNIX standard wants it for whatever reason... -DaveM
1400 */
1401 case SO_ACCEPTCONN:
1402 v.val = sk->sk_state == TCP_LISTEN;
1403 break;
1da177e4 1404
e71a4783 1405 case SO_PASSSEC:
82981930 1406 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1407 break;
877ce7c1 1408
e71a4783
SH
1409 case SO_PEERSEC:
1410 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1411
4a19ec58
LAT
1412 case SO_MARK:
1413 v.val = sk->sk_mark;
1414 break;
1415
3b885787 1416 case SO_RXQ_OVFL:
1b23a5df 1417 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1418 break;
1419
6e3e939f 1420 case SO_WIFI_STATUS:
1b23a5df 1421 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1422 break;
1423
ef64a54f
PE
1424 case SO_PEEK_OFF:
1425 if (!sock->ops->set_peek_off)
1426 return -EOPNOTSUPP;
1427
1428 v.val = sk->sk_peek_off;
1429 break;
bc2f7996 1430 case SO_NOFCS:
1b23a5df 1431 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1432 break;
c91f6df2 1433
f7b86bfe 1434 case SO_BINDTODEVICE:
c91f6df2
BH
1435 return sock_getbindtodevice(sk, optval, optlen, len);
1436
a8fc9277
PE
1437 case SO_GET_FILTER:
1438 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1439 if (len < 0)
1440 return len;
1441
1442 goto lenout;
c91f6df2 1443
d59577b6
VB
1444 case SO_LOCK_FILTER:
1445 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1446 break;
1447
ea02f941
MS
1448 case SO_BPF_EXTENSIONS:
1449 v.val = bpf_tell_extensions();
1450 break;
1451
7d4c04fc
KJ
1452 case SO_SELECT_ERR_QUEUE:
1453 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1454 break;
1455
e0d1095a 1456#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1457 case SO_BUSY_POLL:
dafcc438
ET
1458 v.val = sk->sk_ll_usec;
1459 break;
1460#endif
1461
62748f32 1462 case SO_MAX_PACING_RATE:
677f136c
ED
1463 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1464 lv = sizeof(v.ulval);
1465 v.ulval = sk->sk_max_pacing_rate;
1466 } else {
1467 /* 32bit version */
1468 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1469 }
62748f32
ED
1470 break;
1471
2c8c56e1
ED
1472 case SO_INCOMING_CPU:
1473 v.val = sk->sk_incoming_cpu;
1474 break;
1475
a2d133b1
JH
1476 case SO_MEMINFO:
1477 {
1478 u32 meminfo[SK_MEMINFO_VARS];
1479
1480 if (get_user(len, optlen))
1481 return -EFAULT;
1482
1483 sk_get_meminfo(sk, meminfo);
1484
1485 len = min_t(unsigned int, len, sizeof(meminfo));
1486 if (copy_to_user(optval, &meminfo, len))
1487 return -EFAULT;
1488
1489 goto lenout;
1490 }
6d433902
SS
1491
1492#ifdef CONFIG_NET_RX_BUSY_POLL
1493 case SO_INCOMING_NAPI_ID:
1494 v.val = READ_ONCE(sk->sk_napi_id);
1495
1496 /* aggregate non-NAPI IDs down to 0 */
1497 if (v.val < MIN_NAPI_ID)
1498 v.val = 0;
1499
1500 break;
1501#endif
1502
5daab9db
CF
1503 case SO_COOKIE:
1504 lv = sizeof(u64);
1505 if (len < lv)
1506 return -EINVAL;
1507 v.val64 = sock_gen_cookie(sk);
1508 break;
1509
76851d12
WB
1510 case SO_ZEROCOPY:
1511 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1512 break;
1513
80b14dee
RC
1514 case SO_TXTIME:
1515 lv = sizeof(v.txtime);
1516 v.txtime.clockid = sk->sk_clockid;
1517 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1518 SOF_TXTIME_DEADLINE_MODE : 0;
4b15c707
JSP
1519 v.txtime.flags |= sk->sk_txtime_report_errors ?
1520 SOF_TXTIME_REPORT_ERRORS : 0;
80b14dee
RC
1521 break;
1522
f5dd3d0c
DH
1523 case SO_BINDTOIFINDEX:
1524 v.val = sk->sk_bound_dev_if;
1525 break;
1526
e71a4783 1527 default:
443b5991
YH
1528 /* We implement the SO_SNDLOWAT etc to not be settable
1529 * (1003.1g 7).
1530 */
e71a4783 1531 return -ENOPROTOOPT;
1da177e4 1532 }
e71a4783 1533
1da177e4
LT
1534 if (len > lv)
1535 len = lv;
1536 if (copy_to_user(optval, &v, len))
1537 return -EFAULT;
1538lenout:
4ec93edb
YH
1539 if (put_user(len, optlen))
1540 return -EFAULT;
1541 return 0;
1da177e4
LT
1542}
1543
a5b5bb9a
IM
1544/*
1545 * Initialize an sk_lock.
1546 *
1547 * (We also register the sk_lock with the lock validator.)
1548 */
b6f99a21 1549static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1550{
cdfbabfb
DH
1551 if (sk->sk_kern_sock)
1552 sock_lock_init_class_and_name(
1553 sk,
1554 af_family_kern_slock_key_strings[sk->sk_family],
1555 af_family_kern_slock_keys + sk->sk_family,
1556 af_family_kern_key_strings[sk->sk_family],
1557 af_family_kern_keys + sk->sk_family);
1558 else
1559 sock_lock_init_class_and_name(
1560 sk,
ed07536e
PZ
1561 af_family_slock_key_strings[sk->sk_family],
1562 af_family_slock_keys + sk->sk_family,
1563 af_family_key_strings[sk->sk_family],
1564 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1565}
1566
4dc6dc71
ED
1567/*
1568 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1569 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1570 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1571 */
f1a6c4da
PE
1572static void sock_copy(struct sock *nsk, const struct sock *osk)
1573{
1574#ifdef CONFIG_SECURITY_NETWORK
1575 void *sptr = nsk->sk_security;
1576#endif
68835aba
ED
1577 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1578
1579 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1580 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1581
f1a6c4da
PE
1582#ifdef CONFIG_SECURITY_NETWORK
1583 nsk->sk_security = sptr;
1584 security_sk_clone(osk, nsk);
1585#endif
1586}
1587
2e4afe7b
PE
1588static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1589 int family)
c308c1b2
PE
1590{
1591 struct sock *sk;
1592 struct kmem_cache *slab;
1593
1594 slab = prot->slab;
e912b114
ED
1595 if (slab != NULL) {
1596 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1597 if (!sk)
1598 return sk;
ba2489b0
ED
1599 if (priority & __GFP_ZERO)
1600 sk_prot_clear_nulls(sk, prot->obj_size);
fcbdf09d 1601 } else
c308c1b2
PE
1602 sk = kmalloc(prot->obj_size, priority);
1603
2e4afe7b
PE
1604 if (sk != NULL) {
1605 if (security_sk_alloc(sk, family, priority))
1606 goto out_free;
1607
1608 if (!try_module_get(prot->owner))
1609 goto out_free_sec;
e022f0b4 1610 sk_tx_queue_clear(sk);
2e4afe7b
PE
1611 }
1612
c308c1b2 1613 return sk;
2e4afe7b
PE
1614
1615out_free_sec:
1616 security_sk_free(sk);
1617out_free:
1618 if (slab != NULL)
1619 kmem_cache_free(slab, sk);
1620 else
1621 kfree(sk);
1622 return NULL;
c308c1b2
PE
1623}
1624
1625static void sk_prot_free(struct proto *prot, struct sock *sk)
1626{
1627 struct kmem_cache *slab;
2e4afe7b 1628 struct module *owner;
c308c1b2 1629
2e4afe7b 1630 owner = prot->owner;
c308c1b2 1631 slab = prot->slab;
2e4afe7b 1632
bd1060a1 1633 cgroup_sk_free(&sk->sk_cgrp_data);
2d758073 1634 mem_cgroup_sk_free(sk);
2e4afe7b 1635 security_sk_free(sk);
c308c1b2
PE
1636 if (slab != NULL)
1637 kmem_cache_free(slab, sk);
1638 else
1639 kfree(sk);
2e4afe7b 1640 module_put(owner);
c308c1b2
PE
1641}
1642
1da177e4
LT
1643/**
1644 * sk_alloc - All socket objects are allocated here
c4ea43c5 1645 * @net: the applicable net namespace
4dc3b16b
PP
1646 * @family: protocol family
1647 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1648 * @prot: struct proto associated with this new sock instance
11aa9c28 1649 * @kern: is this to be a kernel socket?
1da177e4 1650 */
1b8d7ae4 1651struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1652 struct proto *prot, int kern)
1da177e4 1653{
c308c1b2 1654 struct sock *sk;
1da177e4 1655
154adbc8 1656 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1657 if (sk) {
154adbc8
PE
1658 sk->sk_family = family;
1659 /*
1660 * See comment in struct sock definition to understand
1661 * why we need sk_prot_creator -acme
1662 */
1663 sk->sk_prot = sk->sk_prot_creator = prot;
cdfbabfb 1664 sk->sk_kern_sock = kern;
154adbc8 1665 sock_lock_init(sk);
26abe143 1666 sk->sk_net_refcnt = kern ? 0 : 1;
648845ab 1667 if (likely(sk->sk_net_refcnt)) {
26abe143 1668 get_net(net);
648845ab
TZ
1669 sock_inuse_add(net, 1);
1670 }
1671
26abe143 1672 sock_net_set(sk, net);
14afee4b 1673 refcount_set(&sk->sk_wmem_alloc, 1);
f8451725 1674
2d758073 1675 mem_cgroup_sk_alloc(sk);
d979a39d 1676 cgroup_sk_alloc(&sk->sk_cgrp_data);
2a56a1fe
TH
1677 sock_update_classid(&sk->sk_cgrp_data);
1678 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1679 }
a79af59e 1680
2e4afe7b 1681 return sk;
1da177e4 1682}
2a91525c 1683EXPORT_SYMBOL(sk_alloc);
1da177e4 1684
a4298e45
ED
1685/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1686 * grace period. This is the case for UDP sockets and TCP listeners.
1687 */
1688static void __sk_destruct(struct rcu_head *head)
1da177e4 1689{
a4298e45 1690 struct sock *sk = container_of(head, struct sock, sk_rcu);
1da177e4 1691 struct sk_filter *filter;
1da177e4
LT
1692
1693 if (sk->sk_destruct)
1694 sk->sk_destruct(sk);
1695
a898def2 1696 filter = rcu_dereference_check(sk->sk_filter,
14afee4b 1697 refcount_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1698 if (filter) {
309dd5fc 1699 sk_filter_uncharge(sk, filter);
a9b3cd7f 1700 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1701 }
538950a1
CG
1702 if (rcu_access_pointer(sk->sk_reuseport_cb))
1703 reuseport_detach_sock(sk);
1da177e4 1704
08e29af3 1705 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4 1706
6ac99e8f
MKL
1707#ifdef CONFIG_BPF_SYSCALL
1708 bpf_sk_storage_free(sk);
1709#endif
1710
1da177e4 1711 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1712 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1713 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1714
22a0e18e
ED
1715 if (sk->sk_frag.page) {
1716 put_page(sk->sk_frag.page);
1717 sk->sk_frag.page = NULL;
1718 }
1719
109f6e39
EB
1720 if (sk->sk_peer_cred)
1721 put_cred(sk->sk_peer_cred);
1722 put_pid(sk->sk_peer_pid);
26abe143
EB
1723 if (likely(sk->sk_net_refcnt))
1724 put_net(sock_net(sk));
c308c1b2 1725 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1726}
2b85a34e 1727
a4298e45
ED
1728void sk_destruct(struct sock *sk)
1729{
1730 if (sock_flag(sk, SOCK_RCU_FREE))
1731 call_rcu(&sk->sk_rcu, __sk_destruct);
1732 else
1733 __sk_destruct(&sk->sk_rcu);
1734}
1735
eb4cb008
CG
1736static void __sk_free(struct sock *sk)
1737{
648845ab
TZ
1738 if (likely(sk->sk_net_refcnt))
1739 sock_inuse_add(sock_net(sk), -1);
1740
9709020c 1741 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
eb4cb008
CG
1742 sock_diag_broadcast_destroy(sk);
1743 else
1744 sk_destruct(sk);
1745}
1746
2b85a34e
ED
1747void sk_free(struct sock *sk)
1748{
1749 /*
25985edc 1750 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1751 * some packets are still in some tx queue.
1752 * If not null, sock_wfree() will call __sk_free(sk) later
1753 */
14afee4b 1754 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2b85a34e
ED
1755 __sk_free(sk);
1756}
2a91525c 1757EXPORT_SYMBOL(sk_free);
1da177e4 1758
581319c5
PA
1759static void sk_init_common(struct sock *sk)
1760{
1761 skb_queue_head_init(&sk->sk_receive_queue);
1762 skb_queue_head_init(&sk->sk_write_queue);
1763 skb_queue_head_init(&sk->sk_error_queue);
1764
1765 rwlock_init(&sk->sk_callback_lock);
1766 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1767 af_rlock_keys + sk->sk_family,
1768 af_family_rlock_key_strings[sk->sk_family]);
1769 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1770 af_wlock_keys + sk->sk_family,
1771 af_family_wlock_key_strings[sk->sk_family]);
1772 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1773 af_elock_keys + sk->sk_family,
1774 af_family_elock_key_strings[sk->sk_family]);
1775 lockdep_set_class_and_name(&sk->sk_callback_lock,
1776 af_callback_keys + sk->sk_family,
1777 af_family_clock_key_strings[sk->sk_family]);
1778}
1779
e56c57d0
ED
1780/**
1781 * sk_clone_lock - clone a socket, and lock its clone
1782 * @sk: the socket to clone
1783 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1784 *
1785 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1786 */
1787struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1788{
8fd1d178 1789 struct sock *newsk;
278571ba 1790 bool is_charged = true;
87d11ceb 1791
8fd1d178 1792 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1793 if (newsk != NULL) {
1794 struct sk_filter *filter;
1795
892c141e 1796 sock_copy(newsk, sk);
87d11ceb 1797
9d538fa6
CP
1798 newsk->sk_prot_creator = sk->sk_prot;
1799
87d11ceb 1800 /* SANITY */
8a681736
SV
1801 if (likely(newsk->sk_net_refcnt))
1802 get_net(sock_net(newsk));
87d11ceb
ACM
1803 sk_node_init(&newsk->sk_node);
1804 sock_lock_init(newsk);
1805 bh_lock_sock(newsk);
fa438ccf 1806 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1807 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1808
1809 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1810 /*
1811 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1812 */
14afee4b 1813 refcount_set(&newsk->sk_wmem_alloc, 1);
87d11ceb 1814 atomic_set(&newsk->sk_omem_alloc, 0);
581319c5 1815 sk_init_common(newsk);
87d11ceb
ACM
1816
1817 newsk->sk_dst_cache = NULL;
9b8805a3 1818 newsk->sk_dst_pending_confirm = 0;
87d11ceb
ACM
1819 newsk->sk_wmem_queued = 0;
1820 newsk->sk_forward_alloc = 0;
9caad864 1821 atomic_set(&newsk->sk_drops, 0);
87d11ceb 1822 newsk->sk_send_head = NULL;
87d11ceb 1823 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
52267790 1824 atomic_set(&newsk->sk_zckey, 0);
87d11ceb
ACM
1825
1826 sock_reset_flag(newsk, SOCK_DONE);
edbe69ef 1827 mem_cgroup_sk_alloc(newsk);
c0576e39 1828 cgroup_sk_alloc(&newsk->sk_cgrp_data);
87d11ceb 1829
eefca20e
ED
1830 rcu_read_lock();
1831 filter = rcu_dereference(sk->sk_filter);
87d11ceb 1832 if (filter != NULL)
278571ba
AS
1833 /* though it's an empty new sock, the charging may fail
1834 * if sysctl_optmem_max was changed between creation of
1835 * original socket and cloning
1836 */
1837 is_charged = sk_filter_charge(newsk, filter);
eefca20e
ED
1838 RCU_INIT_POINTER(newsk->sk_filter, filter);
1839 rcu_read_unlock();
87d11ceb 1840
d188ba86 1841 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
a97e50cc
DB
1842 /* We need to make sure that we don't uncharge the new
1843 * socket if we couldn't charge it in the first place
1844 * as otherwise we uncharge the parent's filter.
1845 */
1846 if (!is_charged)
1847 RCU_INIT_POINTER(newsk->sk_filter, NULL);
94352d45 1848 sk_free_unlock_clone(newsk);
87d11ceb
ACM
1849 newsk = NULL;
1850 goto out;
1851 }
fa463497 1852 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
87d11ceb
ACM
1853
1854 newsk->sk_err = 0;
e551c32d 1855 newsk->sk_err_soft = 0;
87d11ceb 1856 newsk->sk_priority = 0;
2c8c56e1 1857 newsk->sk_incoming_cpu = raw_smp_processor_id();
648845ab
TZ
1858 if (likely(newsk->sk_net_refcnt))
1859 sock_inuse_add(sock_net(newsk), 1);
d979a39d 1860
4dc6dc71
ED
1861 /*
1862 * Before updating sk_refcnt, we must commit prior changes to memory
1863 * (Documentation/RCU/rculist_nulls.txt for details)
1864 */
1865 smp_wmb();
41c6d650 1866 refcount_set(&newsk->sk_refcnt, 2);
87d11ceb
ACM
1867
1868 /*
1869 * Increment the counter in the same struct proto as the master
1870 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1871 * is the same as sk->sk_prot->socks, as this field was copied
1872 * with memcpy).
1873 *
1874 * This _changes_ the previous behaviour, where
1875 * tcp_create_openreq_child always was incrementing the
1876 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1877 * to be taken into account in all callers. -acme
1878 */
1879 sk_refcnt_debug_inc(newsk);
972692e0 1880 sk_set_socket(newsk, NULL);
c2f26e8f 1881 RCU_INIT_POINTER(newsk->sk_wq, NULL);
87d11ceb
ACM
1882
1883 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1884 sk_sockets_allocated_inc(newsk);
704da560 1885
080a270f
HFS
1886 if (sock_needs_netstamp(sk) &&
1887 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1888 net_enable_timestamp();
87d11ceb
ACM
1889 }
1890out:
1891 return newsk;
1892}
e56c57d0 1893EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1894
94352d45
ACM
1895void sk_free_unlock_clone(struct sock *sk)
1896{
1897 /* It is still raw copy of parent, so invalidate
1898 * destructor and make plain sk_free() */
1899 sk->sk_destruct = NULL;
1900 bh_unlock_sock(sk);
1901 sk_free(sk);
1902}
1903EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1904
9958089a
AK
1905void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1906{
d6a4e26a
ED
1907 u32 max_segs = 1;
1908
6bd4f355 1909 sk_dst_set(sk, dst);
0a6b2a1d 1910 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
9958089a 1911 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1912 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1913 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1914 if (sk_can_gso(sk)) {
f70f250a 1915 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
9958089a 1916 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1917 } else {
9958089a 1918 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1919 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1920 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1921 }
9958089a 1922 }
d6a4e26a 1923 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1924}
1925EXPORT_SYMBOL_GPL(sk_setup_caps);
1926
1da177e4
LT
1927/*
1928 * Simple resource managers for sockets.
1929 */
1930
1931
4ec93edb
YH
1932/*
1933 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1934 */
1935void sock_wfree(struct sk_buff *skb)
1936{
1937 struct sock *sk = skb->sk;
d99927f4 1938 unsigned int len = skb->truesize;
1da177e4 1939
d99927f4
ED
1940 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1941 /*
1942 * Keep a reference on sk_wmem_alloc, this will be released
1943 * after sk_write_space() call
1944 */
14afee4b 1945 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1da177e4 1946 sk->sk_write_space(sk);
d99927f4
ED
1947 len = 1;
1948 }
2b85a34e 1949 /*
d99927f4
ED
1950 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1951 * could not do because of in-flight packets
2b85a34e 1952 */
14afee4b 1953 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1954 __sk_free(sk);
1da177e4 1955}
2a91525c 1956EXPORT_SYMBOL(sock_wfree);
1da177e4 1957
1d2077ac
ED
1958/* This variant of sock_wfree() is used by TCP,
1959 * since it sets SOCK_USE_WRITE_QUEUE.
1960 */
1961void __sock_wfree(struct sk_buff *skb)
1962{
1963 struct sock *sk = skb->sk;
1964
14afee4b 1965 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1d2077ac
ED
1966 __sk_free(sk);
1967}
1968
9e17f8a4
ED
1969void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1970{
1971 skb_orphan(skb);
1972 skb->sk = sk;
1973#ifdef CONFIG_INET
1974 if (unlikely(!sk_fullsock(sk))) {
1975 skb->destructor = sock_edemux;
1976 sock_hold(sk);
1977 return;
1978 }
1979#endif
1980 skb->destructor = sock_wfree;
1981 skb_set_hash_from_sk(skb, sk);
1982 /*
1983 * We used to take a refcount on sk, but following operation
1984 * is enough to guarantee sk_free() wont free this sock until
1985 * all in-flight packets are completed
1986 */
14afee4b 1987 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
9e17f8a4
ED
1988}
1989EXPORT_SYMBOL(skb_set_owner_w);
1990
1d2077ac
ED
1991/* This helper is used by netem, as it can hold packets in its
1992 * delay queue. We want to allow the owner socket to send more
1993 * packets, as if they were already TX completed by a typical driver.
1994 * But we also want to keep skb->sk set because some packet schedulers
f6ba8d33 1995 * rely on it (sch_fq for example).
1d2077ac 1996 */
f2f872f9
ED
1997void skb_orphan_partial(struct sk_buff *skb)
1998{
f6ba8d33 1999 if (skb_is_tcp_pure_ack(skb))
1d2077ac
ED
2000 return;
2001
f2f872f9
ED
2002 if (skb->destructor == sock_wfree
2003#ifdef CONFIG_INET
2004 || skb->destructor == tcp_wfree
2005#endif
2006 ) {
f6ba8d33
ED
2007 struct sock *sk = skb->sk;
2008
41c6d650 2009 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
14afee4b 2010 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
f6ba8d33
ED
2011 skb->destructor = sock_efree;
2012 }
f2f872f9
ED
2013 } else {
2014 skb_orphan(skb);
2015 }
2016}
2017EXPORT_SYMBOL(skb_orphan_partial);
2018
4ec93edb
YH
2019/*
2020 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
2021 */
2022void sock_rfree(struct sk_buff *skb)
2023{
2024 struct sock *sk = skb->sk;
d361fd59 2025 unsigned int len = skb->truesize;
1da177e4 2026
d361fd59
ED
2027 atomic_sub(len, &sk->sk_rmem_alloc);
2028 sk_mem_uncharge(sk, len);
1da177e4 2029}
2a91525c 2030EXPORT_SYMBOL(sock_rfree);
1da177e4 2031
7768eed8
OH
2032/*
2033 * Buffer destructor for skbs that are not used directly in read or write
2034 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2035 */
62bccb8c
AD
2036void sock_efree(struct sk_buff *skb)
2037{
2038 sock_put(skb->sk);
2039}
2040EXPORT_SYMBOL(sock_efree);
2041
976d0201 2042kuid_t sock_i_uid(struct sock *sk)
1da177e4 2043{
976d0201 2044 kuid_t uid;
1da177e4 2045
f064af1e 2046 read_lock_bh(&sk->sk_callback_lock);
976d0201 2047 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 2048 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2049 return uid;
2050}
2a91525c 2051EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
2052
2053unsigned long sock_i_ino(struct sock *sk)
2054{
2055 unsigned long ino;
2056
f064af1e 2057 read_lock_bh(&sk->sk_callback_lock);
1da177e4 2058 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 2059 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
2060 return ino;
2061}
2a91525c 2062EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
2063
2064/*
2065 * Allocate a skb from the socket's send buffer.
2066 */
86a76caf 2067struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 2068 gfp_t priority)
1da177e4 2069{
14afee4b 2070 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 2071 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
2072 if (skb) {
2073 skb_set_owner_w(skb, sk);
2074 return skb;
2075 }
2076 }
2077 return NULL;
2078}
2a91525c 2079EXPORT_SYMBOL(sock_wmalloc);
1da177e4 2080
98ba0bd5
WB
2081static void sock_ofree(struct sk_buff *skb)
2082{
2083 struct sock *sk = skb->sk;
2084
2085 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2086}
2087
2088struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2089 gfp_t priority)
2090{
2091 struct sk_buff *skb;
2092
2093 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2094 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2095 sysctl_optmem_max)
2096 return NULL;
2097
2098 skb = alloc_skb(size, priority);
2099 if (!skb)
2100 return NULL;
2101
2102 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2103 skb->sk = sk;
2104 skb->destructor = sock_ofree;
2105 return skb;
2106}
2107
4ec93edb 2108/*
1da177e4 2109 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 2110 */
dd0fc66f 2111void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 2112{
95c96174 2113 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
2114 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2115 void *mem;
2116 /* First do the add, to avoid the race if kmalloc
4ec93edb 2117 * might sleep.
1da177e4
LT
2118 */
2119 atomic_add(size, &sk->sk_omem_alloc);
2120 mem = kmalloc(size, priority);
2121 if (mem)
2122 return mem;
2123 atomic_sub(size, &sk->sk_omem_alloc);
2124 }
2125 return NULL;
2126}
2a91525c 2127EXPORT_SYMBOL(sock_kmalloc);
1da177e4 2128
79e88659
DB
2129/* Free an option memory block. Note, we actually want the inline
2130 * here as this allows gcc to detect the nullify and fold away the
2131 * condition entirely.
1da177e4 2132 */
79e88659
DB
2133static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2134 const bool nullify)
1da177e4 2135{
e53da5fb
DM
2136 if (WARN_ON_ONCE(!mem))
2137 return;
79e88659
DB
2138 if (nullify)
2139 kzfree(mem);
2140 else
2141 kfree(mem);
1da177e4
LT
2142 atomic_sub(size, &sk->sk_omem_alloc);
2143}
79e88659
DB
2144
2145void sock_kfree_s(struct sock *sk, void *mem, int size)
2146{
2147 __sock_kfree_s(sk, mem, size, false);
2148}
2a91525c 2149EXPORT_SYMBOL(sock_kfree_s);
1da177e4 2150
79e88659
DB
2151void sock_kzfree_s(struct sock *sk, void *mem, int size)
2152{
2153 __sock_kfree_s(sk, mem, size, true);
2154}
2155EXPORT_SYMBOL(sock_kzfree_s);
2156
1da177e4
LT
2157/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2158 I think, these locks should be removed for datagram sockets.
2159 */
2a91525c 2160static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
2161{
2162 DEFINE_WAIT(wait);
2163
9cd3e072 2164 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
2165 for (;;) {
2166 if (!timeo)
2167 break;
2168 if (signal_pending(current))
2169 break;
2170 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 2171 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
14afee4b 2172 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1da177e4
LT
2173 break;
2174 if (sk->sk_shutdown & SEND_SHUTDOWN)
2175 break;
2176 if (sk->sk_err)
2177 break;
2178 timeo = schedule_timeout(timeo);
2179 }
aa395145 2180 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2181 return timeo;
2182}
2183
2184
2185/*
2186 * Generic send/receive buffer handlers
2187 */
2188
4cc7f68d
HX
2189struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2190 unsigned long data_len, int noblock,
28d64271 2191 int *errcode, int max_page_order)
1da177e4 2192{
2e4e4410 2193 struct sk_buff *skb;
1da177e4
LT
2194 long timeo;
2195 int err;
2196
1da177e4 2197 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 2198 for (;;) {
1da177e4
LT
2199 err = sock_error(sk);
2200 if (err != 0)
2201 goto failure;
2202
2203 err = -EPIPE;
2204 if (sk->sk_shutdown & SEND_SHUTDOWN)
2205 goto failure;
2206
2e4e4410
ED
2207 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2208 break;
28d64271 2209
9cd3e072 2210 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
2211 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2212 err = -EAGAIN;
2213 if (!timeo)
1da177e4 2214 goto failure;
2e4e4410
ED
2215 if (signal_pending(current))
2216 goto interrupted;
2217 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 2218 }
2e4e4410
ED
2219 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2220 errcode, sk->sk_allocation);
2221 if (skb)
2222 skb_set_owner_w(skb, sk);
1da177e4
LT
2223 return skb;
2224
2225interrupted:
2226 err = sock_intr_errno(timeo);
2227failure:
2228 *errcode = err;
2229 return NULL;
2230}
4cc7f68d 2231EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 2232
4ec93edb 2233struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
2234 int noblock, int *errcode)
2235{
28d64271 2236 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 2237}
2a91525c 2238EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 2239
39771b12
WB
2240int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2241 struct sockcm_cookie *sockc)
2242{
3dd17e63
SHY
2243 u32 tsflags;
2244
39771b12
WB
2245 switch (cmsg->cmsg_type) {
2246 case SO_MARK:
2247 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2248 return -EPERM;
2249 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2250 return -EINVAL;
2251 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2252 break;
7f1bc6e9 2253 case SO_TIMESTAMPING_OLD:
3dd17e63
SHY
2254 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2255 return -EINVAL;
2256
2257 tsflags = *(u32 *)CMSG_DATA(cmsg);
2258 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2259 return -EINVAL;
2260
2261 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2262 sockc->tsflags |= tsflags;
2263 break;
80b14dee
RC
2264 case SCM_TXTIME:
2265 if (!sock_flag(sk, SOCK_TXTIME))
2266 return -EINVAL;
2267 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2268 return -EINVAL;
2269 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2270 break;
779f1ede
SHY
2271 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2272 case SCM_RIGHTS:
2273 case SCM_CREDENTIALS:
2274 break;
39771b12
WB
2275 default:
2276 return -EINVAL;
2277 }
2278 return 0;
2279}
2280EXPORT_SYMBOL(__sock_cmsg_send);
2281
f28ea365
EJ
2282int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2283 struct sockcm_cookie *sockc)
2284{
2285 struct cmsghdr *cmsg;
39771b12 2286 int ret;
f28ea365
EJ
2287
2288 for_each_cmsghdr(cmsg, msg) {
2289 if (!CMSG_OK(msg, cmsg))
2290 return -EINVAL;
2291 if (cmsg->cmsg_level != SOL_SOCKET)
2292 continue;
39771b12
WB
2293 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2294 if (ret)
2295 return ret;
f28ea365
EJ
2296 }
2297 return 0;
2298}
2299EXPORT_SYMBOL(sock_cmsg_send);
2300
06044751
ED
2301static void sk_enter_memory_pressure(struct sock *sk)
2302{
2303 if (!sk->sk_prot->enter_memory_pressure)
2304 return;
2305
2306 sk->sk_prot->enter_memory_pressure(sk);
2307}
2308
2309static void sk_leave_memory_pressure(struct sock *sk)
2310{
2311 if (sk->sk_prot->leave_memory_pressure) {
2312 sk->sk_prot->leave_memory_pressure(sk);
2313 } else {
2314 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2315
2316 if (memory_pressure && *memory_pressure)
2317 *memory_pressure = 0;
2318 }
2319}
2320
5640f768
ED
2321/* On 32bit arches, an skb frag is limited to 2^15 */
2322#define SKB_FRAG_PAGE_ORDER get_order(32768)
2323
400dfd3a
ED
2324/**
2325 * skb_page_frag_refill - check that a page_frag contains enough room
2326 * @sz: minimum size of the fragment we want to get
2327 * @pfrag: pointer to page_frag
82d5e2b8 2328 * @gfp: priority for memory allocation
400dfd3a
ED
2329 *
2330 * Note: While this allocator tries to use high order pages, there is
2331 * no guarantee that allocations succeed. Therefore, @sz MUST be
2332 * less or equal than PAGE_SIZE.
2333 */
d9b2938a 2334bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 2335{
5640f768 2336 if (pfrag->page) {
fe896d18 2337 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
2338 pfrag->offset = 0;
2339 return true;
2340 }
400dfd3a 2341 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
2342 return true;
2343 put_page(pfrag->page);
2344 }
2345
d9b2938a
ED
2346 pfrag->offset = 0;
2347 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
2348 /* Avoid direct reclaim but allow kswapd to wake */
2349 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2350 __GFP_COMP | __GFP_NOWARN |
2351 __GFP_NORETRY,
d9b2938a 2352 SKB_FRAG_PAGE_ORDER);
5640f768 2353 if (likely(pfrag->page)) {
d9b2938a 2354 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
2355 return true;
2356 }
d9b2938a
ED
2357 }
2358 pfrag->page = alloc_page(gfp);
2359 if (likely(pfrag->page)) {
2360 pfrag->size = PAGE_SIZE;
2361 return true;
2362 }
400dfd3a
ED
2363 return false;
2364}
2365EXPORT_SYMBOL(skb_page_frag_refill);
2366
2367bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2368{
2369 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2370 return true;
2371
5640f768
ED
2372 sk_enter_memory_pressure(sk);
2373 sk_stream_moderate_sndbuf(sk);
2374 return false;
2375}
2376EXPORT_SYMBOL(sk_page_frag_refill);
2377
1da177e4 2378static void __lock_sock(struct sock *sk)
f39234d6
NK
2379 __releases(&sk->sk_lock.slock)
2380 __acquires(&sk->sk_lock.slock)
1da177e4
LT
2381{
2382 DEFINE_WAIT(wait);
2383
e71a4783 2384 for (;;) {
1da177e4
LT
2385 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2386 TASK_UNINTERRUPTIBLE);
2387 spin_unlock_bh(&sk->sk_lock.slock);
2388 schedule();
2389 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 2390 if (!sock_owned_by_user(sk))
1da177e4
LT
2391 break;
2392 }
2393 finish_wait(&sk->sk_lock.wq, &wait);
2394}
2395
8873c064 2396void __release_sock(struct sock *sk)
f39234d6
NK
2397 __releases(&sk->sk_lock.slock)
2398 __acquires(&sk->sk_lock.slock)
1da177e4 2399{
5413d1ba 2400 struct sk_buff *skb, *next;
1da177e4 2401
5413d1ba 2402 while ((skb = sk->sk_backlog.head) != NULL) {
1da177e4 2403 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1da177e4 2404
5413d1ba 2405 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4 2406
5413d1ba
ED
2407 do {
2408 next = skb->next;
e4cbb02a 2409 prefetch(next);
7fee226a 2410 WARN_ON_ONCE(skb_dst_is_noref(skb));
a8305bff 2411 skb_mark_not_on_list(skb);
c57943a1 2412 sk_backlog_rcv(sk, skb);
1da177e4 2413
5413d1ba 2414 cond_resched();
1da177e4
LT
2415
2416 skb = next;
2417 } while (skb != NULL);
2418
5413d1ba
ED
2419 spin_lock_bh(&sk->sk_lock.slock);
2420 }
8eae939f
ZY
2421
2422 /*
2423 * Doing the zeroing here guarantee we can not loop forever
2424 * while a wild producer attempts to flood us.
2425 */
2426 sk->sk_backlog.len = 0;
1da177e4
LT
2427}
2428
d41a69f1
ED
2429void __sk_flush_backlog(struct sock *sk)
2430{
2431 spin_lock_bh(&sk->sk_lock.slock);
2432 __release_sock(sk);
2433 spin_unlock_bh(&sk->sk_lock.slock);
2434}
2435
1da177e4
LT
2436/**
2437 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2438 * @sk: sock to wait on
2439 * @timeo: for how long
dfbafc99 2440 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2441 *
2442 * Now socket state including sk->sk_err is changed only under lock,
2443 * hence we may omit checks after joining wait queue.
2444 * We check receive queue before schedule() only as optimization;
2445 * it is very likely that release_sock() added new data.
2446 */
dfbafc99 2447int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4 2448{
d9dc8b0f 2449 DEFINE_WAIT_FUNC(wait, woken_wake_function);
1da177e4 2450 int rc;
1da177e4 2451
d9dc8b0f 2452 add_wait_queue(sk_sleep(sk), &wait);
9cd3e072 2453 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2454 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
9cd3e072 2455 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
d9dc8b0f 2456 remove_wait_queue(sk_sleep(sk), &wait);
1da177e4
LT
2457 return rc;
2458}
1da177e4
LT
2459EXPORT_SYMBOL(sk_wait_data);
2460
3ab224be 2461/**
f8c3bf00 2462 * __sk_mem_raise_allocated - increase memory_allocated
3ab224be
HA
2463 * @sk: socket
2464 * @size: memory size to allocate
f8c3bf00 2465 * @amt: pages to allocate
3ab224be
HA
2466 * @kind: allocation type
2467 *
f8c3bf00 2468 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
3ab224be 2469 */
f8c3bf00 2470int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3ab224be
HA
2471{
2472 struct proto *prot = sk->sk_prot;
f8c3bf00 2473 long allocated = sk_memory_allocated_add(sk, amt);
d6f19938 2474 bool charged = true;
e805605c 2475
baac50bb 2476 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
d6f19938 2477 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
e805605c 2478 goto suppress_allocation;
3ab224be
HA
2479
2480 /* Under limit. */
e805605c 2481 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2482 sk_leave_memory_pressure(sk);
3ab224be
HA
2483 return 1;
2484 }
2485
e805605c
JW
2486 /* Under pressure. */
2487 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2488 sk_enter_memory_pressure(sk);
3ab224be 2489
e805605c
JW
2490 /* Over hard limit. */
2491 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2492 goto suppress_allocation;
2493
2494 /* guarantee minimum buffer size under pressure */
2495 if (kind == SK_MEM_RECV) {
a3dcaf17 2496 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3ab224be 2497 return 1;
180d8cd9 2498
3ab224be 2499 } else { /* SK_MEM_SEND */
a3dcaf17
ED
2500 int wmem0 = sk_get_wmem0(sk, prot);
2501
3ab224be 2502 if (sk->sk_type == SOCK_STREAM) {
a3dcaf17 2503 if (sk->sk_wmem_queued < wmem0)
3ab224be 2504 return 1;
a3dcaf17 2505 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3ab224be 2506 return 1;
a3dcaf17 2507 }
3ab224be
HA
2508 }
2509
180d8cd9 2510 if (sk_has_memory_pressure(sk)) {
5bf325a5 2511 u64 alloc;
1748376b 2512
180d8cd9 2513 if (!sk_under_memory_pressure(sk))
1748376b 2514 return 1;
180d8cd9
GC
2515 alloc = sk_sockets_allocated_read_positive(sk);
2516 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2517 sk_mem_pages(sk->sk_wmem_queued +
2518 atomic_read(&sk->sk_rmem_alloc) +
2519 sk->sk_forward_alloc))
2520 return 1;
2521 }
2522
2523suppress_allocation:
2524
2525 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2526 sk_stream_moderate_sndbuf(sk);
2527
2528 /* Fail only if socket is _under_ its sndbuf.
2529 * In this case we cannot block, so that we have to fail.
2530 */
2531 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2532 return 1;
2533 }
2534
d6f19938
YS
2535 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2536 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3847ce32 2537
0e90b31f 2538 sk_memory_allocated_sub(sk, amt);
180d8cd9 2539
baac50bb
JW
2540 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2541 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2542
3ab224be
HA
2543 return 0;
2544}
f8c3bf00
PA
2545EXPORT_SYMBOL(__sk_mem_raise_allocated);
2546
2547/**
2548 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2549 * @sk: socket
2550 * @size: memory size to allocate
2551 * @kind: allocation type
2552 *
2553 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2554 * rmem allocation. This function assumes that protocols which have
2555 * memory_pressure use sk_wmem_queued as write buffer accounting.
2556 */
2557int __sk_mem_schedule(struct sock *sk, int size, int kind)
2558{
2559 int ret, amt = sk_mem_pages(size);
2560
2561 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2562 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2563 if (!ret)
2564 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2565 return ret;
2566}
3ab224be
HA
2567EXPORT_SYMBOL(__sk_mem_schedule);
2568
2569/**
f8c3bf00 2570 * __sk_mem_reduce_allocated - reclaim memory_allocated
3ab224be 2571 * @sk: socket
f8c3bf00
PA
2572 * @amount: number of quanta
2573 *
2574 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3ab224be 2575 */
f8c3bf00 2576void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3ab224be 2577{
1a24e04e 2578 sk_memory_allocated_sub(sk, amount);
3ab224be 2579
baac50bb
JW
2580 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2581 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2582
180d8cd9
GC
2583 if (sk_under_memory_pressure(sk) &&
2584 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2585 sk_leave_memory_pressure(sk);
3ab224be 2586}
f8c3bf00
PA
2587EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2588
2589/**
2590 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2591 * @sk: socket
2592 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2593 */
2594void __sk_mem_reclaim(struct sock *sk, int amount)
2595{
2596 amount >>= SK_MEM_QUANTUM_SHIFT;
2597 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2598 __sk_mem_reduce_allocated(sk, amount);
2599}
3ab224be
HA
2600EXPORT_SYMBOL(__sk_mem_reclaim);
2601
627d2d6b 2602int sk_set_peek_off(struct sock *sk, int val)
2603{
627d2d6b 2604 sk->sk_peek_off = val;
2605 return 0;
2606}
2607EXPORT_SYMBOL_GPL(sk_set_peek_off);
3ab224be 2608
1da177e4
LT
2609/*
2610 * Set of default routines for initialising struct proto_ops when
2611 * the protocol does not support a particular function. In certain
2612 * cases where it makes no sense for a protocol to have a "do nothing"
2613 * function, some default processing is provided.
2614 */
2615
2616int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2617{
2618 return -EOPNOTSUPP;
2619}
2a91525c 2620EXPORT_SYMBOL(sock_no_bind);
1da177e4 2621
4ec93edb 2622int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2623 int len, int flags)
2624{
2625 return -EOPNOTSUPP;
2626}
2a91525c 2627EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2628
2629int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2630{
2631 return -EOPNOTSUPP;
2632}
2a91525c 2633EXPORT_SYMBOL(sock_no_socketpair);
1da177e4 2634
cdfbabfb
DH
2635int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2636 bool kern)
1da177e4
LT
2637{
2638 return -EOPNOTSUPP;
2639}
2a91525c 2640EXPORT_SYMBOL(sock_no_accept);
1da177e4 2641
4ec93edb 2642int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
9b2c45d4 2643 int peer)
1da177e4
LT
2644{
2645 return -EOPNOTSUPP;
2646}
2a91525c 2647EXPORT_SYMBOL(sock_no_getname);
1da177e4 2648
1da177e4
LT
2649int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2650{
2651 return -EOPNOTSUPP;
2652}
2a91525c 2653EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2654
2655int sock_no_listen(struct socket *sock, int backlog)
2656{
2657 return -EOPNOTSUPP;
2658}
2a91525c 2659EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2660
2661int sock_no_shutdown(struct socket *sock, int how)
2662{
2663 return -EOPNOTSUPP;
2664}
2a91525c 2665EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2666
2667int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2668 char __user *optval, unsigned int optlen)
1da177e4
LT
2669{
2670 return -EOPNOTSUPP;
2671}
2a91525c 2672EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2673
2674int sock_no_getsockopt(struct socket *sock, int level, int optname,
2675 char __user *optval, int __user *optlen)
2676{
2677 return -EOPNOTSUPP;
2678}
2a91525c 2679EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2680
1b784140 2681int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2682{
2683 return -EOPNOTSUPP;
2684}
2a91525c 2685EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2686
306b13eb
TH
2687int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2688{
2689 return -EOPNOTSUPP;
2690}
2691EXPORT_SYMBOL(sock_no_sendmsg_locked);
2692
1b784140
YX
2693int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2694 int flags)
1da177e4
LT
2695{
2696 return -EOPNOTSUPP;
2697}
2a91525c 2698EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2699
2700int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2701{
2702 /* Mirror missing mmap method error code */
2703 return -ENODEV;
2704}
2a91525c 2705EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2706
2707ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2708{
2709 ssize_t res;
2710 struct msghdr msg = {.msg_flags = flags};
2711 struct kvec iov;
2712 char *kaddr = kmap(page);
2713 iov.iov_base = kaddr + offset;
2714 iov.iov_len = size;
2715 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2716 kunmap(page);
2717 return res;
2718}
2a91525c 2719EXPORT_SYMBOL(sock_no_sendpage);
1da177e4 2720
306b13eb
TH
2721ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2722 int offset, size_t size, int flags)
2723{
2724 ssize_t res;
2725 struct msghdr msg = {.msg_flags = flags};
2726 struct kvec iov;
2727 char *kaddr = kmap(page);
2728
2729 iov.iov_base = kaddr + offset;
2730 iov.iov_len = size;
2731 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2732 kunmap(page);
2733 return res;
2734}
2735EXPORT_SYMBOL(sock_no_sendpage_locked);
2736
1da177e4
LT
2737/*
2738 * Default Socket Callbacks
2739 */
2740
2741static void sock_def_wakeup(struct sock *sk)
2742{
43815482
ED
2743 struct socket_wq *wq;
2744
2745 rcu_read_lock();
2746 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2747 if (skwq_has_sleeper(wq))
43815482
ED
2748 wake_up_interruptible_all(&wq->wait);
2749 rcu_read_unlock();
1da177e4
LT
2750}
2751
2752static void sock_def_error_report(struct sock *sk)
2753{
43815482
ED
2754 struct socket_wq *wq;
2755
2756 rcu_read_lock();
2757 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2758 if (skwq_has_sleeper(wq))
a9a08845 2759 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
8d8ad9d7 2760 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2761 rcu_read_unlock();
1da177e4
LT
2762}
2763
676d2369 2764static void sock_def_readable(struct sock *sk)
1da177e4 2765{
43815482
ED
2766 struct socket_wq *wq;
2767
2768 rcu_read_lock();
2769 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2770 if (skwq_has_sleeper(wq))
a9a08845
LT
2771 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2772 EPOLLRDNORM | EPOLLRDBAND);
8d8ad9d7 2773 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2774 rcu_read_unlock();
1da177e4
LT
2775}
2776
2777static void sock_def_write_space(struct sock *sk)
2778{
43815482
ED
2779 struct socket_wq *wq;
2780
2781 rcu_read_lock();
1da177e4
LT
2782
2783 /* Do not wake up a writer until he can make "significant"
2784 * progress. --DaveM
2785 */
14afee4b 2786 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2787 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2788 if (skwq_has_sleeper(wq))
a9a08845
LT
2789 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2790 EPOLLWRNORM | EPOLLWRBAND);
1da177e4
LT
2791
2792 /* Should agree with poll, otherwise some programs break */
2793 if (sock_writeable(sk))
8d8ad9d7 2794 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2795 }
2796
43815482 2797 rcu_read_unlock();
1da177e4
LT
2798}
2799
2800static void sock_def_destruct(struct sock *sk)
2801{
1da177e4
LT
2802}
2803
2804void sk_send_sigurg(struct sock *sk)
2805{
2806 if (sk->sk_socket && sk->sk_socket->file)
2807 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2808 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2809}
2a91525c 2810EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2811
2812void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2813 unsigned long expires)
2814{
2815 if (!mod_timer(timer, expires))
2816 sock_hold(sk);
2817}
1da177e4
LT
2818EXPORT_SYMBOL(sk_reset_timer);
2819
2820void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2821{
25cc4ae9 2822 if (del_timer(timer))
1da177e4
LT
2823 __sock_put(sk);
2824}
1da177e4
LT
2825EXPORT_SYMBOL(sk_stop_timer);
2826
2827void sock_init_data(struct socket *sock, struct sock *sk)
2828{
581319c5 2829 sk_init_common(sk);
1da177e4
LT
2830 sk->sk_send_head = NULL;
2831
99767f27 2832 timer_setup(&sk->sk_timer, NULL, 0);
4ec93edb 2833
1da177e4
LT
2834 sk->sk_allocation = GFP_KERNEL;
2835 sk->sk_rcvbuf = sysctl_rmem_default;
2836 sk->sk_sndbuf = sysctl_wmem_default;
2837 sk->sk_state = TCP_CLOSE;
972692e0 2838 sk_set_socket(sk, sock);
1da177e4
LT
2839
2840 sock_set_flag(sk, SOCK_ZAPPED);
2841
e71a4783 2842 if (sock) {
1da177e4 2843 sk->sk_type = sock->type;
c2f26e8f 2844 RCU_INIT_POINTER(sk->sk_wq, sock->wq);
1da177e4 2845 sock->sk = sk;
86741ec2
LC
2846 sk->sk_uid = SOCK_INODE(sock)->i_uid;
2847 } else {
c2f26e8f 2848 RCU_INIT_POINTER(sk->sk_wq, NULL);
86741ec2
LC
2849 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
2850 }
1da177e4 2851
1da177e4 2852 rwlock_init(&sk->sk_callback_lock);
cdfbabfb
DH
2853 if (sk->sk_kern_sock)
2854 lockdep_set_class_and_name(
2855 &sk->sk_callback_lock,
2856 af_kern_callback_keys + sk->sk_family,
2857 af_family_kern_clock_key_strings[sk->sk_family]);
2858 else
2859 lockdep_set_class_and_name(
2860 &sk->sk_callback_lock,
443aef0e
PZ
2861 af_callback_keys + sk->sk_family,
2862 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2863
2864 sk->sk_state_change = sock_def_wakeup;
2865 sk->sk_data_ready = sock_def_readable;
2866 sk->sk_write_space = sock_def_write_space;
2867 sk->sk_error_report = sock_def_error_report;
2868 sk->sk_destruct = sock_def_destruct;
2869
5640f768
ED
2870 sk->sk_frag.page = NULL;
2871 sk->sk_frag.offset = 0;
ef64a54f 2872 sk->sk_peek_off = -1;
1da177e4 2873
109f6e39
EB
2874 sk->sk_peer_pid = NULL;
2875 sk->sk_peer_cred = NULL;
1da177e4
LT
2876 sk->sk_write_pending = 0;
2877 sk->sk_rcvlowat = 1;
2878 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2879 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2880
6c7c98ba 2881 sk->sk_stamp = SK_DEFAULT_STAMP;
3a0ed3e9
DD
2882#if BITS_PER_LONG==32
2883 seqlock_init(&sk->sk_stamp_seq);
2884#endif
52267790 2885 atomic_set(&sk->sk_zckey, 0);
1da177e4 2886
e0d1095a 2887#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2888 sk->sk_napi_id = 0;
64b0dc51 2889 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2890#endif
2891
76a9ebe8
ED
2892 sk->sk_max_pacing_rate = ~0UL;
2893 sk->sk_pacing_rate = ~0UL;
3a9b76fd 2894 sk->sk_pacing_shift = 10;
70da268b 2895 sk->sk_incoming_cpu = -1;
c6345ce7
AN
2896
2897 sk_rx_queue_clear(sk);
4dc6dc71
ED
2898 /*
2899 * Before updating sk_refcnt, we must commit prior changes to memory
2900 * (Documentation/RCU/rculist_nulls.txt for details)
2901 */
2902 smp_wmb();
41c6d650 2903 refcount_set(&sk->sk_refcnt, 1);
33c732c3 2904 atomic_set(&sk->sk_drops, 0);
1da177e4 2905}
2a91525c 2906EXPORT_SYMBOL(sock_init_data);
1da177e4 2907
b5606c2d 2908void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2909{
2910 might_sleep();
a5b5bb9a 2911 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2912 if (sk->sk_lock.owned)
1da177e4 2913 __lock_sock(sk);
d2e9117c 2914 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2915 spin_unlock(&sk->sk_lock.slock);
2916 /*
2917 * The sk_lock has mutex_lock() semantics here:
2918 */
fcc70d5f 2919 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2920 local_bh_enable();
1da177e4 2921}
fcc70d5f 2922EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2923
b5606c2d 2924void release_sock(struct sock *sk)
1da177e4 2925{
a5b5bb9a 2926 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2927 if (sk->sk_backlog.tail)
2928 __release_sock(sk);
46d3ceab 2929
c3f9b018
ED
2930 /* Warning : release_cb() might need to release sk ownership,
2931 * ie call sock_release_ownership(sk) before us.
2932 */
46d3ceab
ED
2933 if (sk->sk_prot->release_cb)
2934 sk->sk_prot->release_cb(sk);
2935
c3f9b018 2936 sock_release_ownership(sk);
a5b5bb9a
IM
2937 if (waitqueue_active(&sk->sk_lock.wq))
2938 wake_up(&sk->sk_lock.wq);
2939 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2940}
2941EXPORT_SYMBOL(release_sock);
2942
8a74ad60
ED
2943/**
2944 * lock_sock_fast - fast version of lock_sock
2945 * @sk: socket
2946 *
2947 * This version should be used for very small section, where process wont block
d651983d
MCC
2948 * return false if fast path is taken:
2949 *
8a74ad60 2950 * sk_lock.slock locked, owned = 0, BH disabled
d651983d
MCC
2951 *
2952 * return true if slow path is taken:
2953 *
8a74ad60
ED
2954 * sk_lock.slock unlocked, owned = 1, BH enabled
2955 */
2956bool lock_sock_fast(struct sock *sk)
2957{
2958 might_sleep();
2959 spin_lock_bh(&sk->sk_lock.slock);
2960
2961 if (!sk->sk_lock.owned)
2962 /*
2963 * Note : We must disable BH
2964 */
2965 return false;
2966
2967 __lock_sock(sk);
2968 sk->sk_lock.owned = 1;
2969 spin_unlock(&sk->sk_lock.slock);
2970 /*
2971 * The sk_lock has mutex_lock() semantics here:
2972 */
2973 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2974 local_bh_enable();
2975 return true;
2976}
2977EXPORT_SYMBOL(lock_sock_fast);
2978
c7cbdbf2
AB
2979int sock_gettstamp(struct socket *sock, void __user *userstamp,
2980 bool timeval, bool time32)
4ec93edb 2981{
c7cbdbf2
AB
2982 struct sock *sk = sock->sk;
2983 struct timespec64 ts;
9dae3497
YS
2984
2985 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
c7cbdbf2
AB
2986 ts = ktime_to_timespec64(sock_read_timestamp(sk));
2987 if (ts.tv_sec == -1)
1da177e4 2988 return -ENOENT;
c7cbdbf2 2989 if (ts.tv_sec == 0) {
3a0ed3e9 2990 ktime_t kt = ktime_get_real();
c7cbdbf2
AB
2991 sock_write_timestamp(sk, kt);;
2992 ts = ktime_to_timespec64(kt);
b7aa0bf7 2993 }
1da177e4 2994
c7cbdbf2
AB
2995 if (timeval)
2996 ts.tv_nsec /= 1000;
9dae3497 2997
c7cbdbf2
AB
2998#ifdef CONFIG_COMPAT_32BIT_TIME
2999 if (time32)
3000 return put_old_timespec32(&ts, userstamp);
3001#endif
3002#ifdef CONFIG_SPARC64
3003 /* beware of padding in sparc64 timeval */
3004 if (timeval && !in_compat_syscall()) {
3005 struct __kernel_old_timeval __user tv = {
c98f4822
SR
3006 .tv_sec = ts.tv_sec,
3007 .tv_usec = ts.tv_nsec,
c7cbdbf2 3008 };
c98f4822 3009 if (copy_to_user(userstamp, &tv, sizeof(tv)))
c7cbdbf2
AB
3010 return -EFAULT;
3011 return 0;
ae40eb1e 3012 }
c7cbdbf2
AB
3013#endif
3014 return put_timespec64(&ts, userstamp);
ae40eb1e 3015}
c7cbdbf2 3016EXPORT_SYMBOL(sock_gettstamp);
ae40eb1e 3017
20d49473 3018void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 3019{
20d49473 3020 if (!sock_flag(sk, flag)) {
08e29af3
ED
3021 unsigned long previous_flags = sk->sk_flags;
3022
20d49473
PO
3023 sock_set_flag(sk, flag);
3024 /*
3025 * we just set one of the two flags which require net
3026 * time stamping, but time stamping might have been on
3027 * already because of the other one
3028 */
080a270f
HFS
3029 if (sock_needs_netstamp(sk) &&
3030 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 3031 net_enable_timestamp();
1da177e4
LT
3032 }
3033}
1da177e4 3034
cb820f8e
RC
3035int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3036 int level, int type)
3037{
3038 struct sock_exterr_skb *serr;
364a9e93 3039 struct sk_buff *skb;
cb820f8e
RC
3040 int copied, err;
3041
3042 err = -EAGAIN;
364a9e93 3043 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
3044 if (skb == NULL)
3045 goto out;
3046
3047 copied = skb->len;
3048 if (copied > len) {
3049 msg->msg_flags |= MSG_TRUNC;
3050 copied = len;
3051 }
51f3d02b 3052 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
3053 if (err)
3054 goto out_free_skb;
3055
3056 sock_recv_timestamp(msg, sk, skb);
3057
3058 serr = SKB_EXT_ERR(skb);
3059 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3060
3061 msg->msg_flags |= MSG_ERRQUEUE;
3062 err = copied;
3063
cb820f8e
RC
3064out_free_skb:
3065 kfree_skb(skb);
3066out:
3067 return err;
3068}
3069EXPORT_SYMBOL(sock_recv_errqueue);
3070
1da177e4
LT
3071/*
3072 * Get a socket option on an socket.
3073 *
3074 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3075 * asynchronous errors should be reported by getsockopt. We assume
3076 * this means if you specify SO_ERROR (otherwise whats the point of it).
3077 */
3078int sock_common_getsockopt(struct socket *sock, int level, int optname,
3079 char __user *optval, int __user *optlen)
3080{
3081 struct sock *sk = sock->sk;
3082
3083 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3084}
1da177e4
LT
3085EXPORT_SYMBOL(sock_common_getsockopt);
3086
3fdadf7d 3087#ifdef CONFIG_COMPAT
543d9cfe
ACM
3088int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3089 char __user *optval, int __user *optlen)
3fdadf7d
DM
3090{
3091 struct sock *sk = sock->sk;
3092
1e51f951 3093 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
3094 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3095 optval, optlen);
3fdadf7d
DM
3096 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3097}
3098EXPORT_SYMBOL(compat_sock_common_getsockopt);
3099#endif
3100
1b784140
YX
3101int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3102 int flags)
1da177e4
LT
3103{
3104 struct sock *sk = sock->sk;
3105 int addr_len = 0;
3106 int err;
3107
1b784140 3108 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
3109 flags & ~MSG_DONTWAIT, &addr_len);
3110 if (err >= 0)
3111 msg->msg_namelen = addr_len;
3112 return err;
3113}
1da177e4
LT
3114EXPORT_SYMBOL(sock_common_recvmsg);
3115
3116/*
3117 * Set socket options on an inet socket.
3118 */
3119int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3120 char __user *optval, unsigned int optlen)
1da177e4
LT
3121{
3122 struct sock *sk = sock->sk;
3123
3124 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3125}
1da177e4
LT
3126EXPORT_SYMBOL(sock_common_setsockopt);
3127
3fdadf7d 3128#ifdef CONFIG_COMPAT
543d9cfe 3129int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 3130 char __user *optval, unsigned int optlen)
3fdadf7d
DM
3131{
3132 struct sock *sk = sock->sk;
3133
543d9cfe
ACM
3134 if (sk->sk_prot->compat_setsockopt != NULL)
3135 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3136 optval, optlen);
3fdadf7d
DM
3137 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3138}
3139EXPORT_SYMBOL(compat_sock_common_setsockopt);
3140#endif
3141
1da177e4
LT
3142void sk_common_release(struct sock *sk)
3143{
3144 if (sk->sk_prot->destroy)
3145 sk->sk_prot->destroy(sk);
3146
3147 /*
3148 * Observation: when sock_common_release is called, processes have
3149 * no access to socket. But net still has.
3150 * Step one, detach it from networking:
3151 *
3152 * A. Remove from hash tables.
3153 */
3154
3155 sk->sk_prot->unhash(sk);
3156
3157 /*
3158 * In this point socket cannot receive new packets, but it is possible
3159 * that some packets are in flight because some CPU runs receiver and
3160 * did hash table lookup before we unhashed socket. They will achieve
3161 * receive queue and will be purged by socket destructor.
3162 *
3163 * Also we still have packets pending on receive queue and probably,
3164 * our own packets waiting in device queues. sock_destroy will drain
3165 * receive queue, but transmitted packets will delay socket destruction
3166 * until the last reference will be released.
3167 */
3168
3169 sock_orphan(sk);
3170
3171 xfrm_sk_free_policy(sk);
3172
e6848976 3173 sk_refcnt_debug_release(sk);
5640f768 3174
1da177e4
LT
3175 sock_put(sk);
3176}
1da177e4
LT
3177EXPORT_SYMBOL(sk_common_release);
3178
a2d133b1
JH
3179void sk_get_meminfo(const struct sock *sk, u32 *mem)
3180{
3181 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3182
3183 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3184 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3185 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3186 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3187 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3188 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3189 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3190 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3191 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3192}
3193
13ff3d6f
PE
3194#ifdef CONFIG_PROC_FS
3195#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
3196struct prot_inuse {
3197 int val[PROTO_INUSE_NR];
3198};
13ff3d6f
PE
3199
3200static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159 3201
70ee1159
PE
3202void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3203{
08fc7f81 3204 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
70ee1159
PE
3205}
3206EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3207
3208int sock_prot_inuse_get(struct net *net, struct proto *prot)
3209{
3210 int cpu, idx = prot->inuse_idx;
3211 int res = 0;
3212
3213 for_each_possible_cpu(cpu)
08fc7f81 3214 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
70ee1159
PE
3215
3216 return res >= 0 ? res : 0;
3217}
3218EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3219
648845ab
TZ
3220static void sock_inuse_add(struct net *net, int val)
3221{
3222 this_cpu_add(*net->core.sock_inuse, val);
3223}
3224
3225int sock_inuse_get(struct net *net)
3226{
3227 int cpu, res = 0;
3228
3229 for_each_possible_cpu(cpu)
3230 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3231
3232 return res;
3233}
3234
3235EXPORT_SYMBOL_GPL(sock_inuse_get);
3236
2c8c1e72 3237static int __net_init sock_inuse_init_net(struct net *net)
70ee1159 3238{
08fc7f81 3239 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
648845ab
TZ
3240 if (net->core.prot_inuse == NULL)
3241 return -ENOMEM;
3242
3243 net->core.sock_inuse = alloc_percpu(int);
3244 if (net->core.sock_inuse == NULL)
3245 goto out;
3246
3247 return 0;
3248
3249out:
3250 free_percpu(net->core.prot_inuse);
3251 return -ENOMEM;
70ee1159
PE
3252}
3253
2c8c1e72 3254static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159 3255{
08fc7f81 3256 free_percpu(net->core.prot_inuse);
648845ab 3257 free_percpu(net->core.sock_inuse);
70ee1159
PE
3258}
3259
3260static struct pernet_operations net_inuse_ops = {
3261 .init = sock_inuse_init_net,
3262 .exit = sock_inuse_exit_net,
3263};
3264
3265static __init int net_inuse_init(void)
3266{
3267 if (register_pernet_subsys(&net_inuse_ops))
3268 panic("Cannot initialize net inuse counters");
3269
3270 return 0;
3271}
3272
3273core_initcall(net_inuse_init);
13ff3d6f
PE
3274
3275static void assign_proto_idx(struct proto *prot)
3276{
3277 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3278
3279 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 3280 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
3281 return;
3282 }
3283
3284 set_bit(prot->inuse_idx, proto_inuse_idx);
3285}
3286
3287static void release_proto_idx(struct proto *prot)
3288{
3289 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3290 clear_bit(prot->inuse_idx, proto_inuse_idx);
3291}
3292#else
3293static inline void assign_proto_idx(struct proto *prot)
3294{
3295}
3296
3297static inline void release_proto_idx(struct proto *prot)
3298{
3299}
648845ab
TZ
3300
3301static void sock_inuse_add(struct net *net, int val)
3302{
3303}
13ff3d6f
PE
3304#endif
3305
0159dfd3
ED
3306static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3307{
3308 if (!rsk_prot)
3309 return;
3310 kfree(rsk_prot->slab_name);
3311 rsk_prot->slab_name = NULL;
adf78eda
JL
3312 kmem_cache_destroy(rsk_prot->slab);
3313 rsk_prot->slab = NULL;
0159dfd3
ED
3314}
3315
3316static int req_prot_init(const struct proto *prot)
3317{
3318 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3319
3320 if (!rsk_prot)
3321 return 0;
3322
3323 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3324 prot->name);
3325 if (!rsk_prot->slab_name)
3326 return -ENOMEM;
3327
3328 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3329 rsk_prot->obj_size, 0,
e699e2c6
SB
3330 SLAB_ACCOUNT | prot->slab_flags,
3331 NULL);
0159dfd3
ED
3332
3333 if (!rsk_prot->slab) {
3334 pr_crit("%s: Can't create request sock SLAB cache!\n",
3335 prot->name);
3336 return -ENOMEM;
3337 }
3338 return 0;
3339}
3340
b733c007
PE
3341int proto_register(struct proto *prot, int alloc_slab)
3342{
1da177e4 3343 if (alloc_slab) {
30c2c9f1
DW
3344 prot->slab = kmem_cache_create_usercopy(prot->name,
3345 prot->obj_size, 0,
e699e2c6
SB
3346 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3347 prot->slab_flags,
289a4860 3348 prot->useroffset, prot->usersize,
271b72c7 3349 NULL);
1da177e4
LT
3350
3351 if (prot->slab == NULL) {
e005d193
JP
3352 pr_crit("%s: Can't create sock SLAB cache!\n",
3353 prot->name);
60e7663d 3354 goto out;
1da177e4 3355 }
2e6599cb 3356
0159dfd3
ED
3357 if (req_prot_init(prot))
3358 goto out_free_request_sock_slab;
8feaf0c0 3359
6d6ee43e 3360 if (prot->twsk_prot != NULL) {
faf23422 3361 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 3362
7e56b5d6 3363 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
3364 goto out_free_request_sock_slab;
3365
6d6ee43e 3366 prot->twsk_prot->twsk_slab =
7e56b5d6 3367 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 3368 prot->twsk_prot->twsk_obj_size,
3ab5aee7 3369 0,
e699e2c6 3370 SLAB_ACCOUNT |
52db70dc 3371 prot->slab_flags,
20c2df83 3372 NULL);
6d6ee43e 3373 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
3374 goto out_free_timewait_sock_slab_name;
3375 }
1da177e4
LT
3376 }
3377
36b77a52 3378 mutex_lock(&proto_list_mutex);
1da177e4 3379 list_add(&prot->node, &proto_list);
13ff3d6f 3380 assign_proto_idx(prot);
36b77a52 3381 mutex_unlock(&proto_list_mutex);
b733c007
PE
3382 return 0;
3383
8feaf0c0 3384out_free_timewait_sock_slab_name:
7e56b5d6 3385 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 3386out_free_request_sock_slab:
0159dfd3
ED
3387 req_prot_cleanup(prot->rsk_prot);
3388
2e6599cb
ACM
3389 kmem_cache_destroy(prot->slab);
3390 prot->slab = NULL;
b733c007
PE
3391out:
3392 return -ENOBUFS;
1da177e4 3393}
1da177e4
LT
3394EXPORT_SYMBOL(proto_register);
3395
3396void proto_unregister(struct proto *prot)
3397{
36b77a52 3398 mutex_lock(&proto_list_mutex);
13ff3d6f 3399 release_proto_idx(prot);
0a3f4358 3400 list_del(&prot->node);
36b77a52 3401 mutex_unlock(&proto_list_mutex);
1da177e4 3402
adf78eda
JL
3403 kmem_cache_destroy(prot->slab);
3404 prot->slab = NULL;
1da177e4 3405
0159dfd3 3406 req_prot_cleanup(prot->rsk_prot);
2e6599cb 3407
6d6ee43e 3408 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 3409 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 3410 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 3411 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 3412 }
1da177e4 3413}
1da177e4
LT
3414EXPORT_SYMBOL(proto_unregister);
3415
bf2ae2e4
XL
3416int sock_load_diag_module(int family, int protocol)
3417{
3418 if (!protocol) {
3419 if (!sock_is_registered(family))
3420 return -ENOENT;
3421
3422 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3423 NETLINK_SOCK_DIAG, family);
3424 }
3425
3426#ifdef CONFIG_INET
3427 if (family == AF_INET &&
c34c1287 3428 protocol != IPPROTO_RAW &&
bf2ae2e4
XL
3429 !rcu_access_pointer(inet_protos[protocol]))
3430 return -ENOENT;
3431#endif
3432
3433 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3434 NETLINK_SOCK_DIAG, family, protocol);
3435}
3436EXPORT_SYMBOL(sock_load_diag_module);
3437
1da177e4 3438#ifdef CONFIG_PROC_FS
1da177e4 3439static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 3440 __acquires(proto_list_mutex)
1da177e4 3441{
36b77a52 3442 mutex_lock(&proto_list_mutex);
60f0438a 3443 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
3444}
3445
3446static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3447{
60f0438a 3448 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
3449}
3450
3451static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 3452 __releases(proto_list_mutex)
1da177e4 3453{
36b77a52 3454 mutex_unlock(&proto_list_mutex);
1da177e4
LT
3455}
3456
3457static char proto_method_implemented(const void *method)
3458{
3459 return method == NULL ? 'n' : 'y';
3460}
180d8cd9
GC
3461static long sock_prot_memory_allocated(struct proto *proto)
3462{
cb75a36c 3463 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
3464}
3465
3466static char *sock_prot_memory_pressure(struct proto *proto)
3467{
3468 return proto->memory_pressure != NULL ?
3469 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3470}
1da177e4
LT
3471
3472static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3473{
180d8cd9 3474
8d987e5c 3475 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
3476 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3477 proto->name,
3478 proto->obj_size,
14e943db 3479 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
3480 sock_prot_memory_allocated(proto),
3481 sock_prot_memory_pressure(proto),
1da177e4
LT
3482 proto->max_header,
3483 proto->slab == NULL ? "no" : "yes",
3484 module_name(proto->owner),
3485 proto_method_implemented(proto->close),
3486 proto_method_implemented(proto->connect),
3487 proto_method_implemented(proto->disconnect),
3488 proto_method_implemented(proto->accept),
3489 proto_method_implemented(proto->ioctl),
3490 proto_method_implemented(proto->init),
3491 proto_method_implemented(proto->destroy),
3492 proto_method_implemented(proto->shutdown),
3493 proto_method_implemented(proto->setsockopt),
3494 proto_method_implemented(proto->getsockopt),
3495 proto_method_implemented(proto->sendmsg),
3496 proto_method_implemented(proto->recvmsg),
3497 proto_method_implemented(proto->sendpage),
3498 proto_method_implemented(proto->bind),
3499 proto_method_implemented(proto->backlog_rcv),
3500 proto_method_implemented(proto->hash),
3501 proto_method_implemented(proto->unhash),
3502 proto_method_implemented(proto->get_port),
3503 proto_method_implemented(proto->enter_memory_pressure));
3504}
3505
3506static int proto_seq_show(struct seq_file *seq, void *v)
3507{
60f0438a 3508 if (v == &proto_list)
1da177e4
LT
3509 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3510 "protocol",
3511 "size",
3512 "sockets",
3513 "memory",
3514 "press",
3515 "maxhdr",
3516 "slab",
3517 "module",
3518 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3519 else
60f0438a 3520 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
3521 return 0;
3522}
3523
f690808e 3524static const struct seq_operations proto_seq_ops = {
1da177e4
LT
3525 .start = proto_seq_start,
3526 .next = proto_seq_next,
3527 .stop = proto_seq_stop,
3528 .show = proto_seq_show,
3529};
3530
14e943db
ED
3531static __net_init int proto_init_net(struct net *net)
3532{
c3506372
CH
3533 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3534 sizeof(struct seq_net_private)))
14e943db
ED
3535 return -ENOMEM;
3536
3537 return 0;
3538}
3539
3540static __net_exit void proto_exit_net(struct net *net)
3541{
ece31ffd 3542 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3543}
3544
3545
3546static __net_initdata struct pernet_operations proto_net_ops = {
3547 .init = proto_init_net,
3548 .exit = proto_exit_net,
1da177e4
LT
3549};
3550
3551static int __init proto_init(void)
3552{
14e943db 3553 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3554}
3555
3556subsys_initcall(proto_init);
3557
3558#endif /* PROC_FS */
7db6b048
SS
3559
3560#ifdef CONFIG_NET_RX_BUSY_POLL
3561bool sk_busy_loop_end(void *p, unsigned long start_time)
3562{
3563 struct sock *sk = p;
3564
3565 return !skb_queue_empty(&sk->sk_receive_queue) ||
3566 sk_busy_loop_timeout(sk, start_time);
3567}
3568EXPORT_SYMBOL(sk_busy_loop_end);
3569#endif /* CONFIG_NET_RX_BUSY_POLL */