]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - net/core/sock.c
mm: introduce page reference manipulation functions
[mirror_ubuntu-zesty-kernel.git] / net / core / sock.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
4ec93edb 35 * code. The ACK stuff can wait and needs major
1da177e4
LT
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
e005d193
JP
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
4fc268d2 94#include <linux/capability.h>
1da177e4 95#include <linux/errno.h>
cb820f8e 96#include <linux/errqueue.h>
1da177e4
LT
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
1da177e4
LT
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
a1f8e7f7 115#include <linux/highmem.h>
3f551f94 116#include <linux/user_namespace.h>
c5905afb 117#include <linux/static_key.h>
3969eb38 118#include <linux/memcontrol.h>
8c1ae10d 119#include <linux/prefetch.h>
1da177e4
LT
120
121#include <asm/uaccess.h>
1da177e4
LT
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
457c4cbc 126#include <net/net_namespace.h>
2e6599cb 127#include <net/request_sock.h>
1da177e4 128#include <net/sock.h>
20d49473 129#include <linux/net_tstamp.h>
1da177e4
LT
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
f8451725 132#include <net/cls_cgroup.h>
5bc1421e 133#include <net/netprio_cgroup.h>
eb4cb008 134#include <linux/sock_diag.h>
1da177e4
LT
135
136#include <linux/filter.h>
538950a1 137#include <net/sock_reuseport.h>
1da177e4 138
3847ce32
SM
139#include <trace/events/sock.h>
140
1da177e4
LT
141#ifdef CONFIG_INET
142#include <net/tcp.h>
143#endif
144
076bb0c8 145#include <net/busy_poll.h>
06021292 146
36b77a52 147static DEFINE_MUTEX(proto_list_mutex);
d1a4c0b3
GC
148static LIST_HEAD(proto_list);
149
a3b299da
EB
150/**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162{
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165}
166EXPORT_SYMBOL(sk_ns_capable);
167
168/**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
e793c0f7 171 * @cap: The global capability to use
a3b299da
EB
172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177bool sk_capable(const struct sock *sk, int cap)
178{
179 return sk_ns_capable(sk, &init_user_ns, cap);
180}
181EXPORT_SYMBOL(sk_capable);
182
183/**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
e793c0f7 188 * Test to see if the opener of the socket had when the socket was created
a3b299da
EB
189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192bool sk_net_capable(const struct sock *sk, int cap)
193{
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195}
196EXPORT_SYMBOL(sk_net_capable);
197
da21f24d
IM
198/*
199 * Each address family might have different locking rules, so we have
200 * one slock key per address family:
201 */
a5b5bb9a
IM
202static struct lock_class_key af_family_keys[AF_MAX];
203static struct lock_class_key af_family_slock_keys[AF_MAX];
204
a5b5bb9a
IM
205/*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
36cbd3dc 210static const char *const af_family_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
cbd151bf 218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
a5b5bb9a 219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
cd05acfe 220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
17926a79 221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
bce7b154 222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
6f107b58 223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
456db6a4 224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
a5b5bb9a 225};
36cbd3dc 226static const char *const af_family_slock_key_strings[AF_MAX+1] = {
a5b5bb9a
IM
227 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
228 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
229 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
230 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
231 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
232 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
233 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
cbd151bf 234 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
a5b5bb9a 235 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
cd05acfe 236 "slock-27" , "slock-28" , "slock-AF_CAN" ,
17926a79 237 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
bce7b154 238 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
6f107b58 239 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
456db6a4 240 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
a5b5bb9a 241};
36cbd3dc 242static const char *const af_family_clock_key_strings[AF_MAX+1] = {
443aef0e
PZ
243 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
244 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
245 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
246 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
247 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
248 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
249 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
cbd151bf 250 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
443aef0e 251 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
b4942af6 252 "clock-27" , "clock-28" , "clock-AF_CAN" ,
e51f802b 253 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
bce7b154 254 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
6f107b58 255 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
456db6a4 256 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
443aef0e 257};
da21f24d
IM
258
259/*
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263static struct lock_class_key af_callback_keys[AF_MAX];
264
1da177e4
LT
265/* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
269 */
270#define _SK_MEM_PACKETS 256
87fb4b7b 271#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
1da177e4
LT
272#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274
275/* Run time adjustable parameters. */
ab32ea5d 276__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
6d8ebc8a 277EXPORT_SYMBOL(sysctl_wmem_max);
ab32ea5d 278__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
6d8ebc8a 279EXPORT_SYMBOL(sysctl_rmem_max);
ab32ea5d
BH
280__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
1da177e4 282
25985edc 283/* Maximal space eaten by iovec or ancillary data plus some space */
ab32ea5d 284int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
2a91525c 285EXPORT_SYMBOL(sysctl_optmem_max);
1da177e4 286
b245be1f
WB
287int sysctl_tstamp_allow_data __read_mostly = 1;
288
c93bdd0e
MG
289struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290EXPORT_SYMBOL_GPL(memalloc_socks);
291
7cb02404
MG
292/**
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
295 *
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
299 */
300void sk_set_memalloc(struct sock *sk)
301{
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
c93bdd0e 304 static_key_slow_inc(&memalloc_socks);
7cb02404
MG
305}
306EXPORT_SYMBOL_GPL(sk_set_memalloc);
307
308void sk_clear_memalloc(struct sock *sk)
309{
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
c93bdd0e 312 static_key_slow_dec(&memalloc_socks);
c76562b6
MG
313
314 /*
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
5d753610
MG
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
c76562b6 320 */
5d753610 321 sk_mem_reclaim(sk);
7cb02404
MG
322}
323EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324
b4b9e355
MG
325int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326{
327 int ret;
328 unsigned long pflags = current->flags;
329
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
336
337 return ret;
338}
339EXPORT_SYMBOL(__sk_backlog_rcv);
340
1da177e4
LT
341static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342{
343 struct timeval tv;
344
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
ba78073e
VA
349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
1da177e4 351
ba78073e 352 if (tv.tv_sec < 0) {
6f11df83
AM
353 static int warned __read_mostly;
354
ba78073e 355 *timeo_p = 0;
50aab54f 356 if (warned < 10 && net_ratelimit()) {
ba78073e 357 warned++;
e005d193
JP
358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
50aab54f 360 }
ba78073e
VA
361 return 0;
362 }
1da177e4
LT
363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
369}
370
371static void sock_warn_obsolete_bsdism(const char *name)
372{
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
4ec93edb
YH
375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
e005d193
JP
377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
1da177e4
LT
379 warned++;
380 }
381}
382
080a270f
HFS
383static bool sock_needs_netstamp(const struct sock *sk)
384{
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
391 }
392}
393
08e29af3 394static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
4ec93edb 395{
08e29af3
ED
396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
080a270f
HFS
398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
20d49473 400 net_disable_timestamp();
1da177e4
LT
401 }
402}
403
404
f0088a50
DV
405int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406{
766e9037 407 int err;
3b885787
NH
408 unsigned long flags;
409 struct sk_buff_head *list = &sk->sk_receive_queue;
f0088a50 410
0fd7bac6 411 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
766e9037 412 atomic_inc(&sk->sk_drops);
3847ce32 413 trace_sock_rcvqueue_full(sk, skb);
766e9037 414 return -ENOMEM;
f0088a50
DV
415 }
416
fda9ef5d 417 err = sk_filter(sk, skb);
f0088a50 418 if (err)
766e9037 419 return err;
f0088a50 420
c76562b6 421 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
766e9037
ED
422 atomic_inc(&sk->sk_drops);
423 return -ENOBUFS;
3ab224be
HA
424 }
425
f0088a50
DV
426 skb->dev = NULL;
427 skb_set_owner_r(skb, sk);
49ad9599 428
7fee226a
ED
429 /* we escape from rcu protected region, make sure we dont leak
430 * a norefcounted dst
431 */
432 skb_dst_force(skb);
433
3b885787 434 spin_lock_irqsave(&list->lock, flags);
3bc3b96f 435 sock_skb_set_dropcount(sk, skb);
3b885787
NH
436 __skb_queue_tail(list, skb);
437 spin_unlock_irqrestore(&list->lock, flags);
f0088a50
DV
438
439 if (!sock_flag(sk, SOCK_DEAD))
676d2369 440 sk->sk_data_ready(sk);
766e9037 441 return 0;
f0088a50
DV
442}
443EXPORT_SYMBOL(sock_queue_rcv_skb);
444
58a5a7b9 445int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
f0088a50
DV
446{
447 int rc = NET_RX_SUCCESS;
448
fda9ef5d 449 if (sk_filter(sk, skb))
f0088a50
DV
450 goto discard_and_relse;
451
452 skb->dev = NULL;
453
274f482d 454 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
c377411f
ED
455 atomic_inc(&sk->sk_drops);
456 goto discard_and_relse;
457 }
58a5a7b9
ACM
458 if (nested)
459 bh_lock_sock_nested(sk);
460 else
461 bh_lock_sock(sk);
a5b5bb9a
IM
462 if (!sock_owned_by_user(sk)) {
463 /*
464 * trylock + unlock semantics:
465 */
466 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
467
c57943a1 468 rc = sk_backlog_rcv(sk, skb);
a5b5bb9a
IM
469
470 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
f545a38f 471 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
8eae939f
ZY
472 bh_unlock_sock(sk);
473 atomic_inc(&sk->sk_drops);
474 goto discard_and_relse;
475 }
476
f0088a50
DV
477 bh_unlock_sock(sk);
478out:
479 sock_put(sk);
480 return rc;
481discard_and_relse:
482 kfree_skb(skb);
483 goto out;
484}
485EXPORT_SYMBOL(sk_receive_skb);
486
487struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
488{
b6c6712a 489 struct dst_entry *dst = __sk_dst_get(sk);
f0088a50
DV
490
491 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
e022f0b4 492 sk_tx_queue_clear(sk);
a9b3cd7f 493 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
f0088a50
DV
494 dst_release(dst);
495 return NULL;
496 }
497
498 return dst;
499}
500EXPORT_SYMBOL(__sk_dst_check);
501
502struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
503{
504 struct dst_entry *dst = sk_dst_get(sk);
505
506 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
507 sk_dst_reset(sk);
508 dst_release(dst);
509 return NULL;
510 }
511
512 return dst;
513}
514EXPORT_SYMBOL(sk_dst_check);
515
c91f6df2
BH
516static int sock_setbindtodevice(struct sock *sk, char __user *optval,
517 int optlen)
4878809f
DM
518{
519 int ret = -ENOPROTOOPT;
520#ifdef CONFIG_NETDEVICES
3b1e0a65 521 struct net *net = sock_net(sk);
4878809f
DM
522 char devname[IFNAMSIZ];
523 int index;
524
525 /* Sorry... */
526 ret = -EPERM;
5e1fccc0 527 if (!ns_capable(net->user_ns, CAP_NET_RAW))
4878809f
DM
528 goto out;
529
530 ret = -EINVAL;
531 if (optlen < 0)
532 goto out;
533
534 /* Bind this socket to a particular device like "eth0",
535 * as specified in the passed interface name. If the
536 * name is "" or the option length is zero the socket
537 * is not bound.
538 */
539 if (optlen > IFNAMSIZ - 1)
540 optlen = IFNAMSIZ - 1;
541 memset(devname, 0, sizeof(devname));
542
543 ret = -EFAULT;
544 if (copy_from_user(devname, optval, optlen))
545 goto out;
546
000ba2e4
DM
547 index = 0;
548 if (devname[0] != '\0') {
bf8e56bf 549 struct net_device *dev;
4878809f 550
bf8e56bf
ED
551 rcu_read_lock();
552 dev = dev_get_by_name_rcu(net, devname);
553 if (dev)
554 index = dev->ifindex;
555 rcu_read_unlock();
4878809f
DM
556 ret = -ENODEV;
557 if (!dev)
558 goto out;
4878809f
DM
559 }
560
561 lock_sock(sk);
562 sk->sk_bound_dev_if = index;
563 sk_dst_reset(sk);
564 release_sock(sk);
565
566 ret = 0;
567
568out:
569#endif
570
571 return ret;
572}
573
c91f6df2
BH
574static int sock_getbindtodevice(struct sock *sk, char __user *optval,
575 int __user *optlen, int len)
576{
577 int ret = -ENOPROTOOPT;
578#ifdef CONFIG_NETDEVICES
579 struct net *net = sock_net(sk);
c91f6df2 580 char devname[IFNAMSIZ];
c91f6df2
BH
581
582 if (sk->sk_bound_dev_if == 0) {
583 len = 0;
584 goto zero;
585 }
586
587 ret = -EINVAL;
588 if (len < IFNAMSIZ)
589 goto out;
590
5dbe7c17
NS
591 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
592 if (ret)
c91f6df2 593 goto out;
c91f6df2
BH
594
595 len = strlen(devname) + 1;
596
597 ret = -EFAULT;
598 if (copy_to_user(optval, devname, len))
599 goto out;
600
601zero:
602 ret = -EFAULT;
603 if (put_user(len, optlen))
604 goto out;
605
606 ret = 0;
607
608out:
609#endif
610
611 return ret;
612}
613
c0ef877b
PE
614static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
615{
616 if (valbool)
617 sock_set_flag(sk, bit);
618 else
619 sock_reset_flag(sk, bit);
620}
621
f60e5990 622bool sk_mc_loop(struct sock *sk)
623{
624 if (dev_recursion_level())
625 return false;
626 if (!sk)
627 return true;
628 switch (sk->sk_family) {
629 case AF_INET:
630 return inet_sk(sk)->mc_loop;
631#if IS_ENABLED(CONFIG_IPV6)
632 case AF_INET6:
633 return inet6_sk(sk)->mc_loop;
634#endif
635 }
636 WARN_ON(1);
637 return true;
638}
639EXPORT_SYMBOL(sk_mc_loop);
640
1da177e4
LT
641/*
642 * This is meant for all protocols to use and covers goings on
643 * at the socket level. Everything here is generic.
644 */
645
646int sock_setsockopt(struct socket *sock, int level, int optname,
b7058842 647 char __user *optval, unsigned int optlen)
1da177e4 648{
2a91525c 649 struct sock *sk = sock->sk;
1da177e4
LT
650 int val;
651 int valbool;
652 struct linger ling;
653 int ret = 0;
4ec93edb 654
1da177e4
LT
655 /*
656 * Options without arguments
657 */
658
4878809f 659 if (optname == SO_BINDTODEVICE)
c91f6df2 660 return sock_setbindtodevice(sk, optval, optlen);
4878809f 661
e71a4783
SH
662 if (optlen < sizeof(int))
663 return -EINVAL;
4ec93edb 664
1da177e4
LT
665 if (get_user(val, (int __user *)optval))
666 return -EFAULT;
4ec93edb 667
2a91525c 668 valbool = val ? 1 : 0;
1da177e4
LT
669
670 lock_sock(sk);
671
2a91525c 672 switch (optname) {
e71a4783 673 case SO_DEBUG:
2a91525c 674 if (val && !capable(CAP_NET_ADMIN))
e71a4783 675 ret = -EACCES;
2a91525c 676 else
c0ef877b 677 sock_valbool_flag(sk, SOCK_DBG, valbool);
e71a4783
SH
678 break;
679 case SO_REUSEADDR:
4a17fd52 680 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
e71a4783 681 break;
055dc21a
TH
682 case SO_REUSEPORT:
683 sk->sk_reuseport = valbool;
684 break;
e71a4783 685 case SO_TYPE:
49c794e9 686 case SO_PROTOCOL:
0d6038ee 687 case SO_DOMAIN:
e71a4783
SH
688 case SO_ERROR:
689 ret = -ENOPROTOOPT;
690 break;
691 case SO_DONTROUTE:
c0ef877b 692 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
e71a4783
SH
693 break;
694 case SO_BROADCAST:
695 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
696 break;
697 case SO_SNDBUF:
698 /* Don't error on this BSD doesn't and if you think
82981930
ED
699 * about it this is right. Otherwise apps have to
700 * play 'guess the biggest size' games. RCVBUF/SNDBUF
701 * are treated in BSD as hints
702 */
703 val = min_t(u32, val, sysctl_wmem_max);
b0573dea 704set_sndbuf:
e71a4783 705 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
82981930
ED
706 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
707 /* Wake up sending tasks if we upped the value. */
e71a4783
SH
708 sk->sk_write_space(sk);
709 break;
1da177e4 710
e71a4783
SH
711 case SO_SNDBUFFORCE:
712 if (!capable(CAP_NET_ADMIN)) {
713 ret = -EPERM;
714 break;
715 }
716 goto set_sndbuf;
b0573dea 717
e71a4783
SH
718 case SO_RCVBUF:
719 /* Don't error on this BSD doesn't and if you think
82981930
ED
720 * about it this is right. Otherwise apps have to
721 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722 * are treated in BSD as hints
723 */
724 val = min_t(u32, val, sysctl_rmem_max);
b0573dea 725set_rcvbuf:
e71a4783
SH
726 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
727 /*
728 * We double it on the way in to account for
729 * "struct sk_buff" etc. overhead. Applications
730 * assume that the SO_RCVBUF setting they make will
731 * allow that much actual data to be received on that
732 * socket.
733 *
734 * Applications are unaware that "struct sk_buff" and
735 * other overheads allocate from the receive buffer
736 * during socket buffer allocation.
737 *
738 * And after considering the possible alternatives,
739 * returning the value we actually used in getsockopt
740 * is the most desirable behavior.
741 */
82981930 742 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
e71a4783
SH
743 break;
744
745 case SO_RCVBUFFORCE:
746 if (!capable(CAP_NET_ADMIN)) {
747 ret = -EPERM;
1da177e4 748 break;
e71a4783
SH
749 }
750 goto set_rcvbuf;
1da177e4 751
e71a4783 752 case SO_KEEPALIVE:
1da177e4 753#ifdef CONFIG_INET
3e10986d
ED
754 if (sk->sk_protocol == IPPROTO_TCP &&
755 sk->sk_type == SOCK_STREAM)
e71a4783 756 tcp_set_keepalive(sk, valbool);
1da177e4 757#endif
e71a4783
SH
758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 break;
760
761 case SO_OOBINLINE:
762 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 break;
764
765 case SO_NO_CHECK:
28448b80 766 sk->sk_no_check_tx = valbool;
e71a4783
SH
767 break;
768
769 case SO_PRIORITY:
5e1fccc0
EB
770 if ((val >= 0 && val <= 6) ||
771 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
e71a4783
SH
772 sk->sk_priority = val;
773 else
774 ret = -EPERM;
775 break;
776
777 case SO_LINGER:
778 if (optlen < sizeof(ling)) {
779 ret = -EINVAL; /* 1003.1g */
1da177e4 780 break;
e71a4783 781 }
2a91525c 782 if (copy_from_user(&ling, optval, sizeof(ling))) {
e71a4783 783 ret = -EFAULT;
1da177e4 784 break;
e71a4783
SH
785 }
786 if (!ling.l_onoff)
787 sock_reset_flag(sk, SOCK_LINGER);
788 else {
1da177e4 789#if (BITS_PER_LONG == 32)
e71a4783
SH
790 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1da177e4 792 else
e71a4783
SH
793#endif
794 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 sock_set_flag(sk, SOCK_LINGER);
796 }
797 break;
798
799 case SO_BSDCOMPAT:
800 sock_warn_obsolete_bsdism("setsockopt");
801 break;
802
803 case SO_PASSCRED:
804 if (valbool)
805 set_bit(SOCK_PASSCRED, &sock->flags);
806 else
807 clear_bit(SOCK_PASSCRED, &sock->flags);
808 break;
809
810 case SO_TIMESTAMP:
92f37fd2 811 case SO_TIMESTAMPNS:
e71a4783 812 if (valbool) {
92f37fd2
ED
813 if (optname == SO_TIMESTAMP)
814 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 else
816 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783 817 sock_set_flag(sk, SOCK_RCVTSTAMP);
20d49473 818 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
92f37fd2 819 } else {
e71a4783 820 sock_reset_flag(sk, SOCK_RCVTSTAMP);
92f37fd2
ED
821 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 }
e71a4783
SH
823 break;
824
20d49473
PO
825 case SO_TIMESTAMPING:
826 if (val & ~SOF_TIMESTAMPING_MASK) {
f249fb78 827 ret = -EINVAL;
20d49473
PO
828 break;
829 }
b245be1f 830
09c2d251 831 if (val & SOF_TIMESTAMPING_OPT_ID &&
4ed2d765 832 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
ac5cc977
WC
833 if (sk->sk_protocol == IPPROTO_TCP &&
834 sk->sk_type == SOCK_STREAM) {
4ed2d765
WB
835 if (sk->sk_state != TCP_ESTABLISHED) {
836 ret = -EINVAL;
837 break;
838 }
839 sk->sk_tskey = tcp_sk(sk)->snd_una;
840 } else {
841 sk->sk_tskey = 0;
842 }
843 }
b9f40e21 844 sk->sk_tsflags = val;
20d49473
PO
845 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
846 sock_enable_timestamp(sk,
847 SOCK_TIMESTAMPING_RX_SOFTWARE);
848 else
849 sock_disable_timestamp(sk,
08e29af3 850 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
20d49473
PO
851 break;
852
e71a4783
SH
853 case SO_RCVLOWAT:
854 if (val < 0)
855 val = INT_MAX;
856 sk->sk_rcvlowat = val ? : 1;
857 break;
858
859 case SO_RCVTIMEO:
860 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
861 break;
862
863 case SO_SNDTIMEO:
864 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
865 break;
1da177e4 866
e71a4783
SH
867 case SO_ATTACH_FILTER:
868 ret = -EINVAL;
869 if (optlen == sizeof(struct sock_fprog)) {
870 struct sock_fprog fprog;
1da177e4 871
e71a4783
SH
872 ret = -EFAULT;
873 if (copy_from_user(&fprog, optval, sizeof(fprog)))
1da177e4 874 break;
e71a4783
SH
875
876 ret = sk_attach_filter(&fprog, sk);
877 }
878 break;
879
89aa0758
AS
880 case SO_ATTACH_BPF:
881 ret = -EINVAL;
882 if (optlen == sizeof(u32)) {
883 u32 ufd;
884
885 ret = -EFAULT;
886 if (copy_from_user(&ufd, optval, sizeof(ufd)))
887 break;
888
889 ret = sk_attach_bpf(ufd, sk);
890 }
891 break;
892
538950a1
CG
893 case SO_ATTACH_REUSEPORT_CBPF:
894 ret = -EINVAL;
895 if (optlen == sizeof(struct sock_fprog)) {
896 struct sock_fprog fprog;
897
898 ret = -EFAULT;
899 if (copy_from_user(&fprog, optval, sizeof(fprog)))
900 break;
901
902 ret = sk_reuseport_attach_filter(&fprog, sk);
903 }
904 break;
905
906 case SO_ATTACH_REUSEPORT_EBPF:
907 ret = -EINVAL;
908 if (optlen == sizeof(u32)) {
909 u32 ufd;
910
911 ret = -EFAULT;
912 if (copy_from_user(&ufd, optval, sizeof(ufd)))
913 break;
914
915 ret = sk_reuseport_attach_bpf(ufd, sk);
916 }
917 break;
918
e71a4783 919 case SO_DETACH_FILTER:
55b33325 920 ret = sk_detach_filter(sk);
e71a4783 921 break;
1da177e4 922
d59577b6
VB
923 case SO_LOCK_FILTER:
924 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
925 ret = -EPERM;
926 else
927 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
928 break;
929
e71a4783
SH
930 case SO_PASSSEC:
931 if (valbool)
932 set_bit(SOCK_PASSSEC, &sock->flags);
933 else
934 clear_bit(SOCK_PASSSEC, &sock->flags);
935 break;
4a19ec58 936 case SO_MARK:
5e1fccc0 937 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
4a19ec58 938 ret = -EPERM;
2a91525c 939 else
4a19ec58 940 sk->sk_mark = val;
4a19ec58 941 break;
877ce7c1 942
3b885787 943 case SO_RXQ_OVFL:
8083f0fc 944 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
3b885787 945 break;
6e3e939f
JB
946
947 case SO_WIFI_STATUS:
948 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
949 break;
950
ef64a54f
PE
951 case SO_PEEK_OFF:
952 if (sock->ops->set_peek_off)
12663bfc 953 ret = sock->ops->set_peek_off(sk, val);
ef64a54f
PE
954 else
955 ret = -EOPNOTSUPP;
956 break;
3bdc0eba
BG
957
958 case SO_NOFCS:
959 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
960 break;
961
7d4c04fc
KJ
962 case SO_SELECT_ERR_QUEUE:
963 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
964 break;
965
e0d1095a 966#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 967 case SO_BUSY_POLL:
dafcc438
ET
968 /* allow unprivileged users to decrease the value */
969 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
970 ret = -EPERM;
971 else {
972 if (val < 0)
973 ret = -EINVAL;
974 else
975 sk->sk_ll_usec = val;
976 }
977 break;
978#endif
62748f32
ED
979
980 case SO_MAX_PACING_RATE:
981 sk->sk_max_pacing_rate = val;
982 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
983 sk->sk_max_pacing_rate);
984 break;
985
70da268b
ED
986 case SO_INCOMING_CPU:
987 sk->sk_incoming_cpu = val;
988 break;
989
e71a4783
SH
990 default:
991 ret = -ENOPROTOOPT;
992 break;
4ec93edb 993 }
1da177e4
LT
994 release_sock(sk);
995 return ret;
996}
2a91525c 997EXPORT_SYMBOL(sock_setsockopt);
1da177e4
LT
998
999
8f09898b 1000static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1001 struct ucred *ucred)
3f551f94
EB
1002{
1003 ucred->pid = pid_vnr(pid);
1004 ucred->uid = ucred->gid = -1;
1005 if (cred) {
1006 struct user_namespace *current_ns = current_user_ns();
1007
b2e4f544
EB
1008 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1009 ucred->gid = from_kgid_munged(current_ns, cred->egid);
3f551f94
EB
1010 }
1011}
1012
1da177e4
LT
1013int sock_getsockopt(struct socket *sock, int level, int optname,
1014 char __user *optval, int __user *optlen)
1015{
1016 struct sock *sk = sock->sk;
4ec93edb 1017
e71a4783 1018 union {
4ec93edb
YH
1019 int val;
1020 struct linger ling;
1da177e4
LT
1021 struct timeval tm;
1022 } v;
4ec93edb 1023
4d0392be 1024 int lv = sizeof(int);
1da177e4 1025 int len;
4ec93edb 1026
e71a4783 1027 if (get_user(len, optlen))
4ec93edb 1028 return -EFAULT;
e71a4783 1029 if (len < 0)
1da177e4 1030 return -EINVAL;
4ec93edb 1031
50fee1de 1032 memset(&v, 0, sizeof(v));
df0bca04 1033
2a91525c 1034 switch (optname) {
e71a4783
SH
1035 case SO_DEBUG:
1036 v.val = sock_flag(sk, SOCK_DBG);
1037 break;
1038
1039 case SO_DONTROUTE:
1040 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1041 break;
1042
1043 case SO_BROADCAST:
1b23a5df 1044 v.val = sock_flag(sk, SOCK_BROADCAST);
e71a4783
SH
1045 break;
1046
1047 case SO_SNDBUF:
1048 v.val = sk->sk_sndbuf;
1049 break;
1050
1051 case SO_RCVBUF:
1052 v.val = sk->sk_rcvbuf;
1053 break;
1054
1055 case SO_REUSEADDR:
1056 v.val = sk->sk_reuse;
1057 break;
1058
055dc21a
TH
1059 case SO_REUSEPORT:
1060 v.val = sk->sk_reuseport;
1061 break;
1062
e71a4783 1063 case SO_KEEPALIVE:
1b23a5df 1064 v.val = sock_flag(sk, SOCK_KEEPOPEN);
e71a4783
SH
1065 break;
1066
1067 case SO_TYPE:
1068 v.val = sk->sk_type;
1069 break;
1070
49c794e9
JE
1071 case SO_PROTOCOL:
1072 v.val = sk->sk_protocol;
1073 break;
1074
0d6038ee
JE
1075 case SO_DOMAIN:
1076 v.val = sk->sk_family;
1077 break;
1078
e71a4783
SH
1079 case SO_ERROR:
1080 v.val = -sock_error(sk);
2a91525c 1081 if (v.val == 0)
e71a4783
SH
1082 v.val = xchg(&sk->sk_err_soft, 0);
1083 break;
1084
1085 case SO_OOBINLINE:
1b23a5df 1086 v.val = sock_flag(sk, SOCK_URGINLINE);
e71a4783
SH
1087 break;
1088
1089 case SO_NO_CHECK:
28448b80 1090 v.val = sk->sk_no_check_tx;
e71a4783
SH
1091 break;
1092
1093 case SO_PRIORITY:
1094 v.val = sk->sk_priority;
1095 break;
1096
1097 case SO_LINGER:
1098 lv = sizeof(v.ling);
1b23a5df 1099 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
e71a4783
SH
1100 v.ling.l_linger = sk->sk_lingertime / HZ;
1101 break;
1102
1103 case SO_BSDCOMPAT:
1104 sock_warn_obsolete_bsdism("getsockopt");
1105 break;
1106
1107 case SO_TIMESTAMP:
92f37fd2
ED
1108 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1109 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1110 break;
1111
1112 case SO_TIMESTAMPNS:
1113 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
e71a4783
SH
1114 break;
1115
20d49473 1116 case SO_TIMESTAMPING:
b9f40e21 1117 v.val = sk->sk_tsflags;
20d49473
PO
1118 break;
1119
e71a4783 1120 case SO_RCVTIMEO:
2a91525c 1121 lv = sizeof(struct timeval);
e71a4783
SH
1122 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1123 v.tm.tv_sec = 0;
1124 v.tm.tv_usec = 0;
1125 } else {
1126 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1127 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1128 }
1129 break;
1130
1131 case SO_SNDTIMEO:
2a91525c 1132 lv = sizeof(struct timeval);
e71a4783
SH
1133 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1134 v.tm.tv_sec = 0;
1135 v.tm.tv_usec = 0;
1136 } else {
1137 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1138 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1139 }
1140 break;
1da177e4 1141
e71a4783
SH
1142 case SO_RCVLOWAT:
1143 v.val = sk->sk_rcvlowat;
1144 break;
1da177e4 1145
e71a4783 1146 case SO_SNDLOWAT:
2a91525c 1147 v.val = 1;
e71a4783 1148 break;
1da177e4 1149
e71a4783 1150 case SO_PASSCRED:
82981930 1151 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
e71a4783 1152 break;
1da177e4 1153
e71a4783 1154 case SO_PEERCRED:
109f6e39
EB
1155 {
1156 struct ucred peercred;
1157 if (len > sizeof(peercred))
1158 len = sizeof(peercred);
1159 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1160 if (copy_to_user(optval, &peercred, len))
e71a4783
SH
1161 return -EFAULT;
1162 goto lenout;
109f6e39 1163 }
1da177e4 1164
e71a4783
SH
1165 case SO_PEERNAME:
1166 {
1167 char address[128];
1168
1169 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1170 return -ENOTCONN;
1171 if (lv < len)
1172 return -EINVAL;
1173 if (copy_to_user(optval, address, len))
1174 return -EFAULT;
1175 goto lenout;
1176 }
1da177e4 1177
e71a4783
SH
1178 /* Dubious BSD thing... Probably nobody even uses it, but
1179 * the UNIX standard wants it for whatever reason... -DaveM
1180 */
1181 case SO_ACCEPTCONN:
1182 v.val = sk->sk_state == TCP_LISTEN;
1183 break;
1da177e4 1184
e71a4783 1185 case SO_PASSSEC:
82981930 1186 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
e71a4783 1187 break;
877ce7c1 1188
e71a4783
SH
1189 case SO_PEERSEC:
1190 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1da177e4 1191
4a19ec58
LAT
1192 case SO_MARK:
1193 v.val = sk->sk_mark;
1194 break;
1195
3b885787 1196 case SO_RXQ_OVFL:
1b23a5df 1197 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
3b885787
NH
1198 break;
1199
6e3e939f 1200 case SO_WIFI_STATUS:
1b23a5df 1201 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
6e3e939f
JB
1202 break;
1203
ef64a54f
PE
1204 case SO_PEEK_OFF:
1205 if (!sock->ops->set_peek_off)
1206 return -EOPNOTSUPP;
1207
1208 v.val = sk->sk_peek_off;
1209 break;
bc2f7996 1210 case SO_NOFCS:
1b23a5df 1211 v.val = sock_flag(sk, SOCK_NOFCS);
bc2f7996 1212 break;
c91f6df2 1213
f7b86bfe 1214 case SO_BINDTODEVICE:
c91f6df2
BH
1215 return sock_getbindtodevice(sk, optval, optlen, len);
1216
a8fc9277
PE
1217 case SO_GET_FILTER:
1218 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1219 if (len < 0)
1220 return len;
1221
1222 goto lenout;
c91f6df2 1223
d59577b6
VB
1224 case SO_LOCK_FILTER:
1225 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1226 break;
1227
ea02f941
MS
1228 case SO_BPF_EXTENSIONS:
1229 v.val = bpf_tell_extensions();
1230 break;
1231
7d4c04fc
KJ
1232 case SO_SELECT_ERR_QUEUE:
1233 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1234 break;
1235
e0d1095a 1236#ifdef CONFIG_NET_RX_BUSY_POLL
64b0dc51 1237 case SO_BUSY_POLL:
dafcc438
ET
1238 v.val = sk->sk_ll_usec;
1239 break;
1240#endif
1241
62748f32
ED
1242 case SO_MAX_PACING_RATE:
1243 v.val = sk->sk_max_pacing_rate;
1244 break;
1245
2c8c56e1
ED
1246 case SO_INCOMING_CPU:
1247 v.val = sk->sk_incoming_cpu;
1248 break;
1249
e71a4783 1250 default:
443b5991
YH
1251 /* We implement the SO_SNDLOWAT etc to not be settable
1252 * (1003.1g 7).
1253 */
e71a4783 1254 return -ENOPROTOOPT;
1da177e4 1255 }
e71a4783 1256
1da177e4
LT
1257 if (len > lv)
1258 len = lv;
1259 if (copy_to_user(optval, &v, len))
1260 return -EFAULT;
1261lenout:
4ec93edb
YH
1262 if (put_user(len, optlen))
1263 return -EFAULT;
1264 return 0;
1da177e4
LT
1265}
1266
a5b5bb9a
IM
1267/*
1268 * Initialize an sk_lock.
1269 *
1270 * (We also register the sk_lock with the lock validator.)
1271 */
b6f99a21 1272static inline void sock_lock_init(struct sock *sk)
a5b5bb9a 1273{
ed07536e
PZ
1274 sock_lock_init_class_and_name(sk,
1275 af_family_slock_key_strings[sk->sk_family],
1276 af_family_slock_keys + sk->sk_family,
1277 af_family_key_strings[sk->sk_family],
1278 af_family_keys + sk->sk_family);
a5b5bb9a
IM
1279}
1280
4dc6dc71
ED
1281/*
1282 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1283 * even temporarly, because of RCU lookups. sk_node should also be left as is.
68835aba 1284 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
4dc6dc71 1285 */
f1a6c4da
PE
1286static void sock_copy(struct sock *nsk, const struct sock *osk)
1287{
1288#ifdef CONFIG_SECURITY_NETWORK
1289 void *sptr = nsk->sk_security;
1290#endif
68835aba
ED
1291 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1292
1293 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1294 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1295
f1a6c4da
PE
1296#ifdef CONFIG_SECURITY_NETWORK
1297 nsk->sk_security = sptr;
1298 security_sk_clone(osk, nsk);
1299#endif
1300}
1301
fcbdf09d
OP
1302void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1303{
1304 unsigned long nulls1, nulls2;
1305
1306 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1307 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1308 if (nulls1 > nulls2)
1309 swap(nulls1, nulls2);
1310
1311 if (nulls1 != 0)
1312 memset((char *)sk, 0, nulls1);
1313 memset((char *)sk + nulls1 + sizeof(void *), 0,
1314 nulls2 - nulls1 - sizeof(void *));
1315 memset((char *)sk + nulls2 + sizeof(void *), 0,
1316 size - nulls2 - sizeof(void *));
1317}
1318EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1319
2e4afe7b
PE
1320static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1321 int family)
c308c1b2
PE
1322{
1323 struct sock *sk;
1324 struct kmem_cache *slab;
1325
1326 slab = prot->slab;
e912b114
ED
1327 if (slab != NULL) {
1328 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1329 if (!sk)
1330 return sk;
1331 if (priority & __GFP_ZERO) {
fcbdf09d
OP
1332 if (prot->clear_sk)
1333 prot->clear_sk(sk, prot->obj_size);
1334 else
1335 sk_prot_clear_nulls(sk, prot->obj_size);
e912b114 1336 }
fcbdf09d 1337 } else
c308c1b2
PE
1338 sk = kmalloc(prot->obj_size, priority);
1339
2e4afe7b 1340 if (sk != NULL) {
a98b65a3
VN
1341 kmemcheck_annotate_bitfield(sk, flags);
1342
2e4afe7b
PE
1343 if (security_sk_alloc(sk, family, priority))
1344 goto out_free;
1345
1346 if (!try_module_get(prot->owner))
1347 goto out_free_sec;
e022f0b4 1348 sk_tx_queue_clear(sk);
bd1060a1 1349 cgroup_sk_alloc(&sk->sk_cgrp_data);
2e4afe7b
PE
1350 }
1351
c308c1b2 1352 return sk;
2e4afe7b
PE
1353
1354out_free_sec:
1355 security_sk_free(sk);
1356out_free:
1357 if (slab != NULL)
1358 kmem_cache_free(slab, sk);
1359 else
1360 kfree(sk);
1361 return NULL;
c308c1b2
PE
1362}
1363
1364static void sk_prot_free(struct proto *prot, struct sock *sk)
1365{
1366 struct kmem_cache *slab;
2e4afe7b 1367 struct module *owner;
c308c1b2 1368
2e4afe7b 1369 owner = prot->owner;
c308c1b2 1370 slab = prot->slab;
2e4afe7b 1371
bd1060a1 1372 cgroup_sk_free(&sk->sk_cgrp_data);
2e4afe7b 1373 security_sk_free(sk);
c308c1b2
PE
1374 if (slab != NULL)
1375 kmem_cache_free(slab, sk);
1376 else
1377 kfree(sk);
2e4afe7b 1378 module_put(owner);
c308c1b2
PE
1379}
1380
1da177e4
LT
1381/**
1382 * sk_alloc - All socket objects are allocated here
c4ea43c5 1383 * @net: the applicable net namespace
4dc3b16b
PP
1384 * @family: protocol family
1385 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1386 * @prot: struct proto associated with this new sock instance
11aa9c28 1387 * @kern: is this to be a kernel socket?
1da177e4 1388 */
1b8d7ae4 1389struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
11aa9c28 1390 struct proto *prot, int kern)
1da177e4 1391{
c308c1b2 1392 struct sock *sk;
1da177e4 1393
154adbc8 1394 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1da177e4 1395 if (sk) {
154adbc8
PE
1396 sk->sk_family = family;
1397 /*
1398 * See comment in struct sock definition to understand
1399 * why we need sk_prot_creator -acme
1400 */
1401 sk->sk_prot = sk->sk_prot_creator = prot;
1402 sock_lock_init(sk);
26abe143
EB
1403 sk->sk_net_refcnt = kern ? 0 : 1;
1404 if (likely(sk->sk_net_refcnt))
1405 get_net(net);
1406 sock_net_set(sk, net);
d66ee058 1407 atomic_set(&sk->sk_wmem_alloc, 1);
f8451725 1408
2a56a1fe
TH
1409 sock_update_classid(&sk->sk_cgrp_data);
1410 sock_update_netprioidx(&sk->sk_cgrp_data);
1da177e4 1411 }
a79af59e 1412
2e4afe7b 1413 return sk;
1da177e4 1414}
2a91525c 1415EXPORT_SYMBOL(sk_alloc);
1da177e4 1416
eb4cb008 1417void sk_destruct(struct sock *sk)
1da177e4
LT
1418{
1419 struct sk_filter *filter;
1da177e4
LT
1420
1421 if (sk->sk_destruct)
1422 sk->sk_destruct(sk);
1423
a898def2
PM
1424 filter = rcu_dereference_check(sk->sk_filter,
1425 atomic_read(&sk->sk_wmem_alloc) == 0);
1da177e4 1426 if (filter) {
309dd5fc 1427 sk_filter_uncharge(sk, filter);
a9b3cd7f 1428 RCU_INIT_POINTER(sk->sk_filter, NULL);
1da177e4 1429 }
538950a1
CG
1430 if (rcu_access_pointer(sk->sk_reuseport_cb))
1431 reuseport_detach_sock(sk);
1da177e4 1432
08e29af3 1433 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1da177e4
LT
1434
1435 if (atomic_read(&sk->sk_omem_alloc))
e005d193
JP
1436 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1437 __func__, atomic_read(&sk->sk_omem_alloc));
1da177e4 1438
109f6e39
EB
1439 if (sk->sk_peer_cred)
1440 put_cred(sk->sk_peer_cred);
1441 put_pid(sk->sk_peer_pid);
26abe143
EB
1442 if (likely(sk->sk_net_refcnt))
1443 put_net(sock_net(sk));
c308c1b2 1444 sk_prot_free(sk->sk_prot_creator, sk);
1da177e4 1445}
2b85a34e 1446
eb4cb008
CG
1447static void __sk_free(struct sock *sk)
1448{
b922622e 1449 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
eb4cb008
CG
1450 sock_diag_broadcast_destroy(sk);
1451 else
1452 sk_destruct(sk);
1453}
1454
2b85a34e
ED
1455void sk_free(struct sock *sk)
1456{
1457 /*
25985edc 1458 * We subtract one from sk_wmem_alloc and can know if
2b85a34e
ED
1459 * some packets are still in some tx queue.
1460 * If not null, sock_wfree() will call __sk_free(sk) later
1461 */
1462 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1463 __sk_free(sk);
1464}
2a91525c 1465EXPORT_SYMBOL(sk_free);
1da177e4 1466
e56c57d0
ED
1467/**
1468 * sk_clone_lock - clone a socket, and lock its clone
1469 * @sk: the socket to clone
1470 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1471 *
1472 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1473 */
1474struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
87d11ceb 1475{
8fd1d178 1476 struct sock *newsk;
278571ba 1477 bool is_charged = true;
87d11ceb 1478
8fd1d178 1479 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
87d11ceb
ACM
1480 if (newsk != NULL) {
1481 struct sk_filter *filter;
1482
892c141e 1483 sock_copy(newsk, sk);
87d11ceb
ACM
1484
1485 /* SANITY */
8a681736
SV
1486 if (likely(newsk->sk_net_refcnt))
1487 get_net(sock_net(newsk));
87d11ceb
ACM
1488 sk_node_init(&newsk->sk_node);
1489 sock_lock_init(newsk);
1490 bh_lock_sock(newsk);
fa438ccf 1491 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
8eae939f 1492 newsk->sk_backlog.len = 0;
87d11ceb
ACM
1493
1494 atomic_set(&newsk->sk_rmem_alloc, 0);
2b85a34e
ED
1495 /*
1496 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1497 */
1498 atomic_set(&newsk->sk_wmem_alloc, 1);
87d11ceb
ACM
1499 atomic_set(&newsk->sk_omem_alloc, 0);
1500 skb_queue_head_init(&newsk->sk_receive_queue);
1501 skb_queue_head_init(&newsk->sk_write_queue);
1502
87d11ceb 1503 rwlock_init(&newsk->sk_callback_lock);
443aef0e
PZ
1504 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1505 af_callback_keys + newsk->sk_family,
1506 af_family_clock_key_strings[newsk->sk_family]);
87d11ceb
ACM
1507
1508 newsk->sk_dst_cache = NULL;
1509 newsk->sk_wmem_queued = 0;
1510 newsk->sk_forward_alloc = 0;
1511 newsk->sk_send_head = NULL;
87d11ceb
ACM
1512 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1513
1514 sock_reset_flag(newsk, SOCK_DONE);
1515 skb_queue_head_init(&newsk->sk_error_queue);
1516
0d7da9dd 1517 filter = rcu_dereference_protected(newsk->sk_filter, 1);
87d11ceb 1518 if (filter != NULL)
278571ba
AS
1519 /* though it's an empty new sock, the charging may fail
1520 * if sysctl_optmem_max was changed between creation of
1521 * original socket and cloning
1522 */
1523 is_charged = sk_filter_charge(newsk, filter);
87d11ceb 1524
d188ba86 1525 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
87d11ceb
ACM
1526 /* It is still raw copy of parent, so invalidate
1527 * destructor and make plain sk_free() */
1528 newsk->sk_destruct = NULL;
b0691c8e 1529 bh_unlock_sock(newsk);
87d11ceb
ACM
1530 sk_free(newsk);
1531 newsk = NULL;
1532 goto out;
1533 }
1534
1535 newsk->sk_err = 0;
1536 newsk->sk_priority = 0;
2c8c56e1 1537 newsk->sk_incoming_cpu = raw_smp_processor_id();
33cf7c90 1538 atomic64_set(&newsk->sk_cookie, 0);
4dc6dc71
ED
1539 /*
1540 * Before updating sk_refcnt, we must commit prior changes to memory
1541 * (Documentation/RCU/rculist_nulls.txt for details)
1542 */
1543 smp_wmb();
87d11ceb
ACM
1544 atomic_set(&newsk->sk_refcnt, 2);
1545
1546 /*
1547 * Increment the counter in the same struct proto as the master
1548 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1549 * is the same as sk->sk_prot->socks, as this field was copied
1550 * with memcpy).
1551 *
1552 * This _changes_ the previous behaviour, where
1553 * tcp_create_openreq_child always was incrementing the
1554 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1555 * to be taken into account in all callers. -acme
1556 */
1557 sk_refcnt_debug_inc(newsk);
972692e0 1558 sk_set_socket(newsk, NULL);
43815482 1559 newsk->sk_wq = NULL;
87d11ceb 1560
baac50bb 1561 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3d596f7b 1562 sock_update_memcg(newsk);
f3f511e1 1563
87d11ceb 1564 if (newsk->sk_prot->sockets_allocated)
180d8cd9 1565 sk_sockets_allocated_inc(newsk);
704da560 1566
080a270f
HFS
1567 if (sock_needs_netstamp(sk) &&
1568 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
704da560 1569 net_enable_timestamp();
87d11ceb
ACM
1570 }
1571out:
1572 return newsk;
1573}
e56c57d0 1574EXPORT_SYMBOL_GPL(sk_clone_lock);
87d11ceb 1575
9958089a
AK
1576void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1577{
d6a4e26a
ED
1578 u32 max_segs = 1;
1579
6bd4f355 1580 sk_dst_set(sk, dst);
9958089a
AK
1581 sk->sk_route_caps = dst->dev->features;
1582 if (sk->sk_route_caps & NETIF_F_GSO)
4fcd6b99 1583 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
a465419b 1584 sk->sk_route_caps &= ~sk->sk_route_nocaps;
9958089a 1585 if (sk_can_gso(sk)) {
82cc1a7a 1586 if (dst->header_len) {
9958089a 1587 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
82cc1a7a 1588 } else {
9958089a 1589 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
82cc1a7a 1590 sk->sk_gso_max_size = dst->dev->gso_max_size;
d6a4e26a 1591 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
82cc1a7a 1592 }
9958089a 1593 }
d6a4e26a 1594 sk->sk_gso_max_segs = max_segs;
9958089a
AK
1595}
1596EXPORT_SYMBOL_GPL(sk_setup_caps);
1597
1da177e4
LT
1598/*
1599 * Simple resource managers for sockets.
1600 */
1601
1602
4ec93edb
YH
1603/*
1604 * Write buffer destructor automatically called from kfree_skb.
1da177e4
LT
1605 */
1606void sock_wfree(struct sk_buff *skb)
1607{
1608 struct sock *sk = skb->sk;
d99927f4 1609 unsigned int len = skb->truesize;
1da177e4 1610
d99927f4
ED
1611 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1612 /*
1613 * Keep a reference on sk_wmem_alloc, this will be released
1614 * after sk_write_space() call
1615 */
1616 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1da177e4 1617 sk->sk_write_space(sk);
d99927f4
ED
1618 len = 1;
1619 }
2b85a34e 1620 /*
d99927f4
ED
1621 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1622 * could not do because of in-flight packets
2b85a34e 1623 */
d99927f4 1624 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
2b85a34e 1625 __sk_free(sk);
1da177e4 1626}
2a91525c 1627EXPORT_SYMBOL(sock_wfree);
1da177e4 1628
9e17f8a4
ED
1629void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1630{
1631 skb_orphan(skb);
1632 skb->sk = sk;
1633#ifdef CONFIG_INET
1634 if (unlikely(!sk_fullsock(sk))) {
1635 skb->destructor = sock_edemux;
1636 sock_hold(sk);
1637 return;
1638 }
1639#endif
1640 skb->destructor = sock_wfree;
1641 skb_set_hash_from_sk(skb, sk);
1642 /*
1643 * We used to take a refcount on sk, but following operation
1644 * is enough to guarantee sk_free() wont free this sock until
1645 * all in-flight packets are completed
1646 */
1647 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1648}
1649EXPORT_SYMBOL(skb_set_owner_w);
1650
f2f872f9
ED
1651void skb_orphan_partial(struct sk_buff *skb)
1652{
1653 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1654 * so we do not completely orphan skb, but transfert all
1655 * accounted bytes but one, to avoid unexpected reorders.
1656 */
1657 if (skb->destructor == sock_wfree
1658#ifdef CONFIG_INET
1659 || skb->destructor == tcp_wfree
1660#endif
1661 ) {
1662 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1663 skb->truesize = 1;
1664 } else {
1665 skb_orphan(skb);
1666 }
1667}
1668EXPORT_SYMBOL(skb_orphan_partial);
1669
4ec93edb
YH
1670/*
1671 * Read buffer destructor automatically called from kfree_skb.
1da177e4
LT
1672 */
1673void sock_rfree(struct sk_buff *skb)
1674{
1675 struct sock *sk = skb->sk;
d361fd59 1676 unsigned int len = skb->truesize;
1da177e4 1677
d361fd59
ED
1678 atomic_sub(len, &sk->sk_rmem_alloc);
1679 sk_mem_uncharge(sk, len);
1da177e4 1680}
2a91525c 1681EXPORT_SYMBOL(sock_rfree);
1da177e4 1682
7768eed8
OH
1683/*
1684 * Buffer destructor for skbs that are not used directly in read or write
1685 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1686 */
62bccb8c
AD
1687void sock_efree(struct sk_buff *skb)
1688{
1689 sock_put(skb->sk);
1690}
1691EXPORT_SYMBOL(sock_efree);
1692
976d0201 1693kuid_t sock_i_uid(struct sock *sk)
1da177e4 1694{
976d0201 1695 kuid_t uid;
1da177e4 1696
f064af1e 1697 read_lock_bh(&sk->sk_callback_lock);
976d0201 1698 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
f064af1e 1699 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1700 return uid;
1701}
2a91525c 1702EXPORT_SYMBOL(sock_i_uid);
1da177e4
LT
1703
1704unsigned long sock_i_ino(struct sock *sk)
1705{
1706 unsigned long ino;
1707
f064af1e 1708 read_lock_bh(&sk->sk_callback_lock);
1da177e4 1709 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
f064af1e 1710 read_unlock_bh(&sk->sk_callback_lock);
1da177e4
LT
1711 return ino;
1712}
2a91525c 1713EXPORT_SYMBOL(sock_i_ino);
1da177e4
LT
1714
1715/*
1716 * Allocate a skb from the socket's send buffer.
1717 */
86a76caf 1718struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
dd0fc66f 1719 gfp_t priority)
1da177e4
LT
1720{
1721 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
2a91525c 1722 struct sk_buff *skb = alloc_skb(size, priority);
1da177e4
LT
1723 if (skb) {
1724 skb_set_owner_w(skb, sk);
1725 return skb;
1726 }
1727 }
1728 return NULL;
1729}
2a91525c 1730EXPORT_SYMBOL(sock_wmalloc);
1da177e4 1731
4ec93edb 1732/*
1da177e4 1733 * Allocate a memory block from the socket's option memory buffer.
4ec93edb 1734 */
dd0fc66f 1735void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1da177e4 1736{
95c96174 1737 if ((unsigned int)size <= sysctl_optmem_max &&
1da177e4
LT
1738 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1739 void *mem;
1740 /* First do the add, to avoid the race if kmalloc
4ec93edb 1741 * might sleep.
1da177e4
LT
1742 */
1743 atomic_add(size, &sk->sk_omem_alloc);
1744 mem = kmalloc(size, priority);
1745 if (mem)
1746 return mem;
1747 atomic_sub(size, &sk->sk_omem_alloc);
1748 }
1749 return NULL;
1750}
2a91525c 1751EXPORT_SYMBOL(sock_kmalloc);
1da177e4 1752
79e88659
DB
1753/* Free an option memory block. Note, we actually want the inline
1754 * here as this allows gcc to detect the nullify and fold away the
1755 * condition entirely.
1da177e4 1756 */
79e88659
DB
1757static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1758 const bool nullify)
1da177e4 1759{
e53da5fb
DM
1760 if (WARN_ON_ONCE(!mem))
1761 return;
79e88659
DB
1762 if (nullify)
1763 kzfree(mem);
1764 else
1765 kfree(mem);
1da177e4
LT
1766 atomic_sub(size, &sk->sk_omem_alloc);
1767}
79e88659
DB
1768
1769void sock_kfree_s(struct sock *sk, void *mem, int size)
1770{
1771 __sock_kfree_s(sk, mem, size, false);
1772}
2a91525c 1773EXPORT_SYMBOL(sock_kfree_s);
1da177e4 1774
79e88659
DB
1775void sock_kzfree_s(struct sock *sk, void *mem, int size)
1776{
1777 __sock_kfree_s(sk, mem, size, true);
1778}
1779EXPORT_SYMBOL(sock_kzfree_s);
1780
1da177e4
LT
1781/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1782 I think, these locks should be removed for datagram sockets.
1783 */
2a91525c 1784static long sock_wait_for_wmem(struct sock *sk, long timeo)
1da177e4
LT
1785{
1786 DEFINE_WAIT(wait);
1787
9cd3e072 1788 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1da177e4
LT
1789 for (;;) {
1790 if (!timeo)
1791 break;
1792 if (signal_pending(current))
1793 break;
1794 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
aa395145 1795 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1da177e4
LT
1796 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1797 break;
1798 if (sk->sk_shutdown & SEND_SHUTDOWN)
1799 break;
1800 if (sk->sk_err)
1801 break;
1802 timeo = schedule_timeout(timeo);
1803 }
aa395145 1804 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
1805 return timeo;
1806}
1807
1808
1809/*
1810 * Generic send/receive buffer handlers
1811 */
1812
4cc7f68d
HX
1813struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1814 unsigned long data_len, int noblock,
28d64271 1815 int *errcode, int max_page_order)
1da177e4 1816{
2e4e4410 1817 struct sk_buff *skb;
1da177e4
LT
1818 long timeo;
1819 int err;
1820
1da177e4 1821 timeo = sock_sndtimeo(sk, noblock);
2e4e4410 1822 for (;;) {
1da177e4
LT
1823 err = sock_error(sk);
1824 if (err != 0)
1825 goto failure;
1826
1827 err = -EPIPE;
1828 if (sk->sk_shutdown & SEND_SHUTDOWN)
1829 goto failure;
1830
2e4e4410
ED
1831 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1832 break;
28d64271 1833
9cd3e072 1834 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2e4e4410
ED
1835 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1836 err = -EAGAIN;
1837 if (!timeo)
1da177e4 1838 goto failure;
2e4e4410
ED
1839 if (signal_pending(current))
1840 goto interrupted;
1841 timeo = sock_wait_for_wmem(sk, timeo);
1da177e4 1842 }
2e4e4410
ED
1843 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1844 errcode, sk->sk_allocation);
1845 if (skb)
1846 skb_set_owner_w(skb, sk);
1da177e4
LT
1847 return skb;
1848
1849interrupted:
1850 err = sock_intr_errno(timeo);
1851failure:
1852 *errcode = err;
1853 return NULL;
1854}
4cc7f68d 1855EXPORT_SYMBOL(sock_alloc_send_pskb);
1da177e4 1856
4ec93edb 1857struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1da177e4
LT
1858 int noblock, int *errcode)
1859{
28d64271 1860 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1da177e4 1861}
2a91525c 1862EXPORT_SYMBOL(sock_alloc_send_skb);
1da177e4 1863
f28ea365
EJ
1864int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1865 struct sockcm_cookie *sockc)
1866{
1867 struct cmsghdr *cmsg;
1868
1869 for_each_cmsghdr(cmsg, msg) {
1870 if (!CMSG_OK(msg, cmsg))
1871 return -EINVAL;
1872 if (cmsg->cmsg_level != SOL_SOCKET)
1873 continue;
1874 switch (cmsg->cmsg_type) {
1875 case SO_MARK:
1876 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1877 return -EPERM;
1878 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1879 return -EINVAL;
1880 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1881 break;
1882 default:
1883 return -EINVAL;
1884 }
1885 }
1886 return 0;
1887}
1888EXPORT_SYMBOL(sock_cmsg_send);
1889
5640f768
ED
1890/* On 32bit arches, an skb frag is limited to 2^15 */
1891#define SKB_FRAG_PAGE_ORDER get_order(32768)
1892
400dfd3a
ED
1893/**
1894 * skb_page_frag_refill - check that a page_frag contains enough room
1895 * @sz: minimum size of the fragment we want to get
1896 * @pfrag: pointer to page_frag
82d5e2b8 1897 * @gfp: priority for memory allocation
400dfd3a
ED
1898 *
1899 * Note: While this allocator tries to use high order pages, there is
1900 * no guarantee that allocations succeed. Therefore, @sz MUST be
1901 * less or equal than PAGE_SIZE.
1902 */
d9b2938a 1903bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
5640f768 1904{
5640f768 1905 if (pfrag->page) {
fe896d18 1906 if (page_ref_count(pfrag->page) == 1) {
5640f768
ED
1907 pfrag->offset = 0;
1908 return true;
1909 }
400dfd3a 1910 if (pfrag->offset + sz <= pfrag->size)
5640f768
ED
1911 return true;
1912 put_page(pfrag->page);
1913 }
1914
d9b2938a
ED
1915 pfrag->offset = 0;
1916 if (SKB_FRAG_PAGE_ORDER) {
d0164adc
MG
1917 /* Avoid direct reclaim but allow kswapd to wake */
1918 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1919 __GFP_COMP | __GFP_NOWARN |
1920 __GFP_NORETRY,
d9b2938a 1921 SKB_FRAG_PAGE_ORDER);
5640f768 1922 if (likely(pfrag->page)) {
d9b2938a 1923 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
5640f768
ED
1924 return true;
1925 }
d9b2938a
ED
1926 }
1927 pfrag->page = alloc_page(gfp);
1928 if (likely(pfrag->page)) {
1929 pfrag->size = PAGE_SIZE;
1930 return true;
1931 }
400dfd3a
ED
1932 return false;
1933}
1934EXPORT_SYMBOL(skb_page_frag_refill);
1935
1936bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1937{
1938 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1939 return true;
1940
5640f768
ED
1941 sk_enter_memory_pressure(sk);
1942 sk_stream_moderate_sndbuf(sk);
1943 return false;
1944}
1945EXPORT_SYMBOL(sk_page_frag_refill);
1946
1da177e4 1947static void __lock_sock(struct sock *sk)
f39234d6
NK
1948 __releases(&sk->sk_lock.slock)
1949 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1950{
1951 DEFINE_WAIT(wait);
1952
e71a4783 1953 for (;;) {
1da177e4
LT
1954 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1955 TASK_UNINTERRUPTIBLE);
1956 spin_unlock_bh(&sk->sk_lock.slock);
1957 schedule();
1958 spin_lock_bh(&sk->sk_lock.slock);
e71a4783 1959 if (!sock_owned_by_user(sk))
1da177e4
LT
1960 break;
1961 }
1962 finish_wait(&sk->sk_lock.wq, &wait);
1963}
1964
1965static void __release_sock(struct sock *sk)
f39234d6
NK
1966 __releases(&sk->sk_lock.slock)
1967 __acquires(&sk->sk_lock.slock)
1da177e4
LT
1968{
1969 struct sk_buff *skb = sk->sk_backlog.head;
1970
1971 do {
1972 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1973 bh_unlock_sock(sk);
1974
1975 do {
1976 struct sk_buff *next = skb->next;
1977
e4cbb02a 1978 prefetch(next);
7fee226a 1979 WARN_ON_ONCE(skb_dst_is_noref(skb));
1da177e4 1980 skb->next = NULL;
c57943a1 1981 sk_backlog_rcv(sk, skb);
1da177e4
LT
1982
1983 /*
1984 * We are in process context here with softirqs
1985 * disabled, use cond_resched_softirq() to preempt.
1986 * This is safe to do because we've taken the backlog
1987 * queue private:
1988 */
1989 cond_resched_softirq();
1990
1991 skb = next;
1992 } while (skb != NULL);
1993
1994 bh_lock_sock(sk);
e71a4783 1995 } while ((skb = sk->sk_backlog.head) != NULL);
8eae939f
ZY
1996
1997 /*
1998 * Doing the zeroing here guarantee we can not loop forever
1999 * while a wild producer attempts to flood us.
2000 */
2001 sk->sk_backlog.len = 0;
1da177e4
LT
2002}
2003
2004/**
2005 * sk_wait_data - wait for data to arrive at sk_receive_queue
4dc3b16b
PP
2006 * @sk: sock to wait on
2007 * @timeo: for how long
dfbafc99 2008 * @skb: last skb seen on sk_receive_queue
1da177e4
LT
2009 *
2010 * Now socket state including sk->sk_err is changed only under lock,
2011 * hence we may omit checks after joining wait queue.
2012 * We check receive queue before schedule() only as optimization;
2013 * it is very likely that release_sock() added new data.
2014 */
dfbafc99 2015int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
1da177e4
LT
2016{
2017 int rc;
2018 DEFINE_WAIT(wait);
2019
aa395145 2020 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
9cd3e072 2021 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
dfbafc99 2022 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
9cd3e072 2023 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
aa395145 2024 finish_wait(sk_sleep(sk), &wait);
1da177e4
LT
2025 return rc;
2026}
1da177e4
LT
2027EXPORT_SYMBOL(sk_wait_data);
2028
3ab224be
HA
2029/**
2030 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2031 * @sk: socket
2032 * @size: memory size to allocate
2033 * @kind: allocation type
2034 *
2035 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2036 * rmem allocation. This function assumes that protocols which have
2037 * memory_pressure use sk_wmem_queued as write buffer accounting.
2038 */
2039int __sk_mem_schedule(struct sock *sk, int size, int kind)
2040{
2041 struct proto *prot = sk->sk_prot;
2042 int amt = sk_mem_pages(size);
8d987e5c 2043 long allocated;
3ab224be
HA
2044
2045 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
180d8cd9 2046
e805605c
JW
2047 allocated = sk_memory_allocated_add(sk, amt);
2048
baac50bb
JW
2049 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2050 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
e805605c 2051 goto suppress_allocation;
3ab224be
HA
2052
2053 /* Under limit. */
e805605c 2054 if (allocated <= sk_prot_mem_limits(sk, 0)) {
180d8cd9 2055 sk_leave_memory_pressure(sk);
3ab224be
HA
2056 return 1;
2057 }
2058
e805605c
JW
2059 /* Under pressure. */
2060 if (allocated > sk_prot_mem_limits(sk, 1))
180d8cd9 2061 sk_enter_memory_pressure(sk);
3ab224be 2062
e805605c
JW
2063 /* Over hard limit. */
2064 if (allocated > sk_prot_mem_limits(sk, 2))
3ab224be
HA
2065 goto suppress_allocation;
2066
2067 /* guarantee minimum buffer size under pressure */
2068 if (kind == SK_MEM_RECV) {
2069 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2070 return 1;
180d8cd9 2071
3ab224be
HA
2072 } else { /* SK_MEM_SEND */
2073 if (sk->sk_type == SOCK_STREAM) {
2074 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2075 return 1;
2076 } else if (atomic_read(&sk->sk_wmem_alloc) <
2077 prot->sysctl_wmem[0])
2078 return 1;
2079 }
2080
180d8cd9 2081 if (sk_has_memory_pressure(sk)) {
1748376b
ED
2082 int alloc;
2083
180d8cd9 2084 if (!sk_under_memory_pressure(sk))
1748376b 2085 return 1;
180d8cd9
GC
2086 alloc = sk_sockets_allocated_read_positive(sk);
2087 if (sk_prot_mem_limits(sk, 2) > alloc *
3ab224be
HA
2088 sk_mem_pages(sk->sk_wmem_queued +
2089 atomic_read(&sk->sk_rmem_alloc) +
2090 sk->sk_forward_alloc))
2091 return 1;
2092 }
2093
2094suppress_allocation:
2095
2096 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2097 sk_stream_moderate_sndbuf(sk);
2098
2099 /* Fail only if socket is _under_ its sndbuf.
2100 * In this case we cannot block, so that we have to fail.
2101 */
2102 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2103 return 1;
2104 }
2105
3847ce32
SM
2106 trace_sock_exceed_buf_limit(sk, prot, allocated);
2107
3ab224be
HA
2108 /* Alas. Undo changes. */
2109 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
180d8cd9 2110
0e90b31f 2111 sk_memory_allocated_sub(sk, amt);
180d8cd9 2112
baac50bb
JW
2113 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2114 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
e805605c 2115
3ab224be
HA
2116 return 0;
2117}
3ab224be
HA
2118EXPORT_SYMBOL(__sk_mem_schedule);
2119
2120/**
69dba9bb 2121 * __sk_mem_reclaim - reclaim memory_allocated
3ab224be 2122 * @sk: socket
1a24e04e 2123 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
3ab224be 2124 */
1a24e04e 2125void __sk_mem_reclaim(struct sock *sk, int amount)
3ab224be 2126{
1a24e04e
ED
2127 amount >>= SK_MEM_QUANTUM_SHIFT;
2128 sk_memory_allocated_sub(sk, amount);
2129 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
3ab224be 2130
baac50bb
JW
2131 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2132 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
e805605c 2133
180d8cd9
GC
2134 if (sk_under_memory_pressure(sk) &&
2135 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2136 sk_leave_memory_pressure(sk);
3ab224be 2137}
3ab224be
HA
2138EXPORT_SYMBOL(__sk_mem_reclaim);
2139
2140
1da177e4
LT
2141/*
2142 * Set of default routines for initialising struct proto_ops when
2143 * the protocol does not support a particular function. In certain
2144 * cases where it makes no sense for a protocol to have a "do nothing"
2145 * function, some default processing is provided.
2146 */
2147
2148int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2149{
2150 return -EOPNOTSUPP;
2151}
2a91525c 2152EXPORT_SYMBOL(sock_no_bind);
1da177e4 2153
4ec93edb 2154int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2155 int len, int flags)
2156{
2157 return -EOPNOTSUPP;
2158}
2a91525c 2159EXPORT_SYMBOL(sock_no_connect);
1da177e4
LT
2160
2161int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2162{
2163 return -EOPNOTSUPP;
2164}
2a91525c 2165EXPORT_SYMBOL(sock_no_socketpair);
1da177e4
LT
2166
2167int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2168{
2169 return -EOPNOTSUPP;
2170}
2a91525c 2171EXPORT_SYMBOL(sock_no_accept);
1da177e4 2172
4ec93edb 2173int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1da177e4
LT
2174 int *len, int peer)
2175{
2176 return -EOPNOTSUPP;
2177}
2a91525c 2178EXPORT_SYMBOL(sock_no_getname);
1da177e4 2179
2a91525c 2180unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1da177e4
LT
2181{
2182 return 0;
2183}
2a91525c 2184EXPORT_SYMBOL(sock_no_poll);
1da177e4
LT
2185
2186int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2187{
2188 return -EOPNOTSUPP;
2189}
2a91525c 2190EXPORT_SYMBOL(sock_no_ioctl);
1da177e4
LT
2191
2192int sock_no_listen(struct socket *sock, int backlog)
2193{
2194 return -EOPNOTSUPP;
2195}
2a91525c 2196EXPORT_SYMBOL(sock_no_listen);
1da177e4
LT
2197
2198int sock_no_shutdown(struct socket *sock, int how)
2199{
2200 return -EOPNOTSUPP;
2201}
2a91525c 2202EXPORT_SYMBOL(sock_no_shutdown);
1da177e4
LT
2203
2204int sock_no_setsockopt(struct socket *sock, int level, int optname,
b7058842 2205 char __user *optval, unsigned int optlen)
1da177e4
LT
2206{
2207 return -EOPNOTSUPP;
2208}
2a91525c 2209EXPORT_SYMBOL(sock_no_setsockopt);
1da177e4
LT
2210
2211int sock_no_getsockopt(struct socket *sock, int level, int optname,
2212 char __user *optval, int __user *optlen)
2213{
2214 return -EOPNOTSUPP;
2215}
2a91525c 2216EXPORT_SYMBOL(sock_no_getsockopt);
1da177e4 2217
1b784140 2218int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
1da177e4
LT
2219{
2220 return -EOPNOTSUPP;
2221}
2a91525c 2222EXPORT_SYMBOL(sock_no_sendmsg);
1da177e4 2223
1b784140
YX
2224int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2225 int flags)
1da177e4
LT
2226{
2227 return -EOPNOTSUPP;
2228}
2a91525c 2229EXPORT_SYMBOL(sock_no_recvmsg);
1da177e4
LT
2230
2231int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2232{
2233 /* Mirror missing mmap method error code */
2234 return -ENODEV;
2235}
2a91525c 2236EXPORT_SYMBOL(sock_no_mmap);
1da177e4
LT
2237
2238ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2239{
2240 ssize_t res;
2241 struct msghdr msg = {.msg_flags = flags};
2242 struct kvec iov;
2243 char *kaddr = kmap(page);
2244 iov.iov_base = kaddr + offset;
2245 iov.iov_len = size;
2246 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2247 kunmap(page);
2248 return res;
2249}
2a91525c 2250EXPORT_SYMBOL(sock_no_sendpage);
1da177e4
LT
2251
2252/*
2253 * Default Socket Callbacks
2254 */
2255
2256static void sock_def_wakeup(struct sock *sk)
2257{
43815482
ED
2258 struct socket_wq *wq;
2259
2260 rcu_read_lock();
2261 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2262 if (skwq_has_sleeper(wq))
43815482
ED
2263 wake_up_interruptible_all(&wq->wait);
2264 rcu_read_unlock();
1da177e4
LT
2265}
2266
2267static void sock_def_error_report(struct sock *sk)
2268{
43815482
ED
2269 struct socket_wq *wq;
2270
2271 rcu_read_lock();
2272 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2273 if (skwq_has_sleeper(wq))
43815482 2274 wake_up_interruptible_poll(&wq->wait, POLLERR);
8d8ad9d7 2275 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
43815482 2276 rcu_read_unlock();
1da177e4
LT
2277}
2278
676d2369 2279static void sock_def_readable(struct sock *sk)
1da177e4 2280{
43815482
ED
2281 struct socket_wq *wq;
2282
2283 rcu_read_lock();
2284 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2285 if (skwq_has_sleeper(wq))
2c6607c6 2286 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
37e5540b 2287 POLLRDNORM | POLLRDBAND);
8d8ad9d7 2288 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
43815482 2289 rcu_read_unlock();
1da177e4
LT
2290}
2291
2292static void sock_def_write_space(struct sock *sk)
2293{
43815482
ED
2294 struct socket_wq *wq;
2295
2296 rcu_read_lock();
1da177e4
LT
2297
2298 /* Do not wake up a writer until he can make "significant"
2299 * progress. --DaveM
2300 */
e71a4783 2301 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
43815482 2302 wq = rcu_dereference(sk->sk_wq);
1ce0bf50 2303 if (skwq_has_sleeper(wq))
43815482 2304 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
37e5540b 2305 POLLWRNORM | POLLWRBAND);
1da177e4
LT
2306
2307 /* Should agree with poll, otherwise some programs break */
2308 if (sock_writeable(sk))
8d8ad9d7 2309 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1da177e4
LT
2310 }
2311
43815482 2312 rcu_read_unlock();
1da177e4
LT
2313}
2314
2315static void sock_def_destruct(struct sock *sk)
2316{
1da177e4
LT
2317}
2318
2319void sk_send_sigurg(struct sock *sk)
2320{
2321 if (sk->sk_socket && sk->sk_socket->file)
2322 if (send_sigurg(&sk->sk_socket->file->f_owner))
8d8ad9d7 2323 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1da177e4 2324}
2a91525c 2325EXPORT_SYMBOL(sk_send_sigurg);
1da177e4
LT
2326
2327void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2328 unsigned long expires)
2329{
2330 if (!mod_timer(timer, expires))
2331 sock_hold(sk);
2332}
1da177e4
LT
2333EXPORT_SYMBOL(sk_reset_timer);
2334
2335void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2336{
25cc4ae9 2337 if (del_timer(timer))
1da177e4
LT
2338 __sock_put(sk);
2339}
1da177e4
LT
2340EXPORT_SYMBOL(sk_stop_timer);
2341
2342void sock_init_data(struct socket *sock, struct sock *sk)
2343{
2344 skb_queue_head_init(&sk->sk_receive_queue);
2345 skb_queue_head_init(&sk->sk_write_queue);
2346 skb_queue_head_init(&sk->sk_error_queue);
2347
2348 sk->sk_send_head = NULL;
2349
2350 init_timer(&sk->sk_timer);
4ec93edb 2351
1da177e4
LT
2352 sk->sk_allocation = GFP_KERNEL;
2353 sk->sk_rcvbuf = sysctl_rmem_default;
2354 sk->sk_sndbuf = sysctl_wmem_default;
2355 sk->sk_state = TCP_CLOSE;
972692e0 2356 sk_set_socket(sk, sock);
1da177e4
LT
2357
2358 sock_set_flag(sk, SOCK_ZAPPED);
2359
e71a4783 2360 if (sock) {
1da177e4 2361 sk->sk_type = sock->type;
43815482 2362 sk->sk_wq = sock->wq;
1da177e4
LT
2363 sock->sk = sk;
2364 } else
43815482 2365 sk->sk_wq = NULL;
1da177e4 2366
1da177e4 2367 rwlock_init(&sk->sk_callback_lock);
443aef0e
PZ
2368 lockdep_set_class_and_name(&sk->sk_callback_lock,
2369 af_callback_keys + sk->sk_family,
2370 af_family_clock_key_strings[sk->sk_family]);
1da177e4
LT
2371
2372 sk->sk_state_change = sock_def_wakeup;
2373 sk->sk_data_ready = sock_def_readable;
2374 sk->sk_write_space = sock_def_write_space;
2375 sk->sk_error_report = sock_def_error_report;
2376 sk->sk_destruct = sock_def_destruct;
2377
5640f768
ED
2378 sk->sk_frag.page = NULL;
2379 sk->sk_frag.offset = 0;
ef64a54f 2380 sk->sk_peek_off = -1;
1da177e4 2381
109f6e39
EB
2382 sk->sk_peer_pid = NULL;
2383 sk->sk_peer_cred = NULL;
1da177e4
LT
2384 sk->sk_write_pending = 0;
2385 sk->sk_rcvlowat = 1;
2386 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2387 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2388
f37f0afb 2389 sk->sk_stamp = ktime_set(-1L, 0);
1da177e4 2390
e0d1095a 2391#ifdef CONFIG_NET_RX_BUSY_POLL
06021292 2392 sk->sk_napi_id = 0;
64b0dc51 2393 sk->sk_ll_usec = sysctl_net_busy_read;
06021292
ET
2394#endif
2395
62748f32 2396 sk->sk_max_pacing_rate = ~0U;
7eec4174 2397 sk->sk_pacing_rate = ~0U;
70da268b 2398 sk->sk_incoming_cpu = -1;
4dc6dc71
ED
2399 /*
2400 * Before updating sk_refcnt, we must commit prior changes to memory
2401 * (Documentation/RCU/rculist_nulls.txt for details)
2402 */
2403 smp_wmb();
1da177e4 2404 atomic_set(&sk->sk_refcnt, 1);
33c732c3 2405 atomic_set(&sk->sk_drops, 0);
1da177e4 2406}
2a91525c 2407EXPORT_SYMBOL(sock_init_data);
1da177e4 2408
b5606c2d 2409void lock_sock_nested(struct sock *sk, int subclass)
1da177e4
LT
2410{
2411 might_sleep();
a5b5bb9a 2412 spin_lock_bh(&sk->sk_lock.slock);
d2e9117c 2413 if (sk->sk_lock.owned)
1da177e4 2414 __lock_sock(sk);
d2e9117c 2415 sk->sk_lock.owned = 1;
a5b5bb9a
IM
2416 spin_unlock(&sk->sk_lock.slock);
2417 /*
2418 * The sk_lock has mutex_lock() semantics here:
2419 */
fcc70d5f 2420 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
a5b5bb9a 2421 local_bh_enable();
1da177e4 2422}
fcc70d5f 2423EXPORT_SYMBOL(lock_sock_nested);
1da177e4 2424
b5606c2d 2425void release_sock(struct sock *sk)
1da177e4 2426{
a5b5bb9a
IM
2427 /*
2428 * The sk_lock has mutex_unlock() semantics:
2429 */
2430 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2431
2432 spin_lock_bh(&sk->sk_lock.slock);
1da177e4
LT
2433 if (sk->sk_backlog.tail)
2434 __release_sock(sk);
46d3ceab 2435
c3f9b018
ED
2436 /* Warning : release_cb() might need to release sk ownership,
2437 * ie call sock_release_ownership(sk) before us.
2438 */
46d3ceab
ED
2439 if (sk->sk_prot->release_cb)
2440 sk->sk_prot->release_cb(sk);
2441
c3f9b018 2442 sock_release_ownership(sk);
a5b5bb9a
IM
2443 if (waitqueue_active(&sk->sk_lock.wq))
2444 wake_up(&sk->sk_lock.wq);
2445 spin_unlock_bh(&sk->sk_lock.slock);
1da177e4
LT
2446}
2447EXPORT_SYMBOL(release_sock);
2448
8a74ad60
ED
2449/**
2450 * lock_sock_fast - fast version of lock_sock
2451 * @sk: socket
2452 *
2453 * This version should be used for very small section, where process wont block
2454 * return false if fast path is taken
2455 * sk_lock.slock locked, owned = 0, BH disabled
2456 * return true if slow path is taken
2457 * sk_lock.slock unlocked, owned = 1, BH enabled
2458 */
2459bool lock_sock_fast(struct sock *sk)
2460{
2461 might_sleep();
2462 spin_lock_bh(&sk->sk_lock.slock);
2463
2464 if (!sk->sk_lock.owned)
2465 /*
2466 * Note : We must disable BH
2467 */
2468 return false;
2469
2470 __lock_sock(sk);
2471 sk->sk_lock.owned = 1;
2472 spin_unlock(&sk->sk_lock.slock);
2473 /*
2474 * The sk_lock has mutex_lock() semantics here:
2475 */
2476 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2477 local_bh_enable();
2478 return true;
2479}
2480EXPORT_SYMBOL(lock_sock_fast);
2481
1da177e4 2482int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
4ec93edb 2483{
b7aa0bf7 2484 struct timeval tv;
1da177e4 2485 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2486 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
b7aa0bf7
ED
2487 tv = ktime_to_timeval(sk->sk_stamp);
2488 if (tv.tv_sec == -1)
1da177e4 2489 return -ENOENT;
b7aa0bf7
ED
2490 if (tv.tv_sec == 0) {
2491 sk->sk_stamp = ktime_get_real();
2492 tv = ktime_to_timeval(sk->sk_stamp);
2493 }
2494 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
4ec93edb 2495}
1da177e4
LT
2496EXPORT_SYMBOL(sock_get_timestamp);
2497
ae40eb1e
ED
2498int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2499{
2500 struct timespec ts;
2501 if (!sock_flag(sk, SOCK_TIMESTAMP))
20d49473 2502 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ae40eb1e
ED
2503 ts = ktime_to_timespec(sk->sk_stamp);
2504 if (ts.tv_sec == -1)
2505 return -ENOENT;
2506 if (ts.tv_sec == 0) {
2507 sk->sk_stamp = ktime_get_real();
2508 ts = ktime_to_timespec(sk->sk_stamp);
2509 }
2510 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2511}
2512EXPORT_SYMBOL(sock_get_timestampns);
2513
20d49473 2514void sock_enable_timestamp(struct sock *sk, int flag)
4ec93edb 2515{
20d49473 2516 if (!sock_flag(sk, flag)) {
08e29af3
ED
2517 unsigned long previous_flags = sk->sk_flags;
2518
20d49473
PO
2519 sock_set_flag(sk, flag);
2520 /*
2521 * we just set one of the two flags which require net
2522 * time stamping, but time stamping might have been on
2523 * already because of the other one
2524 */
080a270f
HFS
2525 if (sock_needs_netstamp(sk) &&
2526 !(previous_flags & SK_FLAGS_TIMESTAMP))
20d49473 2527 net_enable_timestamp();
1da177e4
LT
2528 }
2529}
1da177e4 2530
cb820f8e
RC
2531int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2532 int level, int type)
2533{
2534 struct sock_exterr_skb *serr;
364a9e93 2535 struct sk_buff *skb;
cb820f8e
RC
2536 int copied, err;
2537
2538 err = -EAGAIN;
364a9e93 2539 skb = sock_dequeue_err_skb(sk);
cb820f8e
RC
2540 if (skb == NULL)
2541 goto out;
2542
2543 copied = skb->len;
2544 if (copied > len) {
2545 msg->msg_flags |= MSG_TRUNC;
2546 copied = len;
2547 }
51f3d02b 2548 err = skb_copy_datagram_msg(skb, 0, msg, copied);
cb820f8e
RC
2549 if (err)
2550 goto out_free_skb;
2551
2552 sock_recv_timestamp(msg, sk, skb);
2553
2554 serr = SKB_EXT_ERR(skb);
2555 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2556
2557 msg->msg_flags |= MSG_ERRQUEUE;
2558 err = copied;
2559
cb820f8e
RC
2560out_free_skb:
2561 kfree_skb(skb);
2562out:
2563 return err;
2564}
2565EXPORT_SYMBOL(sock_recv_errqueue);
2566
1da177e4
LT
2567/*
2568 * Get a socket option on an socket.
2569 *
2570 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2571 * asynchronous errors should be reported by getsockopt. We assume
2572 * this means if you specify SO_ERROR (otherwise whats the point of it).
2573 */
2574int sock_common_getsockopt(struct socket *sock, int level, int optname,
2575 char __user *optval, int __user *optlen)
2576{
2577 struct sock *sk = sock->sk;
2578
2579 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2580}
1da177e4
LT
2581EXPORT_SYMBOL(sock_common_getsockopt);
2582
3fdadf7d 2583#ifdef CONFIG_COMPAT
543d9cfe
ACM
2584int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2585 char __user *optval, int __user *optlen)
3fdadf7d
DM
2586{
2587 struct sock *sk = sock->sk;
2588
1e51f951 2589 if (sk->sk_prot->compat_getsockopt != NULL)
543d9cfe
ACM
2590 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2591 optval, optlen);
3fdadf7d
DM
2592 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2593}
2594EXPORT_SYMBOL(compat_sock_common_getsockopt);
2595#endif
2596
1b784140
YX
2597int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2598 int flags)
1da177e4
LT
2599{
2600 struct sock *sk = sock->sk;
2601 int addr_len = 0;
2602 int err;
2603
1b784140 2604 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
1da177e4
LT
2605 flags & ~MSG_DONTWAIT, &addr_len);
2606 if (err >= 0)
2607 msg->msg_namelen = addr_len;
2608 return err;
2609}
1da177e4
LT
2610EXPORT_SYMBOL(sock_common_recvmsg);
2611
2612/*
2613 * Set socket options on an inet socket.
2614 */
2615int sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2616 char __user *optval, unsigned int optlen)
1da177e4
LT
2617{
2618 struct sock *sk = sock->sk;
2619
2620 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2621}
1da177e4
LT
2622EXPORT_SYMBOL(sock_common_setsockopt);
2623
3fdadf7d 2624#ifdef CONFIG_COMPAT
543d9cfe 2625int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
b7058842 2626 char __user *optval, unsigned int optlen)
3fdadf7d
DM
2627{
2628 struct sock *sk = sock->sk;
2629
543d9cfe
ACM
2630 if (sk->sk_prot->compat_setsockopt != NULL)
2631 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2632 optval, optlen);
3fdadf7d
DM
2633 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2634}
2635EXPORT_SYMBOL(compat_sock_common_setsockopt);
2636#endif
2637
1da177e4
LT
2638void sk_common_release(struct sock *sk)
2639{
2640 if (sk->sk_prot->destroy)
2641 sk->sk_prot->destroy(sk);
2642
2643 /*
2644 * Observation: when sock_common_release is called, processes have
2645 * no access to socket. But net still has.
2646 * Step one, detach it from networking:
2647 *
2648 * A. Remove from hash tables.
2649 */
2650
2651 sk->sk_prot->unhash(sk);
2652
2653 /*
2654 * In this point socket cannot receive new packets, but it is possible
2655 * that some packets are in flight because some CPU runs receiver and
2656 * did hash table lookup before we unhashed socket. They will achieve
2657 * receive queue and will be purged by socket destructor.
2658 *
2659 * Also we still have packets pending on receive queue and probably,
2660 * our own packets waiting in device queues. sock_destroy will drain
2661 * receive queue, but transmitted packets will delay socket destruction
2662 * until the last reference will be released.
2663 */
2664
2665 sock_orphan(sk);
2666
2667 xfrm_sk_free_policy(sk);
2668
e6848976 2669 sk_refcnt_debug_release(sk);
5640f768
ED
2670
2671 if (sk->sk_frag.page) {
2672 put_page(sk->sk_frag.page);
2673 sk->sk_frag.page = NULL;
2674 }
2675
1da177e4
LT
2676 sock_put(sk);
2677}
1da177e4
LT
2678EXPORT_SYMBOL(sk_common_release);
2679
13ff3d6f
PE
2680#ifdef CONFIG_PROC_FS
2681#define PROTO_INUSE_NR 64 /* should be enough for the first time */
1338d466
PE
2682struct prot_inuse {
2683 int val[PROTO_INUSE_NR];
2684};
13ff3d6f
PE
2685
2686static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
70ee1159
PE
2687
2688#ifdef CONFIG_NET_NS
2689void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2690{
d6d9ca0f 2691 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
70ee1159
PE
2692}
2693EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2694
2695int sock_prot_inuse_get(struct net *net, struct proto *prot)
2696{
2697 int cpu, idx = prot->inuse_idx;
2698 int res = 0;
2699
2700 for_each_possible_cpu(cpu)
2701 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2702
2703 return res >= 0 ? res : 0;
2704}
2705EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2706
2c8c1e72 2707static int __net_init sock_inuse_init_net(struct net *net)
70ee1159
PE
2708{
2709 net->core.inuse = alloc_percpu(struct prot_inuse);
2710 return net->core.inuse ? 0 : -ENOMEM;
2711}
2712
2c8c1e72 2713static void __net_exit sock_inuse_exit_net(struct net *net)
70ee1159
PE
2714{
2715 free_percpu(net->core.inuse);
2716}
2717
2718static struct pernet_operations net_inuse_ops = {
2719 .init = sock_inuse_init_net,
2720 .exit = sock_inuse_exit_net,
2721};
2722
2723static __init int net_inuse_init(void)
2724{
2725 if (register_pernet_subsys(&net_inuse_ops))
2726 panic("Cannot initialize net inuse counters");
2727
2728 return 0;
2729}
2730
2731core_initcall(net_inuse_init);
2732#else
1338d466
PE
2733static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2734
c29a0bc4 2735void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1338d466 2736{
d6d9ca0f 2737 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
1338d466
PE
2738}
2739EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2740
c29a0bc4 2741int sock_prot_inuse_get(struct net *net, struct proto *prot)
1338d466
PE
2742{
2743 int cpu, idx = prot->inuse_idx;
2744 int res = 0;
2745
2746 for_each_possible_cpu(cpu)
2747 res += per_cpu(prot_inuse, cpu).val[idx];
2748
2749 return res >= 0 ? res : 0;
2750}
2751EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
70ee1159 2752#endif
13ff3d6f
PE
2753
2754static void assign_proto_idx(struct proto *prot)
2755{
2756 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2757
2758 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
e005d193 2759 pr_err("PROTO_INUSE_NR exhausted\n");
13ff3d6f
PE
2760 return;
2761 }
2762
2763 set_bit(prot->inuse_idx, proto_inuse_idx);
2764}
2765
2766static void release_proto_idx(struct proto *prot)
2767{
2768 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2769 clear_bit(prot->inuse_idx, proto_inuse_idx);
2770}
2771#else
2772static inline void assign_proto_idx(struct proto *prot)
2773{
2774}
2775
2776static inline void release_proto_idx(struct proto *prot)
2777{
2778}
2779#endif
2780
0159dfd3
ED
2781static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2782{
2783 if (!rsk_prot)
2784 return;
2785 kfree(rsk_prot->slab_name);
2786 rsk_prot->slab_name = NULL;
adf78eda
JL
2787 kmem_cache_destroy(rsk_prot->slab);
2788 rsk_prot->slab = NULL;
0159dfd3
ED
2789}
2790
2791static int req_prot_init(const struct proto *prot)
2792{
2793 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2794
2795 if (!rsk_prot)
2796 return 0;
2797
2798 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2799 prot->name);
2800 if (!rsk_prot->slab_name)
2801 return -ENOMEM;
2802
2803 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2804 rsk_prot->obj_size, 0,
e96f78ab 2805 prot->slab_flags, NULL);
0159dfd3
ED
2806
2807 if (!rsk_prot->slab) {
2808 pr_crit("%s: Can't create request sock SLAB cache!\n",
2809 prot->name);
2810 return -ENOMEM;
2811 }
2812 return 0;
2813}
2814
b733c007
PE
2815int proto_register(struct proto *prot, int alloc_slab)
2816{
1da177e4
LT
2817 if (alloc_slab) {
2818 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
271b72c7
ED
2819 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2820 NULL);
1da177e4
LT
2821
2822 if (prot->slab == NULL) {
e005d193
JP
2823 pr_crit("%s: Can't create sock SLAB cache!\n",
2824 prot->name);
60e7663d 2825 goto out;
1da177e4 2826 }
2e6599cb 2827
0159dfd3
ED
2828 if (req_prot_init(prot))
2829 goto out_free_request_sock_slab;
8feaf0c0 2830
6d6ee43e 2831 if (prot->twsk_prot != NULL) {
faf23422 2832 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
8feaf0c0 2833
7e56b5d6 2834 if (prot->twsk_prot->twsk_slab_name == NULL)
8feaf0c0
ACM
2835 goto out_free_request_sock_slab;
2836
6d6ee43e 2837 prot->twsk_prot->twsk_slab =
7e56b5d6 2838 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
6d6ee43e 2839 prot->twsk_prot->twsk_obj_size,
3ab5aee7 2840 0,
52db70dc 2841 prot->slab_flags,
20c2df83 2842 NULL);
6d6ee43e 2843 if (prot->twsk_prot->twsk_slab == NULL)
8feaf0c0
ACM
2844 goto out_free_timewait_sock_slab_name;
2845 }
1da177e4
LT
2846 }
2847
36b77a52 2848 mutex_lock(&proto_list_mutex);
1da177e4 2849 list_add(&prot->node, &proto_list);
13ff3d6f 2850 assign_proto_idx(prot);
36b77a52 2851 mutex_unlock(&proto_list_mutex);
b733c007
PE
2852 return 0;
2853
8feaf0c0 2854out_free_timewait_sock_slab_name:
7e56b5d6 2855 kfree(prot->twsk_prot->twsk_slab_name);
8feaf0c0 2856out_free_request_sock_slab:
0159dfd3
ED
2857 req_prot_cleanup(prot->rsk_prot);
2858
2e6599cb
ACM
2859 kmem_cache_destroy(prot->slab);
2860 prot->slab = NULL;
b733c007
PE
2861out:
2862 return -ENOBUFS;
1da177e4 2863}
1da177e4
LT
2864EXPORT_SYMBOL(proto_register);
2865
2866void proto_unregister(struct proto *prot)
2867{
36b77a52 2868 mutex_lock(&proto_list_mutex);
13ff3d6f 2869 release_proto_idx(prot);
0a3f4358 2870 list_del(&prot->node);
36b77a52 2871 mutex_unlock(&proto_list_mutex);
1da177e4 2872
adf78eda
JL
2873 kmem_cache_destroy(prot->slab);
2874 prot->slab = NULL;
1da177e4 2875
0159dfd3 2876 req_prot_cleanup(prot->rsk_prot);
2e6599cb 2877
6d6ee43e 2878 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
6d6ee43e 2879 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
7e56b5d6 2880 kfree(prot->twsk_prot->twsk_slab_name);
6d6ee43e 2881 prot->twsk_prot->twsk_slab = NULL;
8feaf0c0 2882 }
1da177e4 2883}
1da177e4
LT
2884EXPORT_SYMBOL(proto_unregister);
2885
2886#ifdef CONFIG_PROC_FS
1da177e4 2887static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
36b77a52 2888 __acquires(proto_list_mutex)
1da177e4 2889{
36b77a52 2890 mutex_lock(&proto_list_mutex);
60f0438a 2891 return seq_list_start_head(&proto_list, *pos);
1da177e4
LT
2892}
2893
2894static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2895{
60f0438a 2896 return seq_list_next(v, &proto_list, pos);
1da177e4
LT
2897}
2898
2899static void proto_seq_stop(struct seq_file *seq, void *v)
36b77a52 2900 __releases(proto_list_mutex)
1da177e4 2901{
36b77a52 2902 mutex_unlock(&proto_list_mutex);
1da177e4
LT
2903}
2904
2905static char proto_method_implemented(const void *method)
2906{
2907 return method == NULL ? 'n' : 'y';
2908}
180d8cd9
GC
2909static long sock_prot_memory_allocated(struct proto *proto)
2910{
cb75a36c 2911 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
180d8cd9
GC
2912}
2913
2914static char *sock_prot_memory_pressure(struct proto *proto)
2915{
2916 return proto->memory_pressure != NULL ?
2917 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2918}
1da177e4
LT
2919
2920static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2921{
180d8cd9 2922
8d987e5c 2923 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
1da177e4
LT
2924 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2925 proto->name,
2926 proto->obj_size,
14e943db 2927 sock_prot_inuse_get(seq_file_net(seq), proto),
180d8cd9
GC
2928 sock_prot_memory_allocated(proto),
2929 sock_prot_memory_pressure(proto),
1da177e4
LT
2930 proto->max_header,
2931 proto->slab == NULL ? "no" : "yes",
2932 module_name(proto->owner),
2933 proto_method_implemented(proto->close),
2934 proto_method_implemented(proto->connect),
2935 proto_method_implemented(proto->disconnect),
2936 proto_method_implemented(proto->accept),
2937 proto_method_implemented(proto->ioctl),
2938 proto_method_implemented(proto->init),
2939 proto_method_implemented(proto->destroy),
2940 proto_method_implemented(proto->shutdown),
2941 proto_method_implemented(proto->setsockopt),
2942 proto_method_implemented(proto->getsockopt),
2943 proto_method_implemented(proto->sendmsg),
2944 proto_method_implemented(proto->recvmsg),
2945 proto_method_implemented(proto->sendpage),
2946 proto_method_implemented(proto->bind),
2947 proto_method_implemented(proto->backlog_rcv),
2948 proto_method_implemented(proto->hash),
2949 proto_method_implemented(proto->unhash),
2950 proto_method_implemented(proto->get_port),
2951 proto_method_implemented(proto->enter_memory_pressure));
2952}
2953
2954static int proto_seq_show(struct seq_file *seq, void *v)
2955{
60f0438a 2956 if (v == &proto_list)
1da177e4
LT
2957 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2958 "protocol",
2959 "size",
2960 "sockets",
2961 "memory",
2962 "press",
2963 "maxhdr",
2964 "slab",
2965 "module",
2966 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2967 else
60f0438a 2968 proto_seq_printf(seq, list_entry(v, struct proto, node));
1da177e4
LT
2969 return 0;
2970}
2971
f690808e 2972static const struct seq_operations proto_seq_ops = {
1da177e4
LT
2973 .start = proto_seq_start,
2974 .next = proto_seq_next,
2975 .stop = proto_seq_stop,
2976 .show = proto_seq_show,
2977};
2978
2979static int proto_seq_open(struct inode *inode, struct file *file)
2980{
14e943db
ED
2981 return seq_open_net(inode, file, &proto_seq_ops,
2982 sizeof(struct seq_net_private));
1da177e4
LT
2983}
2984
9a32144e 2985static const struct file_operations proto_seq_fops = {
1da177e4
LT
2986 .owner = THIS_MODULE,
2987 .open = proto_seq_open,
2988 .read = seq_read,
2989 .llseek = seq_lseek,
14e943db
ED
2990 .release = seq_release_net,
2991};
2992
2993static __net_init int proto_init_net(struct net *net)
2994{
d4beaa66 2995 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
14e943db
ED
2996 return -ENOMEM;
2997
2998 return 0;
2999}
3000
3001static __net_exit void proto_exit_net(struct net *net)
3002{
ece31ffd 3003 remove_proc_entry("protocols", net->proc_net);
14e943db
ED
3004}
3005
3006
3007static __net_initdata struct pernet_operations proto_net_ops = {
3008 .init = proto_init_net,
3009 .exit = proto_exit_net,
1da177e4
LT
3010};
3011
3012static int __init proto_init(void)
3013{
14e943db 3014 return register_pernet_subsys(&proto_net_ops);
1da177e4
LT
3015}
3016
3017subsys_initcall(proto_init);
3018
3019#endif /* PROC_FS */