net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138
 139 #include <trace/events/sock.h>
 140
 141 #ifdef CONFIG_INET
 142 #include <net/tcp.h>
 143 #endif
 144
 145 #include <net/busy_poll.h>
 146
 147 static DEFINE_MUTEX(proto_list_mutex);
 148 static LIST_HEAD(proto_list);
 149
 150 /**
 151  * sk_ns_capable - General socket capability test
 152  * @sk: Socket to use a capability on or through
 153  * @user_ns: The user namespace of the capability to use
 154  * @cap: The capability to use
 155  *
 156  * Test to see if the opener of the socket had when the socket was
 157  * created and the current process has the capability @cap in the user
 158  * namespace @user_ns.
 159  */
 160 bool sk_ns_capable(const struct sock *sk,
 161                    struct user_namespace *user_ns, int cap)
 162 {
 163         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                 ns_capable(user_ns, cap);
 165 }
 166 EXPORT_SYMBOL(sk_ns_capable);
 167
 168 /**
 169  * sk_capable - Socket global capability test
 170  * @sk: Socket to use a capability on or through
 171  * @cap: The global capability to use
 172  *
 173  * Test to see if the opener of the socket had when the socket was
 174  * created and the current process has the capability @cap in all user
 175  * namespaces.
 176  */
 177 bool sk_capable(const struct sock *sk, int cap)
 178 {
 179         return sk_ns_capable(sk, &init_user_ns, cap);
 180 }
 181 EXPORT_SYMBOL(sk_capable);
 182
 183 /**
 184  * sk_net_capable - Network namespace socket capability test
 185  * @sk: Socket to use a capability on or through
 186  * @cap: The capability to use
 187  *
 188  * Test to see if the opener of the socket had when the socket was created
 189  * and the current process has the capability @cap over the network namespace
 190  * the socket is a member of.
 191  */
 192 bool sk_net_capable(const struct sock *sk, int cap)
 193 {
 194         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195 }
 196 EXPORT_SYMBOL(sk_net_capable);
 197
 198 /*
 199  * Each address family might have different locking rules, so we have
 200  * one slock key per address family and separate keys for internal and
 201  * userspace sockets.
 202  */
 203 static struct lock_class_key af_family_keys[AF_MAX];
 204 static struct lock_class_key af_family_kern_keys[AF_MAX];
 205 static struct lock_class_key af_family_slock_keys[AF_MAX];
 206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 207
 208 /*
 209  * Make lock validator output more readable. (we pre-construct these
 210  * strings build-time, so that runtime initialization of socket
 211  * locks is fast):
 212  */
 213
 214 #define _sock_locks(x)                                            \
 215   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 216   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 217   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 218   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 219   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 220   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 221   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 222   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 223   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 224   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 225   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 226   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 227   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 228   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 229   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 230
 231 static const char *const af_family_key_strings[AF_MAX+1] = {
 232         _sock_locks("sk_lock-")
 233 };
 234 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 235         _sock_locks("slock-")
 236 };
 237 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 238         _sock_locks("clock-")
 239 };
 240
 241 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 242         _sock_locks("k-sk_lock-")
 243 };
 244 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 245         _sock_locks("k-slock-")
 246 };
 247 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 248         _sock_locks("k-clock-")
 249 };
 250 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 251   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 252   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 253   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 254   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 255   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 256   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 257   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 258   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 259   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 260   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 261   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 262   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 263   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 264   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 265   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 266 };
 267 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 268   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 269   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 270   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 271   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 272   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 273   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 274   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 275   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 276   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 277   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 278   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 279   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 280   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 281   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 282   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 283 };
 284 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 285   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 286   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 287   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 288   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 289   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 290   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 291   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 292   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 293   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 294   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 295   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 296   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 297   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 298   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 299   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 300 };
 301
 302 /*
 303  * sk_callback_lock and sk queues locking rules are per-address-family,
 304  * so split the lock classes by using a per-AF key:
 305  */
 306 static struct lock_class_key af_callback_keys[AF_MAX];
 307 static struct lock_class_key af_rlock_keys[AF_MAX];
 308 static struct lock_class_key af_wlock_keys[AF_MAX];
 309 static struct lock_class_key af_elock_keys[AF_MAX];
 310 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 311
 312 /* Take into consideration the size of the struct sk_buff overhead in the
 313  * determination of these values, since that is non-constant across
 314  * platforms.  This makes socket queueing behavior and performance
 315  * not depend upon such differences.
 316  */
 317 #define _SK_MEM_PACKETS         256
 318 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 319 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 320 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 321
 322 /* Run time adjustable parameters. */
 323 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 324 EXPORT_SYMBOL(sysctl_wmem_max);
 325 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 326 EXPORT_SYMBOL(sysctl_rmem_max);
 327 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 328 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 329
 330 /* Maximal space eaten by iovec or ancillary data plus some space */
 331 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 332 EXPORT_SYMBOL(sysctl_optmem_max);
 333
 334 int sysctl_tstamp_allow_data __read_mostly = 1;
 335
 336 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 337 EXPORT_SYMBOL_GPL(memalloc_socks);
 338
 339 /**
 340  * sk_set_memalloc - sets %SOCK_MEMALLOC
 341  * @sk: socket to set it on
 342  *
 343  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 344  * It's the responsibility of the admin to adjust min_free_kbytes
 345  * to meet the requirements
 346  */
 347 void sk_set_memalloc(struct sock *sk)
 348 {
 349         sock_set_flag(sk, SOCK_MEMALLOC);
 350         sk->sk_allocation |= __GFP_MEMALLOC;
 351         static_key_slow_inc(&memalloc_socks);
 352 }
 353 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 354
 355 void sk_clear_memalloc(struct sock *sk)
 356 {
 357         sock_reset_flag(sk, SOCK_MEMALLOC);
 358         sk->sk_allocation &= ~__GFP_MEMALLOC;
 359         static_key_slow_dec(&memalloc_socks);
 360
 361         /*
 362          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 363          * progress of swapping. SOCK_MEMALLOC may be cleared while
 364          * it has rmem allocations due to the last swapfile being deactivated
 365          * but there is a risk that the socket is unusable due to exceeding
 366          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 367          */
 368         sk_mem_reclaim(sk);
 369 }
 370 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 371
 372 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 373 {
 374         int ret;
 375         unsigned long pflags = current->flags;
 376
 377         /* these should have been dropped before queueing */
 378         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 379
 380         current->flags |= PF_MEMALLOC;
 381         ret = sk->sk_backlog_rcv(sk, skb);
 382         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 383
 384         return ret;
 385 }
 386 EXPORT_SYMBOL(__sk_backlog_rcv);
 387
 388 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 389 {
 390         struct timeval tv;
 391
 392         if (optlen < sizeof(tv))
 393                 return -EINVAL;
 394         if (copy_from_user(&tv, optval, sizeof(tv)))
 395                 return -EFAULT;
 396         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 397                 return -EDOM;
 398
 399         if (tv.tv_sec < 0) {
 400                 static int warned __read_mostly;
 401
 402                 *timeo_p = 0;
 403                 if (warned < 10 && net_ratelimit()) {
 404                         warned++;
 405                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 406                                 __func__, current->comm, task_pid_nr(current));
 407                 }
 408                 return 0;
 409         }
 410         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 411         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 412                 return 0;
 413         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 414                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 415         return 0;
 416 }
 417
 418 static void sock_warn_obsolete_bsdism(const char *name)
 419 {
 420         static int warned;
 421         static char warncomm[TASK_COMM_LEN];
 422         if (strcmp(warncomm, current->comm) && warned < 5) {
 423                 strcpy(warncomm,  current->comm);
 424                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 425                         warncomm, name);
 426                 warned++;
 427         }
 428 }
 429
 430 static bool sock_needs_netstamp(const struct sock *sk)
 431 {
 432         switch (sk->sk_family) {
 433         case AF_UNSPEC:
 434         case AF_UNIX:
 435                 return false;
 436         default:
 437                 return true;
 438         }
 439 }
 440
 441 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 442 {
 443         if (sk->sk_flags & flags) {
 444                 sk->sk_flags &= ~flags;
 445                 if (sock_needs_netstamp(sk) &&
 446                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 447                         net_disable_timestamp();
 448         }
 449 }
 450
 451
 452 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 453 {
 454         unsigned long flags;
 455         struct sk_buff_head *list = &sk->sk_receive_queue;
 456
 457         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 458                 atomic_inc(&sk->sk_drops);
 459                 trace_sock_rcvqueue_full(sk, skb);
 460                 return -ENOMEM;
 461         }
 462
 463         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 464                 atomic_inc(&sk->sk_drops);
 465                 return -ENOBUFS;
 466         }
 467
 468         skb->dev = NULL;
 469         skb_set_owner_r(skb, sk);
 470
 471         /* we escape from rcu protected region, make sure we dont leak
 472          * a norefcounted dst
 473          */
 474         skb_dst_force(skb);
 475
 476         spin_lock_irqsave(&list->lock, flags);
 477         sock_skb_set_dropcount(sk, skb);
 478         __skb_queue_tail(list, skb);
 479         spin_unlock_irqrestore(&list->lock, flags);
 480
 481         if (!sock_flag(sk, SOCK_DEAD))
 482                 sk->sk_data_ready(sk);
 483         return 0;
 484 }
 485 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 486
 487 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 488 {
 489         int err;
 490
 491         err = sk_filter(sk, skb);
 492         if (err)
 493                 return err;
 494
 495         return __sock_queue_rcv_skb(sk, skb);
 496 }
 497 EXPORT_SYMBOL(sock_queue_rcv_skb);
 498
 499 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 500                      const int nested, unsigned int trim_cap, bool refcounted)
 501 {
 502         int rc = NET_RX_SUCCESS;
 503
 504         if (sk_filter_trim_cap(sk, skb, trim_cap))
 505                 goto discard_and_relse;
 506
 507         skb->dev = NULL;
 508
 509         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 510                 atomic_inc(&sk->sk_drops);
 511                 goto discard_and_relse;
 512         }
 513         if (nested)
 514                 bh_lock_sock_nested(sk);
 515         else
 516                 bh_lock_sock(sk);
 517         if (!sock_owned_by_user(sk)) {
 518                 /*
 519                  * trylock + unlock semantics:
 520                  */
 521                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 522
 523                 rc = sk_backlog_rcv(sk, skb);
 524
 525                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 526         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 527                 bh_unlock_sock(sk);
 528                 atomic_inc(&sk->sk_drops);
 529                 goto discard_and_relse;
 530         }
 531
 532         bh_unlock_sock(sk);
 533 out:
 534         if (refcounted)
 535                 sock_put(sk);
 536         return rc;
 537 discard_and_relse:
 538         kfree_skb(skb);
 539         goto out;
 540 }
 541 EXPORT_SYMBOL(__sk_receive_skb);
 542
 543 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 544 {
 545         struct dst_entry *dst = __sk_dst_get(sk);
 546
 547         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 548                 sk_tx_queue_clear(sk);
 549                 sk->sk_dst_pending_confirm = 0;
 550                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 551                 dst_release(dst);
 552                 return NULL;
 553         }
 554
 555         return dst;
 556 }
 557 EXPORT_SYMBOL(__sk_dst_check);
 558
 559 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 560 {
 561         struct dst_entry *dst = sk_dst_get(sk);
 562
 563         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 564                 sk_dst_reset(sk);
 565                 dst_release(dst);
 566                 return NULL;
 567         }
 568
 569         return dst;
 570 }
 571 EXPORT_SYMBOL(sk_dst_check);
 572
 573 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 574                                 int optlen)
 575 {
 576         int ret = -ENOPROTOOPT;
 577 #ifdef CONFIG_NETDEVICES
 578         struct net *net = sock_net(sk);
 579         char devname[IFNAMSIZ];
 580         int index;
 581
 582         /* Sorry... */
 583         ret = -EPERM;
 584         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 585                 goto out;
 586
 587         ret = -EINVAL;
 588         if (optlen < 0)
 589                 goto out;
 590
 591         /* Bind this socket to a particular device like "eth0",
 592          * as specified in the passed interface name. If the
 593          * name is "" or the option length is zero the socket
 594          * is not bound.
 595          */
 596         if (optlen > IFNAMSIZ - 1)
 597                 optlen = IFNAMSIZ - 1;
 598         memset(devname, 0, sizeof(devname));
 599
 600         ret = -EFAULT;
 601         if (copy_from_user(devname, optval, optlen))
 602                 goto out;
 603
 604         index = 0;
 605         if (devname[0] != '\0') {
 606                 struct net_device *dev;
 607
 608                 rcu_read_lock();
 609                 dev = dev_get_by_name_rcu(net, devname);
 610                 if (dev)
 611                         index = dev->ifindex;
 612                 rcu_read_unlock();
 613                 ret = -ENODEV;
 614                 if (!dev)
 615                         goto out;
 616         }
 617
 618         lock_sock(sk);
 619         sk->sk_bound_dev_if = index;
 620         sk_dst_reset(sk);
 621         release_sock(sk);
 622
 623         ret = 0;
 624
 625 out:
 626 #endif
 627
 628         return ret;
 629 }
 630
 631 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 632                                 int __user *optlen, int len)
 633 {
 634         int ret = -ENOPROTOOPT;
 635 #ifdef CONFIG_NETDEVICES
 636         struct net *net = sock_net(sk);
 637         char devname[IFNAMSIZ];
 638
 639         if (sk->sk_bound_dev_if == 0) {
 640                 len = 0;
 641                 goto zero;
 642         }
 643
 644         ret = -EINVAL;
 645         if (len < IFNAMSIZ)
 646                 goto out;
 647
 648         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 649         if (ret)
 650                 goto out;
 651
 652         len = strlen(devname) + 1;
 653
 654         ret = -EFAULT;
 655         if (copy_to_user(optval, devname, len))
 656                 goto out;
 657
 658 zero:
 659         ret = -EFAULT;
 660         if (put_user(len, optlen))
 661                 goto out;
 662
 663         ret = 0;
 664
 665 out:
 666 #endif
 667
 668         return ret;
 669 }
 670
 671 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 672 {
 673         if (valbool)
 674                 sock_set_flag(sk, bit);
 675         else
 676                 sock_reset_flag(sk, bit);
 677 }
 678
 679 bool sk_mc_loop(struct sock *sk)
 680 {
 681         if (dev_recursion_level())
 682                 return false;
 683         if (!sk)
 684                 return true;
 685         switch (sk->sk_family) {
 686         case AF_INET:
 687                 return inet_sk(sk)->mc_loop;
 688 #if IS_ENABLED(CONFIG_IPV6)
 689         case AF_INET6:
 690                 return inet6_sk(sk)->mc_loop;
 691 #endif
 692         }
 693         WARN_ON(1);
 694         return true;
 695 }
 696 EXPORT_SYMBOL(sk_mc_loop);
 697
 698 /*
 699  *      This is meant for all protocols to use and covers goings on
 700  *      at the socket level. Everything here is generic.
 701  */
 702
 703 int sock_setsockopt(struct socket *sock, int level, int optname,
 704                     char __user *optval, unsigned int optlen)
 705 {
 706         struct sock *sk = sock->sk;
 707         int val;
 708         int valbool;
 709         struct linger ling;
 710         int ret = 0;
 711
 712         /*
 713          *      Options without arguments
 714          */
 715
 716         if (optname == SO_BINDTODEVICE)
 717                 return sock_setbindtodevice(sk, optval, optlen);
 718
 719         if (optlen < sizeof(int))
 720                 return -EINVAL;
 721
 722         if (get_user(val, (int __user *)optval))
 723                 return -EFAULT;
 724
 725         valbool = val ? 1 : 0;
 726
 727         lock_sock(sk);
 728
 729         switch (optname) {
 730         case SO_DEBUG:
 731                 if (val && !capable(CAP_NET_ADMIN))
 732                         ret = -EACCES;
 733                 else
 734                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 735                 break;
 736         case SO_REUSEADDR:
 737                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 738                 break;
 739         case SO_REUSEPORT:
 740                 sk->sk_reuseport = valbool;
 741                 break;
 742         case SO_TYPE:
 743         case SO_PROTOCOL:
 744         case SO_DOMAIN:
 745         case SO_ERROR:
 746                 ret = -ENOPROTOOPT;
 747                 break;
 748         case SO_DONTROUTE:
 749                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 750                 break;
 751         case SO_BROADCAST:
 752                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 753                 break;
 754         case SO_SNDBUF:
 755                 /* Don't error on this BSD doesn't and if you think
 756                  * about it this is right. Otherwise apps have to
 757                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 758                  * are treated in BSD as hints
 759                  */
 760                 val = min_t(u32, val, sysctl_wmem_max);
 761 set_sndbuf:
 762                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 763                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 764                 /* Wake up sending tasks if we upped the value. */
 765                 sk->sk_write_space(sk);
 766                 break;
 767
 768         case SO_SNDBUFFORCE:
 769                 if (!capable(CAP_NET_ADMIN)) {
 770                         ret = -EPERM;
 771                         break;
 772                 }
 773                 goto set_sndbuf;
 774
 775         case SO_RCVBUF:
 776                 /* Don't error on this BSD doesn't and if you think
 777                  * about it this is right. Otherwise apps have to
 778                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 779                  * are treated in BSD as hints
 780                  */
 781                 val = min_t(u32, val, sysctl_rmem_max);
 782 set_rcvbuf:
 783                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 784                 /*
 785                  * We double it on the way in to account for
 786                  * "struct sk_buff" etc. overhead.   Applications
 787                  * assume that the SO_RCVBUF setting they make will
 788                  * allow that much actual data to be received on that
 789                  * socket.
 790                  *
 791                  * Applications are unaware that "struct sk_buff" and
 792                  * other overheads allocate from the receive buffer
 793                  * during socket buffer allocation.
 794                  *
 795                  * And after considering the possible alternatives,
 796                  * returning the value we actually used in getsockopt
 797                  * is the most desirable behavior.
 798                  */
 799                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 800                 break;
 801
 802         case SO_RCVBUFFORCE:
 803                 if (!capable(CAP_NET_ADMIN)) {
 804                         ret = -EPERM;
 805                         break;
 806                 }
 807                 goto set_rcvbuf;
 808
 809         case SO_KEEPALIVE:
 810                 if (sk->sk_prot->keepalive)
 811                         sk->sk_prot->keepalive(sk, valbool);
 812                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 813                 break;
 814
 815         case SO_OOBINLINE:
 816                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 817                 break;
 818
 819         case SO_NO_CHECK:
 820                 sk->sk_no_check_tx = valbool;
 821                 break;
 822
 823         case SO_PRIORITY:
 824                 if ((val >= 0 && val <= 6) ||
 825                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 826                         sk->sk_priority = val;
 827                 else
 828                         ret = -EPERM;
 829                 break;
 830
 831         case SO_LINGER:
 832                 if (optlen < sizeof(ling)) {
 833                         ret = -EINVAL;  /* 1003.1g */
 834                         break;
 835                 }
 836                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 837                         ret = -EFAULT;
 838                         break;
 839                 }
 840                 if (!ling.l_onoff)
 841                         sock_reset_flag(sk, SOCK_LINGER);
 842                 else {
 843 #if (BITS_PER_LONG == 32)
 844                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 845                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 846                         else
 847 #endif
 848                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 849                         sock_set_flag(sk, SOCK_LINGER);
 850                 }
 851                 break;
 852
 853         case SO_BSDCOMPAT:
 854                 sock_warn_obsolete_bsdism("setsockopt");
 855                 break;
 856
 857         case SO_PASSCRED:
 858                 if (valbool)
 859                         set_bit(SOCK_PASSCRED, &sock->flags);
 860                 else
 861                         clear_bit(SOCK_PASSCRED, &sock->flags);
 862                 break;
 863
 864         case SO_TIMESTAMP:
 865         case SO_TIMESTAMPNS:
 866                 if (valbool)  {
 867                         if (optname == SO_TIMESTAMP)
 868                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 869                         else
 870                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 871                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 872                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 873                 } else {
 874                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 875                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 876                 }
 877                 break;
 878
 879         case SO_TIMESTAMPING:
 880                 if (val & ~SOF_TIMESTAMPING_MASK) {
 881                         ret = -EINVAL;
 882                         break;
 883                 }
 884
 885                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 886                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 887                         if (sk->sk_protocol == IPPROTO_TCP &&
 888                             sk->sk_type == SOCK_STREAM) {
 889                                 if ((1 << sk->sk_state) &
 890                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 891                                         ret = -EINVAL;
 892                                         break;
 893                                 }
 894                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 895                         } else {
 896                                 sk->sk_tskey = 0;
 897                         }
 898                 }
 899
 900                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 901                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 902                         ret = -EINVAL;
 903                         break;
 904                 }
 905
 906                 sk->sk_tsflags = val;
 907                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 908                         sock_enable_timestamp(sk,
 909                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 910                 else
 911                         sock_disable_timestamp(sk,
 912                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 913                 break;
 914
 915         case SO_RCVLOWAT:
 916                 if (val < 0)
 917                         val = INT_MAX;
 918                 sk->sk_rcvlowat = val ? : 1;
 919                 break;
 920
 921         case SO_RCVTIMEO:
 922                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 923                 break;
 924
 925         case SO_SNDTIMEO:
 926                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 927                 break;
 928
 929         case SO_ATTACH_FILTER:
 930                 ret = -EINVAL;
 931                 if (optlen == sizeof(struct sock_fprog)) {
 932                         struct sock_fprog fprog;
 933
 934                         ret = -EFAULT;
 935                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 936                                 break;
 937
 938                         ret = sk_attach_filter(&fprog, sk);
 939                 }
 940                 break;
 941
 942         case SO_ATTACH_BPF:
 943                 ret = -EINVAL;
 944                 if (optlen == sizeof(u32)) {
 945                         u32 ufd;
 946
 947                         ret = -EFAULT;
 948                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 949                                 break;
 950
 951                         ret = sk_attach_bpf(ufd, sk);
 952                 }
 953                 break;
 954
 955         case SO_ATTACH_REUSEPORT_CBPF:
 956                 ret = -EINVAL;
 957                 if (optlen == sizeof(struct sock_fprog)) {
 958                         struct sock_fprog fprog;
 959
 960                         ret = -EFAULT;
 961                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 962                                 break;
 963
 964                         ret = sk_reuseport_attach_filter(&fprog, sk);
 965                 }
 966                 break;
 967
 968         case SO_ATTACH_REUSEPORT_EBPF:
 969                 ret = -EINVAL;
 970                 if (optlen == sizeof(u32)) {
 971                         u32 ufd;
 972
 973                         ret = -EFAULT;
 974                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 975                                 break;
 976
 977                         ret = sk_reuseport_attach_bpf(ufd, sk);
 978                 }
 979                 break;
 980
 981         case SO_DETACH_FILTER:
 982                 ret = sk_detach_filter(sk);
 983                 break;
 984
 985         case SO_LOCK_FILTER:
 986                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 987                         ret = -EPERM;
 988                 else
 989                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 990                 break;
 991
 992         case SO_PASSSEC:
 993                 if (valbool)
 994                         set_bit(SOCK_PASSSEC, &sock->flags);
 995                 else
 996                         clear_bit(SOCK_PASSSEC, &sock->flags);
 997                 break;
 998         case SO_MARK:
 999                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1000                         ret = -EPERM;
1001                 else
1002                         sk->sk_mark = val;
1003                 break;
1004
1005         case SO_RXQ_OVFL:
1006                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1007                 break;
1008
1009         case SO_WIFI_STATUS:
1010                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1011                 break;
1012
1013         case SO_PEEK_OFF:
1014                 if (sock->ops->set_peek_off)
1015                         ret = sock->ops->set_peek_off(sk, val);
1016                 else
1017                         ret = -EOPNOTSUPP;
1018                 break;
1019
1020         case SO_NOFCS:
1021                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1022                 break;
1023
1024         case SO_SELECT_ERR_QUEUE:
1025                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1026                 break;
1027
1028 #ifdef CONFIG_NET_RX_BUSY_POLL
1029         case SO_BUSY_POLL:
1030                 /* allow unprivileged users to decrease the value */
1031                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1032                         ret = -EPERM;
1033                 else {
1034                         if (val < 0)
1035                                 ret = -EINVAL;
1036                         else
1037                                 sk->sk_ll_usec = val;
1038                 }
1039                 break;
1040 #endif
1041
1042         case SO_MAX_PACING_RATE:
1043                 sk->sk_max_pacing_rate = val;
1044                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1045                                          sk->sk_max_pacing_rate);
1046                 break;
1047
1048         case SO_INCOMING_CPU:
1049                 sk->sk_incoming_cpu = val;
1050                 break;
1051
1052         case SO_CNX_ADVICE:
1053                 if (val == 1)
1054                         dst_negative_advice(sk);
1055                 break;
1056         default:
1057                 ret = -ENOPROTOOPT;
1058                 break;
1059         }
1060         release_sock(sk);
1061         return ret;
1062 }
1063 EXPORT_SYMBOL(sock_setsockopt);
1064
1065
1066 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1067                           struct ucred *ucred)
1068 {
1069         ucred->pid = pid_vnr(pid);
1070         ucred->uid = ucred->gid = -1;
1071         if (cred) {
1072                 struct user_namespace *current_ns = current_user_ns();
1073
1074                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1075                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1076         }
1077 }
1078
1079 int sock_getsockopt(struct socket *sock, int level, int optname,
1080                     char __user *optval, int __user *optlen)
1081 {
1082         struct sock *sk = sock->sk;
1083
1084         union {
1085                 int val;
1086                 struct linger ling;
1087                 struct timeval tm;
1088         } v;
1089
1090         int lv = sizeof(int);
1091         int len;
1092
1093         if (get_user(len, optlen))
1094                 return -EFAULT;
1095         if (len < 0)
1096                 return -EINVAL;
1097
1098         memset(&v, 0, sizeof(v));
1099
1100         switch (optname) {
1101         case SO_DEBUG:
1102                 v.val = sock_flag(sk, SOCK_DBG);
1103                 break;
1104
1105         case SO_DONTROUTE:
1106                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1107                 break;
1108
1109         case SO_BROADCAST:
1110                 v.val = sock_flag(sk, SOCK_BROADCAST);
1111                 break;
1112
1113         case SO_SNDBUF:
1114                 v.val = sk->sk_sndbuf;
1115                 break;
1116
1117         case SO_RCVBUF:
1118                 v.val = sk->sk_rcvbuf;
1119                 break;
1120
1121         case SO_REUSEADDR:
1122                 v.val = sk->sk_reuse;
1123                 break;
1124
1125         case SO_REUSEPORT:
1126                 v.val = sk->sk_reuseport;
1127                 break;
1128
1129         case SO_KEEPALIVE:
1130                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1131                 break;
1132
1133         case SO_TYPE:
1134                 v.val = sk->sk_type;
1135                 break;
1136
1137         case SO_PROTOCOL:
1138                 v.val = sk->sk_protocol;
1139                 break;
1140
1141         case SO_DOMAIN:
1142                 v.val = sk->sk_family;
1143                 break;
1144
1145         case SO_ERROR:
1146                 v.val = -sock_error(sk);
1147                 if (v.val == 0)
1148                         v.val = xchg(&sk->sk_err_soft, 0);
1149                 break;
1150
1151         case SO_OOBINLINE:
1152                 v.val = sock_flag(sk, SOCK_URGINLINE);
1153                 break;
1154
1155         case SO_NO_CHECK:
1156                 v.val = sk->sk_no_check_tx;
1157                 break;
1158
1159         case SO_PRIORITY:
1160                 v.val = sk->sk_priority;
1161                 break;
1162
1163         case SO_LINGER:
1164                 lv              = sizeof(v.ling);
1165                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1166                 v.ling.l_linger = sk->sk_lingertime / HZ;
1167                 break;
1168
1169         case SO_BSDCOMPAT:
1170                 sock_warn_obsolete_bsdism("getsockopt");
1171                 break;
1172
1173         case SO_TIMESTAMP:
1174                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1175                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1176                 break;
1177
1178         case SO_TIMESTAMPNS:
1179                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1180                 break;
1181
1182         case SO_TIMESTAMPING:
1183                 v.val = sk->sk_tsflags;
1184                 break;
1185
1186         case SO_RCVTIMEO:
1187                 lv = sizeof(struct timeval);
1188                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1189                         v.tm.tv_sec = 0;
1190                         v.tm.tv_usec = 0;
1191                 } else {
1192                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1193                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1194                 }
1195                 break;
1196
1197         case SO_SNDTIMEO:
1198                 lv = sizeof(struct timeval);
1199                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1200                         v.tm.tv_sec = 0;
1201                         v.tm.tv_usec = 0;
1202                 } else {
1203                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1204                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1205                 }
1206                 break;
1207
1208         case SO_RCVLOWAT:
1209                 v.val = sk->sk_rcvlowat;
1210                 break;
1211
1212         case SO_SNDLOWAT:
1213                 v.val = 1;
1214                 break;
1215
1216         case SO_PASSCRED:
1217                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1218                 break;
1219
1220         case SO_PEERCRED:
1221         {
1222                 struct ucred peercred;
1223                 if (len > sizeof(peercred))
1224                         len = sizeof(peercred);
1225                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1226                 if (copy_to_user(optval, &peercred, len))
1227                         return -EFAULT;
1228                 goto lenout;
1229         }
1230
1231         case SO_PEERNAME:
1232         {
1233                 char address[128];
1234
1235                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1236                         return -ENOTCONN;
1237                 if (lv < len)
1238                         return -EINVAL;
1239                 if (copy_to_user(optval, address, len))
1240                         return -EFAULT;
1241                 goto lenout;
1242         }
1243
1244         /* Dubious BSD thing... Probably nobody even uses it, but
1245          * the UNIX standard wants it for whatever reason... -DaveM
1246          */
1247         case SO_ACCEPTCONN:
1248                 v.val = sk->sk_state == TCP_LISTEN;
1249                 break;
1250
1251         case SO_PASSSEC:
1252                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1253                 break;
1254
1255         case SO_PEERSEC:
1256                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1257
1258         case SO_MARK:
1259                 v.val = sk->sk_mark;
1260                 break;
1261
1262         case SO_RXQ_OVFL:
1263                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1264                 break;
1265
1266         case SO_WIFI_STATUS:
1267                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1268                 break;
1269
1270         case SO_PEEK_OFF:
1271                 if (!sock->ops->set_peek_off)
1272                         return -EOPNOTSUPP;
1273
1274                 v.val = sk->sk_peek_off;
1275                 break;
1276         case SO_NOFCS:
1277                 v.val = sock_flag(sk, SOCK_NOFCS);
1278                 break;
1279
1280         case SO_BINDTODEVICE:
1281                 return sock_getbindtodevice(sk, optval, optlen, len);
1282
1283         case SO_GET_FILTER:
1284                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1285                 if (len < 0)
1286                         return len;
1287
1288                 goto lenout;
1289
1290         case SO_LOCK_FILTER:
1291                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1292                 break;
1293
1294         case SO_BPF_EXTENSIONS:
1295                 v.val = bpf_tell_extensions();
1296                 break;
1297
1298         case SO_SELECT_ERR_QUEUE:
1299                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1300                 break;
1301
1302 #ifdef CONFIG_NET_RX_BUSY_POLL
1303         case SO_BUSY_POLL:
1304                 v.val = sk->sk_ll_usec;
1305                 break;
1306 #endif
1307
1308         case SO_MAX_PACING_RATE:
1309                 v.val = sk->sk_max_pacing_rate;
1310                 break;
1311
1312         case SO_INCOMING_CPU:
1313                 v.val = sk->sk_incoming_cpu;
1314                 break;
1315
1316         case SO_MEMINFO:
1317         {
1318                 u32 meminfo[SK_MEMINFO_VARS];
1319
1320                 if (get_user(len, optlen))
1321                         return -EFAULT;
1322
1323                 sk_get_meminfo(sk, meminfo);
1324
1325                 len = min_t(unsigned int, len, sizeof(meminfo));
1326                 if (copy_to_user(optval, &meminfo, len))
1327                         return -EFAULT;
1328
1329                 goto lenout;
1330         }
1331         default:
1332                 /* We implement the SO_SNDLOWAT etc to not be settable
1333                  * (1003.1g 7).
1334                  */
1335                 return -ENOPROTOOPT;
1336         }
1337
1338         if (len > lv)
1339                 len = lv;
1340         if (copy_to_user(optval, &v, len))
1341                 return -EFAULT;
1342 lenout:
1343         if (put_user(len, optlen))
1344                 return -EFAULT;
1345         return 0;
1346 }
1347
1348 /*
1349  * Initialize an sk_lock.
1350  *
1351  * (We also register the sk_lock with the lock validator.)
1352  */
1353 static inline void sock_lock_init(struct sock *sk)
1354 {
1355         if (sk->sk_kern_sock)
1356                 sock_lock_init_class_and_name(
1357                         sk,
1358                         af_family_kern_slock_key_strings[sk->sk_family],
1359                         af_family_kern_slock_keys + sk->sk_family,
1360                         af_family_kern_key_strings[sk->sk_family],
1361                         af_family_kern_keys + sk->sk_family);
1362         else
1363                 sock_lock_init_class_and_name(
1364                         sk,
1365                         af_family_slock_key_strings[sk->sk_family],
1366                         af_family_slock_keys + sk->sk_family,
1367                         af_family_key_strings[sk->sk_family],
1368                         af_family_keys + sk->sk_family);
1369 }
1370
1371 /*
1372  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1373  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1374  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1375  */
1376 static void sock_copy(struct sock *nsk, const struct sock *osk)
1377 {
1378 #ifdef CONFIG_SECURITY_NETWORK
1379         void *sptr = nsk->sk_security;
1380 #endif
1381         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1382
1383         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1384                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1385
1386 #ifdef CONFIG_SECURITY_NETWORK
1387         nsk->sk_security = sptr;
1388         security_sk_clone(osk, nsk);
1389 #endif
1390 }
1391
1392 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1393                 int family)
1394 {
1395         struct sock *sk;
1396         struct kmem_cache *slab;
1397
1398         slab = prot->slab;
1399         if (slab != NULL) {
1400                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1401                 if (!sk)
1402                         return sk;
1403                 if (priority & __GFP_ZERO)
1404                         sk_prot_clear_nulls(sk, prot->obj_size);
1405         } else
1406                 sk = kmalloc(prot->obj_size, priority);
1407
1408         if (sk != NULL) {
1409                 kmemcheck_annotate_bitfield(sk, flags);
1410
1411                 if (security_sk_alloc(sk, family, priority))
1412                         goto out_free;
1413
1414                 if (!try_module_get(prot->owner))
1415                         goto out_free_sec;
1416                 sk_tx_queue_clear(sk);
1417         }
1418
1419         return sk;
1420
1421 out_free_sec:
1422         security_sk_free(sk);
1423 out_free:
1424         if (slab != NULL)
1425                 kmem_cache_free(slab, sk);
1426         else
1427                 kfree(sk);
1428         return NULL;
1429 }
1430
1431 static void sk_prot_free(struct proto *prot, struct sock *sk)
1432 {
1433         struct kmem_cache *slab;
1434         struct module *owner;
1435
1436         owner = prot->owner;
1437         slab = prot->slab;
1438
1439         cgroup_sk_free(&sk->sk_cgrp_data);
1440         mem_cgroup_sk_free(sk);
1441         security_sk_free(sk);
1442         if (slab != NULL)
1443                 kmem_cache_free(slab, sk);
1444         else
1445                 kfree(sk);
1446         module_put(owner);
1447 }
1448
1449 /**
1450  *      sk_alloc - All socket objects are allocated here
1451  *      @net: the applicable net namespace
1452  *      @family: protocol family
1453  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1454  *      @prot: struct proto associated with this new sock instance
1455  *      @kern: is this to be a kernel socket?
1456  */
1457 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1458                       struct proto *prot, int kern)
1459 {
1460         struct sock *sk;
1461
1462         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1463         if (sk) {
1464                 sk->sk_family = family;
1465                 /*
1466                  * See comment in struct sock definition to understand
1467                  * why we need sk_prot_creator -acme
1468                  */
1469                 sk->sk_prot = sk->sk_prot_creator = prot;
1470                 sk->sk_kern_sock = kern;
1471                 sock_lock_init(sk);
1472                 sk->sk_net_refcnt = kern ? 0 : 1;
1473                 if (likely(sk->sk_net_refcnt))
1474                         get_net(net);
1475                 sock_net_set(sk, net);
1476                 atomic_set(&sk->sk_wmem_alloc, 1);
1477
1478                 mem_cgroup_sk_alloc(sk);
1479                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1480                 sock_update_classid(&sk->sk_cgrp_data);
1481                 sock_update_netprioidx(&sk->sk_cgrp_data);
1482         }
1483
1484         return sk;
1485 }
1486 EXPORT_SYMBOL(sk_alloc);
1487
1488 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1489  * grace period. This is the case for UDP sockets and TCP listeners.
1490  */
1491 static void __sk_destruct(struct rcu_head *head)
1492 {
1493         struct sock *sk = container_of(head, struct sock, sk_rcu);
1494         struct sk_filter *filter;
1495
1496         if (sk->sk_destruct)
1497                 sk->sk_destruct(sk);
1498
1499         filter = rcu_dereference_check(sk->sk_filter,
1500                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1501         if (filter) {
1502                 sk_filter_uncharge(sk, filter);
1503                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1504         }
1505         if (rcu_access_pointer(sk->sk_reuseport_cb))
1506                 reuseport_detach_sock(sk);
1507
1508         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1509
1510         if (atomic_read(&sk->sk_omem_alloc))
1511                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1512                          __func__, atomic_read(&sk->sk_omem_alloc));
1513
1514         if (sk->sk_frag.page) {
1515                 put_page(sk->sk_frag.page);
1516                 sk->sk_frag.page = NULL;
1517         }
1518
1519         if (sk->sk_peer_cred)
1520                 put_cred(sk->sk_peer_cred);
1521         put_pid(sk->sk_peer_pid);
1522         if (likely(sk->sk_net_refcnt))
1523                 put_net(sock_net(sk));
1524         sk_prot_free(sk->sk_prot_creator, sk);
1525 }
1526
1527 void sk_destruct(struct sock *sk)
1528 {
1529         if (sock_flag(sk, SOCK_RCU_FREE))
1530                 call_rcu(&sk->sk_rcu, __sk_destruct);
1531         else
1532                 __sk_destruct(&sk->sk_rcu);
1533 }
1534
1535 static void __sk_free(struct sock *sk)
1536 {
1537         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1538                 sock_diag_broadcast_destroy(sk);
1539         else
1540                 sk_destruct(sk);
1541 }
1542
1543 void sk_free(struct sock *sk)
1544 {
1545         /*
1546          * We subtract one from sk_wmem_alloc and can know if
1547          * some packets are still in some tx queue.
1548          * If not null, sock_wfree() will call __sk_free(sk) later
1549          */
1550         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1551                 __sk_free(sk);
1552 }
1553 EXPORT_SYMBOL(sk_free);
1554
1555 static void sk_init_common(struct sock *sk)
1556 {
1557         skb_queue_head_init(&sk->sk_receive_queue);
1558         skb_queue_head_init(&sk->sk_write_queue);
1559         skb_queue_head_init(&sk->sk_error_queue);
1560
1561         rwlock_init(&sk->sk_callback_lock);
1562         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1563                         af_rlock_keys + sk->sk_family,
1564                         af_family_rlock_key_strings[sk->sk_family]);
1565         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1566                         af_wlock_keys + sk->sk_family,
1567                         af_family_wlock_key_strings[sk->sk_family]);
1568         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1569                         af_elock_keys + sk->sk_family,
1570                         af_family_elock_key_strings[sk->sk_family]);
1571         lockdep_set_class_and_name(&sk->sk_callback_lock,
1572                         af_callback_keys + sk->sk_family,
1573                         af_family_clock_key_strings[sk->sk_family]);
1574 }
1575
1576 /**
1577  *      sk_clone_lock - clone a socket, and lock its clone
1578  *      @sk: the socket to clone
1579  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1580  *
1581  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1582  */
1583 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1584 {
1585         struct sock *newsk;
1586         bool is_charged = true;
1587
1588         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1589         if (newsk != NULL) {
1590                 struct sk_filter *filter;
1591
1592                 sock_copy(newsk, sk);
1593
1594                 /* SANITY */
1595                 if (likely(newsk->sk_net_refcnt))
1596                         get_net(sock_net(newsk));
1597                 sk_node_init(&newsk->sk_node);
1598                 sock_lock_init(newsk);
1599                 bh_lock_sock(newsk);
1600                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1601                 newsk->sk_backlog.len = 0;
1602
1603                 atomic_set(&newsk->sk_rmem_alloc, 0);
1604                 /*
1605                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1606                  */
1607                 atomic_set(&newsk->sk_wmem_alloc, 1);
1608                 atomic_set(&newsk->sk_omem_alloc, 0);
1609                 sk_init_common(newsk);
1610
1611                 newsk->sk_dst_cache     = NULL;
1612                 newsk->sk_dst_pending_confirm = 0;
1613                 newsk->sk_wmem_queued   = 0;
1614                 newsk->sk_forward_alloc = 0;
1615                 atomic_set(&newsk->sk_drops, 0);
1616                 newsk->sk_send_head     = NULL;
1617                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1618
1619                 sock_reset_flag(newsk, SOCK_DONE);
1620
1621                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1622                 if (filter != NULL)
1623                         /* though it's an empty new sock, the charging may fail
1624                          * if sysctl_optmem_max was changed between creation of
1625                          * original socket and cloning
1626                          */
1627                         is_charged = sk_filter_charge(newsk, filter);
1628
1629                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1630                         /* We need to make sure that we don't uncharge the new
1631                          * socket if we couldn't charge it in the first place
1632                          * as otherwise we uncharge the parent's filter.
1633                          */
1634                         if (!is_charged)
1635                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1636                         sk_free_unlock_clone(newsk);
1637                         newsk = NULL;
1638                         goto out;
1639                 }
1640                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1641
1642                 newsk->sk_err      = 0;
1643                 newsk->sk_err_soft = 0;
1644                 newsk->sk_priority = 0;
1645                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1646                 atomic64_set(&newsk->sk_cookie, 0);
1647
1648                 mem_cgroup_sk_alloc(newsk);
1649                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1650
1651                 /*
1652                  * Before updating sk_refcnt, we must commit prior changes to memory
1653                  * (Documentation/RCU/rculist_nulls.txt for details)
1654                  */
1655                 smp_wmb();
1656                 atomic_set(&newsk->sk_refcnt, 2);
1657
1658                 /*
1659                  * Increment the counter in the same struct proto as the master
1660                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1661                  * is the same as sk->sk_prot->socks, as this field was copied
1662                  * with memcpy).
1663                  *
1664                  * This _changes_ the previous behaviour, where
1665                  * tcp_create_openreq_child always was incrementing the
1666                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1667                  * to be taken into account in all callers. -acme
1668                  */
1669                 sk_refcnt_debug_inc(newsk);
1670                 sk_set_socket(newsk, NULL);
1671                 newsk->sk_wq = NULL;
1672
1673                 if (newsk->sk_prot->sockets_allocated)
1674                         sk_sockets_allocated_inc(newsk);
1675
1676                 if (sock_needs_netstamp(sk) &&
1677                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1678                         net_enable_timestamp();
1679         }
1680 out:
1681         return newsk;
1682 }
1683 EXPORT_SYMBOL_GPL(sk_clone_lock);
1684
1685 void sk_free_unlock_clone(struct sock *sk)
1686 {
1687         /* It is still raw copy of parent, so invalidate
1688          * destructor and make plain sk_free() */
1689         sk->sk_destruct = NULL;
1690         bh_unlock_sock(sk);
1691         sk_free(sk);
1692 }
1693 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1694
1695 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1696 {
1697         u32 max_segs = 1;
1698
1699         sk_dst_set(sk, dst);
1700         sk->sk_route_caps = dst->dev->features;
1701         if (sk->sk_route_caps & NETIF_F_GSO)
1702                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1703         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1704         if (sk_can_gso(sk)) {
1705                 if (dst->header_len) {
1706                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1707                 } else {
1708                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1709                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1710                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1711                 }
1712         }
1713         sk->sk_gso_max_segs = max_segs;
1714 }
1715 EXPORT_SYMBOL_GPL(sk_setup_caps);
1716
1717 /*
1718  *      Simple resource managers for sockets.
1719  */
1720
1721
1722 /*
1723  * Write buffer destructor automatically called from kfree_skb.
1724  */
1725 void sock_wfree(struct sk_buff *skb)
1726 {
1727         struct sock *sk = skb->sk;
1728         unsigned int len = skb->truesize;
1729
1730         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1731                 /*
1732                  * Keep a reference on sk_wmem_alloc, this will be released
1733                  * after sk_write_space() call
1734                  */
1735                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1736                 sk->sk_write_space(sk);
1737                 len = 1;
1738         }
1739         /*
1740          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1741          * could not do because of in-flight packets
1742          */
1743         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1744                 __sk_free(sk);
1745 }
1746 EXPORT_SYMBOL(sock_wfree);
1747
1748 /* This variant of sock_wfree() is used by TCP,
1749  * since it sets SOCK_USE_WRITE_QUEUE.
1750  */
1751 void __sock_wfree(struct sk_buff *skb)
1752 {
1753         struct sock *sk = skb->sk;
1754
1755         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1756                 __sk_free(sk);
1757 }
1758
1759 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1760 {
1761         skb_orphan(skb);
1762         skb->sk = sk;
1763 #ifdef CONFIG_INET
1764         if (unlikely(!sk_fullsock(sk))) {
1765                 skb->destructor = sock_edemux;
1766                 sock_hold(sk);
1767                 return;
1768         }
1769 #endif
1770         skb->destructor = sock_wfree;
1771         skb_set_hash_from_sk(skb, sk);
1772         /*
1773          * We used to take a refcount on sk, but following operation
1774          * is enough to guarantee sk_free() wont free this sock until
1775          * all in-flight packets are completed
1776          */
1777         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1778 }
1779 EXPORT_SYMBOL(skb_set_owner_w);
1780
1781 /* This helper is used by netem, as it can hold packets in its
1782  * delay queue. We want to allow the owner socket to send more
1783  * packets, as if they were already TX completed by a typical driver.
1784  * But we also want to keep skb->sk set because some packet schedulers
1785  * rely on it (sch_fq for example). So we set skb->truesize to a small
1786  * amount (1) and decrease sk_wmem_alloc accordingly.
1787  */
1788 void skb_orphan_partial(struct sk_buff *skb)
1789 {
1790         /* If this skb is a TCP pure ACK or already went here,
1791          * we have nothing to do. 2 is already a very small truesize.
1792          */
1793         if (skb->truesize <= 2)
1794                 return;
1795
1796         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1797          * so we do not completely orphan skb, but transfert all
1798          * accounted bytes but one, to avoid unexpected reorders.
1799          */
1800         if (skb->destructor == sock_wfree
1801 #ifdef CONFIG_INET
1802             || skb->destructor == tcp_wfree
1803 #endif
1804                 ) {
1805                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1806                 skb->truesize = 1;
1807         } else {
1808                 skb_orphan(skb);
1809         }
1810 }
1811 EXPORT_SYMBOL(skb_orphan_partial);
1812
1813 /*
1814  * Read buffer destructor automatically called from kfree_skb.
1815  */
1816 void sock_rfree(struct sk_buff *skb)
1817 {
1818         struct sock *sk = skb->sk;
1819         unsigned int len = skb->truesize;
1820
1821         atomic_sub(len, &sk->sk_rmem_alloc);
1822         sk_mem_uncharge(sk, len);
1823 }
1824 EXPORT_SYMBOL(sock_rfree);
1825
1826 /*
1827  * Buffer destructor for skbs that are not used directly in read or write
1828  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1829  */
1830 void sock_efree(struct sk_buff *skb)
1831 {
1832         sock_put(skb->sk);
1833 }
1834 EXPORT_SYMBOL(sock_efree);
1835
1836 kuid_t sock_i_uid(struct sock *sk)
1837 {
1838         kuid_t uid;
1839
1840         read_lock_bh(&sk->sk_callback_lock);
1841         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1842         read_unlock_bh(&sk->sk_callback_lock);
1843         return uid;
1844 }
1845 EXPORT_SYMBOL(sock_i_uid);
1846
1847 unsigned long sock_i_ino(struct sock *sk)
1848 {
1849         unsigned long ino;
1850
1851         read_lock_bh(&sk->sk_callback_lock);
1852         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1853         read_unlock_bh(&sk->sk_callback_lock);
1854         return ino;
1855 }
1856 EXPORT_SYMBOL(sock_i_ino);
1857
1858 /*
1859  * Allocate a skb from the socket's send buffer.
1860  */
1861 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1862                              gfp_t priority)
1863 {
1864         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1865                 struct sk_buff *skb = alloc_skb(size, priority);
1866                 if (skb) {
1867                         skb_set_owner_w(skb, sk);
1868                         return skb;
1869                 }
1870         }
1871         return NULL;
1872 }
1873 EXPORT_SYMBOL(sock_wmalloc);
1874
1875 /*
1876  * Allocate a memory block from the socket's option memory buffer.
1877  */
1878 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1879 {
1880         if ((unsigned int)size <= sysctl_optmem_max &&
1881             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1882                 void *mem;
1883                 /* First do the add, to avoid the race if kmalloc
1884                  * might sleep.
1885                  */
1886                 atomic_add(size, &sk->sk_omem_alloc);
1887                 mem = kmalloc(size, priority);
1888                 if (mem)
1889                         return mem;
1890                 atomic_sub(size, &sk->sk_omem_alloc);
1891         }
1892         return NULL;
1893 }
1894 EXPORT_SYMBOL(sock_kmalloc);
1895
1896 /* Free an option memory block. Note, we actually want the inline
1897  * here as this allows gcc to detect the nullify and fold away the
1898  * condition entirely.
1899  */
1900 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1901                                   const bool nullify)
1902 {
1903         if (WARN_ON_ONCE(!mem))
1904                 return;
1905         if (nullify)
1906                 kzfree(mem);
1907         else
1908                 kfree(mem);
1909         atomic_sub(size, &sk->sk_omem_alloc);
1910 }
1911
1912 void sock_kfree_s(struct sock *sk, void *mem, int size)
1913 {
1914         __sock_kfree_s(sk, mem, size, false);
1915 }
1916 EXPORT_SYMBOL(sock_kfree_s);
1917
1918 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1919 {
1920         __sock_kfree_s(sk, mem, size, true);
1921 }
1922 EXPORT_SYMBOL(sock_kzfree_s);
1923
1924 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1925    I think, these locks should be removed for datagram sockets.
1926  */
1927 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1928 {
1929         DEFINE_WAIT(wait);
1930
1931         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1932         for (;;) {
1933                 if (!timeo)
1934                         break;
1935                 if (signal_pending(current))
1936                         break;
1937                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1938                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1939                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1940                         break;
1941                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1942                         break;
1943                 if (sk->sk_err)
1944                         break;
1945                 timeo = schedule_timeout(timeo);
1946         }
1947         finish_wait(sk_sleep(sk), &wait);
1948         return timeo;
1949 }
1950
1951
1952 /*
1953  *      Generic send/receive buffer handlers
1954  */
1955
1956 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1957                                      unsigned long data_len, int noblock,
1958                                      int *errcode, int max_page_order)
1959 {
1960         struct sk_buff *skb;
1961         long timeo;
1962         int err;
1963
1964         timeo = sock_sndtimeo(sk, noblock);
1965         for (;;) {
1966                 err = sock_error(sk);
1967                 if (err != 0)
1968                         goto failure;
1969
1970                 err = -EPIPE;
1971                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1972                         goto failure;
1973
1974                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1975                         break;
1976
1977                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1978                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1979                 err = -EAGAIN;
1980                 if (!timeo)
1981                         goto failure;
1982                 if (signal_pending(current))
1983                         goto interrupted;
1984                 timeo = sock_wait_for_wmem(sk, timeo);
1985         }
1986         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1987                                    errcode, sk->sk_allocation);
1988         if (skb)
1989                 skb_set_owner_w(skb, sk);
1990         return skb;
1991
1992 interrupted:
1993         err = sock_intr_errno(timeo);
1994 failure:
1995         *errcode = err;
1996         return NULL;
1997 }
1998 EXPORT_SYMBOL(sock_alloc_send_pskb);
1999
2000 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2001                                     int noblock, int *errcode)
2002 {
2003         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2004 }
2005 EXPORT_SYMBOL(sock_alloc_send_skb);
2006
2007 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2008                      struct sockcm_cookie *sockc)
2009 {
2010         u32 tsflags;
2011
2012         switch (cmsg->cmsg_type) {
2013         case SO_MARK:
2014                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2015                         return -EPERM;
2016                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2017                         return -EINVAL;
2018                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2019                 break;
2020         case SO_TIMESTAMPING:
2021                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2022                         return -EINVAL;
2023
2024                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2025                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2026                         return -EINVAL;
2027
2028                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2029                 sockc->tsflags |= tsflags;
2030                 break;
2031         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2032         case SCM_RIGHTS:
2033         case SCM_CREDENTIALS:
2034                 break;
2035         default:
2036                 return -EINVAL;
2037         }
2038         return 0;
2039 }
2040 EXPORT_SYMBOL(__sock_cmsg_send);
2041
2042 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2043                    struct sockcm_cookie *sockc)
2044 {
2045         struct cmsghdr *cmsg;
2046         int ret;
2047
2048         for_each_cmsghdr(cmsg, msg) {
2049                 if (!CMSG_OK(msg, cmsg))
2050                         return -EINVAL;
2051                 if (cmsg->cmsg_level != SOL_SOCKET)
2052                         continue;
2053                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2054                 if (ret)
2055                         return ret;
2056         }
2057         return 0;
2058 }
2059 EXPORT_SYMBOL(sock_cmsg_send);
2060
2061 /* On 32bit arches, an skb frag is limited to 2^15 */
2062 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2063
2064 /**
2065  * skb_page_frag_refill - check that a page_frag contains enough room
2066  * @sz: minimum size of the fragment we want to get
2067  * @pfrag: pointer to page_frag
2068  * @gfp: priority for memory allocation
2069  *
2070  * Note: While this allocator tries to use high order pages, there is
2071  * no guarantee that allocations succeed. Therefore, @sz MUST be
2072  * less or equal than PAGE_SIZE.
2073  */
2074 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2075 {
2076         if (pfrag->page) {
2077                 if (page_ref_count(pfrag->page) == 1) {
2078                         pfrag->offset = 0;
2079                         return true;
2080                 }
2081                 if (pfrag->offset + sz <= pfrag->size)
2082                         return true;
2083                 put_page(pfrag->page);
2084         }
2085
2086         pfrag->offset = 0;
2087         if (SKB_FRAG_PAGE_ORDER) {
2088                 /* Avoid direct reclaim but allow kswapd to wake */
2089                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2090                                           __GFP_COMP | __GFP_NOWARN |
2091                                           __GFP_NORETRY,
2092                                           SKB_FRAG_PAGE_ORDER);
2093                 if (likely(pfrag->page)) {
2094                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2095                         return true;
2096                 }
2097         }
2098         pfrag->page = alloc_page(gfp);
2099         if (likely(pfrag->page)) {
2100                 pfrag->size = PAGE_SIZE;
2101                 return true;
2102         }
2103         return false;
2104 }
2105 EXPORT_SYMBOL(skb_page_frag_refill);
2106
2107 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2108 {
2109         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2110                 return true;
2111
2112         sk_enter_memory_pressure(sk);
2113         sk_stream_moderate_sndbuf(sk);
2114         return false;
2115 }
2116 EXPORT_SYMBOL(sk_page_frag_refill);
2117
2118 static void __lock_sock(struct sock *sk)
2119         __releases(&sk->sk_lock.slock)
2120         __acquires(&sk->sk_lock.slock)
2121 {
2122         DEFINE_WAIT(wait);
2123
2124         for (;;) {
2125                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2126                                         TASK_UNINTERRUPTIBLE);
2127                 spin_unlock_bh(&sk->sk_lock.slock);
2128                 schedule();
2129                 spin_lock_bh(&sk->sk_lock.slock);
2130                 if (!sock_owned_by_user(sk))
2131                         break;
2132         }
2133         finish_wait(&sk->sk_lock.wq, &wait);
2134 }
2135
2136 static void __release_sock(struct sock *sk)
2137         __releases(&sk->sk_lock.slock)
2138         __acquires(&sk->sk_lock.slock)
2139 {
2140         struct sk_buff *skb, *next;
2141
2142         while ((skb = sk->sk_backlog.head) != NULL) {
2143                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2144
2145                 spin_unlock_bh(&sk->sk_lock.slock);
2146
2147                 do {
2148                         next = skb->next;
2149                         prefetch(next);
2150                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2151                         skb->next = NULL;
2152                         sk_backlog_rcv(sk, skb);
2153
2154                         cond_resched();
2155
2156                         skb = next;
2157                 } while (skb != NULL);
2158
2159                 spin_lock_bh(&sk->sk_lock.slock);
2160         }
2161
2162         /*
2163          * Doing the zeroing here guarantee we can not loop forever
2164          * while a wild producer attempts to flood us.
2165          */
2166         sk->sk_backlog.len = 0;
2167 }
2168
2169 void __sk_flush_backlog(struct sock *sk)
2170 {
2171         spin_lock_bh(&sk->sk_lock.slock);
2172         __release_sock(sk);
2173         spin_unlock_bh(&sk->sk_lock.slock);
2174 }
2175
2176 /**
2177  * sk_wait_data - wait for data to arrive at sk_receive_queue
2178  * @sk:    sock to wait on
2179  * @timeo: for how long
2180  * @skb:   last skb seen on sk_receive_queue
2181  *
2182  * Now socket state including sk->sk_err is changed only under lock,
2183  * hence we may omit checks after joining wait queue.
2184  * We check receive queue before schedule() only as optimization;
2185  * it is very likely that release_sock() added new data.
2186  */
2187 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2188 {
2189         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2190         int rc;
2191
2192         add_wait_queue(sk_sleep(sk), &wait);
2193         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2194         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2195         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2196         remove_wait_queue(sk_sleep(sk), &wait);
2197         return rc;
2198 }
2199 EXPORT_SYMBOL(sk_wait_data);
2200
2201 /**
2202  *      __sk_mem_raise_allocated - increase memory_allocated
2203  *      @sk: socket
2204  *      @size: memory size to allocate
2205  *      @amt: pages to allocate
2206  *      @kind: allocation type
2207  *
2208  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2209  */
2210 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2211 {
2212         struct proto *prot = sk->sk_prot;
2213         long allocated = sk_memory_allocated_add(sk, amt);
2214
2215         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2216             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2217                 goto suppress_allocation;
2218
2219         /* Under limit. */
2220         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2221                 sk_leave_memory_pressure(sk);
2222                 return 1;
2223         }
2224
2225         /* Under pressure. */
2226         if (allocated > sk_prot_mem_limits(sk, 1))
2227                 sk_enter_memory_pressure(sk);
2228
2229         /* Over hard limit. */
2230         if (allocated > sk_prot_mem_limits(sk, 2))
2231                 goto suppress_allocation;
2232
2233         /* guarantee minimum buffer size under pressure */
2234         if (kind == SK_MEM_RECV) {
2235                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2236                         return 1;
2237
2238         } else { /* SK_MEM_SEND */
2239                 if (sk->sk_type == SOCK_STREAM) {
2240                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2241                                 return 1;
2242                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2243                            prot->sysctl_wmem[0])
2244                                 return 1;
2245         }
2246
2247         if (sk_has_memory_pressure(sk)) {
2248                 int alloc;
2249
2250                 if (!sk_under_memory_pressure(sk))
2251                         return 1;
2252                 alloc = sk_sockets_allocated_read_positive(sk);
2253                 if (sk_prot_mem_limits(sk, 2) > alloc *
2254                     sk_mem_pages(sk->sk_wmem_queued +
2255                                  atomic_read(&sk->sk_rmem_alloc) +
2256                                  sk->sk_forward_alloc))
2257                         return 1;
2258         }
2259
2260 suppress_allocation:
2261
2262         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2263                 sk_stream_moderate_sndbuf(sk);
2264
2265                 /* Fail only if socket is _under_ its sndbuf.
2266                  * In this case we cannot block, so that we have to fail.
2267                  */
2268                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2269                         return 1;
2270         }
2271
2272         trace_sock_exceed_buf_limit(sk, prot, allocated);
2273
2274         sk_memory_allocated_sub(sk, amt);
2275
2276         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2277                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2278
2279         return 0;
2280 }
2281 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2282
2283 /**
2284  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2285  *      @sk: socket
2286  *      @size: memory size to allocate
2287  *      @kind: allocation type
2288  *
2289  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2290  *      rmem allocation. This function assumes that protocols which have
2291  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2292  */
2293 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2294 {
2295         int ret, amt = sk_mem_pages(size);
2296
2297         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2298         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2299         if (!ret)
2300                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2301         return ret;
2302 }
2303 EXPORT_SYMBOL(__sk_mem_schedule);
2304
2305 /**
2306  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2307  *      @sk: socket
2308  *      @amount: number of quanta
2309  *
2310  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2311  */
2312 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2313 {
2314         sk_memory_allocated_sub(sk, amount);
2315
2316         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2317                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2318
2319         if (sk_under_memory_pressure(sk) &&
2320             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2321                 sk_leave_memory_pressure(sk);
2322 }
2323 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2324
2325 /**
2326  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2327  *      @sk: socket
2328  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2329  */
2330 void __sk_mem_reclaim(struct sock *sk, int amount)
2331 {
2332         amount >>= SK_MEM_QUANTUM_SHIFT;
2333         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2334         __sk_mem_reduce_allocated(sk, amount);
2335 }
2336 EXPORT_SYMBOL(__sk_mem_reclaim);
2337
2338 int sk_set_peek_off(struct sock *sk, int val)
2339 {
2340         if (val < 0)
2341                 return -EINVAL;
2342
2343         sk->sk_peek_off = val;
2344         return 0;
2345 }
2346 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2347
2348 /*
2349  * Set of default routines for initialising struct proto_ops when
2350  * the protocol does not support a particular function. In certain
2351  * cases where it makes no sense for a protocol to have a "do nothing"
2352  * function, some default processing is provided.
2353  */
2354
2355 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2356 {
2357         return -EOPNOTSUPP;
2358 }
2359 EXPORT_SYMBOL(sock_no_bind);
2360
2361 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2362                     int len, int flags)
2363 {
2364         return -EOPNOTSUPP;
2365 }
2366 EXPORT_SYMBOL(sock_no_connect);
2367
2368 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2369 {
2370         return -EOPNOTSUPP;
2371 }
2372 EXPORT_SYMBOL(sock_no_socketpair);
2373
2374 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2375                    bool kern)
2376 {
2377         return -EOPNOTSUPP;
2378 }
2379 EXPORT_SYMBOL(sock_no_accept);
2380
2381 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2382                     int *len, int peer)
2383 {
2384         return -EOPNOTSUPP;
2385 }
2386 EXPORT_SYMBOL(sock_no_getname);
2387
2388 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2389 {
2390         return 0;
2391 }
2392 EXPORT_SYMBOL(sock_no_poll);
2393
2394 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2395 {
2396         return -EOPNOTSUPP;
2397 }
2398 EXPORT_SYMBOL(sock_no_ioctl);
2399
2400 int sock_no_listen(struct socket *sock, int backlog)
2401 {
2402         return -EOPNOTSUPP;
2403 }
2404 EXPORT_SYMBOL(sock_no_listen);
2405
2406 int sock_no_shutdown(struct socket *sock, int how)
2407 {
2408         return -EOPNOTSUPP;
2409 }
2410 EXPORT_SYMBOL(sock_no_shutdown);
2411
2412 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2413                     char __user *optval, unsigned int optlen)
2414 {
2415         return -EOPNOTSUPP;
2416 }
2417 EXPORT_SYMBOL(sock_no_setsockopt);
2418
2419 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2420                     char __user *optval, int __user *optlen)
2421 {
2422         return -EOPNOTSUPP;
2423 }
2424 EXPORT_SYMBOL(sock_no_getsockopt);
2425
2426 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2427 {
2428         return -EOPNOTSUPP;
2429 }
2430 EXPORT_SYMBOL(sock_no_sendmsg);
2431
2432 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2433                     int flags)
2434 {
2435         return -EOPNOTSUPP;
2436 }
2437 EXPORT_SYMBOL(sock_no_recvmsg);
2438
2439 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2440 {
2441         /* Mirror missing mmap method error code */
2442         return -ENODEV;
2443 }
2444 EXPORT_SYMBOL(sock_no_mmap);
2445
2446 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2447 {
2448         ssize_t res;
2449         struct msghdr msg = {.msg_flags = flags};
2450         struct kvec iov;
2451         char *kaddr = kmap(page);
2452         iov.iov_base = kaddr + offset;
2453         iov.iov_len = size;
2454         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2455         kunmap(page);
2456         return res;
2457 }
2458 EXPORT_SYMBOL(sock_no_sendpage);
2459
2460 /*
2461  *      Default Socket Callbacks
2462  */
2463
2464 static void sock_def_wakeup(struct sock *sk)
2465 {
2466         struct socket_wq *wq;
2467
2468         rcu_read_lock();
2469         wq = rcu_dereference(sk->sk_wq);
2470         if (skwq_has_sleeper(wq))
2471                 wake_up_interruptible_all(&wq->wait);
2472         rcu_read_unlock();
2473 }
2474
2475 static void sock_def_error_report(struct sock *sk)
2476 {
2477         struct socket_wq *wq;
2478
2479         rcu_read_lock();
2480         wq = rcu_dereference(sk->sk_wq);
2481         if (skwq_has_sleeper(wq))
2482                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2483         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2484         rcu_read_unlock();
2485 }
2486
2487 static void sock_def_readable(struct sock *sk)
2488 {
2489         struct socket_wq *wq;
2490
2491         rcu_read_lock();
2492         wq = rcu_dereference(sk->sk_wq);
2493         if (skwq_has_sleeper(wq))
2494                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2495                                                 POLLRDNORM | POLLRDBAND);
2496         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2497         rcu_read_unlock();
2498 }
2499
2500 static void sock_def_write_space(struct sock *sk)
2501 {
2502         struct socket_wq *wq;
2503
2504         rcu_read_lock();
2505
2506         /* Do not wake up a writer until he can make "significant"
2507          * progress.  --DaveM
2508          */
2509         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2510                 wq = rcu_dereference(sk->sk_wq);
2511                 if (skwq_has_sleeper(wq))
2512                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2513                                                 POLLWRNORM | POLLWRBAND);
2514
2515                 /* Should agree with poll, otherwise some programs break */
2516                 if (sock_writeable(sk))
2517                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2518         }
2519
2520         rcu_read_unlock();
2521 }
2522
2523 static void sock_def_destruct(struct sock *sk)
2524 {
2525 }
2526
2527 void sk_send_sigurg(struct sock *sk)
2528 {
2529         if (sk->sk_socket && sk->sk_socket->file)
2530                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2531                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2532 }
2533 EXPORT_SYMBOL(sk_send_sigurg);
2534
2535 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2536                     unsigned long expires)
2537 {
2538         if (!mod_timer(timer, expires))
2539                 sock_hold(sk);
2540 }
2541 EXPORT_SYMBOL(sk_reset_timer);
2542
2543 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2544 {
2545         if (del_timer(timer))
2546                 __sock_put(sk);
2547 }
2548 EXPORT_SYMBOL(sk_stop_timer);
2549
2550 void sock_init_data(struct socket *sock, struct sock *sk)
2551 {
2552         sk_init_common(sk);
2553         sk->sk_send_head        =       NULL;
2554
2555         init_timer(&sk->sk_timer);
2556
2557         sk->sk_allocation       =       GFP_KERNEL;
2558         sk->sk_rcvbuf           =       sysctl_rmem_default;
2559         sk->sk_sndbuf           =       sysctl_wmem_default;
2560         sk->sk_state            =       TCP_CLOSE;
2561         sk_set_socket(sk, sock);
2562
2563         sock_set_flag(sk, SOCK_ZAPPED);
2564
2565         if (sock) {
2566                 sk->sk_type     =       sock->type;
2567                 sk->sk_wq       =       sock->wq;
2568                 sock->sk        =       sk;
2569                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2570         } else {
2571                 sk->sk_wq       =       NULL;
2572                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2573         }
2574
2575         rwlock_init(&sk->sk_callback_lock);
2576         if (sk->sk_kern_sock)
2577                 lockdep_set_class_and_name(
2578                         &sk->sk_callback_lock,
2579                         af_kern_callback_keys + sk->sk_family,
2580                         af_family_kern_clock_key_strings[sk->sk_family]);
2581         else
2582                 lockdep_set_class_and_name(
2583                         &sk->sk_callback_lock,
2584                         af_callback_keys + sk->sk_family,
2585                         af_family_clock_key_strings[sk->sk_family]);
2586
2587         sk->sk_state_change     =       sock_def_wakeup;
2588         sk->sk_data_ready       =       sock_def_readable;
2589         sk->sk_write_space      =       sock_def_write_space;
2590         sk->sk_error_report     =       sock_def_error_report;
2591         sk->sk_destruct         =       sock_def_destruct;
2592
2593         sk->sk_frag.page        =       NULL;
2594         sk->sk_frag.offset      =       0;
2595         sk->sk_peek_off         =       -1;
2596
2597         sk->sk_peer_pid         =       NULL;
2598         sk->sk_peer_cred        =       NULL;
2599         sk->sk_write_pending    =       0;
2600         sk->sk_rcvlowat         =       1;
2601         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2602         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2603
2604         sk->sk_stamp = ktime_set(-1L, 0);
2605
2606 #ifdef CONFIG_NET_RX_BUSY_POLL
2607         sk->sk_napi_id          =       0;
2608         sk->sk_ll_usec          =       sysctl_net_busy_read;
2609 #endif
2610
2611         sk->sk_max_pacing_rate = ~0U;
2612         sk->sk_pacing_rate = ~0U;
2613         sk->sk_incoming_cpu = -1;
2614         /*
2615          * Before updating sk_refcnt, we must commit prior changes to memory
2616          * (Documentation/RCU/rculist_nulls.txt for details)
2617          */
2618         smp_wmb();
2619         atomic_set(&sk->sk_refcnt, 1);
2620         atomic_set(&sk->sk_drops, 0);
2621 }
2622 EXPORT_SYMBOL(sock_init_data);
2623
2624 void lock_sock_nested(struct sock *sk, int subclass)
2625 {
2626         might_sleep();
2627         spin_lock_bh(&sk->sk_lock.slock);
2628         if (sk->sk_lock.owned)
2629                 __lock_sock(sk);
2630         sk->sk_lock.owned = 1;
2631         spin_unlock(&sk->sk_lock.slock);
2632         /*
2633          * The sk_lock has mutex_lock() semantics here:
2634          */
2635         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2636         local_bh_enable();
2637 }
2638 EXPORT_SYMBOL(lock_sock_nested);
2639
2640 void release_sock(struct sock *sk)
2641 {
2642         spin_lock_bh(&sk->sk_lock.slock);
2643         if (sk->sk_backlog.tail)
2644                 __release_sock(sk);
2645
2646         /* Warning : release_cb() might need to release sk ownership,
2647          * ie call sock_release_ownership(sk) before us.
2648          */
2649         if (sk->sk_prot->release_cb)
2650                 sk->sk_prot->release_cb(sk);
2651
2652         sock_release_ownership(sk);
2653         if (waitqueue_active(&sk->sk_lock.wq))
2654                 wake_up(&sk->sk_lock.wq);
2655         spin_unlock_bh(&sk->sk_lock.slock);
2656 }
2657 EXPORT_SYMBOL(release_sock);
2658
2659 /**
2660  * lock_sock_fast - fast version of lock_sock
2661  * @sk: socket
2662  *
2663  * This version should be used for very small section, where process wont block
2664  * return false if fast path is taken
2665  *   sk_lock.slock locked, owned = 0, BH disabled
2666  * return true if slow path is taken
2667  *   sk_lock.slock unlocked, owned = 1, BH enabled
2668  */
2669 bool lock_sock_fast(struct sock *sk)
2670 {
2671         might_sleep();
2672         spin_lock_bh(&sk->sk_lock.slock);
2673
2674         if (!sk->sk_lock.owned)
2675                 /*
2676                  * Note : We must disable BH
2677                  */
2678                 return false;
2679
2680         __lock_sock(sk);
2681         sk->sk_lock.owned = 1;
2682         spin_unlock(&sk->sk_lock.slock);
2683         /*
2684          * The sk_lock has mutex_lock() semantics here:
2685          */
2686         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2687         local_bh_enable();
2688         return true;
2689 }
2690 EXPORT_SYMBOL(lock_sock_fast);
2691
2692 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2693 {
2694         struct timeval tv;
2695         if (!sock_flag(sk, SOCK_TIMESTAMP))
2696                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2697         tv = ktime_to_timeval(sk->sk_stamp);
2698         if (tv.tv_sec == -1)
2699                 return -ENOENT;
2700         if (tv.tv_sec == 0) {
2701                 sk->sk_stamp = ktime_get_real();
2702                 tv = ktime_to_timeval(sk->sk_stamp);
2703         }
2704         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2705 }
2706 EXPORT_SYMBOL(sock_get_timestamp);
2707
2708 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2709 {
2710         struct timespec ts;
2711         if (!sock_flag(sk, SOCK_TIMESTAMP))
2712                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2713         ts = ktime_to_timespec(sk->sk_stamp);
2714         if (ts.tv_sec == -1)
2715                 return -ENOENT;
2716         if (ts.tv_sec == 0) {
2717                 sk->sk_stamp = ktime_get_real();
2718                 ts = ktime_to_timespec(sk->sk_stamp);
2719         }
2720         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2721 }
2722 EXPORT_SYMBOL(sock_get_timestampns);
2723
2724 void sock_enable_timestamp(struct sock *sk, int flag)
2725 {
2726         if (!sock_flag(sk, flag)) {
2727                 unsigned long previous_flags = sk->sk_flags;
2728
2729                 sock_set_flag(sk, flag);
2730                 /*
2731                  * we just set one of the two flags which require net
2732                  * time stamping, but time stamping might have been on
2733                  * already because of the other one
2734                  */
2735                 if (sock_needs_netstamp(sk) &&
2736                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2737                         net_enable_timestamp();
2738         }
2739 }
2740
2741 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2742                        int level, int type)
2743 {
2744         struct sock_exterr_skb *serr;
2745         struct sk_buff *skb;
2746         int copied, err;
2747
2748         err = -EAGAIN;
2749         skb = sock_dequeue_err_skb(sk);
2750         if (skb == NULL)
2751                 goto out;
2752
2753         copied = skb->len;
2754         if (copied > len) {
2755                 msg->msg_flags |= MSG_TRUNC;
2756                 copied = len;
2757         }
2758         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2759         if (err)
2760                 goto out_free_skb;
2761
2762         sock_recv_timestamp(msg, sk, skb);
2763
2764         serr = SKB_EXT_ERR(skb);
2765         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2766
2767         msg->msg_flags |= MSG_ERRQUEUE;
2768         err = copied;
2769
2770 out_free_skb:
2771         kfree_skb(skb);
2772 out:
2773         return err;
2774 }
2775 EXPORT_SYMBOL(sock_recv_errqueue);
2776
2777 /*
2778  *      Get a socket option on an socket.
2779  *
2780  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2781  *      asynchronous errors should be reported by getsockopt. We assume
2782  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2783  */
2784 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2785                            char __user *optval, int __user *optlen)
2786 {
2787         struct sock *sk = sock->sk;
2788
2789         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2790 }
2791 EXPORT_SYMBOL(sock_common_getsockopt);
2792
2793 #ifdef CONFIG_COMPAT
2794 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2795                                   char __user *optval, int __user *optlen)
2796 {
2797         struct sock *sk = sock->sk;
2798
2799         if (sk->sk_prot->compat_getsockopt != NULL)
2800                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2801                                                       optval, optlen);
2802         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2803 }
2804 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2805 #endif
2806
2807 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2808                         int flags)
2809 {
2810         struct sock *sk = sock->sk;
2811         int addr_len = 0;
2812         int err;
2813
2814         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2815                                    flags & ~MSG_DONTWAIT, &addr_len);
2816         if (err >= 0)
2817                 msg->msg_namelen = addr_len;
2818         return err;
2819 }
2820 EXPORT_SYMBOL(sock_common_recvmsg);
2821
2822 /*
2823  *      Set socket options on an inet socket.
2824  */
2825 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2826                            char __user *optval, unsigned int optlen)
2827 {
2828         struct sock *sk = sock->sk;
2829
2830         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2831 }
2832 EXPORT_SYMBOL(sock_common_setsockopt);
2833
2834 #ifdef CONFIG_COMPAT
2835 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2836                                   char __user *optval, unsigned int optlen)
2837 {
2838         struct sock *sk = sock->sk;
2839
2840         if (sk->sk_prot->compat_setsockopt != NULL)
2841                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2842                                                       optval, optlen);
2843         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2844 }
2845 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2846 #endif
2847
2848 void sk_common_release(struct sock *sk)
2849 {
2850         if (sk->sk_prot->destroy)
2851                 sk->sk_prot->destroy(sk);
2852
2853         /*
2854          * Observation: when sock_common_release is called, processes have
2855          * no access to socket. But net still has.
2856          * Step one, detach it from networking:
2857          *
2858          * A. Remove from hash tables.
2859          */
2860
2861         sk->sk_prot->unhash(sk);
2862
2863         /*
2864          * In this point socket cannot receive new packets, but it is possible
2865          * that some packets are in flight because some CPU runs receiver and
2866          * did hash table lookup before we unhashed socket. They will achieve
2867          * receive queue and will be purged by socket destructor.
2868          *
2869          * Also we still have packets pending on receive queue and probably,
2870          * our own packets waiting in device queues. sock_destroy will drain
2871          * receive queue, but transmitted packets will delay socket destruction
2872          * until the last reference will be released.
2873          */
2874
2875         sock_orphan(sk);
2876
2877         xfrm_sk_free_policy(sk);
2878
2879         sk_refcnt_debug_release(sk);
2880
2881         sock_put(sk);
2882 }
2883 EXPORT_SYMBOL(sk_common_release);
2884
2885 void sk_get_meminfo(const struct sock *sk, u32 *mem)
2886 {
2887         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
2888
2889         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
2890         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
2891         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
2892         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
2893         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
2894         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
2895         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
2896         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
2897         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
2898 }
2899
2900 #ifdef CONFIG_PROC_FS
2901 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2902 struct prot_inuse {
2903         int val[PROTO_INUSE_NR];
2904 };
2905
2906 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2907
2908 #ifdef CONFIG_NET_NS
2909 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2910 {
2911         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2912 }
2913 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2914
2915 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2916 {
2917         int cpu, idx = prot->inuse_idx;
2918         int res = 0;
2919
2920         for_each_possible_cpu(cpu)
2921                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2922
2923         return res >= 0 ? res : 0;
2924 }
2925 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2926
2927 static int __net_init sock_inuse_init_net(struct net *net)
2928 {
2929         net->core.inuse = alloc_percpu(struct prot_inuse);
2930         return net->core.inuse ? 0 : -ENOMEM;
2931 }
2932
2933 static void __net_exit sock_inuse_exit_net(struct net *net)
2934 {
2935         free_percpu(net->core.inuse);
2936 }
2937
2938 static struct pernet_operations net_inuse_ops = {
2939         .init = sock_inuse_init_net,
2940         .exit = sock_inuse_exit_net,
2941 };
2942
2943 static __init int net_inuse_init(void)
2944 {
2945         if (register_pernet_subsys(&net_inuse_ops))
2946                 panic("Cannot initialize net inuse counters");
2947
2948         return 0;
2949 }
2950
2951 core_initcall(net_inuse_init);
2952 #else
2953 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2954
2955 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2956 {
2957         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2958 }
2959 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2960
2961 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2962 {
2963         int cpu, idx = prot->inuse_idx;
2964         int res = 0;
2965
2966         for_each_possible_cpu(cpu)
2967                 res += per_cpu(prot_inuse, cpu).val[idx];
2968
2969         return res >= 0 ? res : 0;
2970 }
2971 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2972 #endif
2973
2974 static void assign_proto_idx(struct proto *prot)
2975 {
2976         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2977
2978         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2979                 pr_err("PROTO_INUSE_NR exhausted\n");
2980                 return;
2981         }
2982
2983         set_bit(prot->inuse_idx, proto_inuse_idx);
2984 }
2985
2986 static void release_proto_idx(struct proto *prot)
2987 {
2988         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2989                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2990 }
2991 #else
2992 static inline void assign_proto_idx(struct proto *prot)
2993 {
2994 }
2995
2996 static inline void release_proto_idx(struct proto *prot)
2997 {
2998 }
2999 #endif
3000
3001 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3002 {
3003         if (!rsk_prot)
3004                 return;
3005         kfree(rsk_prot->slab_name);
3006         rsk_prot->slab_name = NULL;
3007         kmem_cache_destroy(rsk_prot->slab);
3008         rsk_prot->slab = NULL;
3009 }
3010
3011 static int req_prot_init(const struct proto *prot)
3012 {
3013         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3014
3015         if (!rsk_prot)
3016                 return 0;
3017
3018         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3019                                         prot->name);
3020         if (!rsk_prot->slab_name)
3021                 return -ENOMEM;
3022
3023         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3024                                            rsk_prot->obj_size, 0,
3025                                            prot->slab_flags, NULL);
3026
3027         if (!rsk_prot->slab) {
3028                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3029                         prot->name);
3030                 return -ENOMEM;
3031         }
3032         return 0;
3033 }
3034
3035 int proto_register(struct proto *prot, int alloc_slab)
3036 {
3037         if (alloc_slab) {
3038                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3039                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3040                                         NULL);
3041
3042                 if (prot->slab == NULL) {
3043                         pr_crit("%s: Can't create sock SLAB cache!\n",
3044                                 prot->name);
3045                         goto out;
3046                 }
3047
3048                 if (req_prot_init(prot))
3049                         goto out_free_request_sock_slab;
3050
3051                 if (prot->twsk_prot != NULL) {
3052                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3053
3054                         if (prot->twsk_prot->twsk_slab_name == NULL)
3055                                 goto out_free_request_sock_slab;
3056
3057                         prot->twsk_prot->twsk_slab =
3058                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3059                                                   prot->twsk_prot->twsk_obj_size,
3060                                                   0,
3061                                                   prot->slab_flags,
3062                                                   NULL);
3063                         if (prot->twsk_prot->twsk_slab == NULL)
3064                                 goto out_free_timewait_sock_slab_name;
3065                 }
3066         }
3067
3068         mutex_lock(&proto_list_mutex);
3069         list_add(&prot->node, &proto_list);
3070         assign_proto_idx(prot);
3071         mutex_unlock(&proto_list_mutex);
3072         return 0;
3073
3074 out_free_timewait_sock_slab_name:
3075         kfree(prot->twsk_prot->twsk_slab_name);
3076 out_free_request_sock_slab:
3077         req_prot_cleanup(prot->rsk_prot);
3078
3079         kmem_cache_destroy(prot->slab);
3080         prot->slab = NULL;
3081 out:
3082         return -ENOBUFS;
3083 }
3084 EXPORT_SYMBOL(proto_register);
3085
3086 void proto_unregister(struct proto *prot)
3087 {
3088         mutex_lock(&proto_list_mutex);
3089         release_proto_idx(prot);
3090         list_del(&prot->node);
3091         mutex_unlock(&proto_list_mutex);
3092
3093         kmem_cache_destroy(prot->slab);
3094         prot->slab = NULL;
3095
3096         req_prot_cleanup(prot->rsk_prot);
3097
3098         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3099                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3100                 kfree(prot->twsk_prot->twsk_slab_name);
3101                 prot->twsk_prot->twsk_slab = NULL;
3102         }
3103 }
3104 EXPORT_SYMBOL(proto_unregister);
3105
3106 #ifdef CONFIG_PROC_FS
3107 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3108         __acquires(proto_list_mutex)
3109 {
3110         mutex_lock(&proto_list_mutex);
3111         return seq_list_start_head(&proto_list, *pos);
3112 }
3113
3114 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3115 {
3116         return seq_list_next(v, &proto_list, pos);
3117 }
3118
3119 static void proto_seq_stop(struct seq_file *seq, void *v)
3120         __releases(proto_list_mutex)
3121 {
3122         mutex_unlock(&proto_list_mutex);
3123 }
3124
3125 static char proto_method_implemented(const void *method)
3126 {
3127         return method == NULL ? 'n' : 'y';
3128 }
3129 static long sock_prot_memory_allocated(struct proto *proto)
3130 {
3131         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3132 }
3133
3134 static char *sock_prot_memory_pressure(struct proto *proto)
3135 {
3136         return proto->memory_pressure != NULL ?
3137         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3138 }
3139
3140 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3141 {
3142
3143         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3144                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3145                    proto->name,
3146                    proto->obj_size,
3147                    sock_prot_inuse_get(seq_file_net(seq), proto),
3148                    sock_prot_memory_allocated(proto),
3149                    sock_prot_memory_pressure(proto),
3150                    proto->max_header,
3151                    proto->slab == NULL ? "no" : "yes",
3152                    module_name(proto->owner),
3153                    proto_method_implemented(proto->close),
3154                    proto_method_implemented(proto->connect),
3155                    proto_method_implemented(proto->disconnect),
3156                    proto_method_implemented(proto->accept),
3157                    proto_method_implemented(proto->ioctl),
3158                    proto_method_implemented(proto->init),
3159                    proto_method_implemented(proto->destroy),
3160                    proto_method_implemented(proto->shutdown),
3161                    proto_method_implemented(proto->setsockopt),
3162                    proto_method_implemented(proto->getsockopt),
3163                    proto_method_implemented(proto->sendmsg),
3164                    proto_method_implemented(proto->recvmsg),
3165                    proto_method_implemented(proto->sendpage),
3166                    proto_method_implemented(proto->bind),
3167                    proto_method_implemented(proto->backlog_rcv),
3168                    proto_method_implemented(proto->hash),
3169                    proto_method_implemented(proto->unhash),
3170                    proto_method_implemented(proto->get_port),
3171                    proto_method_implemented(proto->enter_memory_pressure));
3172 }
3173
3174 static int proto_seq_show(struct seq_file *seq, void *v)
3175 {
3176         if (v == &proto_list)
3177                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3178                            "protocol",
3179                            "size",
3180                            "sockets",
3181                            "memory",
3182                            "press",
3183                            "maxhdr",
3184                            "slab",
3185                            "module",
3186                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3187         else
3188                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3189         return 0;
3190 }
3191
3192 static const struct seq_operations proto_seq_ops = {
3193         .start  = proto_seq_start,
3194         .next   = proto_seq_next,
3195         .stop   = proto_seq_stop,
3196         .show   = proto_seq_show,
3197 };
3198
3199 static int proto_seq_open(struct inode *inode, struct file *file)
3200 {
3201         return seq_open_net(inode, file, &proto_seq_ops,
3202                             sizeof(struct seq_net_private));
3203 }
3204
3205 static const struct file_operations proto_seq_fops = {
3206         .owner          = THIS_MODULE,
3207         .open           = proto_seq_open,
3208         .read           = seq_read,
3209         .llseek         = seq_lseek,
3210         .release        = seq_release_net,
3211 };
3212
3213 static __net_init int proto_init_net(struct net *net)
3214 {
3215         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3216                 return -ENOMEM;
3217
3218         return 0;
3219 }
3220
3221 static __net_exit void proto_exit_net(struct net *net)
3222 {
3223         remove_proc_entry("protocols", net->proc_net);
3224 }
3225
3226
3227 static __net_initdata struct pernet_operations proto_net_ops = {
3228         .init = proto_init_net,
3229         .exit = proto_exit_net,
3230 };
3231
3232 static int __init proto_init(void)
3233 {
3234         return register_pernet_subsys(&proto_net_ops);
3235 }
3236
3237 subsys_initcall(proto_init);
3238
3239 #endif /* PROC_FS */
3240
3241 #ifdef CONFIG_NET_RX_BUSY_POLL
3242 bool sk_busy_loop_end(void *p, unsigned long start_time)
3243 {
3244         struct sock *sk = p;
3245
3246         return !skb_queue_empty(&sk->sk_receive_queue) ||
3247                sk_busy_loop_timeout(sk, start_time);
3248 }
3249 EXPORT_SYMBOL(sk_busy_loop_end);
3250 #endif /* CONFIG_NET_RX_BUSY_POLL */