net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/sched/mm.h>
 106 #include <linux/timer.h>
 107 #include <linux/string.h>
 108 #include <linux/sockios.h>
 109 #include <linux/net.h>
 110 #include <linux/mm.h>
 111 #include <linux/slab.h>
 112 #include <linux/interrupt.h>
 113 #include <linux/poll.h>
 114 #include <linux/tcp.h>
 115 #include <linux/init.h>
 116 #include <linux/highmem.h>
 117 #include <linux/user_namespace.h>
 118 #include <linux/static_key.h>
 119 #include <linux/memcontrol.h>
 120 #include <linux/prefetch.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <net/net_namespace.h>
 128 #include <net/request_sock.h>
 129 #include <net/sock.h>
 130 #include <linux/net_tstamp.h>
 131 #include <net/xfrm.h>
 132 #include <linux/ipsec.h>
 133 #include <net/cls_cgroup.h>
 134 #include <net/netprio_cgroup.h>
 135 #include <linux/sock_diag.h>
 136
 137 #include <linux/filter.h>
 138 #include <net/sock_reuseport.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #ifdef CONFIG_INET
 143 #include <net/tcp.h>
 144 #endif
 145
 146 #include <net/busy_poll.h>
 147
 148 static DEFINE_MUTEX(proto_list_mutex);
 149 static LIST_HEAD(proto_list);
 150
 151 /**
 152  * sk_ns_capable - General socket capability test
 153  * @sk: Socket to use a capability on or through
 154  * @user_ns: The user namespace of the capability to use
 155  * @cap: The capability to use
 156  *
 157  * Test to see if the opener of the socket had when the socket was
 158  * created and the current process has the capability @cap in the user
 159  * namespace @user_ns.
 160  */
 161 bool sk_ns_capable(const struct sock *sk,
 162                    struct user_namespace *user_ns, int cap)
 163 {
 164         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 165                 ns_capable(user_ns, cap);
 166 }
 167 EXPORT_SYMBOL(sk_ns_capable);
 168
 169 /**
 170  * sk_capable - Socket global capability test
 171  * @sk: Socket to use a capability on or through
 172  * @cap: The global capability to use
 173  *
 174  * Test to see if the opener of the socket had when the socket was
 175  * created and the current process has the capability @cap in all user
 176  * namespaces.
 177  */
 178 bool sk_capable(const struct sock *sk, int cap)
 179 {
 180         return sk_ns_capable(sk, &init_user_ns, cap);
 181 }
 182 EXPORT_SYMBOL(sk_capable);
 183
 184 /**
 185  * sk_net_capable - Network namespace socket capability test
 186  * @sk: Socket to use a capability on or through
 187  * @cap: The capability to use
 188  *
 189  * Test to see if the opener of the socket had when the socket was created
 190  * and the current process has the capability @cap over the network namespace
 191  * the socket is a member of.
 192  */
 193 bool sk_net_capable(const struct sock *sk, int cap)
 194 {
 195         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 196 }
 197 EXPORT_SYMBOL(sk_net_capable);
 198
 199 /*
 200  * Each address family might have different locking rules, so we have
 201  * one slock key per address family and separate keys for internal and
 202  * userspace sockets.
 203  */
 204 static struct lock_class_key af_family_keys[AF_MAX];
 205 static struct lock_class_key af_family_kern_keys[AF_MAX];
 206 static struct lock_class_key af_family_slock_keys[AF_MAX];
 207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 208
 209 /*
 210  * Make lock validator output more readable. (we pre-construct these
 211  * strings build-time, so that runtime initialization of socket
 212  * locks is fast):
 213  */
 214
 215 #define _sock_locks(x)                                            \
 216   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 217   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 218   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 219   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 220   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 221   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 222   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 223   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 224   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 225   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 226   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 227   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 228   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 229   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 230   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_MAX"
 231
 232 static const char *const af_family_key_strings[AF_MAX+1] = {
 233         _sock_locks("sk_lock-")
 234 };
 235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 236         _sock_locks("slock-")
 237 };
 238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 239         _sock_locks("clock-")
 240 };
 241
 242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-sk_lock-")
 244 };
 245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-slock-")
 247 };
 248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 249         _sock_locks("k-clock-")
 250 };
 251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 252   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 253   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 254   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 255   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 256   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 257   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 258   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 259   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 260   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 261   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 262   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 263   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 264   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 265   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 266   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 267 };
 268 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 269   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 270   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 271   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 272   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 273   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 274   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 275   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 276   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 277   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 278   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 279   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 280   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 281   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 282   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 283   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 284 };
 285 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 286   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 287   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 288   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 289   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 290   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 291   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 292   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 293   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 294   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 295   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 296   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 297   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 298   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 299   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 300   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 301 };
 302
 303 /*
 304  * sk_callback_lock and sk queues locking rules are per-address-family,
 305  * so split the lock classes by using a per-AF key:
 306  */
 307 static struct lock_class_key af_callback_keys[AF_MAX];
 308 static struct lock_class_key af_rlock_keys[AF_MAX];
 309 static struct lock_class_key af_wlock_keys[AF_MAX];
 310 static struct lock_class_key af_elock_keys[AF_MAX];
 311 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 312
 313 /* Take into consideration the size of the struct sk_buff overhead in the
 314  * determination of these values, since that is non-constant across
 315  * platforms.  This makes socket queueing behavior and performance
 316  * not depend upon such differences.
 317  */
 318 #define _SK_MEM_PACKETS         256
 319 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 320 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 321 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 322
 323 /* Run time adjustable parameters. */
 324 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 325 EXPORT_SYMBOL(sysctl_wmem_max);
 326 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 327 EXPORT_SYMBOL(sysctl_rmem_max);
 328 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 329 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 330
 331 /* Maximal space eaten by iovec or ancillary data plus some space */
 332 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 333 EXPORT_SYMBOL(sysctl_optmem_max);
 334
 335 int sysctl_tstamp_allow_data __read_mostly = 1;
 336
 337 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 338 EXPORT_SYMBOL_GPL(memalloc_socks);
 339
 340 /**
 341  * sk_set_memalloc - sets %SOCK_MEMALLOC
 342  * @sk: socket to set it on
 343  *
 344  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 345  * It's the responsibility of the admin to adjust min_free_kbytes
 346  * to meet the requirements
 347  */
 348 void sk_set_memalloc(struct sock *sk)
 349 {
 350         sock_set_flag(sk, SOCK_MEMALLOC);
 351         sk->sk_allocation |= __GFP_MEMALLOC;
 352         static_key_slow_inc(&memalloc_socks);
 353 }
 354 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 355
 356 void sk_clear_memalloc(struct sock *sk)
 357 {
 358         sock_reset_flag(sk, SOCK_MEMALLOC);
 359         sk->sk_allocation &= ~__GFP_MEMALLOC;
 360         static_key_slow_dec(&memalloc_socks);
 361
 362         /*
 363          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 364          * progress of swapping. SOCK_MEMALLOC may be cleared while
 365          * it has rmem allocations due to the last swapfile being deactivated
 366          * but there is a risk that the socket is unusable due to exceeding
 367          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 368          */
 369         sk_mem_reclaim(sk);
 370 }
 371 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 372
 373 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 374 {
 375         int ret;
 376         unsigned int noreclaim_flag;
 377
 378         /* these should have been dropped before queueing */
 379         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 380
 381         noreclaim_flag = memalloc_noreclaim_save();
 382         ret = sk->sk_backlog_rcv(sk, skb);
 383         memalloc_noreclaim_restore(noreclaim_flag);
 384
 385         return ret;
 386 }
 387 EXPORT_SYMBOL(__sk_backlog_rcv);
 388
 389 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 390 {
 391         struct timeval tv;
 392
 393         if (optlen < sizeof(tv))
 394                 return -EINVAL;
 395         if (copy_from_user(&tv, optval, sizeof(tv)))
 396                 return -EFAULT;
 397         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 398                 return -EDOM;
 399
 400         if (tv.tv_sec < 0) {
 401                 static int warned __read_mostly;
 402
 403                 *timeo_p = 0;
 404                 if (warned < 10 && net_ratelimit()) {
 405                         warned++;
 406                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 407                                 __func__, current->comm, task_pid_nr(current));
 408                 }
 409                 return 0;
 410         }
 411         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 412         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 413                 return 0;
 414         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 415                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 416         return 0;
 417 }
 418
 419 static void sock_warn_obsolete_bsdism(const char *name)
 420 {
 421         static int warned;
 422         static char warncomm[TASK_COMM_LEN];
 423         if (strcmp(warncomm, current->comm) && warned < 5) {
 424                 strcpy(warncomm,  current->comm);
 425                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 426                         warncomm, name);
 427                 warned++;
 428         }
 429 }
 430
 431 static bool sock_needs_netstamp(const struct sock *sk)
 432 {
 433         switch (sk->sk_family) {
 434         case AF_UNSPEC:
 435         case AF_UNIX:
 436                 return false;
 437         default:
 438                 return true;
 439         }
 440 }
 441
 442 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 443 {
 444         if (sk->sk_flags & flags) {
 445                 sk->sk_flags &= ~flags;
 446                 if (sock_needs_netstamp(sk) &&
 447                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 448                         net_disable_timestamp();
 449         }
 450 }
 451
 452
 453 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 454 {
 455         unsigned long flags;
 456         struct sk_buff_head *list = &sk->sk_receive_queue;
 457
 458         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 459                 atomic_inc(&sk->sk_drops);
 460                 trace_sock_rcvqueue_full(sk, skb);
 461                 return -ENOMEM;
 462         }
 463
 464         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 465                 atomic_inc(&sk->sk_drops);
 466                 return -ENOBUFS;
 467         }
 468
 469         skb->dev = NULL;
 470         skb_set_owner_r(skb, sk);
 471
 472         /* we escape from rcu protected region, make sure we dont leak
 473          * a norefcounted dst
 474          */
 475         skb_dst_force(skb);
 476
 477         spin_lock_irqsave(&list->lock, flags);
 478         sock_skb_set_dropcount(sk, skb);
 479         __skb_queue_tail(list, skb);
 480         spin_unlock_irqrestore(&list->lock, flags);
 481
 482         if (!sock_flag(sk, SOCK_DEAD))
 483                 sk->sk_data_ready(sk);
 484         return 0;
 485 }
 486 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 487
 488 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 489 {
 490         int err;
 491
 492         err = sk_filter(sk, skb);
 493         if (err)
 494                 return err;
 495
 496         return __sock_queue_rcv_skb(sk, skb);
 497 }
 498 EXPORT_SYMBOL(sock_queue_rcv_skb);
 499
 500 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 501                      const int nested, unsigned int trim_cap, bool refcounted)
 502 {
 503         int rc = NET_RX_SUCCESS;
 504
 505         if (sk_filter_trim_cap(sk, skb, trim_cap))
 506                 goto discard_and_relse;
 507
 508         skb->dev = NULL;
 509
 510         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 511                 atomic_inc(&sk->sk_drops);
 512                 goto discard_and_relse;
 513         }
 514         if (nested)
 515                 bh_lock_sock_nested(sk);
 516         else
 517                 bh_lock_sock(sk);
 518         if (!sock_owned_by_user(sk)) {
 519                 /*
 520                  * trylock + unlock semantics:
 521                  */
 522                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 523
 524                 rc = sk_backlog_rcv(sk, skb);
 525
 526                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 527         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 528                 bh_unlock_sock(sk);
 529                 atomic_inc(&sk->sk_drops);
 530                 goto discard_and_relse;
 531         }
 532
 533         bh_unlock_sock(sk);
 534 out:
 535         if (refcounted)
 536                 sock_put(sk);
 537         return rc;
 538 discard_and_relse:
 539         kfree_skb(skb);
 540         goto out;
 541 }
 542 EXPORT_SYMBOL(__sk_receive_skb);
 543
 544 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 545 {
 546         struct dst_entry *dst = __sk_dst_get(sk);
 547
 548         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 549                 sk_tx_queue_clear(sk);
 550                 sk->sk_dst_pending_confirm = 0;
 551                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 552                 dst_release(dst);
 553                 return NULL;
 554         }
 555
 556         return dst;
 557 }
 558 EXPORT_SYMBOL(__sk_dst_check);
 559
 560 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 561 {
 562         struct dst_entry *dst = sk_dst_get(sk);
 563
 564         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 565                 sk_dst_reset(sk);
 566                 dst_release(dst);
 567                 return NULL;
 568         }
 569
 570         return dst;
 571 }
 572 EXPORT_SYMBOL(sk_dst_check);
 573
 574 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 575                                 int optlen)
 576 {
 577         int ret = -ENOPROTOOPT;
 578 #ifdef CONFIG_NETDEVICES
 579         struct net *net = sock_net(sk);
 580         char devname[IFNAMSIZ];
 581         int index;
 582
 583         /* Sorry... */
 584         ret = -EPERM;
 585         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 586                 goto out;
 587
 588         ret = -EINVAL;
 589         if (optlen < 0)
 590                 goto out;
 591
 592         /* Bind this socket to a particular device like "eth0",
 593          * as specified in the passed interface name. If the
 594          * name is "" or the option length is zero the socket
 595          * is not bound.
 596          */
 597         if (optlen > IFNAMSIZ - 1)
 598                 optlen = IFNAMSIZ - 1;
 599         memset(devname, 0, sizeof(devname));
 600
 601         ret = -EFAULT;
 602         if (copy_from_user(devname, optval, optlen))
 603                 goto out;
 604
 605         index = 0;
 606         if (devname[0] != '\0') {
 607                 struct net_device *dev;
 608
 609                 rcu_read_lock();
 610                 dev = dev_get_by_name_rcu(net, devname);
 611                 if (dev)
 612                         index = dev->ifindex;
 613                 rcu_read_unlock();
 614                 ret = -ENODEV;
 615                 if (!dev)
 616                         goto out;
 617         }
 618
 619         lock_sock(sk);
 620         sk->sk_bound_dev_if = index;
 621         sk_dst_reset(sk);
 622         release_sock(sk);
 623
 624         ret = 0;
 625
 626 out:
 627 #endif
 628
 629         return ret;
 630 }
 631
 632 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 633                                 int __user *optlen, int len)
 634 {
 635         int ret = -ENOPROTOOPT;
 636 #ifdef CONFIG_NETDEVICES
 637         struct net *net = sock_net(sk);
 638         char devname[IFNAMSIZ];
 639
 640         if (sk->sk_bound_dev_if == 0) {
 641                 len = 0;
 642                 goto zero;
 643         }
 644
 645         ret = -EINVAL;
 646         if (len < IFNAMSIZ)
 647                 goto out;
 648
 649         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 650         if (ret)
 651                 goto out;
 652
 653         len = strlen(devname) + 1;
 654
 655         ret = -EFAULT;
 656         if (copy_to_user(optval, devname, len))
 657                 goto out;
 658
 659 zero:
 660         ret = -EFAULT;
 661         if (put_user(len, optlen))
 662                 goto out;
 663
 664         ret = 0;
 665
 666 out:
 667 #endif
 668
 669         return ret;
 670 }
 671
 672 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 673 {
 674         if (valbool)
 675                 sock_set_flag(sk, bit);
 676         else
 677                 sock_reset_flag(sk, bit);
 678 }
 679
 680 bool sk_mc_loop(struct sock *sk)
 681 {
 682         if (dev_recursion_level())
 683                 return false;
 684         if (!sk)
 685                 return true;
 686         switch (sk->sk_family) {
 687         case AF_INET:
 688                 return inet_sk(sk)->mc_loop;
 689 #if IS_ENABLED(CONFIG_IPV6)
 690         case AF_INET6:
 691                 return inet6_sk(sk)->mc_loop;
 692 #endif
 693         }
 694         WARN_ON(1);
 695         return true;
 696 }
 697 EXPORT_SYMBOL(sk_mc_loop);
 698
 699 /*
 700  *      This is meant for all protocols to use and covers goings on
 701  *      at the socket level. Everything here is generic.
 702  */
 703
 704 int sock_setsockopt(struct socket *sock, int level, int optname,
 705                     char __user *optval, unsigned int optlen)
 706 {
 707         struct sock *sk = sock->sk;
 708         int val;
 709         int valbool;
 710         struct linger ling;
 711         int ret = 0;
 712
 713         /*
 714          *      Options without arguments
 715          */
 716
 717         if (optname == SO_BINDTODEVICE)
 718                 return sock_setbindtodevice(sk, optval, optlen);
 719
 720         if (optlen < sizeof(int))
 721                 return -EINVAL;
 722
 723         if (get_user(val, (int __user *)optval))
 724                 return -EFAULT;
 725
 726         valbool = val ? 1 : 0;
 727
 728         lock_sock(sk);
 729
 730         switch (optname) {
 731         case SO_DEBUG:
 732                 if (val && !capable(CAP_NET_ADMIN))
 733                         ret = -EACCES;
 734                 else
 735                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 736                 break;
 737         case SO_REUSEADDR:
 738                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 739                 break;
 740         case SO_REUSEPORT:
 741                 sk->sk_reuseport = valbool;
 742                 break;
 743         case SO_TYPE:
 744         case SO_PROTOCOL:
 745         case SO_DOMAIN:
 746         case SO_ERROR:
 747                 ret = -ENOPROTOOPT;
 748                 break;
 749         case SO_DONTROUTE:
 750                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 751                 break;
 752         case SO_BROADCAST:
 753                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 754                 break;
 755         case SO_SNDBUF:
 756                 /* Don't error on this BSD doesn't and if you think
 757                  * about it this is right. Otherwise apps have to
 758                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 759                  * are treated in BSD as hints
 760                  */
 761                 val = min_t(u32, val, sysctl_wmem_max);
 762 set_sndbuf:
 763                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 764                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 765                 /* Wake up sending tasks if we upped the value. */
 766                 sk->sk_write_space(sk);
 767                 break;
 768
 769         case SO_SNDBUFFORCE:
 770                 if (!capable(CAP_NET_ADMIN)) {
 771                         ret = -EPERM;
 772                         break;
 773                 }
 774                 goto set_sndbuf;
 775
 776         case SO_RCVBUF:
 777                 /* Don't error on this BSD doesn't and if you think
 778                  * about it this is right. Otherwise apps have to
 779                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 780                  * are treated in BSD as hints
 781                  */
 782                 val = min_t(u32, val, sysctl_rmem_max);
 783 set_rcvbuf:
 784                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 785                 /*
 786                  * We double it on the way in to account for
 787                  * "struct sk_buff" etc. overhead.   Applications
 788                  * assume that the SO_RCVBUF setting they make will
 789                  * allow that much actual data to be received on that
 790                  * socket.
 791                  *
 792                  * Applications are unaware that "struct sk_buff" and
 793                  * other overheads allocate from the receive buffer
 794                  * during socket buffer allocation.
 795                  *
 796                  * And after considering the possible alternatives,
 797                  * returning the value we actually used in getsockopt
 798                  * is the most desirable behavior.
 799                  */
 800                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 801                 break;
 802
 803         case SO_RCVBUFFORCE:
 804                 if (!capable(CAP_NET_ADMIN)) {
 805                         ret = -EPERM;
 806                         break;
 807                 }
 808                 goto set_rcvbuf;
 809
 810         case SO_KEEPALIVE:
 811                 if (sk->sk_prot->keepalive)
 812                         sk->sk_prot->keepalive(sk, valbool);
 813                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 814                 break;
 815
 816         case SO_OOBINLINE:
 817                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 818                 break;
 819
 820         case SO_NO_CHECK:
 821                 sk->sk_no_check_tx = valbool;
 822                 break;
 823
 824         case SO_PRIORITY:
 825                 if ((val >= 0 && val <= 6) ||
 826                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 827                         sk->sk_priority = val;
 828                 else
 829                         ret = -EPERM;
 830                 break;
 831
 832         case SO_LINGER:
 833                 if (optlen < sizeof(ling)) {
 834                         ret = -EINVAL;  /* 1003.1g */
 835                         break;
 836                 }
 837                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 838                         ret = -EFAULT;
 839                         break;
 840                 }
 841                 if (!ling.l_onoff)
 842                         sock_reset_flag(sk, SOCK_LINGER);
 843                 else {
 844 #if (BITS_PER_LONG == 32)
 845                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 846                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 847                         else
 848 #endif
 849                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 850                         sock_set_flag(sk, SOCK_LINGER);
 851                 }
 852                 break;
 853
 854         case SO_BSDCOMPAT:
 855                 sock_warn_obsolete_bsdism("setsockopt");
 856                 break;
 857
 858         case SO_PASSCRED:
 859                 if (valbool)
 860                         set_bit(SOCK_PASSCRED, &sock->flags);
 861                 else
 862                         clear_bit(SOCK_PASSCRED, &sock->flags);
 863                 break;
 864
 865         case SO_TIMESTAMP:
 866         case SO_TIMESTAMPNS:
 867                 if (valbool)  {
 868                         if (optname == SO_TIMESTAMP)
 869                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 870                         else
 871                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 872                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 873                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 874                 } else {
 875                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 876                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 877                 }
 878                 break;
 879
 880         case SO_TIMESTAMPING:
 881                 if (val & ~SOF_TIMESTAMPING_MASK) {
 882                         ret = -EINVAL;
 883                         break;
 884                 }
 885
 886                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 887                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 888                         if (sk->sk_protocol == IPPROTO_TCP &&
 889                             sk->sk_type == SOCK_STREAM) {
 890                                 if ((1 << sk->sk_state) &
 891                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 892                                         ret = -EINVAL;
 893                                         break;
 894                                 }
 895                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 896                         } else {
 897                                 sk->sk_tskey = 0;
 898                         }
 899                 }
 900
 901                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 902                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 903                         ret = -EINVAL;
 904                         break;
 905                 }
 906
 907                 sk->sk_tsflags = val;
 908                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 909                         sock_enable_timestamp(sk,
 910                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 911                 else
 912                         sock_disable_timestamp(sk,
 913                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 914                 break;
 915
 916         case SO_RCVLOWAT:
 917                 if (val < 0)
 918                         val = INT_MAX;
 919                 sk->sk_rcvlowat = val ? : 1;
 920                 break;
 921
 922         case SO_RCVTIMEO:
 923                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 924                 break;
 925
 926         case SO_SNDTIMEO:
 927                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 928                 break;
 929
 930         case SO_ATTACH_FILTER:
 931                 ret = -EINVAL;
 932                 if (optlen == sizeof(struct sock_fprog)) {
 933                         struct sock_fprog fprog;
 934
 935                         ret = -EFAULT;
 936                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 937                                 break;
 938
 939                         ret = sk_attach_filter(&fprog, sk);
 940                 }
 941                 break;
 942
 943         case SO_ATTACH_BPF:
 944                 ret = -EINVAL;
 945                 if (optlen == sizeof(u32)) {
 946                         u32 ufd;
 947
 948                         ret = -EFAULT;
 949                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 950                                 break;
 951
 952                         ret = sk_attach_bpf(ufd, sk);
 953                 }
 954                 break;
 955
 956         case SO_ATTACH_REUSEPORT_CBPF:
 957                 ret = -EINVAL;
 958                 if (optlen == sizeof(struct sock_fprog)) {
 959                         struct sock_fprog fprog;
 960
 961                         ret = -EFAULT;
 962                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 963                                 break;
 964
 965                         ret = sk_reuseport_attach_filter(&fprog, sk);
 966                 }
 967                 break;
 968
 969         case SO_ATTACH_REUSEPORT_EBPF:
 970                 ret = -EINVAL;
 971                 if (optlen == sizeof(u32)) {
 972                         u32 ufd;
 973
 974                         ret = -EFAULT;
 975                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 976                                 break;
 977
 978                         ret = sk_reuseport_attach_bpf(ufd, sk);
 979                 }
 980                 break;
 981
 982         case SO_DETACH_FILTER:
 983                 ret = sk_detach_filter(sk);
 984                 break;
 985
 986         case SO_LOCK_FILTER:
 987                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 988                         ret = -EPERM;
 989                 else
 990                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 991                 break;
 992
 993         case SO_PASSSEC:
 994                 if (valbool)
 995                         set_bit(SOCK_PASSSEC, &sock->flags);
 996                 else
 997                         clear_bit(SOCK_PASSSEC, &sock->flags);
 998                 break;
 999         case SO_MARK:
1000                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1001                         ret = -EPERM;
1002                 else
1003                         sk->sk_mark = val;
1004                 break;
1005
1006         case SO_RXQ_OVFL:
1007                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1008                 break;
1009
1010         case SO_WIFI_STATUS:
1011                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1012                 break;
1013
1014         case SO_PEEK_OFF:
1015                 if (sock->ops->set_peek_off)
1016                         ret = sock->ops->set_peek_off(sk, val);
1017                 else
1018                         ret = -EOPNOTSUPP;
1019                 break;
1020
1021         case SO_NOFCS:
1022                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1023                 break;
1024
1025         case SO_SELECT_ERR_QUEUE:
1026                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1027                 break;
1028
1029 #ifdef CONFIG_NET_RX_BUSY_POLL
1030         case SO_BUSY_POLL:
1031                 /* allow unprivileged users to decrease the value */
1032                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1033                         ret = -EPERM;
1034                 else {
1035                         if (val < 0)
1036                                 ret = -EINVAL;
1037                         else
1038                                 sk->sk_ll_usec = val;
1039                 }
1040                 break;
1041 #endif
1042
1043         case SO_MAX_PACING_RATE:
1044                 sk->sk_max_pacing_rate = val;
1045                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1046                                          sk->sk_max_pacing_rate);
1047                 break;
1048
1049         case SO_INCOMING_CPU:
1050                 sk->sk_incoming_cpu = val;
1051                 break;
1052
1053         case SO_CNX_ADVICE:
1054                 if (val == 1)
1055                         dst_negative_advice(sk);
1056                 break;
1057         default:
1058                 ret = -ENOPROTOOPT;
1059                 break;
1060         }
1061         release_sock(sk);
1062         return ret;
1063 }
1064 EXPORT_SYMBOL(sock_setsockopt);
1065
1066
1067 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1068                           struct ucred *ucred)
1069 {
1070         ucred->pid = pid_vnr(pid);
1071         ucred->uid = ucred->gid = -1;
1072         if (cred) {
1073                 struct user_namespace *current_ns = current_user_ns();
1074
1075                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1076                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1077         }
1078 }
1079
1080 int sock_getsockopt(struct socket *sock, int level, int optname,
1081                     char __user *optval, int __user *optlen)
1082 {
1083         struct sock *sk = sock->sk;
1084
1085         union {
1086                 int val;
1087                 u64 val64;
1088                 struct linger ling;
1089                 struct timeval tm;
1090         } v;
1091
1092         int lv = sizeof(int);
1093         int len;
1094
1095         if (get_user(len, optlen))
1096                 return -EFAULT;
1097         if (len < 0)
1098                 return -EINVAL;
1099
1100         memset(&v, 0, sizeof(v));
1101
1102         switch (optname) {
1103         case SO_DEBUG:
1104                 v.val = sock_flag(sk, SOCK_DBG);
1105                 break;
1106
1107         case SO_DONTROUTE:
1108                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1109                 break;
1110
1111         case SO_BROADCAST:
1112                 v.val = sock_flag(sk, SOCK_BROADCAST);
1113                 break;
1114
1115         case SO_SNDBUF:
1116                 v.val = sk->sk_sndbuf;
1117                 break;
1118
1119         case SO_RCVBUF:
1120                 v.val = sk->sk_rcvbuf;
1121                 break;
1122
1123         case SO_REUSEADDR:
1124                 v.val = sk->sk_reuse;
1125                 break;
1126
1127         case SO_REUSEPORT:
1128                 v.val = sk->sk_reuseport;
1129                 break;
1130
1131         case SO_KEEPALIVE:
1132                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1133                 break;
1134
1135         case SO_TYPE:
1136                 v.val = sk->sk_type;
1137                 break;
1138
1139         case SO_PROTOCOL:
1140                 v.val = sk->sk_protocol;
1141                 break;
1142
1143         case SO_DOMAIN:
1144                 v.val = sk->sk_family;
1145                 break;
1146
1147         case SO_ERROR:
1148                 v.val = -sock_error(sk);
1149                 if (v.val == 0)
1150                         v.val = xchg(&sk->sk_err_soft, 0);
1151                 break;
1152
1153         case SO_OOBINLINE:
1154                 v.val = sock_flag(sk, SOCK_URGINLINE);
1155                 break;
1156
1157         case SO_NO_CHECK:
1158                 v.val = sk->sk_no_check_tx;
1159                 break;
1160
1161         case SO_PRIORITY:
1162                 v.val = sk->sk_priority;
1163                 break;
1164
1165         case SO_LINGER:
1166                 lv              = sizeof(v.ling);
1167                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1168                 v.ling.l_linger = sk->sk_lingertime / HZ;
1169                 break;
1170
1171         case SO_BSDCOMPAT:
1172                 sock_warn_obsolete_bsdism("getsockopt");
1173                 break;
1174
1175         case SO_TIMESTAMP:
1176                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1177                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1178                 break;
1179
1180         case SO_TIMESTAMPNS:
1181                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1182                 break;
1183
1184         case SO_TIMESTAMPING:
1185                 v.val = sk->sk_tsflags;
1186                 break;
1187
1188         case SO_RCVTIMEO:
1189                 lv = sizeof(struct timeval);
1190                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1191                         v.tm.tv_sec = 0;
1192                         v.tm.tv_usec = 0;
1193                 } else {
1194                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1195                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1196                 }
1197                 break;
1198
1199         case SO_SNDTIMEO:
1200                 lv = sizeof(struct timeval);
1201                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1202                         v.tm.tv_sec = 0;
1203                         v.tm.tv_usec = 0;
1204                 } else {
1205                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1206                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1207                 }
1208                 break;
1209
1210         case SO_RCVLOWAT:
1211                 v.val = sk->sk_rcvlowat;
1212                 break;
1213
1214         case SO_SNDLOWAT:
1215                 v.val = 1;
1216                 break;
1217
1218         case SO_PASSCRED:
1219                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1220                 break;
1221
1222         case SO_PEERCRED:
1223         {
1224                 struct ucred peercred;
1225                 if (len > sizeof(peercred))
1226                         len = sizeof(peercred);
1227                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1228                 if (copy_to_user(optval, &peercred, len))
1229                         return -EFAULT;
1230                 goto lenout;
1231         }
1232
1233         case SO_PEERNAME:
1234         {
1235                 char address[128];
1236
1237                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1238                         return -ENOTCONN;
1239                 if (lv < len)
1240                         return -EINVAL;
1241                 if (copy_to_user(optval, address, len))
1242                         return -EFAULT;
1243                 goto lenout;
1244         }
1245
1246         /* Dubious BSD thing... Probably nobody even uses it, but
1247          * the UNIX standard wants it for whatever reason... -DaveM
1248          */
1249         case SO_ACCEPTCONN:
1250                 v.val = sk->sk_state == TCP_LISTEN;
1251                 break;
1252
1253         case SO_PASSSEC:
1254                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1255                 break;
1256
1257         case SO_PEERSEC:
1258                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1259
1260         case SO_MARK:
1261                 v.val = sk->sk_mark;
1262                 break;
1263
1264         case SO_RXQ_OVFL:
1265                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1266                 break;
1267
1268         case SO_WIFI_STATUS:
1269                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1270                 break;
1271
1272         case SO_PEEK_OFF:
1273                 if (!sock->ops->set_peek_off)
1274                         return -EOPNOTSUPP;
1275
1276                 v.val = sk->sk_peek_off;
1277                 break;
1278         case SO_NOFCS:
1279                 v.val = sock_flag(sk, SOCK_NOFCS);
1280                 break;
1281
1282         case SO_BINDTODEVICE:
1283                 return sock_getbindtodevice(sk, optval, optlen, len);
1284
1285         case SO_GET_FILTER:
1286                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1287                 if (len < 0)
1288                         return len;
1289
1290                 goto lenout;
1291
1292         case SO_LOCK_FILTER:
1293                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1294                 break;
1295
1296         case SO_BPF_EXTENSIONS:
1297                 v.val = bpf_tell_extensions();
1298                 break;
1299
1300         case SO_SELECT_ERR_QUEUE:
1301                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1302                 break;
1303
1304 #ifdef CONFIG_NET_RX_BUSY_POLL
1305         case SO_BUSY_POLL:
1306                 v.val = sk->sk_ll_usec;
1307                 break;
1308 #endif
1309
1310         case SO_MAX_PACING_RATE:
1311                 v.val = sk->sk_max_pacing_rate;
1312                 break;
1313
1314         case SO_INCOMING_CPU:
1315                 v.val = sk->sk_incoming_cpu;
1316                 break;
1317
1318         case SO_MEMINFO:
1319         {
1320                 u32 meminfo[SK_MEMINFO_VARS];
1321
1322                 if (get_user(len, optlen))
1323                         return -EFAULT;
1324
1325                 sk_get_meminfo(sk, meminfo);
1326
1327                 len = min_t(unsigned int, len, sizeof(meminfo));
1328                 if (copy_to_user(optval, &meminfo, len))
1329                         return -EFAULT;
1330
1331                 goto lenout;
1332         }
1333
1334 #ifdef CONFIG_NET_RX_BUSY_POLL
1335         case SO_INCOMING_NAPI_ID:
1336                 v.val = READ_ONCE(sk->sk_napi_id);
1337
1338                 /* aggregate non-NAPI IDs down to 0 */
1339                 if (v.val < MIN_NAPI_ID)
1340                         v.val = 0;
1341
1342                 break;
1343 #endif
1344
1345         case SO_COOKIE:
1346                 lv = sizeof(u64);
1347                 if (len < lv)
1348                         return -EINVAL;
1349                 v.val64 = sock_gen_cookie(sk);
1350                 break;
1351
1352         default:
1353                 /* We implement the SO_SNDLOWAT etc to not be settable
1354                  * (1003.1g 7).
1355                  */
1356                 return -ENOPROTOOPT;
1357         }
1358
1359         if (len > lv)
1360                 len = lv;
1361         if (copy_to_user(optval, &v, len))
1362                 return -EFAULT;
1363 lenout:
1364         if (put_user(len, optlen))
1365                 return -EFAULT;
1366         return 0;
1367 }
1368
1369 /*
1370  * Initialize an sk_lock.
1371  *
1372  * (We also register the sk_lock with the lock validator.)
1373  */
1374 static inline void sock_lock_init(struct sock *sk)
1375 {
1376         if (sk->sk_kern_sock)
1377                 sock_lock_init_class_and_name(
1378                         sk,
1379                         af_family_kern_slock_key_strings[sk->sk_family],
1380                         af_family_kern_slock_keys + sk->sk_family,
1381                         af_family_kern_key_strings[sk->sk_family],
1382                         af_family_kern_keys + sk->sk_family);
1383         else
1384                 sock_lock_init_class_and_name(
1385                         sk,
1386                         af_family_slock_key_strings[sk->sk_family],
1387                         af_family_slock_keys + sk->sk_family,
1388                         af_family_key_strings[sk->sk_family],
1389                         af_family_keys + sk->sk_family);
1390 }
1391
1392 /*
1393  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1394  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1395  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1396  */
1397 static void sock_copy(struct sock *nsk, const struct sock *osk)
1398 {
1399 #ifdef CONFIG_SECURITY_NETWORK
1400         void *sptr = nsk->sk_security;
1401 #endif
1402         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1403
1404         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1405                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1406
1407 #ifdef CONFIG_SECURITY_NETWORK
1408         nsk->sk_security = sptr;
1409         security_sk_clone(osk, nsk);
1410 #endif
1411 }
1412
1413 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1414                 int family)
1415 {
1416         struct sock *sk;
1417         struct kmem_cache *slab;
1418
1419         slab = prot->slab;
1420         if (slab != NULL) {
1421                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1422                 if (!sk)
1423                         return sk;
1424                 if (priority & __GFP_ZERO)
1425                         sk_prot_clear_nulls(sk, prot->obj_size);
1426         } else
1427                 sk = kmalloc(prot->obj_size, priority);
1428
1429         if (sk != NULL) {
1430                 kmemcheck_annotate_bitfield(sk, flags);
1431
1432                 if (security_sk_alloc(sk, family, priority))
1433                         goto out_free;
1434
1435                 if (!try_module_get(prot->owner))
1436                         goto out_free_sec;
1437                 sk_tx_queue_clear(sk);
1438         }
1439
1440         return sk;
1441
1442 out_free_sec:
1443         security_sk_free(sk);
1444 out_free:
1445         if (slab != NULL)
1446                 kmem_cache_free(slab, sk);
1447         else
1448                 kfree(sk);
1449         return NULL;
1450 }
1451
1452 static void sk_prot_free(struct proto *prot, struct sock *sk)
1453 {
1454         struct kmem_cache *slab;
1455         struct module *owner;
1456
1457         owner = prot->owner;
1458         slab = prot->slab;
1459
1460         cgroup_sk_free(&sk->sk_cgrp_data);
1461         mem_cgroup_sk_free(sk);
1462         security_sk_free(sk);
1463         if (slab != NULL)
1464                 kmem_cache_free(slab, sk);
1465         else
1466                 kfree(sk);
1467         module_put(owner);
1468 }
1469
1470 /**
1471  *      sk_alloc - All socket objects are allocated here
1472  *      @net: the applicable net namespace
1473  *      @family: protocol family
1474  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1475  *      @prot: struct proto associated with this new sock instance
1476  *      @kern: is this to be a kernel socket?
1477  */
1478 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1479                       struct proto *prot, int kern)
1480 {
1481         struct sock *sk;
1482
1483         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1484         if (sk) {
1485                 sk->sk_family = family;
1486                 /*
1487                  * See comment in struct sock definition to understand
1488                  * why we need sk_prot_creator -acme
1489                  */
1490                 sk->sk_prot = sk->sk_prot_creator = prot;
1491                 sk->sk_kern_sock = kern;
1492                 sock_lock_init(sk);
1493                 sk->sk_net_refcnt = kern ? 0 : 1;
1494                 if (likely(sk->sk_net_refcnt))
1495                         get_net(net);
1496                 sock_net_set(sk, net);
1497                 atomic_set(&sk->sk_wmem_alloc, 1);
1498
1499                 mem_cgroup_sk_alloc(sk);
1500                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1501                 sock_update_classid(&sk->sk_cgrp_data);
1502                 sock_update_netprioidx(&sk->sk_cgrp_data);
1503         }
1504
1505         return sk;
1506 }
1507 EXPORT_SYMBOL(sk_alloc);
1508
1509 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1510  * grace period. This is the case for UDP sockets and TCP listeners.
1511  */
1512 static void __sk_destruct(struct rcu_head *head)
1513 {
1514         struct sock *sk = container_of(head, struct sock, sk_rcu);
1515         struct sk_filter *filter;
1516
1517         if (sk->sk_destruct)
1518                 sk->sk_destruct(sk);
1519
1520         filter = rcu_dereference_check(sk->sk_filter,
1521                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1522         if (filter) {
1523                 sk_filter_uncharge(sk, filter);
1524                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1525         }
1526         if (rcu_access_pointer(sk->sk_reuseport_cb))
1527                 reuseport_detach_sock(sk);
1528
1529         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1530
1531         if (atomic_read(&sk->sk_omem_alloc))
1532                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1533                          __func__, atomic_read(&sk->sk_omem_alloc));
1534
1535         if (sk->sk_frag.page) {
1536                 put_page(sk->sk_frag.page);
1537                 sk->sk_frag.page = NULL;
1538         }
1539
1540         if (sk->sk_peer_cred)
1541                 put_cred(sk->sk_peer_cred);
1542         put_pid(sk->sk_peer_pid);
1543         if (likely(sk->sk_net_refcnt))
1544                 put_net(sock_net(sk));
1545         sk_prot_free(sk->sk_prot_creator, sk);
1546 }
1547
1548 void sk_destruct(struct sock *sk)
1549 {
1550         if (sock_flag(sk, SOCK_RCU_FREE))
1551                 call_rcu(&sk->sk_rcu, __sk_destruct);
1552         else
1553                 __sk_destruct(&sk->sk_rcu);
1554 }
1555
1556 static void __sk_free(struct sock *sk)
1557 {
1558         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1559                 sock_diag_broadcast_destroy(sk);
1560         else
1561                 sk_destruct(sk);
1562 }
1563
1564 void sk_free(struct sock *sk)
1565 {
1566         /*
1567          * We subtract one from sk_wmem_alloc and can know if
1568          * some packets are still in some tx queue.
1569          * If not null, sock_wfree() will call __sk_free(sk) later
1570          */
1571         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1572                 __sk_free(sk);
1573 }
1574 EXPORT_SYMBOL(sk_free);
1575
1576 static void sk_init_common(struct sock *sk)
1577 {
1578         skb_queue_head_init(&sk->sk_receive_queue);
1579         skb_queue_head_init(&sk->sk_write_queue);
1580         skb_queue_head_init(&sk->sk_error_queue);
1581
1582         rwlock_init(&sk->sk_callback_lock);
1583         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1584                         af_rlock_keys + sk->sk_family,
1585                         af_family_rlock_key_strings[sk->sk_family]);
1586         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1587                         af_wlock_keys + sk->sk_family,
1588                         af_family_wlock_key_strings[sk->sk_family]);
1589         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1590                         af_elock_keys + sk->sk_family,
1591                         af_family_elock_key_strings[sk->sk_family]);
1592         lockdep_set_class_and_name(&sk->sk_callback_lock,
1593                         af_callback_keys + sk->sk_family,
1594                         af_family_clock_key_strings[sk->sk_family]);
1595 }
1596
1597 /**
1598  *      sk_clone_lock - clone a socket, and lock its clone
1599  *      @sk: the socket to clone
1600  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1601  *
1602  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1603  */
1604 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1605 {
1606         struct sock *newsk;
1607         bool is_charged = true;
1608
1609         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1610         if (newsk != NULL) {
1611                 struct sk_filter *filter;
1612
1613                 sock_copy(newsk, sk);
1614
1615                 /* SANITY */
1616                 if (likely(newsk->sk_net_refcnt))
1617                         get_net(sock_net(newsk));
1618                 sk_node_init(&newsk->sk_node);
1619                 sock_lock_init(newsk);
1620                 bh_lock_sock(newsk);
1621                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1622                 newsk->sk_backlog.len = 0;
1623
1624                 atomic_set(&newsk->sk_rmem_alloc, 0);
1625                 /*
1626                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1627                  */
1628                 atomic_set(&newsk->sk_wmem_alloc, 1);
1629                 atomic_set(&newsk->sk_omem_alloc, 0);
1630                 sk_init_common(newsk);
1631
1632                 newsk->sk_dst_cache     = NULL;
1633                 newsk->sk_dst_pending_confirm = 0;
1634                 newsk->sk_wmem_queued   = 0;
1635                 newsk->sk_forward_alloc = 0;
1636                 atomic_set(&newsk->sk_drops, 0);
1637                 newsk->sk_send_head     = NULL;
1638                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1639
1640                 sock_reset_flag(newsk, SOCK_DONE);
1641
1642                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1643                 if (filter != NULL)
1644                         /* though it's an empty new sock, the charging may fail
1645                          * if sysctl_optmem_max was changed between creation of
1646                          * original socket and cloning
1647                          */
1648                         is_charged = sk_filter_charge(newsk, filter);
1649
1650                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1651                         /* We need to make sure that we don't uncharge the new
1652                          * socket if we couldn't charge it in the first place
1653                          * as otherwise we uncharge the parent's filter.
1654                          */
1655                         if (!is_charged)
1656                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1657                         sk_free_unlock_clone(newsk);
1658                         newsk = NULL;
1659                         goto out;
1660                 }
1661                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1662
1663                 newsk->sk_err      = 0;
1664                 newsk->sk_err_soft = 0;
1665                 newsk->sk_priority = 0;
1666                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1667                 atomic64_set(&newsk->sk_cookie, 0);
1668
1669                 mem_cgroup_sk_alloc(newsk);
1670                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1671
1672                 /*
1673                  * Before updating sk_refcnt, we must commit prior changes to memory
1674                  * (Documentation/RCU/rculist_nulls.txt for details)
1675                  */
1676                 smp_wmb();
1677                 atomic_set(&newsk->sk_refcnt, 2);
1678
1679                 /*
1680                  * Increment the counter in the same struct proto as the master
1681                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1682                  * is the same as sk->sk_prot->socks, as this field was copied
1683                  * with memcpy).
1684                  *
1685                  * This _changes_ the previous behaviour, where
1686                  * tcp_create_openreq_child always was incrementing the
1687                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1688                  * to be taken into account in all callers. -acme
1689                  */
1690                 sk_refcnt_debug_inc(newsk);
1691                 sk_set_socket(newsk, NULL);
1692                 newsk->sk_wq = NULL;
1693
1694                 if (newsk->sk_prot->sockets_allocated)
1695                         sk_sockets_allocated_inc(newsk);
1696
1697                 if (sock_needs_netstamp(sk) &&
1698                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1699                         net_enable_timestamp();
1700         }
1701 out:
1702         return newsk;
1703 }
1704 EXPORT_SYMBOL_GPL(sk_clone_lock);
1705
1706 void sk_free_unlock_clone(struct sock *sk)
1707 {
1708         /* It is still raw copy of parent, so invalidate
1709          * destructor and make plain sk_free() */
1710         sk->sk_destruct = NULL;
1711         bh_unlock_sock(sk);
1712         sk_free(sk);
1713 }
1714 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1715
1716 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1717 {
1718         u32 max_segs = 1;
1719
1720         sk_dst_set(sk, dst);
1721         sk->sk_route_caps = dst->dev->features;
1722         if (sk->sk_route_caps & NETIF_F_GSO)
1723                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1724         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1725         if (sk_can_gso(sk)) {
1726                 if (dst->header_len) {
1727                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1728                 } else {
1729                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1730                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1731                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1732                 }
1733         }
1734         sk->sk_gso_max_segs = max_segs;
1735 }
1736 EXPORT_SYMBOL_GPL(sk_setup_caps);
1737
1738 /*
1739  *      Simple resource managers for sockets.
1740  */
1741
1742
1743 /*
1744  * Write buffer destructor automatically called from kfree_skb.
1745  */
1746 void sock_wfree(struct sk_buff *skb)
1747 {
1748         struct sock *sk = skb->sk;
1749         unsigned int len = skb->truesize;
1750
1751         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1752                 /*
1753                  * Keep a reference on sk_wmem_alloc, this will be released
1754                  * after sk_write_space() call
1755                  */
1756                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1757                 sk->sk_write_space(sk);
1758                 len = 1;
1759         }
1760         /*
1761          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1762          * could not do because of in-flight packets
1763          */
1764         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1765                 __sk_free(sk);
1766 }
1767 EXPORT_SYMBOL(sock_wfree);
1768
1769 /* This variant of sock_wfree() is used by TCP,
1770  * since it sets SOCK_USE_WRITE_QUEUE.
1771  */
1772 void __sock_wfree(struct sk_buff *skb)
1773 {
1774         struct sock *sk = skb->sk;
1775
1776         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1777                 __sk_free(sk);
1778 }
1779
1780 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1781 {
1782         skb_orphan(skb);
1783         skb->sk = sk;
1784 #ifdef CONFIG_INET
1785         if (unlikely(!sk_fullsock(sk))) {
1786                 skb->destructor = sock_edemux;
1787                 sock_hold(sk);
1788                 return;
1789         }
1790 #endif
1791         skb->destructor = sock_wfree;
1792         skb_set_hash_from_sk(skb, sk);
1793         /*
1794          * We used to take a refcount on sk, but following operation
1795          * is enough to guarantee sk_free() wont free this sock until
1796          * all in-flight packets are completed
1797          */
1798         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1799 }
1800 EXPORT_SYMBOL(skb_set_owner_w);
1801
1802 /* This helper is used by netem, as it can hold packets in its
1803  * delay queue. We want to allow the owner socket to send more
1804  * packets, as if they were already TX completed by a typical driver.
1805  * But we also want to keep skb->sk set because some packet schedulers
1806  * rely on it (sch_fq for example). So we set skb->truesize to a small
1807  * amount (1) and decrease sk_wmem_alloc accordingly.
1808  */
1809 void skb_orphan_partial(struct sk_buff *skb)
1810 {
1811         /* If this skb is a TCP pure ACK or already went here,
1812          * we have nothing to do. 2 is already a very small truesize.
1813          */
1814         if (skb->truesize <= 2)
1815                 return;
1816
1817         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1818          * so we do not completely orphan skb, but transfert all
1819          * accounted bytes but one, to avoid unexpected reorders.
1820          */
1821         if (skb->destructor == sock_wfree
1822 #ifdef CONFIG_INET
1823             || skb->destructor == tcp_wfree
1824 #endif
1825                 ) {
1826                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1827                 skb->truesize = 1;
1828         } else {
1829                 skb_orphan(skb);
1830         }
1831 }
1832 EXPORT_SYMBOL(skb_orphan_partial);
1833
1834 /*
1835  * Read buffer destructor automatically called from kfree_skb.
1836  */
1837 void sock_rfree(struct sk_buff *skb)
1838 {
1839         struct sock *sk = skb->sk;
1840         unsigned int len = skb->truesize;
1841
1842         atomic_sub(len, &sk->sk_rmem_alloc);
1843         sk_mem_uncharge(sk, len);
1844 }
1845 EXPORT_SYMBOL(sock_rfree);
1846
1847 /*
1848  * Buffer destructor for skbs that are not used directly in read or write
1849  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1850  */
1851 void sock_efree(struct sk_buff *skb)
1852 {
1853         sock_put(skb->sk);
1854 }
1855 EXPORT_SYMBOL(sock_efree);
1856
1857 kuid_t sock_i_uid(struct sock *sk)
1858 {
1859         kuid_t uid;
1860
1861         read_lock_bh(&sk->sk_callback_lock);
1862         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1863         read_unlock_bh(&sk->sk_callback_lock);
1864         return uid;
1865 }
1866 EXPORT_SYMBOL(sock_i_uid);
1867
1868 unsigned long sock_i_ino(struct sock *sk)
1869 {
1870         unsigned long ino;
1871
1872         read_lock_bh(&sk->sk_callback_lock);
1873         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1874         read_unlock_bh(&sk->sk_callback_lock);
1875         return ino;
1876 }
1877 EXPORT_SYMBOL(sock_i_ino);
1878
1879 /*
1880  * Allocate a skb from the socket's send buffer.
1881  */
1882 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1883                              gfp_t priority)
1884 {
1885         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1886                 struct sk_buff *skb = alloc_skb(size, priority);
1887                 if (skb) {
1888                         skb_set_owner_w(skb, sk);
1889                         return skb;
1890                 }
1891         }
1892         return NULL;
1893 }
1894 EXPORT_SYMBOL(sock_wmalloc);
1895
1896 /*
1897  * Allocate a memory block from the socket's option memory buffer.
1898  */
1899 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1900 {
1901         if ((unsigned int)size <= sysctl_optmem_max &&
1902             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1903                 void *mem;
1904                 /* First do the add, to avoid the race if kmalloc
1905                  * might sleep.
1906                  */
1907                 atomic_add(size, &sk->sk_omem_alloc);
1908                 mem = kmalloc(size, priority);
1909                 if (mem)
1910                         return mem;
1911                 atomic_sub(size, &sk->sk_omem_alloc);
1912         }
1913         return NULL;
1914 }
1915 EXPORT_SYMBOL(sock_kmalloc);
1916
1917 /* Free an option memory block. Note, we actually want the inline
1918  * here as this allows gcc to detect the nullify and fold away the
1919  * condition entirely.
1920  */
1921 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1922                                   const bool nullify)
1923 {
1924         if (WARN_ON_ONCE(!mem))
1925                 return;
1926         if (nullify)
1927                 kzfree(mem);
1928         else
1929                 kfree(mem);
1930         atomic_sub(size, &sk->sk_omem_alloc);
1931 }
1932
1933 void sock_kfree_s(struct sock *sk, void *mem, int size)
1934 {
1935         __sock_kfree_s(sk, mem, size, false);
1936 }
1937 EXPORT_SYMBOL(sock_kfree_s);
1938
1939 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1940 {
1941         __sock_kfree_s(sk, mem, size, true);
1942 }
1943 EXPORT_SYMBOL(sock_kzfree_s);
1944
1945 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1946    I think, these locks should be removed for datagram sockets.
1947  */
1948 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1949 {
1950         DEFINE_WAIT(wait);
1951
1952         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1953         for (;;) {
1954                 if (!timeo)
1955                         break;
1956                 if (signal_pending(current))
1957                         break;
1958                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1959                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1960                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1961                         break;
1962                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1963                         break;
1964                 if (sk->sk_err)
1965                         break;
1966                 timeo = schedule_timeout(timeo);
1967         }
1968         finish_wait(sk_sleep(sk), &wait);
1969         return timeo;
1970 }
1971
1972
1973 /*
1974  *      Generic send/receive buffer handlers
1975  */
1976
1977 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1978                                      unsigned long data_len, int noblock,
1979                                      int *errcode, int max_page_order)
1980 {
1981         struct sk_buff *skb;
1982         long timeo;
1983         int err;
1984
1985         timeo = sock_sndtimeo(sk, noblock);
1986         for (;;) {
1987                 err = sock_error(sk);
1988                 if (err != 0)
1989                         goto failure;
1990
1991                 err = -EPIPE;
1992                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1993                         goto failure;
1994
1995                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1996                         break;
1997
1998                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1999                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2000                 err = -EAGAIN;
2001                 if (!timeo)
2002                         goto failure;
2003                 if (signal_pending(current))
2004                         goto interrupted;
2005                 timeo = sock_wait_for_wmem(sk, timeo);
2006         }
2007         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2008                                    errcode, sk->sk_allocation);
2009         if (skb)
2010                 skb_set_owner_w(skb, sk);
2011         return skb;
2012
2013 interrupted:
2014         err = sock_intr_errno(timeo);
2015 failure:
2016         *errcode = err;
2017         return NULL;
2018 }
2019 EXPORT_SYMBOL(sock_alloc_send_pskb);
2020
2021 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2022                                     int noblock, int *errcode)
2023 {
2024         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2025 }
2026 EXPORT_SYMBOL(sock_alloc_send_skb);
2027
2028 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2029                      struct sockcm_cookie *sockc)
2030 {
2031         u32 tsflags;
2032
2033         switch (cmsg->cmsg_type) {
2034         case SO_MARK:
2035                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2036                         return -EPERM;
2037                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2038                         return -EINVAL;
2039                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2040                 break;
2041         case SO_TIMESTAMPING:
2042                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2043                         return -EINVAL;
2044
2045                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2046                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2047                         return -EINVAL;
2048
2049                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2050                 sockc->tsflags |= tsflags;
2051                 break;
2052         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2053         case SCM_RIGHTS:
2054         case SCM_CREDENTIALS:
2055                 break;
2056         default:
2057                 return -EINVAL;
2058         }
2059         return 0;
2060 }
2061 EXPORT_SYMBOL(__sock_cmsg_send);
2062
2063 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2064                    struct sockcm_cookie *sockc)
2065 {
2066         struct cmsghdr *cmsg;
2067         int ret;
2068
2069         for_each_cmsghdr(cmsg, msg) {
2070                 if (!CMSG_OK(msg, cmsg))
2071                         return -EINVAL;
2072                 if (cmsg->cmsg_level != SOL_SOCKET)
2073                         continue;
2074                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2075                 if (ret)
2076                         return ret;
2077         }
2078         return 0;
2079 }
2080 EXPORT_SYMBOL(sock_cmsg_send);
2081
2082 /* On 32bit arches, an skb frag is limited to 2^15 */
2083 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2084
2085 /**
2086  * skb_page_frag_refill - check that a page_frag contains enough room
2087  * @sz: minimum size of the fragment we want to get
2088  * @pfrag: pointer to page_frag
2089  * @gfp: priority for memory allocation
2090  *
2091  * Note: While this allocator tries to use high order pages, there is
2092  * no guarantee that allocations succeed. Therefore, @sz MUST be
2093  * less or equal than PAGE_SIZE.
2094  */
2095 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2096 {
2097         if (pfrag->page) {
2098                 if (page_ref_count(pfrag->page) == 1) {
2099                         pfrag->offset = 0;
2100                         return true;
2101                 }
2102                 if (pfrag->offset + sz <= pfrag->size)
2103                         return true;
2104                 put_page(pfrag->page);
2105         }
2106
2107         pfrag->offset = 0;
2108         if (SKB_FRAG_PAGE_ORDER) {
2109                 /* Avoid direct reclaim but allow kswapd to wake */
2110                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2111                                           __GFP_COMP | __GFP_NOWARN |
2112                                           __GFP_NORETRY,
2113                                           SKB_FRAG_PAGE_ORDER);
2114                 if (likely(pfrag->page)) {
2115                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2116                         return true;
2117                 }
2118         }
2119         pfrag->page = alloc_page(gfp);
2120         if (likely(pfrag->page)) {
2121                 pfrag->size = PAGE_SIZE;
2122                 return true;
2123         }
2124         return false;
2125 }
2126 EXPORT_SYMBOL(skb_page_frag_refill);
2127
2128 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2129 {
2130         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2131                 return true;
2132
2133         sk_enter_memory_pressure(sk);
2134         sk_stream_moderate_sndbuf(sk);
2135         return false;
2136 }
2137 EXPORT_SYMBOL(sk_page_frag_refill);
2138
2139 static void __lock_sock(struct sock *sk)
2140         __releases(&sk->sk_lock.slock)
2141         __acquires(&sk->sk_lock.slock)
2142 {
2143         DEFINE_WAIT(wait);
2144
2145         for (;;) {
2146                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2147                                         TASK_UNINTERRUPTIBLE);
2148                 spin_unlock_bh(&sk->sk_lock.slock);
2149                 schedule();
2150                 spin_lock_bh(&sk->sk_lock.slock);
2151                 if (!sock_owned_by_user(sk))
2152                         break;
2153         }
2154         finish_wait(&sk->sk_lock.wq, &wait);
2155 }
2156
2157 static void __release_sock(struct sock *sk)
2158         __releases(&sk->sk_lock.slock)
2159         __acquires(&sk->sk_lock.slock)
2160 {
2161         struct sk_buff *skb, *next;
2162
2163         while ((skb = sk->sk_backlog.head) != NULL) {
2164                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2165
2166                 spin_unlock_bh(&sk->sk_lock.slock);
2167
2168                 do {
2169                         next = skb->next;
2170                         prefetch(next);
2171                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2172                         skb->next = NULL;
2173                         sk_backlog_rcv(sk, skb);
2174
2175                         cond_resched();
2176
2177                         skb = next;
2178                 } while (skb != NULL);
2179
2180                 spin_lock_bh(&sk->sk_lock.slock);
2181         }
2182
2183         /*
2184          * Doing the zeroing here guarantee we can not loop forever
2185          * while a wild producer attempts to flood us.
2186          */
2187         sk->sk_backlog.len = 0;
2188 }
2189
2190 void __sk_flush_backlog(struct sock *sk)
2191 {
2192         spin_lock_bh(&sk->sk_lock.slock);
2193         __release_sock(sk);
2194         spin_unlock_bh(&sk->sk_lock.slock);
2195 }
2196
2197 /**
2198  * sk_wait_data - wait for data to arrive at sk_receive_queue
2199  * @sk:    sock to wait on
2200  * @timeo: for how long
2201  * @skb:   last skb seen on sk_receive_queue
2202  *
2203  * Now socket state including sk->sk_err is changed only under lock,
2204  * hence we may omit checks after joining wait queue.
2205  * We check receive queue before schedule() only as optimization;
2206  * it is very likely that release_sock() added new data.
2207  */
2208 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2209 {
2210         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2211         int rc;
2212
2213         add_wait_queue(sk_sleep(sk), &wait);
2214         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2215         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2216         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2217         remove_wait_queue(sk_sleep(sk), &wait);
2218         return rc;
2219 }
2220 EXPORT_SYMBOL(sk_wait_data);
2221
2222 /**
2223  *      __sk_mem_raise_allocated - increase memory_allocated
2224  *      @sk: socket
2225  *      @size: memory size to allocate
2226  *      @amt: pages to allocate
2227  *      @kind: allocation type
2228  *
2229  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2230  */
2231 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2232 {
2233         struct proto *prot = sk->sk_prot;
2234         long allocated = sk_memory_allocated_add(sk, amt);
2235
2236         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2237             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2238                 goto suppress_allocation;
2239
2240         /* Under limit. */
2241         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2242                 sk_leave_memory_pressure(sk);
2243                 return 1;
2244         }
2245
2246         /* Under pressure. */
2247         if (allocated > sk_prot_mem_limits(sk, 1))
2248                 sk_enter_memory_pressure(sk);
2249
2250         /* Over hard limit. */
2251         if (allocated > sk_prot_mem_limits(sk, 2))
2252                 goto suppress_allocation;
2253
2254         /* guarantee minimum buffer size under pressure */
2255         if (kind == SK_MEM_RECV) {
2256                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2257                         return 1;
2258
2259         } else { /* SK_MEM_SEND */
2260                 if (sk->sk_type == SOCK_STREAM) {
2261                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2262                                 return 1;
2263                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2264                            prot->sysctl_wmem[0])
2265                                 return 1;
2266         }
2267
2268         if (sk_has_memory_pressure(sk)) {
2269                 int alloc;
2270
2271                 if (!sk_under_memory_pressure(sk))
2272                         return 1;
2273                 alloc = sk_sockets_allocated_read_positive(sk);
2274                 if (sk_prot_mem_limits(sk, 2) > alloc *
2275                     sk_mem_pages(sk->sk_wmem_queued +
2276                                  atomic_read(&sk->sk_rmem_alloc) +
2277                                  sk->sk_forward_alloc))
2278                         return 1;
2279         }
2280
2281 suppress_allocation:
2282
2283         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2284                 sk_stream_moderate_sndbuf(sk);
2285
2286                 /* Fail only if socket is _under_ its sndbuf.
2287                  * In this case we cannot block, so that we have to fail.
2288                  */
2289                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2290                         return 1;
2291         }
2292
2293         trace_sock_exceed_buf_limit(sk, prot, allocated);
2294
2295         sk_memory_allocated_sub(sk, amt);
2296
2297         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2298                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2299
2300         return 0;
2301 }
2302 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2303
2304 /**
2305  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2306  *      @sk: socket
2307  *      @size: memory size to allocate
2308  *      @kind: allocation type
2309  *
2310  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2311  *      rmem allocation. This function assumes that protocols which have
2312  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2313  */
2314 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2315 {
2316         int ret, amt = sk_mem_pages(size);
2317
2318         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2319         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2320         if (!ret)
2321                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2322         return ret;
2323 }
2324 EXPORT_SYMBOL(__sk_mem_schedule);
2325
2326 /**
2327  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2328  *      @sk: socket
2329  *      @amount: number of quanta
2330  *
2331  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2332  */
2333 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2334 {
2335         sk_memory_allocated_sub(sk, amount);
2336
2337         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2338                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2339
2340         if (sk_under_memory_pressure(sk) &&
2341             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2342                 sk_leave_memory_pressure(sk);
2343 }
2344 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2345
2346 /**
2347  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2348  *      @sk: socket
2349  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2350  */
2351 void __sk_mem_reclaim(struct sock *sk, int amount)
2352 {
2353         amount >>= SK_MEM_QUANTUM_SHIFT;
2354         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2355         __sk_mem_reduce_allocated(sk, amount);
2356 }
2357 EXPORT_SYMBOL(__sk_mem_reclaim);
2358
2359 int sk_set_peek_off(struct sock *sk, int val)
2360 {
2361         if (val < 0)
2362                 return -EINVAL;
2363
2364         sk->sk_peek_off = val;
2365         return 0;
2366 }
2367 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2368
2369 /*
2370  * Set of default routines for initialising struct proto_ops when
2371  * the protocol does not support a particular function. In certain
2372  * cases where it makes no sense for a protocol to have a "do nothing"
2373  * function, some default processing is provided.
2374  */
2375
2376 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2377 {
2378         return -EOPNOTSUPP;
2379 }
2380 EXPORT_SYMBOL(sock_no_bind);
2381
2382 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2383                     int len, int flags)
2384 {
2385         return -EOPNOTSUPP;
2386 }
2387 EXPORT_SYMBOL(sock_no_connect);
2388
2389 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2390 {
2391         return -EOPNOTSUPP;
2392 }
2393 EXPORT_SYMBOL(sock_no_socketpair);
2394
2395 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2396                    bool kern)
2397 {
2398         return -EOPNOTSUPP;
2399 }
2400 EXPORT_SYMBOL(sock_no_accept);
2401
2402 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2403                     int *len, int peer)
2404 {
2405         return -EOPNOTSUPP;
2406 }
2407 EXPORT_SYMBOL(sock_no_getname);
2408
2409 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2410 {
2411         return 0;
2412 }
2413 EXPORT_SYMBOL(sock_no_poll);
2414
2415 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2416 {
2417         return -EOPNOTSUPP;
2418 }
2419 EXPORT_SYMBOL(sock_no_ioctl);
2420
2421 int sock_no_listen(struct socket *sock, int backlog)
2422 {
2423         return -EOPNOTSUPP;
2424 }
2425 EXPORT_SYMBOL(sock_no_listen);
2426
2427 int sock_no_shutdown(struct socket *sock, int how)
2428 {
2429         return -EOPNOTSUPP;
2430 }
2431 EXPORT_SYMBOL(sock_no_shutdown);
2432
2433 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2434                     char __user *optval, unsigned int optlen)
2435 {
2436         return -EOPNOTSUPP;
2437 }
2438 EXPORT_SYMBOL(sock_no_setsockopt);
2439
2440 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2441                     char __user *optval, int __user *optlen)
2442 {
2443         return -EOPNOTSUPP;
2444 }
2445 EXPORT_SYMBOL(sock_no_getsockopt);
2446
2447 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2448 {
2449         return -EOPNOTSUPP;
2450 }
2451 EXPORT_SYMBOL(sock_no_sendmsg);
2452
2453 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2454                     int flags)
2455 {
2456         return -EOPNOTSUPP;
2457 }
2458 EXPORT_SYMBOL(sock_no_recvmsg);
2459
2460 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2461 {
2462         /* Mirror missing mmap method error code */
2463         return -ENODEV;
2464 }
2465 EXPORT_SYMBOL(sock_no_mmap);
2466
2467 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2468 {
2469         ssize_t res;
2470         struct msghdr msg = {.msg_flags = flags};
2471         struct kvec iov;
2472         char *kaddr = kmap(page);
2473         iov.iov_base = kaddr + offset;
2474         iov.iov_len = size;
2475         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2476         kunmap(page);
2477         return res;
2478 }
2479 EXPORT_SYMBOL(sock_no_sendpage);
2480
2481 /*
2482  *      Default Socket Callbacks
2483  */
2484
2485 static void sock_def_wakeup(struct sock *sk)
2486 {
2487         struct socket_wq *wq;
2488
2489         rcu_read_lock();
2490         wq = rcu_dereference(sk->sk_wq);
2491         if (skwq_has_sleeper(wq))
2492                 wake_up_interruptible_all(&wq->wait);
2493         rcu_read_unlock();
2494 }
2495
2496 static void sock_def_error_report(struct sock *sk)
2497 {
2498         struct socket_wq *wq;
2499
2500         rcu_read_lock();
2501         wq = rcu_dereference(sk->sk_wq);
2502         if (skwq_has_sleeper(wq))
2503                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2504         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2505         rcu_read_unlock();
2506 }
2507
2508 static void sock_def_readable(struct sock *sk)
2509 {
2510         struct socket_wq *wq;
2511
2512         rcu_read_lock();
2513         wq = rcu_dereference(sk->sk_wq);
2514         if (skwq_has_sleeper(wq))
2515                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2516                                                 POLLRDNORM | POLLRDBAND);
2517         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2518         rcu_read_unlock();
2519 }
2520
2521 static void sock_def_write_space(struct sock *sk)
2522 {
2523         struct socket_wq *wq;
2524
2525         rcu_read_lock();
2526
2527         /* Do not wake up a writer until he can make "significant"
2528          * progress.  --DaveM
2529          */
2530         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2531                 wq = rcu_dereference(sk->sk_wq);
2532                 if (skwq_has_sleeper(wq))
2533                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2534                                                 POLLWRNORM | POLLWRBAND);
2535
2536                 /* Should agree with poll, otherwise some programs break */
2537                 if (sock_writeable(sk))
2538                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2539         }
2540
2541         rcu_read_unlock();
2542 }
2543
2544 static void sock_def_destruct(struct sock *sk)
2545 {
2546 }
2547
2548 void sk_send_sigurg(struct sock *sk)
2549 {
2550         if (sk->sk_socket && sk->sk_socket->file)
2551                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2552                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2553 }
2554 EXPORT_SYMBOL(sk_send_sigurg);
2555
2556 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2557                     unsigned long expires)
2558 {
2559         if (!mod_timer(timer, expires))
2560                 sock_hold(sk);
2561 }
2562 EXPORT_SYMBOL(sk_reset_timer);
2563
2564 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2565 {
2566         if (del_timer(timer))
2567                 __sock_put(sk);
2568 }
2569 EXPORT_SYMBOL(sk_stop_timer);
2570
2571 void sock_init_data(struct socket *sock, struct sock *sk)
2572 {
2573         sk_init_common(sk);
2574         sk->sk_send_head        =       NULL;
2575
2576         init_timer(&sk->sk_timer);
2577
2578         sk->sk_allocation       =       GFP_KERNEL;
2579         sk->sk_rcvbuf           =       sysctl_rmem_default;
2580         sk->sk_sndbuf           =       sysctl_wmem_default;
2581         sk->sk_state            =       TCP_CLOSE;
2582         sk_set_socket(sk, sock);
2583
2584         sock_set_flag(sk, SOCK_ZAPPED);
2585
2586         if (sock) {
2587                 sk->sk_type     =       sock->type;
2588                 sk->sk_wq       =       sock->wq;
2589                 sock->sk        =       sk;
2590                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2591         } else {
2592                 sk->sk_wq       =       NULL;
2593                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2594         }
2595
2596         rwlock_init(&sk->sk_callback_lock);
2597         if (sk->sk_kern_sock)
2598                 lockdep_set_class_and_name(
2599                         &sk->sk_callback_lock,
2600                         af_kern_callback_keys + sk->sk_family,
2601                         af_family_kern_clock_key_strings[sk->sk_family]);
2602         else
2603                 lockdep_set_class_and_name(
2604                         &sk->sk_callback_lock,
2605                         af_callback_keys + sk->sk_family,
2606                         af_family_clock_key_strings[sk->sk_family]);
2607
2608         sk->sk_state_change     =       sock_def_wakeup;
2609         sk->sk_data_ready       =       sock_def_readable;
2610         sk->sk_write_space      =       sock_def_write_space;
2611         sk->sk_error_report     =       sock_def_error_report;
2612         sk->sk_destruct         =       sock_def_destruct;
2613
2614         sk->sk_frag.page        =       NULL;
2615         sk->sk_frag.offset      =       0;
2616         sk->sk_peek_off         =       -1;
2617
2618         sk->sk_peer_pid         =       NULL;
2619         sk->sk_peer_cred        =       NULL;
2620         sk->sk_write_pending    =       0;
2621         sk->sk_rcvlowat         =       1;
2622         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2623         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2624
2625         sk->sk_stamp = SK_DEFAULT_STAMP;
2626
2627 #ifdef CONFIG_NET_RX_BUSY_POLL
2628         sk->sk_napi_id          =       0;
2629         sk->sk_ll_usec          =       sysctl_net_busy_read;
2630 #endif
2631
2632         sk->sk_max_pacing_rate = ~0U;
2633         sk->sk_pacing_rate = ~0U;
2634         sk->sk_incoming_cpu = -1;
2635         /*
2636          * Before updating sk_refcnt, we must commit prior changes to memory
2637          * (Documentation/RCU/rculist_nulls.txt for details)
2638          */
2639         smp_wmb();
2640         atomic_set(&sk->sk_refcnt, 1);
2641         atomic_set(&sk->sk_drops, 0);
2642 }
2643 EXPORT_SYMBOL(sock_init_data);
2644
2645 void lock_sock_nested(struct sock *sk, int subclass)
2646 {
2647         might_sleep();
2648         spin_lock_bh(&sk->sk_lock.slock);
2649         if (sk->sk_lock.owned)
2650                 __lock_sock(sk);
2651         sk->sk_lock.owned = 1;
2652         spin_unlock(&sk->sk_lock.slock);
2653         /*
2654          * The sk_lock has mutex_lock() semantics here:
2655          */
2656         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2657         local_bh_enable();
2658 }
2659 EXPORT_SYMBOL(lock_sock_nested);
2660
2661 void release_sock(struct sock *sk)
2662 {
2663         spin_lock_bh(&sk->sk_lock.slock);
2664         if (sk->sk_backlog.tail)
2665                 __release_sock(sk);
2666
2667         /* Warning : release_cb() might need to release sk ownership,
2668          * ie call sock_release_ownership(sk) before us.
2669          */
2670         if (sk->sk_prot->release_cb)
2671                 sk->sk_prot->release_cb(sk);
2672
2673         sock_release_ownership(sk);
2674         if (waitqueue_active(&sk->sk_lock.wq))
2675                 wake_up(&sk->sk_lock.wq);
2676         spin_unlock_bh(&sk->sk_lock.slock);
2677 }
2678 EXPORT_SYMBOL(release_sock);
2679
2680 /**
2681  * lock_sock_fast - fast version of lock_sock
2682  * @sk: socket
2683  *
2684  * This version should be used for very small section, where process wont block
2685  * return false if fast path is taken
2686  *   sk_lock.slock locked, owned = 0, BH disabled
2687  * return true if slow path is taken
2688  *   sk_lock.slock unlocked, owned = 1, BH enabled
2689  */
2690 bool lock_sock_fast(struct sock *sk)
2691 {
2692         might_sleep();
2693         spin_lock_bh(&sk->sk_lock.slock);
2694
2695         if (!sk->sk_lock.owned)
2696                 /*
2697                  * Note : We must disable BH
2698                  */
2699                 return false;
2700
2701         __lock_sock(sk);
2702         sk->sk_lock.owned = 1;
2703         spin_unlock(&sk->sk_lock.slock);
2704         /*
2705          * The sk_lock has mutex_lock() semantics here:
2706          */
2707         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2708         local_bh_enable();
2709         return true;
2710 }
2711 EXPORT_SYMBOL(lock_sock_fast);
2712
2713 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2714 {
2715         struct timeval tv;
2716         if (!sock_flag(sk, SOCK_TIMESTAMP))
2717                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2718         tv = ktime_to_timeval(sk->sk_stamp);
2719         if (tv.tv_sec == -1)
2720                 return -ENOENT;
2721         if (tv.tv_sec == 0) {
2722                 sk->sk_stamp = ktime_get_real();
2723                 tv = ktime_to_timeval(sk->sk_stamp);
2724         }
2725         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2726 }
2727 EXPORT_SYMBOL(sock_get_timestamp);
2728
2729 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2730 {
2731         struct timespec ts;
2732         if (!sock_flag(sk, SOCK_TIMESTAMP))
2733                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2734         ts = ktime_to_timespec(sk->sk_stamp);
2735         if (ts.tv_sec == -1)
2736                 return -ENOENT;
2737         if (ts.tv_sec == 0) {
2738                 sk->sk_stamp = ktime_get_real();
2739                 ts = ktime_to_timespec(sk->sk_stamp);
2740         }
2741         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2742 }
2743 EXPORT_SYMBOL(sock_get_timestampns);
2744
2745 void sock_enable_timestamp(struct sock *sk, int flag)
2746 {
2747         if (!sock_flag(sk, flag)) {
2748                 unsigned long previous_flags = sk->sk_flags;
2749
2750                 sock_set_flag(sk, flag);
2751                 /*
2752                  * we just set one of the two flags which require net
2753                  * time stamping, but time stamping might have been on
2754                  * already because of the other one
2755                  */
2756                 if (sock_needs_netstamp(sk) &&
2757                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2758                         net_enable_timestamp();
2759         }
2760 }
2761
2762 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2763                        int level, int type)
2764 {
2765         struct sock_exterr_skb *serr;
2766         struct sk_buff *skb;
2767         int copied, err;
2768
2769         err = -EAGAIN;
2770         skb = sock_dequeue_err_skb(sk);
2771         if (skb == NULL)
2772                 goto out;
2773
2774         copied = skb->len;
2775         if (copied > len) {
2776                 msg->msg_flags |= MSG_TRUNC;
2777                 copied = len;
2778         }
2779         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2780         if (err)
2781                 goto out_free_skb;
2782
2783         sock_recv_timestamp(msg, sk, skb);
2784
2785         serr = SKB_EXT_ERR(skb);
2786         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2787
2788         msg->msg_flags |= MSG_ERRQUEUE;
2789         err = copied;
2790
2791 out_free_skb:
2792         kfree_skb(skb);
2793 out:
2794         return err;
2795 }
2796 EXPORT_SYMBOL(sock_recv_errqueue);
2797
2798 /*
2799  *      Get a socket option on an socket.
2800  *
2801  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2802  *      asynchronous errors should be reported by getsockopt. We assume
2803  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2804  */
2805 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2806                            char __user *optval, int __user *optlen)
2807 {
2808         struct sock *sk = sock->sk;
2809
2810         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2811 }
2812 EXPORT_SYMBOL(sock_common_getsockopt);
2813
2814 #ifdef CONFIG_COMPAT
2815 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2816                                   char __user *optval, int __user *optlen)
2817 {
2818         struct sock *sk = sock->sk;
2819
2820         if (sk->sk_prot->compat_getsockopt != NULL)
2821                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2822                                                       optval, optlen);
2823         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2824 }
2825 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2826 #endif
2827
2828 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2829                         int flags)
2830 {
2831         struct sock *sk = sock->sk;
2832         int addr_len = 0;
2833         int err;
2834
2835         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2836                                    flags & ~MSG_DONTWAIT, &addr_len);
2837         if (err >= 0)
2838                 msg->msg_namelen = addr_len;
2839         return err;
2840 }
2841 EXPORT_SYMBOL(sock_common_recvmsg);
2842
2843 /*
2844  *      Set socket options on an inet socket.
2845  */
2846 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2847                            char __user *optval, unsigned int optlen)
2848 {
2849         struct sock *sk = sock->sk;
2850
2851         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2852 }
2853 EXPORT_SYMBOL(sock_common_setsockopt);
2854
2855 #ifdef CONFIG_COMPAT
2856 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2857                                   char __user *optval, unsigned int optlen)
2858 {
2859         struct sock *sk = sock->sk;
2860
2861         if (sk->sk_prot->compat_setsockopt != NULL)
2862                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2863                                                       optval, optlen);
2864         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2865 }
2866 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2867 #endif
2868
2869 void sk_common_release(struct sock *sk)
2870 {
2871         if (sk->sk_prot->destroy)
2872                 sk->sk_prot->destroy(sk);
2873
2874         /*
2875          * Observation: when sock_common_release is called, processes have
2876          * no access to socket. But net still has.
2877          * Step one, detach it from networking:
2878          *
2879          * A. Remove from hash tables.
2880          */
2881
2882         sk->sk_prot->unhash(sk);
2883
2884         /*
2885          * In this point socket cannot receive new packets, but it is possible
2886          * that some packets are in flight because some CPU runs receiver and
2887          * did hash table lookup before we unhashed socket. They will achieve
2888          * receive queue and will be purged by socket destructor.
2889          *
2890          * Also we still have packets pending on receive queue and probably,
2891          * our own packets waiting in device queues. sock_destroy will drain
2892          * receive queue, but transmitted packets will delay socket destruction
2893          * until the last reference will be released.
2894          */
2895
2896         sock_orphan(sk);
2897
2898         xfrm_sk_free_policy(sk);
2899
2900         sk_refcnt_debug_release(sk);
2901
2902         sock_put(sk);
2903 }
2904 EXPORT_SYMBOL(sk_common_release);
2905
2906 void sk_get_meminfo(const struct sock *sk, u32 *mem)
2907 {
2908         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
2909
2910         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
2911         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
2912         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
2913         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
2914         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
2915         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
2916         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
2917         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
2918         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
2919 }
2920
2921 #ifdef CONFIG_PROC_FS
2922 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2923 struct prot_inuse {
2924         int val[PROTO_INUSE_NR];
2925 };
2926
2927 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2928
2929 #ifdef CONFIG_NET_NS
2930 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2931 {
2932         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2933 }
2934 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2935
2936 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2937 {
2938         int cpu, idx = prot->inuse_idx;
2939         int res = 0;
2940
2941         for_each_possible_cpu(cpu)
2942                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2943
2944         return res >= 0 ? res : 0;
2945 }
2946 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2947
2948 static int __net_init sock_inuse_init_net(struct net *net)
2949 {
2950         net->core.inuse = alloc_percpu(struct prot_inuse);
2951         return net->core.inuse ? 0 : -ENOMEM;
2952 }
2953
2954 static void __net_exit sock_inuse_exit_net(struct net *net)
2955 {
2956         free_percpu(net->core.inuse);
2957 }
2958
2959 static struct pernet_operations net_inuse_ops = {
2960         .init = sock_inuse_init_net,
2961         .exit = sock_inuse_exit_net,
2962 };
2963
2964 static __init int net_inuse_init(void)
2965 {
2966         if (register_pernet_subsys(&net_inuse_ops))
2967                 panic("Cannot initialize net inuse counters");
2968
2969         return 0;
2970 }
2971
2972 core_initcall(net_inuse_init);
2973 #else
2974 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2975
2976 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2977 {
2978         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2979 }
2980 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2981
2982 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2983 {
2984         int cpu, idx = prot->inuse_idx;
2985         int res = 0;
2986
2987         for_each_possible_cpu(cpu)
2988                 res += per_cpu(prot_inuse, cpu).val[idx];
2989
2990         return res >= 0 ? res : 0;
2991 }
2992 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2993 #endif
2994
2995 static void assign_proto_idx(struct proto *prot)
2996 {
2997         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2998
2999         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3000                 pr_err("PROTO_INUSE_NR exhausted\n");
3001                 return;
3002         }
3003
3004         set_bit(prot->inuse_idx, proto_inuse_idx);
3005 }
3006
3007 static void release_proto_idx(struct proto *prot)
3008 {
3009         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3010                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3011 }
3012 #else
3013 static inline void assign_proto_idx(struct proto *prot)
3014 {
3015 }
3016
3017 static inline void release_proto_idx(struct proto *prot)
3018 {
3019 }
3020 #endif
3021
3022 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3023 {
3024         if (!rsk_prot)
3025                 return;
3026         kfree(rsk_prot->slab_name);
3027         rsk_prot->slab_name = NULL;
3028         kmem_cache_destroy(rsk_prot->slab);
3029         rsk_prot->slab = NULL;
3030 }
3031
3032 static int req_prot_init(const struct proto *prot)
3033 {
3034         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3035
3036         if (!rsk_prot)
3037                 return 0;
3038
3039         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3040                                         prot->name);
3041         if (!rsk_prot->slab_name)
3042                 return -ENOMEM;
3043
3044         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3045                                            rsk_prot->obj_size, 0,
3046                                            prot->slab_flags, NULL);
3047
3048         if (!rsk_prot->slab) {
3049                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3050                         prot->name);
3051                 return -ENOMEM;
3052         }
3053         return 0;
3054 }
3055
3056 int proto_register(struct proto *prot, int alloc_slab)
3057 {
3058         if (alloc_slab) {
3059                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3060                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3061                                         NULL);
3062
3063                 if (prot->slab == NULL) {
3064                         pr_crit("%s: Can't create sock SLAB cache!\n",
3065                                 prot->name);
3066                         goto out;
3067                 }
3068
3069                 if (req_prot_init(prot))
3070                         goto out_free_request_sock_slab;
3071
3072                 if (prot->twsk_prot != NULL) {
3073                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3074
3075                         if (prot->twsk_prot->twsk_slab_name == NULL)
3076                                 goto out_free_request_sock_slab;
3077
3078                         prot->twsk_prot->twsk_slab =
3079                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3080                                                   prot->twsk_prot->twsk_obj_size,
3081                                                   0,
3082                                                   prot->slab_flags,
3083                                                   NULL);
3084                         if (prot->twsk_prot->twsk_slab == NULL)
3085                                 goto out_free_timewait_sock_slab_name;
3086                 }
3087         }
3088
3089         mutex_lock(&proto_list_mutex);
3090         list_add(&prot->node, &proto_list);
3091         assign_proto_idx(prot);
3092         mutex_unlock(&proto_list_mutex);
3093         return 0;
3094
3095 out_free_timewait_sock_slab_name:
3096         kfree(prot->twsk_prot->twsk_slab_name);
3097 out_free_request_sock_slab:
3098         req_prot_cleanup(prot->rsk_prot);
3099
3100         kmem_cache_destroy(prot->slab);
3101         prot->slab = NULL;
3102 out:
3103         return -ENOBUFS;
3104 }
3105 EXPORT_SYMBOL(proto_register);
3106
3107 void proto_unregister(struct proto *prot)
3108 {
3109         mutex_lock(&proto_list_mutex);
3110         release_proto_idx(prot);
3111         list_del(&prot->node);
3112         mutex_unlock(&proto_list_mutex);
3113
3114         kmem_cache_destroy(prot->slab);
3115         prot->slab = NULL;
3116
3117         req_prot_cleanup(prot->rsk_prot);
3118
3119         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3120                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3121                 kfree(prot->twsk_prot->twsk_slab_name);
3122                 prot->twsk_prot->twsk_slab = NULL;
3123         }
3124 }
3125 EXPORT_SYMBOL(proto_unregister);
3126
3127 #ifdef CONFIG_PROC_FS
3128 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3129         __acquires(proto_list_mutex)
3130 {
3131         mutex_lock(&proto_list_mutex);
3132         return seq_list_start_head(&proto_list, *pos);
3133 }
3134
3135 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3136 {
3137         return seq_list_next(v, &proto_list, pos);
3138 }
3139
3140 static void proto_seq_stop(struct seq_file *seq, void *v)
3141         __releases(proto_list_mutex)
3142 {
3143         mutex_unlock(&proto_list_mutex);
3144 }
3145
3146 static char proto_method_implemented(const void *method)
3147 {
3148         return method == NULL ? 'n' : 'y';
3149 }
3150 static long sock_prot_memory_allocated(struct proto *proto)
3151 {
3152         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3153 }
3154
3155 static char *sock_prot_memory_pressure(struct proto *proto)
3156 {
3157         return proto->memory_pressure != NULL ?
3158         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3159 }
3160
3161 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3162 {
3163
3164         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3165                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3166                    proto->name,
3167                    proto->obj_size,
3168                    sock_prot_inuse_get(seq_file_net(seq), proto),
3169                    sock_prot_memory_allocated(proto),
3170                    sock_prot_memory_pressure(proto),
3171                    proto->max_header,
3172                    proto->slab == NULL ? "no" : "yes",
3173                    module_name(proto->owner),
3174                    proto_method_implemented(proto->close),
3175                    proto_method_implemented(proto->connect),
3176                    proto_method_implemented(proto->disconnect),
3177                    proto_method_implemented(proto->accept),
3178                    proto_method_implemented(proto->ioctl),
3179                    proto_method_implemented(proto->init),
3180                    proto_method_implemented(proto->destroy),
3181                    proto_method_implemented(proto->shutdown),
3182                    proto_method_implemented(proto->setsockopt),
3183                    proto_method_implemented(proto->getsockopt),
3184                    proto_method_implemented(proto->sendmsg),
3185                    proto_method_implemented(proto->recvmsg),
3186                    proto_method_implemented(proto->sendpage),
3187                    proto_method_implemented(proto->bind),
3188                    proto_method_implemented(proto->backlog_rcv),
3189                    proto_method_implemented(proto->hash),
3190                    proto_method_implemented(proto->unhash),
3191                    proto_method_implemented(proto->get_port),
3192                    proto_method_implemented(proto->enter_memory_pressure));
3193 }
3194
3195 static int proto_seq_show(struct seq_file *seq, void *v)
3196 {
3197         if (v == &proto_list)
3198                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3199                            "protocol",
3200                            "size",
3201                            "sockets",
3202                            "memory",
3203                            "press",
3204                            "maxhdr",
3205                            "slab",
3206                            "module",
3207                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3208         else
3209                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3210         return 0;
3211 }
3212
3213 static const struct seq_operations proto_seq_ops = {
3214         .start  = proto_seq_start,
3215         .next   = proto_seq_next,
3216         .stop   = proto_seq_stop,
3217         .show   = proto_seq_show,
3218 };
3219
3220 static int proto_seq_open(struct inode *inode, struct file *file)
3221 {
3222         return seq_open_net(inode, file, &proto_seq_ops,
3223                             sizeof(struct seq_net_private));
3224 }
3225
3226 static const struct file_operations proto_seq_fops = {
3227         .owner          = THIS_MODULE,
3228         .open           = proto_seq_open,
3229         .read           = seq_read,
3230         .llseek         = seq_lseek,
3231         .release        = seq_release_net,
3232 };
3233
3234 static __net_init int proto_init_net(struct net *net)
3235 {
3236         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3237                 return -ENOMEM;
3238
3239         return 0;
3240 }
3241
3242 static __net_exit void proto_exit_net(struct net *net)
3243 {
3244         remove_proc_entry("protocols", net->proc_net);
3245 }
3246
3247
3248 static __net_initdata struct pernet_operations proto_net_ops = {
3249         .init = proto_init_net,
3250         .exit = proto_exit_net,
3251 };
3252
3253 static int __init proto_init(void)
3254 {
3255         return register_pernet_subsys(&proto_net_ops);
3256 }
3257
3258 subsys_initcall(proto_init);
3259
3260 #endif /* PROC_FS */
3261
3262 #ifdef CONFIG_NET_RX_BUSY_POLL
3263 bool sk_busy_loop_end(void *p, unsigned long start_time)
3264 {
3265         struct sock *sk = p;
3266
3267         return !skb_queue_empty(&sk->sk_receive_queue) ||
3268                sk_busy_loop_timeout(sk, start_time);
3269 }
3270 EXPORT_SYMBOL(sk_busy_loop_end);
3271 #endif /* CONFIG_NET_RX_BUSY_POLL */