net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138
 139 #include <trace/events/sock.h>
 140
 141 #ifdef CONFIG_INET
 142 #include <net/tcp.h>
 143 #endif
 144
 145 #include <net/busy_poll.h>
 146
 147 static DEFINE_MUTEX(proto_list_mutex);
 148 static LIST_HEAD(proto_list);
 149
 150 /**
 151  * sk_ns_capable - General socket capability test
 152  * @sk: Socket to use a capability on or through
 153  * @user_ns: The user namespace of the capability to use
 154  * @cap: The capability to use
 155  *
 156  * Test to see if the opener of the socket had when the socket was
 157  * created and the current process has the capability @cap in the user
 158  * namespace @user_ns.
 159  */
 160 bool sk_ns_capable(const struct sock *sk,
 161                    struct user_namespace *user_ns, int cap)
 162 {
 163         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                 ns_capable(user_ns, cap);
 165 }
 166 EXPORT_SYMBOL(sk_ns_capable);
 167
 168 /**
 169  * sk_capable - Socket global capability test
 170  * @sk: Socket to use a capability on or through
 171  * @cap: The global capability to use
 172  *
 173  * Test to see if the opener of the socket had when the socket was
 174  * created and the current process has the capability @cap in all user
 175  * namespaces.
 176  */
 177 bool sk_capable(const struct sock *sk, int cap)
 178 {
 179         return sk_ns_capable(sk, &init_user_ns, cap);
 180 }
 181 EXPORT_SYMBOL(sk_capable);
 182
 183 /**
 184  * sk_net_capable - Network namespace socket capability test
 185  * @sk: Socket to use a capability on or through
 186  * @cap: The capability to use
 187  *
 188  * Test to see if the opener of the socket had when the socket was created
 189  * and the current process has the capability @cap over the network namespace
 190  * the socket is a member of.
 191  */
 192 bool sk_net_capable(const struct sock *sk, int cap)
 193 {
 194         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195 }
 196 EXPORT_SYMBOL(sk_net_capable);
 197
 198 /*
 199  * Each address family might have different locking rules, so we have
 200  * one slock key per address family:
 201  */
 202 static struct lock_class_key af_family_keys[AF_MAX];
 203 static struct lock_class_key af_family_slock_keys[AF_MAX];
 204
 205 /*
 206  * Make lock validator output more readable. (we pre-construct these
 207  * strings build-time, so that runtime initialization of socket
 208  * locks is fast):
 209  */
 210 static const char *const af_family_key_strings[AF_MAX+1] = {
 211   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 212   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 213   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 214   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 215   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 216   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 217   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 218   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 219   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 220   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 221   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 222   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 223   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 224   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
 225   "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
 226 };
 227 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 228   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 229   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 230   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 231   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 232   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 233   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 234   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 235   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 236   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 237   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 238   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 239   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 240   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 241   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
 242   "slock-AF_QIPCRTR", "slock-AF_MAX"
 243 };
 244 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 245   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 246   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 247   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 248   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 249   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 250   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 251   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 252   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 253   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 254   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 255   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 256   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 257   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 258   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
 259   "clock-AF_QIPCRTR", "clock-AF_MAX"
 260 };
 261
 262 /*
 263  * sk_callback_lock locking rules are per-address-family,
 264  * so split the lock classes by using a per-AF key:
 265  */
 266 static struct lock_class_key af_callback_keys[AF_MAX];
 267
 268 /* Take into consideration the size of the struct sk_buff overhead in the
 269  * determination of these values, since that is non-constant across
 270  * platforms.  This makes socket queueing behavior and performance
 271  * not depend upon such differences.
 272  */
 273 #define _SK_MEM_PACKETS         256
 274 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 275 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 276 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 277
 278 /* Run time adjustable parameters. */
 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280 EXPORT_SYMBOL(sysctl_wmem_max);
 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282 EXPORT_SYMBOL(sysctl_rmem_max);
 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286 /* Maximal space eaten by iovec or ancillary data plus some space */
 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288 EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290 int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 293 EXPORT_SYMBOL_GPL(memalloc_socks);
 294
 295 /**
 296  * sk_set_memalloc - sets %SOCK_MEMALLOC
 297  * @sk: socket to set it on
 298  *
 299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300  * It's the responsibility of the admin to adjust min_free_kbytes
 301  * to meet the requirements
 302  */
 303 void sk_set_memalloc(struct sock *sk)
 304 {
 305         sock_set_flag(sk, SOCK_MEMALLOC);
 306         sk->sk_allocation |= __GFP_MEMALLOC;
 307         static_key_slow_inc(&memalloc_socks);
 308 }
 309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311 void sk_clear_memalloc(struct sock *sk)
 312 {
 313         sock_reset_flag(sk, SOCK_MEMALLOC);
 314         sk->sk_allocation &= ~__GFP_MEMALLOC;
 315         static_key_slow_dec(&memalloc_socks);
 316
 317         /*
 318          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319          * progress of swapping. SOCK_MEMALLOC may be cleared while
 320          * it has rmem allocations due to the last swapfile being deactivated
 321          * but there is a risk that the socket is unusable due to exceeding
 322          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323          */
 324         sk_mem_reclaim(sk);
 325 }
 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329 {
 330         int ret;
 331         unsigned long pflags = current->flags;
 332
 333         /* these should have been dropped before queueing */
 334         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336         current->flags |= PF_MEMALLOC;
 337         ret = sk->sk_backlog_rcv(sk, skb);
 338         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 339
 340         return ret;
 341 }
 342 EXPORT_SYMBOL(__sk_backlog_rcv);
 343
 344 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 345 {
 346         struct timeval tv;
 347
 348         if (optlen < sizeof(tv))
 349                 return -EINVAL;
 350         if (copy_from_user(&tv, optval, sizeof(tv)))
 351                 return -EFAULT;
 352         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 353                 return -EDOM;
 354
 355         if (tv.tv_sec < 0) {
 356                 static int warned __read_mostly;
 357
 358                 *timeo_p = 0;
 359                 if (warned < 10 && net_ratelimit()) {
 360                         warned++;
 361                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 362                                 __func__, current->comm, task_pid_nr(current));
 363                 }
 364                 return 0;
 365         }
 366         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 367         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 368                 return 0;
 369         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 370                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 371         return 0;
 372 }
 373
 374 static void sock_warn_obsolete_bsdism(const char *name)
 375 {
 376         static int warned;
 377         static char warncomm[TASK_COMM_LEN];
 378         if (strcmp(warncomm, current->comm) && warned < 5) {
 379                 strcpy(warncomm,  current->comm);
 380                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 381                         warncomm, name);
 382                 warned++;
 383         }
 384 }
 385
 386 static bool sock_needs_netstamp(const struct sock *sk)
 387 {
 388         switch (sk->sk_family) {
 389         case AF_UNSPEC:
 390         case AF_UNIX:
 391                 return false;
 392         default:
 393                 return true;
 394         }
 395 }
 396
 397 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 398 {
 399         if (sk->sk_flags & flags) {
 400                 sk->sk_flags &= ~flags;
 401                 if (sock_needs_netstamp(sk) &&
 402                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 403                         net_disable_timestamp();
 404         }
 405 }
 406
 407
 408 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 409 {
 410         unsigned long flags;
 411         struct sk_buff_head *list = &sk->sk_receive_queue;
 412
 413         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 414                 atomic_inc(&sk->sk_drops);
 415                 trace_sock_rcvqueue_full(sk, skb);
 416                 return -ENOMEM;
 417         }
 418
 419         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 420                 atomic_inc(&sk->sk_drops);
 421                 return -ENOBUFS;
 422         }
 423
 424         skb->dev = NULL;
 425         skb_set_owner_r(skb, sk);
 426
 427         /* we escape from rcu protected region, make sure we dont leak
 428          * a norefcounted dst
 429          */
 430         skb_dst_force(skb);
 431
 432         spin_lock_irqsave(&list->lock, flags);
 433         sock_skb_set_dropcount(sk, skb);
 434         __skb_queue_tail(list, skb);
 435         spin_unlock_irqrestore(&list->lock, flags);
 436
 437         if (!sock_flag(sk, SOCK_DEAD))
 438                 sk->sk_data_ready(sk);
 439         return 0;
 440 }
 441 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 442
 443 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 444 {
 445         int err;
 446
 447         err = sk_filter(sk, skb);
 448         if (err)
 449                 return err;
 450
 451         return __sock_queue_rcv_skb(sk, skb);
 452 }
 453 EXPORT_SYMBOL(sock_queue_rcv_skb);
 454
 455 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 456                      const int nested, unsigned int trim_cap, bool refcounted)
 457 {
 458         int rc = NET_RX_SUCCESS;
 459
 460         if (sk_filter_trim_cap(sk, skb, trim_cap))
 461                 goto discard_and_relse;
 462
 463         skb->dev = NULL;
 464
 465         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 466                 atomic_inc(&sk->sk_drops);
 467                 goto discard_and_relse;
 468         }
 469         if (nested)
 470                 bh_lock_sock_nested(sk);
 471         else
 472                 bh_lock_sock(sk);
 473         if (!sock_owned_by_user(sk)) {
 474                 /*
 475                  * trylock + unlock semantics:
 476                  */
 477                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 478
 479                 rc = sk_backlog_rcv(sk, skb);
 480
 481                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 482         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 483                 bh_unlock_sock(sk);
 484                 atomic_inc(&sk->sk_drops);
 485                 goto discard_and_relse;
 486         }
 487
 488         bh_unlock_sock(sk);
 489 out:
 490         if (refcounted)
 491                 sock_put(sk);
 492         return rc;
 493 discard_and_relse:
 494         kfree_skb(skb);
 495         goto out;
 496 }
 497 EXPORT_SYMBOL(__sk_receive_skb);
 498
 499 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 500 {
 501         struct dst_entry *dst = __sk_dst_get(sk);
 502
 503         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 504                 sk_tx_queue_clear(sk);
 505                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 506                 dst_release(dst);
 507                 return NULL;
 508         }
 509
 510         return dst;
 511 }
 512 EXPORT_SYMBOL(__sk_dst_check);
 513
 514 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 515 {
 516         struct dst_entry *dst = sk_dst_get(sk);
 517
 518         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 519                 sk_dst_reset(sk);
 520                 dst_release(dst);
 521                 return NULL;
 522         }
 523
 524         return dst;
 525 }
 526 EXPORT_SYMBOL(sk_dst_check);
 527
 528 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 529                                 int optlen)
 530 {
 531         int ret = -ENOPROTOOPT;
 532 #ifdef CONFIG_NETDEVICES
 533         struct net *net = sock_net(sk);
 534         char devname[IFNAMSIZ];
 535         int index;
 536
 537         /* Sorry... */
 538         ret = -EPERM;
 539         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 540                 goto out;
 541
 542         ret = -EINVAL;
 543         if (optlen < 0)
 544                 goto out;
 545
 546         /* Bind this socket to a particular device like "eth0",
 547          * as specified in the passed interface name. If the
 548          * name is "" or the option length is zero the socket
 549          * is not bound.
 550          */
 551         if (optlen > IFNAMSIZ - 1)
 552                 optlen = IFNAMSIZ - 1;
 553         memset(devname, 0, sizeof(devname));
 554
 555         ret = -EFAULT;
 556         if (copy_from_user(devname, optval, optlen))
 557                 goto out;
 558
 559         index = 0;
 560         if (devname[0] != '\0') {
 561                 struct net_device *dev;
 562
 563                 rcu_read_lock();
 564                 dev = dev_get_by_name_rcu(net, devname);
 565                 if (dev)
 566                         index = dev->ifindex;
 567                 rcu_read_unlock();
 568                 ret = -ENODEV;
 569                 if (!dev)
 570                         goto out;
 571         }
 572
 573         lock_sock(sk);
 574         sk->sk_bound_dev_if = index;
 575         sk_dst_reset(sk);
 576         release_sock(sk);
 577
 578         ret = 0;
 579
 580 out:
 581 #endif
 582
 583         return ret;
 584 }
 585
 586 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 587                                 int __user *optlen, int len)
 588 {
 589         int ret = -ENOPROTOOPT;
 590 #ifdef CONFIG_NETDEVICES
 591         struct net *net = sock_net(sk);
 592         char devname[IFNAMSIZ];
 593
 594         if (sk->sk_bound_dev_if == 0) {
 595                 len = 0;
 596                 goto zero;
 597         }
 598
 599         ret = -EINVAL;
 600         if (len < IFNAMSIZ)
 601                 goto out;
 602
 603         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 604         if (ret)
 605                 goto out;
 606
 607         len = strlen(devname) + 1;
 608
 609         ret = -EFAULT;
 610         if (copy_to_user(optval, devname, len))
 611                 goto out;
 612
 613 zero:
 614         ret = -EFAULT;
 615         if (put_user(len, optlen))
 616                 goto out;
 617
 618         ret = 0;
 619
 620 out:
 621 #endif
 622
 623         return ret;
 624 }
 625
 626 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 627 {
 628         if (valbool)
 629                 sock_set_flag(sk, bit);
 630         else
 631                 sock_reset_flag(sk, bit);
 632 }
 633
 634 bool sk_mc_loop(struct sock *sk)
 635 {
 636         if (dev_recursion_level())
 637                 return false;
 638         if (!sk)
 639                 return true;
 640         switch (sk->sk_family) {
 641         case AF_INET:
 642                 return inet_sk(sk)->mc_loop;
 643 #if IS_ENABLED(CONFIG_IPV6)
 644         case AF_INET6:
 645                 return inet6_sk(sk)->mc_loop;
 646 #endif
 647         }
 648         WARN_ON(1);
 649         return true;
 650 }
 651 EXPORT_SYMBOL(sk_mc_loop);
 652
 653 /*
 654  *      This is meant for all protocols to use and covers goings on
 655  *      at the socket level. Everything here is generic.
 656  */
 657
 658 int sock_setsockopt(struct socket *sock, int level, int optname,
 659                     char __user *optval, unsigned int optlen)
 660 {
 661         struct sock *sk = sock->sk;
 662         int val;
 663         int valbool;
 664         struct linger ling;
 665         int ret = 0;
 666
 667         /*
 668          *      Options without arguments
 669          */
 670
 671         if (optname == SO_BINDTODEVICE)
 672                 return sock_setbindtodevice(sk, optval, optlen);
 673
 674         if (optlen < sizeof(int))
 675                 return -EINVAL;
 676
 677         if (get_user(val, (int __user *)optval))
 678                 return -EFAULT;
 679
 680         valbool = val ? 1 : 0;
 681
 682         lock_sock(sk);
 683
 684         switch (optname) {
 685         case SO_DEBUG:
 686                 if (val && !capable(CAP_NET_ADMIN))
 687                         ret = -EACCES;
 688                 else
 689                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 690                 break;
 691         case SO_REUSEADDR:
 692                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 693                 break;
 694         case SO_REUSEPORT:
 695                 sk->sk_reuseport = valbool;
 696                 break;
 697         case SO_TYPE:
 698         case SO_PROTOCOL:
 699         case SO_DOMAIN:
 700         case SO_ERROR:
 701                 ret = -ENOPROTOOPT;
 702                 break;
 703         case SO_DONTROUTE:
 704                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 705                 break;
 706         case SO_BROADCAST:
 707                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 708                 break;
 709         case SO_SNDBUF:
 710                 /* Don't error on this BSD doesn't and if you think
 711                  * about it this is right. Otherwise apps have to
 712                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 713                  * are treated in BSD as hints
 714                  */
 715                 val = min_t(u32, val, sysctl_wmem_max);
 716 set_sndbuf:
 717                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 718                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 719                 /* Wake up sending tasks if we upped the value. */
 720                 sk->sk_write_space(sk);
 721                 break;
 722
 723         case SO_SNDBUFFORCE:
 724                 if (!capable(CAP_NET_ADMIN)) {
 725                         ret = -EPERM;
 726                         break;
 727                 }
 728                 goto set_sndbuf;
 729
 730         case SO_RCVBUF:
 731                 /* Don't error on this BSD doesn't and if you think
 732                  * about it this is right. Otherwise apps have to
 733                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 734                  * are treated in BSD as hints
 735                  */
 736                 val = min_t(u32, val, sysctl_rmem_max);
 737 set_rcvbuf:
 738                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 739                 /*
 740                  * We double it on the way in to account for
 741                  * "struct sk_buff" etc. overhead.   Applications
 742                  * assume that the SO_RCVBUF setting they make will
 743                  * allow that much actual data to be received on that
 744                  * socket.
 745                  *
 746                  * Applications are unaware that "struct sk_buff" and
 747                  * other overheads allocate from the receive buffer
 748                  * during socket buffer allocation.
 749                  *
 750                  * And after considering the possible alternatives,
 751                  * returning the value we actually used in getsockopt
 752                  * is the most desirable behavior.
 753                  */
 754                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 755                 break;
 756
 757         case SO_RCVBUFFORCE:
 758                 if (!capable(CAP_NET_ADMIN)) {
 759                         ret = -EPERM;
 760                         break;
 761                 }
 762                 goto set_rcvbuf;
 763
 764         case SO_KEEPALIVE:
 765 #ifdef CONFIG_INET
 766                 if (sk->sk_protocol == IPPROTO_TCP &&
 767                     sk->sk_type == SOCK_STREAM)
 768                         tcp_set_keepalive(sk, valbool);
 769 #endif
 770                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 771                 break;
 772
 773         case SO_OOBINLINE:
 774                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 775                 break;
 776
 777         case SO_NO_CHECK:
 778                 sk->sk_no_check_tx = valbool;
 779                 break;
 780
 781         case SO_PRIORITY:
 782                 if ((val >= 0 && val <= 6) ||
 783                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 784                         sk->sk_priority = val;
 785                 else
 786                         ret = -EPERM;
 787                 break;
 788
 789         case SO_LINGER:
 790                 if (optlen < sizeof(ling)) {
 791                         ret = -EINVAL;  /* 1003.1g */
 792                         break;
 793                 }
 794                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 795                         ret = -EFAULT;
 796                         break;
 797                 }
 798                 if (!ling.l_onoff)
 799                         sock_reset_flag(sk, SOCK_LINGER);
 800                 else {
 801 #if (BITS_PER_LONG == 32)
 802                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 803                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 804                         else
 805 #endif
 806                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 807                         sock_set_flag(sk, SOCK_LINGER);
 808                 }
 809                 break;
 810
 811         case SO_BSDCOMPAT:
 812                 sock_warn_obsolete_bsdism("setsockopt");
 813                 break;
 814
 815         case SO_PASSCRED:
 816                 if (valbool)
 817                         set_bit(SOCK_PASSCRED, &sock->flags);
 818                 else
 819                         clear_bit(SOCK_PASSCRED, &sock->flags);
 820                 break;
 821
 822         case SO_TIMESTAMP:
 823         case SO_TIMESTAMPNS:
 824                 if (valbool)  {
 825                         if (optname == SO_TIMESTAMP)
 826                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 827                         else
 828                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 829                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 830                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 831                 } else {
 832                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 833                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 834                 }
 835                 break;
 836
 837         case SO_TIMESTAMPING:
 838                 if (val & ~SOF_TIMESTAMPING_MASK) {
 839                         ret = -EINVAL;
 840                         break;
 841                 }
 842
 843                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 844                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 845                         if (sk->sk_protocol == IPPROTO_TCP &&
 846                             sk->sk_type == SOCK_STREAM) {
 847                                 if ((1 << sk->sk_state) &
 848                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 849                                         ret = -EINVAL;
 850                                         break;
 851                                 }
 852                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 853                         } else {
 854                                 sk->sk_tskey = 0;
 855                         }
 856                 }
 857
 858                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 859                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 860                         ret = -EINVAL;
 861                         break;
 862                 }
 863
 864                 sk->sk_tsflags = val;
 865                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 866                         sock_enable_timestamp(sk,
 867                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 868                 else
 869                         sock_disable_timestamp(sk,
 870                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 871                 break;
 872
 873         case SO_RCVLOWAT:
 874                 if (val < 0)
 875                         val = INT_MAX;
 876                 sk->sk_rcvlowat = val ? : 1;
 877                 break;
 878
 879         case SO_RCVTIMEO:
 880                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 881                 break;
 882
 883         case SO_SNDTIMEO:
 884                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 885                 break;
 886
 887         case SO_ATTACH_FILTER:
 888                 ret = -EINVAL;
 889                 if (optlen == sizeof(struct sock_fprog)) {
 890                         struct sock_fprog fprog;
 891
 892                         ret = -EFAULT;
 893                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 894                                 break;
 895
 896                         ret = sk_attach_filter(&fprog, sk);
 897                 }
 898                 break;
 899
 900         case SO_ATTACH_BPF:
 901                 ret = -EINVAL;
 902                 if (optlen == sizeof(u32)) {
 903                         u32 ufd;
 904
 905                         ret = -EFAULT;
 906                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 907                                 break;
 908
 909                         ret = sk_attach_bpf(ufd, sk);
 910                 }
 911                 break;
 912
 913         case SO_ATTACH_REUSEPORT_CBPF:
 914                 ret = -EINVAL;
 915                 if (optlen == sizeof(struct sock_fprog)) {
 916                         struct sock_fprog fprog;
 917
 918                         ret = -EFAULT;
 919                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 920                                 break;
 921
 922                         ret = sk_reuseport_attach_filter(&fprog, sk);
 923                 }
 924                 break;
 925
 926         case SO_ATTACH_REUSEPORT_EBPF:
 927                 ret = -EINVAL;
 928                 if (optlen == sizeof(u32)) {
 929                         u32 ufd;
 930
 931                         ret = -EFAULT;
 932                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 933                                 break;
 934
 935                         ret = sk_reuseport_attach_bpf(ufd, sk);
 936                 }
 937                 break;
 938
 939         case SO_DETACH_FILTER:
 940                 ret = sk_detach_filter(sk);
 941                 break;
 942
 943         case SO_LOCK_FILTER:
 944                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 945                         ret = -EPERM;
 946                 else
 947                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 948                 break;
 949
 950         case SO_PASSSEC:
 951                 if (valbool)
 952                         set_bit(SOCK_PASSSEC, &sock->flags);
 953                 else
 954                         clear_bit(SOCK_PASSSEC, &sock->flags);
 955                 break;
 956         case SO_MARK:
 957                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 958                         ret = -EPERM;
 959                 else
 960                         sk->sk_mark = val;
 961                 break;
 962
 963         case SO_RXQ_OVFL:
 964                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 965                 break;
 966
 967         case SO_WIFI_STATUS:
 968                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 969                 break;
 970
 971         case SO_PEEK_OFF:
 972                 if (sock->ops->set_peek_off)
 973                         ret = sock->ops->set_peek_off(sk, val);
 974                 else
 975                         ret = -EOPNOTSUPP;
 976                 break;
 977
 978         case SO_NOFCS:
 979                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 980                 break;
 981
 982         case SO_SELECT_ERR_QUEUE:
 983                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 984                 break;
 985
 986 #ifdef CONFIG_NET_RX_BUSY_POLL
 987         case SO_BUSY_POLL:
 988                 /* allow unprivileged users to decrease the value */
 989                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 990                         ret = -EPERM;
 991                 else {
 992                         if (val < 0)
 993                                 ret = -EINVAL;
 994                         else
 995                                 sk->sk_ll_usec = val;
 996                 }
 997                 break;
 998 #endif
 999
1000         case SO_MAX_PACING_RATE:
1001                 sk->sk_max_pacing_rate = val;
1002                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1003                                          sk->sk_max_pacing_rate);
1004                 break;
1005
1006         case SO_INCOMING_CPU:
1007                 sk->sk_incoming_cpu = val;
1008                 break;
1009
1010         case SO_CNX_ADVICE:
1011                 if (val == 1)
1012                         dst_negative_advice(sk);
1013                 break;
1014         default:
1015                 ret = -ENOPROTOOPT;
1016                 break;
1017         }
1018         release_sock(sk);
1019         return ret;
1020 }
1021 EXPORT_SYMBOL(sock_setsockopt);
1022
1023
1024 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1025                           struct ucred *ucred)
1026 {
1027         ucred->pid = pid_vnr(pid);
1028         ucred->uid = ucred->gid = -1;
1029         if (cred) {
1030                 struct user_namespace *current_ns = current_user_ns();
1031
1032                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1033                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1034         }
1035 }
1036
1037 int sock_getsockopt(struct socket *sock, int level, int optname,
1038                     char __user *optval, int __user *optlen)
1039 {
1040         struct sock *sk = sock->sk;
1041
1042         union {
1043                 int val;
1044                 struct linger ling;
1045                 struct timeval tm;
1046         } v;
1047
1048         int lv = sizeof(int);
1049         int len;
1050
1051         if (get_user(len, optlen))
1052                 return -EFAULT;
1053         if (len < 0)
1054                 return -EINVAL;
1055
1056         memset(&v, 0, sizeof(v));
1057
1058         switch (optname) {
1059         case SO_DEBUG:
1060                 v.val = sock_flag(sk, SOCK_DBG);
1061                 break;
1062
1063         case SO_DONTROUTE:
1064                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1065                 break;
1066
1067         case SO_BROADCAST:
1068                 v.val = sock_flag(sk, SOCK_BROADCAST);
1069                 break;
1070
1071         case SO_SNDBUF:
1072                 v.val = sk->sk_sndbuf;
1073                 break;
1074
1075         case SO_RCVBUF:
1076                 v.val = sk->sk_rcvbuf;
1077                 break;
1078
1079         case SO_REUSEADDR:
1080                 v.val = sk->sk_reuse;
1081                 break;
1082
1083         case SO_REUSEPORT:
1084                 v.val = sk->sk_reuseport;
1085                 break;
1086
1087         case SO_KEEPALIVE:
1088                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1089                 break;
1090
1091         case SO_TYPE:
1092                 v.val = sk->sk_type;
1093                 break;
1094
1095         case SO_PROTOCOL:
1096                 v.val = sk->sk_protocol;
1097                 break;
1098
1099         case SO_DOMAIN:
1100                 v.val = sk->sk_family;
1101                 break;
1102
1103         case SO_ERROR:
1104                 v.val = -sock_error(sk);
1105                 if (v.val == 0)
1106                         v.val = xchg(&sk->sk_err_soft, 0);
1107                 break;
1108
1109         case SO_OOBINLINE:
1110                 v.val = sock_flag(sk, SOCK_URGINLINE);
1111                 break;
1112
1113         case SO_NO_CHECK:
1114                 v.val = sk->sk_no_check_tx;
1115                 break;
1116
1117         case SO_PRIORITY:
1118                 v.val = sk->sk_priority;
1119                 break;
1120
1121         case SO_LINGER:
1122                 lv              = sizeof(v.ling);
1123                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1124                 v.ling.l_linger = sk->sk_lingertime / HZ;
1125                 break;
1126
1127         case SO_BSDCOMPAT:
1128                 sock_warn_obsolete_bsdism("getsockopt");
1129                 break;
1130
1131         case SO_TIMESTAMP:
1132                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1133                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1134                 break;
1135
1136         case SO_TIMESTAMPNS:
1137                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1138                 break;
1139
1140         case SO_TIMESTAMPING:
1141                 v.val = sk->sk_tsflags;
1142                 break;
1143
1144         case SO_RCVTIMEO:
1145                 lv = sizeof(struct timeval);
1146                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1147                         v.tm.tv_sec = 0;
1148                         v.tm.tv_usec = 0;
1149                 } else {
1150                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1151                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1152                 }
1153                 break;
1154
1155         case SO_SNDTIMEO:
1156                 lv = sizeof(struct timeval);
1157                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1158                         v.tm.tv_sec = 0;
1159                         v.tm.tv_usec = 0;
1160                 } else {
1161                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1162                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1163                 }
1164                 break;
1165
1166         case SO_RCVLOWAT:
1167                 v.val = sk->sk_rcvlowat;
1168                 break;
1169
1170         case SO_SNDLOWAT:
1171                 v.val = 1;
1172                 break;
1173
1174         case SO_PASSCRED:
1175                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1176                 break;
1177
1178         case SO_PEERCRED:
1179         {
1180                 struct ucred peercred;
1181                 if (len > sizeof(peercred))
1182                         len = sizeof(peercred);
1183                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1184                 if (copy_to_user(optval, &peercred, len))
1185                         return -EFAULT;
1186                 goto lenout;
1187         }
1188
1189         case SO_PEERNAME:
1190         {
1191                 char address[128];
1192
1193                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1194                         return -ENOTCONN;
1195                 if (lv < len)
1196                         return -EINVAL;
1197                 if (copy_to_user(optval, address, len))
1198                         return -EFAULT;
1199                 goto lenout;
1200         }
1201
1202         /* Dubious BSD thing... Probably nobody even uses it, but
1203          * the UNIX standard wants it for whatever reason... -DaveM
1204          */
1205         case SO_ACCEPTCONN:
1206                 v.val = sk->sk_state == TCP_LISTEN;
1207                 break;
1208
1209         case SO_PASSSEC:
1210                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1211                 break;
1212
1213         case SO_PEERSEC:
1214                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1215
1216         case SO_MARK:
1217                 v.val = sk->sk_mark;
1218                 break;
1219
1220         case SO_RXQ_OVFL:
1221                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1222                 break;
1223
1224         case SO_WIFI_STATUS:
1225                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1226                 break;
1227
1228         case SO_PEEK_OFF:
1229                 if (!sock->ops->set_peek_off)
1230                         return -EOPNOTSUPP;
1231
1232                 v.val = sk->sk_peek_off;
1233                 break;
1234         case SO_NOFCS:
1235                 v.val = sock_flag(sk, SOCK_NOFCS);
1236                 break;
1237
1238         case SO_BINDTODEVICE:
1239                 return sock_getbindtodevice(sk, optval, optlen, len);
1240
1241         case SO_GET_FILTER:
1242                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1243                 if (len < 0)
1244                         return len;
1245
1246                 goto lenout;
1247
1248         case SO_LOCK_FILTER:
1249                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1250                 break;
1251
1252         case SO_BPF_EXTENSIONS:
1253                 v.val = bpf_tell_extensions();
1254                 break;
1255
1256         case SO_SELECT_ERR_QUEUE:
1257                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1258                 break;
1259
1260 #ifdef CONFIG_NET_RX_BUSY_POLL
1261         case SO_BUSY_POLL:
1262                 v.val = sk->sk_ll_usec;
1263                 break;
1264 #endif
1265
1266         case SO_MAX_PACING_RATE:
1267                 v.val = sk->sk_max_pacing_rate;
1268                 break;
1269
1270         case SO_INCOMING_CPU:
1271                 v.val = sk->sk_incoming_cpu;
1272                 break;
1273
1274         default:
1275                 /* We implement the SO_SNDLOWAT etc to not be settable
1276                  * (1003.1g 7).
1277                  */
1278                 return -ENOPROTOOPT;
1279         }
1280
1281         if (len > lv)
1282                 len = lv;
1283         if (copy_to_user(optval, &v, len))
1284                 return -EFAULT;
1285 lenout:
1286         if (put_user(len, optlen))
1287                 return -EFAULT;
1288         return 0;
1289 }
1290
1291 /*
1292  * Initialize an sk_lock.
1293  *
1294  * (We also register the sk_lock with the lock validator.)
1295  */
1296 static inline void sock_lock_init(struct sock *sk)
1297 {
1298         sock_lock_init_class_and_name(sk,
1299                         af_family_slock_key_strings[sk->sk_family],
1300                         af_family_slock_keys + sk->sk_family,
1301                         af_family_key_strings[sk->sk_family],
1302                         af_family_keys + sk->sk_family);
1303 }
1304
1305 /*
1306  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1307  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1308  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1309  */
1310 static void sock_copy(struct sock *nsk, const struct sock *osk)
1311 {
1312 #ifdef CONFIG_SECURITY_NETWORK
1313         void *sptr = nsk->sk_security;
1314 #endif
1315         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1316
1317         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1318                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1319
1320 #ifdef CONFIG_SECURITY_NETWORK
1321         nsk->sk_security = sptr;
1322         security_sk_clone(osk, nsk);
1323 #endif
1324 }
1325
1326 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1327                 int family)
1328 {
1329         struct sock *sk;
1330         struct kmem_cache *slab;
1331
1332         slab = prot->slab;
1333         if (slab != NULL) {
1334                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1335                 if (!sk)
1336                         return sk;
1337                 if (priority & __GFP_ZERO)
1338                         sk_prot_clear_nulls(sk, prot->obj_size);
1339         } else
1340                 sk = kmalloc(prot->obj_size, priority);
1341
1342         if (sk != NULL) {
1343                 kmemcheck_annotate_bitfield(sk, flags);
1344
1345                 if (security_sk_alloc(sk, family, priority))
1346                         goto out_free;
1347
1348                 if (!try_module_get(prot->owner))
1349                         goto out_free_sec;
1350                 sk_tx_queue_clear(sk);
1351         }
1352
1353         return sk;
1354
1355 out_free_sec:
1356         security_sk_free(sk);
1357 out_free:
1358         if (slab != NULL)
1359                 kmem_cache_free(slab, sk);
1360         else
1361                 kfree(sk);
1362         return NULL;
1363 }
1364
1365 static void sk_prot_free(struct proto *prot, struct sock *sk)
1366 {
1367         struct kmem_cache *slab;
1368         struct module *owner;
1369
1370         owner = prot->owner;
1371         slab = prot->slab;
1372
1373         cgroup_sk_free(&sk->sk_cgrp_data);
1374         mem_cgroup_sk_free(sk);
1375         security_sk_free(sk);
1376         if (slab != NULL)
1377                 kmem_cache_free(slab, sk);
1378         else
1379                 kfree(sk);
1380         module_put(owner);
1381 }
1382
1383 /**
1384  *      sk_alloc - All socket objects are allocated here
1385  *      @net: the applicable net namespace
1386  *      @family: protocol family
1387  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1388  *      @prot: struct proto associated with this new sock instance
1389  *      @kern: is this to be a kernel socket?
1390  */
1391 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1392                       struct proto *prot, int kern)
1393 {
1394         struct sock *sk;
1395
1396         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1397         if (sk) {
1398                 sk->sk_family = family;
1399                 /*
1400                  * See comment in struct sock definition to understand
1401                  * why we need sk_prot_creator -acme
1402                  */
1403                 sk->sk_prot = sk->sk_prot_creator = prot;
1404                 sock_lock_init(sk);
1405                 sk->sk_net_refcnt = kern ? 0 : 1;
1406                 if (likely(sk->sk_net_refcnt))
1407                         get_net(net);
1408                 sock_net_set(sk, net);
1409                 atomic_set(&sk->sk_wmem_alloc, 1);
1410
1411                 mem_cgroup_sk_alloc(sk);
1412                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1413                 sock_update_classid(&sk->sk_cgrp_data);
1414                 sock_update_netprioidx(&sk->sk_cgrp_data);
1415         }
1416
1417         return sk;
1418 }
1419 EXPORT_SYMBOL(sk_alloc);
1420
1421 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1422  * grace period. This is the case for UDP sockets and TCP listeners.
1423  */
1424 static void __sk_destruct(struct rcu_head *head)
1425 {
1426         struct sock *sk = container_of(head, struct sock, sk_rcu);
1427         struct sk_filter *filter;
1428
1429         if (sk->sk_destruct)
1430                 sk->sk_destruct(sk);
1431
1432         filter = rcu_dereference_check(sk->sk_filter,
1433                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1434         if (filter) {
1435                 sk_filter_uncharge(sk, filter);
1436                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1437         }
1438         if (rcu_access_pointer(sk->sk_reuseport_cb))
1439                 reuseport_detach_sock(sk);
1440
1441         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1442
1443         if (atomic_read(&sk->sk_omem_alloc))
1444                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1445                          __func__, atomic_read(&sk->sk_omem_alloc));
1446
1447         if (sk->sk_frag.page) {
1448                 put_page(sk->sk_frag.page);
1449                 sk->sk_frag.page = NULL;
1450         }
1451
1452         if (sk->sk_peer_cred)
1453                 put_cred(sk->sk_peer_cred);
1454         put_pid(sk->sk_peer_pid);
1455         if (likely(sk->sk_net_refcnt))
1456                 put_net(sock_net(sk));
1457         sk_prot_free(sk->sk_prot_creator, sk);
1458 }
1459
1460 void sk_destruct(struct sock *sk)
1461 {
1462         if (sock_flag(sk, SOCK_RCU_FREE))
1463                 call_rcu(&sk->sk_rcu, __sk_destruct);
1464         else
1465                 __sk_destruct(&sk->sk_rcu);
1466 }
1467
1468 static void __sk_free(struct sock *sk)
1469 {
1470         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1471                 sock_diag_broadcast_destroy(sk);
1472         else
1473                 sk_destruct(sk);
1474 }
1475
1476 void sk_free(struct sock *sk)
1477 {
1478         /*
1479          * We subtract one from sk_wmem_alloc and can know if
1480          * some packets are still in some tx queue.
1481          * If not null, sock_wfree() will call __sk_free(sk) later
1482          */
1483         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1484                 __sk_free(sk);
1485 }
1486 EXPORT_SYMBOL(sk_free);
1487
1488 /**
1489  *      sk_clone_lock - clone a socket, and lock its clone
1490  *      @sk: the socket to clone
1491  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1492  *
1493  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1494  */
1495 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1496 {
1497         struct sock *newsk;
1498         bool is_charged = true;
1499
1500         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1501         if (newsk != NULL) {
1502                 struct sk_filter *filter;
1503
1504                 sock_copy(newsk, sk);
1505
1506                 /* SANITY */
1507                 if (likely(newsk->sk_net_refcnt))
1508                         get_net(sock_net(newsk));
1509                 sk_node_init(&newsk->sk_node);
1510                 sock_lock_init(newsk);
1511                 bh_lock_sock(newsk);
1512                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1513                 newsk->sk_backlog.len = 0;
1514
1515                 atomic_set(&newsk->sk_rmem_alloc, 0);
1516                 /*
1517                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1518                  */
1519                 atomic_set(&newsk->sk_wmem_alloc, 1);
1520                 atomic_set(&newsk->sk_omem_alloc, 0);
1521                 skb_queue_head_init(&newsk->sk_receive_queue);
1522                 skb_queue_head_init(&newsk->sk_write_queue);
1523
1524                 rwlock_init(&newsk->sk_callback_lock);
1525                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1526                                 af_callback_keys + newsk->sk_family,
1527                                 af_family_clock_key_strings[newsk->sk_family]);
1528
1529                 newsk->sk_dst_cache     = NULL;
1530                 newsk->sk_wmem_queued   = 0;
1531                 newsk->sk_forward_alloc = 0;
1532                 atomic_set(&newsk->sk_drops, 0);
1533                 newsk->sk_send_head     = NULL;
1534                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1535
1536                 sock_reset_flag(newsk, SOCK_DONE);
1537                 skb_queue_head_init(&newsk->sk_error_queue);
1538
1539                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1540                 if (filter != NULL)
1541                         /* though it's an empty new sock, the charging may fail
1542                          * if sysctl_optmem_max was changed between creation of
1543                          * original socket and cloning
1544                          */
1545                         is_charged = sk_filter_charge(newsk, filter);
1546
1547                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1548                         /* We need to make sure that we don't uncharge the new
1549                          * socket if we couldn't charge it in the first place
1550                          * as otherwise we uncharge the parent's filter.
1551                          */
1552                         if (!is_charged)
1553                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1554                         /* It is still raw copy of parent, so invalidate
1555                          * destructor and make plain sk_free() */
1556                         newsk->sk_destruct = NULL;
1557                         bh_unlock_sock(newsk);
1558                         sk_free(newsk);
1559                         newsk = NULL;
1560                         goto out;
1561                 }
1562                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1563
1564                 newsk->sk_err      = 0;
1565                 newsk->sk_err_soft = 0;
1566                 newsk->sk_priority = 0;
1567                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1568                 atomic64_set(&newsk->sk_cookie, 0);
1569
1570                 mem_cgroup_sk_alloc(newsk);
1571                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1572
1573                 /*
1574                  * Before updating sk_refcnt, we must commit prior changes to memory
1575                  * (Documentation/RCU/rculist_nulls.txt for details)
1576                  */
1577                 smp_wmb();
1578                 atomic_set(&newsk->sk_refcnt, 2);
1579
1580                 /*
1581                  * Increment the counter in the same struct proto as the master
1582                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1583                  * is the same as sk->sk_prot->socks, as this field was copied
1584                  * with memcpy).
1585                  *
1586                  * This _changes_ the previous behaviour, where
1587                  * tcp_create_openreq_child always was incrementing the
1588                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1589                  * to be taken into account in all callers. -acme
1590                  */
1591                 sk_refcnt_debug_inc(newsk);
1592                 sk_set_socket(newsk, NULL);
1593                 newsk->sk_wq = NULL;
1594
1595                 if (newsk->sk_prot->sockets_allocated)
1596                         sk_sockets_allocated_inc(newsk);
1597
1598                 if (sock_needs_netstamp(sk) &&
1599                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1600                         net_enable_timestamp();
1601         }
1602 out:
1603         return newsk;
1604 }
1605 EXPORT_SYMBOL_GPL(sk_clone_lock);
1606
1607 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1608 {
1609         u32 max_segs = 1;
1610
1611         sk_dst_set(sk, dst);
1612         sk->sk_route_caps = dst->dev->features;
1613         if (sk->sk_route_caps & NETIF_F_GSO)
1614                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1615         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1616         if (sk_can_gso(sk)) {
1617                 if (dst->header_len) {
1618                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1619                 } else {
1620                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1621                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1622                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1623                 }
1624         }
1625         sk->sk_gso_max_segs = max_segs;
1626 }
1627 EXPORT_SYMBOL_GPL(sk_setup_caps);
1628
1629 /*
1630  *      Simple resource managers for sockets.
1631  */
1632
1633
1634 /*
1635  * Write buffer destructor automatically called from kfree_skb.
1636  */
1637 void sock_wfree(struct sk_buff *skb)
1638 {
1639         struct sock *sk = skb->sk;
1640         unsigned int len = skb->truesize;
1641
1642         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1643                 /*
1644                  * Keep a reference on sk_wmem_alloc, this will be released
1645                  * after sk_write_space() call
1646                  */
1647                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1648                 sk->sk_write_space(sk);
1649                 len = 1;
1650         }
1651         /*
1652          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1653          * could not do because of in-flight packets
1654          */
1655         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1656                 __sk_free(sk);
1657 }
1658 EXPORT_SYMBOL(sock_wfree);
1659
1660 /* This variant of sock_wfree() is used by TCP,
1661  * since it sets SOCK_USE_WRITE_QUEUE.
1662  */
1663 void __sock_wfree(struct sk_buff *skb)
1664 {
1665         struct sock *sk = skb->sk;
1666
1667         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1668                 __sk_free(sk);
1669 }
1670
1671 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1672 {
1673         skb_orphan(skb);
1674         skb->sk = sk;
1675 #ifdef CONFIG_INET
1676         if (unlikely(!sk_fullsock(sk))) {
1677                 skb->destructor = sock_edemux;
1678                 sock_hold(sk);
1679                 return;
1680         }
1681 #endif
1682         skb->destructor = sock_wfree;
1683         skb_set_hash_from_sk(skb, sk);
1684         /*
1685          * We used to take a refcount on sk, but following operation
1686          * is enough to guarantee sk_free() wont free this sock until
1687          * all in-flight packets are completed
1688          */
1689         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1690 }
1691 EXPORT_SYMBOL(skb_set_owner_w);
1692
1693 /* This helper is used by netem, as it can hold packets in its
1694  * delay queue. We want to allow the owner socket to send more
1695  * packets, as if they were already TX completed by a typical driver.
1696  * But we also want to keep skb->sk set because some packet schedulers
1697  * rely on it (sch_fq for example). So we set skb->truesize to a small
1698  * amount (1) and decrease sk_wmem_alloc accordingly.
1699  */
1700 void skb_orphan_partial(struct sk_buff *skb)
1701 {
1702         /* If this skb is a TCP pure ACK or already went here,
1703          * we have nothing to do. 2 is already a very small truesize.
1704          */
1705         if (skb->truesize <= 2)
1706                 return;
1707
1708         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1709          * so we do not completely orphan skb, but transfert all
1710          * accounted bytes but one, to avoid unexpected reorders.
1711          */
1712         if (skb->destructor == sock_wfree
1713 #ifdef CONFIG_INET
1714             || skb->destructor == tcp_wfree
1715 #endif
1716                 ) {
1717                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1718                 skb->truesize = 1;
1719         } else {
1720                 skb_orphan(skb);
1721         }
1722 }
1723 EXPORT_SYMBOL(skb_orphan_partial);
1724
1725 /*
1726  * Read buffer destructor automatically called from kfree_skb.
1727  */
1728 void sock_rfree(struct sk_buff *skb)
1729 {
1730         struct sock *sk = skb->sk;
1731         unsigned int len = skb->truesize;
1732
1733         atomic_sub(len, &sk->sk_rmem_alloc);
1734         sk_mem_uncharge(sk, len);
1735 }
1736 EXPORT_SYMBOL(sock_rfree);
1737
1738 /*
1739  * Buffer destructor for skbs that are not used directly in read or write
1740  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1741  */
1742 void sock_efree(struct sk_buff *skb)
1743 {
1744         sock_put(skb->sk);
1745 }
1746 EXPORT_SYMBOL(sock_efree);
1747
1748 kuid_t sock_i_uid(struct sock *sk)
1749 {
1750         kuid_t uid;
1751
1752         read_lock_bh(&sk->sk_callback_lock);
1753         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1754         read_unlock_bh(&sk->sk_callback_lock);
1755         return uid;
1756 }
1757 EXPORT_SYMBOL(sock_i_uid);
1758
1759 unsigned long sock_i_ino(struct sock *sk)
1760 {
1761         unsigned long ino;
1762
1763         read_lock_bh(&sk->sk_callback_lock);
1764         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1765         read_unlock_bh(&sk->sk_callback_lock);
1766         return ino;
1767 }
1768 EXPORT_SYMBOL(sock_i_ino);
1769
1770 /*
1771  * Allocate a skb from the socket's send buffer.
1772  */
1773 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1774                              gfp_t priority)
1775 {
1776         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1777                 struct sk_buff *skb = alloc_skb(size, priority);
1778                 if (skb) {
1779                         skb_set_owner_w(skb, sk);
1780                         return skb;
1781                 }
1782         }
1783         return NULL;
1784 }
1785 EXPORT_SYMBOL(sock_wmalloc);
1786
1787 /*
1788  * Allocate a memory block from the socket's option memory buffer.
1789  */
1790 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1791 {
1792         if ((unsigned int)size <= sysctl_optmem_max &&
1793             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1794                 void *mem;
1795                 /* First do the add, to avoid the race if kmalloc
1796                  * might sleep.
1797                  */
1798                 atomic_add(size, &sk->sk_omem_alloc);
1799                 mem = kmalloc(size, priority);
1800                 if (mem)
1801                         return mem;
1802                 atomic_sub(size, &sk->sk_omem_alloc);
1803         }
1804         return NULL;
1805 }
1806 EXPORT_SYMBOL(sock_kmalloc);
1807
1808 /* Free an option memory block. Note, we actually want the inline
1809  * here as this allows gcc to detect the nullify and fold away the
1810  * condition entirely.
1811  */
1812 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1813                                   const bool nullify)
1814 {
1815         if (WARN_ON_ONCE(!mem))
1816                 return;
1817         if (nullify)
1818                 kzfree(mem);
1819         else
1820                 kfree(mem);
1821         atomic_sub(size, &sk->sk_omem_alloc);
1822 }
1823
1824 void sock_kfree_s(struct sock *sk, void *mem, int size)
1825 {
1826         __sock_kfree_s(sk, mem, size, false);
1827 }
1828 EXPORT_SYMBOL(sock_kfree_s);
1829
1830 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1831 {
1832         __sock_kfree_s(sk, mem, size, true);
1833 }
1834 EXPORT_SYMBOL(sock_kzfree_s);
1835
1836 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1837    I think, these locks should be removed for datagram sockets.
1838  */
1839 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1840 {
1841         DEFINE_WAIT(wait);
1842
1843         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1844         for (;;) {
1845                 if (!timeo)
1846                         break;
1847                 if (signal_pending(current))
1848                         break;
1849                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1850                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1851                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1852                         break;
1853                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1854                         break;
1855                 if (sk->sk_err)
1856                         break;
1857                 timeo = schedule_timeout(timeo);
1858         }
1859         finish_wait(sk_sleep(sk), &wait);
1860         return timeo;
1861 }
1862
1863
1864 /*
1865  *      Generic send/receive buffer handlers
1866  */
1867
1868 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1869                                      unsigned long data_len, int noblock,
1870                                      int *errcode, int max_page_order)
1871 {
1872         struct sk_buff *skb;
1873         long timeo;
1874         int err;
1875
1876         timeo = sock_sndtimeo(sk, noblock);
1877         for (;;) {
1878                 err = sock_error(sk);
1879                 if (err != 0)
1880                         goto failure;
1881
1882                 err = -EPIPE;
1883                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1884                         goto failure;
1885
1886                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1887                         break;
1888
1889                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1890                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1891                 err = -EAGAIN;
1892                 if (!timeo)
1893                         goto failure;
1894                 if (signal_pending(current))
1895                         goto interrupted;
1896                 timeo = sock_wait_for_wmem(sk, timeo);
1897         }
1898         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1899                                    errcode, sk->sk_allocation);
1900         if (skb)
1901                 skb_set_owner_w(skb, sk);
1902         return skb;
1903
1904 interrupted:
1905         err = sock_intr_errno(timeo);
1906 failure:
1907         *errcode = err;
1908         return NULL;
1909 }
1910 EXPORT_SYMBOL(sock_alloc_send_pskb);
1911
1912 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1913                                     int noblock, int *errcode)
1914 {
1915         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1916 }
1917 EXPORT_SYMBOL(sock_alloc_send_skb);
1918
1919 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1920                      struct sockcm_cookie *sockc)
1921 {
1922         u32 tsflags;
1923
1924         switch (cmsg->cmsg_type) {
1925         case SO_MARK:
1926                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1927                         return -EPERM;
1928                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1929                         return -EINVAL;
1930                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1931                 break;
1932         case SO_TIMESTAMPING:
1933                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1934                         return -EINVAL;
1935
1936                 tsflags = *(u32 *)CMSG_DATA(cmsg);
1937                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1938                         return -EINVAL;
1939
1940                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1941                 sockc->tsflags |= tsflags;
1942                 break;
1943         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1944         case SCM_RIGHTS:
1945         case SCM_CREDENTIALS:
1946                 break;
1947         default:
1948                 return -EINVAL;
1949         }
1950         return 0;
1951 }
1952 EXPORT_SYMBOL(__sock_cmsg_send);
1953
1954 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1955                    struct sockcm_cookie *sockc)
1956 {
1957         struct cmsghdr *cmsg;
1958         int ret;
1959
1960         for_each_cmsghdr(cmsg, msg) {
1961                 if (!CMSG_OK(msg, cmsg))
1962                         return -EINVAL;
1963                 if (cmsg->cmsg_level != SOL_SOCKET)
1964                         continue;
1965                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1966                 if (ret)
1967                         return ret;
1968         }
1969         return 0;
1970 }
1971 EXPORT_SYMBOL(sock_cmsg_send);
1972
1973 /* On 32bit arches, an skb frag is limited to 2^15 */
1974 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
1975
1976 /**
1977  * skb_page_frag_refill - check that a page_frag contains enough room
1978  * @sz: minimum size of the fragment we want to get
1979  * @pfrag: pointer to page_frag
1980  * @gfp: priority for memory allocation
1981  *
1982  * Note: While this allocator tries to use high order pages, there is
1983  * no guarantee that allocations succeed. Therefore, @sz MUST be
1984  * less or equal than PAGE_SIZE.
1985  */
1986 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1987 {
1988         if (pfrag->page) {
1989                 if (page_ref_count(pfrag->page) == 1) {
1990                         pfrag->offset = 0;
1991                         return true;
1992                 }
1993                 if (pfrag->offset + sz <= pfrag->size)
1994                         return true;
1995                 put_page(pfrag->page);
1996         }
1997
1998         pfrag->offset = 0;
1999         if (SKB_FRAG_PAGE_ORDER) {
2000                 /* Avoid direct reclaim but allow kswapd to wake */
2001                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2002                                           __GFP_COMP | __GFP_NOWARN |
2003                                           __GFP_NORETRY,
2004                                           SKB_FRAG_PAGE_ORDER);
2005                 if (likely(pfrag->page)) {
2006                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2007                         return true;
2008                 }
2009         }
2010         pfrag->page = alloc_page(gfp);
2011         if (likely(pfrag->page)) {
2012                 pfrag->size = PAGE_SIZE;
2013                 return true;
2014         }
2015         return false;
2016 }
2017 EXPORT_SYMBOL(skb_page_frag_refill);
2018
2019 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2020 {
2021         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2022                 return true;
2023
2024         sk_enter_memory_pressure(sk);
2025         sk_stream_moderate_sndbuf(sk);
2026         return false;
2027 }
2028 EXPORT_SYMBOL(sk_page_frag_refill);
2029
2030 static void __lock_sock(struct sock *sk)
2031         __releases(&sk->sk_lock.slock)
2032         __acquires(&sk->sk_lock.slock)
2033 {
2034         DEFINE_WAIT(wait);
2035
2036         for (;;) {
2037                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2038                                         TASK_UNINTERRUPTIBLE);
2039                 spin_unlock_bh(&sk->sk_lock.slock);
2040                 schedule();
2041                 spin_lock_bh(&sk->sk_lock.slock);
2042                 if (!sock_owned_by_user(sk))
2043                         break;
2044         }
2045         finish_wait(&sk->sk_lock.wq, &wait);
2046 }
2047
2048 static void __release_sock(struct sock *sk)
2049         __releases(&sk->sk_lock.slock)
2050         __acquires(&sk->sk_lock.slock)
2051 {
2052         struct sk_buff *skb, *next;
2053
2054         while ((skb = sk->sk_backlog.head) != NULL) {
2055                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2056
2057                 spin_unlock_bh(&sk->sk_lock.slock);
2058
2059                 do {
2060                         next = skb->next;
2061                         prefetch(next);
2062                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2063                         skb->next = NULL;
2064                         sk_backlog_rcv(sk, skb);
2065
2066                         cond_resched();
2067
2068                         skb = next;
2069                 } while (skb != NULL);
2070
2071                 spin_lock_bh(&sk->sk_lock.slock);
2072         }
2073
2074         /*
2075          * Doing the zeroing here guarantee we can not loop forever
2076          * while a wild producer attempts to flood us.
2077          */
2078         sk->sk_backlog.len = 0;
2079 }
2080
2081 void __sk_flush_backlog(struct sock *sk)
2082 {
2083         spin_lock_bh(&sk->sk_lock.slock);
2084         __release_sock(sk);
2085         spin_unlock_bh(&sk->sk_lock.slock);
2086 }
2087
2088 /**
2089  * sk_wait_data - wait for data to arrive at sk_receive_queue
2090  * @sk:    sock to wait on
2091  * @timeo: for how long
2092  * @skb:   last skb seen on sk_receive_queue
2093  *
2094  * Now socket state including sk->sk_err is changed only under lock,
2095  * hence we may omit checks after joining wait queue.
2096  * We check receive queue before schedule() only as optimization;
2097  * it is very likely that release_sock() added new data.
2098  */
2099 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2100 {
2101         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2102         int rc;
2103
2104         add_wait_queue(sk_sleep(sk), &wait);
2105         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2106         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2107         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2108         remove_wait_queue(sk_sleep(sk), &wait);
2109         return rc;
2110 }
2111 EXPORT_SYMBOL(sk_wait_data);
2112
2113 /**
2114  *      __sk_mem_raise_allocated - increase memory_allocated
2115  *      @sk: socket
2116  *      @size: memory size to allocate
2117  *      @amt: pages to allocate
2118  *      @kind: allocation type
2119  *
2120  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2121  */
2122 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2123 {
2124         struct proto *prot = sk->sk_prot;
2125         long allocated = sk_memory_allocated_add(sk, amt);
2126
2127         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2128             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2129                 goto suppress_allocation;
2130
2131         /* Under limit. */
2132         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2133                 sk_leave_memory_pressure(sk);
2134                 return 1;
2135         }
2136
2137         /* Under pressure. */
2138         if (allocated > sk_prot_mem_limits(sk, 1))
2139                 sk_enter_memory_pressure(sk);
2140
2141         /* Over hard limit. */
2142         if (allocated > sk_prot_mem_limits(sk, 2))
2143                 goto suppress_allocation;
2144
2145         /* guarantee minimum buffer size under pressure */
2146         if (kind == SK_MEM_RECV) {
2147                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2148                         return 1;
2149
2150         } else { /* SK_MEM_SEND */
2151                 if (sk->sk_type == SOCK_STREAM) {
2152                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2153                                 return 1;
2154                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2155                            prot->sysctl_wmem[0])
2156                                 return 1;
2157         }
2158
2159         if (sk_has_memory_pressure(sk)) {
2160                 int alloc;
2161
2162                 if (!sk_under_memory_pressure(sk))
2163                         return 1;
2164                 alloc = sk_sockets_allocated_read_positive(sk);
2165                 if (sk_prot_mem_limits(sk, 2) > alloc *
2166                     sk_mem_pages(sk->sk_wmem_queued +
2167                                  atomic_read(&sk->sk_rmem_alloc) +
2168                                  sk->sk_forward_alloc))
2169                         return 1;
2170         }
2171
2172 suppress_allocation:
2173
2174         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2175                 sk_stream_moderate_sndbuf(sk);
2176
2177                 /* Fail only if socket is _under_ its sndbuf.
2178                  * In this case we cannot block, so that we have to fail.
2179                  */
2180                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2181                         return 1;
2182         }
2183
2184         trace_sock_exceed_buf_limit(sk, prot, allocated);
2185
2186         sk_memory_allocated_sub(sk, amt);
2187
2188         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2189                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2190
2191         return 0;
2192 }
2193 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2194
2195 /**
2196  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2197  *      @sk: socket
2198  *      @size: memory size to allocate
2199  *      @kind: allocation type
2200  *
2201  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2202  *      rmem allocation. This function assumes that protocols which have
2203  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2204  */
2205 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2206 {
2207         int ret, amt = sk_mem_pages(size);
2208
2209         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2210         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2211         if (!ret)
2212                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2213         return ret;
2214 }
2215 EXPORT_SYMBOL(__sk_mem_schedule);
2216
2217 /**
2218  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2219  *      @sk: socket
2220  *      @amount: number of quanta
2221  *
2222  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2223  */
2224 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2225 {
2226         sk_memory_allocated_sub(sk, amount);
2227
2228         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2229                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2230
2231         if (sk_under_memory_pressure(sk) &&
2232             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2233                 sk_leave_memory_pressure(sk);
2234 }
2235 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2236
2237 /**
2238  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2239  *      @sk: socket
2240  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2241  */
2242 void __sk_mem_reclaim(struct sock *sk, int amount)
2243 {
2244         amount >>= SK_MEM_QUANTUM_SHIFT;
2245         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2246         __sk_mem_reduce_allocated(sk, amount);
2247 }
2248 EXPORT_SYMBOL(__sk_mem_reclaim);
2249
2250 int sk_set_peek_off(struct sock *sk, int val)
2251 {
2252         if (val < 0)
2253                 return -EINVAL;
2254
2255         sk->sk_peek_off = val;
2256         return 0;
2257 }
2258 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2259
2260 /*
2261  * Set of default routines for initialising struct proto_ops when
2262  * the protocol does not support a particular function. In certain
2263  * cases where it makes no sense for a protocol to have a "do nothing"
2264  * function, some default processing is provided.
2265  */
2266
2267 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2268 {
2269         return -EOPNOTSUPP;
2270 }
2271 EXPORT_SYMBOL(sock_no_bind);
2272
2273 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2274                     int len, int flags)
2275 {
2276         return -EOPNOTSUPP;
2277 }
2278 EXPORT_SYMBOL(sock_no_connect);
2279
2280 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2281 {
2282         return -EOPNOTSUPP;
2283 }
2284 EXPORT_SYMBOL(sock_no_socketpair);
2285
2286 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2287 {
2288         return -EOPNOTSUPP;
2289 }
2290 EXPORT_SYMBOL(sock_no_accept);
2291
2292 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2293                     int *len, int peer)
2294 {
2295         return -EOPNOTSUPP;
2296 }
2297 EXPORT_SYMBOL(sock_no_getname);
2298
2299 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2300 {
2301         return 0;
2302 }
2303 EXPORT_SYMBOL(sock_no_poll);
2304
2305 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2306 {
2307         return -EOPNOTSUPP;
2308 }
2309 EXPORT_SYMBOL(sock_no_ioctl);
2310
2311 int sock_no_listen(struct socket *sock, int backlog)
2312 {
2313         return -EOPNOTSUPP;
2314 }
2315 EXPORT_SYMBOL(sock_no_listen);
2316
2317 int sock_no_shutdown(struct socket *sock, int how)
2318 {
2319         return -EOPNOTSUPP;
2320 }
2321 EXPORT_SYMBOL(sock_no_shutdown);
2322
2323 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2324                     char __user *optval, unsigned int optlen)
2325 {
2326         return -EOPNOTSUPP;
2327 }
2328 EXPORT_SYMBOL(sock_no_setsockopt);
2329
2330 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2331                     char __user *optval, int __user *optlen)
2332 {
2333         return -EOPNOTSUPP;
2334 }
2335 EXPORT_SYMBOL(sock_no_getsockopt);
2336
2337 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2338 {
2339         return -EOPNOTSUPP;
2340 }
2341 EXPORT_SYMBOL(sock_no_sendmsg);
2342
2343 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2344                     int flags)
2345 {
2346         return -EOPNOTSUPP;
2347 }
2348 EXPORT_SYMBOL(sock_no_recvmsg);
2349
2350 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2351 {
2352         /* Mirror missing mmap method error code */
2353         return -ENODEV;
2354 }
2355 EXPORT_SYMBOL(sock_no_mmap);
2356
2357 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2358 {
2359         ssize_t res;
2360         struct msghdr msg = {.msg_flags = flags};
2361         struct kvec iov;
2362         char *kaddr = kmap(page);
2363         iov.iov_base = kaddr + offset;
2364         iov.iov_len = size;
2365         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2366         kunmap(page);
2367         return res;
2368 }
2369 EXPORT_SYMBOL(sock_no_sendpage);
2370
2371 /*
2372  *      Default Socket Callbacks
2373  */
2374
2375 static void sock_def_wakeup(struct sock *sk)
2376 {
2377         struct socket_wq *wq;
2378
2379         rcu_read_lock();
2380         wq = rcu_dereference(sk->sk_wq);
2381         if (skwq_has_sleeper(wq))
2382                 wake_up_interruptible_all(&wq->wait);
2383         rcu_read_unlock();
2384 }
2385
2386 static void sock_def_error_report(struct sock *sk)
2387 {
2388         struct socket_wq *wq;
2389
2390         rcu_read_lock();
2391         wq = rcu_dereference(sk->sk_wq);
2392         if (skwq_has_sleeper(wq))
2393                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2394         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2395         rcu_read_unlock();
2396 }
2397
2398 static void sock_def_readable(struct sock *sk)
2399 {
2400         struct socket_wq *wq;
2401
2402         rcu_read_lock();
2403         wq = rcu_dereference(sk->sk_wq);
2404         if (skwq_has_sleeper(wq))
2405                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2406                                                 POLLRDNORM | POLLRDBAND);
2407         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2408         rcu_read_unlock();
2409 }
2410
2411 static void sock_def_write_space(struct sock *sk)
2412 {
2413         struct socket_wq *wq;
2414
2415         rcu_read_lock();
2416
2417         /* Do not wake up a writer until he can make "significant"
2418          * progress.  --DaveM
2419          */
2420         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2421                 wq = rcu_dereference(sk->sk_wq);
2422                 if (skwq_has_sleeper(wq))
2423                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2424                                                 POLLWRNORM | POLLWRBAND);
2425
2426                 /* Should agree with poll, otherwise some programs break */
2427                 if (sock_writeable(sk))
2428                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2429         }
2430
2431         rcu_read_unlock();
2432 }
2433
2434 static void sock_def_destruct(struct sock *sk)
2435 {
2436 }
2437
2438 void sk_send_sigurg(struct sock *sk)
2439 {
2440         if (sk->sk_socket && sk->sk_socket->file)
2441                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2442                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2443 }
2444 EXPORT_SYMBOL(sk_send_sigurg);
2445
2446 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2447                     unsigned long expires)
2448 {
2449         if (!mod_timer(timer, expires))
2450                 sock_hold(sk);
2451 }
2452 EXPORT_SYMBOL(sk_reset_timer);
2453
2454 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2455 {
2456         if (del_timer(timer))
2457                 __sock_put(sk);
2458 }
2459 EXPORT_SYMBOL(sk_stop_timer);
2460
2461 void sock_init_data(struct socket *sock, struct sock *sk)
2462 {
2463         skb_queue_head_init(&sk->sk_receive_queue);
2464         skb_queue_head_init(&sk->sk_write_queue);
2465         skb_queue_head_init(&sk->sk_error_queue);
2466
2467         sk->sk_send_head        =       NULL;
2468
2469         init_timer(&sk->sk_timer);
2470
2471         sk->sk_allocation       =       GFP_KERNEL;
2472         sk->sk_rcvbuf           =       sysctl_rmem_default;
2473         sk->sk_sndbuf           =       sysctl_wmem_default;
2474         sk->sk_state            =       TCP_CLOSE;
2475         sk_set_socket(sk, sock);
2476
2477         sock_set_flag(sk, SOCK_ZAPPED);
2478
2479         if (sock) {
2480                 sk->sk_type     =       sock->type;
2481                 sk->sk_wq       =       sock->wq;
2482                 sock->sk        =       sk;
2483                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2484         } else {
2485                 sk->sk_wq       =       NULL;
2486                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2487         }
2488
2489         rwlock_init(&sk->sk_callback_lock);
2490         lockdep_set_class_and_name(&sk->sk_callback_lock,
2491                         af_callback_keys + sk->sk_family,
2492                         af_family_clock_key_strings[sk->sk_family]);
2493
2494         sk->sk_state_change     =       sock_def_wakeup;
2495         sk->sk_data_ready       =       sock_def_readable;
2496         sk->sk_write_space      =       sock_def_write_space;
2497         sk->sk_error_report     =       sock_def_error_report;
2498         sk->sk_destruct         =       sock_def_destruct;
2499
2500         sk->sk_frag.page        =       NULL;
2501         sk->sk_frag.offset      =       0;
2502         sk->sk_peek_off         =       -1;
2503
2504         sk->sk_peer_pid         =       NULL;
2505         sk->sk_peer_cred        =       NULL;
2506         sk->sk_write_pending    =       0;
2507         sk->sk_rcvlowat         =       1;
2508         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2509         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2510
2511         sk->sk_stamp = ktime_set(-1L, 0);
2512
2513 #ifdef CONFIG_NET_RX_BUSY_POLL
2514         sk->sk_napi_id          =       0;
2515         sk->sk_ll_usec          =       sysctl_net_busy_read;
2516 #endif
2517
2518         sk->sk_max_pacing_rate = ~0U;
2519         sk->sk_pacing_rate = ~0U;
2520         sk->sk_incoming_cpu = -1;
2521         /*
2522          * Before updating sk_refcnt, we must commit prior changes to memory
2523          * (Documentation/RCU/rculist_nulls.txt for details)
2524          */
2525         smp_wmb();
2526         atomic_set(&sk->sk_refcnt, 1);
2527         atomic_set(&sk->sk_drops, 0);
2528 }
2529 EXPORT_SYMBOL(sock_init_data);
2530
2531 void lock_sock_nested(struct sock *sk, int subclass)
2532 {
2533         might_sleep();
2534         spin_lock_bh(&sk->sk_lock.slock);
2535         if (sk->sk_lock.owned)
2536                 __lock_sock(sk);
2537         sk->sk_lock.owned = 1;
2538         spin_unlock(&sk->sk_lock.slock);
2539         /*
2540          * The sk_lock has mutex_lock() semantics here:
2541          */
2542         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2543         local_bh_enable();
2544 }
2545 EXPORT_SYMBOL(lock_sock_nested);
2546
2547 void release_sock(struct sock *sk)
2548 {
2549         spin_lock_bh(&sk->sk_lock.slock);
2550         if (sk->sk_backlog.tail)
2551                 __release_sock(sk);
2552
2553         /* Warning : release_cb() might need to release sk ownership,
2554          * ie call sock_release_ownership(sk) before us.
2555          */
2556         if (sk->sk_prot->release_cb)
2557                 sk->sk_prot->release_cb(sk);
2558
2559         sock_release_ownership(sk);
2560         if (waitqueue_active(&sk->sk_lock.wq))
2561                 wake_up(&sk->sk_lock.wq);
2562         spin_unlock_bh(&sk->sk_lock.slock);
2563 }
2564 EXPORT_SYMBOL(release_sock);
2565
2566 /**
2567  * lock_sock_fast - fast version of lock_sock
2568  * @sk: socket
2569  *
2570  * This version should be used for very small section, where process wont block
2571  * return false if fast path is taken
2572  *   sk_lock.slock locked, owned = 0, BH disabled
2573  * return true if slow path is taken
2574  *   sk_lock.slock unlocked, owned = 1, BH enabled
2575  */
2576 bool lock_sock_fast(struct sock *sk)
2577 {
2578         might_sleep();
2579         spin_lock_bh(&sk->sk_lock.slock);
2580
2581         if (!sk->sk_lock.owned)
2582                 /*
2583                  * Note : We must disable BH
2584                  */
2585                 return false;
2586
2587         __lock_sock(sk);
2588         sk->sk_lock.owned = 1;
2589         spin_unlock(&sk->sk_lock.slock);
2590         /*
2591          * The sk_lock has mutex_lock() semantics here:
2592          */
2593         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2594         local_bh_enable();
2595         return true;
2596 }
2597 EXPORT_SYMBOL(lock_sock_fast);
2598
2599 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2600 {
2601         struct timeval tv;
2602         if (!sock_flag(sk, SOCK_TIMESTAMP))
2603                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2604         tv = ktime_to_timeval(sk->sk_stamp);
2605         if (tv.tv_sec == -1)
2606                 return -ENOENT;
2607         if (tv.tv_sec == 0) {
2608                 sk->sk_stamp = ktime_get_real();
2609                 tv = ktime_to_timeval(sk->sk_stamp);
2610         }
2611         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2612 }
2613 EXPORT_SYMBOL(sock_get_timestamp);
2614
2615 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2616 {
2617         struct timespec ts;
2618         if (!sock_flag(sk, SOCK_TIMESTAMP))
2619                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2620         ts = ktime_to_timespec(sk->sk_stamp);
2621         if (ts.tv_sec == -1)
2622                 return -ENOENT;
2623         if (ts.tv_sec == 0) {
2624                 sk->sk_stamp = ktime_get_real();
2625                 ts = ktime_to_timespec(sk->sk_stamp);
2626         }
2627         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2628 }
2629 EXPORT_SYMBOL(sock_get_timestampns);
2630
2631 void sock_enable_timestamp(struct sock *sk, int flag)
2632 {
2633         if (!sock_flag(sk, flag)) {
2634                 unsigned long previous_flags = sk->sk_flags;
2635
2636                 sock_set_flag(sk, flag);
2637                 /*
2638                  * we just set one of the two flags which require net
2639                  * time stamping, but time stamping might have been on
2640                  * already because of the other one
2641                  */
2642                 if (sock_needs_netstamp(sk) &&
2643                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2644                         net_enable_timestamp();
2645         }
2646 }
2647
2648 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2649                        int level, int type)
2650 {
2651         struct sock_exterr_skb *serr;
2652         struct sk_buff *skb;
2653         int copied, err;
2654
2655         err = -EAGAIN;
2656         skb = sock_dequeue_err_skb(sk);
2657         if (skb == NULL)
2658                 goto out;
2659
2660         copied = skb->len;
2661         if (copied > len) {
2662                 msg->msg_flags |= MSG_TRUNC;
2663                 copied = len;
2664         }
2665         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2666         if (err)
2667                 goto out_free_skb;
2668
2669         sock_recv_timestamp(msg, sk, skb);
2670
2671         serr = SKB_EXT_ERR(skb);
2672         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2673
2674         msg->msg_flags |= MSG_ERRQUEUE;
2675         err = copied;
2676
2677 out_free_skb:
2678         kfree_skb(skb);
2679 out:
2680         return err;
2681 }
2682 EXPORT_SYMBOL(sock_recv_errqueue);
2683
2684 /*
2685  *      Get a socket option on an socket.
2686  *
2687  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2688  *      asynchronous errors should be reported by getsockopt. We assume
2689  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2690  */
2691 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2692                            char __user *optval, int __user *optlen)
2693 {
2694         struct sock *sk = sock->sk;
2695
2696         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2697 }
2698 EXPORT_SYMBOL(sock_common_getsockopt);
2699
2700 #ifdef CONFIG_COMPAT
2701 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2702                                   char __user *optval, int __user *optlen)
2703 {
2704         struct sock *sk = sock->sk;
2705
2706         if (sk->sk_prot->compat_getsockopt != NULL)
2707                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2708                                                       optval, optlen);
2709         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2710 }
2711 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2712 #endif
2713
2714 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2715                         int flags)
2716 {
2717         struct sock *sk = sock->sk;
2718         int addr_len = 0;
2719         int err;
2720
2721         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2722                                    flags & ~MSG_DONTWAIT, &addr_len);
2723         if (err >= 0)
2724                 msg->msg_namelen = addr_len;
2725         return err;
2726 }
2727 EXPORT_SYMBOL(sock_common_recvmsg);
2728
2729 /*
2730  *      Set socket options on an inet socket.
2731  */
2732 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2733                            char __user *optval, unsigned int optlen)
2734 {
2735         struct sock *sk = sock->sk;
2736
2737         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2738 }
2739 EXPORT_SYMBOL(sock_common_setsockopt);
2740
2741 #ifdef CONFIG_COMPAT
2742 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2743                                   char __user *optval, unsigned int optlen)
2744 {
2745         struct sock *sk = sock->sk;
2746
2747         if (sk->sk_prot->compat_setsockopt != NULL)
2748                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2749                                                       optval, optlen);
2750         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2751 }
2752 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2753 #endif
2754
2755 void sk_common_release(struct sock *sk)
2756 {
2757         if (sk->sk_prot->destroy)
2758                 sk->sk_prot->destroy(sk);
2759
2760         /*
2761          * Observation: when sock_common_release is called, processes have
2762          * no access to socket. But net still has.
2763          * Step one, detach it from networking:
2764          *
2765          * A. Remove from hash tables.
2766          */
2767
2768         sk->sk_prot->unhash(sk);
2769
2770         /*
2771          * In this point socket cannot receive new packets, but it is possible
2772          * that some packets are in flight because some CPU runs receiver and
2773          * did hash table lookup before we unhashed socket. They will achieve
2774          * receive queue and will be purged by socket destructor.
2775          *
2776          * Also we still have packets pending on receive queue and probably,
2777          * our own packets waiting in device queues. sock_destroy will drain
2778          * receive queue, but transmitted packets will delay socket destruction
2779          * until the last reference will be released.
2780          */
2781
2782         sock_orphan(sk);
2783
2784         xfrm_sk_free_policy(sk);
2785
2786         sk_refcnt_debug_release(sk);
2787
2788         sock_put(sk);
2789 }
2790 EXPORT_SYMBOL(sk_common_release);
2791
2792 #ifdef CONFIG_PROC_FS
2793 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2794 struct prot_inuse {
2795         int val[PROTO_INUSE_NR];
2796 };
2797
2798 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2799
2800 #ifdef CONFIG_NET_NS
2801 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2802 {
2803         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2804 }
2805 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2806
2807 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2808 {
2809         int cpu, idx = prot->inuse_idx;
2810         int res = 0;
2811
2812         for_each_possible_cpu(cpu)
2813                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2814
2815         return res >= 0 ? res : 0;
2816 }
2817 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2818
2819 static int __net_init sock_inuse_init_net(struct net *net)
2820 {
2821         net->core.inuse = alloc_percpu(struct prot_inuse);
2822         return net->core.inuse ? 0 : -ENOMEM;
2823 }
2824
2825 static void __net_exit sock_inuse_exit_net(struct net *net)
2826 {
2827         free_percpu(net->core.inuse);
2828 }
2829
2830 static struct pernet_operations net_inuse_ops = {
2831         .init = sock_inuse_init_net,
2832         .exit = sock_inuse_exit_net,
2833 };
2834
2835 static __init int net_inuse_init(void)
2836 {
2837         if (register_pernet_subsys(&net_inuse_ops))
2838                 panic("Cannot initialize net inuse counters");
2839
2840         return 0;
2841 }
2842
2843 core_initcall(net_inuse_init);
2844 #else
2845 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2846
2847 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2848 {
2849         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2850 }
2851 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2852
2853 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2854 {
2855         int cpu, idx = prot->inuse_idx;
2856         int res = 0;
2857
2858         for_each_possible_cpu(cpu)
2859                 res += per_cpu(prot_inuse, cpu).val[idx];
2860
2861         return res >= 0 ? res : 0;
2862 }
2863 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2864 #endif
2865
2866 static void assign_proto_idx(struct proto *prot)
2867 {
2868         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2869
2870         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2871                 pr_err("PROTO_INUSE_NR exhausted\n");
2872                 return;
2873         }
2874
2875         set_bit(prot->inuse_idx, proto_inuse_idx);
2876 }
2877
2878 static void release_proto_idx(struct proto *prot)
2879 {
2880         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2881                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2882 }
2883 #else
2884 static inline void assign_proto_idx(struct proto *prot)
2885 {
2886 }
2887
2888 static inline void release_proto_idx(struct proto *prot)
2889 {
2890 }
2891 #endif
2892
2893 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2894 {
2895         if (!rsk_prot)
2896                 return;
2897         kfree(rsk_prot->slab_name);
2898         rsk_prot->slab_name = NULL;
2899         kmem_cache_destroy(rsk_prot->slab);
2900         rsk_prot->slab = NULL;
2901 }
2902
2903 static int req_prot_init(const struct proto *prot)
2904 {
2905         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2906
2907         if (!rsk_prot)
2908                 return 0;
2909
2910         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2911                                         prot->name);
2912         if (!rsk_prot->slab_name)
2913                 return -ENOMEM;
2914
2915         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2916                                            rsk_prot->obj_size, 0,
2917                                            prot->slab_flags, NULL);
2918
2919         if (!rsk_prot->slab) {
2920                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2921                         prot->name);
2922                 return -ENOMEM;
2923         }
2924         return 0;
2925 }
2926
2927 int proto_register(struct proto *prot, int alloc_slab)
2928 {
2929         if (alloc_slab) {
2930                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2931                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2932                                         NULL);
2933
2934                 if (prot->slab == NULL) {
2935                         pr_crit("%s: Can't create sock SLAB cache!\n",
2936                                 prot->name);
2937                         goto out;
2938                 }
2939
2940                 if (req_prot_init(prot))
2941                         goto out_free_request_sock_slab;
2942
2943                 if (prot->twsk_prot != NULL) {
2944                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2945
2946                         if (prot->twsk_prot->twsk_slab_name == NULL)
2947                                 goto out_free_request_sock_slab;
2948
2949                         prot->twsk_prot->twsk_slab =
2950                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2951                                                   prot->twsk_prot->twsk_obj_size,
2952                                                   0,
2953                                                   prot->slab_flags,
2954                                                   NULL);
2955                         if (prot->twsk_prot->twsk_slab == NULL)
2956                                 goto out_free_timewait_sock_slab_name;
2957                 }
2958         }
2959
2960         mutex_lock(&proto_list_mutex);
2961         list_add(&prot->node, &proto_list);
2962         assign_proto_idx(prot);
2963         mutex_unlock(&proto_list_mutex);
2964         return 0;
2965
2966 out_free_timewait_sock_slab_name:
2967         kfree(prot->twsk_prot->twsk_slab_name);
2968 out_free_request_sock_slab:
2969         req_prot_cleanup(prot->rsk_prot);
2970
2971         kmem_cache_destroy(prot->slab);
2972         prot->slab = NULL;
2973 out:
2974         return -ENOBUFS;
2975 }
2976 EXPORT_SYMBOL(proto_register);
2977
2978 void proto_unregister(struct proto *prot)
2979 {
2980         mutex_lock(&proto_list_mutex);
2981         release_proto_idx(prot);
2982         list_del(&prot->node);
2983         mutex_unlock(&proto_list_mutex);
2984
2985         kmem_cache_destroy(prot->slab);
2986         prot->slab = NULL;
2987
2988         req_prot_cleanup(prot->rsk_prot);
2989
2990         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2991                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2992                 kfree(prot->twsk_prot->twsk_slab_name);
2993                 prot->twsk_prot->twsk_slab = NULL;
2994         }
2995 }
2996 EXPORT_SYMBOL(proto_unregister);
2997
2998 #ifdef CONFIG_PROC_FS
2999 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3000         __acquires(proto_list_mutex)
3001 {
3002         mutex_lock(&proto_list_mutex);
3003         return seq_list_start_head(&proto_list, *pos);
3004 }
3005
3006 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3007 {
3008         return seq_list_next(v, &proto_list, pos);
3009 }
3010
3011 static void proto_seq_stop(struct seq_file *seq, void *v)
3012         __releases(proto_list_mutex)
3013 {
3014         mutex_unlock(&proto_list_mutex);
3015 }
3016
3017 static char proto_method_implemented(const void *method)
3018 {
3019         return method == NULL ? 'n' : 'y';
3020 }
3021 static long sock_prot_memory_allocated(struct proto *proto)
3022 {
3023         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3024 }
3025
3026 static char *sock_prot_memory_pressure(struct proto *proto)
3027 {
3028         return proto->memory_pressure != NULL ?
3029         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3030 }
3031
3032 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3033 {
3034
3035         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3036                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3037                    proto->name,
3038                    proto->obj_size,
3039                    sock_prot_inuse_get(seq_file_net(seq), proto),
3040                    sock_prot_memory_allocated(proto),
3041                    sock_prot_memory_pressure(proto),
3042                    proto->max_header,
3043                    proto->slab == NULL ? "no" : "yes",
3044                    module_name(proto->owner),
3045                    proto_method_implemented(proto->close),
3046                    proto_method_implemented(proto->connect),
3047                    proto_method_implemented(proto->disconnect),
3048                    proto_method_implemented(proto->accept),
3049                    proto_method_implemented(proto->ioctl),
3050                    proto_method_implemented(proto->init),
3051                    proto_method_implemented(proto->destroy),
3052                    proto_method_implemented(proto->shutdown),
3053                    proto_method_implemented(proto->setsockopt),
3054                    proto_method_implemented(proto->getsockopt),
3055                    proto_method_implemented(proto->sendmsg),
3056                    proto_method_implemented(proto->recvmsg),
3057                    proto_method_implemented(proto->sendpage),
3058                    proto_method_implemented(proto->bind),
3059                    proto_method_implemented(proto->backlog_rcv),
3060                    proto_method_implemented(proto->hash),
3061                    proto_method_implemented(proto->unhash),
3062                    proto_method_implemented(proto->get_port),
3063                    proto_method_implemented(proto->enter_memory_pressure));
3064 }
3065
3066 static int proto_seq_show(struct seq_file *seq, void *v)
3067 {
3068         if (v == &proto_list)
3069                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3070                            "protocol",
3071                            "size",
3072                            "sockets",
3073                            "memory",
3074                            "press",
3075                            "maxhdr",
3076                            "slab",
3077                            "module",
3078                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3079         else
3080                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3081         return 0;
3082 }
3083
3084 static const struct seq_operations proto_seq_ops = {
3085         .start  = proto_seq_start,
3086         .next   = proto_seq_next,
3087         .stop   = proto_seq_stop,
3088         .show   = proto_seq_show,
3089 };
3090
3091 static int proto_seq_open(struct inode *inode, struct file *file)
3092 {
3093         return seq_open_net(inode, file, &proto_seq_ops,
3094                             sizeof(struct seq_net_private));
3095 }
3096
3097 static const struct file_operations proto_seq_fops = {
3098         .owner          = THIS_MODULE,
3099         .open           = proto_seq_open,
3100         .read           = seq_read,
3101         .llseek         = seq_lseek,
3102         .release        = seq_release_net,
3103 };
3104
3105 static __net_init int proto_init_net(struct net *net)
3106 {
3107         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3108                 return -ENOMEM;
3109
3110         return 0;
3111 }
3112
3113 static __net_exit void proto_exit_net(struct net *net)
3114 {
3115         remove_proc_entry("protocols", net->proc_net);
3116 }
3117
3118
3119 static __net_initdata struct pernet_operations proto_net_ops = {
3120         .init = proto_init_net,
3121         .exit = proto_exit_net,
3122 };
3123
3124 static int __init proto_init(void)
3125 {
3126         return register_pernet_subsys(&proto_net_ops);
3127 }
3128
3129 subsys_initcall(proto_init);
3130
3131 #endif /* PROC_FS */