net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/types.h>
  97 #include <linux/socket.h>
  98 #include <linux/in.h>
  99 #include <linux/kernel.h>
 100 #include <linux/module.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/sched.h>
 104 #include <linux/timer.h>
 105 #include <linux/string.h>
 106 #include <linux/sockios.h>
 107 #include <linux/net.h>
 108 #include <linux/mm.h>
 109 #include <linux/slab.h>
 110 #include <linux/interrupt.h>
 111 #include <linux/poll.h>
 112 #include <linux/tcp.h>
 113 #include <linux/init.h>
 114 #include <linux/highmem.h>
 115 #include <linux/user_namespace.h>
 116 #include <linux/static_key.h>
 117 #include <linux/memcontrol.h>
 118 #include <linux/prefetch.h>
 119
 120 #include <asm/uaccess.h>
 121
 122 #include <linux/netdevice.h>
 123 #include <net/protocol.h>
 124 #include <linux/skbuff.h>
 125 #include <net/net_namespace.h>
 126 #include <net/request_sock.h>
 127 #include <net/sock.h>
 128 #include <linux/net_tstamp.h>
 129 #include <net/xfrm.h>
 130 #include <linux/ipsec.h>
 131 #include <net/cls_cgroup.h>
 132 #include <net/netprio_cgroup.h>
 133
 134 #include <linux/filter.h>
 135
 136 #include <trace/events/sock.h>
 137
 138 #ifdef CONFIG_INET
 139 #include <net/tcp.h>
 140 #endif
 141
 142 static DEFINE_MUTEX(proto_list_mutex);
 143 static LIST_HEAD(proto_list);
 144
 145 #ifdef CONFIG_MEMCG_KMEM
 146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 147 {
 148         struct proto *proto;
 149         int ret = 0;
 150
 151         mutex_lock(&proto_list_mutex);
 152         list_for_each_entry(proto, &proto_list, node) {
 153                 if (proto->init_cgroup) {
 154                         ret = proto->init_cgroup(memcg, ss);
 155                         if (ret)
 156                                 goto out;
 157                 }
 158         }
 159
 160         mutex_unlock(&proto_list_mutex);
 161         return ret;
 162 out:
 163         list_for_each_entry_continue_reverse(proto, &proto_list, node)
 164                 if (proto->destroy_cgroup)
 165                         proto->destroy_cgroup(memcg);
 166         mutex_unlock(&proto_list_mutex);
 167         return ret;
 168 }
 169
 170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 171 {
 172         struct proto *proto;
 173
 174         mutex_lock(&proto_list_mutex);
 175         list_for_each_entry_reverse(proto, &proto_list, node)
 176                 if (proto->destroy_cgroup)
 177                         proto->destroy_cgroup(memcg);
 178         mutex_unlock(&proto_list_mutex);
 179 }
 180 #endif
 181
 182 /*
 183  * Each address family might have different locking rules, so we have
 184  * one slock key per address family:
 185  */
 186 static struct lock_class_key af_family_keys[AF_MAX];
 187 static struct lock_class_key af_family_slock_keys[AF_MAX];
 188
 189 struct static_key memcg_socket_limit_enabled;
 190 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 191
 192 /*
 193  * Make lock validator output more readable. (we pre-construct these
 194  * strings build-time, so that runtime initialization of socket
 195  * locks is fast):
 196  */
 197 static const char *const af_family_key_strings[AF_MAX+1] = {
 198   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 199   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 200   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 201   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 202   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 203   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 204   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 205   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 206   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 207   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 208   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 209   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 210   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 211   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
 212 };
 213 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 214   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 215   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 216   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 217   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 218   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 219   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 220   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 221   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 222   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 223   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 224   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 225   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 226   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 227   "slock-AF_NFC"   , "slock-AF_MAX"
 228 };
 229 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 230   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 231   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 232   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 233   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 234   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 235   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 236   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 237   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 238   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 239   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 240   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 241   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 242   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 243   "clock-AF_NFC"   , "clock-AF_MAX"
 244 };
 245
 246 /*
 247  * sk_callback_lock locking rules are per-address-family,
 248  * so split the lock classes by using a per-AF key:
 249  */
 250 static struct lock_class_key af_callback_keys[AF_MAX];
 251
 252 /* Take into consideration the size of the struct sk_buff overhead in the
 253  * determination of these values, since that is non-constant across
 254  * platforms.  This makes socket queueing behavior and performance
 255  * not depend upon such differences.
 256  */
 257 #define _SK_MEM_PACKETS         256
 258 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 259 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 260 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 261
 262 /* Run time adjustable parameters. */
 263 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 264 EXPORT_SYMBOL(sysctl_wmem_max);
 265 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 266 EXPORT_SYMBOL(sysctl_rmem_max);
 267 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 268 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 269
 270 /* Maximal space eaten by iovec or ancillary data plus some space */
 271 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 272 EXPORT_SYMBOL(sysctl_optmem_max);
 273
 274 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 275 EXPORT_SYMBOL_GPL(memalloc_socks);
 276
 277 /**
 278  * sk_set_memalloc - sets %SOCK_MEMALLOC
 279  * @sk: socket to set it on
 280  *
 281  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 282  * It's the responsibility of the admin to adjust min_free_kbytes
 283  * to meet the requirements
 284  */
 285 void sk_set_memalloc(struct sock *sk)
 286 {
 287         sock_set_flag(sk, SOCK_MEMALLOC);
 288         sk->sk_allocation |= __GFP_MEMALLOC;
 289         static_key_slow_inc(&memalloc_socks);
 290 }
 291 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 292
 293 void sk_clear_memalloc(struct sock *sk)
 294 {
 295         sock_reset_flag(sk, SOCK_MEMALLOC);
 296         sk->sk_allocation &= ~__GFP_MEMALLOC;
 297         static_key_slow_dec(&memalloc_socks);
 298
 299         /*
 300          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 301          * progress of swapping. However, if SOCK_MEMALLOC is cleared while
 302          * it has rmem allocations there is a risk that the user of the
 303          * socket cannot make forward progress due to exceeding the rmem
 304          * limits. By rights, sk_clear_memalloc() should only be called
 305          * on sockets being torn down but warn and reset the accounting if
 306          * that assumption breaks.
 307          */
 308         if (WARN_ON(sk->sk_forward_alloc))
 309                 sk_mem_reclaim(sk);
 310 }
 311 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 312
 313 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 314 {
 315         int ret;
 316         unsigned long pflags = current->flags;
 317
 318         /* these should have been dropped before queueing */
 319         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 320
 321         current->flags |= PF_MEMALLOC;
 322         ret = sk->sk_backlog_rcv(sk, skb);
 323         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 324
 325         return ret;
 326 }
 327 EXPORT_SYMBOL(__sk_backlog_rcv);
 328
 329 #if defined(CONFIG_CGROUPS)
 330 #if !defined(CONFIG_NET_CLS_CGROUP)
 331 int net_cls_subsys_id = -1;
 332 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 333 #endif
 334 #if !defined(CONFIG_NETPRIO_CGROUP)
 335 int net_prio_subsys_id = -1;
 336 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
 337 #endif
 338 #endif
 339
 340 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 341 {
 342         struct timeval tv;
 343
 344         if (optlen < sizeof(tv))
 345                 return -EINVAL;
 346         if (copy_from_user(&tv, optval, sizeof(tv)))
 347                 return -EFAULT;
 348         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 349                 return -EDOM;
 350
 351         if (tv.tv_sec < 0) {
 352                 static int warned __read_mostly;
 353
 354                 *timeo_p = 0;
 355                 if (warned < 10 && net_ratelimit()) {
 356                         warned++;
 357                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 358                                 __func__, current->comm, task_pid_nr(current));
 359                 }
 360                 return 0;
 361         }
 362         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 363         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 364                 return 0;
 365         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 366                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 367         return 0;
 368 }
 369
 370 static void sock_warn_obsolete_bsdism(const char *name)
 371 {
 372         static int warned;
 373         static char warncomm[TASK_COMM_LEN];
 374         if (strcmp(warncomm, current->comm) && warned < 5) {
 375                 strcpy(warncomm,  current->comm);
 376                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 377                         warncomm, name);
 378                 warned++;
 379         }
 380 }
 381
 382 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 383
 384 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 385 {
 386         if (sk->sk_flags & flags) {
 387                 sk->sk_flags &= ~flags;
 388                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 389                         net_disable_timestamp();
 390         }
 391 }
 392
 393
 394 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 395 {
 396         int err;
 397         int skb_len;
 398         unsigned long flags;
 399         struct sk_buff_head *list = &sk->sk_receive_queue;
 400
 401         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 402                 atomic_inc(&sk->sk_drops);
 403                 trace_sock_rcvqueue_full(sk, skb);
 404                 return -ENOMEM;
 405         }
 406
 407         err = sk_filter(sk, skb);
 408         if (err)
 409                 return err;
 410
 411         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 412                 atomic_inc(&sk->sk_drops);
 413                 return -ENOBUFS;
 414         }
 415
 416         skb->dev = NULL;
 417         skb_set_owner_r(skb, sk);
 418
 419         /* Cache the SKB length before we tack it onto the receive
 420          * queue.  Once it is added it no longer belongs to us and
 421          * may be freed by other threads of control pulling packets
 422          * from the queue.
 423          */
 424         skb_len = skb->len;
 425
 426         /* we escape from rcu protected region, make sure we dont leak
 427          * a norefcounted dst
 428          */
 429         skb_dst_force(skb);
 430
 431         spin_lock_irqsave(&list->lock, flags);
 432         skb->dropcount = atomic_read(&sk->sk_drops);
 433         __skb_queue_tail(list, skb);
 434         spin_unlock_irqrestore(&list->lock, flags);
 435
 436         if (!sock_flag(sk, SOCK_DEAD))
 437                 sk->sk_data_ready(sk, skb_len);
 438         return 0;
 439 }
 440 EXPORT_SYMBOL(sock_queue_rcv_skb);
 441
 442 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 443 {
 444         int rc = NET_RX_SUCCESS;
 445
 446         if (sk_filter(sk, skb))
 447                 goto discard_and_relse;
 448
 449         skb->dev = NULL;
 450
 451         if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 452                 atomic_inc(&sk->sk_drops);
 453                 goto discard_and_relse;
 454         }
 455         if (nested)
 456                 bh_lock_sock_nested(sk);
 457         else
 458                 bh_lock_sock(sk);
 459         if (!sock_owned_by_user(sk)) {
 460                 /*
 461                  * trylock + unlock semantics:
 462                  */
 463                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 464
 465                 rc = sk_backlog_rcv(sk, skb);
 466
 467                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 468         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 469                 bh_unlock_sock(sk);
 470                 atomic_inc(&sk->sk_drops);
 471                 goto discard_and_relse;
 472         }
 473
 474         bh_unlock_sock(sk);
 475 out:
 476         sock_put(sk);
 477         return rc;
 478 discard_and_relse:
 479         kfree_skb(skb);
 480         goto out;
 481 }
 482 EXPORT_SYMBOL(sk_receive_skb);
 483
 484 void sk_reset_txq(struct sock *sk)
 485 {
 486         sk_tx_queue_clear(sk);
 487 }
 488 EXPORT_SYMBOL(sk_reset_txq);
 489
 490 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 491 {
 492         struct dst_entry *dst = __sk_dst_get(sk);
 493
 494         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 495                 sk_tx_queue_clear(sk);
 496                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 497                 dst_release(dst);
 498                 return NULL;
 499         }
 500
 501         return dst;
 502 }
 503 EXPORT_SYMBOL(__sk_dst_check);
 504
 505 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 506 {
 507         struct dst_entry *dst = sk_dst_get(sk);
 508
 509         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 510                 sk_dst_reset(sk);
 511                 dst_release(dst);
 512                 return NULL;
 513         }
 514
 515         return dst;
 516 }
 517 EXPORT_SYMBOL(sk_dst_check);
 518
 519 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 520 {
 521         int ret = -ENOPROTOOPT;
 522 #ifdef CONFIG_NETDEVICES
 523         struct net *net = sock_net(sk);
 524         char devname[IFNAMSIZ];
 525         int index;
 526
 527         /* Sorry... */
 528         ret = -EPERM;
 529         if (!capable(CAP_NET_RAW))
 530                 goto out;
 531
 532         ret = -EINVAL;
 533         if (optlen < 0)
 534                 goto out;
 535
 536         /* Bind this socket to a particular device like "eth0",
 537          * as specified in the passed interface name. If the
 538          * name is "" or the option length is zero the socket
 539          * is not bound.
 540          */
 541         if (optlen > IFNAMSIZ - 1)
 542                 optlen = IFNAMSIZ - 1;
 543         memset(devname, 0, sizeof(devname));
 544
 545         ret = -EFAULT;
 546         if (copy_from_user(devname, optval, optlen))
 547                 goto out;
 548
 549         index = 0;
 550         if (devname[0] != '\0') {
 551                 struct net_device *dev;
 552
 553                 rcu_read_lock();
 554                 dev = dev_get_by_name_rcu(net, devname);
 555                 if (dev)
 556                         index = dev->ifindex;
 557                 rcu_read_unlock();
 558                 ret = -ENODEV;
 559                 if (!dev)
 560                         goto out;
 561         }
 562
 563         lock_sock(sk);
 564         sk->sk_bound_dev_if = index;
 565         sk_dst_reset(sk);
 566         release_sock(sk);
 567
 568         ret = 0;
 569
 570 out:
 571 #endif
 572
 573         return ret;
 574 }
 575
 576 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 577 {
 578         if (valbool)
 579                 sock_set_flag(sk, bit);
 580         else
 581                 sock_reset_flag(sk, bit);
 582 }
 583
 584 /*
 585  *      This is meant for all protocols to use and covers goings on
 586  *      at the socket level. Everything here is generic.
 587  */
 588
 589 int sock_setsockopt(struct socket *sock, int level, int optname,
 590                     char __user *optval, unsigned int optlen)
 591 {
 592         struct sock *sk = sock->sk;
 593         int val;
 594         int valbool;
 595         struct linger ling;
 596         int ret = 0;
 597
 598         /*
 599          *      Options without arguments
 600          */
 601
 602         if (optname == SO_BINDTODEVICE)
 603                 return sock_bindtodevice(sk, optval, optlen);
 604
 605         if (optlen < sizeof(int))
 606                 return -EINVAL;
 607
 608         if (get_user(val, (int __user *)optval))
 609                 return -EFAULT;
 610
 611         valbool = val ? 1 : 0;
 612
 613         lock_sock(sk);
 614
 615         switch (optname) {
 616         case SO_DEBUG:
 617                 if (val && !capable(CAP_NET_ADMIN))
 618                         ret = -EACCES;
 619                 else
 620                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 621                 break;
 622         case SO_REUSEADDR:
 623                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 624                 break;
 625         case SO_TYPE:
 626         case SO_PROTOCOL:
 627         case SO_DOMAIN:
 628         case SO_ERROR:
 629                 ret = -ENOPROTOOPT;
 630                 break;
 631         case SO_DONTROUTE:
 632                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 633                 break;
 634         case SO_BROADCAST:
 635                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 636                 break;
 637         case SO_SNDBUF:
 638                 /* Don't error on this BSD doesn't and if you think
 639                  * about it this is right. Otherwise apps have to
 640                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 641                  * are treated in BSD as hints
 642                  */
 643                 val = min_t(u32, val, sysctl_wmem_max);
 644 set_sndbuf:
 645                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 646                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 647                 /* Wake up sending tasks if we upped the value. */
 648                 sk->sk_write_space(sk);
 649                 break;
 650
 651         case SO_SNDBUFFORCE:
 652                 if (!capable(CAP_NET_ADMIN)) {
 653                         ret = -EPERM;
 654                         break;
 655                 }
 656                 goto set_sndbuf;
 657
 658         case SO_RCVBUF:
 659                 /* Don't error on this BSD doesn't and if you think
 660                  * about it this is right. Otherwise apps have to
 661                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 662                  * are treated in BSD as hints
 663                  */
 664                 val = min_t(u32, val, sysctl_rmem_max);
 665 set_rcvbuf:
 666                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 667                 /*
 668                  * We double it on the way in to account for
 669                  * "struct sk_buff" etc. overhead.   Applications
 670                  * assume that the SO_RCVBUF setting they make will
 671                  * allow that much actual data to be received on that
 672                  * socket.
 673                  *
 674                  * Applications are unaware that "struct sk_buff" and
 675                  * other overheads allocate from the receive buffer
 676                  * during socket buffer allocation.
 677                  *
 678                  * And after considering the possible alternatives,
 679                  * returning the value we actually used in getsockopt
 680                  * is the most desirable behavior.
 681                  */
 682                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 683                 break;
 684
 685         case SO_RCVBUFFORCE:
 686                 if (!capable(CAP_NET_ADMIN)) {
 687                         ret = -EPERM;
 688                         break;
 689                 }
 690                 goto set_rcvbuf;
 691
 692         case SO_KEEPALIVE:
 693 #ifdef CONFIG_INET
 694                 if (sk->sk_protocol == IPPROTO_TCP &&
 695                     sk->sk_type == SOCK_STREAM)
 696                         tcp_set_keepalive(sk, valbool);
 697 #endif
 698                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 699                 break;
 700
 701         case SO_OOBINLINE:
 702                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 703                 break;
 704
 705         case SO_NO_CHECK:
 706                 sk->sk_no_check = valbool;
 707                 break;
 708
 709         case SO_PRIORITY:
 710                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 711                         sk->sk_priority = val;
 712                 else
 713                         ret = -EPERM;
 714                 break;
 715
 716         case SO_LINGER:
 717                 if (optlen < sizeof(ling)) {
 718                         ret = -EINVAL;  /* 1003.1g */
 719                         break;
 720                 }
 721                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 722                         ret = -EFAULT;
 723                         break;
 724                 }
 725                 if (!ling.l_onoff)
 726                         sock_reset_flag(sk, SOCK_LINGER);
 727                 else {
 728 #if (BITS_PER_LONG == 32)
 729                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 730                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 731                         else
 732 #endif
 733                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 734                         sock_set_flag(sk, SOCK_LINGER);
 735                 }
 736                 break;
 737
 738         case SO_BSDCOMPAT:
 739                 sock_warn_obsolete_bsdism("setsockopt");
 740                 break;
 741
 742         case SO_PASSCRED:
 743                 if (valbool)
 744                         set_bit(SOCK_PASSCRED, &sock->flags);
 745                 else
 746                         clear_bit(SOCK_PASSCRED, &sock->flags);
 747                 break;
 748
 749         case SO_TIMESTAMP:
 750         case SO_TIMESTAMPNS:
 751                 if (valbool)  {
 752                         if (optname == SO_TIMESTAMP)
 753                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 754                         else
 755                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 756                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 757                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 758                 } else {
 759                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 760                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 761                 }
 762                 break;
 763
 764         case SO_TIMESTAMPING:
 765                 if (val & ~SOF_TIMESTAMPING_MASK) {
 766                         ret = -EINVAL;
 767                         break;
 768                 }
 769                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 770                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
 771                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 772                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
 773                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 774                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
 775                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 776                         sock_enable_timestamp(sk,
 777                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 778                 else
 779                         sock_disable_timestamp(sk,
 780                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 781                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 782                                   val & SOF_TIMESTAMPING_SOFTWARE);
 783                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 784                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
 785                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 786                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
 787                 break;
 788
 789         case SO_RCVLOWAT:
 790                 if (val < 0)
 791                         val = INT_MAX;
 792                 sk->sk_rcvlowat = val ? : 1;
 793                 break;
 794
 795         case SO_RCVTIMEO:
 796                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 797                 break;
 798
 799         case SO_SNDTIMEO:
 800                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 801                 break;
 802
 803         case SO_ATTACH_FILTER:
 804                 ret = -EINVAL;
 805                 if (optlen == sizeof(struct sock_fprog)) {
 806                         struct sock_fprog fprog;
 807
 808                         ret = -EFAULT;
 809                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 810                                 break;
 811
 812                         ret = sk_attach_filter(&fprog, sk);
 813                 }
 814                 break;
 815
 816         case SO_DETACH_FILTER:
 817                 ret = sk_detach_filter(sk);
 818                 break;
 819
 820         case SO_PASSSEC:
 821                 if (valbool)
 822                         set_bit(SOCK_PASSSEC, &sock->flags);
 823                 else
 824                         clear_bit(SOCK_PASSSEC, &sock->flags);
 825                 break;
 826         case SO_MARK:
 827                 if (!capable(CAP_NET_ADMIN))
 828                         ret = -EPERM;
 829                 else
 830                         sk->sk_mark = val;
 831                 break;
 832
 833                 /* We implement the SO_SNDLOWAT etc to
 834                    not be settable (1003.1g 5.3) */
 835         case SO_RXQ_OVFL:
 836                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 837                 break;
 838
 839         case SO_WIFI_STATUS:
 840                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 841                 break;
 842
 843         case SO_PEEK_OFF:
 844                 if (sock->ops->set_peek_off)
 845                         sock->ops->set_peek_off(sk, val);
 846                 else
 847                         ret = -EOPNOTSUPP;
 848                 break;
 849
 850         case SO_NOFCS:
 851                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 852                 break;
 853
 854         default:
 855                 ret = -ENOPROTOOPT;
 856                 break;
 857         }
 858         release_sock(sk);
 859         return ret;
 860 }
 861 EXPORT_SYMBOL(sock_setsockopt);
 862
 863
 864 void cred_to_ucred(struct pid *pid, const struct cred *cred,
 865                    struct ucred *ucred)
 866 {
 867         ucred->pid = pid_vnr(pid);
 868         ucred->uid = ucred->gid = -1;
 869         if (cred) {
 870                 struct user_namespace *current_ns = current_user_ns();
 871
 872                 ucred->uid = from_kuid(current_ns, cred->euid);
 873                 ucred->gid = from_kgid(current_ns, cred->egid);
 874         }
 875 }
 876 EXPORT_SYMBOL_GPL(cred_to_ucred);
 877
 878 int sock_getsockopt(struct socket *sock, int level, int optname,
 879                     char __user *optval, int __user *optlen)
 880 {
 881         struct sock *sk = sock->sk;
 882
 883         union {
 884                 int val;
 885                 struct linger ling;
 886                 struct timeval tm;
 887         } v;
 888
 889         int lv = sizeof(int);
 890         int len;
 891
 892         if (get_user(len, optlen))
 893                 return -EFAULT;
 894         if (len < 0)
 895                 return -EINVAL;
 896
 897         memset(&v, 0, sizeof(v));
 898
 899         switch (optname) {
 900         case SO_DEBUG:
 901                 v.val = sock_flag(sk, SOCK_DBG);
 902                 break;
 903
 904         case SO_DONTROUTE:
 905                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 906                 break;
 907
 908         case SO_BROADCAST:
 909                 v.val = sock_flag(sk, SOCK_BROADCAST);
 910                 break;
 911
 912         case SO_SNDBUF:
 913                 v.val = sk->sk_sndbuf;
 914                 break;
 915
 916         case SO_RCVBUF:
 917                 v.val = sk->sk_rcvbuf;
 918                 break;
 919
 920         case SO_REUSEADDR:
 921                 v.val = sk->sk_reuse;
 922                 break;
 923
 924         case SO_KEEPALIVE:
 925                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
 926                 break;
 927
 928         case SO_TYPE:
 929                 v.val = sk->sk_type;
 930                 break;
 931
 932         case SO_PROTOCOL:
 933                 v.val = sk->sk_protocol;
 934                 break;
 935
 936         case SO_DOMAIN:
 937                 v.val = sk->sk_family;
 938                 break;
 939
 940         case SO_ERROR:
 941                 v.val = -sock_error(sk);
 942                 if (v.val == 0)
 943                         v.val = xchg(&sk->sk_err_soft, 0);
 944                 break;
 945
 946         case SO_OOBINLINE:
 947                 v.val = sock_flag(sk, SOCK_URGINLINE);
 948                 break;
 949
 950         case SO_NO_CHECK:
 951                 v.val = sk->sk_no_check;
 952                 break;
 953
 954         case SO_PRIORITY:
 955                 v.val = sk->sk_priority;
 956                 break;
 957
 958         case SO_LINGER:
 959                 lv              = sizeof(v.ling);
 960                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
 961                 v.ling.l_linger = sk->sk_lingertime / HZ;
 962                 break;
 963
 964         case SO_BSDCOMPAT:
 965                 sock_warn_obsolete_bsdism("getsockopt");
 966                 break;
 967
 968         case SO_TIMESTAMP:
 969                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 970                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 971                 break;
 972
 973         case SO_TIMESTAMPNS:
 974                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 975                 break;
 976
 977         case SO_TIMESTAMPING:
 978                 v.val = 0;
 979                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 980                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 981                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 982                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 983                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 984                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 985                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 986                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 987                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 988                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
 989                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 990                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 991                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 992                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 993                 break;
 994
 995         case SO_RCVTIMEO:
 996                 lv = sizeof(struct timeval);
 997                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 998                         v.tm.tv_sec = 0;
 999                         v.tm.tv_usec = 0;
1000                 } else {
1001                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1002                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1003                 }
1004                 break;
1005
1006         case SO_SNDTIMEO:
1007                 lv = sizeof(struct timeval);
1008                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1009                         v.tm.tv_sec = 0;
1010                         v.tm.tv_usec = 0;
1011                 } else {
1012                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1013                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1014                 }
1015                 break;
1016
1017         case SO_RCVLOWAT:
1018                 v.val = sk->sk_rcvlowat;
1019                 break;
1020
1021         case SO_SNDLOWAT:
1022                 v.val = 1;
1023                 break;
1024
1025         case SO_PASSCRED:
1026                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1027                 break;
1028
1029         case SO_PEERCRED:
1030         {
1031                 struct ucred peercred;
1032                 if (len > sizeof(peercred))
1033                         len = sizeof(peercred);
1034                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1035                 if (copy_to_user(optval, &peercred, len))
1036                         return -EFAULT;
1037                 goto lenout;
1038         }
1039
1040         case SO_PEERNAME:
1041         {
1042                 char address[128];
1043
1044                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1045                         return -ENOTCONN;
1046                 if (lv < len)
1047                         return -EINVAL;
1048                 if (copy_to_user(optval, address, len))
1049                         return -EFAULT;
1050                 goto lenout;
1051         }
1052
1053         /* Dubious BSD thing... Probably nobody even uses it, but
1054          * the UNIX standard wants it for whatever reason... -DaveM
1055          */
1056         case SO_ACCEPTCONN:
1057                 v.val = sk->sk_state == TCP_LISTEN;
1058                 break;
1059
1060         case SO_PASSSEC:
1061                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1062                 break;
1063
1064         case SO_PEERSEC:
1065                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1066
1067         case SO_MARK:
1068                 v.val = sk->sk_mark;
1069                 break;
1070
1071         case SO_RXQ_OVFL:
1072                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1073                 break;
1074
1075         case SO_WIFI_STATUS:
1076                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1077                 break;
1078
1079         case SO_PEEK_OFF:
1080                 if (!sock->ops->set_peek_off)
1081                         return -EOPNOTSUPP;
1082
1083                 v.val = sk->sk_peek_off;
1084                 break;
1085         case SO_NOFCS:
1086                 v.val = sock_flag(sk, SOCK_NOFCS);
1087                 break;
1088         default:
1089                 return -ENOPROTOOPT;
1090         }
1091
1092         if (len > lv)
1093                 len = lv;
1094         if (copy_to_user(optval, &v, len))
1095                 return -EFAULT;
1096 lenout:
1097         if (put_user(len, optlen))
1098                 return -EFAULT;
1099         return 0;
1100 }
1101
1102 /*
1103  * Initialize an sk_lock.
1104  *
1105  * (We also register the sk_lock with the lock validator.)
1106  */
1107 static inline void sock_lock_init(struct sock *sk)
1108 {
1109         sock_lock_init_class_and_name(sk,
1110                         af_family_slock_key_strings[sk->sk_family],
1111                         af_family_slock_keys + sk->sk_family,
1112                         af_family_key_strings[sk->sk_family],
1113                         af_family_keys + sk->sk_family);
1114 }
1115
1116 /*
1117  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1118  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1119  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1120  */
1121 static void sock_copy(struct sock *nsk, const struct sock *osk)
1122 {
1123 #ifdef CONFIG_SECURITY_NETWORK
1124         void *sptr = nsk->sk_security;
1125 #endif
1126         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1127
1128         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1129                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1130
1131 #ifdef CONFIG_SECURITY_NETWORK
1132         nsk->sk_security = sptr;
1133         security_sk_clone(osk, nsk);
1134 #endif
1135 }
1136
1137 /*
1138  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1139  * un-modified. Special care is taken when initializing object to zero.
1140  */
1141 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1142 {
1143         if (offsetof(struct sock, sk_node.next) != 0)
1144                 memset(sk, 0, offsetof(struct sock, sk_node.next));
1145         memset(&sk->sk_node.pprev, 0,
1146                size - offsetof(struct sock, sk_node.pprev));
1147 }
1148
1149 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1150 {
1151         unsigned long nulls1, nulls2;
1152
1153         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1154         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1155         if (nulls1 > nulls2)
1156                 swap(nulls1, nulls2);
1157
1158         if (nulls1 != 0)
1159                 memset((char *)sk, 0, nulls1);
1160         memset((char *)sk + nulls1 + sizeof(void *), 0,
1161                nulls2 - nulls1 - sizeof(void *));
1162         memset((char *)sk + nulls2 + sizeof(void *), 0,
1163                size - nulls2 - sizeof(void *));
1164 }
1165 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1166
1167 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1168                 int family)
1169 {
1170         struct sock *sk;
1171         struct kmem_cache *slab;
1172
1173         slab = prot->slab;
1174         if (slab != NULL) {
1175                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1176                 if (!sk)
1177                         return sk;
1178                 if (priority & __GFP_ZERO) {
1179                         if (prot->clear_sk)
1180                                 prot->clear_sk(sk, prot->obj_size);
1181                         else
1182                                 sk_prot_clear_nulls(sk, prot->obj_size);
1183                 }
1184         } else
1185                 sk = kmalloc(prot->obj_size, priority);
1186
1187         if (sk != NULL) {
1188                 kmemcheck_annotate_bitfield(sk, flags);
1189
1190                 if (security_sk_alloc(sk, family, priority))
1191                         goto out_free;
1192
1193                 if (!try_module_get(prot->owner))
1194                         goto out_free_sec;
1195                 sk_tx_queue_clear(sk);
1196         }
1197
1198         return sk;
1199
1200 out_free_sec:
1201         security_sk_free(sk);
1202 out_free:
1203         if (slab != NULL)
1204                 kmem_cache_free(slab, sk);
1205         else
1206                 kfree(sk);
1207         return NULL;
1208 }
1209
1210 static void sk_prot_free(struct proto *prot, struct sock *sk)
1211 {
1212         struct kmem_cache *slab;
1213         struct module *owner;
1214
1215         owner = prot->owner;
1216         slab = prot->slab;
1217
1218         security_sk_free(sk);
1219         if (slab != NULL)
1220                 kmem_cache_free(slab, sk);
1221         else
1222                 kfree(sk);
1223         module_put(owner);
1224 }
1225
1226 #ifdef CONFIG_CGROUPS
1227 void sock_update_classid(struct sock *sk)
1228 {
1229         u32 classid;
1230
1231         rcu_read_lock();  /* doing current task, which cannot vanish. */
1232         classid = task_cls_classid(current);
1233         rcu_read_unlock();
1234         if (classid && classid != sk->sk_classid)
1235                 sk->sk_classid = classid;
1236 }
1237 EXPORT_SYMBOL(sock_update_classid);
1238
1239 void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1240 {
1241         if (in_interrupt())
1242                 return;
1243
1244         sk->sk_cgrp_prioidx = task_netprioidx(task);
1245 }
1246 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1247 #endif
1248
1249 /**
1250  *      sk_alloc - All socket objects are allocated here
1251  *      @net: the applicable net namespace
1252  *      @family: protocol family
1253  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1254  *      @prot: struct proto associated with this new sock instance
1255  */
1256 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1257                       struct proto *prot)
1258 {
1259         struct sock *sk;
1260
1261         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1262         if (sk) {
1263                 sk->sk_family = family;
1264                 /*
1265                  * See comment in struct sock definition to understand
1266                  * why we need sk_prot_creator -acme
1267                  */
1268                 sk->sk_prot = sk->sk_prot_creator = prot;
1269                 sock_lock_init(sk);
1270                 sock_net_set(sk, get_net(net));
1271                 atomic_set(&sk->sk_wmem_alloc, 1);
1272
1273                 sock_update_classid(sk);
1274                 sock_update_netprioidx(sk, current);
1275         }
1276
1277         return sk;
1278 }
1279 EXPORT_SYMBOL(sk_alloc);
1280
1281 static void __sk_free(struct sock *sk)
1282 {
1283         struct sk_filter *filter;
1284
1285         if (sk->sk_destruct)
1286                 sk->sk_destruct(sk);
1287
1288         filter = rcu_dereference_check(sk->sk_filter,
1289                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1290         if (filter) {
1291                 sk_filter_uncharge(sk, filter);
1292                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1293         }
1294
1295         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1296
1297         if (atomic_read(&sk->sk_omem_alloc))
1298                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1299                          __func__, atomic_read(&sk->sk_omem_alloc));
1300
1301         if (sk->sk_peer_cred)
1302                 put_cred(sk->sk_peer_cred);
1303         put_pid(sk->sk_peer_pid);
1304         put_net(sock_net(sk));
1305         sk_prot_free(sk->sk_prot_creator, sk);
1306 }
1307
1308 void sk_free(struct sock *sk)
1309 {
1310         /*
1311          * We subtract one from sk_wmem_alloc and can know if
1312          * some packets are still in some tx queue.
1313          * If not null, sock_wfree() will call __sk_free(sk) later
1314          */
1315         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1316                 __sk_free(sk);
1317 }
1318 EXPORT_SYMBOL(sk_free);
1319
1320 /*
1321  * Last sock_put should drop reference to sk->sk_net. It has already
1322  * been dropped in sk_change_net. Taking reference to stopping namespace
1323  * is not an option.
1324  * Take reference to a socket to remove it from hash _alive_ and after that
1325  * destroy it in the context of init_net.
1326  */
1327 void sk_release_kernel(struct sock *sk)
1328 {
1329         if (sk == NULL || sk->sk_socket == NULL)
1330                 return;
1331
1332         sock_hold(sk);
1333         sock_release(sk->sk_socket);
1334         release_net(sock_net(sk));
1335         sock_net_set(sk, get_net(&init_net));
1336         sock_put(sk);
1337 }
1338 EXPORT_SYMBOL(sk_release_kernel);
1339
1340 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1341 {
1342         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1343                 sock_update_memcg(newsk);
1344 }
1345
1346 /**
1347  *      sk_clone_lock - clone a socket, and lock its clone
1348  *      @sk: the socket to clone
1349  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1350  *
1351  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1352  */
1353 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1354 {
1355         struct sock *newsk;
1356
1357         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1358         if (newsk != NULL) {
1359                 struct sk_filter *filter;
1360
1361                 sock_copy(newsk, sk);
1362
1363                 /* SANITY */
1364                 get_net(sock_net(newsk));
1365                 sk_node_init(&newsk->sk_node);
1366                 sock_lock_init(newsk);
1367                 bh_lock_sock(newsk);
1368                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1369                 newsk->sk_backlog.len = 0;
1370
1371                 atomic_set(&newsk->sk_rmem_alloc, 0);
1372                 /*
1373                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1374                  */
1375                 atomic_set(&newsk->sk_wmem_alloc, 1);
1376                 atomic_set(&newsk->sk_omem_alloc, 0);
1377                 skb_queue_head_init(&newsk->sk_receive_queue);
1378                 skb_queue_head_init(&newsk->sk_write_queue);
1379 #ifdef CONFIG_NET_DMA
1380                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1381 #endif
1382
1383                 spin_lock_init(&newsk->sk_dst_lock);
1384                 rwlock_init(&newsk->sk_callback_lock);
1385                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1386                                 af_callback_keys + newsk->sk_family,
1387                                 af_family_clock_key_strings[newsk->sk_family]);
1388
1389                 newsk->sk_dst_cache     = NULL;
1390                 newsk->sk_wmem_queued   = 0;
1391                 newsk->sk_forward_alloc = 0;
1392                 newsk->sk_send_head     = NULL;
1393                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1394
1395                 sock_reset_flag(newsk, SOCK_DONE);
1396                 skb_queue_head_init(&newsk->sk_error_queue);
1397
1398                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1399                 if (filter != NULL)
1400                         sk_filter_charge(newsk, filter);
1401
1402                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1403                         /* It is still raw copy of parent, so invalidate
1404                          * destructor and make plain sk_free() */
1405                         newsk->sk_destruct = NULL;
1406                         bh_unlock_sock(newsk);
1407                         sk_free(newsk);
1408                         newsk = NULL;
1409                         goto out;
1410                 }
1411
1412                 newsk->sk_err      = 0;
1413                 newsk->sk_priority = 0;
1414                 /*
1415                  * Before updating sk_refcnt, we must commit prior changes to memory
1416                  * (Documentation/RCU/rculist_nulls.txt for details)
1417                  */
1418                 smp_wmb();
1419                 atomic_set(&newsk->sk_refcnt, 2);
1420
1421                 /*
1422                  * Increment the counter in the same struct proto as the master
1423                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1424                  * is the same as sk->sk_prot->socks, as this field was copied
1425                  * with memcpy).
1426                  *
1427                  * This _changes_ the previous behaviour, where
1428                  * tcp_create_openreq_child always was incrementing the
1429                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1430                  * to be taken into account in all callers. -acme
1431                  */
1432                 sk_refcnt_debug_inc(newsk);
1433                 sk_set_socket(newsk, NULL);
1434                 newsk->sk_wq = NULL;
1435
1436                 sk_update_clone(sk, newsk);
1437
1438                 if (newsk->sk_prot->sockets_allocated)
1439                         sk_sockets_allocated_inc(newsk);
1440
1441                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1442                         net_enable_timestamp();
1443         }
1444 out:
1445         return newsk;
1446 }
1447 EXPORT_SYMBOL_GPL(sk_clone_lock);
1448
1449 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1450 {
1451         __sk_dst_set(sk, dst);
1452         sk->sk_route_caps = dst->dev->features;
1453         if (sk->sk_route_caps & NETIF_F_GSO)
1454                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1455         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1456         if (sk_can_gso(sk)) {
1457                 if (dst->header_len) {
1458                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1459                 } else {
1460                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1461                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1462                         sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1463                 }
1464         }
1465 }
1466 EXPORT_SYMBOL_GPL(sk_setup_caps);
1467
1468 void __init sk_init(void)
1469 {
1470         if (totalram_pages <= 4096) {
1471                 sysctl_wmem_max = 32767;
1472                 sysctl_rmem_max = 32767;
1473                 sysctl_wmem_default = 32767;
1474                 sysctl_rmem_default = 32767;
1475         } else if (totalram_pages >= 131072) {
1476                 sysctl_wmem_max = 131071;
1477                 sysctl_rmem_max = 131071;
1478         }
1479 }
1480
1481 /*
1482  *      Simple resource managers for sockets.
1483  */
1484
1485
1486 /*
1487  * Write buffer destructor automatically called from kfree_skb.
1488  */
1489 void sock_wfree(struct sk_buff *skb)
1490 {
1491         struct sock *sk = skb->sk;
1492         unsigned int len = skb->truesize;
1493
1494         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1495                 /*
1496                  * Keep a reference on sk_wmem_alloc, this will be released
1497                  * after sk_write_space() call
1498                  */
1499                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1500                 sk->sk_write_space(sk);
1501                 len = 1;
1502         }
1503         /*
1504          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1505          * could not do because of in-flight packets
1506          */
1507         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1508                 __sk_free(sk);
1509 }
1510 EXPORT_SYMBOL(sock_wfree);
1511
1512 /*
1513  * Read buffer destructor automatically called from kfree_skb.
1514  */
1515 void sock_rfree(struct sk_buff *skb)
1516 {
1517         struct sock *sk = skb->sk;
1518         unsigned int len = skb->truesize;
1519
1520         atomic_sub(len, &sk->sk_rmem_alloc);
1521         sk_mem_uncharge(sk, len);
1522 }
1523 EXPORT_SYMBOL(sock_rfree);
1524
1525 void sock_edemux(struct sk_buff *skb)
1526 {
1527         struct sock *sk = skb->sk;
1528
1529 #ifdef CONFIG_INET
1530         if (sk->sk_state == TCP_TIME_WAIT)
1531                 inet_twsk_put(inet_twsk(sk));
1532         else
1533 #endif
1534                 sock_put(sk);
1535 }
1536 EXPORT_SYMBOL(sock_edemux);
1537
1538 int sock_i_uid(struct sock *sk)
1539 {
1540         int uid;
1541
1542         read_lock_bh(&sk->sk_callback_lock);
1543         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1544         read_unlock_bh(&sk->sk_callback_lock);
1545         return uid;
1546 }
1547 EXPORT_SYMBOL(sock_i_uid);
1548
1549 unsigned long sock_i_ino(struct sock *sk)
1550 {
1551         unsigned long ino;
1552
1553         read_lock_bh(&sk->sk_callback_lock);
1554         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1555         read_unlock_bh(&sk->sk_callback_lock);
1556         return ino;
1557 }
1558 EXPORT_SYMBOL(sock_i_ino);
1559
1560 /*
1561  * Allocate a skb from the socket's send buffer.
1562  */
1563 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1564                              gfp_t priority)
1565 {
1566         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1567                 struct sk_buff *skb = alloc_skb(size, priority);
1568                 if (skb) {
1569                         skb_set_owner_w(skb, sk);
1570                         return skb;
1571                 }
1572         }
1573         return NULL;
1574 }
1575 EXPORT_SYMBOL(sock_wmalloc);
1576
1577 /*
1578  * Allocate a skb from the socket's receive buffer.
1579  */
1580 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1581                              gfp_t priority)
1582 {
1583         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1584                 struct sk_buff *skb = alloc_skb(size, priority);
1585                 if (skb) {
1586                         skb_set_owner_r(skb, sk);
1587                         return skb;
1588                 }
1589         }
1590         return NULL;
1591 }
1592
1593 /*
1594  * Allocate a memory block from the socket's option memory buffer.
1595  */
1596 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1597 {
1598         if ((unsigned int)size <= sysctl_optmem_max &&
1599             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1600                 void *mem;
1601                 /* First do the add, to avoid the race if kmalloc
1602                  * might sleep.
1603                  */
1604                 atomic_add(size, &sk->sk_omem_alloc);
1605                 mem = kmalloc(size, priority);
1606                 if (mem)
1607                         return mem;
1608                 atomic_sub(size, &sk->sk_omem_alloc);
1609         }
1610         return NULL;
1611 }
1612 EXPORT_SYMBOL(sock_kmalloc);
1613
1614 /*
1615  * Free an option memory block.
1616  */
1617 void sock_kfree_s(struct sock *sk, void *mem, int size)
1618 {
1619         kfree(mem);
1620         atomic_sub(size, &sk->sk_omem_alloc);
1621 }
1622 EXPORT_SYMBOL(sock_kfree_s);
1623
1624 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1625    I think, these locks should be removed for datagram sockets.
1626  */
1627 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1628 {
1629         DEFINE_WAIT(wait);
1630
1631         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1632         for (;;) {
1633                 if (!timeo)
1634                         break;
1635                 if (signal_pending(current))
1636                         break;
1637                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1638                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1639                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1640                         break;
1641                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1642                         break;
1643                 if (sk->sk_err)
1644                         break;
1645                 timeo = schedule_timeout(timeo);
1646         }
1647         finish_wait(sk_sleep(sk), &wait);
1648         return timeo;
1649 }
1650
1651
1652 /*
1653  *      Generic send/receive buffer handlers
1654  */
1655
1656 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1657                                      unsigned long data_len, int noblock,
1658                                      int *errcode)
1659 {
1660         struct sk_buff *skb;
1661         gfp_t gfp_mask;
1662         long timeo;
1663         int err;
1664         int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1665
1666         err = -EMSGSIZE;
1667         if (npages > MAX_SKB_FRAGS)
1668                 goto failure;
1669
1670         gfp_mask = sk->sk_allocation;
1671         if (gfp_mask & __GFP_WAIT)
1672                 gfp_mask |= __GFP_REPEAT;
1673
1674         timeo = sock_sndtimeo(sk, noblock);
1675         while (1) {
1676                 err = sock_error(sk);
1677                 if (err != 0)
1678                         goto failure;
1679
1680                 err = -EPIPE;
1681                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1682                         goto failure;
1683
1684                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1685                         skb = alloc_skb(header_len, gfp_mask);
1686                         if (skb) {
1687                                 int i;
1688
1689                                 /* No pages, we're done... */
1690                                 if (!data_len)
1691                                         break;
1692
1693                                 skb->truesize += data_len;
1694                                 skb_shinfo(skb)->nr_frags = npages;
1695                                 for (i = 0; i < npages; i++) {
1696                                         struct page *page;
1697
1698                                         page = alloc_pages(sk->sk_allocation, 0);
1699                                         if (!page) {
1700                                                 err = -ENOBUFS;
1701                                                 skb_shinfo(skb)->nr_frags = i;
1702                                                 kfree_skb(skb);
1703                                                 goto failure;
1704                                         }
1705
1706                                         __skb_fill_page_desc(skb, i,
1707                                                         page, 0,
1708                                                         (data_len >= PAGE_SIZE ?
1709                                                          PAGE_SIZE :
1710                                                          data_len));
1711                                         data_len -= PAGE_SIZE;
1712                                 }
1713
1714                                 /* Full success... */
1715                                 break;
1716                         }
1717                         err = -ENOBUFS;
1718                         goto failure;
1719                 }
1720                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1721                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1722                 err = -EAGAIN;
1723                 if (!timeo)
1724                         goto failure;
1725                 if (signal_pending(current))
1726                         goto interrupted;
1727                 timeo = sock_wait_for_wmem(sk, timeo);
1728         }
1729
1730         skb_set_owner_w(skb, sk);
1731         return skb;
1732
1733 interrupted:
1734         err = sock_intr_errno(timeo);
1735 failure:
1736         *errcode = err;
1737         return NULL;
1738 }
1739 EXPORT_SYMBOL(sock_alloc_send_pskb);
1740
1741 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1742                                     int noblock, int *errcode)
1743 {
1744         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1745 }
1746 EXPORT_SYMBOL(sock_alloc_send_skb);
1747
1748 static void __lock_sock(struct sock *sk)
1749         __releases(&sk->sk_lock.slock)
1750         __acquires(&sk->sk_lock.slock)
1751 {
1752         DEFINE_WAIT(wait);
1753
1754         for (;;) {
1755                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1756                                         TASK_UNINTERRUPTIBLE);
1757                 spin_unlock_bh(&sk->sk_lock.slock);
1758                 schedule();
1759                 spin_lock_bh(&sk->sk_lock.slock);
1760                 if (!sock_owned_by_user(sk))
1761                         break;
1762         }
1763         finish_wait(&sk->sk_lock.wq, &wait);
1764 }
1765
1766 static void __release_sock(struct sock *sk)
1767         __releases(&sk->sk_lock.slock)
1768         __acquires(&sk->sk_lock.slock)
1769 {
1770         struct sk_buff *skb = sk->sk_backlog.head;
1771
1772         do {
1773                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1774                 bh_unlock_sock(sk);
1775
1776                 do {
1777                         struct sk_buff *next = skb->next;
1778
1779                         prefetch(next);
1780                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1781                         skb->next = NULL;
1782                         sk_backlog_rcv(sk, skb);
1783
1784                         /*
1785                          * We are in process context here with softirqs
1786                          * disabled, use cond_resched_softirq() to preempt.
1787                          * This is safe to do because we've taken the backlog
1788                          * queue private:
1789                          */
1790                         cond_resched_softirq();
1791
1792                         skb = next;
1793                 } while (skb != NULL);
1794
1795                 bh_lock_sock(sk);
1796         } while ((skb = sk->sk_backlog.head) != NULL);
1797
1798         /*
1799          * Doing the zeroing here guarantee we can not loop forever
1800          * while a wild producer attempts to flood us.
1801          */
1802         sk->sk_backlog.len = 0;
1803 }
1804
1805 /**
1806  * sk_wait_data - wait for data to arrive at sk_receive_queue
1807  * @sk:    sock to wait on
1808  * @timeo: for how long
1809  *
1810  * Now socket state including sk->sk_err is changed only under lock,
1811  * hence we may omit checks after joining wait queue.
1812  * We check receive queue before schedule() only as optimization;
1813  * it is very likely that release_sock() added new data.
1814  */
1815 int sk_wait_data(struct sock *sk, long *timeo)
1816 {
1817         int rc;
1818         DEFINE_WAIT(wait);
1819
1820         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1821         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1822         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1823         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1824         finish_wait(sk_sleep(sk), &wait);
1825         return rc;
1826 }
1827 EXPORT_SYMBOL(sk_wait_data);
1828
1829 /**
1830  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1831  *      @sk: socket
1832  *      @size: memory size to allocate
1833  *      @kind: allocation type
1834  *
1835  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1836  *      rmem allocation. This function assumes that protocols which have
1837  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1838  */
1839 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1840 {
1841         struct proto *prot = sk->sk_prot;
1842         int amt = sk_mem_pages(size);
1843         long allocated;
1844         int parent_status = UNDER_LIMIT;
1845
1846         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1847
1848         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1849
1850         /* Under limit. */
1851         if (parent_status == UNDER_LIMIT &&
1852                         allocated <= sk_prot_mem_limits(sk, 0)) {
1853                 sk_leave_memory_pressure(sk);
1854                 return 1;
1855         }
1856
1857         /* Under pressure. (we or our parents) */
1858         if ((parent_status > SOFT_LIMIT) ||
1859                         allocated > sk_prot_mem_limits(sk, 1))
1860                 sk_enter_memory_pressure(sk);
1861
1862         /* Over hard limit (we or our parents) */
1863         if ((parent_status == OVER_LIMIT) ||
1864                         (allocated > sk_prot_mem_limits(sk, 2)))
1865                 goto suppress_allocation;
1866
1867         /* guarantee minimum buffer size under pressure */
1868         if (kind == SK_MEM_RECV) {
1869                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1870                         return 1;
1871
1872         } else { /* SK_MEM_SEND */
1873                 if (sk->sk_type == SOCK_STREAM) {
1874                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1875                                 return 1;
1876                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1877                            prot->sysctl_wmem[0])
1878                                 return 1;
1879         }
1880
1881         if (sk_has_memory_pressure(sk)) {
1882                 int alloc;
1883
1884                 if (!sk_under_memory_pressure(sk))
1885                         return 1;
1886                 alloc = sk_sockets_allocated_read_positive(sk);
1887                 if (sk_prot_mem_limits(sk, 2) > alloc *
1888                     sk_mem_pages(sk->sk_wmem_queued +
1889                                  atomic_read(&sk->sk_rmem_alloc) +
1890                                  sk->sk_forward_alloc))
1891                         return 1;
1892         }
1893
1894 suppress_allocation:
1895
1896         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1897                 sk_stream_moderate_sndbuf(sk);
1898
1899                 /* Fail only if socket is _under_ its sndbuf.
1900                  * In this case we cannot block, so that we have to fail.
1901                  */
1902                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1903                         return 1;
1904         }
1905
1906         trace_sock_exceed_buf_limit(sk, prot, allocated);
1907
1908         /* Alas. Undo changes. */
1909         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1910
1911         sk_memory_allocated_sub(sk, amt);
1912
1913         return 0;
1914 }
1915 EXPORT_SYMBOL(__sk_mem_schedule);
1916
1917 /**
1918  *      __sk_reclaim - reclaim memory_allocated
1919  *      @sk: socket
1920  */
1921 void __sk_mem_reclaim(struct sock *sk)
1922 {
1923         sk_memory_allocated_sub(sk,
1924                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1925         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1926
1927         if (sk_under_memory_pressure(sk) &&
1928             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1929                 sk_leave_memory_pressure(sk);
1930 }
1931 EXPORT_SYMBOL(__sk_mem_reclaim);
1932
1933
1934 /*
1935  * Set of default routines for initialising struct proto_ops when
1936  * the protocol does not support a particular function. In certain
1937  * cases where it makes no sense for a protocol to have a "do nothing"
1938  * function, some default processing is provided.
1939  */
1940
1941 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1942 {
1943         return -EOPNOTSUPP;
1944 }
1945 EXPORT_SYMBOL(sock_no_bind);
1946
1947 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1948                     int len, int flags)
1949 {
1950         return -EOPNOTSUPP;
1951 }
1952 EXPORT_SYMBOL(sock_no_connect);
1953
1954 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1955 {
1956         return -EOPNOTSUPP;
1957 }
1958 EXPORT_SYMBOL(sock_no_socketpair);
1959
1960 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1961 {
1962         return -EOPNOTSUPP;
1963 }
1964 EXPORT_SYMBOL(sock_no_accept);
1965
1966 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1967                     int *len, int peer)
1968 {
1969         return -EOPNOTSUPP;
1970 }
1971 EXPORT_SYMBOL(sock_no_getname);
1972
1973 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1974 {
1975         return 0;
1976 }
1977 EXPORT_SYMBOL(sock_no_poll);
1978
1979 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1980 {
1981         return -EOPNOTSUPP;
1982 }
1983 EXPORT_SYMBOL(sock_no_ioctl);
1984
1985 int sock_no_listen(struct socket *sock, int backlog)
1986 {
1987         return -EOPNOTSUPP;
1988 }
1989 EXPORT_SYMBOL(sock_no_listen);
1990
1991 int sock_no_shutdown(struct socket *sock, int how)
1992 {
1993         return -EOPNOTSUPP;
1994 }
1995 EXPORT_SYMBOL(sock_no_shutdown);
1996
1997 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1998                     char __user *optval, unsigned int optlen)
1999 {
2000         return -EOPNOTSUPP;
2001 }
2002 EXPORT_SYMBOL(sock_no_setsockopt);
2003
2004 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2005                     char __user *optval, int __user *optlen)
2006 {
2007         return -EOPNOTSUPP;
2008 }
2009 EXPORT_SYMBOL(sock_no_getsockopt);
2010
2011 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2012                     size_t len)
2013 {
2014         return -EOPNOTSUPP;
2015 }
2016 EXPORT_SYMBOL(sock_no_sendmsg);
2017
2018 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2019                     size_t len, int flags)
2020 {
2021         return -EOPNOTSUPP;
2022 }
2023 EXPORT_SYMBOL(sock_no_recvmsg);
2024
2025 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2026 {
2027         /* Mirror missing mmap method error code */
2028         return -ENODEV;
2029 }
2030 EXPORT_SYMBOL(sock_no_mmap);
2031
2032 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2033 {
2034         ssize_t res;
2035         struct msghdr msg = {.msg_flags = flags};
2036         struct kvec iov;
2037         char *kaddr = kmap(page);
2038         iov.iov_base = kaddr + offset;
2039         iov.iov_len = size;
2040         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2041         kunmap(page);
2042         return res;
2043 }
2044 EXPORT_SYMBOL(sock_no_sendpage);
2045
2046 /*
2047  *      Default Socket Callbacks
2048  */
2049
2050 static void sock_def_wakeup(struct sock *sk)
2051 {
2052         struct socket_wq *wq;
2053
2054         rcu_read_lock();
2055         wq = rcu_dereference(sk->sk_wq);
2056         if (wq_has_sleeper(wq))
2057                 wake_up_interruptible_all(&wq->wait);
2058         rcu_read_unlock();
2059 }
2060
2061 static void sock_def_error_report(struct sock *sk)
2062 {
2063         struct socket_wq *wq;
2064
2065         rcu_read_lock();
2066         wq = rcu_dereference(sk->sk_wq);
2067         if (wq_has_sleeper(wq))
2068                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2069         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2070         rcu_read_unlock();
2071 }
2072
2073 static void sock_def_readable(struct sock *sk, int len)
2074 {
2075         struct socket_wq *wq;
2076
2077         rcu_read_lock();
2078         wq = rcu_dereference(sk->sk_wq);
2079         if (wq_has_sleeper(wq))
2080                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2081                                                 POLLRDNORM | POLLRDBAND);
2082         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2083         rcu_read_unlock();
2084 }
2085
2086 static void sock_def_write_space(struct sock *sk)
2087 {
2088         struct socket_wq *wq;
2089
2090         rcu_read_lock();
2091
2092         /* Do not wake up a writer until he can make "significant"
2093          * progress.  --DaveM
2094          */
2095         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2096                 wq = rcu_dereference(sk->sk_wq);
2097                 if (wq_has_sleeper(wq))
2098                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2099                                                 POLLWRNORM | POLLWRBAND);
2100
2101                 /* Should agree with poll, otherwise some programs break */
2102                 if (sock_writeable(sk))
2103                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2104         }
2105
2106         rcu_read_unlock();
2107 }
2108
2109 static void sock_def_destruct(struct sock *sk)
2110 {
2111         kfree(sk->sk_protinfo);
2112 }
2113
2114 void sk_send_sigurg(struct sock *sk)
2115 {
2116         if (sk->sk_socket && sk->sk_socket->file)
2117                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2118                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2119 }
2120 EXPORT_SYMBOL(sk_send_sigurg);
2121
2122 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2123                     unsigned long expires)
2124 {
2125         if (!mod_timer(timer, expires))
2126                 sock_hold(sk);
2127 }
2128 EXPORT_SYMBOL(sk_reset_timer);
2129
2130 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2131 {
2132         if (timer_pending(timer) && del_timer(timer))
2133                 __sock_put(sk);
2134 }
2135 EXPORT_SYMBOL(sk_stop_timer);
2136
2137 void sock_init_data(struct socket *sock, struct sock *sk)
2138 {
2139         skb_queue_head_init(&sk->sk_receive_queue);
2140         skb_queue_head_init(&sk->sk_write_queue);
2141         skb_queue_head_init(&sk->sk_error_queue);
2142 #ifdef CONFIG_NET_DMA
2143         skb_queue_head_init(&sk->sk_async_wait_queue);
2144 #endif
2145
2146         sk->sk_send_head        =       NULL;
2147
2148         init_timer(&sk->sk_timer);
2149
2150         sk->sk_allocation       =       GFP_KERNEL;
2151         sk->sk_rcvbuf           =       sysctl_rmem_default;
2152         sk->sk_sndbuf           =       sysctl_wmem_default;
2153         sk->sk_state            =       TCP_CLOSE;
2154         sk_set_socket(sk, sock);
2155
2156         sock_set_flag(sk, SOCK_ZAPPED);
2157
2158         if (sock) {
2159                 sk->sk_type     =       sock->type;
2160                 sk->sk_wq       =       sock->wq;
2161                 sock->sk        =       sk;
2162         } else
2163                 sk->sk_wq       =       NULL;
2164
2165         spin_lock_init(&sk->sk_dst_lock);
2166         rwlock_init(&sk->sk_callback_lock);
2167         lockdep_set_class_and_name(&sk->sk_callback_lock,
2168                         af_callback_keys + sk->sk_family,
2169                         af_family_clock_key_strings[sk->sk_family]);
2170
2171         sk->sk_state_change     =       sock_def_wakeup;
2172         sk->sk_data_ready       =       sock_def_readable;
2173         sk->sk_write_space      =       sock_def_write_space;
2174         sk->sk_error_report     =       sock_def_error_report;
2175         sk->sk_destruct         =       sock_def_destruct;
2176
2177         sk->sk_sndmsg_page      =       NULL;
2178         sk->sk_sndmsg_off       =       0;
2179         sk->sk_peek_off         =       -1;
2180
2181         sk->sk_peer_pid         =       NULL;
2182         sk->sk_peer_cred        =       NULL;
2183         sk->sk_write_pending    =       0;
2184         sk->sk_rcvlowat         =       1;
2185         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2186         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2187
2188         sk->sk_stamp = ktime_set(-1L, 0);
2189
2190         /*
2191          * Before updating sk_refcnt, we must commit prior changes to memory
2192          * (Documentation/RCU/rculist_nulls.txt for details)
2193          */
2194         smp_wmb();
2195         atomic_set(&sk->sk_refcnt, 1);
2196         atomic_set(&sk->sk_drops, 0);
2197 }
2198 EXPORT_SYMBOL(sock_init_data);
2199
2200 void lock_sock_nested(struct sock *sk, int subclass)
2201 {
2202         might_sleep();
2203         spin_lock_bh(&sk->sk_lock.slock);
2204         if (sk->sk_lock.owned)
2205                 __lock_sock(sk);
2206         sk->sk_lock.owned = 1;
2207         spin_unlock(&sk->sk_lock.slock);
2208         /*
2209          * The sk_lock has mutex_lock() semantics here:
2210          */
2211         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2212         local_bh_enable();
2213 }
2214 EXPORT_SYMBOL(lock_sock_nested);
2215
2216 void release_sock(struct sock *sk)
2217 {
2218         /*
2219          * The sk_lock has mutex_unlock() semantics:
2220          */
2221         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2222
2223         spin_lock_bh(&sk->sk_lock.slock);
2224         if (sk->sk_backlog.tail)
2225                 __release_sock(sk);
2226
2227         if (sk->sk_prot->release_cb)
2228                 sk->sk_prot->release_cb(sk);
2229
2230         sk->sk_lock.owned = 0;
2231         if (waitqueue_active(&sk->sk_lock.wq))
2232                 wake_up(&sk->sk_lock.wq);
2233         spin_unlock_bh(&sk->sk_lock.slock);
2234 }
2235 EXPORT_SYMBOL(release_sock);
2236
2237 /**
2238  * lock_sock_fast - fast version of lock_sock
2239  * @sk: socket
2240  *
2241  * This version should be used for very small section, where process wont block
2242  * return false if fast path is taken
2243  *   sk_lock.slock locked, owned = 0, BH disabled
2244  * return true if slow path is taken
2245  *   sk_lock.slock unlocked, owned = 1, BH enabled
2246  */
2247 bool lock_sock_fast(struct sock *sk)
2248 {
2249         might_sleep();
2250         spin_lock_bh(&sk->sk_lock.slock);
2251
2252         if (!sk->sk_lock.owned)
2253                 /*
2254                  * Note : We must disable BH
2255                  */
2256                 return false;
2257
2258         __lock_sock(sk);
2259         sk->sk_lock.owned = 1;
2260         spin_unlock(&sk->sk_lock.slock);
2261         /*
2262          * The sk_lock has mutex_lock() semantics here:
2263          */
2264         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2265         local_bh_enable();
2266         return true;
2267 }
2268 EXPORT_SYMBOL(lock_sock_fast);
2269
2270 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2271 {
2272         struct timeval tv;
2273         if (!sock_flag(sk, SOCK_TIMESTAMP))
2274                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2275         tv = ktime_to_timeval(sk->sk_stamp);
2276         if (tv.tv_sec == -1)
2277                 return -ENOENT;
2278         if (tv.tv_sec == 0) {
2279                 sk->sk_stamp = ktime_get_real();
2280                 tv = ktime_to_timeval(sk->sk_stamp);
2281         }
2282         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2283 }
2284 EXPORT_SYMBOL(sock_get_timestamp);
2285
2286 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2287 {
2288         struct timespec ts;
2289         if (!sock_flag(sk, SOCK_TIMESTAMP))
2290                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2291         ts = ktime_to_timespec(sk->sk_stamp);
2292         if (ts.tv_sec == -1)
2293                 return -ENOENT;
2294         if (ts.tv_sec == 0) {
2295                 sk->sk_stamp = ktime_get_real();
2296                 ts = ktime_to_timespec(sk->sk_stamp);
2297         }
2298         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2299 }
2300 EXPORT_SYMBOL(sock_get_timestampns);
2301
2302 void sock_enable_timestamp(struct sock *sk, int flag)
2303 {
2304         if (!sock_flag(sk, flag)) {
2305                 unsigned long previous_flags = sk->sk_flags;
2306
2307                 sock_set_flag(sk, flag);
2308                 /*
2309                  * we just set one of the two flags which require net
2310                  * time stamping, but time stamping might have been on
2311                  * already because of the other one
2312                  */
2313                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2314                         net_enable_timestamp();
2315         }
2316 }
2317
2318 /*
2319  *      Get a socket option on an socket.
2320  *
2321  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2322  *      asynchronous errors should be reported by getsockopt. We assume
2323  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2324  */
2325 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2326                            char __user *optval, int __user *optlen)
2327 {
2328         struct sock *sk = sock->sk;
2329
2330         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2331 }
2332 EXPORT_SYMBOL(sock_common_getsockopt);
2333
2334 #ifdef CONFIG_COMPAT
2335 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2336                                   char __user *optval, int __user *optlen)
2337 {
2338         struct sock *sk = sock->sk;
2339
2340         if (sk->sk_prot->compat_getsockopt != NULL)
2341                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2342                                                       optval, optlen);
2343         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2344 }
2345 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2346 #endif
2347
2348 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2349                         struct msghdr *msg, size_t size, int flags)
2350 {
2351         struct sock *sk = sock->sk;
2352         int addr_len = 0;
2353         int err;
2354
2355         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2356                                    flags & ~MSG_DONTWAIT, &addr_len);
2357         if (err >= 0)
2358                 msg->msg_namelen = addr_len;
2359         return err;
2360 }
2361 EXPORT_SYMBOL(sock_common_recvmsg);
2362
2363 /*
2364  *      Set socket options on an inet socket.
2365  */
2366 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2367                            char __user *optval, unsigned int optlen)
2368 {
2369         struct sock *sk = sock->sk;
2370
2371         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2372 }
2373 EXPORT_SYMBOL(sock_common_setsockopt);
2374
2375 #ifdef CONFIG_COMPAT
2376 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2377                                   char __user *optval, unsigned int optlen)
2378 {
2379         struct sock *sk = sock->sk;
2380
2381         if (sk->sk_prot->compat_setsockopt != NULL)
2382                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2383                                                       optval, optlen);
2384         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2385 }
2386 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2387 #endif
2388
2389 void sk_common_release(struct sock *sk)
2390 {
2391         if (sk->sk_prot->destroy)
2392                 sk->sk_prot->destroy(sk);
2393
2394         /*
2395          * Observation: when sock_common_release is called, processes have
2396          * no access to socket. But net still has.
2397          * Step one, detach it from networking:
2398          *
2399          * A. Remove from hash tables.
2400          */
2401
2402         sk->sk_prot->unhash(sk);
2403
2404         /*
2405          * In this point socket cannot receive new packets, but it is possible
2406          * that some packets are in flight because some CPU runs receiver and
2407          * did hash table lookup before we unhashed socket. They will achieve
2408          * receive queue and will be purged by socket destructor.
2409          *
2410          * Also we still have packets pending on receive queue and probably,
2411          * our own packets waiting in device queues. sock_destroy will drain
2412          * receive queue, but transmitted packets will delay socket destruction
2413          * until the last reference will be released.
2414          */
2415
2416         sock_orphan(sk);
2417
2418         xfrm_sk_free_policy(sk);
2419
2420         sk_refcnt_debug_release(sk);
2421         sock_put(sk);
2422 }
2423 EXPORT_SYMBOL(sk_common_release);
2424
2425 #ifdef CONFIG_PROC_FS
2426 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2427 struct prot_inuse {
2428         int val[PROTO_INUSE_NR];
2429 };
2430
2431 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2432
2433 #ifdef CONFIG_NET_NS
2434 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2435 {
2436         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2437 }
2438 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2439
2440 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2441 {
2442         int cpu, idx = prot->inuse_idx;
2443         int res = 0;
2444
2445         for_each_possible_cpu(cpu)
2446                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2447
2448         return res >= 0 ? res : 0;
2449 }
2450 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2451
2452 static int __net_init sock_inuse_init_net(struct net *net)
2453 {
2454         net->core.inuse = alloc_percpu(struct prot_inuse);
2455         return net->core.inuse ? 0 : -ENOMEM;
2456 }
2457
2458 static void __net_exit sock_inuse_exit_net(struct net *net)
2459 {
2460         free_percpu(net->core.inuse);
2461 }
2462
2463 static struct pernet_operations net_inuse_ops = {
2464         .init = sock_inuse_init_net,
2465         .exit = sock_inuse_exit_net,
2466 };
2467
2468 static __init int net_inuse_init(void)
2469 {
2470         if (register_pernet_subsys(&net_inuse_ops))
2471                 panic("Cannot initialize net inuse counters");
2472
2473         return 0;
2474 }
2475
2476 core_initcall(net_inuse_init);
2477 #else
2478 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2479
2480 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2481 {
2482         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2483 }
2484 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2485
2486 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2487 {
2488         int cpu, idx = prot->inuse_idx;
2489         int res = 0;
2490
2491         for_each_possible_cpu(cpu)
2492                 res += per_cpu(prot_inuse, cpu).val[idx];
2493
2494         return res >= 0 ? res : 0;
2495 }
2496 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2497 #endif
2498
2499 static void assign_proto_idx(struct proto *prot)
2500 {
2501         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2502
2503         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2504                 pr_err("PROTO_INUSE_NR exhausted\n");
2505                 return;
2506         }
2507
2508         set_bit(prot->inuse_idx, proto_inuse_idx);
2509 }
2510
2511 static void release_proto_idx(struct proto *prot)
2512 {
2513         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2514                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2515 }
2516 #else
2517 static inline void assign_proto_idx(struct proto *prot)
2518 {
2519 }
2520
2521 static inline void release_proto_idx(struct proto *prot)
2522 {
2523 }
2524 #endif
2525
2526 int proto_register(struct proto *prot, int alloc_slab)
2527 {
2528         if (alloc_slab) {
2529                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2530                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2531                                         NULL);
2532
2533                 if (prot->slab == NULL) {
2534                         pr_crit("%s: Can't create sock SLAB cache!\n",
2535                                 prot->name);
2536                         goto out;
2537                 }
2538
2539                 if (prot->rsk_prot != NULL) {
2540                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2541                         if (prot->rsk_prot->slab_name == NULL)
2542                                 goto out_free_sock_slab;
2543
2544                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2545                                                                  prot->rsk_prot->obj_size, 0,
2546                                                                  SLAB_HWCACHE_ALIGN, NULL);
2547
2548                         if (prot->rsk_prot->slab == NULL) {
2549                                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2550                                         prot->name);
2551                                 goto out_free_request_sock_slab_name;
2552                         }
2553                 }
2554
2555                 if (prot->twsk_prot != NULL) {
2556                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2557
2558                         if (prot->twsk_prot->twsk_slab_name == NULL)
2559                                 goto out_free_request_sock_slab;
2560
2561                         prot->twsk_prot->twsk_slab =
2562                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2563                                                   prot->twsk_prot->twsk_obj_size,
2564                                                   0,
2565                                                   SLAB_HWCACHE_ALIGN |
2566                                                         prot->slab_flags,
2567                                                   NULL);
2568                         if (prot->twsk_prot->twsk_slab == NULL)
2569                                 goto out_free_timewait_sock_slab_name;
2570                 }
2571         }
2572
2573         mutex_lock(&proto_list_mutex);
2574         list_add(&prot->node, &proto_list);
2575         assign_proto_idx(prot);
2576         mutex_unlock(&proto_list_mutex);
2577         return 0;
2578
2579 out_free_timewait_sock_slab_name:
2580         kfree(prot->twsk_prot->twsk_slab_name);
2581 out_free_request_sock_slab:
2582         if (prot->rsk_prot && prot->rsk_prot->slab) {
2583                 kmem_cache_destroy(prot->rsk_prot->slab);
2584                 prot->rsk_prot->slab = NULL;
2585         }
2586 out_free_request_sock_slab_name:
2587         if (prot->rsk_prot)
2588                 kfree(prot->rsk_prot->slab_name);
2589 out_free_sock_slab:
2590         kmem_cache_destroy(prot->slab);
2591         prot->slab = NULL;
2592 out:
2593         return -ENOBUFS;
2594 }
2595 EXPORT_SYMBOL(proto_register);
2596
2597 void proto_unregister(struct proto *prot)
2598 {
2599         mutex_lock(&proto_list_mutex);
2600         release_proto_idx(prot);
2601         list_del(&prot->node);
2602         mutex_unlock(&proto_list_mutex);
2603
2604         if (prot->slab != NULL) {
2605                 kmem_cache_destroy(prot->slab);
2606                 prot->slab = NULL;
2607         }
2608
2609         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2610                 kmem_cache_destroy(prot->rsk_prot->slab);
2611                 kfree(prot->rsk_prot->slab_name);
2612                 prot->rsk_prot->slab = NULL;
2613         }
2614
2615         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2616                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2617                 kfree(prot->twsk_prot->twsk_slab_name);
2618                 prot->twsk_prot->twsk_slab = NULL;
2619         }
2620 }
2621 EXPORT_SYMBOL(proto_unregister);
2622
2623 #ifdef CONFIG_PROC_FS
2624 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2625         __acquires(proto_list_mutex)
2626 {
2627         mutex_lock(&proto_list_mutex);
2628         return seq_list_start_head(&proto_list, *pos);
2629 }
2630
2631 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2632 {
2633         return seq_list_next(v, &proto_list, pos);
2634 }
2635
2636 static void proto_seq_stop(struct seq_file *seq, void *v)
2637         __releases(proto_list_mutex)
2638 {
2639         mutex_unlock(&proto_list_mutex);
2640 }
2641
2642 static char proto_method_implemented(const void *method)
2643 {
2644         return method == NULL ? 'n' : 'y';
2645 }
2646 static long sock_prot_memory_allocated(struct proto *proto)
2647 {
2648         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2649 }
2650
2651 static char *sock_prot_memory_pressure(struct proto *proto)
2652 {
2653         return proto->memory_pressure != NULL ?
2654         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2655 }
2656
2657 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2658 {
2659
2660         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2661                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2662                    proto->name,
2663                    proto->obj_size,
2664                    sock_prot_inuse_get(seq_file_net(seq), proto),
2665                    sock_prot_memory_allocated(proto),
2666                    sock_prot_memory_pressure(proto),
2667                    proto->max_header,
2668                    proto->slab == NULL ? "no" : "yes",
2669                    module_name(proto->owner),
2670                    proto_method_implemented(proto->close),
2671                    proto_method_implemented(proto->connect),
2672                    proto_method_implemented(proto->disconnect),
2673                    proto_method_implemented(proto->accept),
2674                    proto_method_implemented(proto->ioctl),
2675                    proto_method_implemented(proto->init),
2676                    proto_method_implemented(proto->destroy),
2677                    proto_method_implemented(proto->shutdown),
2678                    proto_method_implemented(proto->setsockopt),
2679                    proto_method_implemented(proto->getsockopt),
2680                    proto_method_implemented(proto->sendmsg),
2681                    proto_method_implemented(proto->recvmsg),
2682                    proto_method_implemented(proto->sendpage),
2683                    proto_method_implemented(proto->bind),
2684                    proto_method_implemented(proto->backlog_rcv),
2685                    proto_method_implemented(proto->hash),
2686                    proto_method_implemented(proto->unhash),
2687                    proto_method_implemented(proto->get_port),
2688                    proto_method_implemented(proto->enter_memory_pressure));
2689 }
2690
2691 static int proto_seq_show(struct seq_file *seq, void *v)
2692 {
2693         if (v == &proto_list)
2694                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2695                            "protocol",
2696                            "size",
2697                            "sockets",
2698                            "memory",
2699                            "press",
2700                            "maxhdr",
2701                            "slab",
2702                            "module",
2703                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2704         else
2705                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2706         return 0;
2707 }
2708
2709 static const struct seq_operations proto_seq_ops = {
2710         .start  = proto_seq_start,
2711         .next   = proto_seq_next,
2712         .stop   = proto_seq_stop,
2713         .show   = proto_seq_show,
2714 };
2715
2716 static int proto_seq_open(struct inode *inode, struct file *file)
2717 {
2718         return seq_open_net(inode, file, &proto_seq_ops,
2719                             sizeof(struct seq_net_private));
2720 }
2721
2722 static const struct file_operations proto_seq_fops = {
2723         .owner          = THIS_MODULE,
2724         .open           = proto_seq_open,
2725         .read           = seq_read,
2726         .llseek         = seq_lseek,
2727         .release        = seq_release_net,
2728 };
2729
2730 static __net_init int proto_init_net(struct net *net)
2731 {
2732         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2733                 return -ENOMEM;
2734
2735         return 0;
2736 }
2737
2738 static __net_exit void proto_exit_net(struct net *net)
2739 {
2740         proc_net_remove(net, "protocols");
2741 }
2742
2743
2744 static __net_initdata struct pernet_operations proto_net_ops = {
2745         .init = proto_init_net,
2746         .exit = proto_exit_net,
2747 };
2748
2749 static int __init proto_init(void)
2750 {
2751         return register_pernet_subsys(&proto_net_ops);
2752 }
2753
2754 subsys_initcall(proto_init);
2755
2756 #endif /* PROC_FS */