net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/types.h>
  97 #include <linux/socket.h>
  98 #include <linux/in.h>
  99 #include <linux/kernel.h>
 100 #include <linux/module.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/sched.h>
 104 #include <linux/timer.h>
 105 #include <linux/string.h>
 106 #include <linux/sockios.h>
 107 #include <linux/net.h>
 108 #include <linux/mm.h>
 109 #include <linux/slab.h>
 110 #include <linux/interrupt.h>
 111 #include <linux/poll.h>
 112 #include <linux/tcp.h>
 113 #include <linux/init.h>
 114 #include <linux/highmem.h>
 115 #include <linux/user_namespace.h>
 116 #include <linux/static_key.h>
 117 #include <linux/memcontrol.h>
 118 #include <linux/prefetch.h>
 119
 120 #include <asm/uaccess.h>
 121
 122 #include <linux/netdevice.h>
 123 #include <net/protocol.h>
 124 #include <linux/skbuff.h>
 125 #include <net/net_namespace.h>
 126 #include <net/request_sock.h>
 127 #include <net/sock.h>
 128 #include <linux/net_tstamp.h>
 129 #include <net/xfrm.h>
 130 #include <linux/ipsec.h>
 131 #include <net/cls_cgroup.h>
 132 #include <net/netprio_cgroup.h>
 133
 134 #include <linux/filter.h>
 135
 136 #include <trace/events/sock.h>
 137
 138 #ifdef CONFIG_INET
 139 #include <net/tcp.h>
 140 #endif
 141
 142 static DEFINE_MUTEX(proto_list_mutex);
 143 static LIST_HEAD(proto_list);
 144
 145 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 147 {
 148         struct proto *proto;
 149         int ret = 0;
 150
 151         mutex_lock(&proto_list_mutex);
 152         list_for_each_entry(proto, &proto_list, node) {
 153                 if (proto->init_cgroup) {
 154                         ret = proto->init_cgroup(memcg, ss);
 155                         if (ret)
 156                                 goto out;
 157                 }
 158         }
 159
 160         mutex_unlock(&proto_list_mutex);
 161         return ret;
 162 out:
 163         list_for_each_entry_continue_reverse(proto, &proto_list, node)
 164                 if (proto->destroy_cgroup)
 165                         proto->destroy_cgroup(memcg);
 166         mutex_unlock(&proto_list_mutex);
 167         return ret;
 168 }
 169
 170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
 171 {
 172         struct proto *proto;
 173
 174         mutex_lock(&proto_list_mutex);
 175         list_for_each_entry_reverse(proto, &proto_list, node)
 176                 if (proto->destroy_cgroup)
 177                         proto->destroy_cgroup(memcg);
 178         mutex_unlock(&proto_list_mutex);
 179 }
 180 #endif
 181
 182 /*
 183  * Each address family might have different locking rules, so we have
 184  * one slock key per address family:
 185  */
 186 static struct lock_class_key af_family_keys[AF_MAX];
 187 static struct lock_class_key af_family_slock_keys[AF_MAX];
 188
 189 struct static_key memcg_socket_limit_enabled;
 190 EXPORT_SYMBOL(memcg_socket_limit_enabled);
 191
 192 /*
 193  * Make lock validator output more readable. (we pre-construct these
 194  * strings build-time, so that runtime initialization of socket
 195  * locks is fast):
 196  */
 197 static const char *const af_family_key_strings[AF_MAX+1] = {
 198   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 199   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 200   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 201   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 202   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 203   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 204   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 205   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 206   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 207   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 208   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 209   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 210   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 211   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
 212 };
 213 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 214   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 215   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 216   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 217   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 218   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 219   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 220   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 221   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 222   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 223   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 224   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 225   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 226   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 227   "slock-AF_NFC"   , "slock-AF_MAX"
 228 };
 229 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 230   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 231   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 232   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 233   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 234   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 235   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 236   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 237   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 238   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 239   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 240   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 241   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 242   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 243   "clock-AF_NFC"   , "clock-AF_MAX"
 244 };
 245
 246 /*
 247  * sk_callback_lock locking rules are per-address-family,
 248  * so split the lock classes by using a per-AF key:
 249  */
 250 static struct lock_class_key af_callback_keys[AF_MAX];
 251
 252 /* Take into consideration the size of the struct sk_buff overhead in the
 253  * determination of these values, since that is non-constant across
 254  * platforms.  This makes socket queueing behavior and performance
 255  * not depend upon such differences.
 256  */
 257 #define _SK_MEM_PACKETS         256
 258 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 259 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 260 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 261
 262 /* Run time adjustable parameters. */
 263 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 264 EXPORT_SYMBOL(sysctl_wmem_max);
 265 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 266 EXPORT_SYMBOL(sysctl_rmem_max);
 267 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 268 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 269
 270 /* Maximal space eaten by iovec or ancillary data plus some space */
 271 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 272 EXPORT_SYMBOL(sysctl_optmem_max);
 273
 274 #if defined(CONFIG_CGROUPS)
 275 #if !defined(CONFIG_NET_CLS_CGROUP)
 276 int net_cls_subsys_id = -1;
 277 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
 278 #endif
 279 #if !defined(CONFIG_NETPRIO_CGROUP)
 280 int net_prio_subsys_id = -1;
 281 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
 282 #endif
 283 #endif
 284
 285 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 286 {
 287         struct timeval tv;
 288
 289         if (optlen < sizeof(tv))
 290                 return -EINVAL;
 291         if (copy_from_user(&tv, optval, sizeof(tv)))
 292                 return -EFAULT;
 293         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 294                 return -EDOM;
 295
 296         if (tv.tv_sec < 0) {
 297                 static int warned __read_mostly;
 298
 299                 *timeo_p = 0;
 300                 if (warned < 10 && net_ratelimit()) {
 301                         warned++;
 302                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 303                                 __func__, current->comm, task_pid_nr(current));
 304                 }
 305                 return 0;
 306         }
 307         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 308         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 309                 return 0;
 310         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 311                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 312         return 0;
 313 }
 314
 315 static void sock_warn_obsolete_bsdism(const char *name)
 316 {
 317         static int warned;
 318         static char warncomm[TASK_COMM_LEN];
 319         if (strcmp(warncomm, current->comm) && warned < 5) {
 320                 strcpy(warncomm,  current->comm);
 321                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 322                         warncomm, name);
 323                 warned++;
 324         }
 325 }
 326
 327 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
 328
 329 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 330 {
 331         if (sk->sk_flags & flags) {
 332                 sk->sk_flags &= ~flags;
 333                 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 334                         net_disable_timestamp();
 335         }
 336 }
 337
 338
 339 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 340 {
 341         int err;
 342         int skb_len;
 343         unsigned long flags;
 344         struct sk_buff_head *list = &sk->sk_receive_queue;
 345
 346         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 347                 atomic_inc(&sk->sk_drops);
 348                 trace_sock_rcvqueue_full(sk, skb);
 349                 return -ENOMEM;
 350         }
 351
 352         err = sk_filter(sk, skb);
 353         if (err)
 354                 return err;
 355
 356         if (!sk_rmem_schedule(sk, skb->truesize)) {
 357                 atomic_inc(&sk->sk_drops);
 358                 return -ENOBUFS;
 359         }
 360
 361         skb->dev = NULL;
 362         skb_set_owner_r(skb, sk);
 363
 364         /* Cache the SKB length before we tack it onto the receive
 365          * queue.  Once it is added it no longer belongs to us and
 366          * may be freed by other threads of control pulling packets
 367          * from the queue.
 368          */
 369         skb_len = skb->len;
 370
 371         /* we escape from rcu protected region, make sure we dont leak
 372          * a norefcounted dst
 373          */
 374         skb_dst_force(skb);
 375
 376         spin_lock_irqsave(&list->lock, flags);
 377         skb->dropcount = atomic_read(&sk->sk_drops);
 378         __skb_queue_tail(list, skb);
 379         spin_unlock_irqrestore(&list->lock, flags);
 380
 381         if (!sock_flag(sk, SOCK_DEAD))
 382                 sk->sk_data_ready(sk, skb_len);
 383         return 0;
 384 }
 385 EXPORT_SYMBOL(sock_queue_rcv_skb);
 386
 387 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 388 {
 389         int rc = NET_RX_SUCCESS;
 390
 391         if (sk_filter(sk, skb))
 392                 goto discard_and_relse;
 393
 394         skb->dev = NULL;
 395
 396         if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
 397                 atomic_inc(&sk->sk_drops);
 398                 goto discard_and_relse;
 399         }
 400         if (nested)
 401                 bh_lock_sock_nested(sk);
 402         else
 403                 bh_lock_sock(sk);
 404         if (!sock_owned_by_user(sk)) {
 405                 /*
 406                  * trylock + unlock semantics:
 407                  */
 408                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 409
 410                 rc = sk_backlog_rcv(sk, skb);
 411
 412                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 413         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 414                 bh_unlock_sock(sk);
 415                 atomic_inc(&sk->sk_drops);
 416                 goto discard_and_relse;
 417         }
 418
 419         bh_unlock_sock(sk);
 420 out:
 421         sock_put(sk);
 422         return rc;
 423 discard_and_relse:
 424         kfree_skb(skb);
 425         goto out;
 426 }
 427 EXPORT_SYMBOL(sk_receive_skb);
 428
 429 void sk_reset_txq(struct sock *sk)
 430 {
 431         sk_tx_queue_clear(sk);
 432 }
 433 EXPORT_SYMBOL(sk_reset_txq);
 434
 435 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 436 {
 437         struct dst_entry *dst = __sk_dst_get(sk);
 438
 439         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 440                 sk_tx_queue_clear(sk);
 441                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 442                 dst_release(dst);
 443                 return NULL;
 444         }
 445
 446         return dst;
 447 }
 448 EXPORT_SYMBOL(__sk_dst_check);
 449
 450 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 451 {
 452         struct dst_entry *dst = sk_dst_get(sk);
 453
 454         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 455                 sk_dst_reset(sk);
 456                 dst_release(dst);
 457                 return NULL;
 458         }
 459
 460         return dst;
 461 }
 462 EXPORT_SYMBOL(sk_dst_check);
 463
 464 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 465 {
 466         int ret = -ENOPROTOOPT;
 467 #ifdef CONFIG_NETDEVICES
 468         struct net *net = sock_net(sk);
 469         char devname[IFNAMSIZ];
 470         int index;
 471
 472         /* Sorry... */
 473         ret = -EPERM;
 474         if (!capable(CAP_NET_RAW))
 475                 goto out;
 476
 477         ret = -EINVAL;
 478         if (optlen < 0)
 479                 goto out;
 480
 481         /* Bind this socket to a particular device like "eth0",
 482          * as specified in the passed interface name. If the
 483          * name is "" or the option length is zero the socket
 484          * is not bound.
 485          */
 486         if (optlen > IFNAMSIZ - 1)
 487                 optlen = IFNAMSIZ - 1;
 488         memset(devname, 0, sizeof(devname));
 489
 490         ret = -EFAULT;
 491         if (copy_from_user(devname, optval, optlen))
 492                 goto out;
 493
 494         index = 0;
 495         if (devname[0] != '\0') {
 496                 struct net_device *dev;
 497
 498                 rcu_read_lock();
 499                 dev = dev_get_by_name_rcu(net, devname);
 500                 if (dev)
 501                         index = dev->ifindex;
 502                 rcu_read_unlock();
 503                 ret = -ENODEV;
 504                 if (!dev)
 505                         goto out;
 506         }
 507
 508         lock_sock(sk);
 509         sk->sk_bound_dev_if = index;
 510         sk_dst_reset(sk);
 511         release_sock(sk);
 512
 513         ret = 0;
 514
 515 out:
 516 #endif
 517
 518         return ret;
 519 }
 520
 521 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 522 {
 523         if (valbool)
 524                 sock_set_flag(sk, bit);
 525         else
 526                 sock_reset_flag(sk, bit);
 527 }
 528
 529 /*
 530  *      This is meant for all protocols to use and covers goings on
 531  *      at the socket level. Everything here is generic.
 532  */
 533
 534 int sock_setsockopt(struct socket *sock, int level, int optname,
 535                     char __user *optval, unsigned int optlen)
 536 {
 537         struct sock *sk = sock->sk;
 538         int val;
 539         int valbool;
 540         struct linger ling;
 541         int ret = 0;
 542
 543         /*
 544          *      Options without arguments
 545          */
 546
 547         if (optname == SO_BINDTODEVICE)
 548                 return sock_bindtodevice(sk, optval, optlen);
 549
 550         if (optlen < sizeof(int))
 551                 return -EINVAL;
 552
 553         if (get_user(val, (int __user *)optval))
 554                 return -EFAULT;
 555
 556         valbool = val ? 1 : 0;
 557
 558         lock_sock(sk);
 559
 560         switch (optname) {
 561         case SO_DEBUG:
 562                 if (val && !capable(CAP_NET_ADMIN))
 563                         ret = -EACCES;
 564                 else
 565                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 566                 break;
 567         case SO_REUSEADDR:
 568                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 569                 break;
 570         case SO_TYPE:
 571         case SO_PROTOCOL:
 572         case SO_DOMAIN:
 573         case SO_ERROR:
 574                 ret = -ENOPROTOOPT;
 575                 break;
 576         case SO_DONTROUTE:
 577                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 578                 break;
 579         case SO_BROADCAST:
 580                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 581                 break;
 582         case SO_SNDBUF:
 583                 /* Don't error on this BSD doesn't and if you think
 584                  * about it this is right. Otherwise apps have to
 585                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 586                  * are treated in BSD as hints
 587                  */
 588                 val = min_t(u32, val, sysctl_wmem_max);
 589 set_sndbuf:
 590                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 591                 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 592                 /* Wake up sending tasks if we upped the value. */
 593                 sk->sk_write_space(sk);
 594                 break;
 595
 596         case SO_SNDBUFFORCE:
 597                 if (!capable(CAP_NET_ADMIN)) {
 598                         ret = -EPERM;
 599                         break;
 600                 }
 601                 goto set_sndbuf;
 602
 603         case SO_RCVBUF:
 604                 /* Don't error on this BSD doesn't and if you think
 605                  * about it this is right. Otherwise apps have to
 606                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 607                  * are treated in BSD as hints
 608                  */
 609                 val = min_t(u32, val, sysctl_rmem_max);
 610 set_rcvbuf:
 611                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 612                 /*
 613                  * We double it on the way in to account for
 614                  * "struct sk_buff" etc. overhead.   Applications
 615                  * assume that the SO_RCVBUF setting they make will
 616                  * allow that much actual data to be received on that
 617                  * socket.
 618                  *
 619                  * Applications are unaware that "struct sk_buff" and
 620                  * other overheads allocate from the receive buffer
 621                  * during socket buffer allocation.
 622                  *
 623                  * And after considering the possible alternatives,
 624                  * returning the value we actually used in getsockopt
 625                  * is the most desirable behavior.
 626                  */
 627                 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
 628                 break;
 629
 630         case SO_RCVBUFFORCE:
 631                 if (!capable(CAP_NET_ADMIN)) {
 632                         ret = -EPERM;
 633                         break;
 634                 }
 635                 goto set_rcvbuf;
 636
 637         case SO_KEEPALIVE:
 638 #ifdef CONFIG_INET
 639                 if (sk->sk_protocol == IPPROTO_TCP)
 640                         tcp_set_keepalive(sk, valbool);
 641 #endif
 642                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 643                 break;
 644
 645         case SO_OOBINLINE:
 646                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 647                 break;
 648
 649         case SO_NO_CHECK:
 650                 sk->sk_no_check = valbool;
 651                 break;
 652
 653         case SO_PRIORITY:
 654                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 655                         sk->sk_priority = val;
 656                 else
 657                         ret = -EPERM;
 658                 break;
 659
 660         case SO_LINGER:
 661                 if (optlen < sizeof(ling)) {
 662                         ret = -EINVAL;  /* 1003.1g */
 663                         break;
 664                 }
 665                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 666                         ret = -EFAULT;
 667                         break;
 668                 }
 669                 if (!ling.l_onoff)
 670                         sock_reset_flag(sk, SOCK_LINGER);
 671                 else {
 672 #if (BITS_PER_LONG == 32)
 673                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 674                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 675                         else
 676 #endif
 677                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 678                         sock_set_flag(sk, SOCK_LINGER);
 679                 }
 680                 break;
 681
 682         case SO_BSDCOMPAT:
 683                 sock_warn_obsolete_bsdism("setsockopt");
 684                 break;
 685
 686         case SO_PASSCRED:
 687                 if (valbool)
 688                         set_bit(SOCK_PASSCRED, &sock->flags);
 689                 else
 690                         clear_bit(SOCK_PASSCRED, &sock->flags);
 691                 break;
 692
 693         case SO_TIMESTAMP:
 694         case SO_TIMESTAMPNS:
 695                 if (valbool)  {
 696                         if (optname == SO_TIMESTAMP)
 697                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 698                         else
 699                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 700                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 701                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 702                 } else {
 703                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 704                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 705                 }
 706                 break;
 707
 708         case SO_TIMESTAMPING:
 709                 if (val & ~SOF_TIMESTAMPING_MASK) {
 710                         ret = -EINVAL;
 711                         break;
 712                 }
 713                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
 714                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
 715                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
 716                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
 717                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
 718                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
 719                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 720                         sock_enable_timestamp(sk,
 721                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 722                 else
 723                         sock_disable_timestamp(sk,
 724                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 725                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
 726                                   val & SOF_TIMESTAMPING_SOFTWARE);
 727                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
 728                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
 729                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
 730                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
 731                 break;
 732
 733         case SO_RCVLOWAT:
 734                 if (val < 0)
 735                         val = INT_MAX;
 736                 sk->sk_rcvlowat = val ? : 1;
 737                 break;
 738
 739         case SO_RCVTIMEO:
 740                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 741                 break;
 742
 743         case SO_SNDTIMEO:
 744                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 745                 break;
 746
 747         case SO_ATTACH_FILTER:
 748                 ret = -EINVAL;
 749                 if (optlen == sizeof(struct sock_fprog)) {
 750                         struct sock_fprog fprog;
 751
 752                         ret = -EFAULT;
 753                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 754                                 break;
 755
 756                         ret = sk_attach_filter(&fprog, sk);
 757                 }
 758                 break;
 759
 760         case SO_DETACH_FILTER:
 761                 ret = sk_detach_filter(sk);
 762                 break;
 763
 764         case SO_PASSSEC:
 765                 if (valbool)
 766                         set_bit(SOCK_PASSSEC, &sock->flags);
 767                 else
 768                         clear_bit(SOCK_PASSSEC, &sock->flags);
 769                 break;
 770         case SO_MARK:
 771                 if (!capable(CAP_NET_ADMIN))
 772                         ret = -EPERM;
 773                 else
 774                         sk->sk_mark = val;
 775                 break;
 776
 777                 /* We implement the SO_SNDLOWAT etc to
 778                    not be settable (1003.1g 5.3) */
 779         case SO_RXQ_OVFL:
 780                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 781                 break;
 782
 783         case SO_WIFI_STATUS:
 784                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 785                 break;
 786
 787         case SO_PEEK_OFF:
 788                 if (sock->ops->set_peek_off)
 789                         sock->ops->set_peek_off(sk, val);
 790                 else
 791                         ret = -EOPNOTSUPP;
 792                 break;
 793
 794         case SO_NOFCS:
 795                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 796                 break;
 797
 798         default:
 799                 ret = -ENOPROTOOPT;
 800                 break;
 801         }
 802         release_sock(sk);
 803         return ret;
 804 }
 805 EXPORT_SYMBOL(sock_setsockopt);
 806
 807
 808 void cred_to_ucred(struct pid *pid, const struct cred *cred,
 809                    struct ucred *ucred)
 810 {
 811         ucred->pid = pid_vnr(pid);
 812         ucred->uid = ucred->gid = -1;
 813         if (cred) {
 814                 struct user_namespace *current_ns = current_user_ns();
 815
 816                 ucred->uid = from_kuid(current_ns, cred->euid);
 817                 ucred->gid = from_kgid(current_ns, cred->egid);
 818         }
 819 }
 820 EXPORT_SYMBOL_GPL(cred_to_ucred);
 821
 822 int sock_getsockopt(struct socket *sock, int level, int optname,
 823                     char __user *optval, int __user *optlen)
 824 {
 825         struct sock *sk = sock->sk;
 826
 827         union {
 828                 int val;
 829                 struct linger ling;
 830                 struct timeval tm;
 831         } v;
 832
 833         int lv = sizeof(int);
 834         int len;
 835
 836         if (get_user(len, optlen))
 837                 return -EFAULT;
 838         if (len < 0)
 839                 return -EINVAL;
 840
 841         memset(&v, 0, sizeof(v));
 842
 843         switch (optname) {
 844         case SO_DEBUG:
 845                 v.val = sock_flag(sk, SOCK_DBG);
 846                 break;
 847
 848         case SO_DONTROUTE:
 849                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 850                 break;
 851
 852         case SO_BROADCAST:
 853                 v.val = sock_flag(sk, SOCK_BROADCAST);
 854                 break;
 855
 856         case SO_SNDBUF:
 857                 v.val = sk->sk_sndbuf;
 858                 break;
 859
 860         case SO_RCVBUF:
 861                 v.val = sk->sk_rcvbuf;
 862                 break;
 863
 864         case SO_REUSEADDR:
 865                 v.val = sk->sk_reuse;
 866                 break;
 867
 868         case SO_KEEPALIVE:
 869                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
 870                 break;
 871
 872         case SO_TYPE:
 873                 v.val = sk->sk_type;
 874                 break;
 875
 876         case SO_PROTOCOL:
 877                 v.val = sk->sk_protocol;
 878                 break;
 879
 880         case SO_DOMAIN:
 881                 v.val = sk->sk_family;
 882                 break;
 883
 884         case SO_ERROR:
 885                 v.val = -sock_error(sk);
 886                 if (v.val == 0)
 887                         v.val = xchg(&sk->sk_err_soft, 0);
 888                 break;
 889
 890         case SO_OOBINLINE:
 891                 v.val = sock_flag(sk, SOCK_URGINLINE);
 892                 break;
 893
 894         case SO_NO_CHECK:
 895                 v.val = sk->sk_no_check;
 896                 break;
 897
 898         case SO_PRIORITY:
 899                 v.val = sk->sk_priority;
 900                 break;
 901
 902         case SO_LINGER:
 903                 lv              = sizeof(v.ling);
 904                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
 905                 v.ling.l_linger = sk->sk_lingertime / HZ;
 906                 break;
 907
 908         case SO_BSDCOMPAT:
 909                 sock_warn_obsolete_bsdism("getsockopt");
 910                 break;
 911
 912         case SO_TIMESTAMP:
 913                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 914                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 915                 break;
 916
 917         case SO_TIMESTAMPNS:
 918                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 919                 break;
 920
 921         case SO_TIMESTAMPING:
 922                 v.val = 0;
 923                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
 924                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
 925                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 926                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
 927                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
 928                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
 929                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
 930                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
 931                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
 932                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
 933                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
 934                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
 935                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
 936                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
 937                 break;
 938
 939         case SO_RCVTIMEO:
 940                 lv = sizeof(struct timeval);
 941                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 942                         v.tm.tv_sec = 0;
 943                         v.tm.tv_usec = 0;
 944                 } else {
 945                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 946                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 947                 }
 948                 break;
 949
 950         case SO_SNDTIMEO:
 951                 lv = sizeof(struct timeval);
 952                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 953                         v.tm.tv_sec = 0;
 954                         v.tm.tv_usec = 0;
 955                 } else {
 956                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 957                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 958                 }
 959                 break;
 960
 961         case SO_RCVLOWAT:
 962                 v.val = sk->sk_rcvlowat;
 963                 break;
 964
 965         case SO_SNDLOWAT:
 966                 v.val = 1;
 967                 break;
 968
 969         case SO_PASSCRED:
 970                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
 971                 break;
 972
 973         case SO_PEERCRED:
 974         {
 975                 struct ucred peercred;
 976                 if (len > sizeof(peercred))
 977                         len = sizeof(peercred);
 978                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
 979                 if (copy_to_user(optval, &peercred, len))
 980                         return -EFAULT;
 981                 goto lenout;
 982         }
 983
 984         case SO_PEERNAME:
 985         {
 986                 char address[128];
 987
 988                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 989                         return -ENOTCONN;
 990                 if (lv < len)
 991                         return -EINVAL;
 992                 if (copy_to_user(optval, address, len))
 993                         return -EFAULT;
 994                 goto lenout;
 995         }
 996
 997         /* Dubious BSD thing... Probably nobody even uses it, but
 998          * the UNIX standard wants it for whatever reason... -DaveM
 999          */
1000         case SO_ACCEPTCONN:
1001                 v.val = sk->sk_state == TCP_LISTEN;
1002                 break;
1003
1004         case SO_PASSSEC:
1005                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1006                 break;
1007
1008         case SO_PEERSEC:
1009                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1010
1011         case SO_MARK:
1012                 v.val = sk->sk_mark;
1013                 break;
1014
1015         case SO_RXQ_OVFL:
1016                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1017                 break;
1018
1019         case SO_WIFI_STATUS:
1020                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1021                 break;
1022
1023         case SO_PEEK_OFF:
1024                 if (!sock->ops->set_peek_off)
1025                         return -EOPNOTSUPP;
1026
1027                 v.val = sk->sk_peek_off;
1028                 break;
1029         case SO_NOFCS:
1030                 v.val = sock_flag(sk, SOCK_NOFCS);
1031                 break;
1032         default:
1033                 return -ENOPROTOOPT;
1034         }
1035
1036         if (len > lv)
1037                 len = lv;
1038         if (copy_to_user(optval, &v, len))
1039                 return -EFAULT;
1040 lenout:
1041         if (put_user(len, optlen))
1042                 return -EFAULT;
1043         return 0;
1044 }
1045
1046 /*
1047  * Initialize an sk_lock.
1048  *
1049  * (We also register the sk_lock with the lock validator.)
1050  */
1051 static inline void sock_lock_init(struct sock *sk)
1052 {
1053         sock_lock_init_class_and_name(sk,
1054                         af_family_slock_key_strings[sk->sk_family],
1055                         af_family_slock_keys + sk->sk_family,
1056                         af_family_key_strings[sk->sk_family],
1057                         af_family_keys + sk->sk_family);
1058 }
1059
1060 /*
1061  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1062  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1063  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1064  */
1065 static void sock_copy(struct sock *nsk, const struct sock *osk)
1066 {
1067 #ifdef CONFIG_SECURITY_NETWORK
1068         void *sptr = nsk->sk_security;
1069 #endif
1070         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1071
1072         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1073                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1074
1075 #ifdef CONFIG_SECURITY_NETWORK
1076         nsk->sk_security = sptr;
1077         security_sk_clone(osk, nsk);
1078 #endif
1079 }
1080
1081 /*
1082  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1083  * un-modified. Special care is taken when initializing object to zero.
1084  */
1085 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1086 {
1087         if (offsetof(struct sock, sk_node.next) != 0)
1088                 memset(sk, 0, offsetof(struct sock, sk_node.next));
1089         memset(&sk->sk_node.pprev, 0,
1090                size - offsetof(struct sock, sk_node.pprev));
1091 }
1092
1093 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1094 {
1095         unsigned long nulls1, nulls2;
1096
1097         nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1098         nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1099         if (nulls1 > nulls2)
1100                 swap(nulls1, nulls2);
1101
1102         if (nulls1 != 0)
1103                 memset((char *)sk, 0, nulls1);
1104         memset((char *)sk + nulls1 + sizeof(void *), 0,
1105                nulls2 - nulls1 - sizeof(void *));
1106         memset((char *)sk + nulls2 + sizeof(void *), 0,
1107                size - nulls2 - sizeof(void *));
1108 }
1109 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1110
1111 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1112                 int family)
1113 {
1114         struct sock *sk;
1115         struct kmem_cache *slab;
1116
1117         slab = prot->slab;
1118         if (slab != NULL) {
1119                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1120                 if (!sk)
1121                         return sk;
1122                 if (priority & __GFP_ZERO) {
1123                         if (prot->clear_sk)
1124                                 prot->clear_sk(sk, prot->obj_size);
1125                         else
1126                                 sk_prot_clear_nulls(sk, prot->obj_size);
1127                 }
1128         } else
1129                 sk = kmalloc(prot->obj_size, priority);
1130
1131         if (sk != NULL) {
1132                 kmemcheck_annotate_bitfield(sk, flags);
1133
1134                 if (security_sk_alloc(sk, family, priority))
1135                         goto out_free;
1136
1137                 if (!try_module_get(prot->owner))
1138                         goto out_free_sec;
1139                 sk_tx_queue_clear(sk);
1140         }
1141
1142         return sk;
1143
1144 out_free_sec:
1145         security_sk_free(sk);
1146 out_free:
1147         if (slab != NULL)
1148                 kmem_cache_free(slab, sk);
1149         else
1150                 kfree(sk);
1151         return NULL;
1152 }
1153
1154 static void sk_prot_free(struct proto *prot, struct sock *sk)
1155 {
1156         struct kmem_cache *slab;
1157         struct module *owner;
1158
1159         owner = prot->owner;
1160         slab = prot->slab;
1161
1162         security_sk_free(sk);
1163         if (slab != NULL)
1164                 kmem_cache_free(slab, sk);
1165         else
1166                 kfree(sk);
1167         module_put(owner);
1168 }
1169
1170 #ifdef CONFIG_CGROUPS
1171 void sock_update_classid(struct sock *sk)
1172 {
1173         u32 classid;
1174
1175         rcu_read_lock();  /* doing current task, which cannot vanish. */
1176         classid = task_cls_classid(current);
1177         rcu_read_unlock();
1178         if (classid && classid != sk->sk_classid)
1179                 sk->sk_classid = classid;
1180 }
1181 EXPORT_SYMBOL(sock_update_classid);
1182
1183 void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1184 {
1185         if (in_interrupt())
1186                 return;
1187
1188         sk->sk_cgrp_prioidx = task_netprioidx(task);
1189 }
1190 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1191 #endif
1192
1193 /**
1194  *      sk_alloc - All socket objects are allocated here
1195  *      @net: the applicable net namespace
1196  *      @family: protocol family
1197  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1198  *      @prot: struct proto associated with this new sock instance
1199  */
1200 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1201                       struct proto *prot)
1202 {
1203         struct sock *sk;
1204
1205         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1206         if (sk) {
1207                 sk->sk_family = family;
1208                 /*
1209                  * See comment in struct sock definition to understand
1210                  * why we need sk_prot_creator -acme
1211                  */
1212                 sk->sk_prot = sk->sk_prot_creator = prot;
1213                 sock_lock_init(sk);
1214                 sock_net_set(sk, get_net(net));
1215                 atomic_set(&sk->sk_wmem_alloc, 1);
1216
1217                 sock_update_classid(sk);
1218                 sock_update_netprioidx(sk, current);
1219         }
1220
1221         return sk;
1222 }
1223 EXPORT_SYMBOL(sk_alloc);
1224
1225 static void __sk_free(struct sock *sk)
1226 {
1227         struct sk_filter *filter;
1228
1229         if (sk->sk_destruct)
1230                 sk->sk_destruct(sk);
1231
1232         filter = rcu_dereference_check(sk->sk_filter,
1233                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1234         if (filter) {
1235                 sk_filter_uncharge(sk, filter);
1236                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1237         }
1238
1239         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1240
1241         if (atomic_read(&sk->sk_omem_alloc))
1242                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1243                          __func__, atomic_read(&sk->sk_omem_alloc));
1244
1245         if (sk->sk_peer_cred)
1246                 put_cred(sk->sk_peer_cred);
1247         put_pid(sk->sk_peer_pid);
1248         put_net(sock_net(sk));
1249         sk_prot_free(sk->sk_prot_creator, sk);
1250 }
1251
1252 void sk_free(struct sock *sk)
1253 {
1254         /*
1255          * We subtract one from sk_wmem_alloc and can know if
1256          * some packets are still in some tx queue.
1257          * If not null, sock_wfree() will call __sk_free(sk) later
1258          */
1259         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1260                 __sk_free(sk);
1261 }
1262 EXPORT_SYMBOL(sk_free);
1263
1264 /*
1265  * Last sock_put should drop reference to sk->sk_net. It has already
1266  * been dropped in sk_change_net. Taking reference to stopping namespace
1267  * is not an option.
1268  * Take reference to a socket to remove it from hash _alive_ and after that
1269  * destroy it in the context of init_net.
1270  */
1271 void sk_release_kernel(struct sock *sk)
1272 {
1273         if (sk == NULL || sk->sk_socket == NULL)
1274                 return;
1275
1276         sock_hold(sk);
1277         sock_release(sk->sk_socket);
1278         release_net(sock_net(sk));
1279         sock_net_set(sk, get_net(&init_net));
1280         sock_put(sk);
1281 }
1282 EXPORT_SYMBOL(sk_release_kernel);
1283
1284 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1285 {
1286         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1287                 sock_update_memcg(newsk);
1288 }
1289
1290 /**
1291  *      sk_clone_lock - clone a socket, and lock its clone
1292  *      @sk: the socket to clone
1293  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1294  *
1295  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1296  */
1297 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1298 {
1299         struct sock *newsk;
1300
1301         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1302         if (newsk != NULL) {
1303                 struct sk_filter *filter;
1304
1305                 sock_copy(newsk, sk);
1306
1307                 /* SANITY */
1308                 get_net(sock_net(newsk));
1309                 sk_node_init(&newsk->sk_node);
1310                 sock_lock_init(newsk);
1311                 bh_lock_sock(newsk);
1312                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1313                 newsk->sk_backlog.len = 0;
1314
1315                 atomic_set(&newsk->sk_rmem_alloc, 0);
1316                 /*
1317                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1318                  */
1319                 atomic_set(&newsk->sk_wmem_alloc, 1);
1320                 atomic_set(&newsk->sk_omem_alloc, 0);
1321                 skb_queue_head_init(&newsk->sk_receive_queue);
1322                 skb_queue_head_init(&newsk->sk_write_queue);
1323 #ifdef CONFIG_NET_DMA
1324                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1325 #endif
1326
1327                 spin_lock_init(&newsk->sk_dst_lock);
1328                 rwlock_init(&newsk->sk_callback_lock);
1329                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1330                                 af_callback_keys + newsk->sk_family,
1331                                 af_family_clock_key_strings[newsk->sk_family]);
1332
1333                 newsk->sk_dst_cache     = NULL;
1334                 newsk->sk_wmem_queued   = 0;
1335                 newsk->sk_forward_alloc = 0;
1336                 newsk->sk_send_head     = NULL;
1337                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1338
1339                 sock_reset_flag(newsk, SOCK_DONE);
1340                 skb_queue_head_init(&newsk->sk_error_queue);
1341
1342                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1343                 if (filter != NULL)
1344                         sk_filter_charge(newsk, filter);
1345
1346                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1347                         /* It is still raw copy of parent, so invalidate
1348                          * destructor and make plain sk_free() */
1349                         newsk->sk_destruct = NULL;
1350                         bh_unlock_sock(newsk);
1351                         sk_free(newsk);
1352                         newsk = NULL;
1353                         goto out;
1354                 }
1355
1356                 newsk->sk_err      = 0;
1357                 newsk->sk_priority = 0;
1358                 /*
1359                  * Before updating sk_refcnt, we must commit prior changes to memory
1360                  * (Documentation/RCU/rculist_nulls.txt for details)
1361                  */
1362                 smp_wmb();
1363                 atomic_set(&newsk->sk_refcnt, 2);
1364
1365                 /*
1366                  * Increment the counter in the same struct proto as the master
1367                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1368                  * is the same as sk->sk_prot->socks, as this field was copied
1369                  * with memcpy).
1370                  *
1371                  * This _changes_ the previous behaviour, where
1372                  * tcp_create_openreq_child always was incrementing the
1373                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1374                  * to be taken into account in all callers. -acme
1375                  */
1376                 sk_refcnt_debug_inc(newsk);
1377                 sk_set_socket(newsk, NULL);
1378                 newsk->sk_wq = NULL;
1379
1380                 sk_update_clone(sk, newsk);
1381
1382                 if (newsk->sk_prot->sockets_allocated)
1383                         sk_sockets_allocated_inc(newsk);
1384
1385                 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1386                         net_enable_timestamp();
1387         }
1388 out:
1389         return newsk;
1390 }
1391 EXPORT_SYMBOL_GPL(sk_clone_lock);
1392
1393 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1394 {
1395         __sk_dst_set(sk, dst);
1396         sk->sk_route_caps = dst->dev->features;
1397         if (sk->sk_route_caps & NETIF_F_GSO)
1398                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1399         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1400         if (sk_can_gso(sk)) {
1401                 if (dst->header_len) {
1402                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1403                 } else {
1404                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1405                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1406                 }
1407         }
1408 }
1409 EXPORT_SYMBOL_GPL(sk_setup_caps);
1410
1411 void __init sk_init(void)
1412 {
1413         if (totalram_pages <= 4096) {
1414                 sysctl_wmem_max = 32767;
1415                 sysctl_rmem_max = 32767;
1416                 sysctl_wmem_default = 32767;
1417                 sysctl_rmem_default = 32767;
1418         } else if (totalram_pages >= 131072) {
1419                 sysctl_wmem_max = 131071;
1420                 sysctl_rmem_max = 131071;
1421         }
1422 }
1423
1424 /*
1425  *      Simple resource managers for sockets.
1426  */
1427
1428
1429 /*
1430  * Write buffer destructor automatically called from kfree_skb.
1431  */
1432 void sock_wfree(struct sk_buff *skb)
1433 {
1434         struct sock *sk = skb->sk;
1435         unsigned int len = skb->truesize;
1436
1437         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1438                 /*
1439                  * Keep a reference on sk_wmem_alloc, this will be released
1440                  * after sk_write_space() call
1441                  */
1442                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1443                 sk->sk_write_space(sk);
1444                 len = 1;
1445         }
1446         /*
1447          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1448          * could not do because of in-flight packets
1449          */
1450         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1451                 __sk_free(sk);
1452 }
1453 EXPORT_SYMBOL(sock_wfree);
1454
1455 /*
1456  * Read buffer destructor automatically called from kfree_skb.
1457  */
1458 void sock_rfree(struct sk_buff *skb)
1459 {
1460         struct sock *sk = skb->sk;
1461         unsigned int len = skb->truesize;
1462
1463         atomic_sub(len, &sk->sk_rmem_alloc);
1464         sk_mem_uncharge(sk, len);
1465 }
1466 EXPORT_SYMBOL(sock_rfree);
1467
1468 void sock_edemux(struct sk_buff *skb)
1469 {
1470         sock_put(skb->sk);
1471 }
1472 EXPORT_SYMBOL(sock_edemux);
1473
1474 int sock_i_uid(struct sock *sk)
1475 {
1476         int uid;
1477
1478         read_lock_bh(&sk->sk_callback_lock);
1479         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1480         read_unlock_bh(&sk->sk_callback_lock);
1481         return uid;
1482 }
1483 EXPORT_SYMBOL(sock_i_uid);
1484
1485 unsigned long sock_i_ino(struct sock *sk)
1486 {
1487         unsigned long ino;
1488
1489         read_lock_bh(&sk->sk_callback_lock);
1490         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1491         read_unlock_bh(&sk->sk_callback_lock);
1492         return ino;
1493 }
1494 EXPORT_SYMBOL(sock_i_ino);
1495
1496 /*
1497  * Allocate a skb from the socket's send buffer.
1498  */
1499 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1500                              gfp_t priority)
1501 {
1502         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1503                 struct sk_buff *skb = alloc_skb(size, priority);
1504                 if (skb) {
1505                         skb_set_owner_w(skb, sk);
1506                         return skb;
1507                 }
1508         }
1509         return NULL;
1510 }
1511 EXPORT_SYMBOL(sock_wmalloc);
1512
1513 /*
1514  * Allocate a skb from the socket's receive buffer.
1515  */
1516 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1517                              gfp_t priority)
1518 {
1519         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1520                 struct sk_buff *skb = alloc_skb(size, priority);
1521                 if (skb) {
1522                         skb_set_owner_r(skb, sk);
1523                         return skb;
1524                 }
1525         }
1526         return NULL;
1527 }
1528
1529 /*
1530  * Allocate a memory block from the socket's option memory buffer.
1531  */
1532 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1533 {
1534         if ((unsigned int)size <= sysctl_optmem_max &&
1535             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1536                 void *mem;
1537                 /* First do the add, to avoid the race if kmalloc
1538                  * might sleep.
1539                  */
1540                 atomic_add(size, &sk->sk_omem_alloc);
1541                 mem = kmalloc(size, priority);
1542                 if (mem)
1543                         return mem;
1544                 atomic_sub(size, &sk->sk_omem_alloc);
1545         }
1546         return NULL;
1547 }
1548 EXPORT_SYMBOL(sock_kmalloc);
1549
1550 /*
1551  * Free an option memory block.
1552  */
1553 void sock_kfree_s(struct sock *sk, void *mem, int size)
1554 {
1555         kfree(mem);
1556         atomic_sub(size, &sk->sk_omem_alloc);
1557 }
1558 EXPORT_SYMBOL(sock_kfree_s);
1559
1560 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1561    I think, these locks should be removed for datagram sockets.
1562  */
1563 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1564 {
1565         DEFINE_WAIT(wait);
1566
1567         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1568         for (;;) {
1569                 if (!timeo)
1570                         break;
1571                 if (signal_pending(current))
1572                         break;
1573                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1574                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1575                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1576                         break;
1577                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1578                         break;
1579                 if (sk->sk_err)
1580                         break;
1581                 timeo = schedule_timeout(timeo);
1582         }
1583         finish_wait(sk_sleep(sk), &wait);
1584         return timeo;
1585 }
1586
1587
1588 /*
1589  *      Generic send/receive buffer handlers
1590  */
1591
1592 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1593                                      unsigned long data_len, int noblock,
1594                                      int *errcode)
1595 {
1596         struct sk_buff *skb;
1597         gfp_t gfp_mask;
1598         long timeo;
1599         int err;
1600         int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1601
1602         err = -EMSGSIZE;
1603         if (npages > MAX_SKB_FRAGS)
1604                 goto failure;
1605
1606         gfp_mask = sk->sk_allocation;
1607         if (gfp_mask & __GFP_WAIT)
1608                 gfp_mask |= __GFP_REPEAT;
1609
1610         timeo = sock_sndtimeo(sk, noblock);
1611         while (1) {
1612                 err = sock_error(sk);
1613                 if (err != 0)
1614                         goto failure;
1615
1616                 err = -EPIPE;
1617                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1618                         goto failure;
1619
1620                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1621                         skb = alloc_skb(header_len, gfp_mask);
1622                         if (skb) {
1623                                 int i;
1624
1625                                 /* No pages, we're done... */
1626                                 if (!data_len)
1627                                         break;
1628
1629                                 skb->truesize += data_len;
1630                                 skb_shinfo(skb)->nr_frags = npages;
1631                                 for (i = 0; i < npages; i++) {
1632                                         struct page *page;
1633
1634                                         page = alloc_pages(sk->sk_allocation, 0);
1635                                         if (!page) {
1636                                                 err = -ENOBUFS;
1637                                                 skb_shinfo(skb)->nr_frags = i;
1638                                                 kfree_skb(skb);
1639                                                 goto failure;
1640                                         }
1641
1642                                         __skb_fill_page_desc(skb, i,
1643                                                         page, 0,
1644                                                         (data_len >= PAGE_SIZE ?
1645                                                          PAGE_SIZE :
1646                                                          data_len));
1647                                         data_len -= PAGE_SIZE;
1648                                 }
1649
1650                                 /* Full success... */
1651                                 break;
1652                         }
1653                         err = -ENOBUFS;
1654                         goto failure;
1655                 }
1656                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1657                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1658                 err = -EAGAIN;
1659                 if (!timeo)
1660                         goto failure;
1661                 if (signal_pending(current))
1662                         goto interrupted;
1663                 timeo = sock_wait_for_wmem(sk, timeo);
1664         }
1665
1666         skb_set_owner_w(skb, sk);
1667         return skb;
1668
1669 interrupted:
1670         err = sock_intr_errno(timeo);
1671 failure:
1672         *errcode = err;
1673         return NULL;
1674 }
1675 EXPORT_SYMBOL(sock_alloc_send_pskb);
1676
1677 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1678                                     int noblock, int *errcode)
1679 {
1680         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1681 }
1682 EXPORT_SYMBOL(sock_alloc_send_skb);
1683
1684 static void __lock_sock(struct sock *sk)
1685         __releases(&sk->sk_lock.slock)
1686         __acquires(&sk->sk_lock.slock)
1687 {
1688         DEFINE_WAIT(wait);
1689
1690         for (;;) {
1691                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1692                                         TASK_UNINTERRUPTIBLE);
1693                 spin_unlock_bh(&sk->sk_lock.slock);
1694                 schedule();
1695                 spin_lock_bh(&sk->sk_lock.slock);
1696                 if (!sock_owned_by_user(sk))
1697                         break;
1698         }
1699         finish_wait(&sk->sk_lock.wq, &wait);
1700 }
1701
1702 static void __release_sock(struct sock *sk)
1703         __releases(&sk->sk_lock.slock)
1704         __acquires(&sk->sk_lock.slock)
1705 {
1706         struct sk_buff *skb = sk->sk_backlog.head;
1707
1708         do {
1709                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1710                 bh_unlock_sock(sk);
1711
1712                 do {
1713                         struct sk_buff *next = skb->next;
1714
1715                         prefetch(next);
1716                         WARN_ON_ONCE(skb_dst_is_noref(skb));
1717                         skb->next = NULL;
1718                         sk_backlog_rcv(sk, skb);
1719
1720                         /*
1721                          * We are in process context here with softirqs
1722                          * disabled, use cond_resched_softirq() to preempt.
1723                          * This is safe to do because we've taken the backlog
1724                          * queue private:
1725                          */
1726                         cond_resched_softirq();
1727
1728                         skb = next;
1729                 } while (skb != NULL);
1730
1731                 bh_lock_sock(sk);
1732         } while ((skb = sk->sk_backlog.head) != NULL);
1733
1734         /*
1735          * Doing the zeroing here guarantee we can not loop forever
1736          * while a wild producer attempts to flood us.
1737          */
1738         sk->sk_backlog.len = 0;
1739 }
1740
1741 /**
1742  * sk_wait_data - wait for data to arrive at sk_receive_queue
1743  * @sk:    sock to wait on
1744  * @timeo: for how long
1745  *
1746  * Now socket state including sk->sk_err is changed only under lock,
1747  * hence we may omit checks after joining wait queue.
1748  * We check receive queue before schedule() only as optimization;
1749  * it is very likely that release_sock() added new data.
1750  */
1751 int sk_wait_data(struct sock *sk, long *timeo)
1752 {
1753         int rc;
1754         DEFINE_WAIT(wait);
1755
1756         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1757         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1758         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1759         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1760         finish_wait(sk_sleep(sk), &wait);
1761         return rc;
1762 }
1763 EXPORT_SYMBOL(sk_wait_data);
1764
1765 /**
1766  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1767  *      @sk: socket
1768  *      @size: memory size to allocate
1769  *      @kind: allocation type
1770  *
1771  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1772  *      rmem allocation. This function assumes that protocols which have
1773  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1774  */
1775 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1776 {
1777         struct proto *prot = sk->sk_prot;
1778         int amt = sk_mem_pages(size);
1779         long allocated;
1780         int parent_status = UNDER_LIMIT;
1781
1782         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1783
1784         allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1785
1786         /* Under limit. */
1787         if (parent_status == UNDER_LIMIT &&
1788                         allocated <= sk_prot_mem_limits(sk, 0)) {
1789                 sk_leave_memory_pressure(sk);
1790                 return 1;
1791         }
1792
1793         /* Under pressure. (we or our parents) */
1794         if ((parent_status > SOFT_LIMIT) ||
1795                         allocated > sk_prot_mem_limits(sk, 1))
1796                 sk_enter_memory_pressure(sk);
1797
1798         /* Over hard limit (we or our parents) */
1799         if ((parent_status == OVER_LIMIT) ||
1800                         (allocated > sk_prot_mem_limits(sk, 2)))
1801                 goto suppress_allocation;
1802
1803         /* guarantee minimum buffer size under pressure */
1804         if (kind == SK_MEM_RECV) {
1805                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1806                         return 1;
1807
1808         } else { /* SK_MEM_SEND */
1809                 if (sk->sk_type == SOCK_STREAM) {
1810                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1811                                 return 1;
1812                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1813                            prot->sysctl_wmem[0])
1814                                 return 1;
1815         }
1816
1817         if (sk_has_memory_pressure(sk)) {
1818                 int alloc;
1819
1820                 if (!sk_under_memory_pressure(sk))
1821                         return 1;
1822                 alloc = sk_sockets_allocated_read_positive(sk);
1823                 if (sk_prot_mem_limits(sk, 2) > alloc *
1824                     sk_mem_pages(sk->sk_wmem_queued +
1825                                  atomic_read(&sk->sk_rmem_alloc) +
1826                                  sk->sk_forward_alloc))
1827                         return 1;
1828         }
1829
1830 suppress_allocation:
1831
1832         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1833                 sk_stream_moderate_sndbuf(sk);
1834
1835                 /* Fail only if socket is _under_ its sndbuf.
1836                  * In this case we cannot block, so that we have to fail.
1837                  */
1838                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1839                         return 1;
1840         }
1841
1842         trace_sock_exceed_buf_limit(sk, prot, allocated);
1843
1844         /* Alas. Undo changes. */
1845         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1846
1847         sk_memory_allocated_sub(sk, amt);
1848
1849         return 0;
1850 }
1851 EXPORT_SYMBOL(__sk_mem_schedule);
1852
1853 /**
1854  *      __sk_reclaim - reclaim memory_allocated
1855  *      @sk: socket
1856  */
1857 void __sk_mem_reclaim(struct sock *sk)
1858 {
1859         sk_memory_allocated_sub(sk,
1860                                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1861         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1862
1863         if (sk_under_memory_pressure(sk) &&
1864             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1865                 sk_leave_memory_pressure(sk);
1866 }
1867 EXPORT_SYMBOL(__sk_mem_reclaim);
1868
1869
1870 /*
1871  * Set of default routines for initialising struct proto_ops when
1872  * the protocol does not support a particular function. In certain
1873  * cases where it makes no sense for a protocol to have a "do nothing"
1874  * function, some default processing is provided.
1875  */
1876
1877 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1878 {
1879         return -EOPNOTSUPP;
1880 }
1881 EXPORT_SYMBOL(sock_no_bind);
1882
1883 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1884                     int len, int flags)
1885 {
1886         return -EOPNOTSUPP;
1887 }
1888 EXPORT_SYMBOL(sock_no_connect);
1889
1890 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1891 {
1892         return -EOPNOTSUPP;
1893 }
1894 EXPORT_SYMBOL(sock_no_socketpair);
1895
1896 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1897 {
1898         return -EOPNOTSUPP;
1899 }
1900 EXPORT_SYMBOL(sock_no_accept);
1901
1902 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1903                     int *len, int peer)
1904 {
1905         return -EOPNOTSUPP;
1906 }
1907 EXPORT_SYMBOL(sock_no_getname);
1908
1909 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1910 {
1911         return 0;
1912 }
1913 EXPORT_SYMBOL(sock_no_poll);
1914
1915 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1916 {
1917         return -EOPNOTSUPP;
1918 }
1919 EXPORT_SYMBOL(sock_no_ioctl);
1920
1921 int sock_no_listen(struct socket *sock, int backlog)
1922 {
1923         return -EOPNOTSUPP;
1924 }
1925 EXPORT_SYMBOL(sock_no_listen);
1926
1927 int sock_no_shutdown(struct socket *sock, int how)
1928 {
1929         return -EOPNOTSUPP;
1930 }
1931 EXPORT_SYMBOL(sock_no_shutdown);
1932
1933 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1934                     char __user *optval, unsigned int optlen)
1935 {
1936         return -EOPNOTSUPP;
1937 }
1938 EXPORT_SYMBOL(sock_no_setsockopt);
1939
1940 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1941                     char __user *optval, int __user *optlen)
1942 {
1943         return -EOPNOTSUPP;
1944 }
1945 EXPORT_SYMBOL(sock_no_getsockopt);
1946
1947 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1948                     size_t len)
1949 {
1950         return -EOPNOTSUPP;
1951 }
1952 EXPORT_SYMBOL(sock_no_sendmsg);
1953
1954 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1955                     size_t len, int flags)
1956 {
1957         return -EOPNOTSUPP;
1958 }
1959 EXPORT_SYMBOL(sock_no_recvmsg);
1960
1961 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1962 {
1963         /* Mirror missing mmap method error code */
1964         return -ENODEV;
1965 }
1966 EXPORT_SYMBOL(sock_no_mmap);
1967
1968 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1969 {
1970         ssize_t res;
1971         struct msghdr msg = {.msg_flags = flags};
1972         struct kvec iov;
1973         char *kaddr = kmap(page);
1974         iov.iov_base = kaddr + offset;
1975         iov.iov_len = size;
1976         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1977         kunmap(page);
1978         return res;
1979 }
1980 EXPORT_SYMBOL(sock_no_sendpage);
1981
1982 /*
1983  *      Default Socket Callbacks
1984  */
1985
1986 static void sock_def_wakeup(struct sock *sk)
1987 {
1988         struct socket_wq *wq;
1989
1990         rcu_read_lock();
1991         wq = rcu_dereference(sk->sk_wq);
1992         if (wq_has_sleeper(wq))
1993                 wake_up_interruptible_all(&wq->wait);
1994         rcu_read_unlock();
1995 }
1996
1997 static void sock_def_error_report(struct sock *sk)
1998 {
1999         struct socket_wq *wq;
2000
2001         rcu_read_lock();
2002         wq = rcu_dereference(sk->sk_wq);
2003         if (wq_has_sleeper(wq))
2004                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2005         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2006         rcu_read_unlock();
2007 }
2008
2009 static void sock_def_readable(struct sock *sk, int len)
2010 {
2011         struct socket_wq *wq;
2012
2013         rcu_read_lock();
2014         wq = rcu_dereference(sk->sk_wq);
2015         if (wq_has_sleeper(wq))
2016                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2017                                                 POLLRDNORM | POLLRDBAND);
2018         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2019         rcu_read_unlock();
2020 }
2021
2022 static void sock_def_write_space(struct sock *sk)
2023 {
2024         struct socket_wq *wq;
2025
2026         rcu_read_lock();
2027
2028         /* Do not wake up a writer until he can make "significant"
2029          * progress.  --DaveM
2030          */
2031         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2032                 wq = rcu_dereference(sk->sk_wq);
2033                 if (wq_has_sleeper(wq))
2034                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2035                                                 POLLWRNORM | POLLWRBAND);
2036
2037                 /* Should agree with poll, otherwise some programs break */
2038                 if (sock_writeable(sk))
2039                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2040         }
2041
2042         rcu_read_unlock();
2043 }
2044
2045 static void sock_def_destruct(struct sock *sk)
2046 {
2047         kfree(sk->sk_protinfo);
2048 }
2049
2050 void sk_send_sigurg(struct sock *sk)
2051 {
2052         if (sk->sk_socket && sk->sk_socket->file)
2053                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2054                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2055 }
2056 EXPORT_SYMBOL(sk_send_sigurg);
2057
2058 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2059                     unsigned long expires)
2060 {
2061         if (!mod_timer(timer, expires))
2062                 sock_hold(sk);
2063 }
2064 EXPORT_SYMBOL(sk_reset_timer);
2065
2066 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2067 {
2068         if (timer_pending(timer) && del_timer(timer))
2069                 __sock_put(sk);
2070 }
2071 EXPORT_SYMBOL(sk_stop_timer);
2072
2073 void sock_init_data(struct socket *sock, struct sock *sk)
2074 {
2075         skb_queue_head_init(&sk->sk_receive_queue);
2076         skb_queue_head_init(&sk->sk_write_queue);
2077         skb_queue_head_init(&sk->sk_error_queue);
2078 #ifdef CONFIG_NET_DMA
2079         skb_queue_head_init(&sk->sk_async_wait_queue);
2080 #endif
2081
2082         sk->sk_send_head        =       NULL;
2083
2084         init_timer(&sk->sk_timer);
2085
2086         sk->sk_allocation       =       GFP_KERNEL;
2087         sk->sk_rcvbuf           =       sysctl_rmem_default;
2088         sk->sk_sndbuf           =       sysctl_wmem_default;
2089         sk->sk_state            =       TCP_CLOSE;
2090         sk_set_socket(sk, sock);
2091
2092         sock_set_flag(sk, SOCK_ZAPPED);
2093
2094         if (sock) {
2095                 sk->sk_type     =       sock->type;
2096                 sk->sk_wq       =       sock->wq;
2097                 sock->sk        =       sk;
2098         } else
2099                 sk->sk_wq       =       NULL;
2100
2101         spin_lock_init(&sk->sk_dst_lock);
2102         rwlock_init(&sk->sk_callback_lock);
2103         lockdep_set_class_and_name(&sk->sk_callback_lock,
2104                         af_callback_keys + sk->sk_family,
2105                         af_family_clock_key_strings[sk->sk_family]);
2106
2107         sk->sk_state_change     =       sock_def_wakeup;
2108         sk->sk_data_ready       =       sock_def_readable;
2109         sk->sk_write_space      =       sock_def_write_space;
2110         sk->sk_error_report     =       sock_def_error_report;
2111         sk->sk_destruct         =       sock_def_destruct;
2112
2113         sk->sk_sndmsg_page      =       NULL;
2114         sk->sk_sndmsg_off       =       0;
2115         sk->sk_peek_off         =       -1;
2116
2117         sk->sk_peer_pid         =       NULL;
2118         sk->sk_peer_cred        =       NULL;
2119         sk->sk_write_pending    =       0;
2120         sk->sk_rcvlowat         =       1;
2121         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2122         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2123
2124         sk->sk_stamp = ktime_set(-1L, 0);
2125
2126         /*
2127          * Before updating sk_refcnt, we must commit prior changes to memory
2128          * (Documentation/RCU/rculist_nulls.txt for details)
2129          */
2130         smp_wmb();
2131         atomic_set(&sk->sk_refcnt, 1);
2132         atomic_set(&sk->sk_drops, 0);
2133 }
2134 EXPORT_SYMBOL(sock_init_data);
2135
2136 void lock_sock_nested(struct sock *sk, int subclass)
2137 {
2138         might_sleep();
2139         spin_lock_bh(&sk->sk_lock.slock);
2140         if (sk->sk_lock.owned)
2141                 __lock_sock(sk);
2142         sk->sk_lock.owned = 1;
2143         spin_unlock(&sk->sk_lock.slock);
2144         /*
2145          * The sk_lock has mutex_lock() semantics here:
2146          */
2147         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2148         local_bh_enable();
2149 }
2150 EXPORT_SYMBOL(lock_sock_nested);
2151
2152 void release_sock(struct sock *sk)
2153 {
2154         /*
2155          * The sk_lock has mutex_unlock() semantics:
2156          */
2157         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2158
2159         spin_lock_bh(&sk->sk_lock.slock);
2160         if (sk->sk_backlog.tail)
2161                 __release_sock(sk);
2162
2163         if (sk->sk_prot->release_cb)
2164                 sk->sk_prot->release_cb(sk);
2165
2166         sk->sk_lock.owned = 0;
2167         if (waitqueue_active(&sk->sk_lock.wq))
2168                 wake_up(&sk->sk_lock.wq);
2169         spin_unlock_bh(&sk->sk_lock.slock);
2170 }
2171 EXPORT_SYMBOL(release_sock);
2172
2173 /**
2174  * lock_sock_fast - fast version of lock_sock
2175  * @sk: socket
2176  *
2177  * This version should be used for very small section, where process wont block
2178  * return false if fast path is taken
2179  *   sk_lock.slock locked, owned = 0, BH disabled
2180  * return true if slow path is taken
2181  *   sk_lock.slock unlocked, owned = 1, BH enabled
2182  */
2183 bool lock_sock_fast(struct sock *sk)
2184 {
2185         might_sleep();
2186         spin_lock_bh(&sk->sk_lock.slock);
2187
2188         if (!sk->sk_lock.owned)
2189                 /*
2190                  * Note : We must disable BH
2191                  */
2192                 return false;
2193
2194         __lock_sock(sk);
2195         sk->sk_lock.owned = 1;
2196         spin_unlock(&sk->sk_lock.slock);
2197         /*
2198          * The sk_lock has mutex_lock() semantics here:
2199          */
2200         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2201         local_bh_enable();
2202         return true;
2203 }
2204 EXPORT_SYMBOL(lock_sock_fast);
2205
2206 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2207 {
2208         struct timeval tv;
2209         if (!sock_flag(sk, SOCK_TIMESTAMP))
2210                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2211         tv = ktime_to_timeval(sk->sk_stamp);
2212         if (tv.tv_sec == -1)
2213                 return -ENOENT;
2214         if (tv.tv_sec == 0) {
2215                 sk->sk_stamp = ktime_get_real();
2216                 tv = ktime_to_timeval(sk->sk_stamp);
2217         }
2218         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2219 }
2220 EXPORT_SYMBOL(sock_get_timestamp);
2221
2222 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2223 {
2224         struct timespec ts;
2225         if (!sock_flag(sk, SOCK_TIMESTAMP))
2226                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2227         ts = ktime_to_timespec(sk->sk_stamp);
2228         if (ts.tv_sec == -1)
2229                 return -ENOENT;
2230         if (ts.tv_sec == 0) {
2231                 sk->sk_stamp = ktime_get_real();
2232                 ts = ktime_to_timespec(sk->sk_stamp);
2233         }
2234         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2235 }
2236 EXPORT_SYMBOL(sock_get_timestampns);
2237
2238 void sock_enable_timestamp(struct sock *sk, int flag)
2239 {
2240         if (!sock_flag(sk, flag)) {
2241                 unsigned long previous_flags = sk->sk_flags;
2242
2243                 sock_set_flag(sk, flag);
2244                 /*
2245                  * we just set one of the two flags which require net
2246                  * time stamping, but time stamping might have been on
2247                  * already because of the other one
2248                  */
2249                 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2250                         net_enable_timestamp();
2251         }
2252 }
2253
2254 /*
2255  *      Get a socket option on an socket.
2256  *
2257  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2258  *      asynchronous errors should be reported by getsockopt. We assume
2259  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2260  */
2261 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2262                            char __user *optval, int __user *optlen)
2263 {
2264         struct sock *sk = sock->sk;
2265
2266         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2267 }
2268 EXPORT_SYMBOL(sock_common_getsockopt);
2269
2270 #ifdef CONFIG_COMPAT
2271 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2272                                   char __user *optval, int __user *optlen)
2273 {
2274         struct sock *sk = sock->sk;
2275
2276         if (sk->sk_prot->compat_getsockopt != NULL)
2277                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2278                                                       optval, optlen);
2279         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2280 }
2281 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2282 #endif
2283
2284 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2285                         struct msghdr *msg, size_t size, int flags)
2286 {
2287         struct sock *sk = sock->sk;
2288         int addr_len = 0;
2289         int err;
2290
2291         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2292                                    flags & ~MSG_DONTWAIT, &addr_len);
2293         if (err >= 0)
2294                 msg->msg_namelen = addr_len;
2295         return err;
2296 }
2297 EXPORT_SYMBOL(sock_common_recvmsg);
2298
2299 /*
2300  *      Set socket options on an inet socket.
2301  */
2302 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2303                            char __user *optval, unsigned int optlen)
2304 {
2305         struct sock *sk = sock->sk;
2306
2307         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2308 }
2309 EXPORT_SYMBOL(sock_common_setsockopt);
2310
2311 #ifdef CONFIG_COMPAT
2312 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2313                                   char __user *optval, unsigned int optlen)
2314 {
2315         struct sock *sk = sock->sk;
2316
2317         if (sk->sk_prot->compat_setsockopt != NULL)
2318                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2319                                                       optval, optlen);
2320         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2321 }
2322 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2323 #endif
2324
2325 void sk_common_release(struct sock *sk)
2326 {
2327         if (sk->sk_prot->destroy)
2328                 sk->sk_prot->destroy(sk);
2329
2330         /*
2331          * Observation: when sock_common_release is called, processes have
2332          * no access to socket. But net still has.
2333          * Step one, detach it from networking:
2334          *
2335          * A. Remove from hash tables.
2336          */
2337
2338         sk->sk_prot->unhash(sk);
2339
2340         /*
2341          * In this point socket cannot receive new packets, but it is possible
2342          * that some packets are in flight because some CPU runs receiver and
2343          * did hash table lookup before we unhashed socket. They will achieve
2344          * receive queue and will be purged by socket destructor.
2345          *
2346          * Also we still have packets pending on receive queue and probably,
2347          * our own packets waiting in device queues. sock_destroy will drain
2348          * receive queue, but transmitted packets will delay socket destruction
2349          * until the last reference will be released.
2350          */
2351
2352         sock_orphan(sk);
2353
2354         xfrm_sk_free_policy(sk);
2355
2356         sk_refcnt_debug_release(sk);
2357         sock_put(sk);
2358 }
2359 EXPORT_SYMBOL(sk_common_release);
2360
2361 #ifdef CONFIG_PROC_FS
2362 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2363 struct prot_inuse {
2364         int val[PROTO_INUSE_NR];
2365 };
2366
2367 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2368
2369 #ifdef CONFIG_NET_NS
2370 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2371 {
2372         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2373 }
2374 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2375
2376 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2377 {
2378         int cpu, idx = prot->inuse_idx;
2379         int res = 0;
2380
2381         for_each_possible_cpu(cpu)
2382                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2383
2384         return res >= 0 ? res : 0;
2385 }
2386 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2387
2388 static int __net_init sock_inuse_init_net(struct net *net)
2389 {
2390         net->core.inuse = alloc_percpu(struct prot_inuse);
2391         return net->core.inuse ? 0 : -ENOMEM;
2392 }
2393
2394 static void __net_exit sock_inuse_exit_net(struct net *net)
2395 {
2396         free_percpu(net->core.inuse);
2397 }
2398
2399 static struct pernet_operations net_inuse_ops = {
2400         .init = sock_inuse_init_net,
2401         .exit = sock_inuse_exit_net,
2402 };
2403
2404 static __init int net_inuse_init(void)
2405 {
2406         if (register_pernet_subsys(&net_inuse_ops))
2407                 panic("Cannot initialize net inuse counters");
2408
2409         return 0;
2410 }
2411
2412 core_initcall(net_inuse_init);
2413 #else
2414 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2415
2416 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2417 {
2418         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2419 }
2420 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2421
2422 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2423 {
2424         int cpu, idx = prot->inuse_idx;
2425         int res = 0;
2426
2427         for_each_possible_cpu(cpu)
2428                 res += per_cpu(prot_inuse, cpu).val[idx];
2429
2430         return res >= 0 ? res : 0;
2431 }
2432 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2433 #endif
2434
2435 static void assign_proto_idx(struct proto *prot)
2436 {
2437         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2438
2439         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2440                 pr_err("PROTO_INUSE_NR exhausted\n");
2441                 return;
2442         }
2443
2444         set_bit(prot->inuse_idx, proto_inuse_idx);
2445 }
2446
2447 static void release_proto_idx(struct proto *prot)
2448 {
2449         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2450                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2451 }
2452 #else
2453 static inline void assign_proto_idx(struct proto *prot)
2454 {
2455 }
2456
2457 static inline void release_proto_idx(struct proto *prot)
2458 {
2459 }
2460 #endif
2461
2462 int proto_register(struct proto *prot, int alloc_slab)
2463 {
2464         if (alloc_slab) {
2465                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2466                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2467                                         NULL);
2468
2469                 if (prot->slab == NULL) {
2470                         pr_crit("%s: Can't create sock SLAB cache!\n",
2471                                 prot->name);
2472                         goto out;
2473                 }
2474
2475                 if (prot->rsk_prot != NULL) {
2476                         prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2477                         if (prot->rsk_prot->slab_name == NULL)
2478                                 goto out_free_sock_slab;
2479
2480                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2481                                                                  prot->rsk_prot->obj_size, 0,
2482                                                                  SLAB_HWCACHE_ALIGN, NULL);
2483
2484                         if (prot->rsk_prot->slab == NULL) {
2485                                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2486                                         prot->name);
2487                                 goto out_free_request_sock_slab_name;
2488                         }
2489                 }
2490
2491                 if (prot->twsk_prot != NULL) {
2492                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2493
2494                         if (prot->twsk_prot->twsk_slab_name == NULL)
2495                                 goto out_free_request_sock_slab;
2496
2497                         prot->twsk_prot->twsk_slab =
2498                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2499                                                   prot->twsk_prot->twsk_obj_size,
2500                                                   0,
2501                                                   SLAB_HWCACHE_ALIGN |
2502                                                         prot->slab_flags,
2503                                                   NULL);
2504                         if (prot->twsk_prot->twsk_slab == NULL)
2505                                 goto out_free_timewait_sock_slab_name;
2506                 }
2507         }
2508
2509         mutex_lock(&proto_list_mutex);
2510         list_add(&prot->node, &proto_list);
2511         assign_proto_idx(prot);
2512         mutex_unlock(&proto_list_mutex);
2513         return 0;
2514
2515 out_free_timewait_sock_slab_name:
2516         kfree(prot->twsk_prot->twsk_slab_name);
2517 out_free_request_sock_slab:
2518         if (prot->rsk_prot && prot->rsk_prot->slab) {
2519                 kmem_cache_destroy(prot->rsk_prot->slab);
2520                 prot->rsk_prot->slab = NULL;
2521         }
2522 out_free_request_sock_slab_name:
2523         if (prot->rsk_prot)
2524                 kfree(prot->rsk_prot->slab_name);
2525 out_free_sock_slab:
2526         kmem_cache_destroy(prot->slab);
2527         prot->slab = NULL;
2528 out:
2529         return -ENOBUFS;
2530 }
2531 EXPORT_SYMBOL(proto_register);
2532
2533 void proto_unregister(struct proto *prot)
2534 {
2535         mutex_lock(&proto_list_mutex);
2536         release_proto_idx(prot);
2537         list_del(&prot->node);
2538         mutex_unlock(&proto_list_mutex);
2539
2540         if (prot->slab != NULL) {
2541                 kmem_cache_destroy(prot->slab);
2542                 prot->slab = NULL;
2543         }
2544
2545         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2546                 kmem_cache_destroy(prot->rsk_prot->slab);
2547                 kfree(prot->rsk_prot->slab_name);
2548                 prot->rsk_prot->slab = NULL;
2549         }
2550
2551         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2552                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2553                 kfree(prot->twsk_prot->twsk_slab_name);
2554                 prot->twsk_prot->twsk_slab = NULL;
2555         }
2556 }
2557 EXPORT_SYMBOL(proto_unregister);
2558
2559 #ifdef CONFIG_PROC_FS
2560 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2561         __acquires(proto_list_mutex)
2562 {
2563         mutex_lock(&proto_list_mutex);
2564         return seq_list_start_head(&proto_list, *pos);
2565 }
2566
2567 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2568 {
2569         return seq_list_next(v, &proto_list, pos);
2570 }
2571
2572 static void proto_seq_stop(struct seq_file *seq, void *v)
2573         __releases(proto_list_mutex)
2574 {
2575         mutex_unlock(&proto_list_mutex);
2576 }
2577
2578 static char proto_method_implemented(const void *method)
2579 {
2580         return method == NULL ? 'n' : 'y';
2581 }
2582 static long sock_prot_memory_allocated(struct proto *proto)
2583 {
2584         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2585 }
2586
2587 static char *sock_prot_memory_pressure(struct proto *proto)
2588 {
2589         return proto->memory_pressure != NULL ?
2590         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2591 }
2592
2593 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2594 {
2595
2596         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2597                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2598                    proto->name,
2599                    proto->obj_size,
2600                    sock_prot_inuse_get(seq_file_net(seq), proto),
2601                    sock_prot_memory_allocated(proto),
2602                    sock_prot_memory_pressure(proto),
2603                    proto->max_header,
2604                    proto->slab == NULL ? "no" : "yes",
2605                    module_name(proto->owner),
2606                    proto_method_implemented(proto->close),
2607                    proto_method_implemented(proto->connect),
2608                    proto_method_implemented(proto->disconnect),
2609                    proto_method_implemented(proto->accept),
2610                    proto_method_implemented(proto->ioctl),
2611                    proto_method_implemented(proto->init),
2612                    proto_method_implemented(proto->destroy),
2613                    proto_method_implemented(proto->shutdown),
2614                    proto_method_implemented(proto->setsockopt),
2615                    proto_method_implemented(proto->getsockopt),
2616                    proto_method_implemented(proto->sendmsg),
2617                    proto_method_implemented(proto->recvmsg),
2618                    proto_method_implemented(proto->sendpage),
2619                    proto_method_implemented(proto->bind),
2620                    proto_method_implemented(proto->backlog_rcv),
2621                    proto_method_implemented(proto->hash),
2622                    proto_method_implemented(proto->unhash),
2623                    proto_method_implemented(proto->get_port),
2624                    proto_method_implemented(proto->enter_memory_pressure));
2625 }
2626
2627 static int proto_seq_show(struct seq_file *seq, void *v)
2628 {
2629         if (v == &proto_list)
2630                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2631                            "protocol",
2632                            "size",
2633                            "sockets",
2634                            "memory",
2635                            "press",
2636                            "maxhdr",
2637                            "slab",
2638                            "module",
2639                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2640         else
2641                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2642         return 0;
2643 }
2644
2645 static const struct seq_operations proto_seq_ops = {
2646         .start  = proto_seq_start,
2647         .next   = proto_seq_next,
2648         .stop   = proto_seq_stop,
2649         .show   = proto_seq_show,
2650 };
2651
2652 static int proto_seq_open(struct inode *inode, struct file *file)
2653 {
2654         return seq_open_net(inode, file, &proto_seq_ops,
2655                             sizeof(struct seq_net_private));
2656 }
2657
2658 static const struct file_operations proto_seq_fops = {
2659         .owner          = THIS_MODULE,
2660         .open           = proto_seq_open,
2661         .read           = seq_read,
2662         .llseek         = seq_lseek,
2663         .release        = seq_release_net,
2664 };
2665
2666 static __net_init int proto_init_net(struct net *net)
2667 {
2668         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2669                 return -ENOMEM;
2670
2671         return 0;
2672 }
2673
2674 static __net_exit void proto_exit_net(struct net *net)
2675 {
2676         proc_net_remove(net, "protocols");
2677 }
2678
2679
2680 static __net_initdata struct pernet_operations proto_net_ops = {
2681         .init = proto_init_net,
2682         .exit = proto_exit_net,
2683 };
2684
2685 static int __init proto_init(void)
2686 {
2687         return register_pernet_subsys(&proto_net_ops);
2688 }
2689
2690 subsys_initcall(proto_init);
2691
2692 #endif /* PROC_FS */