net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #include <linux/capability.h>
  93 #include <linux/errno.h>
  94 #include <linux/types.h>
  95 #include <linux/socket.h>
  96 #include <linux/in.h>
  97 #include <linux/kernel.h>
  98 #include <linux/module.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/sched.h>
 102 #include <linux/timer.h>
 103 #include <linux/string.h>
 104 #include <linux/sockios.h>
 105 #include <linux/net.h>
 106 #include <linux/mm.h>
 107 #include <linux/slab.h>
 108 #include <linux/interrupt.h>
 109 #include <linux/poll.h>
 110 #include <linux/tcp.h>
 111 #include <linux/init.h>
 112 #include <linux/highmem.h>
 113
 114 #include <asm/uaccess.h>
 115 #include <asm/system.h>
 116
 117 #include <linux/netdevice.h>
 118 #include <net/protocol.h>
 119 #include <linux/skbuff.h>
 120 #include <net/net_namespace.h>
 121 #include <net/request_sock.h>
 122 #include <net/sock.h>
 123 #include <net/xfrm.h>
 124 #include <linux/ipsec.h>
 125
 126 #include <linux/filter.h>
 127
 128 #ifdef CONFIG_INET
 129 #include <net/tcp.h>
 130 #endif
 131
 132 /*
 133  * Each address family might have different locking rules, so we have
 134  * one slock key per address family:
 135  */
 136 static struct lock_class_key af_family_keys[AF_MAX];
 137 static struct lock_class_key af_family_slock_keys[AF_MAX];
 138
 139 /*
 140  * Make lock validator output more readable. (we pre-construct these
 141  * strings build-time, so that runtime initialization of socket
 142  * locks is fast):
 143  */
 144 static const char *af_family_key_strings[AF_MAX+1] = {
 145   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 146   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 147   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 148   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 149   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 150   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 151   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 152   "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 153   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 154   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 155   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 156   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 157   "sk_lock-AF_MAX"
 158 };
 159 static const char *af_family_slock_key_strings[AF_MAX+1] = {
 160   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 161   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 162   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 163   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 164   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 165   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 166   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 167   "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 168   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 169   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 170   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 171   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 172   "slock-AF_MAX"
 173 };
 174 static const char *af_family_clock_key_strings[AF_MAX+1] = {
 175   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 176   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 177   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 178   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 179   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 180   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 181   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 182   "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 183   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 184   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 185   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 186   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 187   "clock-AF_MAX"
 188 };
 189
 190 /*
 191  * sk_callback_lock locking rules are per-address-family,
 192  * so split the lock classes by using a per-AF key:
 193  */
 194 static struct lock_class_key af_callback_keys[AF_MAX];
 195
 196 /* Take into consideration the size of the struct sk_buff overhead in the
 197  * determination of these values, since that is non-constant across
 198  * platforms.  This makes socket queueing behavior and performance
 199  * not depend upon such differences.
 200  */
 201 #define _SK_MEM_PACKETS         256
 202 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 203 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 204 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 205
 206 /* Run time adjustable parameters. */
 207 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 208 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 209 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 210 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 211
 212 /* Maximal space eaten by iovec or ancilliary data plus some space */
 213 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 214
 215 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 216 {
 217         struct timeval tv;
 218
 219         if (optlen < sizeof(tv))
 220                 return -EINVAL;
 221         if (copy_from_user(&tv, optval, sizeof(tv)))
 222                 return -EFAULT;
 223         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 224                 return -EDOM;
 225
 226         if (tv.tv_sec < 0) {
 227                 static int warned __read_mostly;
 228
 229                 *timeo_p = 0;
 230                 if (warned < 10 && net_ratelimit()) {
 231                         warned++;
 232                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 233                                "tries to set negative timeout\n",
 234                                 current->comm, task_pid_nr(current));
 235                 }
 236                 return 0;
 237         }
 238         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 239         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 240                 return 0;
 241         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 242                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 243         return 0;
 244 }
 245
 246 static void sock_warn_obsolete_bsdism(const char *name)
 247 {
 248         static int warned;
 249         static char warncomm[TASK_COMM_LEN];
 250         if (strcmp(warncomm, current->comm) && warned < 5) {
 251                 strcpy(warncomm,  current->comm);
 252                 printk(KERN_WARNING "process `%s' is using obsolete "
 253                        "%s SO_BSDCOMPAT\n", warncomm, name);
 254                 warned++;
 255         }
 256 }
 257
 258 static void sock_disable_timestamp(struct sock *sk)
 259 {
 260         if (sock_flag(sk, SOCK_TIMESTAMP)) {
 261                 sock_reset_flag(sk, SOCK_TIMESTAMP);
 262                 net_disable_timestamp();
 263         }
 264 }
 265
 266
 267 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 268 {
 269         int err = 0;
 270         int skb_len;
 271
 272         /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
 273            number of warnings when compiling with -W --ANK
 274          */
 275         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 276             (unsigned)sk->sk_rcvbuf) {
 277                 err = -ENOMEM;
 278                 goto out;
 279         }
 280
 281         err = sk_filter(sk, skb);
 282         if (err)
 283                 goto out;
 284
 285         if (!sk_rmem_schedule(sk, skb->truesize)) {
 286                 err = -ENOBUFS;
 287                 goto out;
 288         }
 289
 290         skb->dev = NULL;
 291         skb_set_owner_r(skb, sk);
 292
 293         /* Cache the SKB length before we tack it onto the receive
 294          * queue.  Once it is added it no longer belongs to us and
 295          * may be freed by other threads of control pulling packets
 296          * from the queue.
 297          */
 298         skb_len = skb->len;
 299
 300         skb_queue_tail(&sk->sk_receive_queue, skb);
 301
 302         if (!sock_flag(sk, SOCK_DEAD))
 303                 sk->sk_data_ready(sk, skb_len);
 304 out:
 305         return err;
 306 }
 307 EXPORT_SYMBOL(sock_queue_rcv_skb);
 308
 309 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 310 {
 311         int rc = NET_RX_SUCCESS;
 312
 313         if (sk_filter(sk, skb))
 314                 goto discard_and_relse;
 315
 316         skb->dev = NULL;
 317
 318         if (nested)
 319                 bh_lock_sock_nested(sk);
 320         else
 321                 bh_lock_sock(sk);
 322         if (!sock_owned_by_user(sk)) {
 323                 /*
 324                  * trylock + unlock semantics:
 325                  */
 326                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 327
 328                 rc = sk_backlog_rcv(sk, skb);
 329
 330                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 331         } else
 332                 sk_add_backlog(sk, skb);
 333         bh_unlock_sock(sk);
 334 out:
 335         sock_put(sk);
 336         return rc;
 337 discard_and_relse:
 338         kfree_skb(skb);
 339         goto out;
 340 }
 341 EXPORT_SYMBOL(sk_receive_skb);
 342
 343 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 344 {
 345         struct dst_entry *dst = sk->sk_dst_cache;
 346
 347         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 348                 sk->sk_dst_cache = NULL;
 349                 dst_release(dst);
 350                 return NULL;
 351         }
 352
 353         return dst;
 354 }
 355 EXPORT_SYMBOL(__sk_dst_check);
 356
 357 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 358 {
 359         struct dst_entry *dst = sk_dst_get(sk);
 360
 361         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 362                 sk_dst_reset(sk);
 363                 dst_release(dst);
 364                 return NULL;
 365         }
 366
 367         return dst;
 368 }
 369 EXPORT_SYMBOL(sk_dst_check);
 370
 371 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 372 {
 373         int ret = -ENOPROTOOPT;
 374 #ifdef CONFIG_NETDEVICES
 375         struct net *net = sock_net(sk);
 376         char devname[IFNAMSIZ];
 377         int index;
 378
 379         /* Sorry... */
 380         ret = -EPERM;
 381         if (!capable(CAP_NET_RAW))
 382                 goto out;
 383
 384         ret = -EINVAL;
 385         if (optlen < 0)
 386                 goto out;
 387
 388         /* Bind this socket to a particular device like "eth0",
 389          * as specified in the passed interface name. If the
 390          * name is "" or the option length is zero the socket
 391          * is not bound.
 392          */
 393         if (optlen > IFNAMSIZ - 1)
 394                 optlen = IFNAMSIZ - 1;
 395         memset(devname, 0, sizeof(devname));
 396
 397         ret = -EFAULT;
 398         if (copy_from_user(devname, optval, optlen))
 399                 goto out;
 400
 401         if (devname[0] == '\0') {
 402                 index = 0;
 403         } else {
 404                 struct net_device *dev = dev_get_by_name(net, devname);
 405
 406                 ret = -ENODEV;
 407                 if (!dev)
 408                         goto out;
 409
 410                 index = dev->ifindex;
 411                 dev_put(dev);
 412         }
 413
 414         lock_sock(sk);
 415         sk->sk_bound_dev_if = index;
 416         sk_dst_reset(sk);
 417         release_sock(sk);
 418
 419         ret = 0;
 420
 421 out:
 422 #endif
 423
 424         return ret;
 425 }
 426
 427 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 428 {
 429         if (valbool)
 430                 sock_set_flag(sk, bit);
 431         else
 432                 sock_reset_flag(sk, bit);
 433 }
 434
 435 /*
 436  *      This is meant for all protocols to use and covers goings on
 437  *      at the socket level. Everything here is generic.
 438  */
 439
 440 int sock_setsockopt(struct socket *sock, int level, int optname,
 441                     char __user *optval, int optlen)
 442 {
 443         struct sock *sk=sock->sk;
 444         int val;
 445         int valbool;
 446         struct linger ling;
 447         int ret = 0;
 448
 449         /*
 450          *      Options without arguments
 451          */
 452
 453         if (optname == SO_BINDTODEVICE)
 454                 return sock_bindtodevice(sk, optval, optlen);
 455
 456         if (optlen < sizeof(int))
 457                 return -EINVAL;
 458
 459         if (get_user(val, (int __user *)optval))
 460                 return -EFAULT;
 461
 462         valbool = val?1:0;
 463
 464         lock_sock(sk);
 465
 466         switch(optname) {
 467         case SO_DEBUG:
 468                 if (val && !capable(CAP_NET_ADMIN)) {
 469                         ret = -EACCES;
 470                 } else
 471                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 472                 break;
 473         case SO_REUSEADDR:
 474                 sk->sk_reuse = valbool;
 475                 break;
 476         case SO_TYPE:
 477         case SO_ERROR:
 478                 ret = -ENOPROTOOPT;
 479                 break;
 480         case SO_DONTROUTE:
 481                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 482                 break;
 483         case SO_BROADCAST:
 484                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 485                 break;
 486         case SO_SNDBUF:
 487                 /* Don't error on this BSD doesn't and if you think
 488                    about it this is right. Otherwise apps have to
 489                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 490                    are treated in BSD as hints */
 491
 492                 if (val > sysctl_wmem_max)
 493                         val = sysctl_wmem_max;
 494 set_sndbuf:
 495                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 496                 if ((val * 2) < SOCK_MIN_SNDBUF)
 497                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 498                 else
 499                         sk->sk_sndbuf = val * 2;
 500
 501                 /*
 502                  *      Wake up sending tasks if we
 503                  *      upped the value.
 504                  */
 505                 sk->sk_write_space(sk);
 506                 break;
 507
 508         case SO_SNDBUFFORCE:
 509                 if (!capable(CAP_NET_ADMIN)) {
 510                         ret = -EPERM;
 511                         break;
 512                 }
 513                 goto set_sndbuf;
 514
 515         case SO_RCVBUF:
 516                 /* Don't error on this BSD doesn't and if you think
 517                    about it this is right. Otherwise apps have to
 518                    play 'guess the biggest size' games. RCVBUF/SNDBUF
 519                    are treated in BSD as hints */
 520
 521                 if (val > sysctl_rmem_max)
 522                         val = sysctl_rmem_max;
 523 set_rcvbuf:
 524                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 525                 /*
 526                  * We double it on the way in to account for
 527                  * "struct sk_buff" etc. overhead.   Applications
 528                  * assume that the SO_RCVBUF setting they make will
 529                  * allow that much actual data to be received on that
 530                  * socket.
 531                  *
 532                  * Applications are unaware that "struct sk_buff" and
 533                  * other overheads allocate from the receive buffer
 534                  * during socket buffer allocation.
 535                  *
 536                  * And after considering the possible alternatives,
 537                  * returning the value we actually used in getsockopt
 538                  * is the most desirable behavior.
 539                  */
 540                 if ((val * 2) < SOCK_MIN_RCVBUF)
 541                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 542                 else
 543                         sk->sk_rcvbuf = val * 2;
 544                 break;
 545
 546         case SO_RCVBUFFORCE:
 547                 if (!capable(CAP_NET_ADMIN)) {
 548                         ret = -EPERM;
 549                         break;
 550                 }
 551                 goto set_rcvbuf;
 552
 553         case SO_KEEPALIVE:
 554 #ifdef CONFIG_INET
 555                 if (sk->sk_protocol == IPPROTO_TCP)
 556                         tcp_set_keepalive(sk, valbool);
 557 #endif
 558                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 559                 break;
 560
 561         case SO_OOBINLINE:
 562                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 563                 break;
 564
 565         case SO_NO_CHECK:
 566                 sk->sk_no_check = valbool;
 567                 break;
 568
 569         case SO_PRIORITY:
 570                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 571                         sk->sk_priority = val;
 572                 else
 573                         ret = -EPERM;
 574                 break;
 575
 576         case SO_LINGER:
 577                 if (optlen < sizeof(ling)) {
 578                         ret = -EINVAL;  /* 1003.1g */
 579                         break;
 580                 }
 581                 if (copy_from_user(&ling,optval,sizeof(ling))) {
 582                         ret = -EFAULT;
 583                         break;
 584                 }
 585                 if (!ling.l_onoff)
 586                         sock_reset_flag(sk, SOCK_LINGER);
 587                 else {
 588 #if (BITS_PER_LONG == 32)
 589                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 590                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 591                         else
 592 #endif
 593                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 594                         sock_set_flag(sk, SOCK_LINGER);
 595                 }
 596                 break;
 597
 598         case SO_BSDCOMPAT:
 599                 sock_warn_obsolete_bsdism("setsockopt");
 600                 break;
 601
 602         case SO_PASSCRED:
 603                 if (valbool)
 604                         set_bit(SOCK_PASSCRED, &sock->flags);
 605                 else
 606                         clear_bit(SOCK_PASSCRED, &sock->flags);
 607                 break;
 608
 609         case SO_TIMESTAMP:
 610         case SO_TIMESTAMPNS:
 611                 if (valbool)  {
 612                         if (optname == SO_TIMESTAMP)
 613                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 614                         else
 615                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 616                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 617                         sock_enable_timestamp(sk);
 618                 } else {
 619                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 620                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 621                 }
 622                 break;
 623
 624         case SO_RCVLOWAT:
 625                 if (val < 0)
 626                         val = INT_MAX;
 627                 sk->sk_rcvlowat = val ? : 1;
 628                 break;
 629
 630         case SO_RCVTIMEO:
 631                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 632                 break;
 633
 634         case SO_SNDTIMEO:
 635                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 636                 break;
 637
 638         case SO_ATTACH_FILTER:
 639                 ret = -EINVAL;
 640                 if (optlen == sizeof(struct sock_fprog)) {
 641                         struct sock_fprog fprog;
 642
 643                         ret = -EFAULT;
 644                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 645                                 break;
 646
 647                         ret = sk_attach_filter(&fprog, sk);
 648                 }
 649                 break;
 650
 651         case SO_DETACH_FILTER:
 652                 ret = sk_detach_filter(sk);
 653                 break;
 654
 655         case SO_PASSSEC:
 656                 if (valbool)
 657                         set_bit(SOCK_PASSSEC, &sock->flags);
 658                 else
 659                         clear_bit(SOCK_PASSSEC, &sock->flags);
 660                 break;
 661         case SO_MARK:
 662                 if (!capable(CAP_NET_ADMIN))
 663                         ret = -EPERM;
 664                 else {
 665                         sk->sk_mark = val;
 666                 }
 667                 break;
 668
 669                 /* We implement the SO_SNDLOWAT etc to
 670                    not be settable (1003.1g 5.3) */
 671         default:
 672                 ret = -ENOPROTOOPT;
 673                 break;
 674         }
 675         release_sock(sk);
 676         return ret;
 677 }
 678
 679
 680 int sock_getsockopt(struct socket *sock, int level, int optname,
 681                     char __user *optval, int __user *optlen)
 682 {
 683         struct sock *sk = sock->sk;
 684
 685         union {
 686                 int val;
 687                 struct linger ling;
 688                 struct timeval tm;
 689         } v;
 690
 691         unsigned int lv = sizeof(int);
 692         int len;
 693
 694         if (get_user(len, optlen))
 695                 return -EFAULT;
 696         if (len < 0)
 697                 return -EINVAL;
 698
 699         switch(optname) {
 700         case SO_DEBUG:
 701                 v.val = sock_flag(sk, SOCK_DBG);
 702                 break;
 703
 704         case SO_DONTROUTE:
 705                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
 706                 break;
 707
 708         case SO_BROADCAST:
 709                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
 710                 break;
 711
 712         case SO_SNDBUF:
 713                 v.val = sk->sk_sndbuf;
 714                 break;
 715
 716         case SO_RCVBUF:
 717                 v.val = sk->sk_rcvbuf;
 718                 break;
 719
 720         case SO_REUSEADDR:
 721                 v.val = sk->sk_reuse;
 722                 break;
 723
 724         case SO_KEEPALIVE:
 725                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 726                 break;
 727
 728         case SO_TYPE:
 729                 v.val = sk->sk_type;
 730                 break;
 731
 732         case SO_ERROR:
 733                 v.val = -sock_error(sk);
 734                 if (v.val==0)
 735                         v.val = xchg(&sk->sk_err_soft, 0);
 736                 break;
 737
 738         case SO_OOBINLINE:
 739                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
 740                 break;
 741
 742         case SO_NO_CHECK:
 743                 v.val = sk->sk_no_check;
 744                 break;
 745
 746         case SO_PRIORITY:
 747                 v.val = sk->sk_priority;
 748                 break;
 749
 750         case SO_LINGER:
 751                 lv              = sizeof(v.ling);
 752                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 753                 v.ling.l_linger = sk->sk_lingertime / HZ;
 754                 break;
 755
 756         case SO_BSDCOMPAT:
 757                 sock_warn_obsolete_bsdism("getsockopt");
 758                 break;
 759
 760         case SO_TIMESTAMP:
 761                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
 762                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
 763                 break;
 764
 765         case SO_TIMESTAMPNS:
 766                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
 767                 break;
 768
 769         case SO_RCVTIMEO:
 770                 lv=sizeof(struct timeval);
 771                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 772                         v.tm.tv_sec = 0;
 773                         v.tm.tv_usec = 0;
 774                 } else {
 775                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 776                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 777                 }
 778                 break;
 779
 780         case SO_SNDTIMEO:
 781                 lv=sizeof(struct timeval);
 782                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 783                         v.tm.tv_sec = 0;
 784                         v.tm.tv_usec = 0;
 785                 } else {
 786                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 787                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 788                 }
 789                 break;
 790
 791         case SO_RCVLOWAT:
 792                 v.val = sk->sk_rcvlowat;
 793                 break;
 794
 795         case SO_SNDLOWAT:
 796                 v.val=1;
 797                 break;
 798
 799         case SO_PASSCRED:
 800                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 801                 break;
 802
 803         case SO_PEERCRED:
 804                 if (len > sizeof(sk->sk_peercred))
 805                         len = sizeof(sk->sk_peercred);
 806                 if (copy_to_user(optval, &sk->sk_peercred, len))
 807                         return -EFAULT;
 808                 goto lenout;
 809
 810         case SO_PEERNAME:
 811         {
 812                 char address[128];
 813
 814                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 815                         return -ENOTCONN;
 816                 if (lv < len)
 817                         return -EINVAL;
 818                 if (copy_to_user(optval, address, len))
 819                         return -EFAULT;
 820                 goto lenout;
 821         }
 822
 823         /* Dubious BSD thing... Probably nobody even uses it, but
 824          * the UNIX standard wants it for whatever reason... -DaveM
 825          */
 826         case SO_ACCEPTCONN:
 827                 v.val = sk->sk_state == TCP_LISTEN;
 828                 break;
 829
 830         case SO_PASSSEC:
 831                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 832                 break;
 833
 834         case SO_PEERSEC:
 835                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
 836
 837         case SO_MARK:
 838                 v.val = sk->sk_mark;
 839                 break;
 840
 841         default:
 842                 return -ENOPROTOOPT;
 843         }
 844
 845         if (len > lv)
 846                 len = lv;
 847         if (copy_to_user(optval, &v, len))
 848                 return -EFAULT;
 849 lenout:
 850         if (put_user(len, optlen))
 851                 return -EFAULT;
 852         return 0;
 853 }
 854
 855 /*
 856  * Initialize an sk_lock.
 857  *
 858  * (We also register the sk_lock with the lock validator.)
 859  */
 860 static inline void sock_lock_init(struct sock *sk)
 861 {
 862         sock_lock_init_class_and_name(sk,
 863                         af_family_slock_key_strings[sk->sk_family],
 864                         af_family_slock_keys + sk->sk_family,
 865                         af_family_key_strings[sk->sk_family],
 866                         af_family_keys + sk->sk_family);
 867 }
 868
 869 static void sock_copy(struct sock *nsk, const struct sock *osk)
 870 {
 871 #ifdef CONFIG_SECURITY_NETWORK
 872         void *sptr = nsk->sk_security;
 873 #endif
 874
 875         memcpy(nsk, osk, osk->sk_prot->obj_size);
 876 #ifdef CONFIG_SECURITY_NETWORK
 877         nsk->sk_security = sptr;
 878         security_sk_clone(osk, nsk);
 879 #endif
 880 }
 881
 882 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 883                 int family)
 884 {
 885         struct sock *sk;
 886         struct kmem_cache *slab;
 887
 888         slab = prot->slab;
 889         if (slab != NULL)
 890                 sk = kmem_cache_alloc(slab, priority);
 891         else
 892                 sk = kmalloc(prot->obj_size, priority);
 893
 894         if (sk != NULL) {
 895                 if (security_sk_alloc(sk, family, priority))
 896                         goto out_free;
 897
 898                 if (!try_module_get(prot->owner))
 899                         goto out_free_sec;
 900         }
 901
 902         return sk;
 903
 904 out_free_sec:
 905         security_sk_free(sk);
 906 out_free:
 907         if (slab != NULL)
 908                 kmem_cache_free(slab, sk);
 909         else
 910                 kfree(sk);
 911         return NULL;
 912 }
 913
 914 static void sk_prot_free(struct proto *prot, struct sock *sk)
 915 {
 916         struct kmem_cache *slab;
 917         struct module *owner;
 918
 919         owner = prot->owner;
 920         slab = prot->slab;
 921
 922         security_sk_free(sk);
 923         if (slab != NULL)
 924                 kmem_cache_free(slab, sk);
 925         else
 926                 kfree(sk);
 927         module_put(owner);
 928 }
 929
 930 /**
 931  *      sk_alloc - All socket objects are allocated here
 932  *      @net: the applicable net namespace
 933  *      @family: protocol family
 934  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 935  *      @prot: struct proto associated with this new sock instance
 936  */
 937 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 938                       struct proto *prot)
 939 {
 940         struct sock *sk;
 941
 942         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
 943         if (sk) {
 944                 sk->sk_family = family;
 945                 /*
 946                  * See comment in struct sock definition to understand
 947                  * why we need sk_prot_creator -acme
 948                  */
 949                 sk->sk_prot = sk->sk_prot_creator = prot;
 950                 sock_lock_init(sk);
 951                 sock_net_set(sk, get_net(net));
 952         }
 953
 954         return sk;
 955 }
 956
 957 void sk_free(struct sock *sk)
 958 {
 959         struct sk_filter *filter;
 960
 961         if (sk->sk_destruct)
 962                 sk->sk_destruct(sk);
 963
 964         filter = rcu_dereference(sk->sk_filter);
 965         if (filter) {
 966                 sk_filter_uncharge(sk, filter);
 967                 rcu_assign_pointer(sk->sk_filter, NULL);
 968         }
 969
 970         sock_disable_timestamp(sk);
 971
 972         if (atomic_read(&sk->sk_omem_alloc))
 973                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
 974                        __func__, atomic_read(&sk->sk_omem_alloc));
 975
 976         put_net(sock_net(sk));
 977         sk_prot_free(sk->sk_prot_creator, sk);
 978 }
 979
 980 /*
 981  * Last sock_put should drop referrence to sk->sk_net. It has already
 982  * been dropped in sk_change_net. Taking referrence to stopping namespace
 983  * is not an option.
 984  * Take referrence to a socket to remove it from hash _alive_ and after that
 985  * destroy it in the context of init_net.
 986  */
 987 void sk_release_kernel(struct sock *sk)
 988 {
 989         if (sk == NULL || sk->sk_socket == NULL)
 990                 return;
 991
 992         sock_hold(sk);
 993         sock_release(sk->sk_socket);
 994         release_net(sock_net(sk));
 995         sock_net_set(sk, get_net(&init_net));
 996         sock_put(sk);
 997 }
 998 EXPORT_SYMBOL(sk_release_kernel);
 999
1000 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1001 {
1002         struct sock *newsk;
1003
1004         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1005         if (newsk != NULL) {
1006                 struct sk_filter *filter;
1007
1008                 sock_copy(newsk, sk);
1009
1010                 /* SANITY */
1011                 get_net(sock_net(newsk));
1012                 sk_node_init(&newsk->sk_node);
1013                 sock_lock_init(newsk);
1014                 bh_lock_sock(newsk);
1015                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1016
1017                 atomic_set(&newsk->sk_rmem_alloc, 0);
1018                 atomic_set(&newsk->sk_wmem_alloc, 0);
1019                 atomic_set(&newsk->sk_omem_alloc, 0);
1020                 skb_queue_head_init(&newsk->sk_receive_queue);
1021                 skb_queue_head_init(&newsk->sk_write_queue);
1022 #ifdef CONFIG_NET_DMA
1023                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1024 #endif
1025
1026                 rwlock_init(&newsk->sk_dst_lock);
1027                 rwlock_init(&newsk->sk_callback_lock);
1028                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1029                                 af_callback_keys + newsk->sk_family,
1030                                 af_family_clock_key_strings[newsk->sk_family]);
1031
1032                 newsk->sk_dst_cache     = NULL;
1033                 newsk->sk_wmem_queued   = 0;
1034                 newsk->sk_forward_alloc = 0;
1035                 newsk->sk_send_head     = NULL;
1036                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1037
1038                 sock_reset_flag(newsk, SOCK_DONE);
1039                 skb_queue_head_init(&newsk->sk_error_queue);
1040
1041                 filter = newsk->sk_filter;
1042                 if (filter != NULL)
1043                         sk_filter_charge(newsk, filter);
1044
1045                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1046                         /* It is still raw copy of parent, so invalidate
1047                          * destructor and make plain sk_free() */
1048                         newsk->sk_destruct = NULL;
1049                         sk_free(newsk);
1050                         newsk = NULL;
1051                         goto out;
1052                 }
1053
1054                 newsk->sk_err      = 0;
1055                 newsk->sk_priority = 0;
1056                 atomic_set(&newsk->sk_refcnt, 2);
1057
1058                 /*
1059                  * Increment the counter in the same struct proto as the master
1060                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1061                  * is the same as sk->sk_prot->socks, as this field was copied
1062                  * with memcpy).
1063                  *
1064                  * This _changes_ the previous behaviour, where
1065                  * tcp_create_openreq_child always was incrementing the
1066                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1067                  * to be taken into account in all callers. -acme
1068                  */
1069                 sk_refcnt_debug_inc(newsk);
1070                 sk_set_socket(newsk, NULL);
1071                 newsk->sk_sleep  = NULL;
1072
1073                 if (newsk->sk_prot->sockets_allocated)
1074                         atomic_inc(newsk->sk_prot->sockets_allocated);
1075         }
1076 out:
1077         return newsk;
1078 }
1079
1080 EXPORT_SYMBOL_GPL(sk_clone);
1081
1082 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1083 {
1084         __sk_dst_set(sk, dst);
1085         sk->sk_route_caps = dst->dev->features;
1086         if (sk->sk_route_caps & NETIF_F_GSO)
1087                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1088         if (sk_can_gso(sk)) {
1089                 if (dst->header_len) {
1090                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1091                 } else {
1092                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1093                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1094                 }
1095         }
1096 }
1097 EXPORT_SYMBOL_GPL(sk_setup_caps);
1098
1099 void __init sk_init(void)
1100 {
1101         if (num_physpages <= 4096) {
1102                 sysctl_wmem_max = 32767;
1103                 sysctl_rmem_max = 32767;
1104                 sysctl_wmem_default = 32767;
1105                 sysctl_rmem_default = 32767;
1106         } else if (num_physpages >= 131072) {
1107                 sysctl_wmem_max = 131071;
1108                 sysctl_rmem_max = 131071;
1109         }
1110 }
1111
1112 /*
1113  *      Simple resource managers for sockets.
1114  */
1115
1116
1117 /*
1118  * Write buffer destructor automatically called from kfree_skb.
1119  */
1120 void sock_wfree(struct sk_buff *skb)
1121 {
1122         struct sock *sk = skb->sk;
1123
1124         /* In case it might be waiting for more memory. */
1125         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1126         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1127                 sk->sk_write_space(sk);
1128         sock_put(sk);
1129 }
1130
1131 /*
1132  * Read buffer destructor automatically called from kfree_skb.
1133  */
1134 void sock_rfree(struct sk_buff *skb)
1135 {
1136         struct sock *sk = skb->sk;
1137
1138         skb_truesize_check(skb);
1139         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1140         sk_mem_uncharge(skb->sk, skb->truesize);
1141 }
1142
1143
1144 int sock_i_uid(struct sock *sk)
1145 {
1146         int uid;
1147
1148         read_lock(&sk->sk_callback_lock);
1149         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1150         read_unlock(&sk->sk_callback_lock);
1151         return uid;
1152 }
1153
1154 unsigned long sock_i_ino(struct sock *sk)
1155 {
1156         unsigned long ino;
1157
1158         read_lock(&sk->sk_callback_lock);
1159         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1160         read_unlock(&sk->sk_callback_lock);
1161         return ino;
1162 }
1163
1164 /*
1165  * Allocate a skb from the socket's send buffer.
1166  */
1167 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1168                              gfp_t priority)
1169 {
1170         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1171                 struct sk_buff * skb = alloc_skb(size, priority);
1172                 if (skb) {
1173                         skb_set_owner_w(skb, sk);
1174                         return skb;
1175                 }
1176         }
1177         return NULL;
1178 }
1179
1180 /*
1181  * Allocate a skb from the socket's receive buffer.
1182  */
1183 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1184                              gfp_t priority)
1185 {
1186         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1187                 struct sk_buff *skb = alloc_skb(size, priority);
1188                 if (skb) {
1189                         skb_set_owner_r(skb, sk);
1190                         return skb;
1191                 }
1192         }
1193         return NULL;
1194 }
1195
1196 /*
1197  * Allocate a memory block from the socket's option memory buffer.
1198  */
1199 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1200 {
1201         if ((unsigned)size <= sysctl_optmem_max &&
1202             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1203                 void *mem;
1204                 /* First do the add, to avoid the race if kmalloc
1205                  * might sleep.
1206                  */
1207                 atomic_add(size, &sk->sk_omem_alloc);
1208                 mem = kmalloc(size, priority);
1209                 if (mem)
1210                         return mem;
1211                 atomic_sub(size, &sk->sk_omem_alloc);
1212         }
1213         return NULL;
1214 }
1215
1216 /*
1217  * Free an option memory block.
1218  */
1219 void sock_kfree_s(struct sock *sk, void *mem, int size)
1220 {
1221         kfree(mem);
1222         atomic_sub(size, &sk->sk_omem_alloc);
1223 }
1224
1225 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1226    I think, these locks should be removed for datagram sockets.
1227  */
1228 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1229 {
1230         DEFINE_WAIT(wait);
1231
1232         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1233         for (;;) {
1234                 if (!timeo)
1235                         break;
1236                 if (signal_pending(current))
1237                         break;
1238                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1239                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1240                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1241                         break;
1242                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1243                         break;
1244                 if (sk->sk_err)
1245                         break;
1246                 timeo = schedule_timeout(timeo);
1247         }
1248         finish_wait(sk->sk_sleep, &wait);
1249         return timeo;
1250 }
1251
1252
1253 /*
1254  *      Generic send/receive buffer handlers
1255  */
1256
1257 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1258                                             unsigned long header_len,
1259                                             unsigned long data_len,
1260                                             int noblock, int *errcode)
1261 {
1262         struct sk_buff *skb;
1263         gfp_t gfp_mask;
1264         long timeo;
1265         int err;
1266
1267         gfp_mask = sk->sk_allocation;
1268         if (gfp_mask & __GFP_WAIT)
1269                 gfp_mask |= __GFP_REPEAT;
1270
1271         timeo = sock_sndtimeo(sk, noblock);
1272         while (1) {
1273                 err = sock_error(sk);
1274                 if (err != 0)
1275                         goto failure;
1276
1277                 err = -EPIPE;
1278                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1279                         goto failure;
1280
1281                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1282                         skb = alloc_skb(header_len, gfp_mask);
1283                         if (skb) {
1284                                 int npages;
1285                                 int i;
1286
1287                                 /* No pages, we're done... */
1288                                 if (!data_len)
1289                                         break;
1290
1291                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1292                                 skb->truesize += data_len;
1293                                 skb_shinfo(skb)->nr_frags = npages;
1294                                 for (i = 0; i < npages; i++) {
1295                                         struct page *page;
1296                                         skb_frag_t *frag;
1297
1298                                         page = alloc_pages(sk->sk_allocation, 0);
1299                                         if (!page) {
1300                                                 err = -ENOBUFS;
1301                                                 skb_shinfo(skb)->nr_frags = i;
1302                                                 kfree_skb(skb);
1303                                                 goto failure;
1304                                         }
1305
1306                                         frag = &skb_shinfo(skb)->frags[i];
1307                                         frag->page = page;
1308                                         frag->page_offset = 0;
1309                                         frag->size = (data_len >= PAGE_SIZE ?
1310                                                       PAGE_SIZE :
1311                                                       data_len);
1312                                         data_len -= PAGE_SIZE;
1313                                 }
1314
1315                                 /* Full success... */
1316                                 break;
1317                         }
1318                         err = -ENOBUFS;
1319                         goto failure;
1320                 }
1321                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1322                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1323                 err = -EAGAIN;
1324                 if (!timeo)
1325                         goto failure;
1326                 if (signal_pending(current))
1327                         goto interrupted;
1328                 timeo = sock_wait_for_wmem(sk, timeo);
1329         }
1330
1331         skb_set_owner_w(skb, sk);
1332         return skb;
1333
1334 interrupted:
1335         err = sock_intr_errno(timeo);
1336 failure:
1337         *errcode = err;
1338         return NULL;
1339 }
1340
1341 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1342                                     int noblock, int *errcode)
1343 {
1344         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1345 }
1346
1347 static void __lock_sock(struct sock *sk)
1348 {
1349         DEFINE_WAIT(wait);
1350
1351         for (;;) {
1352                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1353                                         TASK_UNINTERRUPTIBLE);
1354                 spin_unlock_bh(&sk->sk_lock.slock);
1355                 schedule();
1356                 spin_lock_bh(&sk->sk_lock.slock);
1357                 if (!sock_owned_by_user(sk))
1358                         break;
1359         }
1360         finish_wait(&sk->sk_lock.wq, &wait);
1361 }
1362
1363 static void __release_sock(struct sock *sk)
1364 {
1365         struct sk_buff *skb = sk->sk_backlog.head;
1366
1367         do {
1368                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1369                 bh_unlock_sock(sk);
1370
1371                 do {
1372                         struct sk_buff *next = skb->next;
1373
1374                         skb->next = NULL;
1375                         sk_backlog_rcv(sk, skb);
1376
1377                         /*
1378                          * We are in process context here with softirqs
1379                          * disabled, use cond_resched_softirq() to preempt.
1380                          * This is safe to do because we've taken the backlog
1381                          * queue private:
1382                          */
1383                         cond_resched_softirq();
1384
1385                         skb = next;
1386                 } while (skb != NULL);
1387
1388                 bh_lock_sock(sk);
1389         } while ((skb = sk->sk_backlog.head) != NULL);
1390 }
1391
1392 /**
1393  * sk_wait_data - wait for data to arrive at sk_receive_queue
1394  * @sk:    sock to wait on
1395  * @timeo: for how long
1396  *
1397  * Now socket state including sk->sk_err is changed only under lock,
1398  * hence we may omit checks after joining wait queue.
1399  * We check receive queue before schedule() only as optimization;
1400  * it is very likely that release_sock() added new data.
1401  */
1402 int sk_wait_data(struct sock *sk, long *timeo)
1403 {
1404         int rc;
1405         DEFINE_WAIT(wait);
1406
1407         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1408         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1409         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1410         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1411         finish_wait(sk->sk_sleep, &wait);
1412         return rc;
1413 }
1414
1415 EXPORT_SYMBOL(sk_wait_data);
1416
1417 /**
1418  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1419  *      @sk: socket
1420  *      @size: memory size to allocate
1421  *      @kind: allocation type
1422  *
1423  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1424  *      rmem allocation. This function assumes that protocols which have
1425  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1426  */
1427 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1428 {
1429         struct proto *prot = sk->sk_prot;
1430         int amt = sk_mem_pages(size);
1431         int allocated;
1432
1433         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1434         allocated = atomic_add_return(amt, prot->memory_allocated);
1435
1436         /* Under limit. */
1437         if (allocated <= prot->sysctl_mem[0]) {
1438                 if (prot->memory_pressure && *prot->memory_pressure)
1439                         *prot->memory_pressure = 0;
1440                 return 1;
1441         }
1442
1443         /* Under pressure. */
1444         if (allocated > prot->sysctl_mem[1])
1445                 if (prot->enter_memory_pressure)
1446                         prot->enter_memory_pressure(sk);
1447
1448         /* Over hard limit. */
1449         if (allocated > prot->sysctl_mem[2])
1450                 goto suppress_allocation;
1451
1452         /* guarantee minimum buffer size under pressure */
1453         if (kind == SK_MEM_RECV) {
1454                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1455                         return 1;
1456         } else { /* SK_MEM_SEND */
1457                 if (sk->sk_type == SOCK_STREAM) {
1458                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1459                                 return 1;
1460                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1461                            prot->sysctl_wmem[0])
1462                                 return 1;
1463         }
1464
1465         if (prot->memory_pressure) {
1466                 if (!*prot->memory_pressure ||
1467                     prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
1468                     sk_mem_pages(sk->sk_wmem_queued +
1469                                  atomic_read(&sk->sk_rmem_alloc) +
1470                                  sk->sk_forward_alloc))
1471                         return 1;
1472         }
1473
1474 suppress_allocation:
1475
1476         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1477                 sk_stream_moderate_sndbuf(sk);
1478
1479                 /* Fail only if socket is _under_ its sndbuf.
1480                  * In this case we cannot block, so that we have to fail.
1481                  */
1482                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1483                         return 1;
1484         }
1485
1486         /* Alas. Undo changes. */
1487         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1488         atomic_sub(amt, prot->memory_allocated);
1489         return 0;
1490 }
1491
1492 EXPORT_SYMBOL(__sk_mem_schedule);
1493
1494 /**
1495  *      __sk_reclaim - reclaim memory_allocated
1496  *      @sk: socket
1497  */
1498 void __sk_mem_reclaim(struct sock *sk)
1499 {
1500         struct proto *prot = sk->sk_prot;
1501
1502         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1503                    prot->memory_allocated);
1504         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1505
1506         if (prot->memory_pressure && *prot->memory_pressure &&
1507             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1508                 *prot->memory_pressure = 0;
1509 }
1510
1511 EXPORT_SYMBOL(__sk_mem_reclaim);
1512
1513
1514 /*
1515  * Set of default routines for initialising struct proto_ops when
1516  * the protocol does not support a particular function. In certain
1517  * cases where it makes no sense for a protocol to have a "do nothing"
1518  * function, some default processing is provided.
1519  */
1520
1521 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1522 {
1523         return -EOPNOTSUPP;
1524 }
1525
1526 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1527                     int len, int flags)
1528 {
1529         return -EOPNOTSUPP;
1530 }
1531
1532 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1533 {
1534         return -EOPNOTSUPP;
1535 }
1536
1537 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1538 {
1539         return -EOPNOTSUPP;
1540 }
1541
1542 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1543                     int *len, int peer)
1544 {
1545         return -EOPNOTSUPP;
1546 }
1547
1548 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1549 {
1550         return 0;
1551 }
1552
1553 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1554 {
1555         return -EOPNOTSUPP;
1556 }
1557
1558 int sock_no_listen(struct socket *sock, int backlog)
1559 {
1560         return -EOPNOTSUPP;
1561 }
1562
1563 int sock_no_shutdown(struct socket *sock, int how)
1564 {
1565         return -EOPNOTSUPP;
1566 }
1567
1568 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1569                     char __user *optval, int optlen)
1570 {
1571         return -EOPNOTSUPP;
1572 }
1573
1574 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1575                     char __user *optval, int __user *optlen)
1576 {
1577         return -EOPNOTSUPP;
1578 }
1579
1580 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1581                     size_t len)
1582 {
1583         return -EOPNOTSUPP;
1584 }
1585
1586 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1587                     size_t len, int flags)
1588 {
1589         return -EOPNOTSUPP;
1590 }
1591
1592 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1593 {
1594         /* Mirror missing mmap method error code */
1595         return -ENODEV;
1596 }
1597
1598 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1599 {
1600         ssize_t res;
1601         struct msghdr msg = {.msg_flags = flags};
1602         struct kvec iov;
1603         char *kaddr = kmap(page);
1604         iov.iov_base = kaddr + offset;
1605         iov.iov_len = size;
1606         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1607         kunmap(page);
1608         return res;
1609 }
1610
1611 /*
1612  *      Default Socket Callbacks
1613  */
1614
1615 static void sock_def_wakeup(struct sock *sk)
1616 {
1617         read_lock(&sk->sk_callback_lock);
1618         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1619                 wake_up_interruptible_all(sk->sk_sleep);
1620         read_unlock(&sk->sk_callback_lock);
1621 }
1622
1623 static void sock_def_error_report(struct sock *sk)
1624 {
1625         read_lock(&sk->sk_callback_lock);
1626         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1627                 wake_up_interruptible(sk->sk_sleep);
1628         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1629         read_unlock(&sk->sk_callback_lock);
1630 }
1631
1632 static void sock_def_readable(struct sock *sk, int len)
1633 {
1634         read_lock(&sk->sk_callback_lock);
1635         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1636                 wake_up_interruptible_sync(sk->sk_sleep);
1637         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1638         read_unlock(&sk->sk_callback_lock);
1639 }
1640
1641 static void sock_def_write_space(struct sock *sk)
1642 {
1643         read_lock(&sk->sk_callback_lock);
1644
1645         /* Do not wake up a writer until he can make "significant"
1646          * progress.  --DaveM
1647          */
1648         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1649                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1650                         wake_up_interruptible_sync(sk->sk_sleep);
1651
1652                 /* Should agree with poll, otherwise some programs break */
1653                 if (sock_writeable(sk))
1654                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1655         }
1656
1657         read_unlock(&sk->sk_callback_lock);
1658 }
1659
1660 static void sock_def_destruct(struct sock *sk)
1661 {
1662         kfree(sk->sk_protinfo);
1663 }
1664
1665 void sk_send_sigurg(struct sock *sk)
1666 {
1667         if (sk->sk_socket && sk->sk_socket->file)
1668                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1669                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1670 }
1671
1672 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1673                     unsigned long expires)
1674 {
1675         if (!mod_timer(timer, expires))
1676                 sock_hold(sk);
1677 }
1678
1679 EXPORT_SYMBOL(sk_reset_timer);
1680
1681 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1682 {
1683         if (timer_pending(timer) && del_timer(timer))
1684                 __sock_put(sk);
1685 }
1686
1687 EXPORT_SYMBOL(sk_stop_timer);
1688
1689 void sock_init_data(struct socket *sock, struct sock *sk)
1690 {
1691         skb_queue_head_init(&sk->sk_receive_queue);
1692         skb_queue_head_init(&sk->sk_write_queue);
1693         skb_queue_head_init(&sk->sk_error_queue);
1694 #ifdef CONFIG_NET_DMA
1695         skb_queue_head_init(&sk->sk_async_wait_queue);
1696 #endif
1697
1698         sk->sk_send_head        =       NULL;
1699
1700         init_timer(&sk->sk_timer);
1701
1702         sk->sk_allocation       =       GFP_KERNEL;
1703         sk->sk_rcvbuf           =       sysctl_rmem_default;
1704         sk->sk_sndbuf           =       sysctl_wmem_default;
1705         sk->sk_state            =       TCP_CLOSE;
1706         sk_set_socket(sk, sock);
1707
1708         sock_set_flag(sk, SOCK_ZAPPED);
1709
1710         if (sock) {
1711                 sk->sk_type     =       sock->type;
1712                 sk->sk_sleep    =       &sock->wait;
1713                 sock->sk        =       sk;
1714         } else
1715                 sk->sk_sleep    =       NULL;
1716
1717         rwlock_init(&sk->sk_dst_lock);
1718         rwlock_init(&sk->sk_callback_lock);
1719         lockdep_set_class_and_name(&sk->sk_callback_lock,
1720                         af_callback_keys + sk->sk_family,
1721                         af_family_clock_key_strings[sk->sk_family]);
1722
1723         sk->sk_state_change     =       sock_def_wakeup;
1724         sk->sk_data_ready       =       sock_def_readable;
1725         sk->sk_write_space      =       sock_def_write_space;
1726         sk->sk_error_report     =       sock_def_error_report;
1727         sk->sk_destruct         =       sock_def_destruct;
1728
1729         sk->sk_sndmsg_page      =       NULL;
1730         sk->sk_sndmsg_off       =       0;
1731
1732         sk->sk_peercred.pid     =       0;
1733         sk->sk_peercred.uid     =       -1;
1734         sk->sk_peercred.gid     =       -1;
1735         sk->sk_write_pending    =       0;
1736         sk->sk_rcvlowat         =       1;
1737         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1738         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1739
1740         sk->sk_stamp = ktime_set(-1L, 0);
1741
1742         atomic_set(&sk->sk_refcnt, 1);
1743         atomic_set(&sk->sk_drops, 0);
1744 }
1745
1746 void lock_sock_nested(struct sock *sk, int subclass)
1747 {
1748         might_sleep();
1749         spin_lock_bh(&sk->sk_lock.slock);
1750         if (sk->sk_lock.owned)
1751                 __lock_sock(sk);
1752         sk->sk_lock.owned = 1;
1753         spin_unlock(&sk->sk_lock.slock);
1754         /*
1755          * The sk_lock has mutex_lock() semantics here:
1756          */
1757         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1758         local_bh_enable();
1759 }
1760
1761 EXPORT_SYMBOL(lock_sock_nested);
1762
1763 void release_sock(struct sock *sk)
1764 {
1765         /*
1766          * The sk_lock has mutex_unlock() semantics:
1767          */
1768         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1769
1770         spin_lock_bh(&sk->sk_lock.slock);
1771         if (sk->sk_backlog.tail)
1772                 __release_sock(sk);
1773         sk->sk_lock.owned = 0;
1774         if (waitqueue_active(&sk->sk_lock.wq))
1775                 wake_up(&sk->sk_lock.wq);
1776         spin_unlock_bh(&sk->sk_lock.slock);
1777 }
1778 EXPORT_SYMBOL(release_sock);
1779
1780 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1781 {
1782         struct timeval tv;
1783         if (!sock_flag(sk, SOCK_TIMESTAMP))
1784                 sock_enable_timestamp(sk);
1785         tv = ktime_to_timeval(sk->sk_stamp);
1786         if (tv.tv_sec == -1)
1787                 return -ENOENT;
1788         if (tv.tv_sec == 0) {
1789                 sk->sk_stamp = ktime_get_real();
1790                 tv = ktime_to_timeval(sk->sk_stamp);
1791         }
1792         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1793 }
1794 EXPORT_SYMBOL(sock_get_timestamp);
1795
1796 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1797 {
1798         struct timespec ts;
1799         if (!sock_flag(sk, SOCK_TIMESTAMP))
1800                 sock_enable_timestamp(sk);
1801         ts = ktime_to_timespec(sk->sk_stamp);
1802         if (ts.tv_sec == -1)
1803                 return -ENOENT;
1804         if (ts.tv_sec == 0) {
1805                 sk->sk_stamp = ktime_get_real();
1806                 ts = ktime_to_timespec(sk->sk_stamp);
1807         }
1808         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1809 }
1810 EXPORT_SYMBOL(sock_get_timestampns);
1811
1812 void sock_enable_timestamp(struct sock *sk)
1813 {
1814         if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1815                 sock_set_flag(sk, SOCK_TIMESTAMP);
1816                 net_enable_timestamp();
1817         }
1818 }
1819
1820 /*
1821  *      Get a socket option on an socket.
1822  *
1823  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1824  *      asynchronous errors should be reported by getsockopt. We assume
1825  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1826  */
1827 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1828                            char __user *optval, int __user *optlen)
1829 {
1830         struct sock *sk = sock->sk;
1831
1832         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1833 }
1834
1835 EXPORT_SYMBOL(sock_common_getsockopt);
1836
1837 #ifdef CONFIG_COMPAT
1838 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1839                                   char __user *optval, int __user *optlen)
1840 {
1841         struct sock *sk = sock->sk;
1842
1843         if (sk->sk_prot->compat_getsockopt != NULL)
1844                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1845                                                       optval, optlen);
1846         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1847 }
1848 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1849 #endif
1850
1851 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1852                         struct msghdr *msg, size_t size, int flags)
1853 {
1854         struct sock *sk = sock->sk;
1855         int addr_len = 0;
1856         int err;
1857
1858         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1859                                    flags & ~MSG_DONTWAIT, &addr_len);
1860         if (err >= 0)
1861                 msg->msg_namelen = addr_len;
1862         return err;
1863 }
1864
1865 EXPORT_SYMBOL(sock_common_recvmsg);
1866
1867 /*
1868  *      Set socket options on an inet socket.
1869  */
1870 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1871                            char __user *optval, int optlen)
1872 {
1873         struct sock *sk = sock->sk;
1874
1875         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1876 }
1877
1878 EXPORT_SYMBOL(sock_common_setsockopt);
1879
1880 #ifdef CONFIG_COMPAT
1881 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1882                                   char __user *optval, int optlen)
1883 {
1884         struct sock *sk = sock->sk;
1885
1886         if (sk->sk_prot->compat_setsockopt != NULL)
1887                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1888                                                       optval, optlen);
1889         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1890 }
1891 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1892 #endif
1893
1894 void sk_common_release(struct sock *sk)
1895 {
1896         if (sk->sk_prot->destroy)
1897                 sk->sk_prot->destroy(sk);
1898
1899         /*
1900          * Observation: when sock_common_release is called, processes have
1901          * no access to socket. But net still has.
1902          * Step one, detach it from networking:
1903          *
1904          * A. Remove from hash tables.
1905          */
1906
1907         sk->sk_prot->unhash(sk);
1908
1909         /*
1910          * In this point socket cannot receive new packets, but it is possible
1911          * that some packets are in flight because some CPU runs receiver and
1912          * did hash table lookup before we unhashed socket. They will achieve
1913          * receive queue and will be purged by socket destructor.
1914          *
1915          * Also we still have packets pending on receive queue and probably,
1916          * our own packets waiting in device queues. sock_destroy will drain
1917          * receive queue, but transmitted packets will delay socket destruction
1918          * until the last reference will be released.
1919          */
1920
1921         sock_orphan(sk);
1922
1923         xfrm_sk_free_policy(sk);
1924
1925         sk_refcnt_debug_release(sk);
1926         sock_put(sk);
1927 }
1928
1929 EXPORT_SYMBOL(sk_common_release);
1930
1931 static DEFINE_RWLOCK(proto_list_lock);
1932 static LIST_HEAD(proto_list);
1933
1934 #ifdef CONFIG_PROC_FS
1935 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
1936 struct prot_inuse {
1937         int val[PROTO_INUSE_NR];
1938 };
1939
1940 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
1941
1942 #ifdef CONFIG_NET_NS
1943 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1944 {
1945         int cpu = smp_processor_id();
1946         per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
1947 }
1948 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1949
1950 int sock_prot_inuse_get(struct net *net, struct proto *prot)
1951 {
1952         int cpu, idx = prot->inuse_idx;
1953         int res = 0;
1954
1955         for_each_possible_cpu(cpu)
1956                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
1957
1958         return res >= 0 ? res : 0;
1959 }
1960 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
1961
1962 static int sock_inuse_init_net(struct net *net)
1963 {
1964         net->core.inuse = alloc_percpu(struct prot_inuse);
1965         return net->core.inuse ? 0 : -ENOMEM;
1966 }
1967
1968 static void sock_inuse_exit_net(struct net *net)
1969 {
1970         free_percpu(net->core.inuse);
1971 }
1972
1973 static struct pernet_operations net_inuse_ops = {
1974         .init = sock_inuse_init_net,
1975         .exit = sock_inuse_exit_net,
1976 };
1977
1978 static __init int net_inuse_init(void)
1979 {
1980         if (register_pernet_subsys(&net_inuse_ops))
1981                 panic("Cannot initialize net inuse counters");
1982
1983         return 0;
1984 }
1985
1986 core_initcall(net_inuse_init);
1987 #else
1988 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
1989
1990 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1991 {
1992         __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
1993 }
1994 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1995
1996 int sock_prot_inuse_get(struct net *net, struct proto *prot)
1997 {
1998         int cpu, idx = prot->inuse_idx;
1999         int res = 0;
2000
2001         for_each_possible_cpu(cpu)
2002                 res += per_cpu(prot_inuse, cpu).val[idx];
2003
2004         return res >= 0 ? res : 0;
2005 }
2006 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2007 #endif
2008
2009 static void assign_proto_idx(struct proto *prot)
2010 {
2011         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2012
2013         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2014                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2015                 return;
2016         }
2017
2018         set_bit(prot->inuse_idx, proto_inuse_idx);
2019 }
2020
2021 static void release_proto_idx(struct proto *prot)
2022 {
2023         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2024                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2025 }
2026 #else
2027 static inline void assign_proto_idx(struct proto *prot)
2028 {
2029 }
2030
2031 static inline void release_proto_idx(struct proto *prot)
2032 {
2033 }
2034 #endif
2035
2036 int proto_register(struct proto *prot, int alloc_slab)
2037 {
2038         if (alloc_slab) {
2039                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2040                                                SLAB_HWCACHE_ALIGN, NULL);
2041
2042                 if (prot->slab == NULL) {
2043                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2044                                prot->name);
2045                         goto out;
2046                 }
2047
2048                 if (prot->rsk_prot != NULL) {
2049                         static const char mask[] = "request_sock_%s";
2050
2051                         prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2052                         if (prot->rsk_prot->slab_name == NULL)
2053                                 goto out_free_sock_slab;
2054
2055                         sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2056                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2057                                                                  prot->rsk_prot->obj_size, 0,
2058                                                                  SLAB_HWCACHE_ALIGN, NULL);
2059
2060                         if (prot->rsk_prot->slab == NULL) {
2061                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2062                                        prot->name);
2063                                 goto out_free_request_sock_slab_name;
2064                         }
2065                 }
2066
2067                 if (prot->twsk_prot != NULL) {
2068                         static const char mask[] = "tw_sock_%s";
2069
2070                         prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2071
2072                         if (prot->twsk_prot->twsk_slab_name == NULL)
2073                                 goto out_free_request_sock_slab;
2074
2075                         sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2076                         prot->twsk_prot->twsk_slab =
2077                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2078                                                   prot->twsk_prot->twsk_obj_size,
2079                                                   0, SLAB_HWCACHE_ALIGN,
2080                                                   NULL);
2081                         if (prot->twsk_prot->twsk_slab == NULL)
2082                                 goto out_free_timewait_sock_slab_name;
2083                 }
2084         }
2085
2086         write_lock(&proto_list_lock);
2087         list_add(&prot->node, &proto_list);
2088         assign_proto_idx(prot);
2089         write_unlock(&proto_list_lock);
2090         return 0;
2091
2092 out_free_timewait_sock_slab_name:
2093         kfree(prot->twsk_prot->twsk_slab_name);
2094 out_free_request_sock_slab:
2095         if (prot->rsk_prot && prot->rsk_prot->slab) {
2096                 kmem_cache_destroy(prot->rsk_prot->slab);
2097                 prot->rsk_prot->slab = NULL;
2098         }
2099 out_free_request_sock_slab_name:
2100         kfree(prot->rsk_prot->slab_name);
2101 out_free_sock_slab:
2102         kmem_cache_destroy(prot->slab);
2103         prot->slab = NULL;
2104 out:
2105         return -ENOBUFS;
2106 }
2107
2108 EXPORT_SYMBOL(proto_register);
2109
2110 void proto_unregister(struct proto *prot)
2111 {
2112         write_lock(&proto_list_lock);
2113         release_proto_idx(prot);
2114         list_del(&prot->node);
2115         write_unlock(&proto_list_lock);
2116
2117         if (prot->slab != NULL) {
2118                 kmem_cache_destroy(prot->slab);
2119                 prot->slab = NULL;
2120         }
2121
2122         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2123                 kmem_cache_destroy(prot->rsk_prot->slab);
2124                 kfree(prot->rsk_prot->slab_name);
2125                 prot->rsk_prot->slab = NULL;
2126         }
2127
2128         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2129                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2130                 kfree(prot->twsk_prot->twsk_slab_name);
2131                 prot->twsk_prot->twsk_slab = NULL;
2132         }
2133 }
2134
2135 EXPORT_SYMBOL(proto_unregister);
2136
2137 #ifdef CONFIG_PROC_FS
2138 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2139         __acquires(proto_list_lock)
2140 {
2141         read_lock(&proto_list_lock);
2142         return seq_list_start_head(&proto_list, *pos);
2143 }
2144
2145 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2146 {
2147         return seq_list_next(v, &proto_list, pos);
2148 }
2149
2150 static void proto_seq_stop(struct seq_file *seq, void *v)
2151         __releases(proto_list_lock)
2152 {
2153         read_unlock(&proto_list_lock);
2154 }
2155
2156 static char proto_method_implemented(const void *method)
2157 {
2158         return method == NULL ? 'n' : 'y';
2159 }
2160
2161 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2162 {
2163         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2164                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2165                    proto->name,
2166                    proto->obj_size,
2167                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
2168                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2169                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2170                    proto->max_header,
2171                    proto->slab == NULL ? "no" : "yes",
2172                    module_name(proto->owner),
2173                    proto_method_implemented(proto->close),
2174                    proto_method_implemented(proto->connect),
2175                    proto_method_implemented(proto->disconnect),
2176                    proto_method_implemented(proto->accept),
2177                    proto_method_implemented(proto->ioctl),
2178                    proto_method_implemented(proto->init),
2179                    proto_method_implemented(proto->destroy),
2180                    proto_method_implemented(proto->shutdown),
2181                    proto_method_implemented(proto->setsockopt),
2182                    proto_method_implemented(proto->getsockopt),
2183                    proto_method_implemented(proto->sendmsg),
2184                    proto_method_implemented(proto->recvmsg),
2185                    proto_method_implemented(proto->sendpage),
2186                    proto_method_implemented(proto->bind),
2187                    proto_method_implemented(proto->backlog_rcv),
2188                    proto_method_implemented(proto->hash),
2189                    proto_method_implemented(proto->unhash),
2190                    proto_method_implemented(proto->get_port),
2191                    proto_method_implemented(proto->enter_memory_pressure));
2192 }
2193
2194 static int proto_seq_show(struct seq_file *seq, void *v)
2195 {
2196         if (v == &proto_list)
2197                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2198                            "protocol",
2199                            "size",
2200                            "sockets",
2201                            "memory",
2202                            "press",
2203                            "maxhdr",
2204                            "slab",
2205                            "module",
2206                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2207         else
2208                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2209         return 0;
2210 }
2211
2212 static const struct seq_operations proto_seq_ops = {
2213         .start  = proto_seq_start,
2214         .next   = proto_seq_next,
2215         .stop   = proto_seq_stop,
2216         .show   = proto_seq_show,
2217 };
2218
2219 static int proto_seq_open(struct inode *inode, struct file *file)
2220 {
2221         return seq_open(file, &proto_seq_ops);
2222 }
2223
2224 static const struct file_operations proto_seq_fops = {
2225         .owner          = THIS_MODULE,
2226         .open           = proto_seq_open,
2227         .read           = seq_read,
2228         .llseek         = seq_lseek,
2229         .release        = seq_release,
2230 };
2231
2232 static int __init proto_init(void)
2233 {
2234         /* register /proc/net/protocols */
2235         return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
2236 }
2237
2238 subsys_initcall(proto_init);
2239
2240 #endif /* PROC_FS */
2241
2242 EXPORT_SYMBOL(sk_alloc);
2243 EXPORT_SYMBOL(sk_free);
2244 EXPORT_SYMBOL(sk_send_sigurg);
2245 EXPORT_SYMBOL(sock_alloc_send_skb);
2246 EXPORT_SYMBOL(sock_init_data);
2247 EXPORT_SYMBOL(sock_kfree_s);
2248 EXPORT_SYMBOL(sock_kmalloc);
2249 EXPORT_SYMBOL(sock_no_accept);
2250 EXPORT_SYMBOL(sock_no_bind);
2251 EXPORT_SYMBOL(sock_no_connect);
2252 EXPORT_SYMBOL(sock_no_getname);
2253 EXPORT_SYMBOL(sock_no_getsockopt);
2254 EXPORT_SYMBOL(sock_no_ioctl);
2255 EXPORT_SYMBOL(sock_no_listen);
2256 EXPORT_SYMBOL(sock_no_mmap);
2257 EXPORT_SYMBOL(sock_no_poll);
2258 EXPORT_SYMBOL(sock_no_recvmsg);
2259 EXPORT_SYMBOL(sock_no_sendmsg);
2260 EXPORT_SYMBOL(sock_no_sendpage);
2261 EXPORT_SYMBOL(sock_no_setsockopt);
2262 EXPORT_SYMBOL(sock_no_shutdown);
2263 EXPORT_SYMBOL(sock_no_socketpair);
2264 EXPORT_SYMBOL(sock_rfree);
2265 EXPORT_SYMBOL(sock_setsockopt);
2266 EXPORT_SYMBOL(sock_wfree);
2267 EXPORT_SYMBOL(sock_wmalloc);
2268 EXPORT_SYMBOL(sock_i_uid);
2269 EXPORT_SYMBOL(sock_i_ino);
2270 EXPORT_SYMBOL(sysctl_optmem_max);