net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Version:     $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
  11  *
  12  * Authors:     Ross Biro
  13  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Alan Cox, <A.Cox@swansea.ac.uk>
  16  *
  17  * Fixes:
  18  *              Alan Cox        :       Numerous verify_area() problems
  19  *              Alan Cox        :       Connecting on a connecting socket
  20  *                                      now returns an error for tcp.
  21  *              Alan Cox        :       sock->protocol is set correctly.
  22  *                                      and is not sometimes left as 0.
  23  *              Alan Cox        :       connect handles icmp errors on a
  24  *                                      connect properly. Unfortunately there
  25  *                                      is a restart syscall nasty there. I
  26  *                                      can't match BSD without hacking the C
  27  *                                      library. Ideas urgently sought!
  28  *              Alan Cox        :       Disallow bind() to addresses that are
  29  *                                      not ours - especially broadcast ones!!
  30  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  31  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  32  *                                      instead they leave that for the DESTROY timer.
  33  *              Alan Cox        :       Clean up error flag in accept
  34  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  35  *                                      was buggy. Put a remove_sock() in the handler
  36  *                                      for memory when we hit 0. Also altered the timer
  37  *                                      code. The ACK stuff can wait and needs major
  38  *                                      TCP layer surgery.
  39  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  40  *                                      and fixed timer/inet_bh race.
  41  *              Alan Cox        :       Added zapped flag for TCP
  42  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  43  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  44  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  45  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  46  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  47  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  48  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  49  *      Pauline Middelink       :       identd support
  50  *              Alan Cox        :       Fixed connect() taking signals I think.
  51  *              Alan Cox        :       SO_LINGER supported
  52  *              Alan Cox        :       Error reporting fixes
  53  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  54  *              Alan Cox        :       inet sockets don't set sk->type!
  55  *              Alan Cox        :       Split socket option code
  56  *              Alan Cox        :       Callbacks
  57  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  58  *              Alex            :       Removed restriction on inet fioctl
  59  *              Alan Cox        :       Splitting INET from NET core
  60  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  61  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  62  *              Alan Cox        :       Split IP from generic code
  63  *              Alan Cox        :       New kfree_skbmem()
  64  *              Alan Cox        :       Make SO_DEBUG superuser only.
  65  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  66  *                                      (compatibility fix)
  67  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  68  *              Alan Cox        :       Allocator for a socket is settable.
  69  *              Alan Cox        :       SO_ERROR includes soft errors.
  70  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  71  *              Alan Cox        :       Generic socket allocation to make hooks
  72  *                                      easier (suggested by Craig Metz).
  73  *              Michael Pall    :       SO_ERROR returns positive errno again
  74  *              Steve Whitehouse:       Added default destructor to free
  75  *                                      protocol private data.
  76  *              Steve Whitehouse:       Added various other default routines
  77  *                                      common to several socket families.
  78  *              Chris Evans     :       Call suser() check last on F_SETOWN
  79  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  80  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  81  *              Andi Kleen      :       Fix write_space callback
  82  *              Chris Evans     :       Security fixes - signedness again
  83  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  84  *
  85  * To Fix:
  86  *
  87  *
  88  *              This program is free software; you can redistribute it and/or
  89  *              modify it under the terms of the GNU General Public License
  90  *              as published by the Free Software Foundation; either version
  91  *              2 of the License, or (at your option) any later version.
  92  */
  93
  94 #include <linux/capability.h>
  95 #include <linux/config.h>
  96 #include <linux/errno.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115
 116 #include <asm/uaccess.h>
 117 #include <asm/system.h>
 118
 119 #include <linux/netdevice.h>
 120 #include <net/protocol.h>
 121 #include <linux/skbuff.h>
 122 #include <net/request_sock.h>
 123 #include <net/sock.h>
 124 #include <net/xfrm.h>
 125 #include <linux/ipsec.h>
 126
 127 #include <linux/filter.h>
 128
 129 #ifdef CONFIG_INET
 130 #include <net/tcp.h>
 131 #endif
 132
 133 /* Take into consideration the size of the struct sk_buff overhead in the
 134  * determination of these values, since that is non-constant across
 135  * platforms.  This makes socket queueing behavior and performance
 136  * not depend upon such differences.
 137  */
 138 #define _SK_MEM_PACKETS         256
 139 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
 140 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 141 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 142
 143 /* Run time adjustable parameters. */
 144 __u32 sysctl_wmem_max = SK_WMEM_MAX;
 145 __u32 sysctl_rmem_max = SK_RMEM_MAX;
 146 __u32 sysctl_wmem_default = SK_WMEM_MAX;
 147 __u32 sysctl_rmem_default = SK_RMEM_MAX;
 148
 149 /* Maximal space eaten by iovec or ancilliary data plus some space */
 150 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
 151
 152 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 153 {
 154         struct timeval tv;
 155
 156         if (optlen < sizeof(tv))
 157                 return -EINVAL;
 158         if (copy_from_user(&tv, optval, sizeof(tv)))
 159                 return -EFAULT;
 160
 161         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 162         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 163                 return 0;
 164         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 165                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
 166         return 0;
 167 }
 168
 169 static void sock_warn_obsolete_bsdism(const char *name)
 170 {
 171         static int warned;
 172         static char warncomm[TASK_COMM_LEN];
 173         if (strcmp(warncomm, current->comm) && warned < 5) {
 174                 strcpy(warncomm,  current->comm);
 175                 printk(KERN_WARNING "process `%s' is using obsolete "
 176                        "%s SO_BSDCOMPAT\n", warncomm, name);
 177                 warned++;
 178         }
 179 }
 180
 181 static void sock_disable_timestamp(struct sock *sk)
 182 {
 183         if (sock_flag(sk, SOCK_TIMESTAMP)) {
 184                 sock_reset_flag(sk, SOCK_TIMESTAMP);
 185                 net_disable_timestamp();
 186         }
 187 }
 188
 189
 190 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 191 {
 192         int err = 0;
 193         int skb_len;
 194
 195         /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
 196            number of warnings when compiling with -W --ANK
 197          */
 198         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 199             (unsigned)sk->sk_rcvbuf) {
 200                 err = -ENOMEM;
 201                 goto out;
 202         }
 203
 204         /* It would be deadlock, if sock_queue_rcv_skb is used
 205            with socket lock! We assume that users of this
 206            function are lock free.
 207         */
 208         err = sk_filter(sk, skb, 1);
 209         if (err)
 210                 goto out;
 211
 212         skb->dev = NULL;
 213         skb_set_owner_r(skb, sk);
 214
 215         /* Cache the SKB length before we tack it onto the receive
 216          * queue.  Once it is added it no longer belongs to us and
 217          * may be freed by other threads of control pulling packets
 218          * from the queue.
 219          */
 220         skb_len = skb->len;
 221
 222         skb_queue_tail(&sk->sk_receive_queue, skb);
 223
 224         if (!sock_flag(sk, SOCK_DEAD))
 225                 sk->sk_data_ready(sk, skb_len);
 226 out:
 227         return err;
 228 }
 229 EXPORT_SYMBOL(sock_queue_rcv_skb);
 230
 231 int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
 232 {
 233         int rc = NET_RX_SUCCESS;
 234
 235         if (sk_filter(sk, skb, 0))
 236                 goto discard_and_relse;
 237
 238         skb->dev = NULL;
 239
 240         bh_lock_sock(sk);
 241         if (!sock_owned_by_user(sk))
 242                 rc = sk->sk_backlog_rcv(sk, skb);
 243         else
 244                 sk_add_backlog(sk, skb);
 245         bh_unlock_sock(sk);
 246 out:
 247         sock_put(sk);
 248         return rc;
 249 discard_and_relse:
 250         kfree_skb(skb);
 251         goto out;
 252 }
 253 EXPORT_SYMBOL(sk_receive_skb);
 254
 255 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 256 {
 257         struct dst_entry *dst = sk->sk_dst_cache;
 258
 259         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 260                 sk->sk_dst_cache = NULL;
 261                 dst_release(dst);
 262                 return NULL;
 263         }
 264
 265         return dst;
 266 }
 267 EXPORT_SYMBOL(__sk_dst_check);
 268
 269 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 270 {
 271         struct dst_entry *dst = sk_dst_get(sk);
 272
 273         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 274                 sk_dst_reset(sk);
 275                 dst_release(dst);
 276                 return NULL;
 277         }
 278
 279         return dst;
 280 }
 281 EXPORT_SYMBOL(sk_dst_check);
 282
 283 /*
 284  *      This is meant for all protocols to use and covers goings on
 285  *      at the socket level. Everything here is generic.
 286  */
 287
 288 int sock_setsockopt(struct socket *sock, int level, int optname,
 289                     char __user *optval, int optlen)
 290 {
 291         struct sock *sk=sock->sk;
 292         struct sk_filter *filter;
 293         int val;
 294         int valbool;
 295         struct linger ling;
 296         int ret = 0;
 297
 298         /*
 299          *      Options without arguments
 300          */
 301
 302 #ifdef SO_DONTLINGER            /* Compatibility item... */
 303         if (optname == SO_DONTLINGER) {
 304                 lock_sock(sk);
 305                 sock_reset_flag(sk, SOCK_LINGER);
 306                 release_sock(sk);
 307                 return 0;
 308         }
 309 #endif
 310
 311         if(optlen<sizeof(int))
 312                 return(-EINVAL);
 313
 314         if (get_user(val, (int __user *)optval))
 315                 return -EFAULT;
 316
 317         valbool = val?1:0;
 318
 319         lock_sock(sk);
 320
 321         switch(optname)
 322         {
 323                 case SO_DEBUG:
 324                         if(val && !capable(CAP_NET_ADMIN))
 325                         {
 326                                 ret = -EACCES;
 327                         }
 328                         else if (valbool)
 329                                 sock_set_flag(sk, SOCK_DBG);
 330                         else
 331                                 sock_reset_flag(sk, SOCK_DBG);
 332                         break;
 333                 case SO_REUSEADDR:
 334                         sk->sk_reuse = valbool;
 335                         break;
 336                 case SO_TYPE:
 337                 case SO_ERROR:
 338                         ret = -ENOPROTOOPT;
 339                         break;
 340                 case SO_DONTROUTE:
 341                         if (valbool)
 342                                 sock_set_flag(sk, SOCK_LOCALROUTE);
 343                         else
 344                                 sock_reset_flag(sk, SOCK_LOCALROUTE);
 345                         break;
 346                 case SO_BROADCAST:
 347                         sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 348                         break;
 349                 case SO_SNDBUF:
 350                         /* Don't error on this BSD doesn't and if you think
 351                            about it this is right. Otherwise apps have to
 352                            play 'guess the biggest size' games. RCVBUF/SNDBUF
 353                            are treated in BSD as hints */
 354
 355                         if (val > sysctl_wmem_max)
 356                                 val = sysctl_wmem_max;
 357 set_sndbuf:
 358                         sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 359                         if ((val * 2) < SOCK_MIN_SNDBUF)
 360                                 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 361                         else
 362                                 sk->sk_sndbuf = val * 2;
 363
 364                         /*
 365                          *      Wake up sending tasks if we
 366                          *      upped the value.
 367                          */
 368                         sk->sk_write_space(sk);
 369                         break;
 370
 371                 case SO_SNDBUFFORCE:
 372                         if (!capable(CAP_NET_ADMIN)) {
 373                                 ret = -EPERM;
 374                                 break;
 375                         }
 376                         goto set_sndbuf;
 377
 378                 case SO_RCVBUF:
 379                         /* Don't error on this BSD doesn't and if you think
 380                            about it this is right. Otherwise apps have to
 381                            play 'guess the biggest size' games. RCVBUF/SNDBUF
 382                            are treated in BSD as hints */
 383
 384                         if (val > sysctl_rmem_max)
 385                                 val = sysctl_rmem_max;
 386 set_rcvbuf:
 387                         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 388                         /*
 389                          * We double it on the way in to account for
 390                          * "struct sk_buff" etc. overhead.   Applications
 391                          * assume that the SO_RCVBUF setting they make will
 392                          * allow that much actual data to be received on that
 393                          * socket.
 394                          *
 395                          * Applications are unaware that "struct sk_buff" and
 396                          * other overheads allocate from the receive buffer
 397                          * during socket buffer allocation.
 398                          *
 399                          * And after considering the possible alternatives,
 400                          * returning the value we actually used in getsockopt
 401                          * is the most desirable behavior.
 402                          */
 403                         if ((val * 2) < SOCK_MIN_RCVBUF)
 404                                 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 405                         else
 406                                 sk->sk_rcvbuf = val * 2;
 407                         break;
 408
 409                 case SO_RCVBUFFORCE:
 410                         if (!capable(CAP_NET_ADMIN)) {
 411                                 ret = -EPERM;
 412                                 break;
 413                         }
 414                         goto set_rcvbuf;
 415
 416                 case SO_KEEPALIVE:
 417 #ifdef CONFIG_INET
 418                         if (sk->sk_protocol == IPPROTO_TCP)
 419                                 tcp_set_keepalive(sk, valbool);
 420 #endif
 421                         sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 422                         break;
 423
 424                 case SO_OOBINLINE:
 425                         sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 426                         break;
 427
 428                 case SO_NO_CHECK:
 429                         sk->sk_no_check = valbool;
 430                         break;
 431
 432                 case SO_PRIORITY:
 433                         if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
 434                                 sk->sk_priority = val;
 435                         else
 436                                 ret = -EPERM;
 437                         break;
 438
 439                 case SO_LINGER:
 440                         if(optlen<sizeof(ling)) {
 441                                 ret = -EINVAL;  /* 1003.1g */
 442                                 break;
 443                         }
 444                         if (copy_from_user(&ling,optval,sizeof(ling))) {
 445                                 ret = -EFAULT;
 446                                 break;
 447                         }
 448                         if (!ling.l_onoff)
 449                                 sock_reset_flag(sk, SOCK_LINGER);
 450                         else {
 451 #if (BITS_PER_LONG == 32)
 452                                 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 453                                         sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 454                                 else
 455 #endif
 456                                         sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 457                                 sock_set_flag(sk, SOCK_LINGER);
 458                         }
 459                         break;
 460
 461                 case SO_BSDCOMPAT:
 462                         sock_warn_obsolete_bsdism("setsockopt");
 463                         break;
 464
 465                 case SO_PASSCRED:
 466                         if (valbool)
 467                                 set_bit(SOCK_PASSCRED, &sock->flags);
 468                         else
 469                                 clear_bit(SOCK_PASSCRED, &sock->flags);
 470                         break;
 471
 472                 case SO_TIMESTAMP:
 473                         if (valbool)  {
 474                                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 475                                 sock_enable_timestamp(sk);
 476                         } else
 477                                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 478                         break;
 479
 480                 case SO_RCVLOWAT:
 481                         if (val < 0)
 482                                 val = INT_MAX;
 483                         sk->sk_rcvlowat = val ? : 1;
 484                         break;
 485
 486                 case SO_RCVTIMEO:
 487                         ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 488                         break;
 489
 490                 case SO_SNDTIMEO:
 491                         ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 492                         break;
 493
 494 #ifdef CONFIG_NETDEVICES
 495                 case SO_BINDTODEVICE:
 496                 {
 497                         char devname[IFNAMSIZ];
 498
 499                         /* Sorry... */
 500                         if (!capable(CAP_NET_RAW)) {
 501                                 ret = -EPERM;
 502                                 break;
 503                         }
 504
 505                         /* Bind this socket to a particular device like "eth0",
 506                          * as specified in the passed interface name. If the
 507                          * name is "" or the option length is zero the socket
 508                          * is not bound.
 509                          */
 510
 511                         if (!valbool) {
 512                                 sk->sk_bound_dev_if = 0;
 513                         } else {
 514                                 if (optlen > IFNAMSIZ - 1)
 515                                         optlen = IFNAMSIZ - 1;
 516                                 memset(devname, 0, sizeof(devname));
 517                                 if (copy_from_user(devname, optval, optlen)) {
 518                                         ret = -EFAULT;
 519                                         break;
 520                                 }
 521
 522                                 /* Remove any cached route for this socket. */
 523                                 sk_dst_reset(sk);
 524
 525                                 if (devname[0] == '\0') {
 526                                         sk->sk_bound_dev_if = 0;
 527                                 } else {
 528                                         struct net_device *dev = dev_get_by_name(devname);
 529                                         if (!dev) {
 530                                                 ret = -ENODEV;
 531                                                 break;
 532                                         }
 533                                         sk->sk_bound_dev_if = dev->ifindex;
 534                                         dev_put(dev);
 535                                 }
 536                         }
 537                         break;
 538                 }
 539 #endif
 540
 541
 542                 case SO_ATTACH_FILTER:
 543                         ret = -EINVAL;
 544                         if (optlen == sizeof(struct sock_fprog)) {
 545                                 struct sock_fprog fprog;
 546
 547                                 ret = -EFAULT;
 548                                 if (copy_from_user(&fprog, optval, sizeof(fprog)))
 549                                         break;
 550
 551                                 ret = sk_attach_filter(&fprog, sk);
 552                         }
 553                         break;
 554
 555                 case SO_DETACH_FILTER:
 556                         spin_lock_bh(&sk->sk_lock.slock);
 557                         filter = sk->sk_filter;
 558                         if (filter) {
 559                                 sk->sk_filter = NULL;
 560                                 spin_unlock_bh(&sk->sk_lock.slock);
 561                                 sk_filter_release(sk, filter);
 562                                 break;
 563                         }
 564                         spin_unlock_bh(&sk->sk_lock.slock);
 565                         ret = -ENONET;
 566                         break;
 567
 568                 /* We implement the SO_SNDLOWAT etc to
 569                    not be settable (1003.1g 5.3) */
 570                 default:
 571                         ret = -ENOPROTOOPT;
 572                         break;
 573         }
 574         release_sock(sk);
 575         return ret;
 576 }
 577
 578
 579 int sock_getsockopt(struct socket *sock, int level, int optname,
 580                     char __user *optval, int __user *optlen)
 581 {
 582         struct sock *sk = sock->sk;
 583
 584         union
 585         {
 586                 int val;
 587                 struct linger ling;
 588                 struct timeval tm;
 589         } v;
 590
 591         unsigned int lv = sizeof(int);
 592         int len;
 593
 594         if(get_user(len,optlen))
 595                 return -EFAULT;
 596         if(len < 0)
 597                 return -EINVAL;
 598
 599         switch(optname)
 600         {
 601                 case SO_DEBUG:
 602                         v.val = sock_flag(sk, SOCK_DBG);
 603                         break;
 604
 605                 case SO_DONTROUTE:
 606                         v.val = sock_flag(sk, SOCK_LOCALROUTE);
 607                         break;
 608
 609                 case SO_BROADCAST:
 610                         v.val = !!sock_flag(sk, SOCK_BROADCAST);
 611                         break;
 612
 613                 case SO_SNDBUF:
 614                         v.val = sk->sk_sndbuf;
 615                         break;
 616
 617                 case SO_RCVBUF:
 618                         v.val = sk->sk_rcvbuf;
 619                         break;
 620
 621                 case SO_REUSEADDR:
 622                         v.val = sk->sk_reuse;
 623                         break;
 624
 625                 case SO_KEEPALIVE:
 626                         v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 627                         break;
 628
 629                 case SO_TYPE:
 630                         v.val = sk->sk_type;
 631                         break;
 632
 633                 case SO_ERROR:
 634                         v.val = -sock_error(sk);
 635                         if(v.val==0)
 636                                 v.val = xchg(&sk->sk_err_soft, 0);
 637                         break;
 638
 639                 case SO_OOBINLINE:
 640                         v.val = !!sock_flag(sk, SOCK_URGINLINE);
 641                         break;
 642
 643                 case SO_NO_CHECK:
 644                         v.val = sk->sk_no_check;
 645                         break;
 646
 647                 case SO_PRIORITY:
 648                         v.val = sk->sk_priority;
 649                         break;
 650
 651                 case SO_LINGER:
 652                         lv              = sizeof(v.ling);
 653                         v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 654                         v.ling.l_linger = sk->sk_lingertime / HZ;
 655                         break;
 656
 657                 case SO_BSDCOMPAT:
 658                         sock_warn_obsolete_bsdism("getsockopt");
 659                         break;
 660
 661                 case SO_TIMESTAMP:
 662                         v.val = sock_flag(sk, SOCK_RCVTSTAMP);
 663                         break;
 664
 665                 case SO_RCVTIMEO:
 666                         lv=sizeof(struct timeval);
 667                         if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
 668                                 v.tm.tv_sec = 0;
 669                                 v.tm.tv_usec = 0;
 670                         } else {
 671                                 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
 672                                 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
 673                         }
 674                         break;
 675
 676                 case SO_SNDTIMEO:
 677                         lv=sizeof(struct timeval);
 678                         if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
 679                                 v.tm.tv_sec = 0;
 680                                 v.tm.tv_usec = 0;
 681                         } else {
 682                                 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
 683                                 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
 684                         }
 685                         break;
 686
 687                 case SO_RCVLOWAT:
 688                         v.val = sk->sk_rcvlowat;
 689                         break;
 690
 691                 case SO_SNDLOWAT:
 692                         v.val=1;
 693                         break;
 694
 695                 case SO_PASSCRED:
 696                         v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 697                         break;
 698
 699                 case SO_PEERCRED:
 700                         if (len > sizeof(sk->sk_peercred))
 701                                 len = sizeof(sk->sk_peercred);
 702                         if (copy_to_user(optval, &sk->sk_peercred, len))
 703                                 return -EFAULT;
 704                         goto lenout;
 705
 706                 case SO_PEERNAME:
 707                 {
 708                         char address[128];
 709
 710                         if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
 711                                 return -ENOTCONN;
 712                         if (lv < len)
 713                                 return -EINVAL;
 714                         if (copy_to_user(optval, address, len))
 715                                 return -EFAULT;
 716                         goto lenout;
 717                 }
 718
 719                 /* Dubious BSD thing... Probably nobody even uses it, but
 720                  * the UNIX standard wants it for whatever reason... -DaveM
 721                  */
 722                 case SO_ACCEPTCONN:
 723                         v.val = sk->sk_state == TCP_LISTEN;
 724                         break;
 725
 726                 case SO_PEERSEC:
 727                         return security_socket_getpeersec_stream(sock, optval, optlen, len);
 728
 729                 default:
 730                         return(-ENOPROTOOPT);
 731         }
 732         if (len > lv)
 733                 len = lv;
 734         if (copy_to_user(optval, &v, len))
 735                 return -EFAULT;
 736 lenout:
 737         if (put_user(len, optlen))
 738                 return -EFAULT;
 739         return 0;
 740 }
 741
 742 /**
 743  *      sk_alloc - All socket objects are allocated here
 744  *      @family: protocol family
 745  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 746  *      @prot: struct proto associated with this new sock instance
 747  *      @zero_it: if we should zero the newly allocated sock
 748  */
 749 struct sock *sk_alloc(int family, gfp_t priority,
 750                       struct proto *prot, int zero_it)
 751 {
 752         struct sock *sk = NULL;
 753         kmem_cache_t *slab = prot->slab;
 754
 755         if (slab != NULL)
 756                 sk = kmem_cache_alloc(slab, priority);
 757         else
 758                 sk = kmalloc(prot->obj_size, priority);
 759
 760         if (sk) {
 761                 if (zero_it) {
 762                         memset(sk, 0, prot->obj_size);
 763                         sk->sk_family = family;
 764                         /*
 765                          * See comment in struct sock definition to understand
 766                          * why we need sk_prot_creator -acme
 767                          */
 768                         sk->sk_prot = sk->sk_prot_creator = prot;
 769                         sock_lock_init(sk);
 770                 }
 771
 772                 if (security_sk_alloc(sk, family, priority))
 773                         goto out_free;
 774
 775                 if (!try_module_get(prot->owner))
 776                         goto out_free;
 777         }
 778         return sk;
 779
 780 out_free:
 781         if (slab != NULL)
 782                 kmem_cache_free(slab, sk);
 783         else
 784                 kfree(sk);
 785         return NULL;
 786 }
 787
 788 void sk_free(struct sock *sk)
 789 {
 790         struct sk_filter *filter;
 791         struct module *owner = sk->sk_prot_creator->owner;
 792
 793         if (sk->sk_destruct)
 794                 sk->sk_destruct(sk);
 795
 796         filter = sk->sk_filter;
 797         if (filter) {
 798                 sk_filter_release(sk, filter);
 799                 sk->sk_filter = NULL;
 800         }
 801
 802         sock_disable_timestamp(sk);
 803
 804         if (atomic_read(&sk->sk_omem_alloc))
 805                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
 806                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
 807
 808         security_sk_free(sk);
 809         if (sk->sk_prot_creator->slab != NULL)
 810                 kmem_cache_free(sk->sk_prot_creator->slab, sk);
 811         else
 812                 kfree(sk);
 813         module_put(owner);
 814 }
 815
 816 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 817 {
 818         struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
 819
 820         if (newsk != NULL) {
 821                 struct sk_filter *filter;
 822
 823                 memcpy(newsk, sk, sk->sk_prot->obj_size);
 824
 825                 /* SANITY */
 826                 sk_node_init(&newsk->sk_node);
 827                 sock_lock_init(newsk);
 828                 bh_lock_sock(newsk);
 829
 830                 atomic_set(&newsk->sk_rmem_alloc, 0);
 831                 atomic_set(&newsk->sk_wmem_alloc, 0);
 832                 atomic_set(&newsk->sk_omem_alloc, 0);
 833                 skb_queue_head_init(&newsk->sk_receive_queue);
 834                 skb_queue_head_init(&newsk->sk_write_queue);
 835 #ifdef CONFIG_NET_DMA
 836                 skb_queue_head_init(&newsk->sk_async_wait_queue);
 837 #endif
 838
 839                 rwlock_init(&newsk->sk_dst_lock);
 840                 rwlock_init(&newsk->sk_callback_lock);
 841
 842                 newsk->sk_dst_cache     = NULL;
 843                 newsk->sk_wmem_queued   = 0;
 844                 newsk->sk_forward_alloc = 0;
 845                 newsk->sk_send_head     = NULL;
 846                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
 847                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 848
 849                 sock_reset_flag(newsk, SOCK_DONE);
 850                 skb_queue_head_init(&newsk->sk_error_queue);
 851
 852                 filter = newsk->sk_filter;
 853                 if (filter != NULL)
 854                         sk_filter_charge(newsk, filter);
 855
 856                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
 857                         /* It is still raw copy of parent, so invalidate
 858                          * destructor and make plain sk_free() */
 859                         newsk->sk_destruct = NULL;
 860                         sk_free(newsk);
 861                         newsk = NULL;
 862                         goto out;
 863                 }
 864
 865                 newsk->sk_err      = 0;
 866                 newsk->sk_priority = 0;
 867                 atomic_set(&newsk->sk_refcnt, 2);
 868
 869                 /*
 870                  * Increment the counter in the same struct proto as the master
 871                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
 872                  * is the same as sk->sk_prot->socks, as this field was copied
 873                  * with memcpy).
 874                  *
 875                  * This _changes_ the previous behaviour, where
 876                  * tcp_create_openreq_child always was incrementing the
 877                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
 878                  * to be taken into account in all callers. -acme
 879                  */
 880                 sk_refcnt_debug_inc(newsk);
 881                 newsk->sk_socket = NULL;
 882                 newsk->sk_sleep  = NULL;
 883
 884                 if (newsk->sk_prot->sockets_allocated)
 885                         atomic_inc(newsk->sk_prot->sockets_allocated);
 886         }
 887 out:
 888         return newsk;
 889 }
 890
 891 EXPORT_SYMBOL_GPL(sk_clone);
 892
 893 void __init sk_init(void)
 894 {
 895         if (num_physpages <= 4096) {
 896                 sysctl_wmem_max = 32767;
 897                 sysctl_rmem_max = 32767;
 898                 sysctl_wmem_default = 32767;
 899                 sysctl_rmem_default = 32767;
 900         } else if (num_physpages >= 131072) {
 901                 sysctl_wmem_max = 131071;
 902                 sysctl_rmem_max = 131071;
 903         }
 904 }
 905
 906 /*
 907  *      Simple resource managers for sockets.
 908  */
 909
 910
 911 /*
 912  * Write buffer destructor automatically called from kfree_skb.
 913  */
 914 void sock_wfree(struct sk_buff *skb)
 915 {
 916         struct sock *sk = skb->sk;
 917
 918         /* In case it might be waiting for more memory. */
 919         atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
 920         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
 921                 sk->sk_write_space(sk);
 922         sock_put(sk);
 923 }
 924
 925 /*
 926  * Read buffer destructor automatically called from kfree_skb.
 927  */
 928 void sock_rfree(struct sk_buff *skb)
 929 {
 930         struct sock *sk = skb->sk;
 931
 932         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 933 }
 934
 935
 936 int sock_i_uid(struct sock *sk)
 937 {
 938         int uid;
 939
 940         read_lock(&sk->sk_callback_lock);
 941         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
 942         read_unlock(&sk->sk_callback_lock);
 943         return uid;
 944 }
 945
 946 unsigned long sock_i_ino(struct sock *sk)
 947 {
 948         unsigned long ino;
 949
 950         read_lock(&sk->sk_callback_lock);
 951         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
 952         read_unlock(&sk->sk_callback_lock);
 953         return ino;
 954 }
 955
 956 /*
 957  * Allocate a skb from the socket's send buffer.
 958  */
 959 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
 960                              gfp_t priority)
 961 {
 962         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
 963                 struct sk_buff * skb = alloc_skb(size, priority);
 964                 if (skb) {
 965                         skb_set_owner_w(skb, sk);
 966                         return skb;
 967                 }
 968         }
 969         return NULL;
 970 }
 971
 972 /*
 973  * Allocate a skb from the socket's receive buffer.
 974  */
 975 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
 976                              gfp_t priority)
 977 {
 978         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
 979                 struct sk_buff *skb = alloc_skb(size, priority);
 980                 if (skb) {
 981                         skb_set_owner_r(skb, sk);
 982                         return skb;
 983                 }
 984         }
 985         return NULL;
 986 }
 987
 988 /*
 989  * Allocate a memory block from the socket's option memory buffer.
 990  */
 991 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
 992 {
 993         if ((unsigned)size <= sysctl_optmem_max &&
 994             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
 995                 void *mem;
 996                 /* First do the add, to avoid the race if kmalloc
 997                  * might sleep.
 998                  */
 999                 atomic_add(size, &sk->sk_omem_alloc);
1000                 mem = kmalloc(size, priority);
1001                 if (mem)
1002                         return mem;
1003                 atomic_sub(size, &sk->sk_omem_alloc);
1004         }
1005         return NULL;
1006 }
1007
1008 /*
1009  * Free an option memory block.
1010  */
1011 void sock_kfree_s(struct sock *sk, void *mem, int size)
1012 {
1013         kfree(mem);
1014         atomic_sub(size, &sk->sk_omem_alloc);
1015 }
1016
1017 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1018    I think, these locks should be removed for datagram sockets.
1019  */
1020 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1021 {
1022         DEFINE_WAIT(wait);
1023
1024         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1025         for (;;) {
1026                 if (!timeo)
1027                         break;
1028                 if (signal_pending(current))
1029                         break;
1030                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1031                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1032                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1033                         break;
1034                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1035                         break;
1036                 if (sk->sk_err)
1037                         break;
1038                 timeo = schedule_timeout(timeo);
1039         }
1040         finish_wait(sk->sk_sleep, &wait);
1041         return timeo;
1042 }
1043
1044
1045 /*
1046  *      Generic send/receive buffer handlers
1047  */
1048
1049 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1050                                             unsigned long header_len,
1051                                             unsigned long data_len,
1052                                             int noblock, int *errcode)
1053 {
1054         struct sk_buff *skb;
1055         gfp_t gfp_mask;
1056         long timeo;
1057         int err;
1058
1059         gfp_mask = sk->sk_allocation;
1060         if (gfp_mask & __GFP_WAIT)
1061                 gfp_mask |= __GFP_REPEAT;
1062
1063         timeo = sock_sndtimeo(sk, noblock);
1064         while (1) {
1065                 err = sock_error(sk);
1066                 if (err != 0)
1067                         goto failure;
1068
1069                 err = -EPIPE;
1070                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1071                         goto failure;
1072
1073                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1074                         skb = alloc_skb(header_len, sk->sk_allocation);
1075                         if (skb) {
1076                                 int npages;
1077                                 int i;
1078
1079                                 /* No pages, we're done... */
1080                                 if (!data_len)
1081                                         break;
1082
1083                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1084                                 skb->truesize += data_len;
1085                                 skb_shinfo(skb)->nr_frags = npages;
1086                                 for (i = 0; i < npages; i++) {
1087                                         struct page *page;
1088                                         skb_frag_t *frag;
1089
1090                                         page = alloc_pages(sk->sk_allocation, 0);
1091                                         if (!page) {
1092                                                 err = -ENOBUFS;
1093                                                 skb_shinfo(skb)->nr_frags = i;
1094                                                 kfree_skb(skb);
1095                                                 goto failure;
1096                                         }
1097
1098                                         frag = &skb_shinfo(skb)->frags[i];
1099                                         frag->page = page;
1100                                         frag->page_offset = 0;
1101                                         frag->size = (data_len >= PAGE_SIZE ?
1102                                                       PAGE_SIZE :
1103                                                       data_len);
1104                                         data_len -= PAGE_SIZE;
1105                                 }
1106
1107                                 /* Full success... */
1108                                 break;
1109                         }
1110                         err = -ENOBUFS;
1111                         goto failure;
1112                 }
1113                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1114                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1115                 err = -EAGAIN;
1116                 if (!timeo)
1117                         goto failure;
1118                 if (signal_pending(current))
1119                         goto interrupted;
1120                 timeo = sock_wait_for_wmem(sk, timeo);
1121         }
1122
1123         skb_set_owner_w(skb, sk);
1124         return skb;
1125
1126 interrupted:
1127         err = sock_intr_errno(timeo);
1128 failure:
1129         *errcode = err;
1130         return NULL;
1131 }
1132
1133 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1134                                     int noblock, int *errcode)
1135 {
1136         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1137 }
1138
1139 static void __lock_sock(struct sock *sk)
1140 {
1141         DEFINE_WAIT(wait);
1142
1143         for(;;) {
1144                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1145                                         TASK_UNINTERRUPTIBLE);
1146                 spin_unlock_bh(&sk->sk_lock.slock);
1147                 schedule();
1148                 spin_lock_bh(&sk->sk_lock.slock);
1149                 if(!sock_owned_by_user(sk))
1150                         break;
1151         }
1152         finish_wait(&sk->sk_lock.wq, &wait);
1153 }
1154
1155 static void __release_sock(struct sock *sk)
1156 {
1157         struct sk_buff *skb = sk->sk_backlog.head;
1158
1159         do {
1160                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1161                 bh_unlock_sock(sk);
1162
1163                 do {
1164                         struct sk_buff *next = skb->next;
1165
1166                         skb->next = NULL;
1167                         sk->sk_backlog_rcv(sk, skb);
1168
1169                         /*
1170                          * We are in process context here with softirqs
1171                          * disabled, use cond_resched_softirq() to preempt.
1172                          * This is safe to do because we've taken the backlog
1173                          * queue private:
1174                          */
1175                         cond_resched_softirq();
1176
1177                         skb = next;
1178                 } while (skb != NULL);
1179
1180                 bh_lock_sock(sk);
1181         } while((skb = sk->sk_backlog.head) != NULL);
1182 }
1183
1184 /**
1185  * sk_wait_data - wait for data to arrive at sk_receive_queue
1186  * @sk:    sock to wait on
1187  * @timeo: for how long
1188  *
1189  * Now socket state including sk->sk_err is changed only under lock,
1190  * hence we may omit checks after joining wait queue.
1191  * We check receive queue before schedule() only as optimization;
1192  * it is very likely that release_sock() added new data.
1193  */
1194 int sk_wait_data(struct sock *sk, long *timeo)
1195 {
1196         int rc;
1197         DEFINE_WAIT(wait);
1198
1199         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1200         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1201         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1202         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1203         finish_wait(sk->sk_sleep, &wait);
1204         return rc;
1205 }
1206
1207 EXPORT_SYMBOL(sk_wait_data);
1208
1209 /*
1210  * Set of default routines for initialising struct proto_ops when
1211  * the protocol does not support a particular function. In certain
1212  * cases where it makes no sense for a protocol to have a "do nothing"
1213  * function, some default processing is provided.
1214  */
1215
1216 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1217 {
1218         return -EOPNOTSUPP;
1219 }
1220
1221 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1222                     int len, int flags)
1223 {
1224         return -EOPNOTSUPP;
1225 }
1226
1227 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1228 {
1229         return -EOPNOTSUPP;
1230 }
1231
1232 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1233 {
1234         return -EOPNOTSUPP;
1235 }
1236
1237 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1238                     int *len, int peer)
1239 {
1240         return -EOPNOTSUPP;
1241 }
1242
1243 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1244 {
1245         return 0;
1246 }
1247
1248 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1249 {
1250         return -EOPNOTSUPP;
1251 }
1252
1253 int sock_no_listen(struct socket *sock, int backlog)
1254 {
1255         return -EOPNOTSUPP;
1256 }
1257
1258 int sock_no_shutdown(struct socket *sock, int how)
1259 {
1260         return -EOPNOTSUPP;
1261 }
1262
1263 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1264                     char __user *optval, int optlen)
1265 {
1266         return -EOPNOTSUPP;
1267 }
1268
1269 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1270                     char __user *optval, int __user *optlen)
1271 {
1272         return -EOPNOTSUPP;
1273 }
1274
1275 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1276                     size_t len)
1277 {
1278         return -EOPNOTSUPP;
1279 }
1280
1281 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1282                     size_t len, int flags)
1283 {
1284         return -EOPNOTSUPP;
1285 }
1286
1287 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1288 {
1289         /* Mirror missing mmap method error code */
1290         return -ENODEV;
1291 }
1292
1293 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1294 {
1295         ssize_t res;
1296         struct msghdr msg = {.msg_flags = flags};
1297         struct kvec iov;
1298         char *kaddr = kmap(page);
1299         iov.iov_base = kaddr + offset;
1300         iov.iov_len = size;
1301         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1302         kunmap(page);
1303         return res;
1304 }
1305
1306 /*
1307  *      Default Socket Callbacks
1308  */
1309
1310 static void sock_def_wakeup(struct sock *sk)
1311 {
1312         read_lock(&sk->sk_callback_lock);
1313         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1314                 wake_up_interruptible_all(sk->sk_sleep);
1315         read_unlock(&sk->sk_callback_lock);
1316 }
1317
1318 static void sock_def_error_report(struct sock *sk)
1319 {
1320         read_lock(&sk->sk_callback_lock);
1321         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1322                 wake_up_interruptible(sk->sk_sleep);
1323         sk_wake_async(sk,0,POLL_ERR);
1324         read_unlock(&sk->sk_callback_lock);
1325 }
1326
1327 static void sock_def_readable(struct sock *sk, int len)
1328 {
1329         read_lock(&sk->sk_callback_lock);
1330         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1331                 wake_up_interruptible(sk->sk_sleep);
1332         sk_wake_async(sk,1,POLL_IN);
1333         read_unlock(&sk->sk_callback_lock);
1334 }
1335
1336 static void sock_def_write_space(struct sock *sk)
1337 {
1338         read_lock(&sk->sk_callback_lock);
1339
1340         /* Do not wake up a writer until he can make "significant"
1341          * progress.  --DaveM
1342          */
1343         if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1344                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1345                         wake_up_interruptible(sk->sk_sleep);
1346
1347                 /* Should agree with poll, otherwise some programs break */
1348                 if (sock_writeable(sk))
1349                         sk_wake_async(sk, 2, POLL_OUT);
1350         }
1351
1352         read_unlock(&sk->sk_callback_lock);
1353 }
1354
1355 static void sock_def_destruct(struct sock *sk)
1356 {
1357         kfree(sk->sk_protinfo);
1358 }
1359
1360 void sk_send_sigurg(struct sock *sk)
1361 {
1362         if (sk->sk_socket && sk->sk_socket->file)
1363                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1364                         sk_wake_async(sk, 3, POLL_PRI);
1365 }
1366
1367 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1368                     unsigned long expires)
1369 {
1370         if (!mod_timer(timer, expires))
1371                 sock_hold(sk);
1372 }
1373
1374 EXPORT_SYMBOL(sk_reset_timer);
1375
1376 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1377 {
1378         if (timer_pending(timer) && del_timer(timer))
1379                 __sock_put(sk);
1380 }
1381
1382 EXPORT_SYMBOL(sk_stop_timer);
1383
1384 void sock_init_data(struct socket *sock, struct sock *sk)
1385 {
1386         skb_queue_head_init(&sk->sk_receive_queue);
1387         skb_queue_head_init(&sk->sk_write_queue);
1388         skb_queue_head_init(&sk->sk_error_queue);
1389 #ifdef CONFIG_NET_DMA
1390         skb_queue_head_init(&sk->sk_async_wait_queue);
1391 #endif
1392
1393         sk->sk_send_head        =       NULL;
1394
1395         init_timer(&sk->sk_timer);
1396
1397         sk->sk_allocation       =       GFP_KERNEL;
1398         sk->sk_rcvbuf           =       sysctl_rmem_default;
1399         sk->sk_sndbuf           =       sysctl_wmem_default;
1400         sk->sk_state            =       TCP_CLOSE;
1401         sk->sk_socket           =       sock;
1402
1403         sock_set_flag(sk, SOCK_ZAPPED);
1404
1405         if(sock)
1406         {
1407                 sk->sk_type     =       sock->type;
1408                 sk->sk_sleep    =       &sock->wait;
1409                 sock->sk        =       sk;
1410         } else
1411                 sk->sk_sleep    =       NULL;
1412
1413         rwlock_init(&sk->sk_dst_lock);
1414         rwlock_init(&sk->sk_callback_lock);
1415
1416         sk->sk_state_change     =       sock_def_wakeup;
1417         sk->sk_data_ready       =       sock_def_readable;
1418         sk->sk_write_space      =       sock_def_write_space;
1419         sk->sk_error_report     =       sock_def_error_report;
1420         sk->sk_destruct         =       sock_def_destruct;
1421
1422         sk->sk_sndmsg_page      =       NULL;
1423         sk->sk_sndmsg_off       =       0;
1424
1425         sk->sk_peercred.pid     =       0;
1426         sk->sk_peercred.uid     =       -1;
1427         sk->sk_peercred.gid     =       -1;
1428         sk->sk_write_pending    =       0;
1429         sk->sk_rcvlowat         =       1;
1430         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1431         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1432
1433         sk->sk_stamp.tv_sec     = -1L;
1434         sk->sk_stamp.tv_usec    = -1L;
1435
1436         atomic_set(&sk->sk_refcnt, 1);
1437 }
1438
1439 void fastcall lock_sock(struct sock *sk)
1440 {
1441         might_sleep();
1442         spin_lock_bh(&(sk->sk_lock.slock));
1443         if (sk->sk_lock.owner)
1444                 __lock_sock(sk);
1445         sk->sk_lock.owner = (void *)1;
1446         spin_unlock_bh(&(sk->sk_lock.slock));
1447 }
1448
1449 EXPORT_SYMBOL(lock_sock);
1450
1451 void fastcall release_sock(struct sock *sk)
1452 {
1453         spin_lock_bh(&(sk->sk_lock.slock));
1454         if (sk->sk_backlog.tail)
1455                 __release_sock(sk);
1456         sk->sk_lock.owner = NULL;
1457         if (waitqueue_active(&(sk->sk_lock.wq)))
1458                 wake_up(&(sk->sk_lock.wq));
1459         spin_unlock_bh(&(sk->sk_lock.slock));
1460 }
1461 EXPORT_SYMBOL(release_sock);
1462
1463 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1464 {
1465         if (!sock_flag(sk, SOCK_TIMESTAMP))
1466                 sock_enable_timestamp(sk);
1467         if (sk->sk_stamp.tv_sec == -1)
1468                 return -ENOENT;
1469         if (sk->sk_stamp.tv_sec == 0)
1470                 do_gettimeofday(&sk->sk_stamp);
1471         return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1472                 -EFAULT : 0;
1473 }
1474 EXPORT_SYMBOL(sock_get_timestamp);
1475
1476 void sock_enable_timestamp(struct sock *sk)
1477 {
1478         if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1479                 sock_set_flag(sk, SOCK_TIMESTAMP);
1480                 net_enable_timestamp();
1481         }
1482 }
1483 EXPORT_SYMBOL(sock_enable_timestamp);
1484
1485 /*
1486  *      Get a socket option on an socket.
1487  *
1488  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1489  *      asynchronous errors should be reported by getsockopt. We assume
1490  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1491  */
1492 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1493                            char __user *optval, int __user *optlen)
1494 {
1495         struct sock *sk = sock->sk;
1496
1497         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1498 }
1499
1500 EXPORT_SYMBOL(sock_common_getsockopt);
1501
1502 #ifdef CONFIG_COMPAT
1503 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1504                                   char __user *optval, int __user *optlen)
1505 {
1506         struct sock *sk = sock->sk;
1507
1508         if (sk->sk_prot->compat_setsockopt != NULL)
1509                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
1510                                                       optval, optlen);
1511         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1512 }
1513 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1514 #endif
1515
1516 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1517                         struct msghdr *msg, size_t size, int flags)
1518 {
1519         struct sock *sk = sock->sk;
1520         int addr_len = 0;
1521         int err;
1522
1523         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1524                                    flags & ~MSG_DONTWAIT, &addr_len);
1525         if (err >= 0)
1526                 msg->msg_namelen = addr_len;
1527         return err;
1528 }
1529
1530 EXPORT_SYMBOL(sock_common_recvmsg);
1531
1532 /*
1533  *      Set socket options on an inet socket.
1534  */
1535 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1536                            char __user *optval, int optlen)
1537 {
1538         struct sock *sk = sock->sk;
1539
1540         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1541 }
1542
1543 EXPORT_SYMBOL(sock_common_setsockopt);
1544
1545 #ifdef CONFIG_COMPAT
1546 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1547                                   char __user *optval, int optlen)
1548 {
1549         struct sock *sk = sock->sk;
1550
1551         if (sk->sk_prot->compat_setsockopt != NULL)
1552                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
1553                                                       optval, optlen);
1554         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1555 }
1556 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1557 #endif
1558
1559 void sk_common_release(struct sock *sk)
1560 {
1561         if (sk->sk_prot->destroy)
1562                 sk->sk_prot->destroy(sk);
1563
1564         /*
1565          * Observation: when sock_common_release is called, processes have
1566          * no access to socket. But net still has.
1567          * Step one, detach it from networking:
1568          *
1569          * A. Remove from hash tables.
1570          */
1571
1572         sk->sk_prot->unhash(sk);
1573
1574         /*
1575          * In this point socket cannot receive new packets, but it is possible
1576          * that some packets are in flight because some CPU runs receiver and
1577          * did hash table lookup before we unhashed socket. They will achieve
1578          * receive queue and will be purged by socket destructor.
1579          *
1580          * Also we still have packets pending on receive queue and probably,
1581          * our own packets waiting in device queues. sock_destroy will drain
1582          * receive queue, but transmitted packets will delay socket destruction
1583          * until the last reference will be released.
1584          */
1585
1586         sock_orphan(sk);
1587
1588         xfrm_sk_free_policy(sk);
1589
1590         sk_refcnt_debug_release(sk);
1591         sock_put(sk);
1592 }
1593
1594 EXPORT_SYMBOL(sk_common_release);
1595
1596 static DEFINE_RWLOCK(proto_list_lock);
1597 static LIST_HEAD(proto_list);
1598
1599 int proto_register(struct proto *prot, int alloc_slab)
1600 {
1601         char *request_sock_slab_name = NULL;
1602         char *timewait_sock_slab_name;
1603         int rc = -ENOBUFS;
1604
1605         if (alloc_slab) {
1606                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1607                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
1608
1609                 if (prot->slab == NULL) {
1610                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1611                                prot->name);
1612                         goto out;
1613                 }
1614
1615                 if (prot->rsk_prot != NULL) {
1616                         static const char mask[] = "request_sock_%s";
1617
1618                         request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1619                         if (request_sock_slab_name == NULL)
1620                                 goto out_free_sock_slab;
1621
1622                         sprintf(request_sock_slab_name, mask, prot->name);
1623                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1624                                                                  prot->rsk_prot->obj_size, 0,
1625                                                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
1626
1627                         if (prot->rsk_prot->slab == NULL) {
1628                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1629                                        prot->name);
1630                                 goto out_free_request_sock_slab_name;
1631                         }
1632                 }
1633
1634                 if (prot->twsk_prot != NULL) {
1635                         static const char mask[] = "tw_sock_%s";
1636
1637                         timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1638
1639                         if (timewait_sock_slab_name == NULL)
1640                                 goto out_free_request_sock_slab;
1641
1642                         sprintf(timewait_sock_slab_name, mask, prot->name);
1643                         prot->twsk_prot->twsk_slab =
1644                                 kmem_cache_create(timewait_sock_slab_name,
1645                                                   prot->twsk_prot->twsk_obj_size,
1646                                                   0, SLAB_HWCACHE_ALIGN,
1647                                                   NULL, NULL);
1648                         if (prot->twsk_prot->twsk_slab == NULL)
1649                                 goto out_free_timewait_sock_slab_name;
1650                 }
1651         }
1652
1653         write_lock(&proto_list_lock);
1654         list_add(&prot->node, &proto_list);
1655         write_unlock(&proto_list_lock);
1656         rc = 0;
1657 out:
1658         return rc;
1659 out_free_timewait_sock_slab_name:
1660         kfree(timewait_sock_slab_name);
1661 out_free_request_sock_slab:
1662         if (prot->rsk_prot && prot->rsk_prot->slab) {
1663                 kmem_cache_destroy(prot->rsk_prot->slab);
1664                 prot->rsk_prot->slab = NULL;
1665         }
1666 out_free_request_sock_slab_name:
1667         kfree(request_sock_slab_name);
1668 out_free_sock_slab:
1669         kmem_cache_destroy(prot->slab);
1670         prot->slab = NULL;
1671         goto out;
1672 }
1673
1674 EXPORT_SYMBOL(proto_register);
1675
1676 void proto_unregister(struct proto *prot)
1677 {
1678         write_lock(&proto_list_lock);
1679         list_del(&prot->node);
1680         write_unlock(&proto_list_lock);
1681
1682         if (prot->slab != NULL) {
1683                 kmem_cache_destroy(prot->slab);
1684                 prot->slab = NULL;
1685         }
1686
1687         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1688                 const char *name = kmem_cache_name(prot->rsk_prot->slab);
1689
1690                 kmem_cache_destroy(prot->rsk_prot->slab);
1691                 kfree(name);
1692                 prot->rsk_prot->slab = NULL;
1693         }
1694
1695         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
1696                 const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
1697
1698                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
1699                 kfree(name);
1700                 prot->twsk_prot->twsk_slab = NULL;
1701         }
1702 }
1703
1704 EXPORT_SYMBOL(proto_unregister);
1705
1706 #ifdef CONFIG_PROC_FS
1707 static inline struct proto *__proto_head(void)
1708 {
1709         return list_entry(proto_list.next, struct proto, node);
1710 }
1711
1712 static inline struct proto *proto_head(void)
1713 {
1714         return list_empty(&proto_list) ? NULL : __proto_head();
1715 }
1716
1717 static inline struct proto *proto_next(struct proto *proto)
1718 {
1719         return proto->node.next == &proto_list ? NULL :
1720                 list_entry(proto->node.next, struct proto, node);
1721 }
1722
1723 static inline struct proto *proto_get_idx(loff_t pos)
1724 {
1725         struct proto *proto;
1726         loff_t i = 0;
1727
1728         list_for_each_entry(proto, &proto_list, node)
1729                 if (i++ == pos)
1730                         goto out;
1731
1732         proto = NULL;
1733 out:
1734         return proto;
1735 }
1736
1737 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1738 {
1739         read_lock(&proto_list_lock);
1740         return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1741 }
1742
1743 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1744 {
1745         ++*pos;
1746         return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1747 }
1748
1749 static void proto_seq_stop(struct seq_file *seq, void *v)
1750 {
1751         read_unlock(&proto_list_lock);
1752 }
1753
1754 static char proto_method_implemented(const void *method)
1755 {
1756         return method == NULL ? 'n' : 'y';
1757 }
1758
1759 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1760 {
1761         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1762                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1763                    proto->name,
1764                    proto->obj_size,
1765                    proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1766                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1767                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1768                    proto->max_header,
1769                    proto->slab == NULL ? "no" : "yes",
1770                    module_name(proto->owner),
1771                    proto_method_implemented(proto->close),
1772                    proto_method_implemented(proto->connect),
1773                    proto_method_implemented(proto->disconnect),
1774                    proto_method_implemented(proto->accept),
1775                    proto_method_implemented(proto->ioctl),
1776                    proto_method_implemented(proto->init),
1777                    proto_method_implemented(proto->destroy),
1778                    proto_method_implemented(proto->shutdown),
1779                    proto_method_implemented(proto->setsockopt),
1780                    proto_method_implemented(proto->getsockopt),
1781                    proto_method_implemented(proto->sendmsg),
1782                    proto_method_implemented(proto->recvmsg),
1783                    proto_method_implemented(proto->sendpage),
1784                    proto_method_implemented(proto->bind),
1785                    proto_method_implemented(proto->backlog_rcv),
1786                    proto_method_implemented(proto->hash),
1787                    proto_method_implemented(proto->unhash),
1788                    proto_method_implemented(proto->get_port),
1789                    proto_method_implemented(proto->enter_memory_pressure));
1790 }
1791
1792 static int proto_seq_show(struct seq_file *seq, void *v)
1793 {
1794         if (v == SEQ_START_TOKEN)
1795                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1796                            "protocol",
1797                            "size",
1798                            "sockets",
1799                            "memory",
1800                            "press",
1801                            "maxhdr",
1802                            "slab",
1803                            "module",
1804                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1805         else
1806                 proto_seq_printf(seq, v);
1807         return 0;
1808 }
1809
1810 static struct seq_operations proto_seq_ops = {
1811         .start  = proto_seq_start,
1812         .next   = proto_seq_next,
1813         .stop   = proto_seq_stop,
1814         .show   = proto_seq_show,
1815 };
1816
1817 static int proto_seq_open(struct inode *inode, struct file *file)
1818 {
1819         return seq_open(file, &proto_seq_ops);
1820 }
1821
1822 static struct file_operations proto_seq_fops = {
1823         .owner          = THIS_MODULE,
1824         .open           = proto_seq_open,
1825         .read           = seq_read,
1826         .llseek         = seq_lseek,
1827         .release        = seq_release,
1828 };
1829
1830 static int __init proto_init(void)
1831 {
1832         /* register /proc/net/protocols */
1833         return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1834 }
1835
1836 subsys_initcall(proto_init);
1837
1838 #endif /* PROC_FS */
1839
1840 EXPORT_SYMBOL(sk_alloc);
1841 EXPORT_SYMBOL(sk_free);
1842 EXPORT_SYMBOL(sk_send_sigurg);
1843 EXPORT_SYMBOL(sock_alloc_send_skb);
1844 EXPORT_SYMBOL(sock_init_data);
1845 EXPORT_SYMBOL(sock_kfree_s);
1846 EXPORT_SYMBOL(sock_kmalloc);
1847 EXPORT_SYMBOL(sock_no_accept);
1848 EXPORT_SYMBOL(sock_no_bind);
1849 EXPORT_SYMBOL(sock_no_connect);
1850 EXPORT_SYMBOL(sock_no_getname);
1851 EXPORT_SYMBOL(sock_no_getsockopt);
1852 EXPORT_SYMBOL(sock_no_ioctl);
1853 EXPORT_SYMBOL(sock_no_listen);
1854 EXPORT_SYMBOL(sock_no_mmap);
1855 EXPORT_SYMBOL(sock_no_poll);
1856 EXPORT_SYMBOL(sock_no_recvmsg);
1857 EXPORT_SYMBOL(sock_no_sendmsg);
1858 EXPORT_SYMBOL(sock_no_sendpage);
1859 EXPORT_SYMBOL(sock_no_setsockopt);
1860 EXPORT_SYMBOL(sock_no_shutdown);
1861 EXPORT_SYMBOL(sock_no_socketpair);
1862 EXPORT_SYMBOL(sock_rfree);
1863 EXPORT_SYMBOL(sock_setsockopt);
1864 EXPORT_SYMBOL(sock_wfree);
1865 EXPORT_SYMBOL(sock_wmalloc);
1866 EXPORT_SYMBOL(sock_i_uid);
1867 EXPORT_SYMBOL(sock_i_ino);
1868 EXPORT_SYMBOL(sysctl_optmem_max);
1869 #ifdef CONFIG_SYSCTL
1870 EXPORT_SYMBOL(sysctl_rmem_max);
1871 EXPORT_SYMBOL(sysctl_wmem_max);
1872 #endif