net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #include <linux/module.h>
  84 #include <linux/kernel.h>
  85 #include <linux/signal.h>
  86 #include <linux/sched.h>
  87 #include <linux/errno.h>
  88 #include <linux/string.h>
  89 #include <linux/stat.h>
  90 #include <linux/dcache.h>
  91 #include <linux/namei.h>
  92 #include <linux/socket.h>
  93 #include <linux/un.h>
  94 #include <linux/fcntl.h>
  95 #include <linux/termios.h>
  96 #include <linux/sockios.h>
  97 #include <linux/net.h>
  98 #include <linux/in.h>
  99 #include <linux/fs.h>
 100 #include <linux/slab.h>
 101 #include <asm/uaccess.h>
 102 #include <linux/skbuff.h>
 103 #include <linux/netdevice.h>
 104 #include <net/net_namespace.h>
 105 #include <net/sock.h>
 106 #include <net/tcp_states.h>
 107 #include <net/af_unix.h>
 108 #include <linux/proc_fs.h>
 109 #include <linux/seq_file.h>
 110 #include <net/scm.h>
 111 #include <linux/init.h>
 112 #include <linux/poll.h>
 113 #include <linux/rtnetlink.h>
 114 #include <linux/mount.h>
 115 #include <net/checksum.h>
 116 #include <linux/security.h>
 117 #include <linux/freezer.h>
 118
 119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 120 EXPORT_SYMBOL_GPL(unix_socket_table);
 121 DEFINE_SPINLOCK(unix_table_lock);
 122 EXPORT_SYMBOL_GPL(unix_table_lock);
 123 static atomic_long_t unix_nr_socks;
 124
 125
 126 static struct hlist_head *unix_sockets_unbound(void *addr)
 127 {
 128         unsigned long hash = (unsigned long)addr;
 129
 130         hash ^= hash >> 16;
 131         hash ^= hash >> 8;
 132         hash %= UNIX_HASH_SIZE;
 133         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 134 }
 135
 136 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 137
 138 #ifdef CONFIG_SECURITY_NETWORK
 139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 140 {
 141         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 142 }
 143
 144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         scm->secid = *UNIXSID(skb);
 147 }
 148 #else
 149 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 150 { }
 151
 152 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 153 { }
 154 #endif /* CONFIG_SECURITY_NETWORK */
 155
 156 /*
 157  *  SMP locking strategy:
 158  *    hash table is protected with spinlock unix_table_lock
 159  *    each socket state is protected by separate spin lock.
 160  */
 161
 162 static inline unsigned int unix_hash_fold(__wsum n)
 163 {
 164         unsigned int hash = (__force unsigned int)n;
 165
 166         hash ^= hash>>16;
 167         hash ^= hash>>8;
 168         return hash&(UNIX_HASH_SIZE-1);
 169 }
 170
 171 #define unix_peer(sk) (unix_sk(sk)->peer)
 172
 173 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 174 {
 175         return unix_peer(osk) == sk;
 176 }
 177
 178 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 179 {
 180         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 181 }
 182
 183 static inline int unix_recvq_full(struct sock const *sk)
 184 {
 185         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 186 }
 187
 188 struct sock *unix_peer_get(struct sock *s)
 189 {
 190         struct sock *peer;
 191
 192         unix_state_lock(s);
 193         peer = unix_peer(s);
 194         if (peer)
 195                 sock_hold(peer);
 196         unix_state_unlock(s);
 197         return peer;
 198 }
 199 EXPORT_SYMBOL_GPL(unix_peer_get);
 200
 201 static inline void unix_release_addr(struct unix_address *addr)
 202 {
 203         if (atomic_dec_and_test(&addr->refcnt))
 204                 kfree(addr);
 205 }
 206
 207 /*
 208  *      Check unix socket name:
 209  *              - should be not zero length.
 210  *              - if started by not zero, should be NULL terminated (FS object)
 211  *              - if started by zero, it is abstract name.
 212  */
 213
 214 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 215 {
 216         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 217                 return -EINVAL;
 218         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 219                 return -EINVAL;
 220         if (sunaddr->sun_path[0]) {
 221                 /*
 222                  * This may look like an off by one error but it is a bit more
 223                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 224                  * sun_path[108] doesn't as such exist.  However in kernel space
 225                  * we are guaranteed that it is a valid memory location in our
 226                  * kernel address buffer.
 227                  */
 228                 ((char *)sunaddr)[len] = 0;
 229                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 230                 return len;
 231         }
 232
 233         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 234         return len;
 235 }
 236
 237 static void __unix_remove_socket(struct sock *sk)
 238 {
 239         sk_del_node_init(sk);
 240 }
 241
 242 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 243 {
 244         WARN_ON(!sk_unhashed(sk));
 245         sk_add_node(sk, list);
 246 }
 247
 248 static inline void unix_remove_socket(struct sock *sk)
 249 {
 250         spin_lock(&unix_table_lock);
 251         __unix_remove_socket(sk);
 252         spin_unlock(&unix_table_lock);
 253 }
 254
 255 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 256 {
 257         spin_lock(&unix_table_lock);
 258         __unix_insert_socket(list, sk);
 259         spin_unlock(&unix_table_lock);
 260 }
 261
 262 static struct sock *__unix_find_socket_byname(struct net *net,
 263                                               struct sockaddr_un *sunname,
 264                                               int len, int type, unsigned int hash)
 265 {
 266         struct sock *s;
 267
 268         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 269                 struct unix_sock *u = unix_sk(s);
 270
 271                 if (!net_eq(sock_net(s), net))
 272                         continue;
 273
 274                 if (u->addr->len == len &&
 275                     !memcmp(u->addr->name, sunname, len))
 276                         goto found;
 277         }
 278         s = NULL;
 279 found:
 280         return s;
 281 }
 282
 283 static inline struct sock *unix_find_socket_byname(struct net *net,
 284                                                    struct sockaddr_un *sunname,
 285                                                    int len, int type,
 286                                                    unsigned int hash)
 287 {
 288         struct sock *s;
 289
 290         spin_lock(&unix_table_lock);
 291         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 292         if (s)
 293                 sock_hold(s);
 294         spin_unlock(&unix_table_lock);
 295         return s;
 296 }
 297
 298 static struct sock *unix_find_socket_byinode(struct inode *i)
 299 {
 300         struct sock *s;
 301
 302         spin_lock(&unix_table_lock);
 303         sk_for_each(s,
 304                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 305                 struct dentry *dentry = unix_sk(s)->path.dentry;
 306
 307                 if (dentry && dentry->d_inode == i) {
 308                         sock_hold(s);
 309                         goto found;
 310                 }
 311         }
 312         s = NULL;
 313 found:
 314         spin_unlock(&unix_table_lock);
 315         return s;
 316 }
 317
 318 static inline int unix_writable(struct sock *sk)
 319 {
 320         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 321 }
 322
 323 static void unix_write_space(struct sock *sk)
 324 {
 325         struct socket_wq *wq;
 326
 327         rcu_read_lock();
 328         if (unix_writable(sk)) {
 329                 wq = rcu_dereference(sk->sk_wq);
 330                 if (wq_has_sleeper(wq))
 331                         wake_up_interruptible_sync_poll(&wq->wait,
 332                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 333                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 334         }
 335         rcu_read_unlock();
 336 }
 337
 338 /* When dgram socket disconnects (or changes its peer), we clear its receive
 339  * queue of packets arrived from previous peer. First, it allows to do
 340  * flow control based only on wmem_alloc; second, sk connected to peer
 341  * may receive messages only from that peer. */
 342 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 343 {
 344         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 345                 skb_queue_purge(&sk->sk_receive_queue);
 346                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 347
 348                 /* If one link of bidirectional dgram pipe is disconnected,
 349                  * we signal error. Messages are lost. Do not make this,
 350                  * when peer was not connected to us.
 351                  */
 352                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 353                         other->sk_err = ECONNRESET;
 354                         other->sk_error_report(other);
 355                 }
 356         }
 357 }
 358
 359 static void unix_sock_destructor(struct sock *sk)
 360 {
 361         struct unix_sock *u = unix_sk(sk);
 362
 363         skb_queue_purge(&sk->sk_receive_queue);
 364
 365         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 366         WARN_ON(!sk_unhashed(sk));
 367         WARN_ON(sk->sk_socket);
 368         if (!sock_flag(sk, SOCK_DEAD)) {
 369                 printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
 370                 return;
 371         }
 372
 373         if (u->addr)
 374                 unix_release_addr(u->addr);
 375
 376         atomic_long_dec(&unix_nr_socks);
 377         local_bh_disable();
 378         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 379         local_bh_enable();
 380 #ifdef UNIX_REFCNT_DEBUG
 381         printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
 382                 atomic_long_read(&unix_nr_socks));
 383 #endif
 384 }
 385
 386 static void unix_release_sock(struct sock *sk, int embrion)
 387 {
 388         struct unix_sock *u = unix_sk(sk);
 389         struct path path;
 390         struct sock *skpair;
 391         struct sk_buff *skb;
 392         int state;
 393
 394         unix_remove_socket(sk);
 395
 396         /* Clear state */
 397         unix_state_lock(sk);
 398         sock_orphan(sk);
 399         sk->sk_shutdown = SHUTDOWN_MASK;
 400         path         = u->path;
 401         u->path.dentry = NULL;
 402         u->path.mnt = NULL;
 403         state = sk->sk_state;
 404         sk->sk_state = TCP_CLOSE;
 405         unix_state_unlock(sk);
 406
 407         wake_up_interruptible_all(&u->peer_wait);
 408
 409         skpair = unix_peer(sk);
 410
 411         if (skpair != NULL) {
 412                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 413                         unix_state_lock(skpair);
 414                         /* No more writes */
 415                         skpair->sk_shutdown = SHUTDOWN_MASK;
 416                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 417                                 skpair->sk_err = ECONNRESET;
 418                         unix_state_unlock(skpair);
 419                         skpair->sk_state_change(skpair);
 420                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 421                 }
 422                 sock_put(skpair); /* It may now die */
 423                 unix_peer(sk) = NULL;
 424         }
 425
 426         /* Try to flush out this socket. Throw out buffers at least */
 427
 428         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 429                 if (state == TCP_LISTEN)
 430                         unix_release_sock(skb->sk, 1);
 431                 /* passed fds are erased in the kfree_skb hook        */
 432                 kfree_skb(skb);
 433         }
 434
 435         if (path.dentry)
 436                 path_put(&path);
 437
 438         sock_put(sk);
 439
 440         /* ---- Socket is dead now and most probably destroyed ---- */
 441
 442         /*
 443          * Fixme: BSD difference: In BSD all sockets connected to us get
 444          *        ECONNRESET and we die on the spot. In Linux we behave
 445          *        like files and pipes do and wait for the last
 446          *        dereference.
 447          *
 448          * Can't we simply set sock->err?
 449          *
 450          *        What the above comment does talk about? --ANK(980817)
 451          */
 452
 453         if (unix_tot_inflight)
 454                 unix_gc();              /* Garbage collect fds */
 455 }
 456
 457 static void init_peercred(struct sock *sk)
 458 {
 459         put_pid(sk->sk_peer_pid);
 460         if (sk->sk_peer_cred)
 461                 put_cred(sk->sk_peer_cred);
 462         sk->sk_peer_pid  = get_pid(task_tgid(current));
 463         sk->sk_peer_cred = get_current_cred();
 464 }
 465
 466 static void copy_peercred(struct sock *sk, struct sock *peersk)
 467 {
 468         put_pid(sk->sk_peer_pid);
 469         if (sk->sk_peer_cred)
 470                 put_cred(sk->sk_peer_cred);
 471         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 472         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 473 }
 474
 475 static int unix_listen(struct socket *sock, int backlog)
 476 {
 477         int err;
 478         struct sock *sk = sock->sk;
 479         struct unix_sock *u = unix_sk(sk);
 480         struct pid *old_pid = NULL;
 481
 482         err = -EOPNOTSUPP;
 483         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 484                 goto out;       /* Only stream/seqpacket sockets accept */
 485         err = -EINVAL;
 486         if (!u->addr)
 487                 goto out;       /* No listens on an unbound socket */
 488         unix_state_lock(sk);
 489         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 490                 goto out_unlock;
 491         if (backlog > sk->sk_max_ack_backlog)
 492                 wake_up_interruptible_all(&u->peer_wait);
 493         sk->sk_max_ack_backlog  = backlog;
 494         sk->sk_state            = TCP_LISTEN;
 495         /* set credentials so connect can copy them */
 496         init_peercred(sk);
 497         err = 0;
 498
 499 out_unlock:
 500         unix_state_unlock(sk);
 501         put_pid(old_pid);
 502 out:
 503         return err;
 504 }
 505
 506 static int unix_release(struct socket *);
 507 static int unix_bind(struct socket *, struct sockaddr *, int);
 508 static int unix_stream_connect(struct socket *, struct sockaddr *,
 509                                int addr_len, int flags);
 510 static int unix_socketpair(struct socket *, struct socket *);
 511 static int unix_accept(struct socket *, struct socket *, int);
 512 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 513 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 514 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 515                                     poll_table *);
 516 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 517 static int unix_shutdown(struct socket *, int);
 518 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 519                                struct msghdr *, size_t);
 520 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
 521                                struct msghdr *, size_t, int);
 522 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
 523                               struct msghdr *, size_t);
 524 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
 525                               struct msghdr *, size_t, int);
 526 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 527                               int, int);
 528 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 529                                   struct msghdr *, size_t);
 530 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 531                                   struct msghdr *, size_t, int);
 532
 533 static void unix_set_peek_off(struct sock *sk, int val)
 534 {
 535         struct unix_sock *u = unix_sk(sk);
 536
 537         mutex_lock(&u->readlock);
 538         sk->sk_peek_off = val;
 539         mutex_unlock(&u->readlock);
 540 }
 541
 542
 543 static const struct proto_ops unix_stream_ops = {
 544         .family =       PF_UNIX,
 545         .owner =        THIS_MODULE,
 546         .release =      unix_release,
 547         .bind =         unix_bind,
 548         .connect =      unix_stream_connect,
 549         .socketpair =   unix_socketpair,
 550         .accept =       unix_accept,
 551         .getname =      unix_getname,
 552         .poll =         unix_poll,
 553         .ioctl =        unix_ioctl,
 554         .listen =       unix_listen,
 555         .shutdown =     unix_shutdown,
 556         .setsockopt =   sock_no_setsockopt,
 557         .getsockopt =   sock_no_getsockopt,
 558         .sendmsg =      unix_stream_sendmsg,
 559         .recvmsg =      unix_stream_recvmsg,
 560         .mmap =         sock_no_mmap,
 561         .sendpage =     sock_no_sendpage,
 562         .set_peek_off = unix_set_peek_off,
 563 };
 564
 565 static const struct proto_ops unix_dgram_ops = {
 566         .family =       PF_UNIX,
 567         .owner =        THIS_MODULE,
 568         .release =      unix_release,
 569         .bind =         unix_bind,
 570         .connect =      unix_dgram_connect,
 571         .socketpair =   unix_socketpair,
 572         .accept =       sock_no_accept,
 573         .getname =      unix_getname,
 574         .poll =         unix_dgram_poll,
 575         .ioctl =        unix_ioctl,
 576         .listen =       sock_no_listen,
 577         .shutdown =     unix_shutdown,
 578         .setsockopt =   sock_no_setsockopt,
 579         .getsockopt =   sock_no_getsockopt,
 580         .sendmsg =      unix_dgram_sendmsg,
 581         .recvmsg =      unix_dgram_recvmsg,
 582         .mmap =         sock_no_mmap,
 583         .sendpage =     sock_no_sendpage,
 584         .set_peek_off = unix_set_peek_off,
 585 };
 586
 587 static const struct proto_ops unix_seqpacket_ops = {
 588         .family =       PF_UNIX,
 589         .owner =        THIS_MODULE,
 590         .release =      unix_release,
 591         .bind =         unix_bind,
 592         .connect =      unix_stream_connect,
 593         .socketpair =   unix_socketpair,
 594         .accept =       unix_accept,
 595         .getname =      unix_getname,
 596         .poll =         unix_dgram_poll,
 597         .ioctl =        unix_ioctl,
 598         .listen =       unix_listen,
 599         .shutdown =     unix_shutdown,
 600         .setsockopt =   sock_no_setsockopt,
 601         .getsockopt =   sock_no_getsockopt,
 602         .sendmsg =      unix_seqpacket_sendmsg,
 603         .recvmsg =      unix_seqpacket_recvmsg,
 604         .mmap =         sock_no_mmap,
 605         .sendpage =     sock_no_sendpage,
 606         .set_peek_off = unix_set_peek_off,
 607 };
 608
 609 static struct proto unix_proto = {
 610         .name                   = "UNIX",
 611         .owner                  = THIS_MODULE,
 612         .obj_size               = sizeof(struct unix_sock),
 613 };
 614
 615 /*
 616  * AF_UNIX sockets do not interact with hardware, hence they
 617  * dont trigger interrupts - so it's safe for them to have
 618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 619  * this special lock-class by reinitializing the spinlock key:
 620  */
 621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 622
 623 static struct sock *unix_create1(struct net *net, struct socket *sock)
 624 {
 625         struct sock *sk = NULL;
 626         struct unix_sock *u;
 627
 628         atomic_long_inc(&unix_nr_socks);
 629         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 630                 goto out;
 631
 632         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 633         if (!sk)
 634                 goto out;
 635
 636         sock_init_data(sock, sk);
 637         lockdep_set_class(&sk->sk_receive_queue.lock,
 638                                 &af_unix_sk_receive_queue_lock_key);
 639
 640         sk->sk_write_space      = unix_write_space;
 641         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 642         sk->sk_destruct         = unix_sock_destructor;
 643         u         = unix_sk(sk);
 644         u->path.dentry = NULL;
 645         u->path.mnt = NULL;
 646         spin_lock_init(&u->lock);
 647         atomic_long_set(&u->inflight, 0);
 648         INIT_LIST_HEAD(&u->link);
 649         mutex_init(&u->readlock); /* single task reading lock */
 650         init_waitqueue_head(&u->peer_wait);
 651         unix_insert_socket(unix_sockets_unbound(sk), sk);
 652 out:
 653         if (sk == NULL)
 654                 atomic_long_dec(&unix_nr_socks);
 655         else {
 656                 local_bh_disable();
 657                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 658                 local_bh_enable();
 659         }
 660         return sk;
 661 }
 662
 663 static int unix_create(struct net *net, struct socket *sock, int protocol,
 664                        int kern)
 665 {
 666         if (protocol && protocol != PF_UNIX)
 667                 return -EPROTONOSUPPORT;
 668
 669         sock->state = SS_UNCONNECTED;
 670
 671         switch (sock->type) {
 672         case SOCK_STREAM:
 673                 sock->ops = &unix_stream_ops;
 674                 break;
 675                 /*
 676                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 677                  *      nothing uses it.
 678                  */
 679         case SOCK_RAW:
 680                 sock->type = SOCK_DGRAM;
 681         case SOCK_DGRAM:
 682                 sock->ops = &unix_dgram_ops;
 683                 break;
 684         case SOCK_SEQPACKET:
 685                 sock->ops = &unix_seqpacket_ops;
 686                 break;
 687         default:
 688                 return -ESOCKTNOSUPPORT;
 689         }
 690
 691         return unix_create1(net, sock) ? 0 : -ENOMEM;
 692 }
 693
 694 static int unix_release(struct socket *sock)
 695 {
 696         struct sock *sk = sock->sk;
 697
 698         if (!sk)
 699                 return 0;
 700
 701         unix_release_sock(sk, 0);
 702         sock->sk = NULL;
 703
 704         return 0;
 705 }
 706
 707 static int unix_autobind(struct socket *sock)
 708 {
 709         struct sock *sk = sock->sk;
 710         struct net *net = sock_net(sk);
 711         struct unix_sock *u = unix_sk(sk);
 712         static u32 ordernum = 1;
 713         struct unix_address *addr;
 714         int err;
 715         unsigned int retries = 0;
 716
 717         mutex_lock(&u->readlock);
 718
 719         err = 0;
 720         if (u->addr)
 721                 goto out;
 722
 723         err = -ENOMEM;
 724         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 725         if (!addr)
 726                 goto out;
 727
 728         addr->name->sun_family = AF_UNIX;
 729         atomic_set(&addr->refcnt, 1);
 730
 731 retry:
 732         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 733         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 734
 735         spin_lock(&unix_table_lock);
 736         ordernum = (ordernum+1)&0xFFFFF;
 737
 738         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 739                                       addr->hash)) {
 740                 spin_unlock(&unix_table_lock);
 741                 /*
 742                  * __unix_find_socket_byname() may take long time if many names
 743                  * are already in use.
 744                  */
 745                 cond_resched();
 746                 /* Give up if all names seems to be in use. */
 747                 if (retries++ == 0xFFFFF) {
 748                         err = -ENOSPC;
 749                         kfree(addr);
 750                         goto out;
 751                 }
 752                 goto retry;
 753         }
 754         addr->hash ^= sk->sk_type;
 755
 756         __unix_remove_socket(sk);
 757         u->addr = addr;
 758         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 759         spin_unlock(&unix_table_lock);
 760         err = 0;
 761
 762 out:    mutex_unlock(&u->readlock);
 763         return err;
 764 }
 765
 766 static struct sock *unix_find_other(struct net *net,
 767                                     struct sockaddr_un *sunname, int len,
 768                                     int type, unsigned int hash, int *error)
 769 {
 770         struct sock *u;
 771         struct path path;
 772         int err = 0;
 773
 774         if (sunname->sun_path[0]) {
 775                 struct inode *inode;
 776                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 777                 if (err)
 778                         goto fail;
 779                 inode = path.dentry->d_inode;
 780                 err = inode_permission(inode, MAY_WRITE);
 781                 if (err)
 782                         goto put_fail;
 783
 784                 err = -ECONNREFUSED;
 785                 if (!S_ISSOCK(inode->i_mode))
 786                         goto put_fail;
 787                 u = unix_find_socket_byinode(inode);
 788                 if (!u)
 789                         goto put_fail;
 790
 791                 if (u->sk_type == type)
 792                         touch_atime(&path);
 793
 794                 path_put(&path);
 795
 796                 err = -EPROTOTYPE;
 797                 if (u->sk_type != type) {
 798                         sock_put(u);
 799                         goto fail;
 800                 }
 801         } else {
 802                 err = -ECONNREFUSED;
 803                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 804                 if (u) {
 805                         struct dentry *dentry;
 806                         dentry = unix_sk(u)->path.dentry;
 807                         if (dentry)
 808                                 touch_atime(&unix_sk(u)->path);
 809                 } else
 810                         goto fail;
 811         }
 812         return u;
 813
 814 put_fail:
 815         path_put(&path);
 816 fail:
 817         *error = err;
 818         return NULL;
 819 }
 820
 821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 822 {
 823         struct dentry *dentry;
 824         struct path path;
 825         int err = 0;
 826         /*
 827          * Get the parent directory, calculate the hash for last
 828          * component.
 829          */
 830         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 831         err = PTR_ERR(dentry);
 832         if (IS_ERR(dentry))
 833                 return err;
 834
 835         /*
 836          * All right, let's create it.
 837          */
 838         err = security_path_mknod(&path, dentry, mode, 0);
 839         if (!err) {
 840                 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 841                 if (!err) {
 842                         res->mnt = mntget(path.mnt);
 843                         res->dentry = dget(dentry);
 844                 }
 845         }
 846         done_path_create(&path, dentry);
 847         return err;
 848 }
 849
 850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 851 {
 852         struct sock *sk = sock->sk;
 853         struct net *net = sock_net(sk);
 854         struct unix_sock *u = unix_sk(sk);
 855         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 856         char *sun_path = sunaddr->sun_path;
 857         int err;
 858         unsigned int hash;
 859         struct unix_address *addr;
 860         struct hlist_head *list;
 861
 862         err = -EINVAL;
 863         if (sunaddr->sun_family != AF_UNIX)
 864                 goto out;
 865
 866         if (addr_len == sizeof(short)) {
 867                 err = unix_autobind(sock);
 868                 goto out;
 869         }
 870
 871         err = unix_mkname(sunaddr, addr_len, &hash);
 872         if (err < 0)
 873                 goto out;
 874         addr_len = err;
 875
 876         mutex_lock(&u->readlock);
 877
 878         err = -EINVAL;
 879         if (u->addr)
 880                 goto out_up;
 881
 882         err = -ENOMEM;
 883         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 884         if (!addr)
 885                 goto out_up;
 886
 887         memcpy(addr->name, sunaddr, addr_len);
 888         addr->len = addr_len;
 889         addr->hash = hash ^ sk->sk_type;
 890         atomic_set(&addr->refcnt, 1);
 891
 892         if (sun_path[0]) {
 893                 struct path path;
 894                 umode_t mode = S_IFSOCK |
 895                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 896                 err = unix_mknod(sun_path, mode, &path);
 897                 if (err) {
 898                         if (err == -EEXIST)
 899                                 err = -EADDRINUSE;
 900                         unix_release_addr(addr);
 901                         goto out_up;
 902                 }
 903                 addr->hash = UNIX_HASH_SIZE;
 904                 hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
 905                 spin_lock(&unix_table_lock);
 906                 u->path = path;
 907                 list = &unix_socket_table[hash];
 908         } else {
 909                 spin_lock(&unix_table_lock);
 910                 err = -EADDRINUSE;
 911                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 912                                               sk->sk_type, hash)) {
 913                         unix_release_addr(addr);
 914                         goto out_unlock;
 915                 }
 916
 917                 list = &unix_socket_table[addr->hash];
 918         }
 919
 920         err = 0;
 921         __unix_remove_socket(sk);
 922         u->addr = addr;
 923         __unix_insert_socket(list, sk);
 924
 925 out_unlock:
 926         spin_unlock(&unix_table_lock);
 927 out_up:
 928         mutex_unlock(&u->readlock);
 929 out:
 930         return err;
 931 }
 932
 933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 934 {
 935         if (unlikely(sk1 == sk2) || !sk2) {
 936                 unix_state_lock(sk1);
 937                 return;
 938         }
 939         if (sk1 < sk2) {
 940                 unix_state_lock(sk1);
 941                 unix_state_lock_nested(sk2);
 942         } else {
 943                 unix_state_lock(sk2);
 944                 unix_state_lock_nested(sk1);
 945         }
 946 }
 947
 948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 949 {
 950         if (unlikely(sk1 == sk2) || !sk2) {
 951                 unix_state_unlock(sk1);
 952                 return;
 953         }
 954         unix_state_unlock(sk1);
 955         unix_state_unlock(sk2);
 956 }
 957
 958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 959                               int alen, int flags)
 960 {
 961         struct sock *sk = sock->sk;
 962         struct net *net = sock_net(sk);
 963         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 964         struct sock *other;
 965         unsigned int hash;
 966         int err;
 967
 968         if (addr->sa_family != AF_UNSPEC) {
 969                 err = unix_mkname(sunaddr, alen, &hash);
 970                 if (err < 0)
 971                         goto out;
 972                 alen = err;
 973
 974                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 975                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 976                         goto out;
 977
 978 restart:
 979                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 980                 if (!other)
 981                         goto out;
 982
 983                 unix_state_double_lock(sk, other);
 984
 985                 /* Apparently VFS overslept socket death. Retry. */
 986                 if (sock_flag(other, SOCK_DEAD)) {
 987                         unix_state_double_unlock(sk, other);
 988                         sock_put(other);
 989                         goto restart;
 990                 }
 991
 992                 err = -EPERM;
 993                 if (!unix_may_send(sk, other))
 994                         goto out_unlock;
 995
 996                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
 997                 if (err)
 998                         goto out_unlock;
 999
1000         } else {
1001                 /*
1002                  *      1003.1g breaking connected state with AF_UNSPEC
1003                  */
1004                 other = NULL;
1005                 unix_state_double_lock(sk, other);
1006         }
1007
1008         /*
1009          * If it was connected, reconnect.
1010          */
1011         if (unix_peer(sk)) {
1012                 struct sock *old_peer = unix_peer(sk);
1013                 unix_peer(sk) = other;
1014                 unix_state_double_unlock(sk, other);
1015
1016                 if (other != old_peer)
1017                         unix_dgram_disconnected(sk, old_peer);
1018                 sock_put(old_peer);
1019         } else {
1020                 unix_peer(sk) = other;
1021                 unix_state_double_unlock(sk, other);
1022         }
1023         return 0;
1024
1025 out_unlock:
1026         unix_state_double_unlock(sk, other);
1027         sock_put(other);
1028 out:
1029         return err;
1030 }
1031
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034         struct unix_sock *u = unix_sk(other);
1035         int sched;
1036         DEFINE_WAIT(wait);
1037
1038         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039
1040         sched = !sock_flag(other, SOCK_DEAD) &&
1041                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1042                 unix_recvq_full(other);
1043
1044         unix_state_unlock(other);
1045
1046         if (sched)
1047                 timeo = schedule_timeout(timeo);
1048
1049         finish_wait(&u->peer_wait, &wait);
1050         return timeo;
1051 }
1052
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054                                int addr_len, int flags)
1055 {
1056         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057         struct sock *sk = sock->sk;
1058         struct net *net = sock_net(sk);
1059         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060         struct sock *newsk = NULL;
1061         struct sock *other = NULL;
1062         struct sk_buff *skb = NULL;
1063         unsigned int hash;
1064         int st;
1065         int err;
1066         long timeo;
1067
1068         err = unix_mkname(sunaddr, addr_len, &hash);
1069         if (err < 0)
1070                 goto out;
1071         addr_len = err;
1072
1073         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074             (err = unix_autobind(sock)) != 0)
1075                 goto out;
1076
1077         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078
1079         /* First of all allocate resources.
1080            If we will make it after state is locked,
1081            we will have to recheck all again in any case.
1082          */
1083
1084         err = -ENOMEM;
1085
1086         /* create new sock for complete connection */
1087         newsk = unix_create1(sock_net(sk), NULL);
1088         if (newsk == NULL)
1089                 goto out;
1090
1091         /* Allocate skb for sending to listening sock */
1092         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093         if (skb == NULL)
1094                 goto out;
1095
1096 restart:
1097         /*  Find listening sock. */
1098         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099         if (!other)
1100                 goto out;
1101
1102         /* Latch state of peer */
1103         unix_state_lock(other);
1104
1105         /* Apparently VFS overslept socket death. Retry. */
1106         if (sock_flag(other, SOCK_DEAD)) {
1107                 unix_state_unlock(other);
1108                 sock_put(other);
1109                 goto restart;
1110         }
1111
1112         err = -ECONNREFUSED;
1113         if (other->sk_state != TCP_LISTEN)
1114                 goto out_unlock;
1115         if (other->sk_shutdown & RCV_SHUTDOWN)
1116                 goto out_unlock;
1117
1118         if (unix_recvq_full(other)) {
1119                 err = -EAGAIN;
1120                 if (!timeo)
1121                         goto out_unlock;
1122
1123                 timeo = unix_wait_for_peer(other, timeo);
1124
1125                 err = sock_intr_errno(timeo);
1126                 if (signal_pending(current))
1127                         goto out;
1128                 sock_put(other);
1129                 goto restart;
1130         }
1131
1132         /* Latch our state.
1133
1134            It is tricky place. We need to grab our state lock and cannot
1135            drop lock on peer. It is dangerous because deadlock is
1136            possible. Connect to self case and simultaneous
1137            attempt to connect are eliminated by checking socket
1138            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139            check this before attempt to grab lock.
1140
1141            Well, and we have to recheck the state after socket locked.
1142          */
1143         st = sk->sk_state;
1144
1145         switch (st) {
1146         case TCP_CLOSE:
1147                 /* This is ok... continue with connect */
1148                 break;
1149         case TCP_ESTABLISHED:
1150                 /* Socket is already connected */
1151                 err = -EISCONN;
1152                 goto out_unlock;
1153         default:
1154                 err = -EINVAL;
1155                 goto out_unlock;
1156         }
1157
1158         unix_state_lock_nested(sk);
1159
1160         if (sk->sk_state != st) {
1161                 unix_state_unlock(sk);
1162                 unix_state_unlock(other);
1163                 sock_put(other);
1164                 goto restart;
1165         }
1166
1167         err = security_unix_stream_connect(sk, other, newsk);
1168         if (err) {
1169                 unix_state_unlock(sk);
1170                 goto out_unlock;
1171         }
1172
1173         /* The way is open! Fastly set all the necessary fields... */
1174
1175         sock_hold(sk);
1176         unix_peer(newsk)        = sk;
1177         newsk->sk_state         = TCP_ESTABLISHED;
1178         newsk->sk_type          = sk->sk_type;
1179         init_peercred(newsk);
1180         newu = unix_sk(newsk);
1181         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182         otheru = unix_sk(other);
1183
1184         /* copy address information from listening to new sock*/
1185         if (otheru->addr) {
1186                 atomic_inc(&otheru->addr->refcnt);
1187                 newu->addr = otheru->addr;
1188         }
1189         if (otheru->path.dentry) {
1190                 path_get(&otheru->path);
1191                 newu->path = otheru->path;
1192         }
1193
1194         /* Set credentials */
1195         copy_peercred(sk, other);
1196
1197         sock->state     = SS_CONNECTED;
1198         sk->sk_state    = TCP_ESTABLISHED;
1199         sock_hold(newsk);
1200
1201         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1202         unix_peer(sk)   = newsk;
1203
1204         unix_state_unlock(sk);
1205
1206         /* take ten and and send info to listening sock */
1207         spin_lock(&other->sk_receive_queue.lock);
1208         __skb_queue_tail(&other->sk_receive_queue, skb);
1209         spin_unlock(&other->sk_receive_queue.lock);
1210         unix_state_unlock(other);
1211         other->sk_data_ready(other, 0);
1212         sock_put(other);
1213         return 0;
1214
1215 out_unlock:
1216         if (other)
1217                 unix_state_unlock(other);
1218
1219 out:
1220         kfree_skb(skb);
1221         if (newsk)
1222                 unix_release_sock(newsk, 0);
1223         if (other)
1224                 sock_put(other);
1225         return err;
1226 }
1227
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230         struct sock *ska = socka->sk, *skb = sockb->sk;
1231
1232         /* Join our sockets back to back */
1233         sock_hold(ska);
1234         sock_hold(skb);
1235         unix_peer(ska) = skb;
1236         unix_peer(skb) = ska;
1237         init_peercred(ska);
1238         init_peercred(skb);
1239
1240         if (ska->sk_type != SOCK_DGRAM) {
1241                 ska->sk_state = TCP_ESTABLISHED;
1242                 skb->sk_state = TCP_ESTABLISHED;
1243                 socka->state  = SS_CONNECTED;
1244                 sockb->state  = SS_CONNECTED;
1245         }
1246         return 0;
1247 }
1248
1249 static void unix_sock_inherit_flags(const struct socket *old,
1250                                     struct socket *new)
1251 {
1252         if (test_bit(SOCK_PASSCRED, &old->flags))
1253                 set_bit(SOCK_PASSCRED, &new->flags);
1254         if (test_bit(SOCK_PASSSEC, &old->flags))
1255                 set_bit(SOCK_PASSSEC, &new->flags);
1256 }
1257
1258 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1259 {
1260         struct sock *sk = sock->sk;
1261         struct sock *tsk;
1262         struct sk_buff *skb;
1263         int err;
1264
1265         err = -EOPNOTSUPP;
1266         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1267                 goto out;
1268
1269         err = -EINVAL;
1270         if (sk->sk_state != TCP_LISTEN)
1271                 goto out;
1272
1273         /* If socket state is TCP_LISTEN it cannot change (for now...),
1274          * so that no locks are necessary.
1275          */
1276
1277         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1278         if (!skb) {
1279                 /* This means receive shutdown. */
1280                 if (err == 0)
1281                         err = -EINVAL;
1282                 goto out;
1283         }
1284
1285         tsk = skb->sk;
1286         skb_free_datagram(sk, skb);
1287         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1288
1289         /* attach accepted sock to socket */
1290         unix_state_lock(tsk);
1291         newsock->state = SS_CONNECTED;
1292         unix_sock_inherit_flags(sock, newsock);
1293         sock_graft(tsk, newsock);
1294         unix_state_unlock(tsk);
1295         return 0;
1296
1297 out:
1298         return err;
1299 }
1300
1301
1302 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1303 {
1304         struct sock *sk = sock->sk;
1305         struct unix_sock *u;
1306         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1307         int err = 0;
1308
1309         if (peer) {
1310                 sk = unix_peer_get(sk);
1311
1312                 err = -ENOTCONN;
1313                 if (!sk)
1314                         goto out;
1315                 err = 0;
1316         } else {
1317                 sock_hold(sk);
1318         }
1319
1320         u = unix_sk(sk);
1321         unix_state_lock(sk);
1322         if (!u->addr) {
1323                 sunaddr->sun_family = AF_UNIX;
1324                 sunaddr->sun_path[0] = 0;
1325                 *uaddr_len = sizeof(short);
1326         } else {
1327                 struct unix_address *addr = u->addr;
1328
1329                 *uaddr_len = addr->len;
1330                 memcpy(sunaddr, addr->name, *uaddr_len);
1331         }
1332         unix_state_unlock(sk);
1333         sock_put(sk);
1334 out:
1335         return err;
1336 }
1337
1338 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1339 {
1340         int i;
1341
1342         scm->fp = UNIXCB(skb).fp;
1343         UNIXCB(skb).fp = NULL;
1344
1345         for (i = scm->fp->count-1; i >= 0; i--)
1346                 unix_notinflight(scm->fp->fp[i]);
1347 }
1348
1349 static void unix_destruct_scm(struct sk_buff *skb)
1350 {
1351         struct scm_cookie scm;
1352         memset(&scm, 0, sizeof(scm));
1353         scm.pid  = UNIXCB(skb).pid;
1354         if (UNIXCB(skb).fp)
1355                 unix_detach_fds(&scm, skb);
1356
1357         /* Alas, it calls VFS */
1358         /* So fscking what? fput() had been SMP-safe since the last Summer */
1359         scm_destroy(&scm);
1360         sock_wfree(skb);
1361 }
1362
1363 #define MAX_RECURSION_LEVEL 4
1364
1365 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1366 {
1367         int i;
1368         unsigned char max_level = 0;
1369         int unix_sock_count = 0;
1370
1371         for (i = scm->fp->count - 1; i >= 0; i--) {
1372                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1373
1374                 if (sk) {
1375                         unix_sock_count++;
1376                         max_level = max(max_level,
1377                                         unix_sk(sk)->recursion_level);
1378                 }
1379         }
1380         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1381                 return -ETOOMANYREFS;
1382
1383         /*
1384          * Need to duplicate file references for the sake of garbage
1385          * collection.  Otherwise a socket in the fps might become a
1386          * candidate for GC while the skb is not yet queued.
1387          */
1388         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1389         if (!UNIXCB(skb).fp)
1390                 return -ENOMEM;
1391
1392         if (unix_sock_count) {
1393                 for (i = scm->fp->count - 1; i >= 0; i--)
1394                         unix_inflight(scm->fp->fp[i]);
1395         }
1396         return max_level;
1397 }
1398
1399 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1400 {
1401         int err = 0;
1402
1403         UNIXCB(skb).pid  = get_pid(scm->pid);
1404         UNIXCB(skb).uid = scm->creds.uid;
1405         UNIXCB(skb).gid = scm->creds.gid;
1406         UNIXCB(skb).fp = NULL;
1407         if (scm->fp && send_fds)
1408                 err = unix_attach_fds(scm, skb);
1409
1410         skb->destructor = unix_destruct_scm;
1411         return err;
1412 }
1413
1414 /*
1415  * Some apps rely on write() giving SCM_CREDENTIALS
1416  * We include credentials if source or destination socket
1417  * asserted SOCK_PASSCRED.
1418  */
1419 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1420                             const struct sock *other)
1421 {
1422         if (UNIXCB(skb).pid)
1423                 return;
1424         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1425             !other->sk_socket ||
1426             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1427                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1428                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1429         }
1430 }
1431
1432 /*
1433  *      Send AF_UNIX data.
1434  */
1435
1436 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1437                               struct msghdr *msg, size_t len)
1438 {
1439         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1440         struct sock *sk = sock->sk;
1441         struct net *net = sock_net(sk);
1442         struct unix_sock *u = unix_sk(sk);
1443         struct sockaddr_un *sunaddr = msg->msg_name;
1444         struct sock *other = NULL;
1445         int namelen = 0; /* fake GCC */
1446         int err;
1447         unsigned int hash;
1448         struct sk_buff *skb;
1449         long timeo;
1450         struct scm_cookie tmp_scm;
1451         int max_level;
1452         int data_len = 0;
1453
1454         if (NULL == siocb->scm)
1455                 siocb->scm = &tmp_scm;
1456         wait_for_unix_gc();
1457         err = scm_send(sock, msg, siocb->scm, false);
1458         if (err < 0)
1459                 return err;
1460
1461         err = -EOPNOTSUPP;
1462         if (msg->msg_flags&MSG_OOB)
1463                 goto out;
1464
1465         if (msg->msg_namelen) {
1466                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1467                 if (err < 0)
1468                         goto out;
1469                 namelen = err;
1470         } else {
1471                 sunaddr = NULL;
1472                 err = -ENOTCONN;
1473                 other = unix_peer_get(sk);
1474                 if (!other)
1475                         goto out;
1476         }
1477
1478         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1479             && (err = unix_autobind(sock)) != 0)
1480                 goto out;
1481
1482         err = -EMSGSIZE;
1483         if (len > sk->sk_sndbuf - 32)
1484                 goto out;
1485
1486         if (len > SKB_MAX_ALLOC)
1487                 data_len = min_t(size_t,
1488                                  len - SKB_MAX_ALLOC,
1489                                  MAX_SKB_FRAGS * PAGE_SIZE);
1490
1491         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1492                                    msg->msg_flags & MSG_DONTWAIT, &err,
1493                                    PAGE_ALLOC_COSTLY_ORDER);
1494         if (skb == NULL)
1495                 goto out;
1496
1497         err = unix_scm_to_skb(siocb->scm, skb, true);
1498         if (err < 0)
1499                 goto out_free;
1500         max_level = err + 1;
1501         unix_get_secdata(siocb->scm, skb);
1502
1503         skb_put(skb, len - data_len);
1504         skb->data_len = data_len;
1505         skb->len = len;
1506         err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1507         if (err)
1508                 goto out_free;
1509
1510         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1511
1512 restart:
1513         if (!other) {
1514                 err = -ECONNRESET;
1515                 if (sunaddr == NULL)
1516                         goto out_free;
1517
1518                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1519                                         hash, &err);
1520                 if (other == NULL)
1521                         goto out_free;
1522         }
1523
1524         if (sk_filter(other, skb) < 0) {
1525                 /* Toss the packet but do not return any error to the sender */
1526                 err = len;
1527                 goto out_free;
1528         }
1529
1530         unix_state_lock(other);
1531         err = -EPERM;
1532         if (!unix_may_send(sk, other))
1533                 goto out_unlock;
1534
1535         if (sock_flag(other, SOCK_DEAD)) {
1536                 /*
1537                  *      Check with 1003.1g - what should
1538                  *      datagram error
1539                  */
1540                 unix_state_unlock(other);
1541                 sock_put(other);
1542
1543                 err = 0;
1544                 unix_state_lock(sk);
1545                 if (unix_peer(sk) == other) {
1546                         unix_peer(sk) = NULL;
1547                         unix_state_unlock(sk);
1548
1549                         unix_dgram_disconnected(sk, other);
1550                         sock_put(other);
1551                         err = -ECONNREFUSED;
1552                 } else {
1553                         unix_state_unlock(sk);
1554                 }
1555
1556                 other = NULL;
1557                 if (err)
1558                         goto out_free;
1559                 goto restart;
1560         }
1561
1562         err = -EPIPE;
1563         if (other->sk_shutdown & RCV_SHUTDOWN)
1564                 goto out_unlock;
1565
1566         if (sk->sk_type != SOCK_SEQPACKET) {
1567                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1568                 if (err)
1569                         goto out_unlock;
1570         }
1571
1572         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1573                 if (!timeo) {
1574                         err = -EAGAIN;
1575                         goto out_unlock;
1576                 }
1577
1578                 timeo = unix_wait_for_peer(other, timeo);
1579
1580                 err = sock_intr_errno(timeo);
1581                 if (signal_pending(current))
1582                         goto out_free;
1583
1584                 goto restart;
1585         }
1586
1587         if (sock_flag(other, SOCK_RCVTSTAMP))
1588                 __net_timestamp(skb);
1589         maybe_add_creds(skb, sock, other);
1590         skb_queue_tail(&other->sk_receive_queue, skb);
1591         if (max_level > unix_sk(other)->recursion_level)
1592                 unix_sk(other)->recursion_level = max_level;
1593         unix_state_unlock(other);
1594         other->sk_data_ready(other, len);
1595         sock_put(other);
1596         scm_destroy(siocb->scm);
1597         return len;
1598
1599 out_unlock:
1600         unix_state_unlock(other);
1601 out_free:
1602         kfree_skb(skb);
1603 out:
1604         if (other)
1605                 sock_put(other);
1606         scm_destroy(siocb->scm);
1607         return err;
1608 }
1609
1610 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1611  * bytes, and a minimun of a full page.
1612  */
1613 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1614
1615 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1616                                struct msghdr *msg, size_t len)
1617 {
1618         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1619         struct sock *sk = sock->sk;
1620         struct sock *other = NULL;
1621         int err, size;
1622         struct sk_buff *skb;
1623         int sent = 0;
1624         struct scm_cookie tmp_scm;
1625         bool fds_sent = false;
1626         int max_level;
1627         int data_len;
1628
1629         if (NULL == siocb->scm)
1630                 siocb->scm = &tmp_scm;
1631         wait_for_unix_gc();
1632         err = scm_send(sock, msg, siocb->scm, false);
1633         if (err < 0)
1634                 return err;
1635
1636         err = -EOPNOTSUPP;
1637         if (msg->msg_flags&MSG_OOB)
1638                 goto out_err;
1639
1640         if (msg->msg_namelen) {
1641                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1642                 goto out_err;
1643         } else {
1644                 err = -ENOTCONN;
1645                 other = unix_peer(sk);
1646                 if (!other)
1647                         goto out_err;
1648         }
1649
1650         if (sk->sk_shutdown & SEND_SHUTDOWN)
1651                 goto pipe_err;
1652
1653         while (sent < len) {
1654                 size = len - sent;
1655
1656                 /* Keep two messages in the pipe so it schedules better */
1657                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1658
1659                 /* allow fallback to order-0 allocations */
1660                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1661
1662                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1663
1664                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1665                                            msg->msg_flags & MSG_DONTWAIT, &err,
1666                                            get_order(UNIX_SKB_FRAGS_SZ));
1667                 if (!skb)
1668                         goto out_err;
1669
1670                 /* Only send the fds in the first buffer */
1671                 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1672                 if (err < 0) {
1673                         kfree_skb(skb);
1674                         goto out_err;
1675                 }
1676                 max_level = err + 1;
1677                 fds_sent = true;
1678
1679                 skb_put(skb, size - data_len);
1680                 skb->data_len = data_len;
1681                 skb->len = size;
1682                 err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1683                                                    sent, size);
1684                 if (err) {
1685                         kfree_skb(skb);
1686                         goto out_err;
1687                 }
1688
1689                 unix_state_lock(other);
1690
1691                 if (sock_flag(other, SOCK_DEAD) ||
1692                     (other->sk_shutdown & RCV_SHUTDOWN))
1693                         goto pipe_err_free;
1694
1695                 maybe_add_creds(skb, sock, other);
1696                 skb_queue_tail(&other->sk_receive_queue, skb);
1697                 if (max_level > unix_sk(other)->recursion_level)
1698                         unix_sk(other)->recursion_level = max_level;
1699                 unix_state_unlock(other);
1700                 other->sk_data_ready(other, size);
1701                 sent += size;
1702         }
1703
1704         scm_destroy(siocb->scm);
1705         siocb->scm = NULL;
1706
1707         return sent;
1708
1709 pipe_err_free:
1710         unix_state_unlock(other);
1711         kfree_skb(skb);
1712 pipe_err:
1713         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1714                 send_sig(SIGPIPE, current, 0);
1715         err = -EPIPE;
1716 out_err:
1717         scm_destroy(siocb->scm);
1718         siocb->scm = NULL;
1719         return sent ? : err;
1720 }
1721
1722 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1723                                   struct msghdr *msg, size_t len)
1724 {
1725         int err;
1726         struct sock *sk = sock->sk;
1727
1728         err = sock_error(sk);
1729         if (err)
1730                 return err;
1731
1732         if (sk->sk_state != TCP_ESTABLISHED)
1733                 return -ENOTCONN;
1734
1735         if (msg->msg_namelen)
1736                 msg->msg_namelen = 0;
1737
1738         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1739 }
1740
1741 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1742                               struct msghdr *msg, size_t size,
1743                               int flags)
1744 {
1745         struct sock *sk = sock->sk;
1746
1747         if (sk->sk_state != TCP_ESTABLISHED)
1748                 return -ENOTCONN;
1749
1750         return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1751 }
1752
1753 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1754 {
1755         struct unix_sock *u = unix_sk(sk);
1756
1757         if (u->addr) {
1758                 msg->msg_namelen = u->addr->len;
1759                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1760         }
1761 }
1762
1763 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1764                               struct msghdr *msg, size_t size,
1765                               int flags)
1766 {
1767         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1768         struct scm_cookie tmp_scm;
1769         struct sock *sk = sock->sk;
1770         struct unix_sock *u = unix_sk(sk);
1771         int noblock = flags & MSG_DONTWAIT;
1772         struct sk_buff *skb;
1773         int err;
1774         int peeked, skip;
1775
1776         err = -EOPNOTSUPP;
1777         if (flags&MSG_OOB)
1778                 goto out;
1779
1780         err = mutex_lock_interruptible(&u->readlock);
1781         if (err) {
1782                 err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1783                 goto out;
1784         }
1785
1786         skip = sk_peek_offset(sk, flags);
1787
1788         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1789         if (!skb) {
1790                 unix_state_lock(sk);
1791                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1792                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1793                     (sk->sk_shutdown & RCV_SHUTDOWN))
1794                         err = 0;
1795                 unix_state_unlock(sk);
1796                 goto out_unlock;
1797         }
1798
1799         wake_up_interruptible_sync_poll(&u->peer_wait,
1800                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1801
1802         if (msg->msg_name)
1803                 unix_copy_addr(msg, skb->sk);
1804
1805         if (size > skb->len - skip)
1806                 size = skb->len - skip;
1807         else if (size < skb->len - skip)
1808                 msg->msg_flags |= MSG_TRUNC;
1809
1810         err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1811         if (err)
1812                 goto out_free;
1813
1814         if (sock_flag(sk, SOCK_RCVTSTAMP))
1815                 __sock_recv_timestamp(msg, sk, skb);
1816
1817         if (!siocb->scm) {
1818                 siocb->scm = &tmp_scm;
1819                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1820         }
1821         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1822         unix_set_secdata(siocb->scm, skb);
1823
1824         if (!(flags & MSG_PEEK)) {
1825                 if (UNIXCB(skb).fp)
1826                         unix_detach_fds(siocb->scm, skb);
1827
1828                 sk_peek_offset_bwd(sk, skb->len);
1829         } else {
1830                 /* It is questionable: on PEEK we could:
1831                    - do not return fds - good, but too simple 8)
1832                    - return fds, and do not return them on read (old strategy,
1833                      apparently wrong)
1834                    - clone fds (I chose it for now, it is the most universal
1835                      solution)
1836
1837                    POSIX 1003.1g does not actually define this clearly
1838                    at all. POSIX 1003.1g doesn't define a lot of things
1839                    clearly however!
1840
1841                 */
1842
1843                 sk_peek_offset_fwd(sk, size);
1844
1845                 if (UNIXCB(skb).fp)
1846                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1847         }
1848         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1849
1850         scm_recv(sock, msg, siocb->scm, flags);
1851
1852 out_free:
1853         skb_free_datagram(sk, skb);
1854 out_unlock:
1855         mutex_unlock(&u->readlock);
1856 out:
1857         return err;
1858 }
1859
1860 /*
1861  *      Sleep until more data has arrived. But check for races..
1862  */
1863 static long unix_stream_data_wait(struct sock *sk, long timeo,
1864                                   struct sk_buff *last)
1865 {
1866         DEFINE_WAIT(wait);
1867
1868         unix_state_lock(sk);
1869
1870         for (;;) {
1871                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1872
1873                 if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1874                     sk->sk_err ||
1875                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1876                     signal_pending(current) ||
1877                     !timeo)
1878                         break;
1879
1880                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1881                 unix_state_unlock(sk);
1882                 timeo = freezable_schedule_timeout(timeo);
1883                 unix_state_lock(sk);
1884                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1885         }
1886
1887         finish_wait(sk_sleep(sk), &wait);
1888         unix_state_unlock(sk);
1889         return timeo;
1890 }
1891
1892 static unsigned int unix_skb_len(const struct sk_buff *skb)
1893 {
1894         return skb->len - UNIXCB(skb).consumed;
1895 }
1896
1897 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1898                                struct msghdr *msg, size_t size,
1899                                int flags)
1900 {
1901         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1902         struct scm_cookie tmp_scm;
1903         struct sock *sk = sock->sk;
1904         struct unix_sock *u = unix_sk(sk);
1905         struct sockaddr_un *sunaddr = msg->msg_name;
1906         int copied = 0;
1907         int check_creds = 0;
1908         int target;
1909         int err = 0;
1910         long timeo;
1911         int skip;
1912
1913         err = -EINVAL;
1914         if (sk->sk_state != TCP_ESTABLISHED)
1915                 goto out;
1916
1917         err = -EOPNOTSUPP;
1918         if (flags&MSG_OOB)
1919                 goto out;
1920
1921         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1922         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1923
1924         /* Lock the socket to prevent queue disordering
1925          * while sleeps in memcpy_tomsg
1926          */
1927
1928         if (!siocb->scm) {
1929                 siocb->scm = &tmp_scm;
1930                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1931         }
1932
1933         err = mutex_lock_interruptible(&u->readlock);
1934         if (err) {
1935                 err = sock_intr_errno(timeo);
1936                 goto out;
1937         }
1938
1939         do {
1940                 int chunk;
1941                 struct sk_buff *skb, *last;
1942
1943                 unix_state_lock(sk);
1944                 last = skb = skb_peek(&sk->sk_receive_queue);
1945 again:
1946                 if (skb == NULL) {
1947                         unix_sk(sk)->recursion_level = 0;
1948                         if (copied >= target)
1949                                 goto unlock;
1950
1951                         /*
1952                          *      POSIX 1003.1g mandates this order.
1953                          */
1954
1955                         err = sock_error(sk);
1956                         if (err)
1957                                 goto unlock;
1958                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1959                                 goto unlock;
1960
1961                         unix_state_unlock(sk);
1962                         err = -EAGAIN;
1963                         if (!timeo)
1964                                 break;
1965                         mutex_unlock(&u->readlock);
1966
1967                         timeo = unix_stream_data_wait(sk, timeo, last);
1968
1969                         if (signal_pending(current)
1970                             ||  mutex_lock_interruptible(&u->readlock)) {
1971                                 err = sock_intr_errno(timeo);
1972                                 goto out;
1973                         }
1974
1975                         continue;
1976  unlock:
1977                         unix_state_unlock(sk);
1978                         break;
1979                 }
1980
1981                 skip = sk_peek_offset(sk, flags);
1982                 while (skip >= unix_skb_len(skb)) {
1983                         skip -= unix_skb_len(skb);
1984                         last = skb;
1985                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
1986                         if (!skb)
1987                                 goto again;
1988                 }
1989
1990                 unix_state_unlock(sk);
1991
1992                 if (check_creds) {
1993                         /* Never glue messages from different writers */
1994                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1995                             !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
1996                             !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
1997                                 break;
1998                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1999                         /* Copy credentials */
2000                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2001                         check_creds = 1;
2002                 }
2003
2004                 /* Copy address just once */
2005                 if (sunaddr) {
2006                         unix_copy_addr(msg, skb->sk);
2007                         sunaddr = NULL;
2008                 }
2009
2010                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2011                 if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2012                                             msg->msg_iov, chunk)) {
2013                         if (copied == 0)
2014                                 copied = -EFAULT;
2015                         break;
2016                 }
2017                 copied += chunk;
2018                 size -= chunk;
2019
2020                 /* Mark read part of skb as used */
2021                 if (!(flags & MSG_PEEK)) {
2022                         UNIXCB(skb).consumed += chunk;
2023
2024                         sk_peek_offset_bwd(sk, chunk);
2025
2026                         if (UNIXCB(skb).fp)
2027                                 unix_detach_fds(siocb->scm, skb);
2028
2029                         if (unix_skb_len(skb))
2030                                 break;
2031
2032                         skb_unlink(skb, &sk->sk_receive_queue);
2033                         consume_skb(skb);
2034
2035                         if (siocb->scm->fp)
2036                                 break;
2037                 } else {
2038                         /* It is questionable, see note in unix_dgram_recvmsg.
2039                          */
2040                         if (UNIXCB(skb).fp)
2041                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2042
2043                         sk_peek_offset_fwd(sk, chunk);
2044
2045                         break;
2046                 }
2047         } while (size);
2048
2049         mutex_unlock(&u->readlock);
2050         scm_recv(sock, msg, siocb->scm, flags);
2051 out:
2052         return copied ? : err;
2053 }
2054
2055 static int unix_shutdown(struct socket *sock, int mode)
2056 {
2057         struct sock *sk = sock->sk;
2058         struct sock *other;
2059
2060         if (mode < SHUT_RD || mode > SHUT_RDWR)
2061                 return -EINVAL;
2062         /* This maps:
2063          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2064          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2065          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2066          */
2067         ++mode;
2068
2069         unix_state_lock(sk);
2070         sk->sk_shutdown |= mode;
2071         other = unix_peer(sk);
2072         if (other)
2073                 sock_hold(other);
2074         unix_state_unlock(sk);
2075         sk->sk_state_change(sk);
2076
2077         if (other &&
2078                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2079
2080                 int peer_mode = 0;
2081
2082                 if (mode&RCV_SHUTDOWN)
2083                         peer_mode |= SEND_SHUTDOWN;
2084                 if (mode&SEND_SHUTDOWN)
2085                         peer_mode |= RCV_SHUTDOWN;
2086                 unix_state_lock(other);
2087                 other->sk_shutdown |= peer_mode;
2088                 unix_state_unlock(other);
2089                 other->sk_state_change(other);
2090                 if (peer_mode == SHUTDOWN_MASK)
2091                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2092                 else if (peer_mode & RCV_SHUTDOWN)
2093                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2094         }
2095         if (other)
2096                 sock_put(other);
2097
2098         return 0;
2099 }
2100
2101 long unix_inq_len(struct sock *sk)
2102 {
2103         struct sk_buff *skb;
2104         long amount = 0;
2105
2106         if (sk->sk_state == TCP_LISTEN)
2107                 return -EINVAL;
2108
2109         spin_lock(&sk->sk_receive_queue.lock);
2110         if (sk->sk_type == SOCK_STREAM ||
2111             sk->sk_type == SOCK_SEQPACKET) {
2112                 skb_queue_walk(&sk->sk_receive_queue, skb)
2113                         amount += unix_skb_len(skb);
2114         } else {
2115                 skb = skb_peek(&sk->sk_receive_queue);
2116                 if (skb)
2117                         amount = skb->len;
2118         }
2119         spin_unlock(&sk->sk_receive_queue.lock);
2120
2121         return amount;
2122 }
2123 EXPORT_SYMBOL_GPL(unix_inq_len);
2124
2125 long unix_outq_len(struct sock *sk)
2126 {
2127         return sk_wmem_alloc_get(sk);
2128 }
2129 EXPORT_SYMBOL_GPL(unix_outq_len);
2130
2131 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2132 {
2133         struct sock *sk = sock->sk;
2134         long amount = 0;
2135         int err;
2136
2137         switch (cmd) {
2138         case SIOCOUTQ:
2139                 amount = unix_outq_len(sk);
2140                 err = put_user(amount, (int __user *)arg);
2141                 break;
2142         case SIOCINQ:
2143                 amount = unix_inq_len(sk);
2144                 if (amount < 0)
2145                         err = amount;
2146                 else
2147                         err = put_user(amount, (int __user *)arg);
2148                 break;
2149         default:
2150                 err = -ENOIOCTLCMD;
2151                 break;
2152         }
2153         return err;
2154 }
2155
2156 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2157 {
2158         struct sock *sk = sock->sk;
2159         unsigned int mask;
2160
2161         sock_poll_wait(file, sk_sleep(sk), wait);
2162         mask = 0;
2163
2164         /* exceptional events? */
2165         if (sk->sk_err)
2166                 mask |= POLLERR;
2167         if (sk->sk_shutdown == SHUTDOWN_MASK)
2168                 mask |= POLLHUP;
2169         if (sk->sk_shutdown & RCV_SHUTDOWN)
2170                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2171
2172         /* readable? */
2173         if (!skb_queue_empty(&sk->sk_receive_queue))
2174                 mask |= POLLIN | POLLRDNORM;
2175
2176         /* Connection-based need to check for termination and startup */
2177         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2178             sk->sk_state == TCP_CLOSE)
2179                 mask |= POLLHUP;
2180
2181         /*
2182          * we set writable also when the other side has shut down the
2183          * connection. This prevents stuck sockets.
2184          */
2185         if (unix_writable(sk))
2186                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2187
2188         return mask;
2189 }
2190
2191 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2192                                     poll_table *wait)
2193 {
2194         struct sock *sk = sock->sk, *other;
2195         unsigned int mask, writable;
2196
2197         sock_poll_wait(file, sk_sleep(sk), wait);
2198         mask = 0;
2199
2200         /* exceptional events? */
2201         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2202                 mask |= POLLERR |
2203                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2204
2205         if (sk->sk_shutdown & RCV_SHUTDOWN)
2206                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2207         if (sk->sk_shutdown == SHUTDOWN_MASK)
2208                 mask |= POLLHUP;
2209
2210         /* readable? */
2211         if (!skb_queue_empty(&sk->sk_receive_queue))
2212                 mask |= POLLIN | POLLRDNORM;
2213
2214         /* Connection-based need to check for termination and startup */
2215         if (sk->sk_type == SOCK_SEQPACKET) {
2216                 if (sk->sk_state == TCP_CLOSE)
2217                         mask |= POLLHUP;
2218                 /* connection hasn't started yet? */
2219                 if (sk->sk_state == TCP_SYN_SENT)
2220                         return mask;
2221         }
2222
2223         /* No write status requested, avoid expensive OUT tests. */
2224         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2225                 return mask;
2226
2227         writable = unix_writable(sk);
2228         other = unix_peer_get(sk);
2229         if (other) {
2230                 if (unix_peer(other) != sk) {
2231                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2232                         if (unix_recvq_full(other))
2233                                 writable = 0;
2234                 }
2235                 sock_put(other);
2236         }
2237
2238         if (writable)
2239                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2240         else
2241                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2242
2243         return mask;
2244 }
2245
2246 #ifdef CONFIG_PROC_FS
2247
2248 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2249
2250 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2251 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2252 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2253
2254 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2255 {
2256         unsigned long offset = get_offset(*pos);
2257         unsigned long bucket = get_bucket(*pos);
2258         struct sock *sk;
2259         unsigned long count = 0;
2260
2261         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2262                 if (sock_net(sk) != seq_file_net(seq))
2263                         continue;
2264                 if (++count == offset)
2265                         break;
2266         }
2267
2268         return sk;
2269 }
2270
2271 static struct sock *unix_next_socket(struct seq_file *seq,
2272                                      struct sock *sk,
2273                                      loff_t *pos)
2274 {
2275         unsigned long bucket;
2276
2277         while (sk > (struct sock *)SEQ_START_TOKEN) {
2278                 sk = sk_next(sk);
2279                 if (!sk)
2280                         goto next_bucket;
2281                 if (sock_net(sk) == seq_file_net(seq))
2282                         return sk;
2283         }
2284
2285         do {
2286                 sk = unix_from_bucket(seq, pos);
2287                 if (sk)
2288                         return sk;
2289
2290 next_bucket:
2291                 bucket = get_bucket(*pos) + 1;
2292                 *pos = set_bucket_offset(bucket, 1);
2293         } while (bucket < ARRAY_SIZE(unix_socket_table));
2294
2295         return NULL;
2296 }
2297
2298 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2299         __acquires(unix_table_lock)
2300 {
2301         spin_lock(&unix_table_lock);
2302
2303         if (!*pos)
2304                 return SEQ_START_TOKEN;
2305
2306         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2307                 return NULL;
2308
2309         return unix_next_socket(seq, NULL, pos);
2310 }
2311
2312 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2313 {
2314         ++*pos;
2315         return unix_next_socket(seq, v, pos);
2316 }
2317
2318 static void unix_seq_stop(struct seq_file *seq, void *v)
2319         __releases(unix_table_lock)
2320 {
2321         spin_unlock(&unix_table_lock);
2322 }
2323
2324 static int unix_seq_show(struct seq_file *seq, void *v)
2325 {
2326
2327         if (v == SEQ_START_TOKEN)
2328                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2329                          "Inode Path\n");
2330         else {
2331                 struct sock *s = v;
2332                 struct unix_sock *u = unix_sk(s);
2333                 unix_state_lock(s);
2334
2335                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2336                         s,
2337                         atomic_read(&s->sk_refcnt),
2338                         0,
2339                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2340                         s->sk_type,
2341                         s->sk_socket ?
2342                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2343                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2344                         sock_i_ino(s));
2345
2346                 if (u->addr) {
2347                         int i, len;
2348                         seq_putc(seq, ' ');
2349
2350                         i = 0;
2351                         len = u->addr->len - sizeof(short);
2352                         if (!UNIX_ABSTRACT(s))
2353                                 len--;
2354                         else {
2355                                 seq_putc(seq, '@');
2356                                 i++;
2357                         }
2358                         for ( ; i < len; i++)
2359                                 seq_putc(seq, u->addr->name->sun_path[i]);
2360                 }
2361                 unix_state_unlock(s);
2362                 seq_putc(seq, '\n');
2363         }
2364
2365         return 0;
2366 }
2367
2368 static const struct seq_operations unix_seq_ops = {
2369         .start  = unix_seq_start,
2370         .next   = unix_seq_next,
2371         .stop   = unix_seq_stop,
2372         .show   = unix_seq_show,
2373 };
2374
2375 static int unix_seq_open(struct inode *inode, struct file *file)
2376 {
2377         return seq_open_net(inode, file, &unix_seq_ops,
2378                             sizeof(struct seq_net_private));
2379 }
2380
2381 static const struct file_operations unix_seq_fops = {
2382         .owner          = THIS_MODULE,
2383         .open           = unix_seq_open,
2384         .read           = seq_read,
2385         .llseek         = seq_lseek,
2386         .release        = seq_release_net,
2387 };
2388
2389 #endif
2390
2391 static const struct net_proto_family unix_family_ops = {
2392         .family = PF_UNIX,
2393         .create = unix_create,
2394         .owner  = THIS_MODULE,
2395 };
2396
2397
2398 static int __net_init unix_net_init(struct net *net)
2399 {
2400         int error = -ENOMEM;
2401
2402         net->unx.sysctl_max_dgram_qlen = 10;
2403         if (unix_sysctl_register(net))
2404                 goto out;
2405
2406 #ifdef CONFIG_PROC_FS
2407         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2408                 unix_sysctl_unregister(net);
2409                 goto out;
2410         }
2411 #endif
2412         error = 0;
2413 out:
2414         return error;
2415 }
2416
2417 static void __net_exit unix_net_exit(struct net *net)
2418 {
2419         unix_sysctl_unregister(net);
2420         remove_proc_entry("unix", net->proc_net);
2421 }
2422
2423 static struct pernet_operations unix_net_ops = {
2424         .init = unix_net_init,
2425         .exit = unix_net_exit,
2426 };
2427
2428 static int __init af_unix_init(void)
2429 {
2430         int rc = -1;
2431
2432         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2433
2434         rc = proto_register(&unix_proto, 1);
2435         if (rc != 0) {
2436                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2437                        __func__);
2438                 goto out;
2439         }
2440
2441         sock_register(&unix_family_ops);
2442         register_pernet_subsys(&unix_net_ops);
2443 out:
2444         return rc;
2445 }
2446
2447 static void __exit af_unix_exit(void)
2448 {
2449         sock_unregister(PF_UNIX);
2450         proto_unregister(&unix_proto);
2451         unregister_pernet_subsys(&unix_net_ops);
2452 }
2453
2454 /* Earlier than device_initcall() so that other drivers invoking
2455    request_module() don't end up in a loop when modprobe tries
2456    to use a UNIX socket. But later than subsys_initcall() because
2457    we depend on stuff initialised there */
2458 fs_initcall(af_unix_init);
2459 module_exit(af_unix_exit);
2460
2461 MODULE_LICENSE("GPL");
2462 MODULE_ALIAS_NETPROTO(PF_UNIX);