net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  84
  85 #include <linux/module.h>
  86 #include <linux/kernel.h>
  87 #include <linux/signal.h>
  88 #include <linux/sched.h>
  89 #include <linux/errno.h>
  90 #include <linux/string.h>
  91 #include <linux/stat.h>
  92 #include <linux/dcache.h>
  93 #include <linux/namei.h>
  94 #include <linux/socket.h>
  95 #include <linux/un.h>
  96 #include <linux/fcntl.h>
  97 #include <linux/termios.h>
  98 #include <linux/sockios.h>
  99 #include <linux/net.h>
 100 #include <linux/in.h>
 101 #include <linux/fs.h>
 102 #include <linux/slab.h>
 103 #include <asm/uaccess.h>
 104 #include <linux/skbuff.h>
 105 #include <linux/netdevice.h>
 106 #include <net/net_namespace.h>
 107 #include <net/sock.h>
 108 #include <net/tcp_states.h>
 109 #include <net/af_unix.h>
 110 #include <linux/proc_fs.h>
 111 #include <linux/seq_file.h>
 112 #include <net/scm.h>
 113 #include <linux/init.h>
 114 #include <linux/poll.h>
 115 #include <linux/rtnetlink.h>
 116 #include <linux/mount.h>
 117 #include <net/checksum.h>
 118 #include <linux/security.h>
 119 #include <linux/freezer.h>
 120
 121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 122 EXPORT_SYMBOL_GPL(unix_socket_table);
 123 DEFINE_SPINLOCK(unix_table_lock);
 124 EXPORT_SYMBOL_GPL(unix_table_lock);
 125 static atomic_long_t unix_nr_socks;
 126
 127
 128 static struct hlist_head *unix_sockets_unbound(void *addr)
 129 {
 130         unsigned long hash = (unsigned long)addr;
 131
 132         hash ^= hash >> 16;
 133         hash ^= hash >> 8;
 134         hash %= UNIX_HASH_SIZE;
 135         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 136 }
 137
 138 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 139
 140 #ifdef CONFIG_SECURITY_NETWORK
 141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 142 {
 143         UNIXCB(skb).secid = scm->secid;
 144 }
 145
 146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 147 {
 148         scm->secid = UNIXCB(skb).secid;
 149 }
 150
 151 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 152 {
 153         return (scm->secid == UNIXCB(skb).secid);
 154 }
 155 #else
 156 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 157 { }
 158
 159 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 160 { }
 161
 162 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
 163 {
 164         return true;
 165 }
 166 #endif /* CONFIG_SECURITY_NETWORK */
 167
 168 /*
 169  *  SMP locking strategy:
 170  *    hash table is protected with spinlock unix_table_lock
 171  *    each socket state is protected by separate spin lock.
 172  */
 173
 174 static inline unsigned int unix_hash_fold(__wsum n)
 175 {
 176         unsigned int hash = (__force unsigned int)csum_fold(n);
 177
 178         hash ^= hash>>8;
 179         return hash&(UNIX_HASH_SIZE-1);
 180 }
 181
 182 #define unix_peer(sk) (unix_sk(sk)->peer)
 183
 184 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 185 {
 186         return unix_peer(osk) == sk;
 187 }
 188
 189 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 190 {
 191         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 192 }
 193
 194 static inline int unix_recvq_full(struct sock const *sk)
 195 {
 196         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 197 }
 198
 199 struct sock *unix_peer_get(struct sock *s)
 200 {
 201         struct sock *peer;
 202
 203         unix_state_lock(s);
 204         peer = unix_peer(s);
 205         if (peer)
 206                 sock_hold(peer);
 207         unix_state_unlock(s);
 208         return peer;
 209 }
 210 EXPORT_SYMBOL_GPL(unix_peer_get);
 211
 212 static inline void unix_release_addr(struct unix_address *addr)
 213 {
 214         if (atomic_dec_and_test(&addr->refcnt))
 215                 kfree(addr);
 216 }
 217
 218 /*
 219  *      Check unix socket name:
 220  *              - should be not zero length.
 221  *              - if started by not zero, should be NULL terminated (FS object)
 222  *              - if started by zero, it is abstract name.
 223  */
 224
 225 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 226 {
 227         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 228                 return -EINVAL;
 229         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 230                 return -EINVAL;
 231         if (sunaddr->sun_path[0]) {
 232                 /*
 233                  * This may look like an off by one error but it is a bit more
 234                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 235                  * sun_path[108] doesn't as such exist.  However in kernel space
 236                  * we are guaranteed that it is a valid memory location in our
 237                  * kernel address buffer.
 238                  */
 239                 ((char *)sunaddr)[len] = 0;
 240                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 241                 return len;
 242         }
 243
 244         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 245         return len;
 246 }
 247
 248 static void __unix_remove_socket(struct sock *sk)
 249 {
 250         sk_del_node_init(sk);
 251 }
 252
 253 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 254 {
 255         WARN_ON(!sk_unhashed(sk));
 256         sk_add_node(sk, list);
 257 }
 258
 259 static inline void unix_remove_socket(struct sock *sk)
 260 {
 261         spin_lock(&unix_table_lock);
 262         __unix_remove_socket(sk);
 263         spin_unlock(&unix_table_lock);
 264 }
 265
 266 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 267 {
 268         spin_lock(&unix_table_lock);
 269         __unix_insert_socket(list, sk);
 270         spin_unlock(&unix_table_lock);
 271 }
 272
 273 static struct sock *__unix_find_socket_byname(struct net *net,
 274                                               struct sockaddr_un *sunname,
 275                                               int len, int type, unsigned int hash)
 276 {
 277         struct sock *s;
 278
 279         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 280                 struct unix_sock *u = unix_sk(s);
 281
 282                 if (!net_eq(sock_net(s), net))
 283                         continue;
 284
 285                 if (u->addr->len == len &&
 286                     !memcmp(u->addr->name, sunname, len))
 287                         goto found;
 288         }
 289         s = NULL;
 290 found:
 291         return s;
 292 }
 293
 294 static inline struct sock *unix_find_socket_byname(struct net *net,
 295                                                    struct sockaddr_un *sunname,
 296                                                    int len, int type,
 297                                                    unsigned int hash)
 298 {
 299         struct sock *s;
 300
 301         spin_lock(&unix_table_lock);
 302         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 303         if (s)
 304                 sock_hold(s);
 305         spin_unlock(&unix_table_lock);
 306         return s;
 307 }
 308
 309 static struct sock *unix_find_socket_byinode(struct inode *i)
 310 {
 311         struct sock *s;
 312
 313         spin_lock(&unix_table_lock);
 314         sk_for_each(s,
 315                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 316                 struct dentry *dentry = unix_sk(s)->path.dentry;
 317
 318                 if (dentry && d_backing_inode(dentry) == i) {
 319                         sock_hold(s);
 320                         goto found;
 321                 }
 322         }
 323         s = NULL;
 324 found:
 325         spin_unlock(&unix_table_lock);
 326         return s;
 327 }
 328
 329 static int unix_writable(const struct sock *sk)
 330 {
 331         return sk->sk_state != TCP_LISTEN &&
 332                (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 333 }
 334
 335 static void unix_write_space(struct sock *sk)
 336 {
 337         struct socket_wq *wq;
 338
 339         rcu_read_lock();
 340         if (unix_writable(sk)) {
 341                 wq = rcu_dereference(sk->sk_wq);
 342                 if (wq_has_sleeper(wq))
 343                         wake_up_interruptible_sync_poll(&wq->wait,
 344                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 345                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 346         }
 347         rcu_read_unlock();
 348 }
 349
 350 /* When dgram socket disconnects (or changes its peer), we clear its receive
 351  * queue of packets arrived from previous peer. First, it allows to do
 352  * flow control based only on wmem_alloc; second, sk connected to peer
 353  * may receive messages only from that peer. */
 354 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 355 {
 356         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 357                 skb_queue_purge(&sk->sk_receive_queue);
 358                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 359
 360                 /* If one link of bidirectional dgram pipe is disconnected,
 361                  * we signal error. Messages are lost. Do not make this,
 362                  * when peer was not connected to us.
 363                  */
 364                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 365                         other->sk_err = ECONNRESET;
 366                         other->sk_error_report(other);
 367                 }
 368         }
 369 }
 370
 371 static void unix_sock_destructor(struct sock *sk)
 372 {
 373         struct unix_sock *u = unix_sk(sk);
 374
 375         skb_queue_purge(&sk->sk_receive_queue);
 376
 377         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 378         WARN_ON(!sk_unhashed(sk));
 379         WARN_ON(sk->sk_socket);
 380         if (!sock_flag(sk, SOCK_DEAD)) {
 381                 pr_info("Attempt to release alive unix socket: %p\n", sk);
 382                 return;
 383         }
 384
 385         if (u->addr)
 386                 unix_release_addr(u->addr);
 387
 388         atomic_long_dec(&unix_nr_socks);
 389         local_bh_disable();
 390         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 391         local_bh_enable();
 392 #ifdef UNIX_REFCNT_DEBUG
 393         pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
 394                 atomic_long_read(&unix_nr_socks));
 395 #endif
 396 }
 397
 398 static void unix_release_sock(struct sock *sk, int embrion)
 399 {
 400         struct unix_sock *u = unix_sk(sk);
 401         struct path path;
 402         struct sock *skpair;
 403         struct sk_buff *skb;
 404         int state;
 405
 406         unix_remove_socket(sk);
 407
 408         /* Clear state */
 409         unix_state_lock(sk);
 410         sock_orphan(sk);
 411         sk->sk_shutdown = SHUTDOWN_MASK;
 412         path         = u->path;
 413         u->path.dentry = NULL;
 414         u->path.mnt = NULL;
 415         state = sk->sk_state;
 416         sk->sk_state = TCP_CLOSE;
 417         unix_state_unlock(sk);
 418
 419         wake_up_interruptible_all(&u->peer_wait);
 420
 421         skpair = unix_peer(sk);
 422
 423         if (skpair != NULL) {
 424                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 425                         unix_state_lock(skpair);
 426                         /* No more writes */
 427                         skpair->sk_shutdown = SHUTDOWN_MASK;
 428                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 429                                 skpair->sk_err = ECONNRESET;
 430                         unix_state_unlock(skpair);
 431                         skpair->sk_state_change(skpair);
 432                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 433                 }
 434                 sock_put(skpair); /* It may now die */
 435                 unix_peer(sk) = NULL;
 436         }
 437
 438         /* Try to flush out this socket. Throw out buffers at least */
 439
 440         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 441                 if (state == TCP_LISTEN)
 442                         unix_release_sock(skb->sk, 1);
 443                 /* passed fds are erased in the kfree_skb hook        */
 444                 kfree_skb(skb);
 445         }
 446
 447         if (path.dentry)
 448                 path_put(&path);
 449
 450         sock_put(sk);
 451
 452         /* ---- Socket is dead now and most probably destroyed ---- */
 453
 454         /*
 455          * Fixme: BSD difference: In BSD all sockets connected to us get
 456          *        ECONNRESET and we die on the spot. In Linux we behave
 457          *        like files and pipes do and wait for the last
 458          *        dereference.
 459          *
 460          * Can't we simply set sock->err?
 461          *
 462          *        What the above comment does talk about? --ANK(980817)
 463          */
 464
 465         if (unix_tot_inflight)
 466                 unix_gc();              /* Garbage collect fds */
 467 }
 468
 469 static void init_peercred(struct sock *sk)
 470 {
 471         put_pid(sk->sk_peer_pid);
 472         if (sk->sk_peer_cred)
 473                 put_cred(sk->sk_peer_cred);
 474         sk->sk_peer_pid  = get_pid(task_tgid(current));
 475         sk->sk_peer_cred = get_current_cred();
 476 }
 477
 478 static void copy_peercred(struct sock *sk, struct sock *peersk)
 479 {
 480         put_pid(sk->sk_peer_pid);
 481         if (sk->sk_peer_cred)
 482                 put_cred(sk->sk_peer_cred);
 483         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 484         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 485 }
 486
 487 static int unix_listen(struct socket *sock, int backlog)
 488 {
 489         int err;
 490         struct sock *sk = sock->sk;
 491         struct unix_sock *u = unix_sk(sk);
 492         struct pid *old_pid = NULL;
 493
 494         err = -EOPNOTSUPP;
 495         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 496                 goto out;       /* Only stream/seqpacket sockets accept */
 497         err = -EINVAL;
 498         if (!u->addr)
 499                 goto out;       /* No listens on an unbound socket */
 500         unix_state_lock(sk);
 501         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 502                 goto out_unlock;
 503         if (backlog > sk->sk_max_ack_backlog)
 504                 wake_up_interruptible_all(&u->peer_wait);
 505         sk->sk_max_ack_backlog  = backlog;
 506         sk->sk_state            = TCP_LISTEN;
 507         /* set credentials so connect can copy them */
 508         init_peercred(sk);
 509         err = 0;
 510
 511 out_unlock:
 512         unix_state_unlock(sk);
 513         put_pid(old_pid);
 514 out:
 515         return err;
 516 }
 517
 518 static int unix_release(struct socket *);
 519 static int unix_bind(struct socket *, struct sockaddr *, int);
 520 static int unix_stream_connect(struct socket *, struct sockaddr *,
 521                                int addr_len, int flags);
 522 static int unix_socketpair(struct socket *, struct socket *);
 523 static int unix_accept(struct socket *, struct socket *, int);
 524 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 525 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 526 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 527                                     poll_table *);
 528 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 529 static int unix_shutdown(struct socket *, int);
 530 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 531 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 532 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 533                                     size_t size, int flags);
 534 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
 535                                        struct pipe_inode_info *, size_t size,
 536                                        unsigned int flags);
 537 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 538 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 539 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 540                               int, int);
 541 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 542 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 543                                   int);
 544
 545 static int unix_set_peek_off(struct sock *sk, int val)
 546 {
 547         struct unix_sock *u = unix_sk(sk);
 548
 549         if (mutex_lock_interruptible(&u->readlock))
 550                 return -EINTR;
 551
 552         sk->sk_peek_off = val;
 553         mutex_unlock(&u->readlock);
 554
 555         return 0;
 556 }
 557
 558
 559 static const struct proto_ops unix_stream_ops = {
 560         .family =       PF_UNIX,
 561         .owner =        THIS_MODULE,
 562         .release =      unix_release,
 563         .bind =         unix_bind,
 564         .connect =      unix_stream_connect,
 565         .socketpair =   unix_socketpair,
 566         .accept =       unix_accept,
 567         .getname =      unix_getname,
 568         .poll =         unix_poll,
 569         .ioctl =        unix_ioctl,
 570         .listen =       unix_listen,
 571         .shutdown =     unix_shutdown,
 572         .setsockopt =   sock_no_setsockopt,
 573         .getsockopt =   sock_no_getsockopt,
 574         .sendmsg =      unix_stream_sendmsg,
 575         .recvmsg =      unix_stream_recvmsg,
 576         .mmap =         sock_no_mmap,
 577         .sendpage =     unix_stream_sendpage,
 578         .splice_read =  unix_stream_splice_read,
 579         .set_peek_off = unix_set_peek_off,
 580 };
 581
 582 static const struct proto_ops unix_dgram_ops = {
 583         .family =       PF_UNIX,
 584         .owner =        THIS_MODULE,
 585         .release =      unix_release,
 586         .bind =         unix_bind,
 587         .connect =      unix_dgram_connect,
 588         .socketpair =   unix_socketpair,
 589         .accept =       sock_no_accept,
 590         .getname =      unix_getname,
 591         .poll =         unix_dgram_poll,
 592         .ioctl =        unix_ioctl,
 593         .listen =       sock_no_listen,
 594         .shutdown =     unix_shutdown,
 595         .setsockopt =   sock_no_setsockopt,
 596         .getsockopt =   sock_no_getsockopt,
 597         .sendmsg =      unix_dgram_sendmsg,
 598         .recvmsg =      unix_dgram_recvmsg,
 599         .mmap =         sock_no_mmap,
 600         .sendpage =     sock_no_sendpage,
 601         .set_peek_off = unix_set_peek_off,
 602 };
 603
 604 static const struct proto_ops unix_seqpacket_ops = {
 605         .family =       PF_UNIX,
 606         .owner =        THIS_MODULE,
 607         .release =      unix_release,
 608         .bind =         unix_bind,
 609         .connect =      unix_stream_connect,
 610         .socketpair =   unix_socketpair,
 611         .accept =       unix_accept,
 612         .getname =      unix_getname,
 613         .poll =         unix_dgram_poll,
 614         .ioctl =        unix_ioctl,
 615         .listen =       unix_listen,
 616         .shutdown =     unix_shutdown,
 617         .setsockopt =   sock_no_setsockopt,
 618         .getsockopt =   sock_no_getsockopt,
 619         .sendmsg =      unix_seqpacket_sendmsg,
 620         .recvmsg =      unix_seqpacket_recvmsg,
 621         .mmap =         sock_no_mmap,
 622         .sendpage =     sock_no_sendpage,
 623         .set_peek_off = unix_set_peek_off,
 624 };
 625
 626 static struct proto unix_proto = {
 627         .name                   = "UNIX",
 628         .owner                  = THIS_MODULE,
 629         .obj_size               = sizeof(struct unix_sock),
 630 };
 631
 632 /*
 633  * AF_UNIX sockets do not interact with hardware, hence they
 634  * dont trigger interrupts - so it's safe for them to have
 635  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 636  * this special lock-class by reinitializing the spinlock key:
 637  */
 638 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 639
 640 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 641 {
 642         struct sock *sk = NULL;
 643         struct unix_sock *u;
 644
 645         atomic_long_inc(&unix_nr_socks);
 646         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 647                 goto out;
 648
 649         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 650         if (!sk)
 651                 goto out;
 652
 653         sock_init_data(sock, sk);
 654         lockdep_set_class(&sk->sk_receive_queue.lock,
 655                                 &af_unix_sk_receive_queue_lock_key);
 656
 657         sk->sk_write_space      = unix_write_space;
 658         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 659         sk->sk_destruct         = unix_sock_destructor;
 660         u         = unix_sk(sk);
 661         u->path.dentry = NULL;
 662         u->path.mnt = NULL;
 663         spin_lock_init(&u->lock);
 664         atomic_long_set(&u->inflight, 0);
 665         INIT_LIST_HEAD(&u->link);
 666         mutex_init(&u->readlock); /* single task reading lock */
 667         init_waitqueue_head(&u->peer_wait);
 668         unix_insert_socket(unix_sockets_unbound(sk), sk);
 669 out:
 670         if (sk == NULL)
 671                 atomic_long_dec(&unix_nr_socks);
 672         else {
 673                 local_bh_disable();
 674                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 675                 local_bh_enable();
 676         }
 677         return sk;
 678 }
 679
 680 static int unix_create(struct net *net, struct socket *sock, int protocol,
 681                        int kern)
 682 {
 683         if (protocol && protocol != PF_UNIX)
 684                 return -EPROTONOSUPPORT;
 685
 686         sock->state = SS_UNCONNECTED;
 687
 688         switch (sock->type) {
 689         case SOCK_STREAM:
 690                 sock->ops = &unix_stream_ops;
 691                 break;
 692                 /*
 693                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 694                  *      nothing uses it.
 695                  */
 696         case SOCK_RAW:
 697                 sock->type = SOCK_DGRAM;
 698         case SOCK_DGRAM:
 699                 sock->ops = &unix_dgram_ops;
 700                 break;
 701         case SOCK_SEQPACKET:
 702                 sock->ops = &unix_seqpacket_ops;
 703                 break;
 704         default:
 705                 return -ESOCKTNOSUPPORT;
 706         }
 707
 708         return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 709 }
 710
 711 static int unix_release(struct socket *sock)
 712 {
 713         struct sock *sk = sock->sk;
 714
 715         if (!sk)
 716                 return 0;
 717
 718         unix_release_sock(sk, 0);
 719         sock->sk = NULL;
 720
 721         return 0;
 722 }
 723
 724 static int unix_autobind(struct socket *sock)
 725 {
 726         struct sock *sk = sock->sk;
 727         struct net *net = sock_net(sk);
 728         struct unix_sock *u = unix_sk(sk);
 729         static u32 ordernum = 1;
 730         struct unix_address *addr;
 731         int err;
 732         unsigned int retries = 0;
 733
 734         err = mutex_lock_interruptible(&u->readlock);
 735         if (err)
 736                 return err;
 737
 738         err = 0;
 739         if (u->addr)
 740                 goto out;
 741
 742         err = -ENOMEM;
 743         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 744         if (!addr)
 745                 goto out;
 746
 747         addr->name->sun_family = AF_UNIX;
 748         atomic_set(&addr->refcnt, 1);
 749
 750 retry:
 751         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 752         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 753
 754         spin_lock(&unix_table_lock);
 755         ordernum = (ordernum+1)&0xFFFFF;
 756
 757         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 758                                       addr->hash)) {
 759                 spin_unlock(&unix_table_lock);
 760                 /*
 761                  * __unix_find_socket_byname() may take long time if many names
 762                  * are already in use.
 763                  */
 764                 cond_resched();
 765                 /* Give up if all names seems to be in use. */
 766                 if (retries++ == 0xFFFFF) {
 767                         err = -ENOSPC;
 768                         kfree(addr);
 769                         goto out;
 770                 }
 771                 goto retry;
 772         }
 773         addr->hash ^= sk->sk_type;
 774
 775         __unix_remove_socket(sk);
 776         u->addr = addr;
 777         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 778         spin_unlock(&unix_table_lock);
 779         err = 0;
 780
 781 out:    mutex_unlock(&u->readlock);
 782         return err;
 783 }
 784
 785 static struct sock *unix_find_other(struct net *net,
 786                                     struct sockaddr_un *sunname, int len,
 787                                     int type, unsigned int hash, int *error)
 788 {
 789         struct sock *u;
 790         struct path path;
 791         int err = 0;
 792
 793         if (sunname->sun_path[0]) {
 794                 struct inode *inode;
 795                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 796                 if (err)
 797                         goto fail;
 798                 inode = d_backing_inode(path.dentry);
 799                 err = inode_permission(inode, MAY_WRITE);
 800                 if (err)
 801                         goto put_fail;
 802
 803                 err = -ECONNREFUSED;
 804                 if (!S_ISSOCK(inode->i_mode))
 805                         goto put_fail;
 806                 u = unix_find_socket_byinode(inode);
 807                 if (!u)
 808                         goto put_fail;
 809
 810                 if (u->sk_type == type)
 811                         touch_atime(&path);
 812
 813                 path_put(&path);
 814
 815                 err = -EPROTOTYPE;
 816                 if (u->sk_type != type) {
 817                         sock_put(u);
 818                         goto fail;
 819                 }
 820         } else {
 821                 err = -ECONNREFUSED;
 822                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 823                 if (u) {
 824                         struct dentry *dentry;
 825                         dentry = unix_sk(u)->path.dentry;
 826                         if (dentry)
 827                                 touch_atime(&unix_sk(u)->path);
 828                 } else
 829                         goto fail;
 830         }
 831         return u;
 832
 833 put_fail:
 834         path_put(&path);
 835 fail:
 836         *error = err;
 837         return NULL;
 838 }
 839
 840 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 841 {
 842         struct dentry *dentry;
 843         struct path path;
 844         int err = 0;
 845         /*
 846          * Get the parent directory, calculate the hash for last
 847          * component.
 848          */
 849         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 850         err = PTR_ERR(dentry);
 851         if (IS_ERR(dentry))
 852                 return err;
 853
 854         /*
 855          * All right, let's create it.
 856          */
 857         err = security_path_mknod(&path, dentry, mode, 0);
 858         if (!err) {
 859                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 860                 if (!err) {
 861                         res->mnt = mntget(path.mnt);
 862                         res->dentry = dget(dentry);
 863                 }
 864         }
 865         done_path_create(&path, dentry);
 866         return err;
 867 }
 868
 869 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 870 {
 871         struct sock *sk = sock->sk;
 872         struct net *net = sock_net(sk);
 873         struct unix_sock *u = unix_sk(sk);
 874         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 875         char *sun_path = sunaddr->sun_path;
 876         int err;
 877         unsigned int hash;
 878         struct unix_address *addr;
 879         struct hlist_head *list;
 880
 881         err = -EINVAL;
 882         if (sunaddr->sun_family != AF_UNIX)
 883                 goto out;
 884
 885         if (addr_len == sizeof(short)) {
 886                 err = unix_autobind(sock);
 887                 goto out;
 888         }
 889
 890         err = unix_mkname(sunaddr, addr_len, &hash);
 891         if (err < 0)
 892                 goto out;
 893         addr_len = err;
 894
 895         err = mutex_lock_interruptible(&u->readlock);
 896         if (err)
 897                 goto out;
 898
 899         err = -EINVAL;
 900         if (u->addr)
 901                 goto out_up;
 902
 903         err = -ENOMEM;
 904         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 905         if (!addr)
 906                 goto out_up;
 907
 908         memcpy(addr->name, sunaddr, addr_len);
 909         addr->len = addr_len;
 910         addr->hash = hash ^ sk->sk_type;
 911         atomic_set(&addr->refcnt, 1);
 912
 913         if (sun_path[0]) {
 914                 struct path path;
 915                 umode_t mode = S_IFSOCK |
 916                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 917                 err = unix_mknod(sun_path, mode, &path);
 918                 if (err) {
 919                         if (err == -EEXIST)
 920                                 err = -EADDRINUSE;
 921                         unix_release_addr(addr);
 922                         goto out_up;
 923                 }
 924                 addr->hash = UNIX_HASH_SIZE;
 925                 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
 926                 spin_lock(&unix_table_lock);
 927                 u->path = path;
 928                 list = &unix_socket_table[hash];
 929         } else {
 930                 spin_lock(&unix_table_lock);
 931                 err = -EADDRINUSE;
 932                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 933                                               sk->sk_type, hash)) {
 934                         unix_release_addr(addr);
 935                         goto out_unlock;
 936                 }
 937
 938                 list = &unix_socket_table[addr->hash];
 939         }
 940
 941         err = 0;
 942         __unix_remove_socket(sk);
 943         u->addr = addr;
 944         __unix_insert_socket(list, sk);
 945
 946 out_unlock:
 947         spin_unlock(&unix_table_lock);
 948 out_up:
 949         mutex_unlock(&u->readlock);
 950 out:
 951         return err;
 952 }
 953
 954 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 955 {
 956         if (unlikely(sk1 == sk2) || !sk2) {
 957                 unix_state_lock(sk1);
 958                 return;
 959         }
 960         if (sk1 < sk2) {
 961                 unix_state_lock(sk1);
 962                 unix_state_lock_nested(sk2);
 963         } else {
 964                 unix_state_lock(sk2);
 965                 unix_state_lock_nested(sk1);
 966         }
 967 }
 968
 969 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 970 {
 971         if (unlikely(sk1 == sk2) || !sk2) {
 972                 unix_state_unlock(sk1);
 973                 return;
 974         }
 975         unix_state_unlock(sk1);
 976         unix_state_unlock(sk2);
 977 }
 978
 979 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 980                               int alen, int flags)
 981 {
 982         struct sock *sk = sock->sk;
 983         struct net *net = sock_net(sk);
 984         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 985         struct sock *other;
 986         unsigned int hash;
 987         int err;
 988
 989         if (addr->sa_family != AF_UNSPEC) {
 990                 err = unix_mkname(sunaddr, alen, &hash);
 991                 if (err < 0)
 992                         goto out;
 993                 alen = err;
 994
 995                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 996                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 997                         goto out;
 998
 999 restart:
1000                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1001                 if (!other)
1002                         goto out;
1003
1004                 unix_state_double_lock(sk, other);
1005
1006                 /* Apparently VFS overslept socket death. Retry. */
1007                 if (sock_flag(other, SOCK_DEAD)) {
1008                         unix_state_double_unlock(sk, other);
1009                         sock_put(other);
1010                         goto restart;
1011                 }
1012
1013                 err = -EPERM;
1014                 if (!unix_may_send(sk, other))
1015                         goto out_unlock;
1016
1017                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1018                 if (err)
1019                         goto out_unlock;
1020
1021         } else {
1022                 /*
1023                  *      1003.1g breaking connected state with AF_UNSPEC
1024                  */
1025                 other = NULL;
1026                 unix_state_double_lock(sk, other);
1027         }
1028
1029         /*
1030          * If it was connected, reconnect.
1031          */
1032         if (unix_peer(sk)) {
1033                 struct sock *old_peer = unix_peer(sk);
1034                 unix_peer(sk) = other;
1035                 unix_state_double_unlock(sk, other);
1036
1037                 if (other != old_peer)
1038                         unix_dgram_disconnected(sk, old_peer);
1039                 sock_put(old_peer);
1040         } else {
1041                 unix_peer(sk) = other;
1042                 unix_state_double_unlock(sk, other);
1043         }
1044         return 0;
1045
1046 out_unlock:
1047         unix_state_double_unlock(sk, other);
1048         sock_put(other);
1049 out:
1050         return err;
1051 }
1052
1053 static long unix_wait_for_peer(struct sock *other, long timeo)
1054 {
1055         struct unix_sock *u = unix_sk(other);
1056         int sched;
1057         DEFINE_WAIT(wait);
1058
1059         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1060
1061         sched = !sock_flag(other, SOCK_DEAD) &&
1062                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1063                 unix_recvq_full(other);
1064
1065         unix_state_unlock(other);
1066
1067         if (sched)
1068                 timeo = schedule_timeout(timeo);
1069
1070         finish_wait(&u->peer_wait, &wait);
1071         return timeo;
1072 }
1073
1074 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1075                                int addr_len, int flags)
1076 {
1077         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1078         struct sock *sk = sock->sk;
1079         struct net *net = sock_net(sk);
1080         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1081         struct sock *newsk = NULL;
1082         struct sock *other = NULL;
1083         struct sk_buff *skb = NULL;
1084         unsigned int hash;
1085         int st;
1086         int err;
1087         long timeo;
1088
1089         err = unix_mkname(sunaddr, addr_len, &hash);
1090         if (err < 0)
1091                 goto out;
1092         addr_len = err;
1093
1094         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1095             (err = unix_autobind(sock)) != 0)
1096                 goto out;
1097
1098         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1099
1100         /* First of all allocate resources.
1101            If we will make it after state is locked,
1102            we will have to recheck all again in any case.
1103          */
1104
1105         err = -ENOMEM;
1106
1107         /* create new sock for complete connection */
1108         newsk = unix_create1(sock_net(sk), NULL, 0);
1109         if (newsk == NULL)
1110                 goto out;
1111
1112         /* Allocate skb for sending to listening sock */
1113         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1114         if (skb == NULL)
1115                 goto out;
1116
1117 restart:
1118         /*  Find listening sock. */
1119         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1120         if (!other)
1121                 goto out;
1122
1123         /* Latch state of peer */
1124         unix_state_lock(other);
1125
1126         /* Apparently VFS overslept socket death. Retry. */
1127         if (sock_flag(other, SOCK_DEAD)) {
1128                 unix_state_unlock(other);
1129                 sock_put(other);
1130                 goto restart;
1131         }
1132
1133         err = -ECONNREFUSED;
1134         if (other->sk_state != TCP_LISTEN)
1135                 goto out_unlock;
1136         if (other->sk_shutdown & RCV_SHUTDOWN)
1137                 goto out_unlock;
1138
1139         if (unix_recvq_full(other)) {
1140                 err = -EAGAIN;
1141                 if (!timeo)
1142                         goto out_unlock;
1143
1144                 timeo = unix_wait_for_peer(other, timeo);
1145
1146                 err = sock_intr_errno(timeo);
1147                 if (signal_pending(current))
1148                         goto out;
1149                 sock_put(other);
1150                 goto restart;
1151         }
1152
1153         /* Latch our state.
1154
1155            It is tricky place. We need to grab our state lock and cannot
1156            drop lock on peer. It is dangerous because deadlock is
1157            possible. Connect to self case and simultaneous
1158            attempt to connect are eliminated by checking socket
1159            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1160            check this before attempt to grab lock.
1161
1162            Well, and we have to recheck the state after socket locked.
1163          */
1164         st = sk->sk_state;
1165
1166         switch (st) {
1167         case TCP_CLOSE:
1168                 /* This is ok... continue with connect */
1169                 break;
1170         case TCP_ESTABLISHED:
1171                 /* Socket is already connected */
1172                 err = -EISCONN;
1173                 goto out_unlock;
1174         default:
1175                 err = -EINVAL;
1176                 goto out_unlock;
1177         }
1178
1179         unix_state_lock_nested(sk);
1180
1181         if (sk->sk_state != st) {
1182                 unix_state_unlock(sk);
1183                 unix_state_unlock(other);
1184                 sock_put(other);
1185                 goto restart;
1186         }
1187
1188         err = security_unix_stream_connect(sk, other, newsk);
1189         if (err) {
1190                 unix_state_unlock(sk);
1191                 goto out_unlock;
1192         }
1193
1194         /* The way is open! Fastly set all the necessary fields... */
1195
1196         sock_hold(sk);
1197         unix_peer(newsk)        = sk;
1198         newsk->sk_state         = TCP_ESTABLISHED;
1199         newsk->sk_type          = sk->sk_type;
1200         init_peercred(newsk);
1201         newu = unix_sk(newsk);
1202         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1203         otheru = unix_sk(other);
1204
1205         /* copy address information from listening to new sock*/
1206         if (otheru->addr) {
1207                 atomic_inc(&otheru->addr->refcnt);
1208                 newu->addr = otheru->addr;
1209         }
1210         if (otheru->path.dentry) {
1211                 path_get(&otheru->path);
1212                 newu->path = otheru->path;
1213         }
1214
1215         /* Set credentials */
1216         copy_peercred(sk, other);
1217
1218         sock->state     = SS_CONNECTED;
1219         sk->sk_state    = TCP_ESTABLISHED;
1220         sock_hold(newsk);
1221
1222         smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1223         unix_peer(sk)   = newsk;
1224
1225         unix_state_unlock(sk);
1226
1227         /* take ten and and send info to listening sock */
1228         spin_lock(&other->sk_receive_queue.lock);
1229         __skb_queue_tail(&other->sk_receive_queue, skb);
1230         spin_unlock(&other->sk_receive_queue.lock);
1231         unix_state_unlock(other);
1232         other->sk_data_ready(other);
1233         sock_put(other);
1234         return 0;
1235
1236 out_unlock:
1237         if (other)
1238                 unix_state_unlock(other);
1239
1240 out:
1241         kfree_skb(skb);
1242         if (newsk)
1243                 unix_release_sock(newsk, 0);
1244         if (other)
1245                 sock_put(other);
1246         return err;
1247 }
1248
1249 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1250 {
1251         struct sock *ska = socka->sk, *skb = sockb->sk;
1252
1253         /* Join our sockets back to back */
1254         sock_hold(ska);
1255         sock_hold(skb);
1256         unix_peer(ska) = skb;
1257         unix_peer(skb) = ska;
1258         init_peercred(ska);
1259         init_peercred(skb);
1260
1261         if (ska->sk_type != SOCK_DGRAM) {
1262                 ska->sk_state = TCP_ESTABLISHED;
1263                 skb->sk_state = TCP_ESTABLISHED;
1264                 socka->state  = SS_CONNECTED;
1265                 sockb->state  = SS_CONNECTED;
1266         }
1267         return 0;
1268 }
1269
1270 static void unix_sock_inherit_flags(const struct socket *old,
1271                                     struct socket *new)
1272 {
1273         if (test_bit(SOCK_PASSCRED, &old->flags))
1274                 set_bit(SOCK_PASSCRED, &new->flags);
1275         if (test_bit(SOCK_PASSSEC, &old->flags))
1276                 set_bit(SOCK_PASSSEC, &new->flags);
1277 }
1278
1279 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1280 {
1281         struct sock *sk = sock->sk;
1282         struct sock *tsk;
1283         struct sk_buff *skb;
1284         int err;
1285
1286         err = -EOPNOTSUPP;
1287         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1288                 goto out;
1289
1290         err = -EINVAL;
1291         if (sk->sk_state != TCP_LISTEN)
1292                 goto out;
1293
1294         /* If socket state is TCP_LISTEN it cannot change (for now...),
1295          * so that no locks are necessary.
1296          */
1297
1298         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1299         if (!skb) {
1300                 /* This means receive shutdown. */
1301                 if (err == 0)
1302                         err = -EINVAL;
1303                 goto out;
1304         }
1305
1306         tsk = skb->sk;
1307         skb_free_datagram(sk, skb);
1308         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1309
1310         /* attach accepted sock to socket */
1311         unix_state_lock(tsk);
1312         newsock->state = SS_CONNECTED;
1313         unix_sock_inherit_flags(sock, newsock);
1314         sock_graft(tsk, newsock);
1315         unix_state_unlock(tsk);
1316         return 0;
1317
1318 out:
1319         return err;
1320 }
1321
1322
1323 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1324 {
1325         struct sock *sk = sock->sk;
1326         struct unix_sock *u;
1327         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1328         int err = 0;
1329
1330         if (peer) {
1331                 sk = unix_peer_get(sk);
1332
1333                 err = -ENOTCONN;
1334                 if (!sk)
1335                         goto out;
1336                 err = 0;
1337         } else {
1338                 sock_hold(sk);
1339         }
1340
1341         u = unix_sk(sk);
1342         unix_state_lock(sk);
1343         if (!u->addr) {
1344                 sunaddr->sun_family = AF_UNIX;
1345                 sunaddr->sun_path[0] = 0;
1346                 *uaddr_len = sizeof(short);
1347         } else {
1348                 struct unix_address *addr = u->addr;
1349
1350                 *uaddr_len = addr->len;
1351                 memcpy(sunaddr, addr->name, *uaddr_len);
1352         }
1353         unix_state_unlock(sk);
1354         sock_put(sk);
1355 out:
1356         return err;
1357 }
1358
1359 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1360 {
1361         int i;
1362
1363         scm->fp = UNIXCB(skb).fp;
1364         UNIXCB(skb).fp = NULL;
1365
1366         for (i = scm->fp->count-1; i >= 0; i--)
1367                 unix_notinflight(scm->fp->fp[i]);
1368 }
1369
1370 static void unix_destruct_scm(struct sk_buff *skb)
1371 {
1372         struct scm_cookie scm;
1373         memset(&scm, 0, sizeof(scm));
1374         scm.pid  = UNIXCB(skb).pid;
1375         if (UNIXCB(skb).fp)
1376                 unix_detach_fds(&scm, skb);
1377
1378         /* Alas, it calls VFS */
1379         /* So fscking what? fput() had been SMP-safe since the last Summer */
1380         scm_destroy(&scm);
1381         sock_wfree(skb);
1382 }
1383
1384 #define MAX_RECURSION_LEVEL 4
1385
1386 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1387 {
1388         int i;
1389         unsigned char max_level = 0;
1390         int unix_sock_count = 0;
1391
1392         for (i = scm->fp->count - 1; i >= 0; i--) {
1393                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1394
1395                 if (sk) {
1396                         unix_sock_count++;
1397                         max_level = max(max_level,
1398                                         unix_sk(sk)->recursion_level);
1399                 }
1400         }
1401         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1402                 return -ETOOMANYREFS;
1403
1404         /*
1405          * Need to duplicate file references for the sake of garbage
1406          * collection.  Otherwise a socket in the fps might become a
1407          * candidate for GC while the skb is not yet queued.
1408          */
1409         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1410         if (!UNIXCB(skb).fp)
1411                 return -ENOMEM;
1412
1413         if (unix_sock_count) {
1414                 for (i = scm->fp->count - 1; i >= 0; i--)
1415                         unix_inflight(scm->fp->fp[i]);
1416         }
1417         return max_level;
1418 }
1419
1420 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1421 {
1422         int err = 0;
1423
1424         UNIXCB(skb).pid  = get_pid(scm->pid);
1425         UNIXCB(skb).uid = scm->creds.uid;
1426         UNIXCB(skb).gid = scm->creds.gid;
1427         UNIXCB(skb).fp = NULL;
1428         unix_get_secdata(scm, skb);
1429         if (scm->fp && send_fds)
1430                 err = unix_attach_fds(scm, skb);
1431
1432         skb->destructor = unix_destruct_scm;
1433         return err;
1434 }
1435
1436 /*
1437  * Some apps rely on write() giving SCM_CREDENTIALS
1438  * We include credentials if source or destination socket
1439  * asserted SOCK_PASSCRED.
1440  */
1441 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1442                             const struct sock *other)
1443 {
1444         if (UNIXCB(skb).pid)
1445                 return;
1446         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1447             !other->sk_socket ||
1448             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1449                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1450                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1451         }
1452 }
1453
1454 /*
1455  *      Send AF_UNIX data.
1456  */
1457
1458 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1459                               size_t len)
1460 {
1461         struct sock *sk = sock->sk;
1462         struct net *net = sock_net(sk);
1463         struct unix_sock *u = unix_sk(sk);
1464         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1465         struct sock *other = NULL;
1466         int namelen = 0; /* fake GCC */
1467         int err;
1468         unsigned int hash;
1469         struct sk_buff *skb;
1470         long timeo;
1471         struct scm_cookie scm;
1472         int max_level;
1473         int data_len = 0;
1474
1475         wait_for_unix_gc();
1476         err = scm_send(sock, msg, &scm, false);
1477         if (err < 0)
1478                 return err;
1479
1480         err = -EOPNOTSUPP;
1481         if (msg->msg_flags&MSG_OOB)
1482                 goto out;
1483
1484         if (msg->msg_namelen) {
1485                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1486                 if (err < 0)
1487                         goto out;
1488                 namelen = err;
1489         } else {
1490                 sunaddr = NULL;
1491                 err = -ENOTCONN;
1492                 other = unix_peer_get(sk);
1493                 if (!other)
1494                         goto out;
1495         }
1496
1497         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1498             && (err = unix_autobind(sock)) != 0)
1499                 goto out;
1500
1501         err = -EMSGSIZE;
1502         if (len > sk->sk_sndbuf - 32)
1503                 goto out;
1504
1505         if (len > SKB_MAX_ALLOC) {
1506                 data_len = min_t(size_t,
1507                                  len - SKB_MAX_ALLOC,
1508                                  MAX_SKB_FRAGS * PAGE_SIZE);
1509                 data_len = PAGE_ALIGN(data_len);
1510
1511                 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1512         }
1513
1514         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1515                                    msg->msg_flags & MSG_DONTWAIT, &err,
1516                                    PAGE_ALLOC_COSTLY_ORDER);
1517         if (skb == NULL)
1518                 goto out;
1519
1520         err = unix_scm_to_skb(&scm, skb, true);
1521         if (err < 0)
1522                 goto out_free;
1523         max_level = err + 1;
1524
1525         skb_put(skb, len - data_len);
1526         skb->data_len = data_len;
1527         skb->len = len;
1528         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1529         if (err)
1530                 goto out_free;
1531
1532         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1533
1534 restart:
1535         if (!other) {
1536                 err = -ECONNRESET;
1537                 if (sunaddr == NULL)
1538                         goto out_free;
1539
1540                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1541                                         hash, &err);
1542                 if (other == NULL)
1543                         goto out_free;
1544         }
1545
1546         if (sk_filter(other, skb) < 0) {
1547                 /* Toss the packet but do not return any error to the sender */
1548                 err = len;
1549                 goto out_free;
1550         }
1551
1552         unix_state_lock(other);
1553         err = -EPERM;
1554         if (!unix_may_send(sk, other))
1555                 goto out_unlock;
1556
1557         if (sock_flag(other, SOCK_DEAD)) {
1558                 /*
1559                  *      Check with 1003.1g - what should
1560                  *      datagram error
1561                  */
1562                 unix_state_unlock(other);
1563                 sock_put(other);
1564
1565                 err = 0;
1566                 unix_state_lock(sk);
1567                 if (unix_peer(sk) == other) {
1568                         unix_peer(sk) = NULL;
1569                         unix_state_unlock(sk);
1570
1571                         unix_dgram_disconnected(sk, other);
1572                         sock_put(other);
1573                         err = -ECONNREFUSED;
1574                 } else {
1575                         unix_state_unlock(sk);
1576                 }
1577
1578                 other = NULL;
1579                 if (err)
1580                         goto out_free;
1581                 goto restart;
1582         }
1583
1584         err = -EPIPE;
1585         if (other->sk_shutdown & RCV_SHUTDOWN)
1586                 goto out_unlock;
1587
1588         if (sk->sk_type != SOCK_SEQPACKET) {
1589                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1590                 if (err)
1591                         goto out_unlock;
1592         }
1593
1594         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1595                 if (!timeo) {
1596                         err = -EAGAIN;
1597                         goto out_unlock;
1598                 }
1599
1600                 timeo = unix_wait_for_peer(other, timeo);
1601
1602                 err = sock_intr_errno(timeo);
1603                 if (signal_pending(current))
1604                         goto out_free;
1605
1606                 goto restart;
1607         }
1608
1609         if (sock_flag(other, SOCK_RCVTSTAMP))
1610                 __net_timestamp(skb);
1611         maybe_add_creds(skb, sock, other);
1612         skb_queue_tail(&other->sk_receive_queue, skb);
1613         if (max_level > unix_sk(other)->recursion_level)
1614                 unix_sk(other)->recursion_level = max_level;
1615         unix_state_unlock(other);
1616         other->sk_data_ready(other);
1617         sock_put(other);
1618         scm_destroy(&scm);
1619         return len;
1620
1621 out_unlock:
1622         unix_state_unlock(other);
1623 out_free:
1624         kfree_skb(skb);
1625 out:
1626         if (other)
1627                 sock_put(other);
1628         scm_destroy(&scm);
1629         return err;
1630 }
1631
1632 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1633  * bytes, and a minimun of a full page.
1634  */
1635 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1636
1637 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1638                                size_t len)
1639 {
1640         struct sock *sk = sock->sk;
1641         struct sock *other = NULL;
1642         int err, size;
1643         struct sk_buff *skb;
1644         int sent = 0;
1645         struct scm_cookie scm;
1646         bool fds_sent = false;
1647         int max_level;
1648         int data_len;
1649
1650         wait_for_unix_gc();
1651         err = scm_send(sock, msg, &scm, false);
1652         if (err < 0)
1653                 return err;
1654
1655         err = -EOPNOTSUPP;
1656         if (msg->msg_flags&MSG_OOB)
1657                 goto out_err;
1658
1659         if (msg->msg_namelen) {
1660                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1661                 goto out_err;
1662         } else {
1663                 err = -ENOTCONN;
1664                 other = unix_peer(sk);
1665                 if (!other)
1666                         goto out_err;
1667         }
1668
1669         if (sk->sk_shutdown & SEND_SHUTDOWN)
1670                 goto pipe_err;
1671
1672         while (sent < len) {
1673                 size = len - sent;
1674
1675                 /* Keep two messages in the pipe so it schedules better */
1676                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1677
1678                 /* allow fallback to order-0 allocations */
1679                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1680
1681                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1682
1683                 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1684
1685                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1686                                            msg->msg_flags & MSG_DONTWAIT, &err,
1687                                            get_order(UNIX_SKB_FRAGS_SZ));
1688                 if (!skb)
1689                         goto out_err;
1690
1691                 /* Only send the fds in the first buffer */
1692                 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1693                 if (err < 0) {
1694                         kfree_skb(skb);
1695                         goto out_err;
1696                 }
1697                 max_level = err + 1;
1698                 fds_sent = true;
1699
1700                 skb_put(skb, size - data_len);
1701                 skb->data_len = data_len;
1702                 skb->len = size;
1703                 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1704                 if (err) {
1705                         kfree_skb(skb);
1706                         goto out_err;
1707                 }
1708
1709                 unix_state_lock(other);
1710
1711                 if (sock_flag(other, SOCK_DEAD) ||
1712                     (other->sk_shutdown & RCV_SHUTDOWN))
1713                         goto pipe_err_free;
1714
1715                 maybe_add_creds(skb, sock, other);
1716                 skb_queue_tail(&other->sk_receive_queue, skb);
1717                 if (max_level > unix_sk(other)->recursion_level)
1718                         unix_sk(other)->recursion_level = max_level;
1719                 unix_state_unlock(other);
1720                 other->sk_data_ready(other);
1721                 sent += size;
1722         }
1723
1724         scm_destroy(&scm);
1725
1726         return sent;
1727
1728 pipe_err_free:
1729         unix_state_unlock(other);
1730         kfree_skb(skb);
1731 pipe_err:
1732         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1733                 send_sig(SIGPIPE, current, 0);
1734         err = -EPIPE;
1735 out_err:
1736         scm_destroy(&scm);
1737         return sent ? : err;
1738 }
1739
1740 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1741                                     int offset, size_t size, int flags)
1742 {
1743         int err = 0;
1744         bool send_sigpipe = true;
1745         struct sock *other, *sk = socket->sk;
1746         struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1747
1748         if (flags & MSG_OOB)
1749                 return -EOPNOTSUPP;
1750
1751         other = unix_peer(sk);
1752         if (!other || sk->sk_state != TCP_ESTABLISHED)
1753                 return -ENOTCONN;
1754
1755         if (false) {
1756 alloc_skb:
1757                 unix_state_unlock(other);
1758                 mutex_unlock(&unix_sk(other)->readlock);
1759                 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1760                                               &err, 0);
1761                 if (!newskb)
1762                         return err;
1763         }
1764
1765         /* we must acquire readlock as we modify already present
1766          * skbs in the sk_receive_queue and mess with skb->len
1767          */
1768         err = mutex_lock_interruptible(&unix_sk(other)->readlock);
1769         if (err) {
1770                 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1771                 send_sigpipe = false;
1772                 goto err;
1773         }
1774
1775         if (sk->sk_shutdown & SEND_SHUTDOWN) {
1776                 err = -EPIPE;
1777                 goto err_unlock;
1778         }
1779
1780         unix_state_lock(other);
1781
1782         if (sock_flag(other, SOCK_DEAD) ||
1783             other->sk_shutdown & RCV_SHUTDOWN) {
1784                 err = -EPIPE;
1785                 goto err_state_unlock;
1786         }
1787
1788         skb = skb_peek_tail(&other->sk_receive_queue);
1789         if (tail && tail == skb) {
1790                 skb = newskb;
1791         } else if (!skb) {
1792                 if (newskb)
1793                         skb = newskb;
1794                 else
1795                         goto alloc_skb;
1796         } else if (newskb) {
1797                 /* this is fast path, we don't necessarily need to
1798                  * call to kfree_skb even though with newskb == NULL
1799                  * this - does no harm
1800                  */
1801                 consume_skb(newskb);
1802         }
1803
1804         if (skb_append_pagefrags(skb, page, offset, size)) {
1805                 tail = skb;
1806                 goto alloc_skb;
1807         }
1808
1809         skb->len += size;
1810         skb->data_len += size;
1811         skb->truesize += size;
1812         atomic_add(size, &sk->sk_wmem_alloc);
1813
1814         if (newskb)
1815                 __skb_queue_tail(&other->sk_receive_queue, newskb);
1816
1817         unix_state_unlock(other);
1818         mutex_unlock(&unix_sk(other)->readlock);
1819
1820         other->sk_data_ready(other);
1821
1822         return size;
1823
1824 err_state_unlock:
1825         unix_state_unlock(other);
1826 err_unlock:
1827         mutex_unlock(&unix_sk(other)->readlock);
1828 err:
1829         kfree_skb(newskb);
1830         if (send_sigpipe && !(flags & MSG_NOSIGNAL))
1831                 send_sig(SIGPIPE, current, 0);
1832         return err;
1833 }
1834
1835 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1836                                   size_t len)
1837 {
1838         int err;
1839         struct sock *sk = sock->sk;
1840
1841         err = sock_error(sk);
1842         if (err)
1843                 return err;
1844
1845         if (sk->sk_state != TCP_ESTABLISHED)
1846                 return -ENOTCONN;
1847
1848         if (msg->msg_namelen)
1849                 msg->msg_namelen = 0;
1850
1851         return unix_dgram_sendmsg(sock, msg, len);
1852 }
1853
1854 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1855                                   size_t size, int flags)
1856 {
1857         struct sock *sk = sock->sk;
1858
1859         if (sk->sk_state != TCP_ESTABLISHED)
1860                 return -ENOTCONN;
1861
1862         return unix_dgram_recvmsg(sock, msg, size, flags);
1863 }
1864
1865 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1866 {
1867         struct unix_sock *u = unix_sk(sk);
1868
1869         if (u->addr) {
1870                 msg->msg_namelen = u->addr->len;
1871                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1872         }
1873 }
1874
1875 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1876                               size_t size, int flags)
1877 {
1878         struct scm_cookie scm;
1879         struct sock *sk = sock->sk;
1880         struct unix_sock *u = unix_sk(sk);
1881         int noblock = flags & MSG_DONTWAIT;
1882         struct sk_buff *skb;
1883         int err;
1884         int peeked, skip;
1885
1886         err = -EOPNOTSUPP;
1887         if (flags&MSG_OOB)
1888                 goto out;
1889
1890         err = mutex_lock_interruptible(&u->readlock);
1891         if (unlikely(err)) {
1892                 /* recvmsg() in non blocking mode is supposed to return -EAGAIN
1893                  * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1894                  */
1895                 err = noblock ? -EAGAIN : -ERESTARTSYS;
1896                 goto out;
1897         }
1898
1899         skip = sk_peek_offset(sk, flags);
1900
1901         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1902         if (!skb) {
1903                 unix_state_lock(sk);
1904                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1905                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1906                     (sk->sk_shutdown & RCV_SHUTDOWN))
1907                         err = 0;
1908                 unix_state_unlock(sk);
1909                 goto out_unlock;
1910         }
1911
1912         wake_up_interruptible_sync_poll(&u->peer_wait,
1913                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1914
1915         if (msg->msg_name)
1916                 unix_copy_addr(msg, skb->sk);
1917
1918         if (size > skb->len - skip)
1919                 size = skb->len - skip;
1920         else if (size < skb->len - skip)
1921                 msg->msg_flags |= MSG_TRUNC;
1922
1923         err = skb_copy_datagram_msg(skb, skip, msg, size);
1924         if (err)
1925                 goto out_free;
1926
1927         if (sock_flag(sk, SOCK_RCVTSTAMP))
1928                 __sock_recv_timestamp(msg, sk, skb);
1929
1930         memset(&scm, 0, sizeof(scm));
1931
1932         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1933         unix_set_secdata(&scm, skb);
1934
1935         if (!(flags & MSG_PEEK)) {
1936                 if (UNIXCB(skb).fp)
1937                         unix_detach_fds(&scm, skb);
1938
1939                 sk_peek_offset_bwd(sk, skb->len);
1940         } else {
1941                 /* It is questionable: on PEEK we could:
1942                    - do not return fds - good, but too simple 8)
1943                    - return fds, and do not return them on read (old strategy,
1944                      apparently wrong)
1945                    - clone fds (I chose it for now, it is the most universal
1946                      solution)
1947
1948                    POSIX 1003.1g does not actually define this clearly
1949                    at all. POSIX 1003.1g doesn't define a lot of things
1950                    clearly however!
1951
1952                 */
1953
1954                 sk_peek_offset_fwd(sk, size);
1955
1956                 if (UNIXCB(skb).fp)
1957                         scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1958         }
1959         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1960
1961         scm_recv(sock, msg, &scm, flags);
1962
1963 out_free:
1964         skb_free_datagram(sk, skb);
1965 out_unlock:
1966         mutex_unlock(&u->readlock);
1967 out:
1968         return err;
1969 }
1970
1971 /*
1972  *      Sleep until more data has arrived. But check for races..
1973  */
1974 static long unix_stream_data_wait(struct sock *sk, long timeo,
1975                                   struct sk_buff *last, unsigned int last_len)
1976 {
1977         struct sk_buff *tail;
1978         DEFINE_WAIT(wait);
1979
1980         unix_state_lock(sk);
1981
1982         for (;;) {
1983                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1984
1985                 tail = skb_peek_tail(&sk->sk_receive_queue);
1986                 if (tail != last ||
1987                     (tail && tail->len != last_len) ||
1988                     sk->sk_err ||
1989                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1990                     signal_pending(current) ||
1991                     !timeo)
1992                         break;
1993
1994                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1995                 unix_state_unlock(sk);
1996                 timeo = freezable_schedule_timeout(timeo);
1997                 unix_state_lock(sk);
1998
1999                 if (sock_flag(sk, SOCK_DEAD))
2000                         break;
2001
2002                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2003         }
2004
2005         finish_wait(sk_sleep(sk), &wait);
2006         unix_state_unlock(sk);
2007         return timeo;
2008 }
2009
2010 static unsigned int unix_skb_len(const struct sk_buff *skb)
2011 {
2012         return skb->len - UNIXCB(skb).consumed;
2013 }
2014
2015 struct unix_stream_read_state {
2016         int (*recv_actor)(struct sk_buff *, int, int,
2017                           struct unix_stream_read_state *);
2018         struct socket *socket;
2019         struct msghdr *msg;
2020         struct pipe_inode_info *pipe;
2021         size_t size;
2022         int flags;
2023         unsigned int splice_flags;
2024 };
2025
2026 static int unix_stream_read_generic(struct unix_stream_read_state *state)
2027 {
2028         struct scm_cookie scm;
2029         struct socket *sock = state->socket;
2030         struct sock *sk = sock->sk;
2031         struct unix_sock *u = unix_sk(sk);
2032         int copied = 0;
2033         int flags = state->flags;
2034         int noblock = flags & MSG_DONTWAIT;
2035         bool check_creds = false;
2036         int target;
2037         int err = 0;
2038         long timeo;
2039         int skip;
2040         size_t size = state->size;
2041         unsigned int last_len;
2042
2043         err = -EINVAL;
2044         if (sk->sk_state != TCP_ESTABLISHED)
2045                 goto out;
2046
2047         err = -EOPNOTSUPP;
2048         if (flags & MSG_OOB)
2049                 goto out;
2050
2051         target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2052         timeo = sock_rcvtimeo(sk, noblock);
2053
2054         memset(&scm, 0, sizeof(scm));
2055
2056         /* Lock the socket to prevent queue disordering
2057          * while sleeps in memcpy_tomsg
2058          */
2059         err = mutex_lock_interruptible(&u->readlock);
2060         if (unlikely(err)) {
2061                 /* recvmsg() in non blocking mode is supposed to return -EAGAIN
2062                  * sk_rcvtimeo is not honored by mutex_lock_interruptible()
2063                  */
2064                 err = noblock ? -EAGAIN : -ERESTARTSYS;
2065                 goto out;
2066         }
2067
2068         if (flags & MSG_PEEK)
2069                 skip = sk_peek_offset(sk, flags);
2070         else
2071                 skip = 0;
2072
2073         do {
2074                 int chunk;
2075                 struct sk_buff *skb, *last;
2076
2077                 unix_state_lock(sk);
2078                 if (sock_flag(sk, SOCK_DEAD)) {
2079                         err = -ECONNRESET;
2080                         goto unlock;
2081                 }
2082                 last = skb = skb_peek(&sk->sk_receive_queue);
2083                 last_len = last ? last->len : 0;
2084 again:
2085                 if (skb == NULL) {
2086                         unix_sk(sk)->recursion_level = 0;
2087                         if (copied >= target)
2088                                 goto unlock;
2089
2090                         /*
2091                          *      POSIX 1003.1g mandates this order.
2092                          */
2093
2094                         err = sock_error(sk);
2095                         if (err)
2096                                 goto unlock;
2097                         if (sk->sk_shutdown & RCV_SHUTDOWN)
2098                                 goto unlock;
2099
2100                         unix_state_unlock(sk);
2101                         err = -EAGAIN;
2102                         if (!timeo)
2103                                 break;
2104                         mutex_unlock(&u->readlock);
2105
2106                         timeo = unix_stream_data_wait(sk, timeo, last,
2107                                                       last_len);
2108
2109                         if (signal_pending(current) ||
2110                             mutex_lock_interruptible(&u->readlock)) {
2111                                 err = sock_intr_errno(timeo);
2112                                 goto out;
2113                         }
2114
2115                         continue;
2116 unlock:
2117                         unix_state_unlock(sk);
2118                         break;
2119                 }
2120
2121                 while (skip >= unix_skb_len(skb)) {
2122                         skip -= unix_skb_len(skb);
2123                         last = skb;
2124                         last_len = skb->len;
2125                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2126                         if (!skb)
2127                                 goto again;
2128                 }
2129
2130                 unix_state_unlock(sk);
2131
2132                 if (check_creds) {
2133                         /* Never glue messages from different writers */
2134                         if ((UNIXCB(skb).pid  != scm.pid) ||
2135                             !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
2136                             !gid_eq(UNIXCB(skb).gid, scm.creds.gid) ||
2137                             !unix_secdata_eq(&scm, skb))
2138                                 break;
2139                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2140                         /* Copy credentials */
2141                         scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2142                         unix_set_secdata(&scm, skb);
2143                         check_creds = true;
2144                 }
2145
2146                 /* Copy address just once */
2147                 if (state->msg && state->msg->msg_name) {
2148                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2149                                          state->msg->msg_name);
2150                         unix_copy_addr(state->msg, skb->sk);
2151                         sunaddr = NULL;
2152                 }
2153
2154                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2155                 chunk = state->recv_actor(skb, skip, chunk, state);
2156                 if (chunk < 0) {
2157                         if (copied == 0)
2158                                 copied = -EFAULT;
2159                         break;
2160                 }
2161                 copied += chunk;
2162                 size -= chunk;
2163
2164                 /* Mark read part of skb as used */
2165                 if (!(flags & MSG_PEEK)) {
2166                         UNIXCB(skb).consumed += chunk;
2167
2168                         sk_peek_offset_bwd(sk, chunk);
2169
2170                         if (UNIXCB(skb).fp)
2171                                 unix_detach_fds(&scm, skb);
2172
2173                         if (unix_skb_len(skb))
2174                                 break;
2175
2176                         skb_unlink(skb, &sk->sk_receive_queue);
2177                         consume_skb(skb);
2178
2179                         if (scm.fp)
2180                                 break;
2181                 } else {
2182                         /* It is questionable, see note in unix_dgram_recvmsg.
2183                          */
2184                         if (UNIXCB(skb).fp)
2185                                 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2186
2187                         sk_peek_offset_fwd(sk, chunk);
2188
2189                         if (UNIXCB(skb).fp)
2190                                 break;
2191
2192                         skip = 0;
2193                         last = skb;
2194                         last_len = skb->len;
2195                         unix_state_lock(sk);
2196                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
2197                         if (skb)
2198                                 goto again;
2199                         unix_state_unlock(sk);
2200                         break;
2201                 }
2202         } while (size);
2203
2204         mutex_unlock(&u->readlock);
2205         if (state->msg)
2206                 scm_recv(sock, state->msg, &scm, flags);
2207         else
2208                 scm_destroy(&scm);
2209 out:
2210         return copied ? : err;
2211 }
2212
2213 static int unix_stream_read_actor(struct sk_buff *skb,
2214                                   int skip, int chunk,
2215                                   struct unix_stream_read_state *state)
2216 {
2217         int ret;
2218
2219         ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2220                                     state->msg, chunk);
2221         return ret ?: chunk;
2222 }
2223
2224 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2225                                size_t size, int flags)
2226 {
2227         struct unix_stream_read_state state = {
2228                 .recv_actor = unix_stream_read_actor,
2229                 .socket = sock,
2230                 .msg = msg,
2231                 .size = size,
2232                 .flags = flags
2233         };
2234
2235         return unix_stream_read_generic(&state);
2236 }
2237
2238 static ssize_t skb_unix_socket_splice(struct sock *sk,
2239                                       struct pipe_inode_info *pipe,
2240                                       struct splice_pipe_desc *spd)
2241 {
2242         int ret;
2243         struct unix_sock *u = unix_sk(sk);
2244
2245         mutex_unlock(&u->readlock);
2246         ret = splice_to_pipe(pipe, spd);
2247         mutex_lock(&u->readlock);
2248
2249         return ret;
2250 }
2251
2252 static int unix_stream_splice_actor(struct sk_buff *skb,
2253                                     int skip, int chunk,
2254                                     struct unix_stream_read_state *state)
2255 {
2256         return skb_splice_bits(skb, state->socket->sk,
2257                                UNIXCB(skb).consumed + skip,
2258                                state->pipe, chunk, state->splice_flags,
2259                                skb_unix_socket_splice);
2260 }
2261
2262 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2263                                        struct pipe_inode_info *pipe,
2264                                        size_t size, unsigned int flags)
2265 {
2266         struct unix_stream_read_state state = {
2267                 .recv_actor = unix_stream_splice_actor,
2268                 .socket = sock,
2269                 .pipe = pipe,
2270                 .size = size,
2271                 .splice_flags = flags,
2272         };
2273
2274         if (unlikely(*ppos))
2275                 return -ESPIPE;
2276
2277         if (sock->file->f_flags & O_NONBLOCK ||
2278             flags & SPLICE_F_NONBLOCK)
2279                 state.flags = MSG_DONTWAIT;
2280
2281         return unix_stream_read_generic(&state);
2282 }
2283
2284 static int unix_shutdown(struct socket *sock, int mode)
2285 {
2286         struct sock *sk = sock->sk;
2287         struct sock *other;
2288
2289         if (mode < SHUT_RD || mode > SHUT_RDWR)
2290                 return -EINVAL;
2291         /* This maps:
2292          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2293          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2294          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2295          */
2296         ++mode;
2297
2298         unix_state_lock(sk);
2299         sk->sk_shutdown |= mode;
2300         other = unix_peer(sk);
2301         if (other)
2302                 sock_hold(other);
2303         unix_state_unlock(sk);
2304         sk->sk_state_change(sk);
2305
2306         if (other &&
2307                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2308
2309                 int peer_mode = 0;
2310
2311                 if (mode&RCV_SHUTDOWN)
2312                         peer_mode |= SEND_SHUTDOWN;
2313                 if (mode&SEND_SHUTDOWN)
2314                         peer_mode |= RCV_SHUTDOWN;
2315                 unix_state_lock(other);
2316                 other->sk_shutdown |= peer_mode;
2317                 unix_state_unlock(other);
2318                 other->sk_state_change(other);
2319                 if (peer_mode == SHUTDOWN_MASK)
2320                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2321                 else if (peer_mode & RCV_SHUTDOWN)
2322                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2323         }
2324         if (other)
2325                 sock_put(other);
2326
2327         return 0;
2328 }
2329
2330 long unix_inq_len(struct sock *sk)
2331 {
2332         struct sk_buff *skb;
2333         long amount = 0;
2334
2335         if (sk->sk_state == TCP_LISTEN)
2336                 return -EINVAL;
2337
2338         spin_lock(&sk->sk_receive_queue.lock);
2339         if (sk->sk_type == SOCK_STREAM ||
2340             sk->sk_type == SOCK_SEQPACKET) {
2341                 skb_queue_walk(&sk->sk_receive_queue, skb)
2342                         amount += unix_skb_len(skb);
2343         } else {
2344                 skb = skb_peek(&sk->sk_receive_queue);
2345                 if (skb)
2346                         amount = skb->len;
2347         }
2348         spin_unlock(&sk->sk_receive_queue.lock);
2349
2350         return amount;
2351 }
2352 EXPORT_SYMBOL_GPL(unix_inq_len);
2353
2354 long unix_outq_len(struct sock *sk)
2355 {
2356         return sk_wmem_alloc_get(sk);
2357 }
2358 EXPORT_SYMBOL_GPL(unix_outq_len);
2359
2360 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2361 {
2362         struct sock *sk = sock->sk;
2363         long amount = 0;
2364         int err;
2365
2366         switch (cmd) {
2367         case SIOCOUTQ:
2368                 amount = unix_outq_len(sk);
2369                 err = put_user(amount, (int __user *)arg);
2370                 break;
2371         case SIOCINQ:
2372                 amount = unix_inq_len(sk);
2373                 if (amount < 0)
2374                         err = amount;
2375                 else
2376                         err = put_user(amount, (int __user *)arg);
2377                 break;
2378         default:
2379                 err = -ENOIOCTLCMD;
2380                 break;
2381         }
2382         return err;
2383 }
2384
2385 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2386 {
2387         struct sock *sk = sock->sk;
2388         unsigned int mask;
2389
2390         sock_poll_wait(file, sk_sleep(sk), wait);
2391         mask = 0;
2392
2393         /* exceptional events? */
2394         if (sk->sk_err)
2395                 mask |= POLLERR;
2396         if (sk->sk_shutdown == SHUTDOWN_MASK)
2397                 mask |= POLLHUP;
2398         if (sk->sk_shutdown & RCV_SHUTDOWN)
2399                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2400
2401         /* readable? */
2402         if (!skb_queue_empty(&sk->sk_receive_queue))
2403                 mask |= POLLIN | POLLRDNORM;
2404
2405         /* Connection-based need to check for termination and startup */
2406         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2407             sk->sk_state == TCP_CLOSE)
2408                 mask |= POLLHUP;
2409
2410         /*
2411          * we set writable also when the other side has shut down the
2412          * connection. This prevents stuck sockets.
2413          */
2414         if (unix_writable(sk))
2415                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2416
2417         return mask;
2418 }
2419
2420 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2421                                     poll_table *wait)
2422 {
2423         struct sock *sk = sock->sk, *other;
2424         unsigned int mask, writable;
2425
2426         sock_poll_wait(file, sk_sleep(sk), wait);
2427         mask = 0;
2428
2429         /* exceptional events? */
2430         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2431                 mask |= POLLERR |
2432                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2433
2434         if (sk->sk_shutdown & RCV_SHUTDOWN)
2435                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2436         if (sk->sk_shutdown == SHUTDOWN_MASK)
2437                 mask |= POLLHUP;
2438
2439         /* readable? */
2440         if (!skb_queue_empty(&sk->sk_receive_queue))
2441                 mask |= POLLIN | POLLRDNORM;
2442
2443         /* Connection-based need to check for termination and startup */
2444         if (sk->sk_type == SOCK_SEQPACKET) {
2445                 if (sk->sk_state == TCP_CLOSE)
2446                         mask |= POLLHUP;
2447                 /* connection hasn't started yet? */
2448                 if (sk->sk_state == TCP_SYN_SENT)
2449                         return mask;
2450         }
2451
2452         /* No write status requested, avoid expensive OUT tests. */
2453         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2454                 return mask;
2455
2456         writable = unix_writable(sk);
2457         other = unix_peer_get(sk);
2458         if (other) {
2459                 if (unix_peer(other) != sk) {
2460                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2461                         if (unix_recvq_full(other))
2462                                 writable = 0;
2463                 }
2464                 sock_put(other);
2465         }
2466
2467         if (writable)
2468                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2469         else
2470                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2471
2472         return mask;
2473 }
2474
2475 #ifdef CONFIG_PROC_FS
2476
2477 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2478
2479 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2480 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2481 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2482
2483 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2484 {
2485         unsigned long offset = get_offset(*pos);
2486         unsigned long bucket = get_bucket(*pos);
2487         struct sock *sk;
2488         unsigned long count = 0;
2489
2490         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2491                 if (sock_net(sk) != seq_file_net(seq))
2492                         continue;
2493                 if (++count == offset)
2494                         break;
2495         }
2496
2497         return sk;
2498 }
2499
2500 static struct sock *unix_next_socket(struct seq_file *seq,
2501                                      struct sock *sk,
2502                                      loff_t *pos)
2503 {
2504         unsigned long bucket;
2505
2506         while (sk > (struct sock *)SEQ_START_TOKEN) {
2507                 sk = sk_next(sk);
2508                 if (!sk)
2509                         goto next_bucket;
2510                 if (sock_net(sk) == seq_file_net(seq))
2511                         return sk;
2512         }
2513
2514         do {
2515                 sk = unix_from_bucket(seq, pos);
2516                 if (sk)
2517                         return sk;
2518
2519 next_bucket:
2520                 bucket = get_bucket(*pos) + 1;
2521                 *pos = set_bucket_offset(bucket, 1);
2522         } while (bucket < ARRAY_SIZE(unix_socket_table));
2523
2524         return NULL;
2525 }
2526
2527 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2528         __acquires(unix_table_lock)
2529 {
2530         spin_lock(&unix_table_lock);
2531
2532         if (!*pos)
2533                 return SEQ_START_TOKEN;
2534
2535         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2536                 return NULL;
2537
2538         return unix_next_socket(seq, NULL, pos);
2539 }
2540
2541 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2542 {
2543         ++*pos;
2544         return unix_next_socket(seq, v, pos);
2545 }
2546
2547 static void unix_seq_stop(struct seq_file *seq, void *v)
2548         __releases(unix_table_lock)
2549 {
2550         spin_unlock(&unix_table_lock);
2551 }
2552
2553 static int unix_seq_show(struct seq_file *seq, void *v)
2554 {
2555
2556         if (v == SEQ_START_TOKEN)
2557                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2558                          "Inode Path\n");
2559         else {
2560                 struct sock *s = v;
2561                 struct unix_sock *u = unix_sk(s);
2562                 unix_state_lock(s);
2563
2564                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2565                         s,
2566                         atomic_read(&s->sk_refcnt),
2567                         0,
2568                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2569                         s->sk_type,
2570                         s->sk_socket ?
2571                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2572                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2573                         sock_i_ino(s));
2574
2575                 if (u->addr) {
2576                         int i, len;
2577                         seq_putc(seq, ' ');
2578
2579                         i = 0;
2580                         len = u->addr->len - sizeof(short);
2581                         if (!UNIX_ABSTRACT(s))
2582                                 len--;
2583                         else {
2584                                 seq_putc(seq, '@');
2585                                 i++;
2586                         }
2587                         for ( ; i < len; i++)
2588                                 seq_putc(seq, u->addr->name->sun_path[i]);
2589                 }
2590                 unix_state_unlock(s);
2591                 seq_putc(seq, '\n');
2592         }
2593
2594         return 0;
2595 }
2596
2597 static const struct seq_operations unix_seq_ops = {
2598         .start  = unix_seq_start,
2599         .next   = unix_seq_next,
2600         .stop   = unix_seq_stop,
2601         .show   = unix_seq_show,
2602 };
2603
2604 static int unix_seq_open(struct inode *inode, struct file *file)
2605 {
2606         return seq_open_net(inode, file, &unix_seq_ops,
2607                             sizeof(struct seq_net_private));
2608 }
2609
2610 static const struct file_operations unix_seq_fops = {
2611         .owner          = THIS_MODULE,
2612         .open           = unix_seq_open,
2613         .read           = seq_read,
2614         .llseek         = seq_lseek,
2615         .release        = seq_release_net,
2616 };
2617
2618 #endif
2619
2620 static const struct net_proto_family unix_family_ops = {
2621         .family = PF_UNIX,
2622         .create = unix_create,
2623         .owner  = THIS_MODULE,
2624 };
2625
2626
2627 static int __net_init unix_net_init(struct net *net)
2628 {
2629         int error = -ENOMEM;
2630
2631         net->unx.sysctl_max_dgram_qlen = 10;
2632         if (unix_sysctl_register(net))
2633                 goto out;
2634
2635 #ifdef CONFIG_PROC_FS
2636         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2637                 unix_sysctl_unregister(net);
2638                 goto out;
2639         }
2640 #endif
2641         error = 0;
2642 out:
2643         return error;
2644 }
2645
2646 static void __net_exit unix_net_exit(struct net *net)
2647 {
2648         unix_sysctl_unregister(net);
2649         remove_proc_entry("unix", net->proc_net);
2650 }
2651
2652 static struct pernet_operations unix_net_ops = {
2653         .init = unix_net_init,
2654         .exit = unix_net_exit,
2655 };
2656
2657 static int __init af_unix_init(void)
2658 {
2659         int rc = -1;
2660
2661         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2662
2663         rc = proto_register(&unix_proto, 1);
2664         if (rc != 0) {
2665                 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2666                 goto out;
2667         }
2668
2669         sock_register(&unix_family_ops);
2670         register_pernet_subsys(&unix_net_ops);
2671 out:
2672         return rc;
2673 }
2674
2675 static void __exit af_unix_exit(void)
2676 {
2677         sock_unregister(PF_UNIX);
2678         proto_unregister(&unix_proto);
2679         unregister_pernet_subsys(&unix_net_ops);
2680 }
2681
2682 /* Earlier than device_initcall() so that other drivers invoking
2683    request_module() don't end up in a loop when modprobe tries
2684    to use a UNIX socket. But later than subsys_initcall() because
2685    we depend on stuff initialised there */
2686 fs_initcall(af_unix_init);
2687 module_exit(af_unix_exit);
2688
2689 MODULE_LICENSE("GPL");
2690 MODULE_ALIAS_NETPROTO(PF_UNIX);