fs/pipe.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/fs/pipe.c
   4  *
   5  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
   6  */
   7
   8 #include <linux/mm.h>
   9 #include <linux/file.h>
  10 #include <linux/poll.h>
  11 #include <linux/slab.h>
  12 #include <linux/module.h>
  13 #include <linux/init.h>
  14 #include <linux/fs.h>
  15 #include <linux/log2.h>
  16 #include <linux/mount.h>
  17 #include <linux/pseudo_fs.h>
  18 #include <linux/magic.h>
  19 #include <linux/pipe_fs_i.h>
  20 #include <linux/uio.h>
  21 #include <linux/highmem.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/audit.h>
  24 #include <linux/syscalls.h>
  25 #include <linux/fcntl.h>
  26 #include <linux/memcontrol.h>
  27 #include <linux/watch_queue.h>
  28
  29 #include <linux/uaccess.h>
  30 #include <asm/ioctls.h>
  31
  32 #include "internal.h"
  33
  34 /*
  35  * The max size that a non-root user is allowed to grow the pipe. Can
  36  * be set by root in /proc/sys/fs/pipe-max-size
  37  */
  38 unsigned int pipe_max_size = 1048576;
  39
  40 /* Maximum allocatable pages per user. Hard limit is unset by default, soft
  41  * matches default values.
  42  */
  43 unsigned long pipe_user_pages_hard;
  44 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  45
  46 /*
  47  * We use head and tail indices that aren't masked off, except at the point of
  48  * dereference, but rather they're allowed to wrap naturally.  This means there
  49  * isn't a dead spot in the buffer, but the ring has to be a power of two and
  50  * <= 2^31.
  51  * -- David Howells 2019-09-23.
  52  *
  53  * Reads with count = 0 should always return 0.
  54  * -- Julian Bradfield 1999-06-07.
  55  *
  56  * FIFOs and Pipes now generate SIGIO for both readers and writers.
  57  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
  58  *
  59  * pipe_read & write cleanup
  60  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  61  */
  62
  63 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
  64 {
  65         if (pipe->files)
  66                 mutex_lock_nested(&pipe->mutex, subclass);
  67 }
  68
  69 void pipe_lock(struct pipe_inode_info *pipe)
  70 {
  71         /*
  72          * pipe_lock() nests non-pipe inode locks (for writing to a file)
  73          */
  74         pipe_lock_nested(pipe, I_MUTEX_PARENT);
  75 }
  76 EXPORT_SYMBOL(pipe_lock);
  77
  78 void pipe_unlock(struct pipe_inode_info *pipe)
  79 {
  80         if (pipe->files)
  81                 mutex_unlock(&pipe->mutex);
  82 }
  83 EXPORT_SYMBOL(pipe_unlock);
  84
  85 static inline void __pipe_lock(struct pipe_inode_info *pipe)
  86 {
  87         mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
  88 }
  89
  90 static inline void __pipe_unlock(struct pipe_inode_info *pipe)
  91 {
  92         mutex_unlock(&pipe->mutex);
  93 }
  94
  95 void pipe_double_lock(struct pipe_inode_info *pipe1,
  96                       struct pipe_inode_info *pipe2)
  97 {
  98         BUG_ON(pipe1 == pipe2);
  99
 100         if (pipe1 < pipe2) {
 101                 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
 102                 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
 103         } else {
 104                 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
 105                 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
 106         }
 107 }
 108
 109 /* Drop the inode semaphore and wait for a pipe event, atomically */
 110 void pipe_wait(struct pipe_inode_info *pipe)
 111 {
 112         DEFINE_WAIT(rdwait);
 113         DEFINE_WAIT(wrwait);
 114
 115         /*
 116          * Pipes are system-local resources, so sleeping on them
 117          * is considered a noninteractive wait:
 118          */
 119         prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
 120         prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
 121         pipe_unlock(pipe);
 122         schedule();
 123         finish_wait(&pipe->rd_wait, &rdwait);
 124         finish_wait(&pipe->wr_wait, &wrwait);
 125         pipe_lock(pipe);
 126 }
 127
 128 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 129                                   struct pipe_buffer *buf)
 130 {
 131         struct page *page = buf->page;
 132
 133         /*
 134          * If nobody else uses this page, and we don't already have a
 135          * temporary page, let's keep track of it as a one-deep
 136          * allocation cache. (Otherwise just release our reference to it)
 137          */
 138         if (page_count(page) == 1 && !pipe->tmp_page)
 139                 pipe->tmp_page = page;
 140         else
 141                 put_page(page);
 142 }
 143
 144 static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
 145                 struct pipe_buffer *buf)
 146 {
 147         struct page *page = buf->page;
 148
 149         if (page_count(page) != 1)
 150                 return false;
 151         memcg_kmem_uncharge_page(page, 0);
 152         __SetPageLocked(page);
 153         return true;
 154 }
 155
 156 /**
 157  * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
 158  * @pipe:       the pipe that the buffer belongs to
 159  * @buf:        the buffer to attempt to steal
 160  *
 161  * Description:
 162  *      This function attempts to steal the &struct page attached to
 163  *      @buf. If successful, this function returns 0 and returns with
 164  *      the page locked. The caller may then reuse the page for whatever
 165  *      he wishes; the typical use is insertion into a different file
 166  *      page cache.
 167  */
 168 bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
 169                 struct pipe_buffer *buf)
 170 {
 171         struct page *page = buf->page;
 172
 173         /*
 174          * A reference of one is golden, that means that the owner of this
 175          * page is the only one holding a reference to it. lock the page
 176          * and return OK.
 177          */
 178         if (page_count(page) == 1) {
 179                 lock_page(page);
 180                 return true;
 181         }
 182         return false;
 183 }
 184 EXPORT_SYMBOL(generic_pipe_buf_try_steal);
 185
 186 /**
 187  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 188  * @pipe:       the pipe that the buffer belongs to
 189  * @buf:        the buffer to get a reference to
 190  *
 191  * Description:
 192  *      This function grabs an extra reference to @buf. It's used in
 193  *      in the tee() system call, when we duplicate the buffers in one
 194  *      pipe into another.
 195  */
 196 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 197 {
 198         return try_get_page(buf->page);
 199 }
 200 EXPORT_SYMBOL(generic_pipe_buf_get);
 201
 202 /**
 203  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 204  * @pipe:       the pipe that the buffer belongs to
 205  * @buf:        the buffer to put a reference to
 206  *
 207  * Description:
 208  *      This function releases a reference to @buf.
 209  */
 210 void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 211                               struct pipe_buffer *buf)
 212 {
 213         put_page(buf->page);
 214 }
 215 EXPORT_SYMBOL(generic_pipe_buf_release);
 216
 217 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 218         .release        = anon_pipe_buf_release,
 219         .try_steal      = anon_pipe_buf_try_steal,
 220         .get            = generic_pipe_buf_get,
 221 };
 222
 223 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
 224 static inline bool pipe_readable(const struct pipe_inode_info *pipe)
 225 {
 226         unsigned int head = READ_ONCE(pipe->head);
 227         unsigned int tail = READ_ONCE(pipe->tail);
 228         unsigned int writers = READ_ONCE(pipe->writers);
 229
 230         return !pipe_empty(head, tail) || !writers;
 231 }
 232
 233 static ssize_t
 234 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 235 {
 236         size_t total_len = iov_iter_count(to);
 237         struct file *filp = iocb->ki_filp;
 238         struct pipe_inode_info *pipe = filp->private_data;
 239         bool was_full, wake_next_reader = false;
 240         ssize_t ret;
 241
 242         /* Null read succeeds. */
 243         if (unlikely(total_len == 0))
 244                 return 0;
 245
 246         ret = 0;
 247         __pipe_lock(pipe);
 248
 249         /*
 250          * We only wake up writers if the pipe was full when we started
 251          * reading in order to avoid unnecessary wakeups.
 252          *
 253          * But when we do wake up writers, we do so using a sync wakeup
 254          * (WF_SYNC), because we want them to get going and generate more
 255          * data for us.
 256          */
 257         was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 258         for (;;) {
 259                 unsigned int head = pipe->head;
 260                 unsigned int tail = pipe->tail;
 261                 unsigned int mask = pipe->ring_size - 1;
 262
 263 #ifdef CONFIG_WATCH_QUEUE
 264                 if (pipe->note_loss) {
 265                         struct watch_notification n;
 266
 267                         if (total_len < 8) {
 268                                 if (ret == 0)
 269                                         ret = -ENOBUFS;
 270                                 break;
 271                         }
 272
 273                         n.type = WATCH_TYPE_META;
 274                         n.subtype = WATCH_META_LOSS_NOTIFICATION;
 275                         n.info = watch_sizeof(n);
 276                         if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
 277                                 if (ret == 0)
 278                                         ret = -EFAULT;
 279                                 break;
 280                         }
 281                         ret += sizeof(n);
 282                         total_len -= sizeof(n);
 283                         pipe->note_loss = false;
 284                 }
 285 #endif
 286
 287                 if (!pipe_empty(head, tail)) {
 288                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 289                         size_t chars = buf->len;
 290                         size_t written;
 291                         int error;
 292
 293                         if (chars > total_len) {
 294                                 if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
 295                                         if (ret == 0)
 296                                                 ret = -ENOBUFS;
 297                                         break;
 298                                 }
 299                                 chars = total_len;
 300                         }
 301
 302                         error = pipe_buf_confirm(pipe, buf);
 303                         if (error) {
 304                                 if (!ret)
 305                                         ret = error;
 306                                 break;
 307                         }
 308
 309                         written = copy_page_to_iter(buf->page, buf->offset, chars, to);
 310                         if (unlikely(written < chars)) {
 311                                 if (!ret)
 312                                         ret = -EFAULT;
 313                                 break;
 314                         }
 315                         ret += chars;
 316                         buf->offset += chars;
 317                         buf->len -= chars;
 318
 319                         /* Was it a packet buffer? Clean up and exit */
 320                         if (buf->flags & PIPE_BUF_FLAG_PACKET) {
 321                                 total_len = chars;
 322                                 buf->len = 0;
 323                         }
 324
 325                         if (!buf->len) {
 326                                 pipe_buf_release(pipe, buf);
 327                                 spin_lock_irq(&pipe->rd_wait.lock);
 328 #ifdef CONFIG_WATCH_QUEUE
 329                                 if (buf->flags & PIPE_BUF_FLAG_LOSS)
 330                                         pipe->note_loss = true;
 331 #endif
 332                                 tail++;
 333                                 pipe->tail = tail;
 334                                 spin_unlock_irq(&pipe->rd_wait.lock);
 335                         }
 336                         total_len -= chars;
 337                         if (!total_len)
 338                                 break;  /* common path: read succeeded */
 339                         if (!pipe_empty(head, tail))    /* More to do? */
 340                                 continue;
 341                 }
 342
 343                 if (!pipe->writers)
 344                         break;
 345                 if (ret)
 346                         break;
 347                 if (filp->f_flags & O_NONBLOCK) {
 348                         ret = -EAGAIN;
 349                         break;
 350                 }
 351                 __pipe_unlock(pipe);
 352
 353                 /*
 354                  * We only get here if we didn't actually read anything.
 355                  *
 356                  * However, we could have seen (and removed) a zero-sized
 357                  * pipe buffer, and might have made space in the buffers
 358                  * that way.
 359                  *
 360                  * You can't make zero-sized pipe buffers by doing an empty
 361                  * write (not even in packet mode), but they can happen if
 362                  * the writer gets an EFAULT when trying to fill a buffer
 363                  * that already got allocated and inserted in the buffer
 364                  * array.
 365                  *
 366                  * So we still need to wake up any pending writers in the
 367                  * _very_ unlikely case that the pipe was full, but we got
 368                  * no data.
 369                  */
 370                 if (unlikely(was_full)) {
 371                         wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 372                         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 373                 }
 374
 375                 /*
 376                  * But because we didn't read anything, at this point we can
 377                  * just return directly with -ERESTARTSYS if we're interrupted,
 378                  * since we've done any required wakeups and there's no need
 379                  * to mark anything accessed. And we've dropped the lock.
 380                  */
 381                 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
 382                         return -ERESTARTSYS;
 383
 384                 __pipe_lock(pipe);
 385                 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 386                 wake_next_reader = true;
 387         }
 388         if (pipe_empty(pipe->head, pipe->tail))
 389                 wake_next_reader = false;
 390         __pipe_unlock(pipe);
 391
 392         if (was_full) {
 393                 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 394                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 395         }
 396         if (wake_next_reader)
 397                 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 398         if (ret > 0)
 399                 file_accessed(filp);
 400         return ret;
 401 }
 402
 403 static inline int is_packetized(struct file *file)
 404 {
 405         return (file->f_flags & O_DIRECT) != 0;
 406 }
 407
 408 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
 409 static inline bool pipe_writable(const struct pipe_inode_info *pipe)
 410 {
 411         unsigned int head = READ_ONCE(pipe->head);
 412         unsigned int tail = READ_ONCE(pipe->tail);
 413         unsigned int max_usage = READ_ONCE(pipe->max_usage);
 414
 415         return !pipe_full(head, tail, max_usage) ||
 416                 !READ_ONCE(pipe->readers);
 417 }
 418
 419 static ssize_t
 420 pipe_write(struct kiocb *iocb, struct iov_iter *from)
 421 {
 422         struct file *filp = iocb->ki_filp;
 423         struct pipe_inode_info *pipe = filp->private_data;
 424         unsigned int head;
 425         ssize_t ret = 0;
 426         size_t total_len = iov_iter_count(from);
 427         ssize_t chars;
 428         bool was_empty = false;
 429         bool wake_next_writer = false;
 430
 431         /* Null write succeeds. */
 432         if (unlikely(total_len == 0))
 433                 return 0;
 434
 435         __pipe_lock(pipe);
 436
 437         if (!pipe->readers) {
 438                 send_sig(SIGPIPE, current, 0);
 439                 ret = -EPIPE;
 440                 goto out;
 441         }
 442
 443 #ifdef CONFIG_WATCH_QUEUE
 444         if (pipe->watch_queue) {
 445                 ret = -EXDEV;
 446                 goto out;
 447         }
 448 #endif
 449
 450         /*
 451          * Only wake up if the pipe started out empty, since
 452          * otherwise there should be no readers waiting.
 453          *
 454          * If it wasn't empty we try to merge new data into
 455          * the last buffer.
 456          *
 457          * That naturally merges small writes, but it also
 458          * page-aligs the rest of the writes for large writes
 459          * spanning multiple pages.
 460          */
 461         head = pipe->head;
 462         was_empty = pipe_empty(head, pipe->tail);
 463         chars = total_len & (PAGE_SIZE-1);
 464         if (chars && !was_empty) {
 465                 unsigned int mask = pipe->ring_size - 1;
 466                 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
 467                 int offset = buf->offset + buf->len;
 468
 469                 if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
 470                     offset + chars <= PAGE_SIZE) {
 471                         ret = pipe_buf_confirm(pipe, buf);
 472                         if (ret)
 473                                 goto out;
 474
 475                         ret = copy_page_from_iter(buf->page, offset, chars, from);
 476                         if (unlikely(ret < chars)) {
 477                                 ret = -EFAULT;
 478                                 goto out;
 479                         }
 480
 481                         buf->len += ret;
 482                         if (!iov_iter_count(from))
 483                                 goto out;
 484                 }
 485         }
 486
 487         for (;;) {
 488                 if (!pipe->readers) {
 489                         send_sig(SIGPIPE, current, 0);
 490                         if (!ret)
 491                                 ret = -EPIPE;
 492                         break;
 493                 }
 494
 495                 head = pipe->head;
 496                 if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
 497                         unsigned int mask = pipe->ring_size - 1;
 498                         struct pipe_buffer *buf = &pipe->bufs[head & mask];
 499                         struct page *page = pipe->tmp_page;
 500                         int copied;
 501
 502                         if (!page) {
 503                                 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
 504                                 if (unlikely(!page)) {
 505                                         ret = ret ? : -ENOMEM;
 506                                         break;
 507                                 }
 508                                 pipe->tmp_page = page;
 509                         }
 510
 511                         /* Allocate a slot in the ring in advance and attach an
 512                          * empty buffer.  If we fault or otherwise fail to use
 513                          * it, either the reader will consume it or it'll still
 514                          * be there for the next write.
 515                          */
 516                         spin_lock_irq(&pipe->rd_wait.lock);
 517
 518                         head = pipe->head;
 519                         if (pipe_full(head, pipe->tail, pipe->max_usage)) {
 520                                 spin_unlock_irq(&pipe->rd_wait.lock);
 521                                 continue;
 522                         }
 523
 524                         pipe->head = head + 1;
 525                         spin_unlock_irq(&pipe->rd_wait.lock);
 526
 527                         /* Insert it into the buffer array */
 528                         buf = &pipe->bufs[head & mask];
 529                         buf->page = page;
 530                         buf->ops = &anon_pipe_buf_ops;
 531                         buf->offset = 0;
 532                         buf->len = 0;
 533                         if (is_packetized(filp))
 534                                 buf->flags = PIPE_BUF_FLAG_PACKET;
 535                         else
 536                                 buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
 537                         pipe->tmp_page = NULL;
 538
 539                         copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
 540                         if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
 541                                 if (!ret)
 542                                         ret = -EFAULT;
 543                                 break;
 544                         }
 545                         ret += copied;
 546                         buf->offset = 0;
 547                         buf->len = copied;
 548
 549                         if (!iov_iter_count(from))
 550                                 break;
 551                 }
 552
 553                 if (!pipe_full(head, pipe->tail, pipe->max_usage))
 554                         continue;
 555
 556                 /* Wait for buffer space to become available. */
 557                 if (filp->f_flags & O_NONBLOCK) {
 558                         if (!ret)
 559                                 ret = -EAGAIN;
 560                         break;
 561                 }
 562                 if (signal_pending(current)) {
 563                         if (!ret)
 564                                 ret = -ERESTARTSYS;
 565                         break;
 566                 }
 567
 568                 /*
 569                  * We're going to release the pipe lock and wait for more
 570                  * space. We wake up any readers if necessary, and then
 571                  * after waiting we need to re-check whether the pipe
 572                  * become empty while we dropped the lock.
 573                  */
 574                 __pipe_unlock(pipe);
 575                 if (was_empty) {
 576                         wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 577                         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 578                 }
 579                 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
 580                 __pipe_lock(pipe);
 581                 was_empty = pipe_empty(pipe->head, pipe->tail);
 582                 wake_next_writer = true;
 583         }
 584 out:
 585         if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 586                 wake_next_writer = false;
 587         __pipe_unlock(pipe);
 588
 589         /*
 590          * If we do do a wakeup event, we do a 'sync' wakeup, because we
 591          * want the reader to start processing things asap, rather than
 592          * leave the data pending.
 593          *
 594          * This is particularly important for small writes, because of
 595          * how (for example) the GNU make jobserver uses small writes to
 596          * wake up pending jobs
 597          */
 598         if (was_empty) {
 599                 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 600                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 601         }
 602         if (wake_next_writer)
 603                 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 604         if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
 605                 int err = file_update_time(filp);
 606                 if (err)
 607                         ret = err;
 608                 sb_end_write(file_inode(filp)->i_sb);
 609         }
 610         return ret;
 611 }
 612
 613 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 614 {
 615         struct pipe_inode_info *pipe = filp->private_data;
 616         int count, head, tail, mask;
 617
 618         switch (cmd) {
 619         case FIONREAD:
 620                 __pipe_lock(pipe);
 621                 count = 0;
 622                 head = pipe->head;
 623                 tail = pipe->tail;
 624                 mask = pipe->ring_size - 1;
 625
 626                 while (tail != head) {
 627                         count += pipe->bufs[tail & mask].len;
 628                         tail++;
 629                 }
 630                 __pipe_unlock(pipe);
 631
 632                 return put_user(count, (int __user *)arg);
 633
 634 #ifdef CONFIG_WATCH_QUEUE
 635         case IOC_WATCH_QUEUE_SET_SIZE: {
 636                 int ret;
 637                 __pipe_lock(pipe);
 638                 ret = watch_queue_set_size(pipe, arg);
 639                 __pipe_unlock(pipe);
 640                 return ret;
 641         }
 642
 643         case IOC_WATCH_QUEUE_SET_FILTER:
 644                 return watch_queue_set_filter(
 645                         pipe, (struct watch_notification_filter __user *)arg);
 646 #endif
 647
 648         default:
 649                 return -ENOIOCTLCMD;
 650         }
 651 }
 652
 653 /* No kernel lock held - fine */
 654 static __poll_t
 655 pipe_poll(struct file *filp, poll_table *wait)
 656 {
 657         __poll_t mask;
 658         struct pipe_inode_info *pipe = filp->private_data;
 659         unsigned int head, tail;
 660
 661         /*
 662          * Reading pipe state only -- no need for acquiring the semaphore.
 663          *
 664          * But because this is racy, the code has to add the
 665          * entry to the poll table _first_ ..
 666          */
 667         if (filp->f_mode & FMODE_READ)
 668                 poll_wait(filp, &pipe->rd_wait, wait);
 669         if (filp->f_mode & FMODE_WRITE)
 670                 poll_wait(filp, &pipe->wr_wait, wait);
 671
 672         /*
 673          * .. and only then can you do the racy tests. That way,
 674          * if something changes and you got it wrong, the poll
 675          * table entry will wake you up and fix it.
 676          */
 677         head = READ_ONCE(pipe->head);
 678         tail = READ_ONCE(pipe->tail);
 679
 680         mask = 0;
 681         if (filp->f_mode & FMODE_READ) {
 682                 if (!pipe_empty(head, tail))
 683                         mask |= EPOLLIN | EPOLLRDNORM;
 684                 if (!pipe->writers && filp->f_version != pipe->w_counter)
 685                         mask |= EPOLLHUP;
 686         }
 687
 688         if (filp->f_mode & FMODE_WRITE) {
 689                 if (!pipe_full(head, tail, pipe->max_usage))
 690                         mask |= EPOLLOUT | EPOLLWRNORM;
 691                 /*
 692                  * Most Unices do not set EPOLLERR for FIFOs but on Linux they
 693                  * behave exactly like pipes for poll().
 694                  */
 695                 if (!pipe->readers)
 696                         mask |= EPOLLERR;
 697         }
 698
 699         return mask;
 700 }
 701
 702 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
 703 {
 704         int kill = 0;
 705
 706         spin_lock(&inode->i_lock);
 707         if (!--pipe->files) {
 708                 inode->i_pipe = NULL;
 709                 kill = 1;
 710         }
 711         spin_unlock(&inode->i_lock);
 712
 713         if (kill)
 714                 free_pipe_info(pipe);
 715 }
 716
 717 static int
 718 pipe_release(struct inode *inode, struct file *file)
 719 {
 720         struct pipe_inode_info *pipe = file->private_data;
 721
 722         __pipe_lock(pipe);
 723         if (file->f_mode & FMODE_READ)
 724                 pipe->readers--;
 725         if (file->f_mode & FMODE_WRITE)
 726                 pipe->writers--;
 727
 728         /* Was that the last reader or writer, but not the other side? */
 729         if (!pipe->readers != !pipe->writers) {
 730                 wake_up_interruptible_all(&pipe->rd_wait);
 731                 wake_up_interruptible_all(&pipe->wr_wait);
 732                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 733                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 734         }
 735         __pipe_unlock(pipe);
 736
 737         put_pipe_info(inode, pipe);
 738         return 0;
 739 }
 740
 741 static int
 742 pipe_fasync(int fd, struct file *filp, int on)
 743 {
 744         struct pipe_inode_info *pipe = filp->private_data;
 745         int retval = 0;
 746
 747         __pipe_lock(pipe);
 748         if (filp->f_mode & FMODE_READ)
 749                 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 750         if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
 751                 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 752                 if (retval < 0 && (filp->f_mode & FMODE_READ))
 753                         /* this can happen only if on == T */
 754                         fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 755         }
 756         __pipe_unlock(pipe);
 757         return retval;
 758 }
 759
 760 unsigned long account_pipe_buffers(struct user_struct *user,
 761                                    unsigned long old, unsigned long new)
 762 {
 763         return atomic_long_add_return(new - old, &user->pipe_bufs);
 764 }
 765
 766 bool too_many_pipe_buffers_soft(unsigned long user_bufs)
 767 {
 768         unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
 769
 770         return soft_limit && user_bufs > soft_limit;
 771 }
 772
 773 bool too_many_pipe_buffers_hard(unsigned long user_bufs)
 774 {
 775         unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
 776
 777         return hard_limit && user_bufs > hard_limit;
 778 }
 779
 780 bool pipe_is_unprivileged_user(void)
 781 {
 782         return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
 783 }
 784
 785 struct pipe_inode_info *alloc_pipe_info(void)
 786 {
 787         struct pipe_inode_info *pipe;
 788         unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
 789         struct user_struct *user = get_current_user();
 790         unsigned long user_bufs;
 791         unsigned int max_size = READ_ONCE(pipe_max_size);
 792
 793         pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
 794         if (pipe == NULL)
 795                 goto out_free_uid;
 796
 797         if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
 798                 pipe_bufs = max_size >> PAGE_SHIFT;
 799
 800         user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
 801
 802         if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
 803                 user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
 804                 pipe_bufs = 1;
 805         }
 806
 807         if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
 808                 goto out_revert_acct;
 809
 810         pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
 811                              GFP_KERNEL_ACCOUNT);
 812
 813         if (pipe->bufs) {
 814                 init_waitqueue_head(&pipe->rd_wait);
 815                 init_waitqueue_head(&pipe->wr_wait);
 816                 pipe->r_counter = pipe->w_counter = 1;
 817                 pipe->max_usage = pipe_bufs;
 818                 pipe->ring_size = pipe_bufs;
 819                 pipe->nr_accounted = pipe_bufs;
 820                 pipe->user = user;
 821                 mutex_init(&pipe->mutex);
 822                 return pipe;
 823         }
 824
 825 out_revert_acct:
 826         (void) account_pipe_buffers(user, pipe_bufs, 0);
 827         kfree(pipe);
 828 out_free_uid:
 829         free_uid(user);
 830         return NULL;
 831 }
 832
 833 void free_pipe_info(struct pipe_inode_info *pipe)
 834 {
 835         int i;
 836
 837 #ifdef CONFIG_WATCH_QUEUE
 838         if (pipe->watch_queue) {
 839                 watch_queue_clear(pipe->watch_queue);
 840                 put_watch_queue(pipe->watch_queue);
 841         }
 842 #endif
 843
 844         (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
 845         free_uid(pipe->user);
 846         for (i = 0; i < pipe->ring_size; i++) {
 847                 struct pipe_buffer *buf = pipe->bufs + i;
 848                 if (buf->ops)
 849                         pipe_buf_release(pipe, buf);
 850         }
 851         if (pipe->tmp_page)
 852                 __free_page(pipe->tmp_page);
 853         kfree(pipe->bufs);
 854         kfree(pipe);
 855 }
 856
 857 static struct vfsmount *pipe_mnt __read_mostly;
 858
 859 /*
 860  * pipefs_dname() is called from d_path().
 861  */
 862 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 863 {
 864         return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
 865                                 d_inode(dentry)->i_ino);
 866 }
 867
 868 static const struct dentry_operations pipefs_dentry_operations = {
 869         .d_dname        = pipefs_dname,
 870 };
 871
 872 static struct inode * get_pipe_inode(void)
 873 {
 874         struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
 875         struct pipe_inode_info *pipe;
 876
 877         if (!inode)
 878                 goto fail_inode;
 879
 880         inode->i_ino = get_next_ino();
 881
 882         pipe = alloc_pipe_info();
 883         if (!pipe)
 884                 goto fail_iput;
 885
 886         inode->i_pipe = pipe;
 887         pipe->files = 2;
 888         pipe->readers = pipe->writers = 1;
 889         inode->i_fop = &pipefifo_fops;
 890
 891         /*
 892          * Mark the inode dirty from the very beginning,
 893          * that way it will never be moved to the dirty
 894          * list because "mark_inode_dirty()" will think
 895          * that it already _is_ on the dirty list.
 896          */
 897         inode->i_state = I_DIRTY;
 898         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 899         inode->i_uid = current_fsuid();
 900         inode->i_gid = current_fsgid();
 901         inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 902
 903         return inode;
 904
 905 fail_iput:
 906         iput(inode);
 907
 908 fail_inode:
 909         return NULL;
 910 }
 911
 912 int create_pipe_files(struct file **res, int flags)
 913 {
 914         struct inode *inode = get_pipe_inode();
 915         struct file *f;
 916
 917         if (!inode)
 918                 return -ENFILE;
 919
 920         if (flags & O_NOTIFICATION_PIPE) {
 921 #ifdef CONFIG_WATCH_QUEUE
 922                 if (watch_queue_init(inode->i_pipe) < 0) {
 923                         iput(inode);
 924                         return -ENOMEM;
 925                 }
 926 #else
 927                 return -ENOPKG;
 928 #endif
 929         }
 930
 931         f = alloc_file_pseudo(inode, pipe_mnt, "",
 932                                 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
 933                                 &pipefifo_fops);
 934         if (IS_ERR(f)) {
 935                 free_pipe_info(inode->i_pipe);
 936                 iput(inode);
 937                 return PTR_ERR(f);
 938         }
 939
 940         f->private_data = inode->i_pipe;
 941
 942         res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
 943                                   &pipefifo_fops);
 944         if (IS_ERR(res[0])) {
 945                 put_pipe_info(inode, inode->i_pipe);
 946                 fput(f);
 947                 return PTR_ERR(res[0]);
 948         }
 949         res[0]->private_data = inode->i_pipe;
 950         res[1] = f;
 951         stream_open(inode, res[0]);
 952         stream_open(inode, res[1]);
 953         return 0;
 954 }
 955
 956 static int __do_pipe_flags(int *fd, struct file **files, int flags)
 957 {
 958         int error;
 959         int fdw, fdr;
 960
 961         if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
 962                 return -EINVAL;
 963
 964         error = create_pipe_files(files, flags);
 965         if (error)
 966                 return error;
 967
 968         error = get_unused_fd_flags(flags);
 969         if (error < 0)
 970                 goto err_read_pipe;
 971         fdr = error;
 972
 973         error = get_unused_fd_flags(flags);
 974         if (error < 0)
 975                 goto err_fdr;
 976         fdw = error;
 977
 978         audit_fd_pair(fdr, fdw);
 979         fd[0] = fdr;
 980         fd[1] = fdw;
 981         return 0;
 982
 983  err_fdr:
 984         put_unused_fd(fdr);
 985  err_read_pipe:
 986         fput(files[0]);
 987         fput(files[1]);
 988         return error;
 989 }
 990
 991 int do_pipe_flags(int *fd, int flags)
 992 {
 993         struct file *files[2];
 994         int error = __do_pipe_flags(fd, files, flags);
 995         if (!error) {
 996                 fd_install(fd[0], files[0]);
 997                 fd_install(fd[1], files[1]);
 998         }
 999         return error;
1000 }
1001
1002 /*
1003  * sys_pipe() is the normal C calling standard for creating
1004  * a pipe. It's not the way Unix traditionally does this, though.
1005  */
1006 static int do_pipe2(int __user *fildes, int flags)
1007 {
1008         struct file *files[2];
1009         int fd[2];
1010         int error;
1011
1012         error = __do_pipe_flags(fd, files, flags);
1013         if (!error) {
1014                 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
1015                         fput(files[0]);
1016                         fput(files[1]);
1017                         put_unused_fd(fd[0]);
1018                         put_unused_fd(fd[1]);
1019                         error = -EFAULT;
1020                 } else {
1021                         fd_install(fd[0], files[0]);
1022                         fd_install(fd[1], files[1]);
1023                 }
1024         }
1025         return error;
1026 }
1027
1028 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
1029 {
1030         return do_pipe2(fildes, flags);
1031 }
1032
1033 SYSCALL_DEFINE1(pipe, int __user *, fildes)
1034 {
1035         return do_pipe2(fildes, 0);
1036 }
1037
1038 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
1039 {
1040         int cur = *cnt;
1041
1042         while (cur == *cnt) {
1043                 pipe_wait(pipe);
1044                 if (signal_pending(current))
1045                         break;
1046         }
1047         return cur == *cnt ? -ERESTARTSYS : 0;
1048 }
1049
1050 static void wake_up_partner(struct pipe_inode_info *pipe)
1051 {
1052         wake_up_interruptible_all(&pipe->rd_wait);
1053         wake_up_interruptible_all(&pipe->wr_wait);
1054 }
1055
1056 static int fifo_open(struct inode *inode, struct file *filp)
1057 {
1058         struct pipe_inode_info *pipe;
1059         bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
1060         int ret;
1061
1062         filp->f_version = 0;
1063
1064         spin_lock(&inode->i_lock);
1065         if (inode->i_pipe) {
1066                 pipe = inode->i_pipe;
1067                 pipe->files++;
1068                 spin_unlock(&inode->i_lock);
1069         } else {
1070                 spin_unlock(&inode->i_lock);
1071                 pipe = alloc_pipe_info();
1072                 if (!pipe)
1073                         return -ENOMEM;
1074                 pipe->files = 1;
1075                 spin_lock(&inode->i_lock);
1076                 if (unlikely(inode->i_pipe)) {
1077                         inode->i_pipe->files++;
1078                         spin_unlock(&inode->i_lock);
1079                         free_pipe_info(pipe);
1080                         pipe = inode->i_pipe;
1081                 } else {
1082                         inode->i_pipe = pipe;
1083                         spin_unlock(&inode->i_lock);
1084                 }
1085         }
1086         filp->private_data = pipe;
1087         /* OK, we have a pipe and it's pinned down */
1088
1089         __pipe_lock(pipe);
1090
1091         /* We can only do regular read/write on fifos */
1092         stream_open(inode, filp);
1093
1094         switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
1095         case FMODE_READ:
1096         /*
1097          *  O_RDONLY
1098          *  POSIX.1 says that O_NONBLOCK means return with the FIFO
1099          *  opened, even when there is no process writing the FIFO.
1100          */
1101                 pipe->r_counter++;
1102                 if (pipe->readers++ == 0)
1103                         wake_up_partner(pipe);
1104
1105                 if (!is_pipe && !pipe->writers) {
1106                         if ((filp->f_flags & O_NONBLOCK)) {
1107                                 /* suppress EPOLLHUP until we have
1108                                  * seen a writer */
1109                                 filp->f_version = pipe->w_counter;
1110                         } else {
1111                                 if (wait_for_partner(pipe, &pipe->w_counter))
1112                                         goto err_rd;
1113                         }
1114                 }
1115                 break;
1116
1117         case FMODE_WRITE:
1118         /*
1119          *  O_WRONLY
1120          *  POSIX.1 says that O_NONBLOCK means return -1 with
1121          *  errno=ENXIO when there is no process reading the FIFO.
1122          */
1123                 ret = -ENXIO;
1124                 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
1125                         goto err;
1126
1127                 pipe->w_counter++;
1128                 if (!pipe->writers++)
1129                         wake_up_partner(pipe);
1130
1131                 if (!is_pipe && !pipe->readers) {
1132                         if (wait_for_partner(pipe, &pipe->r_counter))
1133                                 goto err_wr;
1134                 }
1135                 break;
1136
1137         case FMODE_READ | FMODE_WRITE:
1138         /*
1139          *  O_RDWR
1140          *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1141          *  This implementation will NEVER block on a O_RDWR open, since
1142          *  the process can at least talk to itself.
1143          */
1144
1145                 pipe->readers++;
1146                 pipe->writers++;
1147                 pipe->r_counter++;
1148                 pipe->w_counter++;
1149                 if (pipe->readers == 1 || pipe->writers == 1)
1150                         wake_up_partner(pipe);
1151                 break;
1152
1153         default:
1154                 ret = -EINVAL;
1155                 goto err;
1156         }
1157
1158         /* Ok! */
1159         __pipe_unlock(pipe);
1160         return 0;
1161
1162 err_rd:
1163         if (!--pipe->readers)
1164                 wake_up_interruptible(&pipe->wr_wait);
1165         ret = -ERESTARTSYS;
1166         goto err;
1167
1168 err_wr:
1169         if (!--pipe->writers)
1170                 wake_up_interruptible_all(&pipe->rd_wait);
1171         ret = -ERESTARTSYS;
1172         goto err;
1173
1174 err:
1175         __pipe_unlock(pipe);
1176
1177         put_pipe_info(inode, pipe);
1178         return ret;
1179 }
1180
1181 const struct file_operations pipefifo_fops = {
1182         .open           = fifo_open,
1183         .llseek         = no_llseek,
1184         .read_iter      = pipe_read,
1185         .write_iter     = pipe_write,
1186         .poll           = pipe_poll,
1187         .unlocked_ioctl = pipe_ioctl,
1188         .release        = pipe_release,
1189         .fasync         = pipe_fasync,
1190 };
1191
1192 /*
1193  * Currently we rely on the pipe array holding a power-of-2 number
1194  * of pages. Returns 0 on error.
1195  */
1196 unsigned int round_pipe_size(unsigned long size)
1197 {
1198         if (size > (1U << 31))
1199                 return 0;
1200
1201         /* Minimum pipe size, as required by POSIX */
1202         if (size < PAGE_SIZE)
1203                 return PAGE_SIZE;
1204
1205         return roundup_pow_of_two(size);
1206 }
1207
1208 /*
1209  * Resize the pipe ring to a number of slots.
1210  */
1211 int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
1212 {
1213         struct pipe_buffer *bufs;
1214         unsigned int head, tail, mask, n;
1215
1216         /*
1217          * We can shrink the pipe, if arg is greater than the ring occupancy.
1218          * Since we don't expect a lot of shrink+grow operations, just free and
1219          * allocate again like we would do for growing.  If the pipe currently
1220          * contains more buffers than arg, then return busy.
1221          */
1222         mask = pipe->ring_size - 1;
1223         head = pipe->head;
1224         tail = pipe->tail;
1225         n = pipe_occupancy(pipe->head, pipe->tail);
1226         if (nr_slots < n)
1227                 return -EBUSY;
1228
1229         bufs = kcalloc(nr_slots, sizeof(*bufs),
1230                        GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1231         if (unlikely(!bufs))
1232                 return -ENOMEM;
1233
1234         /*
1235          * The pipe array wraps around, so just start the new one at zero
1236          * and adjust the indices.
1237          */
1238         if (n > 0) {
1239                 unsigned int h = head & mask;
1240                 unsigned int t = tail & mask;
1241                 if (h > t) {
1242                         memcpy(bufs, pipe->bufs + t,
1243                                n * sizeof(struct pipe_buffer));
1244                 } else {
1245                         unsigned int tsize = pipe->ring_size - t;
1246                         if (h > 0)
1247                                 memcpy(bufs + tsize, pipe->bufs,
1248                                        h * sizeof(struct pipe_buffer));
1249                         memcpy(bufs, pipe->bufs + t,
1250                                tsize * sizeof(struct pipe_buffer));
1251                 }
1252         }
1253
1254         head = n;
1255         tail = 0;
1256
1257         kfree(pipe->bufs);
1258         pipe->bufs = bufs;
1259         pipe->ring_size = nr_slots;
1260         if (pipe->max_usage > nr_slots)
1261                 pipe->max_usage = nr_slots;
1262         pipe->tail = tail;
1263         pipe->head = head;
1264
1265         /* This might have made more room for writers */
1266         wake_up_interruptible(&pipe->wr_wait);
1267         return 0;
1268 }
1269
1270 /*
1271  * Allocate a new array of pipe buffers and copy the info over. Returns the
1272  * pipe size if successful, or return -ERROR on error.
1273  */
1274 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1275 {
1276         unsigned long user_bufs;
1277         unsigned int nr_slots, size;
1278         long ret = 0;
1279
1280 #ifdef CONFIG_WATCH_QUEUE
1281         if (pipe->watch_queue)
1282                 return -EBUSY;
1283 #endif
1284
1285         size = round_pipe_size(arg);
1286         nr_slots = size >> PAGE_SHIFT;
1287
1288         if (!nr_slots)
1289                 return -EINVAL;
1290
1291         /*
1292          * If trying to increase the pipe capacity, check that an
1293          * unprivileged user is not trying to exceed various limits
1294          * (soft limit check here, hard limit check just below).
1295          * Decreasing the pipe capacity is always permitted, even
1296          * if the user is currently over a limit.
1297          */
1298         if (nr_slots > pipe->max_usage &&
1299                         size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1300                 return -EPERM;
1301
1302         user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
1303
1304         if (nr_slots > pipe->max_usage &&
1305                         (too_many_pipe_buffers_hard(user_bufs) ||
1306                          too_many_pipe_buffers_soft(user_bufs)) &&
1307                         pipe_is_unprivileged_user()) {
1308                 ret = -EPERM;
1309                 goto out_revert_acct;
1310         }
1311
1312         ret = pipe_resize_ring(pipe, nr_slots);
1313         if (ret < 0)
1314                 goto out_revert_acct;
1315
1316         pipe->max_usage = nr_slots;
1317         pipe->nr_accounted = nr_slots;
1318         return pipe->max_usage * PAGE_SIZE;
1319
1320 out_revert_acct:
1321         (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
1322         return ret;
1323 }
1324
1325 /*
1326  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1327  * location, so checking ->i_pipe is not enough to verify that this is a
1328  * pipe.
1329  */
1330 struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
1331 {
1332         struct pipe_inode_info *pipe = file->private_data;
1333
1334         if (file->f_op != &pipefifo_fops || !pipe)
1335                 return NULL;
1336 #ifdef CONFIG_WATCH_QUEUE
1337         if (for_splice && pipe->watch_queue)
1338                 return NULL;
1339 #endif
1340         return pipe;
1341 }
1342
1343 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1344 {
1345         struct pipe_inode_info *pipe;
1346         long ret;
1347
1348         pipe = get_pipe_info(file, false);
1349         if (!pipe)
1350                 return -EBADF;
1351
1352         __pipe_lock(pipe);
1353
1354         switch (cmd) {
1355         case F_SETPIPE_SZ:
1356                 ret = pipe_set_size(pipe, arg);
1357                 break;
1358         case F_GETPIPE_SZ:
1359                 ret = pipe->max_usage * PAGE_SIZE;
1360                 break;
1361         default:
1362                 ret = -EINVAL;
1363                 break;
1364         }
1365
1366         __pipe_unlock(pipe);
1367         return ret;
1368 }
1369
1370 static const struct super_operations pipefs_ops = {
1371         .destroy_inode = free_inode_nonrcu,
1372         .statfs = simple_statfs,
1373 };
1374
1375 /*
1376  * pipefs should _never_ be mounted by userland - too much of security hassle,
1377  * no real gain from having the whole whorehouse mounted. So we don't need
1378  * any operations on the root directory. However, we need a non-trivial
1379  * d_name - pipe: will go nicely and kill the special-casing in procfs.
1380  */
1381
1382 static int pipefs_init_fs_context(struct fs_context *fc)
1383 {
1384         struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
1385         if (!ctx)
1386                 return -ENOMEM;
1387         ctx->ops = &pipefs_ops;
1388         ctx->dops = &pipefs_dentry_operations;
1389         return 0;
1390 }
1391
1392 static struct file_system_type pipe_fs_type = {
1393         .name           = "pipefs",
1394         .init_fs_context = pipefs_init_fs_context,
1395         .kill_sb        = kill_anon_super,
1396 };
1397
1398 static int __init init_pipe_fs(void)
1399 {
1400         int err = register_filesystem(&pipe_fs_type);
1401
1402         if (!err) {
1403                 pipe_mnt = kern_mount(&pipe_fs_type);
1404                 if (IS_ERR(pipe_mnt)) {
1405                         err = PTR_ERR(pipe_mnt);
1406                         unregister_filesystem(&pipe_fs_type);
1407                 }
1408         }
1409         return err;
1410 }
1411
1412 fs_initcall(init_pipe_fs);