2 * Copyright (C) 2005-2017 Junjiro R. Okajima
4 * This program, aufs is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 * file and vm operations
22 #include <linux/aio.h>
23 #include <linux/fs_stack.h>
24 #include <linux/mman.h>
25 #include <linux/security.h>
28 int au_do_open_nondir(struct file
*file
, int flags
, struct file
*h_file
)
32 struct dentry
*dentry
, *h_dentry
;
33 struct au_finfo
*finfo
;
34 struct inode
*h_inode
;
36 FiMustWriteLock(file
);
39 dentry
= file
->f_path
.dentry
;
40 AuDebugOn(IS_ERR_OR_NULL(dentry
));
42 memset(&finfo
->fi_htop
, 0, sizeof(finfo
->fi_htop
));
43 atomic_set(&finfo
->fi_mmapped
, 0);
44 bindex
= au_dbtop(dentry
);
46 h_dentry
= au_h_dptr(dentry
, bindex
);
47 err
= vfsub_test_mntns(file
->f_path
.mnt
, h_dentry
->d_sb
);
50 h_file
= au_h_open(dentry
, bindex
, flags
, file
, /*force_wr*/0);
52 h_dentry
= h_file
->f_path
.dentry
;
53 err
= vfsub_test_mntns(file
->f_path
.mnt
, h_dentry
->d_sb
);
59 err
= PTR_ERR(h_file
);
61 if ((flags
& __O_TMPFILE
)
62 && !(flags
& O_EXCL
)) {
63 h_inode
= file_inode(h_file
);
64 spin_lock(&h_inode
->i_lock
);
65 h_inode
->i_state
|= I_LINKABLE
;
66 spin_unlock(&h_inode
->i_lock
);
68 au_set_fbtop(file
, bindex
);
69 au_set_h_fptr(file
, bindex
, h_file
);
70 au_update_figen(file
);
71 /* todo: necessary? */
72 /* file->f_ra = h_file->f_ra; */
79 static int aufs_open_nondir(struct inode
*inode __maybe_unused
,
83 struct super_block
*sb
;
84 struct au_do_open_args args
= {
85 .open
= au_do_open_nondir
88 AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n",
89 file
, vfsub_file_flags(file
), file
->f_mode
);
91 sb
= file
->f_path
.dentry
->d_sb
;
92 si_read_lock(sb
, AuLock_FLUSH
);
93 err
= au_do_open(file
, &args
);
98 int aufs_release_nondir(struct inode
*inode __maybe_unused
, struct file
*file
)
100 struct au_finfo
*finfo
;
101 aufs_bindex_t bindex
;
104 au_hbl_del(&finfo
->fi_hlist
,
105 &au_sbi(file
->f_path
.dentry
->d_sb
)->si_files
);
106 bindex
= finfo
->fi_btop
;
108 au_set_h_fptr(file
, bindex
, NULL
);
114 /* ---------------------------------------------------------------------- */
116 static int au_do_flush_nondir(struct file
*file
, fl_owner_t id
)
122 h_file
= au_hf_top(file
);
124 err
= vfsub_flush(h_file
, id
);
128 static int aufs_flush_nondir(struct file
*file
, fl_owner_t id
)
130 return au_do_flush(file
, id
, au_do_flush_nondir
);
133 /* ---------------------------------------------------------------------- */
135 * read and write functions acquire [fdi]_rwsem once, but release before
136 * mmap_sem. This is because to stop a race condition between mmap(2).
137 * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping
138 * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in
139 * read functions after [fdi]_rwsem are released, but it should be harmless.
142 /* Callers should call au_read_post() or fput() in the end */
143 struct file
*au_read_pre(struct file
*file
, int keep_fi
, unsigned int lsc
)
148 err
= au_reval_and_lock_fdi(file
, au_reopen_nondir
, /*wlock*/0, lsc
);
150 di_read_unlock(file
->f_path
.dentry
, AuLock_IR
);
151 h_file
= au_hf_top(file
);
154 fi_read_unlock(file
);
156 h_file
= ERR_PTR(err
);
161 static void au_read_post(struct inode
*inode
, struct file
*h_file
)
163 /* update without lock, I don't think it a problem */
164 fsstack_copy_attr_atime(inode
, file_inode(h_file
));
168 struct au_write_pre
{
178 * return with iinfo is write-locked
179 * callers should call au_write_post() or iinfo_write_unlock() + fput() in the
182 static struct file
*au_write_pre(struct file
*file
, int do_ready
,
183 struct au_write_pre
*wpre
)
186 struct dentry
*dentry
;
194 err
= au_reval_and_lock_fdi(file
, au_reopen_nondir
, /*wlock*/1, lsc
);
195 h_file
= ERR_PTR(err
);
199 dentry
= file
->f_path
.dentry
;
201 err
= au_ready_to_write(file
, -1, &pin
);
203 h_file
= ERR_PTR(err
);
204 di_write_unlock(dentry
);
209 di_downgrade_lock(dentry
, /*flags*/0);
211 wpre
->btop
= au_fbtop(file
);
212 h_file
= au_hf_top(file
);
215 wpre
->blks
= file_inode(h_file
)->i_blocks
;
218 di_read_unlock(dentry
, /*flags*/0);
221 fi_write_unlock(file
);
226 static void au_write_post(struct inode
*inode
, struct file
*h_file
,
227 struct au_write_pre
*wpre
, ssize_t written
)
229 struct inode
*h_inode
;
231 au_cpup_attr_timesizes(inode
);
232 AuDebugOn(au_ibtop(inode
) != wpre
->btop
);
233 h_inode
= file_inode(h_file
);
234 inode
->i_mode
= h_inode
->i_mode
;
235 ii_write_unlock(inode
);
236 /* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */
238 au_fhsm_wrote(inode
->i_sb
, wpre
->btop
,
239 /*force*/h_inode
->i_blocks
> wpre
->blks
);
243 static ssize_t
aufs_read(struct file
*file
, char __user
*buf
, size_t count
,
249 struct super_block
*sb
;
251 inode
= file_inode(file
);
253 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
255 h_file
= au_read_pre(file
, /*keep_fi*/0, /*lsc*/0);
256 err
= PTR_ERR(h_file
);
260 /* filedata may be obsoleted by concurrent copyup, but no problem */
261 err
= vfsub_read_u(h_file
, buf
, count
, ppos
);
262 /* todo: necessary? */
263 /* file->f_ra = h_file->f_ra; */
264 au_read_post(inode
, h_file
);
273 * it locks both of i_mutex and si_rwsem for read in safe.
274 * if the plink maintenance mode continues forever (that is the problem),
277 static void au_mtx_and_read_lock(struct inode
*inode
)
280 struct super_block
*sb
= inode
->i_sb
;
284 err
= si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLM
);
288 si_read_lock(sb
, AuLock_NOPLMW
);
293 static ssize_t
aufs_write(struct file
*file
, const char __user
*ubuf
,
294 size_t count
, loff_t
*ppos
)
297 struct au_write_pre wpre
;
300 char __user
*buf
= (char __user
*)ubuf
;
302 inode
= file_inode(file
);
303 au_mtx_and_read_lock(inode
);
306 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
307 err
= PTR_ERR(h_file
);
311 err
= vfsub_write_u(h_file
, buf
, count
, ppos
);
312 au_write_post(inode
, h_file
, &wpre
, err
);
315 si_read_unlock(inode
->i_sb
);
320 static ssize_t
au_do_iter(struct file
*h_file
, int rw
, struct kiocb
*kio
,
321 struct iov_iter
*iov_iter
)
325 ssize_t (*iter
)(struct kiocb
*, struct iov_iter
*);
327 err
= security_file_permission(h_file
, rw
);
334 iter
= h_file
->f_op
->read_iter
;
335 else if (rw
== MAY_WRITE
)
336 iter
= h_file
->f_op
->write_iter
;
339 kio
->ki_filp
= h_file
;
342 err
= iter(kio
, iov_iter
);
345 /* currently there is no such fs */
353 static ssize_t
aufs_read_iter(struct kiocb
*kio
, struct iov_iter
*iov_iter
)
356 struct file
*file
, *h_file
;
358 struct super_block
*sb
;
361 inode
= file_inode(file
);
363 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
365 h_file
= au_read_pre(file
, /*keep_fi*/1, /*lsc*/0);
366 err
= PTR_ERR(h_file
);
370 if (0 && au_test_loopback_kthread()) {
371 au_warn_loopback(h_file
->f_path
.dentry
->d_sb
);
372 if (file
->f_mapping
!= h_file
->f_mapping
) {
373 file
->f_mapping
= h_file
->f_mapping
;
374 smp_mb(); /* unnecessary? */
377 fi_read_unlock(file
);
379 err
= au_do_iter(h_file
, MAY_READ
, kio
, iov_iter
);
380 /* todo: necessary? */
381 /* file->f_ra = h_file->f_ra; */
382 au_read_post(inode
, h_file
);
389 static ssize_t
aufs_write_iter(struct kiocb
*kio
, struct iov_iter
*iov_iter
)
392 struct au_write_pre wpre
;
394 struct file
*file
, *h_file
;
397 inode
= file_inode(file
);
398 au_mtx_and_read_lock(inode
);
401 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
402 err
= PTR_ERR(h_file
);
406 err
= au_do_iter(h_file
, MAY_WRITE
, kio
, iov_iter
);
407 au_write_post(inode
, h_file
, &wpre
, err
);
410 si_read_unlock(inode
->i_sb
);
415 static ssize_t
aufs_splice_read(struct file
*file
, loff_t
*ppos
,
416 struct pipe_inode_info
*pipe
, size_t len
,
422 struct super_block
*sb
;
424 inode
= file_inode(file
);
426 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
428 h_file
= au_read_pre(file
, /*keep_fi*/0, /*lsc*/0);
429 err
= PTR_ERR(h_file
);
433 err
= vfsub_splice_to(h_file
, ppos
, pipe
, len
, flags
);
434 /* todo: necessasry? */
435 /* file->f_ra = h_file->f_ra; */
436 au_read_post(inode
, h_file
);
444 aufs_splice_write(struct pipe_inode_info
*pipe
, struct file
*file
, loff_t
*ppos
,
445 size_t len
, unsigned int flags
)
448 struct au_write_pre wpre
;
452 inode
= file_inode(file
);
453 au_mtx_and_read_lock(inode
);
456 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
457 err
= PTR_ERR(h_file
);
461 err
= vfsub_splice_from(pipe
, h_file
, ppos
, len
, flags
);
462 au_write_post(inode
, h_file
, &wpre
, err
);
465 si_read_unlock(inode
->i_sb
);
470 static long aufs_fallocate(struct file
*file
, int mode
, loff_t offset
,
474 struct au_write_pre wpre
;
478 inode
= file_inode(file
);
479 au_mtx_and_read_lock(inode
);
482 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
483 err
= PTR_ERR(h_file
);
488 err
= vfs_fallocate(h_file
, mode
, offset
, len
);
490 au_write_post(inode
, h_file
, &wpre
, /*written*/1);
493 si_read_unlock(inode
->i_sb
);
498 static ssize_t
aufs_copy_file_range(struct file
*src
, loff_t src_pos
,
499 struct file
*dst
, loff_t dst_pos
,
500 size_t len
, unsigned int flags
)
503 struct au_write_pre wpre
;
508 struct super_block
*h_sb
;
514 a_src
.inode
= file_inode(src
);
515 if (unlikely(!S_ISREG(a_src
.inode
->i_mode
)))
517 a_dst
.inode
= file_inode(dst
);
518 if (unlikely(!S_ISREG(a_dst
.inode
->i_mode
)))
521 au_mtx_and_read_lock(a_dst
.inode
);
523 * in order to match the order in di_write_lock2_{child,parent}(),
524 * use f_path.dentry for this comparision.
526 if (src
->f_path
.dentry
< dst
->f_path
.dentry
) {
527 a_src
.h_file
= au_read_pre(src
, /*keep_fi*/1, AuLsc_FI_1
);
528 err
= PTR_ERR(a_src
.h_file
);
529 if (IS_ERR(a_src
.h_file
))
532 wpre
.lsc
= AuLsc_FI_2
;
533 a_dst
.h_file
= au_write_pre(dst
, /*do_ready*/1, &wpre
);
534 err
= PTR_ERR(a_dst
.h_file
);
535 if (IS_ERR(a_dst
.h_file
)) {
536 au_read_post(a_src
.inode
, a_src
.h_file
);
540 wpre
.lsc
= AuLsc_FI_1
;
541 a_dst
.h_file
= au_write_pre(dst
, /*do_ready*/1, &wpre
);
542 err
= PTR_ERR(a_dst
.h_file
);
543 if (IS_ERR(a_dst
.h_file
))
546 a_src
.h_file
= au_read_pre(src
, /*keep_fi*/1, AuLsc_FI_2
);
547 err
= PTR_ERR(a_src
.h_file
);
548 if (IS_ERR(a_src
.h_file
)) {
549 au_write_post(a_dst
.inode
, a_dst
.h_file
, &wpre
,
556 a_src
.h_sb
= file_inode(a_src
.h_file
)->i_sb
;
557 a_dst
.h_sb
= file_inode(a_dst
.h_file
)->i_sb
;
558 if (unlikely(a_src
.h_sb
!= a_dst
.h_sb
)) {
564 err
= vfsub_copy_file_range(a_src
.h_file
, src_pos
, a_dst
.h_file
,
565 dst_pos
, len
, flags
);
568 au_write_post(a_dst
.inode
, a_dst
.h_file
, &wpre
, err
);
570 au_read_post(a_src
.inode
, a_src
.h_file
);
572 si_read_unlock(a_dst
.inode
->i_sb
);
573 inode_unlock(a_dst
.inode
);
580 /* ---------------------------------------------------------------------- */
583 * The locking order around current->mmap_sem.
584 * - in most and regular cases
585 * file I/O syscall -- aufs_read() or something
586 * -- si_rwsem for read -- mmap_sem
587 * (Note that [fdi]i_rwsem are released before mmap_sem).
589 * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
590 * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for
591 * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in
592 * file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
593 * It means that when aufs acquires si_rwsem for write, the process should never
596 * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
597 * problem either since any directory is not able to be mmap-ed.
598 * The similar scenario is applied to aufs_readlink() too.
601 #if 0 /* stop calling security_file_mmap() */
602 /* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
603 #define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b)
605 static unsigned long au_arch_prot_conv(unsigned long flags
)
607 /* currently ppc64 only */
609 /* cf. linux/arch/powerpc/include/asm/mman.h */
610 AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO
);
611 return AuConv_VM_PROT(flags
, SAO
);
613 AuDebugOn(arch_calc_vm_prot_bits(-1));
618 static unsigned long au_prot_conv(unsigned long flags
)
620 return AuConv_VM_PROT(flags
, READ
)
621 | AuConv_VM_PROT(flags
, WRITE
)
622 | AuConv_VM_PROT(flags
, EXEC
)
623 | au_arch_prot_conv(flags
);
626 /* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
627 #define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b)
629 static unsigned long au_flag_conv(unsigned long flags
)
631 return AuConv_VM_MAP(flags
, GROWSDOWN
)
632 | AuConv_VM_MAP(flags
, DENYWRITE
)
633 | AuConv_VM_MAP(flags
, LOCKED
);
637 static int aufs_mmap(struct file
*file
, struct vm_area_struct
*vma
)
640 const unsigned char wlock
641 = (file
->f_mode
& FMODE_WRITE
) && (vma
->vm_flags
& VM_SHARED
);
642 struct super_block
*sb
;
646 AuDbgVmRegion(file
, vma
);
648 inode
= file_inode(file
);
651 si_read_lock(sb
, AuLock_NOPLMW
);
653 h_file
= au_write_pre(file
, wlock
, /*wpre*/NULL
);
655 err
= PTR_ERR(h_file
);
660 au_set_mmapped(file
);
661 au_vm_file_reset(vma
, h_file
);
663 * we cannot call security_mmap_file() here since it may acquire
664 * mmap_sem or i_mutex.
666 * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
667 * au_flag_conv(vma->vm_flags));
670 err
= call_mmap(h_file
, vma
);
672 au_vm_prfile_set(vma
, file
);
673 fsstack_copy_attr_atime(inode
, file_inode(h_file
));
674 goto out_fput
; /* success */
676 au_unset_mmapped(file
);
677 au_vm_file_reset(vma
, file
);
681 ii_write_unlock(inode
);
692 /* ---------------------------------------------------------------------- */
694 static int aufs_fsync_nondir(struct file
*file
, loff_t start
, loff_t end
,
698 struct au_write_pre wpre
;
702 err
= 0; /* -EBADF; */ /* posix? */
703 if (unlikely(!(file
->f_mode
& FMODE_WRITE
)))
706 inode
= file_inode(file
);
707 au_mtx_and_read_lock(inode
);
710 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
711 err
= PTR_ERR(h_file
);
715 err
= vfsub_fsync(h_file
, &h_file
->f_path
, datasync
);
716 au_write_post(inode
, h_file
, &wpre
, /*written*/0);
719 si_read_unlock(inode
->i_sb
);
725 static int aufs_fasync(int fd
, struct file
*file
, int flag
)
729 struct super_block
*sb
;
731 sb
= file
->f_path
.dentry
->d_sb
;
732 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
734 h_file
= au_read_pre(file
, /*keep_fi*/0, /*lsc*/0);
735 err
= PTR_ERR(h_file
);
739 if (h_file
->f_op
->fasync
)
740 err
= h_file
->f_op
->fasync(fd
, h_file
, flag
);
741 fput(h_file
); /* instead of au_read_post() */
748 static int aufs_setfl(struct file
*file
, unsigned long arg
)
752 struct super_block
*sb
;
754 sb
= file
->f_path
.dentry
->d_sb
;
755 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
757 h_file
= au_read_pre(file
, /*keep_fi*/0, /*lsc*/0);
758 err
= PTR_ERR(h_file
);
762 /* stop calling h_file->fasync */
763 arg
|= vfsub_file_flags(file
) & FASYNC
;
764 err
= setfl(/*unused fd*/-1, h_file
, arg
);
765 fput(h_file
); /* instead of au_read_post() */
772 /* ---------------------------------------------------------------------- */
774 /* no one supports this operation, currently */
776 static ssize_t
aufs_sendpage(struct file
*file
, struct page
*page
, int offset
,
777 size_t len
, loff_t
*pos
, int more
)
782 /* ---------------------------------------------------------------------- */
784 const struct file_operations aufs_file_fop
= {
785 .owner
= THIS_MODULE
,
787 .llseek
= default_llseek
,
791 .read_iter
= aufs_read_iter
,
792 .write_iter
= aufs_write_iter
,
794 #ifdef CONFIG_AUFS_POLL
797 .unlocked_ioctl
= aufs_ioctl_nondir
,
799 .compat_ioctl
= aufs_compat_ioctl_nondir
,
802 .open
= aufs_open_nondir
,
803 .flush
= aufs_flush_nondir
,
804 .release
= aufs_release_nondir
,
805 .fsync
= aufs_fsync_nondir
,
806 .fasync
= aufs_fasync
,
807 /* .sendpage = aufs_sendpage, */
809 .splice_write
= aufs_splice_write
,
810 .splice_read
= aufs_splice_read
,
812 .aio_splice_write
= aufs_aio_splice_write
,
813 .aio_splice_read
= aufs_aio_splice_read
,
815 .fallocate
= aufs_fallocate
,
816 .copy_file_range
= aufs_copy_file_range