2 * Copyright (C) 2005-2016 Junjiro R. Okajima
4 * This program, aufs is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 * file and vm operations
22 #include <linux/aio.h>
23 #include <linux/fs_stack.h>
24 #include <linux/mman.h>
25 #include <linux/security.h>
28 int au_do_open_nondir(struct file
*file
, int flags
, struct file
*h_file
)
32 struct dentry
*dentry
, *h_dentry
;
33 struct au_finfo
*finfo
;
34 struct inode
*h_inode
;
36 FiMustWriteLock(file
);
39 dentry
= file
->f_path
.dentry
;
40 AuDebugOn(IS_ERR_OR_NULL(dentry
));
42 memset(&finfo
->fi_htop
, 0, sizeof(finfo
->fi_htop
));
43 atomic_set(&finfo
->fi_mmapped
, 0);
44 bindex
= au_dbtop(dentry
);
46 h_dentry
= au_h_dptr(dentry
, bindex
);
47 err
= vfsub_test_mntns(file
->f_path
.mnt
, h_dentry
->d_sb
);
50 h_file
= au_h_open(dentry
, bindex
, flags
, file
, /*force_wr*/0);
52 h_dentry
= h_file
->f_path
.dentry
;
53 err
= vfsub_test_mntns(file
->f_path
.mnt
, h_dentry
->d_sb
);
59 err
= PTR_ERR(h_file
);
61 if ((flags
& __O_TMPFILE
)
62 && !(flags
& O_EXCL
)) {
63 h_inode
= file_inode(h_file
);
64 spin_lock(&h_inode
->i_lock
);
65 h_inode
->i_state
|= I_LINKABLE
;
66 spin_unlock(&h_inode
->i_lock
);
68 au_set_fbtop(file
, bindex
);
69 au_set_h_fptr(file
, bindex
, h_file
);
70 au_update_figen(file
);
71 /* todo: necessary? */
72 /* file->f_ra = h_file->f_ra; */
79 static int aufs_open_nondir(struct inode
*inode __maybe_unused
,
83 struct super_block
*sb
;
84 struct au_do_open_args args
= {
85 .open
= au_do_open_nondir
88 AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n",
89 file
, vfsub_file_flags(file
), file
->f_mode
);
91 sb
= file
->f_path
.dentry
->d_sb
;
92 si_read_lock(sb
, AuLock_FLUSH
);
93 err
= au_do_open(file
, &args
);
98 int aufs_release_nondir(struct inode
*inode __maybe_unused
, struct file
*file
)
100 struct au_finfo
*finfo
;
101 aufs_bindex_t bindex
;
105 au_sphl_del(&finfo
->fi_hlist
,
106 &au_sbi(file
->f_path
.dentry
->d_sb
)->si_files
);
107 bindex
= finfo
->fi_btop
;
109 au_set_h_fptr(file
, bindex
, NULL
);
111 delayed
= (current
->flags
& PF_KTHREAD
) || in_interrupt();
112 au_finfo_fin(file
, delayed
);
116 /* ---------------------------------------------------------------------- */
118 static int au_do_flush_nondir(struct file
*file
, fl_owner_t id
)
124 h_file
= au_hf_top(file
);
126 err
= vfsub_flush(h_file
, id
);
130 static int aufs_flush_nondir(struct file
*file
, fl_owner_t id
)
132 return au_do_flush(file
, id
, au_do_flush_nondir
);
135 /* ---------------------------------------------------------------------- */
137 * read and write functions acquire [fdi]_rwsem once, but release before
138 * mmap_sem. This is because to stop a race condition between mmap(2).
139 * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping
140 * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in
141 * read functions after [fdi]_rwsem are released, but it should be harmless.
144 /* Callers should call au_read_post() or fput() in the end */
145 struct file
*au_read_pre(struct file
*file
, int keep_fi
)
150 err
= au_reval_and_lock_fdi(file
, au_reopen_nondir
, /*wlock*/0);
152 di_read_unlock(file
->f_path
.dentry
, AuLock_IR
);
153 h_file
= au_hf_top(file
);
156 fi_read_unlock(file
);
158 h_file
= ERR_PTR(err
);
163 static void au_read_post(struct inode
*inode
, struct file
*h_file
)
165 /* update without lock, I don't think it a problem */
166 fsstack_copy_attr_atime(inode
, file_inode(h_file
));
170 struct au_write_pre
{
176 * return with iinfo is write-locked
177 * callers should call au_write_post() or iinfo_write_unlock() + fput() in the
180 static struct file
*au_write_pre(struct file
*file
, int do_ready
,
181 struct au_write_pre
*wpre
)
184 struct dentry
*dentry
;
188 err
= au_reval_and_lock_fdi(file
, au_reopen_nondir
, /*wlock*/1);
189 h_file
= ERR_PTR(err
);
193 dentry
= file
->f_path
.dentry
;
195 err
= au_ready_to_write(file
, -1, &pin
);
197 h_file
= ERR_PTR(err
);
198 di_write_unlock(dentry
);
203 di_downgrade_lock(dentry
, /*flags*/0);
205 wpre
->btop
= au_fbtop(file
);
206 h_file
= au_hf_top(file
);
209 wpre
->blks
= file_inode(h_file
)->i_blocks
;
212 di_read_unlock(dentry
, /*flags*/0);
215 fi_write_unlock(file
);
220 static void au_write_post(struct inode
*inode
, struct file
*h_file
,
221 struct au_write_pre
*wpre
, ssize_t written
)
223 struct inode
*h_inode
;
225 au_cpup_attr_timesizes(inode
);
226 AuDebugOn(au_ibtop(inode
) != wpre
->btop
);
227 h_inode
= file_inode(h_file
);
228 inode
->i_mode
= h_inode
->i_mode
;
229 ii_write_unlock(inode
);
232 /* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */
234 au_fhsm_wrote(inode
->i_sb
, wpre
->btop
,
235 /*force*/h_inode
->i_blocks
> wpre
->blks
);
238 static ssize_t
aufs_read(struct file
*file
, char __user
*buf
, size_t count
,
244 struct super_block
*sb
;
246 inode
= file_inode(file
);
248 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
250 h_file
= au_read_pre(file
, /*keep_fi*/0);
251 err
= PTR_ERR(h_file
);
255 /* filedata may be obsoleted by concurrent copyup, but no problem */
256 err
= vfsub_read_u(h_file
, buf
, count
, ppos
);
257 /* todo: necessary? */
258 /* file->f_ra = h_file->f_ra; */
259 au_read_post(inode
, h_file
);
268 * it locks both of i_mutex and si_rwsem for read in safe.
269 * if the plink maintenance mode continues forever (that is the problem),
272 static void au_mtx_and_read_lock(struct inode
*inode
)
275 struct super_block
*sb
= inode
->i_sb
;
279 err
= si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLM
);
283 si_read_lock(sb
, AuLock_NOPLMW
);
288 static ssize_t
aufs_write(struct file
*file
, const char __user
*ubuf
,
289 size_t count
, loff_t
*ppos
)
292 struct au_write_pre wpre
;
295 char __user
*buf
= (char __user
*)ubuf
;
297 inode
= file_inode(file
);
298 au_mtx_and_read_lock(inode
);
300 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
301 err
= PTR_ERR(h_file
);
305 err
= vfsub_write_u(h_file
, buf
, count
, ppos
);
306 au_write_post(inode
, h_file
, &wpre
, err
);
309 si_read_unlock(inode
->i_sb
);
314 static ssize_t
au_do_iter(struct file
*h_file
, int rw
, struct kiocb
*kio
,
315 struct iov_iter
*iov_iter
)
319 ssize_t (*iter
)(struct kiocb
*, struct iov_iter
*);
321 err
= security_file_permission(h_file
, rw
);
328 iter
= h_file
->f_op
->read_iter
;
329 else if (rw
== MAY_WRITE
)
330 iter
= h_file
->f_op
->write_iter
;
333 kio
->ki_filp
= h_file
;
336 err
= iter(kio
, iov_iter
);
339 /* currently there is no such fs */
347 static ssize_t
aufs_read_iter(struct kiocb
*kio
, struct iov_iter
*iov_iter
)
350 struct file
*file
, *h_file
;
352 struct super_block
*sb
;
355 inode
= file_inode(file
);
357 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
359 h_file
= au_read_pre(file
, /*keep_fi*/1);
360 err
= PTR_ERR(h_file
);
364 if (au_test_loopback_kthread()) {
365 au_warn_loopback(h_file
->f_path
.dentry
->d_sb
);
366 if (file
->f_mapping
!= h_file
->f_mapping
) {
367 file
->f_mapping
= h_file
->f_mapping
;
368 smp_mb(); /* unnecessary? */
371 fi_read_unlock(file
);
373 err
= au_do_iter(h_file
, MAY_READ
, kio
, iov_iter
);
374 /* todo: necessary? */
375 /* file->f_ra = h_file->f_ra; */
376 au_read_post(inode
, h_file
);
383 static ssize_t
aufs_write_iter(struct kiocb
*kio
, struct iov_iter
*iov_iter
)
386 struct au_write_pre wpre
;
388 struct file
*file
, *h_file
;
391 inode
= file_inode(file
);
392 au_mtx_and_read_lock(inode
);
394 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
395 err
= PTR_ERR(h_file
);
399 err
= au_do_iter(h_file
, MAY_WRITE
, kio
, iov_iter
);
400 au_write_post(inode
, h_file
, &wpre
, err
);
403 si_read_unlock(inode
->i_sb
);
408 static ssize_t
aufs_splice_read(struct file
*file
, loff_t
*ppos
,
409 struct pipe_inode_info
*pipe
, size_t len
,
415 struct super_block
*sb
;
417 inode
= file_inode(file
);
419 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
421 h_file
= au_read_pre(file
, /*keep_fi*/0);
422 err
= PTR_ERR(h_file
);
426 err
= vfsub_splice_to(h_file
, ppos
, pipe
, len
, flags
);
427 /* todo: necessasry? */
428 /* file->f_ra = h_file->f_ra; */
429 au_read_post(inode
, h_file
);
437 aufs_splice_write(struct pipe_inode_info
*pipe
, struct file
*file
, loff_t
*ppos
,
438 size_t len
, unsigned int flags
)
441 struct au_write_pre wpre
;
445 inode
= file_inode(file
);
446 au_mtx_and_read_lock(inode
);
448 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
449 err
= PTR_ERR(h_file
);
453 err
= vfsub_splice_from(pipe
, h_file
, ppos
, len
, flags
);
454 au_write_post(inode
, h_file
, &wpre
, err
);
457 si_read_unlock(inode
->i_sb
);
462 static long aufs_fallocate(struct file
*file
, int mode
, loff_t offset
,
466 struct au_write_pre wpre
;
470 inode
= file_inode(file
);
471 au_mtx_and_read_lock(inode
);
473 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
474 err
= PTR_ERR(h_file
);
479 err
= vfs_fallocate(h_file
, mode
, offset
, len
);
481 au_write_post(inode
, h_file
, &wpre
, /*written*/1);
484 si_read_unlock(inode
->i_sb
);
489 /* ---------------------------------------------------------------------- */
492 * The locking order around current->mmap_sem.
493 * - in most and regular cases
494 * file I/O syscall -- aufs_read() or something
495 * -- si_rwsem for read -- mmap_sem
496 * (Note that [fdi]i_rwsem are released before mmap_sem).
498 * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
499 * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for
500 * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in
501 * file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
502 * It means that when aufs acquires si_rwsem for write, the process should never
505 * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
506 * problem either since any directory is not able to be mmap-ed.
507 * The similar scenario is applied to aufs_readlink() too.
510 #if 0 /* stop calling security_file_mmap() */
511 /* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
512 #define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b)
514 static unsigned long au_arch_prot_conv(unsigned long flags
)
516 /* currently ppc64 only */
518 /* cf. linux/arch/powerpc/include/asm/mman.h */
519 AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO
);
520 return AuConv_VM_PROT(flags
, SAO
);
522 AuDebugOn(arch_calc_vm_prot_bits(-1));
527 static unsigned long au_prot_conv(unsigned long flags
)
529 return AuConv_VM_PROT(flags
, READ
)
530 | AuConv_VM_PROT(flags
, WRITE
)
531 | AuConv_VM_PROT(flags
, EXEC
)
532 | au_arch_prot_conv(flags
);
535 /* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
536 #define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b)
538 static unsigned long au_flag_conv(unsigned long flags
)
540 return AuConv_VM_MAP(flags
, GROWSDOWN
)
541 | AuConv_VM_MAP(flags
, DENYWRITE
)
542 | AuConv_VM_MAP(flags
, LOCKED
);
546 static int aufs_mmap(struct file
*file
, struct vm_area_struct
*vma
)
549 const unsigned char wlock
550 = (file
->f_mode
& FMODE_WRITE
) && (vma
->vm_flags
& VM_SHARED
);
551 struct super_block
*sb
;
555 AuDbgVmRegion(file
, vma
);
557 inode
= file_inode(file
);
560 si_read_lock(sb
, AuLock_NOPLMW
);
562 h_file
= au_write_pre(file
, wlock
, /*wpre*/NULL
);
564 err
= PTR_ERR(h_file
);
569 au_set_mmapped(file
);
570 au_vm_file_reset(vma
, h_file
);
572 * we cannot call security_mmap_file() here since it may acquire
573 * mmap_sem or i_mutex.
575 * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
576 * au_flag_conv(vma->vm_flags));
579 err
= h_file
->f_op
->mmap(h_file
, vma
);
581 au_vm_prfile_set(vma
, file
);
582 fsstack_copy_attr_atime(inode
, file_inode(h_file
));
583 goto out_fput
; /* success */
585 au_unset_mmapped(file
);
586 au_vm_file_reset(vma
, file
);
590 ii_write_unlock(inode
);
601 /* ---------------------------------------------------------------------- */
603 static int aufs_fsync_nondir(struct file
*file
, loff_t start
, loff_t end
,
607 struct au_write_pre wpre
;
611 err
= 0; /* -EBADF; */ /* posix? */
612 if (unlikely(!(file
->f_mode
& FMODE_WRITE
)))
615 inode
= file_inode(file
);
616 au_mtx_and_read_lock(inode
);
618 h_file
= au_write_pre(file
, /*do_ready*/1, &wpre
);
619 err
= PTR_ERR(h_file
);
623 err
= vfsub_fsync(h_file
, &h_file
->f_path
, datasync
);
624 au_write_post(inode
, h_file
, &wpre
, /*written*/0);
627 si_read_unlock(inode
->i_sb
);
633 static int aufs_fasync(int fd
, struct file
*file
, int flag
)
637 struct super_block
*sb
;
639 sb
= file
->f_path
.dentry
->d_sb
;
640 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
642 h_file
= au_read_pre(file
, /*keep_fi*/0);
643 err
= PTR_ERR(h_file
);
647 if (h_file
->f_op
->fasync
)
648 err
= h_file
->f_op
->fasync(fd
, h_file
, flag
);
649 fput(h_file
); /* instead of au_read_post() */
656 static int aufs_setfl(struct file
*file
, unsigned long arg
)
660 struct super_block
*sb
;
662 sb
= file
->f_path
.dentry
->d_sb
;
663 si_read_lock(sb
, AuLock_FLUSH
| AuLock_NOPLMW
);
665 h_file
= au_read_pre(file
, /*keep_fi*/0);
666 err
= PTR_ERR(h_file
);
670 arg
|= vfsub_file_flags(file
) & FASYNC
; /* stop calling h_file->fasync */
671 err
= setfl(/*unused fd*/-1, h_file
, arg
);
672 fput(h_file
); /* instead of au_read_post() */
679 /* ---------------------------------------------------------------------- */
681 /* no one supports this operation, currently */
683 static ssize_t
aufs_sendpage(struct file
*file
, struct page
*page
, int offset
,
684 size_t len
, loff_t
*pos
, int more
)
689 /* ---------------------------------------------------------------------- */
691 const struct file_operations aufs_file_fop
= {
692 .owner
= THIS_MODULE
,
694 .llseek
= default_llseek
,
698 .read_iter
= aufs_read_iter
,
699 .write_iter
= aufs_write_iter
,
701 #ifdef CONFIG_AUFS_POLL
704 .unlocked_ioctl
= aufs_ioctl_nondir
,
706 .compat_ioctl
= aufs_compat_ioctl_nondir
,
709 .open
= aufs_open_nondir
,
710 .flush
= aufs_flush_nondir
,
711 .release
= aufs_release_nondir
,
712 .fsync
= aufs_fsync_nondir
,
713 .fasync
= aufs_fasync
,
714 /* .sendpage = aufs_sendpage, */
716 .splice_write
= aufs_splice_write
,
717 .splice_read
= aufs_splice_read
,
719 .aio_splice_write
= aufs_aio_splice_write
,
720 .aio_splice_read
= aufs_aio_splice_read
,
722 .fallocate
= aufs_fallocate