]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - fs/aufs/f_op.c
UBUNTU: SAUCE: Import aufs driver
[mirror_ubuntu-zesty-kernel.git] / fs / aufs / f_op.c
1 /*
2 * Copyright (C) 2005-2016 Junjiro R. Okajima
3 *
4 * This program, aufs is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 /*
19 * file and vm operations
20 */
21
22 #include <linux/aio.h>
23 #include <linux/fs_stack.h>
24 #include <linux/mman.h>
25 #include <linux/security.h>
26 #include "aufs.h"
27
28 int au_do_open_nondir(struct file *file, int flags, struct file *h_file)
29 {
30 int err;
31 aufs_bindex_t bindex;
32 struct dentry *dentry, *h_dentry;
33 struct au_finfo *finfo;
34 struct inode *h_inode;
35
36 FiMustWriteLock(file);
37
38 err = 0;
39 dentry = file->f_path.dentry;
40 AuDebugOn(IS_ERR_OR_NULL(dentry));
41 finfo = au_fi(file);
42 memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop));
43 atomic_set(&finfo->fi_mmapped, 0);
44 bindex = au_dbtop(dentry);
45 if (!h_file) {
46 h_dentry = au_h_dptr(dentry, bindex);
47 err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb);
48 if (unlikely(err))
49 goto out;
50 h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
51 } else {
52 h_dentry = h_file->f_path.dentry;
53 err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb);
54 if (unlikely(err))
55 goto out;
56 get_file(h_file);
57 }
58 if (IS_ERR(h_file))
59 err = PTR_ERR(h_file);
60 else {
61 if ((flags & __O_TMPFILE)
62 && !(flags & O_EXCL)) {
63 h_inode = file_inode(h_file);
64 spin_lock(&h_inode->i_lock);
65 h_inode->i_state |= I_LINKABLE;
66 spin_unlock(&h_inode->i_lock);
67 }
68 au_set_fbtop(file, bindex);
69 au_set_h_fptr(file, bindex, h_file);
70 au_update_figen(file);
71 /* todo: necessary? */
72 /* file->f_ra = h_file->f_ra; */
73 }
74
75 out:
76 return err;
77 }
78
79 static int aufs_open_nondir(struct inode *inode __maybe_unused,
80 struct file *file)
81 {
82 int err;
83 struct super_block *sb;
84 struct au_do_open_args args = {
85 .open = au_do_open_nondir
86 };
87
88 AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n",
89 file, vfsub_file_flags(file), file->f_mode);
90
91 sb = file->f_path.dentry->d_sb;
92 si_read_lock(sb, AuLock_FLUSH);
93 err = au_do_open(file, &args);
94 si_read_unlock(sb);
95 return err;
96 }
97
98 int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file)
99 {
100 struct au_finfo *finfo;
101 aufs_bindex_t bindex;
102 int delayed;
103
104 finfo = au_fi(file);
105 au_sphl_del(&finfo->fi_hlist,
106 &au_sbi(file->f_path.dentry->d_sb)->si_files);
107 bindex = finfo->fi_btop;
108 if (bindex >= 0)
109 au_set_h_fptr(file, bindex, NULL);
110
111 delayed = (current->flags & PF_KTHREAD) || in_interrupt();
112 au_finfo_fin(file, delayed);
113 return 0;
114 }
115
116 /* ---------------------------------------------------------------------- */
117
118 static int au_do_flush_nondir(struct file *file, fl_owner_t id)
119 {
120 int err;
121 struct file *h_file;
122
123 err = 0;
124 h_file = au_hf_top(file);
125 if (h_file)
126 err = vfsub_flush(h_file, id);
127 return err;
128 }
129
130 static int aufs_flush_nondir(struct file *file, fl_owner_t id)
131 {
132 return au_do_flush(file, id, au_do_flush_nondir);
133 }
134
135 /* ---------------------------------------------------------------------- */
136 /*
137 * read and write functions acquire [fdi]_rwsem once, but release before
138 * mmap_sem. This is because to stop a race condition between mmap(2).
139 * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping
140 * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in
141 * read functions after [fdi]_rwsem are released, but it should be harmless.
142 */
143
144 /* Callers should call au_read_post() or fput() in the end */
145 struct file *au_read_pre(struct file *file, int keep_fi)
146 {
147 struct file *h_file;
148 int err;
149
150 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
151 if (!err) {
152 di_read_unlock(file->f_path.dentry, AuLock_IR);
153 h_file = au_hf_top(file);
154 get_file(h_file);
155 if (!keep_fi)
156 fi_read_unlock(file);
157 } else
158 h_file = ERR_PTR(err);
159
160 return h_file;
161 }
162
163 static void au_read_post(struct inode *inode, struct file *h_file)
164 {
165 /* update without lock, I don't think it a problem */
166 fsstack_copy_attr_atime(inode, file_inode(h_file));
167 fput(h_file);
168 }
169
170 struct au_write_pre {
171 blkcnt_t blks;
172 aufs_bindex_t btop;
173 };
174
175 /*
176 * return with iinfo is write-locked
177 * callers should call au_write_post() or iinfo_write_unlock() + fput() in the
178 * end
179 */
180 static struct file *au_write_pre(struct file *file, int do_ready,
181 struct au_write_pre *wpre)
182 {
183 struct file *h_file;
184 struct dentry *dentry;
185 int err;
186 struct au_pin pin;
187
188 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
189 h_file = ERR_PTR(err);
190 if (unlikely(err))
191 goto out;
192
193 dentry = file->f_path.dentry;
194 if (do_ready) {
195 err = au_ready_to_write(file, -1, &pin);
196 if (unlikely(err)) {
197 h_file = ERR_PTR(err);
198 di_write_unlock(dentry);
199 goto out_fi;
200 }
201 }
202
203 di_downgrade_lock(dentry, /*flags*/0);
204 if (wpre)
205 wpre->btop = au_fbtop(file);
206 h_file = au_hf_top(file);
207 get_file(h_file);
208 if (wpre)
209 wpre->blks = file_inode(h_file)->i_blocks;
210 if (do_ready)
211 au_unpin(&pin);
212 di_read_unlock(dentry, /*flags*/0);
213
214 out_fi:
215 fi_write_unlock(file);
216 out:
217 return h_file;
218 }
219
220 static void au_write_post(struct inode *inode, struct file *h_file,
221 struct au_write_pre *wpre, ssize_t written)
222 {
223 struct inode *h_inode;
224
225 au_cpup_attr_timesizes(inode);
226 AuDebugOn(au_ibtop(inode) != wpre->btop);
227 h_inode = file_inode(h_file);
228 inode->i_mode = h_inode->i_mode;
229 ii_write_unlock(inode);
230 fput(h_file);
231
232 /* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */
233 if (written > 0)
234 au_fhsm_wrote(inode->i_sb, wpre->btop,
235 /*force*/h_inode->i_blocks > wpre->blks);
236 }
237
238 static ssize_t aufs_read(struct file *file, char __user *buf, size_t count,
239 loff_t *ppos)
240 {
241 ssize_t err;
242 struct inode *inode;
243 struct file *h_file;
244 struct super_block *sb;
245
246 inode = file_inode(file);
247 sb = inode->i_sb;
248 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
249
250 h_file = au_read_pre(file, /*keep_fi*/0);
251 err = PTR_ERR(h_file);
252 if (IS_ERR(h_file))
253 goto out;
254
255 /* filedata may be obsoleted by concurrent copyup, but no problem */
256 err = vfsub_read_u(h_file, buf, count, ppos);
257 /* todo: necessary? */
258 /* file->f_ra = h_file->f_ra; */
259 au_read_post(inode, h_file);
260
261 out:
262 si_read_unlock(sb);
263 return err;
264 }
265
266 /*
267 * todo: very ugly
268 * it locks both of i_mutex and si_rwsem for read in safe.
269 * if the plink maintenance mode continues forever (that is the problem),
270 * may loop forever.
271 */
272 static void au_mtx_and_read_lock(struct inode *inode)
273 {
274 int err;
275 struct super_block *sb = inode->i_sb;
276
277 while (1) {
278 inode_lock(inode);
279 err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
280 if (!err)
281 break;
282 inode_unlock(inode);
283 si_read_lock(sb, AuLock_NOPLMW);
284 si_read_unlock(sb);
285 }
286 }
287
288 static ssize_t aufs_write(struct file *file, const char __user *ubuf,
289 size_t count, loff_t *ppos)
290 {
291 ssize_t err;
292 struct au_write_pre wpre;
293 struct inode *inode;
294 struct file *h_file;
295 char __user *buf = (char __user *)ubuf;
296
297 inode = file_inode(file);
298 au_mtx_and_read_lock(inode);
299
300 h_file = au_write_pre(file, /*do_ready*/1, &wpre);
301 err = PTR_ERR(h_file);
302 if (IS_ERR(h_file))
303 goto out;
304
305 err = vfsub_write_u(h_file, buf, count, ppos);
306 au_write_post(inode, h_file, &wpre, err);
307
308 out:
309 si_read_unlock(inode->i_sb);
310 inode_unlock(inode);
311 return err;
312 }
313
314 static ssize_t au_do_iter(struct file *h_file, int rw, struct kiocb *kio,
315 struct iov_iter *iov_iter)
316 {
317 ssize_t err;
318 struct file *file;
319 ssize_t (*iter)(struct kiocb *, struct iov_iter *);
320
321 err = security_file_permission(h_file, rw);
322 if (unlikely(err))
323 goto out;
324
325 err = -ENOSYS;
326 iter = NULL;
327 if (rw == MAY_READ)
328 iter = h_file->f_op->read_iter;
329 else if (rw == MAY_WRITE)
330 iter = h_file->f_op->write_iter;
331
332 file = kio->ki_filp;
333 kio->ki_filp = h_file;
334 if (iter) {
335 lockdep_off();
336 err = iter(kio, iov_iter);
337 lockdep_on();
338 } else
339 /* currently there is no such fs */
340 WARN_ON_ONCE(1);
341 kio->ki_filp = file;
342
343 out:
344 return err;
345 }
346
347 static ssize_t aufs_read_iter(struct kiocb *kio, struct iov_iter *iov_iter)
348 {
349 ssize_t err;
350 struct file *file, *h_file;
351 struct inode *inode;
352 struct super_block *sb;
353
354 file = kio->ki_filp;
355 inode = file_inode(file);
356 sb = inode->i_sb;
357 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
358
359 h_file = au_read_pre(file, /*keep_fi*/1);
360 err = PTR_ERR(h_file);
361 if (IS_ERR(h_file))
362 goto out;
363
364 if (au_test_loopback_kthread()) {
365 au_warn_loopback(h_file->f_path.dentry->d_sb);
366 if (file->f_mapping != h_file->f_mapping) {
367 file->f_mapping = h_file->f_mapping;
368 smp_mb(); /* unnecessary? */
369 }
370 }
371 fi_read_unlock(file);
372
373 err = au_do_iter(h_file, MAY_READ, kio, iov_iter);
374 /* todo: necessary? */
375 /* file->f_ra = h_file->f_ra; */
376 au_read_post(inode, h_file);
377
378 out:
379 si_read_unlock(sb);
380 return err;
381 }
382
383 static ssize_t aufs_write_iter(struct kiocb *kio, struct iov_iter *iov_iter)
384 {
385 ssize_t err;
386 struct au_write_pre wpre;
387 struct inode *inode;
388 struct file *file, *h_file;
389
390 file = kio->ki_filp;
391 inode = file_inode(file);
392 au_mtx_and_read_lock(inode);
393
394 h_file = au_write_pre(file, /*do_ready*/1, &wpre);
395 err = PTR_ERR(h_file);
396 if (IS_ERR(h_file))
397 goto out;
398
399 err = au_do_iter(h_file, MAY_WRITE, kio, iov_iter);
400 au_write_post(inode, h_file, &wpre, err);
401
402 out:
403 si_read_unlock(inode->i_sb);
404 inode_unlock(inode);
405 return err;
406 }
407
408 static ssize_t aufs_splice_read(struct file *file, loff_t *ppos,
409 struct pipe_inode_info *pipe, size_t len,
410 unsigned int flags)
411 {
412 ssize_t err;
413 struct file *h_file;
414 struct inode *inode;
415 struct super_block *sb;
416
417 inode = file_inode(file);
418 sb = inode->i_sb;
419 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
420
421 h_file = au_read_pre(file, /*keep_fi*/0);
422 err = PTR_ERR(h_file);
423 if (IS_ERR(h_file))
424 goto out;
425
426 err = vfsub_splice_to(h_file, ppos, pipe, len, flags);
427 /* todo: necessasry? */
428 /* file->f_ra = h_file->f_ra; */
429 au_read_post(inode, h_file);
430
431 out:
432 si_read_unlock(sb);
433 return err;
434 }
435
436 static ssize_t
437 aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos,
438 size_t len, unsigned int flags)
439 {
440 ssize_t err;
441 struct au_write_pre wpre;
442 struct inode *inode;
443 struct file *h_file;
444
445 inode = file_inode(file);
446 au_mtx_and_read_lock(inode);
447
448 h_file = au_write_pre(file, /*do_ready*/1, &wpre);
449 err = PTR_ERR(h_file);
450 if (IS_ERR(h_file))
451 goto out;
452
453 err = vfsub_splice_from(pipe, h_file, ppos, len, flags);
454 au_write_post(inode, h_file, &wpre, err);
455
456 out:
457 si_read_unlock(inode->i_sb);
458 inode_unlock(inode);
459 return err;
460 }
461
462 static long aufs_fallocate(struct file *file, int mode, loff_t offset,
463 loff_t len)
464 {
465 long err;
466 struct au_write_pre wpre;
467 struct inode *inode;
468 struct file *h_file;
469
470 inode = file_inode(file);
471 au_mtx_and_read_lock(inode);
472
473 h_file = au_write_pre(file, /*do_ready*/1, &wpre);
474 err = PTR_ERR(h_file);
475 if (IS_ERR(h_file))
476 goto out;
477
478 lockdep_off();
479 err = vfs_fallocate(h_file, mode, offset, len);
480 lockdep_on();
481 au_write_post(inode, h_file, &wpre, /*written*/1);
482
483 out:
484 si_read_unlock(inode->i_sb);
485 inode_unlock(inode);
486 return err;
487 }
488
489 /* ---------------------------------------------------------------------- */
490
491 /*
492 * The locking order around current->mmap_sem.
493 * - in most and regular cases
494 * file I/O syscall -- aufs_read() or something
495 * -- si_rwsem for read -- mmap_sem
496 * (Note that [fdi]i_rwsem are released before mmap_sem).
497 * - in mmap case
498 * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
499 * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for
500 * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in
501 * file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
502 * It means that when aufs acquires si_rwsem for write, the process should never
503 * acquire mmap_sem.
504 *
505 * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
506 * problem either since any directory is not able to be mmap-ed.
507 * The similar scenario is applied to aufs_readlink() too.
508 */
509
510 #if 0 /* stop calling security_file_mmap() */
511 /* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
512 #define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b)
513
514 static unsigned long au_arch_prot_conv(unsigned long flags)
515 {
516 /* currently ppc64 only */
517 #ifdef CONFIG_PPC64
518 /* cf. linux/arch/powerpc/include/asm/mman.h */
519 AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO);
520 return AuConv_VM_PROT(flags, SAO);
521 #else
522 AuDebugOn(arch_calc_vm_prot_bits(-1));
523 return 0;
524 #endif
525 }
526
527 static unsigned long au_prot_conv(unsigned long flags)
528 {
529 return AuConv_VM_PROT(flags, READ)
530 | AuConv_VM_PROT(flags, WRITE)
531 | AuConv_VM_PROT(flags, EXEC)
532 | au_arch_prot_conv(flags);
533 }
534
535 /* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
536 #define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b)
537
538 static unsigned long au_flag_conv(unsigned long flags)
539 {
540 return AuConv_VM_MAP(flags, GROWSDOWN)
541 | AuConv_VM_MAP(flags, DENYWRITE)
542 | AuConv_VM_MAP(flags, LOCKED);
543 }
544 #endif
545
546 static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
547 {
548 int err;
549 const unsigned char wlock
550 = (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
551 struct super_block *sb;
552 struct file *h_file;
553 struct inode *inode;
554
555 AuDbgVmRegion(file, vma);
556
557 inode = file_inode(file);
558 sb = inode->i_sb;
559 lockdep_off();
560 si_read_lock(sb, AuLock_NOPLMW);
561
562 h_file = au_write_pre(file, wlock, /*wpre*/NULL);
563 lockdep_on();
564 err = PTR_ERR(h_file);
565 if (IS_ERR(h_file))
566 goto out;
567
568 err = 0;
569 au_set_mmapped(file);
570 au_vm_file_reset(vma, h_file);
571 /*
572 * we cannot call security_mmap_file() here since it may acquire
573 * mmap_sem or i_mutex.
574 *
575 * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
576 * au_flag_conv(vma->vm_flags));
577 */
578 if (!err)
579 err = h_file->f_op->mmap(h_file, vma);
580 if (!err) {
581 au_vm_prfile_set(vma, file);
582 fsstack_copy_attr_atime(inode, file_inode(h_file));
583 goto out_fput; /* success */
584 }
585 au_unset_mmapped(file);
586 au_vm_file_reset(vma, file);
587
588 out_fput:
589 lockdep_off();
590 ii_write_unlock(inode);
591 lockdep_on();
592 fput(h_file);
593 out:
594 lockdep_off();
595 si_read_unlock(sb);
596 lockdep_on();
597 AuTraceErr(err);
598 return err;
599 }
600
601 /* ---------------------------------------------------------------------- */
602
603 static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end,
604 int datasync)
605 {
606 int err;
607 struct au_write_pre wpre;
608 struct inode *inode;
609 struct file *h_file;
610
611 err = 0; /* -EBADF; */ /* posix? */
612 if (unlikely(!(file->f_mode & FMODE_WRITE)))
613 goto out;
614
615 inode = file_inode(file);
616 au_mtx_and_read_lock(inode);
617
618 h_file = au_write_pre(file, /*do_ready*/1, &wpre);
619 err = PTR_ERR(h_file);
620 if (IS_ERR(h_file))
621 goto out_unlock;
622
623 err = vfsub_fsync(h_file, &h_file->f_path, datasync);
624 au_write_post(inode, h_file, &wpre, /*written*/0);
625
626 out_unlock:
627 si_read_unlock(inode->i_sb);
628 inode_unlock(inode);
629 out:
630 return err;
631 }
632
633 static int aufs_fasync(int fd, struct file *file, int flag)
634 {
635 int err;
636 struct file *h_file;
637 struct super_block *sb;
638
639 sb = file->f_path.dentry->d_sb;
640 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
641
642 h_file = au_read_pre(file, /*keep_fi*/0);
643 err = PTR_ERR(h_file);
644 if (IS_ERR(h_file))
645 goto out;
646
647 if (h_file->f_op->fasync)
648 err = h_file->f_op->fasync(fd, h_file, flag);
649 fput(h_file); /* instead of au_read_post() */
650
651 out:
652 si_read_unlock(sb);
653 return err;
654 }
655
656 static int aufs_setfl(struct file *file, unsigned long arg)
657 {
658 int err;
659 struct file *h_file;
660 struct super_block *sb;
661
662 sb = file->f_path.dentry->d_sb;
663 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
664
665 h_file = au_read_pre(file, /*keep_fi*/0);
666 err = PTR_ERR(h_file);
667 if (IS_ERR(h_file))
668 goto out;
669
670 arg |= vfsub_file_flags(file) & FASYNC; /* stop calling h_file->fasync */
671 err = setfl(/*unused fd*/-1, h_file, arg);
672 fput(h_file); /* instead of au_read_post() */
673
674 out:
675 si_read_unlock(sb);
676 return err;
677 }
678
679 /* ---------------------------------------------------------------------- */
680
681 /* no one supports this operation, currently */
682 #if 0
683 static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset,
684 size_t len, loff_t *pos, int more)
685 {
686 }
687 #endif
688
689 /* ---------------------------------------------------------------------- */
690
691 const struct file_operations aufs_file_fop = {
692 .owner = THIS_MODULE,
693
694 .llseek = default_llseek,
695
696 .read = aufs_read,
697 .write = aufs_write,
698 .read_iter = aufs_read_iter,
699 .write_iter = aufs_write_iter,
700
701 #ifdef CONFIG_AUFS_POLL
702 .poll = aufs_poll,
703 #endif
704 .unlocked_ioctl = aufs_ioctl_nondir,
705 #ifdef CONFIG_COMPAT
706 .compat_ioctl = aufs_compat_ioctl_nondir,
707 #endif
708 .mmap = aufs_mmap,
709 .open = aufs_open_nondir,
710 .flush = aufs_flush_nondir,
711 .release = aufs_release_nondir,
712 .fsync = aufs_fsync_nondir,
713 .fasync = aufs_fasync,
714 /* .sendpage = aufs_sendpage, */
715 .setfl = aufs_setfl,
716 .splice_write = aufs_splice_write,
717 .splice_read = aufs_splice_read,
718 #if 0
719 .aio_splice_write = aufs_aio_splice_write,
720 .aio_splice_read = aufs_aio_splice_read,
721 #endif
722 .fallocate = aufs_fallocate
723 };