]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - fs/super.c
NFS: Fix a soft lockup in the delegation recovery code
[mirror_ubuntu-bionic-kernel.git] / fs / super.c
index 7ff1349609e4874a35268876065490d77f5e01ab..5c9fa51c133fad7ea040e2e5f4d9f76f6c6d3347 100644 (file)
@@ -120,13 +120,23 @@ static unsigned long super_cache_count(struct shrinker *shrink,
        sb = container_of(shrink, struct super_block, s_shrink);
 
        /*
-        * Don't call trylock_super as it is a potential
-        * scalability bottleneck. The counts could get updated
-        * between super_cache_count and super_cache_scan anyway.
-        * Call to super_cache_count with shrinker_rwsem held
-        * ensures the safety of call to list_lru_shrink_count() and
-        * s_op->nr_cached_objects().
+        * We don't call trylock_super() here as it is a scalability bottleneck,
+        * so we're exposed to partial setup state. The shrinker rwsem does not
+        * protect filesystem operations backing list_lru_shrink_count() or
+        * s_op->nr_cached_objects(). Counts can change between
+        * super_cache_count and super_cache_scan, so we really don't need locks
+        * here.
+        *
+        * However, if we are currently mounting the superblock, the underlying
+        * filesystem might be in a state of partial construction and hence it
+        * is dangerous to access it.  trylock_super() uses a SB_BORN check to
+        * avoid this situation, so do the same here. The memory barrier is
+        * matched with the one in mount_fs() as we don't hold locks here.
         */
+       if (!(sb->s_flags & SB_BORN))
+               return 0;
+       smp_rmb();
+
        if (sb->s_op && sb->s_op->nr_cached_objects)
                total_objects = sb->s_op->nr_cached_objects(sb, sc);
 
@@ -166,6 +176,7 @@ static void destroy_unused_super(struct super_block *s)
        security_sb_free(s);
        put_user_ns(s->s_user_ns);
        kfree(s->s_subtype);
+       free_prealloced_shrinker(&s->s_shrink);
        /* no delays needed */
        destroy_super_work(&s->destroy_work);
 }
@@ -251,6 +262,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
        s->s_shrink.count_objects = super_cache_count;
        s->s_shrink.batch = 1024;
        s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
+       if (prealloc_shrinker(&s->s_shrink))
+               goto fail;
        return s;
 
 fail:
@@ -517,7 +530,7 @@ retry:
        hlist_add_head(&s->s_instances, &type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(type);
-       register_shrinker(&s->s_shrink);
+       register_shrinker_prepared(&s->s_shrink);
        return s;
 }
 
@@ -1074,6 +1087,23 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
        if (IS_ERR(bdev))
                return ERR_CAST(bdev);
 
+       if (current_user_ns() != &init_user_ns) {
+               /*
+                * For userns mounts, disallow mounting if bdev is open for
+                * writing
+                */
+               if (!atomic_dec_unless_positive(&bdev->bd_inode->i_writecount)) {
+                       error = -EBUSY;
+                       goto error_bdev;
+               }
+               if (bdev->bd_contains != bdev &&
+                   !atomic_dec_unless_positive(&bdev->bd_contains->bd_inode->i_writecount)) {
+                       atomic_inc(&bdev->bd_inode->i_writecount);
+                       error = -EBUSY;
+                       goto error_bdev;
+               }
+       }
+
        /*
         * once the super is inserted into the list by sget, s_umount
         * will protect the lockfs code from trying to start a snapshot
@@ -1083,7 +1113,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
        if (bdev->bd_fsfreeze_count > 0) {
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                error = -EBUSY;
-               goto error_bdev;
+               goto error_inc;
        }
        s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
                 bdev);
@@ -1095,7 +1125,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
                if ((flags ^ s->s_flags) & SB_RDONLY) {
                        deactivate_locked_super(s);
                        error = -EBUSY;
-                       goto error_bdev;
+                       goto error_inc;
                }
 
                /*
@@ -1126,6 +1156,12 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 
 error_s:
        error = PTR_ERR(s);
+error_inc:
+       if (current_user_ns() != &init_user_ns) {
+               atomic_inc(&bdev->bd_inode->i_writecount);
+               if (bdev->bd_contains != bdev)
+                       atomic_inc(&bdev->bd_contains->bd_inode->i_writecount);
+       }
 error_bdev:
        blkdev_put(bdev, mode);
 error:
@@ -1142,6 +1178,11 @@ void kill_block_super(struct super_block *sb)
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
        WARN_ON_ONCE(!(mode & FMODE_EXCL));
+       if (sb->s_user_ns != &init_user_ns) {
+               atomic_inc(&bdev->bd_inode->i_writecount);
+               if (bdev->bd_contains != bdev)
+                       atomic_inc(&bdev->bd_contains->bd_inode->i_writecount);
+       }
        blkdev_put(bdev, mode | FMODE_EXCL);
 }
 
@@ -1223,6 +1264,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
        sb = root->d_sb;
        BUG_ON(!sb);
        WARN_ON(!sb->s_bdi);
+
+       /*
+        * Write barrier is for super_cache_count(). We place it before setting
+        * SB_BORN as the data dependency between the two functions is the
+        * superblock structure contents that we just set up, not the SB_BORN
+        * flag.
+        */
+       smp_wmb();
        sb->s_flags |= SB_BORN;
 
        error = security_sb_kern_mount(sb, flags, secdata);