ext4: Avoid races caused by on-line resizing and SMP memory reordering

[mirror_ubuntu-bionic-kernel.git] / fs / ext4 / super.c
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 39d1993cfa1370c3718dffd95231660f3dc606b4..fcd7b24c6df37a3f0b67a7936cced18a26b652ff 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
  #include <linux/string.h>
  #include <linux/fs.h>
  #include <linux/time.h>
+#include <linux/vmalloc.h>
  #include <linux/jbd2.h>
  #include <linux/slab.h>
  #include <linux/init.h>
@@ -35,6 +36,7 @@
  #include <linux/quotaops.h>
  #include <linux/seq_file.h>
  #include <linux/proc_fs.h>
+#include <linux/ctype.h>
  #include <linux/marker.h>
  #include <linux/log2.h>
  #include <linux/crc16.h>
@@ -48,11 +50,11 @@
  #include "group.h"
  
  struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
  
  static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                              unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
-                             struct ext4_super_block *es, int sync);
+static int ext4_commit_super(struct super_block *sb, int sync);
  static void ext4_mark_recovery_complete(struct super_block *sb,
                                         struct ext4_super_block *es);
  static void ext4_clear_journal_err(struct super_block *sb,
@@ -303,7 +305,7 @@ static void ext4_handle_error(struct super_block *sb)
                 printk(KERN_CRIT "Remounting filesystem read-only\n");
                 sb->s_flags |= MS_RDONLY;
         }
-       ext4_commit_super(sb, es, 1);
+       ext4_commit_super(sb, 1);
         if (test_opt(sb, ERRORS_PANIC))
                 panic("EXT4-fs (device %s): panic forced after error\n",
                         sb->s_id);
@@ -445,7 +447,7 @@ __acquires(bitlock)
         if (test_opt(sb, ERRORS_CONT)) {
                 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-               ext4_commit_super(sb, es, 0);
+               ext4_commit_super(sb, 0);
                 return;
         }
         ext4_unlock_group(sb, grp);
@@ -574,17 +576,20 @@ static void ext4_put_super(struct super_block *sb)
         if (!(sb->s_flags & MS_RDONLY)) {
                 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                 es->s_state = cpu_to_le16(sbi->s_mount_state);
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
         }
         if (sbi->s_proc) {
-               remove_proc_entry("inode_readahead_blks", sbi->s_proc);
                 remove_proc_entry(sb->s_id, ext4_proc_root);
         }
+       kobject_del(&sbi->s_kobj);
  
         for (i = 0; i < sbi->s_gdb_count; i++)
                 brelse(sbi->s_group_desc[i]);
         kfree(sbi->s_group_desc);
-       kfree(sbi->s_flex_groups);
+       if (is_vmalloc_addr(sbi->s_flex_groups))
+               vfree(sbi->s_flex_groups);
+       else
+               kfree(sbi->s_flex_groups);
         percpu_counter_destroy(&sbi->s_freeblocks_counter);
         percpu_counter_destroy(&sbi->s_freeinodes_counter);
         percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -615,6 +620,17 @@ static void ext4_put_super(struct super_block *sb)
                 ext4_blkdev_remove(sbi);
         }
         sb->s_fs_info = NULL;
+       /*
+        * Now that we are completely done shutting down the
+        * superblock, we need to actually destroy the kobject.
+        */
+       unlock_kernel();
+       unlock_super(sb);
+       kobject_put(&sbi->s_kobj);
+       wait_for_completion(&sbi->s_kobj_unregister);
+       lock_super(sb);
+       lock_kernel();
+       kfree(sbi->s_blockgroup_lock);
         kfree(sbi);
         return;
  }
@@ -803,8 +819,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
         if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
                 seq_puts(seq, ",noacl");
  #endif
-       if (!test_opt(sb, RESERVATION))
-               seq_puts(seq, ",noreservation");
         if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                 seq_printf(seq, ",commit=%u",
                            (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +869,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
         if (test_opt(sb, DATA_ERR_ABORT))
                 seq_puts(seq, ",data_err=abort");
  
+       if (test_opt(sb, NO_AUTO_DA_ALLOC))
+               seq_puts(seq, ",noauto_da_alloc");
+
         ext4_show_quota_options(seq, sb);
         return 0;
  }
@@ -926,8 +943,6 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_
  #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
  #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
  
-static int ext4_dquot_initialize(struct inode *inode, int type);
-static int ext4_dquot_drop(struct inode *inode);
  static int ext4_write_dquot(struct dquot *dquot);
  static int ext4_acquire_dquot(struct dquot *dquot);
  static int ext4_release_dquot(struct dquot *dquot);
@@ -942,9 +957,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                 const char *data, size_t len, loff_t off);
  
  static struct dquot_operations ext4_quota_operations = {
-       .initialize     = ext4_dquot_initialize,
-       .drop           = ext4_dquot_drop,
+       .initialize     = dquot_initialize,
+       .drop           = dquot_drop,
         .alloc_space    = dquot_alloc_space,
+       .reserve_space  = dquot_reserve_space,
+       .claim_space    = dquot_claim_space,
+       .release_rsv    = dquot_release_reserved_space,
+       .get_reserved_space = ext4_get_reserved_space,
         .alloc_inode    = dquot_alloc_inode,
         .free_space     = dquot_free_space,
         .free_inode     = dquot_free_inode,
@@ -976,7 +995,6 @@ static const struct super_operations ext4_sops = {
         .dirty_inode    = ext4_dirty_inode,
         .delete_inode   = ext4_delete_inode,
         .put_super      = ext4_put_super,
-       .write_super    = ext4_write_super,
         .sync_fs        = ext4_sync_fs,
         .freeze_fs      = ext4_freeze,
         .unfreeze_fs    = ext4_unfreeze,
@@ -991,6 +1009,25 @@ static const struct super_operations ext4_sops = {
         .bdev_try_to_free_page = bdev_try_to_free_page,
  };
  
+static const struct super_operations ext4_nojournal_sops = {
+       .alloc_inode    = ext4_alloc_inode,
+       .destroy_inode  = ext4_destroy_inode,
+       .write_inode    = ext4_write_inode,
+       .dirty_inode    = ext4_dirty_inode,
+       .delete_inode   = ext4_delete_inode,
+       .write_super    = ext4_write_super,
+       .put_super      = ext4_put_super,
+       .statfs         = ext4_statfs,
+       .remount_fs     = ext4_remount,
+       .clear_inode    = ext4_clear_inode,
+       .show_options   = ext4_show_options,
+#ifdef CONFIG_QUOTA
+       .quota_read     = ext4_quota_read,
+       .quota_write    = ext4_quota_write,
+#endif
+       .bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
  static const struct export_operations ext4_export_ops = {
         .fh_to_dentry = ext4_fh_to_dentry,
         .fh_to_parent = ext4_fh_to_parent,
@@ -1002,7 +1039,7 @@ enum {
         Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
         Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
         Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-       Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+       Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
         Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
         Opt_journal_update, Opt_journal_dev,
         Opt_journal_checksum, Opt_journal_async_commit,
@@ -1010,8 +1047,8 @@ enum {
         Opt_data_err_abort, Opt_data_err_ignore,
         Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
         Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-       Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-       Opt_grpquota, Opt_i_version,
+       Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
+       Opt_usrquota, Opt_grpquota, Opt_i_version,
         Opt_stripe, Opt_delalloc, Opt_nodelalloc,
         Opt_inode_readahead_blks, Opt_journal_ioprio
  };
@@ -1037,8 +1074,6 @@ static const match_table_t tokens = {
         {Opt_nouser_xattr, "nouser_xattr"},
         {Opt_acl, "acl"},
         {Opt_noacl, "noacl"},
-       {Opt_reservation, "reservation"},
-       {Opt_noreservation, "noreservation"},
         {Opt_noload, "noload"},
         {Opt_nobh, "nobh"},
         {Opt_bh, "bh"},
@@ -1066,6 +1101,8 @@ static const match_table_t tokens = {
         {Opt_quota, "quota"},
         {Opt_usrquota, "usrquota"},
         {Opt_barrier, "barrier=%u"},
+       {Opt_barrier, "barrier"},
+       {Opt_nobarrier, "nobarrier"},
         {Opt_i_version, "i_version"},
         {Opt_stripe, "stripe=%u"},
         {Opt_resize, "resize"},
@@ -1073,6 +1110,9 @@ static const match_table_t tokens = {
         {Opt_nodelalloc, "nodelalloc"},
         {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
         {Opt_journal_ioprio, "journal_ioprio=%u"},
+       {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+       {Opt_auto_da_alloc, "auto_da_alloc"},
+       {Opt_noauto_da_alloc, "noauto_da_alloc"},
         {Opt_err, NULL},
  };
  
@@ -1205,12 +1245,6 @@ static int parse_options(char *options, struct super_block *sb,
                                "not supported\n");
                         break;
  #endif
-               case Opt_reservation:
-                       set_opt(sbi->s_mount_opt, RESERVATION);
-                       break;
-               case Opt_noreservation:
-                       clear_opt(sbi->s_mount_opt, RESERVATION);
-                       break;
                 case Opt_journal_update:
                         /* @@@ FIXME */
                         /* Eventually we will want to be able to create
@@ -1413,9 +1447,14 @@ set_qf_format:
                 case Opt_abort:
                         set_opt(sbi->s_mount_opt, ABORT);
                         break;
+               case Opt_nobarrier:
+                       clear_opt(sbi->s_mount_opt, BARRIER);
+                       break;
                 case Opt_barrier:
-                       if (match_int(&args[0], &option))
-                               return 0;
+                       if (match_int(&args[0], &option)) {
+                               set_opt(sbi->s_mount_opt, BARRIER);
+                               break;
+                       }
                         if (option)
                                 set_opt(sbi->s_mount_opt, BARRIER);
                         else
@@ -1461,6 +1500,11 @@ set_qf_format:
                                 return 0;
                         if (option < 0 || option > (1 << 30))
                                 return 0;
+                       if (!is_power_of_2(option)) {
+                               printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+                                      " must be a power of 2\n");
+                               return 0;
+                       }
                         sbi->s_inode_readahead_blks = option;
                         break;
                 case Opt_journal_ioprio:
@@ -1471,6 +1515,19 @@ set_qf_format:
                         *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
                                                             option);
                         break;
+               case Opt_noauto_da_alloc:
+                       set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                       break;
+               case Opt_auto_da_alloc:
+                       if (match_int(&args[0], &option)) {
+                               clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                               break;
+                       }
+                       if (option)
+                               clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                       else
+                               set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                       break;
                 default:
                         printk(KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1556,7 +1613,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
         if (sbi->s_journal)
                 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
  
-       ext4_commit_super(sb, es, 1);
+       ext4_commit_super(sb, 1);
         if (test_opt(sb, DEBUG))
                 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -1584,6 +1641,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
         ext4_group_t flex_group_count;
         ext4_group_t flex_group;
         int groups_per_flex = 0;
+       size_t size;
         int i;
  
         if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1598,8 +1656,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
         flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
                         ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
                               EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-       sbi->s_flex_groups = kzalloc(flex_group_count *
-                                    sizeof(struct flex_groups), GFP_KERNEL);
+       size = flex_group_count * sizeof(struct flex_groups);
+       sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+       if (sbi->s_flex_groups == NULL) {
+               sbi->s_flex_groups = vmalloc(size);
+               if (sbi->s_flex_groups)
+                       memset(sbi->s_flex_groups, 0, size);
+       }
         if (sbi->s_flex_groups == NULL) {
                 printk(KERN_ERR "EXT4-fs: not enough memory for "
                                 "%u flex groups\n", flex_group_count);
@@ -1610,10 +1673,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
                 gdp = ext4_get_group_desc(sb, i, &bh);
  
                 flex_group = ext4_flex_group(sbi, i);
-               sbi->s_flex_groups[flex_group].free_inodes +=
-                       ext4_free_inodes_count(sb, gdp);
-               sbi->s_flex_groups[flex_group].free_blocks +=
-                       ext4_free_blks_count(sb, gdp);
+               atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
+                          ext4_free_inodes_count(sb, gdp));
+               atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
+                          ext4_free_blks_count(sb, gdp));
+               atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+                          ext4_used_dirs_count(sb, gdp));
         }
  
         return 1;
@@ -1802,7 +1867,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                 }
  
                 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-               DQUOT_INIT(inode);
+               vfs_dq_init(inode);
                 if (inode->i_nlink) {
                         printk(KERN_DEBUG
                                 "%s: truncating inode %lu to %lld bytes\n",
@@ -1989,6 +2054,180 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
         return 0;
  }
  
+/* sysfs supprt */
+
+struct ext4_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+       ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+                        const char *, size_t);
+       int offset;
+};
+
+static int parse_strtoul(const char *buf,
+               unsigned long max, unsigned long *value)
+{
+       char *endp;
+
+       while (*buf && isspace(*buf))
+               buf++;
+       *value = simple_strtoul(buf, &endp, 0);
+       while (*endp && isspace(*endp))
+               endp++;
+       if (*endp || *value > max)
+               return -EINVAL;
+
+       return 0;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+                                             struct ext4_sb_info *sbi,
+                                             char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%llu\n",
+                       (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+                                        struct ext4_sb_info *sbi, char *buf)
+{
+       struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+       return snprintf(buf, PAGE_SIZE, "%lu\n",
+                       (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                        sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+                                         struct ext4_sb_info *sbi, char *buf)
+{
+       struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+       return snprintf(buf, PAGE_SIZE, "%llu\n",
+                       sbi->s_kbytes_written + 
+                       ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                         EXT4_SB(sb)->s_sectors_written_start) >> 1));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+                                         struct ext4_sb_info *sbi,
+                                         const char *buf, size_t count)
+{
+       unsigned long t;
+
+       if (parse_strtoul(buf, 0x40000000, &t))
+               return -EINVAL;
+
+       if (!is_power_of_2(t))
+               return -EINVAL;
+
+       sbi->s_inode_readahead_blks = t;
+       return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+                               struct ext4_sb_info *sbi, char *buf)
+{
+       unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+
+       return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+                           struct ext4_sb_info *sbi,
+                           const char *buf, size_t count)
+{
+       unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+       unsigned long t;
+
+       if (parse_strtoul(buf, 0xffffffff, &t))
+               return -EINVAL;
+       *ui = t;
+       return count;
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {                  \
+       .attr = {.name = __stringify(_name), .mode = _mode },   \
+       .show   = _show,                                        \
+       .store  = _store,                                       \
+       .offset = offsetof(struct ext4_sb_info, _elname),       \
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)      \
+       EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+                inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+
+static struct attribute *ext4_attrs[] = {
+       ATTR_LIST(delayed_allocation_blocks),
+       ATTR_LIST(session_write_kbytes),
+       ATTR_LIST(lifetime_write_kbytes),
+       ATTR_LIST(inode_readahead_blks),
+       ATTR_LIST(mb_stats),
+       ATTR_LIST(mb_max_to_scan),
+       ATTR_LIST(mb_min_to_scan),
+       ATTR_LIST(mb_order2_req),
+       ATTR_LIST(mb_stream_req),
+       ATTR_LIST(mb_group_prealloc),
+       NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+                             struct attribute *attr, char *buf)
+{
+       struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                               s_kobj);
+       struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+       return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+                              struct attribute *attr,
+                              const char *buf, size_t len)
+{
+       struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                               s_kobj);
+       struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+       return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+       struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                               s_kobj);
+       complete(&sbi->s_kobj_unregister);
+}
+
+
+static struct sysfs_ops ext4_attr_ops = {
+       .show   = ext4_attr_show,
+       .store  = ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+       .default_attrs  = ext4_attrs,
+       .sysfs_ops      = &ext4_attr_ops,
+       .release        = ext4_sb_release,
+};
+
  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                 __releases(kernel_lock)
                                 __acquires(kernel_lock)
@@ -2019,12 +2258,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
         if (!sbi)
                 return -ENOMEM;
+
+       sbi->s_blockgroup_lock =
+               kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+       if (!sbi->s_blockgroup_lock) {
+               kfree(sbi);
+               return -ENOMEM;
+       }
         sb->s_fs_info = sbi;
         sbi->s_mount_opt = 0;
         sbi->s_resuid = EXT4_DEF_RESUID;
         sbi->s_resgid = EXT4_DEF_RESGID;
         sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
         sbi->s_sb_block = sb_block;
+       sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+                                                     sectors[1]);
  
         unlock_kernel();
  
@@ -2062,6 +2310,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sb->s_magic = le16_to_cpu(es->s_magic);
         if (sb->s_magic != EXT4_SUPER_MAGIC)
                 goto cantfind_ext4;
+       sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
  
         /* Set defaults before we parse the mount options */
         def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2099,7 +2348,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
         sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
  
-       set_opt(sbi->s_mount_opt, RESERVATION);
         set_opt(sbi->s_mount_opt, BARRIER);
  
         /*
@@ -2286,6 +2534,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
                 goto cantfind_ext4;
  
+       /* check blocks count against device size */
+       blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+       if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+               printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
+                      "exceeds size of device (%llu blocks)\n",
+                      ext4_blocks_count(es), blocks_count);
+               goto failed_mount;
+       }
+
          /*
           * It makes no sense for the first data block to be beyond the end
           * of the filesystem.
@@ -2323,14 +2580,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
  #ifdef CONFIG_PROC_FS
         if (ext4_proc_root)
                 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
-       if (sbi->s_proc)
-               proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
-                                &ext4_ui_proc_fops,
-                                &sbi->s_inode_readahead_blks);
  #endif
  
-       bgl_lock_init(&sbi->s_blockgroup_lock);
+       bgl_lock_init(sbi->s_blockgroup_lock);
  
         for (i = 0; i < db_count; i++) {
                 block = descriptor_loc(sb, logical_sb_block, i);
@@ -2381,7 +2633,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         /*
          * set up enough so that it can read an inode
          */
-       sb->s_op = &ext4_sops;
+       if (!test_opt(sb, NOLOAD) &&
+           EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+               sb->s_op = &ext4_sops;
+       else
+               sb->s_op = &ext4_nojournal_sops;
         sb->s_export_op = &ext4_export_ops;
         sb->s_xattr = ext4_xattr_handlers;
  #ifdef CONFIG_QUOTA
@@ -2420,7 +2676,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                         if (test_opt(sb, ERRORS_PANIC)) {
                                 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                               ext4_commit_super(sb, es, 1);
+                               ext4_commit_super(sb, 1);
                                 goto failed_mount4;
                         }
                 }
@@ -2562,6 +2818,16 @@ no_journal:
                 goto failed_mount4;
         }
  
+       sbi->s_kobj.kset = ext4_kset;
+       init_completion(&sbi->s_kobj_unregister);
+       err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+                                  "%s", sb->s_id);
+       if (err) {
+               ext4_mb_release(sb);
+               ext4_ext_release(sb);
+               goto failed_mount4;
+       };
+
         /*
          * akpm: core read_super() calls in here with the superblock locked.
          * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2606,6 +2872,12 @@ failed_mount4:
                 sbi->s_journal = NULL;
         }
  failed_mount3:
+       if (sbi->s_flex_groups) {
+               if (is_vmalloc_addr(sbi->s_flex_groups))
+                       vfree(sbi->s_flex_groups);
+               else
+                       kfree(sbi->s_flex_groups);
+       }
         percpu_counter_destroy(&sbi->s_freeblocks_counter);
         percpu_counter_destroy(&sbi->s_freeinodes_counter);
         percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2616,7 +2888,6 @@ failed_mount2:
         kfree(sbi->s_group_desc);
  failed_mount:
         if (sbi->s_proc) {
-               remove_proc_entry("inode_readahead_blks", sbi->s_proc);
                 remove_proc_entry(sb->s_id, ext4_proc_root);
         }
  #ifdef CONFIG_QUOTA
@@ -2879,18 +3150,17 @@ static int ext4_load_journal(struct super_block *sb,
         if (journal_devnum &&
             journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                 es->s_journal_dev = cpu_to_le32(journal_devnum);
-               sb->s_dirt = 1;
  
                 /* Make sure we flush the recovery flag to disk. */
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
         }
  
         return 0;
  }
  
-static int ext4_commit_super(struct super_block *sb,
-                             struct ext4_super_block *es, int sync)
+static int ext4_commit_super(struct super_block *sb, int sync)
  {
+       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
         struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
         int error = 0;
  
@@ -2911,11 +3181,15 @@ static int ext4_commit_super(struct super_block *sb,
                 set_buffer_uptodate(sbh);
         }
         es->s_wtime = cpu_to_le32(get_seconds());
+       es->s_kbytes_written =
+               cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+                           ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                             EXT4_SB(sb)->s_sectors_written_start) >> 1));
         ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                         &EXT4_SB(sb)->s_freeblocks_counter));
         es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
                                         &EXT4_SB(sb)->s_freeinodes_counter));
-
+       sb->s_dirt = 0;
         BUFFER_TRACE(sbh, "marking dirty");
         mark_buffer_dirty(sbh);
         if (sync) {
@@ -2957,8 +3231,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
             sb->s_flags & MS_RDONLY) {
                 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-               sb->s_dirt = 0;
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
         }
         unlock_super(sb);
  
@@ -2999,7 +3272,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
  
                 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
  
                 jbd2_journal_clear_err(journal);
         }
@@ -3018,29 +3291,15 @@ int ext4_force_commit(struct super_block *sb)
                 return 0;
  
         journal = EXT4_SB(sb)->s_journal;
-       if (journal) {
-               sb->s_dirt = 0;
+       if (journal)
                 ret = ext4_journal_force_commit(journal);
-       }
  
         return ret;
  }
  
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
- */
  static void ext4_write_super(struct super_block *sb)
  {
-       if (EXT4_SB(sb)->s_journal) {
-               if (mutex_trylock(&sb->s_lock) != 0)
-                       BUG();
-               sb->s_dirt = 0;
-       } else {
-               ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-       }
+       ext4_commit_super(sb, 1);
  }
  
  static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3049,16 +3308,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
         tid_t target;
  
         trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-       sb->s_dirt = 0;
-       if (EXT4_SB(sb)->s_journal) {
-               if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
-                                             &target)) {
-                       if (wait)
-                               jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
-                                                    target);
-               }
-       } else {
-               ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+       if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+               if (wait)
+                       jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
         }
         return ret;
  }
@@ -3071,34 +3323,32 @@ static int ext4_freeze(struct super_block *sb)
  {
         int error = 0;
         journal_t *journal;
-       sb->s_dirt = 0;
  
-       if (!(sb->s_flags & MS_RDONLY)) {
-               journal = EXT4_SB(sb)->s_journal;
+       if (sb->s_flags & MS_RDONLY)
+               return 0;
  
-               if (journal) {
-                       /* Now we set up the journal barrier. */
-                       jbd2_journal_lock_updates(journal);
+       journal = EXT4_SB(sb)->s_journal;
  
-                       /*
-                        * We don't want to clear needs_recovery flag when we
-                        * failed to flush the journal.
-                        */
-                       error = jbd2_journal_flush(journal);
-                       if (error < 0)
-                               goto out;
-               }
+       /* Now we set up the journal barrier. */
+       jbd2_journal_lock_updates(journal);
  
-               /* Journal blocked and flushed, clear needs_recovery flag. */
-               EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-               error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-               if (error)
-                       goto out;
+       /*
+        * Don't clear the needs_recovery flag if we failed to flush
+        * the journal.
+        */
+       error = jbd2_journal_flush(journal);
+       if (error < 0) {
+       out:
+               jbd2_journal_unlock_updates(journal);
+               return error;
         }
+
+       /* Journal blocked and flushed, clear needs_recovery flag. */
+       EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+       error = ext4_commit_super(sb, 1);
+       if (error)
+               goto out;
         return 0;
-out:
-       jbd2_journal_unlock_updates(journal);
-       return error;
  }
  
  /*
@@ -3107,14 +3357,15 @@ out:
   */
  static int ext4_unfreeze(struct super_block *sb)
  {
-       if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
-               lock_super(sb);
-               /* Reser the needs_recovery flag before the fs is unlocked. */
-               EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-               ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-               unlock_super(sb);
-               jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-       }
+       if (sb->s_flags & MS_RDONLY)
+               return 0;
+
+       lock_super(sb);
+       /* Reset the needs_recovery flag before the fs is unlocked. */
+       EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+       ext4_commit_super(sb, 1);
+       unlock_super(sb);
+       jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
         return 0;
  }
  
@@ -3266,7 +3517,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                 }
         }
         if (sbi->s_journal == NULL)
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
  
  #ifdef CONFIG_QUOTA
         /* Release old quota file names */
@@ -3306,9 +3557,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
         if (test_opt(sb, MINIX_DF)) {
                 sbi->s_overhead_last = 0;
         } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-               ext4_group_t ngroups = sbi->s_groups_count, i;
+               ext4_group_t i, ngroups = ext4_get_groups_count(sb);
                 ext4_fsblk_t overhead = 0;
-               smp_rmb();
  
                 /*
                  * Compute the overhead (FS structures).  This is constant
@@ -3367,8 +3617,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
   * is locked for write. Otherwise the are possible deadlocks:
   * Process 1                         Process 2
   * ext4_create()                     quota_sync()
- *   jbd2_journal_start()                   write_dquot()
- *   DQUOT_INIT()                        down(dqio_mutex)
+ *   jbd2_journal_start()                  write_dquot()
+ *   vfs_dq_init()                         down(dqio_mutex)
   *     down(dqio_mutex)                    jbd2_journal_start()
   *
   */
@@ -3380,44 +3630,6 @@ static inline struct inode *dquot_to_inode(struct dquot *dquot)
         return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
  }
  
-static int ext4_dquot_initialize(struct inode *inode, int type)
-{
-       handle_t *handle;
-       int ret, err;
-
-       /* We may create quota structure so we need to reserve enough blocks */
-       handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
-       ret = dquot_initialize(inode, type);
-       err = ext4_journal_stop(handle);
-       if (!ret)
-               ret = err;
-       return ret;
-}
-
-static int ext4_dquot_drop(struct inode *inode)
-{
-       handle_t *handle;
-       int ret, err;
-
-       /* We may delete quota structure so we need to reserve enough blocks */
-       handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
-       if (IS_ERR(handle)) {
-               /*
-                * We call dquot_drop() anyway to at least release references
-                * to quota structures so that umount does not hang.
-                */
-               dquot_drop(inode);
-               return PTR_ERR(handle);
-       }
-       ret = dquot_drop(inode);
-       err = ext4_journal_stop(handle);
-       if (!ret)
-               ret = err;
-       return ret;
-}
-
  static int ext4_write_dquot(struct dquot *dquot)
  {
         int ret, err;
@@ -3683,45 +3895,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
         return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
  }
  
-#ifdef CONFIG_PROC_FS
-static int ext4_ui_proc_show(struct seq_file *m, void *v)
-{
-       unsigned int *p = m->private;
-
-       seq_printf(m, "%u\n", *p);
-       return 0;
-}
-
-static int ext4_ui_proc_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
-}
-
-static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
-                              size_t cnt, loff_t *ppos)
-{
-       unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
-       char str[32];
-
-       if (cnt >= sizeof(str))
-               return -EINVAL;
-       if (copy_from_user(str, buf, cnt))
-               return -EFAULT;
-
-       *p = simple_strtoul(str, NULL, 0);
-       return cnt;
-}
-
-const struct file_operations ext4_ui_proc_fops = {
-       .owner          = THIS_MODULE,
-       .open           = ext4_ui_proc_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-       .write          = ext4_ui_proc_write,
-};
-#endif
-
  static struct file_system_type ext4_fs_type = {
         .owner          = THIS_MODULE,
         .name           = "ext4",
@@ -3755,6 +3928,9 @@ static int __init init_ext4_fs(void)
  {
         int err;
  
+       ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+       if (!ext4_kset)
+               return -ENOMEM;
         ext4_proc_root = proc_mkdir("fs/ext4", NULL);
         err = init_ext4_mballoc();
         if (err)
@@ -3796,6 +3972,7 @@ static void __exit exit_ext4_fs(void)
         exit_ext4_xattr();
         exit_ext4_mballoc();
         remove_proc_entry("fs/ext4", NULL);
+       kset_unregister(ext4_kset);
  }
  
  MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");