jbd2: Remove data=ordered mode support using jbd buffer heads

[mirror_ubuntu-zesty-kernel.git] / fs / jbd2 / commit.c
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c

index 4f302d2792794008351326bf9d6e128cf8b18a33..483183d15ed54212aaa9e81ddba9070d90eef9d5 100644 (file)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -37,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  }
  
  /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
   * After the transaction commits, these pages are left on the LRU, with no
   * ->mapping, and with attached buffers.  These pages are trivially reclaimable
   * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -79,21 +79,6 @@ nope:
         __brelse(bh);
  }
  
-/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-       if (!jbd_trylock_bh_state(bh)) {
-               spin_unlock(&journal->j_list_lock);
-               schedule();
-               return 0;
-       }
-       return 1;
-}
-
  /*
   * Done it all: now submit the commit record.  We should have
   * cleaned up our previous buffers by now, so if we are in abort
@@ -112,6 +97,7 @@ static int journal_submit_commit_record(journal_t *journal,
         struct buffer_head *bh;
         int ret;
         int barrier_done = 0;
+       struct timespec now = current_kernel_time();
  
         if (is_journal_aborted(journal))
                 return 0;
@@ -126,6 +112,8 @@ static int journal_submit_commit_record(journal_t *journal,
         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+       tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+       tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
  
         if (JBD2_HAS_COMPAT_FEATURE(journal,
                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -136,18 +124,20 @@ static int journal_submit_commit_record(journal_t *journal,
  
         JBUFFER_TRACE(descriptor, "submit commit block");
         lock_buffer(bh);
-
+       get_bh(bh);
         set_buffer_dirty(bh);
         set_buffer_uptodate(bh);
         bh->b_end_io = journal_end_buffer_io_sync;
  
         if (journal->j_flags & JBD2_BARRIER &&
-               !JBD2_HAS_COMPAT_FEATURE(journal,
+               !JBD2_HAS_INCOMPAT_FEATURE(journal,
                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
                 set_buffer_ordered(bh);
                 barrier_done = 1;
         }
         ret = submit_bh(WRITE, bh);
+       if (barrier_done)
+               clear_buffer_ordered(bh);
  
         /* is it possible for another commit to fail at roughly
          * the same time as this one?  If so, we don't want to
@@ -166,7 +156,7 @@ static int journal_submit_commit_record(journal_t *journal,
                 spin_unlock(&journal->j_state_lock);
  
                 /* And try again, without the barrier */
-               clear_buffer_ordered(bh);
+               lock_buffer(bh);
                 set_buffer_uptodate(bh);
                 set_buffer_dirty(bh);
                 ret = submit_bh(WRITE, bh);
@@ -195,159 +185,78 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
  }
  
  /*
- * Wait for all submitted IO to complete.
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
   */
-static int journal_wait_on_locked_list(journal_t *journal,
-                                      transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(journal_t *journal,
+               transaction_t *commit_transaction)
  {
-       int ret = 0;
-       struct journal_head *jh;
+       struct jbd2_inode *jinode;
+       int err, ret = 0;
+       struct address_space *mapping;
  
-       while (commit_transaction->t_locked_list) {
-               struct buffer_head *bh;
-
-               jh = commit_transaction->t_locked_list->b_tprev;
-               bh = jh2bh(jh);
-               get_bh(bh);
-               if (buffer_locked(bh)) {
-                       spin_unlock(&journal->j_list_lock);
-                       wait_on_buffer(bh);
-                       if (unlikely(!buffer_uptodate(bh)))
-                               ret = -EIO;
-                       spin_lock(&journal->j_list_lock);
-               }
-               if (!inverted_lock(journal, bh)) {
-                       put_bh(bh);
-                       spin_lock(&journal->j_list_lock);
-                       continue;
-               }
-               if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                       __jbd2_journal_unfile_buffer(jh);
-                       jbd_unlock_bh_state(bh);
-                       jbd2_journal_remove_journal_head(bh);
-                       put_bh(bh);
-               } else {
-                       jbd_unlock_bh_state(bh);
-               }
-               put_bh(bh);
-               cond_resched_lock(&journal->j_list_lock);
+       spin_lock(&journal->j_list_lock);
+       list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+               mapping = jinode->i_vfs_inode->i_mapping;
+               jinode->i_flags |= JI_COMMIT_RUNNING;
+               spin_unlock(&journal->j_list_lock);
+               err = filemap_fdatawrite_range(mapping, 0,
+                                       i_size_read(jinode->i_vfs_inode));
+               if (!ret)
+                       ret = err;
+               spin_lock(&journal->j_list_lock);
+               J_ASSERT(jinode->i_transaction == commit_transaction);
+               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
         }
+       spin_unlock(&journal->j_list_lock);
         return ret;
-  }
-
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
-{
-       int i;
-
-       for (i = 0; i < bufs; i++) {
-               wbuf[i]->b_end_io = end_buffer_write_sync;
-               /* We use-up our safety reference in submit_bh() */
-               submit_bh(WRITE, wbuf[i]);
-       }
  }
  
  /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
   */
-static void journal_submit_data_buffers(journal_t *journal,
-                               transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+               transaction_t *commit_transaction)
  {
-       struct journal_head *jh;
-       struct buffer_head *bh;
-       int locked;
-       int bufs = 0;
-       struct buffer_head **wbuf = journal->j_wbuf;
+       struct jbd2_inode *jinode, *next_i;
+       int err, ret = 0;
  
-       /*
-        * Whenever we unlock the journal and sleep, things can get added
-        * onto ->t_sync_datalist, so we have to keep looping back to
-        * write_out_data until we *know* that the list is empty.
-        *
-        * Cleanup any flushed data buffers from the data list.  Even in
-        * abort mode, we want to flush this out as soon as possible.
-        */
-write_out_data:
-       cond_resched();
+       /* For locking, see the comment in journal_submit_inode_data_buffers() */
         spin_lock(&journal->j_list_lock);
+       list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+               jinode->i_flags |= JI_COMMIT_RUNNING;
+               spin_unlock(&journal->j_list_lock);
+               err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+               if (!ret)
+                       ret = err;
+               spin_lock(&journal->j_list_lock);
+               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+       }
  
-       while (commit_transaction->t_sync_datalist) {
-               jh = commit_transaction->t_sync_datalist;
-               bh = jh2bh(jh);
-               locked = 0;
-
-               /* Get reference just to make sure buffer does not disappear
-                * when we are forced to drop various locks */
-               get_bh(bh);
-               /* If the buffer is dirty, we need to submit IO and hence
-                * we need the buffer lock. We try to lock the buffer without
-                * blocking. If we fail, we need to drop j_list_lock and do
-                * blocking lock_buffer().
-                */
-               if (buffer_dirty(bh)) {
-                       if (test_set_buffer_locked(bh)) {
-                               BUFFER_TRACE(bh, "needs blocking lock");
-                               spin_unlock(&journal->j_list_lock);
-                               /* Write out all data to prevent deadlocks */
-                               journal_do_submit_data(wbuf, bufs);
-                               bufs = 0;
-                               lock_buffer(bh);
-                               spin_lock(&journal->j_list_lock);
-                       }
-                       locked = 1;
-               }
-               /* We have to get bh_state lock. Again out of order, sigh. */
-               if (!inverted_lock(journal, bh)) {
-                       jbd_lock_bh_state(bh);
-                       spin_lock(&journal->j_list_lock);
-               }
-               /* Someone already cleaned up the buffer? */
-               if (!buffer_jbd(bh)
-                       || jh->b_transaction != commit_transaction
-                       || jh->b_jlist != BJ_SyncData) {
-                       jbd_unlock_bh_state(bh);
-                       if (locked)
-                               unlock_buffer(bh);
-                       BUFFER_TRACE(bh, "already cleaned up");
-                       put_bh(bh);
-                       continue;
-               }
-               if (locked && test_clear_buffer_dirty(bh)) {
-                       BUFFER_TRACE(bh, "needs writeout, adding to array");
-                       wbuf[bufs++] = bh;
-                       __jbd2_journal_file_buffer(jh, commit_transaction,
-                                               BJ_Locked);
-                       jbd_unlock_bh_state(bh);
-                       if (bufs == journal->j_wbufsize) {
-                               spin_unlock(&journal->j_list_lock);
-                               journal_do_submit_data(wbuf, bufs);
-                               bufs = 0;
-                               goto write_out_data;
-                       }
-               } else if (!locked && buffer_locked(bh)) {
-                       __jbd2_journal_file_buffer(jh, commit_transaction,
-                                               BJ_Locked);
-                       jbd_unlock_bh_state(bh);
-                       put_bh(bh);
+       /* Now refile inode to proper lists */
+       list_for_each_entry_safe(jinode, next_i,
+                                &commit_transaction->t_inode_list, i_list) {
+               list_del(&jinode->i_list);
+               if (jinode->i_next_transaction) {
+                       jinode->i_transaction = jinode->i_next_transaction;
+                       jinode->i_next_transaction = NULL;
+                       list_add(&jinode->i_list,
+                               &jinode->i_transaction->t_inode_list);
                 } else {
-                       BUFFER_TRACE(bh, "writeout complete: unfile");
-                       __jbd2_journal_unfile_buffer(jh);
-                       jbd_unlock_bh_state(bh);
-                       if (locked)
-                               unlock_buffer(bh);
-                       jbd2_journal_remove_journal_head(bh);
-                       /* Once for our safety reference, once for
-                        * jbd2_journal_remove_journal_head() */
-                       put_bh(bh);
-                       put_bh(bh);
-               }
-
-               if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-                       spin_unlock(&journal->j_list_lock);
-                       goto write_out_data;
+                       jinode->i_transaction = NULL;
                 }
         }
         spin_unlock(&journal->j_list_lock);
-       journal_do_submit_data(wbuf, bufs);
+
+       return ret;
  }
  
  static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -518,41 +427,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
  
         jbd_debug (3, "JBD: commit phase 2\n");
  
-       /*
-        * First, drop modified flag: all accesses to the buffers
-        * will be tracked for a new trasaction only -bzzz
-        */
-       spin_lock(&journal->j_list_lock);
-       if (commit_transaction->t_buffers) {
-               new_jh = jh = commit_transaction->t_buffers->b_tnext;
-               do {
-                       J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
-                                       new_jh->b_modified == 0);
-                       new_jh->b_modified = 0;
-                       new_jh = new_jh->b_tnext;
-               } while (new_jh != jh);
-       }
-       spin_unlock(&journal->j_list_lock);
-
         /*
          * Now start flushing things to disk, in the order they appear
          * on the transaction lists.  Data blocks go first.
          */
-       err = 0;
-       journal_submit_data_buffers(journal, commit_transaction);
-
-       /*
-        * Wait for all previously submitted IO to complete if commit
-        * record is to be written synchronously.
-        */
-       spin_lock(&journal->j_list_lock);
-       if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-               err = journal_wait_on_locked_list(journal,
-                                               commit_transaction);
-
-       spin_unlock(&journal->j_list_lock);
-
+       err = journal_submit_inode_data_buffers(journal, commit_transaction);
         if (err)
                 jbd2_journal_abort(journal, err);
  
@@ -560,22 +439,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
  
         jbd_debug(3, "JBD: commit phase 2\n");
  
-       /*
-        * If we found any dirty or locked buffers, then we should have
-        * looped back up to the write_out_data label.  If there weren't
-        * any then journal_clean_data_list should have wiped the list
-        * clean by now, so check that it is in fact empty.
-        */
-       J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-       jbd_debug (3, "JBD: commit phase 3\n");
-
         /*
          * Way to go: we have now written out all of the data for a
          * transaction!  Now comes the tricky part: we need to write out
          * metadata.  Loop over the transaction's entire buffer list:
          */
+       spin_lock(&journal->j_state_lock);
         commit_transaction->t_state = T_COMMIT;
+       spin_unlock(&journal->j_state_lock);
  
         stats.u.run.rs_logging = jiffies;
         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
@@ -583,6 +454,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
         stats.u.run.rs_blocks_logged = 0;
  
+       J_ASSERT(commit_transaction->t_nr_buffers <=
+                commit_transaction->t_outstanding_credits);
+
+       err = 0;
         descriptor = NULL;
         bufs = 0;
         while (commit_transaction->t_buffers) {
@@ -757,15 +632,19 @@ start_journal_io:
                                                  &cbh, crc32_sum);
                 if (err)
                         __jbd2_journal_abort_hard(journal);
-
-               spin_lock(&journal->j_list_lock);
-               err = journal_wait_on_locked_list(journal,
-                                               commit_transaction);
-               spin_unlock(&journal->j_list_lock);
-               if (err)
-                       __jbd2_journal_abort_hard(journal);
         }
  
+       /*
+        * This is the right place to wait for data buffers both for ASYNC
+        * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+        * the commit block went to disk (which happens above). If commit is
+        * SYNC, we need to wait for data buffers before we start writing
+        * commit block, which happens below in such setting.
+        */
+       err = journal_finish_inode_data_buffers(journal, commit_transaction);
+       if (err)
+               jbd2_journal_abort(journal, err);
+
         /* Lo and behold: we have just managed to send a transaction to
             the log.  Before we can commit it, wait for the IO so far to
             complete.  Control buffers being written are on the
@@ -777,7 +656,7 @@ start_journal_io:
            so we incur less scheduling load.
         */
  
-       jbd_debug(3, "JBD: commit phase 4\n");
+       jbd_debug(3, "JBD: commit phase 3\n");
  
         /*
          * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -836,7 +715,7 @@ wait_for_iobuf:
  
         J_ASSERT (commit_transaction->t_shadow_list == NULL);
  
-       jbd_debug(3, "JBD: commit phase 5\n");
+       jbd_debug(3, "JBD: commit phase 4\n");
  
         /* Here we wait for the revoke record and descriptor record buffers */
   wait_for_ctlbuf:
@@ -863,7 +742,7 @@ wait_for_iobuf:
                 /* AKPM: bforget here */
         }
  
-       jbd_debug(3, "JBD: commit phase 6\n");
+       jbd_debug(3, "JBD: commit phase 5\n");
  
         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -872,7 +751,8 @@ wait_for_iobuf:
                 if (err)
                         __jbd2_journal_abort_hard(journal);
         }
-       err = journal_wait_on_commit_record(cbh);
+       if (!err && !is_journal_aborted(journal))
+               err = journal_wait_on_commit_record(cbh);
  
         if (err)
                 jbd2_journal_abort(journal, err);
@@ -882,9 +762,9 @@ wait_for_iobuf:
             transaction can be removed from any checkpoint list it was on
             before. */
  
-       jbd_debug(3, "JBD: commit phase 7\n");
+       jbd_debug(3, "JBD: commit phase 6\n");
  
-       J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+       J_ASSERT(list_empty(&commit_transaction->t_inode_list));
         J_ASSERT(commit_transaction->t_buffers == NULL);
         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -1005,7 +885,7 @@ restart_loop:
  
         /* Done with this transaction! */
  
-       jbd_debug(3, "JBD: commit phase 8\n");
+       jbd_debug(3, "JBD: commit phase 7\n");
  
         J_ASSERT(commit_transaction->t_state == T_COMMIT);