]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blobdiff - fs/jbd2/commit.c
jbd2: Remove data=ordered mode support using jbd buffer heads
[mirror_ubuntu-zesty-kernel.git] / fs / jbd2 / commit.c
index 4f302d2792794008351326bf9d6e128cf8b18a33..483183d15ed54212aaa9e81ddba9070d90eef9d5 100644 (file)
@@ -37,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -79,21 +79,6 @@ nope:
        __brelse(bh);
 }
 
-/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-       if (!jbd_trylock_bh_state(bh)) {
-               spin_unlock(&journal->j_list_lock);
-               schedule();
-               return 0;
-       }
-       return 1;
-}
-
 /*
  * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
@@ -112,6 +97,7 @@ static int journal_submit_commit_record(journal_t *journal,
        struct buffer_head *bh;
        int ret;
        int barrier_done = 0;
+       struct timespec now = current_kernel_time();
 
        if (is_journal_aborted(journal))
                return 0;
@@ -126,6 +112,8 @@ static int journal_submit_commit_record(journal_t *journal,
        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+       tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+       tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 
        if (JBD2_HAS_COMPAT_FEATURE(journal,
                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -136,18 +124,20 @@ static int journal_submit_commit_record(journal_t *journal,
 
        JBUFFER_TRACE(descriptor, "submit commit block");
        lock_buffer(bh);
-
+       get_bh(bh);
        set_buffer_dirty(bh);
        set_buffer_uptodate(bh);
        bh->b_end_io = journal_end_buffer_io_sync;
 
        if (journal->j_flags & JBD2_BARRIER &&
-               !JBD2_HAS_COMPAT_FEATURE(journal,
+               !JBD2_HAS_INCOMPAT_FEATURE(journal,
                                         JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
        ret = submit_bh(WRITE, bh);
+       if (barrier_done)
+               clear_buffer_ordered(bh);
 
        /* is it possible for another commit to fail at roughly
         * the same time as this one?  If so, we don't want to
@@ -166,7 +156,7 @@ static int journal_submit_commit_record(journal_t *journal,
                spin_unlock(&journal->j_state_lock);
 
                /* And try again, without the barrier */
-               clear_buffer_ordered(bh);
+               lock_buffer(bh);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
                ret = submit_bh(WRITE, bh);
@@ -195,159 +185,78 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 }
 
 /*
- * Wait for all submitted IO to complete.
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
  */
-static int journal_wait_on_locked_list(journal_t *journal,
-                                      transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(journal_t *journal,
+               transaction_t *commit_transaction)
 {
-       int ret = 0;
-       struct journal_head *jh;
+       struct jbd2_inode *jinode;
+       int err, ret = 0;
+       struct address_space *mapping;
 
-       while (commit_transaction->t_locked_list) {
-               struct buffer_head *bh;
-
-               jh = commit_transaction->t_locked_list->b_tprev;
-               bh = jh2bh(jh);
-               get_bh(bh);
-               if (buffer_locked(bh)) {
-                       spin_unlock(&journal->j_list_lock);
-                       wait_on_buffer(bh);
-                       if (unlikely(!buffer_uptodate(bh)))
-                               ret = -EIO;
-                       spin_lock(&journal->j_list_lock);
-               }
-               if (!inverted_lock(journal, bh)) {
-                       put_bh(bh);
-                       spin_lock(&journal->j_list_lock);
-                       continue;
-               }
-               if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                       __jbd2_journal_unfile_buffer(jh);
-                       jbd_unlock_bh_state(bh);
-                       jbd2_journal_remove_journal_head(bh);
-                       put_bh(bh);
-               } else {
-                       jbd_unlock_bh_state(bh);
-               }
-               put_bh(bh);
-               cond_resched_lock(&journal->j_list_lock);
+       spin_lock(&journal->j_list_lock);
+       list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+               mapping = jinode->i_vfs_inode->i_mapping;
+               jinode->i_flags |= JI_COMMIT_RUNNING;
+               spin_unlock(&journal->j_list_lock);
+               err = filemap_fdatawrite_range(mapping, 0,
+                                       i_size_read(jinode->i_vfs_inode));
+               if (!ret)
+                       ret = err;
+               spin_lock(&journal->j_list_lock);
+               J_ASSERT(jinode->i_transaction == commit_transaction);
+               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
+       spin_unlock(&journal->j_list_lock);
        return ret;
-  }
-
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
-{
-       int i;
-
-       for (i = 0; i < bufs; i++) {
-               wbuf[i]->b_end_io = end_buffer_write_sync;
-               /* We use-up our safety reference in submit_bh() */
-               submit_bh(WRITE, wbuf[i]);
-       }
 }
 
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
  */
-static void journal_submit_data_buffers(journal_t *journal,
-                               transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+               transaction_t *commit_transaction)
 {
-       struct journal_head *jh;
-       struct buffer_head *bh;
-       int locked;
-       int bufs = 0;
-       struct buffer_head **wbuf = journal->j_wbuf;
+       struct jbd2_inode *jinode, *next_i;
+       int err, ret = 0;
 
-       /*
-        * Whenever we unlock the journal and sleep, things can get added
-        * onto ->t_sync_datalist, so we have to keep looping back to
-        * write_out_data until we *know* that the list is empty.
-        *
-        * Cleanup any flushed data buffers from the data list.  Even in
-        * abort mode, we want to flush this out as soon as possible.
-        */
-write_out_data:
-       cond_resched();
+       /* For locking, see the comment in journal_submit_inode_data_buffers() */
        spin_lock(&journal->j_list_lock);
+       list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+               jinode->i_flags |= JI_COMMIT_RUNNING;
+               spin_unlock(&journal->j_list_lock);
+               err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+               if (!ret)
+                       ret = err;
+               spin_lock(&journal->j_list_lock);
+               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+       }
 
-       while (commit_transaction->t_sync_datalist) {
-               jh = commit_transaction->t_sync_datalist;
-               bh = jh2bh(jh);
-               locked = 0;
-
-               /* Get reference just to make sure buffer does not disappear
-                * when we are forced to drop various locks */
-               get_bh(bh);
-               /* If the buffer is dirty, we need to submit IO and hence
-                * we need the buffer lock. We try to lock the buffer without
-                * blocking. If we fail, we need to drop j_list_lock and do
-                * blocking lock_buffer().
-                */
-               if (buffer_dirty(bh)) {
-                       if (test_set_buffer_locked(bh)) {
-                               BUFFER_TRACE(bh, "needs blocking lock");
-                               spin_unlock(&journal->j_list_lock);
-                               /* Write out all data to prevent deadlocks */
-                               journal_do_submit_data(wbuf, bufs);
-                               bufs = 0;
-                               lock_buffer(bh);
-                               spin_lock(&journal->j_list_lock);
-                       }
-                       locked = 1;
-               }
-               /* We have to get bh_state lock. Again out of order, sigh. */
-               if (!inverted_lock(journal, bh)) {
-                       jbd_lock_bh_state(bh);
-                       spin_lock(&journal->j_list_lock);
-               }
-               /* Someone already cleaned up the buffer? */
-               if (!buffer_jbd(bh)
-                       || jh->b_transaction != commit_transaction
-                       || jh->b_jlist != BJ_SyncData) {
-                       jbd_unlock_bh_state(bh);
-                       if (locked)
-                               unlock_buffer(bh);
-                       BUFFER_TRACE(bh, "already cleaned up");
-                       put_bh(bh);
-                       continue;
-               }
-               if (locked && test_clear_buffer_dirty(bh)) {
-                       BUFFER_TRACE(bh, "needs writeout, adding to array");
-                       wbuf[bufs++] = bh;
-                       __jbd2_journal_file_buffer(jh, commit_transaction,
-                                               BJ_Locked);
-                       jbd_unlock_bh_state(bh);
-                       if (bufs == journal->j_wbufsize) {
-                               spin_unlock(&journal->j_list_lock);
-                               journal_do_submit_data(wbuf, bufs);
-                               bufs = 0;
-                               goto write_out_data;
-                       }
-               } else if (!locked && buffer_locked(bh)) {
-                       __jbd2_journal_file_buffer(jh, commit_transaction,
-                                               BJ_Locked);
-                       jbd_unlock_bh_state(bh);
-                       put_bh(bh);
+       /* Now refile inode to proper lists */
+       list_for_each_entry_safe(jinode, next_i,
+                                &commit_transaction->t_inode_list, i_list) {
+               list_del(&jinode->i_list);
+               if (jinode->i_next_transaction) {
+                       jinode->i_transaction = jinode->i_next_transaction;
+                       jinode->i_next_transaction = NULL;
+                       list_add(&jinode->i_list,
+                               &jinode->i_transaction->t_inode_list);
                } else {
-                       BUFFER_TRACE(bh, "writeout complete: unfile");
-                       __jbd2_journal_unfile_buffer(jh);
-                       jbd_unlock_bh_state(bh);
-                       if (locked)
-                               unlock_buffer(bh);
-                       jbd2_journal_remove_journal_head(bh);
-                       /* Once for our safety reference, once for
-                        * jbd2_journal_remove_journal_head() */
-                       put_bh(bh);
-                       put_bh(bh);
-               }
-
-               if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-                       spin_unlock(&journal->j_list_lock);
-                       goto write_out_data;
+                       jinode->i_transaction = NULL;
                }
        }
        spin_unlock(&journal->j_list_lock);
-       journal_do_submit_data(wbuf, bufs);
+
+       return ret;
 }
 
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -518,41 +427,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
        jbd_debug (3, "JBD: commit phase 2\n");
 
-       /*
-        * First, drop modified flag: all accesses to the buffers
-        * will be tracked for a new trasaction only -bzzz
-        */
-       spin_lock(&journal->j_list_lock);
-       if (commit_transaction->t_buffers) {
-               new_jh = jh = commit_transaction->t_buffers->b_tnext;
-               do {
-                       J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
-                                       new_jh->b_modified == 0);
-                       new_jh->b_modified = 0;
-                       new_jh = new_jh->b_tnext;
-               } while (new_jh != jh);
-       }
-       spin_unlock(&journal->j_list_lock);
-
        /*
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-       err = 0;
-       journal_submit_data_buffers(journal, commit_transaction);
-
-       /*
-        * Wait for all previously submitted IO to complete if commit
-        * record is to be written synchronously.
-        */
-       spin_lock(&journal->j_list_lock);
-       if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-               err = journal_wait_on_locked_list(journal,
-                                               commit_transaction);
-
-       spin_unlock(&journal->j_list_lock);
-
+       err = journal_submit_inode_data_buffers(journal, commit_transaction);
        if (err)
                jbd2_journal_abort(journal, err);
 
@@ -560,22 +439,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
        jbd_debug(3, "JBD: commit phase 2\n");
 
-       /*
-        * If we found any dirty or locked buffers, then we should have
-        * looped back up to the write_out_data label.  If there weren't
-        * any then journal_clean_data_list should have wiped the list
-        * clean by now, so check that it is in fact empty.
-        */
-       J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-       jbd_debug (3, "JBD: commit phase 3\n");
-
        /*
         * Way to go: we have now written out all of the data for a
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
         */
+       spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_COMMIT;
+       spin_unlock(&journal->j_state_lock);
 
        stats.u.run.rs_logging = jiffies;
        stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
@@ -583,6 +454,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
        stats.u.run.rs_blocks_logged = 0;
 
+       J_ASSERT(commit_transaction->t_nr_buffers <=
+                commit_transaction->t_outstanding_credits);
+
+       err = 0;
        descriptor = NULL;
        bufs = 0;
        while (commit_transaction->t_buffers) {
@@ -757,15 +632,19 @@ start_journal_io:
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
-
-               spin_lock(&journal->j_list_lock);
-               err = journal_wait_on_locked_list(journal,
-                                               commit_transaction);
-               spin_unlock(&journal->j_list_lock);
-               if (err)
-                       __jbd2_journal_abort_hard(journal);
        }
 
+       /*
+        * This is the right place to wait for data buffers both for ASYNC
+        * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+        * the commit block went to disk (which happens above). If commit is
+        * SYNC, we need to wait for data buffers before we start writing
+        * commit block, which happens below in such setting.
+        */
+       err = journal_finish_inode_data_buffers(journal, commit_transaction);
+       if (err)
+               jbd2_journal_abort(journal, err);
+
        /* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
@@ -777,7 +656,7 @@ start_journal_io:
           so we incur less scheduling load.
        */
 
-       jbd_debug(3, "JBD: commit phase 4\n");
+       jbd_debug(3, "JBD: commit phase 3\n");
 
        /*
         * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -836,7 +715,7 @@ wait_for_iobuf:
 
        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-       jbd_debug(3, "JBD: commit phase 5\n");
+       jbd_debug(3, "JBD: commit phase 4\n");
 
        /* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -863,7 +742,7 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
 
-       jbd_debug(3, "JBD: commit phase 6\n");
+       jbd_debug(3, "JBD: commit phase 5\n");
 
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -872,7 +751,8 @@ wait_for_iobuf:
                if (err)
                        __jbd2_journal_abort_hard(journal);
        }
-       err = journal_wait_on_commit_record(cbh);
+       if (!err && !is_journal_aborted(journal))
+               err = journal_wait_on_commit_record(cbh);
 
        if (err)
                jbd2_journal_abort(journal, err);
@@ -882,9 +762,9 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-       jbd_debug(3, "JBD: commit phase 7\n");
+       jbd_debug(3, "JBD: commit phase 6\n");
 
-       J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+       J_ASSERT(list_empty(&commit_transaction->t_inode_list));
        J_ASSERT(commit_transaction->t_buffers == NULL);
        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -1005,7 +885,7 @@ restart_loop:
 
        /* Done with this transaction! */
 
-       jbd_debug(3, "JBD: commit phase 8\n");
+       jbd_debug(3, "JBD: commit phase 7\n");
 
        J_ASSERT(commit_transaction->t_state == T_COMMIT);