]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - fs/ext4/extents.c
ext4: race-condition protection for ext4_convert_unwritten_extents_endio
[mirror_ubuntu-bionic-kernel.git] / fs / ext4 / extents.c
index cbcc6b3f2ae049f543122183829e53263616032e..7011ac967208e941272f09a07e3292d72ef576f9 100644 (file)
@@ -52,6 +52,9 @@
 #define EXT4_EXT_MARK_UNINIT1  0x2  /* mark first half uninitialized */
 #define EXT4_EXT_MARK_UNINIT2  0x4  /* mark second half uninitialized */
 
+#define EXT4_EXT_DATA_VALID1   0x8  /* first half contains valid data */
+#define EXT4_EXT_DATA_VALID2   0x10 /* second half contains valid data */
+
 static __le32 ext4_extent_block_csum(struct inode *inode,
                                     struct ext4_extent_header *eh)
 {
@@ -1177,7 +1180,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
-       neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
+       le16_add_cpu(&neh->eh_depth, 1);
        ext4_mark_inode_dirty(handle, inode);
 out:
        brelse(bh);
@@ -2589,7 +2592,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
        struct ext4_ext_path *path = NULL;
        ext4_fsblk_t partial_cluster = 0;
        handle_t *handle;
-       int i = 0, err;
+       int i = 0, err = 0;
 
        ext_debug("truncate since %u to %u\n", start, end);
 
@@ -2621,12 +2624,16 @@ again:
                        return PTR_ERR(path);
                }
                depth = ext_depth(inode);
+               /* Leaf not may not exist only if inode has no blocks at all */
                ex = path[depth].p_ext;
                if (!ex) {
-                       ext4_ext_drop_refs(path);
-                       kfree(path);
-                       path = NULL;
-                       goto cont;
+                       if (depth) {
+                               EXT4_ERROR_INODE(inode,
+                                                "path[%d].p_hdr == NULL",
+                                                depth);
+                               err = -EIO;
+                       }
+                       goto out;
                }
 
                ee_block = le32_to_cpu(ex->ee_block);
@@ -2658,8 +2665,6 @@ again:
                                goto out;
                }
        }
-cont:
-
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
@@ -2912,6 +2917,9 @@ static int ext4_split_extent_at(handle_t *handle,
        unsigned int ee_len, depth;
        int err = 0;
 
+       BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
+              (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
+
        ext_debug("ext4_split_extents_at: inode %lu, logical"
                "block %llu\n", inode->i_ino, (unsigned long long)split);
 
@@ -2970,7 +2978,14 @@ static int ext4_split_extent_at(handle_t *handle,
 
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
-               err = ext4_ext_zeroout(inode, &orig_ex);
+               if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
+                       if (split_flag & EXT4_EXT_DATA_VALID1)
+                               err = ext4_ext_zeroout(inode, ex2);
+                       else
+                               err = ext4_ext_zeroout(inode, ex);
+               } else
+                       err = ext4_ext_zeroout(inode, &orig_ex);
+
                if (err)
                        goto fix_extent_len;
                /* update the extent length and mark as initialized */
@@ -3023,12 +3038,13 @@ static int ext4_split_extent(handle_t *handle,
        uninitialized = ext4_ext_is_uninitialized(ex);
 
        if (map->m_lblk + map->m_len < ee_block + ee_len) {
-               split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
-                             EXT4_EXT_MAY_ZEROOUT : 0;
+               split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
                if (uninitialized)
                        split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
                                       EXT4_EXT_MARK_UNINIT2;
+               if (split_flag & EXT4_EXT_DATA_VALID2)
+                       split_flag1 |= EXT4_EXT_DATA_VALID1;
                err = ext4_split_extent_at(handle, inode, path,
                                map->m_lblk + map->m_len, split_flag1, flags1);
                if (err)
@@ -3041,8 +3057,8 @@ static int ext4_split_extent(handle_t *handle,
                return PTR_ERR(path);
 
        if (map->m_lblk >= ee_block) {
-               split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
-                             EXT4_EXT_MAY_ZEROOUT : 0;
+               split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT |
+                                           EXT4_EXT_DATA_VALID2);
                if (uninitialized)
                        split_flag1 |= EXT4_EXT_MARK_UNINIT1;
                if (split_flag & EXT4_EXT_MARK_UNINIT2)
@@ -3321,26 +3337,47 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 
        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
        split_flag |= EXT4_EXT_MARK_UNINIT2;
-
+       if (flags & EXT4_GET_BLOCKS_CONVERT)
+               split_flag |= EXT4_EXT_DATA_VALID2;
        flags |= EXT4_GET_BLOCKS_PRE_IO;
        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
 }
 
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
-                                             struct inode *inode,
-                                             struct ext4_ext_path *path)
+                                               struct inode *inode,
+                                               struct ext4_map_blocks *map,
+                                               struct ext4_ext_path *path)
 {
        struct ext4_extent *ex;
+       ext4_lblk_t ee_block;
+       unsigned int ee_len;
        int depth;
        int err = 0;
 
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
+       ee_block = le32_to_cpu(ex->ee_block);
+       ee_len = ext4_ext_get_actual_len(ex);
 
        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
-               (unsigned long long)le32_to_cpu(ex->ee_block),
-               ext4_ext_get_actual_len(ex));
+                 (unsigned long long)ee_block, ee_len);
+
+       /* If extent is larger than requested then split is required */
+       if (ee_block != map->m_lblk || ee_len > map->m_len) {
+               err = ext4_split_unwritten_extents(handle, inode, map, path,
+                                                  EXT4_GET_BLOCKS_CONVERT);
+               if (err < 0)
+                       goto out;
+               ext4_ext_drop_refs(path);
+               path = ext4_ext_find_extent(inode, map->m_lblk, path);
+               if (IS_ERR(path)) {
+                       err = PTR_ERR(path);
+                       goto out;
+               }
+               depth = ext_depth(inode);
+               ex = path[depth].p_ext;
+       }
 
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
@@ -3618,7 +3655,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 {
        int ret = 0;
        int err = 0;
-       ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+       ext4_io_end_t *io = ext4_inode_aio(inode);
 
        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
                  "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3633,6 +3670,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                ret = ext4_split_unwritten_extents(handle, inode, map,
                                                   path, flags);
+               if (ret <= 0)
+                       goto out;
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
                 * that this IO needs to conversion to written when IO is
@@ -3648,7 +3687,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        }
        /* IO end_io complete, convert the filled extent to written */
        if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
-               ret = ext4_convert_unwritten_extents_endio(handle, inode,
+               ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
                                                        path);
                if (ret >= 0) {
                        ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3876,8 +3915,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
-       ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+       ext4_io_end_t *io = ext4_inode_aio(inode);
        ext4_lblk_t cluster_offset;
+       int set_unwritten = 0;
 
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
@@ -4100,13 +4140,8 @@ got_allocated_blocks:
                 * For non asycn direct IO case, flag the inode state
                 * that we need to perform conversion when IO is done.
                 */
-               if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                       if (io)
-                               ext4_set_io_unwritten_flag(inode, io);
-                       else
-                               ext4_set_inode_state(inode,
-                                                    EXT4_STATE_DIO_UNWRITTEN);
-               }
+               if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+                       set_unwritten = 1;
                if (ext4_should_dioread_nolock(inode))
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
@@ -4118,6 +4153,15 @@ got_allocated_blocks:
        if (!err)
                err = ext4_ext_insert_extent(handle, inode, path,
                                             &newex, flags);
+
+       if (!err && set_unwritten) {
+               if (io)
+                       ext4_set_io_unwritten_flag(inode, io);
+               else
+                       ext4_set_inode_state(inode,
+                                            EXT4_STATE_DIO_UNWRITTEN);
+       }
+
        if (err && free_on_err) {
                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4259,7 +4303,7 @@ void ext4_ext_truncate(struct inode *inode)
         * finish any pending end_io work so we won't run the risk of
         * converting any truncated blocks to initialized later
         */
-       ext4_flush_completed_IO(inode);
+       ext4_flush_unwritten_io(inode);
 
        /*
         * probably first extent we're gonna free will be last in block
@@ -4419,6 +4463,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
         */
        if (len <= EXT_UNINIT_MAX_LEN << blkbits)
                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+       /* Prevent race condition between unwritten */
+       ext4_flush_unwritten_io(inode);
 retry:
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk = map.m_lblk + ret;
@@ -4787,9 +4834,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        loff_t first_page_offset, last_page_offset;
        int credits, err = 0;
 
+       /*
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
+        */
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               err = filemap_write_and_wait_range(mapping,
+                       offset, offset + length - 1);
+
+               if (err)
+                       return err;
+       }
+
+       mutex_lock(&inode->i_mutex);
+       /* It's not possible punch hole on append only file */
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+               err = -EPERM;
+               goto out_mutex;
+       }
+       if (IS_SWAPFILE(inode)) {
+               err = -ETXTBSY;
+               goto out_mutex;
+       }
+
        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
-               return 0;
+               goto out_mutex;
 
        /*
         * If the hole extends beyond i_size, set the hole
@@ -4807,31 +4877,25 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
        first_page_offset = first_page << PAGE_CACHE_SHIFT;
        last_page_offset = last_page << PAGE_CACHE_SHIFT;
 
-       /*
-        * Write out all dirty pages to avoid race conditions
-        * Then release them.
-        */
-       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-               err = filemap_write_and_wait_range(mapping,
-                       offset, offset + length - 1);
-
-               if (err)
-                       return err;
-       }
-
        /* Now release the pages */
        if (last_page_offset > first_page_offset) {
                truncate_pagecache_range(inode, first_page_offset,
                                         last_page_offset - 1);
        }
 
-       /* finish any pending end_io work */
-       ext4_flush_completed_IO(inode);
+       /* Wait all existing dio workers, newcomers will block on i_mutex */
+       ext4_inode_block_unlocked_dio(inode);
+       err = ext4_flush_unwritten_io(inode);
+       if (err)
+               goto out_dio;
+       inode_dio_wait(inode);
 
        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, credits);
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
+       if (IS_ERR(handle)) {
+               err = PTR_ERR(handle);
+               goto out_dio;
+       }
 
 
        /*
@@ -4921,6 +4985,10 @@ out:
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
        return err;
 }
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,