ext4: fix inode checksum calculation problem if i_extra_size is small

[mirror_ubuntu-artful-kernel.git] / fs / ext4 / inode.c
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index ea433a7f4bca21511ba84fbfbe52f71883680661..0057e353e482adc5fb335946e4e5dfbb02e0009a 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -51,25 +51,30 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
                               struct ext4_inode_info *ei)
  {
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       __u16 csum_lo;
-       __u16 csum_hi = 0;
         __u32 csum;
+       __u16 dummy_csum = 0;
+       int offset = offsetof(struct ext4_inode, i_checksum_lo);
+       unsigned int csum_size = sizeof(dummy_csum);
  
-       csum_lo = le16_to_cpu(raw->i_checksum_lo);
-       raw->i_checksum_lo = 0;
-       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
-           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
-               csum_hi = le16_to_cpu(raw->i_checksum_hi);
-               raw->i_checksum_hi = 0;
-       }
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
+       csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+       offset += csum_size;
+       csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+                          EXT4_GOOD_OLD_INODE_SIZE - offset);
  
-       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
-                          EXT4_INODE_SIZE(inode->i_sb));
-
-       raw->i_checksum_lo = cpu_to_le16(csum_lo);
-       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
-           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
-               raw->i_checksum_hi = cpu_to_le16(csum_hi);
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+               offset = offsetof(struct ext4_inode, i_checksum_hi);
+               csum = ext4_chksum(sbi, csum, (__u8 *)raw +
+                                  EXT4_GOOD_OLD_INODE_SIZE,
+                                  offset - EXT4_GOOD_OLD_INODE_SIZE);
+               if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+                       csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+                                          csum_size);
+                       offset += csum_size;
+               }
+               csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+                                  EXT4_INODE_SIZE(inode->i_sb) - offset);
+       }
  
         return csum;
  }
@@ -205,9 +210,9 @@ void ext4_evict_inode(struct inode *inode)
                  * Note that directories do not have this problem because they
                  * don't use page cache.
                  */
-               if (ext4_should_journal_data(inode) &&
-                   (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
-                   inode->i_ino != EXT4_JOURNAL_INO) {
+               if (inode->i_ino != EXT4_JOURNAL_INO &&
+                   ext4_should_journal_data(inode) &&
+                   (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
                         journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
                         tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
  
@@ -445,13 +450,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
   * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
   * based files
   *
- * On success, it returns the number of blocks being mapped or allocated.
- * if create==0 and the blocks are pre-allocated and unwritten block,
- * the result buffer head is unmapped. If the create ==1, it will make sure
- * the buffer head is mapped.
+ * On success, it returns the number of blocks being mapped or allocated.  if
+ * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
+ * is marked as unwritten. If the create == 1, it will mark @map as mapped.
   *
   * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that case, buffer head is unmapped
+ * that case, @map is returned as unmapped but we still do fill map->m_len to
+ * indicate the length of a hole starting at map->m_lblk.
   *
   * It returns the error in case of allocation failure.
   */
@@ -494,6 +499,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                                 retval = map->m_len;
                         map->m_len = retval;
                 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+                       map->m_pblk = 0;
+                       retval = es.es_len - (map->m_lblk - es.es_lblk);
+                       if (retval > map->m_len)
+                               retval = map->m_len;
+                       map->m_len = retval;
                         retval = 0;
                 } else {
                         BUG_ON(1);
@@ -657,6 +667,34 @@ has_zeroout:
         return retval;
  }
  
+/*
+ * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
+ * we have to be careful as someone else may be manipulating b_state as well.
+ */
+static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
+{
+       unsigned long old_state;
+       unsigned long new_state;
+
+       flags &= EXT4_MAP_FLAGS;
+
+       /* Dummy buffer_head? Set non-atomically. */
+       if (!bh->b_page) {
+               bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
+               return;
+       }
+       /*
+        * Someone else may be modifying b_state. Be careful! This is ugly but
+        * once we get rid of using bh as a container for mapping information
+        * to pass to / from get_block functions, this can go away.
+        */
+       do {
+               old_state = READ_ONCE(bh->b_state);
+               new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
+       } while (unlikely(
+                cmpxchg(&bh->b_state, old_state, new_state) != old_state));
+}
+
  /* Maximum number of blocks we map for direct IO at once. */
  #define DIO_MAX_BLOCKS 4096
  
@@ -693,7 +731,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
                 ext4_io_end_t *io_end = ext4_inode_aio(inode);
  
                 map_bh(bh, inode->i_sb, map.m_pblk);
-               bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+               ext4_update_bh_state(bh, map.m_flags);
                 if (IS_DAX(inode) && buffer_unwritten(bh)) {
                         /*
                          * dgc: I suspect unwritten conversion on ext4+DAX is
@@ -1131,8 +1169,11 @@ static int ext4_write_end(struct file *file,
         if (ext4_has_inline_data(inode)) {
                 ret = ext4_write_inline_data_end(inode, pos, len,
                                                  copied, page);
-               if (ret < 0)
+               if (ret < 0) {
+                       unlock_page(page);
+                       put_page(page);
                         goto errout;
+               }
                 copied = ret;
         } else
                 copied = block_write_end(file, mapping, pos,
@@ -1186,7 +1227,9 @@ errout:
   * set the buffer to be dirty, since in data=journalled mode we need
   * to call ext4_handle_dirty_metadata() instead.
   */
-static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+static void ext4_journalled_zero_new_buffers(handle_t *handle,
+                                           struct page *page,
+                                           unsigned from, unsigned to)
  {
         unsigned int block_start = 0, block_end;
         struct buffer_head *head, *bh;
@@ -1203,7 +1246,7 @@ static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
                                         size = min(to, block_end) - start;
  
                                         zero_user(page, start, size);
-                                       set_buffer_uptodate(bh);
+                                       write_end_fn(handle, bh);
                                 }
                                 clear_buffer_new(bh);
                         }
@@ -1232,18 +1275,25 @@ static int ext4_journalled_write_end(struct file *file,
  
         BUG_ON(!ext4_handle_valid(handle));
  
-       if (ext4_has_inline_data(inode))
-               copied = ext4_write_inline_data_end(inode, pos, len,
-                                                   copied, page);
-       else {
-               if (copied < len) {
-                       if (!PageUptodate(page))
-                               copied = 0;
-                       zero_new_buffers(page, from+copied, to);
+       if (ext4_has_inline_data(inode)) {
+               ret = ext4_write_inline_data_end(inode, pos, len,
+                                                copied, page);
+               if (ret < 0) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto errout;
                 }
-
+               copied = ret;
+       } else if (unlikely(copied < len) && !PageUptodate(page)) {
+               copied = 0;
+               ext4_journalled_zero_new_buffers(handle, page, from, to);
+       } else {
+               if (unlikely(copied < len))
+                       ext4_journalled_zero_new_buffers(handle, page,
+                                                        from + copied, to);
                 ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
-                                            to, &partial, write_end_fn);
+                                            from + copied, &partial,
+                                            write_end_fn);
                 if (!partial)
                         SetPageUptodate(page);
         }
@@ -1269,6 +1319,7 @@ static int ext4_journalled_write_end(struct file *file,
                  */
                 ext4_orphan_add(handle, inode);
  
+errout:
         ret2 = ext4_journal_stop(handle);
         if (!ret)
                 ret = ret2;
@@ -1669,7 +1720,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                 return ret;
  
         map_bh(bh, inode->i_sb, map.m_pblk);
-       bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+       ext4_update_bh_state(bh, map.m_flags);
  
         if (buffer_unwritten(bh)) {
                 /* A delayed write to unwritten bh should be marked
@@ -2561,13 +2612,36 @@ retry:
                                 done = true;
                         }
                 }
-               ext4_journal_stop(handle);
+               /*
+                * Caution: If the handle is synchronous,
+                * ext4_journal_stop() can wait for transaction commit
+                * to finish which may depend on writeback of pages to
+                * complete or on page lock to be released.  In that
+                * case, we have to wait until after after we have
+                * submitted all the IO, released page locks we hold,
+                * and dropped io_end reference (for extent conversion
+                * to be able to complete) before stopping the handle.
+                */
+               if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
+                       ext4_journal_stop(handle);
+                       handle = NULL;
+               }
                 /* Submit prepared bio */
                 ext4_io_submit(&mpd.io_submit);
                 /* Unlock pages we didn't use */
                 mpage_release_unused_pages(&mpd, give_up_on_write);
-               /* Drop our io_end reference we got from init */
-               ext4_put_io_end(mpd.io_submit.io_end);
+               /*
+                * Drop our io_end reference we got from init. We have
+                * to be careful and use deferred io_end finishing if
+                * we are still holding the transaction as we can
+                * release the last reference to io_end which may end
+                * up doing unwritten extent conversion.
+                */
+               if (handle) {
+                       ext4_put_io_end_defer(mpd.io_submit.io_end);
+                       ext4_journal_stop(handle);
+               } else
+                       ext4_put_io_end(mpd.io_submit.io_end);
  
                 if (ret == -ENOSPC && sbi->s_journal) {
                         /*
@@ -3503,6 +3577,10 @@ static int ext4_block_truncate_page(handle_t *handle,
         unsigned blocksize;
         struct inode *inode = mapping->host;
  
+       /* If we are processing an encrypted inode during orphan list handling */
+       if (ext4_encrypted_inode(inode) && !ext4_has_encryption_key(inode))
+               return 0;
+
         blocksize = inode->i_sb->s_blocksize;
         length = blocksize - (offset & (blocksize - 1));
  
@@ -3559,7 +3637,36 @@ int ext4_can_truncate(struct inode *inode)
  }
  
  /*
- * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+                                     loff_t len)
+{
+       handle_t *handle;
+       loff_t size = i_size_read(inode);
+
+       WARN_ON(!mutex_is_locked(&inode->i_mutex));
+       if (offset > size || offset + len < size)
+               return 0;
+
+       if (EXT4_I(inode)->i_disksize >= size)
+               return 0;
+
+       handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+       ext4_update_i_disksize(inode, size);
+       ext4_mark_inode_dirty(handle, inode);
+       ext4_journal_stop(handle);
+
+       return 0;
+}
+
+/*
+ * ext4_punch_hole: punches a hole in a file by releasing the blocks
   * associated with the given offset and length
   *
   * @inode:  File inode
@@ -3588,7 +3695,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
          * Write out all dirty pages to avoid race conditions
          * Then release them.
          */
-       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+       if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                 ret = filemap_write_and_wait_range(mapping, offset,
                                                    offset + length - 1);
                 if (ret)
@@ -3623,17 +3730,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
  
         }
  
+       /* Wait all existing dio workers, newcomers will block on i_mutex */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
+
+       /*
+        * Prevent page faults from reinstantiating pages we have released from
+        * page cache.
+        */
+       down_write(&EXT4_I(inode)->i_mmap_sem);
         first_block_offset = round_up(offset, sb->s_blocksize);
         last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
  
         /* Now release the pages and zero block aligned part of pages*/
-       if (last_block_offset > first_block_offset)
+       if (last_block_offset > first_block_offset) {
+               ret = ext4_update_disksize_before_punch(inode, offset, length);
+               if (ret)
+                       goto out_dio;
                 truncate_pagecache_range(inode, first_block_offset,
                                          last_block_offset);
-
-       /* Wait all existing dio workers, newcomers will block on i_mutex */
-       ext4_inode_block_unlocked_dio(inode);
-       inode_dio_wait(inode);
+       }
  
         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                 credits = ext4_writepage_trans_blocks(inode);
@@ -3680,16 +3796,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
         if (IS_SYNC(inode))
                 ext4_handle_sync(handle);
  
-       /* Now release the pages again to reduce race window */
-       if (last_block_offset > first_block_offset)
-               truncate_pagecache_range(inode, first_block_offset,
-                                        last_block_offset);
-
         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
         ext4_mark_inode_dirty(handle, inode);
  out_stop:
         ext4_journal_stop(handle);
  out_dio:
+       up_write(&EXT4_I(inode)->i_mmap_sem);
         ext4_inode_resume_unlocked_dio(inode);
  out_mutex:
         mutex_unlock(&inode->i_mutex);
@@ -4084,6 +4196,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
         struct inode *inode;
         journal_t *journal = EXT4_SB(sb)->s_journal;
         long ret;
+       loff_t size;
         int block;
         uid_t i_uid;
         gid_t i_gid;
@@ -4175,6 +4288,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                 ei->i_file_acl |=
                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
         inode->i_size = ext4_isize(raw_inode);
+       if ((size = i_size_read(inode)) < 0) {
+               EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+               ret = -EFSCORRUPTED;
+               goto bad_inode;
+       }
         ei->i_disksize = inode->i_size;
  #ifdef CONFIG_QUOTA
         ei->i_reserved_quota = 0;
@@ -4458,14 +4576,14 @@ static int ext4_do_update_inode(handle_t *handle,
   * Fix up interoperability with old kernels. Otherwise, old inodes get
   * re-used with the upper 16 bits of the uid/gid intact
   */
-               if (!ei->i_dtime) {
+               if (ei->i_dtime && list_empty(&ei->i_orphan)) {
+                       raw_inode->i_uid_high = 0;
+                       raw_inode->i_gid_high = 0;
+               } else {
                         raw_inode->i_uid_high =
                                 cpu_to_le16(high_16_bits(i_uid));
                         raw_inode->i_gid_high =
                                 cpu_to_le16(high_16_bits(i_gid));
-               } else {
-                       raw_inode->i_uid_high = 0;
-                       raw_inode->i_gid_high = 0;
                 }
         } else {
                 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
@@ -4823,6 +4941,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                         } else
                                 ext4_wait_for_tail_page_commit(inode);
                 }
+               down_write(&EXT4_I(inode)->i_mmap_sem);
                 /*
                  * Truncate pagecache after we've waited for commit
                  * in data=journal mode to make pages freeable.
@@ -4830,6 +4949,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                 truncate_pagecache(inode, inode->i_size);
                 if (shrink)
                         ext4_truncate(inode);
+               up_write(&EXT4_I(inode)->i_mmap_sem);
         }
  
         if (!rc) {
@@ -5081,6 +5201,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
         might_sleep();
         trace_ext4_mark_inode_dirty(inode, _RET_IP_);
         err = ext4_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               return err;
         if (ext4_handle_valid(handle) &&
             EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
             !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
@@ -5097,8 +5219,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
                                                       sbi->s_want_extra_isize,
                                                       iloc, handle);
                         if (ret) {
-                               ext4_set_inode_state(inode,
-                                                    EXT4_STATE_NO_EXPAND);
                                 if (mnt_count !=
                                         le16_to_cpu(sbi->s_es->s_mnt_count)) {
                                         ext4_warning(inode->i_sb,
@@ -5111,9 +5231,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
                         }
                 }
         }
-       if (!err)
-               err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-       return err;
+       return ext4_mark_iloc_dirty(handle, inode, &iloc);
  }
  
  /*
@@ -5278,6 +5396,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  
         sb_start_pagefault(inode->i_sb);
         file_update_time(vma->vm_file);
+
+       down_read(&EXT4_I(inode)->i_mmap_sem);
         /* Delalloc case is easy... */
         if (test_opt(inode->i_sb, DELALLOC) &&
             !ext4_should_journal_data(inode) &&
@@ -5347,6 +5467,87 @@ retry_alloc:
  out_ret:
         ret = block_page_mkwrite_return(ret);
  out:
+       up_read(&EXT4_I(inode)->i_mmap_sem);
         sb_end_pagefault(inode->i_sb);
         return ret;
  }
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct inode *inode = file_inode(vma->vm_file);
+       int err;
+
+       down_read(&EXT4_I(inode)->i_mmap_sem);
+       err = filemap_fault(vma, vmf);
+       up_read(&EXT4_I(inode)->i_mmap_sem);
+
+       return err;
+}
+
+/*
+ * Find the first extent at or after @lblk in an inode that is not a hole.
+ * Search for @map_len blocks at most. The extent is returned in @result.
+ *
+ * The function returns 1 if we found an extent. The function returns 0 in
+ * case there is no extent at or after @lblk and in that case also sets
+ * @result->es_len to 0. In case of error, the error code is returned.
+ */
+int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+                        unsigned int map_len, struct extent_status *result)
+{
+       struct ext4_map_blocks map;
+       struct extent_status es = {};
+       int ret;
+
+       map.m_lblk = lblk;
+       map.m_len = map_len;
+
+       /*
+        * For non-extent based files this loop may iterate several times since
+        * we do not determine full hole size.
+        */
+       while (map.m_len > 0) {
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+               if (ret < 0)
+                       return ret;
+               /* There's extent covering m_lblk? Just return it. */
+               if (ret > 0) {
+                       int status;
+
+                       ext4_es_store_pblock(result, map.m_pblk);
+                       result->es_lblk = map.m_lblk;
+                       result->es_len = map.m_len;
+                       if (map.m_flags & EXT4_MAP_UNWRITTEN)
+                               status = EXTENT_STATUS_UNWRITTEN;
+                       else
+                               status = EXTENT_STATUS_WRITTEN;
+                       ext4_es_store_status(result, status);
+                       return 1;
+               }
+               ext4_es_find_delayed_extent_range(inode, map.m_lblk,
+                                                 map.m_lblk + map.m_len - 1,
+                                                 &es);
+               /* Is delalloc data before next block in extent tree? */
+               if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
+                       ext4_lblk_t offset = 0;
+
+                       if (es.es_lblk < lblk)
+                               offset = lblk - es.es_lblk;
+                       result->es_lblk = es.es_lblk + offset;
+                       ext4_es_store_pblock(result,
+                                            ext4_es_pblock(&es) + offset);
+                       result->es_len = es.es_len - offset;
+                       ext4_es_store_status(result, ext4_es_status(&es));
+
+                       return 1;
+               }
+               /* There's a hole at m_lblk, advance us after it */
+               map.m_lblk += map.m_len;
+               map_len -= map.m_len;
+               map.m_len = map_len;
+               cond_resched();
+       }
+       result->es_len = 0;
+       return 0;
+}
+