fs/ext4/inode.c

   1 /*
   2  *  linux/fs/ext4/inode.c
   3  *
   4  * Copyright (C) 1992, 1993, 1994, 1995
   5  * Remy Card (card@masi.ibp.fr)
   6  * Laboratoire MASI - Institut Blaise Pascal
   7  * Universite Pierre et Marie Curie (Paris VI)
   8  *
   9  *  from
  10  *
  11  *  linux/fs/minix/inode.c
  12  *
  13  *  Copyright (C) 1991, 1992  Linus Torvalds
  14  *
  15  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  16  *      (jj@sunsite.ms.mff.cuni.cz)
  17  *
  18  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  19  */
  20
  21 #include <linux/module.h>
  22 #include <linux/fs.h>
  23 #include <linux/time.h>
  24 #include <linux/jbd2.h>
  25 #include <linux/highuid.h>
  26 #include <linux/pagemap.h>
  27 #include <linux/quotaops.h>
  28 #include <linux/string.h>
  29 #include <linux/buffer_head.h>
  30 #include <linux/writeback.h>
  31 #include <linux/pagevec.h>
  32 #include <linux/mpage.h>
  33 #include <linux/namei.h>
  34 #include <linux/uio.h>
  35 #include <linux/bio.h>
  36 #include <linux/workqueue.h>
  37 #include <linux/kernel.h>
  38 #include <linux/printk.h>
  39 #include <linux/slab.h>
  40 #include <linux/ratelimit.h>
  41
  42 #include "ext4_jbd2.h"
  43 #include "xattr.h"
  44 #include "acl.h"
  45 #include "ext4_extents.h"
  46 #include "truncate.h"
  47
  48 #include <trace/events/ext4.h>
  49
  50 #define MPAGE_DA_EXTENT_TAIL 0x01
  51
  52 static inline int ext4_begin_ordered_truncate(struct inode *inode,
  53                                               loff_t new_size)
  54 {
  55         trace_ext4_begin_ordered_truncate(inode, new_size);
  56         /*
  57          * If jinode is zero, then we never opened the file for
  58          * writing, so there's no need to call
  59          * jbd2_journal_begin_ordered_truncate() since there's no
  60          * outstanding writes we need to flush.
  61          */
  62         if (!EXT4_I(inode)->jinode)
  63                 return 0;
  64         return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
  65                                                    EXT4_I(inode)->jinode,
  66                                                    new_size);
  67 }
  68
  69 static void ext4_invalidatepage(struct page *page, unsigned long offset);
  70 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
  71                                    struct buffer_head *bh_result, int create);
  72 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
  73 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
  74 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
  75 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
  76
  77 /*
  78  * Test whether an inode is a fast symlink.
  79  */
  80 static int ext4_inode_is_fast_symlink(struct inode *inode)
  81 {
  82         int ea_blocks = EXT4_I(inode)->i_file_acl ?
  83                 (inode->i_sb->s_blocksize >> 9) : 0;
  84
  85         return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
  86 }
  87
  88 /*
  89  * Restart the transaction associated with *handle.  This does a commit,
  90  * so before we call here everything must be consistently dirtied against
  91  * this transaction.
  92  */
  93 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
  94                                  int nblocks)
  95 {
  96         int ret;
  97
  98         /*
  99          * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 100          * moment, get_block can be called only for blocks inside i_size since
 101          * page cache has been already dropped and writes are blocked by
 102          * i_mutex. So we can safely drop the i_data_sem here.
 103          */
 104         BUG_ON(EXT4_JOURNAL(inode) == NULL);
 105         jbd_debug(2, "restarting handle %p\n", handle);
 106         up_write(&EXT4_I(inode)->i_data_sem);
 107         ret = ext4_journal_restart(handle, nblocks);
 108         down_write(&EXT4_I(inode)->i_data_sem);
 109         ext4_discard_preallocations(inode);
 110
 111         return ret;
 112 }
 113
 114 /*
 115  * Called at the last iput() if i_nlink is zero.
 116  */
 117 void ext4_evict_inode(struct inode *inode)
 118 {
 119         handle_t *handle;
 120         int err;
 121
 122         trace_ext4_evict_inode(inode);
 123         if (inode->i_nlink) {
 124                 /*
 125                  * When journalling data dirty buffers are tracked only in the
 126                  * journal. So although mm thinks everything is clean and
 127                  * ready for reaping the inode might still have some pages to
 128                  * write in the running transaction or waiting to be
 129                  * checkpointed. Thus calling jbd2_journal_invalidatepage()
 130                  * (via truncate_inode_pages()) to discard these buffers can
 131                  * cause data loss. Also even if we did not discard these
 132                  * buffers, we would have no way to find them after the inode
 133                  * is reaped and thus user could see stale data if he tries to
 134                  * read them before the transaction is checkpointed. So be
 135                  * careful and force everything to disk here... We use
 136                  * ei->i_datasync_tid to store the newest transaction
 137                  * containing inode's data.
 138                  *
 139                  * Note that directories do not have this problem because they
 140                  * don't use page cache.
 141                  */
 142                 if (ext4_should_journal_data(inode) &&
 143                     (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
 144                         journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 145                         tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
 146
 147                         jbd2_log_start_commit(journal, commit_tid);
 148                         jbd2_log_wait_commit(journal, commit_tid);
 149                         filemap_write_and_wait(&inode->i_data);
 150                 }
 151                 truncate_inode_pages(&inode->i_data, 0);
 152                 goto no_delete;
 153         }
 154
 155         if (!is_bad_inode(inode))
 156                 dquot_initialize(inode);
 157
 158         if (ext4_should_order_data(inode))
 159                 ext4_begin_ordered_truncate(inode, 0);
 160         truncate_inode_pages(&inode->i_data, 0);
 161
 162         if (is_bad_inode(inode))
 163                 goto no_delete;
 164
 165         handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
 166         if (IS_ERR(handle)) {
 167                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
 168                 /*
 169                  * If we're going to skip the normal cleanup, we still need to
 170                  * make sure that the in-core orphan linked list is properly
 171                  * cleaned up.
 172                  */
 173                 ext4_orphan_del(NULL, inode);
 174                 goto no_delete;
 175         }
 176
 177         if (IS_SYNC(inode))
 178                 ext4_handle_sync(handle);
 179         inode->i_size = 0;
 180         err = ext4_mark_inode_dirty(handle, inode);
 181         if (err) {
 182                 ext4_warning(inode->i_sb,
 183                              "couldn't mark inode dirty (err %d)", err);
 184                 goto stop_handle;
 185         }
 186         if (inode->i_blocks)
 187                 ext4_truncate(inode);
 188
 189         /*
 190          * ext4_ext_truncate() doesn't reserve any slop when it
 191          * restarts journal transactions; therefore there may not be
 192          * enough credits left in the handle to remove the inode from
 193          * the orphan list and set the dtime field.
 194          */
 195         if (!ext4_handle_has_enough_credits(handle, 3)) {
 196                 err = ext4_journal_extend(handle, 3);
 197                 if (err > 0)
 198                         err = ext4_journal_restart(handle, 3);
 199                 if (err != 0) {
 200                         ext4_warning(inode->i_sb,
 201                                      "couldn't extend journal (err %d)", err);
 202                 stop_handle:
 203                         ext4_journal_stop(handle);
 204                         ext4_orphan_del(NULL, inode);
 205                         goto no_delete;
 206                 }
 207         }
 208
 209         /*
 210          * Kill off the orphan record which ext4_truncate created.
 211          * AKPM: I think this can be inside the above `if'.
 212          * Note that ext4_orphan_del() has to be able to cope with the
 213          * deletion of a non-existent orphan - this is because we don't
 214          * know if ext4_truncate() actually created an orphan record.
 215          * (Well, we could do this if we need to, but heck - it works)
 216          */
 217         ext4_orphan_del(handle, inode);
 218         EXT4_I(inode)->i_dtime  = get_seconds();
 219
 220         /*
 221          * One subtle ordering requirement: if anything has gone wrong
 222          * (transaction abort, IO errors, whatever), then we can still
 223          * do these next steps (the fs will already have been marked as
 224          * having errors), but we can't free the inode if the mark_dirty
 225          * fails.
 226          */
 227         if (ext4_mark_inode_dirty(handle, inode))
 228                 /* If that failed, just do the required in-core inode clear. */
 229                 ext4_clear_inode(inode);
 230         else
 231                 ext4_free_inode(handle, inode);
 232         ext4_journal_stop(handle);
 233         return;
 234 no_delete:
 235         ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
 236 }
 237
 238 #ifdef CONFIG_QUOTA
 239 qsize_t *ext4_get_reserved_space(struct inode *inode)
 240 {
 241         return &EXT4_I(inode)->i_reserved_quota;
 242 }
 243 #endif
 244
 245 /*
 246  * Calculate the number of metadata blocks need to reserve
 247  * to allocate a block located at @lblock
 248  */
 249 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 250 {
 251         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 252                 return ext4_ext_calc_metadata_amount(inode, lblock);
 253
 254         return ext4_ind_calc_metadata_amount(inode, lblock);
 255 }
 256
 257 /*
 258  * Called with i_data_sem down, which is important since we can call
 259  * ext4_discard_preallocations() from here.
 260  */
 261 void ext4_da_update_reserve_space(struct inode *inode,
 262                                         int used, int quota_claim)
 263 {
 264         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 265         struct ext4_inode_info *ei = EXT4_I(inode);
 266
 267         spin_lock(&ei->i_block_reservation_lock);
 268         trace_ext4_da_update_reserve_space(inode, used);
 269         if (unlikely(used > ei->i_reserved_data_blocks)) {
 270                 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 271                          "with only %d reserved data blocks\n",
 272                          __func__, inode->i_ino, used,
 273                          ei->i_reserved_data_blocks);
 274                 WARN_ON(1);
 275                 used = ei->i_reserved_data_blocks;
 276         }
 277
 278         /* Update per-inode reservations */
 279         ei->i_reserved_data_blocks -= used;
 280         ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
 281         percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 282                            used + ei->i_allocated_meta_blocks);
 283         ei->i_allocated_meta_blocks = 0;
 284
 285         if (ei->i_reserved_data_blocks == 0) {
 286                 /*
 287                  * We can release all of the reserved metadata blocks
 288                  * only when we have written all of the delayed
 289                  * allocation blocks.
 290                  */
 291                 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 292                                    ei->i_reserved_meta_blocks);
 293                 ei->i_reserved_meta_blocks = 0;
 294                 ei->i_da_metadata_calc_len = 0;
 295         }
 296         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 297
 298         /* Update quota subsystem for data blocks */
 299         if (quota_claim)
 300                 dquot_claim_block(inode, used);
 301         else {
 302                 /*
 303                  * We did fallocate with an offset that is already delayed
 304                  * allocated. So on delayed allocated writeback we should
 305                  * not re-claim the quota for fallocated blocks.
 306                  */
 307                 dquot_release_reservation_block(inode, used);
 308         }
 309
 310         /*
 311          * If we have done all the pending block allocations and if
 312          * there aren't any writers on the inode, we can discard the
 313          * inode's preallocations.
 314          */
 315         if ((ei->i_reserved_data_blocks == 0) &&
 316             (atomic_read(&inode->i_writecount) == 0))
 317                 ext4_discard_preallocations(inode);
 318 }
 319
 320 static int __check_block_validity(struct inode *inode, const char *func,
 321                                 unsigned int line,
 322                                 struct ext4_map_blocks *map)
 323 {
 324         if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
 325                                    map->m_len)) {
 326                 ext4_error_inode(inode, func, line, map->m_pblk,
 327                                  "lblock %lu mapped to illegal pblock "
 328                                  "(length %d)", (unsigned long) map->m_lblk,
 329                                  map->m_len);
 330                 return -EIO;
 331         }
 332         return 0;
 333 }
 334
 335 #define check_block_validity(inode, map)        \
 336         __check_block_validity((inode), __func__, __LINE__, (map))
 337
 338 /*
 339  * Return the number of contiguous dirty pages in a given inode
 340  * starting at page frame idx.
 341  */
 342 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 343                                     unsigned int max_pages)
 344 {
 345         struct address_space *mapping = inode->i_mapping;
 346         pgoff_t index;
 347         struct pagevec pvec;
 348         pgoff_t num = 0;
 349         int i, nr_pages, done = 0;
 350
 351         if (max_pages == 0)
 352                 return 0;
 353         pagevec_init(&pvec, 0);
 354         while (!done) {
 355                 index = idx;
 356                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 357                                               PAGECACHE_TAG_DIRTY,
 358                                               (pgoff_t)PAGEVEC_SIZE);
 359                 if (nr_pages == 0)
 360                         break;
 361                 for (i = 0; i < nr_pages; i++) {
 362                         struct page *page = pvec.pages[i];
 363                         struct buffer_head *bh, *head;
 364
 365                         lock_page(page);
 366                         if (unlikely(page->mapping != mapping) ||
 367                             !PageDirty(page) ||
 368                             PageWriteback(page) ||
 369                             page->index != idx) {
 370                                 done = 1;
 371                                 unlock_page(page);
 372                                 break;
 373                         }
 374                         if (page_has_buffers(page)) {
 375                                 bh = head = page_buffers(page);
 376                                 do {
 377                                         if (!buffer_delay(bh) &&
 378                                             !buffer_unwritten(bh))
 379                                                 done = 1;
 380                                         bh = bh->b_this_page;
 381                                 } while (!done && (bh != head));
 382                         }
 383                         unlock_page(page);
 384                         if (done)
 385                                 break;
 386                         idx++;
 387                         num++;
 388                         if (num >= max_pages) {
 389                                 done = 1;
 390                                 break;
 391                         }
 392                 }
 393                 pagevec_release(&pvec);
 394         }
 395         return num;
 396 }
 397
 398 /*
 399  * The ext4_map_blocks() function tries to look up the requested blocks,
 400  * and returns if the blocks are already mapped.
 401  *
 402  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 403  * and store the allocated blocks in the result buffer head and mark it
 404  * mapped.
 405  *
 406  * If file type is extents based, it will call ext4_ext_map_blocks(),
 407  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 408  * based files
 409  *
 410  * On success, it returns the number of blocks being mapped or allocate.
 411  * if create==0 and the blocks are pre-allocated and uninitialized block,
 412  * the result buffer head is unmapped. If the create ==1, it will make sure
 413  * the buffer head is mapped.
 414  *
 415  * It returns 0 if plain look up failed (blocks have not been allocated), in
 416  * that casem, buffer head is unmapped
 417  *
 418  * It returns the error in case of allocation failure.
 419  */
 420 int ext4_map_blocks(handle_t *handle, struct inode *inode,
 421                     struct ext4_map_blocks *map, int flags)
 422 {
 423         int retval;
 424
 425         map->m_flags = 0;
 426         ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
 427                   "logical block %lu\n", inode->i_ino, flags, map->m_len,
 428                   (unsigned long) map->m_lblk);
 429         /*
 430          * Try to see if we can get the block without requesting a new
 431          * file system block.
 432          */
 433         down_read((&EXT4_I(inode)->i_data_sem));
 434         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 435                 retval = ext4_ext_map_blocks(handle, inode, map, 0);
 436         } else {
 437                 retval = ext4_ind_map_blocks(handle, inode, map, 0);
 438         }
 439         up_read((&EXT4_I(inode)->i_data_sem));
 440
 441         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 442                 int ret = check_block_validity(inode, map);
 443                 if (ret != 0)
 444                         return ret;
 445         }
 446
 447         /* If it is only a block(s) look up */
 448         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 449                 return retval;
 450
 451         /*
 452          * Returns if the blocks have already allocated
 453          *
 454          * Note that if blocks have been preallocated
 455          * ext4_ext_get_block() returns th create = 0
 456          * with buffer head unmapped.
 457          */
 458         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 459                 return retval;
 460
 461         /*
 462          * When we call get_blocks without the create flag, the
 463          * BH_Unwritten flag could have gotten set if the blocks
 464          * requested were part of a uninitialized extent.  We need to
 465          * clear this flag now that we are committed to convert all or
 466          * part of the uninitialized extent to be an initialized
 467          * extent.  This is because we need to avoid the combination
 468          * of BH_Unwritten and BH_Mapped flags being simultaneously
 469          * set on the buffer_head.
 470          */
 471         map->m_flags &= ~EXT4_MAP_UNWRITTEN;
 472
 473         /*
 474          * New blocks allocate and/or writing to uninitialized extent
 475          * will possibly result in updating i_data, so we take
 476          * the write lock of i_data_sem, and call get_blocks()
 477          * with create == 1 flag.
 478          */
 479         down_write((&EXT4_I(inode)->i_data_sem));
 480
 481         /*
 482          * if the caller is from delayed allocation writeout path
 483          * we have already reserved fs blocks for allocation
 484          * let the underlying get_block() function know to
 485          * avoid double accounting
 486          */
 487         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 488                 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 489         /*
 490          * We need to check for EXT4 here because migrate
 491          * could have changed the inode type in between
 492          */
 493         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 494                 retval = ext4_ext_map_blocks(handle, inode, map, flags);
 495         } else {
 496                 retval = ext4_ind_map_blocks(handle, inode, map, flags);
 497
 498                 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
 499                         /*
 500                          * We allocated new blocks which will result in
 501                          * i_data's format changing.  Force the migrate
 502                          * to fail by clearing migrate flags
 503                          */
 504                         ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 505                 }
 506
 507                 /*
 508                  * Update reserved blocks/metadata blocks after successful
 509                  * block allocation which had been deferred till now. We don't
 510                  * support fallocate for non extent files. So we can update
 511                  * reserve space here.
 512                  */
 513                 if ((retval > 0) &&
 514                         (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 515                         ext4_da_update_reserve_space(inode, retval, 1);
 516         }
 517         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 518                 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 519
 520         up_write((&EXT4_I(inode)->i_data_sem));
 521         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 522                 int ret = check_block_validity(inode, map);
 523                 if (ret != 0)
 524                         return ret;
 525         }
 526         return retval;
 527 }
 528
 529 /* Maximum number of blocks we map for direct IO at once. */
 530 #define DIO_MAX_BLOCKS 4096
 531
 532 static int _ext4_get_block(struct inode *inode, sector_t iblock,
 533                            struct buffer_head *bh, int flags)
 534 {
 535         handle_t *handle = ext4_journal_current_handle();
 536         struct ext4_map_blocks map;
 537         int ret = 0, started = 0;
 538         int dio_credits;
 539
 540         map.m_lblk = iblock;
 541         map.m_len = bh->b_size >> inode->i_blkbits;
 542
 543         if (flags && !handle) {
 544                 /* Direct IO write... */
 545                 if (map.m_len > DIO_MAX_BLOCKS)
 546                         map.m_len = DIO_MAX_BLOCKS;
 547                 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 548                 handle = ext4_journal_start(inode, dio_credits);
 549                 if (IS_ERR(handle)) {
 550                         ret = PTR_ERR(handle);
 551                         return ret;
 552                 }
 553                 started = 1;
 554         }
 555
 556         ret = ext4_map_blocks(handle, inode, &map, flags);
 557         if (ret > 0) {
 558                 map_bh(bh, inode->i_sb, map.m_pblk);
 559                 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
 560                 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 561                 ret = 0;
 562         }
 563         if (started)
 564                 ext4_journal_stop(handle);
 565         return ret;
 566 }
 567
 568 int ext4_get_block(struct inode *inode, sector_t iblock,
 569                    struct buffer_head *bh, int create)
 570 {
 571         return _ext4_get_block(inode, iblock, bh,
 572                                create ? EXT4_GET_BLOCKS_CREATE : 0);
 573 }
 574
 575 /*
 576  * `handle' can be NULL if create is zero
 577  */
 578 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 579                                 ext4_lblk_t block, int create, int *errp)
 580 {
 581         struct ext4_map_blocks map;
 582         struct buffer_head *bh;
 583         int fatal = 0, err;
 584
 585         J_ASSERT(handle != NULL || create == 0);
 586
 587         map.m_lblk = block;
 588         map.m_len = 1;
 589         err = ext4_map_blocks(handle, inode, &map,
 590                               create ? EXT4_GET_BLOCKS_CREATE : 0);
 591
 592         if (err < 0)
 593                 *errp = err;
 594         if (err <= 0)
 595                 return NULL;
 596         *errp = 0;
 597
 598         bh = sb_getblk(inode->i_sb, map.m_pblk);
 599         if (!bh) {
 600                 *errp = -EIO;
 601                 return NULL;
 602         }
 603         if (map.m_flags & EXT4_MAP_NEW) {
 604                 J_ASSERT(create != 0);
 605                 J_ASSERT(handle != NULL);
 606
 607                 /*
 608                  * Now that we do not always journal data, we should
 609                  * keep in mind whether this should always journal the
 610                  * new buffer as metadata.  For now, regular file
 611                  * writes use ext4_get_block instead, so it's not a
 612                  * problem.
 613                  */
 614                 lock_buffer(bh);
 615                 BUFFER_TRACE(bh, "call get_create_access");
 616                 fatal = ext4_journal_get_create_access(handle, bh);
 617                 if (!fatal && !buffer_uptodate(bh)) {
 618                         memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 619                         set_buffer_uptodate(bh);
 620                 }
 621                 unlock_buffer(bh);
 622                 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 623                 err = ext4_handle_dirty_metadata(handle, inode, bh);
 624                 if (!fatal)
 625                         fatal = err;
 626         } else {
 627                 BUFFER_TRACE(bh, "not a new buffer");
 628         }
 629         if (fatal) {
 630                 *errp = fatal;
 631                 brelse(bh);
 632                 bh = NULL;
 633         }
 634         return bh;
 635 }
 636
 637 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 638                                ext4_lblk_t block, int create, int *err)
 639 {
 640         struct buffer_head *bh;
 641
 642         bh = ext4_getblk(handle, inode, block, create, err);
 643         if (!bh)
 644                 return bh;
 645         if (buffer_uptodate(bh))
 646                 return bh;
 647         ll_rw_block(READ_META, 1, &bh);
 648         wait_on_buffer(bh);
 649         if (buffer_uptodate(bh))
 650                 return bh;
 651         put_bh(bh);
 652         *err = -EIO;
 653         return NULL;
 654 }
 655
 656 static int walk_page_buffers(handle_t *handle,
 657                              struct buffer_head *head,
 658                              unsigned from,
 659                              unsigned to,
 660                              int *partial,
 661                              int (*fn)(handle_t *handle,
 662                                        struct buffer_head *bh))
 663 {
 664         struct buffer_head *bh;
 665         unsigned block_start, block_end;
 666         unsigned blocksize = head->b_size;
 667         int err, ret = 0;
 668         struct buffer_head *next;
 669
 670         for (bh = head, block_start = 0;
 671              ret == 0 && (bh != head || !block_start);
 672              block_start = block_end, bh = next) {
 673                 next = bh->b_this_page;
 674                 block_end = block_start + blocksize;
 675                 if (block_end <= from || block_start >= to) {
 676                         if (partial && !buffer_uptodate(bh))
 677                                 *partial = 1;
 678                         continue;
 679                 }
 680                 err = (*fn)(handle, bh);
 681                 if (!ret)
 682                         ret = err;
 683         }
 684         return ret;
 685 }
 686
 687 /*
 688  * To preserve ordering, it is essential that the hole instantiation and
 689  * the data write be encapsulated in a single transaction.  We cannot
 690  * close off a transaction and start a new one between the ext4_get_block()
 691  * and the commit_write().  So doing the jbd2_journal_start at the start of
 692  * prepare_write() is the right place.
 693  *
 694  * Also, this function can nest inside ext4_writepage() ->
 695  * block_write_full_page(). In that case, we *know* that ext4_writepage()
 696  * has generated enough buffer credits to do the whole page.  So we won't
 697  * block on the journal in that case, which is good, because the caller may
 698  * be PF_MEMALLOC.
 699  *
 700  * By accident, ext4 can be reentered when a transaction is open via
 701  * quota file writes.  If we were to commit the transaction while thus
 702  * reentered, there can be a deadlock - we would be holding a quota
 703  * lock, and the commit would never complete if another thread had a
 704  * transaction open and was blocking on the quota lock - a ranking
 705  * violation.
 706  *
 707  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
 708  * will _not_ run commit under these circumstances because handle->h_ref
 709  * is elevated.  We'll still have enough credits for the tiny quotafile
 710  * write.
 711  */
 712 static int do_journal_get_write_access(handle_t *handle,
 713                                        struct buffer_head *bh)
 714 {
 715         int dirty = buffer_dirty(bh);
 716         int ret;
 717
 718         if (!buffer_mapped(bh) || buffer_freed(bh))
 719                 return 0;
 720         /*
 721          * __block_write_begin() could have dirtied some buffers. Clean
 722          * the dirty bit as jbd2_journal_get_write_access() could complain
 723          * otherwise about fs integrity issues. Setting of the dirty bit
 724          * by __block_write_begin() isn't a real problem here as we clear
 725          * the bit before releasing a page lock and thus writeback cannot
 726          * ever write the buffer.
 727          */
 728         if (dirty)
 729                 clear_buffer_dirty(bh);
 730         ret = ext4_journal_get_write_access(handle, bh);
 731         if (!ret && dirty)
 732                 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
 733         return ret;
 734 }
 735
 736 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 737                    struct buffer_head *bh_result, int create);
 738 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 739                             loff_t pos, unsigned len, unsigned flags,
 740                             struct page **pagep, void **fsdata)
 741 {
 742         struct inode *inode = mapping->host;
 743         int ret, needed_blocks;
 744         handle_t *handle;
 745         int retries = 0;
 746         struct page *page;
 747         pgoff_t index;
 748         unsigned from, to;
 749
 750         trace_ext4_write_begin(inode, pos, len, flags);
 751         /*
 752          * Reserve one block more for addition to orphan list in case
 753          * we allocate blocks but write fails for some reason
 754          */
 755         needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
 756         index = pos >> PAGE_CACHE_SHIFT;
 757         from = pos & (PAGE_CACHE_SIZE - 1);
 758         to = from + len;
 759
 760 retry:
 761         handle = ext4_journal_start(inode, needed_blocks);
 762         if (IS_ERR(handle)) {
 763                 ret = PTR_ERR(handle);
 764                 goto out;
 765         }
 766
 767         /* We cannot recurse into the filesystem as the transaction is already
 768          * started */
 769         flags |= AOP_FLAG_NOFS;
 770
 771         page = grab_cache_page_write_begin(mapping, index, flags);
 772         if (!page) {
 773                 ext4_journal_stop(handle);
 774                 ret = -ENOMEM;
 775                 goto out;
 776         }
 777         *pagep = page;
 778
 779         if (ext4_should_dioread_nolock(inode))
 780                 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
 781         else
 782                 ret = __block_write_begin(page, pos, len, ext4_get_block);
 783
 784         if (!ret && ext4_should_journal_data(inode)) {
 785                 ret = walk_page_buffers(handle, page_buffers(page),
 786                                 from, to, NULL, do_journal_get_write_access);
 787         }
 788
 789         if (ret) {
 790                 unlock_page(page);
 791                 page_cache_release(page);
 792                 /*
 793                  * __block_write_begin may have instantiated a few blocks
 794                  * outside i_size.  Trim these off again. Don't need
 795                  * i_size_read because we hold i_mutex.
 796                  *
 797                  * Add inode to orphan list in case we crash before
 798                  * truncate finishes
 799                  */
 800                 if (pos + len > inode->i_size && ext4_can_truncate(inode))
 801                         ext4_orphan_add(handle, inode);
 802
 803                 ext4_journal_stop(handle);
 804                 if (pos + len > inode->i_size) {
 805                         ext4_truncate_failed_write(inode);
 806                         /*
 807                          * If truncate failed early the inode might
 808                          * still be on the orphan list; we need to
 809                          * make sure the inode is removed from the
 810                          * orphan list in that case.
 811                          */
 812                         if (inode->i_nlink)
 813                                 ext4_orphan_del(NULL, inode);
 814                 }
 815         }
 816
 817         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 818                 goto retry;
 819 out:
 820         return ret;
 821 }
 822
 823 /* For write_end() in data=journal mode */
 824 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 825 {
 826         if (!buffer_mapped(bh) || buffer_freed(bh))
 827                 return 0;
 828         set_buffer_uptodate(bh);
 829         return ext4_handle_dirty_metadata(handle, NULL, bh);
 830 }
 831
 832 static int ext4_generic_write_end(struct file *file,
 833                                   struct address_space *mapping,
 834                                   loff_t pos, unsigned len, unsigned copied,
 835                                   struct page *page, void *fsdata)
 836 {
 837         int i_size_changed = 0;
 838         struct inode *inode = mapping->host;
 839         handle_t *handle = ext4_journal_current_handle();
 840
 841         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 842
 843         /*
 844          * No need to use i_size_read() here, the i_size
 845          * cannot change under us because we hold i_mutex.
 846          *
 847          * But it's important to update i_size while still holding page lock:
 848          * page writeout could otherwise come in and zero beyond i_size.
 849          */
 850         if (pos + copied > inode->i_size) {
 851                 i_size_write(inode, pos + copied);
 852                 i_size_changed = 1;
 853         }
 854
 855         if (pos + copied >  EXT4_I(inode)->i_disksize) {
 856                 /* We need to mark inode dirty even if
 857                  * new_i_size is less that inode->i_size
 858                  * bu greater than i_disksize.(hint delalloc)
 859                  */
 860                 ext4_update_i_disksize(inode, (pos + copied));
 861                 i_size_changed = 1;
 862         }
 863         unlock_page(page);
 864         page_cache_release(page);
 865
 866         /*
 867          * Don't mark the inode dirty under page lock. First, it unnecessarily
 868          * makes the holding time of page lock longer. Second, it forces lock
 869          * ordering of page lock and transaction start for journaling
 870          * filesystems.
 871          */
 872         if (i_size_changed)
 873                 ext4_mark_inode_dirty(handle, inode);
 874
 875         return copied;
 876 }
 877
 878 /*
 879  * We need to pick up the new inode size which generic_commit_write gave us
 880  * `file' can be NULL - eg, when called from page_symlink().
 881  *
 882  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
 883  * buffers are managed internally.
 884  */
 885 static int ext4_ordered_write_end(struct file *file,
 886                                   struct address_space *mapping,
 887                                   loff_t pos, unsigned len, unsigned copied,
 888                                   struct page *page, void *fsdata)
 889 {
 890         handle_t *handle = ext4_journal_current_handle();
 891         struct inode *inode = mapping->host;
 892         int ret = 0, ret2;
 893
 894         trace_ext4_ordered_write_end(inode, pos, len, copied);
 895         ret = ext4_jbd2_file_inode(handle, inode);
 896
 897         if (ret == 0) {
 898                 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 899                                                         page, fsdata);
 900                 copied = ret2;
 901                 if (pos + len > inode->i_size && ext4_can_truncate(inode))
 902                         /* if we have allocated more blocks and copied
 903                          * less. We will have blocks allocated outside
 904                          * inode->i_size. So truncate them
 905                          */
 906                         ext4_orphan_add(handle, inode);
 907                 if (ret2 < 0)
 908                         ret = ret2;
 909         }
 910         ret2 = ext4_journal_stop(handle);
 911         if (!ret)
 912                 ret = ret2;
 913
 914         if (pos + len > inode->i_size) {
 915                 ext4_truncate_failed_write(inode);
 916                 /*
 917                  * If truncate failed early the inode might still be
 918                  * on the orphan list; we need to make sure the inode
 919                  * is removed from the orphan list in that case.
 920                  */
 921                 if (inode->i_nlink)
 922                         ext4_orphan_del(NULL, inode);
 923         }
 924
 925
 926         return ret ? ret : copied;
 927 }
 928
 929 static int ext4_writeback_write_end(struct file *file,
 930                                     struct address_space *mapping,
 931                                     loff_t pos, unsigned len, unsigned copied,
 932                                     struct page *page, void *fsdata)
 933 {
 934         handle_t *handle = ext4_journal_current_handle();
 935         struct inode *inode = mapping->host;
 936         int ret = 0, ret2;
 937
 938         trace_ext4_writeback_write_end(inode, pos, len, copied);
 939         ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 940                                                         page, fsdata);
 941         copied = ret2;
 942         if (pos + len > inode->i_size && ext4_can_truncate(inode))
 943                 /* if we have allocated more blocks and copied
 944                  * less. We will have blocks allocated outside
 945                  * inode->i_size. So truncate them
 946                  */
 947                 ext4_orphan_add(handle, inode);
 948
 949         if (ret2 < 0)
 950                 ret = ret2;
 951
 952         ret2 = ext4_journal_stop(handle);
 953         if (!ret)
 954                 ret = ret2;
 955
 956         if (pos + len > inode->i_size) {
 957                 ext4_truncate_failed_write(inode);
 958                 /*
 959                  * If truncate failed early the inode might still be
 960                  * on the orphan list; we need to make sure the inode
 961                  * is removed from the orphan list in that case.
 962                  */
 963                 if (inode->i_nlink)
 964                         ext4_orphan_del(NULL, inode);
 965         }
 966
 967         return ret ? ret : copied;
 968 }
 969
 970 static int ext4_journalled_write_end(struct file *file,
 971                                      struct address_space *mapping,
 972                                      loff_t pos, unsigned len, unsigned copied,
 973                                      struct page *page, void *fsdata)
 974 {
 975         handle_t *handle = ext4_journal_current_handle();
 976         struct inode *inode = mapping->host;
 977         int ret = 0, ret2;
 978         int partial = 0;
 979         unsigned from, to;
 980         loff_t new_i_size;
 981
 982         trace_ext4_journalled_write_end(inode, pos, len, copied);
 983         from = pos & (PAGE_CACHE_SIZE - 1);
 984         to = from + len;
 985
 986         if (copied < len) {
 987                 if (!PageUptodate(page))
 988                         copied = 0;
 989                 page_zero_new_buffers(page, from+copied, to);
 990         }
 991
 992         ret = walk_page_buffers(handle, page_buffers(page), from,
 993                                 to, &partial, write_end_fn);
 994         if (!partial)
 995                 SetPageUptodate(page);
 996         new_i_size = pos + copied;
 997         if (new_i_size > inode->i_size)
 998                 i_size_write(inode, pos+copied);
 999         ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1000         EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1001         if (new_i_size > EXT4_I(inode)->i_disksize) {
1002                 ext4_update_i_disksize(inode, new_i_size);
1003                 ret2 = ext4_mark_inode_dirty(handle, inode);
1004                 if (!ret)
1005                         ret = ret2;
1006         }
1007
1008         unlock_page(page);
1009         page_cache_release(page);
1010         if (pos + len > inode->i_size && ext4_can_truncate(inode))
1011                 /* if we have allocated more blocks and copied
1012                  * less. We will have blocks allocated outside
1013                  * inode->i_size. So truncate them
1014                  */
1015                 ext4_orphan_add(handle, inode);
1016
1017         ret2 = ext4_journal_stop(handle);
1018         if (!ret)
1019                 ret = ret2;
1020         if (pos + len > inode->i_size) {
1021                 ext4_truncate_failed_write(inode);
1022                 /*
1023                  * If truncate failed early the inode might still be
1024                  * on the orphan list; we need to make sure the inode
1025                  * is removed from the orphan list in that case.
1026                  */
1027                 if (inode->i_nlink)
1028                         ext4_orphan_del(NULL, inode);
1029         }
1030
1031         return ret ? ret : copied;
1032 }
1033
1034 /*
1035  * Reserve a single block located at lblock
1036  */
1037 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1038 {
1039         int retries = 0;
1040         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1041         struct ext4_inode_info *ei = EXT4_I(inode);
1042         unsigned long md_needed;
1043         int ret;
1044
1045         /*
1046          * recalculate the amount of metadata blocks to reserve
1047          * in order to allocate nrblocks
1048          * worse case is one extent per block
1049          */
1050 repeat:
1051         spin_lock(&ei->i_block_reservation_lock);
1052         md_needed = ext4_calc_metadata_amount(inode, lblock);
1053         trace_ext4_da_reserve_space(inode, md_needed);
1054         spin_unlock(&ei->i_block_reservation_lock);
1055
1056         /*
1057          * We will charge metadata quota at writeout time; this saves
1058          * us from metadata over-estimation, though we may go over by
1059          * a small amount in the end.  Here we just reserve for data.
1060          */
1061         ret = dquot_reserve_block(inode, 1);
1062         if (ret)
1063                 return ret;
1064         /*
1065          * We do still charge estimated metadata to the sb though;
1066          * we cannot afford to run out of free blocks.
1067          */
1068         if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1069                 dquot_release_reservation_block(inode, 1);
1070                 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1071                         yield();
1072                         goto repeat;
1073                 }
1074                 return -ENOSPC;
1075         }
1076         spin_lock(&ei->i_block_reservation_lock);
1077         ei->i_reserved_data_blocks++;
1078         ei->i_reserved_meta_blocks += md_needed;
1079         spin_unlock(&ei->i_block_reservation_lock);
1080
1081         return 0;       /* success */
1082 }
1083
1084 static void ext4_da_release_space(struct inode *inode, int to_free)
1085 {
1086         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1087         struct ext4_inode_info *ei = EXT4_I(inode);
1088
1089         if (!to_free)
1090                 return;         /* Nothing to release, exit */
1091
1092         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1093
1094         trace_ext4_da_release_space(inode, to_free);
1095         if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1096                 /*
1097                  * if there aren't enough reserved blocks, then the
1098                  * counter is messed up somewhere.  Since this
1099                  * function is called from invalidate page, it's
1100                  * harmless to return without any action.
1101                  */
1102                 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1103                          "ino %lu, to_free %d with only %d reserved "
1104                          "data blocks\n", inode->i_ino, to_free,
1105                          ei->i_reserved_data_blocks);
1106                 WARN_ON(1);
1107                 to_free = ei->i_reserved_data_blocks;
1108         }
1109         ei->i_reserved_data_blocks -= to_free;
1110
1111         if (ei->i_reserved_data_blocks == 0) {
1112                 /*
1113                  * We can release all of the reserved metadata blocks
1114                  * only when we have written all of the delayed
1115                  * allocation blocks.
1116                  */
1117                 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1118                                    ei->i_reserved_meta_blocks);
1119                 ei->i_reserved_meta_blocks = 0;
1120                 ei->i_da_metadata_calc_len = 0;
1121         }
1122
1123         /* update fs dirty data blocks counter */
1124         percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1125
1126         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1127
1128         dquot_release_reservation_block(inode, to_free);
1129 }
1130
1131 static void ext4_da_page_release_reservation(struct page *page,
1132                                              unsigned long offset)
1133 {
1134         int to_release = 0;
1135         struct buffer_head *head, *bh;
1136         unsigned int curr_off = 0;
1137
1138         head = page_buffers(page);
1139         bh = head;
1140         do {
1141                 unsigned int next_off = curr_off + bh->b_size;
1142
1143                 if ((offset <= curr_off) && (buffer_delay(bh))) {
1144                         to_release++;
1145                         clear_buffer_delay(bh);
1146                 }
1147                 curr_off = next_off;
1148         } while ((bh = bh->b_this_page) != head);
1149         ext4_da_release_space(page->mapping->host, to_release);
1150 }
1151
1152 /*
1153  * Delayed allocation stuff
1154  */
1155
1156 /*
1157  * mpage_da_submit_io - walks through extent of pages and try to write
1158  * them with writepage() call back
1159  *
1160  * @mpd->inode: inode
1161  * @mpd->first_page: first page of the extent
1162  * @mpd->next_page: page after the last page of the extent
1163  *
1164  * By the time mpage_da_submit_io() is called we expect all blocks
1165  * to be allocated. this may be wrong if allocation failed.
1166  *
1167  * As pages are already locked by write_cache_pages(), we can't use it
1168  */
1169 static int mpage_da_submit_io(struct mpage_da_data *mpd,
1170                               struct ext4_map_blocks *map)
1171 {
1172         struct pagevec pvec;
1173         unsigned long index, end;
1174         int ret = 0, err, nr_pages, i;
1175         struct inode *inode = mpd->inode;
1176         struct address_space *mapping = inode->i_mapping;
1177         loff_t size = i_size_read(inode);
1178         unsigned int len, block_start;
1179         struct buffer_head *bh, *page_bufs = NULL;
1180         int journal_data = ext4_should_journal_data(inode);
1181         sector_t pblock = 0, cur_logical = 0;
1182         struct ext4_io_submit io_submit;
1183
1184         BUG_ON(mpd->next_page <= mpd->first_page);
1185         memset(&io_submit, 0, sizeof(io_submit));
1186         /*
1187          * We need to start from the first_page to the next_page - 1
1188          * to make sure we also write the mapped dirty buffer_heads.
1189          * If we look at mpd->b_blocknr we would only be looking
1190          * at the currently mapped buffer_heads.
1191          */
1192         index = mpd->first_page;
1193         end = mpd->next_page - 1;
1194
1195         pagevec_init(&pvec, 0);
1196         while (index <= end) {
1197                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1198                 if (nr_pages == 0)
1199                         break;
1200                 for (i = 0; i < nr_pages; i++) {
1201                         int commit_write = 0, skip_page = 0;
1202                         struct page *page = pvec.pages[i];
1203
1204                         index = page->index;
1205                         if (index > end)
1206                                 break;
1207
1208                         if (index == size >> PAGE_CACHE_SHIFT)
1209                                 len = size & ~PAGE_CACHE_MASK;
1210                         else
1211                                 len = PAGE_CACHE_SIZE;
1212                         if (map) {
1213                                 cur_logical = index << (PAGE_CACHE_SHIFT -
1214                                                         inode->i_blkbits);
1215                                 pblock = map->m_pblk + (cur_logical -
1216                                                         map->m_lblk);
1217                         }
1218                         index++;
1219
1220                         BUG_ON(!PageLocked(page));
1221                         BUG_ON(PageWriteback(page));
1222
1223                         /*
1224                          * If the page does not have buffers (for
1225                          * whatever reason), try to create them using
1226                          * __block_write_begin.  If this fails,
1227                          * skip the page and move on.
1228                          */
1229                         if (!page_has_buffers(page)) {
1230                                 if (__block_write_begin(page, 0, len,
1231                                                 noalloc_get_block_write)) {
1232                                 skip_page:
1233                                         unlock_page(page);
1234                                         continue;
1235                                 }
1236                                 commit_write = 1;
1237                         }
1238
1239                         bh = page_bufs = page_buffers(page);
1240                         block_start = 0;
1241                         do {
1242                                 if (!bh)
1243                                         goto skip_page;
1244                                 if (map && (cur_logical >= map->m_lblk) &&
1245                                     (cur_logical <= (map->m_lblk +
1246                                                      (map->m_len - 1)))) {
1247                                         if (buffer_delay(bh)) {
1248                                                 clear_buffer_delay(bh);
1249                                                 bh->b_blocknr = pblock;
1250                                         }
1251                                         if (buffer_unwritten(bh) ||
1252                                             buffer_mapped(bh))
1253                                                 BUG_ON(bh->b_blocknr != pblock);
1254                                         if (map->m_flags & EXT4_MAP_UNINIT)
1255                                                 set_buffer_uninit(bh);
1256                                         clear_buffer_unwritten(bh);
1257                                 }
1258
1259                                 /* skip page if block allocation undone */
1260                                 if (buffer_delay(bh) || buffer_unwritten(bh))
1261                                         skip_page = 1;
1262                                 bh = bh->b_this_page;
1263                                 block_start += bh->b_size;
1264                                 cur_logical++;
1265                                 pblock++;
1266                         } while (bh != page_bufs);
1267
1268                         if (skip_page)
1269                                 goto skip_page;
1270
1271                         if (commit_write)
1272                                 /* mark the buffer_heads as dirty & uptodate */
1273                                 block_commit_write(page, 0, len);
1274
1275                         clear_page_dirty_for_io(page);
1276                         /*
1277                          * Delalloc doesn't support data journalling,
1278                          * but eventually maybe we'll lift this
1279                          * restriction.
1280                          */
1281                         if (unlikely(journal_data && PageChecked(page)))
1282                                 err = __ext4_journalled_writepage(page, len);
1283                         else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
1284                                 err = ext4_bio_write_page(&io_submit, page,
1285                                                           len, mpd->wbc);
1286                         else
1287                                 err = block_write_full_page(page,
1288                                         noalloc_get_block_write, mpd->wbc);
1289
1290                         if (!err)
1291                                 mpd->pages_written++;
1292                         /*
1293                          * In error case, we have to continue because
1294                          * remaining pages are still locked
1295                          */
1296                         if (ret == 0)
1297                                 ret = err;
1298                 }
1299                 pagevec_release(&pvec);
1300         }
1301         ext4_io_submit(&io_submit);
1302         return ret;
1303 }
1304
1305 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1306 {
1307         int nr_pages, i;
1308         pgoff_t index, end;
1309         struct pagevec pvec;
1310         struct inode *inode = mpd->inode;
1311         struct address_space *mapping = inode->i_mapping;
1312
1313         index = mpd->first_page;
1314         end   = mpd->next_page - 1;
1315         while (index <= end) {
1316                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1317                 if (nr_pages == 0)
1318                         break;
1319                 for (i = 0; i < nr_pages; i++) {
1320                         struct page *page = pvec.pages[i];
1321                         if (page->index > end)
1322                                 break;
1323                         BUG_ON(!PageLocked(page));
1324                         BUG_ON(PageWriteback(page));
1325                         block_invalidatepage(page, 0);
1326                         ClearPageUptodate(page);
1327                         unlock_page(page);
1328                 }
1329                 index = pvec.pages[nr_pages - 1]->index + 1;
1330                 pagevec_release(&pvec);
1331         }
1332         return;
1333 }
1334
1335 static void ext4_print_free_blocks(struct inode *inode)
1336 {
1337         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1338         printk(KERN_CRIT "Total free blocks count %lld\n",
1339                ext4_count_free_blocks(inode->i_sb));
1340         printk(KERN_CRIT "Free/Dirty block details\n");
1341         printk(KERN_CRIT "free_blocks=%lld\n",
1342                (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
1343         printk(KERN_CRIT "dirty_blocks=%lld\n",
1344                (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1345         printk(KERN_CRIT "Block reservation details\n");
1346         printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
1347                EXT4_I(inode)->i_reserved_data_blocks);
1348         printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
1349                EXT4_I(inode)->i_reserved_meta_blocks);
1350         return;
1351 }
1352
1353 /*
1354  * mpage_da_map_and_submit - go through given space, map them
1355  *       if necessary, and then submit them for I/O
1356  *
1357  * @mpd - bh describing space
1358  *
1359  * The function skips space we know is already mapped to disk blocks.
1360  *
1361  */
1362 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1363 {
1364         int err, blks, get_blocks_flags;
1365         struct ext4_map_blocks map, *mapp = NULL;
1366         sector_t next = mpd->b_blocknr;
1367         unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1368         loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1369         handle_t *handle = NULL;
1370
1371         /*
1372          * If the blocks are mapped already, or we couldn't accumulate
1373          * any blocks, then proceed immediately to the submission stage.
1374          */
1375         if ((mpd->b_size == 0) ||
1376             ((mpd->b_state  & (1 << BH_Mapped)) &&
1377              !(mpd->b_state & (1 << BH_Delay)) &&
1378              !(mpd->b_state & (1 << BH_Unwritten))))
1379                 goto submit_io;
1380
1381         handle = ext4_journal_current_handle();
1382         BUG_ON(!handle);
1383
1384         /*
1385          * Call ext4_map_blocks() to allocate any delayed allocation
1386          * blocks, or to convert an uninitialized extent to be
1387          * initialized (in the case where we have written into
1388          * one or more preallocated blocks).
1389          *
1390          * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1391          * indicate that we are on the delayed allocation path.  This
1392          * affects functions in many different parts of the allocation
1393          * call path.  This flag exists primarily because we don't
1394          * want to change *many* call functions, so ext4_map_blocks()
1395          * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1396          * inode's allocation semaphore is taken.
1397          *
1398          * If the blocks in questions were delalloc blocks, set
1399          * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1400          * variables are updated after the blocks have been allocated.
1401          */
1402         map.m_lblk = next;
1403         map.m_len = max_blocks;
1404         get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
1405         if (ext4_should_dioread_nolock(mpd->inode))
1406                 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1407         if (mpd->b_state & (1 << BH_Delay))
1408                 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1409
1410         blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1411         if (blks < 0) {
1412                 struct super_block *sb = mpd->inode->i_sb;
1413
1414                 err = blks;
1415                 /*
1416                  * If get block returns EAGAIN or ENOSPC and there
1417                  * appears to be free blocks we will just let
1418                  * mpage_da_submit_io() unlock all of the pages.
1419                  */
1420                 if (err == -EAGAIN)
1421                         goto submit_io;
1422
1423                 if (err == -ENOSPC &&
1424                     ext4_count_free_blocks(sb)) {
1425                         mpd->retval = err;
1426                         goto submit_io;
1427                 }
1428
1429                 /*
1430                  * get block failure will cause us to loop in
1431                  * writepages, because a_ops->writepage won't be able
1432                  * to make progress. The page will be redirtied by
1433                  * writepage and writepages will again try to write
1434                  * the same.
1435                  */
1436                 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1437                         ext4_msg(sb, KERN_CRIT,
1438                                  "delayed block allocation failed for inode %lu "
1439                                  "at logical offset %llu with max blocks %zd "
1440                                  "with error %d", mpd->inode->i_ino,
1441                                  (unsigned long long) next,
1442                                  mpd->b_size >> mpd->inode->i_blkbits, err);
1443                         ext4_msg(sb, KERN_CRIT,
1444                                 "This should not happen!! Data will be lost\n");
1445                         if (err == -ENOSPC)
1446                                 ext4_print_free_blocks(mpd->inode);
1447                 }
1448                 /* invalidate all the pages */
1449                 ext4_da_block_invalidatepages(mpd);
1450
1451                 /* Mark this page range as having been completed */
1452                 mpd->io_done = 1;
1453                 return;
1454         }
1455         BUG_ON(blks == 0);
1456
1457         mapp = &map;
1458         if (map.m_flags & EXT4_MAP_NEW) {
1459                 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1460                 int i;
1461
1462                 for (i = 0; i < map.m_len; i++)
1463                         unmap_underlying_metadata(bdev, map.m_pblk + i);
1464         }
1465
1466         if (ext4_should_order_data(mpd->inode)) {
1467                 err = ext4_jbd2_file_inode(handle, mpd->inode);
1468                 if (err)
1469                         /* This only happens if the journal is aborted */
1470                         return;
1471         }
1472
1473         /*
1474          * Update on-disk size along with block allocation.
1475          */
1476         disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1477         if (disksize > i_size_read(mpd->inode))
1478                 disksize = i_size_read(mpd->inode);
1479         if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1480                 ext4_update_i_disksize(mpd->inode, disksize);
1481                 err = ext4_mark_inode_dirty(handle, mpd->inode);
1482                 if (err)
1483                         ext4_error(mpd->inode->i_sb,
1484                                    "Failed to mark inode %lu dirty",
1485                                    mpd->inode->i_ino);
1486         }
1487
1488 submit_io:
1489         mpage_da_submit_io(mpd, mapp);
1490         mpd->io_done = 1;
1491 }
1492
1493 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1494                 (1 << BH_Delay) | (1 << BH_Unwritten))
1495
1496 /*
1497  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1498  *
1499  * @mpd->lbh - extent of blocks
1500  * @logical - logical number of the block in the file
1501  * @bh - bh of the block (used to access block's state)
1502  *
1503  * the function is used to collect contig. blocks in same state
1504  */
1505 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1506                                    sector_t logical, size_t b_size,
1507                                    unsigned long b_state)
1508 {
1509         sector_t next;
1510         int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
1511
1512         /*
1513          * XXX Don't go larger than mballoc is willing to allocate
1514          * This is a stopgap solution.  We eventually need to fold
1515          * mpage_da_submit_io() into this function and then call
1516          * ext4_map_blocks() multiple times in a loop
1517          */
1518         if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
1519                 goto flush_it;
1520
1521         /* check if thereserved journal credits might overflow */
1522         if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
1523                 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1524                         /*
1525                          * With non-extent format we are limited by the journal
1526                          * credit available.  Total credit needed to insert
1527                          * nrblocks contiguous blocks is dependent on the
1528                          * nrblocks.  So limit nrblocks.
1529                          */
1530                         goto flush_it;
1531                 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
1532                                 EXT4_MAX_TRANS_DATA) {
1533                         /*
1534                          * Adding the new buffer_head would make it cross the
1535                          * allowed limit for which we have journal credit
1536                          * reserved. So limit the new bh->b_size
1537                          */
1538                         b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
1539                                                 mpd->inode->i_blkbits;
1540                         /* we will do mpage_da_submit_io in the next loop */
1541                 }
1542         }
1543         /*
1544          * First block in the extent
1545          */
1546         if (mpd->b_size == 0) {
1547                 mpd->b_blocknr = logical;
1548                 mpd->b_size = b_size;
1549                 mpd->b_state = b_state & BH_FLAGS;
1550                 return;
1551         }
1552
1553         next = mpd->b_blocknr + nrblocks;
1554         /*
1555          * Can we merge the block to our big extent?
1556          */
1557         if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1558                 mpd->b_size += b_size;
1559                 return;
1560         }
1561
1562 flush_it:
1563         /*
1564          * We couldn't merge the block to our extent, so we
1565          * need to flush current  extent and start new one
1566          */
1567         mpage_da_map_and_submit(mpd);
1568         return;
1569 }
1570
1571 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1572 {
1573         return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
1574 }
1575
1576 /*
1577  * This is a special get_blocks_t callback which is used by
1578  * ext4_da_write_begin().  It will either return mapped block or
1579  * reserve space for a single block.
1580  *
1581  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
1582  * We also have b_blocknr = -1 and b_bdev initialized properly
1583  *
1584  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
1585  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1586  * initialized properly.
1587  */
1588 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1589                                   struct buffer_head *bh, int create)
1590 {
1591         struct ext4_map_blocks map;
1592         int ret = 0;
1593         sector_t invalid_block = ~((sector_t) 0xffff);
1594
1595         if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1596                 invalid_block = ~0;
1597
1598         BUG_ON(create == 0);
1599         BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
1600
1601         map.m_lblk = iblock;
1602         map.m_len = 1;
1603
1604         /*
1605          * first, we need to know whether the block is allocated already
1606          * preallocated blocks are unmapped but should treated
1607          * the same as allocated blocks.
1608          */
1609         ret = ext4_map_blocks(NULL, inode, &map, 0);
1610         if (ret < 0)
1611                 return ret;
1612         if (ret == 0) {
1613                 if (buffer_delay(bh))
1614                         return 0; /* Not sure this could or should happen */
1615                 /*
1616                  * XXX: __block_write_begin() unmaps passed block, is it OK?
1617                  */
1618                 ret = ext4_da_reserve_space(inode, iblock);
1619                 if (ret)
1620                         /* not enough space to reserve */
1621                         return ret;
1622
1623                 map_bh(bh, inode->i_sb, invalid_block);
1624                 set_buffer_new(bh);
1625                 set_buffer_delay(bh);
1626                 return 0;
1627         }
1628
1629         map_bh(bh, inode->i_sb, map.m_pblk);
1630         bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1631
1632         if (buffer_unwritten(bh)) {
1633                 /* A delayed write to unwritten bh should be marked
1634                  * new and mapped.  Mapped ensures that we don't do
1635                  * get_block multiple times when we write to the same
1636                  * offset and new ensures that we do proper zero out
1637                  * for partial write.
1638                  */
1639                 set_buffer_new(bh);
1640                 set_buffer_mapped(bh);
1641         }
1642         return 0;
1643 }
1644
1645 /*
1646  * This function is used as a standard get_block_t calback function
1647  * when there is no desire to allocate any blocks.  It is used as a
1648  * callback function for block_write_begin() and block_write_full_page().
1649  * These functions should only try to map a single block at a time.
1650  *
1651  * Since this function doesn't do block allocations even if the caller
1652  * requests it by passing in create=1, it is critically important that
1653  * any caller checks to make sure that any buffer heads are returned
1654  * by this function are either all already mapped or marked for
1655  * delayed allocation before calling  block_write_full_page().  Otherwise,
1656  * b_blocknr could be left unitialized, and the page write functions will
1657  * be taken by surprise.
1658  */
1659 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
1660                                    struct buffer_head *bh_result, int create)
1661 {
1662         BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1663         return _ext4_get_block(inode, iblock, bh_result, 0);
1664 }
1665
1666 static int bget_one(handle_t *handle, struct buffer_head *bh)
1667 {
1668         get_bh(bh);
1669         return 0;
1670 }
1671
1672 static int bput_one(handle_t *handle, struct buffer_head *bh)
1673 {
1674         put_bh(bh);
1675         return 0;
1676 }
1677
1678 static int __ext4_journalled_writepage(struct page *page,
1679                                        unsigned int len)
1680 {
1681         struct address_space *mapping = page->mapping;
1682         struct inode *inode = mapping->host;
1683         struct buffer_head *page_bufs;
1684         handle_t *handle = NULL;
1685         int ret = 0;
1686         int err;
1687
1688         ClearPageChecked(page);
1689         page_bufs = page_buffers(page);
1690         BUG_ON(!page_bufs);
1691         walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
1692         /* As soon as we unlock the page, it can go away, but we have
1693          * references to buffers so we are safe */
1694         unlock_page(page);
1695
1696         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1697         if (IS_ERR(handle)) {
1698                 ret = PTR_ERR(handle);
1699                 goto out;
1700         }
1701
1702         ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
1703                                 do_journal_get_write_access);
1704
1705         err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
1706                                 write_end_fn);
1707         if (ret == 0)
1708                 ret = err;
1709         EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1710         err = ext4_journal_stop(handle);
1711         if (!ret)
1712                 ret = err;
1713
1714         walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
1715         ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1716 out:
1717         return ret;
1718 }
1719
1720 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
1721 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
1722
1723 /*
1724  * Note that we don't need to start a transaction unless we're journaling data
1725  * because we should have holes filled from ext4_page_mkwrite(). We even don't
1726  * need to file the inode to the transaction's list in ordered mode because if
1727  * we are writing back data added by write(), the inode is already there and if
1728  * we are writing back data modified via mmap(), no one guarantees in which
1729  * transaction the data will hit the disk. In case we are journaling data, we
1730  * cannot start transaction directly because transaction start ranks above page
1731  * lock so we have to do some magic.
1732  *
1733  * This function can get called via...
1734  *   - ext4_da_writepages after taking page lock (have journal handle)
1735  *   - journal_submit_inode_data_buffers (no journal handle)
1736  *   - shrink_page_list via pdflush (no journal handle)
1737  *   - grab_page_cache when doing write_begin (have journal handle)
1738  *
1739  * We don't do any block allocation in this function. If we have page with
1740  * multiple blocks we need to write those buffer_heads that are mapped. This
1741  * is important for mmaped based write. So if we do with blocksize 1K
1742  * truncate(f, 1024);
1743  * a = mmap(f, 0, 4096);
1744  * a[0] = 'a';
1745  * truncate(f, 4096);
1746  * we have in the page first buffer_head mapped via page_mkwrite call back
1747  * but other bufer_heads would be unmapped but dirty(dirty done via the
1748  * do_wp_page). So writepage should write the first block. If we modify
1749  * the mmap area beyond 1024 we will again get a page_fault and the
1750  * page_mkwrite callback will do the block allocation and mark the
1751  * buffer_heads mapped.
1752  *
1753  * We redirty the page if we have any buffer_heads that is either delay or
1754  * unwritten in the page.
1755  *
1756  * We can get recursively called as show below.
1757  *
1758  *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1759  *              ext4_writepage()
1760  *
1761  * But since we don't do any block allocation we should not deadlock.
1762  * Page also have the dirty flag cleared so we don't get recurive page_lock.
1763  */
1764 static int ext4_writepage(struct page *page,
1765                           struct writeback_control *wbc)
1766 {
1767         int ret = 0, commit_write = 0;
1768         loff_t size;
1769         unsigned int len;
1770         struct buffer_head *page_bufs = NULL;
1771         struct inode *inode = page->mapping->host;
1772
1773         trace_ext4_writepage(page);
1774         size = i_size_read(inode);
1775         if (page->index == size >> PAGE_CACHE_SHIFT)
1776                 len = size & ~PAGE_CACHE_MASK;
1777         else
1778                 len = PAGE_CACHE_SIZE;
1779
1780         /*
1781          * If the page does not have buffers (for whatever reason),
1782          * try to create them using __block_write_begin.  If this
1783          * fails, redirty the page and move on.
1784          */
1785         if (!page_has_buffers(page)) {
1786                 if (__block_write_begin(page, 0, len,
1787                                         noalloc_get_block_write)) {
1788                 redirty_page:
1789                         redirty_page_for_writepage(wbc, page);
1790                         unlock_page(page);
1791                         return 0;
1792                 }
1793                 commit_write = 1;
1794         }
1795         page_bufs = page_buffers(page);
1796         if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
1797                               ext4_bh_delay_or_unwritten)) {
1798                 /*
1799                  * We don't want to do block allocation, so redirty
1800                  * the page and return.  We may reach here when we do
1801                  * a journal commit via journal_submit_inode_data_buffers.
1802                  * We can also reach here via shrink_page_list
1803                  */
1804                 goto redirty_page;
1805         }
1806         if (commit_write)
1807                 /* now mark the buffer_heads as dirty and uptodate */
1808                 block_commit_write(page, 0, len);
1809
1810         if (PageChecked(page) && ext4_should_journal_data(inode))
1811                 /*
1812                  * It's mmapped pagecache.  Add buffers and journal it.  There
1813                  * doesn't seem much point in redirtying the page here.
1814                  */
1815                 return __ext4_journalled_writepage(page, len);
1816
1817         if (buffer_uninit(page_bufs)) {
1818                 ext4_set_bh_endio(page_bufs, inode);
1819                 ret = block_write_full_page_endio(page, noalloc_get_block_write,
1820                                             wbc, ext4_end_io_buffer_write);
1821         } else
1822                 ret = block_write_full_page(page, noalloc_get_block_write,
1823                                             wbc);
1824
1825         return ret;
1826 }
1827
1828 /*
1829  * This is called via ext4_da_writepages() to
1830  * calculate the total number of credits to reserve to fit
1831  * a single extent allocation into a single transaction,
1832  * ext4_da_writpeages() will loop calling this before
1833  * the block allocation.
1834  */
1835
1836 static int ext4_da_writepages_trans_blocks(struct inode *inode)
1837 {
1838         int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
1839
1840         /*
1841          * With non-extent format the journal credit needed to
1842          * insert nrblocks contiguous block is dependent on
1843          * number of contiguous block. So we will limit
1844          * number of contiguous block to a sane value
1845          */
1846         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
1847             (max_blocks > EXT4_MAX_TRANS_DATA))
1848                 max_blocks = EXT4_MAX_TRANS_DATA;
1849
1850         return ext4_chunk_trans_blocks(inode, max_blocks);
1851 }
1852
1853 /*
1854  * write_cache_pages_da - walk the list of dirty pages of the given
1855  * address space and accumulate pages that need writing, and call
1856  * mpage_da_map_and_submit to map a single contiguous memory region
1857  * and then write them.
1858  */
1859 static int write_cache_pages_da(struct address_space *mapping,
1860                                 struct writeback_control *wbc,
1861                                 struct mpage_da_data *mpd,
1862                                 pgoff_t *done_index)
1863 {
1864         struct buffer_head      *bh, *head;
1865         struct inode            *inode = mapping->host;
1866         struct pagevec          pvec;
1867         unsigned int            nr_pages;
1868         sector_t                logical;
1869         pgoff_t                 index, end;
1870         long                    nr_to_write = wbc->nr_to_write;
1871         int                     i, tag, ret = 0;
1872
1873         memset(mpd, 0, sizeof(struct mpage_da_data));
1874         mpd->wbc = wbc;
1875         mpd->inode = inode;
1876         pagevec_init(&pvec, 0);
1877         index = wbc->range_start >> PAGE_CACHE_SHIFT;
1878         end = wbc->range_end >> PAGE_CACHE_SHIFT;
1879
1880         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1881                 tag = PAGECACHE_TAG_TOWRITE;
1882         else
1883                 tag = PAGECACHE_TAG_DIRTY;
1884
1885         *done_index = index;
1886         while (index <= end) {
1887                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
1888                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1889                 if (nr_pages == 0)
1890                         return 0;
1891
1892                 for (i = 0; i < nr_pages; i++) {
1893                         struct page *page = pvec.pages[i];
1894
1895                         /*
1896                          * At this point, the page may be truncated or
1897                          * invalidated (changing page->mapping to NULL), or
1898                          * even swizzled back from swapper_space to tmpfs file
1899                          * mapping. However, page->index will not change
1900                          * because we have a reference on the page.
1901                          */
1902                         if (page->index > end)
1903                                 goto out;
1904
1905                         *done_index = page->index + 1;
1906
1907                         /*
1908                          * If we can't merge this page, and we have
1909                          * accumulated an contiguous region, write it
1910                          */
1911                         if ((mpd->next_page != page->index) &&
1912                             (mpd->next_page != mpd->first_page)) {
1913                                 mpage_da_map_and_submit(mpd);
1914                                 goto ret_extent_tail;
1915                         }
1916
1917                         lock_page(page);
1918
1919                         /*
1920                          * If the page is no longer dirty, or its
1921                          * mapping no longer corresponds to inode we
1922                          * are writing (which means it has been
1923                          * truncated or invalidated), or the page is
1924                          * already under writeback and we are not
1925                          * doing a data integrity writeback, skip the page
1926                          */
1927                         if (!PageDirty(page) ||
1928                             (PageWriteback(page) &&
1929                              (wbc->sync_mode == WB_SYNC_NONE)) ||
1930                             unlikely(page->mapping != mapping)) {
1931                                 unlock_page(page);
1932                                 continue;
1933                         }
1934
1935                         wait_on_page_writeback(page);
1936                         BUG_ON(PageWriteback(page));
1937
1938                         if (mpd->next_page != page->index)
1939                                 mpd->first_page = page->index;
1940                         mpd->next_page = page->index + 1;
1941                         logical = (sector_t) page->index <<
1942                                 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1943
1944                         if (!page_has_buffers(page)) {
1945                                 mpage_add_bh_to_extent(mpd, logical,
1946                                                        PAGE_CACHE_SIZE,
1947                                                        (1 << BH_Dirty) | (1 << BH_Uptodate));
1948                                 if (mpd->io_done)
1949                                         goto ret_extent_tail;
1950                         } else {
1951                                 /*
1952                                  * Page with regular buffer heads,
1953                                  * just add all dirty ones
1954                                  */
1955                                 head = page_buffers(page);
1956                                 bh = head;
1957                                 do {
1958                                         BUG_ON(buffer_locked(bh));
1959                                         /*
1960                                          * We need to try to allocate
1961                                          * unmapped blocks in the same page.
1962                                          * Otherwise we won't make progress
1963                                          * with the page in ext4_writepage
1964                                          */
1965                                         if (ext4_bh_delay_or_unwritten(NULL, bh)) {
1966                                                 mpage_add_bh_to_extent(mpd, logical,
1967                                                                        bh->b_size,
1968                                                                        bh->b_state);
1969                                                 if (mpd->io_done)
1970                                                         goto ret_extent_tail;
1971                                         } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
1972                                                 /*
1973                                                  * mapped dirty buffer. We need
1974                                                  * to update the b_state
1975                                                  * because we look at b_state
1976                                                  * in mpage_da_map_blocks.  We
1977                                                  * don't update b_size because
1978                                                  * if we find an unmapped
1979                                                  * buffer_head later we need to
1980                                                  * use the b_state flag of that
1981                                                  * buffer_head.
1982                                                  */
1983                                                 if (mpd->b_size == 0)
1984                                                         mpd->b_state = bh->b_state & BH_FLAGS;
1985                                         }
1986                                         logical++;
1987                                 } while ((bh = bh->b_this_page) != head);
1988                         }
1989
1990                         if (nr_to_write > 0) {
1991                                 nr_to_write--;
1992                                 if (nr_to_write == 0 &&
1993                                     wbc->sync_mode == WB_SYNC_NONE)
1994                                         /*
1995                                          * We stop writing back only if we are
1996                                          * not doing integrity sync. In case of
1997                                          * integrity sync we have to keep going
1998                                          * because someone may be concurrently
1999                                          * dirtying pages, and we might have
2000                                          * synced a lot of newly appeared dirty
2001                                          * pages, but have not synced all of the
2002                                          * old dirty pages.
2003                                          */
2004                                         goto out;
2005                         }
2006                 }
2007                 pagevec_release(&pvec);
2008                 cond_resched();
2009         }
2010         return 0;
2011 ret_extent_tail:
2012         ret = MPAGE_DA_EXTENT_TAIL;
2013 out:
2014         pagevec_release(&pvec);
2015         cond_resched();
2016         return ret;
2017 }
2018
2019
2020 static int ext4_da_writepages(struct address_space *mapping,
2021                               struct writeback_control *wbc)
2022 {
2023         pgoff_t index;
2024         int range_whole = 0;
2025         handle_t *handle = NULL;
2026         struct mpage_da_data mpd;
2027         struct inode *inode = mapping->host;
2028         int pages_written = 0;
2029         unsigned int max_pages;
2030         int range_cyclic, cycled = 1, io_done = 0;
2031         int needed_blocks, ret = 0;
2032         long desired_nr_to_write, nr_to_writebump = 0;
2033         loff_t range_start = wbc->range_start;
2034         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2035         pgoff_t done_index = 0;
2036         pgoff_t end;
2037
2038         trace_ext4_da_writepages(inode, wbc);
2039
2040         /*
2041          * No pages to write? This is mainly a kludge to avoid starting
2042          * a transaction for special inodes like journal inode on last iput()
2043          * because that could violate lock ordering on umount
2044          */
2045         if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2046                 return 0;
2047
2048         /*
2049          * If the filesystem has aborted, it is read-only, so return
2050          * right away instead of dumping stack traces later on that
2051          * will obscure the real source of the problem.  We test
2052          * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2053          * the latter could be true if the filesystem is mounted
2054          * read-only, and in that case, ext4_da_writepages should
2055          * *never* be called, so if that ever happens, we would want
2056          * the stack trace.
2057          */
2058         if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2059                 return -EROFS;
2060
2061         if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2062                 range_whole = 1;
2063
2064         range_cyclic = wbc->range_cyclic;
2065         if (wbc->range_cyclic) {
2066                 index = mapping->writeback_index;
2067                 if (index)
2068                         cycled = 0;
2069                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2070                 wbc->range_end  = LLONG_MAX;
2071                 wbc->range_cyclic = 0;
2072                 end = -1;
2073         } else {
2074                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2075                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2076         }
2077
2078         /*
2079          * This works around two forms of stupidity.  The first is in
2080          * the writeback code, which caps the maximum number of pages
2081          * written to be 1024 pages.  This is wrong on multiple
2082          * levels; different architectues have a different page size,
2083          * which changes the maximum amount of data which gets
2084          * written.  Secondly, 4 megabytes is way too small.  XFS
2085          * forces this value to be 16 megabytes by multiplying
2086          * nr_to_write parameter by four, and then relies on its
2087          * allocator to allocate larger extents to make them
2088          * contiguous.  Unfortunately this brings us to the second
2089          * stupidity, which is that ext4's mballoc code only allocates
2090          * at most 2048 blocks.  So we force contiguous writes up to
2091          * the number of dirty blocks in the inode, or
2092          * sbi->max_writeback_mb_bump whichever is smaller.
2093          */
2094         max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2095         if (!range_cyclic && range_whole) {
2096                 if (wbc->nr_to_write == LONG_MAX)
2097                         desired_nr_to_write = wbc->nr_to_write;
2098                 else
2099                         desired_nr_to_write = wbc->nr_to_write * 8;
2100         } else
2101                 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2102                                                            max_pages);
2103         if (desired_nr_to_write > max_pages)
2104                 desired_nr_to_write = max_pages;
2105
2106         if (wbc->nr_to_write < desired_nr_to_write) {
2107                 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2108                 wbc->nr_to_write = desired_nr_to_write;
2109         }
2110
2111 retry:
2112         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2113                 tag_pages_for_writeback(mapping, index, end);
2114
2115         while (!ret && wbc->nr_to_write > 0) {
2116
2117                 /*
2118                  * we  insert one extent at a time. So we need
2119                  * credit needed for single extent allocation.
2120                  * journalled mode is currently not supported
2121                  * by delalloc
2122                  */
2123                 BUG_ON(ext4_should_journal_data(inode));
2124                 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2125
2126                 /* start a new transaction*/
2127                 handle = ext4_journal_start(inode, needed_blocks);
2128                 if (IS_ERR(handle)) {
2129                         ret = PTR_ERR(handle);
2130                         ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2131                                "%ld pages, ino %lu; err %d", __func__,
2132                                 wbc->nr_to_write, inode->i_ino, ret);
2133                         goto out_writepages;
2134                 }
2135
2136                 /*
2137                  * Now call write_cache_pages_da() to find the next
2138                  * contiguous region of logical blocks that need
2139                  * blocks to be allocated by ext4 and submit them.
2140                  */
2141                 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
2142                 /*
2143                  * If we have a contiguous extent of pages and we
2144                  * haven't done the I/O yet, map the blocks and submit
2145                  * them for I/O.
2146                  */
2147                 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2148                         mpage_da_map_and_submit(&mpd);
2149                         ret = MPAGE_DA_EXTENT_TAIL;
2150                 }
2151                 trace_ext4_da_write_pages(inode, &mpd);
2152                 wbc->nr_to_write -= mpd.pages_written;
2153
2154                 ext4_journal_stop(handle);
2155
2156                 if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
2157                         /* commit the transaction which would
2158                          * free blocks released in the transaction
2159                          * and try again
2160                          */
2161                         jbd2_journal_force_commit_nested(sbi->s_journal);
2162                         ret = 0;
2163                 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2164                         /*
2165                          * got one extent now try with
2166                          * rest of the pages
2167                          */
2168                         pages_written += mpd.pages_written;
2169                         ret = 0;
2170                         io_done = 1;
2171                 } else if (wbc->nr_to_write)
2172                         /*
2173                          * There is no more writeout needed
2174                          * or we requested for a noblocking writeout
2175                          * and we found the device congested
2176                          */
2177                         break;
2178         }
2179         if (!io_done && !cycled) {
2180                 cycled = 1;
2181                 index = 0;
2182                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2183                 wbc->range_end  = mapping->writeback_index - 1;
2184                 goto retry;
2185         }
2186
2187         /* Update index */
2188         wbc->range_cyclic = range_cyclic;
2189         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2190                 /*
2191                  * set the writeback_index so that range_cyclic
2192                  * mode will write it back later
2193                  */
2194                 mapping->writeback_index = done_index;
2195
2196 out_writepages:
2197         wbc->nr_to_write -= nr_to_writebump;
2198         wbc->range_start = range_start;
2199         trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2200         return ret;
2201 }
2202
2203 #define FALL_BACK_TO_NONDELALLOC 1
2204 static int ext4_nonda_switch(struct super_block *sb)
2205 {
2206         s64 free_blocks, dirty_blocks;
2207         struct ext4_sb_info *sbi = EXT4_SB(sb);
2208
2209         /*
2210          * switch to non delalloc mode if we are running low
2211          * on free block. The free block accounting via percpu
2212          * counters can get slightly wrong with percpu_counter_batch getting
2213          * accumulated on each CPU without updating global counters
2214          * Delalloc need an accurate free block accounting. So switch
2215          * to non delalloc when we are near to error range.
2216          */
2217         free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
2218         dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
2219         if (2 * free_blocks < 3 * dirty_blocks ||
2220                 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
2221                 /*
2222                  * free block count is less than 150% of dirty blocks
2223                  * or free blocks is less than watermark
2224                  */
2225                 return 1;
2226         }
2227         /*
2228          * Even if we don't switch but are nearing capacity,
2229          * start pushing delalloc when 1/2 of free blocks are dirty.
2230          */
2231         if (free_blocks < 2 * dirty_blocks)
2232                 writeback_inodes_sb_if_idle(sb);
2233
2234         return 0;
2235 }
2236
2237 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2238                                loff_t pos, unsigned len, unsigned flags,
2239                                struct page **pagep, void **fsdata)
2240 {
2241         int ret, retries = 0;
2242         struct page *page;
2243         pgoff_t index;
2244         struct inode *inode = mapping->host;
2245         handle_t *handle;
2246
2247         index = pos >> PAGE_CACHE_SHIFT;
2248
2249         if (ext4_nonda_switch(inode->i_sb)) {
2250                 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2251                 return ext4_write_begin(file, mapping, pos,
2252                                         len, flags, pagep, fsdata);
2253         }
2254         *fsdata = (void *)0;
2255         trace_ext4_da_write_begin(inode, pos, len, flags);
2256 retry:
2257         /*
2258          * With delayed allocation, we don't log the i_disksize update
2259          * if there is delayed block allocation. But we still need
2260          * to journalling the i_disksize update if writes to the end
2261          * of file which has an already mapped buffer.
2262          */
2263         handle = ext4_journal_start(inode, 1);
2264         if (IS_ERR(handle)) {
2265                 ret = PTR_ERR(handle);
2266                 goto out;
2267         }
2268         /* We cannot recurse into the filesystem as the transaction is already
2269          * started */
2270         flags |= AOP_FLAG_NOFS;
2271
2272         page = grab_cache_page_write_begin(mapping, index, flags);
2273         if (!page) {
2274                 ext4_journal_stop(handle);
2275                 ret = -ENOMEM;
2276                 goto out;
2277         }
2278         *pagep = page;
2279
2280         ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
2281         if (ret < 0) {
2282                 unlock_page(page);
2283                 ext4_journal_stop(handle);
2284                 page_cache_release(page);
2285                 /*
2286                  * block_write_begin may have instantiated a few blocks
2287                  * outside i_size.  Trim these off again. Don't need
2288                  * i_size_read because we hold i_mutex.
2289                  */
2290                 if (pos + len > inode->i_size)
2291                         ext4_truncate_failed_write(inode);
2292         }
2293
2294         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2295                 goto retry;
2296 out:
2297         return ret;
2298 }
2299
2300 /*
2301  * Check if we should update i_disksize
2302  * when write to the end of file but not require block allocation
2303  */
2304 static int ext4_da_should_update_i_disksize(struct page *page,
2305                                             unsigned long offset)
2306 {
2307         struct buffer_head *bh;
2308         struct inode *inode = page->mapping->host;
2309         unsigned int idx;
2310         int i;
2311
2312         bh = page_buffers(page);
2313         idx = offset >> inode->i_blkbits;
2314
2315         for (i = 0; i < idx; i++)
2316                 bh = bh->b_this_page;
2317
2318         if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2319                 return 0;
2320         return 1;
2321 }
2322
2323 static int ext4_da_write_end(struct file *file,
2324                              struct address_space *mapping,
2325                              loff_t pos, unsigned len, unsigned copied,
2326                              struct page *page, void *fsdata)
2327 {
2328         struct inode *inode = mapping->host;
2329         int ret = 0, ret2;
2330         handle_t *handle = ext4_journal_current_handle();
2331         loff_t new_i_size;
2332         unsigned long start, end;
2333         int write_mode = (int)(unsigned long)fsdata;
2334
2335         if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2336                 if (ext4_should_order_data(inode)) {
2337                         return ext4_ordered_write_end(file, mapping, pos,
2338                                         len, copied, page, fsdata);
2339                 } else if (ext4_should_writeback_data(inode)) {
2340                         return ext4_writeback_write_end(file, mapping, pos,
2341                                         len, copied, page, fsdata);
2342                 } else {
2343                         BUG();
2344                 }
2345         }
2346
2347         trace_ext4_da_write_end(inode, pos, len, copied);
2348         start = pos & (PAGE_CACHE_SIZE - 1);
2349         end = start + copied - 1;
2350
2351         /*
2352          * generic_write_end() will run mark_inode_dirty() if i_size
2353          * changes.  So let's piggyback the i_disksize mark_inode_dirty
2354          * into that.
2355          */
2356
2357         new_i_size = pos + copied;
2358         if (new_i_size > EXT4_I(inode)->i_disksize) {
2359                 if (ext4_da_should_update_i_disksize(page, end)) {
2360                         down_write(&EXT4_I(inode)->i_data_sem);
2361                         if (new_i_size > EXT4_I(inode)->i_disksize) {
2362                                 /*
2363                                  * Updating i_disksize when extending file
2364                                  * without needing block allocation
2365                                  */
2366                                 if (ext4_should_order_data(inode))
2367                                         ret = ext4_jbd2_file_inode(handle,
2368                                                                    inode);
2369
2370                                 EXT4_I(inode)->i_disksize = new_i_size;
2371                         }
2372                         up_write(&EXT4_I(inode)->i_data_sem);
2373                         /* We need to mark inode dirty even if
2374                          * new_i_size is less that inode->i_size
2375                          * bu greater than i_disksize.(hint delalloc)
2376                          */
2377                         ext4_mark_inode_dirty(handle, inode);
2378                 }
2379         }
2380         ret2 = generic_write_end(file, mapping, pos, len, copied,
2381                                                         page, fsdata);
2382         copied = ret2;
2383         if (ret2 < 0)
2384                 ret = ret2;
2385         ret2 = ext4_journal_stop(handle);
2386         if (!ret)
2387                 ret = ret2;
2388
2389         return ret ? ret : copied;
2390 }
2391
2392 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2393 {
2394         /*
2395          * Drop reserved blocks
2396          */
2397         BUG_ON(!PageLocked(page));
2398         if (!page_has_buffers(page))
2399                 goto out;
2400
2401         ext4_da_page_release_reservation(page, offset);
2402
2403 out:
2404         ext4_invalidatepage(page, offset);
2405
2406         return;
2407 }
2408
2409 /*
2410  * Force all delayed allocation blocks to be allocated for a given inode.
2411  */
2412 int ext4_alloc_da_blocks(struct inode *inode)
2413 {
2414         trace_ext4_alloc_da_blocks(inode);
2415
2416         if (!EXT4_I(inode)->i_reserved_data_blocks &&
2417             !EXT4_I(inode)->i_reserved_meta_blocks)
2418                 return 0;
2419
2420         /*
2421          * We do something simple for now.  The filemap_flush() will
2422          * also start triggering a write of the data blocks, which is
2423          * not strictly speaking necessary (and for users of
2424          * laptop_mode, not even desirable).  However, to do otherwise
2425          * would require replicating code paths in:
2426          *
2427          * ext4_da_writepages() ->
2428          *    write_cache_pages() ---> (via passed in callback function)
2429          *        __mpage_da_writepage() -->
2430          *           mpage_add_bh_to_extent()
2431          *           mpage_da_map_blocks()
2432          *
2433          * The problem is that write_cache_pages(), located in
2434          * mm/page-writeback.c, marks pages clean in preparation for
2435          * doing I/O, which is not desirable if we're not planning on
2436          * doing I/O at all.
2437          *
2438          * We could call write_cache_pages(), and then redirty all of
2439          * the pages by calling redirty_page_for_writepage() but that
2440          * would be ugly in the extreme.  So instead we would need to
2441          * replicate parts of the code in the above functions,
2442          * simplifying them because we wouldn't actually intend to
2443          * write out the pages, but rather only collect contiguous
2444          * logical block extents, call the multi-block allocator, and
2445          * then update the buffer heads with the block allocations.
2446          *
2447          * For now, though, we'll cheat by calling filemap_flush(),
2448          * which will map the blocks, and start the I/O, but not
2449          * actually wait for the I/O to complete.
2450          */
2451         return filemap_flush(inode->i_mapping);
2452 }
2453
2454 /*
2455  * bmap() is special.  It gets used by applications such as lilo and by
2456  * the swapper to find the on-disk block of a specific piece of data.
2457  *
2458  * Naturally, this is dangerous if the block concerned is still in the
2459  * journal.  If somebody makes a swapfile on an ext4 data-journaling
2460  * filesystem and enables swap, then they may get a nasty shock when the
2461  * data getting swapped to that swapfile suddenly gets overwritten by
2462  * the original zero's written out previously to the journal and
2463  * awaiting writeback in the kernel's buffer cache.
2464  *
2465  * So, if we see any bmap calls here on a modified, data-journaled file,
2466  * take extra steps to flush any blocks which might be in the cache.
2467  */
2468 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2469 {
2470         struct inode *inode = mapping->host;
2471         journal_t *journal;
2472         int err;
2473
2474         if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2475                         test_opt(inode->i_sb, DELALLOC)) {
2476                 /*
2477                  * With delalloc we want to sync the file
2478                  * so that we can make sure we allocate
2479                  * blocks for file
2480                  */
2481                 filemap_write_and_wait(mapping);
2482         }
2483
2484         if (EXT4_JOURNAL(inode) &&
2485             ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
2486                 /*
2487                  * This is a REALLY heavyweight approach, but the use of
2488                  * bmap on dirty files is expected to be extremely rare:
2489                  * only if we run lilo or swapon on a freshly made file
2490                  * do we expect this to happen.
2491                  *
2492                  * (bmap requires CAP_SYS_RAWIO so this does not
2493                  * represent an unprivileged user DOS attack --- we'd be
2494                  * in trouble if mortal users could trigger this path at
2495                  * will.)
2496                  *
2497                  * NB. EXT4_STATE_JDATA is not set on files other than
2498                  * regular files.  If somebody wants to bmap a directory
2499                  * or symlink and gets confused because the buffer
2500                  * hasn't yet been flushed to disk, they deserve
2501                  * everything they get.
2502                  */
2503
2504                 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
2505                 journal = EXT4_JOURNAL(inode);
2506                 jbd2_journal_lock_updates(journal);
2507                 err = jbd2_journal_flush(journal);
2508                 jbd2_journal_unlock_updates(journal);
2509
2510                 if (err)
2511                         return 0;
2512         }
2513
2514         return generic_block_bmap(mapping, block, ext4_get_block);
2515 }
2516
2517 static int ext4_readpage(struct file *file, struct page *page)
2518 {
2519         trace_ext4_readpage(page);
2520         return mpage_readpage(page, ext4_get_block);
2521 }
2522
2523 static int
2524 ext4_readpages(struct file *file, struct address_space *mapping,
2525                 struct list_head *pages, unsigned nr_pages)
2526 {
2527         return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2528 }
2529
2530 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
2531 {
2532         struct buffer_head *head, *bh;
2533         unsigned int curr_off = 0;
2534
2535         if (!page_has_buffers(page))
2536                 return;
2537         head = bh = page_buffers(page);
2538         do {
2539                 if (offset <= curr_off && test_clear_buffer_uninit(bh)
2540                                         && bh->b_private) {
2541                         ext4_free_io_end(bh->b_private);
2542                         bh->b_private = NULL;
2543                         bh->b_end_io = NULL;
2544                 }
2545                 curr_off = curr_off + bh->b_size;
2546                 bh = bh->b_this_page;
2547         } while (bh != head);
2548 }
2549
2550 static void ext4_invalidatepage(struct page *page, unsigned long offset)
2551 {
2552         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2553
2554         trace_ext4_invalidatepage(page, offset);
2555
2556         /*
2557          * free any io_end structure allocated for buffers to be discarded
2558          */
2559         if (ext4_should_dioread_nolock(page->mapping->host))
2560                 ext4_invalidatepage_free_endio(page, offset);
2561         /*
2562          * If it's a full truncate we just forget about the pending dirtying
2563          */
2564         if (offset == 0)
2565                 ClearPageChecked(page);
2566
2567         if (journal)
2568                 jbd2_journal_invalidatepage(journal, page, offset);
2569         else
2570                 block_invalidatepage(page, offset);
2571 }
2572
2573 static int ext4_releasepage(struct page *page, gfp_t wait)
2574 {
2575         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2576
2577         trace_ext4_releasepage(page);
2578
2579         WARN_ON(PageChecked(page));
2580         if (!page_has_buffers(page))
2581                 return 0;
2582         if (journal)
2583                 return jbd2_journal_try_to_free_buffers(journal, page, wait);
2584         else
2585                 return try_to_free_buffers(page);
2586 }
2587
2588 /*
2589  * ext4_get_block used when preparing for a DIO write or buffer write.
2590  * We allocate an uinitialized extent if blocks haven't been allocated.
2591  * The extent will be converted to initialized after the IO is complete.
2592  */
2593 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
2594                    struct buffer_head *bh_result, int create)
2595 {
2596         ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
2597                    inode->i_ino, create);
2598         return _ext4_get_block(inode, iblock, bh_result,
2599                                EXT4_GET_BLOCKS_IO_CREATE_EXT);
2600 }
2601
2602 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2603                             ssize_t size, void *private, int ret,
2604                             bool is_async)
2605 {
2606         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2607         ext4_io_end_t *io_end = iocb->private;
2608         struct workqueue_struct *wq;
2609         unsigned long flags;
2610         struct ext4_inode_info *ei;
2611
2612         /* if not async direct IO or dio with 0 bytes write, just return */
2613         if (!io_end || !size)
2614                 goto out;
2615
2616         ext_debug("ext4_end_io_dio(): io_end 0x%p"
2617                   "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
2618                   iocb->private, io_end->inode->i_ino, iocb, offset,
2619                   size);
2620
2621         /* if not aio dio with unwritten extents, just free io and return */
2622         if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2623                 ext4_free_io_end(io_end);
2624                 iocb->private = NULL;
2625 out:
2626                 if (is_async)
2627                         aio_complete(iocb, ret, 0);
2628                 inode_dio_done(inode);
2629                 return;
2630         }
2631
2632         io_end->offset = offset;
2633         io_end->size = size;
2634         if (is_async) {
2635                 io_end->iocb = iocb;
2636                 io_end->result = ret;
2637         }
2638         wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
2639
2640         /* Add the io_end to per-inode completed aio dio list*/
2641         ei = EXT4_I(io_end->inode);
2642         spin_lock_irqsave(&ei->i_completed_io_lock, flags);
2643         list_add_tail(&io_end->list, &ei->i_completed_io_list);
2644         spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
2645
2646         /* queue the work to convert unwritten extents to written */
2647         queue_work(wq, &io_end->work);
2648         iocb->private = NULL;
2649
2650         /* XXX: probably should move into the real I/O completion handler */
2651         inode_dio_done(inode);
2652 }
2653
2654 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2655 {
2656         ext4_io_end_t *io_end = bh->b_private;
2657         struct workqueue_struct *wq;
2658         struct inode *inode;
2659         unsigned long flags;
2660
2661         if (!test_clear_buffer_uninit(bh) || !io_end)
2662                 goto out;
2663
2664         if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
2665                 printk("sb umounted, discard end_io request for inode %lu\n",
2666                         io_end->inode->i_ino);
2667                 ext4_free_io_end(io_end);
2668                 goto out;
2669         }
2670
2671         io_end->flag = EXT4_IO_END_UNWRITTEN;
2672         inode = io_end->inode;
2673
2674         /* Add the io_end to per-inode completed io list*/
2675         spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
2676         list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
2677         spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
2678
2679         wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
2680         /* queue the work to convert unwritten extents to written */
2681         queue_work(wq, &io_end->work);
2682 out:
2683         bh->b_private = NULL;
2684         bh->b_end_io = NULL;
2685         clear_buffer_uninit(bh);
2686         end_buffer_async_write(bh, uptodate);
2687 }
2688
2689 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
2690 {
2691         ext4_io_end_t *io_end;
2692         struct page *page = bh->b_page;
2693         loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
2694         size_t size = bh->b_size;
2695
2696 retry:
2697         io_end = ext4_init_io_end(inode, GFP_ATOMIC);
2698         if (!io_end) {
2699                 pr_warn_ratelimited("%s: allocation fail\n", __func__);
2700                 schedule();
2701                 goto retry;
2702         }
2703         io_end->offset = offset;
2704         io_end->size = size;
2705         /*
2706          * We need to hold a reference to the page to make sure it
2707          * doesn't get evicted before ext4_end_io_work() has a chance
2708          * to convert the extent from written to unwritten.
2709          */
2710         io_end->page = page;
2711         get_page(io_end->page);
2712
2713         bh->b_private = io_end;
2714         bh->b_end_io = ext4_end_io_buffer_write;
2715         return 0;
2716 }
2717
2718 /*
2719  * For ext4 extent files, ext4 will do direct-io write to holes,
2720  * preallocated extents, and those write extend the file, no need to
2721  * fall back to buffered IO.
2722  *
2723  * For holes, we fallocate those blocks, mark them as uninitialized
2724  * If those blocks were preallocated, we mark sure they are splited, but
2725  * still keep the range to write as uninitialized.
2726  *
2727  * The unwrritten extents will be converted to written when DIO is completed.
2728  * For async direct IO, since the IO may still pending when return, we
2729  * set up an end_io call back function, which will do the conversion
2730  * when async direct IO completed.
2731  *
2732  * If the O_DIRECT write will extend the file then add this inode to the
2733  * orphan list.  So recovery will truncate it back to the original size
2734  * if the machine crashes during the write.
2735  *
2736  */
2737 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2738                               const struct iovec *iov, loff_t offset,
2739                               unsigned long nr_segs)
2740 {
2741         struct file *file = iocb->ki_filp;
2742         struct inode *inode = file->f_mapping->host;
2743         ssize_t ret;
2744         size_t count = iov_length(iov, nr_segs);
2745
2746         loff_t final_size = offset + count;
2747         if (rw == WRITE && final_size <= inode->i_size) {
2748                 /*
2749                  * We could direct write to holes and fallocate.
2750                  *
2751                  * Allocated blocks to fill the hole are marked as uninitialized
2752                  * to prevent parallel buffered read to expose the stale data
2753                  * before DIO complete the data IO.
2754                  *
2755                  * As to previously fallocated extents, ext4 get_block
2756                  * will just simply mark the buffer mapped but still
2757                  * keep the extents uninitialized.
2758                  *
2759                  * for non AIO case, we will convert those unwritten extents
2760                  * to written after return back from blockdev_direct_IO.
2761                  *
2762                  * for async DIO, the conversion needs to be defered when
2763                  * the IO is completed. The ext4 end_io callback function
2764                  * will be called to take care of the conversion work.
2765                  * Here for async case, we allocate an io_end structure to
2766                  * hook to the iocb.
2767                  */
2768                 iocb->private = NULL;
2769                 EXT4_I(inode)->cur_aio_dio = NULL;
2770                 if (!is_sync_kiocb(iocb)) {
2771                         iocb->private = ext4_init_io_end(inode, GFP_NOFS);
2772                         if (!iocb->private)
2773                                 return -ENOMEM;
2774                         /*
2775                          * we save the io structure for current async
2776                          * direct IO, so that later ext4_map_blocks()
2777                          * could flag the io structure whether there
2778                          * is a unwritten extents needs to be converted
2779                          * when IO is completed.
2780                          */
2781                         EXT4_I(inode)->cur_aio_dio = iocb->private;
2782                 }
2783
2784                 ret = __blockdev_direct_IO(rw, iocb, inode,
2785                                          inode->i_sb->s_bdev, iov,
2786                                          offset, nr_segs,
2787                                          ext4_get_block_write,
2788                                          ext4_end_io_dio,
2789                                          NULL,
2790                                          DIO_LOCKING | DIO_SKIP_HOLES);
2791                 if (iocb->private)
2792                         EXT4_I(inode)->cur_aio_dio = NULL;
2793                 /*
2794                  * The io_end structure takes a reference to the inode,
2795                  * that structure needs to be destroyed and the
2796                  * reference to the inode need to be dropped, when IO is
2797                  * complete, even with 0 byte write, or failed.
2798                  *
2799                  * In the successful AIO DIO case, the io_end structure will be
2800                  * desctroyed and the reference to the inode will be dropped
2801                  * after the end_io call back function is called.
2802                  *
2803                  * In the case there is 0 byte write, or error case, since
2804                  * VFS direct IO won't invoke the end_io call back function,
2805                  * we need to free the end_io structure here.
2806                  */
2807                 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
2808                         ext4_free_io_end(iocb->private);
2809                         iocb->private = NULL;
2810                 } else if (ret > 0 && ext4_test_inode_state(inode,
2811                                                 EXT4_STATE_DIO_UNWRITTEN)) {
2812                         int err;
2813                         /*
2814                          * for non AIO case, since the IO is already
2815                          * completed, we could do the conversion right here
2816                          */
2817                         err = ext4_convert_unwritten_extents(inode,
2818                                                              offset, ret);
2819                         if (err < 0)
2820                                 ret = err;
2821                         ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
2822                 }
2823                 return ret;
2824         }
2825
2826         /* for write the the end of file case, we fall back to old way */
2827         return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2828 }
2829
2830 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
2831                               const struct iovec *iov, loff_t offset,
2832                               unsigned long nr_segs)
2833 {
2834         struct file *file = iocb->ki_filp;
2835         struct inode *inode = file->f_mapping->host;
2836         ssize_t ret;
2837
2838         trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
2839         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
2840                 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
2841         else
2842                 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2843         trace_ext4_direct_IO_exit(inode, offset,
2844                                 iov_length(iov, nr_segs), rw, ret);
2845         return ret;
2846 }
2847
2848 /*
2849  * Pages can be marked dirty completely asynchronously from ext4's journalling
2850  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
2851  * much here because ->set_page_dirty is called under VFS locks.  The page is
2852  * not necessarily locked.
2853  *
2854  * We cannot just dirty the page and leave attached buffers clean, because the
2855  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
2856  * or jbddirty because all the journalling code will explode.
2857  *
2858  * So what we do is to mark the page "pending dirty" and next time writepage
2859  * is called, propagate that into the buffers appropriately.
2860  */
2861 static int ext4_journalled_set_page_dirty(struct page *page)
2862 {
2863         SetPageChecked(page);
2864         return __set_page_dirty_nobuffers(page);
2865 }
2866
2867 static const struct address_space_operations ext4_ordered_aops = {
2868         .readpage               = ext4_readpage,
2869         .readpages              = ext4_readpages,
2870         .writepage              = ext4_writepage,
2871         .write_begin            = ext4_write_begin,
2872         .write_end              = ext4_ordered_write_end,
2873         .bmap                   = ext4_bmap,
2874         .invalidatepage         = ext4_invalidatepage,
2875         .releasepage            = ext4_releasepage,
2876         .direct_IO              = ext4_direct_IO,
2877         .migratepage            = buffer_migrate_page,
2878         .is_partially_uptodate  = block_is_partially_uptodate,
2879         .error_remove_page      = generic_error_remove_page,
2880 };
2881
2882 static const struct address_space_operations ext4_writeback_aops = {
2883         .readpage               = ext4_readpage,
2884         .readpages              = ext4_readpages,
2885         .writepage              = ext4_writepage,
2886         .write_begin            = ext4_write_begin,
2887         .write_end              = ext4_writeback_write_end,
2888         .bmap                   = ext4_bmap,
2889         .invalidatepage         = ext4_invalidatepage,
2890         .releasepage            = ext4_releasepage,
2891         .direct_IO              = ext4_direct_IO,
2892         .migratepage            = buffer_migrate_page,
2893         .is_partially_uptodate  = block_is_partially_uptodate,
2894         .error_remove_page      = generic_error_remove_page,
2895 };
2896
2897 static const struct address_space_operations ext4_journalled_aops = {
2898         .readpage               = ext4_readpage,
2899         .readpages              = ext4_readpages,
2900         .writepage              = ext4_writepage,
2901         .write_begin            = ext4_write_begin,
2902         .write_end              = ext4_journalled_write_end,
2903         .set_page_dirty         = ext4_journalled_set_page_dirty,
2904         .bmap                   = ext4_bmap,
2905         .invalidatepage         = ext4_invalidatepage,
2906         .releasepage            = ext4_releasepage,
2907         .is_partially_uptodate  = block_is_partially_uptodate,
2908         .error_remove_page      = generic_error_remove_page,
2909 };
2910
2911 static const struct address_space_operations ext4_da_aops = {
2912         .readpage               = ext4_readpage,
2913         .readpages              = ext4_readpages,
2914         .writepage              = ext4_writepage,
2915         .writepages             = ext4_da_writepages,
2916         .write_begin            = ext4_da_write_begin,
2917         .write_end              = ext4_da_write_end,
2918         .bmap                   = ext4_bmap,
2919         .invalidatepage         = ext4_da_invalidatepage,
2920         .releasepage            = ext4_releasepage,
2921         .direct_IO              = ext4_direct_IO,
2922         .migratepage            = buffer_migrate_page,
2923         .is_partially_uptodate  = block_is_partially_uptodate,
2924         .error_remove_page      = generic_error_remove_page,
2925 };
2926
2927 void ext4_set_aops(struct inode *inode)
2928 {
2929         if (ext4_should_order_data(inode) &&
2930                 test_opt(inode->i_sb, DELALLOC))
2931                 inode->i_mapping->a_ops = &ext4_da_aops;
2932         else if (ext4_should_order_data(inode))
2933                 inode->i_mapping->a_ops = &ext4_ordered_aops;
2934         else if (ext4_should_writeback_data(inode) &&
2935                  test_opt(inode->i_sb, DELALLOC))
2936                 inode->i_mapping->a_ops = &ext4_da_aops;
2937         else if (ext4_should_writeback_data(inode))
2938                 inode->i_mapping->a_ops = &ext4_writeback_aops;
2939         else
2940                 inode->i_mapping->a_ops = &ext4_journalled_aops;
2941 }
2942
2943 /*
2944  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
2945  * up to the end of the block which corresponds to `from'.
2946  * This required during truncate. We need to physically zero the tail end
2947  * of that block so it doesn't yield old data if the file is later grown.
2948  */
2949 int ext4_block_truncate_page(handle_t *handle,
2950                 struct address_space *mapping, loff_t from)
2951 {
2952         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2953         unsigned length;
2954         unsigned blocksize;
2955         struct inode *inode = mapping->host;
2956
2957         blocksize = inode->i_sb->s_blocksize;
2958         length = blocksize - (offset & (blocksize - 1));
2959
2960         return ext4_block_zero_page_range(handle, mapping, from, length);
2961 }
2962
2963 /*
2964  * ext4_block_zero_page_range() zeros out a mapping of length 'length'
2965  * starting from file offset 'from'.  The range to be zero'd must
2966  * be contained with in one block.  If the specified range exceeds
2967  * the end of the block it will be shortened to end of the block
2968  * that cooresponds to 'from'
2969  */
2970 int ext4_block_zero_page_range(handle_t *handle,
2971                 struct address_space *mapping, loff_t from, loff_t length)
2972 {
2973         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
2974         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2975         unsigned blocksize, max, pos;
2976         ext4_lblk_t iblock;
2977         struct inode *inode = mapping->host;
2978         struct buffer_head *bh;
2979         struct page *page;
2980         int err = 0;
2981
2982         page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
2983                                    mapping_gfp_mask(mapping) & ~__GFP_FS);
2984         if (!page)
2985                 return -EINVAL;
2986
2987         blocksize = inode->i_sb->s_blocksize;
2988         max = blocksize - (offset & (blocksize - 1));
2989
2990         /*
2991          * correct length if it does not fall between
2992          * 'from' and the end of the block
2993          */
2994         if (length > max || length < 0)
2995                 length = max;
2996
2997         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
2998
2999         if (!page_has_buffers(page))
3000                 create_empty_buffers(page, blocksize, 0);
3001
3002         /* Find the buffer that contains "offset" */
3003         bh = page_buffers(page);
3004         pos = blocksize;
3005         while (offset >= pos) {
3006                 bh = bh->b_this_page;
3007                 iblock++;
3008                 pos += blocksize;
3009         }
3010
3011         err = 0;
3012         if (buffer_freed(bh)) {
3013                 BUFFER_TRACE(bh, "freed: skip");
3014                 goto unlock;
3015         }
3016
3017         if (!buffer_mapped(bh)) {
3018                 BUFFER_TRACE(bh, "unmapped");
3019                 ext4_get_block(inode, iblock, bh, 0);
3020                 /* unmapped? It's a hole - nothing to do */
3021                 if (!buffer_mapped(bh)) {
3022                         BUFFER_TRACE(bh, "still unmapped");
3023                         goto unlock;
3024                 }
3025         }
3026
3027         /* Ok, it's mapped. Make sure it's up-to-date */
3028         if (PageUptodate(page))
3029                 set_buffer_uptodate(bh);
3030
3031         if (!buffer_uptodate(bh)) {
3032                 err = -EIO;
3033                 ll_rw_block(READ, 1, &bh);
3034                 wait_on_buffer(bh);
3035                 /* Uhhuh. Read error. Complain and punt. */
3036                 if (!buffer_uptodate(bh))
3037                         goto unlock;
3038         }
3039
3040         if (ext4_should_journal_data(inode)) {
3041                 BUFFER_TRACE(bh, "get write access");
3042                 err = ext4_journal_get_write_access(handle, bh);
3043                 if (err)
3044                         goto unlock;
3045         }
3046
3047         zero_user(page, offset, length);
3048
3049         BUFFER_TRACE(bh, "zeroed end of block");
3050
3051         err = 0;
3052         if (ext4_should_journal_data(inode)) {
3053                 err = ext4_handle_dirty_metadata(handle, inode, bh);
3054         } else {
3055                 if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
3056                         err = ext4_jbd2_file_inode(handle, inode);
3057                 mark_buffer_dirty(bh);
3058         }
3059
3060 unlock:
3061         unlock_page(page);
3062         page_cache_release(page);
3063         return err;
3064 }
3065
3066 int ext4_can_truncate(struct inode *inode)
3067 {
3068         if (S_ISREG(inode->i_mode))
3069                 return 1;
3070         if (S_ISDIR(inode->i_mode))
3071                 return 1;
3072         if (S_ISLNK(inode->i_mode))
3073                 return !ext4_inode_is_fast_symlink(inode);
3074         return 0;
3075 }
3076
3077 /*
3078  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
3079  * associated with the given offset and length
3080  *
3081  * @inode:  File inode
3082  * @offset: The offset where the hole will begin
3083  * @len:    The length of the hole
3084  *
3085  * Returns: 0 on sucess or negative on failure
3086  */
3087
3088 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3089 {
3090         struct inode *inode = file->f_path.dentry->d_inode;
3091         if (!S_ISREG(inode->i_mode))
3092                 return -ENOTSUPP;
3093
3094         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3095                 /* TODO: Add support for non extent hole punching */
3096                 return -ENOTSUPP;
3097         }
3098
3099         return ext4_ext_punch_hole(file, offset, length);
3100 }
3101
3102 /*
3103  * ext4_truncate()
3104  *
3105  * We block out ext4_get_block() block instantiations across the entire
3106  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
3107  * simultaneously on behalf of the same inode.
3108  *
3109  * As we work through the truncate and commmit bits of it to the journal there
3110  * is one core, guiding principle: the file's tree must always be consistent on
3111  * disk.  We must be able to restart the truncate after a crash.
3112  *
3113  * The file's tree may be transiently inconsistent in memory (although it
3114  * probably isn't), but whenever we close off and commit a journal transaction,
3115  * the contents of (the filesystem + the journal) must be consistent and
3116  * restartable.  It's pretty simple, really: bottom up, right to left (although
3117  * left-to-right works OK too).
3118  *
3119  * Note that at recovery time, journal replay occurs *before* the restart of
3120  * truncate against the orphan inode list.
3121  *
3122  * The committed inode has the new, desired i_size (which is the same as
3123  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
3124  * that this inode's truncate did not complete and it will again call
3125  * ext4_truncate() to have another go.  So there will be instantiated blocks
3126  * to the right of the truncation point in a crashed ext4 filesystem.  But
3127  * that's fine - as long as they are linked from the inode, the post-crash
3128  * ext4_truncate() run will find them and release them.
3129  */
3130 void ext4_truncate(struct inode *inode)
3131 {
3132         trace_ext4_truncate_enter(inode);
3133
3134         if (!ext4_can_truncate(inode))
3135                 return;
3136
3137         ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3138
3139         if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3140                 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
3141
3142         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3143                 ext4_ext_truncate(inode);
3144         else
3145                 ext4_ind_truncate(inode);
3146
3147         trace_ext4_truncate_exit(inode);
3148 }
3149
3150 /*
3151  * ext4_get_inode_loc returns with an extra refcount against the inode's
3152  * underlying buffer_head on success. If 'in_mem' is true, we have all
3153  * data in memory that is needed to recreate the on-disk version of this
3154  * inode.
3155  */
3156 static int __ext4_get_inode_loc(struct inode *inode,
3157                                 struct ext4_iloc *iloc, int in_mem)
3158 {
3159         struct ext4_group_desc  *gdp;
3160         struct buffer_head      *bh;
3161         struct super_block      *sb = inode->i_sb;
3162         ext4_fsblk_t            block;
3163         int                     inodes_per_block, inode_offset;
3164
3165         iloc->bh = NULL;
3166         if (!ext4_valid_inum(sb, inode->i_ino))
3167                 return -EIO;
3168
3169         iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
3170         gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
3171         if (!gdp)
3172                 return -EIO;
3173
3174         /*
3175          * Figure out the offset within the block group inode table
3176          */
3177         inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
3178         inode_offset = ((inode->i_ino - 1) %
3179                         EXT4_INODES_PER_GROUP(sb));
3180         block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
3181         iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3182
3183         bh = sb_getblk(sb, block);
3184         if (!bh) {
3185                 EXT4_ERROR_INODE_BLOCK(inode, block,
3186                                        "unable to read itable block");
3187                 return -EIO;
3188         }
3189         if (!buffer_uptodate(bh)) {
3190                 lock_buffer(bh);
3191
3192                 /*
3193                  * If the buffer has the write error flag, we have failed
3194                  * to write out another inode in the same block.  In this
3195                  * case, we don't have to read the block because we may
3196                  * read the old inode data successfully.
3197                  */
3198                 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
3199                         set_buffer_uptodate(bh);
3200
3201                 if (buffer_uptodate(bh)) {
3202                         /* someone brought it uptodate while we waited */
3203                         unlock_buffer(bh);
3204                         goto has_buffer;
3205                 }
3206
3207                 /*
3208                  * If we have all information of the inode in memory and this
3209                  * is the only valid inode in the block, we need not read the
3210                  * block.
3211                  */
3212                 if (in_mem) {
3213                         struct buffer_head *bitmap_bh;
3214                         int i, start;
3215
3216                         start = inode_offset & ~(inodes_per_block - 1);
3217
3218                         /* Is the inode bitmap in cache? */
3219                         bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3220                         if (!bitmap_bh)
3221                                 goto make_io;
3222
3223                         /*
3224                          * If the inode bitmap isn't in cache then the
3225                          * optimisation may end up performing two reads instead
3226                          * of one, so skip it.
3227                          */
3228                         if (!buffer_uptodate(bitmap_bh)) {
3229                                 brelse(bitmap_bh);
3230                                 goto make_io;
3231                         }
3232                         for (i = start; i < start + inodes_per_block; i++) {
3233                                 if (i == inode_offset)
3234                                         continue;
3235                                 if (ext4_test_bit(i, bitmap_bh->b_data))
3236                                         break;
3237                         }
3238                         brelse(bitmap_bh);
3239                         if (i == start + inodes_per_block) {
3240                                 /* all other inodes are free, so skip I/O */
3241                                 memset(bh->b_data, 0, bh->b_size);
3242                                 set_buffer_uptodate(bh);
3243                                 unlock_buffer(bh);
3244                                 goto has_buffer;
3245                         }
3246                 }
3247
3248 make_io:
3249                 /*
3250                  * If we need to do any I/O, try to pre-readahead extra
3251                  * blocks from the inode table.
3252                  */
3253                 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3254                         ext4_fsblk_t b, end, table;
3255                         unsigned num;
3256
3257                         table = ext4_inode_table(sb, gdp);
3258                         /* s_inode_readahead_blks is always a power of 2 */
3259                         b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
3260                         if (table > b)
3261                                 b = table;
3262                         end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3263                         num = EXT4_INODES_PER_GROUP(sb);
3264                         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3265                                        EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3266                                 num -= ext4_itable_unused_count(sb, gdp);
3267                         table += num / inodes_per_block;
3268                         if (end > table)
3269                                 end = table;
3270                         while (b <= end)
3271                                 sb_breadahead(sb, b++);
3272                 }
3273
3274                 /*
3275                  * There are other valid inodes in the buffer, this inode
3276                  * has in-inode xattrs, or we don't have this inode in memory.
3277                  * Read the block from disk.
3278                  */
3279                 trace_ext4_load_inode(inode);
3280                 get_bh(bh);
3281                 bh->b_end_io = end_buffer_read_sync;
3282                 submit_bh(READ_META, bh);
3283                 wait_on_buffer(bh);
3284                 if (!buffer_uptodate(bh)) {
3285                         EXT4_ERROR_INODE_BLOCK(inode, block,
3286                                                "unable to read itable block");
3287                         brelse(bh);
3288                         return -EIO;
3289                 }
3290         }
3291 has_buffer:
3292         iloc->bh = bh;
3293         return 0;
3294 }
3295
3296 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
3297 {
3298         /* We have all inode data except xattrs in memory here. */
3299         return __ext4_get_inode_loc(inode, iloc,
3300                 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
3301 }
3302
3303 void ext4_set_inode_flags(struct inode *inode)
3304 {
3305         unsigned int flags = EXT4_I(inode)->i_flags;
3306
3307         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3308         if (flags & EXT4_SYNC_FL)
3309                 inode->i_flags |= S_SYNC;
3310         if (flags & EXT4_APPEND_FL)
3311                 inode->i_flags |= S_APPEND;
3312         if (flags & EXT4_IMMUTABLE_FL)
3313                 inode->i_flags |= S_IMMUTABLE;
3314         if (flags & EXT4_NOATIME_FL)
3315                 inode->i_flags |= S_NOATIME;
3316         if (flags & EXT4_DIRSYNC_FL)
3317                 inode->i_flags |= S_DIRSYNC;
3318 }
3319
3320 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
3321 void ext4_get_inode_flags(struct ext4_inode_info *ei)
3322 {
3323         unsigned int vfs_fl;
3324         unsigned long old_fl, new_fl;
3325
3326         do {
3327                 vfs_fl = ei->vfs_inode.i_flags;
3328                 old_fl = ei->i_flags;
3329                 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
3330                                 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
3331                                 EXT4_DIRSYNC_FL);
3332                 if (vfs_fl & S_SYNC)
3333                         new_fl |= EXT4_SYNC_FL;
3334                 if (vfs_fl & S_APPEND)
3335                         new_fl |= EXT4_APPEND_FL;
3336                 if (vfs_fl & S_IMMUTABLE)
3337                         new_fl |= EXT4_IMMUTABLE_FL;
3338                 if (vfs_fl & S_NOATIME)
3339                         new_fl |= EXT4_NOATIME_FL;
3340                 if (vfs_fl & S_DIRSYNC)
3341                         new_fl |= EXT4_DIRSYNC_FL;
3342         } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
3343 }
3344
3345 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
3346                                   struct ext4_inode_info *ei)
3347 {
3348         blkcnt_t i_blocks ;
3349         struct inode *inode = &(ei->vfs_inode);
3350         struct super_block *sb = inode->i_sb;
3351
3352         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3353                                 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
3354                 /* we are using combined 48 bit field */
3355                 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
3356                                         le32_to_cpu(raw_inode->i_blocks_lo);
3357                 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
3358                         /* i_blocks represent file system block size */
3359                         return i_blocks  << (inode->i_blkbits - 9);
3360                 } else {
3361                         return i_blocks;
3362                 }
3363         } else {
3364                 return le32_to_cpu(raw_inode->i_blocks_lo);
3365         }
3366 }
3367
3368 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3369 {
3370         struct ext4_iloc iloc;
3371         struct ext4_inode *raw_inode;
3372         struct ext4_inode_info *ei;
3373         struct inode *inode;
3374         journal_t *journal = EXT4_SB(sb)->s_journal;
3375         long ret;
3376         int block;
3377
3378         inode = iget_locked(sb, ino);
3379         if (!inode)
3380                 return ERR_PTR(-ENOMEM);
3381         if (!(inode->i_state & I_NEW))
3382                 return inode;
3383
3384         ei = EXT4_I(inode);
3385         iloc.bh = NULL;
3386
3387         ret = __ext4_get_inode_loc(inode, &iloc, 0);
3388         if (ret < 0)
3389                 goto bad_inode;
3390         raw_inode = ext4_raw_inode(&iloc);
3391         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3392         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3393         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3394         if (!(test_opt(inode->i_sb, NO_UID32))) {
3395                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3396                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3397         }
3398         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
3399
3400         ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
3401         ei->i_dir_start_lookup = 0;
3402         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
3403         /* We now have enough fields to check if the inode was active or not.
3404          * This is needed because nfsd might try to access dead inodes
3405          * the test is that same one that e2fsck uses
3406          * NeilBrown 1999oct15
3407          */
3408         if (inode->i_nlink == 0) {
3409                 if (inode->i_mode == 0 ||
3410                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3411                         /* this inode is deleted */
3412                         ret = -ESTALE;
3413                         goto bad_inode;
3414                 }
3415                 /* The only unlinked inodes we let through here have
3416                  * valid i_mode and are being read by the orphan
3417                  * recovery code: that's fine, we're about to complete
3418                  * the process of deleting those. */
3419         }
3420         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
3421         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
3422         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
3423         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
3424                 ei->i_file_acl |=
3425                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
3426         inode->i_size = ext4_isize(raw_inode);
3427         ei->i_disksize = inode->i_size;
3428 #ifdef CONFIG_QUOTA
3429         ei->i_reserved_quota = 0;
3430 #endif
3431         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
3432         ei->i_block_group = iloc.block_group;
3433         ei->i_last_alloc_group = ~0;
3434         /*
3435          * NOTE! The in-memory inode i_data array is in little-endian order
3436          * even on big-endian machines: we do NOT byteswap the block numbers!
3437          */
3438         for (block = 0; block < EXT4_N_BLOCKS; block++)
3439                 ei->i_data[block] = raw_inode->i_block[block];
3440         INIT_LIST_HEAD(&ei->i_orphan);
3441
3442         /*
3443          * Set transaction id's of transactions that have to be committed
3444          * to finish f[data]sync. We set them to currently running transaction
3445          * as we cannot be sure that the inode or some of its metadata isn't
3446          * part of the transaction - the inode could have been reclaimed and
3447          * now it is reread from disk.
3448          */
3449         if (journal) {
3450                 transaction_t *transaction;
3451                 tid_t tid;
3452
3453                 read_lock(&journal->j_state_lock);
3454                 if (journal->j_running_transaction)
3455                         transaction = journal->j_running_transaction;
3456                 else
3457                         transaction = journal->j_committing_transaction;
3458                 if (transaction)
3459                         tid = transaction->t_tid;
3460                 else
3461                         tid = journal->j_commit_sequence;
3462                 read_unlock(&journal->j_state_lock);
3463                 ei->i_sync_tid = tid;
3464                 ei->i_datasync_tid = tid;
3465         }
3466
3467         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3468                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3469                 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3470                     EXT4_INODE_SIZE(inode->i_sb)) {
3471                         ret = -EIO;
3472                         goto bad_inode;
3473                 }
3474                 if (ei->i_extra_isize == 0) {
3475                         /* The extra space is currently unused. Use it. */
3476                         ei->i_extra_isize = sizeof(struct ext4_inode) -
3477                                             EXT4_GOOD_OLD_INODE_SIZE;
3478                 } else {
3479                         __le32 *magic = (void *)raw_inode +
3480                                         EXT4_GOOD_OLD_INODE_SIZE +
3481                                         ei->i_extra_isize;
3482                         if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
3483                                 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3484                 }
3485         } else
3486                 ei->i_extra_isize = 0;
3487
3488         EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
3489         EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
3490         EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
3491         EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
3492
3493         inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
3494         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3495                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
3496                         inode->i_version |=
3497                         (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
3498         }
3499
3500         ret = 0;
3501         if (ei->i_file_acl &&
3502             !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
3503                 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
3504                                  ei->i_file_acl);
3505                 ret = -EIO;
3506                 goto bad_inode;
3507         } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3508                 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3509                     (S_ISLNK(inode->i_mode) &&
3510                      !ext4_inode_is_fast_symlink(inode)))
3511                         /* Validate extent which is part of inode */
3512                         ret = ext4_ext_check_inode(inode);
3513         } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3514                    (S_ISLNK(inode->i_mode) &&
3515                     !ext4_inode_is_fast_symlink(inode))) {
3516                 /* Validate block references which are part of inode */
3517                 ret = ext4_ind_check_inode(inode);
3518         }
3519         if (ret)
3520                 goto bad_inode;
3521
3522         if (S_ISREG(inode->i_mode)) {
3523                 inode->i_op = &ext4_file_inode_operations;
3524                 inode->i_fop = &ext4_file_operations;
3525                 ext4_set_aops(inode);
3526         } else if (S_ISDIR(inode->i_mode)) {
3527                 inode->i_op = &ext4_dir_inode_operations;
3528                 inode->i_fop = &ext4_dir_operations;
3529         } else if (S_ISLNK(inode->i_mode)) {
3530                 if (ext4_inode_is_fast_symlink(inode)) {
3531                         inode->i_op = &ext4_fast_symlink_inode_operations;
3532                         nd_terminate_link(ei->i_data, inode->i_size,
3533                                 sizeof(ei->i_data) - 1);
3534                 } else {
3535                         inode->i_op = &ext4_symlink_inode_operations;
3536                         ext4_set_aops(inode);
3537                 }
3538         } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
3539               S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
3540                 inode->i_op = &ext4_special_inode_operations;
3541                 if (raw_inode->i_block[0])
3542                         init_special_inode(inode, inode->i_mode,
3543                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
3544                 else
3545                         init_special_inode(inode, inode->i_mode,
3546                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3547         } else {
3548                 ret = -EIO;
3549                 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
3550                 goto bad_inode;
3551         }
3552         brelse(iloc.bh);
3553         ext4_set_inode_flags(inode);
3554         unlock_new_inode(inode);
3555         return inode;
3556
3557 bad_inode:
3558         brelse(iloc.bh);
3559         iget_failed(inode);
3560         return ERR_PTR(ret);
3561 }
3562
3563 static int ext4_inode_blocks_set(handle_t *handle,
3564                                 struct ext4_inode *raw_inode,
3565                                 struct ext4_inode_info *ei)
3566 {
3567         struct inode *inode = &(ei->vfs_inode);
3568         u64 i_blocks = inode->i_blocks;
3569         struct super_block *sb = inode->i_sb;
3570
3571         if (i_blocks <= ~0U) {
3572                 /*
3573                  * i_blocks can be represnted in a 32 bit variable
3574                  * as multiple of 512 bytes
3575                  */
3576                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3577                 raw_inode->i_blocks_high = 0;
3578                 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
3579                 return 0;
3580         }
3581         if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
3582                 return -EFBIG;
3583
3584         if (i_blocks <= 0xffffffffffffULL) {
3585                 /*
3586                  * i_blocks can be represented in a 48 bit variable
3587                  * as multiple of 512 bytes
3588                  */
3589                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3590                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3591                 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
3592         } else {
3593                 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
3594                 /* i_block is stored in file system block size */
3595                 i_blocks = i_blocks >> (inode->i_blkbits - 9);
3596                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3597                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3598         }
3599         return 0;
3600 }
3601
3602 /*
3603  * Post the struct inode info into an on-disk inode location in the
3604  * buffer-cache.  This gobbles the caller's reference to the
3605  * buffer_head in the inode location struct.
3606  *
3607  * The caller must have write access to iloc->bh.
3608  */
3609 static int ext4_do_update_inode(handle_t *handle,
3610                                 struct inode *inode,
3611                                 struct ext4_iloc *iloc)
3612 {
3613         struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
3614         struct ext4_inode_info *ei = EXT4_I(inode);
3615         struct buffer_head *bh = iloc->bh;
3616         int err = 0, rc, block;
3617
3618         /* For fields not not tracking in the in-memory inode,
3619          * initialise them to zero for new inodes. */
3620         if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
3621                 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
3622
3623         ext4_get_inode_flags(ei);
3624         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
3625         if (!(test_opt(inode->i_sb, NO_UID32))) {
3626                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
3627                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
3628 /*
3629  * Fix up interoperability with old kernels. Otherwise, old inodes get
3630  * re-used with the upper 16 bits of the uid/gid intact
3631  */
3632                 if (!ei->i_dtime) {
3633                         raw_inode->i_uid_high =
3634                                 cpu_to_le16(high_16_bits(inode->i_uid));
3635                         raw_inode->i_gid_high =
3636                                 cpu_to_le16(high_16_bits(inode->i_gid));
3637                 } else {
3638                         raw_inode->i_uid_high = 0;
3639                         raw_inode->i_gid_high = 0;
3640                 }
3641         } else {
3642                 raw_inode->i_uid_low =
3643                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
3644                 raw_inode->i_gid_low =
3645                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
3646                 raw_inode->i_uid_high = 0;
3647                 raw_inode->i_gid_high = 0;
3648         }
3649         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
3650
3651         EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
3652         EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
3653         EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
3654         EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
3655
3656         if (ext4_inode_blocks_set(handle, raw_inode, ei))
3657                 goto out_brelse;
3658         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
3659         raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
3660         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
3661             cpu_to_le32(EXT4_OS_HURD))
3662                 raw_inode->i_file_acl_high =
3663                         cpu_to_le16(ei->i_file_acl >> 32);
3664         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
3665         ext4_isize_set(raw_inode, ei->i_disksize);
3666         if (ei->i_disksize > 0x7fffffffULL) {
3667                 struct super_block *sb = inode->i_sb;
3668                 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
3669                                 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
3670                                 EXT4_SB(sb)->s_es->s_rev_level ==
3671                                 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
3672                         /* If this is the first large file
3673                          * created, add a flag to the superblock.
3674                          */
3675                         err = ext4_journal_get_write_access(handle,
3676                                         EXT4_SB(sb)->s_sbh);
3677                         if (err)
3678                                 goto out_brelse;
3679                         ext4_update_dynamic_rev(sb);
3680                         EXT4_SET_RO_COMPAT_FEATURE(sb,
3681                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
3682                         sb->s_dirt = 1;
3683                         ext4_handle_sync(handle);
3684                         err = ext4_handle_dirty_metadata(handle, NULL,
3685                                         EXT4_SB(sb)->s_sbh);
3686                 }
3687         }
3688         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
3689         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
3690                 if (old_valid_dev(inode->i_rdev)) {
3691                         raw_inode->i_block[0] =
3692                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
3693                         raw_inode->i_block[1] = 0;
3694                 } else {
3695                         raw_inode->i_block[0] = 0;
3696                         raw_inode->i_block[1] =
3697                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
3698                         raw_inode->i_block[2] = 0;
3699                 }
3700         } else
3701                 for (block = 0; block < EXT4_N_BLOCKS; block++)
3702                         raw_inode->i_block[block] = ei->i_data[block];
3703
3704         raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
3705         if (ei->i_extra_isize) {
3706                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
3707                         raw_inode->i_version_hi =
3708                         cpu_to_le32(inode->i_version >> 32);
3709                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
3710         }
3711
3712         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
3713         rc = ext4_handle_dirty_metadata(handle, NULL, bh);
3714         if (!err)
3715                 err = rc;
3716         ext4_clear_inode_state(inode, EXT4_STATE_NEW);
3717
3718         ext4_update_inode_fsync_trans(handle, inode, 0);
3719 out_brelse:
3720         brelse(bh);
3721         ext4_std_error(inode->i_sb, err);
3722         return err;
3723 }
3724
3725 /*
3726  * ext4_write_inode()
3727  *
3728  * We are called from a few places:
3729  *
3730  * - Within generic_file_write() for O_SYNC files.
3731  *   Here, there will be no transaction running. We wait for any running
3732  *   trasnaction to commit.
3733  *
3734  * - Within sys_sync(), kupdate and such.
3735  *   We wait on commit, if tol to.
3736  *
3737  * - Within prune_icache() (PF_MEMALLOC == true)
3738  *   Here we simply return.  We can't afford to block kswapd on the
3739  *   journal commit.
3740  *
3741  * In all cases it is actually safe for us to return without doing anything,
3742  * because the inode has been copied into a raw inode buffer in
3743  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
3744  * knfsd.
3745  *
3746  * Note that we are absolutely dependent upon all inode dirtiers doing the
3747  * right thing: they *must* call mark_inode_dirty() after dirtying info in
3748  * which we are interested.
3749  *
3750  * It would be a bug for them to not do this.  The code:
3751  *
3752  *      mark_inode_dirty(inode)
3753  *      stuff();
3754  *      inode->i_size = expr;
3755  *
3756  * is in error because a kswapd-driven write_inode() could occur while
3757  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
3758  * will no longer be on the superblock's dirty inode list.
3759  */
3760 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
3761 {
3762         int err;
3763
3764         if (current->flags & PF_MEMALLOC)
3765                 return 0;
3766
3767         if (EXT4_SB(inode->i_sb)->s_journal) {
3768                 if (ext4_journal_current_handle()) {
3769                         jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
3770                         dump_stack();
3771                         return -EIO;
3772                 }
3773
3774                 if (wbc->sync_mode != WB_SYNC_ALL)
3775                         return 0;
3776
3777                 err = ext4_force_commit(inode->i_sb);
3778         } else {
3779                 struct ext4_iloc iloc;
3780
3781                 err = __ext4_get_inode_loc(inode, &iloc, 0);
3782                 if (err)
3783                         return err;
3784                 if (wbc->sync_mode == WB_SYNC_ALL)
3785                         sync_dirty_buffer(iloc.bh);
3786                 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
3787                         EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
3788                                          "IO error syncing inode");
3789                         err = -EIO;
3790                 }
3791                 brelse(iloc.bh);
3792         }
3793         return err;
3794 }
3795
3796 /*
3797  * ext4_setattr()
3798  *
3799  * Called from notify_change.
3800  *
3801  * We want to trap VFS attempts to truncate the file as soon as
3802  * possible.  In particular, we want to make sure that when the VFS
3803  * shrinks i_size, we put the inode on the orphan list and modify
3804  * i_disksize immediately, so that during the subsequent flushing of
3805  * dirty pages and freeing of disk blocks, we can guarantee that any
3806  * commit will leave the blocks being flushed in an unused state on
3807  * disk.  (On recovery, the inode will get truncated and the blocks will
3808  * be freed, so we have a strong guarantee that no future commit will
3809  * leave these blocks visible to the user.)
3810  *
3811  * Another thing we have to assure is that if we are in ordered mode
3812  * and inode is still attached to the committing transaction, we must
3813  * we start writeout of all the dirty pages which are being truncated.
3814  * This way we are sure that all the data written in the previous
3815  * transaction are already on disk (truncate waits for pages under
3816  * writeback).
3817  *
3818  * Called with inode->i_mutex down.
3819  */
3820 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3821 {
3822         struct inode *inode = dentry->d_inode;
3823         int error, rc = 0;
3824         int orphan = 0;
3825         const unsigned int ia_valid = attr->ia_valid;
3826
3827         error = inode_change_ok(inode, attr);
3828         if (error)
3829                 return error;
3830
3831         if (is_quota_modification(inode, attr))
3832                 dquot_initialize(inode);
3833         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
3834                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
3835                 handle_t *handle;
3836
3837                 /* (user+group)*(old+new) structure, inode write (sb,
3838                  * inode block, ? - but truncate inode update has it) */
3839                 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
3840                                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
3841                 if (IS_ERR(handle)) {
3842                         error = PTR_ERR(handle);
3843                         goto err_out;
3844                 }
3845                 error = dquot_transfer(inode, attr);
3846                 if (error) {
3847                         ext4_journal_stop(handle);
3848                         return error;
3849                 }
3850                 /* Update corresponding info in inode so that everything is in
3851                  * one transaction */
3852                 if (attr->ia_valid & ATTR_UID)
3853                         inode->i_uid = attr->ia_uid;
3854                 if (attr->ia_valid & ATTR_GID)
3855                         inode->i_gid = attr->ia_gid;
3856                 error = ext4_mark_inode_dirty(handle, inode);
3857                 ext4_journal_stop(handle);
3858         }
3859
3860         if (attr->ia_valid & ATTR_SIZE) {
3861                 inode_dio_wait(inode);
3862
3863                 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
3864                         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3865
3866                         if (attr->ia_size > sbi->s_bitmap_maxbytes)
3867                                 return -EFBIG;
3868                 }
3869         }
3870
3871         if (S_ISREG(inode->i_mode) &&
3872             attr->ia_valid & ATTR_SIZE &&
3873             (attr->ia_size < inode->i_size)) {
3874                 handle_t *handle;
3875
3876                 handle = ext4_journal_start(inode, 3);
3877                 if (IS_ERR(handle)) {
3878                         error = PTR_ERR(handle);
3879                         goto err_out;
3880                 }
3881                 if (ext4_handle_valid(handle)) {
3882                         error = ext4_orphan_add(handle, inode);
3883                         orphan = 1;
3884                 }
3885                 EXT4_I(inode)->i_disksize = attr->ia_size;
3886                 rc = ext4_mark_inode_dirty(handle, inode);
3887                 if (!error)
3888                         error = rc;
3889                 ext4_journal_stop(handle);
3890
3891                 if (ext4_should_order_data(inode)) {
3892                         error = ext4_begin_ordered_truncate(inode,
3893                                                             attr->ia_size);
3894                         if (error) {
3895                                 /* Do as much error cleanup as possible */
3896                                 handle = ext4_journal_start(inode, 3);
3897                                 if (IS_ERR(handle)) {
3898                                         ext4_orphan_del(NULL, inode);
3899                                         goto err_out;
3900                                 }
3901                                 ext4_orphan_del(handle, inode);
3902                                 orphan = 0;
3903                                 ext4_journal_stop(handle);
3904                                 goto err_out;
3905                         }
3906                 }
3907         }
3908
3909         if (attr->ia_valid & ATTR_SIZE) {
3910                 if (attr->ia_size != i_size_read(inode)) {
3911                         truncate_setsize(inode, attr->ia_size);
3912                         ext4_truncate(inode);
3913                 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3914                         ext4_truncate(inode);
3915         }
3916
3917         if (!rc) {
3918                 setattr_copy(inode, attr);
3919                 mark_inode_dirty(inode);
3920         }
3921
3922         /*
3923          * If the call to ext4_truncate failed to get a transaction handle at
3924          * all, we need to clean up the in-core orphan list manually.
3925          */
3926         if (orphan && inode->i_nlink)
3927                 ext4_orphan_del(NULL, inode);
3928
3929         if (!rc && (ia_valid & ATTR_MODE))
3930                 rc = ext4_acl_chmod(inode);
3931
3932 err_out:
3933         ext4_std_error(inode->i_sb, error);
3934         if (!error)
3935                 error = rc;
3936         return error;
3937 }
3938
3939 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
3940                  struct kstat *stat)
3941 {
3942         struct inode *inode;
3943         unsigned long delalloc_blocks;
3944
3945         inode = dentry->d_inode;
3946         generic_fillattr(inode, stat);
3947
3948         /*
3949          * We can't update i_blocks if the block allocation is delayed
3950          * otherwise in the case of system crash before the real block
3951          * allocation is done, we will have i_blocks inconsistent with
3952          * on-disk file blocks.
3953          * We always keep i_blocks updated together with real
3954          * allocation. But to not confuse with user, stat
3955          * will return the blocks that include the delayed allocation
3956          * blocks for this file.
3957          */
3958         delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
3959
3960         stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
3961         return 0;
3962 }
3963
3964 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
3965 {
3966         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3967                 return ext4_ind_trans_blocks(inode, nrblocks, chunk);
3968         return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
3969 }
3970
3971 /*
3972  * Account for index blocks, block groups bitmaps and block group
3973  * descriptor blocks if modify datablocks and index blocks
3974  * worse case, the indexs blocks spread over different block groups
3975  *
3976  * If datablocks are discontiguous, they are possible to spread over
3977  * different block groups too. If they are contiuguous, with flexbg,
3978  * they could still across block group boundary.
3979  *
3980  * Also account for superblock, inode, quota and xattr blocks
3981  */
3982 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
3983 {
3984         ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
3985         int gdpblocks;
3986         int idxblocks;
3987         int ret = 0;
3988
3989         /*
3990          * How many index blocks need to touch to modify nrblocks?
3991          * The "Chunk" flag indicating whether the nrblocks is
3992          * physically contiguous on disk
3993          *
3994          * For Direct IO and fallocate, they calls get_block to allocate
3995          * one single extent at a time, so they could set the "Chunk" flag
3996          */
3997         idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
3998
3999         ret = idxblocks;
4000
4001         /*
4002          * Now let's see how many group bitmaps and group descriptors need
4003          * to account
4004          */
4005         groups = idxblocks;
4006         if (chunk)
4007                 groups += 1;
4008         else
4009                 groups += nrblocks;
4010
4011         gdpblocks = groups;
4012         if (groups > ngroups)
4013                 groups = ngroups;
4014         if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4015                 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4016
4017         /* bitmaps and block group descriptor blocks */
4018         ret += groups + gdpblocks;
4019
4020         /* Blocks for super block, inode, quota and xattr blocks */
4021         ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4022
4023         return ret;
4024 }
4025
4026 /*
4027  * Calculate the total number of credits to reserve to fit
4028  * the modification of a single pages into a single transaction,
4029  * which may include multiple chunks of block allocations.
4030  *
4031  * This could be called via ext4_write_begin()
4032  *
4033  * We need to consider the worse case, when
4034  * one new block per extent.
4035  */
4036 int ext4_writepage_trans_blocks(struct inode *inode)
4037 {
4038         int bpp = ext4_journal_blocks_per_page(inode);
4039         int ret;
4040
4041         ret = ext4_meta_trans_blocks(inode, bpp, 0);
4042
4043         /* Account for data blocks for journalled mode */
4044         if (ext4_should_journal_data(inode))
4045                 ret += bpp;
4046         return ret;
4047 }
4048
4049 /*
4050  * Calculate the journal credits for a chunk of data modification.
4051  *
4052  * This is called from DIO, fallocate or whoever calling
4053  * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
4054  *
4055  * journal buffers for data blocks are not included here, as DIO
4056  * and fallocate do no need to journal data buffers.
4057  */
4058 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4059 {
4060         return ext4_meta_trans_blocks(inode, nrblocks, 1);
4061 }
4062
4063 /*
4064  * The caller must have previously called ext4_reserve_inode_write().
4065  * Give this, we know that the caller already has write access to iloc->bh.
4066  */
4067 int ext4_mark_iloc_dirty(handle_t *handle,
4068                          struct inode *inode, struct ext4_iloc *iloc)
4069 {
4070         int err = 0;
4071
4072         if (test_opt(inode->i_sb, I_VERSION))
4073                 inode_inc_iversion(inode);
4074
4075         /* the do_update_inode consumes one bh->b_count */
4076         get_bh(iloc->bh);
4077
4078         /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
4079         err = ext4_do_update_inode(handle, inode, iloc);
4080         put_bh(iloc->bh);
4081         return err;
4082 }
4083
4084 /*
4085  * On success, We end up with an outstanding reference count against
4086  * iloc->bh.  This _must_ be cleaned up later.
4087  */
4088
4089 int
4090 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
4091                          struct ext4_iloc *iloc)
4092 {
4093         int err;
4094
4095         err = ext4_get_inode_loc(inode, iloc);
4096         if (!err) {
4097                 BUFFER_TRACE(iloc->bh, "get_write_access");
4098                 err = ext4_journal_get_write_access(handle, iloc->bh);
4099                 if (err) {
4100                         brelse(iloc->bh);
4101                         iloc->bh = NULL;
4102                 }
4103         }
4104         ext4_std_error(inode->i_sb, err);
4105         return err;
4106 }
4107
4108 /*
4109  * Expand an inode by new_extra_isize bytes.
4110  * Returns 0 on success or negative error number on failure.
4111  */
4112 static int ext4_expand_extra_isize(struct inode *inode,
4113                                    unsigned int new_extra_isize,
4114                                    struct ext4_iloc iloc,
4115                                    handle_t *handle)
4116 {
4117         struct ext4_inode *raw_inode;
4118         struct ext4_xattr_ibody_header *header;
4119
4120         if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
4121                 return 0;
4122
4123         raw_inode = ext4_raw_inode(&iloc);
4124
4125         header = IHDR(inode, raw_inode);
4126
4127         /* No extended attributes present */
4128         if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
4129             header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
4130                 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
4131                         new_extra_isize);
4132                 EXT4_I(inode)->i_extra_isize = new_extra_isize;
4133                 return 0;
4134         }
4135
4136         /* try to expand with EAs present */
4137         return ext4_expand_extra_isize_ea(inode, new_extra_isize,
4138                                           raw_inode, handle);
4139 }
4140
4141 /*
4142  * What we do here is to mark the in-core inode as clean with respect to inode
4143  * dirtiness (it may still be data-dirty).
4144  * This means that the in-core inode may be reaped by prune_icache
4145  * without having to perform any I/O.  This is a very good thing,
4146  * because *any* task may call prune_icache - even ones which
4147  * have a transaction open against a different journal.
4148  *
4149  * Is this cheating?  Not really.  Sure, we haven't written the
4150  * inode out, but prune_icache isn't a user-visible syncing function.
4151  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
4152  * we start and wait on commits.
4153  *
4154  * Is this efficient/effective?  Well, we're being nice to the system
4155  * by cleaning up our inodes proactively so they can be reaped
4156  * without I/O.  But we are potentially leaving up to five seconds'
4157  * worth of inodes floating about which prune_icache wants us to
4158  * write out.  One way to fix that would be to get prune_icache()
4159  * to do a write_super() to free up some memory.  It has the desired
4160  * effect.
4161  */
4162 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4163 {
4164         struct ext4_iloc iloc;
4165         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4166         static unsigned int mnt_count;
4167         int err, ret;
4168
4169         might_sleep();
4170         trace_ext4_mark_inode_dirty(inode, _RET_IP_);
4171         err = ext4_reserve_inode_write(handle, inode, &iloc);
4172         if (ext4_handle_valid(handle) &&
4173             EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4174             !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
4175                 /*
4176                  * We need extra buffer credits since we may write into EA block
4177                  * with this same handle. If journal_extend fails, then it will
4178                  * only result in a minor loss of functionality for that inode.
4179                  * If this is felt to be critical, then e2fsck should be run to
4180                  * force a large enough s_min_extra_isize.
4181                  */
4182                 if ((jbd2_journal_extend(handle,
4183                              EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
4184                         ret = ext4_expand_extra_isize(inode,
4185                                                       sbi->s_want_extra_isize,
4186                                                       iloc, handle);
4187                         if (ret) {
4188                                 ext4_set_inode_state(inode,
4189                                                      EXT4_STATE_NO_EXPAND);
4190                                 if (mnt_count !=
4191                                         le16_to_cpu(sbi->s_es->s_mnt_count)) {
4192                                         ext4_warning(inode->i_sb,
4193                                         "Unable to expand inode %lu. Delete"
4194                                         " some EAs or run e2fsck.",
4195                                         inode->i_ino);
4196                                         mnt_count =
4197                                           le16_to_cpu(sbi->s_es->s_mnt_count);
4198                                 }
4199                         }
4200                 }
4201         }
4202         if (!err)
4203                 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
4204         return err;
4205 }
4206
4207 /*
4208  * ext4_dirty_inode() is called from __mark_inode_dirty()
4209  *
4210  * We're really interested in the case where a file is being extended.
4211  * i_size has been changed by generic_commit_write() and we thus need
4212  * to include the updated inode in the current transaction.
4213  *
4214  * Also, dquot_alloc_block() will always dirty the inode when blocks
4215  * are allocated to the file.
4216  *
4217  * If the inode is marked synchronous, we don't honour that here - doing
4218  * so would cause a commit on atime updates, which we don't bother doing.
4219  * We handle synchronous inodes at the highest possible level.
4220  */
4221 void ext4_dirty_inode(struct inode *inode, int flags)
4222 {
4223         handle_t *handle;
4224
4225         handle = ext4_journal_start(inode, 2);
4226         if (IS_ERR(handle))
4227                 goto out;
4228
4229         ext4_mark_inode_dirty(handle, inode);
4230
4231         ext4_journal_stop(handle);
4232 out:
4233         return;
4234 }
4235
4236 #if 0
4237 /*
4238  * Bind an inode's backing buffer_head into this transaction, to prevent
4239  * it from being flushed to disk early.  Unlike
4240  * ext4_reserve_inode_write, this leaves behind no bh reference and
4241  * returns no iloc structure, so the caller needs to repeat the iloc
4242  * lookup to mark the inode dirty later.
4243  */
4244 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4245 {
4246         struct ext4_iloc iloc;
4247
4248         int err = 0;
4249         if (handle) {
4250                 err = ext4_get_inode_loc(inode, &iloc);
4251                 if (!err) {
4252                         BUFFER_TRACE(iloc.bh, "get_write_access");
4253                         err = jbd2_journal_get_write_access(handle, iloc.bh);
4254                         if (!err)
4255                                 err = ext4_handle_dirty_metadata(handle,
4256                                                                  NULL,
4257                                                                  iloc.bh);
4258                         brelse(iloc.bh);
4259                 }
4260         }
4261         ext4_std_error(inode->i_sb, err);
4262         return err;
4263 }
4264 #endif
4265
4266 int ext4_change_inode_journal_flag(struct inode *inode, int val)
4267 {
4268         journal_t *journal;
4269         handle_t *handle;
4270         int err;
4271
4272         /*
4273          * We have to be very careful here: changing a data block's
4274          * journaling status dynamically is dangerous.  If we write a
4275          * data block to the journal, change the status and then delete
4276          * that block, we risk forgetting to revoke the old log record
4277          * from the journal and so a subsequent replay can corrupt data.
4278          * So, first we make sure that the journal is empty and that
4279          * nobody is changing anything.
4280          */
4281
4282         journal = EXT4_JOURNAL(inode);
4283         if (!journal)
4284                 return 0;
4285         if (is_journal_aborted(journal))
4286                 return -EROFS;
4287
4288         jbd2_journal_lock_updates(journal);
4289         jbd2_journal_flush(journal);
4290
4291         /*
4292          * OK, there are no updates running now, and all cached data is
4293          * synced to disk.  We are now in a completely consistent state
4294          * which doesn't have anything in the journal, and we know that
4295          * no filesystem updates are running, so it is safe to modify
4296          * the inode's in-core data-journaling state flag now.
4297          */
4298
4299         if (val)
4300                 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4301         else
4302                 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4303         ext4_set_aops(inode);
4304
4305         jbd2_journal_unlock_updates(journal);
4306
4307         /* Finally we can mark the inode as dirty. */
4308
4309         handle = ext4_journal_start(inode, 1);
4310         if (IS_ERR(handle))
4311                 return PTR_ERR(handle);
4312
4313         err = ext4_mark_inode_dirty(handle, inode);
4314         ext4_handle_sync(handle);
4315         ext4_journal_stop(handle);
4316         ext4_std_error(inode->i_sb, err);
4317
4318         return err;
4319 }
4320
4321 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4322 {
4323         return !buffer_mapped(bh);
4324 }
4325
4326 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4327 {
4328         struct page *page = vmf->page;
4329         loff_t size;
4330         unsigned long len;
4331         int ret;
4332         struct file *file = vma->vm_file;
4333         struct inode *inode = file->f_path.dentry->d_inode;
4334         struct address_space *mapping = inode->i_mapping;
4335         handle_t *handle;
4336         get_block_t *get_block;
4337         int retries = 0;
4338
4339         /*
4340          * This check is racy but catches the common case. We rely on
4341          * __block_page_mkwrite() to do a reliable check.
4342          */
4343         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
4344         /* Delalloc case is easy... */
4345         if (test_opt(inode->i_sb, DELALLOC) &&
4346             !ext4_should_journal_data(inode) &&
4347             !ext4_nonda_switch(inode->i_sb)) {
4348                 do {
4349                         ret = __block_page_mkwrite(vma, vmf,
4350                                                    ext4_da_get_block_prep);
4351                 } while (ret == -ENOSPC &&
4352                        ext4_should_retry_alloc(inode->i_sb, &retries));
4353                 goto out_ret;
4354         }
4355
4356         lock_page(page);
4357         size = i_size_read(inode);
4358         /* Page got truncated from under us? */
4359         if (page->mapping != mapping || page_offset(page) > size) {
4360                 unlock_page(page);
4361                 ret = VM_FAULT_NOPAGE;
4362                 goto out;
4363         }
4364
4365         if (page->index == size >> PAGE_CACHE_SHIFT)
4366                 len = size & ~PAGE_CACHE_MASK;
4367         else
4368                 len = PAGE_CACHE_SIZE;
4369         /*
4370          * Return if we have all the buffers mapped. This avoids the need to do
4371          * journal_start/journal_stop which can block and take a long time
4372          */
4373         if (page_has_buffers(page)) {
4374                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4375                                         ext4_bh_unmapped)) {
4376                         /* Wait so that we don't change page under IO */
4377                         wait_on_page_writeback(page);
4378                         ret = VM_FAULT_LOCKED;
4379                         goto out;
4380                 }
4381         }
4382         unlock_page(page);
4383         /* OK, we need to fill the hole... */
4384         if (ext4_should_dioread_nolock(inode))
4385                 get_block = ext4_get_block_write;
4386         else
4387                 get_block = ext4_get_block;
4388 retry_alloc:
4389         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4390         if (IS_ERR(handle)) {
4391                 ret = VM_FAULT_SIGBUS;
4392                 goto out;
4393         }
4394         ret = __block_page_mkwrite(vma, vmf, get_block);
4395         if (!ret && ext4_should_journal_data(inode)) {
4396                 if (walk_page_buffers(handle, page_buffers(page), 0,
4397                           PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
4398                         unlock_page(page);
4399                         ret = VM_FAULT_SIGBUS;
4400                         goto out;
4401                 }
4402                 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
4403         }
4404         ext4_journal_stop(handle);
4405         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
4406                 goto retry_alloc;
4407 out_ret:
4408         ret = block_page_mkwrite_return(ret);
4409 out:
4410         return ret;
4411 }