fs/gfs2/lops.c

   1 /*
   2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   4  *
   5  * This copyrighted material is made available to anyone wishing to use,
   6  * modify, copy, or redistribute it subject to the terms and conditions
   7  * of the GNU General Public License version 2.
   8  */
   9
  10 #include <linux/sched.h>
  11 #include <linux/slab.h>
  12 #include <linux/spinlock.h>
  13 #include <linux/completion.h>
  14 #include <linux/buffer_head.h>
  15 #include <linux/mempool.h>
  16 #include <linux/gfs2_ondisk.h>
  17 #include <linux/bio.h>
  18 #include <linux/fs.h>
  19 #include <linux/list_sort.h>
  20 #include <linux/blkdev.h>
  21
  22 #include "bmap.h"
  23 #include "dir.h"
  24 #include "gfs2.h"
  25 #include "incore.h"
  26 #include "inode.h"
  27 #include "glock.h"
  28 #include "log.h"
  29 #include "lops.h"
  30 #include "meta_io.h"
  31 #include "recovery.h"
  32 #include "rgrp.h"
  33 #include "trans.h"
  34 #include "util.h"
  35 #include "trace_gfs2.h"
  36
  37 /**
  38  * gfs2_pin - Pin a buffer in memory
  39  * @sdp: The superblock
  40  * @bh: The buffer to be pinned
  41  *
  42  * The log lock must be held when calling this function
  43  */
  44 void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
  45 {
  46         struct gfs2_bufdata *bd;
  47
  48         BUG_ON(!current->journal_info);
  49
  50         clear_buffer_dirty(bh);
  51         if (test_set_buffer_pinned(bh))
  52                 gfs2_assert_withdraw(sdp, 0);
  53         if (!buffer_uptodate(bh))
  54                 gfs2_io_error_bh_wd(sdp, bh);
  55         bd = bh->b_private;
  56         /* If this buffer is in the AIL and it has already been written
  57          * to in-place disk block, remove it from the AIL.
  58          */
  59         spin_lock(&sdp->sd_ail_lock);
  60         if (bd->bd_tr)
  61                 list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list);
  62         spin_unlock(&sdp->sd_ail_lock);
  63         get_bh(bh);
  64         atomic_inc(&sdp->sd_log_pinned);
  65         trace_gfs2_pin(bd, 1);
  66 }
  67
  68 static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
  69 {
  70         return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
  71 }
  72
  73 static void maybe_release_space(struct gfs2_bufdata *bd)
  74 {
  75         struct gfs2_glock *gl = bd->bd_gl;
  76         struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
  77         struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
  78         unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
  79         struct gfs2_bitmap *bi = rgd->rd_bits + index;
  80
  81         if (bi->bi_clone == NULL)
  82                 return;
  83         if (sdp->sd_args.ar_discard)
  84                 gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
  85         memcpy(bi->bi_clone + bi->bi_offset,
  86                bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes);
  87         clear_bit(GBF_FULL, &bi->bi_flags);
  88         rgd->rd_free_clone = rgd->rd_free;
  89         rgd->rd_extfail_pt = rgd->rd_free;
  90 }
  91
  92 /**
  93  * gfs2_unpin - Unpin a buffer
  94  * @sdp: the filesystem the buffer belongs to
  95  * @bh: The buffer to unpin
  96  * @ai:
  97  * @flags: The inode dirty flags
  98  *
  99  */
 100
 101 static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 102                        struct gfs2_trans *tr)
 103 {
 104         struct gfs2_bufdata *bd = bh->b_private;
 105
 106         BUG_ON(!buffer_uptodate(bh));
 107         BUG_ON(!buffer_pinned(bh));
 108
 109         lock_buffer(bh);
 110         mark_buffer_dirty(bh);
 111         clear_buffer_pinned(bh);
 112
 113         if (buffer_is_rgrp(bd))
 114                 maybe_release_space(bd);
 115
 116         spin_lock(&sdp->sd_ail_lock);
 117         if (bd->bd_tr) {
 118                 list_del(&bd->bd_ail_st_list);
 119                 brelse(bh);
 120         } else {
 121                 struct gfs2_glock *gl = bd->bd_gl;
 122                 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
 123                 atomic_inc(&gl->gl_ail_count);
 124         }
 125         bd->bd_tr = tr;
 126         list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list);
 127         spin_unlock(&sdp->sd_ail_lock);
 128
 129         clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
 130         trace_gfs2_pin(bd, 0);
 131         unlock_buffer(bh);
 132         atomic_dec(&sdp->sd_log_pinned);
 133 }
 134
 135 static void gfs2_log_incr_head(struct gfs2_sbd *sdp)
 136 {
 137         BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) &&
 138                (sdp->sd_log_flush_head != sdp->sd_log_head));
 139
 140         if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks)
 141                 sdp->sd_log_flush_head = 0;
 142 }
 143
 144 u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
 145 {
 146         unsigned int lbn = sdp->sd_log_flush_head;
 147         struct gfs2_journal_extent *je;
 148         u64 block;
 149
 150         list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) {
 151                 if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) {
 152                         block = je->dblock + lbn - je->lblock;
 153                         gfs2_log_incr_head(sdp);
 154                         return block;
 155                 }
 156         }
 157
 158         return -1;
 159 }
 160
 161 /**
 162  * gfs2_end_log_write_bh - end log write of pagecache data with buffers
 163  * @sdp: The superblock
 164  * @bvec: The bio_vec
 165  * @error: The i/o status
 166  *
 167  * This finds the relevant buffers and unlocks them and sets the
 168  * error flag according to the status of the i/o request. This is
 169  * used when the log is writing data which has an in-place version
 170  * that is pinned in the pagecache.
 171  */
 172
 173 static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
 174                                   struct bio_vec *bvec,
 175                                   blk_status_t error)
 176 {
 177         struct buffer_head *bh, *next;
 178         struct page *page = bvec->bv_page;
 179         unsigned size;
 180
 181         bh = page_buffers(page);
 182         size = bvec->bv_len;
 183         while (bh_offset(bh) < bvec->bv_offset)
 184                 bh = bh->b_this_page;
 185         do {
 186                 if (error)
 187                         mark_buffer_write_io_error(bh);
 188                 unlock_buffer(bh);
 189                 next = bh->b_this_page;
 190                 size -= bh->b_size;
 191                 brelse(bh);
 192                 bh = next;
 193         } while(bh && size);
 194 }
 195
 196 /**
 197  * gfs2_end_log_write - end of i/o to the log
 198  * @bio: The bio
 199  *
 200  * Each bio_vec contains either data from the pagecache or data
 201  * relating to the log itself. Here we iterate over the bio_vec
 202  * array, processing both kinds of data.
 203  *
 204  */
 205
 206 static void gfs2_end_log_write(struct bio *bio)
 207 {
 208         struct gfs2_sbd *sdp = bio->bi_private;
 209         struct bio_vec *bvec;
 210         struct page *page;
 211         struct bvec_iter_all iter_all;
 212
 213         if (bio->bi_status) {
 214                 fs_err(sdp, "Error %d writing to journal, jid=%u\n",
 215                        bio->bi_status, sdp->sd_jdesc->jd_jid);
 216                 wake_up(&sdp->sd_logd_waitq);
 217         }
 218
 219         bio_for_each_segment_all(bvec, bio, iter_all) {
 220                 page = bvec->bv_page;
 221                 if (page_has_buffers(page))
 222                         gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
 223                 else
 224                         mempool_free(page, gfs2_page_pool);
 225         }
 226
 227         bio_put(bio);
 228         if (atomic_dec_and_test(&sdp->sd_log_in_flight))
 229                 wake_up(&sdp->sd_log_flush_wait);
 230 }
 231
 232 /**
 233  * gfs2_log_submit_bio - Submit any pending log bio
 234  * @biop: Address of the bio pointer
 235  * @opf: REQ_OP | op_flags
 236  *
 237  * Submit any pending part-built or full bio to the block device. If
 238  * there is no pending bio, then this is a no-op.
 239  */
 240
 241 void gfs2_log_submit_bio(struct bio **biop, int opf)
 242 {
 243         struct bio *bio = *biop;
 244         if (bio) {
 245                 struct gfs2_sbd *sdp = bio->bi_private;
 246                 atomic_inc(&sdp->sd_log_in_flight);
 247                 bio->bi_opf = opf;
 248                 submit_bio(bio);
 249                 *biop = NULL;
 250         }
 251 }
 252
 253 /**
 254  * gfs2_log_alloc_bio - Allocate a bio
 255  * @sdp: The super block
 256  * @blkno: The device block number we want to write to
 257  * @end_io: The bi_end_io callback
 258  *
 259  * Allocate a new bio, initialize it with the given parameters and return it.
 260  *
 261  * Returns: The newly allocated bio
 262  */
 263
 264 static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
 265                                       bio_end_io_t *end_io)
 266 {
 267         struct super_block *sb = sdp->sd_vfs;
 268         struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 269
 270         bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
 271         bio_set_dev(bio, sb->s_bdev);
 272         bio->bi_end_io = end_io;
 273         bio->bi_private = sdp;
 274
 275         return bio;
 276 }
 277
 278 /**
 279  * gfs2_log_get_bio - Get cached log bio, or allocate a new one
 280  * @sdp: The super block
 281  * @blkno: The device block number we want to write to
 282  * @bio: The bio to get or allocate
 283  * @op: REQ_OP
 284  * @end_io: The bi_end_io callback
 285  * @flush: Always flush the current bio and allocate a new one?
 286  *
 287  * If there is a cached bio, then if the next block number is sequential
 288  * with the previous one, return it, otherwise flush the bio to the
 289  * device. If there is no cached bio, or we just flushed it, then
 290  * allocate a new one.
 291  *
 292  * Returns: The bio to use for log writes
 293  */
 294
 295 static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno,
 296                                     struct bio **biop, int op,
 297                                     bio_end_io_t *end_io, bool flush)
 298 {
 299         struct bio *bio = *biop;
 300
 301         if (bio) {
 302                 u64 nblk;
 303
 304                 nblk = bio_end_sector(bio);
 305                 nblk >>= sdp->sd_fsb2bb_shift;
 306                 if (blkno == nblk && !flush)
 307                         return bio;
 308                 gfs2_log_submit_bio(biop, op);
 309         }
 310
 311         *biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
 312         return *biop;
 313 }
 314
 315 /**
 316  * gfs2_log_write - write to log
 317  * @sdp: the filesystem
 318  * @page: the page to write
 319  * @size: the size of the data to write
 320  * @offset: the offset within the page
 321  * @blkno: block number of the log entry
 322  *
 323  * Try and add the page segment to the current bio. If that fails,
 324  * submit the current bio to the device and create a new one, and
 325  * then add the page segment to that.
 326  */
 327
 328 void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
 329                     unsigned size, unsigned offset, u64 blkno)
 330 {
 331         struct bio *bio;
 332         int ret;
 333
 334         bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, REQ_OP_WRITE,
 335                                gfs2_end_log_write, false);
 336         ret = bio_add_page(bio, page, size, offset);
 337         if (ret == 0) {
 338                 bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio,
 339                                        REQ_OP_WRITE, gfs2_end_log_write, true);
 340                 ret = bio_add_page(bio, page, size, offset);
 341                 WARN_ON(ret == 0);
 342         }
 343 }
 344
 345 /**
 346  * gfs2_log_write_bh - write a buffer's content to the log
 347  * @sdp: The super block
 348  * @bh: The buffer pointing to the in-place location
 349  *
 350  * This writes the content of the buffer to the next available location
 351  * in the log. The buffer will be unlocked once the i/o to the log has
 352  * completed.
 353  */
 354
 355 static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh)
 356 {
 357         gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh),
 358                        gfs2_log_bmap(sdp));
 359 }
 360
 361 /**
 362  * gfs2_log_write_page - write one block stored in a page, into the log
 363  * @sdp: The superblock
 364  * @page: The struct page
 365  *
 366  * This writes the first block-sized part of the page into the log. Note
 367  * that the page must have been allocated from the gfs2_page_pool mempool
 368  * and that after this has been called, ownership has been transferred and
 369  * the page may be freed at any time.
 370  */
 371
 372 void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
 373 {
 374         struct super_block *sb = sdp->sd_vfs;
 375         gfs2_log_write(sdp, page, sb->s_blocksize, 0,
 376                        gfs2_log_bmap(sdp));
 377 }
 378
 379 /**
 380  * gfs2_end_log_read - end I/O callback for reads from the log
 381  * @bio: The bio
 382  *
 383  * Simply unlock the pages in the bio. The main thread will wait on them and
 384  * process them in order as necessary.
 385  */
 386
 387 static void gfs2_end_log_read(struct bio *bio)
 388 {
 389         struct page *page;
 390         struct bio_vec *bvec;
 391         struct bvec_iter_all iter_all;
 392
 393         bio_for_each_segment_all(bvec, bio, iter_all) {
 394                 page = bvec->bv_page;
 395                 if (bio->bi_status) {
 396                         int err = blk_status_to_errno(bio->bi_status);
 397
 398                         SetPageError(page);
 399                         mapping_set_error(page->mapping, err);
 400                 }
 401                 unlock_page(page);
 402         }
 403
 404         bio_put(bio);
 405 }
 406
 407 /**
 408  * gfs2_jhead_pg_srch - Look for the journal head in a given page.
 409  * @jd: The journal descriptor
 410  * @page: The page to look in
 411  *
 412  * Returns: 1 if found, 0 otherwise.
 413  */
 414
 415 static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
 416                               struct gfs2_log_header_host *head,
 417                               struct page *page)
 418 {
 419         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 420         struct gfs2_log_header_host uninitialized_var(lh);
 421         void *kaddr = kmap_atomic(page);
 422         unsigned int offset;
 423         bool ret = false;
 424
 425         for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
 426                 if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
 427                         if (lh.lh_sequence > head->lh_sequence)
 428                                 *head = lh;
 429                         else {
 430                                 ret = true;
 431                                 break;
 432                         }
 433                 }
 434         }
 435         kunmap_atomic(kaddr);
 436         return ret;
 437 }
 438
 439 /**
 440  * gfs2_jhead_process_page - Search/cleanup a page
 441  * @jd: The journal descriptor
 442  * @index: Index of the page to look into
 443  * @done: If set, perform only cleanup, else search and set if found.
 444  *
 445  * Find the page with 'index' in the journal's mapping. Search the page for
 446  * the journal head if requested (cleanup == false). Release refs on the
 447  * page so the page cache can reclaim it (put_page() twice). We grabbed a
 448  * reference on this page two times, first when we did a find_or_create_page()
 449  * to obtain the page to add it to the bio and second when we do a
 450  * find_get_page() here to get the page to wait on while I/O on it is being
 451  * completed.
 452  * This function is also used to free up a page we might've grabbed but not
 453  * used. Maybe we added it to a bio, but not submitted it for I/O. Or we
 454  * submitted the I/O, but we already found the jhead so we only need to drop
 455  * our references to the page.
 456  */
 457
 458 static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
 459                                     struct gfs2_log_header_host *head,
 460                                     bool *done)
 461 {
 462         struct page *page;
 463
 464         page = find_get_page(jd->jd_inode->i_mapping, index);
 465         wait_on_page_locked(page);
 466
 467         if (PageError(page))
 468                 *done = true;
 469
 470         if (!*done)
 471                 *done = gfs2_jhead_pg_srch(jd, head, page);
 472
 473         put_page(page); /* Once for find_get_page */
 474         put_page(page); /* Once more for find_or_create_page */
 475 }
 476
 477 /**
 478  * gfs2_find_jhead - find the head of a log
 479  * @jd: The journal descriptor
 480  * @head: The log descriptor for the head of the log is returned here
 481  *
 482  * Do a search of a journal by reading it in large chunks using bios and find
 483  * the valid log entry with the highest sequence number.  (i.e. the log head)
 484  *
 485  * Returns: 0 on success, errno otherwise
 486  */
 487 int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
 488                     bool keep_cache)
 489 {
 490         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 491         struct address_space *mapping = jd->jd_inode->i_mapping;
 492         unsigned int block = 0, blocks_submitted = 0, blocks_read = 0;
 493         unsigned int bsize = sdp->sd_sb.sb_bsize;
 494         unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
 495         unsigned int shift = PAGE_SHIFT - bsize_shift;
 496         unsigned int readhead_blocks = BIO_MAX_PAGES << shift;
 497         struct gfs2_journal_extent *je;
 498         int sz, ret = 0;
 499         struct bio *bio = NULL;
 500         struct page *page = NULL;
 501         bool done = false;
 502         errseq_t since;
 503
 504         memset(head, 0, sizeof(*head));
 505         if (list_empty(&jd->extent_list))
 506                 gfs2_map_journal_extents(sdp, jd);
 507
 508         since = filemap_sample_wb_err(mapping);
 509         list_for_each_entry(je, &jd->extent_list, list) {
 510                 for (; block < je->lblock + je->blocks; block++) {
 511                         u64 dblock;
 512
 513                         if (!page) {
 514                                 page = find_or_create_page(mapping,
 515                                                 block >> shift, GFP_NOFS);
 516                                 if (!page) {
 517                                         ret = -ENOMEM;
 518                                         done = true;
 519                                         goto out;
 520                                 }
 521                         }
 522
 523                         if (bio) {
 524                                 unsigned int off;
 525
 526                                 off = (block << bsize_shift) & ~PAGE_MASK;
 527                                 sz = bio_add_page(bio, page, bsize, off);
 528                                 if (sz == bsize) { /* block added */
 529                                         if (off + bsize == PAGE_SIZE) {
 530                                                 page = NULL;
 531                                                 goto page_added;
 532                                         }
 533                                         continue;
 534                                 }
 535                                 blocks_submitted = block + 1;
 536                                 submit_bio(bio);
 537                                 bio = NULL;
 538                         }
 539
 540                         dblock = je->dblock + (block - je->lblock);
 541                         bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read);
 542                         bio->bi_opf = REQ_OP_READ;
 543                         sz = bio_add_page(bio, page, bsize, 0);
 544                         gfs2_assert_warn(sdp, sz == bsize);
 545                         if (bsize == PAGE_SIZE)
 546                                 page = NULL;
 547
 548 page_added:
 549                         if (blocks_submitted < blocks_read + readhead_blocks) {
 550                                 /* Keep at least one bio in flight */
 551                                 continue;
 552                         }
 553
 554                         gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done);
 555                         blocks_read += PAGE_SIZE >> bsize_shift;
 556                         if (done)
 557                                 goto out;  /* found */
 558                 }
 559         }
 560
 561 out:
 562         if (bio)
 563                 submit_bio(bio);
 564         while (blocks_read < block) {
 565                 gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done);
 566                 blocks_read += PAGE_SIZE >> bsize_shift;
 567         }
 568
 569         if (!ret)
 570                 ret = filemap_check_wb_err(mapping, since);
 571
 572         if (!keep_cache)
 573                 truncate_inode_pages(mapping, 0);
 574
 575         return ret;
 576 }
 577
 578 static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
 579                                       u32 ld_length, u32 ld_data1)
 580 {
 581         struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
 582         struct gfs2_log_descriptor *ld = page_address(page);
 583         clear_page(ld);
 584         ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
 585         ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
 586         ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
 587         ld->ld_type = cpu_to_be32(ld_type);
 588         ld->ld_length = cpu_to_be32(ld_length);
 589         ld->ld_data1 = cpu_to_be32(ld_data1);
 590         ld->ld_data2 = 0;
 591         return page;
 592 }
 593
 594 static void gfs2_check_magic(struct buffer_head *bh)
 595 {
 596         void *kaddr;
 597         __be32 *ptr;
 598
 599         clear_buffer_escaped(bh);
 600         kaddr = kmap_atomic(bh->b_page);
 601         ptr = kaddr + bh_offset(bh);
 602         if (*ptr == cpu_to_be32(GFS2_MAGIC))
 603                 set_buffer_escaped(bh);
 604         kunmap_atomic(kaddr);
 605 }
 606
 607 static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
 608 {
 609         struct gfs2_bufdata *bda, *bdb;
 610
 611         bda = list_entry(a, struct gfs2_bufdata, bd_list);
 612         bdb = list_entry(b, struct gfs2_bufdata, bd_list);
 613
 614         if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
 615                 return -1;
 616         if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
 617                 return 1;
 618         return 0;
 619 }
 620
 621 static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
 622                                 unsigned int total, struct list_head *blist,
 623                                 bool is_databuf)
 624 {
 625         struct gfs2_log_descriptor *ld;
 626         struct gfs2_bufdata *bd1 = NULL, *bd2;
 627         struct page *page;
 628         unsigned int num;
 629         unsigned n;
 630         __be64 *ptr;
 631
 632         gfs2_log_lock(sdp);
 633         list_sort(NULL, blist, blocknr_cmp);
 634         bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
 635         while(total) {
 636                 num = total;
 637                 if (total > limit)
 638                         num = limit;
 639                 gfs2_log_unlock(sdp);
 640                 page = gfs2_get_log_desc(sdp,
 641                                          is_databuf ? GFS2_LOG_DESC_JDATA :
 642                                          GFS2_LOG_DESC_METADATA, num + 1, num);
 643                 ld = page_address(page);
 644                 gfs2_log_lock(sdp);
 645                 ptr = (__be64 *)(ld + 1);
 646
 647                 n = 0;
 648                 list_for_each_entry_continue(bd1, blist, bd_list) {
 649                         *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
 650                         if (is_databuf) {
 651                                 gfs2_check_magic(bd1->bd_bh);
 652                                 *ptr++ = cpu_to_be64(buffer_escaped(bd1->bd_bh) ? 1 : 0);
 653                         }
 654                         if (++n >= num)
 655                                 break;
 656                 }
 657
 658                 gfs2_log_unlock(sdp);
 659                 gfs2_log_write_page(sdp, page);
 660                 gfs2_log_lock(sdp);
 661
 662                 n = 0;
 663                 list_for_each_entry_continue(bd2, blist, bd_list) {
 664                         get_bh(bd2->bd_bh);
 665                         gfs2_log_unlock(sdp);
 666                         lock_buffer(bd2->bd_bh);
 667
 668                         if (buffer_escaped(bd2->bd_bh)) {
 669                                 void *kaddr;
 670                                 page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
 671                                 ptr = page_address(page);
 672                                 kaddr = kmap_atomic(bd2->bd_bh->b_page);
 673                                 memcpy(ptr, kaddr + bh_offset(bd2->bd_bh),
 674                                        bd2->bd_bh->b_size);
 675                                 kunmap_atomic(kaddr);
 676                                 *(__be32 *)ptr = 0;
 677                                 clear_buffer_escaped(bd2->bd_bh);
 678                                 unlock_buffer(bd2->bd_bh);
 679                                 brelse(bd2->bd_bh);
 680                                 gfs2_log_write_page(sdp, page);
 681                         } else {
 682                                 gfs2_log_write_bh(sdp, bd2->bd_bh);
 683                         }
 684                         gfs2_log_lock(sdp);
 685                         if (++n >= num)
 686                                 break;
 687                 }
 688
 689                 BUG_ON(total < num);
 690                 total -= num;
 691         }
 692         gfs2_log_unlock(sdp);
 693 }
 694
 695 static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 696 {
 697         unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
 698         unsigned int nbuf;
 699         if (tr == NULL)
 700                 return;
 701         nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
 702         gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0);
 703 }
 704
 705 static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 706 {
 707         struct list_head *head;
 708         struct gfs2_bufdata *bd;
 709
 710         if (tr == NULL)
 711                 return;
 712
 713         head = &tr->tr_buf;
 714         while (!list_empty(head)) {
 715                 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
 716                 list_del_init(&bd->bd_list);
 717                 gfs2_unpin(sdp, bd->bd_bh, tr);
 718         }
 719 }
 720
 721 static void buf_lo_before_scan(struct gfs2_jdesc *jd,
 722                                struct gfs2_log_header_host *head, int pass)
 723 {
 724         if (pass != 0)
 725                 return;
 726
 727         jd->jd_found_blocks = 0;
 728         jd->jd_replayed_blocks = 0;
 729 }
 730
 731 static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
 732                                 struct gfs2_log_descriptor *ld, __be64 *ptr,
 733                                 int pass)
 734 {
 735         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 736         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 737         struct gfs2_glock *gl = ip->i_gl;
 738         unsigned int blks = be32_to_cpu(ld->ld_data1);
 739         struct buffer_head *bh_log, *bh_ip;
 740         u64 blkno;
 741         int error = 0;
 742
 743         if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
 744                 return 0;
 745
 746         gfs2_replay_incr_blk(jd, &start);
 747
 748         for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
 749                 blkno = be64_to_cpu(*ptr++);
 750
 751                 jd->jd_found_blocks++;
 752
 753                 if (gfs2_revoke_check(jd, blkno, start))
 754                         continue;
 755
 756                 error = gfs2_replay_read_block(jd, start, &bh_log);
 757                 if (error)
 758                         return error;
 759
 760                 bh_ip = gfs2_meta_new(gl, blkno);
 761                 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
 762
 763                 if (gfs2_meta_check(sdp, bh_ip))
 764                         error = -EIO;
 765                 else
 766                         mark_buffer_dirty(bh_ip);
 767
 768                 brelse(bh_log);
 769                 brelse(bh_ip);
 770
 771                 if (error)
 772                         break;
 773
 774                 jd->jd_replayed_blocks++;
 775         }
 776
 777         return error;
 778 }
 779
 780 /**
 781  * gfs2_meta_sync - Sync all buffers associated with a glock
 782  * @gl: The glock
 783  *
 784  */
 785
 786 static void gfs2_meta_sync(struct gfs2_glock *gl)
 787 {
 788         struct address_space *mapping = gfs2_glock2aspace(gl);
 789         struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 790         int error;
 791
 792         if (mapping == NULL)
 793                 mapping = &sdp->sd_aspace;
 794
 795         filemap_fdatawrite(mapping);
 796         error = filemap_fdatawait(mapping);
 797
 798         if (error)
 799                 gfs2_io_error(gl->gl_name.ln_sbd);
 800 }
 801
 802 static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 803 {
 804         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 805         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 806
 807         if (error) {
 808                 gfs2_meta_sync(ip->i_gl);
 809                 return;
 810         }
 811         if (pass != 1)
 812                 return;
 813
 814         gfs2_meta_sync(ip->i_gl);
 815
 816         fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
 817                 jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
 818 }
 819
 820 static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 821 {
 822         struct gfs2_meta_header *mh;
 823         unsigned int offset;
 824         struct list_head *head = &sdp->sd_log_revokes;
 825         struct gfs2_bufdata *bd;
 826         struct page *page;
 827         unsigned int length;
 828
 829         gfs2_write_revokes(sdp);
 830         if (!sdp->sd_log_num_revoke)
 831                 return;
 832
 833         length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64));
 834         page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
 835         offset = sizeof(struct gfs2_log_descriptor);
 836
 837         list_for_each_entry(bd, head, bd_list) {
 838                 sdp->sd_log_num_revoke--;
 839
 840                 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
 841
 842                         gfs2_log_write_page(sdp, page);
 843                         page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
 844                         mh = page_address(page);
 845                         clear_page(mh);
 846                         mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
 847                         mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
 848                         mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
 849                         offset = sizeof(struct gfs2_meta_header);
 850                 }
 851
 852                 *(__be64 *)(page_address(page) + offset) = cpu_to_be64(bd->bd_blkno);
 853                 offset += sizeof(u64);
 854         }
 855         gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 856
 857         gfs2_log_write_page(sdp, page);
 858 }
 859
 860 static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 861 {
 862         struct list_head *head = &sdp->sd_log_revokes;
 863         struct gfs2_bufdata *bd, *tmp;
 864
 865         /*
 866          * Glocks can be referenced repeatedly on the revoke list, but the list
 867          * only holds one reference.  All glocks on the list will have the
 868          * GLF_REVOKES flag set initially.
 869          */
 870
 871         list_for_each_entry_safe(bd, tmp, head, bd_list) {
 872                 struct gfs2_glock *gl = bd->bd_gl;
 873
 874                 if (test_bit(GLF_REVOKES, &gl->gl_flags)) {
 875                         /* Keep each glock on the list exactly once. */
 876                         clear_bit(GLF_REVOKES, &gl->gl_flags);
 877                         continue;
 878                 }
 879                 list_del(&bd->bd_list);
 880                 kmem_cache_free(gfs2_bufdata_cachep, bd);
 881         }
 882         list_for_each_entry_safe(bd, tmp, head, bd_list) {
 883                 struct gfs2_glock *gl = bd->bd_gl;
 884
 885                 list_del(&bd->bd_list);
 886                 kmem_cache_free(gfs2_bufdata_cachep, bd);
 887                 clear_bit(GLF_LFLUSH, &gl->gl_flags);
 888                 gfs2_glock_queue_put(gl);
 889         }
 890         /* the list is empty now */
 891 }
 892
 893 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
 894                                   struct gfs2_log_header_host *head, int pass)
 895 {
 896         if (pass != 0)
 897                 return;
 898
 899         jd->jd_found_revokes = 0;
 900         jd->jd_replay_tail = head->lh_tail;
 901 }
 902
 903 static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
 904                                    struct gfs2_log_descriptor *ld, __be64 *ptr,
 905                                    int pass)
 906 {
 907         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 908         unsigned int blks = be32_to_cpu(ld->ld_length);
 909         unsigned int revokes = be32_to_cpu(ld->ld_data1);
 910         struct buffer_head *bh;
 911         unsigned int offset;
 912         u64 blkno;
 913         int first = 1;
 914         int error;
 915
 916         if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
 917                 return 0;
 918
 919         offset = sizeof(struct gfs2_log_descriptor);
 920
 921         for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
 922                 error = gfs2_replay_read_block(jd, start, &bh);
 923                 if (error)
 924                         return error;
 925
 926                 if (!first)
 927                         gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
 928
 929                 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
 930                         blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
 931
 932                         error = gfs2_revoke_add(jd, blkno, start);
 933                         if (error < 0) {
 934                                 brelse(bh);
 935                                 return error;
 936                         }
 937                         else if (error)
 938                                 jd->jd_found_revokes++;
 939
 940                         if (!--revokes)
 941                                 break;
 942                         offset += sizeof(u64);
 943                 }
 944
 945                 brelse(bh);
 946                 offset = sizeof(struct gfs2_meta_header);
 947                 first = 0;
 948         }
 949
 950         return 0;
 951 }
 952
 953 static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 954 {
 955         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 956
 957         if (error) {
 958                 gfs2_revoke_clean(jd);
 959                 return;
 960         }
 961         if (pass != 1)
 962                 return;
 963
 964         fs_info(sdp, "jid=%u: Found %u revoke tags\n",
 965                 jd->jd_jid, jd->jd_found_revokes);
 966
 967         gfs2_revoke_clean(jd);
 968 }
 969
 970 /**
 971  * databuf_lo_before_commit - Scan the data buffers, writing as we go
 972  *
 973  */
 974
 975 static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 976 {
 977         unsigned int limit = databuf_limit(sdp);
 978         unsigned int nbuf;
 979         if (tr == NULL)
 980                 return;
 981         nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
 982         gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
 983 }
 984
 985 static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
 986                                     struct gfs2_log_descriptor *ld,
 987                                     __be64 *ptr, int pass)
 988 {
 989         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 990         struct gfs2_glock *gl = ip->i_gl;
 991         unsigned int blks = be32_to_cpu(ld->ld_data1);
 992         struct buffer_head *bh_log, *bh_ip;
 993         u64 blkno;
 994         u64 esc;
 995         int error = 0;
 996
 997         if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
 998                 return 0;
 999
1000         gfs2_replay_incr_blk(jd, &start);
1001         for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) {
1002                 blkno = be64_to_cpu(*ptr++);
1003                 esc = be64_to_cpu(*ptr++);
1004
1005                 jd->jd_found_blocks++;
1006
1007                 if (gfs2_revoke_check(jd, blkno, start))
1008                         continue;
1009
1010                 error = gfs2_replay_read_block(jd, start, &bh_log);
1011                 if (error)
1012                         return error;
1013
1014                 bh_ip = gfs2_meta_new(gl, blkno);
1015                 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
1016
1017                 /* Unescape */
1018                 if (esc) {
1019                         __be32 *eptr = (__be32 *)bh_ip->b_data;
1020                         *eptr = cpu_to_be32(GFS2_MAGIC);
1021                 }
1022                 mark_buffer_dirty(bh_ip);
1023
1024                 brelse(bh_log);
1025                 brelse(bh_ip);
1026
1027                 jd->jd_replayed_blocks++;
1028         }
1029
1030         return error;
1031 }
1032
1033 /* FIXME: sort out accounting for log blocks etc. */
1034
1035 static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
1036 {
1037         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1038         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
1039
1040         if (error) {
1041                 gfs2_meta_sync(ip->i_gl);
1042                 return;
1043         }
1044         if (pass != 1)
1045                 return;
1046
1047         /* data sync? */
1048         gfs2_meta_sync(ip->i_gl);
1049
1050         fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
1051                 jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
1052 }
1053
1054 static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
1055 {
1056         struct list_head *head;
1057         struct gfs2_bufdata *bd;
1058
1059         if (tr == NULL)
1060                 return;
1061
1062         head = &tr->tr_databuf;
1063         while (!list_empty(head)) {
1064                 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
1065                 list_del_init(&bd->bd_list);
1066                 gfs2_unpin(sdp, bd->bd_bh, tr);
1067         }
1068 }
1069
1070
1071 static const struct gfs2_log_operations gfs2_buf_lops = {
1072         .lo_before_commit = buf_lo_before_commit,
1073         .lo_after_commit = buf_lo_after_commit,
1074         .lo_before_scan = buf_lo_before_scan,
1075         .lo_scan_elements = buf_lo_scan_elements,
1076         .lo_after_scan = buf_lo_after_scan,
1077         .lo_name = "buf",
1078 };
1079
1080 static const struct gfs2_log_operations gfs2_revoke_lops = {
1081         .lo_before_commit = revoke_lo_before_commit,
1082         .lo_after_commit = revoke_lo_after_commit,
1083         .lo_before_scan = revoke_lo_before_scan,
1084         .lo_scan_elements = revoke_lo_scan_elements,
1085         .lo_after_scan = revoke_lo_after_scan,
1086         .lo_name = "revoke",
1087 };
1088
1089 static const struct gfs2_log_operations gfs2_databuf_lops = {
1090         .lo_before_commit = databuf_lo_before_commit,
1091         .lo_after_commit = databuf_lo_after_commit,
1092         .lo_scan_elements = databuf_lo_scan_elements,
1093         .lo_after_scan = databuf_lo_after_scan,
1094         .lo_name = "databuf",
1095 };
1096
1097 const struct gfs2_log_operations *gfs2_log_ops[] = {
1098         &gfs2_databuf_lops,
1099         &gfs2_buf_lops,
1100         &gfs2_revoke_lops,
1101         NULL,
1102 };
1103