]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - fs/xfs/xfs_log_recover.c
xfs: record log sector size rather than log2(that)
[mirror_ubuntu-bionic-kernel.git] / fs / xfs / xfs_log_recover.c
CommitLineData
1da177e4 1/*
87c199c2 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
7b718769 3 * All Rights Reserved.
1da177e4 4 *
7b718769
NS
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
1da177e4
LT
7 * published by the Free Software Foundation.
8 *
7b718769
NS
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
1da177e4 13 *
7b718769
NS
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
1da177e4 17 */
1da177e4 18#include "xfs.h"
a844f451 19#include "xfs_fs.h"
1da177e4 20#include "xfs_types.h"
a844f451 21#include "xfs_bit.h"
1da177e4 22#include "xfs_log.h"
a844f451 23#include "xfs_inum.h"
1da177e4 24#include "xfs_trans.h"
a844f451
NS
25#include "xfs_sb.h"
26#include "xfs_ag.h"
1da177e4
LT
27#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h"
30#include "xfs_error.h"
31#include "xfs_bmap_btree.h"
a844f451
NS
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
1da177e4 34#include "xfs_dir2_sf.h"
a844f451 35#include "xfs_attr_sf.h"
1da177e4 36#include "xfs_dinode.h"
1da177e4 37#include "xfs_inode.h"
a844f451 38#include "xfs_inode_item.h"
a844f451 39#include "xfs_alloc.h"
1da177e4
LT
40#include "xfs_ialloc.h"
41#include "xfs_log_priv.h"
42#include "xfs_buf_item.h"
1da177e4
LT
43#include "xfs_log_recover.h"
44#include "xfs_extfree_item.h"
45#include "xfs_trans_priv.h"
1da177e4
LT
46#include "xfs_quota.h"
47#include "xfs_rw.h"
43355099 48#include "xfs_utils.h"
0b1b213f 49#include "xfs_trace.h"
1da177e4
LT
50
51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
1da177e4
LT
53#if defined(DEBUG)
54STATIC void xlog_recover_check_summary(xlog_t *);
1da177e4
LT
55#else
56#define xlog_recover_check_summary(log)
1da177e4
LT
57#endif
58
1da177e4
LT
59/*
60 * Sector aligned buffer routines for buffer create/read/write/access
61 */
62
ff30a622
AE
63/*
64 * Verify the given count of basic blocks is valid number of blocks
65 * to specify for an operation involving the given XFS log buffer.
66 * Returns nonzero if the count is valid, 0 otherwise.
67 */
68
69static inline int
70xlog_buf_bbcount_valid(
71 xlog_t *log,
72 int bbcount)
73{
74 return bbcount > 0 && bbcount <= log->l_logBBsize;
75}
76
36adecff
AE
77/*
78 * Allocate a buffer to hold log data. The buffer needs to be able
79 * to map to a range of nbblks basic blocks at any valid (basic
80 * block) offset within the log.
81 */
5d77c0dc 82STATIC xfs_buf_t *
1da177e4
LT
83xlog_get_bp(
84 xlog_t *log,
3228149c 85 int nbblks)
1da177e4 86{
ff30a622
AE
87 if (!xlog_buf_bbcount_valid(log, nbblks)) {
88 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
89 nbblks);
90 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
3228149c
DC
91 return NULL;
92 }
1da177e4 93
36adecff
AE
94 /*
95 * We do log I/O in units of log sectors (a power-of-2
96 * multiple of the basic block size), so we round up the
97 * requested size to acommodate the basic blocks required
98 * for complete log sectors.
99 *
100 * In addition, the buffer may be used for a non-sector-
101 * aligned block offset, in which case an I/O of the
102 * requested size could extend beyond the end of the
103 * buffer. If the requested size is only 1 basic block it
104 * will never straddle a sector boundary, so this won't be
105 * an issue. Nor will this be a problem if the log I/O is
106 * done in basic blocks (sector size 1). But otherwise we
107 * extend the buffer by one extra log sector to ensure
108 * there's space to accomodate this possiblility.
109 */
69ce58f0
AE
110 if (nbblks > 1 && log->l_sectBBsize > 1)
111 nbblks += log->l_sectBBsize;
112 nbblks = round_up(nbblks, log->l_sectBBsize);
36adecff 113
3228149c 114 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
1da177e4
LT
115}
116
5d77c0dc 117STATIC void
1da177e4
LT
118xlog_put_bp(
119 xfs_buf_t *bp)
120{
121 xfs_buf_free(bp);
122}
123
076e6acb
CH
124STATIC xfs_caddr_t
125xlog_align(
126 xlog_t *log,
127 xfs_daddr_t blk_no,
128 int nbblks,
129 xfs_buf_t *bp)
130{
131 xfs_caddr_t ptr;
132
69ce58f0 133 if (log->l_sectBBsize == 1)
076e6acb
CH
134 return XFS_BUF_PTR(bp);
135
136 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
137 ASSERT(XFS_BUF_SIZE(bp) >=
138 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
139 return ptr;
140}
141
1da177e4
LT
142
143/*
144 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
145 */
076e6acb
CH
146STATIC int
147xlog_bread_noalign(
1da177e4
LT
148 xlog_t *log,
149 xfs_daddr_t blk_no,
150 int nbblks,
151 xfs_buf_t *bp)
152{
153 int error;
154
ff30a622
AE
155 if (!xlog_buf_bbcount_valid(log, nbblks)) {
156 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
157 nbblks);
158 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
3228149c
DC
159 return EFSCORRUPTED;
160 }
161
69ce58f0
AE
162 blk_no = round_down(blk_no, log->l_sectBBsize);
163 nbblks = round_up(nbblks, log->l_sectBBsize);
1da177e4
LT
164
165 ASSERT(nbblks > 0);
166 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
1da177e4
LT
167
168 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
169 XFS_BUF_READ(bp);
170 XFS_BUF_BUSY(bp);
171 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
172 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
173
174 xfsbdstrat(log->l_mp, bp);
d64e31a2
DC
175 error = xfs_iowait(bp);
176 if (error)
1da177e4
LT
177 xfs_ioerror_alert("xlog_bread", log->l_mp,
178 bp, XFS_BUF_ADDR(bp));
179 return error;
180}
181
076e6acb
CH
182STATIC int
183xlog_bread(
184 xlog_t *log,
185 xfs_daddr_t blk_no,
186 int nbblks,
187 xfs_buf_t *bp,
188 xfs_caddr_t *offset)
189{
190 int error;
191
192 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
193 if (error)
194 return error;
195
196 *offset = xlog_align(log, blk_no, nbblks, bp);
197 return 0;
198}
199
1da177e4
LT
200/*
201 * Write out the buffer at the given block for the given number of blocks.
202 * The buffer is kept locked across the write and is returned locked.
203 * This can only be used for synchronous log writes.
204 */
ba0f32d4 205STATIC int
1da177e4
LT
206xlog_bwrite(
207 xlog_t *log,
208 xfs_daddr_t blk_no,
209 int nbblks,
210 xfs_buf_t *bp)
211{
212 int error;
213
ff30a622
AE
214 if (!xlog_buf_bbcount_valid(log, nbblks)) {
215 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
216 nbblks);
217 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
3228149c
DC
218 return EFSCORRUPTED;
219 }
220
69ce58f0
AE
221 blk_no = round_down(blk_no, log->l_sectBBsize);
222 nbblks = round_up(nbblks, log->l_sectBBsize);
1da177e4
LT
223
224 ASSERT(nbblks > 0);
225 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
226
227 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
228 XFS_BUF_ZEROFLAGS(bp);
229 XFS_BUF_BUSY(bp);
230 XFS_BUF_HOLD(bp);
231 XFS_BUF_PSEMA(bp, PRIBIO);
232 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
233 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
234
235 if ((error = xfs_bwrite(log->l_mp, bp)))
236 xfs_ioerror_alert("xlog_bwrite", log->l_mp,
237 bp, XFS_BUF_ADDR(bp));
238 return error;
239}
240
1da177e4
LT
241#ifdef DEBUG
242/*
243 * dump debug superblock and log record information
244 */
245STATIC void
246xlog_header_check_dump(
247 xfs_mount_t *mp,
248 xlog_rec_header_t *head)
249{
03daa57c
JP
250 cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n",
251 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
252 cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n",
253 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
1da177e4
LT
254}
255#else
256#define xlog_header_check_dump(mp, head)
257#endif
258
259/*
260 * check log record header for recovery
261 */
262STATIC int
263xlog_header_check_recover(
264 xfs_mount_t *mp,
265 xlog_rec_header_t *head)
266{
b53e675d 267 ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
1da177e4
LT
268
269 /*
270 * IRIX doesn't write the h_fmt field and leaves it zeroed
271 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
272 * a dirty log created in IRIX.
273 */
b53e675d 274 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
1da177e4
LT
275 xlog_warn(
276 "XFS: dirty log written in incompatible format - can't recover");
277 xlog_header_check_dump(mp, head);
278 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
279 XFS_ERRLEVEL_HIGH, mp);
280 return XFS_ERROR(EFSCORRUPTED);
281 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
282 xlog_warn(
283 "XFS: dirty log entry has mismatched uuid - can't recover");
284 xlog_header_check_dump(mp, head);
285 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
286 XFS_ERRLEVEL_HIGH, mp);
287 return XFS_ERROR(EFSCORRUPTED);
288 }
289 return 0;
290}
291
292/*
293 * read the head block of the log and check the header
294 */
295STATIC int
296xlog_header_check_mount(
297 xfs_mount_t *mp,
298 xlog_rec_header_t *head)
299{
b53e675d 300 ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
1da177e4
LT
301
302 if (uuid_is_nil(&head->h_fs_uuid)) {
303 /*
304 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
305 * h_fs_uuid is nil, we assume this log was last mounted
306 * by IRIX and continue.
307 */
308 xlog_warn("XFS: nil uuid in log - IRIX style log");
309 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
310 xlog_warn("XFS: log has mismatched uuid - can't recover");
311 xlog_header_check_dump(mp, head);
312 XFS_ERROR_REPORT("xlog_header_check_mount",
313 XFS_ERRLEVEL_HIGH, mp);
314 return XFS_ERROR(EFSCORRUPTED);
315 }
316 return 0;
317}
318
319STATIC void
320xlog_recover_iodone(
321 struct xfs_buf *bp)
322{
1da177e4
LT
323 if (XFS_BUF_GETERROR(bp)) {
324 /*
325 * We're not going to bother about retrying
326 * this during recovery. One strike!
327 */
1da177e4 328 xfs_ioerror_alert("xlog_recover_iodone",
15ac08a8
CH
329 bp->b_mount, bp, XFS_BUF_ADDR(bp));
330 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
1da177e4 331 }
15ac08a8 332 bp->b_mount = NULL;
1da177e4
LT
333 XFS_BUF_CLR_IODONE_FUNC(bp);
334 xfs_biodone(bp);
335}
336
337/*
338 * This routine finds (to an approximation) the first block in the physical
339 * log which contains the given cycle. It uses a binary search algorithm.
340 * Note that the algorithm can not be perfect because the disk will not
341 * necessarily be perfect.
342 */
a8272ce0 343STATIC int
1da177e4
LT
344xlog_find_cycle_start(
345 xlog_t *log,
346 xfs_buf_t *bp,
347 xfs_daddr_t first_blk,
348 xfs_daddr_t *last_blk,
349 uint cycle)
350{
351 xfs_caddr_t offset;
352 xfs_daddr_t mid_blk;
e3bb2e30 353 xfs_daddr_t end_blk;
1da177e4
LT
354 uint mid_cycle;
355 int error;
356
e3bb2e30
AE
357 end_blk = *last_blk;
358 mid_blk = BLK_AVG(first_blk, end_blk);
359 while (mid_blk != first_blk && mid_blk != end_blk) {
076e6acb
CH
360 error = xlog_bread(log, mid_blk, 1, bp, &offset);
361 if (error)
1da177e4 362 return error;
03bea6fe 363 mid_cycle = xlog_get_cycle(offset);
e3bb2e30
AE
364 if (mid_cycle == cycle)
365 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
366 else
367 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
368 mid_blk = BLK_AVG(first_blk, end_blk);
1da177e4 369 }
e3bb2e30
AE
370 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
371 (mid_blk == end_blk && mid_blk-1 == first_blk));
372
373 *last_blk = end_blk;
1da177e4
LT
374
375 return 0;
376}
377
378/*
3f943d85
AE
379 * Check that a range of blocks does not contain stop_on_cycle_no.
380 * Fill in *new_blk with the block offset where such a block is
381 * found, or with -1 (an invalid block number) if there is no such
382 * block in the range. The scan needs to occur from front to back
383 * and the pointer into the region must be updated since a later
384 * routine will need to perform another test.
1da177e4
LT
385 */
386STATIC int
387xlog_find_verify_cycle(
388 xlog_t *log,
389 xfs_daddr_t start_blk,
390 int nbblks,
391 uint stop_on_cycle_no,
392 xfs_daddr_t *new_blk)
393{
394 xfs_daddr_t i, j;
395 uint cycle;
396 xfs_buf_t *bp;
397 xfs_daddr_t bufblks;
398 xfs_caddr_t buf = NULL;
399 int error = 0;
400
6881a229
AE
401 /*
402 * Greedily allocate a buffer big enough to handle the full
403 * range of basic blocks we'll be examining. If that fails,
404 * try a smaller size. We need to be able to read at least
405 * a log sector, or we're out of luck.
406 */
1da177e4 407 bufblks = 1 << ffs(nbblks);
1da177e4 408 while (!(bp = xlog_get_bp(log, bufblks))) {
1da177e4 409 bufblks >>= 1;
69ce58f0 410 if (bufblks < log->l_sectBBsize)
1da177e4
LT
411 return ENOMEM;
412 }
413
414 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
415 int bcount;
416
417 bcount = min(bufblks, (start_blk + nbblks - i));
418
076e6acb
CH
419 error = xlog_bread(log, i, bcount, bp, &buf);
420 if (error)
1da177e4
LT
421 goto out;
422
1da177e4 423 for (j = 0; j < bcount; j++) {
03bea6fe 424 cycle = xlog_get_cycle(buf);
1da177e4
LT
425 if (cycle == stop_on_cycle_no) {
426 *new_blk = i+j;
427 goto out;
428 }
429
430 buf += BBSIZE;
431 }
432 }
433
434 *new_blk = -1;
435
436out:
437 xlog_put_bp(bp);
438 return error;
439}
440
441/*
442 * Potentially backup over partial log record write.
443 *
444 * In the typical case, last_blk is the number of the block directly after
445 * a good log record. Therefore, we subtract one to get the block number
446 * of the last block in the given buffer. extra_bblks contains the number
447 * of blocks we would have read on a previous read. This happens when the
448 * last log record is split over the end of the physical log.
449 *
450 * extra_bblks is the number of blocks potentially verified on a previous
451 * call to this routine.
452 */
453STATIC int
454xlog_find_verify_log_record(
455 xlog_t *log,
456 xfs_daddr_t start_blk,
457 xfs_daddr_t *last_blk,
458 int extra_bblks)
459{
460 xfs_daddr_t i;
461 xfs_buf_t *bp;
462 xfs_caddr_t offset = NULL;
463 xlog_rec_header_t *head = NULL;
464 int error = 0;
465 int smallmem = 0;
466 int num_blks = *last_blk - start_blk;
467 int xhdrs;
468
469 ASSERT(start_blk != 0 || *last_blk != start_blk);
470
471 if (!(bp = xlog_get_bp(log, num_blks))) {
472 if (!(bp = xlog_get_bp(log, 1)))
473 return ENOMEM;
474 smallmem = 1;
475 } else {
076e6acb
CH
476 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
477 if (error)
1da177e4 478 goto out;
1da177e4
LT
479 offset += ((num_blks - 1) << BBSHIFT);
480 }
481
482 for (i = (*last_blk) - 1; i >= 0; i--) {
483 if (i < start_blk) {
484 /* valid log record not found */
485 xlog_warn(
486 "XFS: Log inconsistent (didn't find previous header)");
487 ASSERT(0);
488 error = XFS_ERROR(EIO);
489 goto out;
490 }
491
492 if (smallmem) {
076e6acb
CH
493 error = xlog_bread(log, i, 1, bp, &offset);
494 if (error)
1da177e4 495 goto out;
1da177e4
LT
496 }
497
498 head = (xlog_rec_header_t *)offset;
499
b53e675d 500 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
1da177e4
LT
501 break;
502
503 if (!smallmem)
504 offset -= BBSIZE;
505 }
506
507 /*
508 * We hit the beginning of the physical log & still no header. Return
509 * to caller. If caller can handle a return of -1, then this routine
510 * will be called again for the end of the physical log.
511 */
512 if (i == -1) {
513 error = -1;
514 goto out;
515 }
516
517 /*
518 * We have the final block of the good log (the first block
519 * of the log record _before_ the head. So we check the uuid.
520 */
521 if ((error = xlog_header_check_mount(log->l_mp, head)))
522 goto out;
523
524 /*
525 * We may have found a log record header before we expected one.
526 * last_blk will be the 1st block # with a given cycle #. We may end
527 * up reading an entire log record. In this case, we don't want to
528 * reset last_blk. Only when last_blk points in the middle of a log
529 * record do we update last_blk.
530 */
62118709 531 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
b53e675d 532 uint h_size = be32_to_cpu(head->h_size);
1da177e4
LT
533
534 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
535 if (h_size % XLOG_HEADER_CYCLE_SIZE)
536 xhdrs++;
537 } else {
538 xhdrs = 1;
539 }
540
b53e675d
CH
541 if (*last_blk - i + extra_bblks !=
542 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
1da177e4
LT
543 *last_blk = i;
544
545out:
546 xlog_put_bp(bp);
547 return error;
548}
549
550/*
551 * Head is defined to be the point of the log where the next log write
552 * write could go. This means that incomplete LR writes at the end are
553 * eliminated when calculating the head. We aren't guaranteed that previous
554 * LR have complete transactions. We only know that a cycle number of
555 * current cycle number -1 won't be present in the log if we start writing
556 * from our current block number.
557 *
558 * last_blk contains the block number of the first block with a given
559 * cycle number.
560 *
561 * Return: zero if normal, non-zero if error.
562 */
ba0f32d4 563STATIC int
1da177e4
LT
564xlog_find_head(
565 xlog_t *log,
566 xfs_daddr_t *return_head_blk)
567{
568 xfs_buf_t *bp;
569 xfs_caddr_t offset;
570 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
571 int num_scan_bblks;
572 uint first_half_cycle, last_half_cycle;
573 uint stop_on_cycle;
574 int error, log_bbnum = log->l_logBBsize;
575
576 /* Is the end of the log device zeroed? */
577 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
578 *return_head_blk = first_blk;
579
580 /* Is the whole lot zeroed? */
581 if (!first_blk) {
582 /* Linux XFS shouldn't generate totally zeroed logs -
583 * mkfs etc write a dummy unmount record to a fresh
584 * log so we can store the uuid in there
585 */
586 xlog_warn("XFS: totally zeroed log");
587 }
588
589 return 0;
590 } else if (error) {
591 xlog_warn("XFS: empty log check failed");
592 return error;
593 }
594
595 first_blk = 0; /* get cycle # of 1st block */
596 bp = xlog_get_bp(log, 1);
597 if (!bp)
598 return ENOMEM;
076e6acb
CH
599
600 error = xlog_bread(log, 0, 1, bp, &offset);
601 if (error)
1da177e4 602 goto bp_err;
076e6acb 603
03bea6fe 604 first_half_cycle = xlog_get_cycle(offset);
1da177e4
LT
605
606 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
076e6acb
CH
607 error = xlog_bread(log, last_blk, 1, bp, &offset);
608 if (error)
1da177e4 609 goto bp_err;
076e6acb 610
03bea6fe 611 last_half_cycle = xlog_get_cycle(offset);
1da177e4
LT
612 ASSERT(last_half_cycle != 0);
613
614 /*
615 * If the 1st half cycle number is equal to the last half cycle number,
616 * then the entire log is stamped with the same cycle number. In this
617 * case, head_blk can't be set to zero (which makes sense). The below
618 * math doesn't work out properly with head_blk equal to zero. Instead,
619 * we set it to log_bbnum which is an invalid block number, but this
620 * value makes the math correct. If head_blk doesn't changed through
621 * all the tests below, *head_blk is set to zero at the very end rather
622 * than log_bbnum. In a sense, log_bbnum and zero are the same block
623 * in a circular file.
624 */
625 if (first_half_cycle == last_half_cycle) {
626 /*
627 * In this case we believe that the entire log should have
628 * cycle number last_half_cycle. We need to scan backwards
629 * from the end verifying that there are no holes still
630 * containing last_half_cycle - 1. If we find such a hole,
631 * then the start of that hole will be the new head. The
632 * simple case looks like
633 * x | x ... | x - 1 | x
634 * Another case that fits this picture would be
635 * x | x + 1 | x ... | x
c41564b5 636 * In this case the head really is somewhere at the end of the
1da177e4
LT
637 * log, as one of the latest writes at the beginning was
638 * incomplete.
639 * One more case is
640 * x | x + 1 | x ... | x - 1 | x
641 * This is really the combination of the above two cases, and
642 * the head has to end up at the start of the x-1 hole at the
643 * end of the log.
644 *
645 * In the 256k log case, we will read from the beginning to the
646 * end of the log and search for cycle numbers equal to x-1.
647 * We don't worry about the x+1 blocks that we encounter,
648 * because we know that they cannot be the head since the log
649 * started with x.
650 */
651 head_blk = log_bbnum;
652 stop_on_cycle = last_half_cycle - 1;
653 } else {
654 /*
655 * In this case we want to find the first block with cycle
656 * number matching last_half_cycle. We expect the log to be
657 * some variation on
3f943d85 658 * x + 1 ... | x ... | x
1da177e4
LT
659 * The first block with cycle number x (last_half_cycle) will
660 * be where the new head belongs. First we do a binary search
661 * for the first occurrence of last_half_cycle. The binary
662 * search may not be totally accurate, so then we scan back
663 * from there looking for occurrences of last_half_cycle before
664 * us. If that backwards scan wraps around the beginning of
665 * the log, then we look for occurrences of last_half_cycle - 1
666 * at the end of the log. The cases we're looking for look
667 * like
3f943d85
AE
668 * v binary search stopped here
669 * x + 1 ... | x | x + 1 | x ... | x
670 * ^ but we want to locate this spot
1da177e4 671 * or
1da177e4 672 * <---------> less than scan distance
3f943d85
AE
673 * x + 1 ... | x ... | x - 1 | x
674 * ^ we want to locate this spot
1da177e4
LT
675 */
676 stop_on_cycle = last_half_cycle;
677 if ((error = xlog_find_cycle_start(log, bp, first_blk,
678 &head_blk, last_half_cycle)))
679 goto bp_err;
680 }
681
682 /*
683 * Now validate the answer. Scan back some number of maximum possible
684 * blocks and make sure each one has the expected cycle number. The
685 * maximum is determined by the total possible amount of buffering
686 * in the in-core log. The following number can be made tighter if
687 * we actually look at the block size of the filesystem.
688 */
689 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
690 if (head_blk >= num_scan_bblks) {
691 /*
692 * We are guaranteed that the entire check can be performed
693 * in one buffer.
694 */
695 start_blk = head_blk - num_scan_bblks;
696 if ((error = xlog_find_verify_cycle(log,
697 start_blk, num_scan_bblks,
698 stop_on_cycle, &new_blk)))
699 goto bp_err;
700 if (new_blk != -1)
701 head_blk = new_blk;
702 } else { /* need to read 2 parts of log */
703 /*
704 * We are going to scan backwards in the log in two parts.
705 * First we scan the physical end of the log. In this part
706 * of the log, we are looking for blocks with cycle number
707 * last_half_cycle - 1.
708 * If we find one, then we know that the log starts there, as
709 * we've found a hole that didn't get written in going around
710 * the end of the physical log. The simple case for this is
711 * x + 1 ... | x ... | x - 1 | x
712 * <---------> less than scan distance
713 * If all of the blocks at the end of the log have cycle number
714 * last_half_cycle, then we check the blocks at the start of
715 * the log looking for occurrences of last_half_cycle. If we
716 * find one, then our current estimate for the location of the
717 * first occurrence of last_half_cycle is wrong and we move
718 * back to the hole we've found. This case looks like
719 * x + 1 ... | x | x + 1 | x ...
720 * ^ binary search stopped here
721 * Another case we need to handle that only occurs in 256k
722 * logs is
723 * x + 1 ... | x ... | x+1 | x ...
724 * ^ binary search stops here
725 * In a 256k log, the scan at the end of the log will see the
726 * x + 1 blocks. We need to skip past those since that is
727 * certainly not the head of the log. By searching for
728 * last_half_cycle-1 we accomplish that.
729 */
1da177e4 730 ASSERT(head_blk <= INT_MAX &&
3f943d85
AE
731 (xfs_daddr_t) num_scan_bblks >= head_blk);
732 start_blk = log_bbnum - (num_scan_bblks - head_blk);
1da177e4
LT
733 if ((error = xlog_find_verify_cycle(log, start_blk,
734 num_scan_bblks - (int)head_blk,
735 (stop_on_cycle - 1), &new_blk)))
736 goto bp_err;
737 if (new_blk != -1) {
738 head_blk = new_blk;
9db127ed 739 goto validate_head;
1da177e4
LT
740 }
741
742 /*
743 * Scan beginning of log now. The last part of the physical
744 * log is good. This scan needs to verify that it doesn't find
745 * the last_half_cycle.
746 */
747 start_blk = 0;
748 ASSERT(head_blk <= INT_MAX);
749 if ((error = xlog_find_verify_cycle(log,
750 start_blk, (int)head_blk,
751 stop_on_cycle, &new_blk)))
752 goto bp_err;
753 if (new_blk != -1)
754 head_blk = new_blk;
755 }
756
9db127ed 757validate_head:
1da177e4
LT
758 /*
759 * Now we need to make sure head_blk is not pointing to a block in
760 * the middle of a log record.
761 */
762 num_scan_bblks = XLOG_REC_SHIFT(log);
763 if (head_blk >= num_scan_bblks) {
764 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
765
766 /* start ptr at last block ptr before head_blk */
767 if ((error = xlog_find_verify_log_record(log, start_blk,
768 &head_blk, 0)) == -1) {
769 error = XFS_ERROR(EIO);
770 goto bp_err;
771 } else if (error)
772 goto bp_err;
773 } else {
774 start_blk = 0;
775 ASSERT(head_blk <= INT_MAX);
776 if ((error = xlog_find_verify_log_record(log, start_blk,
777 &head_blk, 0)) == -1) {
778 /* We hit the beginning of the log during our search */
3f943d85 779 start_blk = log_bbnum - (num_scan_bblks - head_blk);
1da177e4
LT
780 new_blk = log_bbnum;
781 ASSERT(start_blk <= INT_MAX &&
782 (xfs_daddr_t) log_bbnum-start_blk >= 0);
783 ASSERT(head_blk <= INT_MAX);
784 if ((error = xlog_find_verify_log_record(log,
785 start_blk, &new_blk,
786 (int)head_blk)) == -1) {
787 error = XFS_ERROR(EIO);
788 goto bp_err;
789 } else if (error)
790 goto bp_err;
791 if (new_blk != log_bbnum)
792 head_blk = new_blk;
793 } else if (error)
794 goto bp_err;
795 }
796
797 xlog_put_bp(bp);
798 if (head_blk == log_bbnum)
799 *return_head_blk = 0;
800 else
801 *return_head_blk = head_blk;
802 /*
803 * When returning here, we have a good block number. Bad block
804 * means that during a previous crash, we didn't have a clean break
805 * from cycle number N to cycle number N-1. In this case, we need
806 * to find the first block with cycle number N-1.
807 */
808 return 0;
809
810 bp_err:
811 xlog_put_bp(bp);
812
813 if (error)
814 xlog_warn("XFS: failed to find log head");
815 return error;
816}
817
818/*
819 * Find the sync block number or the tail of the log.
820 *
821 * This will be the block number of the last record to have its
822 * associated buffers synced to disk. Every log record header has
823 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
824 * to get a sync block number. The only concern is to figure out which
825 * log record header to believe.
826 *
827 * The following algorithm uses the log record header with the largest
828 * lsn. The entire log record does not need to be valid. We only care
829 * that the header is valid.
830 *
831 * We could speed up search by using current head_blk buffer, but it is not
832 * available.
833 */
5d77c0dc 834STATIC int
1da177e4
LT
835xlog_find_tail(
836 xlog_t *log,
837 xfs_daddr_t *head_blk,
65be6054 838 xfs_daddr_t *tail_blk)
1da177e4
LT
839{
840 xlog_rec_header_t *rhead;
841 xlog_op_header_t *op_head;
842 xfs_caddr_t offset = NULL;
843 xfs_buf_t *bp;
844 int error, i, found;
845 xfs_daddr_t umount_data_blk;
846 xfs_daddr_t after_umount_blk;
847 xfs_lsn_t tail_lsn;
848 int hblks;
849
850 found = 0;
851
852 /*
853 * Find previous log record
854 */
855 if ((error = xlog_find_head(log, head_blk)))
856 return error;
857
858 bp = xlog_get_bp(log, 1);
859 if (!bp)
860 return ENOMEM;
861 if (*head_blk == 0) { /* special case */
076e6acb
CH
862 error = xlog_bread(log, 0, 1, bp, &offset);
863 if (error)
9db127ed 864 goto done;
076e6acb 865
03bea6fe 866 if (xlog_get_cycle(offset) == 0) {
1da177e4
LT
867 *tail_blk = 0;
868 /* leave all other log inited values alone */
9db127ed 869 goto done;
1da177e4
LT
870 }
871 }
872
873 /*
874 * Search backwards looking for log record header block
875 */
876 ASSERT(*head_blk < INT_MAX);
877 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
076e6acb
CH
878 error = xlog_bread(log, i, 1, bp, &offset);
879 if (error)
9db127ed 880 goto done;
076e6acb 881
b53e675d 882 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
1da177e4
LT
883 found = 1;
884 break;
885 }
886 }
887 /*
888 * If we haven't found the log record header block, start looking
889 * again from the end of the physical log. XXXmiken: There should be
890 * a check here to make sure we didn't search more than N blocks in
891 * the previous code.
892 */
893 if (!found) {
894 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
076e6acb
CH
895 error = xlog_bread(log, i, 1, bp, &offset);
896 if (error)
9db127ed 897 goto done;
076e6acb 898
1da177e4 899 if (XLOG_HEADER_MAGIC_NUM ==
b53e675d 900 be32_to_cpu(*(__be32 *)offset)) {
1da177e4
LT
901 found = 2;
902 break;
903 }
904 }
905 }
906 if (!found) {
907 xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
908 ASSERT(0);
909 return XFS_ERROR(EIO);
910 }
911
912 /* find blk_no of tail of log */
913 rhead = (xlog_rec_header_t *)offset;
b53e675d 914 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1da177e4
LT
915
916 /*
917 * Reset log values according to the state of the log when we
918 * crashed. In the case where head_blk == 0, we bump curr_cycle
919 * one because the next write starts a new cycle rather than
920 * continuing the cycle of the last good log record. At this
921 * point we have guaranteed that all partial log records have been
922 * accounted for. Therefore, we know that the last good log record
923 * written was complete and ended exactly on the end boundary
924 * of the physical log.
925 */
926 log->l_prev_block = i;
927 log->l_curr_block = (int)*head_blk;
b53e675d 928 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1da177e4
LT
929 if (found == 2)
930 log->l_curr_cycle++;
b53e675d
CH
931 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
932 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
1da177e4
LT
933 log->l_grant_reserve_cycle = log->l_curr_cycle;
934 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
935 log->l_grant_write_cycle = log->l_curr_cycle;
936 log->l_grant_write_bytes = BBTOB(log->l_curr_block);
937
938 /*
939 * Look for unmount record. If we find it, then we know there
940 * was a clean unmount. Since 'i' could be the last block in
941 * the physical log, we convert to a log block before comparing
942 * to the head_blk.
943 *
944 * Save the current tail lsn to use to pass to
945 * xlog_clear_stale_blocks() below. We won't want to clear the
946 * unmount record if there is one, so we pass the lsn of the
947 * unmount record rather than the block after it.
948 */
62118709 949 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
b53e675d
CH
950 int h_size = be32_to_cpu(rhead->h_size);
951 int h_version = be32_to_cpu(rhead->h_version);
1da177e4
LT
952
953 if ((h_version & XLOG_VERSION_2) &&
954 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
955 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
956 if (h_size % XLOG_HEADER_CYCLE_SIZE)
957 hblks++;
958 } else {
959 hblks = 1;
960 }
961 } else {
962 hblks = 1;
963 }
964 after_umount_blk = (i + hblks + (int)
b53e675d 965 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
1da177e4
LT
966 tail_lsn = log->l_tail_lsn;
967 if (*head_blk == after_umount_blk &&
b53e675d 968 be32_to_cpu(rhead->h_num_logops) == 1) {
1da177e4 969 umount_data_blk = (i + hblks) % log->l_logBBsize;
076e6acb
CH
970 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
971 if (error)
9db127ed 972 goto done;
076e6acb 973
1da177e4
LT
974 op_head = (xlog_op_header_t *)offset;
975 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
976 /*
977 * Set tail and last sync so that newly written
978 * log records will point recovery to after the
979 * current unmount record.
980 */
03bea6fe
CH
981 log->l_tail_lsn =
982 xlog_assign_lsn(log->l_curr_cycle,
983 after_umount_blk);
984 log->l_last_sync_lsn =
985 xlog_assign_lsn(log->l_curr_cycle,
986 after_umount_blk);
1da177e4 987 *tail_blk = after_umount_blk;
92821e2b
DC
988
989 /*
990 * Note that the unmount was clean. If the unmount
991 * was not clean, we need to know this to rebuild the
992 * superblock counters from the perag headers if we
993 * have a filesystem using non-persistent counters.
994 */
995 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1da177e4
LT
996 }
997 }
998
999 /*
1000 * Make sure that there are no blocks in front of the head
1001 * with the same cycle number as the head. This can happen
1002 * because we allow multiple outstanding log writes concurrently,
1003 * and the later writes might make it out before earlier ones.
1004 *
1005 * We use the lsn from before modifying it so that we'll never
1006 * overwrite the unmount record after a clean unmount.
1007 *
1008 * Do this only if we are going to recover the filesystem
1009 *
1010 * NOTE: This used to say "if (!readonly)"
1011 * However on Linux, we can & do recover a read-only filesystem.
1012 * We only skip recovery if NORECOVERY is specified on mount,
1013 * in which case we would not be here.
1014 *
1015 * But... if the -device- itself is readonly, just skip this.
1016 * We can't recover this device anyway, so it won't matter.
1017 */
9db127ed 1018 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1da177e4 1019 error = xlog_clear_stale_blocks(log, tail_lsn);
1da177e4 1020
9db127ed 1021done:
1da177e4
LT
1022 xlog_put_bp(bp);
1023
1024 if (error)
1025 xlog_warn("XFS: failed to locate log tail");
1026 return error;
1027}
1028
1029/*
1030 * Is the log zeroed at all?
1031 *
1032 * The last binary search should be changed to perform an X block read
1033 * once X becomes small enough. You can then search linearly through
1034 * the X blocks. This will cut down on the number of reads we need to do.
1035 *
1036 * If the log is partially zeroed, this routine will pass back the blkno
1037 * of the first block with cycle number 0. It won't have a complete LR
1038 * preceding it.
1039 *
1040 * Return:
1041 * 0 => the log is completely written to
1042 * -1 => use *blk_no as the first block of the log
1043 * >0 => error has occurred
1044 */
a8272ce0 1045STATIC int
1da177e4
LT
1046xlog_find_zeroed(
1047 xlog_t *log,
1048 xfs_daddr_t *blk_no)
1049{
1050 xfs_buf_t *bp;
1051 xfs_caddr_t offset;
1052 uint first_cycle, last_cycle;
1053 xfs_daddr_t new_blk, last_blk, start_blk;
1054 xfs_daddr_t num_scan_bblks;
1055 int error, log_bbnum = log->l_logBBsize;
1056
6fdf8ccc
NS
1057 *blk_no = 0;
1058
1da177e4
LT
1059 /* check totally zeroed log */
1060 bp = xlog_get_bp(log, 1);
1061 if (!bp)
1062 return ENOMEM;
076e6acb
CH
1063 error = xlog_bread(log, 0, 1, bp, &offset);
1064 if (error)
1da177e4 1065 goto bp_err;
076e6acb 1066
03bea6fe 1067 first_cycle = xlog_get_cycle(offset);
1da177e4
LT
1068 if (first_cycle == 0) { /* completely zeroed log */
1069 *blk_no = 0;
1070 xlog_put_bp(bp);
1071 return -1;
1072 }
1073
1074 /* check partially zeroed log */
076e6acb
CH
1075 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1076 if (error)
1da177e4 1077 goto bp_err;
076e6acb 1078
03bea6fe 1079 last_cycle = xlog_get_cycle(offset);
1da177e4
LT
1080 if (last_cycle != 0) { /* log completely written to */
1081 xlog_put_bp(bp);
1082 return 0;
1083 } else if (first_cycle != 1) {
1084 /*
1085 * If the cycle of the last block is zero, the cycle of
1086 * the first block must be 1. If it's not, maybe we're
1087 * not looking at a log... Bail out.
1088 */
1089 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
1090 return XFS_ERROR(EINVAL);
1091 }
1092
1093 /* we have a partially zeroed log */
1094 last_blk = log_bbnum-1;
1095 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1096 goto bp_err;
1097
1098 /*
1099 * Validate the answer. Because there is no way to guarantee that
1100 * the entire log is made up of log records which are the same size,
1101 * we scan over the defined maximum blocks. At this point, the maximum
1102 * is not chosen to mean anything special. XXXmiken
1103 */
1104 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1105 ASSERT(num_scan_bblks <= INT_MAX);
1106
1107 if (last_blk < num_scan_bblks)
1108 num_scan_bblks = last_blk;
1109 start_blk = last_blk - num_scan_bblks;
1110
1111 /*
1112 * We search for any instances of cycle number 0 that occur before
1113 * our current estimate of the head. What we're trying to detect is
1114 * 1 ... | 0 | 1 | 0...
1115 * ^ binary search ends here
1116 */
1117 if ((error = xlog_find_verify_cycle(log, start_blk,
1118 (int)num_scan_bblks, 0, &new_blk)))
1119 goto bp_err;
1120 if (new_blk != -1)
1121 last_blk = new_blk;
1122
1123 /*
1124 * Potentially backup over partial log record write. We don't need
1125 * to search the end of the log because we know it is zero.
1126 */
1127 if ((error = xlog_find_verify_log_record(log, start_blk,
1128 &last_blk, 0)) == -1) {
1129 error = XFS_ERROR(EIO);
1130 goto bp_err;
1131 } else if (error)
1132 goto bp_err;
1133
1134 *blk_no = last_blk;
1135bp_err:
1136 xlog_put_bp(bp);
1137 if (error)
1138 return error;
1139 return -1;
1140}
1141
1142/*
1143 * These are simple subroutines used by xlog_clear_stale_blocks() below
1144 * to initialize a buffer full of empty log record headers and write
1145 * them into the log.
1146 */
1147STATIC void
1148xlog_add_record(
1149 xlog_t *log,
1150 xfs_caddr_t buf,
1151 int cycle,
1152 int block,
1153 int tail_cycle,
1154 int tail_block)
1155{
1156 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1157
1158 memset(buf, 0, BBSIZE);
b53e675d
CH
1159 recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1160 recp->h_cycle = cpu_to_be32(cycle);
1161 recp->h_version = cpu_to_be32(
62118709 1162 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
b53e675d
CH
1163 recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1164 recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1165 recp->h_fmt = cpu_to_be32(XLOG_FMT);
1da177e4
LT
1166 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1167}
1168
1169STATIC int
1170xlog_write_log_records(
1171 xlog_t *log,
1172 int cycle,
1173 int start_block,
1174 int blocks,
1175 int tail_cycle,
1176 int tail_block)
1177{
1178 xfs_caddr_t offset;
1179 xfs_buf_t *bp;
1180 int balign, ealign;
69ce58f0 1181 int sectbb = log->l_sectBBsize;
1da177e4
LT
1182 int end_block = start_block + blocks;
1183 int bufblks;
1184 int error = 0;
1185 int i, j = 0;
1186
6881a229
AE
1187 /*
1188 * Greedily allocate a buffer big enough to handle the full
1189 * range of basic blocks to be written. If that fails, try
1190 * a smaller size. We need to be able to write at least a
1191 * log sector, or we're out of luck.
1192 */
1da177e4
LT
1193 bufblks = 1 << ffs(blocks);
1194 while (!(bp = xlog_get_bp(log, bufblks))) {
1195 bufblks >>= 1;
69ce58f0 1196 if (bufblks < sectbb)
1da177e4
LT
1197 return ENOMEM;
1198 }
1199
1200 /* We may need to do a read at the start to fill in part of
1201 * the buffer in the starting sector not covered by the first
1202 * write below.
1203 */
5c17f533 1204 balign = round_down(start_block, sectbb);
1da177e4 1205 if (balign != start_block) {
076e6acb
CH
1206 error = xlog_bread_noalign(log, start_block, 1, bp);
1207 if (error)
1208 goto out_put_bp;
1209
1da177e4
LT
1210 j = start_block - balign;
1211 }
1212
1213 for (i = start_block; i < end_block; i += bufblks) {
1214 int bcount, endcount;
1215
1216 bcount = min(bufblks, end_block - start_block);
1217 endcount = bcount - j;
1218
1219 /* We may need to do a read at the end to fill in part of
1220 * the buffer in the final sector not covered by the write.
1221 * If this is the same sector as the above read, skip it.
1222 */
5c17f533 1223 ealign = round_down(end_block, sectbb);
1da177e4
LT
1224 if (j == 0 && (start_block + endcount > ealign)) {
1225 offset = XFS_BUF_PTR(bp);
1226 balign = BBTOB(ealign - start_block);
234f56ac
DC
1227 error = XFS_BUF_SET_PTR(bp, offset + balign,
1228 BBTOB(sectbb));
076e6acb
CH
1229 if (error)
1230 break;
1231
1232 error = xlog_bread_noalign(log, ealign, sectbb, bp);
1233 if (error)
1234 break;
1235
1236 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
234f56ac 1237 if (error)
1da177e4 1238 break;
1da177e4
LT
1239 }
1240
1241 offset = xlog_align(log, start_block, endcount, bp);
1242 for (; j < endcount; j++) {
1243 xlog_add_record(log, offset, cycle, i+j,
1244 tail_cycle, tail_block);
1245 offset += BBSIZE;
1246 }
1247 error = xlog_bwrite(log, start_block, endcount, bp);
1248 if (error)
1249 break;
1250 start_block += endcount;
1251 j = 0;
1252 }
076e6acb
CH
1253
1254 out_put_bp:
1da177e4
LT
1255 xlog_put_bp(bp);
1256 return error;
1257}
1258
1259/*
1260 * This routine is called to blow away any incomplete log writes out
1261 * in front of the log head. We do this so that we won't become confused
1262 * if we come up, write only a little bit more, and then crash again.
1263 * If we leave the partial log records out there, this situation could
1264 * cause us to think those partial writes are valid blocks since they
1265 * have the current cycle number. We get rid of them by overwriting them
1266 * with empty log records with the old cycle number rather than the
1267 * current one.
1268 *
1269 * The tail lsn is passed in rather than taken from
1270 * the log so that we will not write over the unmount record after a
1271 * clean unmount in a 512 block log. Doing so would leave the log without
1272 * any valid log records in it until a new one was written. If we crashed
1273 * during that time we would not be able to recover.
1274 */
1275STATIC int
1276xlog_clear_stale_blocks(
1277 xlog_t *log,
1278 xfs_lsn_t tail_lsn)
1279{
1280 int tail_cycle, head_cycle;
1281 int tail_block, head_block;
1282 int tail_distance, max_distance;
1283 int distance;
1284 int error;
1285
1286 tail_cycle = CYCLE_LSN(tail_lsn);
1287 tail_block = BLOCK_LSN(tail_lsn);
1288 head_cycle = log->l_curr_cycle;
1289 head_block = log->l_curr_block;
1290
1291 /*
1292 * Figure out the distance between the new head of the log
1293 * and the tail. We want to write over any blocks beyond the
1294 * head that we may have written just before the crash, but
1295 * we don't want to overwrite the tail of the log.
1296 */
1297 if (head_cycle == tail_cycle) {
1298 /*
1299 * The tail is behind the head in the physical log,
1300 * so the distance from the head to the tail is the
1301 * distance from the head to the end of the log plus
1302 * the distance from the beginning of the log to the
1303 * tail.
1304 */
1305 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1306 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1307 XFS_ERRLEVEL_LOW, log->l_mp);
1308 return XFS_ERROR(EFSCORRUPTED);
1309 }
1310 tail_distance = tail_block + (log->l_logBBsize - head_block);
1311 } else {
1312 /*
1313 * The head is behind the tail in the physical log,
1314 * so the distance from the head to the tail is just
1315 * the tail block minus the head block.
1316 */
1317 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1318 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1319 XFS_ERRLEVEL_LOW, log->l_mp);
1320 return XFS_ERROR(EFSCORRUPTED);
1321 }
1322 tail_distance = tail_block - head_block;
1323 }
1324
1325 /*
1326 * If the head is right up against the tail, we can't clear
1327 * anything.
1328 */
1329 if (tail_distance <= 0) {
1330 ASSERT(tail_distance == 0);
1331 return 0;
1332 }
1333
1334 max_distance = XLOG_TOTAL_REC_SHIFT(log);
1335 /*
1336 * Take the smaller of the maximum amount of outstanding I/O
1337 * we could have and the distance to the tail to clear out.
1338 * We take the smaller so that we don't overwrite the tail and
1339 * we don't waste all day writing from the head to the tail
1340 * for no reason.
1341 */
1342 max_distance = MIN(max_distance, tail_distance);
1343
1344 if ((head_block + max_distance) <= log->l_logBBsize) {
1345 /*
1346 * We can stomp all the blocks we need to without
1347 * wrapping around the end of the log. Just do it
1348 * in a single write. Use the cycle number of the
1349 * current cycle minus one so that the log will look like:
1350 * n ... | n - 1 ...
1351 */
1352 error = xlog_write_log_records(log, (head_cycle - 1),
1353 head_block, max_distance, tail_cycle,
1354 tail_block);
1355 if (error)
1356 return error;
1357 } else {
1358 /*
1359 * We need to wrap around the end of the physical log in
1360 * order to clear all the blocks. Do it in two separate
1361 * I/Os. The first write should be from the head to the
1362 * end of the physical log, and it should use the current
1363 * cycle number minus one just like above.
1364 */
1365 distance = log->l_logBBsize - head_block;
1366 error = xlog_write_log_records(log, (head_cycle - 1),
1367 head_block, distance, tail_cycle,
1368 tail_block);
1369
1370 if (error)
1371 return error;
1372
1373 /*
1374 * Now write the blocks at the start of the physical log.
1375 * This writes the remainder of the blocks we want to clear.
1376 * It uses the current cycle number since we're now on the
1377 * same cycle as the head so that we get:
1378 * n ... n ... | n - 1 ...
1379 * ^^^^^ blocks we're writing
1380 */
1381 distance = max_distance - (log->l_logBBsize - head_block);
1382 error = xlog_write_log_records(log, head_cycle, 0, distance,
1383 tail_cycle, tail_block);
1384 if (error)
1385 return error;
1386 }
1387
1388 return 0;
1389}
1390
1391/******************************************************************************
1392 *
1393 * Log recover routines
1394 *
1395 ******************************************************************************
1396 */
1397
1398STATIC xlog_recover_t *
1399xlog_recover_find_tid(
f0a76953 1400 struct hlist_head *head,
1da177e4
LT
1401 xlog_tid_t tid)
1402{
f0a76953
DC
1403 xlog_recover_t *trans;
1404 struct hlist_node *n;
1da177e4 1405
f0a76953
DC
1406 hlist_for_each_entry(trans, n, head, r_list) {
1407 if (trans->r_log_tid == tid)
1408 return trans;
1da177e4 1409 }
f0a76953 1410 return NULL;
1da177e4
LT
1411}
1412
1413STATIC void
f0a76953
DC
1414xlog_recover_new_tid(
1415 struct hlist_head *head,
1416 xlog_tid_t tid,
1417 xfs_lsn_t lsn)
1da177e4 1418{
f0a76953
DC
1419 xlog_recover_t *trans;
1420
1421 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1422 trans->r_log_tid = tid;
1423 trans->r_lsn = lsn;
1424 INIT_LIST_HEAD(&trans->r_itemq);
1425
1426 INIT_HLIST_NODE(&trans->r_list);
1427 hlist_add_head(&trans->r_list, head);
1da177e4
LT
1428}
1429
1430STATIC void
1431xlog_recover_add_item(
f0a76953 1432 struct list_head *head)
1da177e4
LT
1433{
1434 xlog_recover_item_t *item;
1435
1436 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
f0a76953
DC
1437 INIT_LIST_HEAD(&item->ri_list);
1438 list_add_tail(&item->ri_list, head);
1da177e4
LT
1439}
1440
1441STATIC int
1442xlog_recover_add_to_cont_trans(
9abbc539 1443 struct log *log,
1da177e4
LT
1444 xlog_recover_t *trans,
1445 xfs_caddr_t dp,
1446 int len)
1447{
1448 xlog_recover_item_t *item;
1449 xfs_caddr_t ptr, old_ptr;
1450 int old_len;
1451
f0a76953 1452 if (list_empty(&trans->r_itemq)) {
1da177e4
LT
1453 /* finish copying rest of trans header */
1454 xlog_recover_add_item(&trans->r_itemq);
1455 ptr = (xfs_caddr_t) &trans->r_theader +
1456 sizeof(xfs_trans_header_t) - len;
1457 memcpy(ptr, dp, len); /* d, s, l */
1458 return 0;
1459 }
f0a76953
DC
1460 /* take the tail entry */
1461 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1da177e4
LT
1462
1463 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1464 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1465
760dea67 1466 ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
1da177e4
LT
1467 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1468 item->ri_buf[item->ri_cnt-1].i_len += len;
1469 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
9abbc539 1470 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1da177e4
LT
1471 return 0;
1472}
1473
1474/*
1475 * The next region to add is the start of a new region. It could be
1476 * a whole region or it could be the first part of a new region. Because
1477 * of this, the assumption here is that the type and size fields of all
1478 * format structures fit into the first 32 bits of the structure.
1479 *
1480 * This works because all regions must be 32 bit aligned. Therefore, we
1481 * either have both fields or we have neither field. In the case we have
1482 * neither field, the data part of the region is zero length. We only have
1483 * a log_op_header and can throw away the header since a new one will appear
1484 * later. If we have at least 4 bytes, then we can determine how many regions
1485 * will appear in the current log item.
1486 */
1487STATIC int
1488xlog_recover_add_to_trans(
9abbc539 1489 struct log *log,
1da177e4
LT
1490 xlog_recover_t *trans,
1491 xfs_caddr_t dp,
1492 int len)
1493{
1494 xfs_inode_log_format_t *in_f; /* any will do */
1495 xlog_recover_item_t *item;
1496 xfs_caddr_t ptr;
1497
1498 if (!len)
1499 return 0;
f0a76953 1500 if (list_empty(&trans->r_itemq)) {
5a792c45
DC
1501 /* we need to catch log corruptions here */
1502 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1503 xlog_warn("XFS: xlog_recover_add_to_trans: "
1504 "bad header magic number");
1505 ASSERT(0);
1506 return XFS_ERROR(EIO);
1507 }
1da177e4
LT
1508 if (len == sizeof(xfs_trans_header_t))
1509 xlog_recover_add_item(&trans->r_itemq);
1510 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1511 return 0;
1512 }
1513
1514 ptr = kmem_alloc(len, KM_SLEEP);
1515 memcpy(ptr, dp, len);
1516 in_f = (xfs_inode_log_format_t *)ptr;
1517
f0a76953
DC
1518 /* take the tail entry */
1519 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1520 if (item->ri_total != 0 &&
1521 item->ri_total == item->ri_cnt) {
1522 /* tail item is in use, get a new one */
1da177e4 1523 xlog_recover_add_item(&trans->r_itemq);
f0a76953
DC
1524 item = list_entry(trans->r_itemq.prev,
1525 xlog_recover_item_t, ri_list);
1da177e4 1526 }
1da177e4
LT
1527
1528 if (item->ri_total == 0) { /* first region to be added */
e8fa6b48
CH
1529 if (in_f->ilf_size == 0 ||
1530 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1531 xlog_warn(
1532 "XFS: bad number of regions (%d) in inode log format",
1533 in_f->ilf_size);
1534 ASSERT(0);
1535 return XFS_ERROR(EIO);
1536 }
1537
1538 item->ri_total = in_f->ilf_size;
1539 item->ri_buf =
1540 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1541 KM_SLEEP);
1da177e4
LT
1542 }
1543 ASSERT(item->ri_total > item->ri_cnt);
1544 /* Description region is ri_buf[0] */
1545 item->ri_buf[item->ri_cnt].i_addr = ptr;
1546 item->ri_buf[item->ri_cnt].i_len = len;
1547 item->ri_cnt++;
9abbc539 1548 trace_xfs_log_recover_item_add(log, trans, item, 0);
1da177e4
LT
1549 return 0;
1550}
1551
f0a76953
DC
1552/*
1553 * Sort the log items in the transaction. Cancelled buffers need
1554 * to be put first so they are processed before any items that might
1555 * modify the buffers. If they are cancelled, then the modifications
1556 * don't need to be replayed.
1557 */
1da177e4
LT
1558STATIC int
1559xlog_recover_reorder_trans(
9abbc539
DC
1560 struct log *log,
1561 xlog_recover_t *trans,
1562 int pass)
1da177e4 1563{
f0a76953
DC
1564 xlog_recover_item_t *item, *n;
1565 LIST_HEAD(sort_list);
1566
1567 list_splice_init(&trans->r_itemq, &sort_list);
1568 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1569 xfs_buf_log_format_t *buf_f;
1da177e4 1570
f0a76953 1571 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1da177e4 1572
f0a76953 1573 switch (ITEM_TYPE(item)) {
1da177e4 1574 case XFS_LI_BUF:
f0a76953 1575 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
9abbc539
DC
1576 trace_xfs_log_recover_item_reorder_head(log,
1577 trans, item, pass);
f0a76953 1578 list_move(&item->ri_list, &trans->r_itemq);
1da177e4
LT
1579 break;
1580 }
1581 case XFS_LI_INODE:
1da177e4
LT
1582 case XFS_LI_DQUOT:
1583 case XFS_LI_QUOTAOFF:
1584 case XFS_LI_EFD:
1585 case XFS_LI_EFI:
9abbc539
DC
1586 trace_xfs_log_recover_item_reorder_tail(log,
1587 trans, item, pass);
f0a76953 1588 list_move_tail(&item->ri_list, &trans->r_itemq);
1da177e4
LT
1589 break;
1590 default:
1591 xlog_warn(
1592 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
1593 ASSERT(0);
1594 return XFS_ERROR(EIO);
1595 }
f0a76953
DC
1596 }
1597 ASSERT(list_empty(&sort_list));
1da177e4
LT
1598 return 0;
1599}
1600
1601/*
1602 * Build up the table of buf cancel records so that we don't replay
1603 * cancelled data in the second pass. For buffer records that are
1604 * not cancel records, there is nothing to do here so we just return.
1605 *
1606 * If we get a cancel record which is already in the table, this indicates
1607 * that the buffer was cancelled multiple times. In order to ensure
1608 * that during pass 2 we keep the record in the table until we reach its
1609 * last occurrence in the log, we keep a reference count in the cancel
1610 * record in the table to tell us how many times we expect to see this
1611 * record during the second pass.
1612 */
1613STATIC void
1614xlog_recover_do_buffer_pass1(
1615 xlog_t *log,
1616 xfs_buf_log_format_t *buf_f)
1617{
1618 xfs_buf_cancel_t *bcp;
1619 xfs_buf_cancel_t *nextp;
1620 xfs_buf_cancel_t *prevp;
1621 xfs_buf_cancel_t **bucket;
1da177e4
LT
1622 xfs_daddr_t blkno = 0;
1623 uint len = 0;
1624 ushort flags = 0;
1625
1626 switch (buf_f->blf_type) {
1627 case XFS_LI_BUF:
1628 blkno = buf_f->blf_blkno;
1629 len = buf_f->blf_len;
1630 flags = buf_f->blf_flags;
1631 break;
1da177e4
LT
1632 }
1633
1634 /*
1635 * If this isn't a cancel buffer item, then just return.
1636 */
9abbc539
DC
1637 if (!(flags & XFS_BLI_CANCEL)) {
1638 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1da177e4 1639 return;
9abbc539 1640 }
1da177e4
LT
1641
1642 /*
1643 * Insert an xfs_buf_cancel record into the hash table of
1644 * them. If there is already an identical record, bump
1645 * its reference count.
1646 */
1647 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1648 XLOG_BC_TABLE_SIZE];
1649 /*
1650 * If the hash bucket is empty then just insert a new record into
1651 * the bucket.
1652 */
1653 if (*bucket == NULL) {
1654 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1655 KM_SLEEP);
1656 bcp->bc_blkno = blkno;
1657 bcp->bc_len = len;
1658 bcp->bc_refcount = 1;
1659 bcp->bc_next = NULL;
1660 *bucket = bcp;
1661 return;
1662 }
1663
1664 /*
1665 * The hash bucket is not empty, so search for duplicates of our
1666 * record. If we find one them just bump its refcount. If not
1667 * then add us at the end of the list.
1668 */
1669 prevp = NULL;
1670 nextp = *bucket;
1671 while (nextp != NULL) {
1672 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1673 nextp->bc_refcount++;
9abbc539 1674 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1da177e4
LT
1675 return;
1676 }
1677 prevp = nextp;
1678 nextp = nextp->bc_next;
1679 }
1680 ASSERT(prevp != NULL);
1681 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1682 KM_SLEEP);
1683 bcp->bc_blkno = blkno;
1684 bcp->bc_len = len;
1685 bcp->bc_refcount = 1;
1686 bcp->bc_next = NULL;
1687 prevp->bc_next = bcp;
9abbc539 1688 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1da177e4
LT
1689}
1690
1691/*
1692 * Check to see whether the buffer being recovered has a corresponding
1693 * entry in the buffer cancel record table. If it does then return 1
1694 * so that it will be cancelled, otherwise return 0. If the buffer is
1695 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
1696 * the refcount on the entry in the table and remove it from the table
1697 * if this is the last reference.
1698 *
1699 * We remove the cancel record from the table when we encounter its
1700 * last occurrence in the log so that if the same buffer is re-used
1701 * again after its last cancellation we actually replay the changes
1702 * made at that point.
1703 */
1704STATIC int
1705xlog_check_buffer_cancelled(
1706 xlog_t *log,
1707 xfs_daddr_t blkno,
1708 uint len,
1709 ushort flags)
1710{
1711 xfs_buf_cancel_t *bcp;
1712 xfs_buf_cancel_t *prevp;
1713 xfs_buf_cancel_t **bucket;
1714
1715 if (log->l_buf_cancel_table == NULL) {
1716 /*
1717 * There is nothing in the table built in pass one,
1718 * so this buffer must not be cancelled.
1719 */
1720 ASSERT(!(flags & XFS_BLI_CANCEL));
1721 return 0;
1722 }
1723
1724 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1725 XLOG_BC_TABLE_SIZE];
1726 bcp = *bucket;
1727 if (bcp == NULL) {
1728 /*
1729 * There is no corresponding entry in the table built
1730 * in pass one, so this buffer has not been cancelled.
1731 */
1732 ASSERT(!(flags & XFS_BLI_CANCEL));
1733 return 0;
1734 }
1735
1736 /*
1737 * Search for an entry in the buffer cancel table that
1738 * matches our buffer.
1739 */
1740 prevp = NULL;
1741 while (bcp != NULL) {
1742 if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
1743 /*
1744 * We've go a match, so return 1 so that the
1745 * recovery of this buffer is cancelled.
1746 * If this buffer is actually a buffer cancel
1747 * log item, then decrement the refcount on the
1748 * one in the table and remove it if this is the
1749 * last reference.
1750 */
1751 if (flags & XFS_BLI_CANCEL) {
1752 bcp->bc_refcount--;
1753 if (bcp->bc_refcount == 0) {
1754 if (prevp == NULL) {
1755 *bucket = bcp->bc_next;
1756 } else {
1757 prevp->bc_next = bcp->bc_next;
1758 }
f0e2d93c 1759 kmem_free(bcp);
1da177e4
LT
1760 }
1761 }
1762 return 1;
1763 }
1764 prevp = bcp;
1765 bcp = bcp->bc_next;
1766 }
1767 /*
1768 * We didn't find a corresponding entry in the table, so
1769 * return 0 so that the buffer is NOT cancelled.
1770 */
1771 ASSERT(!(flags & XFS_BLI_CANCEL));
1772 return 0;
1773}
1774
1775STATIC int
1776xlog_recover_do_buffer_pass2(
1777 xlog_t *log,
1778 xfs_buf_log_format_t *buf_f)
1779{
1da177e4
LT
1780 xfs_daddr_t blkno = 0;
1781 ushort flags = 0;
1782 uint len = 0;
1783
1784 switch (buf_f->blf_type) {
1785 case XFS_LI_BUF:
1786 blkno = buf_f->blf_blkno;
1787 flags = buf_f->blf_flags;
1788 len = buf_f->blf_len;
1789 break;
1da177e4
LT
1790 }
1791
1792 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1793}
1794
1795/*
1796 * Perform recovery for a buffer full of inodes. In these buffers,
1797 * the only data which should be recovered is that which corresponds
1798 * to the di_next_unlinked pointers in the on disk inode structures.
1799 * The rest of the data for the inodes is always logged through the
1800 * inodes themselves rather than the inode buffer and is recovered
1801 * in xlog_recover_do_inode_trans().
1802 *
1803 * The only time when buffers full of inodes are fully recovered is
1804 * when the buffer is full of newly allocated inodes. In this case
1805 * the buffer will not be marked as an inode buffer and so will be
1806 * sent to xlog_recover_do_reg_buffer() below during recovery.
1807 */
1808STATIC int
1809xlog_recover_do_inode_buffer(
1810 xfs_mount_t *mp,
1811 xlog_recover_item_t *item,
1812 xfs_buf_t *bp,
1813 xfs_buf_log_format_t *buf_f)
1814{
1815 int i;
1816 int item_index;
1817 int bit;
1818 int nbits;
1819 int reg_buf_offset;
1820 int reg_buf_bytes;
1821 int next_unlinked_offset;
1822 int inodes_per_buf;
1823 xfs_agino_t *logged_nextp;
1824 xfs_agino_t *buffer_nextp;
1da177e4
LT
1825 unsigned int *data_map = NULL;
1826 unsigned int map_size = 0;
1827
9abbc539
DC
1828 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1829
1da177e4
LT
1830 switch (buf_f->blf_type) {
1831 case XFS_LI_BUF:
1832 data_map = buf_f->blf_data_map;
1833 map_size = buf_f->blf_map_size;
1834 break;
1da177e4
LT
1835 }
1836 /*
1837 * Set the variables corresponding to the current region to
1838 * 0 so that we'll initialize them on the first pass through
1839 * the loop.
1840 */
1841 reg_buf_offset = 0;
1842 reg_buf_bytes = 0;
1843 bit = 0;
1844 nbits = 0;
1845 item_index = 0;
1846 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1847 for (i = 0; i < inodes_per_buf; i++) {
1848 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1849 offsetof(xfs_dinode_t, di_next_unlinked);
1850
1851 while (next_unlinked_offset >=
1852 (reg_buf_offset + reg_buf_bytes)) {
1853 /*
1854 * The next di_next_unlinked field is beyond
1855 * the current logged region. Find the next
1856 * logged region that contains or is beyond
1857 * the current di_next_unlinked field.
1858 */
1859 bit += nbits;
1860 bit = xfs_next_bit(data_map, map_size, bit);
1861
1862 /*
1863 * If there are no more logged regions in the
1864 * buffer, then we're done.
1865 */
1866 if (bit == -1) {
1867 return 0;
1868 }
1869
1870 nbits = xfs_contig_bits(data_map, map_size,
1871 bit);
1872 ASSERT(nbits > 0);
1873 reg_buf_offset = bit << XFS_BLI_SHIFT;
1874 reg_buf_bytes = nbits << XFS_BLI_SHIFT;
1875 item_index++;
1876 }
1877
1878 /*
1879 * If the current logged region starts after the current
1880 * di_next_unlinked field, then move on to the next
1881 * di_next_unlinked field.
1882 */
1883 if (next_unlinked_offset < reg_buf_offset) {
1884 continue;
1885 }
1886
1887 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1888 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
1889 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1890
1891 /*
1892 * The current logged region contains a copy of the
1893 * current di_next_unlinked field. Extract its value
1894 * and copy it to the buffer copy.
1895 */
1896 logged_nextp = (xfs_agino_t *)
1897 ((char *)(item->ri_buf[item_index].i_addr) +
1898 (next_unlinked_offset - reg_buf_offset));
1899 if (unlikely(*logged_nextp == 0)) {
1900 xfs_fs_cmn_err(CE_ALERT, mp,
1901 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field",
1902 item, bp);
1903 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1904 XFS_ERRLEVEL_LOW, mp);
1905 return XFS_ERROR(EFSCORRUPTED);
1906 }
1907
1908 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1909 next_unlinked_offset);
87c199c2 1910 *buffer_nextp = *logged_nextp;
1da177e4
LT
1911 }
1912
1913 return 0;
1914}
1915
1916/*
1917 * Perform a 'normal' buffer recovery. Each logged region of the
1918 * buffer should be copied over the corresponding region in the
1919 * given buffer. The bitmap in the buf log format structure indicates
1920 * where to place the logged data.
1921 */
1922/*ARGSUSED*/
1923STATIC void
1924xlog_recover_do_reg_buffer(
9abbc539 1925 struct xfs_mount *mp,
1da177e4
LT
1926 xlog_recover_item_t *item,
1927 xfs_buf_t *bp,
1928 xfs_buf_log_format_t *buf_f)
1929{
1930 int i;
1931 int bit;
1932 int nbits;
1da177e4
LT
1933 unsigned int *data_map = NULL;
1934 unsigned int map_size = 0;
1935 int error;
1936
9abbc539
DC
1937 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1938
1da177e4
LT
1939 switch (buf_f->blf_type) {
1940 case XFS_LI_BUF:
1941 data_map = buf_f->blf_data_map;
1942 map_size = buf_f->blf_map_size;
1943 break;
1da177e4
LT
1944 }
1945 bit = 0;
1946 i = 1; /* 0 is the buf format structure */
1947 while (1) {
1948 bit = xfs_next_bit(data_map, map_size, bit);
1949 if (bit == -1)
1950 break;
1951 nbits = xfs_contig_bits(data_map, map_size, bit);
1952 ASSERT(nbits > 0);
4b80916b 1953 ASSERT(item->ri_buf[i].i_addr != NULL);
1da177e4
LT
1954 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
1955 ASSERT(XFS_BUF_COUNT(bp) >=
1956 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
1957
1958 /*
1959 * Do a sanity check if this is a dquot buffer. Just checking
1960 * the first dquot in the buffer should do. XXXThis is
1961 * probably a good thing to do for other buf types also.
1962 */
1963 error = 0;
c8ad20ff
NS
1964 if (buf_f->blf_flags &
1965 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
0c5e1ce8
CH
1966 if (item->ri_buf[i].i_addr == NULL) {
1967 cmn_err(CE_ALERT,
1968 "XFS: NULL dquot in %s.", __func__);
1969 goto next;
1970 }
8ec6dba2 1971 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
0c5e1ce8
CH
1972 cmn_err(CE_ALERT,
1973 "XFS: dquot too small (%d) in %s.",
1974 item->ri_buf[i].i_len, __func__);
1975 goto next;
1976 }
1da177e4
LT
1977 error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1978 item->ri_buf[i].i_addr,
1979 -1, 0, XFS_QMOPT_DOWARN,
1980 "dquot_buf_recover");
0c5e1ce8
CH
1981 if (error)
1982 goto next;
1da177e4 1983 }
0c5e1ce8
CH
1984
1985 memcpy(xfs_buf_offset(bp,
1986 (uint)bit << XFS_BLI_SHIFT), /* dest */
1987 item->ri_buf[i].i_addr, /* source */
1988 nbits<<XFS_BLI_SHIFT); /* length */
1989 next:
1da177e4
LT
1990 i++;
1991 bit += nbits;
1992 }
1993
1994 /* Shouldn't be any more regions */
1995 ASSERT(i == item->ri_total);
1996}
1997
1998/*
1999 * Do some primitive error checking on ondisk dquot data structures.
2000 */
2001int
2002xfs_qm_dqcheck(
2003 xfs_disk_dquot_t *ddq,
2004 xfs_dqid_t id,
2005 uint type, /* used only when IO_dorepair is true */
2006 uint flags,
2007 char *str)
2008{
2009 xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
2010 int errs = 0;
2011
2012 /*
2013 * We can encounter an uninitialized dquot buffer for 2 reasons:
2014 * 1. If we crash while deleting the quotainode(s), and those blks got
2015 * used for user data. This is because we take the path of regular
2016 * file deletion; however, the size field of quotainodes is never
2017 * updated, so all the tricks that we play in itruncate_finish
2018 * don't quite matter.
2019 *
2020 * 2. We don't play the quota buffers when there's a quotaoff logitem.
2021 * But the allocation will be replayed so we'll end up with an
2022 * uninitialized quota block.
2023 *
2024 * This is all fine; things are still consistent, and we haven't lost
2025 * any quota information. Just don't complain about bad dquot blks.
2026 */
1149d96a 2027 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
1da177e4
LT
2028 if (flags & XFS_QMOPT_DOWARN)
2029 cmn_err(CE_ALERT,
2030 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1149d96a 2031 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1da177e4
LT
2032 errs++;
2033 }
1149d96a 2034 if (ddq->d_version != XFS_DQUOT_VERSION) {
1da177e4
LT
2035 if (flags & XFS_QMOPT_DOWARN)
2036 cmn_err(CE_ALERT,
2037 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1149d96a 2038 str, id, ddq->d_version, XFS_DQUOT_VERSION);
1da177e4
LT
2039 errs++;
2040 }
2041
1149d96a
CH
2042 if (ddq->d_flags != XFS_DQ_USER &&
2043 ddq->d_flags != XFS_DQ_PROJ &&
2044 ddq->d_flags != XFS_DQ_GROUP) {
1da177e4
LT
2045 if (flags & XFS_QMOPT_DOWARN)
2046 cmn_err(CE_ALERT,
2047 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1149d96a 2048 str, id, ddq->d_flags);
1da177e4
LT
2049 errs++;
2050 }
2051
1149d96a 2052 if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1da177e4
LT
2053 if (flags & XFS_QMOPT_DOWARN)
2054 cmn_err(CE_ALERT,
2055 "%s : ondisk-dquot 0x%p, ID mismatch: "
2056 "0x%x expected, found id 0x%x",
1149d96a 2057 str, ddq, id, be32_to_cpu(ddq->d_id));
1da177e4
LT
2058 errs++;
2059 }
2060
2061 if (!errs && ddq->d_id) {
1149d96a
CH
2062 if (ddq->d_blk_softlimit &&
2063 be64_to_cpu(ddq->d_bcount) >=
2064 be64_to_cpu(ddq->d_blk_softlimit)) {
1da177e4
LT
2065 if (!ddq->d_btimer) {
2066 if (flags & XFS_QMOPT_DOWARN)
2067 cmn_err(CE_ALERT,
2068 "%s : Dquot ID 0x%x (0x%p) "
2069 "BLK TIMER NOT STARTED",
1149d96a 2070 str, (int)be32_to_cpu(ddq->d_id), ddq);
1da177e4
LT
2071 errs++;
2072 }
2073 }
1149d96a
CH
2074 if (ddq->d_ino_softlimit &&
2075 be64_to_cpu(ddq->d_icount) >=
2076 be64_to_cpu(ddq->d_ino_softlimit)) {
1da177e4
LT
2077 if (!ddq->d_itimer) {
2078 if (flags & XFS_QMOPT_DOWARN)
2079 cmn_err(CE_ALERT,
2080 "%s : Dquot ID 0x%x (0x%p) "
2081 "INODE TIMER NOT STARTED",
1149d96a 2082 str, (int)be32_to_cpu(ddq->d_id), ddq);
1da177e4
LT
2083 errs++;
2084 }
2085 }
1149d96a
CH
2086 if (ddq->d_rtb_softlimit &&
2087 be64_to_cpu(ddq->d_rtbcount) >=
2088 be64_to_cpu(ddq->d_rtb_softlimit)) {
1da177e4
LT
2089 if (!ddq->d_rtbtimer) {
2090 if (flags & XFS_QMOPT_DOWARN)
2091 cmn_err(CE_ALERT,
2092 "%s : Dquot ID 0x%x (0x%p) "
2093 "RTBLK TIMER NOT STARTED",
1149d96a 2094 str, (int)be32_to_cpu(ddq->d_id), ddq);
1da177e4
LT
2095 errs++;
2096 }
2097 }
2098 }
2099
2100 if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2101 return errs;
2102
2103 if (flags & XFS_QMOPT_DOWARN)
2104 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
2105
2106 /*
2107 * Typically, a repair is only requested by quotacheck.
2108 */
2109 ASSERT(id != -1);
2110 ASSERT(flags & XFS_QMOPT_DQREPAIR);
2111 memset(d, 0, sizeof(xfs_dqblk_t));
1149d96a
CH
2112
2113 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2114 d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2115 d->dd_diskdq.d_flags = type;
2116 d->dd_diskdq.d_id = cpu_to_be32(id);
1da177e4
LT
2117
2118 return errs;
2119}
2120
2121/*
2122 * Perform a dquot buffer recovery.
2123 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2124 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2125 * Else, treat it as a regular buffer and do recovery.
2126 */
2127STATIC void
2128xlog_recover_do_dquot_buffer(
2129 xfs_mount_t *mp,
2130 xlog_t *log,
2131 xlog_recover_item_t *item,
2132 xfs_buf_t *bp,
2133 xfs_buf_log_format_t *buf_f)
2134{
2135 uint type;
2136
9abbc539
DC
2137 trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2138
1da177e4
LT
2139 /*
2140 * Filesystems are required to send in quota flags at mount time.
2141 */
2142 if (mp->m_qflags == 0) {
2143 return;
2144 }
2145
2146 type = 0;
2147 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
2148 type |= XFS_DQ_USER;
c8ad20ff
NS
2149 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
2150 type |= XFS_DQ_PROJ;
1da177e4
LT
2151 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
2152 type |= XFS_DQ_GROUP;
2153 /*
2154 * This type of quotas was turned off, so ignore this buffer
2155 */
2156 if (log->l_quotaoffs_flag & type)
2157 return;
2158
9abbc539 2159 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
1da177e4
LT
2160}
2161
2162/*
2163 * This routine replays a modification made to a buffer at runtime.
2164 * There are actually two types of buffer, regular and inode, which
2165 * are handled differently. Inode buffers are handled differently
2166 * in that we only recover a specific set of data from them, namely
2167 * the inode di_next_unlinked fields. This is because all other inode
2168 * data is actually logged via inode records and any data we replay
2169 * here which overlaps that may be stale.
2170 *
2171 * When meta-data buffers are freed at run time we log a buffer item
2172 * with the XFS_BLI_CANCEL bit set to indicate that previous copies
2173 * of the buffer in the log should not be replayed at recovery time.
2174 * This is so that if the blocks covered by the buffer are reused for
2175 * file data before we crash we don't end up replaying old, freed
2176 * meta-data into a user's file.
2177 *
2178 * To handle the cancellation of buffer log items, we make two passes
2179 * over the log during recovery. During the first we build a table of
2180 * those buffers which have been cancelled, and during the second we
2181 * only replay those buffers which do not have corresponding cancel
2182 * records in the table. See xlog_recover_do_buffer_pass[1,2] above
2183 * for more details on the implementation of the table of cancel records.
2184 */
2185STATIC int
2186xlog_recover_do_buffer_trans(
2187 xlog_t *log,
2188 xlog_recover_item_t *item,
2189 int pass)
2190{
2191 xfs_buf_log_format_t *buf_f;
1da177e4
LT
2192 xfs_mount_t *mp;
2193 xfs_buf_t *bp;
2194 int error;
2195 int cancel;
2196 xfs_daddr_t blkno;
2197 int len;
2198 ushort flags;
6ad112bf 2199 uint buf_flags;
1da177e4
LT
2200
2201 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2202
2203 if (pass == XLOG_RECOVER_PASS1) {
2204 /*
2205 * In this pass we're only looking for buf items
2206 * with the XFS_BLI_CANCEL bit set.
2207 */
2208 xlog_recover_do_buffer_pass1(log, buf_f);
2209 return 0;
2210 } else {
2211 /*
2212 * In this pass we want to recover all the buffers
2213 * which have not been cancelled and are not
2214 * cancellation buffers themselves. The routine
2215 * we call here will tell us whether or not to
2216 * continue with the replay of this buffer.
2217 */
2218 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2219 if (cancel) {
9abbc539 2220 trace_xfs_log_recover_buf_cancel(log, buf_f);
1da177e4
LT
2221 return 0;
2222 }
2223 }
9abbc539 2224 trace_xfs_log_recover_buf_recover(log, buf_f);
1da177e4
LT
2225 switch (buf_f->blf_type) {
2226 case XFS_LI_BUF:
2227 blkno = buf_f->blf_blkno;
2228 len = buf_f->blf_len;
2229 flags = buf_f->blf_flags;
2230 break;
1da177e4
LT
2231 default:
2232 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
fc1f8c1c
NS
2233 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2234 buf_f->blf_type, log->l_mp->m_logname ?
2235 log->l_mp->m_logname : "internal");
1da177e4
LT
2236 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2237 XFS_ERRLEVEL_LOW, log->l_mp);
2238 return XFS_ERROR(EFSCORRUPTED);
2239 }
2240
2241 mp = log->l_mp;
0cadda1c 2242 buf_flags = XBF_LOCK;
6ad112bf 2243 if (!(flags & XFS_BLI_INODE_BUF))
0cadda1c 2244 buf_flags |= XBF_MAPPED;
6ad112bf
CH
2245
2246 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
1da177e4
LT
2247 if (XFS_BUF_ISERROR(bp)) {
2248 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2249 bp, blkno);
2250 error = XFS_BUF_GETERROR(bp);
2251 xfs_buf_relse(bp);
2252 return error;
2253 }
2254
2255 error = 0;
2256 if (flags & XFS_BLI_INODE_BUF) {
2257 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
c8ad20ff
NS
2258 } else if (flags &
2259 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
1da177e4
LT
2260 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2261 } else {
9abbc539 2262 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
1da177e4
LT
2263 }
2264 if (error)
2265 return XFS_ERROR(error);
2266
2267 /*
2268 * Perform delayed write on the buffer. Asynchronous writes will be
2269 * slower when taking into account all the buffers to be flushed.
2270 *
2271 * Also make sure that only inode buffers with good sizes stay in
2272 * the buffer cache. The kernel moves inodes in buffers of 1 block
2273 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
2274 * buffers in the log can be a different size if the log was generated
2275 * by an older kernel using unclustered inode buffers or a newer kernel
2276 * running with a different inode cluster size. Regardless, if the
2277 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2278 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2279 * the buffer out of the buffer cache so that the buffer won't
2280 * overlap with future reads of those inodes.
2281 */
2282 if (XFS_DINODE_MAGIC ==
b53e675d 2283 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
1da177e4
LT
2284 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2285 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2286 XFS_BUF_STALE(bp);
2287 error = xfs_bwrite(mp, bp);
2288 } else {
15ac08a8
CH
2289 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2290 bp->b_mount = mp;
1da177e4
LT
2291 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2292 xfs_bdwrite(mp, bp);
2293 }
2294
2295 return (error);
2296}
2297
2298STATIC int
2299xlog_recover_do_inode_trans(
2300 xlog_t *log,
2301 xlog_recover_item_t *item,
2302 int pass)
2303{
2304 xfs_inode_log_format_t *in_f;
2305 xfs_mount_t *mp;
2306 xfs_buf_t *bp;
1da177e4
LT
2307 xfs_dinode_t *dip;
2308 xfs_ino_t ino;
2309 int len;
2310 xfs_caddr_t src;
2311 xfs_caddr_t dest;
2312 int error;
2313 int attr_index;
2314 uint fields;
347d1c01 2315 xfs_icdinode_t *dicp;
6d192a9b 2316 int need_free = 0;
1da177e4
LT
2317
2318 if (pass == XLOG_RECOVER_PASS1) {
2319 return 0;
2320 }
2321
6d192a9b
TS
2322 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2323 in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
2324 } else {
2325 in_f = (xfs_inode_log_format_t *)kmem_alloc(
2326 sizeof(xfs_inode_log_format_t), KM_SLEEP);
2327 need_free = 1;
2328 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2329 if (error)
2330 goto error;
2331 }
1da177e4
LT
2332 ino = in_f->ilf_ino;
2333 mp = log->l_mp;
1da177e4
LT
2334
2335 /*
2336 * Inode buffers can be freed, look out for it,
2337 * and do not replay the inode.
2338 */
a1941895
CH
2339 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2340 in_f->ilf_len, 0)) {
6d192a9b 2341 error = 0;
9abbc539 2342 trace_xfs_log_recover_inode_cancel(log, in_f);
6d192a9b
TS
2343 goto error;
2344 }
9abbc539 2345 trace_xfs_log_recover_inode_recover(log, in_f);
1da177e4 2346
6ad112bf 2347 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
0cadda1c 2348 XBF_LOCK);
1da177e4
LT
2349 if (XFS_BUF_ISERROR(bp)) {
2350 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
a1941895 2351 bp, in_f->ilf_blkno);
1da177e4
LT
2352 error = XFS_BUF_GETERROR(bp);
2353 xfs_buf_relse(bp);
6d192a9b 2354 goto error;
1da177e4
LT
2355 }
2356 error = 0;
2357 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
a1941895 2358 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
1da177e4
LT
2359
2360 /*
2361 * Make sure the place we're flushing out to really looks
2362 * like an inode!
2363 */
81591fe2 2364 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
1da177e4
LT
2365 xfs_buf_relse(bp);
2366 xfs_fs_cmn_err(CE_ALERT, mp,
2367 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2368 dip, bp, ino);
2369 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
2370 XFS_ERRLEVEL_LOW, mp);
6d192a9b
TS
2371 error = EFSCORRUPTED;
2372 goto error;
1da177e4 2373 }
347d1c01 2374 dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
1da177e4
LT
2375 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2376 xfs_buf_relse(bp);
2377 xfs_fs_cmn_err(CE_ALERT, mp,
2378 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2379 item, ino);
2380 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
2381 XFS_ERRLEVEL_LOW, mp);
6d192a9b
TS
2382 error = EFSCORRUPTED;
2383 goto error;
1da177e4
LT
2384 }
2385
2386 /* Skip replay when the on disk inode is newer than the log one */
81591fe2 2387 if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
1da177e4
LT
2388 /*
2389 * Deal with the wrap case, DI_MAX_FLUSH is less
2390 * than smaller numbers
2391 */
81591fe2 2392 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
347d1c01 2393 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
1da177e4
LT
2394 /* do nothing */
2395 } else {
2396 xfs_buf_relse(bp);
9abbc539 2397 trace_xfs_log_recover_inode_skip(log, in_f);
6d192a9b
TS
2398 error = 0;
2399 goto error;
1da177e4
LT
2400 }
2401 }
2402 /* Take the opportunity to reset the flush iteration count */
2403 dicp->di_flushiter = 0;
2404
2405 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2406 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2407 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2408 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
2409 XFS_ERRLEVEL_LOW, mp, dicp);
2410 xfs_buf_relse(bp);
2411 xfs_fs_cmn_err(CE_ALERT, mp,
2412 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2413 item, dip, bp, ino);
6d192a9b
TS
2414 error = EFSCORRUPTED;
2415 goto error;
1da177e4
LT
2416 }
2417 } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
2418 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2419 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2420 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2421 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
2422 XFS_ERRLEVEL_LOW, mp, dicp);
2423 xfs_buf_relse(bp);
2424 xfs_fs_cmn_err(CE_ALERT, mp,
2425 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2426 item, dip, bp, ino);
6d192a9b
TS
2427 error = EFSCORRUPTED;
2428 goto error;
1da177e4
LT
2429 }
2430 }
2431 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2432 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
2433 XFS_ERRLEVEL_LOW, mp, dicp);
2434 xfs_buf_relse(bp);
2435 xfs_fs_cmn_err(CE_ALERT, mp,
2436 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2437 item, dip, bp, ino,
2438 dicp->di_nextents + dicp->di_anextents,
2439 dicp->di_nblocks);
6d192a9b
TS
2440 error = EFSCORRUPTED;
2441 goto error;
1da177e4
LT
2442 }
2443 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2444 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
2445 XFS_ERRLEVEL_LOW, mp, dicp);
2446 xfs_buf_relse(bp);
2447 xfs_fs_cmn_err(CE_ALERT, mp,
2448 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2449 item, dip, bp, ino, dicp->di_forkoff);
6d192a9b
TS
2450 error = EFSCORRUPTED;
2451 goto error;
1da177e4 2452 }
81591fe2 2453 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
1da177e4
LT
2454 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2455 XFS_ERRLEVEL_LOW, mp, dicp);
2456 xfs_buf_relse(bp);
2457 xfs_fs_cmn_err(CE_ALERT, mp,
2458 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
2459 item->ri_buf[1].i_len, item);
6d192a9b
TS
2460 error = EFSCORRUPTED;
2461 goto error;
1da177e4
LT
2462 }
2463
2464 /* The core is in in-core format */
81591fe2 2465 xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
1da177e4
LT
2466
2467 /* the rest is in on-disk format */
81591fe2
CH
2468 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
2469 memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
2470 item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
2471 item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
1da177e4
LT
2472 }
2473
2474 fields = in_f->ilf_fields;
2475 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2476 case XFS_ILOG_DEV:
81591fe2 2477 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
1da177e4
LT
2478 break;
2479 case XFS_ILOG_UUID:
81591fe2
CH
2480 memcpy(XFS_DFORK_DPTR(dip),
2481 &in_f->ilf_u.ilfu_uuid,
2482 sizeof(uuid_t));
1da177e4
LT
2483 break;
2484 }
2485
2486 if (in_f->ilf_size == 2)
2487 goto write_inode_buffer;
2488 len = item->ri_buf[2].i_len;
2489 src = item->ri_buf[2].i_addr;
2490 ASSERT(in_f->ilf_size <= 4);
2491 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2492 ASSERT(!(fields & XFS_ILOG_DFORK) ||
2493 (len == in_f->ilf_dsize));
2494
2495 switch (fields & XFS_ILOG_DFORK) {
2496 case XFS_ILOG_DDATA:
2497 case XFS_ILOG_DEXT:
81591fe2 2498 memcpy(XFS_DFORK_DPTR(dip), src, len);
1da177e4
LT
2499 break;
2500
2501 case XFS_ILOG_DBROOT:
7cc95a82 2502 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
81591fe2 2503 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
1da177e4
LT
2504 XFS_DFORK_DSIZE(dip, mp));
2505 break;
2506
2507 default:
2508 /*
2509 * There are no data fork flags set.
2510 */
2511 ASSERT((fields & XFS_ILOG_DFORK) == 0);
2512 break;
2513 }
2514
2515 /*
2516 * If we logged any attribute data, recover it. There may or
2517 * may not have been any other non-core data logged in this
2518 * transaction.
2519 */
2520 if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2521 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2522 attr_index = 3;
2523 } else {
2524 attr_index = 2;
2525 }
2526 len = item->ri_buf[attr_index].i_len;
2527 src = item->ri_buf[attr_index].i_addr;
2528 ASSERT(len == in_f->ilf_asize);
2529
2530 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2531 case XFS_ILOG_ADATA:
2532 case XFS_ILOG_AEXT:
2533 dest = XFS_DFORK_APTR(dip);
2534 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2535 memcpy(dest, src, len);
2536 break;
2537
2538 case XFS_ILOG_ABROOT:
2539 dest = XFS_DFORK_APTR(dip);
7cc95a82
CH
2540 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2541 len, (xfs_bmdr_block_t*)dest,
1da177e4
LT
2542 XFS_DFORK_ASIZE(dip, mp));
2543 break;
2544
2545 default:
2546 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
2547 ASSERT(0);
2548 xfs_buf_relse(bp);
6d192a9b
TS
2549 error = EIO;
2550 goto error;
1da177e4
LT
2551 }
2552 }
2553
2554write_inode_buffer:
dd0bbad8
CH
2555 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2556 bp->b_mount = mp;
2557 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2558 xfs_bdwrite(mp, bp);
6d192a9b
TS
2559error:
2560 if (need_free)
f0e2d93c 2561 kmem_free(in_f);
6d192a9b 2562 return XFS_ERROR(error);
1da177e4
LT
2563}
2564
2565/*
2566 * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2567 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2568 * of that type.
2569 */
2570STATIC int
2571xlog_recover_do_quotaoff_trans(
2572 xlog_t *log,
2573 xlog_recover_item_t *item,
2574 int pass)
2575{
2576 xfs_qoff_logformat_t *qoff_f;
2577
2578 if (pass == XLOG_RECOVER_PASS2) {
2579 return (0);
2580 }
2581
2582 qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
2583 ASSERT(qoff_f);
2584
2585 /*
2586 * The logitem format's flag tells us if this was user quotaoff,
77a7cce4 2587 * group/project quotaoff or both.
1da177e4
LT
2588 */
2589 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2590 log->l_quotaoffs_flag |= XFS_DQ_USER;
77a7cce4
NS
2591 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2592 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
1da177e4
LT
2593 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2594 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2595
2596 return (0);
2597}
2598
2599/*
2600 * Recover a dquot record
2601 */
2602STATIC int
2603xlog_recover_do_dquot_trans(
2604 xlog_t *log,
2605 xlog_recover_item_t *item,
2606 int pass)
2607{
2608 xfs_mount_t *mp;
2609 xfs_buf_t *bp;
2610 struct xfs_disk_dquot *ddq, *recddq;
2611 int error;
2612 xfs_dq_logformat_t *dq_f;
2613 uint type;
2614
2615 if (pass == XLOG_RECOVER_PASS1) {
2616 return 0;
2617 }
2618 mp = log->l_mp;
2619
2620 /*
2621 * Filesystems are required to send in quota flags at mount time.
2622 */
2623 if (mp->m_qflags == 0)
2624 return (0);
2625
2626 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
0c5e1ce8
CH
2627
2628 if (item->ri_buf[1].i_addr == NULL) {
2629 cmn_err(CE_ALERT,
2630 "XFS: NULL dquot in %s.", __func__);
2631 return XFS_ERROR(EIO);
2632 }
8ec6dba2 2633 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
0c5e1ce8
CH
2634 cmn_err(CE_ALERT,
2635 "XFS: dquot too small (%d) in %s.",
2636 item->ri_buf[1].i_len, __func__);
2637 return XFS_ERROR(EIO);
2638 }
2639
1da177e4
LT
2640 /*
2641 * This type of quotas was turned off, so ignore this record.
2642 */
b53e675d 2643 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
1da177e4
LT
2644 ASSERT(type);
2645 if (log->l_quotaoffs_flag & type)
2646 return (0);
2647
2648 /*
2649 * At this point we know that quota was _not_ turned off.
2650 * Since the mount flags are not indicating to us otherwise, this
2651 * must mean that quota is on, and the dquot needs to be replayed.
2652 * Remember that we may not have fully recovered the superblock yet,
2653 * so we can't do the usual trick of looking at the SB quota bits.
2654 *
2655 * The other possibility, of course, is that the quota subsystem was
2656 * removed since the last mount - ENOSYS.
2657 */
2658 dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
2659 ASSERT(dq_f);
2660 if ((error = xfs_qm_dqcheck(recddq,
2661 dq_f->qlf_id,
2662 0, XFS_QMOPT_DOWARN,
2663 "xlog_recover_do_dquot_trans (log copy)"))) {
2664 return XFS_ERROR(EIO);
2665 }
2666 ASSERT(dq_f->qlf_len == 1);
2667
2668 error = xfs_read_buf(mp, mp->m_ddev_targp,
2669 dq_f->qlf_blkno,
2670 XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2671 0, &bp);
2672 if (error) {
2673 xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
2674 bp, dq_f->qlf_blkno);
2675 return error;
2676 }
2677 ASSERT(bp);
2678 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2679
2680 /*
2681 * At least the magic num portion should be on disk because this
2682 * was among a chunk of dquots created earlier, and we did some
2683 * minimal initialization then.
2684 */
2685 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2686 "xlog_recover_do_dquot_trans")) {
2687 xfs_buf_relse(bp);
2688 return XFS_ERROR(EIO);
2689 }
2690
2691 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2692
2693 ASSERT(dq_f->qlf_size == 2);
15ac08a8
CH
2694 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2695 bp->b_mount = mp;
1da177e4
LT
2696 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2697 xfs_bdwrite(mp, bp);
2698
2699 return (0);
2700}
2701
2702/*
2703 * This routine is called to create an in-core extent free intent
2704 * item from the efi format structure which was logged on disk.
2705 * It allocates an in-core efi, copies the extents from the format
2706 * structure into it, and adds the efi to the AIL with the given
2707 * LSN.
2708 */
6d192a9b 2709STATIC int
1da177e4
LT
2710xlog_recover_do_efi_trans(
2711 xlog_t *log,
2712 xlog_recover_item_t *item,
2713 xfs_lsn_t lsn,
2714 int pass)
2715{
6d192a9b 2716 int error;
1da177e4
LT
2717 xfs_mount_t *mp;
2718 xfs_efi_log_item_t *efip;
2719 xfs_efi_log_format_t *efi_formatp;
1da177e4
LT
2720
2721 if (pass == XLOG_RECOVER_PASS1) {
6d192a9b 2722 return 0;
1da177e4
LT
2723 }
2724
2725 efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
1da177e4
LT
2726
2727 mp = log->l_mp;
2728 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
6d192a9b
TS
2729 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2730 &(efip->efi_format)))) {
2731 xfs_efi_item_free(efip);
2732 return error;
2733 }
1da177e4
LT
2734 efip->efi_next_extent = efi_formatp->efi_nextents;
2735 efip->efi_flags |= XFS_EFI_COMMITTED;
2736
a9c21c1b 2737 spin_lock(&log->l_ailp->xa_lock);
1da177e4 2738 /*
783a2f65 2739 * xfs_trans_ail_update() drops the AIL lock.
1da177e4 2740 */
783a2f65 2741 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
6d192a9b 2742 return 0;
1da177e4
LT
2743}
2744
2745
2746/*
2747 * This routine is called when an efd format structure is found in
2748 * a committed transaction in the log. It's purpose is to cancel
2749 * the corresponding efi if it was still in the log. To do this
2750 * it searches the AIL for the efi with an id equal to that in the
2751 * efd format structure. If we find it, we remove the efi from the
2752 * AIL and free it.
2753 */
2754STATIC void
2755xlog_recover_do_efd_trans(
2756 xlog_t *log,
2757 xlog_recover_item_t *item,
2758 int pass)
2759{
1da177e4
LT
2760 xfs_efd_log_format_t *efd_formatp;
2761 xfs_efi_log_item_t *efip = NULL;
2762 xfs_log_item_t *lip;
1da177e4 2763 __uint64_t efi_id;
27d8d5fe 2764 struct xfs_ail_cursor cur;
783a2f65 2765 struct xfs_ail *ailp = log->l_ailp;
1da177e4
LT
2766
2767 if (pass == XLOG_RECOVER_PASS1) {
2768 return;
2769 }
2770
2771 efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
6d192a9b
TS
2772 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2773 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2774 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2775 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
1da177e4
LT
2776 efi_id = efd_formatp->efd_efi_id;
2777
2778 /*
2779 * Search for the efi with the id in the efd format structure
2780 * in the AIL.
2781 */
a9c21c1b
DC
2782 spin_lock(&ailp->xa_lock);
2783 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
1da177e4
LT
2784 while (lip != NULL) {
2785 if (lip->li_type == XFS_LI_EFI) {
2786 efip = (xfs_efi_log_item_t *)lip;
2787 if (efip->efi_format.efi_id == efi_id) {
2788 /*
783a2f65 2789 * xfs_trans_ail_delete() drops the
1da177e4
LT
2790 * AIL lock.
2791 */
783a2f65 2792 xfs_trans_ail_delete(ailp, lip);
8ae2c0f6 2793 xfs_efi_item_free(efip);
a9c21c1b 2794 spin_lock(&ailp->xa_lock);
27d8d5fe 2795 break;
1da177e4
LT
2796 }
2797 }
a9c21c1b 2798 lip = xfs_trans_ail_cursor_next(ailp, &cur);
1da177e4 2799 }
a9c21c1b
DC
2800 xfs_trans_ail_cursor_done(ailp, &cur);
2801 spin_unlock(&ailp->xa_lock);
1da177e4
LT
2802}
2803
2804/*
2805 * Perform the transaction
2806 *
2807 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2808 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2809 */
2810STATIC int
2811xlog_recover_do_trans(
2812 xlog_t *log,
2813 xlog_recover_t *trans,
2814 int pass)
2815{
2816 int error = 0;
f0a76953 2817 xlog_recover_item_t *item;
1da177e4 2818
9abbc539 2819 error = xlog_recover_reorder_trans(log, trans, pass);
ff0205e0 2820 if (error)
1da177e4 2821 return error;
ff0205e0 2822
f0a76953 2823 list_for_each_entry(item, &trans->r_itemq, ri_list) {
9abbc539 2824 trace_xfs_log_recover_item_recover(log, trans, item, pass);
ff0205e0
CH
2825 switch (ITEM_TYPE(item)) {
2826 case XFS_LI_BUF:
2827 error = xlog_recover_do_buffer_trans(log, item, pass);
2828 break;
2829 case XFS_LI_INODE:
2830 error = xlog_recover_do_inode_trans(log, item, pass);
2831 break;
2832 case XFS_LI_EFI:
2833 error = xlog_recover_do_efi_trans(log, item,
2834 trans->r_lsn, pass);
2835 break;
2836 case XFS_LI_EFD:
1da177e4 2837 xlog_recover_do_efd_trans(log, item, pass);
ff0205e0
CH
2838 error = 0;
2839 break;
2840 case XFS_LI_DQUOT:
2841 error = xlog_recover_do_dquot_trans(log, item, pass);
2842 break;
2843 case XFS_LI_QUOTAOFF:
2844 error = xlog_recover_do_quotaoff_trans(log, item,
2845 pass);
2846 break;
2847 default:
2848 xlog_warn(
2849 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
1da177e4
LT
2850 ASSERT(0);
2851 error = XFS_ERROR(EIO);
2852 break;
2853 }
ff0205e0
CH
2854
2855 if (error)
2856 return error;
f0a76953 2857 }
1da177e4 2858
ff0205e0 2859 return 0;
1da177e4
LT
2860}
2861
2862/*
2863 * Free up any resources allocated by the transaction
2864 *
2865 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2866 */
2867STATIC void
2868xlog_recover_free_trans(
2869 xlog_recover_t *trans)
2870{
f0a76953 2871 xlog_recover_item_t *item, *n;
1da177e4
LT
2872 int i;
2873
f0a76953
DC
2874 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2875 /* Free the regions in the item. */
2876 list_del(&item->ri_list);
2877 for (i = 0; i < item->ri_cnt; i++)
2878 kmem_free(item->ri_buf[i].i_addr);
1da177e4 2879 /* Free the item itself */
f0a76953
DC
2880 kmem_free(item->ri_buf);
2881 kmem_free(item);
2882 }
1da177e4 2883 /* Free the transaction recover structure */
f0e2d93c 2884 kmem_free(trans);
1da177e4
LT
2885}
2886
2887STATIC int
2888xlog_recover_commit_trans(
2889 xlog_t *log,
1da177e4
LT
2890 xlog_recover_t *trans,
2891 int pass)
2892{
2893 int error;
2894
f0a76953 2895 hlist_del(&trans->r_list);
1da177e4
LT
2896 if ((error = xlog_recover_do_trans(log, trans, pass)))
2897 return error;
2898 xlog_recover_free_trans(trans); /* no error */
2899 return 0;
2900}
2901
2902STATIC int
2903xlog_recover_unmount_trans(
2904 xlog_recover_t *trans)
2905{
2906 /* Do nothing now */
2907 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
2908 return 0;
2909}
2910
2911/*
2912 * There are two valid states of the r_state field. 0 indicates that the
2913 * transaction structure is in a normal state. We have either seen the
2914 * start of the transaction or the last operation we added was not a partial
2915 * operation. If the last operation we added to the transaction was a
2916 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2917 *
2918 * NOTE: skip LRs with 0 data length.
2919 */
2920STATIC int
2921xlog_recover_process_data(
2922 xlog_t *log,
f0a76953 2923 struct hlist_head rhash[],
1da177e4
LT
2924 xlog_rec_header_t *rhead,
2925 xfs_caddr_t dp,
2926 int pass)
2927{
2928 xfs_caddr_t lp;
2929 int num_logops;
2930 xlog_op_header_t *ohead;
2931 xlog_recover_t *trans;
2932 xlog_tid_t tid;
2933 int error;
2934 unsigned long hash;
2935 uint flags;
2936
b53e675d
CH
2937 lp = dp + be32_to_cpu(rhead->h_len);
2938 num_logops = be32_to_cpu(rhead->h_num_logops);
1da177e4
LT
2939
2940 /* check the log format matches our own - else we can't recover */
2941 if (xlog_header_check_recover(log->l_mp, rhead))
2942 return (XFS_ERROR(EIO));
2943
2944 while ((dp < lp) && num_logops) {
2945 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2946 ohead = (xlog_op_header_t *)dp;
2947 dp += sizeof(xlog_op_header_t);
2948 if (ohead->oh_clientid != XFS_TRANSACTION &&
2949 ohead->oh_clientid != XFS_LOG) {
2950 xlog_warn(
2951 "XFS: xlog_recover_process_data: bad clientid");
2952 ASSERT(0);
2953 return (XFS_ERROR(EIO));
2954 }
67fcb7bf 2955 tid = be32_to_cpu(ohead->oh_tid);
1da177e4 2956 hash = XLOG_RHASH(tid);
f0a76953 2957 trans = xlog_recover_find_tid(&rhash[hash], tid);
1da177e4
LT
2958 if (trans == NULL) { /* not found; add new tid */
2959 if (ohead->oh_flags & XLOG_START_TRANS)
2960 xlog_recover_new_tid(&rhash[hash], tid,
b53e675d 2961 be64_to_cpu(rhead->h_lsn));
1da177e4 2962 } else {
9742bb93
LM
2963 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2964 xlog_warn(
2965 "XFS: xlog_recover_process_data: bad length");
2966 WARN_ON(1);
2967 return (XFS_ERROR(EIO));
2968 }
1da177e4
LT
2969 flags = ohead->oh_flags & ~XLOG_END_TRANS;
2970 if (flags & XLOG_WAS_CONT_TRANS)
2971 flags &= ~XLOG_CONTINUE_TRANS;
2972 switch (flags) {
2973 case XLOG_COMMIT_TRANS:
2974 error = xlog_recover_commit_trans(log,
f0a76953 2975 trans, pass);
1da177e4
LT
2976 break;
2977 case XLOG_UNMOUNT_TRANS:
2978 error = xlog_recover_unmount_trans(trans);
2979 break;
2980 case XLOG_WAS_CONT_TRANS:
9abbc539
DC
2981 error = xlog_recover_add_to_cont_trans(log,
2982 trans, dp,
2983 be32_to_cpu(ohead->oh_len));
1da177e4
LT
2984 break;
2985 case XLOG_START_TRANS:
2986 xlog_warn(
2987 "XFS: xlog_recover_process_data: bad transaction");
2988 ASSERT(0);
2989 error = XFS_ERROR(EIO);
2990 break;
2991 case 0:
2992 case XLOG_CONTINUE_TRANS:
9abbc539 2993 error = xlog_recover_add_to_trans(log, trans,
67fcb7bf 2994 dp, be32_to_cpu(ohead->oh_len));
1da177e4
LT
2995 break;
2996 default:
2997 xlog_warn(
2998 "XFS: xlog_recover_process_data: bad flag");
2999 ASSERT(0);
3000 error = XFS_ERROR(EIO);
3001 break;
3002 }
3003 if (error)
3004 return error;
3005 }
67fcb7bf 3006 dp += be32_to_cpu(ohead->oh_len);
1da177e4
LT
3007 num_logops--;
3008 }
3009 return 0;
3010}
3011
3012/*
3013 * Process an extent free intent item that was recovered from
3014 * the log. We need to free the extents that it describes.
3015 */
3c1e2bbe 3016STATIC int
1da177e4
LT
3017xlog_recover_process_efi(
3018 xfs_mount_t *mp,
3019 xfs_efi_log_item_t *efip)
3020{
3021 xfs_efd_log_item_t *efdp;
3022 xfs_trans_t *tp;
3023 int i;
3c1e2bbe 3024 int error = 0;
1da177e4
LT
3025 xfs_extent_t *extp;
3026 xfs_fsblock_t startblock_fsb;
3027
3028 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
3029
3030 /*
3031 * First check the validity of the extents described by the
3032 * EFI. If any are bad, then assume that all are bad and
3033 * just toss the EFI.
3034 */
3035 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3036 extp = &(efip->efi_format.efi_extents[i]);
3037 startblock_fsb = XFS_BB_TO_FSB(mp,
3038 XFS_FSB_TO_DADDR(mp, extp->ext_start));
3039 if ((startblock_fsb == 0) ||
3040 (extp->ext_len == 0) ||
3041 (startblock_fsb >= mp->m_sb.sb_dblocks) ||
3042 (extp->ext_len >= mp->m_sb.sb_agblocks)) {
3043 /*
3044 * This will pull the EFI from the AIL and
3045 * free the memory associated with it.
3046 */
3047 xfs_efi_release(efip, efip->efi_format.efi_nextents);
3c1e2bbe 3048 return XFS_ERROR(EIO);
1da177e4
LT
3049 }
3050 }
3051
3052 tp = xfs_trans_alloc(mp, 0);
3c1e2bbe 3053 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
fc6149d8
DC
3054 if (error)
3055 goto abort_error;
1da177e4
LT
3056 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3057
3058 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3059 extp = &(efip->efi_format.efi_extents[i]);
fc6149d8
DC
3060 error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3061 if (error)
3062 goto abort_error;
1da177e4
LT
3063 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3064 extp->ext_len);
3065 }
3066
3067 efip->efi_flags |= XFS_EFI_RECOVERED;
e5720eec 3068 error = xfs_trans_commit(tp, 0);
3c1e2bbe 3069 return error;
fc6149d8
DC
3070
3071abort_error:
3072 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3073 return error;
1da177e4
LT
3074}
3075
1da177e4
LT
3076/*
3077 * When this is called, all of the EFIs which did not have
3078 * corresponding EFDs should be in the AIL. What we do now
3079 * is free the extents associated with each one.
3080 *
3081 * Since we process the EFIs in normal transactions, they
3082 * will be removed at some point after the commit. This prevents
3083 * us from just walking down the list processing each one.
3084 * We'll use a flag in the EFI to skip those that we've already
3085 * processed and use the AIL iteration mechanism's generation
3086 * count to try to speed this up at least a bit.
3087 *
3088 * When we start, we know that the EFIs are the only things in
3089 * the AIL. As we process them, however, other items are added
3090 * to the AIL. Since everything added to the AIL must come after
3091 * everything already in the AIL, we stop processing as soon as
3092 * we see something other than an EFI in the AIL.
3093 */
3c1e2bbe 3094STATIC int
1da177e4
LT
3095xlog_recover_process_efis(
3096 xlog_t *log)
3097{
3098 xfs_log_item_t *lip;
3099 xfs_efi_log_item_t *efip;
3c1e2bbe 3100 int error = 0;
27d8d5fe 3101 struct xfs_ail_cursor cur;
a9c21c1b 3102 struct xfs_ail *ailp;
1da177e4 3103
a9c21c1b
DC
3104 ailp = log->l_ailp;
3105 spin_lock(&ailp->xa_lock);
3106 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
1da177e4
LT
3107 while (lip != NULL) {
3108 /*
3109 * We're done when we see something other than an EFI.
27d8d5fe 3110 * There should be no EFIs left in the AIL now.
1da177e4
LT
3111 */
3112 if (lip->li_type != XFS_LI_EFI) {
27d8d5fe 3113#ifdef DEBUG
a9c21c1b 3114 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
27d8d5fe
DC
3115 ASSERT(lip->li_type != XFS_LI_EFI);
3116#endif
1da177e4
LT
3117 break;
3118 }
3119
3120 /*
3121 * Skip EFIs that we've already processed.
3122 */
3123 efip = (xfs_efi_log_item_t *)lip;
3124 if (efip->efi_flags & XFS_EFI_RECOVERED) {
a9c21c1b 3125 lip = xfs_trans_ail_cursor_next(ailp, &cur);
1da177e4
LT
3126 continue;
3127 }
3128
a9c21c1b
DC
3129 spin_unlock(&ailp->xa_lock);
3130 error = xlog_recover_process_efi(log->l_mp, efip);
3131 spin_lock(&ailp->xa_lock);
27d8d5fe
DC
3132 if (error)
3133 goto out;
a9c21c1b 3134 lip = xfs_trans_ail_cursor_next(ailp, &cur);
1da177e4 3135 }
27d8d5fe 3136out:
a9c21c1b
DC
3137 xfs_trans_ail_cursor_done(ailp, &cur);
3138 spin_unlock(&ailp->xa_lock);
3c1e2bbe 3139 return error;
1da177e4
LT
3140}
3141
3142/*
3143 * This routine performs a transaction to null out a bad inode pointer
3144 * in an agi unlinked inode hash bucket.
3145 */
3146STATIC void
3147xlog_recover_clear_agi_bucket(
3148 xfs_mount_t *mp,
3149 xfs_agnumber_t agno,
3150 int bucket)
3151{
3152 xfs_trans_t *tp;
3153 xfs_agi_t *agi;
3154 xfs_buf_t *agibp;
3155 int offset;
3156 int error;
3157
3158 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
5e1be0fb
CH
3159 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
3160 0, 0, 0);
e5720eec
DC
3161 if (error)
3162 goto out_abort;
1da177e4 3163
5e1be0fb
CH
3164 error = xfs_read_agi(mp, tp, agno, &agibp);
3165 if (error)
e5720eec 3166 goto out_abort;
1da177e4 3167
5e1be0fb 3168 agi = XFS_BUF_TO_AGI(agibp);
16259e7d 3169 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
1da177e4
LT
3170 offset = offsetof(xfs_agi_t, agi_unlinked) +
3171 (sizeof(xfs_agino_t) * bucket);
3172 xfs_trans_log_buf(tp, agibp, offset,
3173 (offset + sizeof(xfs_agino_t) - 1));
3174
e5720eec
DC
3175 error = xfs_trans_commit(tp, 0);
3176 if (error)
3177 goto out_error;
3178 return;
3179
3180out_abort:
3181 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3182out_error:
3183 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
3184 "failed to clear agi %d. Continuing.", agno);
3185 return;
1da177e4
LT
3186}
3187
23fac50f
CH
3188STATIC xfs_agino_t
3189xlog_recover_process_one_iunlink(
3190 struct xfs_mount *mp,
3191 xfs_agnumber_t agno,
3192 xfs_agino_t agino,
3193 int bucket)
3194{
3195 struct xfs_buf *ibp;
3196 struct xfs_dinode *dip;
3197 struct xfs_inode *ip;
3198 xfs_ino_t ino;
3199 int error;
3200
3201 ino = XFS_AGINO_TO_INO(mp, agno, agino);
3202 error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3203 if (error)
3204 goto fail;
3205
3206 /*
3207 * Get the on disk inode to find the next inode in the bucket.
3208 */
0cadda1c 3209 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
23fac50f 3210 if (error)
0e446673 3211 goto fail_iput;
23fac50f 3212
23fac50f 3213 ASSERT(ip->i_d.di_nlink == 0);
0e446673 3214 ASSERT(ip->i_d.di_mode != 0);
23fac50f
CH
3215
3216 /* setup for the next pass */
3217 agino = be32_to_cpu(dip->di_next_unlinked);
3218 xfs_buf_relse(ibp);
3219
3220 /*
3221 * Prevent any DMAPI event from being sent when the reference on
3222 * the inode is dropped.
3223 */
3224 ip->i_d.di_dmevmask = 0;
3225
0e446673 3226 IRELE(ip);
23fac50f
CH
3227 return agino;
3228
0e446673
CH
3229 fail_iput:
3230 IRELE(ip);
23fac50f
CH
3231 fail:
3232 /*
3233 * We can't read in the inode this bucket points to, or this inode
3234 * is messed up. Just ditch this bucket of inodes. We will lose
3235 * some inodes and space, but at least we won't hang.
3236 *
3237 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3238 * clear the inode pointer in the bucket.
3239 */
3240 xlog_recover_clear_agi_bucket(mp, agno, bucket);
3241 return NULLAGINO;
3242}
3243
1da177e4
LT
3244/*
3245 * xlog_iunlink_recover
3246 *
3247 * This is called during recovery to process any inodes which
3248 * we unlinked but not freed when the system crashed. These
3249 * inodes will be on the lists in the AGI blocks. What we do
3250 * here is scan all the AGIs and fully truncate and free any
3251 * inodes found on the lists. Each inode is removed from the
3252 * lists when it has been fully truncated and is freed. The
3253 * freeing of the inode and its removal from the list must be
3254 * atomic.
3255 */
d96f8f89 3256STATIC void
1da177e4
LT
3257xlog_recover_process_iunlinks(
3258 xlog_t *log)
3259{
3260 xfs_mount_t *mp;
3261 xfs_agnumber_t agno;
3262 xfs_agi_t *agi;
3263 xfs_buf_t *agibp;
1da177e4 3264 xfs_agino_t agino;
1da177e4
LT
3265 int bucket;
3266 int error;
3267 uint mp_dmevmask;
3268
3269 mp = log->l_mp;
3270
3271 /*
3272 * Prevent any DMAPI event from being sent while in this function.
3273 */
3274 mp_dmevmask = mp->m_dmevmask;
3275 mp->m_dmevmask = 0;
3276
3277 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3278 /*
3279 * Find the agi for this ag.
3280 */
5e1be0fb
CH
3281 error = xfs_read_agi(mp, NULL, agno, &agibp);
3282 if (error) {
3283 /*
3284 * AGI is b0rked. Don't process it.
3285 *
3286 * We should probably mark the filesystem as corrupt
3287 * after we've recovered all the ag's we can....
3288 */
3289 continue;
1da177e4
LT
3290 }
3291 agi = XFS_BUF_TO_AGI(agibp);
1da177e4
LT
3292
3293 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
16259e7d 3294 agino = be32_to_cpu(agi->agi_unlinked[bucket]);
1da177e4 3295 while (agino != NULLAGINO) {
1da177e4
LT
3296 /*
3297 * Release the agi buffer so that it can
3298 * be acquired in the normal course of the
3299 * transaction to truncate and free the inode.
3300 */
3301 xfs_buf_relse(agibp);
3302
23fac50f
CH
3303 agino = xlog_recover_process_one_iunlink(mp,
3304 agno, agino, bucket);
1da177e4
LT
3305
3306 /*
3307 * Reacquire the agibuffer and continue around
5e1be0fb
CH
3308 * the loop. This should never fail as we know
3309 * the buffer was good earlier on.
1da177e4 3310 */
5e1be0fb
CH
3311 error = xfs_read_agi(mp, NULL, agno, &agibp);
3312 ASSERT(error == 0);
1da177e4 3313 agi = XFS_BUF_TO_AGI(agibp);
1da177e4
LT
3314 }
3315 }
3316
3317 /*
3318 * Release the buffer for the current agi so we can
3319 * go on to the next one.
3320 */
3321 xfs_buf_relse(agibp);
3322 }
3323
3324 mp->m_dmevmask = mp_dmevmask;
3325}
3326
3327
3328#ifdef DEBUG
3329STATIC void
3330xlog_pack_data_checksum(
3331 xlog_t *log,
3332 xlog_in_core_t *iclog,
3333 int size)
3334{
3335 int i;
b53e675d 3336 __be32 *up;
1da177e4
LT
3337 uint chksum = 0;
3338
b53e675d 3339 up = (__be32 *)iclog->ic_datap;
1da177e4
LT
3340 /* divide length by 4 to get # words */
3341 for (i = 0; i < (size >> 2); i++) {
b53e675d 3342 chksum ^= be32_to_cpu(*up);
1da177e4
LT
3343 up++;
3344 }
b53e675d 3345 iclog->ic_header.h_chksum = cpu_to_be32(chksum);
1da177e4
LT
3346}
3347#else
3348#define xlog_pack_data_checksum(log, iclog, size)
3349#endif
3350
3351/*
3352 * Stamp cycle number in every block
3353 */
3354void
3355xlog_pack_data(
3356 xlog_t *log,
3357 xlog_in_core_t *iclog,
3358 int roundoff)
3359{
3360 int i, j, k;
3361 int size = iclog->ic_offset + roundoff;
b53e675d 3362 __be32 cycle_lsn;
1da177e4 3363 xfs_caddr_t dp;
1da177e4
LT
3364
3365 xlog_pack_data_checksum(log, iclog, size);
3366
3367 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3368
3369 dp = iclog->ic_datap;
3370 for (i = 0; i < BTOBB(size) &&
3371 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
b53e675d
CH
3372 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3373 *(__be32 *)dp = cycle_lsn;
1da177e4
LT
3374 dp += BBSIZE;
3375 }
3376
62118709 3377 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
b28708d6
CH
3378 xlog_in_core_2_t *xhdr = iclog->ic_data;
3379
1da177e4
LT
3380 for ( ; i < BTOBB(size); i++) {
3381 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3382 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
b53e675d
CH
3383 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3384 *(__be32 *)dp = cycle_lsn;
1da177e4
LT
3385 dp += BBSIZE;
3386 }
3387
3388 for (i = 1; i < log->l_iclog_heads; i++) {
3389 xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3390 }
3391 }
3392}
3393
1da177e4
LT
3394STATIC void
3395xlog_unpack_data(
3396 xlog_rec_header_t *rhead,
3397 xfs_caddr_t dp,
3398 xlog_t *log)
3399{
3400 int i, j, k;
1da177e4 3401
b53e675d 3402 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1da177e4 3403 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
b53e675d 3404 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1da177e4
LT
3405 dp += BBSIZE;
3406 }
3407
62118709 3408 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
b28708d6 3409 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
b53e675d 3410 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1da177e4
LT
3411 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3412 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
b53e675d 3413 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1da177e4
LT
3414 dp += BBSIZE;
3415 }
3416 }
1da177e4
LT
3417}
3418
3419STATIC int
3420xlog_valid_rec_header(
3421 xlog_t *log,
3422 xlog_rec_header_t *rhead,
3423 xfs_daddr_t blkno)
3424{
3425 int hlen;
3426
b53e675d 3427 if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
1da177e4
LT
3428 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3429 XFS_ERRLEVEL_LOW, log->l_mp);
3430 return XFS_ERROR(EFSCORRUPTED);
3431 }
3432 if (unlikely(
3433 (!rhead->h_version ||
b53e675d 3434 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
1da177e4 3435 xlog_warn("XFS: %s: unrecognised log version (%d).",
34a622b2 3436 __func__, be32_to_cpu(rhead->h_version));
1da177e4
LT
3437 return XFS_ERROR(EIO);
3438 }
3439
3440 /* LR body must have data or it wouldn't have been written */
b53e675d 3441 hlen = be32_to_cpu(rhead->h_len);
1da177e4
LT
3442 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3443 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3444 XFS_ERRLEVEL_LOW, log->l_mp);
3445 return XFS_ERROR(EFSCORRUPTED);
3446 }
3447 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3448 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3449 XFS_ERRLEVEL_LOW, log->l_mp);
3450 return XFS_ERROR(EFSCORRUPTED);
3451 }
3452 return 0;
3453}
3454
3455/*
3456 * Read the log from tail to head and process the log records found.
3457 * Handle the two cases where the tail and head are in the same cycle
3458 * and where the active portion of the log wraps around the end of
3459 * the physical log separately. The pass parameter is passed through
3460 * to the routines called to process the data and is not looked at
3461 * here.
3462 */
3463STATIC int
3464xlog_do_recovery_pass(
3465 xlog_t *log,
3466 xfs_daddr_t head_blk,
3467 xfs_daddr_t tail_blk,
3468 int pass)
3469{
3470 xlog_rec_header_t *rhead;
3471 xfs_daddr_t blk_no;
fc5bc4c8 3472 xfs_caddr_t offset;
1da177e4
LT
3473 xfs_buf_t *hbp, *dbp;
3474 int error = 0, h_size;
3475 int bblks, split_bblks;
3476 int hblks, split_hblks, wrapped_hblks;
f0a76953 3477 struct hlist_head rhash[XLOG_RHASH_SIZE];
1da177e4
LT
3478
3479 ASSERT(head_blk != tail_blk);
3480
3481 /*
3482 * Read the header of the tail block and get the iclog buffer size from
3483 * h_size. Use this to tell how many sectors make up the log header.
3484 */
62118709 3485 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1da177e4
LT
3486 /*
3487 * When using variable length iclogs, read first sector of
3488 * iclog header and extract the header size from it. Get a
3489 * new hbp that is the correct size.
3490 */
3491 hbp = xlog_get_bp(log, 1);
3492 if (!hbp)
3493 return ENOMEM;
076e6acb
CH
3494
3495 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3496 if (error)
1da177e4 3497 goto bread_err1;
076e6acb 3498
1da177e4
LT
3499 rhead = (xlog_rec_header_t *)offset;
3500 error = xlog_valid_rec_header(log, rhead, tail_blk);
3501 if (error)
3502 goto bread_err1;
b53e675d
CH
3503 h_size = be32_to_cpu(rhead->h_size);
3504 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
1da177e4
LT
3505 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3506 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3507 if (h_size % XLOG_HEADER_CYCLE_SIZE)
3508 hblks++;
3509 xlog_put_bp(hbp);
3510 hbp = xlog_get_bp(log, hblks);
3511 } else {
3512 hblks = 1;
3513 }
3514 } else {
69ce58f0 3515 ASSERT(log->l_sectBBsize == 1);
1da177e4
LT
3516 hblks = 1;
3517 hbp = xlog_get_bp(log, 1);
3518 h_size = XLOG_BIG_RECORD_BSIZE;
3519 }
3520
3521 if (!hbp)
3522 return ENOMEM;
3523 dbp = xlog_get_bp(log, BTOBB(h_size));
3524 if (!dbp) {
3525 xlog_put_bp(hbp);
3526 return ENOMEM;
3527 }
3528
3529 memset(rhash, 0, sizeof(rhash));
3530 if (tail_blk <= head_blk) {
3531 for (blk_no = tail_blk; blk_no < head_blk; ) {
076e6acb
CH
3532 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3533 if (error)
1da177e4 3534 goto bread_err2;
076e6acb 3535
1da177e4
LT
3536 rhead = (xlog_rec_header_t *)offset;
3537 error = xlog_valid_rec_header(log, rhead, blk_no);
3538 if (error)
3539 goto bread_err2;
3540
3541 /* blocks in data section */
b53e675d 3542 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
076e6acb
CH
3543 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3544 &offset);
1da177e4
LT
3545 if (error)
3546 goto bread_err2;
076e6acb 3547
1da177e4
LT
3548 xlog_unpack_data(rhead, offset, log);
3549 if ((error = xlog_recover_process_data(log,
3550 rhash, rhead, offset, pass)))
3551 goto bread_err2;
3552 blk_no += bblks + hblks;
3553 }
3554 } else {
3555 /*
3556 * Perform recovery around the end of the physical log.
3557 * When the head is not on the same cycle number as the tail,
3558 * we can't do a sequential recovery as above.
3559 */
3560 blk_no = tail_blk;
3561 while (blk_no < log->l_logBBsize) {
3562 /*
3563 * Check for header wrapping around physical end-of-log
3564 */
fc5bc4c8 3565 offset = XFS_BUF_PTR(hbp);
1da177e4
LT
3566 split_hblks = 0;
3567 wrapped_hblks = 0;
3568 if (blk_no + hblks <= log->l_logBBsize) {
3569 /* Read header in one read */
076e6acb
CH
3570 error = xlog_bread(log, blk_no, hblks, hbp,
3571 &offset);
1da177e4
LT
3572 if (error)
3573 goto bread_err2;
1da177e4
LT
3574 } else {
3575 /* This LR is split across physical log end */
3576 if (blk_no != log->l_logBBsize) {
3577 /* some data before physical log end */
3578 ASSERT(blk_no <= INT_MAX);
3579 split_hblks = log->l_logBBsize - (int)blk_no;
3580 ASSERT(split_hblks > 0);
076e6acb
CH
3581 error = xlog_bread(log, blk_no,
3582 split_hblks, hbp,
3583 &offset);
3584 if (error)
1da177e4 3585 goto bread_err2;
1da177e4 3586 }
076e6acb 3587
1da177e4
LT
3588 /*
3589 * Note: this black magic still works with
3590 * large sector sizes (non-512) only because:
3591 * - we increased the buffer size originally
3592 * by 1 sector giving us enough extra space
3593 * for the second read;
3594 * - the log start is guaranteed to be sector
3595 * aligned;
3596 * - we read the log end (LR header start)
3597 * _first_, then the log start (LR header end)
3598 * - order is important.
3599 */
234f56ac 3600 wrapped_hblks = hblks - split_hblks;
234f56ac 3601 error = XFS_BUF_SET_PTR(hbp,
fc5bc4c8 3602 offset + BBTOB(split_hblks),
1da177e4 3603 BBTOB(hblks - split_hblks));
076e6acb
CH
3604 if (error)
3605 goto bread_err2;
3606
3607 error = xlog_bread_noalign(log, 0,
3608 wrapped_hblks, hbp);
3609 if (error)
3610 goto bread_err2;
3611
fc5bc4c8 3612 error = XFS_BUF_SET_PTR(hbp, offset,
234f56ac 3613 BBTOB(hblks));
1da177e4
LT
3614 if (error)
3615 goto bread_err2;
1da177e4
LT
3616 }
3617 rhead = (xlog_rec_header_t *)offset;
3618 error = xlog_valid_rec_header(log, rhead,
3619 split_hblks ? blk_no : 0);
3620 if (error)
3621 goto bread_err2;
3622
b53e675d 3623 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1da177e4
LT
3624 blk_no += hblks;
3625
3626 /* Read in data for log record */
3627 if (blk_no + bblks <= log->l_logBBsize) {
076e6acb
CH
3628 error = xlog_bread(log, blk_no, bblks, dbp,
3629 &offset);
1da177e4
LT
3630 if (error)
3631 goto bread_err2;
1da177e4
LT
3632 } else {
3633 /* This log record is split across the
3634 * physical end of log */
fc5bc4c8 3635 offset = XFS_BUF_PTR(dbp);
1da177e4
LT
3636 split_bblks = 0;
3637 if (blk_no != log->l_logBBsize) {
3638 /* some data is before the physical
3639 * end of log */
3640 ASSERT(!wrapped_hblks);
3641 ASSERT(blk_no <= INT_MAX);
3642 split_bblks =
3643 log->l_logBBsize - (int)blk_no;
3644 ASSERT(split_bblks > 0);
076e6acb
CH
3645 error = xlog_bread(log, blk_no,
3646 split_bblks, dbp,
3647 &offset);
3648 if (error)
1da177e4 3649 goto bread_err2;
1da177e4 3650 }
076e6acb 3651
1da177e4
LT
3652 /*
3653 * Note: this black magic still works with
3654 * large sector sizes (non-512) only because:
3655 * - we increased the buffer size originally
3656 * by 1 sector giving us enough extra space
3657 * for the second read;
3658 * - the log start is guaranteed to be sector
3659 * aligned;
3660 * - we read the log end (LR header start)
3661 * _first_, then the log start (LR header end)
3662 * - order is important.
3663 */
234f56ac 3664 error = XFS_BUF_SET_PTR(dbp,
fc5bc4c8 3665 offset + BBTOB(split_bblks),
1da177e4 3666 BBTOB(bblks - split_bblks));
234f56ac 3667 if (error)
1da177e4 3668 goto bread_err2;
076e6acb
CH
3669
3670 error = xlog_bread_noalign(log, wrapped_hblks,
3671 bblks - split_bblks,
3672 dbp);
3673 if (error)
3674 goto bread_err2;
3675
fc5bc4c8 3676 error = XFS_BUF_SET_PTR(dbp, offset, h_size);
076e6acb
CH
3677 if (error)
3678 goto bread_err2;
1da177e4
LT
3679 }
3680 xlog_unpack_data(rhead, offset, log);
3681 if ((error = xlog_recover_process_data(log, rhash,
3682 rhead, offset, pass)))
3683 goto bread_err2;
3684 blk_no += bblks;
3685 }
3686
3687 ASSERT(blk_no >= log->l_logBBsize);
3688 blk_no -= log->l_logBBsize;
3689
3690 /* read first part of physical log */
3691 while (blk_no < head_blk) {
076e6acb
CH
3692 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3693 if (error)
1da177e4 3694 goto bread_err2;
076e6acb 3695
1da177e4
LT
3696 rhead = (xlog_rec_header_t *)offset;
3697 error = xlog_valid_rec_header(log, rhead, blk_no);
3698 if (error)
3699 goto bread_err2;
076e6acb 3700
b53e675d 3701 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
076e6acb
CH
3702 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3703 &offset);
3704 if (error)
1da177e4 3705 goto bread_err2;
076e6acb 3706
1da177e4
LT
3707 xlog_unpack_data(rhead, offset, log);
3708 if ((error = xlog_recover_process_data(log, rhash,
3709 rhead, offset, pass)))
3710 goto bread_err2;
3711 blk_no += bblks + hblks;
3712 }
3713 }
3714
3715 bread_err2:
3716 xlog_put_bp(dbp);
3717 bread_err1:
3718 xlog_put_bp(hbp);
3719 return error;
3720}
3721
3722/*
3723 * Do the recovery of the log. We actually do this in two phases.
3724 * The two passes are necessary in order to implement the function
3725 * of cancelling a record written into the log. The first pass
3726 * determines those things which have been cancelled, and the
3727 * second pass replays log items normally except for those which
3728 * have been cancelled. The handling of the replay and cancellations
3729 * takes place in the log item type specific routines.
3730 *
3731 * The table of items which have cancel records in the log is allocated
3732 * and freed at this level, since only here do we know when all of
3733 * the log recovery has been completed.
3734 */
3735STATIC int
3736xlog_do_log_recovery(
3737 xlog_t *log,
3738 xfs_daddr_t head_blk,
3739 xfs_daddr_t tail_blk)
3740{
3741 int error;
3742
3743 ASSERT(head_blk != tail_blk);
3744
3745 /*
3746 * First do a pass to find all of the cancelled buf log items.
3747 * Store them in the buf_cancel_table for use in the second pass.
3748 */
3749 log->l_buf_cancel_table =
3750 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
3751 sizeof(xfs_buf_cancel_t*),
3752 KM_SLEEP);
3753 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3754 XLOG_RECOVER_PASS1);
3755 if (error != 0) {
f0e2d93c 3756 kmem_free(log->l_buf_cancel_table);
1da177e4
LT
3757 log->l_buf_cancel_table = NULL;
3758 return error;
3759 }
3760 /*
3761 * Then do a second pass to actually recover the items in the log.
3762 * When it is complete free the table of buf cancel items.
3763 */
3764 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3765 XLOG_RECOVER_PASS2);
3766#ifdef DEBUG
6d192a9b 3767 if (!error) {
1da177e4
LT
3768 int i;
3769
3770 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3771 ASSERT(log->l_buf_cancel_table[i] == NULL);
3772 }
3773#endif /* DEBUG */
3774
f0e2d93c 3775 kmem_free(log->l_buf_cancel_table);
1da177e4
LT
3776 log->l_buf_cancel_table = NULL;
3777
3778 return error;
3779}
3780
3781/*
3782 * Do the actual recovery
3783 */
3784STATIC int
3785xlog_do_recover(
3786 xlog_t *log,
3787 xfs_daddr_t head_blk,
3788 xfs_daddr_t tail_blk)
3789{
3790 int error;
3791 xfs_buf_t *bp;
3792 xfs_sb_t *sbp;
3793
3794 /*
3795 * First replay the images in the log.
3796 */
3797 error = xlog_do_log_recovery(log, head_blk, tail_blk);
3798 if (error) {
3799 return error;
3800 }
3801
3802 XFS_bflush(log->l_mp->m_ddev_targp);
3803
3804 /*
3805 * If IO errors happened during recovery, bail out.
3806 */
3807 if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3808 return (EIO);
3809 }
3810
3811 /*
3812 * We now update the tail_lsn since much of the recovery has completed
3813 * and there may be space available to use. If there were no extent
3814 * or iunlinks, we can free up the entire log and set the tail_lsn to
3815 * be the last_sync_lsn. This was set in xlog_find_tail to be the
3816 * lsn of the last known good LR on disk. If there are extent frees
3817 * or iunlinks they will have some entries in the AIL; so we look at
3818 * the AIL to determine how to set the tail_lsn.
3819 */
3820 xlog_assign_tail_lsn(log->l_mp);
3821
3822 /*
3823 * Now that we've finished replaying all buffer and inode
3824 * updates, re-read in the superblock.
3825 */
3826 bp = xfs_getsb(log->l_mp, 0);
3827 XFS_BUF_UNDONE(bp);
bebf963f
LM
3828 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3829 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
1da177e4 3830 XFS_BUF_READ(bp);
bebf963f 3831 XFS_BUF_UNASYNC(bp);
1da177e4 3832 xfsbdstrat(log->l_mp, bp);
d64e31a2
DC
3833 error = xfs_iowait(bp);
3834 if (error) {
1da177e4
LT
3835 xfs_ioerror_alert("xlog_do_recover",
3836 log->l_mp, bp, XFS_BUF_ADDR(bp));
3837 ASSERT(0);
3838 xfs_buf_relse(bp);
3839 return error;
3840 }
3841
3842 /* Convert superblock from on-disk format */
3843 sbp = &log->l_mp->m_sb;
2bdf7cd0 3844 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
1da177e4 3845 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
62118709 3846 ASSERT(xfs_sb_good_version(sbp));
1da177e4
LT
3847 xfs_buf_relse(bp);
3848
5478eead
LM
3849 /* We've re-read the superblock so re-initialize per-cpu counters */
3850 xfs_icsb_reinit_counters(log->l_mp);
3851
1da177e4
LT
3852 xlog_recover_check_summary(log);
3853
3854 /* Normal transactions can now occur */
3855 log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3856 return 0;
3857}
3858
3859/*
3860 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3861 *
3862 * Return error or zero.
3863 */
3864int
3865xlog_recover(
65be6054 3866 xlog_t *log)
1da177e4
LT
3867{
3868 xfs_daddr_t head_blk, tail_blk;
3869 int error;
3870
3871 /* find the tail of the log */
65be6054 3872 if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
1da177e4
LT
3873 return error;
3874
3875 if (tail_blk != head_blk) {
3876 /* There used to be a comment here:
3877 *
3878 * disallow recovery on read-only mounts. note -- mount
3879 * checks for ENOSPC and turns it into an intelligent
3880 * error message.
3881 * ...but this is no longer true. Now, unless you specify
3882 * NORECOVERY (in which case this function would never be
3883 * called), we just go ahead and recover. We do this all
3884 * under the vfs layer, so we can get away with it unless
3885 * the device itself is read-only, in which case we fail.
3886 */
3a02ee18 3887 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
1da177e4
LT
3888 return error;
3889 }
3890
3891 cmn_err(CE_NOTE,
fc1f8c1c
NS
3892 "Starting XFS recovery on filesystem: %s (logdev: %s)",
3893 log->l_mp->m_fsname, log->l_mp->m_logname ?
3894 log->l_mp->m_logname : "internal");
1da177e4
LT
3895
3896 error = xlog_do_recover(log, head_blk, tail_blk);
3897 log->l_flags |= XLOG_RECOVERY_NEEDED;
3898 }
3899 return error;
3900}
3901
3902/*
3903 * In the first part of recovery we replay inodes and buffers and build
3904 * up the list of extent free items which need to be processed. Here
3905 * we process the extent free items and clean up the on disk unlinked
3906 * inode lists. This is separated from the first part of recovery so
3907 * that the root and real-time bitmap inodes can be read in from disk in
3908 * between the two stages. This is necessary so that we can free space
3909 * in the real-time portion of the file system.
3910 */
3911int
3912xlog_recover_finish(
4249023a 3913 xlog_t *log)
1da177e4
LT
3914{
3915 /*
3916 * Now we're ready to do the transactions needed for the
3917 * rest of recovery. Start with completing all the extent
3918 * free intent records and then process the unlinked inode
3919 * lists. At this point, we essentially run in normal mode
3920 * except that we're still performing recovery actions
3921 * rather than accepting new requests.
3922 */
3923 if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3c1e2bbe
DC
3924 int error;
3925 error = xlog_recover_process_efis(log);
3926 if (error) {
3927 cmn_err(CE_ALERT,
3928 "Failed to recover EFIs on filesystem: %s",
3929 log->l_mp->m_fsname);
3930 return error;
3931 }
1da177e4
LT
3932 /*
3933 * Sync the log to get all the EFIs out of the AIL.
3934 * This isn't absolutely necessary, but it helps in
3935 * case the unlink transactions would have problems
3936 * pushing the EFIs out of the way.
3937 */
a14a348b 3938 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
1da177e4 3939
4249023a 3940 xlog_recover_process_iunlinks(log);
1da177e4
LT
3941
3942 xlog_recover_check_summary(log);
3943
3944 cmn_err(CE_NOTE,
fc1f8c1c
NS
3945 "Ending XFS recovery on filesystem: %s (logdev: %s)",
3946 log->l_mp->m_fsname, log->l_mp->m_logname ?
3947 log->l_mp->m_logname : "internal");
1da177e4
LT
3948 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3949 } else {
3950 cmn_err(CE_DEBUG,
b6574520 3951 "!Ending clean XFS mount for filesystem: %s\n",
1da177e4
LT
3952 log->l_mp->m_fsname);
3953 }
3954 return 0;
3955}
3956
3957
3958#if defined(DEBUG)
3959/*
3960 * Read all of the agf and agi counters and check that they
3961 * are consistent with the superblock counters.
3962 */
3963void
3964xlog_recover_check_summary(
3965 xlog_t *log)
3966{
3967 xfs_mount_t *mp;
3968 xfs_agf_t *agfp;
1da177e4
LT
3969 xfs_buf_t *agfbp;
3970 xfs_buf_t *agibp;
1da177e4
LT
3971 xfs_agnumber_t agno;
3972 __uint64_t freeblks;
3973 __uint64_t itotal;
3974 __uint64_t ifree;
5e1be0fb 3975 int error;
1da177e4
LT
3976
3977 mp = log->l_mp;
3978
3979 freeblks = 0LL;
3980 itotal = 0LL;
3981 ifree = 0LL;
3982 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4805621a
CH
3983 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3984 if (error) {
3985 xfs_fs_cmn_err(CE_ALERT, mp,
3986 "xlog_recover_check_summary(agf)"
3987 "agf read failed agno %d error %d",
3988 agno, error);
3989 } else {
3990 agfp = XFS_BUF_TO_AGF(agfbp);
3991 freeblks += be32_to_cpu(agfp->agf_freeblks) +
3992 be32_to_cpu(agfp->agf_flcount);
3993 xfs_buf_relse(agfbp);
1da177e4 3994 }
1da177e4 3995
5e1be0fb
CH
3996 error = xfs_read_agi(mp, NULL, agno, &agibp);
3997 if (!error) {
3998 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
16259e7d 3999
5e1be0fb
CH
4000 itotal += be32_to_cpu(agi->agi_count);
4001 ifree += be32_to_cpu(agi->agi_freecount);
4002 xfs_buf_relse(agibp);
4003 }
1da177e4 4004 }
1da177e4
LT
4005}
4006#endif /* DEBUG */