2 * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
4 * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
6 * This file is released under the GPL.
9 * Linux 2.6 Device Mapper RAID4 and RAID5 target.
12 * o RAID4 with dedicated and selectable parity device
13 * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
14 * o recovery of out of sync device for initial
15 * RAID set creation or after dead drive replacement
16 * o run time optimization of xor algorithm used to calculate parity
20 * o the raid address calculation algorithm
21 * o the base of the biovec <-> page list copier.
24 * Uses region hash to keep track of how many writes are in flight to
25 * regions in order to use dirty log to keep state of regions to recover:
27 * o clean regions (those which are synchronized
28 * and don't have write io in flight)
29 * o dirty regions (those with write io in flight)
32 * On startup, any dirty regions are migrated to the
33 * 'nosync' state and are subject to recovery by the daemon.
35 * See raid_ctr() for table definition.
37 * FIXME: recovery bandwidth
40 static const char *version
= "v0.2594b";
43 #include "dm-memcache.h"
44 #include "dm-message.h"
45 #include "dm-raid45.h"
47 #include <linux/kernel.h>
48 #include <linux/vmalloc.h>
49 #include <linux/raid/xor.h>
51 #include <linux/bio.h>
52 #include <linux/dm-io.h>
53 #include <linux/dm-dirty-log.h>
54 #include "dm-region-hash.h"
56 #include <linux/slab.h>
57 #include <linux/module.h>
60 * Configurable parameters
63 /* Minimum/maximum and default # of selectable stripes. */
65 #define STRIPES_MAX 16384
66 #define STRIPES_DEFAULT 80
68 /* Maximum and default chunk size in sectors if not set in constructor. */
69 #define CHUNK_SIZE_MIN 8
70 #define CHUNK_SIZE_MAX 16384
71 #define CHUNK_SIZE_DEFAULT 64
73 /* Default io size in sectors if not set in constructor. */
74 #define IO_SIZE_MIN CHUNK_SIZE_MIN
75 #define IO_SIZE_DEFAULT IO_SIZE_MIN
77 /* Recover io size default in sectors. */
78 #define RECOVER_IO_SIZE_MIN 64
79 #define RECOVER_IO_SIZE_DEFAULT 256
81 /* Default, minimum and maximum percentage of recover io bandwidth. */
82 #define BANDWIDTH_DEFAULT 10
83 #define BANDWIDTH_MIN 1
84 #define BANDWIDTH_MAX 100
86 /* # of parallel recovered regions */
87 #define RECOVERY_STRIPES_MIN 1
88 #define RECOVERY_STRIPES_MAX 64
89 #define RECOVERY_STRIPES_DEFAULT RECOVERY_STRIPES_MIN
91 * END Configurable parameters
94 #define TARGET "dm-raid45"
95 #define DAEMON "kraid45d"
96 #define DM_MSG_PREFIX TARGET
98 #define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
100 /* Amount/size for __xor(). */
101 #define XOR_SIZE PAGE_SIZE
103 /* Check value in range. */
104 #define range_ok(i, min, max) (i >= min && i <= max)
106 /* Check argument is power of 2. */
107 #define POWER_OF_2(a) (!(a & (a - 1)))
109 /* Structure access macros. */
110 /* Derive raid_set from stripe_cache pointer. */
111 #define RS(x) container_of(x, struct raid_set, sc)
113 /* Page reference. */
114 #define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
116 /* Stripe chunk reference. */
117 #define CHUNK(stripe, p) ((stripe)->chunk + p)
119 /* Bio list reference. */
120 #define BL(stripe, p, rw) (stripe->chunk[p].bl + rw)
121 #define BL_CHUNK(chunk, rw) (chunk->bl + rw)
123 /* Page list reference. */
124 #define PL(stripe, p) (stripe->obj[p].pl)
125 /* END: structure access macros. */
127 /* Factor out to dm-bio-list.h */
128 static inline void bio_list_push(struct bio_list
*bl
, struct bio
*bio
)
130 bio
->bi_next
= bl
->head
;
137 /* Factor out to dm.h */
138 #define TI_ERR_RET(str, ret) \
139 do { ti->error = str; return ret; } while (0);
140 #define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
142 /* Macro to define access IO flags access inline functions. */
143 #define BITOPS(name, what, var, flag) \
144 static inline int TestClear ## name ## what(struct var *v) \
145 { return test_and_clear_bit(flag, &v->io.flags); } \
146 static inline int TestSet ## name ## what(struct var *v) \
147 { return test_and_set_bit(flag, &v->io.flags); } \
148 static inline void Clear ## name ## what(struct var *v) \
149 { clear_bit(flag, &v->io.flags); } \
150 static inline void Set ## name ## what(struct var *v) \
151 { set_bit(flag, &v->io.flags); } \
152 static inline int name ## what(struct var *v) \
153 { return test_bit(flag, &v->io.flags); }
155 /*-----------------------------------------------------------------
158 * Cache for all reads and writes to raid sets (operational or degraded)
160 * We need to run all data to and from a RAID set through this cache,
161 * because parity chunks need to get calculated from data chunks
162 * or, in the degraded/resynchronization case, missing chunks need
163 * to be reconstructed using the other chunks of the stripe.
164 *---------------------------------------------------------------*/
165 /* A chunk within a stripe (holds bios hanging off). */
166 /* IO status flags for chunks of a stripe. */
168 CHUNK_DIRTY
, /* Pages of chunk dirty; need writing. */
169 CHUNK_ERROR
, /* IO error on any chunk page. */
170 CHUNK_IO
, /* Allow/prohibit IO on chunk pages. */
171 CHUNK_LOCKED
, /* Chunk pages locked during IO. */
172 CHUNK_MUST_IO
, /* Chunk must io. */
173 CHUNK_UNLOCK
, /* Enforce chunk unlock. */
174 CHUNK_UPTODATE
, /* Chunk pages are uptodate. */
178 * This does not work anymore with __REQ_* values being enums
180 #if READ != 0 || WRITE != 1
181 #error dm-raid45: READ/WRITE != 0/1 used as index!!!
186 WRITE_QUEUED
= WRITE
+ 1,
188 NR_BL_TYPES
, /* Must be last one! */
190 struct stripe_chunk
{
191 atomic_t cnt
; /* Reference count. */
192 struct stripe
*stripe
; /* Backpointer to stripe for endio(). */
193 /* Bio lists for reads, writes, and writes merged. */
194 struct bio_list bl
[NR_BL_TYPES
];
196 unsigned long flags
; /* IO status flags. */
200 /* Define chunk bit operations. */
201 BITOPS(Chunk
, Dirty
, stripe_chunk
, CHUNK_DIRTY
)
202 BITOPS(Chunk
, Error
, stripe_chunk
, CHUNK_ERROR
)
203 BITOPS(Chunk
, Io
, stripe_chunk
, CHUNK_IO
)
204 BITOPS(Chunk
, Locked
, stripe_chunk
, CHUNK_LOCKED
)
205 BITOPS(Chunk
, MustIo
, stripe_chunk
, CHUNK_MUST_IO
)
206 BITOPS(Chunk
, Unlock
, stripe_chunk
, CHUNK_UNLOCK
)
207 BITOPS(Chunk
, Uptodate
, stripe_chunk
, CHUNK_UPTODATE
)
210 * Stripe linked list indexes. Keep order, because the stripe
211 * and the stripe cache rely on the first 3!
214 LIST_FLUSH
, /* Stripes to flush for io. */
215 LIST_ENDIO
, /* Stripes to endio. */
216 LIST_LRU
, /* Least recently used stripes. */
217 SC_NR_LISTS
, /* # of lists in stripe cache. */
218 LIST_HASH
= SC_NR_LISTS
, /* Hashed stripes. */
219 LIST_RECOVER
= LIST_HASH
, /* For recovery type stripes only. */
220 STRIPE_NR_LISTS
,/* To size array in struct stripe. */
223 /* Adressing region recovery. */
224 struct recover_addr
{
225 struct dm_region
*reg
; /* Actual region to recover. */
226 sector_t pos
; /* Position within region to recover. */
227 sector_t end
; /* End of region to recover. */
230 /* A stripe: the io object to handle all reads and writes to a RAID set. */
232 atomic_t cnt
; /* Reference count. */
233 struct stripe_cache
*sc
; /* Backpointer to stripe cache. */
237 * o io list to flush io
239 * o LRU list to put stripes w/o reference count on
240 * o stripe cache hash
242 struct list_head lists
[STRIPE_NR_LISTS
];
244 sector_t key
; /* Hash key. */
245 region_t region
; /* Region stripe is mapped to. */
248 unsigned long flags
; /* Stripe state flags (see below). */
251 * Pending ios in flight:
253 * used to control move of stripe to endio list
257 /* Sectors to read and write for multi page stripe sets. */
261 /* Address region recovery. */
262 struct recover_addr
*recover
;
264 /* Lock on stripe (Future: for clustering). */
268 unsigned short parity
; /* Parity chunk index. */
269 short recover
; /* Recovery chunk index. */
273 * This stripe's memory cache object (dm-mem-cache);
274 * i.e. the io chunk pages.
276 struct dm_mem_cache_object
*obj
;
278 /* Array of stripe sets (dynamically allocated). */
279 struct stripe_chunk chunk
[0];
282 /* States stripes can be in (flags field). */
284 STRIPE_ERROR
, /* io error on stripe. */
285 STRIPE_MERGED
, /* Writes got merged to be written. */
286 STRIPE_RBW
, /* Read-before-write stripe. */
287 STRIPE_RECONSTRUCT
, /* Reconstruct of a missing chunk required. */
288 STRIPE_RECONSTRUCTED
, /* Reconstructed of a missing chunk. */
289 STRIPE_RECOVER
, /* Stripe used for RAID set recovery. */
292 /* Define stripe bit operations. */
293 BITOPS(Stripe
, Error
, stripe
, STRIPE_ERROR
)
294 BITOPS(Stripe
, Merged
, stripe
, STRIPE_MERGED
)
295 BITOPS(Stripe
, RBW
, stripe
, STRIPE_RBW
)
296 BITOPS(Stripe
, Reconstruct
, stripe
, STRIPE_RECONSTRUCT
)
297 BITOPS(Stripe
, Reconstructed
, stripe
, STRIPE_RECONSTRUCTED
)
298 BITOPS(Stripe
, Recover
, stripe
, STRIPE_RECOVER
)
302 struct list_head
*hash
;
310 LOCK_ENDIO
, /* Protect endio list. */
311 LOCK_LRU
, /* Protect LRU list. */
312 NR_LOCKS
, /* To size array in struct stripe_cache. */
315 /* A stripe cache. */
316 struct stripe_cache
{
318 struct stripe_hash hash
;
320 spinlock_t locks
[NR_LOCKS
]; /* Locks to protect lists. */
322 /* Stripes with io to flush, stripes to endio and LRU lists. */
323 struct list_head lists
[SC_NR_LISTS
];
325 /* Slab cache to allocate stripes from. */
327 struct kmem_cache
*cache
; /* Cache itself. */
328 char name
[32]; /* Unique name. */
331 struct dm_io_client
*dm_io_client
; /* dm-io client resource context. */
333 /* dm-mem-cache client resource context. */
334 struct dm_mem_cache_client
*mem_cache_client
;
336 int stripes_parm
; /* # stripes parameter from constructor. */
337 atomic_t stripes
; /* actual # of stripes in cache. */
338 atomic_t stripes_to_set
; /* # of stripes to resize cache to. */
339 atomic_t stripes_last
; /* last # of stripes in cache. */
340 atomic_t active_stripes
; /* actual # of active stripes in cache. */
343 atomic_t active_stripes_max
; /* actual # of active stripes in cache. */
346 /* Flag specs for raid_dev */ ;
347 enum raid_dev_flags
{
348 DEV_FAILED
, /* Device failed. */
349 DEV_IO_QUEUED
, /* Io got queued to device. */
352 /* The raid device in a set. */
355 sector_t start
; /* Offset to map to. */
356 struct { /* Using struct to be able to BITOPS(). */
357 unsigned long flags
; /* raid_dev_flags. */
361 BITOPS(Dev
, Failed
, raid_dev
, DEV_FAILED
)
362 BITOPS(Dev
, IoQueued
, raid_dev
, DEV_IO_QUEUED
)
364 /* Flags spec for raid_set. */
365 enum raid_set_flags
{
366 RS_CHECK_OVERWRITE
, /* Check for chunk overwrites. */
367 RS_DEAD
, /* RAID set inoperational. */
368 RS_DEGRADED
, /* Io errors on RAID device. */
369 RS_DEVEL_STATS
, /* REMOVEME: display status information. */
370 RS_RECOVER
, /* Do recovery. */
371 RS_RECOVERY_BANDWIDTH
, /* Allow recovery bandwidth (delayed bios). */
372 RS_SC_BUSY
, /* Stripe cache busy -> send an event. */
373 RS_SUSPEND
, /* Suspend RAID set. */
376 /* REMOVEME: devel stats counters. */
414 S_NR_STATS
, /* # of stats counters. Must be last! */
417 /* Status type -> string mappings. */
419 const enum stats_types type
;
423 static struct stats_map stats_map
[] = {
424 { S_BIOS_READ
, "r=" },
425 { S_BIOS_ADDED_READ
, "/" },
426 { S_BIOS_ENDIO_READ
, "/" },
427 { S_BIOS_WRITE
, " w=" },
428 { S_BIOS_ADDED_WRITE
, "/" },
429 { S_BIOS_ENDIO_WRITE
, "/" },
430 { S_DM_IO_READ
, " rc=" },
431 { S_DM_IO_WRITE
, " wc=" },
432 { S_BANDWIDTH
, "\nbw=" },
433 { S_NO_BANDWIDTH
, " no_bw=" },
434 { S_BARRIER
, "\nbarrier=" },
435 { S_BIO_COPY_PL_NEXT
, "\nbio_cp_next=" },
436 { S_CAN_MERGE
, "\nmerge=" },
437 { S_CANT_MERGE
, "/no_merge=" },
438 { S_CHUNK_LOCKED
, "\nchunk_locked=" },
439 { S_CONGESTED
, "\ncgst=" },
440 { S_NOT_CONGESTED
, "/not_cgst=" },
441 { S_DEGRADED
, "\ndegraded=" },
442 { S_DELAYED_BIOS
, "\ndel_bios=" },
443 { S_SUM_DELAYED_BIOS
, "/sum_del_bios=" },
444 { S_FLUSHS
, "\nflushs=" },
445 { S_HITS_1ST
, "\nhits_1st=" },
446 { S_IOS_POST
, " ios_post=" },
447 { S_INSCACHE
, " inscache=" },
448 { S_MAX_LOOKUP
, " maxlookup=" },
449 { S_NO_RW
, "\nno_rw=" },
450 { S_NOSYNC
, " nosync=" },
451 { S_OVERWRITE
, " ovr=" },
452 { S_PROHIBITCHUNKIO
, " prhbt_io=" },
453 { S_RECONSTRUCT_EI
, "\nrec_ei=" },
454 { S_RECONSTRUCT_DEV
, " rec_dev=" },
455 { S_RECONSTRUCT_SET
, " rec_set=" },
456 { S_RECONSTRUCTED
, " rec=" },
457 { S_REQUEUE
, " requeue=" },
458 { S_STRIPE_ERROR
, " stripe_err=" },
459 { S_XORS
, " xors=" },
465 #define dm_rh_client dm_region_hash
466 enum count_type
{ IO_WORK
= 0, IO_RECOVER
, IO_NR_COUNT
};
467 typedef void (*xor_function_t
)(unsigned count
, unsigned long **data
);
469 struct dm_target
*ti
; /* Target pointer. */
472 unsigned long flags
; /* State flags. */
473 struct mutex in_lock
; /* Protects central input list below. */
474 struct bio_list in
; /* Pending ios (central input list). */
475 struct bio_list work
; /* ios work set. */
476 wait_queue_head_t suspendq
; /* suspend synchronization. */
477 atomic_t in_process
; /* counter of queued bios (suspendq). */
478 atomic_t in_process_max
;/* counter of queued bios max. */
481 struct workqueue_struct
*wq
;
482 struct delayed_work dws_do_raid
; /* For main worker. */
483 struct work_struct ws_do_table_event
; /* For event worker. */
486 /* Stripe locking abstraction. */
487 struct dm_raid45_locking_type
*locking
;
489 struct stripe_cache sc
; /* Stripe cache for this set. */
491 /* Xor optimization. */
498 /* Recovery parameters. */
500 struct dm_dirty_log
*dl
; /* Dirty log. */
501 struct dm_rh_client
*rh
; /* Region hash. */
503 struct dm_io_client
*dm_io_client
; /* recovery dm-io client. */
504 /* dm-mem-cache client resource context for recovery stripes. */
505 struct dm_mem_cache_client
*mem_cache_client
;
507 struct list_head stripes
; /* List of recovery stripes. */
510 region_t nr_regions_to_recover
;
511 region_t nr_regions_recovered
;
512 unsigned long start_jiffies
;
513 unsigned long end_jiffies
;
515 unsigned bandwidth
; /* Recovery bandwidth [%]. */
516 unsigned bandwidth_work
; /* Recovery bandwidth [factor]. */
517 unsigned bandwidth_parm
; /* " constructor parm. */
518 unsigned io_size
; /* recovery io size <= region size. */
519 unsigned io_size_parm
; /* recovery io size ctr parameter. */
520 unsigned recovery
; /* Recovery allowed/prohibited. */
521 unsigned recovery_stripes
; /* # of parallel recovery stripes. */
523 /* recovery io throttling. */
524 atomic_t io_count
[IO_NR_COUNT
]; /* counter recover/regular io.*/
525 unsigned long last_jiffies
;
528 /* RAID set parameters. */
530 struct raid_type
*raid_type
; /* RAID type (eg, RAID4). */
531 unsigned raid_parms
; /* # variable raid parameters. */
533 unsigned chunk_size
; /* Sectors per chunk. */
534 unsigned chunk_size_parm
;
535 unsigned chunk_shift
; /* rsector chunk size shift. */
537 unsigned io_size
; /* Sectors per io. */
538 unsigned io_size_parm
;
539 unsigned io_mask
; /* Mask for bio_copy_page_list(). */
540 unsigned io_inv_mask
; /* Mask for raid_address(). */
542 sector_t sectors_per_dev
; /* Sectors per device. */
544 atomic_t failed_devs
; /* Amount of devices failed. */
546 /* Index of device to initialize. */
548 int dev_to_init_parm
;
550 /* Raid devices dynamically allocated. */
551 unsigned raid_devs
; /* # of RAID devices below. */
552 unsigned data_devs
; /* # of RAID data devices. */
554 int ei
; /* index of failed RAID device. */
556 /* Index of dedicated parity device (i.e. RAID4). */
558 int pi_parm
; /* constructor parm for status output. */
561 /* REMOVEME: devel stats counters. */
562 atomic_t stats
[S_NR_STATS
];
564 /* Dynamically allocated temporary pointers for xor(). */
565 unsigned long **data
;
567 /* Dynamically allocated RAID devices. Alignment? */
568 struct raid_dev dev
[0];
571 /* Define RAID set bit operations. */
572 BITOPS(RS
, Bandwidth
, raid_set
, RS_RECOVERY_BANDWIDTH
)
573 BITOPS(RS
, CheckOverwrite
, raid_set
, RS_CHECK_OVERWRITE
)
574 BITOPS(RS
, Dead
, raid_set
, RS_DEAD
)
575 BITOPS(RS
, Degraded
, raid_set
, RS_DEGRADED
)
576 BITOPS(RS
, DevelStats
, raid_set
, RS_DEVEL_STATS
)
577 BITOPS(RS
, Recover
, raid_set
, RS_RECOVER
)
578 BITOPS(RS
, ScBusy
, raid_set
, RS_SC_BUSY
)
579 BITOPS(RS
, Suspend
, raid_set
, RS_SUSPEND
)
582 /*-----------------------------------------------------------------
583 * Raid-4/5 set structures.
584 *---------------------------------------------------------------*/
585 /* RAID level definitions. */
591 /* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
592 enum raid_algorithm
{
601 const char *name
; /* RAID algorithm. */
602 const char *descr
; /* Descriptor text for logging. */
603 const unsigned parity_devs
; /* # of parity devices. */
604 const unsigned minimal_devs
; /* minimal # of devices in set. */
605 const enum raid_level level
; /* RAID level. */
606 const enum raid_algorithm algorithm
; /* RAID algorithm. */
609 /* Supported raid types and properties. */
610 static struct raid_type raid_types
[] = {
611 {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4
, none
},
612 {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5
, left_asym
},
613 {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5
, right_asym
},
614 {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5
, left_sym
},
615 {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5
, right_sym
},
618 /* Address as calculated by raid_address(). */
619 struct raid_address
{
620 sector_t key
; /* Hash key (address of stripe % chunk_size). */
621 unsigned di
, pi
; /* Data and parity disks index. */
624 /* REMOVEME: reset statistics counters. */
625 static void stats_reset(struct raid_set
*rs
)
627 unsigned s
= S_NR_STATS
;
630 atomic_set(rs
->stats
+ s
, 0);
633 /*----------------------------------------------------------------
634 * RAID set management routines.
635 *--------------------------------------------------------------*/
637 * Begin small helper functions.
639 /* No need to be called from region hash indirectly at dm_rh_dec(). */
640 static void wake_dummy(void *context
) {}
642 /* Return # of io reference. */
643 static int io_ref(struct raid_set
*rs
)
645 return atomic_read(&rs
->io
.in_process
);
648 /* Get an io reference. */
649 static void io_get(struct raid_set
*rs
)
651 int p
= atomic_inc_return(&rs
->io
.in_process
);
653 if (p
> atomic_read(&rs
->io
.in_process_max
))
654 atomic_set(&rs
->io
.in_process_max
, p
); /* REMOVEME: max. */
657 /* Put the io reference and conditionally wake io waiters. */
658 static void io_put(struct raid_set
*rs
)
660 /* Intel: rebuild data corrupter? */
661 if (atomic_dec_and_test(&rs
->io
.in_process
))
662 wake_up(&rs
->io
.suspendq
);
664 BUG_ON(io_ref(rs
) < 0);
667 /* Wait until all io has been processed. */
668 static void wait_ios(struct raid_set
*rs
)
670 wait_event(rs
->io
.suspendq
, !io_ref(rs
));
673 /* Queue (optionally delayed) io work. */
674 static void wake_do_raid_delayed(struct raid_set
*rs
, unsigned long delay
)
676 queue_delayed_work(rs
->io
.wq
, &rs
->io
.dws_do_raid
, delay
);
679 /* Queue io work immediately (called from region hash too). */
680 static void wake_do_raid(void *context
)
682 struct raid_set
*rs
= context
;
684 queue_work(rs
->io
.wq
, &rs
->io
.dws_do_raid
.work
);
687 /* Calculate device sector offset. */
688 static sector_t
_sector(struct raid_set
*rs
, struct bio
*bio
)
690 sector_t sector
= bio
->bi_sector
;
692 sector_div(sector
, rs
->set
.data_devs
);
696 /* Return # of active stripes in stripe cache. */
697 static int sc_active(struct stripe_cache
*sc
)
699 return atomic_read(&sc
->active_stripes
);
702 /* Stripe cache busy indicator. */
703 static int sc_busy(struct raid_set
*rs
)
705 return sc_active(&rs
->sc
) >
706 atomic_read(&rs
->sc
.stripes
) - (STRIPES_MIN
/ 2);
709 /* Set chunks states. */
710 enum chunk_dirty_type
{ CLEAN
, DIRTY
, ERROR
};
711 static void chunk_set(struct stripe_chunk
*chunk
, enum chunk_dirty_type type
)
715 ClearChunkDirty(chunk
);
718 SetChunkDirty(chunk
);
721 SetChunkError(chunk
);
722 SetStripeError(chunk
->stripe
);
728 SetChunkUptodate(chunk
);
730 ClearChunkError(chunk
);
733 /* Return region state for a sector. */
734 static int region_state(struct raid_set
*rs
, sector_t sector
,
735 enum dm_rh_region_states state
)
737 struct dm_rh_client
*rh
= rs
->recover
.rh
;
738 region_t region
= dm_rh_sector_to_region(rh
, sector
);
740 return !!(dm_rh_get_state(rh
, region
, 1) & state
);
744 * Return true in case a chunk should be read/written
746 * Conditions to read/write:
747 * o chunk not uptodate
750 * Conditios to avoid io:
751 * o io already ongoing on chunk
752 * o io explitely prohibited
754 static int chunk_io(struct stripe_chunk
*chunk
)
756 /* 2nd run optimization (flag set below on first run). */
757 if (TestClearChunkMustIo(chunk
))
760 /* Avoid io if prohibited or a locked chunk. */
761 if (!ChunkIo(chunk
) || ChunkLocked(chunk
))
764 if (!ChunkUptodate(chunk
) || ChunkDirty(chunk
)) {
765 SetChunkMustIo(chunk
); /* 2nd run optimization. */
772 /* Call a function on each chunk needing io unless device failed. */
773 static unsigned for_each_io_dev(struct stripe
*stripe
,
774 void (*f_io
)(struct stripe
*stripe
, unsigned p
))
776 struct raid_set
*rs
= RS(stripe
->sc
);
779 for (p
= 0; p
< rs
->set
.raid_devs
; p
++) {
780 if (chunk_io(CHUNK(stripe
, p
)) && !DevFailed(rs
->dev
+ p
)) {
790 * Index of device to calculate parity on.
792 * Either the parity device index *or* the selected
793 * device to init after a spare replacement.
795 static int dev_for_parity(struct stripe
*stripe
, int *sync
)
797 struct raid_set
*rs
= RS(stripe
->sc
);
798 int r
= region_state(rs
, stripe
->key
, DM_RH_NOSYNC
| DM_RH_RECOVERING
);
802 /* Reconstruct a particular device ?. */
803 if (r
&& rs
->set
.dev_to_init
> -1)
804 return rs
->set
.dev_to_init
;
805 else if (rs
->set
.raid_type
->level
== raid4
)
807 else if (!StripeRecover(stripe
))
808 return stripe
->idx
.parity
;
813 /* RAID set congested function. */
814 static int rs_congested(void *congested_data
, int bdi_bits
)
818 struct raid_set
*rs
= congested_data
;
820 if (sc_busy(rs
) || RSSuspend(rs
))
822 else for (r
= 0, p
= rs
->set
.raid_devs
; !r
&& p
--; ) {
823 /* If any of our component devices are overloaded. */
824 struct request_queue
*q
= bdev_get_queue(rs
->dev
[p
].dev
->bdev
);
826 r
|= bdi_congested(&q
->backing_dev_info
, bdi_bits
);
829 /* REMOVEME: statistics. */
830 atomic_inc(rs
->stats
+ (r
? S_CONGESTED
: S_NOT_CONGESTED
));
834 /* RAID device degrade check. */
835 static void rs_check_degrade_dev(struct raid_set
*rs
,
836 struct stripe
*stripe
, unsigned p
)
838 if (TestSetDevFailed(rs
->dev
+ p
))
841 /* Through an event in case of member device errors. */
842 if (atomic_inc_return(&rs
->set
.failed_devs
) >
843 rs
->set
.raid_type
->parity_devs
&&
844 !TestSetRSDead(rs
)) {
845 /* Display RAID set dead message once. */
847 char buf
[BDEVNAME_SIZE
];
849 DMERR("FATAL: too many devices failed -> RAID set broken");
850 for (p
= 0; p
< rs
->set
.raid_devs
; p
++) {
851 if (DevFailed(rs
->dev
+ p
))
852 DMERR("device /dev/%s failed",
853 bdevname(rs
->dev
[p
].dev
->bdev
, buf
));
857 /* Only log the first member error. */
858 if (!TestSetRSDegraded(rs
)) {
859 char buf
[BDEVNAME_SIZE
];
861 /* Store index for recovery. */
863 DMERR("CRITICAL: %sio error on device /dev/%s "
864 "in region=%llu; DEGRADING RAID set\n",
865 stripe
? "" : "FAKED ",
866 bdevname(rs
->dev
[p
].dev
->bdev
, buf
),
867 (unsigned long long) (stripe
? stripe
->key
: 0));
868 DMERR("further device error messages suppressed");
871 schedule_work(&rs
->io
.ws_do_table_event
);
874 /* RAID set degrade check. */
875 static void rs_check_degrade(struct stripe
*stripe
)
877 struct raid_set
*rs
= RS(stripe
->sc
);
878 unsigned p
= rs
->set
.raid_devs
;
881 if (ChunkError(CHUNK(stripe
, p
)))
882 rs_check_degrade_dev(rs
, stripe
, p
);
886 /* Lookup a RAID device by name or by major:minor number. */
887 static int raid_dev_lookup(struct raid_set
*rs
, struct raid_dev
*dev_lookup
)
890 struct raid_dev
*dev
;
893 * Must be an incremental loop, because the device array
894 * can have empty slots still on calls from raid_ctr()
896 for (dev
= rs
->dev
, p
= 0;
897 dev
->dev
&& p
< rs
->set
.raid_devs
;
899 if (dev_lookup
->dev
->bdev
->bd_dev
== dev
->dev
->bdev
->bd_dev
)
906 * End small helper functions.
910 * Stripe hash functions
912 /* Initialize/destroy stripe hash. */
913 static int hash_init(struct stripe_hash
*hash
, unsigned stripes
)
915 unsigned buckets
= 2, max_buckets
= stripes
>> 1;
916 static unsigned hash_primes
[] = {
917 /* Table of primes for hash_fn/table size optimization. */
918 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
919 1543, 3079, 6151, 12289, 24593, 49157, 98317,
922 /* Calculate number of buckets (2^^n <= stripes / 2). */
923 while (buckets
< max_buckets
)
926 /* Allocate stripe hash buckets. */
927 hash
->hash
= vmalloc(buckets
* sizeof(*hash
->hash
));
931 hash
->buckets
= buckets
;
932 hash
->mask
= buckets
- 1;
933 hash
->shift
= ffs(buckets
);
934 if (hash
->shift
> ARRAY_SIZE(hash_primes
))
935 hash
->shift
= ARRAY_SIZE(hash_primes
) - 1;
937 BUG_ON(hash
->shift
< 2);
938 hash
->prime
= hash_primes
[hash
->shift
];
940 /* Initialize buckets. */
942 INIT_LIST_HEAD(hash
->hash
+ buckets
);
946 static void hash_exit(struct stripe_hash
*hash
)
954 static unsigned hash_fn(struct stripe_hash
*hash
, sector_t key
)
956 return (unsigned) (((key
* hash
->prime
) >> hash
->shift
) & hash
->mask
);
959 static struct list_head
*hash_bucket(struct stripe_hash
*hash
, sector_t key
)
961 return hash
->hash
+ hash_fn(hash
, key
);
964 /* Insert an entry into a hash. */
965 static void stripe_insert(struct stripe_hash
*hash
, struct stripe
*stripe
)
967 list_add(stripe
->lists
+ LIST_HASH
, hash_bucket(hash
, stripe
->key
));
970 /* Lookup an entry in the stripe hash. */
971 static struct stripe
*stripe_lookup(struct stripe_cache
*sc
, sector_t key
)
974 struct stripe
*stripe
;
975 struct list_head
*bucket
= hash_bucket(&sc
->hash
, key
);
977 list_for_each_entry(stripe
, bucket
, lists
[LIST_HASH
]) {
980 if (stripe
->key
== key
) {
981 /* REMOVEME: statisics. */
982 if (look
> atomic_read(RS(sc
)->stats
+ S_MAX_LOOKUP
))
983 atomic_set(RS(sc
)->stats
+ S_MAX_LOOKUP
, look
);
991 /* Resize the stripe cache hash on size changes. */
992 static int sc_hash_resize(struct stripe_cache
*sc
)
994 /* Resize indicated ? */
995 if (atomic_read(&sc
->stripes
) != atomic_read(&sc
->stripes_last
)) {
997 struct stripe_hash hash
;
999 r
= hash_init(&hash
, atomic_read(&sc
->stripes
));
1003 if (sc
->hash
.hash
) {
1004 unsigned b
= sc
->hash
.buckets
;
1005 struct list_head
*pos
, *tmp
;
1007 /* Walk old buckets and insert into new. */
1009 list_for_each_safe(pos
, tmp
, sc
->hash
.hash
+ b
)
1010 stripe_insert(&hash
,
1011 list_entry(pos
, struct stripe
,
1017 hash_exit(&sc
->hash
);
1018 memcpy(&sc
->hash
, &hash
, sizeof(sc
->hash
));
1019 atomic_set(&sc
->stripes_last
, atomic_read(&sc
->stripes
));
1024 /* End hash stripe hash function. */
1026 /* List add, delete, push and pop functions. */
1027 /* Add stripe to flush list. */
1028 #define DEL_LIST(lh) \
1029 if (!list_empty(lh)) \
1032 /* Delete stripe from hash. */
1033 static void stripe_hash_del(struct stripe
*stripe
)
1035 DEL_LIST(stripe
->lists
+ LIST_HASH
);
1038 /* Return stripe reference count. */
1039 static inline int stripe_ref(struct stripe
*stripe
)
1041 return atomic_read(&stripe
->cnt
);
1044 static void stripe_flush_add(struct stripe
*stripe
)
1046 struct stripe_cache
*sc
= stripe
->sc
;
1047 struct list_head
*lh
= stripe
->lists
+ LIST_FLUSH
;
1049 if (!StripeReconstruct(stripe
) && list_empty(lh
))
1050 list_add_tail(lh
, sc
->lists
+ LIST_FLUSH
);
1054 * Add stripe to LRU (inactive) list.
1056 * Need lock, because of concurrent access from message interface.
1058 static void stripe_lru_add(struct stripe
*stripe
)
1060 if (!StripeRecover(stripe
)) {
1061 unsigned long flags
;
1062 struct list_head
*lh
= stripe
->lists
+ LIST_LRU
;
1063 spinlock_t
*lock
= stripe
->sc
->locks
+ LOCK_LRU
;
1065 spin_lock_irqsave(lock
, flags
);
1067 list_add_tail(lh
, stripe
->sc
->lists
+ LIST_LRU
);
1068 spin_unlock_irqrestore(lock
, flags
);
1072 #define POP_LIST(list) \
1074 if (list_empty(sc->lists + (list))) \
1077 stripe = list_first_entry(sc->lists + (list), \
1080 list_del_init(stripe->lists + (list)); \
1084 /* Pop an available stripe off the LRU list. */
1085 static struct stripe
*stripe_lru_pop(struct stripe_cache
*sc
)
1087 struct stripe
*stripe
;
1088 spinlock_t
*lock
= sc
->locks
+ LOCK_LRU
;
1090 spin_lock_irq(lock
);
1092 spin_unlock_irq(lock
);
1097 /* Pop an available stripe off the io list. */
1098 static struct stripe
*stripe_io_pop(struct stripe_cache
*sc
)
1100 struct stripe
*stripe
;
1102 POP_LIST(LIST_FLUSH
);
1106 /* Push a stripe safely onto the endio list to be handled by do_endios(). */
1107 static void stripe_endio_push(struct stripe
*stripe
)
1109 unsigned long flags
;
1110 struct stripe_cache
*sc
= stripe
->sc
;
1111 struct list_head
*stripe_list
= stripe
->lists
+ LIST_ENDIO
,
1112 *sc_list
= sc
->lists
+ LIST_ENDIO
;
1113 spinlock_t
*lock
= sc
->locks
+ LOCK_ENDIO
;
1115 /* This runs in parallel with do_endios(). */
1116 spin_lock_irqsave(lock
, flags
);
1117 if (list_empty(stripe_list
))
1118 list_add_tail(stripe_list
, sc_list
);
1119 spin_unlock_irqrestore(lock
, flags
);
1121 wake_do_raid(RS(sc
)); /* Wake myself. */
1124 /* Pop a stripe off safely off the endio list. */
1125 static struct stripe
*stripe_endio_pop(struct stripe_cache
*sc
)
1127 struct stripe
*stripe
;
1128 spinlock_t
*lock
= sc
->locks
+ LOCK_ENDIO
;
1130 /* This runs in parallel with endio(). */
1131 spin_lock_irq(lock
);
1132 POP_LIST(LIST_ENDIO
)
1133 spin_unlock_irq(lock
);
1139 * Stripe cache locking functions
1141 /* Dummy lock function for single host RAID4+5. */
1142 static void *no_lock(sector_t key
, enum dm_lock_type type
)
1147 /* Dummy unlock function for single host RAID4+5. */
1148 static void no_unlock(void *lock_handle
)
1152 /* No locking (for single host RAID 4+5). */
1153 static struct dm_raid45_locking_type locking_none
= {
1155 .unlock
= no_unlock
,
1158 /* Lock a stripe (for clustering). */
1160 stripe_lock(struct stripe
*stripe
, int rw
, sector_t key
)
1162 stripe
->lock
= RS(stripe
->sc
)->locking
->lock(key
, rw
== READ
? DM_RAID45_SHARED
: DM_RAID45_EX
);
1163 return stripe
->lock
? 0 : -EPERM
;
1166 /* Unlock a stripe (for clustering). */
1167 static void stripe_unlock(struct stripe
*stripe
)
1169 RS(stripe
->sc
)->locking
->unlock(stripe
->lock
);
1170 stripe
->lock
= NULL
;
1173 /* Test io pending on stripe. */
1174 static int stripe_io_ref(struct stripe
*stripe
)
1176 return atomic_read(&stripe
->io
.pending
);
1179 static void stripe_io_get(struct stripe
*stripe
)
1181 if (atomic_inc_return(&stripe
->io
.pending
) == 1)
1182 /* REMOVEME: statistics */
1183 atomic_inc(&stripe
->sc
->active_stripes
);
1185 BUG_ON(stripe_io_ref(stripe
) < 0);
1188 static void stripe_io_put(struct stripe
*stripe
)
1190 if (atomic_dec_and_test(&stripe
->io
.pending
)) {
1191 if (unlikely(StripeRecover(stripe
)))
1192 /* Don't put recovery stripe on endio list. */
1193 wake_do_raid(RS(stripe
->sc
));
1195 /* Add regular stripe to endio list and wake daemon. */
1196 stripe_endio_push(stripe
);
1198 /* REMOVEME: statistics */
1199 atomic_dec(&stripe
->sc
->active_stripes
);
1201 BUG_ON(stripe_io_ref(stripe
) < 0);
1204 /* Take stripe reference out. */
1205 static int stripe_get(struct stripe
*stripe
)
1208 struct list_head
*lh
= stripe
->lists
+ LIST_LRU
;
1209 spinlock_t
*lock
= stripe
->sc
->locks
+ LOCK_LRU
;
1211 /* Delete stripe from LRU (inactive) list if on. */
1212 spin_lock_irq(lock
);
1214 spin_unlock_irq(lock
);
1216 BUG_ON(stripe_ref(stripe
) < 0);
1218 /* Lock stripe on first reference */
1219 r
= (atomic_inc_return(&stripe
->cnt
) == 1) ?
1220 stripe_lock(stripe
, WRITE
, stripe
->key
) : 0;
1226 /* Return references on a chunk. */
1227 static int chunk_ref(struct stripe_chunk
*chunk
)
1229 return atomic_read(&chunk
->cnt
);
1232 /* Take out reference on a chunk. */
1233 static int chunk_get(struct stripe_chunk
*chunk
)
1235 return atomic_inc_return(&chunk
->cnt
);
1238 /* Drop reference on a chunk. */
1239 static void chunk_put(struct stripe_chunk
*chunk
)
1241 BUG_ON(atomic_dec_return(&chunk
->cnt
) < 0);
1245 * Drop reference on a stripe.
1247 * Move it to list of LRU stripes if zero.
1249 static void stripe_put(struct stripe
*stripe
)
1251 if (atomic_dec_and_test(&stripe
->cnt
)) {
1252 BUG_ON(stripe_io_ref(stripe
));
1253 stripe_unlock(stripe
);
1255 BUG_ON(stripe_ref(stripe
) < 0);
1258 /* Helper needed by for_each_io_dev(). */
1259 static void stripe_get_references(struct stripe
*stripe
, unsigned p
)
1263 * Another one to reference the stripe in
1264 * order to protect vs. LRU list moves.
1266 io_get(RS(stripe
->sc
)); /* Global io references. */
1268 stripe_io_get(stripe
); /* One for each chunk io. */
1271 /* Helper for endio() to put all take references. */
1272 static void stripe_put_references(struct stripe
*stripe
)
1274 stripe_io_put(stripe
); /* One for each chunk io. */
1276 io_put(RS(stripe
->sc
));
1280 * Stripe cache functions.
1283 * Invalidate all chunks (i.e. their pages) of a stripe.
1285 * I only keep state for the whole chunk.
1287 static inline void stripe_chunk_invalidate(struct stripe_chunk
*chunk
)
1289 chunk
->io
.flags
= 0;
1293 stripe_chunks_invalidate(struct stripe
*stripe
)
1295 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
1298 stripe_chunk_invalidate(CHUNK(stripe
, p
));
1301 /* Prepare stripe for (re)use. */
1302 static void stripe_invalidate(struct stripe
*stripe
)
1304 stripe
->io
.flags
= 0;
1305 stripe
->idx
.parity
= stripe
->idx
.recover
= -1;
1306 stripe_chunks_invalidate(stripe
);
1310 * Allow io on all chunks of a stripe.
1311 * If not set, IO will not occur; i.e. it's prohibited.
1313 * Actual IO submission for allowed chunks depends
1314 * on their !uptodate or dirty state.
1316 static void stripe_allow_io(struct stripe
*stripe
)
1318 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
1321 SetChunkIo(CHUNK(stripe
, p
));
1324 /* Initialize a stripe. */
1325 static void stripe_init(struct stripe_cache
*sc
, struct stripe
*stripe
)
1327 unsigned i
, p
= RS(sc
)->set
.raid_devs
;
1329 /* Work all io chunks. */
1331 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
1333 atomic_set(&chunk
->cnt
, 0);
1334 chunk
->stripe
= stripe
;
1335 i
= ARRAY_SIZE(chunk
->bl
);
1337 bio_list_init(chunk
->bl
+ i
);
1343 i
= ARRAY_SIZE(stripe
->lists
);
1345 INIT_LIST_HEAD(stripe
->lists
+ i
);
1347 stripe
->io
.size
= RS(sc
)->set
.io_size
;
1348 atomic_set(&stripe
->cnt
, 0);
1349 atomic_set(&stripe
->io
.pending
, 0);
1350 stripe_invalidate(stripe
);
1353 /* Number of pages per chunk. */
1354 static inline unsigned chunk_pages(unsigned sectors
)
1356 return dm_div_up(sectors
, SECTORS_PER_PAGE
);
1359 /* Number of pages per stripe. */
1360 static inline unsigned stripe_pages(struct raid_set
*rs
, unsigned io_size
)
1362 return chunk_pages(io_size
) * rs
->set
.raid_devs
;
1365 /* Initialize part of page_list (recovery). */
1366 static void stripe_zero_pl_part(struct stripe
*stripe
, int p
,
1367 unsigned start
, unsigned count
)
1369 unsigned o
= start
/ SECTORS_PER_PAGE
, pages
= chunk_pages(count
);
1370 /* Get offset into the page_list. */
1371 struct page_list
*pl
= pl_elem(PL(stripe
, p
), o
);
1374 while (pl
&& pages
--) {
1376 memset(page_address(pl
->page
), 0, PAGE_SIZE
);
1381 /* Initialize parity chunk of stripe. */
1382 static void stripe_zero_chunk(struct stripe
*stripe
, int p
)
1385 stripe_zero_pl_part(stripe
, p
, 0, stripe
->io
.size
);
1388 /* Return dynamic stripe structure size. */
1389 static size_t stripe_size(struct raid_set
*rs
)
1391 return sizeof(struct stripe
) +
1392 rs
->set
.raid_devs
* sizeof(struct stripe_chunk
);
1395 /* Allocate a stripe and its memory object. */
1396 /* XXX adjust to cope with stripe cache and recovery stripe caches. */
1397 enum grow
{ SC_GROW
, SC_KEEP
};
1398 static struct stripe
*stripe_alloc(struct stripe_cache
*sc
,
1399 struct dm_mem_cache_client
*mc
,
1403 struct stripe
*stripe
;
1405 stripe
= kmem_cache_zalloc(sc
->kc
.cache
, GFP_KERNEL
);
1407 /* Grow the dm-mem-cache by one object. */
1408 if (grow
== SC_GROW
) {
1409 r
= dm_mem_cache_grow(mc
, 1);
1414 stripe
->obj
= dm_mem_cache_alloc(mc
);
1418 stripe_init(sc
, stripe
);
1424 if (grow
== SC_GROW
)
1425 dm_mem_cache_shrink(mc
, 1);
1427 kmem_cache_free(sc
->kc
.cache
, stripe
);
1432 * Free a stripes memory object, shrink the
1433 * memory cache and free the stripe itself.
1435 static void stripe_free(struct stripe
*stripe
, struct dm_mem_cache_client
*mc
)
1437 dm_mem_cache_free(mc
, stripe
->obj
);
1438 dm_mem_cache_shrink(mc
, 1);
1439 kmem_cache_free(stripe
->sc
->kc
.cache
, stripe
);
1442 /* Free the recovery stripe. */
1443 static void stripe_recover_free(struct raid_set
*rs
)
1445 struct recover
*rec
= &rs
->recover
;
1446 struct dm_mem_cache_client
*mc
;
1448 mc
= rec
->mem_cache_client
;
1449 rec
->mem_cache_client
= NULL
;
1451 struct stripe
*stripe
;
1453 while (!list_empty(&rec
->stripes
)) {
1454 stripe
= list_first_entry(&rec
->stripes
, struct stripe
,
1455 lists
[LIST_RECOVER
]);
1456 list_del(stripe
->lists
+ LIST_RECOVER
);
1457 kfree(stripe
->recover
);
1458 stripe_free(stripe
, mc
);
1461 dm_mem_cache_client_destroy(mc
);
1462 dm_io_client_destroy(rec
->dm_io_client
);
1463 rec
->dm_io_client
= NULL
;
1467 /* Grow stripe cache. */
1468 static int sc_grow(struct stripe_cache
*sc
, unsigned stripes
, enum grow grow
)
1472 /* Try to allocate this many (additional) stripes. */
1474 struct stripe
*stripe
=
1475 stripe_alloc(sc
, sc
->mem_cache_client
, grow
);
1477 if (likely(stripe
)) {
1478 stripe_lru_add(stripe
);
1479 atomic_inc(&sc
->stripes
);
1486 return r
? r
: sc_hash_resize(sc
);
1489 /* Shrink stripe cache. */
1490 static int sc_shrink(struct stripe_cache
*sc
, unsigned stripes
)
1494 /* Try to get unused stripe from LRU list. */
1496 struct stripe
*stripe
;
1498 stripe
= stripe_lru_pop(sc
);
1500 /* An LRU stripe may never have ios pending! */
1501 BUG_ON(stripe_io_ref(stripe
));
1502 BUG_ON(stripe_ref(stripe
));
1503 atomic_dec(&sc
->stripes
);
1504 /* Remove from hash if on before deletion. */
1505 stripe_hash_del(stripe
);
1506 stripe_free(stripe
, sc
->mem_cache_client
);
1513 /* Check if stats are still sane. */
1514 if (atomic_read(&sc
->active_stripes_max
) >
1515 atomic_read(&sc
->stripes
))
1516 atomic_set(&sc
->active_stripes_max
, 0);
1521 return atomic_read(&sc
->stripes
) ? sc_hash_resize(sc
) : 0;
1524 /* Create stripe cache and recovery. */
1525 static int sc_init(struct raid_set
*rs
, unsigned stripes
)
1527 unsigned i
, r
, rstripes
;
1528 struct stripe_cache
*sc
= &rs
->sc
;
1529 struct stripe
*stripe
;
1530 struct recover
*rec
= &rs
->recover
;
1531 struct mapped_device
*md
;
1532 struct gendisk
*disk
;
1534 /* Initialize lists and locks. */
1535 i
= ARRAY_SIZE(sc
->lists
);
1537 INIT_LIST_HEAD(sc
->lists
+ i
);
1539 INIT_LIST_HEAD(&rec
->stripes
);
1541 /* Initialize endio and LRU list locks. */
1544 spin_lock_init(sc
->locks
+ i
);
1546 /* Initialize atomic variables. */
1547 atomic_set(&sc
->stripes
, 0);
1548 atomic_set(&sc
->stripes_to_set
, 0);
1549 atomic_set(&sc
->active_stripes
, 0);
1550 atomic_set(&sc
->active_stripes_max
, 0); /* REMOVEME: statistics. */
1553 * We need a runtime unique # to suffix the kmem cache name
1554 * because we'll have one for each active RAID set.
1556 md
= dm_table_get_md(rs
->ti
->table
);
1558 sprintf(sc
->kc
.name
, "%s-%d", TARGET
, disk
->first_minor
);
1560 sc
->kc
.cache
= kmem_cache_create(sc
->kc
.name
, stripe_size(rs
),
1565 /* Create memory cache client context for RAID stripe cache. */
1566 sc
->mem_cache_client
=
1567 dm_mem_cache_client_create(stripes
, rs
->set
.raid_devs
,
1568 chunk_pages(rs
->set
.io_size
));
1569 if (IS_ERR(sc
->mem_cache_client
))
1570 return PTR_ERR(sc
->mem_cache_client
);
1572 /* Create memory cache client context for RAID recovery stripe(s). */
1573 rstripes
= rec
->recovery_stripes
;
1574 rec
->mem_cache_client
=
1575 dm_mem_cache_client_create(rstripes
, rs
->set
.raid_devs
,
1576 chunk_pages(rec
->io_size
));
1577 if (IS_ERR(rec
->mem_cache_client
))
1578 return PTR_ERR(rec
->mem_cache_client
);
1580 /* Create dm-io client context for IO stripes. */
1582 dm_io_client_create();
1583 if (IS_ERR(sc
->dm_io_client
))
1584 return PTR_ERR(sc
->dm_io_client
);
1586 /* FIXME: intermingeled with stripe cache initialization. */
1587 /* Create dm-io client context for recovery stripes. */
1589 dm_io_client_create();
1590 if (IS_ERR(rec
->dm_io_client
))
1591 return PTR_ERR(rec
->dm_io_client
);
1593 /* Allocate stripes for set recovery. */
1594 while (rstripes
--) {
1595 stripe
= stripe_alloc(sc
, rec
->mem_cache_client
, SC_KEEP
);
1599 stripe
->recover
= kzalloc(sizeof(*stripe
->recover
), GFP_KERNEL
);
1600 if (!stripe
->recover
) {
1601 stripe_free(stripe
, rec
->mem_cache_client
);
1605 SetStripeRecover(stripe
);
1606 stripe
->io
.size
= rec
->io_size
;
1607 list_add_tail(stripe
->lists
+ LIST_RECOVER
, &rec
->stripes
);
1608 /* Don't add recovery stripes to LRU list! */
1612 * Allocate the stripe objetcs from the
1613 * cache and add them to the LRU list.
1615 r
= sc_grow(sc
, stripes
, SC_KEEP
);
1617 atomic_set(&sc
->stripes_last
, stripes
);
1622 /* Destroy the stripe cache. */
1623 static void sc_exit(struct stripe_cache
*sc
)
1625 struct raid_set
*rs
= RS(sc
);
1628 stripe_recover_free(rs
);
1629 BUG_ON(sc_shrink(sc
, atomic_read(&sc
->stripes
)));
1630 kmem_cache_destroy(sc
->kc
.cache
);
1631 sc
->kc
.cache
= NULL
;
1633 if (sc
->mem_cache_client
&& !IS_ERR(sc
->mem_cache_client
))
1634 dm_mem_cache_client_destroy(sc
->mem_cache_client
);
1636 if (sc
->dm_io_client
&& !IS_ERR(sc
->dm_io_client
))
1637 dm_io_client_destroy(sc
->dm_io_client
);
1639 hash_exit(&sc
->hash
);
1644 * Calculate RAID address
1646 * Delivers tuple with the index of the data disk holding the chunk
1647 * in the set, the parity disks index and the start of the stripe
1648 * within the address space of the set (used as the stripe cache hash key).
1651 static struct raid_address
*raid_address(struct raid_set
*rs
, sector_t sector
,
1652 struct raid_address
*addr
)
1654 sector_t stripe
, tmp
;
1657 * chunk_number = sector / chunk_size
1658 * stripe_number = chunk_number / data_devs
1659 * di = stripe % data_devs;
1661 stripe
= sector
>> rs
->set
.chunk_shift
;
1662 addr
->di
= sector_div(stripe
, rs
->set
.data_devs
);
1664 switch (rs
->set
.raid_type
->level
) {
1666 addr
->pi
= rs
->set
.pi
;
1667 goto check_shift_di
;
1670 addr
->pi
= sector_div(tmp
, rs
->set
.raid_devs
);
1672 switch (rs
->set
.raid_type
->algorithm
) {
1673 case left_asym
: /* Left asymmetric. */
1674 addr
->pi
= rs
->set
.data_devs
- addr
->pi
;
1675 case right_asym
: /* Right asymmetric. */
1677 if (addr
->di
>= addr
->pi
)
1680 case left_sym
: /* Left symmetric. */
1681 addr
->pi
= rs
->set
.data_devs
- addr
->pi
;
1682 case right_sym
: /* Right symmetric. */
1683 addr
->di
= (addr
->pi
+ addr
->di
+ 1) %
1686 case none
: /* Ain't happen: RAID4 algorithm placeholder. */
1692 * Start offset of the stripes chunk on any single device of the RAID
1693 * set, adjusted in case io size differs from chunk size.
1695 addr
->key
= (stripe
<< rs
->set
.chunk_shift
) +
1696 (sector
& rs
->set
.io_inv_mask
);
1701 * Copy data across between stripe pages and bio vectors.
1703 * Pay attention to data alignment in stripe and bio pages.
1705 static void bio_copy_page_list(int rw
, struct stripe
*stripe
,
1706 struct page_list
*pl
, struct bio
*bio
)
1708 unsigned i
, page_offset
;
1710 struct raid_set
*rs
= RS(stripe
->sc
);
1713 /* Get start page in page list for this sector. */
1714 i
= (bio
->bi_sector
& rs
->set
.io_mask
) / SECTORS_PER_PAGE
;
1715 pl
= pl_elem(pl
, i
);
1719 page_addr
= page_address(pl
->page
);
1720 page_offset
= to_bytes(bio
->bi_sector
& (SECTORS_PER_PAGE
- 1));
1722 /* Walk all segments and copy data across between bio_vecs and pages. */
1723 bio_for_each_segment(bv
, bio
, i
) {
1724 int len
= bv
->bv_len
, size
;
1725 unsigned bio_offset
= 0;
1726 void *bio_addr
= __bio_kmap_atomic(bio
, i
, KM_USER0
);
1728 size
= (page_offset
+ len
> PAGE_SIZE
) ?
1729 PAGE_SIZE
- page_offset
: len
;
1732 memcpy(bio_addr
+ bio_offset
,
1733 page_addr
+ page_offset
, size
);
1735 memcpy(page_addr
+ page_offset
,
1736 bio_addr
+ bio_offset
, size
);
1738 page_offset
+= size
;
1739 if (page_offset
== PAGE_SIZE
) {
1741 * We reached the end of the chunk page ->
1742 * need to refer to the next one to copy more data.
1746 /* Get next page. */
1750 page_addr
= page_address(pl
->page
);
1753 /* REMOVEME: statistics. */
1754 atomic_inc(rs
->stats
+ S_BIO_COPY_PL_NEXT
);
1759 __bio_kunmap_atomic(bio_addr
, KM_USER0
);
1764 * Xor optimization macros.
1766 /* Xor data pointer declaration and initialization macros. */
1767 #define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
1768 #define DECLARE_3 DECLARE_2, *d2 = data[2]
1769 #define DECLARE_4 DECLARE_3, *d3 = data[3]
1770 #define DECLARE_5 DECLARE_4, *d4 = data[4]
1771 #define DECLARE_6 DECLARE_5, *d5 = data[5]
1772 #define DECLARE_7 DECLARE_6, *d6 = data[6]
1773 #define DECLARE_8 DECLARE_7, *d7 = data[7]
1775 /* Xor unrole macros. */
1776 #define D2(n) d0[n] = d0[n] ^ d1[n]
1777 #define D3(n) D2(n) ^ d2[n]
1778 #define D4(n) D3(n) ^ d3[n]
1779 #define D5(n) D4(n) ^ d4[n]
1780 #define D6(n) D5(n) ^ d5[n]
1781 #define D7(n) D6(n) ^ d6[n]
1782 #define D8(n) D7(n) ^ d7[n]
1784 #define X_2(macro, offset) macro(offset); macro(offset + 1);
1785 #define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
1786 #define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
1787 #define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
1788 #define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
1789 #define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
1791 /* Define a _xor_#chunks_#xors_per_run() function. */
1792 #define _XOR(chunks, xors_per_run) \
1793 static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
1795 unsigned end = XOR_SIZE / sizeof(data[0]), i; \
1796 DECLARE_ ## chunks; \
1798 for (i = 0; i < end; i += xors_per_run) { \
1799 X_ ## xors_per_run(D ## chunks, i); \
1803 /* Define xor functions for 2 - 8 chunks and xors per run. */
1804 #define MAKE_XOR_PER_RUN(xors_per_run) \
1805 _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
1806 _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
1807 _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
1808 _XOR(8, xors_per_run);
1810 MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
1811 MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
1812 MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
1813 MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
1815 #define MAKE_XOR(xors_per_run) \
1817 void (*f)(unsigned long **); \
1818 } static xor_funcs ## xors_per_run[] = { \
1819 { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
1821 { _xor2_ ## xors_per_run }, \
1822 { _xor3_ ## xors_per_run }, \
1823 { _xor4_ ## xors_per_run }, \
1824 { _xor5_ ## xors_per_run }, \
1825 { _xor6_ ## xors_per_run }, \
1826 { _xor7_ ## xors_per_run }, \
1827 { _xor8_ ## xors_per_run }, \
1830 static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
1832 /* Call respective function for amount of chunks. */ \
1833 xor_funcs ## xors_per_run[n].f(data); \
1836 /* Define xor_8() - xor_64 functions. */
1842 /* Maximum number of chunks, which can be xor'ed in one go. */
1843 #define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
1845 static void xor_blocks_wrapper(unsigned n
, unsigned long **data
)
1847 BUG_ON(n
< 2 || n
> MAX_XOR_BLOCKS
+ 1);
1848 xor_blocks(n
- 1, XOR_SIZE
, (void *) data
[0], (void **) data
+ 1);
1854 } static xor_funcs
[] = {
1856 { xor_16
, "xor_16" },
1857 { xor_32
, "xor_32" },
1858 { xor_64
, "xor_64" },
1859 { xor_blocks_wrapper
, "xor_blocks" },
1863 * Check, if chunk has to be xored in/out:
1865 * o if writes are queued
1866 * o if writes are merged
1867 * o if stripe is to be reconstructed
1868 * o if recovery stripe
1870 static inline int chunk_must_xor(struct stripe_chunk
*chunk
)
1872 if (ChunkUptodate(chunk
)) {
1873 BUG_ON(!bio_list_empty(BL_CHUNK(chunk
, WRITE_QUEUED
)) &&
1874 !bio_list_empty(BL_CHUNK(chunk
, WRITE_MERGED
)));
1876 if (!bio_list_empty(BL_CHUNK(chunk
, WRITE_QUEUED
)) ||
1877 !bio_list_empty(BL_CHUNK(chunk
, WRITE_MERGED
)))
1880 if (StripeReconstruct(chunk
->stripe
) ||
1881 StripeRecover(chunk
->stripe
))
1891 * This indexes into the chunks of a stripe and their pages.
1893 * All chunks will be xored into the indexed (@pi)
1894 * chunk in maximum groups of xor.chunks.
1897 static void xor(struct stripe
*stripe
, unsigned pi
, unsigned sector
)
1899 struct raid_set
*rs
= RS(stripe
->sc
);
1900 unsigned max_chunks
= rs
->xor.chunks
, n
= 1,
1901 o
= sector
/ SECTORS_PER_PAGE
, /* Offset into the page_list. */
1902 p
= rs
->set
.raid_devs
;
1903 unsigned long **d
= rs
->data
;
1904 xor_function_t xor_f
= rs
->xor.f
->f
;
1906 BUG_ON(sector
> stripe
->io
.size
);
1908 /* Address of parity page to xor into. */
1909 d
[0] = page_address(pl_elem(PL(stripe
, pi
), o
)->page
);
1912 /* Preset pointers to data pages. */
1913 if (p
!= pi
&& chunk_must_xor(CHUNK(stripe
, p
)))
1914 d
[n
++] = page_address(pl_elem(PL(stripe
, p
), o
)->page
);
1916 /* If max chunks -> xor. */
1917 if (n
== max_chunks
) {
1923 /* If chunks -> xor. */
1928 /* Common xor loop through all stripe page lists. */
1929 static void common_xor(struct stripe
*stripe
, sector_t count
,
1930 unsigned off
, unsigned pi
)
1935 for (sector
= off
; sector
< count
; sector
+= SECTORS_PER_PAGE
)
1936 xor(stripe
, pi
, sector
);
1938 /* Set parity page uptodate and clean. */
1939 chunk_set(CHUNK(stripe
, pi
), CLEAN
);
1940 atomic_inc(RS(stripe
->sc
)->stats
+ S_XORS
); /* REMOVEME: statistics. */
1944 * Calculate parity sectors on intact stripes.
1946 * Need to calculate raid address for recover stripe, because its
1947 * chunk sizes differs and is typically larger than io chunk size.
1949 static void parity_xor(struct stripe
*stripe
)
1951 struct raid_set
*rs
= RS(stripe
->sc
);
1952 unsigned chunk_size
= rs
->set
.chunk_size
, io_size
= stripe
->io
.size
,
1953 xor_size
= chunk_size
> io_size
? io_size
: chunk_size
;
1956 /* This can be the recover stripe with a larger io size. */
1957 for (off
= 0; off
< io_size
; off
+= xor_size
) {
1959 * Recover stripe is likely bigger than regular io
1960 * ones and has no precalculated parity disk index ->
1961 * need to calculate RAID address.
1963 if (unlikely(StripeRecover(stripe
))) {
1964 struct raid_address addr
;
1967 (stripe
->key
+ off
) * rs
->set
.data_devs
,
1969 stripe
->idx
.parity
= addr
.pi
;
1970 stripe_zero_pl_part(stripe
, addr
.pi
, off
, xor_size
);
1973 common_xor(stripe
, xor_size
, off
, stripe
->idx
.parity
);
1974 chunk_set(CHUNK(stripe
, stripe
->idx
.parity
), DIRTY
);
1978 /* Reconstruct missing chunk. */
1979 static void stripe_reconstruct(struct stripe
*stripe
)
1981 struct raid_set
*rs
= RS(stripe
->sc
);
1982 int p
= rs
->set
.raid_devs
, pr
= stripe
->idx
.recover
;
1986 /* Check if all but the chunk to be reconstructed are uptodate. */
1988 BUG_ON(p
!= pr
&& !ChunkUptodate(CHUNK(stripe
, p
)));
1990 /* REMOVEME: statistics. */
1991 atomic_inc(rs
->stats
+ (RSDegraded(rs
) ? S_RECONSTRUCT_EI
:
1992 S_RECONSTRUCT_DEV
));
1993 /* Zero chunk to be reconstructed. */
1994 stripe_zero_chunk(stripe
, pr
);
1995 common_xor(stripe
, stripe
->io
.size
, 0, pr
);
1996 stripe
->idx
.recover
= -1;
2000 * Recovery io throttling
2002 /* Conditionally reset io counters. */
2003 static int recover_io_reset(struct raid_set
*rs
)
2005 unsigned long j
= jiffies
;
2007 /* Pay attention to jiffies overflows. */
2008 if (j
> rs
->recover
.last_jiffies
+ HZ
/ 20 ||
2009 j
< rs
->recover
.last_jiffies
) {
2010 atomic_set(rs
->recover
.io_count
+ IO_WORK
, 0);
2011 atomic_set(rs
->recover
.io_count
+ IO_RECOVER
, 0);
2012 rs
->recover
.last_jiffies
= j
;
2020 static void recover_io_count(struct stripe
*stripe
)
2022 struct raid_set
*rs
= RS(stripe
->sc
);
2024 recover_io_reset(rs
);
2025 atomic_inc(rs
->recover
.io_count
+
2026 (StripeRecover(stripe
) ? IO_RECOVER
: IO_WORK
));
2029 /* Try getting a stripe either from the hash or from the LRU list. */
2030 static struct stripe
*stripe_find(struct raid_set
*rs
,
2031 struct raid_address
*addr
)
2034 struct stripe_cache
*sc
= &rs
->sc
;
2035 struct stripe
*stripe
;
2037 /* Try stripe from hash. */
2038 stripe
= stripe_lookup(sc
, addr
->key
);
2040 r
= stripe_get(stripe
);
2042 goto get_lock_failed
;
2044 atomic_inc(rs
->stats
+ S_HITS_1ST
); /* REMOVEME: statistics. */
2046 /* Not in hash -> try to get an LRU stripe. */
2047 stripe
= stripe_lru_pop(sc
);
2050 * An LRU stripe may not be referenced
2051 * and may never have ios pending!
2053 BUG_ON(stripe_ref(stripe
));
2054 BUG_ON(stripe_io_ref(stripe
));
2056 /* Remove from hash if on before reuse. */
2057 stripe_hash_del(stripe
);
2059 /* Invalidate before reinserting with changed key. */
2060 stripe_invalidate(stripe
);
2062 stripe
->key
= addr
->key
;
2063 stripe
->region
= dm_rh_sector_to_region(rs
->recover
.rh
,
2065 stripe
->idx
.parity
= addr
->pi
;
2066 r
= stripe_get(stripe
);
2068 goto get_lock_failed
;
2070 /* Insert stripe into the stripe hash. */
2071 stripe_insert(&sc
->hash
, stripe
);
2072 /* REMOVEME: statistics. */
2073 atomic_inc(rs
->stats
+ S_INSCACHE
);
2087 * I need to do it here because I can't in interrupt
2089 /* End io all bios on a bio list. */
2090 static void bio_list_endio(struct stripe
*stripe
, struct bio_list
*bl
,
2093 struct raid_set
*rs
= RS(stripe
->sc
);
2095 struct page_list
*pl
= PL(stripe
, p
);
2096 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2098 /* Update region counters. */
2099 while ((bio
= bio_list_pop(bl
))) {
2100 if (bio_data_dir(bio
) == WRITE
)
2101 /* Drop io pending count for any writes. */
2102 dm_rh_dec(rs
->recover
.rh
, stripe
->region
);
2104 /* Copy data accross. */
2105 bio_copy_page_list(READ
, stripe
, pl
, bio
);
2107 bio_endio(bio
, error
);
2109 /* REMOVEME: statistics. */
2110 atomic_inc(rs
->stats
+ (bio_data_dir(bio
) == READ
?
2111 S_BIOS_ENDIO_READ
: S_BIOS_ENDIO_WRITE
));
2115 io_put(rs
); /* Wake any suspend waiters on last bio. */
2120 * End io all reads/writes on a stripe copying
2121 * read data accross from stripe to bios and
2122 * decrementing region counters for writes.
2124 * Processing of ios depeding on state:
2125 * o no chunk error -> endio ok
2127 * - chunk error and read -> ignore to be requeued
2128 * - chunk error and write -> endio ok
2129 * o dead (more than parity_devs failed) and chunk_error-> endio failed
2131 static void stripe_endio(int rw
, struct stripe
*stripe
)
2133 struct raid_set
*rs
= RS(stripe
->sc
);
2134 unsigned p
= rs
->set
.raid_devs
;
2135 int write
= (rw
!= READ
);
2138 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2139 struct bio_list
*bl
;
2141 BUG_ON(ChunkLocked(chunk
));
2143 bl
= BL_CHUNK(chunk
, rw
);
2144 if (bio_list_empty(bl
))
2147 if (unlikely(ChunkError(chunk
) || !ChunkUptodate(chunk
))) {
2148 /* RAID set dead. */
2149 if (unlikely(RSDead(rs
)))
2150 bio_list_endio(stripe
, bl
, p
, -EIO
);
2151 /* RAID set degraded. */
2153 bio_list_endio(stripe
, bl
, p
, 0);
2155 BUG_ON(!RSDegraded(rs
) && ChunkDirty(chunk
));
2156 bio_list_endio(stripe
, bl
, p
, 0);
2161 /* Fail all ios hanging off all bio lists of a stripe. */
2162 static void stripe_fail_io(struct stripe
*stripe
)
2164 struct raid_set
*rs
= RS(stripe
->sc
);
2165 unsigned p
= rs
->set
.raid_devs
;
2168 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2169 int i
= ARRAY_SIZE(chunk
->bl
);
2171 /* Fail all bios on all bio lists of the stripe. */
2173 struct bio_list
*bl
= chunk
->bl
+ i
;
2175 if (!bio_list_empty(bl
))
2176 bio_list_endio(stripe
, bl
, p
, -EIO
);
2180 /* Put stripe on LRU list. */
2181 BUG_ON(stripe_io_ref(stripe
));
2182 BUG_ON(stripe_ref(stripe
));
2185 /* Unlock all required chunks. */
2186 static void stripe_chunks_unlock(struct stripe
*stripe
)
2188 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
2189 struct stripe_chunk
*chunk
;
2192 chunk
= CHUNK(stripe
, p
);
2194 if (TestClearChunkUnlock(chunk
))
2195 ClearChunkLocked(chunk
);
2200 * Queue reads and writes to a stripe by hanging
2201 * their bios off the stripesets read/write lists.
2203 static int stripe_queue_bio(struct raid_set
*rs
, struct bio
*bio
,
2204 struct bio_list
*reject
)
2206 struct raid_address addr
;
2207 struct stripe
*stripe
;
2209 stripe
= stripe_find(rs
, raid_address(rs
, bio
->bi_sector
, &addr
));
2211 int r
= 0, rw
= bio_data_dir(bio
);
2213 /* Distinguish reads and writes. */
2214 bio_list_add(BL(stripe
, addr
.di
, rw
), bio
);
2217 /* REMOVEME: statistics. */
2218 atomic_inc(rs
->stats
+ S_BIOS_ADDED_READ
);
2220 /* Inrement pending write count on region. */
2221 dm_rh_inc(rs
->recover
.rh
, stripe
->region
);
2224 /* REMOVEME: statistics. */
2225 atomic_inc(rs
->stats
+ S_BIOS_ADDED_WRITE
);
2229 * Put on io (flush) list in case of
2230 * initial bio queued to chunk.
2232 if (chunk_get(CHUNK(stripe
, addr
.di
)) == 1)
2233 stripe_flush_add(stripe
);
2238 /* Got no stripe from cache or failed to lock it -> reject bio. */
2239 bio_list_add(reject
, bio
);
2240 atomic_inc(rs
->stats
+ S_IOS_POST
); /* REMOVEME: statistics. */
2245 * Handle all stripes by handing them to the daemon, because we can't
2246 * map their chunk pages to copy the data in interrupt context.
2248 * We don't want to handle them here either, while interrupts are disabled.
2251 /* Read/write endio function for dm-io (interrupt context). */
2252 static void endio(unsigned long error
, void *context
)
2254 struct stripe_chunk
*chunk
= context
;
2256 if (unlikely(error
)) {
2257 chunk_set(chunk
, ERROR
);
2258 /* REMOVEME: statistics. */
2259 atomic_inc(RS(chunk
->stripe
->sc
)->stats
+ S_STRIPE_ERROR
);
2261 chunk_set(chunk
, CLEAN
);
2264 * For recovery stripes, I need to reset locked locked
2265 * here, because those aren't processed in do_endios().
2267 if (unlikely(StripeRecover(chunk
->stripe
)))
2268 ClearChunkLocked(chunk
);
2270 SetChunkUnlock(chunk
);
2272 /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
2273 stripe_put_references(chunk
->stripe
);
2276 /* Read/Write a chunk asynchronously. */
2277 static void stripe_chunk_rw(struct stripe
*stripe
, unsigned p
)
2279 struct stripe_cache
*sc
= stripe
->sc
;
2280 struct raid_set
*rs
= RS(sc
);
2281 struct dm_mem_cache_object
*obj
= stripe
->obj
+ p
;
2282 struct page_list
*pl
= obj
->pl
;
2283 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2284 struct raid_dev
*dev
= rs
->dev
+ p
;
2285 struct dm_io_region io
= {
2286 .bdev
= dev
->dev
->bdev
,
2287 .sector
= stripe
->key
,
2288 .count
= stripe
->io
.size
,
2290 struct dm_io_request control
= {
2291 .bi_rw
= ChunkDirty(chunk
) ? WRITE
: READ
,
2293 .type
= DM_IO_PAGE_LIST
,
2301 .client
= StripeRecover(stripe
) ? rs
->recover
.dm_io_client
:
2305 BUG_ON(ChunkLocked(chunk
));
2306 BUG_ON(!ChunkUptodate(chunk
) && ChunkDirty(chunk
));
2307 BUG_ON(ChunkUptodate(chunk
) && !ChunkDirty(chunk
));
2310 * Don't rw past end of device, which can happen, because
2311 * typically sectors_per_dev isn't divisible by io_size.
2313 if (unlikely(io
.sector
+ io
.count
> rs
->set
.sectors_per_dev
))
2314 io
.count
= rs
->set
.sectors_per_dev
- io
.sector
;
2317 io
.sector
+= dev
->start
; /* Add <offset>. */
2319 recover_io_count(stripe
); /* Recovery io accounting. */
2321 /* REMOVEME: statistics. */
2322 atomic_inc(rs
->stats
+ (ChunkDirty(chunk
) ? S_DM_IO_WRITE
:
2324 SetChunkLocked(chunk
);
2325 SetDevIoQueued(dev
);
2326 BUG_ON(dm_io(&control
, 1, &io
, NULL
));
2330 * Write dirty or read not uptodate page lists of a stripe.
2332 static int stripe_chunks_rw(struct stripe
*stripe
)
2335 struct raid_set
*rs
= RS(stripe
->sc
);
2338 * Increment the pending count on the stripe
2339 * first, so that we don't race in endio().
2341 * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
2344 * o dirtied by writes merged
2345 * o dirtied by parity calculations
2347 r
= for_each_io_dev(stripe
, stripe_get_references
);
2349 /* Io needed: chunks are either not uptodate or dirty. */
2350 int max
; /* REMOVEME: */
2351 struct stripe_cache
*sc
= &rs
->sc
;
2353 /* Submit actual io. */
2354 for_each_io_dev(stripe
, stripe_chunk_rw
);
2356 /* REMOVEME: statistics */
2357 max
= sc_active(sc
);
2358 if (atomic_read(&sc
->active_stripes_max
) < max
)
2359 atomic_set(&sc
->active_stripes_max
, max
);
2361 atomic_inc(rs
->stats
+ S_FLUSHS
);
2362 /* END REMOVEME: statistics */
2368 /* Merge in all writes hence dirtying respective chunks. */
2369 static void stripe_merge_writes(struct stripe
*stripe
)
2371 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
2374 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2375 struct bio_list
*write
= BL_CHUNK(chunk
, WRITE_QUEUED
);
2377 if (!bio_list_empty(write
)) {
2379 struct page_list
*pl
= stripe
->obj
[p
].pl
;
2382 * We can play with the lists without holding a lock,
2383 * because it is just us accessing them anyway.
2385 bio_list_for_each(bio
, write
)
2386 bio_copy_page_list(WRITE
, stripe
, pl
, bio
);
2388 bio_list_merge(BL_CHUNK(chunk
, WRITE_MERGED
), write
);
2389 bio_list_init(write
);
2390 chunk_set(chunk
, DIRTY
);
2395 /* Queue all writes to get merged. */
2396 static int stripe_queue_writes(struct stripe
*stripe
)
2399 unsigned p
= RS(stripe
->sc
)->set
.raid_devs
;
2402 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2403 struct bio_list
*write
= BL_CHUNK(chunk
, WRITE
);
2405 if (!bio_list_empty(write
)) {
2406 bio_list_merge(BL_CHUNK(chunk
, WRITE_QUEUED
), write
);
2407 bio_list_init(write
);
2417 /* Check, if a chunk gets completely overwritten. */
2418 static int stripe_check_chunk_overwrite(struct stripe
*stripe
, unsigned p
)
2420 unsigned sectors
= 0;
2422 struct bio_list
*bl
= BL(stripe
, p
, WRITE_QUEUED
);
2424 bio_list_for_each(bio
, bl
)
2425 sectors
+= bio_sectors(bio
);
2427 BUG_ON(sectors
> RS(stripe
->sc
)->set
.io_size
);
2428 return sectors
== RS(stripe
->sc
)->set
.io_size
;
2432 * Avoid io on broken/reconstructed drive in order to
2433 * reconstruct date on endio.
2435 * (*1*) We set StripeReconstruct() in here, so that _do_endios()
2436 * will trigger a reconstruct call before resetting it.
2438 static int stripe_chunk_set_io_flags(struct stripe
*stripe
, int pr
)
2440 struct stripe_chunk
*chunk
= CHUNK(stripe
, pr
);
2443 * Allow io on all chunks but the indexed one,
2444 * because we're either degraded or prohibit it
2445 * on the one for later reconstruction.
2447 /* Includes ClearChunkIo(), ClearChunkUptodate(). */
2448 stripe_chunk_invalidate(chunk
);
2449 stripe
->idx
.recover
= pr
;
2450 SetStripeReconstruct(stripe
);
2452 /* REMOVEME: statistics. */
2453 atomic_inc(RS(stripe
->sc
)->stats
+ S_PROHIBITCHUNKIO
);
2457 /* Chunk locked/uptodate and device failed tests. */
2458 static struct stripe_chunk
*
2459 stripe_chunk_check(struct stripe
*stripe
, unsigned p
, unsigned *chunks_uptodate
)
2461 struct raid_set
*rs
= RS(stripe
->sc
);
2462 struct stripe_chunk
*chunk
= CHUNK(stripe
, p
);
2464 /* Can't access active chunks. */
2465 if (ChunkLocked(chunk
)) {
2466 /* REMOVEME: statistics. */
2467 atomic_inc(rs
->stats
+ S_CHUNK_LOCKED
);
2471 /* Can't access broken devive. */
2472 if (ChunkError(chunk
) || DevFailed(rs
->dev
+ p
))
2475 /* Can access uptodate chunks. */
2476 if (ChunkUptodate(chunk
)) {
2477 (*chunks_uptodate
)++;
2485 * Degraded/reconstruction mode.
2487 * Check stripe state to figure which chunks don't need IO.
2489 * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
2491 static int stripe_check_reconstruct(struct stripe
*stripe
)
2493 struct raid_set
*rs
= RS(stripe
->sc
);
2496 ClearStripeReconstruct(stripe
);
2497 ClearStripeReconstructed(stripe
);
2498 stripe_allow_io(stripe
);
2502 /* Avoid further reconstruction setting, when already set. */
2503 if (StripeReconstruct(stripe
)) {
2504 /* REMOVEME: statistics. */
2505 atomic_inc(rs
->stats
+ S_RECONSTRUCT_SET
);
2509 /* Initially allow io on all chunks. */
2510 stripe_allow_io(stripe
);
2512 /* Return if stripe is already reconstructed. */
2513 if (StripeReconstructed(stripe
)) {
2514 atomic_inc(rs
->stats
+ S_RECONSTRUCTED
);
2519 * Degraded/reconstruction mode (device failed) ->
2520 * avoid io on the failed device.
2522 if (unlikely(RSDegraded(rs
))) {
2523 /* REMOVEME: statistics. */
2524 atomic_inc(rs
->stats
+ S_DEGRADED
);
2525 /* Allow IO on all devices but the dead one. */
2526 BUG_ON(rs
->set
.ei
< 0);
2527 return stripe_chunk_set_io_flags(stripe
, rs
->set
.ei
);
2529 int sync
, pi
= dev_for_parity(stripe
, &sync
);
2532 * Reconstruction mode (ie. a particular (replaced) device or
2533 * some (rotating) parity chunk is being resynchronized) ->
2534 * o make sure all needed chunks are read in
2535 * o writes are allowed to go through
2538 /* REMOVEME: statistics. */
2539 atomic_inc(rs
->stats
+ S_NOSYNC
);
2540 /* Allow IO on all devs but the one to reconstruct. */
2541 return stripe_chunk_set_io_flags(stripe
, pi
);
2549 * Check, if stripe is ready to merge writes.
2550 * I.e. if all chunks present to allow to merge bios.
2552 * We prohibit io on:
2554 * o chunks without bios
2555 * o chunks which get completely written over
2557 static int stripe_merge_possible(struct stripe
*stripe
, int nosync
)
2559 struct raid_set
*rs
= RS(stripe
->sc
);
2560 unsigned chunks_overwrite
= 0, chunks_prohibited
= 0,
2561 chunks_uptodate
= 0, p
= rs
->set
.raid_devs
;
2563 /* Walk all chunks. */
2565 struct stripe_chunk
*chunk
;
2567 /* Prohibit io on broken devices. */
2568 if (DevFailed(rs
->dev
+ p
)) {
2569 chunk
= CHUNK(stripe
, p
);
2573 /* We can't optimize any further if no chunk. */
2574 chunk
= stripe_chunk_check(stripe
, p
, &chunks_uptodate
);
2575 if (!chunk
|| nosync
)
2579 * We have a chunk, which is not uptodate.
2581 * If this is not parity and we don't have
2582 * reads queued, we can optimize further.
2584 if (p
!= stripe
->idx
.parity
&&
2585 bio_list_empty(BL_CHUNK(chunk
, READ
)) &&
2586 bio_list_empty(BL_CHUNK(chunk
, WRITE_MERGED
))) {
2587 if (bio_list_empty(BL_CHUNK(chunk
, WRITE_QUEUED
)))
2589 else if (RSCheckOverwrite(rs
) &&
2590 stripe_check_chunk_overwrite(stripe
, p
))
2591 /* Completely overwritten chunk. */
2595 /* Allow io for chunks with bios and overwritten ones. */
2600 /* No io for broken devices or for chunks w/o bios. */
2601 ClearChunkIo(chunk
);
2602 chunks_prohibited
++;
2603 /* REMOVEME: statistics. */
2604 atomic_inc(RS(stripe
->sc
)->stats
+ S_PROHIBITCHUNKIO
);
2607 /* All data chunks will get written over. */
2608 if (chunks_overwrite
== rs
->set
.data_devs
)
2609 atomic_inc(rs
->stats
+ S_OVERWRITE
); /* REMOVEME: statistics.*/
2610 else if (chunks_uptodate
+ chunks_prohibited
< rs
->set
.raid_devs
) {
2611 /* We don't have enough chunks to merge. */
2612 atomic_inc(rs
->stats
+ S_CANT_MERGE
); /* REMOVEME: statistics.*/
2617 * If we have all chunks up to date or overwrite them, we
2618 * just zero the parity chunk and let stripe_rw() recreate it.
2620 if (chunks_uptodate
== rs
->set
.raid_devs
||
2621 chunks_overwrite
== rs
->set
.data_devs
) {
2622 stripe_zero_chunk(stripe
, stripe
->idx
.parity
);
2623 BUG_ON(StripeReconstruct(stripe
));
2624 SetStripeReconstruct(stripe
); /* Enforce xor in caller. */
2627 * With less chunks, we xor parity out.
2629 * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
2630 * so that only chunks with queued or merged writes
2637 * We do have enough chunks to merge.
2638 * All chunks are uptodate or get written over.
2640 atomic_inc(rs
->stats
+ S_CAN_MERGE
); /* REMOVEME: statistics. */
2645 * Avoid reading chunks in case we're fully operational.
2647 * We prohibit io on any chunks without bios but the parity chunk.
2649 static void stripe_avoid_reads(struct stripe
*stripe
)
2651 struct raid_set
*rs
= RS(stripe
->sc
);
2652 unsigned dummy
= 0, p
= rs
->set
.raid_devs
;
2654 /* Walk all chunks. */
2656 struct stripe_chunk
*chunk
=
2657 stripe_chunk_check(stripe
, p
, &dummy
);
2662 /* If parity or any bios pending -> allow io. */
2663 if (chunk_ref(chunk
) || p
== stripe
->idx
.parity
)
2666 ClearChunkIo(chunk
);
2667 /* REMOVEME: statistics. */
2668 atomic_inc(RS(stripe
->sc
)->stats
+ S_PROHIBITCHUNKIO
);
2674 * Read/write a stripe.
2676 * All stripe read/write activity goes through this function
2677 * unless recovery, which has to call stripe_chunk_rw() directly.
2679 * Make sure we don't try already merged stripes in order
2680 * to avoid data corruption.
2682 * Check the state of the RAID set and if degraded (or
2683 * resynchronizing for reads), read in all other chunks but
2684 * the one on the dead/resynchronizing device in order to be
2685 * able to reconstruct the missing one in _do_endios().
2687 * Can be called on active stripes in order
2688 * to dispatch new io on inactive chunks.
2691 * o stripe to read and/or write
2692 * o stripe with error to reconstruct
2694 static void stripe_rw(struct stripe
*stripe
)
2697 struct raid_set
*rs
= RS(stripe
->sc
);
2700 * Check, if a chunk needs to be reconstructed
2701 * because of a degraded set or a region out of sync.
2703 nosync
= stripe_check_reconstruct(stripe
);
2706 return; /* Wait for stripe reconstruction to finish. */
2712 * If we don't have merged writes pending, we can schedule
2713 * queued writes to be merged next without corrupting data.
2715 if (!StripeMerged(stripe
)) {
2716 r
= stripe_queue_writes(stripe
);
2718 /* Writes got queued -> flag RBW. */
2719 SetStripeRBW(stripe
);
2723 * Merge all writes hanging off uptodate/overwritten
2724 * chunks of the stripe.
2726 if (StripeRBW(stripe
)) {
2727 r
= stripe_merge_possible(stripe
, nosync
);
2728 if (!r
) { /* Merge possible. */
2729 struct stripe_chunk
*chunk
;
2732 * I rely on valid parity in order
2733 * to xor a fraction of chunks out
2734 * of parity and back in.
2736 stripe_merge_writes(stripe
); /* Merge writes in. */
2737 parity_xor(stripe
); /* Update parity. */
2738 ClearStripeReconstruct(stripe
); /* Reset xor enforce. */
2739 SetStripeMerged(stripe
); /* Writes merged. */
2740 ClearStripeRBW(stripe
); /* Disable RBW. */
2743 * REMOVEME: sanity check on parity chunk
2744 * states after writes got merged.
2746 chunk
= CHUNK(stripe
, stripe
->idx
.parity
);
2747 BUG_ON(ChunkLocked(chunk
));
2748 BUG_ON(!ChunkUptodate(chunk
));
2749 BUG_ON(!ChunkDirty(chunk
));
2750 BUG_ON(!ChunkIo(chunk
));
2752 } else if (!nosync
&& !StripeMerged(stripe
))
2753 /* Read avoidance if not degraded/resynchronizing/merged. */
2754 stripe_avoid_reads(stripe
);
2757 /* Now submit any reads/writes for non-uptodate or dirty chunks. */
2758 r
= stripe_chunks_rw(stripe
);
2761 * No io submitted because of chunk io
2762 * prohibited or locked chunks/failed devices
2763 * -> push to end io list for processing.
2765 stripe_endio_push(stripe
);
2766 atomic_inc(rs
->stats
+ S_NO_RW
); /* REMOVEME: statistics. */
2771 * Recovery functions
2773 /* Read a stripe off a raid set for recovery. */
2774 static int stripe_recover_read(struct stripe
*stripe
, int pi
)
2776 BUG_ON(stripe_io_ref(stripe
));
2778 /* Invalidate all chunks so that they get read in. */
2779 stripe_chunks_invalidate(stripe
);
2780 stripe_allow_io(stripe
); /* Allow io on all recovery chunks. */
2783 * If we are reconstructing a perticular device, we can avoid
2784 * reading the respective chunk in, because we're going to
2785 * reconstruct it anyway.
2787 * We can't do that for resynchronization of rotating parity,
2788 * because the recovery stripe chunk size is typically larger
2789 * than the sets chunk size.
2792 ClearChunkIo(CHUNK(stripe
, pi
));
2794 return stripe_chunks_rw(stripe
);
2797 /* Write a stripe to a raid set for recovery. */
2798 static int stripe_recover_write(struct stripe
*stripe
, int pi
)
2800 BUG_ON(stripe_io_ref(stripe
));
2803 * If this is a reconstruct of a particular device, then
2804 * reconstruct the respective chunk, else create parity chunk.
2807 stripe_zero_chunk(stripe
, pi
);
2808 common_xor(stripe
, stripe
->io
.size
, 0, pi
);
2809 chunk_set(CHUNK(stripe
, pi
), DIRTY
);
2813 return stripe_chunks_rw(stripe
);
2816 /* Read/write a recovery stripe. */
2817 static int stripe_recover_rw(struct stripe
*stripe
)
2819 int r
= 0, sync
= 0;
2821 /* Read/write flip-flop. */
2822 if (TestClearStripeRBW(stripe
)) {
2823 SetStripeMerged(stripe
);
2824 stripe
->key
= stripe
->recover
->pos
;
2825 r
= stripe_recover_read(stripe
, dev_for_parity(stripe
, &sync
));
2827 } else if (TestClearStripeMerged(stripe
)) {
2828 r
= stripe_recover_write(stripe
, dev_for_parity(stripe
, &sync
));
2836 /* Recover bandwidth available ?. */
2837 static int recover_bandwidth(struct raid_set
*rs
)
2841 /* On reset or when bios delayed -> allow recovery. */
2842 r
= recover_io_reset(rs
);
2843 if (r
|| RSBandwidth(rs
))
2846 work
= atomic_read(rs
->recover
.io_count
+ IO_WORK
);
2848 /* Pay attention to larger recover stripe size. */
2849 int recover
= atomic_read(rs
->recover
.io_count
+ IO_RECOVER
) *
2850 rs
->recover
.io_size
/ rs
->set
.io_size
;
2853 * Don't use more than given bandwidth
2854 * of the work io for recovery.
2856 if (recover
> work
/ rs
->recover
.bandwidth_work
) {
2857 /* REMOVEME: statistics. */
2858 atomic_inc(rs
->stats
+ S_NO_BANDWIDTH
);
2864 atomic_inc(rs
->stats
+ S_BANDWIDTH
); /* REMOVEME: statistics. */
2868 /* Try to get a region to recover. */
2869 static int stripe_recover_get_region(struct stripe
*stripe
)
2871 struct raid_set
*rs
= RS(stripe
->sc
);
2872 struct recover
*rec
= &rs
->recover
;
2873 struct recover_addr
*addr
= stripe
->recover
;
2874 struct dm_dirty_log
*dl
= rec
->dl
;
2875 struct dm_rh_client
*rh
= rec
->rh
;
2880 /* Return, that we have region first to finish it during suspension. */
2887 if (dl
->type
->get_sync_count(dl
) >= rec
->nr_regions
)
2890 /* If we don't have enough bandwidth, we don't proceed recovering. */
2891 if (!recover_bandwidth(rs
))
2894 /* Start quiescing a region. */
2895 dm_rh_recovery_prepare(rh
);
2896 addr
->reg
= dm_rh_recovery_start(rh
);
2900 addr
->pos
= dm_rh_region_to_sector(rh
, dm_rh_get_region_key(addr
->reg
));
2901 addr
->end
= addr
->pos
+ dm_rh_get_region_size(rh
);
2904 * Take one global io reference out for the
2905 * whole region, which is going to be released
2906 * when the region is completely done with.
2912 /* Update region hash state. */
2913 enum recover_type
{ REC_FAILURE
= 0, REC_SUCCESS
= 1 };
2914 static void recover_rh_update(struct stripe
*stripe
, enum recover_type success
)
2916 struct recover_addr
*addr
= stripe
->recover
;
2917 struct raid_set
*rs
= RS(stripe
->sc
);
2918 struct recover
*rec
= &rs
->recover
;
2921 DMERR("%s- Called w/o region", __func__
);
2925 dm_rh_recovery_end(addr
->reg
, success
);
2927 rec
->nr_regions_recovered
++;
2932 * Completely done with this region ->
2933 * release the 1st io reference.
2938 /* Set start of recovery state. */
2939 static void set_start_recovery(struct raid_set
*rs
)
2941 /* Initialize recovery. */
2942 rs
->recover
.start_jiffies
= jiffies
;
2943 rs
->recover
.end_jiffies
= 0;
2946 /* Set end of recovery state. */
2947 static void set_end_recovery(struct raid_set
*rs
)
2950 rs
->set
.dev_to_init
= -1;
2952 /* Check for jiffies overrun. */
2953 rs
->recover
.end_jiffies
= jiffies
;
2954 if (rs
->recover
.end_jiffies
< rs
->recover
.start_jiffies
)
2955 rs
->recover
.end_jiffies
= ~0;
2958 /* Handle recovery on one recovery stripe. */
2959 static int _do_recovery(struct stripe
*stripe
)
2962 struct raid_set
*rs
= RS(stripe
->sc
);
2963 struct recover_addr
*addr
= stripe
->recover
;
2965 /* If recovery is active -> return. */
2966 if (stripe_io_ref(stripe
))
2969 /* IO error is fatal for recovery -> stop it. */
2970 if (unlikely(StripeError(stripe
)))
2973 /* Recovery end required. */
2977 /* Get a region to recover. */
2978 r
= stripe_recover_get_region(stripe
);
2980 case 0: /* Got a new region: flag initial read before write. */
2981 SetStripeRBW(stripe
);
2982 case 1: /* Have a region in the works. */
2985 /* No bandwidth/quiesced region yet, try later. */
2987 wake_do_raid_delayed(rs
, HZ
/ 4);
2991 case -ENOENT
: /* No more regions to recover. */
2992 schedule_work(&rs
->io
.ws_do_table_event
);
2998 /* Read/write a recover stripe. */
2999 r
= stripe_recover_rw(stripe
);
3004 /* Read and write finished-> update recovery position within region. */
3005 addr
->pos
+= stripe
->io
.size
;
3007 /* If we're at end of region, update region hash. */
3008 if (addr
->pos
>= addr
->end
||
3009 addr
->pos
>= rs
->set
.sectors_per_dev
)
3010 recover_rh_update(stripe
, REC_SUCCESS
);
3012 /* Prepare to read next region segment. */
3013 SetStripeRBW(stripe
);
3015 /* Schedule myself for another round... */
3020 /* FIXME: rather try recovering other regions on error? */
3021 rs_check_degrade(stripe
);
3022 recover_rh_update(stripe
, REC_FAILURE
);
3024 /* Check state of partially recovered array. */
3025 if (RSDegraded(rs
) && !RSDead(rs
) &&
3026 rs
->set
.dev_to_init
!= -1 &&
3027 rs
->set
.ei
!= rs
->set
.dev_to_init
)
3028 /* Broken drive != drive to recover -> FATAL. */
3031 if (StripeError(stripe
)) {
3032 char buf
[BDEVNAME_SIZE
];
3034 DMERR("stopping recovery due to "
3035 "ERROR on /dev/%s, stripe at offset %llu",
3036 bdevname(rs
->dev
[rs
->set
.ei
].dev
->bdev
, buf
),
3037 (unsigned long long) stripe
->key
);
3041 /* Make sure, that all quiesced regions get released. */
3043 dm_rh_recovery_end(addr
->reg
, -EIO
);
3044 addr
->reg
= dm_rh_recovery_start(rs
->recover
.rh
);
3050 /* Called by main io daemon to recover regions. */
3051 static void do_recovery(struct raid_set
*rs
)
3053 if (RSRecover(rs
)) {
3055 struct stripe
*stripe
;
3057 list_for_each_entry(stripe
, &rs
->recover
.stripes
,
3058 lists
[LIST_RECOVER
])
3059 r
+= _do_recovery(stripe
);
3062 set_end_recovery(rs
);
3063 stripe_recover_free(rs
);
3069 * END recovery functions
3072 /* End io process all stripes handed in by endio() callback. */
3073 static void _do_endios(struct raid_set
*rs
, struct stripe
*stripe
,
3074 struct list_head
*flush_list
)
3076 /* First unlock all required chunks. */
3077 stripe_chunks_unlock(stripe
);
3080 * If an io error on a stripe occured, degrade the RAID set
3081 * and try to endio as many bios as possible. If any bios can't
3082 * be endio processed, requeue the stripe (stripe_ref() != 0).
3084 if (TestClearStripeError(stripe
)) {
3086 * FIXME: if read, rewrite the failed chunk after reconstruction
3087 * in order to trigger disk bad sector relocation.
3089 rs_check_degrade(stripe
); /* Resets ChunkError(). */
3090 ClearStripeReconstruct(stripe
);
3091 ClearStripeReconstructed(stripe
);
3094 /* Got to reconstruct a missing chunk. */
3095 if (StripeReconstruct(stripe
)) {
3097 * (*2*) We use StripeReconstruct() to allow for
3098 * all chunks to be xored into the reconstructed
3099 * one (see chunk_must_xor()).
3101 stripe_reconstruct(stripe
);
3104 * (*3*) Now we reset StripeReconstruct() and flag
3105 * StripeReconstructed() to show to stripe_rw(),
3106 * that we have reconstructed a missing chunk.
3108 ClearStripeReconstruct(stripe
);
3109 SetStripeReconstructed(stripe
);
3111 /* FIXME: reschedule to be written in case of read. */
3112 // if (!StripeRBW(stripe)) {
3113 // chunk_set(CHUNK(stripe, pr), DIRTY);
3114 // stripe_chunks_rw(stripe);
3119 * Now that we eventually got a complete stripe, we
3120 * can process the rest of the end ios on reads.
3122 stripe_endio(READ
, stripe
);
3124 /* End io all merged writes. */
3125 if (TestClearStripeMerged(stripe
))
3126 stripe_endio(WRITE_MERGED
, stripe
);
3128 /* If RAID set is dead -> fail any ios to dead drives. */
3130 DMERR_LIMIT("RAID set dead: failing ios to dead devices");
3131 stripe_fail_io(stripe
);
3135 * We have stripe references still,
3136 * beacuse of read befeore writes or IO errors ->
3137 * got to put on flush list for processing.
3139 if (stripe_ref(stripe
)) {
3140 BUG_ON(!list_empty(stripe
->lists
+ LIST_LRU
));
3141 list_add_tail(stripe
->lists
+ LIST_FLUSH
, flush_list
);
3142 atomic_inc(rs
->stats
+ S_REQUEUE
); /* REMOVEME: statistics. */
3144 stripe_lru_add(stripe
);
3147 /* Pop any endio stripes off of the endio list and belabour them. */
3148 static void do_endios(struct raid_set
*rs
)
3150 struct stripe_cache
*sc
= &rs
->sc
;
3151 struct stripe
*stripe
;
3152 /* IO flush list for sorted requeued stripes. */
3153 struct list_head flush_list
;
3155 INIT_LIST_HEAD(&flush_list
);
3157 while ((stripe
= stripe_endio_pop(sc
))) {
3158 /* Avoid endio on stripes with newly io'ed chunks. */
3159 if (!stripe_io_ref(stripe
))
3160 _do_endios(rs
, stripe
, &flush_list
);
3164 * Insert any requeued stripes in the proper
3165 * order at the beginning of the io (flush) list.
3167 list_splice(&flush_list
, sc
->lists
+ LIST_FLUSH
);
3170 /* Flush any stripes on the io list. */
3171 static void do_flush(struct raid_set
*rs
)
3173 struct stripe
*stripe
;
3175 while ((stripe
= stripe_io_pop(&rs
->sc
)))
3176 stripe_rw(stripe
); /* Read/write stripe. */
3179 /* Stripe cache resizing. */
3180 static void do_sc_resize(struct raid_set
*rs
)
3182 unsigned set
= atomic_read(&rs
->sc
.stripes_to_set
);
3185 unsigned cur
= atomic_read(&rs
->sc
.stripes
);
3186 int r
= (set
> cur
) ? sc_grow(&rs
->sc
, set
- cur
, SC_GROW
) :
3187 sc_shrink(&rs
->sc
, cur
- set
);
3189 /* Flag end of resizeing if ok. */
3191 atomic_set(&rs
->sc
.stripes_to_set
, 0);
3198 * We do different things with the io depending
3199 * on the state of the region that it is in:
3201 * o reads: hang off stripe cache or postpone if full
3205 * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3206 * In case stripe cache is full or busy, postpone the io.
3208 * RECOVERING: delay the io until recovery of the region completes.
3211 static void do_ios(struct raid_set
*rs
, struct bio_list
*ios
)
3214 unsigned flush
= 0, delay
= 0;
3216 struct dm_rh_client
*rh
= rs
->recover
.rh
;
3218 struct bio_list reject
;
3220 bio_list_init(&reject
);
3224 * o delay writes to recovering regions (let reads go through)
3225 * o queue io to all other regions
3227 while ((bio
= bio_list_pop(ios
))) {
3229 * In case we get a barrier bio, push it back onto
3230 * the input queue unless all work queues are empty
3231 * and the stripe cache is inactive.
3233 if (unlikely(bio
->bi_rw
& REQ_FLUSH
)) {
3234 /* REMOVEME: statistics. */
3235 atomic_inc(rs
->stats
+ S_BARRIER
);
3237 !list_empty(rs
->sc
.lists
+ LIST_FLUSH
) ||
3238 !bio_list_empty(&reject
) ||
3239 sc_active(&rs
->sc
)) {
3240 bio_list_push(ios
, bio
);
3245 /* Check for recovering regions. */
3246 sector
= _sector(rs
, bio
);
3247 r
= region_state(rs
, sector
, DM_RH_RECOVERING
);
3248 if (unlikely(r
&& bio_data_dir(bio
) == WRITE
)) {
3250 /* Wait writing to recovering regions. */
3251 dm_rh_delay_by_region(rh
, bio
,
3252 dm_rh_sector_to_region(rh
,
3254 /* REMOVEME: statistics.*/
3255 atomic_inc(rs
->stats
+ S_DELAYED_BIOS
);
3256 atomic_inc(rs
->stats
+ S_SUM_DELAYED_BIOS
);
3258 /* Force bandwidth tests in recovery. */
3262 * Process ios to non-recovering regions by queueing
3263 * them to stripes (does dm_rh_inc()) for writes).
3265 flush
+= stripe_queue_bio(rs
, bio
, &reject
);
3270 /* FIXME: better error handling. */
3271 r
= dm_rh_flush(rh
); /* Writes got queued -> flush dirty log. */
3273 DMERR_LIMIT("dirty log flush");
3276 /* Merge any rejected bios back to the head of the input list. */
3277 bio_list_merge_head(ios
, &reject
);
3280 /* Send an event in case we're getting too busy. */
3281 static void do_busy_event(struct raid_set
*rs
)
3284 if (!TestSetRSScBusy(rs
))
3285 schedule_work(&rs
->io
.ws_do_table_event
);
3291 /* Throw an event. */
3292 static void do_table_event(struct work_struct
*ws
)
3294 struct raid_set
*rs
= container_of(ws
, struct raid_set
,
3295 io
.ws_do_table_event
);
3296 dm_table_event(rs
->ti
->table
);
3300 /*-----------------------------------------------------------------
3302 *---------------------------------------------------------------*/
3304 * o belabour all end ios
3305 * o update the region hash states
3306 * o optionally shrink the stripe cache
3307 * o optionally do recovery
3308 * o unplug any component raid devices with queued bios
3309 * o grab the input queue
3310 * o work an all requeued or new ios and perform stripe cache flushs
3311 * o unplug any component raid devices with queued bios
3312 * o check, if the stripe cache gets too busy and throw an event if so
3314 static void do_raid(struct work_struct
*ws
)
3316 struct raid_set
*rs
= container_of(ws
, struct raid_set
,
3317 io
.dws_do_raid
.work
);
3318 struct bio_list
*ios
= &rs
->io
.work
, *ios_in
= &rs
->io
.in
;
3319 struct blk_plug plug
;
3322 * We always need to end io, so that ios can get errored in
3323 * case the set failed and the region counters get decremented
3324 * before we update region hash states and go any further.
3327 dm_rh_update_states(rs
->recover
.rh
, 1);
3330 * Now that we've end io'd, which may have put stripes on the LRU list
3331 * to allow for shrinking, we resize the stripe cache if requested.
3335 /* Try to recover regions. */
3336 blk_start_plug(&plug
);
3338 blk_finish_plug(&plug
); /* Unplug the queue */
3340 /* Quickly grab all new ios queued and add them to the work list. */
3341 mutex_lock(&rs
->io
.in_lock
);
3342 bio_list_merge(ios
, ios_in
);
3343 bio_list_init(ios_in
);
3344 mutex_unlock(&rs
->io
.in_lock
);
3346 blk_start_plug(&plug
);
3347 if (!bio_list_empty(ios
))
3348 do_ios(rs
, ios
); /* Got ios to work into the cache. */
3350 do_flush(rs
); /* Flush any stripes on io list. */
3351 blk_finish_plug(&plug
); /* Unplug the queue */
3352 do_busy_event(rs
); /* Check if we got too busy. */
3356 * Callback for region hash to dispatch
3357 * delayed bios queued to recovered regions
3358 * (gets called via dm_rh_update_states()).
3360 static void dispatch_delayed_bios(void *context
, struct bio_list
*bl
)
3362 struct raid_set
*rs
= context
;
3365 /* REMOVEME: statistics; decrement pending delayed bios counter. */
3366 bio_list_for_each(bio
, bl
)
3367 atomic_dec(rs
->stats
+ S_DELAYED_BIOS
);
3369 /* Merge region hash private list to work list. */
3370 bio_list_merge_head(&rs
->io
.work
, bl
);
3372 ClearRSBandwidth(rs
);
3375 /*************************************************************
3376 * Constructor helpers
3377 *************************************************************/
3378 /* Calculate MB/sec. */
3379 static unsigned mbpers(struct raid_set
*rs
, unsigned speed
)
3381 return to_bytes(speed
* rs
->set
.data_devs
*
3382 rs
->recover
.io_size
* HZ
>> 10) >> 10;
3386 * Discover fastest xor algorithm and # of chunks combination.
3388 /* Calculate speed for algorithm and # of chunks. */
3389 static unsigned xor_speed(struct stripe
*stripe
)
3394 /* Wait for next tick. */
3395 for (j
= jiffies
; j
== jiffies
; )
3398 /* Do xors for a full tick. */
3399 for (j
= jiffies
; j
== jiffies
; ) {
3401 common_xor(stripe
, stripe
->io
.size
, 0, 0);
3409 /* Optimize xor algorithm for this RAID set. */
3410 static unsigned xor_optimize(struct raid_set
*rs
)
3412 unsigned chunks_max
= 2, p
= rs
->set
.raid_devs
, speed_max
= 0;
3413 struct xor_func
*f
= ARRAY_END(xor_funcs
), *f_max
= NULL
;
3414 struct stripe
*stripe
;
3416 BUG_ON(list_empty(&rs
->recover
.stripes
));
3417 stripe
= list_first_entry(&rs
->recover
.stripes
, struct stripe
,
3418 lists
[LIST_RECOVER
]);
3420 /* Must set uptodate so that xor() will belabour chunks. */
3422 SetChunkUptodate(CHUNK(stripe
, p
));
3424 /* Try all xor functions. */
3425 while (f
-- > xor_funcs
) {
3428 /* Set actual xor function for common_xor(). */
3430 rs
->xor.chunks
= (f
->f
== xor_blocks_wrapper
?
3431 (MAX_XOR_BLOCKS
+ 1) : XOR_CHUNKS_MAX
) + 1;
3433 while (rs
->xor.chunks
-- > 2) {
3434 speed
= xor_speed(stripe
);
3435 if (speed
> speed_max
) {
3437 chunks_max
= rs
->xor.chunks
;
3443 /* Memorize optimum parameters. */
3445 rs
->xor.chunks
= chunks_max
;
3450 * Allocate a RAID context (a RAID set)
3452 /* Structure for variable RAID parameters. */
3453 struct variable_parms
{
3457 int chunk_size_parm
;
3462 int recover_io_size
;
3463 int recover_io_size_parm
;
3466 int recovery_stripes
;
3467 int recovery_stripes_parm
;
3470 static struct raid_set
*
3471 context_alloc(struct raid_type
*raid_type
, struct variable_parms
*p
,
3472 unsigned raid_devs
, sector_t sectors_per_dev
,
3473 struct dm_target
*ti
, unsigned dl_parms
, char **argv
)
3477 sector_t region_size
, ti_len
;
3478 struct raid_set
*rs
= NULL
;
3479 struct dm_dirty_log
*dl
;
3480 struct recover
*rec
;
3483 * Create the dirty log
3485 * We need to change length for the dirty log constructor,
3486 * because we want an amount of regions for all stripes derived
3487 * from the single device size, so that we can keep region
3488 * size = 2^^n independant of the number of devices
3491 ti
->len
= sectors_per_dev
;
3492 dl
= dm_dirty_log_create(argv
[0], ti
, NULL
, dl_parms
, argv
+ 2);
3497 /* Chunk size *must* be smaller than region size. */
3498 region_size
= dl
->type
->get_region_size(dl
);
3499 if (p
->chunk_size
> region_size
)
3500 goto bad_chunk_size
;
3502 /* Recover io size *must* be smaller than region size as well. */
3503 if (p
->recover_io_size
> region_size
)
3504 goto bad_recover_io_size
;
3506 /* Size and allocate the RAID set structure. */
3507 len
= sizeof(*rs
->data
) + sizeof(*rs
->dev
);
3508 if (dm_array_too_big(sizeof(*rs
), len
, raid_devs
))
3511 len
= sizeof(*rs
) + raid_devs
* len
;
3512 rs
= kzalloc(len
, GFP_KERNEL
);
3517 atomic_set(&rs
->io
.in_process
, 0);
3518 atomic_set(&rs
->io
.in_process_max
, 0);
3519 rec
->io_size
= p
->recover_io_size
;
3521 /* Pointer to data array. */
3522 rs
->data
= (unsigned long **)
3523 ((void *) rs
->dev
+ raid_devs
* sizeof(*rs
->dev
));
3525 rs
->set
.raid_devs
= raid_devs
;
3526 rs
->set
.data_devs
= raid_devs
- raid_type
->parity_devs
;
3527 rs
->set
.raid_type
= raid_type
;
3529 rs
->set
.raid_parms
= p
->raid_parms
;
3530 rs
->set
.chunk_size_parm
= p
->chunk_size_parm
;
3531 rs
->set
.io_size_parm
= p
->io_size_parm
;
3532 rs
->sc
.stripes_parm
= p
->stripes_parm
;
3533 rec
->io_size_parm
= p
->recover_io_size_parm
;
3534 rec
->bandwidth_parm
= p
->bandwidth_parm
;
3535 rec
->recovery
= p
->recovery
;
3536 rec
->recovery_stripes
= p
->recovery_stripes
;
3539 * Set chunk and io size and respective shifts
3540 * (used to avoid divisions)
3542 rs
->set
.chunk_size
= p
->chunk_size
;
3543 rs
->set
.chunk_shift
= ffs(p
->chunk_size
) - 1;
3545 rs
->set
.io_size
= p
->io_size
;
3546 rs
->set
.io_mask
= p
->io_size
- 1;
3547 /* Mask to adjust address key in case io_size != chunk_size. */
3548 rs
->set
.io_inv_mask
= (p
->chunk_size
- 1) & ~rs
->set
.io_mask
;
3550 rs
->set
.sectors_per_dev
= sectors_per_dev
;
3552 rs
->set
.ei
= -1; /* Indicate no failed device. */
3553 atomic_set(&rs
->set
.failed_devs
, 0);
3557 atomic_set(rec
->io_count
+ IO_WORK
, 0);
3558 atomic_set(rec
->io_count
+ IO_RECOVER
, 0);
3560 /* Initialize io lock and queues. */
3561 mutex_init(&rs
->io
.in_lock
);
3562 bio_list_init(&rs
->io
.in
);
3563 bio_list_init(&rs
->io
.work
);
3565 init_waitqueue_head(&rs
->io
.suspendq
); /* Suspend waiters (dm-io). */
3567 rec
->nr_regions
= dm_sector_div_up(sectors_per_dev
, region_size
);
3568 rec
->rh
= dm_region_hash_create(rs
, dispatch_delayed_bios
,
3569 wake_dummy
, wake_do_raid
, 0, p
->recovery_stripes
,
3570 dl
, region_size
, rec
->nr_regions
);
3571 if (IS_ERR(rec
->rh
))
3574 /* Initialize stripe cache. */
3575 r
= sc_init(rs
, p
->stripes
);
3579 /* REMOVEME: statistics. */
3581 ClearRSDevelStats(rs
); /* Disnable development status. */
3585 TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM
));
3588 dm_dirty_log_destroy(dl
);
3589 TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL
));
3591 bad_recover_io_size
:
3592 dm_dirty_log_destroy(dl
);
3593 TI_ERR_RET("Recover stripe io size larger than region size",
3597 dm_dirty_log_destroy(dl
);
3598 TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL
));
3601 dm_dirty_log_destroy(dl
);
3602 TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM
));
3605 dm_dirty_log_destroy(dl
);
3606 ti
->error
= DM_MSG_PREFIX
"Error creating dirty region hash";
3610 dm_region_hash_destroy(rec
->rh
); /* Destroys dirty log too. */
3612 ti
->error
= DM_MSG_PREFIX
"Error creating stripe cache";
3615 return ERR_PTR(-ENOMEM
);
3618 /* Free a RAID context (a RAID set). */
3619 static void context_free(struct raid_set
*rs
, unsigned p
)
3622 dm_put_device(rs
->ti
, rs
->dev
[p
].dev
);
3625 dm_region_hash_destroy(rs
->recover
.rh
); /* Destroys dirty log too. */
3629 /* Create work queue and initialize delayed work. */
3630 static int rs_workqueue_init(struct raid_set
*rs
)
3632 struct dm_target
*ti
= rs
->ti
;
3634 rs
->io
.wq
= create_singlethread_workqueue(DAEMON
);
3636 TI_ERR_RET("failed to create " DAEMON
, -ENOMEM
);
3638 INIT_DELAYED_WORK(&rs
->io
.dws_do_raid
, do_raid
);
3639 INIT_WORK(&rs
->io
.ws_do_table_event
, do_table_event
);
3643 /* Return pointer to raid_type structure for raid name. */
3644 static struct raid_type
*get_raid_type(char *name
)
3646 struct raid_type
*r
= ARRAY_END(raid_types
);
3648 while (r
-- > raid_types
) {
3649 if (!strcmp(r
->name
, name
))
3656 /* FIXME: factor out to dm core. */
3657 static int multiple(sector_t a
, sector_t b
, sector_t
*n
)
3666 /* Log RAID set information to kernel log. */
3667 static void rs_log(struct raid_set
*rs
, unsigned speed
)
3670 char buf
[BDEVNAME_SIZE
];
3672 for (p
= 0; p
< rs
->set
.raid_devs
; p
++)
3673 DMINFO("/dev/%s is raid disk %u%s",
3674 bdevname(rs
->dev
[p
].dev
->bdev
, buf
), p
,
3675 (p
== rs
->set
.pi
) ? " (parity)" : "");
3677 DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
3678 "algorithm \"%s\", %u chunks with %uMB/s\n"
3679 "%s set with net %u/%u devices",
3680 rs
->set
.chunk_size
, rs
->set
.io_size
, rs
->recover
.io_size
,
3681 atomic_read(&rs
->sc
.stripes
),
3682 rs
->xor.f
->name
, rs
->xor.chunks
, mbpers(rs
, speed
),
3683 rs
->set
.raid_type
->descr
, rs
->set
.data_devs
, rs
->set
.raid_devs
);
3686 /* Get all devices and offsets. */
3687 static int dev_parms(struct raid_set
*rs
, char **argv
, int *p
)
3689 struct dm_target
*ti
= rs
->ti
;
3691 for (*p
= 0; *p
< rs
->set
.raid_devs
; (*p
)++, argv
+= 2) {
3693 unsigned long long tmp
;
3694 struct raid_dev
*dev
= rs
->dev
+ *p
;
3696 /* Get offset and device. */
3697 if (sscanf(argv
[1], "%llu", &tmp
) != 1 ||
3698 tmp
> rs
->set
.sectors_per_dev
)
3699 TI_ERR("Invalid RAID device offset parameter");
3702 r
= dm_get_device(ti
, *argv
, dm_table_get_mode(ti
->table
), &dev
->dev
);
3704 TI_ERR_RET("RAID device lookup failure", r
);
3706 r
= raid_dev_lookup(rs
, dev
);
3707 if (r
!= -ENODEV
&& r
< *p
) {
3708 (*p
)++; /* Ensure dm_put_device() on actual device. */
3709 TI_ERR_RET("Duplicate RAID device", -ENXIO
);
3716 /* Set recovery bandwidth. */
3718 recover_set_bandwidth(struct raid_set
*rs
, unsigned bandwidth
)
3720 rs
->recover
.bandwidth
= bandwidth
;
3721 rs
->recover
.bandwidth_work
= 100 / bandwidth
;
3724 /* Handle variable number of RAID parameters. */
3725 static int get_raid_variable_parms(struct dm_target
*ti
, char **argv
,
3726 struct variable_parms
*vp
)
3730 int action
; /* -1: skip, 0: no pwer2 check, 1: power2 check */
3733 int *var
, *var2
, *var3
;
3736 "Invalid chunk size; must be -1 or 2^^n and <= 16384",
3737 IO_SIZE_MIN
, CHUNK_SIZE_MAX
,
3738 &vp
->chunk_size_parm
, &vp
->chunk_size
, &vp
->io_size
},
3740 "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
3741 STRIPES_MIN
, STRIPES_MAX
,
3742 &vp
->stripes_parm
, &vp
->stripes
, NULL
},
3744 "Invalid io size; must -1 or >= 8, 2^^n and less equal "
3745 "min(BIO_MAX_SECTORS/2, chunk size)",
3746 IO_SIZE_MIN
, 0, /* Needs to be updated in loop below. */
3747 &vp
->io_size_parm
, &vp
->io_size
, NULL
},
3749 "Invalid recovery io size; must be -1 or "
3750 "2^^n and less equal BIO_MAX_SECTORS/2",
3751 RECOVER_IO_SIZE_MIN
, BIO_MAX_SECTORS
/ 2,
3752 &vp
->recover_io_size_parm
, &vp
->recover_io_size
, NULL
},
3754 "Invalid recovery bandwidth percentage; "
3755 "must be -1 or > 0 and <= 100",
3756 BANDWIDTH_MIN
, BANDWIDTH_MAX
,
3757 &vp
->bandwidth_parm
, &vp
->bandwidth
, NULL
},
3758 /* Handle sync argument seperately in loop. */
3760 "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
3762 "Invalid number of recovery stripes;"
3763 "must be -1, > 0 and <= 16384",
3764 RECOVERY_STRIPES_MIN
, RECOVERY_STRIPES_MAX
,
3765 &vp
->recovery_stripes_parm
, &vp
->recovery_stripes
, NULL
},
3768 /* Fetch # of variable raid parameters. */
3769 if (sscanf(*(argv
++), "%d", &vp
->raid_parms
) != 1 ||
3770 !range_ok(vp
->raid_parms
, 0, 7))
3771 TI_ERR("Bad variable raid parameters number");
3773 /* Preset variable RAID parameters. */
3774 vp
->chunk_size
= CHUNK_SIZE_DEFAULT
;
3775 vp
->io_size
= IO_SIZE_DEFAULT
;
3776 vp
->stripes
= STRIPES_DEFAULT
;
3777 vp
->recover_io_size
= RECOVER_IO_SIZE_DEFAULT
;
3778 vp
->bandwidth
= BANDWIDTH_DEFAULT
;
3780 vp
->recovery_stripes
= RECOVERY_STRIPES_DEFAULT
;
3782 /* Walk the array of argument constraints for all given ones. */
3783 for (p
= 0, varp
= argctr
; p
< vp
->raid_parms
; p
++, varp
++) {
3784 BUG_ON(varp
>= ARRAY_END(argctr
));
3786 /* Special case for "[no]sync" string argument. */
3787 if (varp
->action
< 0) {
3788 if (!strcmp(*argv
, "sync"))
3790 else if (!strcmp(*argv
, "nosync"))
3793 TI_ERR(varp
->errmsg
);
3800 * Special case for io_size depending
3801 * on previously set chunk size.
3804 varp
->max
= min(BIO_MAX_SECTORS
/ 2, vp
->chunk_size
);
3806 if (sscanf(*(argv
++), "%d", &value
) != 1 ||
3808 ((varp
->action
&& !POWER_OF_2(value
)) ||
3809 !range_ok(value
, varp
->min
, varp
->max
))))
3810 TI_ERR(varp
->errmsg
);
3815 *varp
->var2
= value
;
3817 *varp
->var3
= value
;
3824 /* Parse optional locking parameters. */
3825 static int get_raid_locking_parms(struct dm_target
*ti
, char **argv
,
3827 struct dm_raid45_locking_type
**locking_type
)
3829 if (!strnicmp(argv
[0], "locking", strlen(argv
[0]))) {
3830 char *lckstr
= argv
[1];
3831 size_t lcksz
= strlen(lckstr
);
3833 if (!strnicmp(lckstr
, "none", lcksz
)) {
3834 *locking_type
= &locking_none
;
3836 } else if (!strnicmp(lckstr
, "cluster", lcksz
)) {
3837 DMERR("locking type \"%s\" not yet implemented",
3841 DMERR("unknown locking type \"%s\"", lckstr
);
3847 *locking_type
= &locking_none
;
3851 /* Set backing device read ahead properties of RAID set. */
3852 static void rs_set_read_ahead(struct raid_set
*rs
,
3853 unsigned sectors
, unsigned stripes
)
3855 unsigned ra_pages
= dm_div_up(sectors
, SECTORS_PER_PAGE
);
3856 struct mapped_device
*md
= dm_table_get_md(rs
->ti
->table
);
3857 struct backing_dev_info
*bdi
= &dm_disk(md
)->queue
->backing_dev_info
;
3859 /* Set read-ahead for the RAID set and the component devices. */
3861 unsigned p
= rs
->set
.raid_devs
;
3863 bdi
->ra_pages
= stripes
* ra_pages
* rs
->set
.data_devs
;
3866 struct request_queue
*q
=
3867 bdev_get_queue(rs
->dev
[p
].dev
->bdev
);
3869 q
->backing_dev_info
.ra_pages
= ra_pages
;
3876 /* Set congested function. */
3877 static void rs_set_congested_fn(struct raid_set
*rs
)
3879 struct mapped_device
*md
= dm_table_get_md(rs
->ti
->table
);
3880 struct backing_dev_info
*bdi
= &dm_disk(md
)->queue
->backing_dev_info
;
3882 /* Set congested function and data. */
3883 bdi
->congested_fn
= rs_congested
;
3884 bdi
->congested_data
= rs
;
3889 * Construct a RAID4/5 mapping:
3891 * log_type #log_params <log_params> \
3892 * raid_type [#parity_dev] #raid_variable_params <raid_params> \
3893 * [locking "none"/"cluster"]
3894 * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
3896 * log_type = "core"/"disk",
3897 * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
3898 * log_params = [dirty_log_path] region_size [[no]sync])
3900 * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
3902 * #parity_dev = N if raid_type = "raid4"
3903 * o N = -1: pick default = last device
3904 * o N >= 0 and < #raid_devs: parity device index
3906 * #raid_variable_params = 0-7; raid_params (-1 = default):
3907 * [chunk_size [#stripes [io_size [recover_io_size \
3908 * [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
3909 * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
3910 * and <= CHUNK_SIZE_MAX)
3911 * o #stripes is number of stripes allocated to stripe cache
3912 * (must be > 1 and < STRIPES_MAX)
3913 * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
3914 * o recover_io_size (io unit size per device for recovery in sectors;
3915 must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
3916 * o %recovery_bandwith is the maximum amount spend for recovery during
3917 * application io (1-100%)
3918 * o recovery switch = [sync|nosync]
3919 * o #recovery_stripes is the number of recovery stripes used for
3920 * parallel recovery of the RAID set
3921 * If raid_variable_params = 0, defaults will be used.
3922 * Any raid_variable_param can be set to -1 to apply a default
3924 * #raid_devs = N (N >= 3)
3926 * #dev_to_initialize = N
3927 * -1: initialize parity on all devices
3928 * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
3929 * of a failed devices content after replacement
3931 * <dev_path> = device_path (eg, /dev/sdd1)
3932 * <offset> = begin at offset on <dev_path>
3935 #define MIN_PARMS 13
3936 static int raid_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
3938 int dev_to_init
, dl_parms
, i
, locking_parms
,
3939 parity_parm
, pi
= -1, r
, raid_devs
;
3941 sector_t tmp
, sectors_per_dev
;
3942 struct dm_raid45_locking_type
*locking
;
3943 struct raid_set
*rs
;
3944 struct raid_type
*raid_type
;
3945 struct variable_parms parms
;
3947 /* Ensure minimum number of parameters. */
3948 if (argc
< MIN_PARMS
)
3949 TI_ERR("Not enough parameters");
3951 /* Fetch # of dirty log parameters. */
3952 if (sscanf(argv
[1], "%d", &dl_parms
) != 1 ||
3953 !range_ok(dl_parms
, 1, 4711)) /* ;-) */
3954 TI_ERR("Bad dirty log parameters number");
3956 /* Check raid_type. */
3957 raid_type
= get_raid_type(argv
[dl_parms
+ 2]);
3959 TI_ERR("Bad raid type");
3961 /* In case of RAID4, parity drive is selectable. */
3962 parity_parm
= !!(raid_type
->level
== raid4
);
3964 /* Handle variable number of RAID parameters. */
3965 r
= get_raid_variable_parms(ti
, argv
+ dl_parms
+ parity_parm
+ 3,
3970 /* Handle any locking parameters. */
3971 r
= get_raid_locking_parms(ti
,
3972 argv
+ dl_parms
+ parity_parm
+
3973 parms
.raid_parms
+ 4,
3974 &locking_parms
, &locking
);
3978 /* # of raid devices. */
3979 i
= dl_parms
+ parity_parm
+ parms
.raid_parms
+ locking_parms
+ 4;
3980 if (sscanf(argv
[i
], "%d", &raid_devs
) != 1 ||
3981 raid_devs
< raid_type
->minimal_devs
)
3982 TI_ERR("Invalid number of raid devices");
3984 /* In case of RAID4, check parity drive index is in limits. */
3985 if (raid_type
->level
== raid4
) {
3986 /* Fetch index of parity device. */
3987 if (sscanf(argv
[dl_parms
+ 3], "%d", &pi
) != 1 ||
3988 (pi
!= -1 && !range_ok(pi
, 0, raid_devs
- 1)))
3989 TI_ERR("Invalid RAID4 parity device index");
3993 * Index of device to initialize starts at 0
3995 * o -1 -> don't initialize a selected device;
3996 * initialize parity conforming to algorithm
3997 * o 0..raid_devs-1 -> initialize respective device
3998 * (used for reconstruction of a replaced device)
4000 if (sscanf(argv
[dl_parms
+ parity_parm
+ parms
.raid_parms
+
4001 locking_parms
+ 5], "%d", &dev_to_init
) != 1 ||
4002 !range_ok(dev_to_init
, -1, raid_devs
- 1))
4003 TI_ERR("Invalid number for raid device to initialize");
4005 /* Check # of raid device arguments. */
4006 if (argc
- dl_parms
- parity_parm
- parms
.raid_parms
- 6 !=
4008 TI_ERR("Wrong number of raid device/offset arguments");
4011 * Check that the table length is devisable
4012 * w/o rest by (raid_devs - parity_devs)
4014 if (!multiple(ti
->len
, raid_devs
- raid_type
->parity_devs
,
4016 TI_ERR("Target length not divisible by number of data devices");
4019 * Check that the device size is
4020 * devisable w/o rest by chunk size
4022 if (!multiple(sectors_per_dev
, parms
.chunk_size
, &tmp
))
4023 TI_ERR("Device length not divisible by chunk_size");
4025 /****************************************************************
4026 * Now that we checked the constructor arguments ->
4027 * let's allocate the RAID set
4028 ****************************************************************/
4029 rs
= context_alloc(raid_type
, &parms
, raid_devs
, sectors_per_dev
,
4030 ti
, dl_parms
, argv
);
4035 rs
->set
.dev_to_init
= rs
->set
.dev_to_init_parm
= dev_to_init
;
4036 rs
->set
.pi
= rs
->set
.pi_parm
= pi
;
4038 /* Set RAID4 parity drive index. */
4039 if (raid_type
->level
== raid4
)
4040 rs
->set
.pi
= (pi
== -1) ? rs
->set
.data_devs
: pi
;
4042 recover_set_bandwidth(rs
, parms
.bandwidth
);
4044 /* Use locking type to lock stripe access. */
4045 rs
->locking
= locking
;
4047 /* Get the device/offset tupels. */
4048 argv
+= dl_parms
+ 6 + parity_parm
+ parms
.raid_parms
;
4049 r
= dev_parms(rs
, argv
, &i
);
4053 /* Set backing device information (eg. read ahead). */
4054 rs_set_read_ahead(rs
, 2 * rs
->set
.chunk_size
, 4 /* stripes */);
4055 rs_set_congested_fn(rs
); /* Set congested function. */
4056 SetRSCheckOverwrite(rs
); /* Allow chunk overwrite checks. */
4057 speed
= xor_optimize(rs
); /* Select best xor algorithm. */
4059 /* Set for recovery of any nosync regions. */
4064 * Need to free recovery stripe(s) here in case
4065 * of nosync, because xor_optimize uses one.
4067 set_start_recovery(rs
);
4068 set_end_recovery(rs
);
4069 stripe_recover_free(rs
);
4073 * Make sure that dm core only hands maximum io size
4074 * length down and pays attention to io boundaries.
4076 ti
->max_io_len
= rs
->set
.io_size
;
4079 /* Initialize work queue to handle this RAID set's io. */
4080 r
= rs_workqueue_init(rs
);
4084 rs_log(rs
, speed
); /* Log information about RAID set. */
4088 context_free(rs
, i
);
4093 * Destruct a raid mapping
4095 static void raid_dtr(struct dm_target
*ti
)
4097 struct raid_set
*rs
= ti
->private;
4099 destroy_workqueue(rs
->io
.wq
);
4100 context_free(rs
, rs
->set
.raid_devs
);
4103 /* Raid mapping function. */
4104 static int raid_map(struct dm_target
*ti
, struct bio
*bio
,
4105 union map_info
*map_context
)
4107 /* I don't want to waste stripe cache capacity. */
4108 if (bio_rw(bio
) == READA
)
4111 struct raid_set
*rs
= ti
->private;
4114 * Get io reference to be waiting for to drop
4115 * to zero on device suspension/destruction.
4118 bio
->bi_sector
-= ti
->begin
; /* Remap sector. */
4120 /* Queue io to RAID set. */
4121 mutex_lock(&rs
->io
.in_lock
);
4122 bio_list_add(&rs
->io
.in
, bio
);
4123 mutex_unlock(&rs
->io
.in_lock
);
4125 /* Wake daemon to process input list. */
4128 /* REMOVEME: statistics. */
4129 atomic_inc(rs
->stats
+ (bio_data_dir(bio
) == READ
?
4130 S_BIOS_READ
: S_BIOS_WRITE
));
4131 return DM_MAPIO_SUBMITTED
; /* Handle later. */
4135 /* Device suspend. */
4136 static void raid_presuspend(struct dm_target
*ti
)
4138 struct raid_set
*rs
= ti
->private;
4139 struct dm_dirty_log
*dl
= rs
->recover
.dl
;
4144 dm_rh_stop_recovery(rs
->recover
.rh
);
4146 cancel_delayed_work(&rs
->io
.dws_do_raid
);
4147 flush_workqueue(rs
->io
.wq
);
4148 wait_ios(rs
); /* Wait for completion of all ios being processed. */
4150 if (dl
->type
->presuspend
&& dl
->type
->presuspend(dl
))
4151 /* FIXME: need better error handling. */
4152 DMWARN("log presuspend failed");
4155 static void raid_postsuspend(struct dm_target
*ti
)
4157 struct raid_set
*rs
= ti
->private;
4158 struct dm_dirty_log
*dl
= rs
->recover
.dl
;
4160 if (dl
->type
->postsuspend
&& dl
->type
->postsuspend(dl
))
4161 /* FIXME: need better error handling. */
4162 DMWARN("log postsuspend failed");
4166 /* Device resume. */
4167 static void raid_resume(struct dm_target
*ti
)
4169 struct raid_set
*rs
= ti
->private;
4170 struct recover
*rec
= &rs
->recover
;
4171 struct dm_dirty_log
*dl
= rec
->dl
;
4173 if (dl
->type
->resume
&& dl
->type
->resume(dl
))
4174 /* Resume dirty log. */
4175 /* FIXME: need better error handling. */
4176 DMWARN("log resume failed");
4178 rec
->nr_regions_to_recover
=
4179 rec
->nr_regions
- dl
->type
->get_sync_count(dl
);
4181 /* Restart any unfinished recovery. */
4182 if (RSRecover(rs
)) {
4183 set_start_recovery(rs
);
4184 dm_rh_start_recovery(rec
->rh
);
4191 /* Return stripe cache size. */
4192 static unsigned sc_size(struct raid_set
*rs
)
4194 return to_sector(atomic_read(&rs
->sc
.stripes
) *
4195 (sizeof(struct stripe
) +
4196 (sizeof(struct stripe_chunk
) +
4197 (sizeof(struct page_list
) +
4198 to_bytes(rs
->set
.io_size
) *
4199 rs
->set
.raid_devs
)) +
4200 (rs
->recover
.end_jiffies
?
4201 0 : rs
->recover
.recovery_stripes
*
4202 to_bytes(rs
->set
.raid_devs
* rs
->recover
.io_size
))));
4205 /* REMOVEME: status output for development. */
4206 static void raid_devel_stats(struct dm_target
*ti
, char *result
,
4207 unsigned *size
, unsigned maxlen
)
4209 unsigned sz
= *size
;
4211 char buf
[BDEVNAME_SIZE
], *p
;
4212 struct stats_map
*sm
;
4213 struct raid_set
*rs
= ti
->private;
4214 struct recover
*rec
= &rs
->recover
;
4217 DMEMIT("%s %s %u\n", version
, rs
->xor.f
->name
, rs
->xor.chunks
);
4218 DMEMIT("act_ios=%d ", io_ref(rs
));
4219 DMEMIT("act_ios_max=%d\n", atomic_read(&rs
->io
.in_process_max
));
4220 DMEMIT("act_stripes=%d ", sc_active(&rs
->sc
));
4221 DMEMIT("act_stripes_max=%d\n",
4222 atomic_read(&rs
->sc
.active_stripes_max
));
4224 for (sm
= stats_map
; sm
< ARRAY_END(stats_map
); sm
++)
4225 DMEMIT("%s%d", sm
->str
, atomic_read(rs
->stats
+ sm
->type
));
4227 DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs
) ? "on" : "off");
4228 DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs
->set
.chunk_size
,
4229 atomic_read(&rs
->sc
.stripes
), rs
->set
.io_size
,
4230 rec
->recovery_stripes
, rec
->io_size
, rs
->sc
.hash
.buckets
,
4233 j
= (rec
->end_jiffies
? rec
->end_jiffies
: jiffies
) -
4235 jiffies_to_timespec(j
, &ts
);
4236 sprintf(buf
, "%ld.%ld", ts
.tv_sec
, ts
.tv_nsec
);
4237 p
= strchr(buf
, '.');
4240 DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
4241 (unsigned long long) rec
->nr_regions_recovered
,
4242 (unsigned long long) rec
->nr_regions_to_recover
,
4243 (unsigned long long) rec
->nr_regions
, rec
->bandwidth
, buf
);
4248 static int raid_status(struct dm_target
*ti
, status_type_t type
,
4249 unsigned status_flags
, char *result
, unsigned maxlen
)
4252 char buf
[BDEVNAME_SIZE
];
4253 struct raid_set
*rs
= ti
->private;
4254 int raid_parms
[] = {
4255 rs
->set
.chunk_size_parm
,
4256 rs
->sc
.stripes_parm
,
4257 rs
->set
.io_size_parm
,
4258 rs
->recover
.io_size_parm
,
4259 rs
->recover
.bandwidth_parm
,
4261 rs
->recover
.recovery_stripes
,
4265 case STATUSTYPE_INFO
:
4266 /* REMOVEME: statistics. */
4267 if (RSDevelStats(rs
))
4268 raid_devel_stats(ti
, result
, &sz
, maxlen
);
4270 DMEMIT("%u ", rs
->set
.raid_devs
);
4272 for (p
= 0; p
< rs
->set
.raid_devs
; p
++)
4274 format_dev_t(buf
, rs
->dev
[p
].dev
->bdev
->bd_dev
));
4277 for (p
= 0; p
< rs
->set
.raid_devs
; p
++) {
4278 DMEMIT("%c", !DevFailed(rs
->dev
+ p
) ? 'A' : 'D');
4280 if (p
== rs
->set
.pi
)
4283 if (rs
->set
.dev_to_init
== p
)
4288 case STATUSTYPE_TABLE
:
4289 sz
= rs
->recover
.dl
->type
->status(rs
->recover
.dl
, type
,
4291 DMEMIT("%s %u ", rs
->set
.raid_type
->name
,
4292 rs
->set
.raid_parms
);
4294 for (p
= 0; p
< rs
->set
.raid_parms
; p
++) {
4295 if (raid_parms
[p
] > -2)
4296 DMEMIT("%d ", raid_parms
[p
]);
4298 DMEMIT("%s ", rs
->recover
.recovery
?
4302 DMEMIT("%u %d ", rs
->set
.raid_devs
, rs
->set
.dev_to_init
);
4304 for (p
= 0; p
< rs
->set
.raid_devs
; p
++)
4306 format_dev_t(buf
, rs
->dev
[p
].dev
->bdev
->bd_dev
),
4307 (unsigned long long) rs
->dev
[p
].start
);
4316 enum raid_msg_actions
{
4317 act_bw
, /* Recovery bandwidth switch. */
4318 act_dev
, /* Device failure switch. */
4319 act_overwrite
, /* Stripe overwrite check. */
4320 act_stats
, /* Development statistics switch. */
4321 act_sc
, /* Stripe cache switch. */
4323 act_on
, /* Set entity on. */
4324 act_off
, /* Set entity off. */
4325 act_reset
, /* Reset entity. */
4327 act_set
= act_on
, /* Set # absolute. */
4328 act_grow
= act_off
, /* Grow # by an amount. */
4329 act_shrink
= act_reset
, /* Shrink # by an amount. */
4332 /* Turn a delta into an absolute value. */
4333 static int _absolute(unsigned long action
, int act
, int r
)
4335 /* Make delta absolute. */
4336 if (test_bit(act_set
, &action
))
4338 else if (test_bit(act_grow
, &action
))
4340 else if (test_bit(act_shrink
, &action
))
4348 /* Change recovery io bandwidth. */
4349 static int bandwidth_change(struct dm_msg
*msg
, void *context
)
4351 struct raid_set
*rs
= context
;
4352 int act
= rs
->recover
.bandwidth
;
4353 int bandwidth
= DM_MSG_INT_ARG(msg
);
4355 if (range_ok(bandwidth
, BANDWIDTH_MIN
, BANDWIDTH_MAX
)) {
4356 /* Make delta bandwidth absolute. */
4357 bandwidth
= _absolute(msg
->action
, act
, bandwidth
);
4360 if (range_ok(bandwidth
, BANDWIDTH_MIN
, BANDWIDTH_MAX
)) {
4361 recover_set_bandwidth(rs
, bandwidth
);
4366 set_bit(dm_msg_ret_arg
, &msg
->ret
);
4367 set_bit(dm_msg_ret_inval
, &msg
->ret
);
4371 /* Set/reset development feature flags. */
4372 static int devel_flags(struct dm_msg
*msg
, void *context
)
4374 struct raid_set
*rs
= context
;
4376 if (test_bit(act_on
, &msg
->action
))
4377 return test_and_set_bit(msg
->spec
->parm
,
4378 &rs
->io
.flags
) ? -EPERM
: 0;
4379 else if (test_bit(act_off
, &msg
->action
))
4380 return test_and_clear_bit(msg
->spec
->parm
,
4381 &rs
->io
.flags
) ? 0 : -EPERM
;
4382 else if (test_bit(act_reset
, &msg
->action
)) {
4383 if (test_bit(act_stats
, &msg
->action
)) {
4386 } else if (test_bit(act_overwrite
, &msg
->action
)) {
4388 set_bit(msg
->spec
->parm
, &rs
->io
.flags
);
4396 /* Resize the stripe cache. */
4397 static int sc_resize(struct dm_msg
*msg
, void *context
)
4400 struct raid_set
*rs
= context
;
4402 /* Deny permission in case the daemon is still resizing!. */
4403 if (atomic_read(&rs
->sc
.stripes_to_set
))
4406 stripes
= DM_MSG_INT_ARG(msg
);
4408 act
= atomic_read(&rs
->sc
.stripes
);
4410 /* Make delta stripes absolute. */
4411 stripes
= _absolute(msg
->action
, act
, stripes
);
4414 * Check range and that the # of stripes changes.
4415 * We leave the resizing to the wroker.
4417 if (range_ok(stripes
, STRIPES_MIN
, STRIPES_MAX
) &&
4418 stripes
!= atomic_read(&rs
->sc
.stripes
)) {
4419 atomic_set(&rs
->sc
.stripes_to_set
, stripes
);
4425 set_bit(dm_msg_ret_arg
, &msg
->ret
);
4426 set_bit(dm_msg_ret_inval
, &msg
->ret
);
4430 /* Parse the RAID message action. */
4432 * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
4433 * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
4434 * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
4435 * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
4438 static int raid_message(struct dm_target
*ti
, unsigned argc
, char **argv
)
4440 /* Variables to store the parsed parameters im. */
4442 static unsigned long *i_arg
[] = {
4443 (unsigned long *) i
+ 0,
4444 (unsigned long *) i
+ 1,
4447 /* Declare all message option strings. */
4448 static char *str_sgs
[] = { "set", "grow", "shrink" };
4449 static char *str_oor
[] = { "on", "off", "reset" };
4451 /* Declare all actions. */
4452 static unsigned long act_sgs
[] = { act_set
, act_grow
, act_shrink
};
4453 static unsigned long act_oor
[] = { act_on
, act_off
, act_reset
};
4455 /* Bandwidth option. */
4456 static struct dm_message_option bw_opt
= { 3, str_sgs
, act_sgs
};
4457 static struct dm_message_argument bw_args
= {
4458 1, i_arg
, { dm_msg_int_t
}
4461 static struct dm_message_argument null_args
= {
4462 0, NULL
, { dm_msg_int_t
}
4465 /* Overwrite and statistics option. */
4466 static struct dm_message_option ovr_stats_opt
= { 3, str_oor
, act_oor
};
4468 /* Sripecache option. */
4469 static struct dm_message_option stripe_opt
= { 3, str_sgs
, act_sgs
};
4471 /* Declare messages. */
4472 static struct dm_msg_spec specs
[] = {
4473 { "bandwidth", act_bw
, &bw_opt
, &bw_args
,
4474 0, bandwidth_change
},
4475 { "overwrite", act_overwrite
, &ovr_stats_opt
, &null_args
,
4476 RS_CHECK_OVERWRITE
, devel_flags
},
4477 { "statistics", act_stats
, &ovr_stats_opt
, &null_args
,
4478 RS_DEVEL_STATS
, devel_flags
},
4479 { "stripecache", act_sc
, &stripe_opt
, &bw_args
,
4483 /* The message for the parser. */
4484 struct dm_msg msg
= {
4485 .num_specs
= ARRAY_SIZE(specs
),
4489 return dm_message_parse(TARGET
, &msg
, ti
->private, argc
, argv
);
4492 * END message interface
4495 static struct target_type raid_target
= {
4497 .version
= {1, 0, 0},
4498 .module
= THIS_MODULE
,
4502 .presuspend
= raid_presuspend
,
4503 .postsuspend
= raid_postsuspend
,
4504 .resume
= raid_resume
,
4505 .status
= raid_status
,
4506 .message
= raid_message
,
4509 static void init_exit(const char *bad_msg
, const char *good_msg
, int r
)
4512 DMERR("Failed to %sregister target [%d]", bad_msg
, r
);
4514 DMINFO("%s %s", good_msg
, version
);
4517 static int __init
dm_raid_init(void)
4519 int r
= dm_register_target(&raid_target
);
4521 init_exit("", "initialized", r
);
4525 static void __exit
dm_raid_exit(void)
4527 dm_unregister_target(&raid_target
);
4528 init_exit("un", "exit", 0);
4532 module_init(dm_raid_init
);
4533 module_exit(dm_raid_exit
);
4535 MODULE_DESCRIPTION(DM_NAME
" raid4/5 target");
4536 MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
4537 MODULE_LICENSE("GPL");
4538 MODULE_ALIAS("dm-raid4");
4539 MODULE_ALIAS("dm-raid5");