ubuntu/dm-raid4-5/dm-raid4-5.c

   1 /*[A[A
   2  * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
   3  *
   4  * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
   5  *
   6  * This file is released under the GPL.
   7  *
   8  *
   9  * Linux 2.6 Device Mapper RAID4 and RAID5 target.
  10  *
  11  * Supports:
  12  *      o RAID4 with dedicated and selectable parity device
  13  *      o RAID5 with rotating parity (left+right, symmetric+asymmetric)
  14  *      o recovery of out of sync device for initial
  15  *        RAID set creation or after dead drive replacement
  16  *      o run time optimization of xor algorithm used to calculate parity
  17  *
  18  *
  19  * Thanks to MD for:
  20  *    o the raid address calculation algorithm
  21  *    o the base of the biovec <-> page list copier.
  22  *
  23  *
  24  * Uses region hash to keep track of how many writes are in flight to
  25  * regions in order to use dirty log to keep state of regions to recover:
  26  *
  27  *    o clean regions (those which are synchronized
  28  *      and don't have write io in flight)
  29  *    o dirty regions (those with write io in flight)
  30  *
  31  *
  32  * On startup, any dirty regions are migrated to the
  33  * 'nosync' state and are subject to recovery by the daemon.
  34  *
  35  * See raid_ctr() for table definition.
  36  *
  37  * FIXME: recovery bandwidth
  38  */
  39
  40 static const char *version = "v0.2594b";
  41
  42 #include "dm.h"
  43 #include "dm-memcache.h"
  44 #include "dm-message.h"
  45 #include "dm-raid45.h"
  46
  47 #include <linux/kernel.h>
  48 #include <linux/vmalloc.h>
  49 #include <linux/raid/xor.h>
  50
  51 #include <linux/bio.h>
  52 #include <linux/dm-io.h>
  53 #include <linux/dm-dirty-log.h>
  54 #include "dm-region-hash.h"
  55
  56 #include <linux/slab.h>
  57 #include <linux/module.h>
  58
  59 /*
  60  * Configurable parameters
  61  */
  62
  63 /* Minimum/maximum and default # of selectable stripes. */
  64 #define STRIPES_MIN             8
  65 #define STRIPES_MAX             16384
  66 #define STRIPES_DEFAULT         80
  67
  68 /* Maximum and default chunk size in sectors if not set in constructor. */
  69 #define CHUNK_SIZE_MIN          8
  70 #define CHUNK_SIZE_MAX          16384
  71 #define CHUNK_SIZE_DEFAULT      64
  72
  73 /* Default io size in sectors if not set in constructor. */
  74 #define IO_SIZE_MIN             CHUNK_SIZE_MIN
  75 #define IO_SIZE_DEFAULT         IO_SIZE_MIN
  76
  77 /* Recover io size default in sectors. */
  78 #define RECOVER_IO_SIZE_MIN             64
  79 #define RECOVER_IO_SIZE_DEFAULT         256
  80
  81 /* Default, minimum and maximum percentage of recover io bandwidth. */
  82 #define BANDWIDTH_DEFAULT       10
  83 #define BANDWIDTH_MIN           1
  84 #define BANDWIDTH_MAX           100
  85
  86 /* # of parallel recovered regions */
  87 #define RECOVERY_STRIPES_MIN    1
  88 #define RECOVERY_STRIPES_MAX    64
  89 #define RECOVERY_STRIPES_DEFAULT        RECOVERY_STRIPES_MIN
  90 /*
  91  * END Configurable parameters
  92  */
  93
  94 #define TARGET  "dm-raid45"
  95 #define DAEMON  "kraid45d"
  96 #define DM_MSG_PREFIX   TARGET
  97
  98 #define SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
  99
 100 /* Amount/size for __xor(). */
 101 #define XOR_SIZE        PAGE_SIZE
 102
 103 /* Check value in range. */
 104 #define range_ok(i, min, max)   (i >= min && i <= max)
 105
 106 /* Check argument is power of 2. */
 107 #define POWER_OF_2(a) (!(a & (a - 1)))
 108
 109 /* Structure access macros. */
 110 /* Derive raid_set from stripe_cache pointer. */
 111 #define RS(x)   container_of(x, struct raid_set, sc)
 112
 113 /* Page reference. */
 114 #define PAGE(stripe, p)  ((stripe)->obj[p].pl->page)
 115
 116 /* Stripe chunk reference. */
 117 #define CHUNK(stripe, p) ((stripe)->chunk + p)
 118
 119 /* Bio list reference. */
 120 #define BL(stripe, p, rw)       (stripe->chunk[p].bl + rw)
 121 #define BL_CHUNK(chunk, rw)     (chunk->bl + rw)
 122
 123 /* Page list reference. */
 124 #define PL(stripe, p)           (stripe->obj[p].pl)
 125 /* END: structure access macros. */
 126
 127 /* Factor out to dm-bio-list.h */
 128 static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
 129 {
 130         bio->bi_next = bl->head;
 131         bl->head = bio;
 132
 133         if (!bl->tail)
 134                 bl->tail = bio;
 135 }
 136
 137 /* Factor out to dm.h */
 138 #define TI_ERR_RET(str, ret) \
 139         do { ti->error = str; return ret; } while (0);
 140 #define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
 141
 142 /* Macro to define access IO flags access inline functions. */
 143 #define BITOPS(name, what, var, flag) \
 144 static inline int TestClear ## name ## what(struct var *v) \
 145 { return test_and_clear_bit(flag, &v->io.flags); } \
 146 static inline int TestSet ## name ## what(struct var *v) \
 147 { return test_and_set_bit(flag, &v->io.flags); } \
 148 static inline void Clear ## name ## what(struct var *v) \
 149 { clear_bit(flag, &v->io.flags); } \
 150 static inline void Set ## name ## what(struct var *v) \
 151 { set_bit(flag, &v->io.flags); } \
 152 static inline int name ## what(struct var *v) \
 153 { return test_bit(flag, &v->io.flags); }
 154
 155 /*-----------------------------------------------------------------
 156  * Stripe cache
 157  *
 158  * Cache for all reads and writes to raid sets (operational or degraded)
 159  *
 160  * We need to run all data to and from a RAID set through this cache,
 161  * because parity chunks need to get calculated from data chunks
 162  * or, in the degraded/resynchronization case, missing chunks need
 163  * to be reconstructed using the other chunks of the stripe.
 164  *---------------------------------------------------------------*/
 165 /* A chunk within a stripe (holds bios hanging off). */
 166 /* IO status flags for chunks of a stripe. */
 167 enum chunk_flags {
 168         CHUNK_DIRTY,            /* Pages of chunk dirty; need writing. */
 169         CHUNK_ERROR,            /* IO error on any chunk page. */
 170         CHUNK_IO,               /* Allow/prohibit IO on chunk pages. */
 171         CHUNK_LOCKED,           /* Chunk pages locked during IO. */
 172         CHUNK_MUST_IO,          /* Chunk must io. */
 173         CHUNK_UNLOCK,           /* Enforce chunk unlock. */
 174         CHUNK_UPTODATE,         /* Chunk pages are uptodate. */
 175 };
 176
 177 /*
 178  * This does not work anymore with __REQ_* values being enums
 179  *
 180 #if READ != 0 || WRITE != 1
 181 #error dm-raid45: READ/WRITE != 0/1 used as index!!!
 182 #endif
 183 */
 184
 185 enum bl_type {
 186         WRITE_QUEUED = WRITE + 1,
 187         WRITE_MERGED,
 188         NR_BL_TYPES,    /* Must be last one! */
 189 };
 190 struct stripe_chunk {
 191         atomic_t cnt;           /* Reference count. */
 192         struct stripe *stripe;  /* Backpointer to stripe for endio(). */
 193         /* Bio lists for reads, writes, and writes merged. */
 194         struct bio_list bl[NR_BL_TYPES];
 195         struct {
 196                 unsigned long flags; /* IO status flags. */
 197         } io;
 198 };
 199
 200 /* Define chunk bit operations. */
 201 BITOPS(Chunk, Dirty,     stripe_chunk, CHUNK_DIRTY)
 202 BITOPS(Chunk, Error,     stripe_chunk, CHUNK_ERROR)
 203 BITOPS(Chunk, Io,        stripe_chunk, CHUNK_IO)
 204 BITOPS(Chunk, Locked,    stripe_chunk, CHUNK_LOCKED)
 205 BITOPS(Chunk, MustIo,    stripe_chunk, CHUNK_MUST_IO)
 206 BITOPS(Chunk, Unlock,    stripe_chunk, CHUNK_UNLOCK)
 207 BITOPS(Chunk, Uptodate,  stripe_chunk, CHUNK_UPTODATE)
 208
 209 /*
 210  * Stripe linked list indexes. Keep order, because the stripe
 211  * and the stripe cache rely on the first 3!
 212  */
 213 enum list_types {
 214         LIST_FLUSH,     /* Stripes to flush for io. */
 215         LIST_ENDIO,     /* Stripes to endio. */
 216         LIST_LRU,       /* Least recently used stripes. */
 217         SC_NR_LISTS,    /* # of lists in stripe cache. */
 218         LIST_HASH = SC_NR_LISTS,        /* Hashed stripes. */
 219         LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
 220         STRIPE_NR_LISTS,/* To size array in struct stripe. */
 221 };
 222
 223 /* Adressing region recovery. */
 224 struct recover_addr {
 225         struct dm_region *reg;  /* Actual region to recover. */
 226         sector_t pos;   /* Position within region to recover. */
 227         sector_t end;   /* End of region to recover. */
 228 };
 229
 230 /* A stripe: the io object to handle all reads and writes to a RAID set. */
 231 struct stripe {
 232         atomic_t cnt;                   /* Reference count. */
 233         struct stripe_cache *sc;        /* Backpointer to stripe cache. */
 234
 235         /*
 236          * 4 linked lists:
 237          *   o io list to flush io
 238          *   o endio list
 239          *   o LRU list to put stripes w/o reference count on
 240          *   o stripe cache hash
 241          */
 242         struct list_head lists[STRIPE_NR_LISTS];
 243
 244         sector_t key;    /* Hash key. */
 245         region_t region; /* Region stripe is mapped to. */
 246
 247         struct {
 248                 unsigned long flags;    /* Stripe state flags (see below). */
 249
 250                 /*
 251                  * Pending ios in flight:
 252                  *
 253                  * used to control move of stripe to endio list
 254                  */
 255                 atomic_t pending;
 256
 257                 /* Sectors to read and write for multi page stripe sets. */
 258                 unsigned size;
 259         } io;
 260
 261         /* Address region recovery. */
 262         struct recover_addr *recover;
 263
 264         /* Lock on stripe (Future: for clustering). */
 265         void *lock;
 266
 267         struct {
 268                 unsigned short parity;  /* Parity chunk index. */
 269                 short recover;          /* Recovery chunk index. */
 270         } idx;
 271
 272         /*
 273          * This stripe's memory cache object (dm-mem-cache);
 274          * i.e. the io chunk pages.
 275          */
 276         struct dm_mem_cache_object *obj;
 277
 278         /* Array of stripe sets (dynamically allocated). */
 279         struct stripe_chunk chunk[0];
 280 };
 281
 282 /* States stripes can be in (flags field). */
 283 enum stripe_states {
 284         STRIPE_ERROR,           /* io error on stripe. */
 285         STRIPE_MERGED,          /* Writes got merged to be written. */
 286         STRIPE_RBW,             /* Read-before-write stripe. */
 287         STRIPE_RECONSTRUCT,     /* Reconstruct of a missing chunk required. */
 288         STRIPE_RECONSTRUCTED,   /* Reconstructed of a missing chunk. */
 289         STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
 290 };
 291
 292 /* Define stripe bit operations. */
 293 BITOPS(Stripe, Error,         stripe, STRIPE_ERROR)
 294 BITOPS(Stripe, Merged,        stripe, STRIPE_MERGED)
 295 BITOPS(Stripe, RBW,           stripe, STRIPE_RBW)
 296 BITOPS(Stripe, Reconstruct,   stripe, STRIPE_RECONSTRUCT)
 297 BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
 298 BITOPS(Stripe, Recover,       stripe, STRIPE_RECOVER)
 299
 300 /* A stripe hash. */
 301 struct stripe_hash {
 302         struct list_head *hash;
 303         unsigned buckets;
 304         unsigned mask;
 305         unsigned prime;
 306         unsigned shift;
 307 };
 308
 309 enum sc_lock_types {
 310         LOCK_ENDIO,     /* Protect endio list. */
 311         LOCK_LRU,       /* Protect LRU list. */
 312         NR_LOCKS,       /* To size array in struct stripe_cache. */
 313 };
 314
 315 /* A stripe cache. */
 316 struct stripe_cache {
 317         /* Stripe hash. */
 318         struct stripe_hash hash;
 319
 320         spinlock_t locks[NR_LOCKS];     /* Locks to protect lists. */
 321
 322         /* Stripes with io to flush, stripes to endio and LRU lists. */
 323         struct list_head lists[SC_NR_LISTS];
 324
 325         /* Slab cache to allocate stripes from. */
 326         struct {
 327                 struct kmem_cache *cache;       /* Cache itself. */
 328                 char name[32];  /* Unique name. */
 329         } kc;
 330
 331         struct dm_io_client *dm_io_client; /* dm-io client resource context. */
 332
 333         /* dm-mem-cache client resource context. */
 334         struct dm_mem_cache_client *mem_cache_client;
 335
 336         int stripes_parm;           /* # stripes parameter from constructor. */
 337         atomic_t stripes;           /* actual # of stripes in cache. */
 338         atomic_t stripes_to_set;    /* # of stripes to resize cache to. */
 339         atomic_t stripes_last;      /* last # of stripes in cache. */
 340         atomic_t active_stripes;    /* actual # of active stripes in cache. */
 341
 342         /* REMOVEME: */
 343         atomic_t active_stripes_max; /* actual # of active stripes in cache. */
 344 };
 345
 346 /* Flag specs for raid_dev */ ;
 347 enum raid_dev_flags {
 348         DEV_FAILED,     /* Device failed. */
 349         DEV_IO_QUEUED,  /* Io got queued to device. */
 350 };
 351
 352 /* The raid device in a set. */
 353 struct raid_dev {
 354         struct dm_dev *dev;
 355         sector_t start;         /* Offset to map to. */
 356         struct {        /* Using struct to be able to BITOPS(). */
 357                 unsigned long flags;    /* raid_dev_flags. */
 358         } io;
 359 };
 360
 361 BITOPS(Dev, Failed,   raid_dev, DEV_FAILED)
 362 BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
 363
 364 /* Flags spec for raid_set. */
 365 enum raid_set_flags {
 366         RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
 367         RS_DEAD,                /* RAID set inoperational. */
 368         RS_DEGRADED,            /* Io errors on RAID device. */
 369         RS_DEVEL_STATS,         /* REMOVEME: display status information. */
 370         RS_RECOVER,             /* Do recovery. */
 371         RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
 372         RS_SC_BUSY,             /* Stripe cache busy -> send an event. */
 373         RS_SUSPEND,             /* Suspend RAID set. */
 374 };
 375
 376 /* REMOVEME: devel stats counters. */
 377 enum stats_types {
 378         S_BIOS_READ,
 379         S_BIOS_ADDED_READ,
 380         S_BIOS_ENDIO_READ,
 381         S_BIOS_WRITE,
 382         S_BIOS_ADDED_WRITE,
 383         S_BIOS_ENDIO_WRITE,
 384         S_CAN_MERGE,
 385         S_CANT_MERGE,
 386         S_CONGESTED,
 387         S_DM_IO_READ,
 388         S_DM_IO_WRITE,
 389         S_BANDWIDTH,
 390         S_BARRIER,
 391         S_BIO_COPY_PL_NEXT,
 392         S_DEGRADED,
 393         S_DELAYED_BIOS,
 394         S_FLUSHS,
 395         S_HITS_1ST,
 396         S_IOS_POST,
 397         S_INSCACHE,
 398         S_MAX_LOOKUP,
 399         S_CHUNK_LOCKED,
 400         S_NO_BANDWIDTH,
 401         S_NOT_CONGESTED,
 402         S_NO_RW,
 403         S_NOSYNC,
 404         S_OVERWRITE,
 405         S_PROHIBITCHUNKIO,
 406         S_RECONSTRUCT_EI,
 407         S_RECONSTRUCT_DEV,
 408         S_RECONSTRUCT_SET,
 409         S_RECONSTRUCTED,
 410         S_REQUEUE,
 411         S_STRIPE_ERROR,
 412         S_SUM_DELAYED_BIOS,
 413         S_XORS,
 414         S_NR_STATS,     /* # of stats counters. Must be last! */
 415 };
 416
 417 /* Status type -> string mappings. */
 418 struct stats_map {
 419         const enum stats_types type;
 420         const char *str;
 421 };
 422
 423 static struct stats_map stats_map[] = {
 424         { S_BIOS_READ, "r=" },
 425         { S_BIOS_ADDED_READ, "/" },
 426         { S_BIOS_ENDIO_READ, "/" },
 427         { S_BIOS_WRITE, " w=" },
 428         { S_BIOS_ADDED_WRITE, "/" },
 429         { S_BIOS_ENDIO_WRITE, "/" },
 430         { S_DM_IO_READ, " rc=" },
 431         { S_DM_IO_WRITE, " wc=" },
 432         { S_BANDWIDTH, "\nbw=" },
 433         { S_NO_BANDWIDTH, " no_bw=" },
 434         { S_BARRIER, "\nbarrier=" },
 435         { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
 436         { S_CAN_MERGE, "\nmerge=" },
 437         { S_CANT_MERGE, "/no_merge=" },
 438         { S_CHUNK_LOCKED, "\nchunk_locked=" },
 439         { S_CONGESTED, "\ncgst=" },
 440         { S_NOT_CONGESTED, "/not_cgst=" },
 441         { S_DEGRADED, "\ndegraded=" },
 442         { S_DELAYED_BIOS, "\ndel_bios=" },
 443         { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
 444         { S_FLUSHS, "\nflushs=" },
 445         { S_HITS_1ST, "\nhits_1st=" },
 446         { S_IOS_POST, " ios_post=" },
 447         { S_INSCACHE, " inscache=" },
 448         { S_MAX_LOOKUP, " maxlookup=" },
 449         { S_NO_RW, "\nno_rw=" },
 450         { S_NOSYNC, " nosync=" },
 451         { S_OVERWRITE, " ovr=" },
 452         { S_PROHIBITCHUNKIO, " prhbt_io=" },
 453         { S_RECONSTRUCT_EI, "\nrec_ei=" },
 454         { S_RECONSTRUCT_DEV, " rec_dev=" },
 455         { S_RECONSTRUCT_SET, " rec_set=" },
 456         { S_RECONSTRUCTED, " rec=" },
 457         { S_REQUEUE, " requeue=" },
 458         { S_STRIPE_ERROR, " stripe_err=" },
 459         { S_XORS, " xors=" },
 460 };
 461
 462 /*
 463  * A RAID set.
 464  */
 465 #define dm_rh_client    dm_region_hash
 466 enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
 467 typedef void (*xor_function_t)(unsigned count, unsigned long **data);
 468 struct raid_set {
 469         struct dm_target *ti;   /* Target pointer. */
 470
 471         struct {
 472                 unsigned long flags;    /* State flags. */
 473                 struct mutex in_lock;   /* Protects central input list below. */
 474                 struct bio_list in;     /* Pending ios (central input list). */
 475                 struct bio_list work;   /* ios work set. */
 476                 wait_queue_head_t suspendq;     /* suspend synchronization. */
 477                 atomic_t in_process;    /* counter of queued bios (suspendq). */
 478                 atomic_t in_process_max;/* counter of queued bios max. */
 479
 480                 /* io work. */
 481                 struct workqueue_struct *wq;
 482                 struct delayed_work dws_do_raid;        /* For main worker. */
 483                 struct work_struct ws_do_table_event;   /* For event worker. */
 484         } io;
 485
 486         /* Stripe locking abstraction. */
 487         struct dm_raid45_locking_type *locking;
 488
 489         struct stripe_cache sc; /* Stripe cache for this set. */
 490
 491         /* Xor optimization. */
 492         struct {
 493                 struct xor_func *f;
 494                 unsigned chunks;
 495                 unsigned speed;
 496         } xor;
 497
 498         /* Recovery parameters. */
 499         struct recover {
 500                 struct dm_dirty_log *dl;        /* Dirty log. */
 501                 struct dm_rh_client *rh;        /* Region hash. */
 502
 503                 struct dm_io_client *dm_io_client; /* recovery dm-io client. */
 504                 /* dm-mem-cache client resource context for recovery stripes. */
 505                 struct dm_mem_cache_client *mem_cache_client;
 506
 507                 struct list_head stripes;       /* List of recovery stripes. */
 508
 509                 region_t nr_regions;
 510                 region_t nr_regions_to_recover;
 511                 region_t nr_regions_recovered;
 512                 unsigned long start_jiffies;
 513                 unsigned long end_jiffies;
 514
 515                 unsigned bandwidth;      /* Recovery bandwidth [%]. */
 516                 unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
 517                 unsigned bandwidth_parm; /*  " constructor parm. */
 518                 unsigned io_size;        /* recovery io size <= region size. */
 519                 unsigned io_size_parm;   /* recovery io size ctr parameter. */
 520                 unsigned recovery;       /* Recovery allowed/prohibited. */
 521                 unsigned recovery_stripes; /* # of parallel recovery stripes. */
 522
 523                 /* recovery io throttling. */
 524                 atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
 525                 unsigned long last_jiffies;
 526         } recover;
 527
 528         /* RAID set parameters. */
 529         struct {
 530                 struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
 531                 unsigned raid_parms;    /* # variable raid parameters. */
 532
 533                 unsigned chunk_size;    /* Sectors per chunk. */
 534                 unsigned chunk_size_parm;
 535                 unsigned chunk_shift;   /* rsector chunk size shift. */
 536
 537                 unsigned io_size;       /* Sectors per io. */
 538                 unsigned io_size_parm;
 539                 unsigned io_mask;       /* Mask for bio_copy_page_list(). */
 540                 unsigned io_inv_mask;   /* Mask for raid_address(). */
 541
 542                 sector_t sectors_per_dev;       /* Sectors per device. */
 543
 544                 atomic_t failed_devs;           /* Amount of devices failed. */
 545
 546                 /* Index of device to initialize. */
 547                 int dev_to_init;
 548                 int dev_to_init_parm;
 549
 550                 /* Raid devices dynamically allocated. */
 551                 unsigned raid_devs;     /* # of RAID devices below. */
 552                 unsigned data_devs;     /* # of RAID data devices. */
 553
 554                 int ei;         /* index of failed RAID device. */
 555
 556                 /* Index of dedicated parity device (i.e. RAID4). */
 557                 int pi;
 558                 int pi_parm;    /* constructor parm for status output. */
 559         } set;
 560
 561         /* REMOVEME: devel stats counters. */
 562         atomic_t stats[S_NR_STATS];
 563
 564         /* Dynamically allocated temporary pointers for xor(). */
 565         unsigned long **data;
 566
 567         /* Dynamically allocated RAID devices. Alignment? */
 568         struct raid_dev dev[0];
 569 };
 570
 571 /* Define RAID set bit operations. */
 572 BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
 573 BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
 574 BITOPS(RS, Dead, raid_set, RS_DEAD)
 575 BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
 576 BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
 577 BITOPS(RS, Recover, raid_set, RS_RECOVER)
 578 BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
 579 BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
 580 #undef BITOPS
 581
 582 /*-----------------------------------------------------------------
 583  * Raid-4/5 set structures.
 584  *---------------------------------------------------------------*/
 585 /* RAID level definitions. */
 586 enum raid_level {
 587         raid4,
 588         raid5,
 589 };
 590
 591 /* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
 592 enum raid_algorithm {
 593         none,
 594         left_asym,
 595         right_asym,
 596         left_sym,
 597         right_sym,
 598 };
 599
 600 struct raid_type {
 601         const char *name;               /* RAID algorithm. */
 602         const char *descr;              /* Descriptor text for logging. */
 603         const unsigned parity_devs;     /* # of parity devices. */
 604         const unsigned minimal_devs;    /* minimal # of devices in set. */
 605         const enum raid_level level;            /* RAID level. */
 606         const enum raid_algorithm algorithm;    /* RAID algorithm. */
 607 };
 608
 609 /* Supported raid types and properties. */
 610 static struct raid_type raid_types[] = {
 611         {"raid4",    "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
 612         {"raid5_la", "RAID5 (left asymmetric)",       1, 3, raid5, left_asym},
 613         {"raid5_ra", "RAID5 (right asymmetric)",      1, 3, raid5, right_asym},
 614         {"raid5_ls", "RAID5 (left symmetric)",        1, 3, raid5, left_sym},
 615         {"raid5_rs", "RAID5 (right symmetric)",       1, 3, raid5, right_sym},
 616 };
 617
 618 /* Address as calculated by raid_address(). */
 619 struct raid_address {
 620         sector_t key;           /* Hash key (address of stripe % chunk_size). */
 621         unsigned di, pi;        /* Data and parity disks index. */
 622 };
 623
 624 /* REMOVEME: reset statistics counters. */
 625 static void stats_reset(struct raid_set *rs)
 626 {
 627         unsigned s = S_NR_STATS;
 628
 629         while (s--)
 630                 atomic_set(rs->stats + s, 0);
 631 }
 632
 633 /*----------------------------------------------------------------
 634  * RAID set management routines.
 635  *--------------------------------------------------------------*/
 636 /*
 637  * Begin small helper functions.
 638  */
 639 /* No need to be called from region hash indirectly at dm_rh_dec(). */
 640 static void wake_dummy(void *context) {}
 641
 642 /* Return # of io reference. */
 643 static int io_ref(struct raid_set *rs)
 644 {
 645         return atomic_read(&rs->io.in_process);
 646 }
 647
 648 /* Get an io reference. */
 649 static void io_get(struct raid_set *rs)
 650 {
 651         int p = atomic_inc_return(&rs->io.in_process);
 652
 653         if (p > atomic_read(&rs->io.in_process_max))
 654                 atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
 655 }
 656
 657 /* Put the io reference and conditionally wake io waiters. */
 658 static void io_put(struct raid_set *rs)
 659 {
 660         /* Intel: rebuild data corrupter? */
 661         if (atomic_dec_and_test(&rs->io.in_process))
 662                 wake_up(&rs->io.suspendq);
 663         else
 664                 BUG_ON(io_ref(rs) < 0);
 665 }
 666
 667 /* Wait until all io has been processed. */
 668 static void wait_ios(struct raid_set *rs)
 669 {
 670         wait_event(rs->io.suspendq, !io_ref(rs));
 671 }
 672
 673 /* Queue (optionally delayed) io work. */
 674 static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
 675 {
 676         queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
 677 }
 678
 679 /* Queue io work immediately (called from region hash too). */
 680 static void wake_do_raid(void *context)
 681 {
 682         struct raid_set *rs = context;
 683
 684         queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
 685 }
 686
 687 /* Calculate device sector offset. */
 688 static sector_t _sector(struct raid_set *rs, struct bio *bio)
 689 {
 690         sector_t sector = bio->bi_sector;
 691
 692         sector_div(sector, rs->set.data_devs);
 693         return sector;
 694 }
 695
 696 /* Return # of active stripes in stripe cache. */
 697 static int sc_active(struct stripe_cache *sc)
 698 {
 699         return atomic_read(&sc->active_stripes);
 700 }
 701
 702 /* Stripe cache busy indicator. */
 703 static int sc_busy(struct raid_set *rs)
 704 {
 705         return sc_active(&rs->sc) >
 706                atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
 707 }
 708
 709 /* Set chunks states. */
 710 enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
 711 static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
 712 {
 713         switch (type) {
 714         case CLEAN:
 715                 ClearChunkDirty(chunk);
 716                 break;
 717         case DIRTY:
 718                 SetChunkDirty(chunk);
 719                 break;
 720         case ERROR:
 721                 SetChunkError(chunk);
 722                 SetStripeError(chunk->stripe);
 723                 return;
 724         default:
 725                 BUG();
 726         }
 727
 728         SetChunkUptodate(chunk);
 729         SetChunkIo(chunk);
 730         ClearChunkError(chunk);
 731 }
 732
 733 /* Return region state for a sector. */
 734 static int region_state(struct raid_set *rs, sector_t sector,
 735                         enum dm_rh_region_states state)
 736 {
 737         struct dm_rh_client *rh = rs->recover.rh;
 738         region_t region = dm_rh_sector_to_region(rh, sector);
 739
 740         return !!(dm_rh_get_state(rh, region, 1) & state);
 741 }
 742
 743 /*
 744  * Return true in case a chunk should be read/written
 745  *
 746  * Conditions to read/write:
 747  *      o chunk not uptodate
 748  *      o chunk dirty
 749  *
 750  * Conditios to avoid io:
 751  *      o io already ongoing on chunk
 752  *      o io explitely prohibited
 753  */
 754 static int chunk_io(struct stripe_chunk *chunk)
 755 {
 756         /* 2nd run optimization (flag set below on first run). */
 757         if (TestClearChunkMustIo(chunk))
 758                 return 1;
 759
 760         /* Avoid io if prohibited or a locked chunk. */
 761         if (!ChunkIo(chunk) || ChunkLocked(chunk))
 762                 return 0;
 763
 764         if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
 765                 SetChunkMustIo(chunk); /* 2nd run optimization. */
 766                 return 1;
 767         }
 768
 769         return 0;
 770 }
 771
 772 /* Call a function on each chunk needing io unless device failed. */
 773 static unsigned for_each_io_dev(struct stripe *stripe,
 774                                 void (*f_io)(struct stripe *stripe, unsigned p))
 775 {
 776         struct raid_set *rs = RS(stripe->sc);
 777         unsigned p, r = 0;
 778
 779         for (p = 0; p < rs->set.raid_devs; p++) {
 780                 if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
 781                         f_io(stripe, p);
 782                         r++;
 783                 }
 784         }
 785
 786         return r;
 787 }
 788
 789 /*
 790  * Index of device to calculate parity on.
 791  *
 792  * Either the parity device index *or* the selected
 793  * device to init after a spare replacement.
 794  */
 795 static int dev_for_parity(struct stripe *stripe, int *sync)
 796 {
 797         struct raid_set *rs = RS(stripe->sc);
 798         int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
 799
 800         *sync = !r;
 801
 802         /* Reconstruct a particular device ?. */
 803         if (r && rs->set.dev_to_init > -1)
 804                 return rs->set.dev_to_init;
 805         else if (rs->set.raid_type->level == raid4)
 806                 return rs->set.pi;
 807         else if (!StripeRecover(stripe))
 808                 return stripe->idx.parity;
 809         else
 810                 return -1;
 811 }
 812
 813 /* RAID set congested function. */
 814 static int rs_congested(void *congested_data, int bdi_bits)
 815 {
 816         int r;
 817         unsigned p;
 818         struct raid_set *rs = congested_data;
 819
 820         if (sc_busy(rs) || RSSuspend(rs))
 821                 r = 1;
 822         else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
 823                 /* If any of our component devices are overloaded. */
 824                 struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
 825
 826                 r |= bdi_congested(&q->backing_dev_info, bdi_bits);
 827         }
 828
 829         /* REMOVEME: statistics. */
 830         atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
 831         return r;
 832 }
 833
 834 /* RAID device degrade check. */
 835 static void rs_check_degrade_dev(struct raid_set *rs,
 836                                        struct stripe *stripe, unsigned p)
 837 {
 838         if (TestSetDevFailed(rs->dev + p))
 839                 return;
 840
 841         /* Through an event in case of member device errors. */
 842         if (atomic_inc_return(&rs->set.failed_devs) >
 843             rs->set.raid_type->parity_devs &&
 844             !TestSetRSDead(rs)) {
 845                 /* Display RAID set dead message once. */
 846                 unsigned p;
 847                 char buf[BDEVNAME_SIZE];
 848
 849                 DMERR("FATAL: too many devices failed -> RAID set broken");
 850                 for (p = 0; p < rs->set.raid_devs; p++) {
 851                         if (DevFailed(rs->dev + p))
 852                                 DMERR("device /dev/%s failed",
 853                                       bdevname(rs->dev[p].dev->bdev, buf));
 854                 }
 855         }
 856
 857         /* Only log the first member error. */
 858         if (!TestSetRSDegraded(rs)) {
 859                 char buf[BDEVNAME_SIZE];
 860
 861                 /* Store index for recovery. */
 862                 rs->set.ei = p;
 863                 DMERR("CRITICAL: %sio error on device /dev/%s "
 864                       "in region=%llu; DEGRADING RAID set\n",
 865                       stripe ? "" : "FAKED ",
 866                       bdevname(rs->dev[p].dev->bdev, buf),
 867                       (unsigned long long) (stripe ? stripe->key : 0));
 868                 DMERR("further device error messages suppressed");
 869         }
 870
 871         schedule_work(&rs->io.ws_do_table_event);
 872 }
 873
 874 /* RAID set degrade check. */
 875 static void rs_check_degrade(struct stripe *stripe)
 876 {
 877         struct raid_set *rs = RS(stripe->sc);
 878         unsigned p = rs->set.raid_devs;
 879
 880         while (p--) {
 881                 if (ChunkError(CHUNK(stripe, p)))
 882                         rs_check_degrade_dev(rs, stripe, p);
 883         }
 884 }
 885
 886 /* Lookup a RAID device by name or by major:minor number. */
 887 static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
 888 {
 889         unsigned p;
 890         struct raid_dev *dev;
 891
 892         /*
 893          * Must be an incremental loop, because the device array
 894          * can have empty slots still on calls from raid_ctr()
 895          */
 896         for (dev = rs->dev, p = 0;
 897              dev->dev && p < rs->set.raid_devs;
 898              dev++, p++) {
 899                 if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
 900                         return p;
 901         }
 902
 903         return -ENODEV;
 904 }
 905 /*
 906  * End small helper functions.
 907  */
 908
 909 /*
 910  * Stripe hash functions
 911  */
 912 /* Initialize/destroy stripe hash. */
 913 static int hash_init(struct stripe_hash *hash, unsigned stripes)
 914 {
 915         unsigned buckets = 2, max_buckets = stripes >> 1;
 916         static unsigned hash_primes[] = {
 917                 /* Table of primes for hash_fn/table size optimization. */
 918                 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
 919                 1543, 3079, 6151, 12289, 24593, 49157, 98317,
 920         };
 921
 922         /* Calculate number of buckets (2^^n <= stripes / 2). */
 923         while (buckets < max_buckets)
 924                 buckets <<= 1;
 925
 926         /* Allocate stripe hash buckets. */
 927         hash->hash = vmalloc(buckets * sizeof(*hash->hash));
 928         if (!hash->hash)
 929                 return -ENOMEM;
 930
 931         hash->buckets = buckets;
 932         hash->mask = buckets - 1;
 933         hash->shift = ffs(buckets);
 934         if (hash->shift > ARRAY_SIZE(hash_primes))
 935                 hash->shift = ARRAY_SIZE(hash_primes) - 1;
 936
 937         BUG_ON(hash->shift < 2);
 938         hash->prime = hash_primes[hash->shift];
 939
 940         /* Initialize buckets. */
 941         while (buckets--)
 942                 INIT_LIST_HEAD(hash->hash + buckets);
 943         return 0;
 944 }
 945
 946 static void hash_exit(struct stripe_hash *hash)
 947 {
 948         if (hash->hash) {
 949                 vfree(hash->hash);
 950                 hash->hash = NULL;
 951         }
 952 }
 953
 954 static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
 955 {
 956         return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
 957 }
 958
 959 static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
 960 {
 961         return hash->hash + hash_fn(hash, key);
 962 }
 963
 964 /* Insert an entry into a hash. */
 965 static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
 966 {
 967         list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
 968 }
 969
 970 /* Lookup an entry in the stripe hash. */
 971 static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
 972 {
 973         unsigned look = 0;
 974         struct stripe *stripe;
 975         struct list_head *bucket = hash_bucket(&sc->hash, key);
 976
 977         list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
 978                 look++;
 979
 980                 if (stripe->key == key) {
 981                         /* REMOVEME: statisics. */
 982                         if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
 983                                 atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
 984                         return stripe;
 985                 }
 986         }
 987
 988         return NULL;
 989 }
 990
 991 /* Resize the stripe cache hash on size changes. */
 992 static int sc_hash_resize(struct stripe_cache *sc)
 993 {
 994         /* Resize indicated ? */
 995         if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
 996                 int r;
 997                 struct stripe_hash hash;
 998
 999                 r = hash_init(&hash, atomic_read(&sc->stripes));
1000                 if (r)
1001                         return r;
1002
1003                 if (sc->hash.hash) {
1004                         unsigned b = sc->hash.buckets;
1005                         struct list_head *pos, *tmp;
1006
1007                         /* Walk old buckets and insert into new. */
1008                         while (b--) {
1009                                 list_for_each_safe(pos, tmp, sc->hash.hash + b)
1010                                     stripe_insert(&hash,
1011                                                   list_entry(pos, struct stripe,
1012                                                              lists[LIST_HASH]));
1013                         }
1014
1015                 }
1016
1017                 hash_exit(&sc->hash);
1018                 memcpy(&sc->hash, &hash, sizeof(sc->hash));
1019                 atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1020         }
1021
1022         return 0;
1023 }
1024 /* End hash stripe hash function. */
1025
1026 /* List add, delete, push and pop functions. */
1027 /* Add stripe to flush list. */
1028 #define DEL_LIST(lh) \
1029         if (!list_empty(lh)) \
1030                 list_del_init(lh);
1031
1032 /* Delete stripe from hash. */
1033 static void stripe_hash_del(struct stripe *stripe)
1034 {
1035         DEL_LIST(stripe->lists + LIST_HASH);
1036 }
1037
1038 /* Return stripe reference count. */
1039 static inline int stripe_ref(struct stripe *stripe)
1040 {
1041         return atomic_read(&stripe->cnt);
1042 }
1043
1044 static void stripe_flush_add(struct stripe *stripe)
1045 {
1046         struct stripe_cache *sc = stripe->sc;
1047         struct list_head *lh = stripe->lists + LIST_FLUSH;
1048
1049         if (!StripeReconstruct(stripe) && list_empty(lh))
1050                 list_add_tail(lh, sc->lists + LIST_FLUSH);
1051 }
1052
1053 /*
1054  * Add stripe to LRU (inactive) list.
1055  *
1056  * Need lock, because of concurrent access from message interface.
1057  */
1058 static void stripe_lru_add(struct stripe *stripe)
1059 {
1060         if (!StripeRecover(stripe)) {
1061                 unsigned long flags;
1062                 struct list_head *lh = stripe->lists + LIST_LRU;
1063                 spinlock_t *lock = stripe->sc->locks + LOCK_LRU;
1064
1065                 spin_lock_irqsave(lock, flags);
1066                 if (list_empty(lh))
1067                         list_add_tail(lh, stripe->sc->lists + LIST_LRU);
1068                 spin_unlock_irqrestore(lock, flags);
1069         }
1070 }
1071
1072 #define POP_LIST(list) \
1073         do { \
1074                 if (list_empty(sc->lists + (list))) \
1075                         stripe = NULL; \
1076                 else { \
1077                         stripe = list_first_entry(sc->lists + (list), \
1078                                                   struct stripe, \
1079                                                   lists[(list)]); \
1080                         list_del_init(stripe->lists + (list)); \
1081                 } \
1082         } while (0);
1083
1084 /* Pop an available stripe off the LRU list. */
1085 static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1086 {
1087         struct stripe *stripe;
1088         spinlock_t *lock = sc->locks + LOCK_LRU;
1089
1090         spin_lock_irq(lock);
1091         POP_LIST(LIST_LRU);
1092         spin_unlock_irq(lock);
1093
1094         return stripe;
1095 }
1096
1097 /* Pop an available stripe off the io list. */
1098 static struct stripe *stripe_io_pop(struct stripe_cache *sc)
1099 {
1100         struct stripe *stripe;
1101
1102         POP_LIST(LIST_FLUSH);
1103         return stripe;
1104 }
1105
1106 /* Push a stripe safely onto the endio list to be handled by do_endios(). */
1107 static void stripe_endio_push(struct stripe *stripe)
1108 {
1109         unsigned long flags;
1110         struct stripe_cache *sc = stripe->sc;
1111         struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
1112                          *sc_list = sc->lists + LIST_ENDIO;
1113         spinlock_t *lock = sc->locks + LOCK_ENDIO;
1114
1115         /* This runs in parallel with do_endios(). */
1116         spin_lock_irqsave(lock, flags);
1117         if (list_empty(stripe_list))
1118                 list_add_tail(stripe_list, sc_list);
1119         spin_unlock_irqrestore(lock, flags);
1120
1121         wake_do_raid(RS(sc)); /* Wake myself. */
1122 }
1123
1124 /* Pop a stripe off safely off the endio list. */
1125 static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
1126 {
1127         struct stripe *stripe;
1128         spinlock_t *lock = sc->locks + LOCK_ENDIO;
1129
1130         /* This runs in parallel with endio(). */
1131         spin_lock_irq(lock);
1132         POP_LIST(LIST_ENDIO)
1133         spin_unlock_irq(lock);
1134         return stripe;
1135 }
1136 #undef POP_LIST
1137
1138 /*
1139  * Stripe cache locking functions
1140  */
1141 /* Dummy lock function for single host RAID4+5. */
1142 static void *no_lock(sector_t key, enum dm_lock_type type)
1143 {
1144         return &no_lock;
1145 }
1146
1147 /* Dummy unlock function for single host RAID4+5. */
1148 static void no_unlock(void *lock_handle)
1149 {
1150 }
1151
1152 /* No locking (for single host RAID 4+5). */
1153 static struct dm_raid45_locking_type locking_none = {
1154         .lock = no_lock,
1155         .unlock = no_unlock,
1156 };
1157
1158 /* Lock a stripe (for clustering). */
1159 static int
1160 stripe_lock(struct stripe *stripe, int rw, sector_t key)
1161 {
1162         stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
1163         return stripe->lock ? 0 : -EPERM;
1164 }
1165
1166 /* Unlock a stripe (for clustering). */
1167 static void stripe_unlock(struct stripe *stripe)
1168 {
1169         RS(stripe->sc)->locking->unlock(stripe->lock);
1170         stripe->lock = NULL;
1171 }
1172
1173 /* Test io pending on stripe. */
1174 static int stripe_io_ref(struct stripe *stripe)
1175 {
1176         return atomic_read(&stripe->io.pending);
1177 }
1178
1179 static void stripe_io_get(struct stripe *stripe)
1180 {
1181         if (atomic_inc_return(&stripe->io.pending) == 1)
1182                 /* REMOVEME: statistics */
1183                 atomic_inc(&stripe->sc->active_stripes);
1184         else
1185                 BUG_ON(stripe_io_ref(stripe) < 0);
1186 }
1187
1188 static void stripe_io_put(struct stripe *stripe)
1189 {
1190         if (atomic_dec_and_test(&stripe->io.pending)) {
1191                 if (unlikely(StripeRecover(stripe)))
1192                         /* Don't put recovery stripe on endio list. */
1193                         wake_do_raid(RS(stripe->sc));
1194                 else
1195                         /* Add regular stripe to endio list and wake daemon. */
1196                         stripe_endio_push(stripe);
1197
1198                 /* REMOVEME: statistics */
1199                 atomic_dec(&stripe->sc->active_stripes);
1200         } else
1201                 BUG_ON(stripe_io_ref(stripe) < 0);
1202 }
1203
1204 /* Take stripe reference out. */
1205 static int stripe_get(struct stripe *stripe)
1206 {
1207         int r;
1208         struct list_head *lh = stripe->lists + LIST_LRU;
1209         spinlock_t *lock = stripe->sc->locks + LOCK_LRU;
1210
1211         /* Delete stripe from LRU (inactive) list if on. */
1212         spin_lock_irq(lock);
1213         DEL_LIST(lh);
1214         spin_unlock_irq(lock);
1215
1216         BUG_ON(stripe_ref(stripe) < 0);
1217
1218         /* Lock stripe on first reference */
1219         r = (atomic_inc_return(&stripe->cnt) == 1) ?
1220             stripe_lock(stripe, WRITE, stripe->key) : 0;
1221
1222         return r;
1223 }
1224 #undef DEL_LIST
1225
1226 /* Return references on a chunk. */
1227 static int chunk_ref(struct stripe_chunk *chunk)
1228 {
1229         return atomic_read(&chunk->cnt);
1230 }
1231
1232 /* Take out reference on a chunk. */
1233 static int chunk_get(struct stripe_chunk *chunk)
1234 {
1235         return atomic_inc_return(&chunk->cnt);
1236 }
1237
1238 /* Drop reference on a chunk. */
1239 static void chunk_put(struct stripe_chunk *chunk)
1240 {
1241         BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
1242 }
1243
1244 /*
1245  * Drop reference on a stripe.
1246  *
1247  * Move it to list of LRU stripes if zero.
1248  */
1249 static void stripe_put(struct stripe *stripe)
1250 {
1251         if (atomic_dec_and_test(&stripe->cnt)) {
1252                 BUG_ON(stripe_io_ref(stripe));
1253                 stripe_unlock(stripe);
1254         } else
1255                 BUG_ON(stripe_ref(stripe) < 0);
1256 }
1257
1258 /* Helper needed by for_each_io_dev(). */
1259 static void stripe_get_references(struct stripe *stripe, unsigned p)
1260 {
1261
1262         /*
1263          * Another one to reference the stripe in
1264          * order to protect vs. LRU list moves.
1265          */
1266         io_get(RS(stripe->sc)); /* Global io references. */
1267         stripe_get(stripe);
1268         stripe_io_get(stripe);  /* One for each chunk io. */
1269 }
1270
1271 /* Helper for endio() to put all take references. */
1272 static void stripe_put_references(struct stripe *stripe)
1273 {
1274         stripe_io_put(stripe);  /* One for each chunk io. */
1275         stripe_put(stripe);
1276         io_put(RS(stripe->sc));
1277 }
1278
1279 /*
1280  * Stripe cache functions.
1281  */
1282 /*
1283  * Invalidate all chunks (i.e. their pages)  of a stripe.
1284  *
1285  * I only keep state for the whole chunk.
1286  */
1287 static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
1288 {
1289         chunk->io.flags = 0;
1290 }
1291
1292 static void
1293 stripe_chunks_invalidate(struct stripe *stripe)
1294 {
1295         unsigned p = RS(stripe->sc)->set.raid_devs;
1296
1297         while (p--)
1298                 stripe_chunk_invalidate(CHUNK(stripe, p));
1299 }
1300
1301 /* Prepare stripe for (re)use. */
1302 static void stripe_invalidate(struct stripe *stripe)
1303 {
1304         stripe->io.flags = 0;
1305         stripe->idx.parity = stripe->idx.recover = -1;
1306         stripe_chunks_invalidate(stripe);
1307 }
1308
1309 /*
1310  * Allow io on all chunks of a stripe.
1311  * If not set, IO will not occur; i.e. it's prohibited.
1312  *
1313  * Actual IO submission for allowed chunks depends
1314  * on their !uptodate or dirty state.
1315  */
1316 static void stripe_allow_io(struct stripe *stripe)
1317 {
1318         unsigned p = RS(stripe->sc)->set.raid_devs;
1319
1320         while (p--)
1321                 SetChunkIo(CHUNK(stripe, p));
1322 }
1323
1324 /* Initialize a stripe. */
1325 static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1326 {
1327         unsigned i, p = RS(sc)->set.raid_devs;
1328
1329         /* Work all io chunks. */
1330         while (p--) {
1331                 struct stripe_chunk *chunk = CHUNK(stripe, p);
1332
1333                 atomic_set(&chunk->cnt, 0);
1334                 chunk->stripe = stripe;
1335                 i = ARRAY_SIZE(chunk->bl);
1336                 while (i--)
1337                         bio_list_init(chunk->bl + i);
1338         }
1339
1340         stripe->sc = sc;
1341
1342
1343         i = ARRAY_SIZE(stripe->lists);
1344         while (i--)
1345                 INIT_LIST_HEAD(stripe->lists + i);
1346
1347         stripe->io.size = RS(sc)->set.io_size;
1348         atomic_set(&stripe->cnt, 0);
1349         atomic_set(&stripe->io.pending, 0);
1350         stripe_invalidate(stripe);
1351 }
1352
1353 /* Number of pages per chunk. */
1354 static inline unsigned chunk_pages(unsigned sectors)
1355 {
1356         return dm_div_up(sectors, SECTORS_PER_PAGE);
1357 }
1358
1359 /* Number of pages per stripe. */
1360 static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
1361 {
1362         return chunk_pages(io_size) * rs->set.raid_devs;
1363 }
1364
1365 /* Initialize part of page_list (recovery). */
1366 static void stripe_zero_pl_part(struct stripe *stripe, int p,
1367                                 unsigned start, unsigned count)
1368 {
1369         unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
1370         /* Get offset into the page_list. */
1371         struct page_list *pl = pl_elem(PL(stripe, p), o);
1372
1373         BUG_ON(!pl);
1374         while (pl && pages--) {
1375                 BUG_ON(!pl->page);
1376                 memset(page_address(pl->page), 0, PAGE_SIZE);
1377                 pl = pl->next;
1378         }
1379 }
1380
1381 /* Initialize parity chunk of stripe. */
1382 static void stripe_zero_chunk(struct stripe *stripe, int p)
1383 {
1384         if (p > -1)
1385                 stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
1386 }
1387
1388 /* Return dynamic stripe structure size. */
1389 static size_t stripe_size(struct raid_set *rs)
1390 {
1391         return sizeof(struct stripe) +
1392                       rs->set.raid_devs * sizeof(struct stripe_chunk);
1393 }
1394
1395 /* Allocate a stripe and its memory object. */
1396 /* XXX adjust to cope with stripe cache and recovery stripe caches. */
1397 enum grow { SC_GROW, SC_KEEP };
1398 static struct stripe *stripe_alloc(struct stripe_cache *sc,
1399                                    struct dm_mem_cache_client *mc,
1400                                    enum grow grow)
1401 {
1402         int r;
1403         struct stripe *stripe;
1404
1405         stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
1406         if (stripe) {
1407                 /* Grow the dm-mem-cache by one object. */
1408                 if (grow == SC_GROW) {
1409                         r = dm_mem_cache_grow(mc, 1);
1410                         if (r)
1411                                 goto err_free;
1412                 }
1413
1414                 stripe->obj = dm_mem_cache_alloc(mc);
1415                 if (!stripe->obj)
1416                         goto err_shrink;
1417
1418                 stripe_init(sc, stripe);
1419         }
1420
1421         return stripe;
1422
1423 err_shrink:
1424         if (grow == SC_GROW)
1425                 dm_mem_cache_shrink(mc, 1);
1426 err_free:
1427         kmem_cache_free(sc->kc.cache, stripe);
1428         return NULL;
1429 }
1430
1431 /*
1432  * Free a stripes memory object, shrink the
1433  * memory cache and free the stripe itself.
1434  */
1435 static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
1436 {
1437         dm_mem_cache_free(mc, stripe->obj);
1438         dm_mem_cache_shrink(mc, 1);
1439         kmem_cache_free(stripe->sc->kc.cache, stripe);
1440 }
1441
1442 /* Free the recovery stripe. */
1443 static void stripe_recover_free(struct raid_set *rs)
1444 {
1445         struct recover *rec = &rs->recover;
1446         struct dm_mem_cache_client *mc;
1447
1448         mc = rec->mem_cache_client;
1449         rec->mem_cache_client = NULL;
1450         if (mc) {
1451                 struct stripe *stripe;
1452
1453                 while (!list_empty(&rec->stripes)) {
1454                         stripe = list_first_entry(&rec->stripes, struct stripe,
1455                                                   lists[LIST_RECOVER]);
1456                         list_del(stripe->lists + LIST_RECOVER);
1457                         kfree(stripe->recover);
1458                         stripe_free(stripe, mc);
1459                 }
1460
1461                 dm_mem_cache_client_destroy(mc);
1462                 dm_io_client_destroy(rec->dm_io_client);
1463                 rec->dm_io_client = NULL;
1464         }
1465 }
1466
1467 /* Grow stripe cache. */
1468 static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
1469 {
1470         int r = 0;
1471
1472         /* Try to allocate this many (additional) stripes. */
1473         while (stripes--) {
1474                 struct stripe *stripe =
1475                         stripe_alloc(sc, sc->mem_cache_client, grow);
1476
1477                 if (likely(stripe)) {
1478                         stripe_lru_add(stripe);
1479                         atomic_inc(&sc->stripes);
1480                 } else {
1481                         r = -ENOMEM;
1482                         break;
1483                 }
1484         }
1485
1486         return r ? r : sc_hash_resize(sc);
1487 }
1488
1489 /* Shrink stripe cache. */
1490 static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
1491 {
1492         int r = 0;
1493
1494         /* Try to get unused stripe from LRU list. */
1495         while (stripes--) {
1496                 struct stripe *stripe;
1497
1498                 stripe = stripe_lru_pop(sc);
1499                 if (stripe) {
1500                         /* An LRU stripe may never have ios pending! */
1501                         BUG_ON(stripe_io_ref(stripe));
1502                         BUG_ON(stripe_ref(stripe));
1503                         atomic_dec(&sc->stripes);
1504                         /* Remove from hash if on before deletion. */
1505                         stripe_hash_del(stripe);
1506                         stripe_free(stripe, sc->mem_cache_client);
1507                 } else {
1508                         r = -ENOENT;
1509                         break;
1510                 }
1511         }
1512
1513         /* Check if stats are still sane. */
1514         if (atomic_read(&sc->active_stripes_max) >
1515             atomic_read(&sc->stripes))
1516                 atomic_set(&sc->active_stripes_max, 0);
1517
1518         if (r)
1519                 return r;
1520
1521         return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
1522 }
1523
1524 /* Create stripe cache and recovery. */
1525 static int sc_init(struct raid_set *rs, unsigned stripes)
1526 {
1527         unsigned i, r, rstripes;
1528         struct stripe_cache *sc = &rs->sc;
1529         struct stripe *stripe;
1530         struct recover *rec = &rs->recover;
1531         struct mapped_device *md;
1532         struct gendisk *disk;
1533
1534         /* Initialize lists and locks. */
1535         i = ARRAY_SIZE(sc->lists);
1536         while (i--)
1537                 INIT_LIST_HEAD(sc->lists + i);
1538
1539         INIT_LIST_HEAD(&rec->stripes);
1540
1541         /* Initialize endio and LRU list locks. */
1542         i = NR_LOCKS;
1543         while (i--)
1544                 spin_lock_init(sc->locks + i);
1545
1546         /* Initialize atomic variables. */
1547         atomic_set(&sc->stripes, 0);
1548         atomic_set(&sc->stripes_to_set, 0);
1549         atomic_set(&sc->active_stripes, 0);
1550         atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
1551
1552         /*
1553          * We need a runtime unique # to suffix the kmem cache name
1554          * because we'll have one for each active RAID set.
1555          */
1556         md = dm_table_get_md(rs->ti->table);
1557         disk = dm_disk(md);
1558         sprintf(sc->kc.name, "%s-%d", TARGET, disk->first_minor);
1559         dm_put(md);
1560         sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
1561                                          0, 0, NULL);
1562         if (!sc->kc.cache)
1563                 return -ENOMEM;
1564
1565         /* Create memory cache client context for RAID stripe cache. */
1566         sc->mem_cache_client =
1567                 dm_mem_cache_client_create(stripes, rs->set.raid_devs,
1568                                            chunk_pages(rs->set.io_size));
1569         if (IS_ERR(sc->mem_cache_client))
1570                 return PTR_ERR(sc->mem_cache_client);
1571
1572         /* Create memory cache client context for RAID recovery stripe(s). */
1573         rstripes = rec->recovery_stripes;
1574         rec->mem_cache_client =
1575                 dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
1576                                            chunk_pages(rec->io_size));
1577         if (IS_ERR(rec->mem_cache_client))
1578                 return PTR_ERR(rec->mem_cache_client);
1579
1580         /* Create dm-io client context for IO stripes. */
1581         sc->dm_io_client =
1582                 dm_io_client_create();
1583         if (IS_ERR(sc->dm_io_client))
1584                 return PTR_ERR(sc->dm_io_client);
1585
1586         /* FIXME: intermingeled with stripe cache initialization. */
1587         /* Create dm-io client context for recovery stripes. */
1588         rec->dm_io_client =
1589                 dm_io_client_create();
1590         if (IS_ERR(rec->dm_io_client))
1591                 return PTR_ERR(rec->dm_io_client);
1592
1593         /* Allocate stripes for set recovery. */
1594         while (rstripes--) {
1595                 stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
1596                 if (!stripe)
1597                         return -ENOMEM;
1598
1599                 stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
1600                 if (!stripe->recover) {
1601                         stripe_free(stripe, rec->mem_cache_client);
1602                         return -ENOMEM;
1603                 }
1604
1605                 SetStripeRecover(stripe);
1606                 stripe->io.size = rec->io_size;
1607                 list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
1608                 /* Don't add recovery stripes to LRU list! */
1609         }
1610
1611         /*
1612          * Allocate the stripe objetcs from the
1613          * cache and add them to the LRU list.
1614          */
1615         r = sc_grow(sc, stripes, SC_KEEP);
1616         if (!r)
1617                 atomic_set(&sc->stripes_last, stripes);
1618
1619         return r;
1620 }
1621
1622 /* Destroy the stripe cache. */
1623 static void sc_exit(struct stripe_cache *sc)
1624 {
1625         struct raid_set *rs = RS(sc);
1626
1627         if (sc->kc.cache) {
1628                 stripe_recover_free(rs);
1629                 BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
1630                 kmem_cache_destroy(sc->kc.cache);
1631                 sc->kc.cache = NULL;
1632
1633                 if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
1634                         dm_mem_cache_client_destroy(sc->mem_cache_client);
1635
1636                 if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
1637                         dm_io_client_destroy(sc->dm_io_client);
1638
1639                 hash_exit(&sc->hash);
1640         }
1641 }
1642
1643 /*
1644  * Calculate RAID address
1645  *
1646  * Delivers tuple with the index of the data disk holding the chunk
1647  * in the set, the parity disks index and the start of the stripe
1648  * within the address space of the set (used as the stripe cache hash key).
1649  */
1650 /* thx MD. */
1651 static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
1652                                          struct raid_address *addr)
1653 {
1654         sector_t stripe, tmp;
1655
1656         /*
1657          * chunk_number = sector / chunk_size
1658          * stripe_number = chunk_number / data_devs
1659          * di = stripe % data_devs;
1660          */
1661         stripe = sector >> rs->set.chunk_shift;
1662         addr->di = sector_div(stripe, rs->set.data_devs);
1663
1664         switch (rs->set.raid_type->level) {
1665         case raid4:
1666                 addr->pi = rs->set.pi;
1667                 goto check_shift_di;
1668         case raid5:
1669                 tmp = stripe;
1670                 addr->pi = sector_div(tmp, rs->set.raid_devs);
1671
1672                 switch (rs->set.raid_type->algorithm) {
1673                 case left_asym:         /* Left asymmetric. */
1674                         addr->pi = rs->set.data_devs - addr->pi;
1675                 case right_asym:        /* Right asymmetric. */
1676 check_shift_di:
1677                         if (addr->di >= addr->pi)
1678                                 addr->di++;
1679                         break;
1680                 case left_sym:          /* Left symmetric. */
1681                         addr->pi = rs->set.data_devs - addr->pi;
1682                 case right_sym:         /* Right symmetric. */
1683                         addr->di = (addr->pi + addr->di + 1) %
1684                                    rs->set.raid_devs;
1685                         break;
1686                 case none: /* Ain't happen: RAID4 algorithm placeholder. */
1687                         BUG();
1688                 }
1689         }
1690
1691         /*
1692          * Start offset of the stripes chunk on any single device of the RAID
1693          * set, adjusted in case io size differs from chunk size.
1694          */
1695         addr->key = (stripe << rs->set.chunk_shift) +
1696                     (sector & rs->set.io_inv_mask);
1697         return addr;
1698 }
1699
1700 /*
1701  * Copy data across between stripe pages and bio vectors.
1702  *
1703  * Pay attention to data alignment in stripe and bio pages.
1704  */
1705 static void bio_copy_page_list(int rw, struct stripe *stripe,
1706                                struct page_list *pl, struct bio *bio)
1707 {
1708         unsigned i, page_offset;
1709         void *page_addr;
1710         struct raid_set *rs = RS(stripe->sc);
1711         struct bio_vec *bv;
1712
1713         /* Get start page in page list for this sector. */
1714         i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
1715         pl = pl_elem(pl, i);
1716         BUG_ON(!pl);
1717         BUG_ON(!pl->page);
1718
1719         page_addr = page_address(pl->page);
1720         page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
1721
1722         /* Walk all segments and copy data across between bio_vecs and pages. */
1723         bio_for_each_segment(bv, bio, i) {
1724                 int len = bv->bv_len, size;
1725                 unsigned bio_offset = 0;
1726                 void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
1727 redo:
1728                 size = (page_offset + len > PAGE_SIZE) ?
1729                        PAGE_SIZE - page_offset : len;
1730
1731                 if (rw == READ)
1732                         memcpy(bio_addr + bio_offset,
1733                                page_addr + page_offset, size);
1734                 else
1735                         memcpy(page_addr + page_offset,
1736                                bio_addr + bio_offset, size);
1737
1738                 page_offset += size;
1739                 if (page_offset == PAGE_SIZE) {
1740                         /*
1741                          * We reached the end of the chunk page ->
1742                          * need to refer to the next one to copy more data.
1743                          */
1744                         len -= size;
1745                         if (len) {
1746                                 /* Get next page. */
1747                                 pl = pl->next;
1748                                 BUG_ON(!pl);
1749                                 BUG_ON(!pl->page);
1750                                 page_addr = page_address(pl->page);
1751                                 page_offset = 0;
1752                                 bio_offset += size;
1753                                 /* REMOVEME: statistics. */
1754                                 atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
1755                                 goto redo;
1756                         }
1757                 }
1758
1759                 __bio_kunmap_atomic(bio_addr, KM_USER0);
1760         }
1761 }
1762
1763 /*
1764  * Xor optimization macros.
1765  */
1766 /* Xor data pointer declaration and initialization macros. */
1767 #define DECLARE_2       unsigned long *d0 = data[0], *d1 = data[1]
1768 #define DECLARE_3       DECLARE_2, *d2 = data[2]
1769 #define DECLARE_4       DECLARE_3, *d3 = data[3]
1770 #define DECLARE_5       DECLARE_4, *d4 = data[4]
1771 #define DECLARE_6       DECLARE_5, *d5 = data[5]
1772 #define DECLARE_7       DECLARE_6, *d6 = data[6]
1773 #define DECLARE_8       DECLARE_7, *d7 = data[7]
1774
1775 /* Xor unrole macros. */
1776 #define D2(n)   d0[n] = d0[n] ^ d1[n]
1777 #define D3(n)   D2(n) ^ d2[n]
1778 #define D4(n)   D3(n) ^ d3[n]
1779 #define D5(n)   D4(n) ^ d4[n]
1780 #define D6(n)   D5(n) ^ d5[n]
1781 #define D7(n)   D6(n) ^ d6[n]
1782 #define D8(n)   D7(n) ^ d7[n]
1783
1784 #define X_2(macro, offset)      macro(offset); macro(offset + 1);
1785 #define X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
1786 #define X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
1787 #define X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
1788 #define X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
1789 #define X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
1790
1791 /* Define a _xor_#chunks_#xors_per_run() function. */
1792 #define _XOR(chunks, xors_per_run) \
1793 static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
1794 { \
1795         unsigned end = XOR_SIZE / sizeof(data[0]), i; \
1796         DECLARE_ ## chunks; \
1797 \
1798         for (i = 0; i < end; i += xors_per_run) { \
1799                 X_ ## xors_per_run(D ## chunks, i); \
1800         } \
1801 }
1802
1803 /* Define xor functions for 2 - 8 chunks and xors per run. */
1804 #define MAKE_XOR_PER_RUN(xors_per_run) \
1805         _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
1806         _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
1807         _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
1808         _XOR(8, xors_per_run);
1809
1810 MAKE_XOR_PER_RUN(8)     /* Define _xor_*_8() functions. */
1811 MAKE_XOR_PER_RUN(16)    /* Define _xor_*_16() functions. */
1812 MAKE_XOR_PER_RUN(32)    /* Define _xor_*_32() functions. */
1813 MAKE_XOR_PER_RUN(64)    /* Define _xor_*_64() functions. */
1814
1815 #define MAKE_XOR(xors_per_run) \
1816 struct { \
1817         void (*f)(unsigned long **); \
1818 } static xor_funcs ## xors_per_run[] = { \
1819         { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
1820         { NULL }, \
1821         { _xor2_ ## xors_per_run }, \
1822         { _xor3_ ## xors_per_run }, \
1823         { _xor4_ ## xors_per_run }, \
1824         { _xor5_ ## xors_per_run }, \
1825         { _xor6_ ## xors_per_run }, \
1826         { _xor7_ ## xors_per_run }, \
1827         { _xor8_ ## xors_per_run }, \
1828 }; \
1829 \
1830 static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
1831 { \
1832         /* Call respective function for amount of chunks. */ \
1833         xor_funcs ## xors_per_run[n].f(data); \
1834 }
1835
1836 /* Define xor_8() - xor_64 functions. */
1837 MAKE_XOR(8)
1838 MAKE_XOR(16)
1839 MAKE_XOR(32)
1840 MAKE_XOR(64)
1841
1842 /* Maximum number of chunks, which can be xor'ed in one go. */
1843 #define XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
1844
1845 static void xor_blocks_wrapper(unsigned n, unsigned long **data)
1846 {
1847         BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
1848         xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
1849 }
1850
1851 struct xor_func {
1852         xor_function_t f;
1853         const char *name;
1854 } static xor_funcs[] = {
1855         { xor_8,   "xor_8"  },
1856         { xor_16,  "xor_16" },
1857         { xor_32,  "xor_32" },
1858         { xor_64,  "xor_64" },
1859         { xor_blocks_wrapper, "xor_blocks" },
1860 };
1861
1862 /*
1863  * Check, if chunk has to be xored in/out:
1864  *
1865  * o if writes are queued
1866  * o if writes are merged
1867  * o if stripe is to be reconstructed
1868  * o if recovery stripe
1869  */
1870 static inline int chunk_must_xor(struct stripe_chunk *chunk)
1871 {
1872         if (ChunkUptodate(chunk)) {
1873                 BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
1874                        !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
1875
1876                 if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
1877                     !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
1878                         return 1;
1879
1880                 if (StripeReconstruct(chunk->stripe) ||
1881                     StripeRecover(chunk->stripe))
1882                         return 1;
1883         }
1884
1885         return 0;
1886 }
1887
1888 /*
1889  * Calculate crc.
1890  *
1891  * This indexes into the chunks of a stripe and their pages.
1892  *
1893  * All chunks will be xored into the indexed (@pi)
1894  * chunk in maximum groups of xor.chunks.
1895  *
1896  */
1897 static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
1898 {
1899         struct raid_set *rs = RS(stripe->sc);
1900         unsigned max_chunks = rs->xor.chunks, n = 1,
1901                  o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
1902                  p = rs->set.raid_devs;
1903         unsigned long **d = rs->data;
1904         xor_function_t xor_f = rs->xor.f->f;
1905
1906         BUG_ON(sector > stripe->io.size);
1907
1908         /* Address of parity page to xor into. */
1909         d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
1910
1911         while (p--) {
1912                 /* Preset pointers to data pages. */
1913                 if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
1914                         d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
1915
1916                 /* If max chunks -> xor. */
1917                 if (n == max_chunks) {
1918                         xor_f(n, d);
1919                         n = 1;
1920                 }
1921         }
1922
1923         /* If chunks -> xor. */
1924         if (n > 1)
1925                 xor_f(n, d);
1926 }
1927
1928 /* Common xor loop through all stripe page lists. */
1929 static void common_xor(struct stripe *stripe, sector_t count,
1930                        unsigned off, unsigned pi)
1931 {
1932         unsigned sector;
1933
1934         BUG_ON(!count);
1935         for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
1936                 xor(stripe, pi, sector);
1937
1938         /* Set parity page uptodate and clean. */
1939         chunk_set(CHUNK(stripe, pi), CLEAN);
1940         atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
1941 }
1942
1943 /*
1944  * Calculate parity sectors on intact stripes.
1945  *
1946  * Need to calculate raid address for recover stripe, because its
1947  * chunk sizes differs and is typically larger than io chunk size.
1948  */
1949 static void parity_xor(struct stripe *stripe)
1950 {
1951         struct raid_set *rs = RS(stripe->sc);
1952         unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
1953                  xor_size = chunk_size > io_size ? io_size : chunk_size;
1954         sector_t off;
1955
1956         /* This can be the recover stripe with a larger io size. */
1957         for (off = 0; off < io_size; off += xor_size) {
1958                 /*
1959                  * Recover stripe is likely bigger than regular io
1960                  * ones and has no precalculated parity disk index ->
1961                  * need to calculate RAID address.
1962                  */
1963                 if (unlikely(StripeRecover(stripe))) {
1964                         struct raid_address addr;
1965
1966                         raid_address(rs,
1967                                      (stripe->key + off) * rs->set.data_devs,
1968                                      &addr);
1969                         stripe->idx.parity = addr.pi;
1970                         stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
1971                 }
1972
1973                 common_xor(stripe, xor_size, off, stripe->idx.parity);
1974                 chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
1975         }
1976 }
1977
1978 /* Reconstruct missing chunk. */
1979 static void stripe_reconstruct(struct stripe *stripe)
1980 {
1981         struct raid_set *rs = RS(stripe->sc);
1982         int p = rs->set.raid_devs, pr = stripe->idx.recover;
1983
1984         BUG_ON(pr < 0);
1985
1986         /* Check if all but the chunk to be reconstructed are uptodate. */
1987         while (p--)
1988                 BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
1989
1990         /* REMOVEME: statistics. */
1991         atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
1992                                                  S_RECONSTRUCT_DEV));
1993         /* Zero chunk to be reconstructed. */
1994         stripe_zero_chunk(stripe, pr);
1995         common_xor(stripe, stripe->io.size, 0, pr);
1996         stripe->idx.recover = -1;
1997 }
1998
1999 /*
2000  * Recovery io throttling
2001  */
2002 /* Conditionally reset io counters. */
2003 static int recover_io_reset(struct raid_set *rs)
2004 {
2005         unsigned long j = jiffies;
2006
2007         /* Pay attention to jiffies overflows. */
2008         if (j > rs->recover.last_jiffies + HZ / 20 ||
2009             j < rs->recover.last_jiffies) {
2010                 atomic_set(rs->recover.io_count + IO_WORK, 0);
2011                 atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2012                 rs->recover.last_jiffies = j;
2013                 return 1;
2014         }
2015
2016         return 0;
2017 }
2018
2019 /* Count ios. */
2020 static void recover_io_count(struct stripe *stripe)
2021 {
2022         struct raid_set *rs = RS(stripe->sc);
2023
2024         recover_io_reset(rs);
2025         atomic_inc(rs->recover.io_count +
2026                    (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2027 }
2028
2029 /* Try getting a stripe either from the hash or from the LRU list. */
2030 static struct stripe *stripe_find(struct raid_set *rs,
2031                                   struct raid_address *addr)
2032 {
2033         int r;
2034         struct stripe_cache *sc = &rs->sc;
2035         struct stripe *stripe;
2036
2037         /* Try stripe from hash. */
2038         stripe = stripe_lookup(sc, addr->key);
2039         if (stripe) {
2040                 r = stripe_get(stripe);
2041                 if (r)
2042                         goto get_lock_failed;
2043
2044                 atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2045         } else {
2046                 /* Not in hash -> try to get an LRU stripe. */
2047                 stripe = stripe_lru_pop(sc);
2048                 if (stripe) {
2049                         /*
2050                          * An LRU stripe may not be referenced
2051                          * and may never have ios pending!
2052                          */
2053                         BUG_ON(stripe_ref(stripe));
2054                         BUG_ON(stripe_io_ref(stripe));
2055
2056                         /* Remove from hash if on before reuse. */
2057                         stripe_hash_del(stripe);
2058
2059                         /* Invalidate before reinserting with changed key. */
2060                         stripe_invalidate(stripe);
2061
2062                         stripe->key = addr->key;
2063                         stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2064                                                                 addr->key);
2065                         stripe->idx.parity = addr->pi;
2066                         r = stripe_get(stripe);
2067                         if (r)
2068                                 goto get_lock_failed;
2069
2070                         /* Insert stripe into the stripe hash. */
2071                         stripe_insert(&sc->hash, stripe);
2072                         /* REMOVEME: statistics. */
2073                         atomic_inc(rs->stats + S_INSCACHE);
2074                 }
2075         }
2076
2077         return stripe;
2078
2079 get_lock_failed:
2080         stripe_put(stripe);
2081         return NULL;
2082 }
2083
2084 /*
2085  * Process end io
2086  *
2087  * I need to do it here because I can't in interrupt
2088  */
2089 /* End io all bios on a bio list. */
2090 static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
2091                            int p, int error)
2092 {
2093         struct raid_set *rs = RS(stripe->sc);
2094         struct bio *bio;
2095         struct page_list *pl = PL(stripe, p);
2096         struct stripe_chunk *chunk = CHUNK(stripe, p);
2097
2098         /* Update region counters. */
2099         while ((bio = bio_list_pop(bl))) {
2100                 if (bio_data_dir(bio) == WRITE)
2101                         /* Drop io pending count for any writes. */
2102                         dm_rh_dec(rs->recover.rh, stripe->region);
2103                 else if (!error)
2104                         /* Copy data accross. */
2105                         bio_copy_page_list(READ, stripe, pl, bio);
2106
2107                 bio_endio(bio, error);
2108
2109                 /* REMOVEME: statistics. */
2110                 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
2111                            S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
2112
2113                 chunk_put(chunk);
2114                 stripe_put(stripe);
2115                 io_put(rs);     /* Wake any suspend waiters on last bio. */
2116         }
2117 }
2118
2119 /*
2120  * End io all reads/writes on a stripe copying
2121  * read data accross from stripe to bios and
2122  * decrementing region counters for writes.
2123  *
2124  * Processing of ios depeding on state:
2125  * o no chunk error -> endio ok
2126  * o degraded:
2127  *   - chunk error and read -> ignore to be requeued
2128  *   - chunk error and write -> endio ok
2129  * o dead (more than parity_devs failed) and chunk_error-> endio failed
2130  */
2131 static void stripe_endio(int rw, struct stripe *stripe)
2132 {
2133         struct raid_set *rs = RS(stripe->sc);
2134         unsigned p = rs->set.raid_devs;
2135         int write = (rw != READ);
2136
2137         while (p--) {
2138                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2139                 struct bio_list *bl;
2140
2141                 BUG_ON(ChunkLocked(chunk));
2142
2143                 bl = BL_CHUNK(chunk, rw);
2144                 if (bio_list_empty(bl))
2145                         continue;
2146
2147                 if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
2148                         /* RAID set dead. */
2149                         if (unlikely(RSDead(rs)))
2150                                 bio_list_endio(stripe, bl, p, -EIO);
2151                         /* RAID set degraded. */
2152                         else if (write)
2153                                 bio_list_endio(stripe, bl, p, 0);
2154                 } else {
2155                         BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
2156                         bio_list_endio(stripe, bl, p, 0);
2157                 }
2158         }
2159 }
2160
2161 /* Fail all ios hanging off all bio lists of a stripe. */
2162 static void stripe_fail_io(struct stripe *stripe)
2163 {
2164         struct raid_set *rs = RS(stripe->sc);
2165         unsigned p = rs->set.raid_devs;
2166
2167         while (p--) {
2168                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2169                 int i = ARRAY_SIZE(chunk->bl);
2170
2171                 /* Fail all bios on all bio lists of the stripe. */
2172                 while (i--) {
2173                         struct bio_list *bl = chunk->bl + i;
2174
2175                         if (!bio_list_empty(bl))
2176                                 bio_list_endio(stripe, bl, p, -EIO);
2177                 }
2178         }
2179
2180         /* Put stripe on LRU list. */
2181         BUG_ON(stripe_io_ref(stripe));
2182         BUG_ON(stripe_ref(stripe));
2183 }
2184
2185 /* Unlock all required chunks. */
2186 static void stripe_chunks_unlock(struct stripe *stripe)
2187 {
2188         unsigned p = RS(stripe->sc)->set.raid_devs;
2189         struct stripe_chunk *chunk;
2190
2191         while (p--) {
2192                 chunk = CHUNK(stripe, p);
2193
2194                 if (TestClearChunkUnlock(chunk))
2195                         ClearChunkLocked(chunk);
2196         }
2197 }
2198
2199 /*
2200  * Queue reads and writes to a stripe by hanging
2201  * their bios off the stripesets read/write lists.
2202  */
2203 static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
2204                             struct bio_list *reject)
2205 {
2206         struct raid_address addr;
2207         struct stripe *stripe;
2208
2209         stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
2210         if (stripe) {
2211                 int r = 0, rw = bio_data_dir(bio);
2212
2213                 /* Distinguish reads and writes. */
2214                 bio_list_add(BL(stripe, addr.di, rw), bio);
2215
2216                 if (rw == READ)
2217                         /* REMOVEME: statistics. */
2218                         atomic_inc(rs->stats + S_BIOS_ADDED_READ);
2219                 else {
2220                         /* Inrement pending write count on region. */
2221                         dm_rh_inc(rs->recover.rh, stripe->region);
2222                         r = 1;
2223
2224                         /* REMOVEME: statistics. */
2225                         atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
2226                 }
2227
2228                 /*
2229                  * Put on io (flush) list in case of
2230                  * initial bio queued to chunk.
2231                  */
2232                 if (chunk_get(CHUNK(stripe, addr.di)) == 1)
2233                         stripe_flush_add(stripe);
2234
2235                 return r;
2236         }
2237
2238         /* Got no stripe from cache or failed to lock it -> reject bio. */
2239         bio_list_add(reject, bio);
2240         atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
2241         return 0;
2242 }
2243
2244 /*
2245  * Handle all stripes by handing them to the daemon, because we can't
2246  * map their chunk pages to copy the data in interrupt context.
2247  *
2248  * We don't want to handle them here either, while interrupts are disabled.
2249  */
2250
2251 /* Read/write endio function for dm-io (interrupt context). */
2252 static void endio(unsigned long error, void *context)
2253 {
2254         struct stripe_chunk *chunk = context;
2255
2256         if (unlikely(error)) {
2257                 chunk_set(chunk, ERROR);
2258                 /* REMOVEME: statistics. */
2259                 atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
2260         } else
2261                 chunk_set(chunk, CLEAN);
2262
2263         /*
2264          * For recovery stripes, I need to reset locked locked
2265          * here, because those aren't processed in do_endios().
2266          */
2267         if (unlikely(StripeRecover(chunk->stripe)))
2268                 ClearChunkLocked(chunk);
2269         else
2270                 SetChunkUnlock(chunk);
2271
2272         /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
2273         stripe_put_references(chunk->stripe);
2274 }
2275
2276 /* Read/Write a chunk asynchronously. */
2277 static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
2278 {
2279         struct stripe_cache *sc = stripe->sc;
2280         struct raid_set *rs = RS(sc);
2281         struct dm_mem_cache_object *obj = stripe->obj + p;
2282         struct page_list *pl = obj->pl;
2283         struct stripe_chunk *chunk = CHUNK(stripe, p);
2284         struct raid_dev *dev = rs->dev + p;
2285         struct dm_io_region io = {
2286                 .bdev = dev->dev->bdev,
2287                 .sector = stripe->key,
2288                 .count = stripe->io.size,
2289         };
2290         struct dm_io_request control = {
2291                 .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
2292                 .mem = {
2293                         .type = DM_IO_PAGE_LIST,
2294                         .ptr.pl = pl,
2295                         .offset = 0,
2296                 },
2297                 .notify = {
2298                         .fn = endio,
2299                         .context = chunk,
2300                 },
2301                 .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
2302                                                   sc->dm_io_client,
2303         };
2304
2305         BUG_ON(ChunkLocked(chunk));
2306         BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
2307         BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
2308
2309         /*
2310          * Don't rw past end of device, which can happen, because
2311          * typically sectors_per_dev isn't divisible by io_size.
2312          */
2313         if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2314                 io.count = rs->set.sectors_per_dev - io.sector;
2315
2316         BUG_ON(!io.count);
2317         io.sector += dev->start;        /* Add <offset>. */
2318         if (RSRecover(rs))
2319                 recover_io_count(stripe);       /* Recovery io accounting. */
2320
2321         /* REMOVEME: statistics. */
2322         atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
2323                                                     S_DM_IO_READ));
2324         SetChunkLocked(chunk);
2325         SetDevIoQueued(dev);
2326         BUG_ON(dm_io(&control, 1, &io, NULL));
2327 }
2328
2329 /*
2330  * Write dirty or read not uptodate page lists of a stripe.
2331  */
2332 static int stripe_chunks_rw(struct stripe *stripe)
2333 {
2334         int r;
2335         struct raid_set *rs = RS(stripe->sc);
2336
2337         /*
2338          * Increment the pending count on the stripe
2339          * first, so that we don't race in endio().
2340          *
2341          * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
2342          *
2343          * o not uptodate
2344          * o dirtied by writes merged
2345          * o dirtied by parity calculations
2346          */
2347         r = for_each_io_dev(stripe, stripe_get_references);
2348         if (r) {
2349                 /* Io needed: chunks are either not uptodate or dirty. */
2350                 int max;        /* REMOVEME: */
2351                 struct stripe_cache *sc = &rs->sc;
2352
2353                 /* Submit actual io. */
2354                 for_each_io_dev(stripe, stripe_chunk_rw);
2355
2356                 /* REMOVEME: statistics */
2357                 max = sc_active(sc);
2358                 if (atomic_read(&sc->active_stripes_max) < max)
2359                         atomic_set(&sc->active_stripes_max, max);
2360
2361                 atomic_inc(rs->stats + S_FLUSHS);
2362                 /* END REMOVEME: statistics */
2363         }
2364
2365         return r;
2366 }
2367
2368 /* Merge in all writes hence dirtying respective chunks. */
2369 static void stripe_merge_writes(struct stripe *stripe)
2370 {
2371         unsigned p = RS(stripe->sc)->set.raid_devs;
2372
2373         while (p--) {
2374                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2375                 struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
2376
2377                 if (!bio_list_empty(write)) {
2378                         struct bio *bio;
2379                         struct page_list *pl = stripe->obj[p].pl;
2380
2381                         /*
2382                          * We can play with the lists without holding a lock,
2383                          * because it is just us accessing them anyway.
2384                          */
2385                         bio_list_for_each(bio, write)
2386                                 bio_copy_page_list(WRITE, stripe, pl, bio);
2387
2388                         bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
2389                         bio_list_init(write);
2390                         chunk_set(chunk, DIRTY);
2391                 }
2392         }
2393 }
2394
2395 /* Queue all writes to get merged. */
2396 static int stripe_queue_writes(struct stripe *stripe)
2397 {
2398         int r = 0;
2399         unsigned p = RS(stripe->sc)->set.raid_devs;
2400
2401         while (p--) {
2402                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2403                 struct bio_list *write = BL_CHUNK(chunk, WRITE);
2404
2405                 if (!bio_list_empty(write)) {
2406                         bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
2407                         bio_list_init(write);
2408 SetChunkIo(chunk);
2409                         r = 1;
2410                 }
2411         }
2412
2413         return r;
2414 }
2415
2416
2417 /* Check, if a chunk gets completely overwritten. */
2418 static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
2419 {
2420         unsigned sectors = 0;
2421         struct bio *bio;
2422         struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
2423
2424         bio_list_for_each(bio, bl)
2425                 sectors += bio_sectors(bio);
2426
2427         BUG_ON(sectors > RS(stripe->sc)->set.io_size);
2428         return sectors == RS(stripe->sc)->set.io_size;
2429 }
2430
2431 /*
2432  * Avoid io on broken/reconstructed drive in order to
2433  * reconstruct date on endio.
2434  *
2435  * (*1*) We set StripeReconstruct() in here, so that _do_endios()
2436  *       will trigger a reconstruct call before resetting it.
2437  */
2438 static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
2439 {
2440         struct stripe_chunk *chunk = CHUNK(stripe, pr);
2441
2442         /*
2443          * Allow io on all chunks but the indexed one,
2444          * because we're either degraded or prohibit it
2445          * on the one for later reconstruction.
2446          */
2447         /* Includes ClearChunkIo(), ClearChunkUptodate(). */
2448         stripe_chunk_invalidate(chunk);
2449         stripe->idx.recover = pr;
2450         SetStripeReconstruct(stripe);
2451
2452         /* REMOVEME: statistics. */
2453         atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2454         return -EPERM;
2455 }
2456
2457 /* Chunk locked/uptodate and device failed tests. */
2458 static struct stripe_chunk *
2459 stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
2460 {
2461         struct raid_set *rs = RS(stripe->sc);
2462         struct stripe_chunk *chunk = CHUNK(stripe, p);
2463
2464         /* Can't access active chunks. */
2465         if (ChunkLocked(chunk)) {
2466                 /* REMOVEME: statistics. */
2467                 atomic_inc(rs->stats + S_CHUNK_LOCKED);
2468                 return NULL;
2469         }
2470
2471         /* Can't access broken devive. */
2472         if (ChunkError(chunk) || DevFailed(rs->dev + p))
2473                 return NULL;
2474
2475         /* Can access uptodate chunks. */
2476         if (ChunkUptodate(chunk)) {
2477                 (*chunks_uptodate)++;
2478                 return NULL;
2479         }
2480
2481         return chunk;
2482 }
2483
2484 /*
2485  * Degraded/reconstruction mode.
2486  *
2487  * Check stripe state to figure which chunks don't need IO.
2488  *
2489  * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
2490  */
2491 static int stripe_check_reconstruct(struct stripe *stripe)
2492 {
2493         struct raid_set *rs = RS(stripe->sc);
2494
2495         if (RSDead(rs)) {
2496                 ClearStripeReconstruct(stripe);
2497                 ClearStripeReconstructed(stripe);
2498                 stripe_allow_io(stripe);
2499                 return 0;
2500         }
2501
2502         /* Avoid further reconstruction setting, when already set. */
2503         if (StripeReconstruct(stripe)) {
2504                 /* REMOVEME: statistics. */
2505                 atomic_inc(rs->stats + S_RECONSTRUCT_SET);
2506                 return -EBUSY;
2507         }
2508
2509         /* Initially allow io on all chunks. */
2510         stripe_allow_io(stripe);
2511
2512         /* Return if stripe is already reconstructed. */
2513         if (StripeReconstructed(stripe)) {
2514                 atomic_inc(rs->stats + S_RECONSTRUCTED);
2515                 return 0;
2516         }
2517
2518         /*
2519          * Degraded/reconstruction mode (device failed) ->
2520          * avoid io on the failed device.
2521          */
2522         if (unlikely(RSDegraded(rs))) {
2523                 /* REMOVEME: statistics. */
2524                 atomic_inc(rs->stats + S_DEGRADED);
2525                 /* Allow IO on all devices but the dead one. */
2526                 BUG_ON(rs->set.ei < 0);
2527                 return stripe_chunk_set_io_flags(stripe, rs->set.ei);
2528         } else {
2529                 int sync, pi = dev_for_parity(stripe, &sync);
2530
2531                 /*
2532                  * Reconstruction mode (ie. a particular (replaced) device or
2533                  * some (rotating) parity chunk is being resynchronized) ->
2534                  *   o make sure all needed chunks are read in
2535                  *   o writes are allowed to go through
2536                  */
2537                 if (!sync) {
2538                         /* REMOVEME: statistics. */
2539                         atomic_inc(rs->stats + S_NOSYNC);
2540                         /* Allow IO on all devs but the one to reconstruct. */
2541                         return stripe_chunk_set_io_flags(stripe, pi);
2542                 }
2543         }
2544
2545         return 0;
2546 }
2547
2548 /*
2549  * Check, if stripe is ready to merge writes.
2550  * I.e. if all chunks present to allow to merge bios.
2551  *
2552  * We prohibit io on:
2553  *
2554  * o chunks without bios
2555  * o chunks which get completely written over
2556  */
2557 static int stripe_merge_possible(struct stripe *stripe, int nosync)
2558 {
2559         struct raid_set *rs = RS(stripe->sc);
2560         unsigned chunks_overwrite = 0, chunks_prohibited = 0,
2561                  chunks_uptodate = 0, p = rs->set.raid_devs;
2562
2563         /* Walk all chunks. */
2564         while (p--) {
2565                 struct stripe_chunk *chunk;
2566
2567                 /* Prohibit io on broken devices. */
2568                 if (DevFailed(rs->dev + p)) {
2569                         chunk = CHUNK(stripe, p);
2570                         goto prohibit_io;
2571                 }
2572
2573                 /* We can't optimize any further if no chunk. */
2574                 chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
2575                 if (!chunk || nosync)
2576                         continue;
2577
2578                 /*
2579                  * We have a chunk, which is not uptodate.
2580                  *
2581                  * If this is not parity and we don't have
2582                  * reads queued, we can optimize further.
2583                  */
2584                 if (p != stripe->idx.parity &&
2585                     bio_list_empty(BL_CHUNK(chunk, READ)) &&
2586                     bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
2587                         if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
2588                                 goto prohibit_io;
2589                         else if (RSCheckOverwrite(rs) &&
2590                                  stripe_check_chunk_overwrite(stripe, p))
2591                                 /* Completely overwritten chunk. */
2592                                 chunks_overwrite++;
2593                 }
2594
2595                 /* Allow io for chunks with bios and overwritten ones. */
2596                 SetChunkIo(chunk);
2597                 continue;
2598
2599 prohibit_io:
2600                 /* No io for broken devices or for chunks w/o bios. */
2601                 ClearChunkIo(chunk);
2602                 chunks_prohibited++;
2603                 /* REMOVEME: statistics. */
2604                 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2605         }
2606
2607         /* All data chunks will get written over. */
2608         if (chunks_overwrite == rs->set.data_devs)
2609                 atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
2610         else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
2611                 /* We don't have enough chunks to merge. */
2612                 atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
2613                 return -EPERM;
2614         }
2615
2616         /*
2617          * If we have all chunks up to date or overwrite them, we
2618          * just zero the parity chunk and let stripe_rw() recreate it.
2619          */
2620         if (chunks_uptodate == rs->set.raid_devs ||
2621             chunks_overwrite == rs->set.data_devs) {
2622                 stripe_zero_chunk(stripe, stripe->idx.parity);
2623                 BUG_ON(StripeReconstruct(stripe));
2624                 SetStripeReconstruct(stripe);   /* Enforce xor in caller. */
2625         } else {
2626                 /*
2627                  * With less chunks, we xor parity out.
2628                  *
2629                  * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
2630                  *       so that only chunks with queued or merged writes
2631                  *       are being xored.
2632                  */
2633                 parity_xor(stripe);
2634         }
2635
2636         /*
2637          * We do have enough chunks to merge.
2638          * All chunks are uptodate or get written over.
2639          */
2640         atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
2641         return 0;
2642 }
2643
2644 /*
2645  * Avoid reading chunks in case we're fully operational.
2646  *
2647  * We prohibit io on any chunks without bios but the parity chunk.
2648  */
2649 static void stripe_avoid_reads(struct stripe *stripe)
2650 {
2651         struct raid_set *rs = RS(stripe->sc);
2652         unsigned dummy = 0, p = rs->set.raid_devs;
2653
2654         /* Walk all chunks. */
2655         while (p--) {
2656                 struct stripe_chunk *chunk =
2657                         stripe_chunk_check(stripe, p, &dummy);
2658
2659                 if (!chunk)
2660                         continue;
2661
2662                 /* If parity or any bios pending -> allow io. */
2663                 if (chunk_ref(chunk) || p == stripe->idx.parity)
2664                         SetChunkIo(chunk);
2665                 else {
2666                         ClearChunkIo(chunk);
2667                         /* REMOVEME: statistics. */
2668                         atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2669                 }
2670         }
2671 }
2672
2673 /*
2674  * Read/write a stripe.
2675  *
2676  * All stripe read/write activity goes through this function
2677  * unless recovery, which has to call stripe_chunk_rw() directly.
2678  *
2679  * Make sure we don't try already merged stripes in order
2680  * to avoid data corruption.
2681  *
2682  * Check the state of the RAID set and if degraded (or
2683  * resynchronizing for reads), read in all other chunks but
2684  * the one on the dead/resynchronizing device in order to be
2685  * able to reconstruct the missing one in _do_endios().
2686  *
2687  * Can be called on active stripes in order
2688  * to dispatch new io on inactive chunks.
2689  *
2690  * States to cover:
2691  *   o stripe to read and/or write
2692  *   o stripe with error to reconstruct
2693  */
2694 static void stripe_rw(struct stripe *stripe)
2695 {
2696         int nosync, r;
2697         struct raid_set *rs = RS(stripe->sc);
2698
2699         /*
2700          * Check, if a chunk needs to be reconstructed
2701          * because of a degraded set or a region out of sync.
2702          */
2703         nosync = stripe_check_reconstruct(stripe);
2704         switch (nosync) {
2705         case -EBUSY:
2706                 return; /* Wait for stripe reconstruction to finish. */
2707         case -EPERM:
2708                 goto io;
2709         }
2710
2711         /*
2712          * If we don't have merged writes pending, we can schedule
2713          * queued writes to be merged next without corrupting data.
2714          */
2715         if (!StripeMerged(stripe)) {
2716                 r = stripe_queue_writes(stripe);
2717                 if (r)
2718                         /* Writes got queued -> flag RBW. */
2719                         SetStripeRBW(stripe);
2720         }
2721
2722         /*
2723          * Merge all writes hanging off uptodate/overwritten
2724          * chunks of the stripe.
2725          */
2726         if (StripeRBW(stripe)) {
2727                 r = stripe_merge_possible(stripe, nosync);
2728                 if (!r) { /* Merge possible. */
2729                         struct stripe_chunk *chunk;
2730
2731                         /*
2732                          * I rely on valid parity in order
2733                          * to xor a fraction of chunks out
2734                          * of parity and back in.
2735                          */
2736                         stripe_merge_writes(stripe);    /* Merge writes in. */
2737                         parity_xor(stripe);             /* Update parity. */
2738                         ClearStripeReconstruct(stripe); /* Reset xor enforce. */
2739                         SetStripeMerged(stripe);        /* Writes merged. */
2740                         ClearStripeRBW(stripe);         /* Disable RBW. */
2741
2742                         /*
2743                          * REMOVEME: sanity check on parity chunk
2744                          *           states after writes got merged.
2745                          */
2746                         chunk = CHUNK(stripe, stripe->idx.parity);
2747                         BUG_ON(ChunkLocked(chunk));
2748                         BUG_ON(!ChunkUptodate(chunk));
2749                         BUG_ON(!ChunkDirty(chunk));
2750                         BUG_ON(!ChunkIo(chunk));
2751                 }
2752         } else if (!nosync && !StripeMerged(stripe))
2753                 /* Read avoidance if not degraded/resynchronizing/merged. */
2754                 stripe_avoid_reads(stripe);
2755
2756 io:
2757         /* Now submit any reads/writes for non-uptodate or dirty chunks. */
2758         r = stripe_chunks_rw(stripe);
2759         if (!r) {
2760                 /*
2761                  * No io submitted because of chunk io
2762                  * prohibited or locked chunks/failed devices
2763                  * -> push to end io list for processing.
2764                  */
2765                 stripe_endio_push(stripe);
2766                 atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
2767         }
2768 }
2769
2770 /*
2771  * Recovery functions
2772  */
2773 /* Read a stripe off a raid set for recovery. */
2774 static int stripe_recover_read(struct stripe *stripe, int pi)
2775 {
2776         BUG_ON(stripe_io_ref(stripe));
2777
2778         /* Invalidate all chunks so that they get read in. */
2779         stripe_chunks_invalidate(stripe);
2780         stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
2781
2782         /*
2783          * If we are reconstructing a perticular device, we can avoid
2784          * reading the respective chunk in, because we're going to
2785          * reconstruct it anyway.
2786          *
2787          * We can't do that for resynchronization of rotating parity,
2788          * because the recovery stripe chunk size is typically larger
2789          * than the sets chunk size.
2790          */
2791         if (pi > -1)
2792                 ClearChunkIo(CHUNK(stripe, pi));
2793
2794         return stripe_chunks_rw(stripe);
2795 }
2796
2797 /* Write a stripe to a raid set for recovery. */
2798 static int stripe_recover_write(struct stripe *stripe, int pi)
2799 {
2800         BUG_ON(stripe_io_ref(stripe));
2801
2802         /*
2803          * If this is a reconstruct of a particular device, then
2804          * reconstruct the respective chunk, else create parity chunk.
2805          */
2806         if (pi > -1) {
2807                 stripe_zero_chunk(stripe, pi);
2808                 common_xor(stripe, stripe->io.size, 0, pi);
2809                 chunk_set(CHUNK(stripe, pi), DIRTY);
2810         } else
2811                 parity_xor(stripe);
2812
2813         return stripe_chunks_rw(stripe);
2814 }
2815
2816 /* Read/write a recovery stripe. */
2817 static int stripe_recover_rw(struct stripe *stripe)
2818 {
2819         int r = 0, sync = 0;
2820
2821         /* Read/write flip-flop. */
2822         if (TestClearStripeRBW(stripe)) {
2823                 SetStripeMerged(stripe);
2824                 stripe->key = stripe->recover->pos;
2825                 r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
2826                 BUG_ON(!r);
2827         } else if (TestClearStripeMerged(stripe)) {
2828                 r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
2829                 BUG_ON(!r);
2830         }
2831
2832         BUG_ON(sync);
2833         return r;
2834 }
2835
2836 /* Recover bandwidth available ?. */
2837 static int recover_bandwidth(struct raid_set *rs)
2838 {
2839         int r, work;
2840
2841         /* On reset or when bios delayed -> allow recovery. */
2842         r = recover_io_reset(rs);
2843         if (r || RSBandwidth(rs))
2844                 goto out;
2845
2846         work = atomic_read(rs->recover.io_count + IO_WORK);
2847         if (work) {
2848                 /* Pay attention to larger recover stripe size. */
2849                 int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
2850                                           rs->recover.io_size / rs->set.io_size;
2851
2852                 /*
2853                  * Don't use more than given bandwidth
2854                  * of the work io for recovery.
2855                  */
2856                 if (recover > work / rs->recover.bandwidth_work) {
2857                         /* REMOVEME: statistics. */
2858                         atomic_inc(rs->stats + S_NO_BANDWIDTH);
2859                         return 0;
2860                 }
2861         }
2862
2863 out:
2864         atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
2865         return 1;
2866 }
2867
2868 /* Try to get a region to recover. */
2869 static int stripe_recover_get_region(struct stripe *stripe)
2870 {
2871         struct raid_set *rs = RS(stripe->sc);
2872         struct recover *rec = &rs->recover;
2873         struct recover_addr *addr = stripe->recover;
2874         struct dm_dirty_log *dl = rec->dl;
2875         struct dm_rh_client *rh = rec->rh;
2876
2877         BUG_ON(!dl);
2878         BUG_ON(!rh);
2879
2880         /* Return, that we have region first to finish it during suspension. */
2881         if (addr->reg)
2882                 return 1;
2883
2884         if (RSSuspend(rs))
2885                 return -EPERM;
2886
2887         if (dl->type->get_sync_count(dl) >= rec->nr_regions)
2888                 return -ENOENT;
2889
2890         /* If we don't have enough bandwidth, we don't proceed recovering. */
2891         if (!recover_bandwidth(rs))
2892                 return -EAGAIN;
2893
2894         /* Start quiescing a region. */
2895         dm_rh_recovery_prepare(rh);
2896         addr->reg = dm_rh_recovery_start(rh);
2897         if (!addr->reg)
2898                 return -EAGAIN;
2899
2900         addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
2901         addr->end = addr->pos + dm_rh_get_region_size(rh);
2902
2903         /*
2904          * Take one global io reference out for the
2905          * whole region, which is going to be released
2906          * when the region is completely done with.
2907          */
2908         io_get(rs);
2909         return 0;
2910 }
2911
2912 /* Update region hash state. */
2913 enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
2914 static void recover_rh_update(struct stripe *stripe, enum recover_type success)
2915 {
2916         struct recover_addr *addr = stripe->recover;
2917         struct raid_set *rs = RS(stripe->sc);
2918         struct recover *rec = &rs->recover;
2919
2920         if (!addr->reg) {
2921                 DMERR("%s- Called w/o region", __func__);
2922                 return;
2923         }
2924
2925         dm_rh_recovery_end(addr->reg, success);
2926         if (success)
2927                 rec->nr_regions_recovered++;
2928
2929         addr->reg = NULL;
2930
2931         /*
2932          * Completely done with this region ->
2933          * release the 1st io reference.
2934          */
2935         io_put(rs);
2936 }
2937
2938 /* Set start of recovery state. */
2939 static void set_start_recovery(struct raid_set *rs)
2940 {
2941         /* Initialize recovery. */
2942         rs->recover.start_jiffies = jiffies;
2943         rs->recover.end_jiffies = 0;
2944 }
2945
2946 /* Set end of recovery state. */
2947 static void set_end_recovery(struct raid_set *rs)
2948 {
2949         ClearRSRecover(rs);
2950         rs->set.dev_to_init = -1;
2951
2952         /* Check for jiffies overrun. */
2953         rs->recover.end_jiffies = jiffies;
2954         if (rs->recover.end_jiffies < rs->recover.start_jiffies)
2955                 rs->recover.end_jiffies = ~0;
2956 }
2957
2958 /* Handle recovery on one recovery stripe. */
2959 static int _do_recovery(struct stripe *stripe)
2960 {
2961         int r;
2962         struct raid_set *rs = RS(stripe->sc);
2963         struct recover_addr *addr = stripe->recover;
2964
2965         /* If recovery is active -> return. */
2966         if (stripe_io_ref(stripe))
2967                 return 1;
2968
2969         /* IO error is fatal for recovery -> stop it. */
2970         if (unlikely(StripeError(stripe)))
2971                 goto err;
2972
2973         /* Recovery end required. */
2974         if (!RSRecover(rs))
2975                 goto err;
2976
2977         /* Get a region to recover. */
2978         r = stripe_recover_get_region(stripe);
2979         switch (r) {
2980         case 0: /* Got a new region: flag initial read before write. */
2981                 SetStripeRBW(stripe);
2982         case 1: /* Have a region in the works. */
2983                 break;
2984         case -EAGAIN:
2985                 /* No bandwidth/quiesced region yet, try later. */
2986                 if (!io_ref(rs))
2987                         wake_do_raid_delayed(rs, HZ / 4);
2988         case -EPERM:
2989                 /* Suspend. */
2990                 return 1;
2991         case -ENOENT:   /* No more regions to recover. */
2992                 schedule_work(&rs->io.ws_do_table_event);
2993                 return 0;
2994         default:
2995                 BUG();
2996         }
2997
2998         /* Read/write a recover stripe. */
2999         r = stripe_recover_rw(stripe);
3000         if (r)
3001                 /* IO initiated. */
3002                 return 1;
3003
3004         /* Read and write finished-> update recovery position within region. */
3005         addr->pos += stripe->io.size;
3006
3007         /* If we're at end of region, update region hash. */
3008         if (addr->pos >= addr->end ||
3009             addr->pos >= rs->set.sectors_per_dev)
3010                 recover_rh_update(stripe, REC_SUCCESS);
3011         else
3012                 /* Prepare to read next region segment. */
3013                 SetStripeRBW(stripe);
3014
3015         /* Schedule myself for another round... */
3016         wake_do_raid(rs);
3017         return 1;
3018
3019 err:
3020         /* FIXME: rather try recovering other regions on error? */
3021         rs_check_degrade(stripe);
3022         recover_rh_update(stripe, REC_FAILURE);
3023
3024         /* Check state of partially recovered array. */
3025         if (RSDegraded(rs) && !RSDead(rs) &&
3026             rs->set.dev_to_init != -1 &&
3027             rs->set.ei != rs->set.dev_to_init)
3028                 /* Broken drive != drive to recover -> FATAL. */
3029                 SetRSDead(rs);
3030
3031         if (StripeError(stripe)) {
3032                 char buf[BDEVNAME_SIZE];
3033
3034                 DMERR("stopping recovery due to "
3035                       "ERROR on /dev/%s, stripe at offset %llu",
3036                       bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3037                       (unsigned long long) stripe->key);
3038
3039         }
3040
3041         /* Make sure, that all quiesced regions get released. */
3042         while (addr->reg) {
3043                 dm_rh_recovery_end(addr->reg, -EIO);
3044                 addr->reg = dm_rh_recovery_start(rs->recover.rh);
3045         }
3046
3047         return 0;
3048 }
3049
3050 /* Called by main io daemon to recover regions. */
3051 static void do_recovery(struct raid_set *rs)
3052 {
3053         if (RSRecover(rs)) {
3054                 int r = 0;
3055                 struct stripe *stripe;
3056
3057                 list_for_each_entry(stripe, &rs->recover.stripes,
3058                                     lists[LIST_RECOVER])
3059                         r += _do_recovery(stripe);
3060
3061                 if (!r) {
3062                         set_end_recovery(rs);
3063                         stripe_recover_free(rs);
3064                 }
3065         }
3066 }
3067
3068 /*
3069  * END recovery functions
3070  */
3071
3072 /* End io process all stripes handed in by endio() callback. */
3073 static void _do_endios(struct raid_set *rs, struct stripe *stripe,
3074                        struct list_head *flush_list)
3075 {
3076         /* First unlock all required chunks. */
3077         stripe_chunks_unlock(stripe);
3078
3079         /*
3080          * If an io error on a stripe occured, degrade the RAID set
3081          * and try to endio as many bios as possible. If any bios can't
3082          * be endio processed, requeue the stripe (stripe_ref() != 0).
3083          */
3084         if (TestClearStripeError(stripe)) {
3085                 /*
3086                  * FIXME: if read, rewrite the failed chunk after reconstruction
3087                  *        in order to trigger disk bad sector relocation.
3088                  */
3089                 rs_check_degrade(stripe); /* Resets ChunkError(). */
3090                 ClearStripeReconstruct(stripe);
3091                 ClearStripeReconstructed(stripe);
3092         }
3093
3094         /* Got to reconstruct a missing chunk. */
3095         if (StripeReconstruct(stripe)) {
3096                 /*
3097                  * (*2*) We use StripeReconstruct() to allow for
3098                  *       all chunks to be xored into the reconstructed
3099                  *       one (see chunk_must_xor()).
3100                  */
3101                 stripe_reconstruct(stripe);
3102
3103                 /*
3104                  * (*3*) Now we reset StripeReconstruct() and flag
3105                  *       StripeReconstructed() to show to stripe_rw(),
3106                  *       that we have reconstructed a missing chunk.
3107                  */
3108                 ClearStripeReconstruct(stripe);
3109                 SetStripeReconstructed(stripe);
3110
3111                 /* FIXME: reschedule to be written in case of read. */
3112                 // if (!StripeRBW(stripe)) {
3113                 //      chunk_set(CHUNK(stripe, pr), DIRTY);
3114                 //      stripe_chunks_rw(stripe);
3115                 // }
3116         }
3117
3118         /*
3119          * Now that we eventually got a complete stripe, we
3120          * can process the rest of the end ios on reads.
3121          */
3122         stripe_endio(READ, stripe);
3123
3124         /* End io all merged writes. */
3125         if (TestClearStripeMerged(stripe))
3126                 stripe_endio(WRITE_MERGED, stripe);
3127
3128         /* If RAID set is dead -> fail any ios to dead drives. */
3129         if (RSDead(rs)) {
3130                 DMERR_LIMIT("RAID set dead: failing ios to dead devices");
3131                 stripe_fail_io(stripe);
3132         }
3133
3134         /*
3135          * We have stripe references still,
3136          * beacuse of read befeore writes or IO errors ->
3137          * got to put on flush list for processing.
3138          */
3139         if (stripe_ref(stripe)) {
3140                 BUG_ON(!list_empty(stripe->lists + LIST_LRU));
3141                 list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
3142                 atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
3143         } else
3144                 stripe_lru_add(stripe);
3145 }
3146
3147 /* Pop any endio stripes off of the endio list and belabour them. */
3148 static void do_endios(struct raid_set *rs)
3149 {
3150         struct stripe_cache *sc = &rs->sc;
3151         struct stripe *stripe;
3152         /* IO flush list for sorted requeued stripes. */
3153         struct list_head flush_list;
3154
3155         INIT_LIST_HEAD(&flush_list);
3156
3157         while ((stripe = stripe_endio_pop(sc))) {
3158                 /* Avoid endio on stripes with newly io'ed chunks. */
3159                 if (!stripe_io_ref(stripe))
3160                         _do_endios(rs, stripe, &flush_list);
3161         }
3162
3163         /*
3164          * Insert any requeued stripes in the proper
3165          * order at the beginning of the io (flush) list.
3166          */
3167         list_splice(&flush_list, sc->lists + LIST_FLUSH);
3168 }
3169
3170 /* Flush any stripes on the io list. */
3171 static void do_flush(struct raid_set *rs)
3172 {
3173         struct stripe *stripe;
3174
3175         while ((stripe = stripe_io_pop(&rs->sc)))
3176                 stripe_rw(stripe); /* Read/write stripe. */
3177 }
3178
3179 /* Stripe cache resizing. */
3180 static void do_sc_resize(struct raid_set *rs)
3181 {
3182         unsigned set = atomic_read(&rs->sc.stripes_to_set);
3183
3184         if (set) {
3185                 unsigned cur = atomic_read(&rs->sc.stripes);
3186                 int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
3187                                       sc_shrink(&rs->sc, cur - set);
3188
3189                 /* Flag end of resizeing if ok. */
3190                 if (!r)
3191                         atomic_set(&rs->sc.stripes_to_set, 0);
3192         }
3193 }
3194
3195 /*
3196  * Process all ios
3197  *
3198  * We do different things with the io depending
3199  * on the state of the region that it is in:
3200  *
3201  * o reads: hang off stripe cache or postpone if full
3202  *
3203  * o writes:
3204  *
3205  *  CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3206  *                      In case stripe cache is full or busy, postpone the io.
3207  *
3208  *  RECOVERING:         delay the io until recovery of the region completes.
3209  *
3210  */
3211 static void do_ios(struct raid_set *rs, struct bio_list *ios)
3212 {
3213         int r;
3214         unsigned flush = 0, delay = 0;
3215         sector_t sector;
3216         struct dm_rh_client *rh = rs->recover.rh;
3217         struct bio *bio;
3218         struct bio_list reject;
3219
3220         bio_list_init(&reject);
3221
3222         /*
3223          * Classify each io:
3224          *    o delay writes to recovering regions (let reads go through)
3225          *    o queue io to all other regions
3226          */
3227         while ((bio = bio_list_pop(ios))) {
3228                 /*
3229                  * In case we get a barrier bio, push it back onto
3230                  * the input queue unless all work queues are empty
3231                  * and the stripe cache is inactive.
3232                  */
3233                 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
3234                         /* REMOVEME: statistics. */
3235                         atomic_inc(rs->stats + S_BARRIER);
3236                         if (delay ||
3237                             !list_empty(rs->sc.lists + LIST_FLUSH) ||
3238                             !bio_list_empty(&reject) ||
3239                             sc_active(&rs->sc)) {
3240                                 bio_list_push(ios, bio);
3241                                 break;
3242                         }
3243                 }
3244
3245                 /* Check for recovering regions. */
3246                 sector = _sector(rs, bio);
3247                 r = region_state(rs, sector, DM_RH_RECOVERING);
3248                 if (unlikely(r && bio_data_dir(bio) == WRITE)) {
3249                         delay++;
3250                         /* Wait writing to recovering regions. */
3251                         dm_rh_delay_by_region(rh, bio,
3252                                               dm_rh_sector_to_region(rh,
3253                                                                      sector));
3254                         /* REMOVEME: statistics.*/
3255                         atomic_inc(rs->stats + S_DELAYED_BIOS);
3256                         atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3257
3258                         /* Force bandwidth tests in recovery. */
3259                         SetRSBandwidth(rs);
3260                 } else {
3261                         /*
3262                          * Process ios to non-recovering regions by queueing
3263                          * them to stripes (does dm_rh_inc()) for writes).
3264                          */
3265                         flush += stripe_queue_bio(rs, bio, &reject);
3266                 }
3267         }
3268
3269         if (flush) {
3270                 /* FIXME: better error handling. */
3271                 r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3272                 if (r)
3273                         DMERR_LIMIT("dirty log flush");
3274         }
3275
3276         /* Merge any rejected bios back to the head of the input list. */
3277         bio_list_merge_head(ios, &reject);
3278 }
3279
3280 /* Send an event in case we're getting too busy. */
3281 static void do_busy_event(struct raid_set *rs)
3282 {
3283         if (sc_busy(rs)) {
3284                 if (!TestSetRSScBusy(rs))
3285                         schedule_work(&rs->io.ws_do_table_event);
3286         }
3287
3288         ClearRSScBusy(rs);
3289 }
3290
3291 /* Throw an event. */
3292 static void do_table_event(struct work_struct *ws)
3293 {
3294         struct raid_set *rs = container_of(ws, struct raid_set,
3295                                            io.ws_do_table_event);
3296         dm_table_event(rs->ti->table);
3297 }
3298
3299
3300 /*-----------------------------------------------------------------
3301  * RAID daemon
3302  *---------------------------------------------------------------*/
3303 /*
3304  * o belabour all end ios
3305  * o update the region hash states
3306  * o optionally shrink the stripe cache
3307  * o optionally do recovery
3308  * o unplug any component raid devices with queued bios
3309  * o grab the input queue
3310  * o work an all requeued or new ios and perform stripe cache flushs
3311  * o unplug any component raid devices with queued bios
3312  * o check, if the stripe cache gets too busy and throw an event if so
3313  */
3314 static void do_raid(struct work_struct *ws)
3315 {
3316         struct raid_set *rs = container_of(ws, struct raid_set,
3317                                            io.dws_do_raid.work);
3318         struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3319         struct blk_plug plug;
3320
3321         /*
3322          * We always need to end io, so that ios can get errored in
3323          * case the set failed and the region counters get decremented
3324          * before we update region hash states and go any further.
3325          */
3326         do_endios(rs);
3327         dm_rh_update_states(rs->recover.rh, 1);
3328
3329         /*
3330          * Now that we've end io'd, which may have put stripes on the LRU list
3331          * to allow for shrinking, we resize the stripe cache if requested.
3332          */
3333         do_sc_resize(rs);
3334
3335         /* Try to recover regions. */
3336         blk_start_plug(&plug);
3337         do_recovery(rs);
3338         blk_finish_plug(&plug); /* Unplug the queue */
3339
3340         /* Quickly grab all new ios queued and add them to the work list. */
3341         mutex_lock(&rs->io.in_lock);
3342         bio_list_merge(ios, ios_in);
3343         bio_list_init(ios_in);
3344         mutex_unlock(&rs->io.in_lock);
3345
3346         blk_start_plug(&plug);
3347         if (!bio_list_empty(ios))
3348                 do_ios(rs, ios); /* Got ios to work into the cache. */
3349
3350         do_flush(rs);           /* Flush any stripes on io list. */
3351         blk_finish_plug(&plug); /* Unplug the queue */
3352         do_busy_event(rs);      /* Check if we got too busy. */
3353 }
3354
3355 /*
3356  * Callback for region hash to dispatch
3357  * delayed bios queued to recovered regions
3358  * (gets called via dm_rh_update_states()).
3359  */
3360 static void dispatch_delayed_bios(void *context, struct bio_list *bl)
3361 {
3362         struct raid_set *rs = context;
3363         struct bio *bio;
3364
3365         /* REMOVEME: statistics; decrement pending delayed bios counter. */
3366         bio_list_for_each(bio, bl)
3367                 atomic_dec(rs->stats + S_DELAYED_BIOS);
3368
3369         /* Merge region hash private list to work list. */
3370         bio_list_merge_head(&rs->io.work, bl);
3371         bio_list_init(bl);
3372         ClearRSBandwidth(rs);
3373 }
3374
3375 /*************************************************************
3376  * Constructor helpers
3377  *************************************************************/
3378 /* Calculate MB/sec. */
3379 static unsigned mbpers(struct raid_set *rs, unsigned speed)
3380 {
3381         return to_bytes(speed * rs->set.data_devs *
3382                         rs->recover.io_size * HZ >> 10) >> 10;
3383 }
3384
3385 /*
3386  * Discover fastest xor algorithm and # of chunks combination.
3387  */
3388 /* Calculate speed for algorithm and # of chunks. */
3389 static unsigned xor_speed(struct stripe *stripe)
3390 {
3391         unsigned r = 0;
3392         unsigned long j;
3393
3394         /* Wait for next tick. */
3395         for (j = jiffies; j == jiffies; )
3396                 ;
3397
3398         /* Do xors for a full tick. */
3399         for (j = jiffies; j == jiffies; ) {
3400                 mb();
3401                 common_xor(stripe, stripe->io.size, 0, 0);
3402                 mb();
3403                 r++;
3404         }
3405
3406         return r;
3407 }
3408
3409 /* Optimize xor algorithm for this RAID set. */
3410 static unsigned xor_optimize(struct raid_set *rs)
3411 {
3412         unsigned chunks_max = 2, p = rs->set.raid_devs, speed_max = 0;
3413         struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3414         struct stripe *stripe;
3415
3416         BUG_ON(list_empty(&rs->recover.stripes));
3417         stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3418                                   lists[LIST_RECOVER]);
3419
3420         /* Must set uptodate so that xor() will belabour chunks. */
3421         while (p--)
3422                 SetChunkUptodate(CHUNK(stripe, p));
3423
3424         /* Try all xor functions. */
3425         while (f-- > xor_funcs) {
3426                 unsigned speed;
3427
3428                 /* Set actual xor function for common_xor(). */
3429                 rs->xor.f = f;
3430                 rs->xor.chunks = (f->f == xor_blocks_wrapper ?
3431                                   (MAX_XOR_BLOCKS + 1) : XOR_CHUNKS_MAX) + 1;
3432
3433                 while (rs->xor.chunks-- > 2) {
3434                         speed = xor_speed(stripe);
3435                         if (speed > speed_max) {
3436                                 speed_max = speed;
3437                                 chunks_max = rs->xor.chunks;
3438                                 f_max = f;
3439                         }
3440                 }
3441         }
3442
3443         /* Memorize optimum parameters. */
3444         rs->xor.f = f_max;
3445         rs->xor.chunks = chunks_max;
3446         return speed_max;
3447 }
3448
3449 /*
3450  * Allocate a RAID context (a RAID set)
3451  */
3452 /* Structure for variable RAID parameters. */
3453 struct variable_parms {
3454         int bandwidth;
3455         int bandwidth_parm;
3456         int chunk_size;
3457         int chunk_size_parm;
3458         int io_size;
3459         int io_size_parm;
3460         int stripes;
3461         int stripes_parm;
3462         int recover_io_size;
3463         int recover_io_size_parm;
3464         int raid_parms;
3465         int recovery;
3466         int recovery_stripes;
3467         int recovery_stripes_parm;
3468 };
3469
3470 static struct raid_set *
3471 context_alloc(struct raid_type *raid_type, struct variable_parms *p,
3472               unsigned raid_devs, sector_t sectors_per_dev,
3473               struct dm_target *ti, unsigned dl_parms, char **argv)
3474 {
3475         int r;
3476         size_t len;
3477         sector_t region_size, ti_len;
3478         struct raid_set *rs = NULL;
3479         struct dm_dirty_log *dl;
3480         struct recover *rec;
3481
3482         /*
3483          * Create the dirty log
3484          *
3485          * We need to change length for the dirty log constructor,
3486          * because we want an amount of regions for all stripes derived
3487          * from the single device size, so that we can keep region
3488          * size = 2^^n independant of the number of devices
3489          */
3490         ti_len = ti->len;
3491         ti->len = sectors_per_dev;
3492         dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
3493         ti->len = ti_len;
3494         if (!dl)
3495                 goto bad_dirty_log;
3496
3497         /* Chunk size *must* be smaller than region size. */
3498         region_size = dl->type->get_region_size(dl);
3499         if (p->chunk_size > region_size)
3500                 goto bad_chunk_size;
3501
3502         /* Recover io size *must* be smaller than region size as well. */
3503         if (p->recover_io_size > region_size)
3504                 goto bad_recover_io_size;
3505
3506         /* Size and allocate the RAID set structure. */
3507         len = sizeof(*rs->data) + sizeof(*rs->dev);
3508         if (dm_array_too_big(sizeof(*rs), len, raid_devs))
3509                 goto bad_array;
3510
3511         len = sizeof(*rs) + raid_devs * len;
3512         rs = kzalloc(len, GFP_KERNEL);
3513         if (!rs)
3514                 goto bad_alloc;
3515
3516         rec = &rs->recover;
3517         atomic_set(&rs->io.in_process, 0);
3518         atomic_set(&rs->io.in_process_max, 0);
3519         rec->io_size = p->recover_io_size;
3520
3521         /* Pointer to data array. */
3522         rs->data = (unsigned long **)
3523                    ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
3524         rec->dl = dl;
3525         rs->set.raid_devs = raid_devs;
3526         rs->set.data_devs = raid_devs - raid_type->parity_devs;
3527         rs->set.raid_type = raid_type;
3528
3529         rs->set.raid_parms = p->raid_parms;
3530         rs->set.chunk_size_parm = p->chunk_size_parm;
3531         rs->set.io_size_parm = p->io_size_parm;
3532         rs->sc.stripes_parm = p->stripes_parm;
3533         rec->io_size_parm = p->recover_io_size_parm;
3534         rec->bandwidth_parm = p->bandwidth_parm;
3535         rec->recovery = p->recovery;
3536         rec->recovery_stripes = p->recovery_stripes;
3537
3538         /*
3539          * Set chunk and io size and respective shifts
3540          * (used to avoid divisions)
3541          */
3542         rs->set.chunk_size = p->chunk_size;
3543         rs->set.chunk_shift = ffs(p->chunk_size) - 1;
3544
3545         rs->set.io_size = p->io_size;
3546         rs->set.io_mask = p->io_size - 1;
3547         /* Mask to adjust address key in case io_size != chunk_size. */
3548         rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
3549
3550         rs->set.sectors_per_dev = sectors_per_dev;
3551
3552         rs->set.ei = -1;        /* Indicate no failed device. */
3553         atomic_set(&rs->set.failed_devs, 0);
3554
3555         rs->ti = ti;
3556
3557         atomic_set(rec->io_count + IO_WORK, 0);
3558         atomic_set(rec->io_count + IO_RECOVER, 0);
3559
3560         /* Initialize io lock and queues. */
3561         mutex_init(&rs->io.in_lock);
3562         bio_list_init(&rs->io.in);
3563         bio_list_init(&rs->io.work);
3564
3565         init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
3566
3567         rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
3568         rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
3569                         wake_dummy, wake_do_raid, 0, p->recovery_stripes,
3570                         dl, region_size, rec->nr_regions);
3571         if (IS_ERR(rec->rh))
3572                 goto bad_rh;
3573
3574         /* Initialize stripe cache. */
3575         r = sc_init(rs, p->stripes);
3576         if (r)
3577                 goto bad_sc;
3578
3579         /* REMOVEME: statistics. */
3580         stats_reset(rs);
3581         ClearRSDevelStats(rs);  /* Disnable development status. */
3582         return rs;
3583
3584 bad_dirty_log:
3585         TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
3586
3587 bad_chunk_size:
3588         dm_dirty_log_destroy(dl);
3589         TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
3590
3591 bad_recover_io_size:
3592         dm_dirty_log_destroy(dl);
3593         TI_ERR_RET("Recover stripe io size larger than region size",
3594                         ERR_PTR(-EINVAL));
3595
3596 bad_array:
3597         dm_dirty_log_destroy(dl);
3598         TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
3599
3600 bad_alloc:
3601         dm_dirty_log_destroy(dl);
3602         TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
3603
3604 bad_rh:
3605         dm_dirty_log_destroy(dl);
3606         ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
3607         goto free_rs;
3608
3609 bad_sc:
3610         dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
3611         sc_exit(&rs->sc);
3612         ti->error = DM_MSG_PREFIX "Error creating stripe cache";
3613 free_rs:
3614         kfree(rs);
3615         return ERR_PTR(-ENOMEM);
3616 }
3617
3618 /* Free a RAID context (a RAID set). */
3619 static void context_free(struct raid_set *rs, unsigned p)
3620 {
3621         while (p--)
3622                 dm_put_device(rs->ti, rs->dev[p].dev);
3623
3624         sc_exit(&rs->sc);
3625         dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
3626         kfree(rs);
3627 }
3628
3629 /* Create work queue and initialize delayed work. */
3630 static int rs_workqueue_init(struct raid_set *rs)
3631 {
3632         struct dm_target *ti = rs->ti;
3633
3634         rs->io.wq = create_singlethread_workqueue(DAEMON);
3635         if (!rs->io.wq)
3636                 TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
3637
3638         INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
3639         INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
3640         return 0;
3641 }
3642
3643 /* Return pointer to raid_type structure for raid name. */
3644 static struct raid_type *get_raid_type(char *name)
3645 {
3646         struct raid_type *r = ARRAY_END(raid_types);
3647
3648         while (r-- > raid_types) {
3649                 if (!strcmp(r->name, name))
3650                         return r;
3651         }
3652
3653         return NULL;
3654 }
3655
3656 /* FIXME: factor out to dm core. */
3657 static int multiple(sector_t a, sector_t b, sector_t *n)
3658 {
3659         sector_t r = a;
3660
3661         sector_div(r, b);
3662         *n = r;
3663         return a == r * b;
3664 }
3665
3666 /* Log RAID set information to kernel log. */
3667 static void rs_log(struct raid_set *rs, unsigned speed)
3668 {
3669         unsigned p;
3670         char buf[BDEVNAME_SIZE];
3671
3672         for (p = 0; p < rs->set.raid_devs; p++)
3673                 DMINFO("/dev/%s is raid disk %u%s",
3674                                 bdevname(rs->dev[p].dev->bdev, buf), p,
3675                                 (p == rs->set.pi) ? " (parity)" : "");
3676
3677         DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
3678                "algorithm \"%s\", %u chunks with %uMB/s\n"
3679                "%s set with net %u/%u devices",
3680                rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
3681                atomic_read(&rs->sc.stripes),
3682                rs->xor.f->name, rs->xor.chunks, mbpers(rs, speed),
3683                rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
3684 }
3685
3686 /* Get all devices and offsets. */
3687 static int dev_parms(struct raid_set *rs, char **argv, int *p)
3688 {
3689         struct dm_target *ti = rs->ti;
3690
3691         for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
3692                 int r;
3693                 unsigned long long tmp;
3694                 struct raid_dev *dev = rs->dev + *p;
3695
3696                 /* Get offset and device. */
3697                 if (sscanf(argv[1], "%llu", &tmp) != 1 ||
3698                     tmp > rs->set.sectors_per_dev)
3699                         TI_ERR("Invalid RAID device offset parameter");
3700
3701                 dev->start = tmp;
3702                 r = dm_get_device(ti, *argv, dm_table_get_mode(ti->table), &dev->dev);
3703                 if (r)
3704                         TI_ERR_RET("RAID device lookup failure", r);
3705
3706                 r = raid_dev_lookup(rs, dev);
3707                 if (r != -ENODEV && r < *p) {
3708                         (*p)++; /* Ensure dm_put_device() on actual device. */
3709                         TI_ERR_RET("Duplicate RAID device", -ENXIO);
3710                 }
3711         }
3712
3713         return 0;
3714 }
3715
3716 /* Set recovery bandwidth. */
3717 static void
3718 recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
3719 {
3720         rs->recover.bandwidth = bandwidth;
3721         rs->recover.bandwidth_work = 100 / bandwidth;
3722 }
3723
3724 /* Handle variable number of RAID parameters. */
3725 static int get_raid_variable_parms(struct dm_target *ti, char **argv,
3726                                    struct variable_parms *vp)
3727 {
3728         int p, value;
3729         struct {
3730                 int action; /* -1: skip, 0: no pwer2 check, 1: power2 check */
3731                 char *errmsg;
3732                 int min, max;
3733                 int *var, *var2, *var3;
3734         } argctr[] = {
3735                 { 1,
3736                   "Invalid chunk size; must be -1 or 2^^n and <= 16384",
3737                   IO_SIZE_MIN, CHUNK_SIZE_MAX,
3738                   &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
3739                 { 0,
3740                   "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
3741                   STRIPES_MIN, STRIPES_MAX,
3742                   &vp->stripes_parm, &vp->stripes, NULL },
3743                 { 1,
3744                   "Invalid io size; must -1 or >= 8, 2^^n and less equal "
3745                   "min(BIO_MAX_SECTORS/2, chunk size)",
3746                   IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
3747                   &vp->io_size_parm, &vp->io_size, NULL },
3748                 { 1,
3749                   "Invalid recovery io size; must be -1 or "
3750                   "2^^n and less equal BIO_MAX_SECTORS/2",
3751                   RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
3752                   &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
3753                 { 0,
3754                   "Invalid recovery bandwidth percentage; "
3755                   "must be -1 or > 0 and <= 100",
3756                   BANDWIDTH_MIN, BANDWIDTH_MAX,
3757                   &vp->bandwidth_parm, &vp->bandwidth, NULL },
3758                 /* Handle sync argument seperately in loop. */
3759                 { -1,
3760                   "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
3761                 { 0,
3762                   "Invalid number of recovery stripes;"
3763                   "must be -1, > 0 and <= 16384",
3764                   RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
3765                   &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
3766         }, *varp;
3767
3768         /* Fetch # of variable raid parameters. */
3769         if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
3770             !range_ok(vp->raid_parms, 0, 7))
3771                 TI_ERR("Bad variable raid parameters number");
3772
3773         /* Preset variable RAID parameters. */
3774         vp->chunk_size = CHUNK_SIZE_DEFAULT;
3775         vp->io_size = IO_SIZE_DEFAULT;
3776         vp->stripes = STRIPES_DEFAULT;
3777         vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
3778         vp->bandwidth = BANDWIDTH_DEFAULT;
3779         vp->recovery = 1;
3780         vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
3781
3782         /* Walk the array of argument constraints for all given ones. */
3783         for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
3784                 BUG_ON(varp >= ARRAY_END(argctr));
3785
3786                 /* Special case for "[no]sync" string argument. */
3787                 if (varp->action < 0) {
3788                         if (!strcmp(*argv, "sync"))
3789                                 ;
3790                         else if (!strcmp(*argv, "nosync"))
3791                                 vp->recovery = 0;
3792                         else
3793                                 TI_ERR(varp->errmsg);
3794
3795                         argv++;
3796                         continue;
3797                 }
3798
3799                 /*
3800                  * Special case for io_size depending
3801                  * on previously set chunk size.
3802                  */
3803                 if (p == 2)
3804                         varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
3805
3806                 if (sscanf(*(argv++), "%d", &value) != 1 ||
3807                     (value != -1 &&
3808                      ((varp->action && !POWER_OF_2(value)) ||
3809                       !range_ok(value, varp->min, varp->max))))
3810                         TI_ERR(varp->errmsg);
3811
3812                 *varp->var = value;
3813                 if (value != -1) {
3814                         if (varp->var2)
3815                                 *varp->var2 = value;
3816                         if (varp->var3)
3817                                 *varp->var3 = value;
3818                 }
3819         }
3820
3821         return 0;
3822 }
3823
3824 /* Parse optional locking parameters. */
3825 static int get_raid_locking_parms(struct dm_target *ti, char **argv,
3826                                   int *locking_parms,
3827                                   struct dm_raid45_locking_type **locking_type)
3828 {
3829         if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
3830                 char *lckstr = argv[1];
3831                 size_t lcksz = strlen(lckstr);
3832
3833                 if (!strnicmp(lckstr, "none", lcksz)) {
3834                         *locking_type = &locking_none;
3835                         *locking_parms = 2;
3836                 } else if (!strnicmp(lckstr, "cluster", lcksz)) {
3837                         DMERR("locking type \"%s\" not yet implemented",
3838                               lckstr);
3839                         return -EINVAL;
3840                 } else {
3841                         DMERR("unknown locking type \"%s\"", lckstr);
3842                         return -EINVAL;
3843                 }
3844         }
3845
3846         *locking_parms = 0;
3847         *locking_type = &locking_none;
3848         return 0;
3849 }
3850
3851 /* Set backing device read ahead properties of RAID set. */
3852 static void rs_set_read_ahead(struct raid_set *rs,
3853                               unsigned sectors, unsigned stripes)
3854 {
3855         unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
3856         struct mapped_device *md = dm_table_get_md(rs->ti->table);
3857         struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3858
3859         /* Set read-ahead for the RAID set and the component devices. */
3860         if (ra_pages) {
3861                 unsigned p = rs->set.raid_devs;
3862
3863                 bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
3864
3865                 while (p--) {
3866                         struct request_queue *q =
3867                                 bdev_get_queue(rs->dev[p].dev->bdev);
3868
3869                         q->backing_dev_info.ra_pages = ra_pages;
3870                 }
3871         }
3872
3873         dm_put(md);
3874 }
3875
3876 /* Set congested function. */
3877 static void rs_set_congested_fn(struct raid_set *rs)
3878 {
3879         struct mapped_device *md = dm_table_get_md(rs->ti->table);
3880         struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3881
3882         /* Set congested function and data. */
3883         bdi->congested_fn = rs_congested;
3884         bdi->congested_data = rs;
3885         dm_put(md);
3886 }
3887
3888 /*
3889  * Construct a RAID4/5 mapping:
3890  *
3891  * log_type #log_params <log_params> \
3892  * raid_type [#parity_dev] #raid_variable_params <raid_params> \
3893  * [locking "none"/"cluster"]
3894  * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
3895  *
3896  * log_type = "core"/"disk",
3897  * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
3898  * log_params = [dirty_log_path] region_size [[no]sync])
3899  *
3900  * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
3901  *
3902  * #parity_dev = N if raid_type = "raid4"
3903  * o N = -1: pick default = last device
3904  * o N >= 0 and < #raid_devs: parity device index
3905  *
3906  * #raid_variable_params = 0-7; raid_params (-1 = default):
3907  *   [chunk_size [#stripes [io_size [recover_io_size \
3908  *    [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
3909  *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
3910  *     and <= CHUNK_SIZE_MAX)
3911  *   o #stripes is number of stripes allocated to stripe cache
3912  *     (must be > 1 and < STRIPES_MAX)
3913  *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
3914  *   o recover_io_size (io unit size per device for recovery in sectors;
3915  must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
3916  *   o %recovery_bandwith is the maximum amount spend for recovery during
3917  *     application io (1-100%)
3918  *   o recovery switch = [sync|nosync]
3919  *   o #recovery_stripes is the number of recovery stripes used for
3920  *     parallel recovery of the RAID set
3921  * If raid_variable_params = 0, defaults will be used.
3922  * Any raid_variable_param can be set to -1 to apply a default
3923  *
3924  * #raid_devs = N (N >= 3)
3925  *
3926  * #dev_to_initialize = N
3927  * -1: initialize parity on all devices
3928  * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
3929  * of a failed devices content after replacement
3930  *
3931  * <dev_path> = device_path (eg, /dev/sdd1)
3932  * <offset>   = begin at offset on <dev_path>
3933  *
3934  */
3935 #define MIN_PARMS       13
3936 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
3937 {
3938         int dev_to_init, dl_parms, i, locking_parms,
3939             parity_parm, pi = -1, r, raid_devs;
3940         unsigned speed;
3941         sector_t tmp, sectors_per_dev;
3942         struct dm_raid45_locking_type *locking;
3943         struct raid_set *rs;
3944         struct raid_type *raid_type;
3945         struct variable_parms parms;
3946
3947         /* Ensure minimum number of parameters. */
3948         if (argc < MIN_PARMS)
3949                 TI_ERR("Not enough parameters");
3950
3951         /* Fetch # of dirty log parameters. */
3952         if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
3953             !range_ok(dl_parms, 1, 4711)) /* ;-) */
3954                 TI_ERR("Bad dirty log parameters number");
3955
3956         /* Check raid_type. */
3957         raid_type = get_raid_type(argv[dl_parms + 2]);
3958         if (!raid_type)
3959                 TI_ERR("Bad raid type");
3960
3961         /* In case of RAID4, parity drive is selectable. */
3962         parity_parm = !!(raid_type->level == raid4);
3963
3964         /* Handle variable number of RAID parameters. */
3965         r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
3966                                     &parms);
3967         if (r)
3968                 return r;
3969
3970         /* Handle any locking parameters. */
3971         r = get_raid_locking_parms(ti,
3972                                    argv + dl_parms + parity_parm +
3973                                    parms.raid_parms + 4,
3974                                    &locking_parms, &locking);
3975         if (r)
3976                 return r;
3977
3978         /* # of raid devices. */
3979         i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
3980         if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
3981             raid_devs < raid_type->minimal_devs)
3982                 TI_ERR("Invalid number of raid devices");
3983
3984         /* In case of RAID4, check parity drive index is in limits. */
3985         if (raid_type->level == raid4) {
3986                 /* Fetch index of parity device. */
3987                 if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
3988                     (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
3989                         TI_ERR("Invalid RAID4 parity device index");
3990         }
3991
3992         /*
3993          * Index of device to initialize starts at 0
3994          *
3995          * o -1 -> don't initialize a selected device;
3996          *         initialize parity conforming to algorithm
3997          * o 0..raid_devs-1 -> initialize respective device
3998          *   (used for reconstruction of a replaced device)
3999          */
4000         if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
4001                    locking_parms + 5], "%d", &dev_to_init) != 1 ||
4002             !range_ok(dev_to_init, -1, raid_devs - 1))
4003                 TI_ERR("Invalid number for raid device to initialize");
4004
4005         /* Check # of raid device arguments. */
4006         if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
4007             2 * raid_devs)
4008                 TI_ERR("Wrong number of raid device/offset arguments");
4009
4010         /*
4011          * Check that the table length is devisable
4012          * w/o rest by (raid_devs - parity_devs)
4013          */
4014         if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4015                       &sectors_per_dev))
4016                 TI_ERR("Target length not divisible by number of data devices");
4017
4018         /*
4019          * Check that the device size is
4020          * devisable w/o rest by chunk size
4021          */
4022         if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
4023                 TI_ERR("Device length not divisible by chunk_size");
4024
4025         /****************************************************************
4026          * Now that we checked the constructor arguments ->
4027          * let's allocate the RAID set
4028          ****************************************************************/
4029         rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
4030                            ti, dl_parms, argv);
4031         if (IS_ERR(rs))
4032                 return PTR_ERR(rs);
4033
4034
4035         rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
4036         rs->set.pi = rs->set.pi_parm = pi;
4037
4038         /* Set RAID4 parity drive index. */
4039         if (raid_type->level == raid4)
4040                 rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4041
4042         recover_set_bandwidth(rs, parms.bandwidth);
4043
4044         /* Use locking type to lock stripe access. */
4045         rs->locking = locking;
4046
4047         /* Get the device/offset tupels. */
4048         argv += dl_parms + 6 + parity_parm + parms.raid_parms;
4049         r = dev_parms(rs, argv, &i);
4050         if (r)
4051                 goto err;
4052
4053         /* Set backing device information (eg. read ahead). */
4054         rs_set_read_ahead(rs, 2 * rs->set.chunk_size, 4 /* stripes */);
4055         rs_set_congested_fn(rs); /* Set congested function. */
4056         SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4057         speed = xor_optimize(rs); /* Select best xor algorithm. */
4058
4059         /* Set for recovery of any nosync regions. */
4060         if (parms.recovery)
4061                 SetRSRecover(rs);
4062         else {
4063                 /*
4064                  * Need to free recovery stripe(s) here in case
4065                  * of nosync, because xor_optimize uses one.
4066                  */
4067                 set_start_recovery(rs);
4068                 set_end_recovery(rs);
4069                 stripe_recover_free(rs);
4070         }
4071
4072         /*
4073          * Make sure that dm core only hands maximum io size
4074          * length down and pays attention to io boundaries.
4075          */
4076         ti->max_io_len = rs->set.io_size;
4077         ti->private = rs;
4078
4079         /* Initialize work queue to handle this RAID set's io. */
4080         r = rs_workqueue_init(rs);
4081         if (r)
4082                 goto err;
4083
4084         rs_log(rs, speed); /* Log information about RAID set. */
4085         return 0;
4086
4087 err:
4088         context_free(rs, i);
4089         return r;
4090 }
4091
4092 /*
4093  * Destruct a raid mapping
4094  */
4095 static void raid_dtr(struct dm_target *ti)
4096 {
4097         struct raid_set *rs = ti->private;
4098
4099         destroy_workqueue(rs->io.wq);
4100         context_free(rs, rs->set.raid_devs);
4101 }
4102
4103 /* Raid mapping function. */
4104 static int raid_map(struct dm_target *ti, struct bio *bio,
4105                     union map_info *map_context)
4106 {
4107         /* I don't want to waste stripe cache capacity. */
4108         if (bio_rw(bio) == READA)
4109                 return -EIO;
4110         else {
4111                 struct raid_set *rs = ti->private;
4112
4113                 /*
4114                  * Get io reference to be waiting for to drop
4115                  * to zero on device suspension/destruction.
4116                  */
4117                 io_get(rs);
4118                 bio->bi_sector -= ti->begin;    /* Remap sector. */
4119
4120                 /* Queue io to RAID set. */
4121                 mutex_lock(&rs->io.in_lock);
4122                 bio_list_add(&rs->io.in, bio);
4123                 mutex_unlock(&rs->io.in_lock);
4124
4125                 /* Wake daemon to process input list. */
4126                 wake_do_raid(rs);
4127
4128                 /* REMOVEME: statistics. */
4129                 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
4130                                         S_BIOS_READ : S_BIOS_WRITE));
4131                 return DM_MAPIO_SUBMITTED;      /* Handle later. */
4132         }
4133 }
4134
4135 /* Device suspend. */
4136 static void raid_presuspend(struct dm_target *ti)
4137 {
4138         struct raid_set *rs = ti->private;
4139         struct dm_dirty_log *dl = rs->recover.dl;
4140
4141         SetRSSuspend(rs);
4142
4143         if (RSRecover(rs))
4144                 dm_rh_stop_recovery(rs->recover.rh);
4145
4146         cancel_delayed_work(&rs->io.dws_do_raid);
4147         flush_workqueue(rs->io.wq);
4148         wait_ios(rs);   /* Wait for completion of all ios being processed. */
4149
4150         if (dl->type->presuspend && dl->type->presuspend(dl))
4151                 /* FIXME: need better error handling. */
4152                 DMWARN("log presuspend failed");
4153 }
4154
4155 static void raid_postsuspend(struct dm_target *ti)
4156 {
4157         struct raid_set *rs = ti->private;
4158         struct dm_dirty_log *dl = rs->recover.dl;
4159
4160         if (dl->type->postsuspend && dl->type->postsuspend(dl))
4161                 /* FIXME: need better error handling. */
4162                 DMWARN("log postsuspend failed");
4163
4164 }
4165
4166 /* Device resume. */
4167 static void raid_resume(struct dm_target *ti)
4168 {
4169         struct raid_set *rs = ti->private;
4170         struct recover *rec = &rs->recover;
4171         struct dm_dirty_log *dl = rec->dl;
4172
4173         if (dl->type->resume && dl->type->resume(dl))
4174                 /* Resume dirty log. */
4175                 /* FIXME: need better error handling. */
4176                 DMWARN("log resume failed");
4177
4178         rec->nr_regions_to_recover =
4179                 rec->nr_regions - dl->type->get_sync_count(dl);
4180
4181         /* Restart any unfinished recovery. */
4182         if (RSRecover(rs)) {
4183                 set_start_recovery(rs);
4184                 dm_rh_start_recovery(rec->rh);
4185         }
4186
4187         ClearRSSuspend(rs);
4188         wake_do_raid(rs);
4189 }
4190
4191 /* Return stripe cache size. */
4192 static unsigned sc_size(struct raid_set *rs)
4193 {
4194         return to_sector(atomic_read(&rs->sc.stripes) *
4195                          (sizeof(struct stripe) +
4196                           (sizeof(struct stripe_chunk) +
4197                            (sizeof(struct page_list) +
4198                             to_bytes(rs->set.io_size) *
4199                             rs->set.raid_devs)) +
4200                           (rs->recover.end_jiffies ?
4201                            0 : rs->recover.recovery_stripes *
4202                            to_bytes(rs->set.raid_devs * rs->recover.io_size))));
4203 }
4204
4205 /* REMOVEME: status output for development. */
4206 static void raid_devel_stats(struct dm_target *ti, char *result,
4207                              unsigned *size, unsigned maxlen)
4208 {
4209         unsigned sz = *size;
4210         unsigned long j;
4211         char buf[BDEVNAME_SIZE], *p;
4212         struct stats_map *sm;
4213         struct raid_set *rs = ti->private;
4214         struct recover *rec = &rs->recover;
4215         struct timespec ts;
4216
4217         DMEMIT("%s %s %u\n", version, rs->xor.f->name, rs->xor.chunks);
4218         DMEMIT("act_ios=%d ", io_ref(rs));
4219         DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
4220         DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
4221         DMEMIT("act_stripes_max=%d\n",
4222                atomic_read(&rs->sc.active_stripes_max));
4223
4224         for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
4225                 DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4226
4227         DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
4228         DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
4229                atomic_read(&rs->sc.stripes), rs->set.io_size,
4230                rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
4231                sc_size(rs));
4232
4233         j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4234             rec->start_jiffies;
4235         jiffies_to_timespec(j, &ts);
4236         sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4237         p = strchr(buf, '.');
4238         p[3] = 0;
4239
4240         DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
4241                (unsigned long long) rec->nr_regions_recovered,
4242                (unsigned long long) rec->nr_regions_to_recover,
4243                (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4244
4245         *size = sz;
4246 }
4247
4248 static int raid_status(struct dm_target *ti, status_type_t type,
4249                        unsigned status_flags, char *result, unsigned maxlen)
4250 {
4251         unsigned p, sz = 0;
4252         char buf[BDEVNAME_SIZE];
4253         struct raid_set *rs = ti->private;
4254         int raid_parms[] = {
4255                 rs->set.chunk_size_parm,
4256                 rs->sc.stripes_parm,
4257                 rs->set.io_size_parm,
4258                 rs->recover.io_size_parm,
4259                 rs->recover.bandwidth_parm,
4260                 -2,
4261                 rs->recover.recovery_stripes,
4262         };
4263
4264         switch (type) {
4265         case STATUSTYPE_INFO:
4266                 /* REMOVEME: statistics. */
4267                 if (RSDevelStats(rs))
4268                         raid_devel_stats(ti, result, &sz, maxlen);
4269
4270                 DMEMIT("%u ", rs->set.raid_devs);
4271
4272                 for (p = 0; p < rs->set.raid_devs; p++)
4273                         DMEMIT("%s ",
4274                                format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
4275
4276                 DMEMIT("1 ");
4277                 for (p = 0; p < rs->set.raid_devs; p++) {
4278                         DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
4279
4280                         if (p == rs->set.pi)
4281                                 DMEMIT("p");
4282
4283                         if (rs->set.dev_to_init == p)
4284                                 DMEMIT("i");
4285                 }
4286
4287                 break;
4288         case STATUSTYPE_TABLE:
4289                 sz = rs->recover.dl->type->status(rs->recover.dl, type,
4290                                                   result, maxlen);
4291                 DMEMIT("%s %u ", rs->set.raid_type->name,
4292                        rs->set.raid_parms);
4293
4294                 for (p = 0; p < rs->set.raid_parms; p++) {
4295                         if (raid_parms[p] > -2)
4296                                 DMEMIT("%d ", raid_parms[p]);
4297                         else
4298                                 DMEMIT("%s ", rs->recover.recovery ?
4299                                               "sync" : "nosync");
4300                 }
4301
4302                 DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4303
4304                 for (p = 0; p < rs->set.raid_devs; p++)
4305                         DMEMIT("%s %llu ",
4306                                format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
4307                                (unsigned long long) rs->dev[p].start);
4308         }
4309
4310         return 0;
4311 }
4312
4313 /*
4314  * Message interface
4315  */
4316 enum raid_msg_actions {
4317         act_bw,                 /* Recovery bandwidth switch. */
4318         act_dev,                /* Device failure switch. */
4319         act_overwrite,          /* Stripe overwrite check. */
4320         act_stats,              /* Development statistics switch. */
4321         act_sc,                 /* Stripe cache switch. */
4322
4323         act_on,                 /* Set entity on. */
4324         act_off,                /* Set entity off. */
4325         act_reset,              /* Reset entity. */
4326
4327         act_set = act_on,       /* Set # absolute. */
4328         act_grow = act_off,     /* Grow # by an amount. */
4329         act_shrink = act_reset, /* Shrink # by an amount. */
4330 };
4331
4332 /* Turn a delta into an absolute value. */
4333 static int _absolute(unsigned long action, int act, int r)
4334 {
4335         /* Make delta absolute. */
4336         if (test_bit(act_set, &action))
4337                 ;
4338         else if (test_bit(act_grow, &action))
4339                 r += act;
4340         else if (test_bit(act_shrink, &action))
4341                 r = act - r;
4342         else
4343                 r = -EINVAL;
4344
4345         return r;
4346 }
4347
4348  /* Change recovery io bandwidth. */
4349 static int bandwidth_change(struct dm_msg *msg, void *context)
4350 {
4351         struct raid_set *rs = context;
4352         int act = rs->recover.bandwidth;
4353         int bandwidth = DM_MSG_INT_ARG(msg);
4354
4355         if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4356                 /* Make delta bandwidth absolute. */
4357                 bandwidth = _absolute(msg->action, act, bandwidth);
4358
4359                 /* Check range. */
4360                 if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4361                         recover_set_bandwidth(rs, bandwidth);
4362                         return 0;
4363                 }
4364         }
4365
4366         set_bit(dm_msg_ret_arg, &msg->ret);
4367         set_bit(dm_msg_ret_inval, &msg->ret);
4368         return -EINVAL;
4369 }
4370
4371 /* Set/reset development feature flags. */
4372 static int devel_flags(struct dm_msg *msg, void *context)
4373 {
4374         struct raid_set *rs = context;
4375
4376         if (test_bit(act_on, &msg->action))
4377                 return test_and_set_bit(msg->spec->parm,
4378                                         &rs->io.flags) ? -EPERM : 0;
4379         else if (test_bit(act_off, &msg->action))
4380                 return test_and_clear_bit(msg->spec->parm,
4381                                           &rs->io.flags) ? 0 : -EPERM;
4382         else if (test_bit(act_reset, &msg->action)) {
4383                 if (test_bit(act_stats, &msg->action)) {
4384                         stats_reset(rs);
4385                         goto on;
4386                 } else if (test_bit(act_overwrite, &msg->action)) {
4387 on:
4388                         set_bit(msg->spec->parm, &rs->io.flags);
4389                         return 0;
4390                 }
4391         }
4392
4393         return -EINVAL;
4394 }
4395
4396 /* Resize the stripe cache. */
4397 static int sc_resize(struct dm_msg *msg, void *context)
4398 {
4399         int act, stripes;
4400         struct raid_set *rs = context;
4401
4402         /* Deny permission in case the daemon is still resizing!. */
4403         if (atomic_read(&rs->sc.stripes_to_set))
4404                 return -EPERM;
4405
4406         stripes = DM_MSG_INT_ARG(msg);
4407         if (stripes > 0) {
4408                 act = atomic_read(&rs->sc.stripes);
4409
4410                 /* Make delta stripes absolute. */
4411                 stripes = _absolute(msg->action, act, stripes);
4412
4413                 /*
4414                  * Check range and that the # of stripes changes.
4415                  * We leave the resizing to the wroker.
4416                  */
4417                 if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
4418                     stripes != atomic_read(&rs->sc.stripes)) {
4419                         atomic_set(&rs->sc.stripes_to_set, stripes);
4420                         wake_do_raid(rs);
4421                         return 0;
4422                 }
4423         }
4424
4425         set_bit(dm_msg_ret_arg, &msg->ret);
4426         set_bit(dm_msg_ret_inval, &msg->ret);
4427         return -EINVAL;
4428 }
4429
4430 /* Parse the RAID message action. */
4431 /*
4432  * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'      # e.g 'ba se 50'
4433  * "o[verwrite]  {on,of[f],r[eset]}'            # e.g. 'o of'
4434  * 'sta[tistics] {on,of[f],r[eset]}'            # e.g. 'stat of'
4435  * 'str[ipecache] {se[t],g[row],sh[rink]} #'    # e.g. 'stripe set 1024'
4436  *
4437  */
4438 static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
4439 {
4440         /* Variables to store the parsed parameters im. */
4441         static int i[2];
4442         static unsigned long *i_arg[] = {
4443                 (unsigned long *) i + 0,
4444                 (unsigned long *) i + 1,
4445         };
4446
4447         /* Declare all message option strings. */
4448         static char *str_sgs[] = { "set", "grow", "shrink" };
4449         static char *str_oor[] = { "on", "off", "reset" };
4450
4451         /* Declare all actions. */
4452         static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
4453         static unsigned long act_oor[] = { act_on, act_off, act_reset };
4454
4455         /* Bandwidth option. */
4456         static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
4457         static struct dm_message_argument bw_args = {
4458                 1, i_arg, { dm_msg_int_t }
4459         };
4460
4461         static struct dm_message_argument null_args = {
4462                 0, NULL, { dm_msg_int_t }
4463         };
4464
4465         /* Overwrite and statistics option. */
4466         static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
4467
4468         /* Sripecache option. */
4469         static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
4470
4471         /* Declare messages. */
4472         static struct dm_msg_spec specs[] = {
4473                 { "bandwidth", act_bw, &bw_opt, &bw_args,
4474                   0, bandwidth_change },
4475                 { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
4476                   RS_CHECK_OVERWRITE, devel_flags },
4477                 { "statistics", act_stats, &ovr_stats_opt, &null_args,
4478                   RS_DEVEL_STATS, devel_flags },
4479                 { "stripecache", act_sc, &stripe_opt, &bw_args,
4480                   0, sc_resize },
4481         };
4482
4483         /* The message for the parser. */
4484         struct dm_msg msg = {
4485                 .num_specs = ARRAY_SIZE(specs),
4486                 .specs = specs,
4487         };
4488
4489         return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
4490 }
4491 /*
4492  * END message interface
4493  */
4494
4495 static struct target_type raid_target = {
4496         .name = "raid45",
4497         .version = {1, 0, 0},
4498         .module = THIS_MODULE,
4499         .ctr = raid_ctr,
4500         .dtr = raid_dtr,
4501         .map = raid_map,
4502         .presuspend = raid_presuspend,
4503         .postsuspend = raid_postsuspend,
4504         .resume = raid_resume,
4505         .status = raid_status,
4506         .message = raid_message,
4507 };
4508
4509 static void init_exit(const char *bad_msg, const char *good_msg, int r)
4510 {
4511         if (r)
4512                 DMERR("Failed to %sregister target [%d]", bad_msg, r);
4513         else
4514                 DMINFO("%s %s", good_msg, version);
4515 }
4516
4517 static int __init dm_raid_init(void)
4518 {
4519         int r = dm_register_target(&raid_target);
4520
4521         init_exit("", "initialized", r);
4522         return r;
4523 }
4524
4525 static void __exit dm_raid_exit(void)
4526 {
4527         dm_unregister_target(&raid_target);
4528         init_exit("un", "exit", 0);
4529 }
4530
4531 /* Module hooks. */
4532 module_init(dm_raid_init);
4533 module_exit(dm_raid_exit);
4534
4535 MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
4536 MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
4537 MODULE_LICENSE("GPL");
4538 MODULE_ALIAS("dm-raid4");
4539 MODULE_ALIAS("dm-raid5");