]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Illumos 5027 - zfs large block support
authorMatthew Ahrens <mahrens@delphix.com>
Mon, 3 Nov 2014 20:15:08 +0000 (12:15 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Mon, 11 May 2015 19:23:16 +0000 (12:23 -0700)
5027 zfs large block support
Reviewed by: Alek Pinchuk <pinchuk.alek@gmail.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Richard Elling <richard.elling@richardelling.com>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/5027
  https://github.com/illumos/illumos-gate/commit/b515258

Porting Notes:

* Included in this patch is a tiny ISP2() cleanup in zio_init() from
Illumos 5255.

* Unlike the upstream Illumos commit this patch does not impose an
arbitrary 128K block size limit on volumes.  Volumes, like filesystems,
are limited by the zfs_max_recordsize=1M module option.

* By default the maximum record size is limited to 1M by the module
option zfs_max_recordsize.  This value may be safely increased up to
16M which is the largest block size supported by the on-disk format.
At the moment, 1M blocks clearly offer a significant performance
improvement but the benefits of going beyond this for the majority
of workloads are less clear.

* The illumos version of this patch increased DMU_MAX_ACCESS to 32M.
This was determined not to be large enough when using 16M blocks
because the zfs_make_xattrdir() function will fail (EFBIG) when
assigning a TX.  This was immediately observed under Linux because
all newly created files must have a security xattr created and
that was failing.  Therefore, we've set DMU_MAX_ACCESS to 64M.

* On 32-bit platforms a hard limit of 1M is set for blocks due
to the limited virtual address space.  We should be able to relax
this one the ABD patches are merged.

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #354

55 files changed:
cmd/zdb/zdb.c
cmd/zfs/zfs_main.c
cmd/zstreamdump/zstreamdump.c
cmd/ztest/ztest.c
include/libzfs.h
include/libzfs_core.h
include/sys/dmu.h
include/sys/dmu_objset.h
include/sys/dmu_send.h
include/sys/dsl_dataset.h
include/sys/fs/zfs.h
include/sys/spa.h
include/sys/zap_impl.h
include/sys/zfs_ioctl.h
include/sys/zfs_sa.h
include/sys/zfs_znode.h
include/sys/zil.h
include/sys/zil_impl.h
include/zfeature_common.h
lib/libzfs/libzfs_dataset.c
lib/libzfs/libzfs_sendrecv.c
lib/libzfs_core/libzfs_core.c
man/man5/zfs-module-parameters.5
man/man5/zpool-features.5
man/man8/zfs.8
module/zcommon/zfs_prop.c
module/zcommon/zpool_prop.c
module/zfs/arc.c
module/zfs/bpobj.c
module/zfs/bptree.c
module/zfs/dbuf.c
module/zfs/dmu_objset.c
module/zfs/dmu_send.c
module/zfs/dmu_tx.c
module/zfs/dnode.c
module/zfs/dsl_dataset.c
module/zfs/dsl_deadlist.c
module/zfs/dsl_destroy.c
module/zfs/dsl_pool.c
module/zfs/sa.c
module/zfs/spa.c
module/zfs/spa_history.c
module/zfs/spa_misc.c
module/zfs/vdev.c
module/zfs/vdev_disk.c
module/zfs/vdev_queue.c
module/zfs/zap_micro.c
module/zfs/zfeature_common.c
module/zfs/zfs_ioctl.c
module/zfs/zfs_log.c
module/zfs/zfs_vfsops.c
module/zfs/zfs_vnops.c
module/zfs/zfs_znode.c
module/zfs/zil.c
module/zfs/zio.c

index 25f44212a3ca834f72af39c41625c2da914bd04d..ab6155054fd52733f974199c93ea936420cf1a60 100644 (file)
@@ -2185,6 +2185,8 @@ dump_label(const char *dev)
        (void) close(fd);
 }
 
+static uint64_t num_large_blocks;
+
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
@@ -2197,6 +2199,8 @@ dump_one_dir(const char *dsname, void *arg)
                (void) printf("Could not open %s, error %d\n", dsname, error);
                return (0);
        }
+       if (dmu_objset_ds(os)->ds_large_blocks)
+               num_large_blocks++;
        dump_dir(os);
        dmu_objset_disown(os, FTAG);
        fuid_table_destroy();
@@ -2207,7 +2211,7 @@ dump_one_dir(const char *dsname, void *arg)
 /*
  * Block statistics.
  */
-#define        PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
+#define        PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
        uint64_t zb_asize;
        uint64_t zb_lsize;
@@ -2273,7 +2277,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
                zb->zb_lsize += BP_GET_LSIZE(bp);
                zb->zb_psize += BP_GET_PSIZE(bp);
                zb->zb_count++;
-               zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
+
+               /*
+                * The histogram is only big enough to record blocks up to
+                * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+                * "other", bucket.
+                */
+               int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+               idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+               zb->zb_psize_histogram[idx]++;
 
                zb->zb_gangs += BP_COUNT_GANG(bp);
 
@@ -2979,6 +2991,7 @@ dump_zpool(spa_t *spa)
                dump_metaslab_groups(spa);
 
        if (dump_opt['d'] || dump_opt['i']) {
+               uint64_t refcount;
                dump_dir(dp->dp_meta_objset);
                if (dump_opt['d'] >= 3) {
                        dump_bpobj(&spa->spa_deferred_bpobj,
@@ -2998,8 +3011,21 @@ dump_zpool(spa_t *spa)
                }
                (void) dmu_objset_find(spa_name(spa), dump_one_dir,
                    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+               (void) feature_get_refcount(spa,
+                   &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
+               if (num_large_blocks != refcount) {
+                       (void) printf("large_blocks feature refcount mismatch: "
+                           "expected %lld != actual %lld\n",
+                           (longlong_t)num_large_blocks,
+                           (longlong_t)refcount);
+                       rc = 2;
+               } else {
+                       (void) printf("Verified large_blocks feature refcount "
+                           "is correct (%llu)\n", (longlong_t)refcount);
+               }
        }
-       if (dump_opt['b'] || dump_opt['c'])
+       if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
                rc = dump_block_stats(spa);
 
        if (rc == 0)
index 50ac59fba74766034aa5def8ae29c9efafd75ae1..32b9239b7d9f3aa70e6ba924de8297d6e27781c2 100644 (file)
@@ -258,9 +258,9 @@ get_usage(zfs_help_t idx)
        case HELP_ROLLBACK:
                return (gettext("\trollback [-rRf] <snapshot>\n"));
        case HELP_SEND:
-               return (gettext("\tsend [-DnPpRrve] [-[iI] snapshot] "
+               return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
                    "<snapshot>\n"
-                   "\tsend [-e] [-i snapshot|bookmark] "
+                   "\tsend [-Le] [-i snapshot|bookmark] "
                    "<filesystem|volume|snapshot>\n"));
        case HELP_SET:
                return (gettext("\tset <property=value> "
@@ -3683,7 +3683,7 @@ zfs_do_send(int argc, char **argv)
        boolean_t extraverbose = B_FALSE;
 
        /* check options */
-       while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
+       while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
                switch (c) {
                case 'i':
                        if (fromname)
@@ -3718,6 +3718,9 @@ zfs_do_send(int argc, char **argv)
                case 'n':
                        flags.dryrun = B_TRUE;
                        break;
+               case 'L':
+                       flags.largeblock = B_TRUE;
+                       break;
                case 'e':
                        flags.embed_data = B_TRUE;
                        break;
@@ -3774,6 +3777,8 @@ zfs_do_send(int argc, char **argv)
                if (zhp == NULL)
                        return (1);
 
+               if (flags.largeblock)
+                       lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
                if (flags.embed_data)
                        lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 
index c51a9c80680ebf16a96fc9282ca96711b5ed9d76..176dd66b268af4c9c8b2becbe2d91bdd0cc7b530 100644 (file)
@@ -56,7 +56,6 @@ uint64_t total_stream_len = 0;
 FILE *send_stream = 0;
 boolean_t do_byteswap = B_FALSE;
 boolean_t do_cksum = B_TRUE;
-#define        INITIAL_BUFLEN (1<<20)
 
 static void
 usage(void)
@@ -69,6 +68,18 @@ usage(void)
        exit(1);
 }
 
+static void *
+safe_malloc(size_t size)
+{
+       void *rv = malloc(size);
+       if (rv == NULL) {
+               (void) fprintf(stderr, "ERROR; failed to allocate %u bytes\n",
+                   (unsigned)size);
+               abort();
+       }
+       return (rv);
+}
+
 /*
  * ssread - send stream read.
  *
@@ -160,7 +171,7 @@ print_block(char *buf, int length)
 int
 main(int argc, char *argv[])
 {
-       char *buf = malloc(INITIAL_BUFLEN);
+       char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
        uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
        uint64_t total_records = 0;
        dmu_replay_record_t thedrr;
@@ -308,9 +319,9 @@ main(int argc, char *argv[])
                                nvlist_t *nv;
                                int sz = drr->drr_payloadlen;
 
-                               if (sz > INITIAL_BUFLEN) {
+                               if (sz > SPA_MAXBLOCKSIZE) {
                                        free(buf);
-                                       buf = malloc(sz);
+                                       buf = safe_malloc(sz);
                                }
                                (void) ssread(buf, sz, &zc);
                                if (ferror(send_stream))
index 0602a7ec54bf4dfb01b138d8122fff475c973933..6b939bdb656b7f0aca0ba1efa7367dfae52ee2ca 100644 (file)
@@ -1040,9 +1040,14 @@ ztest_spa_get_ashift(void) {
 static int
 ztest_random_blocksize(void)
 {
-       // Choose a block size >= the ashift.
-       uint64_t block_shift =
-           ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
+       /*
+        * Choose a block size >= the ashift.
+        * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+        */
+       int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+       if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+               maxbs = 20;
+       uint64_t block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1);
        return (1 << (SPA_MINBLOCKSHIFT + block_shift));
 }
 
@@ -4972,7 +4977,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
        char *path0;
        char *pathrand;
        size_t fsize;
-       int bshift = SPA_MAXBLOCKSHIFT + 2;     /* don't scrog all labels */
+       int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
        int iters = 1000;
        int maxfaults;
        int mirror_save;
index e6a877214a642fd1e8ac1709e46ea79766cf9ab1..c4e30c5d4dee1d5b7549c494bc20d81953ad1fd4 100644 (file)
@@ -617,6 +617,9 @@ typedef struct sendflags {
        /* show progress (ie. -v) */
        boolean_t progress;
 
+       /* large blocks (>128K) are permitted */
+       boolean_t largeblock;
+
        /* WRITE_EMBEDDED records of type DATA are permitted */
        boolean_t embed_data;
 } sendflags_t;
index d7d767055d3358173aa894488803ac4155b55f49..bdd6c951ee496dc1e21a297e7a69b1342aecf79b 100644 (file)
@@ -53,7 +53,8 @@ int lzc_release(nvlist_t *, nvlist_t **);
 int lzc_get_holds(const char *, nvlist_t **);
 
 enum lzc_send_flags {
-       LZC_SEND_FLAG_EMBED_DATA = 1 << 0
+       LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
+       LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
 };
 
 int lzc_send(const char *, const char *, int, enum lzc_send_flags);
index aa3e89d6070cedbb7dec425d3cbb4f2f7440f1af..08871e890a6e9ab6ff15921d20beeca73f42c26f 100644 (file)
@@ -245,7 +245,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
-#define        DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define        DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
 #define        DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define        DMU_USERUSED_OBJECT     (-1ULL)
@@ -732,6 +732,7 @@ void xuio_stat_wbuf_copied(void);
 void xuio_stat_wbuf_nocopy(void);
 
 extern int zfs_prefetch_disable;
+extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
index 65ae850f4d28f4570676369b630b3182164eb542..8cb7bd02f4c2089bf5c2470db67f83016ae0b6a3 100644 (file)
@@ -99,6 +99,7 @@ struct objset {
        zfs_cache_type_t os_secondary_cache;
        zfs_sync_type_t os_sync;
        zfs_redundant_metadata_type_t os_redundant_metadata;
+       int os_recordsize;
 
        /* no lock needed: */
        struct dmu_tx *os_synctx; /* XXX sketchy */
index dc183c02c350b64c8b0168a8669b573698f3c7d6..3a8dc89abd4ab4ba7aa143f01abb0ffd0098a47a 100644 (file)
@@ -37,12 +37,14 @@ struct dsl_dataset;
 struct drr_begin;
 struct avl_tree;
 
-int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+int dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
     int outfd, struct vnode *vp, offset_t *off);
 int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
     uint64_t *sizep);
 int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
+    boolean_t embedok, boolean_t large_block_ok,
+    int outfd, struct vnode *vp, offset_t *off);
 
 typedef struct dmu_recv_cookie {
        struct dsl_dataset *drc_ds;
index edfc5509efb1975e761efac5fc6722c28fcf6828..1985ce824230d2e484aaf15406e4b35f69be3a7a 100644 (file)
@@ -83,6 +83,13 @@ struct dsl_pool;
  */
 #define        DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
 
+/*
+ * This field is present (with value=0) if this dataset may contain large
+ * blocks (>128KB).  If it is present, then this dataset
+ * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature.
+ */
+#define        DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks"
+
 /*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
  * name lookups should be performed case-insensitively.
@@ -138,6 +145,8 @@ typedef struct dsl_dataset {
        /* only used in syncing context, only valid for non-snapshots: */
        struct dsl_dataset *ds_prev;
        uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
+       boolean_t ds_large_blocks;
+       boolean_t ds_need_large_blocks;
 
        /* has internal locking: */
        dsl_deadlist_t ds_deadlist;
@@ -252,6 +261,8 @@ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
 int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
+int dsl_dataset_activate_large_blocks(const char *dsname);
+void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
index 87a91fef0e038a7b1db11db476078822607de9c6..4da144c724abcadd6091caf17b50a1184fbe0042 100644 (file)
@@ -200,6 +200,7 @@ typedef enum {
        ZPOOL_PROP_FREEING,
        ZPOOL_PROP_FRAGMENTATION,
        ZPOOL_PROP_LEAKED,
+       ZPOOL_PROP_MAXBLOCKSIZE,
        ZPOOL_PROP_TNAME,
        ZPOOL_NUM_PROPS
 } zpool_prop_t;
index 834ad005a1a74d035ba7a17c3fecd880db03da9e..5dc9084dad6b5a5479abe399072db0d300e6ce33 100644 (file)
@@ -98,17 +98,26 @@ _NOTE(CONSTCOND) } while (0)
 _NOTE(CONSTCOND) } while (0)
 
 /*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
  */
 #define        SPA_MINBLOCKSHIFT       9
-#define        SPA_MAXBLOCKSHIFT       17
+#define        SPA_OLD_MAXBLOCKSHIFT   17
+#define        SPA_MAXBLOCKSHIFT       24
 #define        SPA_MINBLOCKSIZE        (1ULL << SPA_MINBLOCKSHIFT)
+#define        SPA_OLD_MAXBLOCKSIZE    (1ULL << SPA_OLD_MAXBLOCKSHIFT)
 #define        SPA_MAXBLOCKSIZE        (1ULL << SPA_MAXBLOCKSHIFT)
 
-#define        SPA_BLOCKSIZES          (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
@@ -830,6 +839,7 @@ extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
 extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
 
 extern int spa_mode(spa_t *spa);
index 528402f79fdb6d9c0b5e15c479bc2a8b29f99af3..bfd43e31da80043dfb93c0d23dc3da4117f22339 100644 (file)
@@ -42,8 +42,7 @@ extern int fzap_default_block_shift;
 
 #define        MZAP_ENT_LEN            64
 #define        MZAP_NAME_LEN           (MZAP_ENT_LEN - 8 - 4 - 2)
-#define        MZAP_MAX_BLKSHIFT       SPA_MAXBLOCKSHIFT
-#define        MZAP_MAX_BLKSZ          (1 << MZAP_MAX_BLKSHIFT)
+#define        MZAP_MAX_BLKSZ          SPA_OLD_MAXBLOCKSIZE
 
 #define        ZAP_NEED_CD             (-1U)
 
index 5cfdcc50fda4f840da74c92f997f0af71047a42c..c71ceb9c5a57b74b28ab5f2e4b36a2de9e7d2743 100644 (file)
@@ -96,13 +96,16 @@ typedef enum drr_headertype {
 /* flags #3 - #15 are reserved for incompatible closed-source implementations */
 #define        DMU_BACKUP_FEATURE_EMBED_DATA           (1<<16)
 #define        DMU_BACKUP_FEATURE_EMBED_DATA_LZ4       (1<<17)
+/* flag #18 is reserved for a Delphix feature */
+#define        DMU_BACKUP_FEATURE_LARGE_BLOCKS         (1<<19)
 
 /*
  * Mask of all supported backup features
  */
 #define        DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
     DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
-    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
+    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+    DMU_BACKUP_FEATURE_LARGE_BLOCKS)
 
 /* Are all features in the given flag word currently supported? */
 #define        DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
index 735d4b32ad48f97e3816d66dcde850ca2d00d9d1..06c4d589aa791d1efef2c930b2e7abf3f44b8c80 100644 (file)
@@ -129,7 +129,7 @@ typedef struct znode_phys {
 #ifdef _KERNEL
 
 #define        DXATTR_MAX_ENTRY_SIZE   (32768)
-#define        DXATTR_MAX_SA_SIZE      (SPA_MAXBLOCKSIZE >> 1)
+#define        DXATTR_MAX_SA_SIZE      (SPA_OLD_MAXBLOCKSIZE >> 1)
 
 int zfs_sa_readlink(struct znode *, uio_t *);
 void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
index a6b82d574ffa7bb0a68cec73f87887226d430565..79ca4f7e9dae67844d453084dfc76be46e3510a7 100644 (file)
@@ -137,8 +137,6 @@ extern "C" {
 #define        ZFS_SHARES_DIR          "SHARES"
 #define        ZFS_SA_ATTRS            "SA_ATTRS"
 
-#define        ZFS_MAX_BLOCKSIZE       (SPA_MAXBLOCKSIZE)
-
 /*
  * Path component length
  *
index 9c806964d5dabc0cb4e87b4add0913b7bfe18874..362304135dacffae1b3997a310ce6de864f8beb9 100644 (file)
@@ -90,7 +90,6 @@ typedef struct zil_chain {
 } zil_chain_t;
 
 #define        ZIL_MIN_BLKSZ   4096ULL
-#define        ZIL_MAX_BLKSZ   SPA_MAXBLOCKSIZE
 
 /*
  * The words of a log block checksum.
index 0db4b525cd2bc4ed7750b34ff15dd5a84fbe8ff3..0c426a15dd06e573746ab4765424160215f4d9a7 100644 (file)
@@ -140,7 +140,7 @@ typedef struct zil_bp_node {
        avl_node_t      zn_node;
 } zil_bp_node_t;
 
-#define        ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define        ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
 
 #ifdef __cplusplus
index e3215bebd81887daf9452f2cd7b4e79386755238..e383c4ff7887a7bb6b0d585e3fe47174d4379882 100644 (file)
@@ -49,6 +49,7 @@ typedef enum spa_feature {
        SPA_FEATURE_EMBEDDED_DATA,
        SPA_FEATURE_BOOKMARKS,
        SPA_FEATURE_FS_SS_LIMIT,
+       SPA_FEATURE_LARGE_BLOCKS,
        SPA_FEATURES
 } spa_feature_t;
 
index 4087baf34b18bb75f1fca2707118e9205b4ab1f9..9f23d18015284af0048e95fb5673b1869275e64f 100644 (file)
@@ -1055,21 +1055,28 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
                        break;
                }
 
-               case ZFS_PROP_RECORDSIZE:
                case ZFS_PROP_VOLBLOCKSIZE:
-                       /* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
+               case ZFS_PROP_RECORDSIZE:
+               {
+                       int maxbs = SPA_MAXBLOCKSIZE;
+                       if (zhp != NULL) {
+                               maxbs = zpool_get_prop_int(zhp->zpool_hdl,
+                                   ZPOOL_PROP_MAXBLOCKSIZE, NULL);
+                       }
+                       /*
+                        * The value must be a power of two between
+                        * SPA_MINBLOCKSIZE and maxbs.
+                        */
                        if (intval < SPA_MINBLOCKSIZE ||
-                           intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) {
+                           intval > maxbs || !ISP2(intval)) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                                   "'%s' must be power of 2 from %u "
-                                   "to %uk"), propname,
-                                   (uint_t)SPA_MINBLOCKSIZE,
-                                   (uint_t)SPA_MAXBLOCKSIZE >> 10);
+                                   "'%s' must be power of 2 from 512B "
+                                   "to %uKB"), propname, maxbs >> 10);
                                (void) zfs_error(hdl, EZFS_BADPROP, errbuf);
                                goto error;
                        }
                        break;
-
+               }
                case ZFS_PROP_MLSLABEL:
                {
 #ifdef HAVE_MLSLABEL
@@ -1446,7 +1453,8 @@ zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
                break;
 
        case ERANGE:
-               if (prop == ZFS_PROP_COMPRESSION) {
+               if (prop == ZFS_PROP_COMPRESSION ||
+                   prop == ZFS_PROP_RECORDSIZE) {
                        (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "property setting is not allowed on "
                            "bootable datasets"));
@@ -3212,9 +3220,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
                case EDOM:
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "volume block size must be power of 2 from "
-                           "%u to %uk"),
-                           (uint_t)SPA_MINBLOCKSIZE,
-                           (uint_t)SPA_MAXBLOCKSIZE >> 10);
+                           "512B to %uKB"), zfs_max_recordsize >> 10);
 
                        return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 
index 8911e0ed26cccb74d93807cdb8ac6db4f5a53dcf..cdc872caa2dbb40c9b3880925567c1af356deb05 100644 (file)
@@ -214,7 +214,7 @@ static void *
 cksummer(void *arg)
 {
        dedup_arg_t *dda = arg;
-       char *buf = malloc(1<<20);
+       char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
        dmu_replay_record_t thedrr;
        dmu_replay_record_t *drr = &thedrr;
        struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@@ -279,9 +279,9 @@ cksummer(void *arg)
                            DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
                                int sz = drr->drr_payloadlen;
 
-                               if (sz > 1<<20) {
-                                       free(buf);
-                                       buf = malloc(sz);
+                               if (sz > SPA_MAXBLOCKSIZE) {
+                                       buf = zfs_realloc(dda->dedup_hdl, buf,
+                                           SPA_MAXBLOCKSIZE, sz);
                                }
                                (void) ssread(buf, sz, ofp);
                                if (ferror(stdin))
@@ -834,7 +834,7 @@ typedef struct send_dump_data {
        char prevsnap[ZFS_MAXNAMELEN];
        uint64_t prevsnap_obj;
        boolean_t seenfrom, seento, replicate, doall, fromorigin;
-       boolean_t verbose, dryrun, parsable, progress, embed_data;
+       boolean_t verbose, dryrun, parsable, progress, embed_data, large_block;
        int outfd;
        boolean_t err;
        nvlist_t *fss;
@@ -1181,6 +1181,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
                }
 
                enum lzc_send_flags flags = 0;
+               if (sdd->large_block)
+                       flags |= LZC_SEND_FLAG_LARGE_BLOCK;
                if (sdd->embed_data)
                        flags |= LZC_SEND_FLAG_EMBED_DATA;
 
@@ -1529,6 +1531,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
        sdd.parsable = flags->parsable;
        sdd.progress = flags->progress;
        sdd.dryrun = flags->dryrun;
+       sdd.large_block = flags->largeblock;
        sdd.embed_data = flags->embed_data;
        sdd.filter_cb = filter_func;
        sdd.filter_cb_arg = cb_arg;
@@ -2564,7 +2567,7 @@ static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
        dmu_replay_record_t *drr;
-       void *buf = malloc(1<<20);
+       void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
        char errbuf[1024];
 
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
index 2198ecd6f24ba12fa7827ba0b6b341649ee20a1a..69df5579bf1bb83c9454659965de1d1cb34d6965 100644 (file)
@@ -455,6 +455,10 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
  *
  * "fd" is the file descriptor to write the send stream to.
  *
+ * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
+ * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
+ * records with drr_blksz > 128K.
+ *
  * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
  * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
  * which the receiving system must support (as indicated by support
@@ -471,6 +475,8 @@ lzc_send(const char *snapname, const char *from, int fd,
        fnvlist_add_int32(args, "fd", fd);
        if (from != NULL)
                fnvlist_add_string(args, "fromsnap", from);
+       if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+               fnvlist_add_boolean(args, "largeblockok");
        if (flags & LZC_SEND_FLAG_EMBED_DATA)
                fnvlist_add_boolean(args, "embedok");
        err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
index 4bbfbf51286c425d6dd655df572389f316d4a8bc..359e9f72f35ea3f554451007a81933f2ae968579 100644 (file)
@@ -945,6 +945,24 @@ Largest data block to write to zil
 Default value: \fB32,768\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_max_recordsize\fR (int)
+.ad
+.RS 12n
+We currently support block sizes from 512 bytes to 16MB.  The benefits of
+larger blocks, and thus larger IO, need to be weighed against the cost of
+COWing a giant block to modify one byte.  Additionally, very large blocks
+can have an impact on i/o latency, and also potentially on the memory
+allocator.  Therefore, we do not allow the recordsize to be set larger than
+zfs_max_recordsize (default 1MB).  Larger blocks can be created by changing
+this tunable, and pools with larger blocks can always be imported and used,
+regardless of this setting.
+.sp
+Default value: \fB1,048,576\fR.
+.RE
+
 .sp
 .ne 2
 .na
index 27034264bc150233d31dce136c022802dd1abb86..a1dac4292ad1699d0d8fec8c563cf0d841621159 100644 (file)
@@ -411,5 +411,26 @@ never return to being \fBenabled\fR.
 
 .RE
 
+.sp
+.ne 2
+.na
+\fB\fBlarge_blocks\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID   org.open-zfs:large_block
+READ\-ONLY COMPATIBLE  no
+DEPENDENCIES   extensible_dataset
+.TE
+
+The \fBlarge_block\fR feature allows the record size on a dataset to be
+set larger than 128KB.
+
+This feature becomes \fBactive\fR once a \fBrecordsize\fR property has been
+set larger than 128KB, and will return to being \fBenabled\fR once all
+filesystems that have ever had their recordsize larger than 128KB are destroyed.
+.RE
+
 .SH "SEE ALSO"
 \fBzpool\fR(8)
index 81fe726cd6436a86f6dafc5a9b06ec2613f95d5a..f926f174ae5c126cc38e30f1d53087603ad6a34e 100644 (file)
@@ -174,12 +174,12 @@ zfs \- configures ZFS file systems
 
 .LP
 .nf
-\fBzfs\fR \fBsend\fR [\fB-DnPpRve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-DnPpRveL\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
 .fi
 
 .LP
 .nf
-\fBzfs\fR \fBsend\fR [\fB-e\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-eL\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .fi
 
 .LP
@@ -2706,7 +2706,7 @@ See \fBzpool-features\fR(5) for details on ZFS feature flags and the
 .sp
 .ne 2
 .na
-\fBzfs send\fR [\fB-DnPpRve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+\fBzfs send\fR [\fB-DnPpRveL\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
 .ad
 .sp .6
 .RS 4n
@@ -2759,6 +2759,22 @@ If the \fB-i\fR or \fB-I\fR flags are used in conjunction with the \fB-R\fR flag
 Generate a deduplicated stream. Blocks which would have been sent multiple times in the send stream will only be sent once. The receiving system must also support this feature to receive a deduplicated stream.  This flag can be used regardless of the dataset's dedup  property, but performance will be much better if the filesystem uses a dedup-capable checksum (eg.  sha256).
 .RE
 
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-L\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a stream which may contain blocks larger than 128KB.  This flag
+has no effect if the \fBlarge_blocks\fR pool feature is disabled, or if
+the \fRrecordsize\fR property of this filesystem has never been set above
+128KB.  The receiving system must have the \fBlarge_blocks\fR pool feature
+enabled as well.  See \fBzpool-features\fR(5) for details on ZFS feature
+flags and the \fBlarge_blocks\fR feature.
+.RE
+
 .sp
 .ne 2
 .mk
@@ -2828,7 +2844,7 @@ The format of the stream is committed. You will be able to receive your streams
 .sp
 .ne 2
 .na
-\fBzfs send\fR [\fB-e\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs send\fR [\fB-eL\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .ad
 .sp .6
 .RS 4n
@@ -2856,6 +2872,22 @@ be the origin snapshot, or an earlier snapshot in the origin's filesystem,
 or the origin's origin, etc.
 .RE
 
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-L\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a stream which may contain blocks larger than 128KB.  This flag
+has no effect if the \fBlarge_blocks\fR pool feature is disabled, or if
+the \fRrecordsize\fR property of this filesystem has never been set above
+128KB.  The receiving system must have the \fBlarge_blocks\fR pool feature
+enabled as well.  See \fBzpool-features\fR(5) for details on ZFS feature
+flags and the \fBlarge_blocks\fR feature.
+.RE
+
 .sp
 .ne 2
 .mk
@@ -2909,7 +2941,6 @@ The \fB-d\fR and \fB-e\fR options cause the file system name of the target snaps
 Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above.
 .RE
 
-
 .sp
 .ne 2
 .na
index 70f1d93dfc9880b0eccb21838d81e0d8b13e7183..aaebab444cfa409432f41117643ddc962415af5e 100644 (file)
@@ -426,8 +426,8 @@ zfs_prop_init(void)
 
        /* inherit number properties */
        zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
-           SPA_MAXBLOCKSIZE, PROP_INHERIT,
-           ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+           SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+           ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
 
        /* hidden properties */
        zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
index e5f69c8152d46a9d32ad6d5489a1ae6d2a934b93..910c56dcc2a9226851e9f877e6aaba2a3d01a6b6 100644 (file)
@@ -131,6 +131,8 @@ zpool_prop_init(void)
        /* hidden properties */
        zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
            PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+       zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+           PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
        zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
            PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
 }
index 6975ada62a555f8e774b88bba8940060f8132dc3..1699ea7e7e7a29081f5f625103e8be573bfd94a0 100644 (file)
@@ -1329,7 +1329,7 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
        arc_buf_hdr_t *hdr;
        arc_buf_t *buf;
 
-       VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+       VERIFY3U(size, <=, spa_maxblocksize(spa));
        hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
        ASSERT(BUF_EMPTY(hdr));
        hdr->b_size = size;
@@ -3289,7 +3289,7 @@ top:
                 * Gracefully handle a damaged logical block size as a
                 * checksum error by passing a dummy zio to the done callback.
                 */
-               if (size > SPA_MAXBLOCKSIZE) {
+               if (size > spa_maxblocksize(spa)) {
                        if (done) {
                                rzio = zio_null(pio, spa, NULL,
                                    NULL, NULL, zio_flags);
index 25767e83fed092ed36d578491699a755e3e7169a..ebfdc2e7a20e853016db06ee1ad1c1764fa0fba0 100644 (file)
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
                if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
                        ASSERT0(dp->dp_empty_bpobj);
                        dp->dp_empty_bpobj =
-                           bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+                           bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
                        VERIFY(zap_add(os,
                            DMU_POOL_DIRECTORY_OBJECT,
                            DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -399,7 +399,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
        dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
        if (bpo->bpo_phys->bpo_subobjs == 0) {
                bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
-                   DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+                   DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+                   DMU_OT_NONE, 0, tx);
        }
 
        ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
index d6ea9d7c645143be5425e50ad81a6ce523621c05..9f62d7b911f361ea2397bd9f96c4a756b883d6c5 100644 (file)
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
        bptree_phys_t *bt;
 
        obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
-           SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+           SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
            sizeof (bptree_phys_t), tx);
 
        /*
index cd74ce3e86140e42c6c933c032f9e263def425de..7d8adcd7356b760553ec56339661a7754e99e36c 100644 (file)
@@ -2216,10 +2216,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
                return (SET_ERROR(ENOTSUP));
        if (blksz == 0)
                blksz = SPA_MINBLOCKSIZE;
-       if (blksz > SPA_MAXBLOCKSIZE)
-               blksz = SPA_MAXBLOCKSIZE;
-       else
-               blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+       ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+       blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
index 6d42bf8a28b74493ac7f1c9cd9ab298a1b6ddf4f..ae4e1dd21b82f6f5111bb4e6990aea257a42d07f 100644 (file)
@@ -256,6 +256,14 @@ logbias_changed_cb(void *arg, uint64_t newval)
                zil_set_logbias(os->os_zil, newval);
 }
 
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+       objset_t *os = arg;
+
+       os->os_recordsize = newval;
+}
+
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
@@ -385,6 +393,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                                    ZFS_PROP_REDUNDANT_METADATA),
                                    redundant_metadata_changed_cb, os);
                        }
+                       if (err == 0) {
+                               err = dsl_prop_register(ds,
+                                   zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                                   recordsize_changed_cb, os);
+                       }
                }
                if (err != 0) {
                        VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -660,6 +673,9 @@ dmu_objset_evict(objset_t *os)
                        VERIFY0(dsl_prop_unregister(ds,
                            zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
                            redundant_metadata_changed_cb, os));
+                       VERIFY0(dsl_prop_unregister(ds,
+                           zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                           recordsize_changed_cb, os));
                }
                VERIFY0(dsl_prop_unregister(ds,
                    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
index c1cac2e6762be2f8fe0f5fc82b7226e464080c56..e26d344d6d63db1d90bb8110979d4f8ef4c4d85f 100644 (file)
@@ -234,11 +234,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
        drrw->drr_offset = offset;
        drrw->drr_length = blksz;
        drrw->drr_toguid = dsp->dsa_toguid;
-       if (BP_IS_EMBEDDED(bp)) {
+       if (bp == NULL || BP_IS_EMBEDDED(bp)) {
                /*
-                * There's no pre-computed checksum of embedded BP's, so
-                * (like fletcher4-checkummed blocks) userland will have
-                * to compute a dedup-capable checksum itself.
+                * There's no pre-computed checksum for partial-block
+                * writes or embedded BP's, so (like
+                * fletcher4-checkummed blocks) userland will have to
+                * compute a dedup-capable checksum itself.
                 */
                drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
        } else {
@@ -400,6 +401,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
        drro->drr_compress = dnp->dn_compress;
        drro->drr_toguid = dsp->dsa_toguid;
 
+       if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+               drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
        if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
                return (SET_ERROR(EINTR));
 
@@ -517,6 +522,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                    zb->zb_blkid * blksz, blksz, bp);
        } else { /* it's a level-0 block of a regular object */
                uint32_t aflags = ARC_WAIT;
+               uint64_t offset;
                arc_buf_t *abuf;
                int blksz = BP_GET_LSIZE(bp);
 
@@ -539,8 +545,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                        }
                }
 
-               err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
-                   blksz, bp, abuf->b_data);
+               offset = zb->zb_blkid * blksz;
+
+               if (!(dsp->dsa_featureflags &
+                   DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+                   blksz > SPA_OLD_MAXBLOCKSIZE) {
+                       char *buf = abuf->b_data;
+                       while (blksz > 0 && err == 0) {
+                               int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+                               err = dump_write(dsp, type, zb->zb_object,
+                                   offset, n, NULL, buf);
+                               offset += n;
+                               buf += n;
+                               blksz -= n;
+                       }
+               } else {
+                       err = dump_write(dsp, type, zb->zb_object,
+                           offset, blksz, bp, abuf->b_data);
+               }
                (void) arc_buf_remove_ref(abuf, &abuf);
        }
 
@@ -554,7 +576,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
     zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
-    int outfd, vnode_t *vp, offset_t *off)
+    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 {
        objset_t *os;
        dmu_replay_record_t *drr;
@@ -589,6 +611,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
        }
 #endif
 
+       if (large_block_ok && ds->ds_large_blocks)
+               featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
        if (embedok &&
            spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
                featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -684,7 +708,8 @@ out:
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+    boolean_t embedok, boolean_t large_block_ok,
+    int outfd, vnode_t *vp, offset_t *off)
 {
        dsl_pool_t *dp;
        dsl_dataset_t *ds;
@@ -719,18 +744,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
                zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
                is_clone = (fromds->ds_dir != ds->ds_dir);
                dsl_dataset_rele(fromds, FTAG);
-               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-                   outfd, vp, off);
+               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+                   embedok, large_block_ok, outfd, vp, off);
        } else {
-               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-                   outfd, vp, off);
+               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+                   embedok, large_block_ok, outfd, vp, off);
        }
        dsl_dataset_rele(ds, FTAG);
        return (err);
 }
 
 int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
     int outfd, vnode_t *vp, offset_t *off)
 {
        dsl_pool_t *dp;
@@ -797,11 +823,11 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
                        dsl_pool_rele(dp, FTAG);
                        return (err);
                }
-               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-                   outfd, vp, off);
+               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+                   embedok, large_block_ok, outfd, vp, off);
        } else {
-               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-                   outfd, vp, off);
+               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+                   embedok, large_block_ok, outfd, vp, off);
        }
        if (owned)
                dsl_dataset_disown(ds, FTAG);
@@ -1000,6 +1026,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
            !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
                return (SET_ERROR(ENOTSUP));
 
+       /*
+        * The receiving code doesn't know how to translate large blocks
+        * to smaller ones, so the pool must have the LARGE_BLOCKS
+        * feature enabled if the stream has LARGE_BLOCKS.
+        */
+       if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SET_ERROR(ENOTSUP));
+
        error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
        if (error == 0) {
                /* target fs already exists; recv into temp clone */
@@ -1125,6 +1160,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
        }
        VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
+       if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+           DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+           !newds->ds_large_blocks) {
+               dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+               newds->ds_large_blocks = B_TRUE;
+       }
+
        dmu_buf_will_dirty(newds->ds_dbuf, tx);
        dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
@@ -1250,6 +1292,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
 
        /* some things will require 8-byte alignment, so everything must */
        ASSERT0(len % 8);
+       ASSERT3U(len, <=, ra->bufsize);
 
        while (done < len) {
                ssize_t resid;
@@ -1391,7 +1434,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
            drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
            P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
            drro->drr_blksz < SPA_MINBLOCKSIZE ||
-           drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+           drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
            drro->drr_bonuslen > DN_MAX_BONUSLEN) {
                return (SET_ERROR(EINVAL));
        }
@@ -1665,7 +1708,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
        int err;
 
        if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-           drrs->drr_length > SPA_MAXBLOCKSIZE)
+           drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
                return (SET_ERROR(EINVAL));
 
        data = restore_read(ra, drrs->drr_length, NULL);
@@ -1752,7 +1795,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
        ra.cksum = drc->drc_cksum;
        ra.vp = vp;
        ra.voff = *voffp;
-       ra.bufsize = 1<<20;
+       ra.bufsize = SPA_MAXBLOCKSIZE;
        ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
 
        /* these were verified in dmu_recv_begin */
index 89f45a78135618ddd3161aaf71eb2dc51894a7eb..62a8d471ec9e9d30ddf9c66c9bff0c8e410aaee4 100644 (file)
@@ -241,7 +241,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                return;
 
        min_bs = SPA_MINBLOCKSHIFT;
-       max_bs = SPA_MAXBLOCKSHIFT;
+       max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
        min_ibs = DN_MIN_INDBLKSHIFT;
        max_ibs = DN_MAX_INDBLKSHIFT;
 
@@ -310,6 +310,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
                         */
                        ASSERT(dn->dn_datablkshift != 0);
                        min_bs = max_bs = dn->dn_datablkshift;
+               } else {
+                       /*
+                        * The blocksize can increase up to the recordsize,
+                        * or if it is already more than the recordsize,
+                        * up to the next power of 2.
+                        */
+                       min_bs = highbit64(dn->dn_datablksz - 1);
+                       max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
                }
 
                /*
@@ -745,11 +753,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
                bp = &dn->dn_phys->dn_blkptr[0];
                if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
                    bp, bp->blk_birth))
-                       txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
                else
-                       txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_towrite += MZAP_MAX_BLKSZ;
                if (!BP_IS_HOLE(bp))
-                       txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tounref += MZAP_MAX_BLKSZ;
                return;
        }
 
@@ -1546,18 +1554,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 
        /* If blkptr doesn't exist then add space to towrite */
        if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
-               txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+               txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
        } else {
                blkptr_t *bp;
 
                bp = &dn->dn_phys->dn_spill;
                if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
                    bp, bp->blk_birth))
-                       txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
                else
-                       txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
                if (!BP_IS_HOLE(bp))
-                       txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+                       txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
        }
 }
 
index 2358849319ab92abc897e763fe94091ce87388fa..e1ea165aab0db08df83161120be95ce0762182c1 100644 (file)
@@ -540,10 +540,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 {
        int i;
 
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (blocksize == 0)
                blocksize = 1 << zfs_default_bs;
-       else if (blocksize > SPA_MAXBLOCKSIZE)
-               blocksize = SPA_MAXBLOCKSIZE;
        else
                blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -624,7 +624,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        int nblkptr;
 
        ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-       ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        ASSERT0(blocksize % SPA_MINBLOCKSIZE);
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
        ASSERT(tx->tx_txg != 0);
@@ -1377,10 +1378,9 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
        dmu_buf_impl_t *db;
        int err;
 
+       ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (size == 0)
                size = SPA_MINBLOCKSIZE;
-       if (size > SPA_MAXBLOCKSIZE)
-               size = SPA_MAXBLOCKSIZE;
        else
                size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
index b444fca64a684e602e4d68fca111af122f750a49..9a66c6552133b831730569566534a54b62b117e2 100644 (file)
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_bookmark.h>
 
+/*
+ * The SPA supports block sizes up to 16MB.  However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator.  Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB).  Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+
 #define        SWITCH64(x, y) \
        { \
                uint64_t __tmp = (x); \
@@ -60,8 +71,6 @@
 
 #define        DS_REF_MAX      (1ULL << 62)
 
-#define        DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
-
 extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
 
 /*
@@ -117,6 +126,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
        dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
        dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
        dsl_dataset_phys(ds)->ds_unique_bytes += used;
+       if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+               ds->ds_need_large_blocks = B_TRUE;
        mutex_exit(&ds->ds_lock);
        dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
            compressed, uncompressed, tx);
@@ -414,6 +425,14 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
                list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
                    offsetof(dmu_sendarg_t, dsa_link));
 
+               if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+                       err = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS);
+                       if (err == 0)
+                               ds->ds_large_blocks = B_TRUE;
+                       else
+                               ASSERT3U(err, ==, ENOENT);
+               }
+
                if (err == 0) {
                        err = dsl_dir_hold_obj(dp,
                            dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds,
@@ -730,6 +749,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
                dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
                    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
 
+               if (origin->ds_large_blocks)
+                       dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
                dmu_buf_will_dirty(origin->ds_dbuf, tx);
                dsl_dataset_phys(origin)->ds_num_children++;
 
@@ -1253,6 +1275,9 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
        dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
        dmu_buf_rele(dbuf, FTAG);
 
+       if (ds->ds_large_blocks)
+               dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+
        ASSERT3U(ds->ds_prev != 0, ==,
            dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
        if (ds->ds_prev) {
@@ -1541,6 +1566,11 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
        dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
 
        dmu_objset_sync(ds->ds_objset, zio, tx);
+
+       if (ds->ds_need_large_blocks && !ds->ds_large_blocks) {
+               dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+               ds->ds_large_blocks = B_TRUE;
+       }
 }
 
 static void
@@ -3222,6 +3252,77 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
        return (err);
 }
 
+static int
+dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx)
+{
+       const char *dsname = arg;
+       dsl_dataset_t *ds;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       int error = 0;
+
+       if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SET_ERROR(ENOTSUP));
+
+       ASSERT(spa_feature_is_enabled(dp->dp_spa,
+           SPA_FEATURE_EXTENSIBLE_DATASET));
+
+       error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       if (ds->ds_large_blocks)
+               error = EALREADY;
+       dsl_dataset_rele(ds, FTAG);
+
+       return (error);
+}
+
+void
+dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx)
+{
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+       uint64_t zero = 0;
+
+       spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+       dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+       VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS,
+           sizeof (zero), 1, &zero, tx));
+}
+
+static void
+dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx)
+{
+       const char *dsname = arg;
+       dsl_dataset_t *ds;
+
+       VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds));
+
+       dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx);
+       ASSERT(!ds->ds_large_blocks);
+       ds->ds_large_blocks = B_TRUE;
+       dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_activate_large_blocks(const char *dsname)
+{
+       int error;
+
+       error = dsl_sync_task(dsname,
+           dsl_dataset_activate_large_blocks_check,
+           dsl_dataset_activate_large_blocks_sync, (void *)dsname,
+           1, ZFS_SPACE_CHECK_RESERVED);
+
+       /*
+        * EALREADY indicates that this dataset already supports large blocks.
+        */
+       if (error == EALREADY)
+               error = 0;
+       return (error);
+}
+
 /*
  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
  * For example, they could both be snapshots of the same filesystem, and
@@ -3275,6 +3376,15 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_LP64)
+module_param(zfs_max_recordsize, int, 0644);
+MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
+#else
+/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
+module_param(zfs_max_recordsize, int, 0444);
+MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
+#endif
+
 EXPORT_SYMBOL(dsl_dataset_hold);
 EXPORT_SYMBOL(dsl_dataset_hold_obj);
 EXPORT_SYMBOL(dsl_dataset_own);
index 55810f5c30247f44be9a6c0002df47ab4cb4a7f9..8da77ebd7b6e12208e1db7a3ecbf378315efc298 100644 (file)
@@ -148,7 +148,7 @@ uint64_t
 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
 {
        if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
-               return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+               return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
        return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
            sizeof (dsl_deadlist_phys_t), tx));
 }
@@ -185,7 +185,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 {
        if (dle->dle_bpobj.bpo_object ==
            dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
-               uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
                bpobj_close(&dle->dle_bpobj);
                bpobj_decr_empty(dl->dl_os, tx);
                VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -259,7 +259,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 
        dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
        dle->dle_mintxg = mintxg;
-       obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+       obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
        VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
        avl_add(&dl->dl_tree, dle);
 
@@ -344,7 +344,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
                if (dle->dle_mintxg >= maxtxg)
                        break;
 
-               obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
                VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
                    dle->dle_mintxg, obj, tx));
        }
index 4623f5dd5b9dad47b91a92bfccd4e585a2eb481c..0e2238f99e5176862b74e0b5ff5347f7c0cd7c06 100644 (file)
@@ -277,6 +277,10 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
        obj = ds->ds_object;
 
+       if (ds->ds_large_blocks) {
+               ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS));
+               spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+       }
        if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
                ASSERT3P(ds->ds_prev, ==, NULL);
                VERIFY0(dsl_dataset_hold_obj(dp,
@@ -738,6 +742,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
                ASSERT0(ds->ds_reserved);
        }
 
+       if (ds->ds_large_blocks)
+               spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx);
+
        dsl_scan_ds_destroyed(ds, tx);
 
        obj = ds->ds_object;
index fe1a4d8b771aa4d26167561c01dabbccddf277b0..13961918e68d8b577631ddfaf0342c07fa5d40da 100644 (file)
@@ -372,7 +372,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
                    FREE_DIR_NAME, &dp->dp_free_dir));
 
                /* create and open the free_bplist */
-               obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+               obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
                VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
                    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
                VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -804,7 +804,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
         * subobj support.  So call dmu_object_alloc() directly.
         */
        obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
-           SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+           SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
        VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
        VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
index 8e857e12930a9c04d0c0d575e398f57ba1658e63..a3fcea87b16a9b55b5f2ef683fea022bf784d3c4 100644 (file)
@@ -504,7 +504,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
 
        if (size == 0) {
                blocksize = SPA_MINBLOCKSIZE;
-       } else if (size > SPA_MAXBLOCKSIZE) {
+       } else if (size > SPA_OLD_MAXBLOCKSIZE) {
                ASSERT(0);
                return (SET_ERROR(EFBIG));
        } else {
@@ -693,7 +693,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
        hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
            SA_BONUS, &spill_idx, &used, &spilling);
 
-       if (used > SPA_MAXBLOCKSIZE)
+       if (used > SPA_OLD_MAXBLOCKSIZE)
                return (SET_ERROR(EFBIG));
 
        VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -717,7 +717,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
                    attr_count - spill_idx, hdl->sa_spill, SA_SPILL, &i,
                    &spill_used, &dummy);
 
-               if (spill_used > SPA_MAXBLOCKSIZE)
+               if (spill_used > SPA_OLD_MAXBLOCKSIZE)
                        return (SET_ERROR(EFBIG));
 
                if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
index 3312c301cb9a5c5c1282ee3d05e607c1d26a5900..2a9ef9ce53e429845d5f98e9b619c19e1bd4e168 100644 (file)
@@ -266,6 +266,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
                spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
                    0, ZPROP_SRC_LOCAL);
 
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+               spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+                   MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+       } else {
+               spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+                   SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+       }
+
        if ((dp = list_head(&spa->spa_config_list)) != NULL) {
                if (dp->scd_path == NULL) {
                        spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -482,7 +490,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 
                        if (!error) {
                                objset_t *os;
-                               uint64_t compress;
+                               uint64_t propval;
 
                                if (strval == NULL || strval[0] == '\0') {
                                        objnum = zpool_prop_default_numeric(
@@ -494,15 +502,25 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
                                if (error)
                                        break;
 
-                               /* Must be ZPL and not gzip compressed. */
+                               /*
+                                * Must be ZPL, and its property settings
+                                * must be supported by GRUB (compression
+                                * is not gzip, and large blocks are not used).
+                                */
 
                                if (dmu_objset_type(os) != DMU_OST_ZFS) {
                                        error = SET_ERROR(ENOTSUP);
                                } else if ((error =
                                    dsl_prop_get_int_ds(dmu_objset_ds(os),
                                    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-                                   &compress)) == 0 &&
-                                   !BOOTFS_COMPRESS_VALID(compress)) {
+                                   &propval)) == 0 &&
+                                   !BOOTFS_COMPRESS_VALID(propval)) {
+                                       error = SET_ERROR(ENOTSUP);
+                               } else if ((error =
+                                   dsl_prop_get_int_ds(dmu_objset_ds(os),
+                                   zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+                                   &propval)) == 0 &&
+                                   propval > SPA_OLD_MAXBLOCKSIZE) {
                                        error = SET_ERROR(ENOTSUP);
                                } else {
                                        objnum = dmu_objset_id(os);
index 1041c8f572f23f72f9bafeb48a54990718da190f..01aa4641e63fe50483fc949973e5ccbf3d2d785f 100644 (file)
@@ -89,7 +89,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
 
        ASSERT(spa->spa_history == 0);
        spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
-           SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+           SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
            sizeof (spa_history_phys_t), tx);
 
        VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
index 8e5fa683ef424402249938b4fcf99a9f6b484879..b46b2d0a4ba32f16a1badd2430ba00b6acc45edf 100644 (file)
@@ -1985,6 +1985,15 @@ spa_debug_enabled(spa_t *spa)
        return (spa->spa_debug);
 }
 
+int
+spa_maxblocksize(spa_t *spa)
+{
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+               return (SPA_MAXBLOCKSIZE);
+       else
+               return (SPA_OLD_MAXBLOCKSIZE);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* Namespace manipulation */
 EXPORT_SYMBOL(spa_lookup);
@@ -2040,6 +2049,7 @@ EXPORT_SYMBOL(spa_suspended);
 EXPORT_SYMBOL(spa_bootfs);
 EXPORT_SYMBOL(spa_delegation);
 EXPORT_SYMBOL(spa_meta_objset);
+EXPORT_SYMBOL(spa_maxblocksize);
 
 /* Miscellaneous support routines */
 EXPORT_SYMBOL(spa_rename);
index 52198261e434ec74510453a3c7b7f32191407ab9..fe66319256627925ed02aaaa08368b7963980968 100644 (file)
@@ -847,9 +847,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 
        /*
         * Compute the raidz-deflation ratio.  Note, we hard-code
-        * in 128k (1 << 17) because it is the current "typical" blocksize.
-        * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
-        * or we will inconsistently account for existing bp's.
+        * in 128k (1 << 17) because it is the "typical" blocksize.
+        * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+        * otherwise it would inconsistently account for existing bp's.
         */
        vd->vdev_deflate_ratio = (1 << 17) /
            (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
index db13b7bc479a79daa8bd859de3653f5b69ebeb29..0196f3945e4acdc82284b8cbb323c4004843c6e9 100644 (file)
@@ -552,9 +552,9 @@ retry:
                        goto retry;
                }
 
-               dr->dr_bio[i] = bio_alloc(GFP_NOIO,
-                   bio_nr_pages(bio_ptr, bio_size));
                /* bio_alloc() with __GFP_WAIT never returns NULL */
+               dr->dr_bio[i] = bio_alloc(GFP_NOIO,
+                   MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
                if (unlikely(dr->dr_bio[i] == NULL)) {
                        vdev_disk_dio_free(dr);
                        return (ENOMEM);
index cf03016490642a9032b5578033a86a30114835fe..a0d6fc4e3394cd7a276e275a09cfe002e5adf4dc 100644 (file)
@@ -167,7 +167,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
index 5af98c2bc93ee5472c888dc800a1934b8aba83bf..8d920c2facc5bc76bb955815cb7e0e0254713fe3 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
 #include <sys/arc.h>
+#include <sys/dmu_objset.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
@@ -654,9 +655,9 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
        uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
        ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
-           leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+           leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
            indirect_blockshift >= SPA_MINBLOCKSHIFT &&
-           indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+           indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
 
        VERIFY(dmu_object_set_blocksize(os, obj,
            1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -1347,7 +1348,6 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
        zap_t *zap;
        int err = 0;
 
-
        /*
         * Since, we don't have a name, we cannot figure out which blocks will
         * be affected in this operation. So, account for the worst case :
@@ -1360,7 +1360,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
         * large microzap results in a promotion to fatzap.
         */
        if (name == NULL) {
-               *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+               *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
                return (err);
        }
 
@@ -1384,7 +1384,7 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
                        /*
                         * We treat this case as similar to (name == NULL)
                         */
-                       *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+                       *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE;
                }
        } else {
                /*
@@ -1403,12 +1403,12 @@ zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
                 *                      ptrtbl blocks
                 */
                if (dmu_buf_freeable(zap->zap_dbuf))
-                       *tooverwrite += SPA_MAXBLOCKSIZE;
+                       *tooverwrite += MZAP_MAX_BLKSZ;
                else
-                       *towrite += SPA_MAXBLOCKSIZE;
+                       *towrite += MZAP_MAX_BLKSZ;
 
                if (add) {
-                       *towrite += 4 * SPA_MAXBLOCKSIZE;
+                       *towrite += 4 * MZAP_MAX_BLKSZ;
                }
        }
 
index 461456275f9f6e48b6427e6082c008f34999d3ea..609a72ab301a220e55aef68f53285b1dde433e0e 100644 (file)
@@ -56,7 +56,8 @@ valid_char(char c, boolean_t after_colon)
 {
        return ((c >= 'a' && c <= 'z') ||
            (c >= '0' && c <= '9') ||
-           c == (after_colon ? '_' : '.'));
+           (after_colon && c == '_') ||
+           (!after_colon && (c == '.' || c == '-')));
 }
 
 /*
@@ -230,4 +231,15 @@ zpool_feature_init(void)
            "com.delphix:embedded_data", "embedded_data",
            "Blocks which compress very well use even less space.",
            B_FALSE, B_TRUE, B_TRUE, NULL);
+
+       {
+       static const spa_feature_t large_blocks_deps[] = {
+               SPA_FEATURE_EXTENSIBLE_DATASET,
+               SPA_FEATURE_NONE
+       };
+       zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+           "org.open-zfs:large_blocks", "large_blocks",
+           "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
+           large_blocks_deps);
+       }
 }
index caaa3a21c71c8840420cf58146675c17ac5d8b55..f5137d09eb4e430950f72f3f4060fe2f95877a86 100644 (file)
@@ -2392,7 +2392,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
        const char *propname = nvpair_name(pair);
        zfs_prop_t prop = zfs_name_to_prop(propname);
        uint64_t intval;
-       int err;
+       int err = -1;
 
        if (prop == ZPROP_INVAL) {
                if (zfs_prop_userquota(propname))
@@ -3790,8 +3790,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
                 * the SPA supports it. We ignore any errors here since
                 * we'll catch them later.
                 */
-               if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
-                   nvpair_value_uint64(pair, &intval) == 0) {
+               if (nvpair_value_uint64(pair, &intval) == 0) {
                        if (intval >= ZIO_COMPRESS_GZIP_1 &&
                            intval <= ZIO_COMPRESS_GZIP_9 &&
                            zfs_earlier_version(dsname,
@@ -3842,6 +3841,42 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
                        return (SET_ERROR(ENOTSUP));
                break;
 
+       case ZFS_PROP_RECORDSIZE:
+               /* Record sizes above 128k need the feature to be enabled */
+               if (nvpair_value_uint64(pair, &intval) == 0 &&
+                   intval > SPA_OLD_MAXBLOCKSIZE) {
+                       spa_t *spa;
+
+                       /*
+                        * If this is a bootable dataset then
+                        * the we don't allow large (>128K) blocks,
+                        * because GRUB doesn't support them.
+                        */
+                       if (zfs_is_bootfs(dsname) &&
+                           intval > SPA_OLD_MAXBLOCKSIZE) {
+                               return (SET_ERROR(EDOM));
+                       }
+
+                       /*
+                        * We don't allow setting the property above 1MB,
+                        * unless the tunable has been changed.
+                        */
+                       if (intval > zfs_max_recordsize ||
+                           intval > SPA_MAXBLOCKSIZE)
+                               return (SET_ERROR(EDOM));
+
+                       if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+                               return (err);
+
+                       if (!spa_feature_is_enabled(spa,
+                           SPA_FEATURE_LARGE_BLOCKS)) {
+                               spa_close(spa, FTAG);
+                               return (SET_ERROR(ENOTSUP));
+                       }
+                       spa_close(spa, FTAG);
+               }
+               break;
+
        case ZFS_PROP_SHARESMB:
                if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
                        return (SET_ERROR(ENOTSUP));
@@ -4221,7 +4256,7 @@ out:
  * zc_fromobj  objsetid of incremental fromsnap (may be zero)
  * zc_guid     if set, estimate size of stream only.  zc_cookie is ignored.
  *             output size in zc_objset_type.
- * zc_flags    if =1, WRITE_EMBEDDED records are permitted
+ * zc_flags    lzc_send_flags
  *
  * outputs:
  * zc_objset_type      estimated size, if zc_guid is set
@@ -4233,6 +4268,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
        offset_t off;
        boolean_t estimate = (zc->zc_guid != 0);
        boolean_t embedok = (zc->zc_flags & 0x1);
+       boolean_t large_block_ok = (zc->zc_flags & 0x2);
 
        if (zc->zc_obj != 0) {
                dsl_pool_t *dp;
@@ -4294,7 +4330,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
 
                off = fp->f_offset;
                error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
-                   zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
+                   zc->zc_fromobj, embedok, large_block_ok,
+                   zc->zc_cookie, fp->f_vnode, &off);
 
                if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                        fp->f_offset = off;
@@ -5160,6 +5197,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
  * innvl: {
  *     "fd" -> file descriptor to write stream to (int32)
  *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "largeblockok" -> (value ignored)
+ *         indicates that blocks > 128KB are permitted
  *     (optional) "embedok" -> (value ignored)
  *         presence indicates DRR_WRITE_EMBEDDED records are permitted
  * }
@@ -5175,6 +5214,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
        char *fromname = NULL;
        int fd;
        file_t *fp;
+       boolean_t largeblockok;
        boolean_t embedok;
 
        error = nvlist_lookup_int32(innvl, "fd", &fd);
@@ -5183,13 +5223,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 
        (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
+       largeblockok = nvlist_exists(innvl, "largeblockok");
        embedok = nvlist_exists(innvl, "embedok");
 
        if ((fp = getf(fd)) == NULL)
                return (SET_ERROR(EBADF));
 
        off = fp->f_offset;
-       error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
+       error = dmu_send(snapname, fromname, embedok, largeblockok,
+           fd, fp->f_vnode, &off);
 
        if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                fp->f_offset = off;
index cfce83138df2422933d86fe307b2d131b8df8efa..38d8de0ebf9764b1381e73882f17039d9aa7c725 100644 (file)
@@ -492,7 +492,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
                 * If the write would overflow the largest block then split it.
                 */
                if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
-                       len = SPA_MAXBLOCKSIZE >> 1;
+                       len = SPA_OLD_MAXBLOCKSIZE >> 1;
                else
                        len = resid;
 
index 59f73776cc1d37bb2aab9c28efbe707aa4c3f079..e86b21aeed61a1b3b8d3bcc1758ea6233f6ae9c9 100644 (file)
@@ -188,10 +188,9 @@ static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
        zfs_sb_t *zsb = arg;
-
-       if (newval < SPA_MINBLOCKSIZE ||
-           newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
-               newval = SPA_MAXBLOCKSIZE;
+       ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zsb->z_os)));
+       ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+       ASSERT(ISP2(newval));
 
        zsb->z_max_blksz = newval;
 }
@@ -672,7 +671,7 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
         */
        zsb->z_sb = NULL;
        zsb->z_parent = zsb;
-       zsb->z_max_blksz = SPA_MAXBLOCKSIZE;
+       zsb->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
        zsb->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
        zsb->z_os = os;
 
index 5ce8a1e98cddb96be98e200e324ffd72e5c80fdf..19a4132e4c078e2753a49b1bb93f9e12b604dace 100644 (file)
@@ -771,8 +771,14 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                        uint64_t new_blksz;
 
                        if (zp->z_blksz > max_blksz) {
+                               /*
+                                * File's blocksize is already larger than the
+                                * "recordsize" property.  Only let it grow to
+                                * the next power of 2.
+                                */
                                ASSERT(!ISP2(zp->z_blksz));
-                               new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+                               new_blksz = MIN(end_size,
+                                   1 << highbit64(zp->z_blksz));
                        } else {
                                new_blksz = MIN(end_size, max_blksz);
                        }
index a3d64fe01b592be8b3944513235abc334994e515..f25ad0fc6981e9a794d4d9061d09c518d1c3fe64 100644 (file)
@@ -61,6 +61,7 @@
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
@@ -1304,8 +1305,13 @@ zfs_extend(znode_t *zp, uint64_t end)
                 * We are growing the file past the current block size.
                 */
                if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
+                       /*
+                        * File's blocksize is already larger than the
+                        * "recordsize" property.  Only let it grow to
+                        * the next power of 2.
+                        */
                        ASSERT(!ISP2(zp->z_blksz));
-                       newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+                       newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
                } else {
                        newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
                }
index be1a86b24ed73fe584b58aea5b83e8060ebd0606..3ae171e592d2f6364c619a78e963d46d424f6cb5 100644 (file)
@@ -243,6 +243,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
                            sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
                                error = SET_ERROR(ECKSUM);
                        } else {
+                               ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
                                bcopy(lr, dst, len);
                                *end = (char *)dst + len;
                                *nbp = zilc->zc_next_blk;
@@ -257,6 +258,8 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
                            (zilc->zc_nused > (size - sizeof (*zilc)))) {
                                error = SET_ERROR(ECKSUM);
                        } else {
+                               ASSERT3U(zilc->zc_nused, <=,
+                                   SPA_OLD_MAXBLOCKSIZE);
                                bcopy(lr, dst, zilc->zc_nused);
                                *end = (char *)dst + zilc->zc_nused;
                                *nbp = zilc->zc_next_blk;
@@ -342,7 +345,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
         * If the log has been claimed, stop if we encounter a sequence
         * number greater than the highest claimed sequence number.
         */
-       lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+       lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
        zil_bp_tree_init(zilog);
 
        for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -389,7 +392,7 @@ done:
            (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
 
        zil_bp_tree_fini(zilog);
-       zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+       zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
        return (error);
 }
@@ -941,7 +944,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 uint64_t zil_block_buckets[] = {
     4096,              /* non TX_WRITE */
@@ -1023,7 +1026,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
                continue;
        zil_blksz = zil_block_buckets[i];
        if (zil_blksz == UINT64_MAX)
-               zil_blksz = SPA_MAXBLOCKSIZE;
+               zil_blksz = SPA_OLD_MAXBLOCKSIZE;
        zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
        for (i = 0; i < ZIL_PREV_BLKS; i++)
                zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
index 9204df2b22fb63ee7064fbd6a1571a68e36f07bd..2b338f2a717961d3addcbb6b0994378b40776e3e 100644 (file)
@@ -24,6 +24,7 @@
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
+#include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
@@ -107,9 +108,8 @@ zio_init(void)
 
        /*
         * For small buffers, we want a cache for each multiple of
-        * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
-        * for each quarter-power of 2.  For large buffers, we want
-        * a cache for each multiple of PAGESIZE.
+        * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+        * for each quarter-power of 2.
         */
        for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
                size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -117,7 +117,16 @@ zio_init(void)
                size_t align = 0;
                size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 
-               while (p2 & (p2 - 1))
+#ifdef _ILP32
+               /*
+                * Cache size limited to 1M on 32-bit platforms until ARC
+                * buffers no longer require virtual address space.
+                */
+               if (size > zfs_max_recordsize)
+                       break;
+#endif
+
+               while (!ISP2(p2))
                        p2 &= p2 - 1;
 
 #ifndef _KERNEL
@@ -132,10 +141,8 @@ zio_init(void)
 #endif
                if (size <= 4 * SPA_MINBLOCKSIZE) {
                        align = SPA_MINBLOCKSIZE;
-               } else if (IS_P2ALIGNED(size, PAGESIZE)) {
-                       align = PAGESIZE;
                } else if (IS_P2ALIGNED(size, p2 >> 2)) {
-                       align = p2 >> 2;
+                       align = MIN(p2 >> 2, PAGESIZE);
                }
 
                if (align != 0) {
@@ -174,6 +181,14 @@ zio_fini(void)
        kmem_cache_t *last_data_cache = NULL;
 
        for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+#ifdef _ILP32
+               /*
+                * Cache size limited to 1M on 32-bit platforms until ARC
+                * buffers no longer require virtual address space.
+                */
+               if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
+                       break;
+#endif
                if (zio_buf_cache[c] != last_cache) {
                        last_cache = zio_buf_cache[c];
                        kmem_cache_destroy(zio_buf_cache[c]);