*
* CDDL HEADER END
*/
+
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
* location.
*
* Byteswap implications:
+ *
* Since the SA attributes are not entirely self describing we can't do
* the normal byteswap processing. The special ZAP layout attribute and
* attribute registration attributes define the byteswap function and the
sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
uint16_t buflen, dmu_tx_t *tx);
-arc_byteswap_func_t *sa_bswap_table[] = {
+arc_byteswap_func_t sa_bswap_table[] = {
byteswap_uint64_array,
byteswap_uint32_array,
byteswap_uint16_array,
};
/*
- * ZPL legacy layout
* This is only used for objects of type DMU_OT_ZNODE
*/
sa_attr_type_t sa_legacy_zpl_layout[] = {
/*
* Special dummy layout used for buffers with no attributes.
*/
-
sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
static int sa_legacy_attr_count = 16;
switch (data_op) {
case SA_LOOKUP:
if (bulk[i].sa_addr == NULL)
- return (ENOENT);
+ return (SET_ERROR(ENOENT));
if (bulk[i].sa_data) {
SA_COPY_DATA(bulk[i].sa_data_func,
bulk[i].sa_addr, bulk[i].sa_data,
char attr_name[8];
if (sa->sa_layout_attr_obj == 0) {
- sa->sa_layout_attr_obj = zap_create(os,
- DMU_OT_SA_ATTR_LAYOUTS, DMU_OT_NONE, 0, tx);
- VERIFY(zap_add(os, sa->sa_master_obj, SA_LAYOUTS, 8, 1,
- &sa->sa_layout_attr_obj, tx) == 0);
+ sa->sa_layout_attr_obj = zap_create_link(os,
+ DMU_OT_SA_ATTR_LAYOUTS,
+ sa->sa_master_obj, SA_LAYOUTS, tx);
}
(void) snprintf(attr_name, sizeof (attr_name),
blocksize = SPA_MINBLOCKSIZE;
} else if (size > SPA_MAXBLOCKSIZE) {
ASSERT(0);
- return (EFBIG);
+ return (SET_ERROR(EFBIG));
} else {
blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
}
}
/*
- * Determine several different sizes
- * first the sa header size
- * the number of bytes to be stored
- * if spill would occur the index in the attribute array is returned
+ * Determine several different values pertaining to system attribute
+ * buffers.
+ *
+ * Return the size of the sa_hdr_phys_t header for the buffer. Each
+ * variable length attribute except the first contributes two bytes to
+ * the header size, which is then rounded up to an 8-byte boundary.
+ *
+ * The following output parameters are also computed.
+ *
+ * index - The index of the first attribute in attr_desc that will
+ * spill over. Only valid if will_spill is set.
+ *
+ * total - The total number of bytes of all system attributes described
+ * in attr_desc.
*
- * the boolean will_spill will be set when spilling is necessary. It
- * is only set when the buftype is SA_BONUS
+ * will_spill - Set when spilling is necessary. It is only set when
+ * the buftype is SA_BONUS.
*/
static int
sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
boolean_t *will_spill)
{
- int var_size = 0;
+ int var_size_count = 0;
int i;
int full_space;
int hdrsize;
- boolean_t done = B_FALSE;
+ int extra_hdrsize;
if (buftype == SA_BONUS && sa->sa_force_spill) {
*total = 0;
*index = -1;
*total = 0;
+ *will_spill = B_FALSE;
- if (buftype == SA_BONUS)
- *will_spill = B_FALSE;
-
+ extra_hdrsize = 0;
hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
sizeof (sa_hdr_phys_t);
full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
+ ASSERT(IS_P2ALIGNED(full_space, 8));
for (i = 0; i != attr_count; i++) {
- boolean_t is_var_sz;
+ boolean_t is_var_sz, might_spill_here;
+ int tmp_hdrsize;
+ *total = P2ROUNDUP(*total, 8);
*total += attr_desc[i].sa_length;
- if (done)
- goto next;
+ if (*will_spill)
+ continue;
is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
- if (is_var_sz) {
- var_size++;
- }
+ if (is_var_sz)
+ var_size_count++;
- if (is_var_sz && var_size > 1) {
- if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
- *total < full_space) {
- hdrsize += sizeof (uint16_t);
+ /*
+ * Calculate what the SA header size would be if this
+ * attribute doesn't spill.
+ */
+ tmp_hdrsize = hdrsize + ((is_var_sz && var_size_count > 1) ?
+ sizeof (uint16_t) : 0);
+
+ /*
+ * Check whether this attribute spans into the space
+ * that would be used by the spill block pointer should
+ * a spill block be needed.
+ */
+ might_spill_here =
+ buftype == SA_BONUS && *index == -1 &&
+ (*total + P2ROUNDUP(tmp_hdrsize, 8)) >
+ (full_space - sizeof (blkptr_t));
+
+ if (is_var_sz && var_size_count > 1) {
+ if (buftype == SA_SPILL ||
+ tmp_hdrsize + *total < full_space) {
+ /*
+ * Record the extra header size in case this
+ * increase needs to be reversed due to
+ * spill-over.
+ */
+ hdrsize = tmp_hdrsize;
+ if (*index != -1 || might_spill_here)
+ extra_hdrsize += sizeof (uint16_t);
} else {
- done = B_TRUE;
- *index = i;
- if (buftype == SA_BONUS)
- *will_spill = B_TRUE;
+ ASSERT(buftype == SA_BONUS);
+ if (*index == -1)
+ *index = i;
+ *will_spill = B_TRUE;
continue;
}
}
/*
- * find index of where spill *could* occur.
- * Then continue to count of remainder attribute
- * space. The sum is used later for sizing bonus
- * and spill buffer.
+ * Store index of where spill *could* occur. Then
+ * continue to count the remaining attribute sizes. The
+ * sum is used later for sizing bonus and spill buffer.
*/
- if (buftype == SA_BONUS && *index == -1 &&
- P2ROUNDUP(*total + hdrsize, 8) >
- (full_space - sizeof (blkptr_t))) {
+ if (might_spill_here)
*index = i;
- done = B_TRUE;
- }
-next:
- if (P2ROUNDUP(*total + hdrsize, 8) > full_space &&
+ if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
buftype == SA_BONUS)
*will_spill = B_TRUE;
}
+ if (*will_spill)
+ hdrsize -= extra_hdrsize;
+
hdrsize = P2ROUNDUP(hdrsize, 8);
return (hdrsize);
}
sa_buf_type_t buftype;
sa_hdr_phys_t *sahdr;
void *data_start;
- int buf_space;
sa_attr_type_t *attrs, *attrs_start;
int i, lot_count;
- int hdrsize, spillhdrsize = 0;
+ int spill_idx;
+ int hdrsize;
+ int spillhdrsize = 0;
int used;
dmu_object_type_t bonustype;
sa_lot_t *lot;
/* first determine bonus header size and sum of all attributes */
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
- SA_BONUS, &i, &used, &spilling);
+ SA_BONUS, &spill_idx, &used, &spilling);
if (used > SPA_MAXBLOCKSIZE)
- return (EFBIG);
+ return (SET_ERROR(EFBIG));
VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
}
dmu_buf_will_dirty(hdl->sa_spill, tx);
- spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
- attr_count - i, hdl->sa_spill, SA_SPILL, &i,
+ spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx],
+ attr_count - spill_idx, hdl->sa_spill, SA_SPILL, &i,
&spill_used, &dummy);
if (spill_used > SPA_MAXBLOCKSIZE)
- return (EFBIG);
+ return (SET_ERROR(EFBIG));
- buf_space = hdl->sa_spill->db_size - spillhdrsize;
if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
hdl->sa_spill->db_size)
VERIFY(0 == sa_resize_spill(hdl,
sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
buftype = SA_BONUS;
- if (spilling)
- buf_space = (sa->sa_force_spill) ?
- 0 : SA_BLKPTR_SPACE - hdrsize;
- else
- buf_space = hdl->sa_bonus->db_size - hdrsize;
-
attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
KM_SLEEP);
lot_count = 0;
for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
uint16_t length;
+ ASSERT(IS_P2ALIGNED(data_start, 8));
attrs[i] = attr_desc[i].sa_attr;
length = SA_REGISTERED_LEN(sa, attrs[i]);
if (length == 0)
length = attr_desc[i].sa_length;
- if (buf_space < length) { /* switch to spill buffer */
+ if (spilling && i == spill_idx) { /* switch to spill buffer */
VERIFY(bonustype == DMU_OT_SA);
if (buftype == SA_BONUS && !sa->sa_force_spill) {
sa_find_layout(hdl->sa_os, hash, attrs_start,
data_start = (void *)((uintptr_t)sahdr +
spillhdrsize);
attrs_start = &attrs[i];
- buf_space = hdl->sa_spill->db_size - spillhdrsize;
lot_count = 0;
}
hash ^= SA_ATTR_HASH(attrs[i]);
}
data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
length), 8);
- buf_space -= P2ROUNDUP(length, 8);
lot_count++;
}
*/
if (error || (error == 0 && sa_attr_count == 0)) {
if (error == 0)
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto bail;
}
sa_reg_count = sa_attr_count;
error = zap_lookup(os, sa->sa_reg_attr_obj,
reg_attrs[i].sa_name, 8, 1, &attr_value);
else
- error = ENOENT;
+ error = SET_ERROR(ENOENT);
switch (error) {
case ENOENT:
sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
sa_attr_type_t *tb;
int error;
- mutex_enter(&os->os_lock);
+ mutex_enter(&os->os_user_ptr_lock);
if (os->os_sa) {
mutex_enter(&os->os_sa->sa_lock);
- mutex_exit(&os->os_lock);
+ mutex_exit(&os->os_user_ptr_lock);
tb = os->os_sa->sa_user_table;
mutex_exit(&os->os_sa->sa_lock);
*user_table = tb;
os->os_sa = sa;
mutex_enter(&sa->sa_lock);
- mutex_exit(&os->os_lock);
+ mutex_exit(&os->os_user_ptr_lock);
avl_create(&sa->sa_layout_num_tree, layout_num_compare,
sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
*/
if (error || (error == 0 && layout_count == 0)) {
if (error == 0)
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
goto fail;
}
if (sa->sa_user_table)
kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
mutex_exit(&sa->sa_lock);
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+ mutex_destroy(&sa->sa_lock);
kmem_free(sa, sizeof (sa_os_t));
return ((error == ECKSUM) ? EIO : error);
}
sa_free_attr_table(sa);
cookie = NULL;
- while ((layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))){
+ while ((layout =
+ avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) {
sa_idx_tab_t *tab;
while ((tab = list_head(&layout->lot_idx_tab))) {
ASSERT(refcount_count(&tab->sa_refcount));
}
cookie = NULL;
- while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))){
+ while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) {
kmem_free(layout->lot_attrs,
sizeof (sa_attr_type_t) * layout->lot_attr_count);
kmem_free(layout, sizeof (sa_lot_t));
avl_destroy(&sa->sa_layout_hash_tree);
avl_destroy(&sa->sa_layout_num_tree);
+ mutex_destroy(&sa->sa_lock);
kmem_free(sa, sizeof (sa_os_t));
os->os_sa = NULL;
(void) refcount_add(&idx_tab->sa_refcount, NULL);
}
+void
+sa_spill_rele(sa_handle_t *hdl)
+{
+ mutex_enter(&hdl->sa_lock);
+ if (hdl->sa_spill) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ dmu_buf_rele(hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ hdl->sa_spill_tab = NULL;
+ }
+ mutex_exit(&hdl->sa_lock);
+}
+
void
sa_handle_destroy(sa_handle_t *hdl)
{
mutex_enter(&hdl->sa_lock);
(void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
- NULL, NULL, NULL);
+ NULL, NULL);
if (hdl->sa_bonus_tab) {
sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
error = sa_build_index(handle, SA_BONUS);
newhandle = (hdl_type == SA_HDL_SHARED) ?
- dmu_buf_set_user_ie(db, handle,
- NULL, sa_evict) : NULL;
+ dmu_buf_set_user_ie(db, handle, sa_evict) : NULL;
if (newhandle != NULL) {
kmem_cache_free(sa_cache, handle);
}
if (sa->sa_reg_attr_obj == 0) {
- sa->sa_reg_attr_obj = zap_create(hdl->sa_os,
- DMU_OT_SA_ATTR_REGISTRATION, DMU_OT_NONE, 0, tx);
- VERIFY(zap_add(hdl->sa_os, sa->sa_master_obj,
- SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj, tx) == 0);
+ sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
+ DMU_OT_SA_ATTR_REGISTRATION,
+ sa->sa_master_obj, SA_REGISTRY, tx);
}
for (i = 0; i != sa->sa_num_attrs; i++) {
if (sa->sa_attr_table[i].sa_registered)
sa_bulk_attr_t *attr_desc;
void *old_data[2];
int bonus_attr_count = 0;
- int bonus_data_size = 0, spill_data_size = 0;
+ int bonus_data_size = 0;
+ int spill_data_size = 0;
int spill_attr_count = 0;
int error;
uint16_t length;
if ((error = sa_get_spill(hdl)) == 0) {
spill_data_size = hdl->sa_spill->db_size;
- old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
+ old_data[1] = zio_buf_alloc(spill_data_size);
bcopy(hdl->sa_spill->db_data, old_data[1],
hdl->sa_spill->db_size);
spill_attr_count =
hdr = SA_GET_HDR(hdl, SA_BONUS);
idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
for (; k != 2; k++) {
- /* iterate over each attribute in layout */
+ /*
+ * Iterate over each attribute in layout. Fetch the
+ * size of variable-length attributes needing rewrite
+ * from sa_lengths[].
+ */
for (i = 0, length_idx = 0; i != count; i++) {
sa_attr_type_t attr;
attr = idx_tab->sa_layout->lot_attrs[i];
+ length = SA_REGISTERED_LEN(sa, attr);
if (attr == newattr) {
- if (action == SA_REMOVE) {
- j++;
+ if (length == 0)
+ ++length_idx;
+ if (action == SA_REMOVE)
continue;
- }
- ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
+ ASSERT(length == 0);
ASSERT(action == SA_REPLACE);
SA_ADD_BULK_ATTR(attr_desc, j, attr,
locator, datastart, buflen);
} else {
- length = SA_REGISTERED_LEN(sa, attr);
- if (length == 0) {
+ if (length == 0)
length = hdr->sa_lengths[length_idx++];
- }
SA_ADD_BULK_ATTR(attr_desc, j, attr,
NULL, (void *)
length = buflen;
}
SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
- datastart, buflen);
+ datastart, length);
}
error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
if (old_data[0])
kmem_free(old_data[0], bonus_data_size);
if (old_data[1])
- kmem_free(old_data[1], spill_data_size);
+ zio_buf_free(old_data[1], spill_data_size);
kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
return (error);
int error;
sa_os_t *sa = hdl->sa_os->os_sa;
dmu_object_type_t bonustype;
-
- bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+ dmu_buf_t *saved_spill;
ASSERT(hdl);
ASSERT(MUTEX_HELD(&hdl->sa_lock));
+ bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+ saved_spill = hdl->sa_spill;
+
/* sync out registration table if necessary */
if (sa->sa_need_attr_registration)
sa_attr_register_sync(hdl, tx);
if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
sa->sa_update_cb(hdl, tx);
+ /*
+ * If saved_spill is NULL and current sa_spill is not NULL that
+ * means we increased the refcount of the spill buffer through
+ * sa_get_spill() or dmu_spill_hold_by_dnode(). Therefore we
+ * must release the hold before calling dmu_tx_commit() to avoid
+ * making a copy of this buffer in dbuf_sync_leaf() due to the
+ * reference count now being greater than 1.
+ */
+ if (!saved_spill && hdl->sa_spill) {
+ if (hdl->sa_spill_tab) {
+ sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+ hdl->sa_spill_tab = NULL;
+ }
+
+ dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+ hdl->sa_spill = NULL;
+ }
+
return (error);
}
sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
{
(void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
- oldhdl, newhdl, NULL, sa_evict);
+ oldhdl, newhdl, sa_evict);
oldhdl->sa_bonus = NULL;
}
EXPORT_SYMBOL(sa_handle_destroy);
EXPORT_SYMBOL(sa_buf_hold);
EXPORT_SYMBOL(sa_buf_rele);
+EXPORT_SYMBOL(sa_spill_rele);
EXPORT_SYMBOL(sa_lookup);
EXPORT_SYMBOL(sa_update);
EXPORT_SYMBOL(sa_remove);