]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
428870ff | 23 | * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. |
34dc7c2f BB |
24 | */ |
25 | ||
34dc7c2f BB |
26 | #include <sys/spa.h> |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/zap.h> | |
29 | #include <sys/dsl_synctask.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/dmu_objset.h> | |
32 | #include <sys/utsname.h> | |
33 | #include <sys/cmn_err.h> | |
34 | #include <sys/sunddi.h> | |
428870ff | 35 | #include "zfs_comutil.h" |
34dc7c2f BB |
36 | #ifdef _KERNEL |
37 | #include <sys/zone.h> | |
38 | #endif | |
39 | ||
40 | /* | |
41 | * Routines to manage the on-disk history log. | |
42 | * | |
43 | * The history log is stored as a dmu object containing | |
44 | * <packed record length, record nvlist> tuples. | |
45 | * | |
46 | * Where "record nvlist" is a nvlist containing uint64_ts and strings, and | |
47 | * "packed record length" is the packed length of the "record nvlist" stored | |
48 | * as a little endian uint64_t. | |
49 | * | |
50 | * The log is implemented as a ring buffer, though the original creation | |
51 | * of the pool ('zpool create') is never overwritten. | |
52 | * | |
53 | * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer | |
54 | * of 'spa_history' stores the offsets for logging/retrieving history as | |
55 | * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of | |
56 | * where the 'zpool create' record is stored. This allows us to never | |
57 | * overwrite the original creation of the pool. 'sh_phys_max_off' is the | |
58 | * physical ending offset in bytes of the log. This tells you the length of | |
59 | * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record | |
60 | * is added, 'sh_eof' is incremented by the the size of the record. | |
61 | * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). | |
62 | * This is where the consumer should start reading from after reading in | |
63 | * the 'zpool create' portion of the log. | |
64 | * | |
65 | * 'sh_records_lost' keeps track of how many records have been overwritten | |
66 | * and permanently lost. | |
67 | */ | |
68 | ||
69 | /* convert a logical offset to physical */ | |
70 | static uint64_t | |
71 | spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) | |
72 | { | |
73 | uint64_t phys_len; | |
74 | ||
75 | phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; | |
76 | return ((log_off - shpp->sh_pool_create_len) % phys_len | |
77 | + shpp->sh_pool_create_len); | |
78 | } | |
79 | ||
80 | void | |
81 | spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) | |
82 | { | |
83 | dmu_buf_t *dbp; | |
84 | spa_history_phys_t *shpp; | |
85 | objset_t *mos = spa->spa_meta_objset; | |
86 | ||
87 | ASSERT(spa->spa_history == 0); | |
88 | spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, | |
89 | SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, | |
90 | sizeof (spa_history_phys_t), tx); | |
91 | ||
92 | VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, | |
93 | DMU_POOL_HISTORY, sizeof (uint64_t), 1, | |
94 | &spa->spa_history, tx) == 0); | |
95 | ||
96 | VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); | |
97 | ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); | |
98 | ||
99 | shpp = dbp->db_data; | |
100 | dmu_buf_will_dirty(dbp, tx); | |
101 | ||
102 | /* | |
103 | * Figure out maximum size of history log. We set it at | |
104 | * 1% of pool size, with a max of 32MB and min of 128KB. | |
105 | */ | |
428870ff BB |
106 | shpp->sh_phys_max_off = |
107 | metaslab_class_get_dspace(spa_normal_class(spa)) / 100; | |
34dc7c2f BB |
108 | shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); |
109 | shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); | |
110 | ||
111 | dmu_buf_rele(dbp, FTAG); | |
112 | } | |
113 | ||
114 | /* | |
115 | * Change 'sh_bof' to the beginning of the next record. | |
116 | */ | |
117 | static int | |
118 | spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) | |
119 | { | |
120 | objset_t *mos = spa->spa_meta_objset; | |
121 | uint64_t firstread, reclen, phys_bof; | |
122 | char buf[sizeof (reclen)]; | |
123 | int err; | |
124 | ||
125 | phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); | |
126 | firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); | |
127 | ||
128 | if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, | |
9babb374 | 129 | buf, DMU_READ_PREFETCH)) != 0) |
34dc7c2f BB |
130 | return (err); |
131 | if (firstread != sizeof (reclen)) { | |
132 | if ((err = dmu_read(mos, spa->spa_history, | |
133 | shpp->sh_pool_create_len, sizeof (reclen) - firstread, | |
9babb374 | 134 | buf + firstread, DMU_READ_PREFETCH)) != 0) |
34dc7c2f BB |
135 | return (err); |
136 | } | |
137 | ||
138 | reclen = LE_64(*((uint64_t *)buf)); | |
139 | shpp->sh_bof += reclen + sizeof (reclen); | |
140 | shpp->sh_records_lost++; | |
141 | return (0); | |
142 | } | |
143 | ||
144 | static int | |
145 | spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, | |
146 | dmu_tx_t *tx) | |
147 | { | |
148 | uint64_t firstwrite, phys_eof; | |
149 | objset_t *mos = spa->spa_meta_objset; | |
150 | int err; | |
151 | ||
152 | ASSERT(MUTEX_HELD(&spa->spa_history_lock)); | |
153 | ||
154 | /* see if we need to reset logical BOF */ | |
155 | while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - | |
156 | (shpp->sh_eof - shpp->sh_bof) <= len) { | |
157 | if ((err = spa_history_advance_bof(spa, shpp)) != 0) { | |
158 | return (err); | |
159 | } | |
160 | } | |
161 | ||
162 | phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); | |
163 | firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); | |
164 | shpp->sh_eof += len; | |
165 | dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); | |
166 | ||
167 | len -= firstwrite; | |
168 | if (len > 0) { | |
169 | /* write out the rest at the beginning of physical file */ | |
170 | dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, | |
171 | len, (char *)buf + firstwrite, tx); | |
172 | } | |
173 | ||
174 | return (0); | |
175 | } | |
176 | ||
177 | static char * | |
178 | spa_history_zone() | |
179 | { | |
180 | #ifdef _KERNEL | |
181 | return (curproc->p_zone->zone_name); | |
182 | #else | |
183 | return ("global"); | |
184 | #endif | |
185 | } | |
186 | ||
187 | /* | |
188 | * Write out a history event. | |
189 | */ | |
428870ff | 190 | /*ARGSUSED*/ |
34dc7c2f | 191 | static void |
428870ff | 192 | spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) |
34dc7c2f BB |
193 | { |
194 | spa_t *spa = arg1; | |
195 | history_arg_t *hap = arg2; | |
196 | const char *history_str = hap->ha_history_str; | |
197 | objset_t *mos = spa->spa_meta_objset; | |
198 | dmu_buf_t *dbp; | |
199 | spa_history_phys_t *shpp; | |
200 | size_t reclen; | |
201 | uint64_t le_len; | |
202 | nvlist_t *nvrecord; | |
203 | char *record_packed = NULL; | |
204 | int ret; | |
205 | ||
206 | /* | |
207 | * If we have an older pool that doesn't have a command | |
208 | * history object, create it now. | |
209 | */ | |
210 | mutex_enter(&spa->spa_history_lock); | |
211 | if (!spa->spa_history) | |
212 | spa_history_create_obj(spa, tx); | |
213 | mutex_exit(&spa->spa_history_lock); | |
214 | ||
215 | /* | |
216 | * Get the offset of where we need to write via the bonus buffer. | |
217 | * Update the offset when the write completes. | |
218 | */ | |
219 | VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); | |
220 | shpp = dbp->db_data; | |
221 | ||
222 | dmu_buf_will_dirty(dbp, tx); | |
223 | ||
224 | #ifdef ZFS_DEBUG | |
225 | { | |
226 | dmu_object_info_t doi; | |
227 | dmu_object_info_from_db(dbp, &doi); | |
228 | ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); | |
229 | } | |
230 | #endif | |
231 | ||
232 | VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); | |
233 | VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, | |
234 | gethrestime_sec()) == 0); | |
428870ff BB |
235 | VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0); |
236 | if (hap->ha_zone != NULL) | |
34dc7c2f BB |
237 | VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, |
238 | hap->ha_zone) == 0); | |
239 | #ifdef _KERNEL | |
240 | VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, | |
241 | utsname.nodename) == 0); | |
242 | #endif | |
243 | if (hap->ha_log_type == LOG_CMD_POOL_CREATE || | |
244 | hap->ha_log_type == LOG_CMD_NORMAL) { | |
245 | VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, | |
246 | history_str) == 0); | |
428870ff BB |
247 | |
248 | zfs_dbgmsg("command: %s", history_str); | |
34dc7c2f BB |
249 | } else { |
250 | VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, | |
251 | hap->ha_event) == 0); | |
252 | VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, | |
253 | tx->tx_txg) == 0); | |
254 | VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, | |
255 | history_str) == 0); | |
428870ff BB |
256 | |
257 | zfs_dbgmsg("internal %s pool:%s txg:%llu %s", | |
258 | zfs_history_event_names[hap->ha_event], spa_name(spa), | |
259 | (longlong_t)tx->tx_txg, history_str); | |
260 | ||
34dc7c2f BB |
261 | } |
262 | ||
263 | VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); | |
264 | record_packed = kmem_alloc(reclen, KM_SLEEP); | |
265 | ||
266 | VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, | |
267 | NV_ENCODE_XDR, KM_SLEEP) == 0); | |
268 | ||
269 | mutex_enter(&spa->spa_history_lock); | |
270 | if (hap->ha_log_type == LOG_CMD_POOL_CREATE) | |
271 | VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); | |
272 | ||
273 | /* write out the packed length as little endian */ | |
274 | le_len = LE_64((uint64_t)reclen); | |
275 | ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); | |
276 | if (!ret) | |
277 | ret = spa_history_write(spa, record_packed, reclen, shpp, tx); | |
278 | ||
279 | if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { | |
280 | shpp->sh_pool_create_len += sizeof (le_len) + reclen; | |
281 | shpp->sh_bof = shpp->sh_pool_create_len; | |
282 | } | |
283 | ||
284 | mutex_exit(&spa->spa_history_lock); | |
285 | nvlist_free(nvrecord); | |
286 | kmem_free(record_packed, reclen); | |
287 | dmu_buf_rele(dbp, FTAG); | |
288 | ||
428870ff BB |
289 | strfree(hap->ha_history_str); |
290 | if (hap->ha_zone != NULL) | |
291 | strfree(hap->ha_zone); | |
292 | kmem_free(hap, sizeof (history_arg_t)); | |
34dc7c2f BB |
293 | } |
294 | ||
295 | /* | |
296 | * Write out a history event. | |
297 | */ | |
298 | int | |
299 | spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) | |
300 | { | |
428870ff BB |
301 | history_arg_t *ha; |
302 | int err = 0; | |
303 | dmu_tx_t *tx; | |
34dc7c2f BB |
304 | |
305 | ASSERT(what != LOG_INTERNAL); | |
306 | ||
428870ff BB |
307 | tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); |
308 | err = dmu_tx_assign(tx, TXG_WAIT); | |
309 | if (err) { | |
310 | dmu_tx_abort(tx); | |
311 | return (err); | |
312 | } | |
313 | ||
314 | ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); | |
315 | ha->ha_history_str = strdup(history_str); | |
316 | ha->ha_zone = strdup(spa_history_zone()); | |
317 | ha->ha_log_type = what; | |
318 | ha->ha_uid = crgetuid(CRED()); | |
319 | ||
320 | /* Kick this off asynchronously; errors are ignored. */ | |
321 | dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, | |
322 | spa_history_log_sync, spa, ha, 0, tx); | |
323 | dmu_tx_commit(tx); | |
324 | ||
325 | /* spa_history_log_sync will free ha and strings */ | |
326 | return (err); | |
34dc7c2f BB |
327 | } |
328 | ||
329 | /* | |
330 | * Read out the command history. | |
331 | */ | |
332 | int | |
333 | spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) | |
334 | { | |
335 | objset_t *mos = spa->spa_meta_objset; | |
336 | dmu_buf_t *dbp; | |
337 | uint64_t read_len, phys_read_off, phys_eof; | |
338 | uint64_t leftover = 0; | |
339 | spa_history_phys_t *shpp; | |
340 | int err; | |
341 | ||
342 | /* | |
343 | * If the command history doesn't exist (older pool), | |
344 | * that's ok, just return ENOENT. | |
345 | */ | |
346 | if (!spa->spa_history) | |
347 | return (ENOENT); | |
348 | ||
428870ff BB |
349 | /* |
350 | * The history is logged asynchronously, so when they request | |
351 | * the first chunk of history, make sure everything has been | |
352 | * synced to disk so that we get it. | |
353 | */ | |
354 | if (*offp == 0 && spa_writeable(spa)) | |
355 | txg_wait_synced(spa_get_dsl(spa), 0); | |
356 | ||
34dc7c2f BB |
357 | if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) |
358 | return (err); | |
359 | shpp = dbp->db_data; | |
360 | ||
361 | #ifdef ZFS_DEBUG | |
362 | { | |
363 | dmu_object_info_t doi; | |
364 | dmu_object_info_from_db(dbp, &doi); | |
365 | ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); | |
366 | } | |
367 | #endif | |
368 | ||
369 | mutex_enter(&spa->spa_history_lock); | |
370 | phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); | |
371 | ||
372 | if (*offp < shpp->sh_pool_create_len) { | |
373 | /* read in just the zpool create history */ | |
374 | phys_read_off = *offp; | |
375 | read_len = MIN(*len, shpp->sh_pool_create_len - | |
376 | phys_read_off); | |
377 | } else { | |
378 | /* | |
379 | * Need to reset passed in offset to BOF if the passed in | |
380 | * offset has since been overwritten. | |
381 | */ | |
382 | *offp = MAX(*offp, shpp->sh_bof); | |
383 | phys_read_off = spa_history_log_to_phys(*offp, shpp); | |
384 | ||
385 | /* | |
386 | * Read up to the minimum of what the user passed down or | |
387 | * the EOF (physical or logical). If we hit physical EOF, | |
388 | * use 'leftover' to read from the physical BOF. | |
389 | */ | |
390 | if (phys_read_off <= phys_eof) { | |
391 | read_len = MIN(*len, phys_eof - phys_read_off); | |
392 | } else { | |
393 | read_len = MIN(*len, | |
394 | shpp->sh_phys_max_off - phys_read_off); | |
395 | if (phys_read_off + *len > shpp->sh_phys_max_off) { | |
396 | leftover = MIN(*len - read_len, | |
397 | phys_eof - shpp->sh_pool_create_len); | |
398 | } | |
399 | } | |
400 | } | |
401 | ||
402 | /* offset for consumer to use next */ | |
403 | *offp += read_len + leftover; | |
404 | ||
405 | /* tell the consumer how much you actually read */ | |
406 | *len = read_len + leftover; | |
407 | ||
408 | if (read_len == 0) { | |
409 | mutex_exit(&spa->spa_history_lock); | |
410 | dmu_buf_rele(dbp, FTAG); | |
411 | return (0); | |
412 | } | |
413 | ||
9babb374 BB |
414 | err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, |
415 | DMU_READ_PREFETCH); | |
34dc7c2f BB |
416 | if (leftover && err == 0) { |
417 | err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, | |
9babb374 | 418 | leftover, buf + read_len, DMU_READ_PREFETCH); |
34dc7c2f BB |
419 | } |
420 | mutex_exit(&spa->spa_history_lock); | |
421 | ||
422 | dmu_buf_rele(dbp, FTAG); | |
423 | return (err); | |
424 | } | |
425 | ||
45d1cae3 BB |
426 | static void |
427 | log_internal(history_internal_events_t event, spa_t *spa, | |
428870ff | 428 | dmu_tx_t *tx, const char *fmt, va_list adx) |
34dc7c2f | 429 | { |
428870ff | 430 | history_arg_t *ha; |
34dc7c2f | 431 | |
b128c09f BB |
432 | /* |
433 | * If this is part of creating a pool, not everything is | |
434 | * initialized yet, so don't bother logging the internal events. | |
435 | */ | |
436 | if (tx->tx_txg == TXG_INITIAL) | |
437 | return; | |
438 | ||
428870ff BB |
439 | ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); |
440 | ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, | |
441 | KM_SLEEP); | |
34dc7c2f | 442 | |
428870ff | 443 | (void) vsprintf(ha->ha_history_str, fmt, adx); |
34dc7c2f | 444 | |
428870ff BB |
445 | ha->ha_log_type = LOG_INTERNAL; |
446 | ha->ha_event = event; | |
447 | ha->ha_zone = NULL; | |
448 | ha->ha_uid = 0; | |
34dc7c2f BB |
449 | |
450 | if (dmu_tx_is_syncing(tx)) { | |
428870ff | 451 | spa_history_log_sync(spa, ha, tx); |
34dc7c2f BB |
452 | } else { |
453 | dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, | |
428870ff | 454 | spa_history_log_sync, spa, ha, 0, tx); |
34dc7c2f | 455 | } |
428870ff | 456 | /* spa_history_log_sync() will free ha and strings */ |
34dc7c2f | 457 | } |
45d1cae3 BB |
458 | |
459 | void | |
428870ff BB |
460 | spa_history_log_internal(history_internal_events_t event, spa_t *spa, |
461 | dmu_tx_t *tx, const char *fmt, ...) | |
45d1cae3 BB |
462 | { |
463 | dmu_tx_t *htx = tx; | |
464 | va_list adx; | |
465 | ||
466 | /* create a tx if we didn't get one */ | |
467 | if (tx == NULL) { | |
468 | htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
469 | if (dmu_tx_assign(htx, TXG_WAIT) != 0) { | |
470 | dmu_tx_abort(htx); | |
471 | return; | |
472 | } | |
473 | } | |
474 | ||
475 | va_start(adx, fmt); | |
428870ff | 476 | log_internal(event, spa, htx, fmt, adx); |
45d1cae3 BB |
477 | va_end(adx); |
478 | ||
479 | /* if we didn't get a tx from the caller, commit the one we made */ | |
480 | if (tx == NULL) | |
481 | dmu_tx_commit(htx); | |
482 | } | |
483 | ||
484 | void | |
485 | spa_history_log_version(spa_t *spa, history_internal_events_t event) | |
486 | { | |
487 | #ifdef _KERNEL | |
488 | uint64_t current_vers = spa_version(spa); | |
489 | ||
490 | if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { | |
428870ff | 491 | spa_history_log_internal(event, spa, NULL, |
45d1cae3 BB |
492 | "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", |
493 | (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, | |
494 | utsname.nodename, utsname.release, utsname.version, | |
495 | utsname.machine); | |
496 | } | |
497 | cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", | |
498 | event == LOG_POOL_IMPORT ? "imported" : | |
499 | event == LOG_POOL_CREATE ? "created" : "accessed", | |
500 | (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); | |
501 | #endif | |
502 | } |