]> git.proxmox.com Git - mirror_qemu.git/blob - include/block/block-common.h
block: Fix deadlocks in bdrv_graph_wrunlock()
[mirror_qemu.git] / include / block / block-common.h
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #ifndef BLOCK_COMMON_H
25 #define BLOCK_COMMON_H
26
27 #include "qapi/qapi-types-block-core.h"
28 #include "qemu/queue.h"
29
30 /*
31 * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py
32 *
33 * Function specifiers, which do nothing but mark functions to be
34 * generated by scripts/block-coroutine-wrapper.py
35 *
36 * Usage: read docs/devel/block-coroutine-wrapper.rst
37 *
38 * There are 4 kind of specifiers:
39 * - co_wrapper functions can be called by only non-coroutine context, because
40 * they always generate a new coroutine.
41 * - co_wrapper_mixed functions can be called by both coroutine and
42 * non-coroutine context.
43 * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and
44 * release the graph rdlock when creating a new coroutine
45 * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but
46 * automatically take and release the graph rdlock when creating a new
47 * coroutine.
48 *
49 * These functions should not be called from a coroutine_fn; instead,
50 * call the wrapped function directly.
51 */
52 #define co_wrapper no_coroutine_fn
53 #define co_wrapper_mixed no_coroutine_fn coroutine_mixed_fn
54 #define co_wrapper_bdrv_rdlock no_coroutine_fn
55 #define co_wrapper_mixed_bdrv_rdlock no_coroutine_fn coroutine_mixed_fn
56
57 /*
58 * no_co_wrapper: Function specifier used by block-coroutine-wrapper.py
59 *
60 * Function specifier which does nothing but mark functions to be generated by
61 * scripts/block-coroutine-wrapper.py.
62 *
63 * A no_co_wrapper function declaration creates a coroutine_fn wrapper around
64 * functions that must not be called in coroutine context. It achieves this by
65 * scheduling a BH in the bottom half that runs the respective non-coroutine
66 * function. The coroutine yields after scheduling the BH and is reentered when
67 * the wrapped function returns.
68 *
69 * A no_co_wrapper_bdrv_rdlock function is a no_co_wrapper function that
70 * automatically takes the graph rdlock when calling the wrapped function. In
71 * the same way, no_co_wrapper_bdrv_wrlock functions automatically take the
72 * graph wrlock.
73 *
74 * If the first parameter of the function is a BlockDriverState, BdrvChild or
75 * BlockBackend pointer, the AioContext lock for it is taken in the wrapper.
76 */
77 #define no_co_wrapper
78 #define no_co_wrapper_bdrv_rdlock
79 #define no_co_wrapper_bdrv_wrlock
80
81 #include "block/blockjob.h"
82
83 /* block.c */
84 typedef struct BlockDriver BlockDriver;
85 typedef struct BdrvChild BdrvChild;
86 typedef struct BdrvChildClass BdrvChildClass;
87
88 typedef enum BlockZoneOp {
89 BLK_ZO_OPEN,
90 BLK_ZO_CLOSE,
91 BLK_ZO_FINISH,
92 BLK_ZO_RESET,
93 } BlockZoneOp;
94
95 typedef enum BlockZoneModel {
96 BLK_Z_NONE = 0x0, /* Regular block device */
97 BLK_Z_HM = 0x1, /* Host-managed zoned block device */
98 BLK_Z_HA = 0x2, /* Host-aware zoned block device */
99 } BlockZoneModel;
100
101 typedef enum BlockZoneState {
102 BLK_ZS_NOT_WP = 0x0,
103 BLK_ZS_EMPTY = 0x1,
104 BLK_ZS_IOPEN = 0x2,
105 BLK_ZS_EOPEN = 0x3,
106 BLK_ZS_CLOSED = 0x4,
107 BLK_ZS_RDONLY = 0xD,
108 BLK_ZS_FULL = 0xE,
109 BLK_ZS_OFFLINE = 0xF,
110 } BlockZoneState;
111
112 typedef enum BlockZoneType {
113 BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
114 BLK_ZT_SWR = 0x2, /* Sequential writes required */
115 BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
116 } BlockZoneType;
117
118 /*
119 * Zone descriptor data structure.
120 * Provides information on a zone with all position and size values in bytes.
121 */
122 typedef struct BlockZoneDescriptor {
123 uint64_t start;
124 uint64_t length;
125 uint64_t cap;
126 uint64_t wp;
127 BlockZoneType type;
128 BlockZoneState state;
129 } BlockZoneDescriptor;
130
131 /*
132 * Track write pointers of a zone in bytes.
133 */
134 typedef struct BlockZoneWps {
135 CoMutex colock;
136 uint64_t wp[];
137 } BlockZoneWps;
138
139 typedef struct BlockDriverInfo {
140 /* in bytes, 0 if irrelevant */
141 int cluster_size;
142 /*
143 * A fraction of cluster_size, if supported (currently QCOW2 only); if
144 * disabled or unsupported, set equal to cluster_size.
145 */
146 int subcluster_size;
147 /* offset at which the VM state can be saved (0 if not possible) */
148 int64_t vm_state_offset;
149 bool is_dirty;
150 /*
151 * True if this block driver only supports compressed writes
152 */
153 bool needs_compressed_writes;
154 } BlockDriverInfo;
155
156 typedef struct BlockFragInfo {
157 uint64_t allocated_clusters;
158 uint64_t total_clusters;
159 uint64_t fragmented_clusters;
160 uint64_t compressed_clusters;
161 } BlockFragInfo;
162
163 typedef enum {
164 BDRV_REQ_COPY_ON_READ = 0x1,
165 BDRV_REQ_ZERO_WRITE = 0x2,
166
167 /*
168 * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
169 * that the block driver should unmap (discard) blocks if it is guaranteed
170 * that the result will read back as zeroes. The flag is only passed to the
171 * driver if the block device is opened with BDRV_O_UNMAP.
172 */
173 BDRV_REQ_MAY_UNMAP = 0x4,
174
175 /*
176 * An optimization hint when all QEMUIOVector elements are within
177 * previously registered bdrv_register_buf() memory ranges.
178 *
179 * Code that replaces the user's QEMUIOVector elements with bounce buffers
180 * must take care to clear this flag.
181 */
182 BDRV_REQ_REGISTERED_BUF = 0x8,
183
184 BDRV_REQ_FUA = 0x10,
185 BDRV_REQ_WRITE_COMPRESSED = 0x20,
186
187 /*
188 * Signifies that this write request will not change the visible disk
189 * content.
190 */
191 BDRV_REQ_WRITE_UNCHANGED = 0x40,
192
193 /*
194 * Forces request serialisation. Use only with write requests.
195 */
196 BDRV_REQ_SERIALISING = 0x80,
197
198 /*
199 * Execute the request only if the operation can be offloaded or otherwise
200 * be executed efficiently, but return an error instead of using a slow
201 * fallback.
202 */
203 BDRV_REQ_NO_FALLBACK = 0x100,
204
205 /*
206 * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
207 * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
208 * filter is involved), in which case it signals that the COR operation
209 * need not read the data into memory (qiov) but only ensure they are
210 * copied to the top layer (i.e., that COR operation is done).
211 */
212 BDRV_REQ_PREFETCH = 0x200,
213
214 /*
215 * If we need to wait for other requests, just fail immediately. Used
216 * only together with BDRV_REQ_SERIALISING. Used only with requests aligned
217 * to request_alignment (corresponding assertions are in block/io.c).
218 */
219 BDRV_REQ_NO_WAIT = 0x400,
220
221 /* Mask of valid flags */
222 BDRV_REQ_MASK = 0x7ff,
223 } BdrvRequestFlags;
224
225 #define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */
226 #define BDRV_O_RDWR 0x0002
227 #define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */
228 #define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save
229 writes in a snapshot */
230 #define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */
231 #define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */
232 #define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the
233 thread pool */
234 #define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */
235 #define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */
236 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
237 #define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */
238 #define BDRV_O_CHECK 0x1000 /* open solely for consistency check */
239 #define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */
240 #define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */
241 #define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given:
242 select an appropriate protocol driver,
243 ignoring the format layer */
244 #define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */
245 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening
246 read-write fails */
247 #define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */
248
249 #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
250
251
252 /* Option names of options parsed by the block layer */
253
254 #define BDRV_OPT_CACHE_WB "cache.writeback"
255 #define BDRV_OPT_CACHE_DIRECT "cache.direct"
256 #define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
257 #define BDRV_OPT_READ_ONLY "read-only"
258 #define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
259 #define BDRV_OPT_DISCARD "discard"
260 #define BDRV_OPT_FORCE_SHARE "force-share"
261
262
263 #define BDRV_SECTOR_BITS 9
264 #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
265
266 /*
267 * Get the first most significant bit of wp. If it is zero, then
268 * the zone type is SWR.
269 */
270 #define BDRV_ZT_IS_CONV(wp) (wp & (1ULL << 63))
271
272 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
273 INT_MAX >> BDRV_SECTOR_BITS)
274 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
275
276 /*
277 * We want allow aligning requests and disk length up to any 32bit alignment
278 * and don't afraid of overflow.
279 * To achieve it, and in the same time use some pretty number as maximum disk
280 * size, let's define maximum "length" (a limit for any offset/bytes request and
281 * for disk size) to be the greatest power of 2 less than INT64_MAX.
282 */
283 #define BDRV_MAX_ALIGNMENT (1L << 30)
284 #define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
285
286 /*
287 * Allocation status flags for bdrv_block_status() and friends.
288 *
289 * Public flags:
290 * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
291 * BDRV_BLOCK_ZERO: offset reads as zero
292 * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
293 * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
294 * layer rather than any backing, set by block layer
295 * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
296 * layer, set by block layer
297 * BDRV_BLOCK_COMPRESSED: the underlying data is compressed; only valid for
298 * the formats supporting compression: qcow, qcow2
299 *
300 * Internal flags:
301 * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
302 * that the block layer recompute the answer from the returned
303 * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
304 * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
305 * zeroes in file child of current block node inside
306 * returned region. Only valid together with both
307 * BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
308 * appear with BDRV_BLOCK_ZERO.
309 *
310 * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
311 * host offset within the returned BDS that is allocated for the
312 * corresponding raw guest data. However, whether that offset
313 * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
314 *
315 * DATA ZERO OFFSET_VALID
316 * t t t sectors read as zero, returned file is zero at offset
317 * t f t sectors read as valid from file at offset
318 * f t t sectors preallocated, read as zero, returned file not
319 * necessarily zero at offset
320 * f f t sectors preallocated but read from backing_hd,
321 * returned file contains garbage at offset
322 * t t f sectors preallocated, read as zero, unknown offset
323 * t f f sectors read from unknown file or offset
324 * f t f not allocated or unknown offset, read as zero
325 * f f f not allocated or unknown offset, read from backing_hd
326 */
327 #define BDRV_BLOCK_DATA 0x01
328 #define BDRV_BLOCK_ZERO 0x02
329 #define BDRV_BLOCK_OFFSET_VALID 0x04
330 #define BDRV_BLOCK_RAW 0x08
331 #define BDRV_BLOCK_ALLOCATED 0x10
332 #define BDRV_BLOCK_EOF 0x20
333 #define BDRV_BLOCK_RECURSE 0x40
334 #define BDRV_BLOCK_COMPRESSED 0x80
335
336 typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
337
338 typedef struct BDRVReopenState {
339 BlockDriverState *bs;
340 int flags;
341 BlockdevDetectZeroesOptions detect_zeroes;
342 bool backing_missing;
343 BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
344 BlockDriverState *old_file_bs; /* keep pointer for permissions update */
345 QDict *options;
346 QDict *explicit_options;
347 void *opaque;
348 } BDRVReopenState;
349
350 /*
351 * Block operation types
352 */
353 typedef enum BlockOpType {
354 BLOCK_OP_TYPE_BACKUP_SOURCE,
355 BLOCK_OP_TYPE_BACKUP_TARGET,
356 BLOCK_OP_TYPE_CHANGE,
357 BLOCK_OP_TYPE_COMMIT_SOURCE,
358 BLOCK_OP_TYPE_COMMIT_TARGET,
359 BLOCK_OP_TYPE_DATAPLANE,
360 BLOCK_OP_TYPE_DRIVE_DEL,
361 BLOCK_OP_TYPE_EJECT,
362 BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
363 BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
364 BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
365 BLOCK_OP_TYPE_MIRROR_SOURCE,
366 BLOCK_OP_TYPE_MIRROR_TARGET,
367 BLOCK_OP_TYPE_RESIZE,
368 BLOCK_OP_TYPE_STREAM,
369 BLOCK_OP_TYPE_REPLACE,
370 BLOCK_OP_TYPE_MAX,
371 } BlockOpType;
372
373 /* Block node permission constants */
374 enum {
375 /**
376 * A user that has the "permission" of consistent reads is guaranteed that
377 * their view of the contents of the block device is complete and
378 * self-consistent, representing the contents of a disk at a specific
379 * point.
380 *
381 * For most block devices (including their backing files) this is true, but
382 * the property cannot be maintained in a few situations like for
383 * intermediate nodes of a commit block job.
384 */
385 BLK_PERM_CONSISTENT_READ = 0x01,
386
387 /** This permission is required to change the visible disk contents. */
388 BLK_PERM_WRITE = 0x02,
389
390 /**
391 * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
392 * required for writes to the block node when the caller promises that
393 * the visible disk content doesn't change.
394 *
395 * As the BLK_PERM_WRITE permission is strictly stronger, either is
396 * sufficient to perform an unchanging write.
397 */
398 BLK_PERM_WRITE_UNCHANGED = 0x04,
399
400 /** This permission is required to change the size of a block node. */
401 BLK_PERM_RESIZE = 0x08,
402
403 /**
404 * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU
405 * 6.1 and earlier may still lock the corresponding byte in block/file-posix
406 * locking. So, implementing some new permission should be very careful to
407 * not interfere with this old unused thing.
408 */
409
410 BLK_PERM_ALL = 0x0f,
411
412 DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ
413 | BLK_PERM_WRITE
414 | BLK_PERM_WRITE_UNCHANGED
415 | BLK_PERM_RESIZE,
416
417 DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
418 };
419
420 /*
421 * Flags that parent nodes assign to child nodes to specify what kind of
422 * role(s) they take.
423 *
424 * At least one of DATA, METADATA, FILTERED, or COW must be set for
425 * every child.
426 *
427 *
428 * = Connection with bs->children, bs->file and bs->backing fields =
429 *
430 * 1. Filters
431 *
432 * Filter drivers have drv->is_filter = true.
433 *
434 * Filter node has exactly one FILTERED|PRIMARY child, and may have other
435 * children which must not have these bits (one example is the
436 * copy-before-write filter, which also has its target DATA child).
437 *
438 * Filter nodes never have COW children.
439 *
440 * For most filters, the filtered child is linked in bs->file, bs->backing is
441 * NULL. For some filters (as an exception), it is the other way around; those
442 * drivers will have drv->filtered_child_is_backing set to true (see that
443 * field’s documentation for what drivers this concerns)
444 *
445 * 2. "raw" driver (block/raw-format.c)
446 *
447 * Formally it's not a filter (drv->is_filter = false)
448 *
449 * bs->backing is always NULL
450 *
451 * Only has one child, linked in bs->file. Its role is either FILTERED|PRIMARY
452 * (like filter) or DATA|PRIMARY depending on options.
453 *
454 * 3. Other drivers
455 *
456 * Don't have any FILTERED children.
457 *
458 * May have at most one COW child. In this case it's linked in bs->backing.
459 * Otherwise bs->backing is NULL. COW child is never PRIMARY.
460 *
461 * May have at most one PRIMARY child. In this case it's linked in bs->file.
462 * Otherwise bs->file is NULL.
463 *
464 * May also have some other children that don't have the PRIMARY or COW bit set.
465 */
466 enum BdrvChildRoleBits {
467 /*
468 * This child stores data.
469 * Any node may have an arbitrary number of such children.
470 */
471 BDRV_CHILD_DATA = (1 << 0),
472
473 /*
474 * This child stores metadata.
475 * Any node may have an arbitrary number of metadata-storing
476 * children.
477 */
478 BDRV_CHILD_METADATA = (1 << 1),
479
480 /*
481 * A child that always presents exactly the same visible data as
482 * the parent, e.g. by virtue of the parent forwarding all reads
483 * and writes.
484 * This flag is mutually exclusive with DATA, METADATA, and COW.
485 * Any node may have at most one filtered child at a time.
486 */
487 BDRV_CHILD_FILTERED = (1 << 2),
488
489 /*
490 * Child from which to read all data that isn't allocated in the
491 * parent (i.e., the backing child); such data is copied to the
492 * parent through COW (and optionally COR).
493 * This field is mutually exclusive with DATA, METADATA, and
494 * FILTERED.
495 * Any node may have at most one such backing child at a time.
496 */
497 BDRV_CHILD_COW = (1 << 3),
498
499 /*
500 * The primary child. For most drivers, this is the child whose
501 * filename applies best to the parent node.
502 * Any node may have at most one primary child at a time.
503 */
504 BDRV_CHILD_PRIMARY = (1 << 4),
505
506 /* Useful combination of flags */
507 BDRV_CHILD_IMAGE = BDRV_CHILD_DATA
508 | BDRV_CHILD_METADATA
509 | BDRV_CHILD_PRIMARY,
510 };
511
512 /* Mask of BdrvChildRoleBits values */
513 typedef unsigned int BdrvChildRole;
514
515 typedef struct BdrvCheckResult {
516 int corruptions;
517 int leaks;
518 int check_errors;
519 int corruptions_fixed;
520 int leaks_fixed;
521 int64_t image_end_offset;
522 BlockFragInfo bfi;
523 } BdrvCheckResult;
524
525 typedef enum {
526 BDRV_FIX_LEAKS = 1,
527 BDRV_FIX_ERRORS = 2,
528 } BdrvCheckMode;
529
530 typedef struct BlockSizes {
531 uint32_t phys;
532 uint32_t log;
533 } BlockSizes;
534
535 typedef struct HDGeometry {
536 uint32_t heads;
537 uint32_t sectors;
538 uint32_t cylinders;
539 } HDGeometry;
540
541 /*
542 * Common functions that are neither I/O nor Global State.
543 *
544 * These functions must never call any function from other categories
545 * (I/O, "I/O or GS", Global State) except this one, but can be invoked by
546 * all of them.
547 */
548
549 char *bdrv_perm_names(uint64_t perm);
550 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
551
552 void bdrv_init_with_whitelist(void);
553 bool bdrv_uses_whitelist(void);
554 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
555
556 int bdrv_parse_aio(const char *mode, int *flags);
557 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
558 int bdrv_parse_discard_flags(const char *mode, int *flags);
559
560 int path_has_protocol(const char *path);
561 int path_is_absolute(const char *path);
562 char *path_combine(const char *base_path, const char *filename);
563
564 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
565 const char *backing,
566 Error **errp);
567
568 #endif /* BLOCK_COMMON_H */