2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "config-host.h"
25 #include "qemu-common.h"
28 #include "block_int.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
36 #include <sys/types.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
51 static void bdrv_dev_change_media_cb(BlockDriverState
*bs
, bool load
);
52 static BlockDriverAIOCB
*bdrv_aio_readv_em(BlockDriverState
*bs
,
53 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
54 BlockDriverCompletionFunc
*cb
, void *opaque
);
55 static BlockDriverAIOCB
*bdrv_aio_writev_em(BlockDriverState
*bs
,
56 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
57 BlockDriverCompletionFunc
*cb
, void *opaque
);
58 static int coroutine_fn
bdrv_co_readv_em(BlockDriverState
*bs
,
59 int64_t sector_num
, int nb_sectors
,
61 static int coroutine_fn
bdrv_co_writev_em(BlockDriverState
*bs
,
62 int64_t sector_num
, int nb_sectors
,
64 static int coroutine_fn
bdrv_co_do_readv(BlockDriverState
*bs
,
65 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
);
66 static int coroutine_fn
bdrv_co_do_writev(BlockDriverState
*bs
,
67 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
);
68 static BlockDriverAIOCB
*bdrv_co_aio_rw_vector(BlockDriverState
*bs
,
72 BlockDriverCompletionFunc
*cb
,
75 static void coroutine_fn
bdrv_co_do_rw(void *opaque
);
77 static bool bdrv_exceed_bps_limits(BlockDriverState
*bs
, int nb_sectors
,
78 bool is_write
, double elapsed_time
, uint64_t *wait
);
79 static bool bdrv_exceed_iops_limits(BlockDriverState
*bs
, bool is_write
,
80 double elapsed_time
, uint64_t *wait
);
81 static bool bdrv_exceed_io_limits(BlockDriverState
*bs
, int nb_sectors
,
82 bool is_write
, int64_t *wait
);
84 static QTAILQ_HEAD(, BlockDriverState
) bdrv_states
=
85 QTAILQ_HEAD_INITIALIZER(bdrv_states
);
87 static QLIST_HEAD(, BlockDriver
) bdrv_drivers
=
88 QLIST_HEAD_INITIALIZER(bdrv_drivers
);
90 /* The device to use for VM snapshots */
91 static BlockDriverState
*bs_snapshots
;
93 /* If non-zero, use only whitelisted block drivers */
94 static int use_bdrv_whitelist
;
97 static int is_windows_drive_prefix(const char *filename
)
99 return (((filename
[0] >= 'a' && filename
[0] <= 'z') ||
100 (filename
[0] >= 'A' && filename
[0] <= 'Z')) &&
104 int is_windows_drive(const char *filename
)
106 if (is_windows_drive_prefix(filename
) &&
109 if (strstart(filename
, "\\\\.\\", NULL
) ||
110 strstart(filename
, "//./", NULL
))
116 /* throttling disk I/O limits */
117 void bdrv_io_limits_disable(BlockDriverState
*bs
)
119 bs
->io_limits_enabled
= false;
121 while (qemu_co_queue_next(&bs
->throttled_reqs
));
123 if (bs
->block_timer
) {
124 qemu_del_timer(bs
->block_timer
);
125 qemu_free_timer(bs
->block_timer
);
126 bs
->block_timer
= NULL
;
132 memset(&bs
->io_base
, 0, sizeof(bs
->io_base
));
135 static void bdrv_block_timer(void *opaque
)
137 BlockDriverState
*bs
= opaque
;
139 qemu_co_queue_next(&bs
->throttled_reqs
);
142 void bdrv_io_limits_enable(BlockDriverState
*bs
)
144 qemu_co_queue_init(&bs
->throttled_reqs
);
145 bs
->block_timer
= qemu_new_timer_ns(vm_clock
, bdrv_block_timer
, bs
);
146 bs
->slice_time
= 5 * BLOCK_IO_SLICE_TIME
;
147 bs
->slice_start
= qemu_get_clock_ns(vm_clock
);
148 bs
->slice_end
= bs
->slice_start
+ bs
->slice_time
;
149 memset(&bs
->io_base
, 0, sizeof(bs
->io_base
));
150 bs
->io_limits_enabled
= true;
153 bool bdrv_io_limits_enabled(BlockDriverState
*bs
)
155 BlockIOLimit
*io_limits
= &bs
->io_limits
;
156 return io_limits
->bps
[BLOCK_IO_LIMIT_READ
]
157 || io_limits
->bps
[BLOCK_IO_LIMIT_WRITE
]
158 || io_limits
->bps
[BLOCK_IO_LIMIT_TOTAL
]
159 || io_limits
->iops
[BLOCK_IO_LIMIT_READ
]
160 || io_limits
->iops
[BLOCK_IO_LIMIT_WRITE
]
161 || io_limits
->iops
[BLOCK_IO_LIMIT_TOTAL
];
164 static void bdrv_io_limits_intercept(BlockDriverState
*bs
,
165 bool is_write
, int nb_sectors
)
167 int64_t wait_time
= -1;
169 if (!qemu_co_queue_empty(&bs
->throttled_reqs
)) {
170 qemu_co_queue_wait(&bs
->throttled_reqs
);
173 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
174 * throttled requests will not be dequeued until the current request is
175 * allowed to be serviced. So if the current request still exceeds the
176 * limits, it will be inserted to the head. All requests followed it will
177 * be still in throttled_reqs queue.
180 while (bdrv_exceed_io_limits(bs
, nb_sectors
, is_write
, &wait_time
)) {
181 qemu_mod_timer(bs
->block_timer
,
182 wait_time
+ qemu_get_clock_ns(vm_clock
));
183 qemu_co_queue_wait_insert_head(&bs
->throttled_reqs
);
186 qemu_co_queue_next(&bs
->throttled_reqs
);
189 /* check if the path starts with "<protocol>:" */
190 static int path_has_protocol(const char *path
)
193 if (is_windows_drive(path
) ||
194 is_windows_drive_prefix(path
)) {
199 return strchr(path
, ':') != NULL
;
202 int path_is_absolute(const char *path
)
206 /* specific case for names like: "\\.\d:" */
207 if (*path
== '/' || *path
== '\\')
210 p
= strchr(path
, ':');
216 return (*p
== '/' || *p
== '\\');
222 /* if filename is absolute, just copy it to dest. Otherwise, build a
223 path to it by considering it is relative to base_path. URL are
225 void path_combine(char *dest
, int dest_size
,
226 const char *base_path
,
227 const char *filename
)
234 if (path_is_absolute(filename
)) {
235 pstrcpy(dest
, dest_size
, filename
);
237 p
= strchr(base_path
, ':');
242 p1
= strrchr(base_path
, '/');
246 p2
= strrchr(base_path
, '\\');
258 if (len
> dest_size
- 1)
260 memcpy(dest
, base_path
, len
);
262 pstrcat(dest
, dest_size
, filename
);
266 void bdrv_register(BlockDriver
*bdrv
)
268 /* Block drivers without coroutine functions need emulation */
269 if (!bdrv
->bdrv_co_readv
) {
270 bdrv
->bdrv_co_readv
= bdrv_co_readv_em
;
271 bdrv
->bdrv_co_writev
= bdrv_co_writev_em
;
273 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
274 * the block driver lacks aio we need to emulate that too.
276 if (!bdrv
->bdrv_aio_readv
) {
277 /* add AIO emulation layer */
278 bdrv
->bdrv_aio_readv
= bdrv_aio_readv_em
;
279 bdrv
->bdrv_aio_writev
= bdrv_aio_writev_em
;
283 QLIST_INSERT_HEAD(&bdrv_drivers
, bdrv
, list
);
286 /* create a new block device (by default it is empty) */
287 BlockDriverState
*bdrv_new(const char *device_name
)
289 BlockDriverState
*bs
;
291 bs
= g_malloc0(sizeof(BlockDriverState
));
292 pstrcpy(bs
->device_name
, sizeof(bs
->device_name
), device_name
);
293 if (device_name
[0] != '\0') {
294 QTAILQ_INSERT_TAIL(&bdrv_states
, bs
, list
);
296 bdrv_iostatus_disable(bs
);
300 BlockDriver
*bdrv_find_format(const char *format_name
)
303 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
304 if (!strcmp(drv1
->format_name
, format_name
)) {
311 static int bdrv_is_whitelisted(BlockDriver
*drv
)
313 static const char *whitelist
[] = {
314 CONFIG_BDRV_WHITELIST
319 return 1; /* no whitelist, anything goes */
321 for (p
= whitelist
; *p
; p
++) {
322 if (!strcmp(drv
->format_name
, *p
)) {
329 BlockDriver
*bdrv_find_whitelisted_format(const char *format_name
)
331 BlockDriver
*drv
= bdrv_find_format(format_name
);
332 return drv
&& bdrv_is_whitelisted(drv
) ? drv
: NULL
;
335 int bdrv_create(BlockDriver
*drv
, const char* filename
,
336 QEMUOptionParameter
*options
)
338 if (!drv
->bdrv_create
)
341 return drv
->bdrv_create(filename
, options
);
344 int bdrv_create_file(const char* filename
, QEMUOptionParameter
*options
)
348 drv
= bdrv_find_protocol(filename
);
353 return bdrv_create(drv
, filename
, options
);
357 void get_tmp_filename(char *filename
, int size
)
359 char temp_dir
[MAX_PATH
];
361 GetTempPath(MAX_PATH
, temp_dir
);
362 GetTempFileName(temp_dir
, "qem", 0, filename
);
365 void get_tmp_filename(char *filename
, int size
)
369 /* XXX: race condition possible */
370 tmpdir
= getenv("TMPDIR");
373 snprintf(filename
, size
, "%s/vl.XXXXXX", tmpdir
);
374 fd
= mkstemp(filename
);
380 * Detect host devices. By convention, /dev/cdrom[N] is always
381 * recognized as a host CDROM.
383 static BlockDriver
*find_hdev_driver(const char *filename
)
385 int score_max
= 0, score
;
386 BlockDriver
*drv
= NULL
, *d
;
388 QLIST_FOREACH(d
, &bdrv_drivers
, list
) {
389 if (d
->bdrv_probe_device
) {
390 score
= d
->bdrv_probe_device(filename
);
391 if (score
> score_max
) {
401 BlockDriver
*bdrv_find_protocol(const char *filename
)
408 /* TODO Drivers without bdrv_file_open must be specified explicitly */
411 * XXX(hch): we really should not let host device detection
412 * override an explicit protocol specification, but moving this
413 * later breaks access to device names with colons in them.
414 * Thanks to the brain-dead persistent naming schemes on udev-
415 * based Linux systems those actually are quite common.
417 drv1
= find_hdev_driver(filename
);
422 if (!path_has_protocol(filename
)) {
423 return bdrv_find_format("file");
425 p
= strchr(filename
, ':');
428 if (len
> sizeof(protocol
) - 1)
429 len
= sizeof(protocol
) - 1;
430 memcpy(protocol
, filename
, len
);
431 protocol
[len
] = '\0';
432 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
433 if (drv1
->protocol_name
&&
434 !strcmp(drv1
->protocol_name
, protocol
)) {
441 static int find_image_format(const char *filename
, BlockDriver
**pdrv
)
443 int ret
, score
, score_max
;
444 BlockDriver
*drv1
, *drv
;
446 BlockDriverState
*bs
;
448 ret
= bdrv_file_open(&bs
, filename
, 0);
454 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
455 if (bs
->sg
|| !bdrv_is_inserted(bs
)) {
457 drv
= bdrv_find_format("raw");
465 ret
= bdrv_pread(bs
, 0, buf
, sizeof(buf
));
474 QLIST_FOREACH(drv1
, &bdrv_drivers
, list
) {
475 if (drv1
->bdrv_probe
) {
476 score
= drv1
->bdrv_probe(buf
, ret
, filename
);
477 if (score
> score_max
) {
491 * Set the current 'total_sectors' value
493 static int refresh_total_sectors(BlockDriverState
*bs
, int64_t hint
)
495 BlockDriver
*drv
= bs
->drv
;
497 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
501 /* query actual device if possible, otherwise just trust the hint */
502 if (drv
->bdrv_getlength
) {
503 int64_t length
= drv
->bdrv_getlength(bs
);
507 hint
= length
>> BDRV_SECTOR_BITS
;
510 bs
->total_sectors
= hint
;
515 * Set open flags for a given cache mode
517 * Return 0 on success, -1 if the cache mode was invalid.
519 int bdrv_parse_cache_flags(const char *mode
, int *flags
)
521 *flags
&= ~BDRV_O_CACHE_MASK
;
523 if (!strcmp(mode
, "off") || !strcmp(mode
, "none")) {
524 *flags
|= BDRV_O_NOCACHE
| BDRV_O_CACHE_WB
;
525 } else if (!strcmp(mode
, "directsync")) {
526 *flags
|= BDRV_O_NOCACHE
;
527 } else if (!strcmp(mode
, "writeback")) {
528 *flags
|= BDRV_O_CACHE_WB
;
529 } else if (!strcmp(mode
, "unsafe")) {
530 *flags
|= BDRV_O_CACHE_WB
;
531 *flags
|= BDRV_O_NO_FLUSH
;
532 } else if (!strcmp(mode
, "writethrough")) {
533 /* this is the default */
542 * The copy-on-read flag is actually a reference count so multiple users may
543 * use the feature without worrying about clobbering its previous state.
544 * Copy-on-read stays enabled until all users have called to disable it.
546 void bdrv_enable_copy_on_read(BlockDriverState
*bs
)
551 void bdrv_disable_copy_on_read(BlockDriverState
*bs
)
553 assert(bs
->copy_on_read
> 0);
558 * Common part for opening disk images and files
560 static int bdrv_open_common(BlockDriverState
*bs
, const char *filename
,
561 int flags
, BlockDriver
*drv
)
567 trace_bdrv_open_common(bs
, filename
, flags
, drv
->format_name
);
570 bs
->total_sectors
= 0;
574 bs
->open_flags
= flags
;
576 bs
->buffer_alignment
= 512;
578 assert(bs
->copy_on_read
== 0); /* bdrv_new() and bdrv_close() make it so */
579 if ((flags
& BDRV_O_RDWR
) && (flags
& BDRV_O_COPY_ON_READ
)) {
580 bdrv_enable_copy_on_read(bs
);
583 pstrcpy(bs
->filename
, sizeof(bs
->filename
), filename
);
584 bs
->backing_file
[0] = '\0';
586 if (use_bdrv_whitelist
&& !bdrv_is_whitelisted(drv
)) {
591 bs
->opaque
= g_malloc0(drv
->instance_size
);
593 bs
->enable_write_cache
= !!(flags
& BDRV_O_CACHE_WB
);
596 * Clear flags that are internal to the block layer before opening the
599 open_flags
= flags
& ~(BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
);
602 * Snapshots should be writable.
604 if (bs
->is_temporary
) {
605 open_flags
|= BDRV_O_RDWR
;
608 bs
->keep_read_only
= bs
->read_only
= !(open_flags
& BDRV_O_RDWR
);
610 /* Open the image, either directly or using a protocol */
611 if (drv
->bdrv_file_open
) {
612 ret
= drv
->bdrv_file_open(bs
, filename
, open_flags
);
614 ret
= bdrv_file_open(&bs
->file
, filename
, open_flags
);
616 ret
= drv
->bdrv_open(bs
, open_flags
);
624 ret
= refresh_total_sectors(bs
, bs
->total_sectors
);
630 if (bs
->is_temporary
) {
638 bdrv_delete(bs
->file
);
648 * Opens a file using a protocol (file, host_device, nbd, ...)
650 int bdrv_file_open(BlockDriverState
**pbs
, const char *filename
, int flags
)
652 BlockDriverState
*bs
;
656 drv
= bdrv_find_protocol(filename
);
662 ret
= bdrv_open_common(bs
, filename
, flags
, drv
);
673 * Opens a disk image (raw, qcow2, vmdk, ...)
675 int bdrv_open(BlockDriverState
*bs
, const char *filename
, int flags
,
679 char tmp_filename
[PATH_MAX
];
681 if (flags
& BDRV_O_SNAPSHOT
) {
682 BlockDriverState
*bs1
;
685 BlockDriver
*bdrv_qcow2
;
686 QEMUOptionParameter
*options
;
687 char backing_filename
[PATH_MAX
];
689 /* if snapshot, we create a temporary backing file and open it
690 instead of opening 'filename' directly */
692 /* if there is a backing file, use it */
694 ret
= bdrv_open(bs1
, filename
, 0, drv
);
699 total_size
= bdrv_getlength(bs1
) & BDRV_SECTOR_MASK
;
701 if (bs1
->drv
&& bs1
->drv
->protocol_name
)
706 get_tmp_filename(tmp_filename
, sizeof(tmp_filename
));
708 /* Real path is meaningless for protocols */
710 snprintf(backing_filename
, sizeof(backing_filename
),
712 else if (!realpath(filename
, backing_filename
))
715 bdrv_qcow2
= bdrv_find_format("qcow2");
716 options
= parse_option_parameters("", bdrv_qcow2
->create_options
, NULL
);
718 set_option_parameter_int(options
, BLOCK_OPT_SIZE
, total_size
);
719 set_option_parameter(options
, BLOCK_OPT_BACKING_FILE
, backing_filename
);
721 set_option_parameter(options
, BLOCK_OPT_BACKING_FMT
,
725 ret
= bdrv_create(bdrv_qcow2
, tmp_filename
, options
);
726 free_option_parameters(options
);
731 filename
= tmp_filename
;
733 bs
->is_temporary
= 1;
736 /* Find the right image format driver */
738 ret
= find_image_format(filename
, &drv
);
742 goto unlink_and_fail
;
746 ret
= bdrv_open_common(bs
, filename
, flags
, drv
);
748 goto unlink_and_fail
;
751 /* If there is a backing file, use it */
752 if ((flags
& BDRV_O_NO_BACKING
) == 0 && bs
->backing_file
[0] != '\0') {
753 char backing_filename
[PATH_MAX
];
755 BlockDriver
*back_drv
= NULL
;
757 bs
->backing_hd
= bdrv_new("");
759 if (path_has_protocol(bs
->backing_file
)) {
760 pstrcpy(backing_filename
, sizeof(backing_filename
),
763 path_combine(backing_filename
, sizeof(backing_filename
),
764 filename
, bs
->backing_file
);
767 if (bs
->backing_format
[0] != '\0') {
768 back_drv
= bdrv_find_format(bs
->backing_format
);
771 /* backing files always opened read-only */
773 flags
& ~(BDRV_O_RDWR
| BDRV_O_SNAPSHOT
| BDRV_O_NO_BACKING
);
775 ret
= bdrv_open(bs
->backing_hd
, backing_filename
, back_flags
, back_drv
);
780 if (bs
->is_temporary
) {
781 bs
->backing_hd
->keep_read_only
= !(flags
& BDRV_O_RDWR
);
783 /* base image inherits from "parent" */
784 bs
->backing_hd
->keep_read_only
= bs
->keep_read_only
;
788 if (!bdrv_key_required(bs
)) {
789 bdrv_dev_change_media_cb(bs
, true);
792 /* throttling disk I/O limits */
793 if (bs
->io_limits_enabled
) {
794 bdrv_io_limits_enable(bs
);
800 if (bs
->is_temporary
) {
806 void bdrv_close(BlockDriverState
*bs
)
809 if (bs
== bs_snapshots
) {
812 if (bs
->backing_hd
) {
813 bdrv_delete(bs
->backing_hd
);
814 bs
->backing_hd
= NULL
;
816 bs
->drv
->bdrv_close(bs
);
819 if (bs
->is_temporary
) {
820 unlink(bs
->filename
);
825 bs
->copy_on_read
= 0;
827 if (bs
->file
!= NULL
) {
828 bdrv_close(bs
->file
);
831 bdrv_dev_change_media_cb(bs
, false);
834 /*throttling disk I/O limits*/
835 if (bs
->io_limits_enabled
) {
836 bdrv_io_limits_disable(bs
);
840 void bdrv_close_all(void)
842 BlockDriverState
*bs
;
844 QTAILQ_FOREACH(bs
, &bdrv_states
, list
) {
849 /* make a BlockDriverState anonymous by removing from bdrv_state list.
850 Also, NULL terminate the device_name to prevent double remove */
851 void bdrv_make_anon(BlockDriverState
*bs
)
853 if (bs
->device_name
[0] != '\0') {
854 QTAILQ_REMOVE(&bdrv_states
, bs
, list
);
856 bs
->device_name
[0] = '\0';
859 void bdrv_delete(BlockDriverState
*bs
)
863 /* remove from list, if necessary */
867 if (bs
->file
!= NULL
) {
868 bdrv_delete(bs
->file
);
871 assert(bs
!= bs_snapshots
);
875 int bdrv_attach_dev(BlockDriverState
*bs
, void *dev
)
876 /* TODO change to DeviceState *dev when all users are qdevified */
882 bdrv_iostatus_reset(bs
);
886 /* TODO qdevified devices don't use this, remove when devices are qdevified */
887 void bdrv_attach_dev_nofail(BlockDriverState
*bs
, void *dev
)
889 if (bdrv_attach_dev(bs
, dev
) < 0) {
894 void bdrv_detach_dev(BlockDriverState
*bs
, void *dev
)
895 /* TODO change to DeviceState *dev when all users are qdevified */
897 assert(bs
->dev
== dev
);
900 bs
->dev_opaque
= NULL
;
901 bs
->buffer_alignment
= 512;
904 /* TODO change to return DeviceState * when all users are qdevified */
905 void *bdrv_get_attached_dev(BlockDriverState
*bs
)
910 void bdrv_set_dev_ops(BlockDriverState
*bs
, const BlockDevOps
*ops
,
914 bs
->dev_opaque
= opaque
;
915 if (bdrv_dev_has_removable_media(bs
) && bs
== bs_snapshots
) {
920 static void bdrv_dev_change_media_cb(BlockDriverState
*bs
, bool load
)
922 if (bs
->dev_ops
&& bs
->dev_ops
->change_media_cb
) {
923 bs
->dev_ops
->change_media_cb(bs
->dev_opaque
, load
);
927 bool bdrv_dev_has_removable_media(BlockDriverState
*bs
)
929 return !bs
->dev
|| (bs
->dev_ops
&& bs
->dev_ops
->change_media_cb
);
932 void bdrv_dev_eject_request(BlockDriverState
*bs
, bool force
)
934 if (bs
->dev_ops
&& bs
->dev_ops
->eject_request_cb
) {
935 bs
->dev_ops
->eject_request_cb(bs
->dev_opaque
, force
);
939 bool bdrv_dev_is_tray_open(BlockDriverState
*bs
)
941 if (bs
->dev_ops
&& bs
->dev_ops
->is_tray_open
) {
942 return bs
->dev_ops
->is_tray_open(bs
->dev_opaque
);
947 static void bdrv_dev_resize_cb(BlockDriverState
*bs
)
949 if (bs
->dev_ops
&& bs
->dev_ops
->resize_cb
) {
950 bs
->dev_ops
->resize_cb(bs
->dev_opaque
);
954 bool bdrv_dev_is_medium_locked(BlockDriverState
*bs
)
956 if (bs
->dev_ops
&& bs
->dev_ops
->is_medium_locked
) {
957 return bs
->dev_ops
->is_medium_locked(bs
->dev_opaque
);
963 * Run consistency checks on an image
965 * Returns 0 if the check could be completed (it doesn't mean that the image is
966 * free of errors) or -errno when an internal error occurred. The results of the
967 * check are stored in res.
969 int bdrv_check(BlockDriverState
*bs
, BdrvCheckResult
*res
)
971 if (bs
->drv
->bdrv_check
== NULL
) {
975 memset(res
, 0, sizeof(*res
));
976 return bs
->drv
->bdrv_check(bs
, res
);
979 #define COMMIT_BUF_SECTORS 2048
981 /* commit COW file into the raw image */
982 int bdrv_commit(BlockDriverState
*bs
)
984 BlockDriver
*drv
= bs
->drv
;
985 BlockDriver
*backing_drv
;
986 int64_t sector
, total_sectors
;
987 int n
, ro
, open_flags
;
988 int ret
= 0, rw_ret
= 0;
991 BlockDriverState
*bs_rw
, *bs_ro
;
996 if (!bs
->backing_hd
) {
1000 if (bs
->backing_hd
->keep_read_only
) {
1004 backing_drv
= bs
->backing_hd
->drv
;
1005 ro
= bs
->backing_hd
->read_only
;
1006 strncpy(filename
, bs
->backing_hd
->filename
, sizeof(filename
));
1007 open_flags
= bs
->backing_hd
->open_flags
;
1011 bdrv_delete(bs
->backing_hd
);
1012 bs
->backing_hd
= NULL
;
1013 bs_rw
= bdrv_new("");
1014 rw_ret
= bdrv_open(bs_rw
, filename
, open_flags
| BDRV_O_RDWR
,
1018 /* try to re-open read-only */
1019 bs_ro
= bdrv_new("");
1020 ret
= bdrv_open(bs_ro
, filename
, open_flags
& ~BDRV_O_RDWR
,
1024 /* drive not functional anymore */
1028 bs
->backing_hd
= bs_ro
;
1031 bs
->backing_hd
= bs_rw
;
1034 total_sectors
= bdrv_getlength(bs
) >> BDRV_SECTOR_BITS
;
1035 buf
= g_malloc(COMMIT_BUF_SECTORS
* BDRV_SECTOR_SIZE
);
1037 for (sector
= 0; sector
< total_sectors
; sector
+= n
) {
1038 if (bdrv_is_allocated(bs
, sector
, COMMIT_BUF_SECTORS
, &n
)) {
1040 if (bdrv_read(bs
, sector
, buf
, n
) != 0) {
1045 if (bdrv_write(bs
->backing_hd
, sector
, buf
, n
) != 0) {
1052 if (drv
->bdrv_make_empty
) {
1053 ret
= drv
->bdrv_make_empty(bs
);
1058 * Make sure all data we wrote to the backing device is actually
1062 bdrv_flush(bs
->backing_hd
);
1069 bdrv_delete(bs
->backing_hd
);
1070 bs
->backing_hd
= NULL
;
1071 bs_ro
= bdrv_new("");
1072 ret
= bdrv_open(bs_ro
, filename
, open_flags
& ~BDRV_O_RDWR
,
1076 /* drive not functional anymore */
1080 bs
->backing_hd
= bs_ro
;
1081 bs
->backing_hd
->keep_read_only
= 0;
1087 void bdrv_commit_all(void)
1089 BlockDriverState
*bs
;
1091 QTAILQ_FOREACH(bs
, &bdrv_states
, list
) {
1096 struct BdrvTrackedRequest
{
1097 BlockDriverState
*bs
;
1101 QLIST_ENTRY(BdrvTrackedRequest
) list
;
1102 Coroutine
*co
; /* owner, used for deadlock detection */
1103 CoQueue wait_queue
; /* coroutines blocked on this request */
1107 * Remove an active request from the tracked requests list
1109 * This function should be called when a tracked request is completing.
1111 static void tracked_request_end(BdrvTrackedRequest
*req
)
1113 QLIST_REMOVE(req
, list
);
1114 qemu_co_queue_restart_all(&req
->wait_queue
);
1118 * Add an active request to the tracked requests list
1120 static void tracked_request_begin(BdrvTrackedRequest
*req
,
1121 BlockDriverState
*bs
,
1123 int nb_sectors
, bool is_write
)
1125 *req
= (BdrvTrackedRequest
){
1127 .sector_num
= sector_num
,
1128 .nb_sectors
= nb_sectors
,
1129 .is_write
= is_write
,
1130 .co
= qemu_coroutine_self(),
1133 qemu_co_queue_init(&req
->wait_queue
);
1135 QLIST_INSERT_HEAD(&bs
->tracked_requests
, req
, list
);
1139 * Round a region to cluster boundaries
1141 static void round_to_clusters(BlockDriverState
*bs
,
1142 int64_t sector_num
, int nb_sectors
,
1143 int64_t *cluster_sector_num
,
1144 int *cluster_nb_sectors
)
1146 BlockDriverInfo bdi
;
1148 if (bdrv_get_info(bs
, &bdi
) < 0 || bdi
.cluster_size
== 0) {
1149 *cluster_sector_num
= sector_num
;
1150 *cluster_nb_sectors
= nb_sectors
;
1152 int64_t c
= bdi
.cluster_size
/ BDRV_SECTOR_SIZE
;
1153 *cluster_sector_num
= QEMU_ALIGN_DOWN(sector_num
, c
);
1154 *cluster_nb_sectors
= QEMU_ALIGN_UP(sector_num
- *cluster_sector_num
+
1159 static bool tracked_request_overlaps(BdrvTrackedRequest
*req
,
1160 int64_t sector_num
, int nb_sectors
) {
1162 if (sector_num
>= req
->sector_num
+ req
->nb_sectors
) {
1166 if (req
->sector_num
>= sector_num
+ nb_sectors
) {
1172 static void coroutine_fn
wait_for_overlapping_requests(BlockDriverState
*bs
,
1173 int64_t sector_num
, int nb_sectors
)
1175 BdrvTrackedRequest
*req
;
1176 int64_t cluster_sector_num
;
1177 int cluster_nb_sectors
;
1180 /* If we touch the same cluster it counts as an overlap. This guarantees
1181 * that allocating writes will be serialized and not race with each other
1182 * for the same cluster. For example, in copy-on-read it ensures that the
1183 * CoR read and write operations are atomic and guest writes cannot
1184 * interleave between them.
1186 round_to_clusters(bs
, sector_num
, nb_sectors
,
1187 &cluster_sector_num
, &cluster_nb_sectors
);
1191 QLIST_FOREACH(req
, &bs
->tracked_requests
, list
) {
1192 if (tracked_request_overlaps(req
, cluster_sector_num
,
1193 cluster_nb_sectors
)) {
1194 /* Hitting this means there was a reentrant request, for
1195 * example, a block driver issuing nested requests. This must
1196 * never happen since it means deadlock.
1198 assert(qemu_coroutine_self() != req
->co
);
1200 qemu_co_queue_wait(&req
->wait_queue
);
1211 * -EINVAL - backing format specified, but no file
1212 * -ENOSPC - can't update the backing file because no space is left in the
1214 * -ENOTSUP - format driver doesn't support changing the backing file
1216 int bdrv_change_backing_file(BlockDriverState
*bs
,
1217 const char *backing_file
, const char *backing_fmt
)
1219 BlockDriver
*drv
= bs
->drv
;
1221 if (drv
->bdrv_change_backing_file
!= NULL
) {
1222 return drv
->bdrv_change_backing_file(bs
, backing_file
, backing_fmt
);
1228 static int bdrv_check_byte_request(BlockDriverState
*bs
, int64_t offset
,
1233 if (!bdrv_is_inserted(bs
))
1239 len
= bdrv_getlength(bs
);
1244 if ((offset
> len
) || (len
- offset
< size
))
1250 static int bdrv_check_request(BlockDriverState
*bs
, int64_t sector_num
,
1253 return bdrv_check_byte_request(bs
, sector_num
* BDRV_SECTOR_SIZE
,
1254 nb_sectors
* BDRV_SECTOR_SIZE
);
1257 typedef struct RwCo
{
1258 BlockDriverState
*bs
;
1266 static void coroutine_fn
bdrv_rw_co_entry(void *opaque
)
1268 RwCo
*rwco
= opaque
;
1270 if (!rwco
->is_write
) {
1271 rwco
->ret
= bdrv_co_do_readv(rwco
->bs
, rwco
->sector_num
,
1272 rwco
->nb_sectors
, rwco
->qiov
);
1274 rwco
->ret
= bdrv_co_do_writev(rwco
->bs
, rwco
->sector_num
,
1275 rwco
->nb_sectors
, rwco
->qiov
);
1280 * Process a synchronous request using coroutines
1282 static int bdrv_rw_co(BlockDriverState
*bs
, int64_t sector_num
, uint8_t *buf
,
1283 int nb_sectors
, bool is_write
)
1286 struct iovec iov
= {
1287 .iov_base
= (void *)buf
,
1288 .iov_len
= nb_sectors
* BDRV_SECTOR_SIZE
,
1293 .sector_num
= sector_num
,
1294 .nb_sectors
= nb_sectors
,
1296 .is_write
= is_write
,
1300 qemu_iovec_init_external(&qiov
, &iov
, 1);
1302 if (qemu_in_coroutine()) {
1303 /* Fast-path if already in coroutine context */
1304 bdrv_rw_co_entry(&rwco
);
1306 co
= qemu_coroutine_create(bdrv_rw_co_entry
);
1307 qemu_coroutine_enter(co
, &rwco
);
1308 while (rwco
.ret
== NOT_DONE
) {
1315 /* return < 0 if error. See bdrv_write() for the return codes */
1316 int bdrv_read(BlockDriverState
*bs
, int64_t sector_num
,
1317 uint8_t *buf
, int nb_sectors
)
1319 return bdrv_rw_co(bs
, sector_num
, buf
, nb_sectors
, false);
1322 static void set_dirty_bitmap(BlockDriverState
*bs
, int64_t sector_num
,
1323 int nb_sectors
, int dirty
)
1326 unsigned long val
, idx
, bit
;
1328 start
= sector_num
/ BDRV_SECTORS_PER_DIRTY_CHUNK
;
1329 end
= (sector_num
+ nb_sectors
- 1) / BDRV_SECTORS_PER_DIRTY_CHUNK
;
1331 for (; start
<= end
; start
++) {
1332 idx
= start
/ (sizeof(unsigned long) * 8);
1333 bit
= start
% (sizeof(unsigned long) * 8);
1334 val
= bs
->dirty_bitmap
[idx
];
1336 if (!(val
& (1UL << bit
))) {
1341 if (val
& (1UL << bit
)) {
1343 val
&= ~(1UL << bit
);
1346 bs
->dirty_bitmap
[idx
] = val
;
1350 /* Return < 0 if error. Important errors are:
1351 -EIO generic I/O error (may happen for all errors)
1352 -ENOMEDIUM No media inserted.
1353 -EINVAL Invalid sector number or nb_sectors
1354 -EACCES Trying to write a read-only device
1356 int bdrv_write(BlockDriverState
*bs
, int64_t sector_num
,
1357 const uint8_t *buf
, int nb_sectors
)
1359 return bdrv_rw_co(bs
, sector_num
, (uint8_t *)buf
, nb_sectors
, true);
1362 int bdrv_pread(BlockDriverState
*bs
, int64_t offset
,
1363 void *buf
, int count1
)
1365 uint8_t tmp_buf
[BDRV_SECTOR_SIZE
];
1366 int len
, nb_sectors
, count
;
1371 /* first read to align to sector start */
1372 len
= (BDRV_SECTOR_SIZE
- offset
) & (BDRV_SECTOR_SIZE
- 1);
1375 sector_num
= offset
>> BDRV_SECTOR_BITS
;
1377 if ((ret
= bdrv_read(bs
, sector_num
, tmp_buf
, 1)) < 0)
1379 memcpy(buf
, tmp_buf
+ (offset
& (BDRV_SECTOR_SIZE
- 1)), len
);
1387 /* read the sectors "in place" */
1388 nb_sectors
= count
>> BDRV_SECTOR_BITS
;
1389 if (nb_sectors
> 0) {
1390 if ((ret
= bdrv_read(bs
, sector_num
, buf
, nb_sectors
)) < 0)
1392 sector_num
+= nb_sectors
;
1393 len
= nb_sectors
<< BDRV_SECTOR_BITS
;
1398 /* add data from the last sector */
1400 if ((ret
= bdrv_read(bs
, sector_num
, tmp_buf
, 1)) < 0)
1402 memcpy(buf
, tmp_buf
, count
);
1407 int bdrv_pwrite(BlockDriverState
*bs
, int64_t offset
,
1408 const void *buf
, int count1
)
1410 uint8_t tmp_buf
[BDRV_SECTOR_SIZE
];
1411 int len
, nb_sectors
, count
;
1416 /* first write to align to sector start */
1417 len
= (BDRV_SECTOR_SIZE
- offset
) & (BDRV_SECTOR_SIZE
- 1);
1420 sector_num
= offset
>> BDRV_SECTOR_BITS
;
1422 if ((ret
= bdrv_read(bs
, sector_num
, tmp_buf
, 1)) < 0)
1424 memcpy(tmp_buf
+ (offset
& (BDRV_SECTOR_SIZE
- 1)), buf
, len
);
1425 if ((ret
= bdrv_write(bs
, sector_num
, tmp_buf
, 1)) < 0)
1434 /* write the sectors "in place" */
1435 nb_sectors
= count
>> BDRV_SECTOR_BITS
;
1436 if (nb_sectors
> 0) {
1437 if ((ret
= bdrv_write(bs
, sector_num
, buf
, nb_sectors
)) < 0)
1439 sector_num
+= nb_sectors
;
1440 len
= nb_sectors
<< BDRV_SECTOR_BITS
;
1445 /* add data from the last sector */
1447 if ((ret
= bdrv_read(bs
, sector_num
, tmp_buf
, 1)) < 0)
1449 memcpy(tmp_buf
, buf
, count
);
1450 if ((ret
= bdrv_write(bs
, sector_num
, tmp_buf
, 1)) < 0)
1457 * Writes to the file and ensures that no writes are reordered across this
1458 * request (acts as a barrier)
1460 * Returns 0 on success, -errno in error cases.
1462 int bdrv_pwrite_sync(BlockDriverState
*bs
, int64_t offset
,
1463 const void *buf
, int count
)
1467 ret
= bdrv_pwrite(bs
, offset
, buf
, count
);
1472 /* No flush needed for cache modes that use O_DSYNC */
1473 if ((bs
->open_flags
& BDRV_O_CACHE_WB
) != 0) {
1480 static int coroutine_fn
bdrv_co_copy_on_readv(BlockDriverState
*bs
,
1481 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
1483 /* Perform I/O through a temporary buffer so that users who scribble over
1484 * their read buffer while the operation is in progress do not end up
1485 * modifying the image file. This is critical for zero-copy guest I/O
1486 * where anything might happen inside guest memory.
1488 void *bounce_buffer
;
1491 QEMUIOVector bounce_qiov
;
1492 int64_t cluster_sector_num
;
1493 int cluster_nb_sectors
;
1497 /* Cover entire cluster so no additional backing file I/O is required when
1498 * allocating cluster in the image file.
1500 round_to_clusters(bs
, sector_num
, nb_sectors
,
1501 &cluster_sector_num
, &cluster_nb_sectors
);
1503 trace_bdrv_co_copy_on_readv(bs
, sector_num
, nb_sectors
,
1504 cluster_sector_num
, cluster_nb_sectors
);
1506 iov
.iov_len
= cluster_nb_sectors
* BDRV_SECTOR_SIZE
;
1507 iov
.iov_base
= bounce_buffer
= qemu_blockalign(bs
, iov
.iov_len
);
1508 qemu_iovec_init_external(&bounce_qiov
, &iov
, 1);
1510 ret
= bs
->drv
->bdrv_co_readv(bs
, cluster_sector_num
, cluster_nb_sectors
,
1516 ret
= bs
->drv
->bdrv_co_writev(bs
, cluster_sector_num
, cluster_nb_sectors
,
1519 /* It might be okay to ignore write errors for guest requests. If this
1520 * is a deliberate copy-on-read then we don't want to ignore the error.
1521 * Simply report it in all cases.
1526 skip_bytes
= (sector_num
- cluster_sector_num
) * BDRV_SECTOR_SIZE
;
1527 qemu_iovec_from_buffer(qiov
, bounce_buffer
+ skip_bytes
,
1528 nb_sectors
* BDRV_SECTOR_SIZE
);
1531 qemu_vfree(bounce_buffer
);
1536 * Handle a read request in coroutine context
1538 static int coroutine_fn
bdrv_co_do_readv(BlockDriverState
*bs
,
1539 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
1541 BlockDriver
*drv
= bs
->drv
;
1542 BdrvTrackedRequest req
;
1548 if (bdrv_check_request(bs
, sector_num
, nb_sectors
)) {
1552 /* throttling disk read I/O */
1553 if (bs
->io_limits_enabled
) {
1554 bdrv_io_limits_intercept(bs
, false, nb_sectors
);
1557 if (bs
->copy_on_read
) {
1558 wait_for_overlapping_requests(bs
, sector_num
, nb_sectors
);
1561 tracked_request_begin(&req
, bs
, sector_num
, nb_sectors
, false);
1563 if (bs
->copy_on_read
) {
1566 ret
= bdrv_co_is_allocated(bs
, sector_num
, nb_sectors
, &pnum
);
1571 if (!ret
|| pnum
!= nb_sectors
) {
1572 ret
= bdrv_co_copy_on_readv(bs
, sector_num
, nb_sectors
, qiov
);
1577 ret
= drv
->bdrv_co_readv(bs
, sector_num
, nb_sectors
, qiov
);
1580 tracked_request_end(&req
);
1584 int coroutine_fn
bdrv_co_readv(BlockDriverState
*bs
, int64_t sector_num
,
1585 int nb_sectors
, QEMUIOVector
*qiov
)
1587 trace_bdrv_co_readv(bs
, sector_num
, nb_sectors
);
1589 return bdrv_co_do_readv(bs
, sector_num
, nb_sectors
, qiov
);
1593 * Handle a write request in coroutine context
1595 static int coroutine_fn
bdrv_co_do_writev(BlockDriverState
*bs
,
1596 int64_t sector_num
, int nb_sectors
, QEMUIOVector
*qiov
)
1598 BlockDriver
*drv
= bs
->drv
;
1599 BdrvTrackedRequest req
;
1605 if (bs
->read_only
) {
1608 if (bdrv_check_request(bs
, sector_num
, nb_sectors
)) {
1612 /* throttling disk write I/O */
1613 if (bs
->io_limits_enabled
) {
1614 bdrv_io_limits_intercept(bs
, true, nb_sectors
);
1617 if (bs
->copy_on_read
) {
1618 wait_for_overlapping_requests(bs
, sector_num
, nb_sectors
);
1621 tracked_request_begin(&req
, bs
, sector_num
, nb_sectors
, true);
1623 ret
= drv
->bdrv_co_writev(bs
, sector_num
, nb_sectors
, qiov
);
1625 if (bs
->dirty_bitmap
) {
1626 set_dirty_bitmap(bs
, sector_num
, nb_sectors
, 1);
1629 if (bs
->wr_highest_sector
< sector_num
+ nb_sectors
- 1) {
1630 bs
->wr_highest_sector
= sector_num
+ nb_sectors
- 1;
1633 tracked_request_end(&req
);
1638 int coroutine_fn
bdrv_co_writev(BlockDriverState
*bs
, int64_t sector_num
,
1639 int nb_sectors
, QEMUIOVector
*qiov
)
1641 trace_bdrv_co_writev(bs
, sector_num
, nb_sectors
);
1643 return bdrv_co_do_writev(bs
, sector_num
, nb_sectors
, qiov
);
1647 * Truncate file to 'offset' bytes (needed only for file protocols)
1649 int bdrv_truncate(BlockDriverState
*bs
, int64_t offset
)
1651 BlockDriver
*drv
= bs
->drv
;
1655 if (!drv
->bdrv_truncate
)
1659 if (bdrv_in_use(bs
))
1661 ret
= drv
->bdrv_truncate(bs
, offset
);
1663 ret
= refresh_total_sectors(bs
, offset
>> BDRV_SECTOR_BITS
);
1664 bdrv_dev_resize_cb(bs
);
1670 * Length of a allocated file in bytes. Sparse files are counted by actual
1671 * allocated space. Return < 0 if error or unknown.
1673 int64_t bdrv_get_allocated_file_size(BlockDriverState
*bs
)
1675 BlockDriver
*drv
= bs
->drv
;
1679 if (drv
->bdrv_get_allocated_file_size
) {
1680 return drv
->bdrv_get_allocated_file_size(bs
);
1683 return bdrv_get_allocated_file_size(bs
->file
);
1689 * Length of a file in bytes. Return < 0 if error or unknown.
1691 int64_t bdrv_getlength(BlockDriverState
*bs
)
1693 BlockDriver
*drv
= bs
->drv
;
1697 if (bs
->growable
|| bdrv_dev_has_removable_media(bs
)) {
1698 if (drv
->bdrv_getlength
) {
1699 return drv
->bdrv_getlength(bs
);
1702 return bs
->total_sectors
* BDRV_SECTOR_SIZE
;
1705 /* return 0 as number of sectors if no device present or error */
1706 void bdrv_get_geometry(BlockDriverState
*bs
, uint64_t *nb_sectors_ptr
)
1709 length
= bdrv_getlength(bs
);
1713 length
= length
>> BDRV_SECTOR_BITS
;
1714 *nb_sectors_ptr
= length
;
1718 uint8_t boot_ind
; /* 0x80 - active */
1719 uint8_t head
; /* starting head */
1720 uint8_t sector
; /* starting sector */
1721 uint8_t cyl
; /* starting cylinder */
1722 uint8_t sys_ind
; /* What partition type */
1723 uint8_t end_head
; /* end head */
1724 uint8_t end_sector
; /* end sector */
1725 uint8_t end_cyl
; /* end cylinder */
1726 uint32_t start_sect
; /* starting sector counting from 0 */
1727 uint32_t nr_sects
; /* nr of sectors in partition */
1730 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1731 static int guess_disk_lchs(BlockDriverState
*bs
,
1732 int *pcylinders
, int *pheads
, int *psectors
)
1734 uint8_t buf
[BDRV_SECTOR_SIZE
];
1735 int ret
, i
, heads
, sectors
, cylinders
;
1736 struct partition
*p
;
1738 uint64_t nb_sectors
;
1740 bdrv_get_geometry(bs
, &nb_sectors
);
1742 ret
= bdrv_read(bs
, 0, buf
, 1);
1745 /* test msdos magic */
1746 if (buf
[510] != 0x55 || buf
[511] != 0xaa)
1748 for(i
= 0; i
< 4; i
++) {
1749 p
= ((struct partition
*)(buf
+ 0x1be)) + i
;
1750 nr_sects
= le32_to_cpu(p
->nr_sects
);
1751 if (nr_sects
&& p
->end_head
) {
1752 /* We make the assumption that the partition terminates on
1753 a cylinder boundary */
1754 heads
= p
->end_head
+ 1;
1755 sectors
= p
->end_sector
& 63;
1758 cylinders
= nb_sectors
/ (heads
* sectors
);
1759 if (cylinders
< 1 || cylinders
> 16383)
1762 *psectors
= sectors
;
1763 *pcylinders
= cylinders
;
1765 printf("guessed geometry: LCHS=%d %d %d\n",
1766 cylinders
, heads
, sectors
);
1774 void bdrv_guess_geometry(BlockDriverState
*bs
, int *pcyls
, int *pheads
, int *psecs
)
1776 int translation
, lba_detected
= 0;
1777 int cylinders
, heads
, secs
;
1778 uint64_t nb_sectors
;
1780 /* if a geometry hint is available, use it */
1781 bdrv_get_geometry(bs
, &nb_sectors
);
1782 bdrv_get_geometry_hint(bs
, &cylinders
, &heads
, &secs
);
1783 translation
= bdrv_get_translation_hint(bs
);
1784 if (cylinders
!= 0) {
1789 if (guess_disk_lchs(bs
, &cylinders
, &heads
, &secs
) == 0) {
1791 /* if heads > 16, it means that a BIOS LBA
1792 translation was active, so the default
1793 hardware geometry is OK */
1795 goto default_geometry
;
1800 /* disable any translation to be in sync with
1801 the logical geometry */
1802 if (translation
== BIOS_ATA_TRANSLATION_AUTO
) {
1803 bdrv_set_translation_hint(bs
,
1804 BIOS_ATA_TRANSLATION_NONE
);
1809 /* if no geometry, use a standard physical disk geometry */
1810 cylinders
= nb_sectors
/ (16 * 63);
1812 if (cylinders
> 16383)
1814 else if (cylinders
< 2)
1819 if ((lba_detected
== 1) && (translation
== BIOS_ATA_TRANSLATION_AUTO
)) {
1820 if ((*pcyls
* *pheads
) <= 131072) {
1821 bdrv_set_translation_hint(bs
,
1822 BIOS_ATA_TRANSLATION_LARGE
);
1824 bdrv_set_translation_hint(bs
,
1825 BIOS_ATA_TRANSLATION_LBA
);
1829 bdrv_set_geometry_hint(bs
, *pcyls
, *pheads
, *psecs
);
1833 void bdrv_set_geometry_hint(BlockDriverState
*bs
,
1834 int cyls
, int heads
, int secs
)
1841 void bdrv_set_translation_hint(BlockDriverState
*bs
, int translation
)
1843 bs
->translation
= translation
;
1846 void bdrv_get_geometry_hint(BlockDriverState
*bs
,
1847 int *pcyls
, int *pheads
, int *psecs
)
1850 *pheads
= bs
->heads
;
1854 /* throttling disk io limits */
1855 void bdrv_set_io_limits(BlockDriverState
*bs
,
1856 BlockIOLimit
*io_limits
)
1858 bs
->io_limits
= *io_limits
;
1859 bs
->io_limits_enabled
= bdrv_io_limits_enabled(bs
);
1862 /* Recognize floppy formats */
1863 typedef struct FDFormat
{
1870 static const FDFormat fd_formats
[] = {
1871 /* First entry is default format */
1872 /* 1.44 MB 3"1/2 floppy disks */
1873 { FDRIVE_DRV_144
, 18, 80, 1, },
1874 { FDRIVE_DRV_144
, 20, 80, 1, },
1875 { FDRIVE_DRV_144
, 21, 80, 1, },
1876 { FDRIVE_DRV_144
, 21, 82, 1, },
1877 { FDRIVE_DRV_144
, 21, 83, 1, },
1878 { FDRIVE_DRV_144
, 22, 80, 1, },
1879 { FDRIVE_DRV_144
, 23, 80, 1, },
1880 { FDRIVE_DRV_144
, 24, 80, 1, },
1881 /* 2.88 MB 3"1/2 floppy disks */
1882 { FDRIVE_DRV_288
, 36, 80, 1, },
1883 { FDRIVE_DRV_288
, 39, 80, 1, },
1884 { FDRIVE_DRV_288
, 40, 80, 1, },
1885 { FDRIVE_DRV_288
, 44, 80, 1, },
1886 { FDRIVE_DRV_288
, 48, 80, 1, },
1887 /* 720 kB 3"1/2 floppy disks */
1888 { FDRIVE_DRV_144
, 9, 80, 1, },
1889 { FDRIVE_DRV_144
, 10, 80, 1, },
1890 { FDRIVE_DRV_144
, 10, 82, 1, },
1891 { FDRIVE_DRV_144
, 10, 83, 1, },
1892 { FDRIVE_DRV_144
, 13, 80, 1, },
1893 { FDRIVE_DRV_144
, 14, 80, 1, },
1894 /* 1.2 MB 5"1/4 floppy disks */
1895 { FDRIVE_DRV_120
, 15, 80, 1, },
1896 { FDRIVE_DRV_120
, 18, 80, 1, },
1897 { FDRIVE_DRV_120
, 18, 82, 1, },
1898 { FDRIVE_DRV_120
, 18, 83, 1, },
1899 { FDRIVE_DRV_120
, 20, 80, 1, },
1900 /* 720 kB 5"1/4 floppy disks */
1901 { FDRIVE_DRV_120
, 9, 80, 1, },
1902 { FDRIVE_DRV_120
, 11, 80, 1, },
1903 /* 360 kB 5"1/4 floppy disks */
1904 { FDRIVE_DRV_120
, 9, 40, 1, },
1905 { FDRIVE_DRV_120
, 9, 40, 0, },
1906 { FDRIVE_DRV_120
, 10, 41, 1, },
1907 { FDRIVE_DRV_120
, 10, 42, 1, },
1908 /* 320 kB 5"1/4 floppy disks */
1909 { FDRIVE_DRV_120
, 8, 40, 1, },
1910 { FDRIVE_DRV_120
, 8, 40, 0, },
1911 /* 360 kB must match 5"1/4 better than 3"1/2... */
1912 { FDRIVE_DRV_144
, 9, 80, 0, },
1914 { FDRIVE_DRV_NONE
, -1, -1, 0, },
1917 void bdrv_get_floppy_geometry_hint(BlockDriverState
*bs
, int *nb_heads
,
1918 int *max_track
, int *last_sect
,
1919 FDriveType drive_in
, FDriveType
*drive
)
1921 const FDFormat
*parse
;
1922 uint64_t nb_sectors
, size
;
1923 int i
, first_match
, match
;
1925 bdrv_get_geometry_hint(bs
, nb_heads
, max_track
, last_sect
);
1926 if (*nb_heads
!= 0 && *max_track
!= 0 && *last_sect
!= 0) {
1927 /* User defined disk */
1929 bdrv_get_geometry(bs
, &nb_sectors
);
1932 for (i
= 0; ; i
++) {
1933 parse
= &fd_formats
[i
];
1934 if (parse
->drive
== FDRIVE_DRV_NONE
) {
1937 if (drive_in
== parse
->drive
||
1938 drive_in
== FDRIVE_DRV_NONE
) {
1939 size
= (parse
->max_head
+ 1) * parse
->max_track
*
1941 if (nb_sectors
== size
) {
1945 if (first_match
== -1) {
1951 if (first_match
== -1) {
1954 match
= first_match
;
1956 parse
= &fd_formats
[match
];
1958 *nb_heads
= parse
->max_head
+ 1;
1959 *max_track
= parse
->max_track
;
1960 *last_sect
= parse
->last_sect
;
1961 *drive
= parse
->drive
;
1965 int bdrv_get_translation_hint(BlockDriverState
*bs
)
1967 return bs
->translation
;
1970 void bdrv_set_on_error(BlockDriverState
*bs
, BlockErrorAction on_read_error
,
1971 BlockErrorAction on_write_error
)
1973 bs
->on_read_error
= on_read_error
;
1974 bs
->on_write_error
= on_write_error
;
1977 BlockErrorAction
bdrv_get_on_error(BlockDriverState
*bs
, int is_read
)
1979 return is_read
? bs
->on_read_error
: bs
->on_write_error
;
1982 int bdrv_is_read_only(BlockDriverState
*bs
)
1984 return bs
->read_only
;
1987 int bdrv_is_sg(BlockDriverState
*bs
)
1992 int bdrv_enable_write_cache(BlockDriverState
*bs
)
1994 return bs
->enable_write_cache
;
1997 int bdrv_is_encrypted(BlockDriverState
*bs
)
1999 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
)
2001 return bs
->encrypted
;
2004 int bdrv_key_required(BlockDriverState
*bs
)
2006 BlockDriverState
*backing_hd
= bs
->backing_hd
;
2008 if (backing_hd
&& backing_hd
->encrypted
&& !backing_hd
->valid_key
)
2010 return (bs
->encrypted
&& !bs
->valid_key
);
2013 int bdrv_set_key(BlockDriverState
*bs
, const char *key
)
2016 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
) {
2017 ret
= bdrv_set_key(bs
->backing_hd
, key
);
2023 if (!bs
->encrypted
) {
2025 } else if (!bs
->drv
|| !bs
->drv
->bdrv_set_key
) {
2028 ret
= bs
->drv
->bdrv_set_key(bs
, key
);
2031 } else if (!bs
->valid_key
) {
2033 /* call the change callback now, we skipped it on open */
2034 bdrv_dev_change_media_cb(bs
, true);
2039 void bdrv_get_format(BlockDriverState
*bs
, char *buf
, int buf_size
)
2044 pstrcpy(buf
, buf_size
, bs
->drv
->format_name
);
2048 void bdrv_iterate_format(void (*it
)(void *opaque
, const char *name
),
2053 QLIST_FOREACH(drv
, &bdrv_drivers
, list
) {
2054 it(opaque
, drv
->format_name
);
2058 BlockDriverState
*bdrv_find(const char *name
)
2060 BlockDriverState
*bs
;
2062 QTAILQ_FOREACH(bs
, &bdrv_states
, list
) {
2063 if (!strcmp(name
, bs
->device_name
)) {
2070 BlockDriverState
*bdrv_next(BlockDriverState
*bs
)
2073 return QTAILQ_FIRST(&bdrv_states
);
2075 return QTAILQ_NEXT(bs
, list
);
2078 void bdrv_iterate(void (*it
)(void *opaque
, BlockDriverState
*bs
), void *opaque
)
2080 BlockDriverState
*bs
;
2082 QTAILQ_FOREACH(bs
, &bdrv_states
, list
) {
2087 const char *bdrv_get_device_name(BlockDriverState
*bs
)
2089 return bs
->device_name
;
2092 void bdrv_flush_all(void)
2094 BlockDriverState
*bs
;
2096 QTAILQ_FOREACH(bs
, &bdrv_states
, list
) {
2097 if (!bdrv_is_read_only(bs
) && bdrv_is_inserted(bs
)) {
2103 int bdrv_has_zero_init(BlockDriverState
*bs
)
2107 if (bs
->drv
->bdrv_has_zero_init
) {
2108 return bs
->drv
->bdrv_has_zero_init(bs
);
2114 typedef struct BdrvCoIsAllocatedData
{
2115 BlockDriverState
*bs
;
2121 } BdrvCoIsAllocatedData
;
2124 * Returns true iff the specified sector is present in the disk image. Drivers
2125 * not implementing the functionality are assumed to not support backing files,
2126 * hence all their sectors are reported as allocated.
2128 * If 'sector_num' is beyond the end of the disk image the return value is 0
2129 * and 'pnum' is set to 0.
2131 * 'pnum' is set to the number of sectors (including and immediately following
2132 * the specified sector) that are known to be in the same
2133 * allocated/unallocated state.
2135 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2136 * beyond the end of the disk image it will be clamped.
2138 int coroutine_fn
bdrv_co_is_allocated(BlockDriverState
*bs
, int64_t sector_num
,
2139 int nb_sectors
, int *pnum
)
2143 if (sector_num
>= bs
->total_sectors
) {
2148 n
= bs
->total_sectors
- sector_num
;
2149 if (n
< nb_sectors
) {
2153 if (!bs
->drv
->bdrv_co_is_allocated
) {
2158 return bs
->drv
->bdrv_co_is_allocated(bs
, sector_num
, nb_sectors
, pnum
);
2161 /* Coroutine wrapper for bdrv_is_allocated() */
2162 static void coroutine_fn
bdrv_is_allocated_co_entry(void *opaque
)
2164 BdrvCoIsAllocatedData
*data
= opaque
;
2165 BlockDriverState
*bs
= data
->bs
;
2167 data
->ret
= bdrv_co_is_allocated(bs
, data
->sector_num
, data
->nb_sectors
,
2173 * Synchronous wrapper around bdrv_co_is_allocated().
2175 * See bdrv_co_is_allocated() for details.
2177 int bdrv_is_allocated(BlockDriverState
*bs
, int64_t sector_num
, int nb_sectors
,
2181 BdrvCoIsAllocatedData data
= {
2183 .sector_num
= sector_num
,
2184 .nb_sectors
= nb_sectors
,
2189 co
= qemu_coroutine_create(bdrv_is_allocated_co_entry
);
2190 qemu_coroutine_enter(co
, &data
);
2191 while (!data
.done
) {
2197 void bdrv_mon_event(const BlockDriverState
*bdrv
,
2198 BlockMonEventAction action
, int is_read
)
2201 const char *action_str
;
2204 case BDRV_ACTION_REPORT
:
2205 action_str
= "report";
2207 case BDRV_ACTION_IGNORE
:
2208 action_str
= "ignore";
2210 case BDRV_ACTION_STOP
:
2211 action_str
= "stop";
2217 data
= qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2220 is_read
? "read" : "write");
2221 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR
, data
);
2223 qobject_decref(data
);
2226 BlockInfoList
*qmp_query_block(Error
**errp
)
2228 BlockInfoList
*head
= NULL
, *cur_item
= NULL
;
2229 BlockDriverState
*bs
;
2231 QTAILQ_FOREACH(bs
, &bdrv_states
, list
) {
2232 BlockInfoList
*info
= g_malloc0(sizeof(*info
));
2234 info
->value
= g_malloc0(sizeof(*info
->value
));
2235 info
->value
->device
= g_strdup(bs
->device_name
);
2236 info
->value
->type
= g_strdup("unknown");
2237 info
->value
->locked
= bdrv_dev_is_medium_locked(bs
);
2238 info
->value
->removable
= bdrv_dev_has_removable_media(bs
);
2240 if (bdrv_dev_has_removable_media(bs
)) {
2241 info
->value
->has_tray_open
= true;
2242 info
->value
->tray_open
= bdrv_dev_is_tray_open(bs
);
2245 if (bdrv_iostatus_is_enabled(bs
)) {
2246 info
->value
->has_io_status
= true;
2247 info
->value
->io_status
= bs
->iostatus
;
2251 info
->value
->has_inserted
= true;
2252 info
->value
->inserted
= g_malloc0(sizeof(*info
->value
->inserted
));
2253 info
->value
->inserted
->file
= g_strdup(bs
->filename
);
2254 info
->value
->inserted
->ro
= bs
->read_only
;
2255 info
->value
->inserted
->drv
= g_strdup(bs
->drv
->format_name
);
2256 info
->value
->inserted
->encrypted
= bs
->encrypted
;
2257 if (bs
->backing_file
[0]) {
2258 info
->value
->inserted
->has_backing_file
= true;
2259 info
->value
->inserted
->backing_file
= g_strdup(bs
->backing_file
);
2262 if (bs
->io_limits_enabled
) {
2263 info
->value
->inserted
->bps
=
2264 bs
->io_limits
.bps
[BLOCK_IO_LIMIT_TOTAL
];
2265 info
->value
->inserted
->bps_rd
=
2266 bs
->io_limits
.bps
[BLOCK_IO_LIMIT_READ
];
2267 info
->value
->inserted
->bps_wr
=
2268 bs
->io_limits
.bps
[BLOCK_IO_LIMIT_WRITE
];
2269 info
->value
->inserted
->iops
=
2270 bs
->io_limits
.iops
[BLOCK_IO_LIMIT_TOTAL
];
2271 info
->value
->inserted
->iops_rd
=
2272 bs
->io_limits
.iops
[BLOCK_IO_LIMIT_READ
];
2273 info
->value
->inserted
->iops_wr
=
2274 bs
->io_limits
.iops
[BLOCK_IO_LIMIT_WRITE
];
2278 /* XXX: waiting for the qapi to support GSList */
2280 head
= cur_item
= info
;
2282 cur_item
->next
= info
;
2290 /* Consider exposing this as a full fledged QMP command */
2291 static BlockStats
*qmp_query_blockstat(const BlockDriverState
*bs
, Error
**errp
)
2295 s
= g_malloc0(sizeof(*s
));
2297 if (bs
->device_name
[0]) {
2298 s
->has_device
= true;
2299 s
->device
= g_strdup(bs
->device_name
);
2302 s
->stats
= g_malloc0(sizeof(*s
->stats
));
2303 s
->stats
->rd_bytes
= bs
->nr_bytes
[BDRV_ACCT_READ
];
2304 s
->stats
->wr_bytes
= bs
->nr_bytes
[BDRV_ACCT_WRITE
];
2305 s
->stats
->rd_operations
= bs
->nr_ops
[BDRV_ACCT_READ
];
2306 s
->stats
->wr_operations
= bs
->nr_ops
[BDRV_ACCT_WRITE
];
2307 s
->stats
->wr_highest_offset
= bs
->wr_highest_sector
* BDRV_SECTOR_SIZE
;
2308 s
->stats
->flush_operations
= bs
->nr_ops
[BDRV_ACCT_FLUSH
];
2309 s
->stats
->wr_total_time_ns
= bs
->total_time_ns
[BDRV_ACCT_WRITE
];
2310 s
->stats
->rd_total_time_ns
= bs
->total_time_ns
[BDRV_ACCT_READ
];
2311 s
->stats
->flush_total_time_ns
= bs
->total_time_ns
[BDRV_ACCT_FLUSH
];
2314 s
->has_parent
= true;
2315 s
->parent
= qmp_query_blockstat(bs
->file
, NULL
);
2321 BlockStatsList
*qmp_query_blockstats(Error
**errp
)
2323 BlockStatsList
*head
= NULL
, *cur_item
= NULL
;
2324 BlockDriverState
*bs
;
2326 QTAILQ_FOREACH(bs
, &bdrv_states
, list
) {
2327 BlockStatsList
*info
= g_malloc0(sizeof(*info
));
2328 info
->value
= qmp_query_blockstat(bs
, NULL
);
2330 /* XXX: waiting for the qapi to support GSList */
2332 head
= cur_item
= info
;
2334 cur_item
->next
= info
;
2342 const char *bdrv_get_encrypted_filename(BlockDriverState
*bs
)
2344 if (bs
->backing_hd
&& bs
->backing_hd
->encrypted
)
2345 return bs
->backing_file
;
2346 else if (bs
->encrypted
)
2347 return bs
->filename
;
2352 void bdrv_get_backing_filename(BlockDriverState
*bs
,
2353 char *filename
, int filename_size
)
2355 pstrcpy(filename
, filename_size
, bs
->backing_file
);
2358 int bdrv_write_compressed(BlockDriverState
*bs
, int64_t sector_num
,
2359 const uint8_t *buf
, int nb_sectors
)
2361 BlockDriver
*drv
= bs
->drv
;
2364 if (!drv
->bdrv_write_compressed
)
2366 if (bdrv_check_request(bs
, sector_num
, nb_sectors
))
2369 if (bs
->dirty_bitmap
) {
2370 set_dirty_bitmap(bs
, sector_num
, nb_sectors
, 1);
2373 return drv
->bdrv_write_compressed(bs
, sector_num
, buf
, nb_sectors
);
2376 int bdrv_get_info(BlockDriverState
*bs
, BlockDriverInfo
*bdi
)
2378 BlockDriver
*drv
= bs
->drv
;
2381 if (!drv
->bdrv_get_info
)
2383 memset(bdi
, 0, sizeof(*bdi
));
2384 return drv
->bdrv_get_info(bs
, bdi
);
2387 int bdrv_save_vmstate(BlockDriverState
*bs
, const uint8_t *buf
,
2388 int64_t pos
, int size
)
2390 BlockDriver
*drv
= bs
->drv
;
2393 if (drv
->bdrv_save_vmstate
)
2394 return drv
->bdrv_save_vmstate(bs
, buf
, pos
, size
);
2396 return bdrv_save_vmstate(bs
->file
, buf
, pos
, size
);
2400 int bdrv_load_vmstate(BlockDriverState
*bs
, uint8_t *buf
,
2401 int64_t pos
, int size
)
2403 BlockDriver
*drv
= bs
->drv
;
2406 if (drv
->bdrv_load_vmstate
)
2407 return drv
->bdrv_load_vmstate(bs
, buf
, pos
, size
);
2409 return bdrv_load_vmstate(bs
->file
, buf
, pos
, size
);
2413 void bdrv_debug_event(BlockDriverState
*bs
, BlkDebugEvent event
)
2415 BlockDriver
*drv
= bs
->drv
;
2417 if (!drv
|| !drv
->bdrv_debug_event
) {
2421 return drv
->bdrv_debug_event(bs
, event
);
2425 /**************************************************************/
2426 /* handling of snapshots */
2428 int bdrv_can_snapshot(BlockDriverState
*bs
)
2430 BlockDriver
*drv
= bs
->drv
;
2431 if (!drv
|| !bdrv_is_inserted(bs
) || bdrv_is_read_only(bs
)) {
2435 if (!drv
->bdrv_snapshot_create
) {
2436 if (bs
->file
!= NULL
) {
2437 return bdrv_can_snapshot(bs
->file
);
2445 int bdrv_is_snapshot(BlockDriverState
*bs
)
2447 return !!(bs
->open_flags
& BDRV_O_SNAPSHOT
);
2450 BlockDriverState
*bdrv_snapshots(void)
2452 BlockDriverState
*bs
;
2455 return bs_snapshots
;
2459 while ((bs
= bdrv_next(bs
))) {
2460 if (bdrv_can_snapshot(bs
)) {
2468 int bdrv_snapshot_create(BlockDriverState
*bs
,
2469 QEMUSnapshotInfo
*sn_info
)
2471 BlockDriver
*drv
= bs
->drv
;
2474 if (drv
->bdrv_snapshot_create
)
2475 return drv
->bdrv_snapshot_create(bs
, sn_info
);
2477 return bdrv_snapshot_create(bs
->file
, sn_info
);
2481 int bdrv_snapshot_goto(BlockDriverState
*bs
,
2482 const char *snapshot_id
)
2484 BlockDriver
*drv
= bs
->drv
;
2489 if (drv
->bdrv_snapshot_goto
)
2490 return drv
->bdrv_snapshot_goto(bs
, snapshot_id
);
2493 drv
->bdrv_close(bs
);
2494 ret
= bdrv_snapshot_goto(bs
->file
, snapshot_id
);
2495 open_ret
= drv
->bdrv_open(bs
, bs
->open_flags
);
2497 bdrv_delete(bs
->file
);
2507 int bdrv_snapshot_delete(BlockDriverState
*bs
, const char *snapshot_id
)
2509 BlockDriver
*drv
= bs
->drv
;
2512 if (drv
->bdrv_snapshot_delete
)
2513 return drv
->bdrv_snapshot_delete(bs
, snapshot_id
);
2515 return bdrv_snapshot_delete(bs
->file
, snapshot_id
);
2519 int bdrv_snapshot_list(BlockDriverState
*bs
,
2520 QEMUSnapshotInfo
**psn_info
)
2522 BlockDriver
*drv
= bs
->drv
;
2525 if (drv
->bdrv_snapshot_list
)
2526 return drv
->bdrv_snapshot_list(bs
, psn_info
);
2528 return bdrv_snapshot_list(bs
->file
, psn_info
);
2532 int bdrv_snapshot_load_tmp(BlockDriverState
*bs
,
2533 const char *snapshot_name
)
2535 BlockDriver
*drv
= bs
->drv
;
2539 if (!bs
->read_only
) {
2542 if (drv
->bdrv_snapshot_load_tmp
) {
2543 return drv
->bdrv_snapshot_load_tmp(bs
, snapshot_name
);
2548 #define NB_SUFFIXES 4
2550 char *get_human_readable_size(char *buf
, int buf_size
, int64_t size
)
2552 static const char suffixes
[NB_SUFFIXES
] = "KMGT";
2557 snprintf(buf
, buf_size
, "%" PRId64
, size
);
2560 for(i
= 0; i
< NB_SUFFIXES
; i
++) {
2561 if (size
< (10 * base
)) {
2562 snprintf(buf
, buf_size
, "%0.1f%c",
2563 (double)size
/ base
,
2566 } else if (size
< (1000 * base
) || i
== (NB_SUFFIXES
- 1)) {
2567 snprintf(buf
, buf_size
, "%" PRId64
"%c",
2568 ((size
+ (base
>> 1)) / base
),
2578 char *bdrv_snapshot_dump(char *buf
, int buf_size
, QEMUSnapshotInfo
*sn
)
2580 char buf1
[128], date_buf
[128], clock_buf
[128];
2590 snprintf(buf
, buf_size
,
2591 "%-10s%-20s%7s%20s%15s",
2592 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2596 ptm
= localtime(&ti
);
2597 strftime(date_buf
, sizeof(date_buf
),
2598 "%Y-%m-%d %H:%M:%S", ptm
);
2600 localtime_r(&ti
, &tm
);
2601 strftime(date_buf
, sizeof(date_buf
),
2602 "%Y-%m-%d %H:%M:%S", &tm
);
2604 secs
= sn
->vm_clock_nsec
/ 1000000000;
2605 snprintf(clock_buf
, sizeof(clock_buf
),
2606 "%02d:%02d:%02d.%03d",
2608 (int)((secs
/ 60) % 60),
2610 (int)((sn
->vm_clock_nsec
/ 1000000) % 1000));
2611 snprintf(buf
, buf_size
,
2612 "%-10s%-20s%7s%20s%15s",
2613 sn
->id_str
, sn
->name
,
2614 get_human_readable_size(buf1
, sizeof(buf1
), sn
->vm_state_size
),
2621 /**************************************************************/
2624 BlockDriverAIOCB
*bdrv_aio_readv(BlockDriverState
*bs
, int64_t sector_num
,
2625 QEMUIOVector
*qiov
, int nb_sectors
,
2626 BlockDriverCompletionFunc
*cb
, void *opaque
)
2628 trace_bdrv_aio_readv(bs
, sector_num
, nb_sectors
, opaque
);
2630 return bdrv_co_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
,
2634 BlockDriverAIOCB
*bdrv_aio_writev(BlockDriverState
*bs
, int64_t sector_num
,
2635 QEMUIOVector
*qiov
, int nb_sectors
,
2636 BlockDriverCompletionFunc
*cb
, void *opaque
)
2638 trace_bdrv_aio_writev(bs
, sector_num
, nb_sectors
, opaque
);
2640 return bdrv_co_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
,
2645 typedef struct MultiwriteCB
{
2650 BlockDriverCompletionFunc
*cb
;
2652 QEMUIOVector
*free_qiov
;
2657 static void multiwrite_user_cb(MultiwriteCB
*mcb
)
2661 for (i
= 0; i
< mcb
->num_callbacks
; i
++) {
2662 mcb
->callbacks
[i
].cb(mcb
->callbacks
[i
].opaque
, mcb
->error
);
2663 if (mcb
->callbacks
[i
].free_qiov
) {
2664 qemu_iovec_destroy(mcb
->callbacks
[i
].free_qiov
);
2666 g_free(mcb
->callbacks
[i
].free_qiov
);
2667 qemu_vfree(mcb
->callbacks
[i
].free_buf
);
2671 static void multiwrite_cb(void *opaque
, int ret
)
2673 MultiwriteCB
*mcb
= opaque
;
2675 trace_multiwrite_cb(mcb
, ret
);
2677 if (ret
< 0 && !mcb
->error
) {
2681 mcb
->num_requests
--;
2682 if (mcb
->num_requests
== 0) {
2683 multiwrite_user_cb(mcb
);
2688 static int multiwrite_req_compare(const void *a
, const void *b
)
2690 const BlockRequest
*req1
= a
, *req2
= b
;
2693 * Note that we can't simply subtract req2->sector from req1->sector
2694 * here as that could overflow the return value.
2696 if (req1
->sector
> req2
->sector
) {
2698 } else if (req1
->sector
< req2
->sector
) {
2706 * Takes a bunch of requests and tries to merge them. Returns the number of
2707 * requests that remain after merging.
2709 static int multiwrite_merge(BlockDriverState
*bs
, BlockRequest
*reqs
,
2710 int num_reqs
, MultiwriteCB
*mcb
)
2714 // Sort requests by start sector
2715 qsort(reqs
, num_reqs
, sizeof(*reqs
), &multiwrite_req_compare
);
2717 // Check if adjacent requests touch the same clusters. If so, combine them,
2718 // filling up gaps with zero sectors.
2720 for (i
= 1; i
< num_reqs
; i
++) {
2722 int64_t oldreq_last
= reqs
[outidx
].sector
+ reqs
[outidx
].nb_sectors
;
2724 // This handles the cases that are valid for all block drivers, namely
2725 // exactly sequential writes and overlapping writes.
2726 if (reqs
[i
].sector
<= oldreq_last
) {
2730 // The block driver may decide that it makes sense to combine requests
2731 // even if there is a gap of some sectors between them. In this case,
2732 // the gap is filled with zeros (therefore only applicable for yet
2733 // unused space in format like qcow2).
2734 if (!merge
&& bs
->drv
->bdrv_merge_requests
) {
2735 merge
= bs
->drv
->bdrv_merge_requests(bs
, &reqs
[outidx
], &reqs
[i
]);
2738 if (reqs
[outidx
].qiov
->niov
+ reqs
[i
].qiov
->niov
+ 1 > IOV_MAX
) {
2744 QEMUIOVector
*qiov
= g_malloc0(sizeof(*qiov
));
2745 qemu_iovec_init(qiov
,
2746 reqs
[outidx
].qiov
->niov
+ reqs
[i
].qiov
->niov
+ 1);
2748 // Add the first request to the merged one. If the requests are
2749 // overlapping, drop the last sectors of the first request.
2750 size
= (reqs
[i
].sector
- reqs
[outidx
].sector
) << 9;
2751 qemu_iovec_concat(qiov
, reqs
[outidx
].qiov
, size
);
2753 // We might need to add some zeros between the two requests
2754 if (reqs
[i
].sector
> oldreq_last
) {
2755 size_t zero_bytes
= (reqs
[i
].sector
- oldreq_last
) << 9;
2756 uint8_t *buf
= qemu_blockalign(bs
, zero_bytes
);
2757 memset(buf
, 0, zero_bytes
);
2758 qemu_iovec_add(qiov
, buf
, zero_bytes
);
2759 mcb
->callbacks
[i
].free_buf
= buf
;
2762 // Add the second request
2763 qemu_iovec_concat(qiov
, reqs
[i
].qiov
, reqs
[i
].qiov
->size
);
2765 reqs
[outidx
].nb_sectors
= qiov
->size
>> 9;
2766 reqs
[outidx
].qiov
= qiov
;
2768 mcb
->callbacks
[i
].free_qiov
= reqs
[outidx
].qiov
;
2771 reqs
[outidx
].sector
= reqs
[i
].sector
;
2772 reqs
[outidx
].nb_sectors
= reqs
[i
].nb_sectors
;
2773 reqs
[outidx
].qiov
= reqs
[i
].qiov
;
2781 * Submit multiple AIO write requests at once.
2783 * On success, the function returns 0 and all requests in the reqs array have
2784 * been submitted. In error case this function returns -1, and any of the
2785 * requests may or may not be submitted yet. In particular, this means that the
2786 * callback will be called for some of the requests, for others it won't. The
2787 * caller must check the error field of the BlockRequest to wait for the right
2788 * callbacks (if error != 0, no callback will be called).
2790 * The implementation may modify the contents of the reqs array, e.g. to merge
2791 * requests. However, the fields opaque and error are left unmodified as they
2792 * are used to signal failure for a single request to the caller.
2794 int bdrv_aio_multiwrite(BlockDriverState
*bs
, BlockRequest
*reqs
, int num_reqs
)
2796 BlockDriverAIOCB
*acb
;
2800 /* don't submit writes if we don't have a medium */
2801 if (bs
->drv
== NULL
) {
2802 for (i
= 0; i
< num_reqs
; i
++) {
2803 reqs
[i
].error
= -ENOMEDIUM
;
2808 if (num_reqs
== 0) {
2812 // Create MultiwriteCB structure
2813 mcb
= g_malloc0(sizeof(*mcb
) + num_reqs
* sizeof(*mcb
->callbacks
));
2814 mcb
->num_requests
= 0;
2815 mcb
->num_callbacks
= num_reqs
;
2817 for (i
= 0; i
< num_reqs
; i
++) {
2818 mcb
->callbacks
[i
].cb
= reqs
[i
].cb
;
2819 mcb
->callbacks
[i
].opaque
= reqs
[i
].opaque
;
2822 // Check for mergable requests
2823 num_reqs
= multiwrite_merge(bs
, reqs
, num_reqs
, mcb
);
2825 trace_bdrv_aio_multiwrite(mcb
, mcb
->num_callbacks
, num_reqs
);
2828 * Run the aio requests. As soon as one request can't be submitted
2829 * successfully, fail all requests that are not yet submitted (we must
2830 * return failure for all requests anyway)
2832 * num_requests cannot be set to the right value immediately: If
2833 * bdrv_aio_writev fails for some request, num_requests would be too high
2834 * and therefore multiwrite_cb() would never recognize the multiwrite
2835 * request as completed. We also cannot use the loop variable i to set it
2836 * when the first request fails because the callback may already have been
2837 * called for previously submitted requests. Thus, num_requests must be
2838 * incremented for each request that is submitted.
2840 * The problem that callbacks may be called early also means that we need
2841 * to take care that num_requests doesn't become 0 before all requests are
2842 * submitted - multiwrite_cb() would consider the multiwrite request
2843 * completed. A dummy request that is "completed" by a manual call to
2844 * multiwrite_cb() takes care of this.
2846 mcb
->num_requests
= 1;
2848 // Run the aio requests
2849 for (i
= 0; i
< num_reqs
; i
++) {
2850 mcb
->num_requests
++;
2851 acb
= bdrv_aio_writev(bs
, reqs
[i
].sector
, reqs
[i
].qiov
,
2852 reqs
[i
].nb_sectors
, multiwrite_cb
, mcb
);
2855 // We can only fail the whole thing if no request has been
2856 // submitted yet. Otherwise we'll wait for the submitted AIOs to
2857 // complete and report the error in the callback.
2859 trace_bdrv_aio_multiwrite_earlyfail(mcb
);
2862 trace_bdrv_aio_multiwrite_latefail(mcb
, i
);
2863 multiwrite_cb(mcb
, -EIO
);
2869 /* Complete the dummy request */
2870 multiwrite_cb(mcb
, 0);
2875 for (i
= 0; i
< mcb
->num_callbacks
; i
++) {
2876 reqs
[i
].error
= -EIO
;
2882 void bdrv_aio_cancel(BlockDriverAIOCB
*acb
)
2884 acb
->pool
->cancel(acb
);
2887 /* block I/O throttling */
2888 static bool bdrv_exceed_bps_limits(BlockDriverState
*bs
, int nb_sectors
,
2889 bool is_write
, double elapsed_time
, uint64_t *wait
)
2891 uint64_t bps_limit
= 0;
2892 double bytes_limit
, bytes_base
, bytes_res
;
2893 double slice_time
, wait_time
;
2895 if (bs
->io_limits
.bps
[BLOCK_IO_LIMIT_TOTAL
]) {
2896 bps_limit
= bs
->io_limits
.bps
[BLOCK_IO_LIMIT_TOTAL
];
2897 } else if (bs
->io_limits
.bps
[is_write
]) {
2898 bps_limit
= bs
->io_limits
.bps
[is_write
];
2907 slice_time
= bs
->slice_end
- bs
->slice_start
;
2908 slice_time
/= (NANOSECONDS_PER_SECOND
);
2909 bytes_limit
= bps_limit
* slice_time
;
2910 bytes_base
= bs
->nr_bytes
[is_write
] - bs
->io_base
.bytes
[is_write
];
2911 if (bs
->io_limits
.bps
[BLOCK_IO_LIMIT_TOTAL
]) {
2912 bytes_base
+= bs
->nr_bytes
[!is_write
] - bs
->io_base
.bytes
[!is_write
];
2915 /* bytes_base: the bytes of data which have been read/written; and
2916 * it is obtained from the history statistic info.
2917 * bytes_res: the remaining bytes of data which need to be read/written.
2918 * (bytes_base + bytes_res) / bps_limit: used to calcuate
2919 * the total time for completing reading/writting all data.
2921 bytes_res
= (unsigned) nb_sectors
* BDRV_SECTOR_SIZE
;
2923 if (bytes_base
+ bytes_res
<= bytes_limit
) {
2931 /* Calc approx time to dispatch */
2932 wait_time
= (bytes_base
+ bytes_res
) / bps_limit
- elapsed_time
;
2934 /* When the I/O rate at runtime exceeds the limits,
2935 * bs->slice_end need to be extended in order that the current statistic
2936 * info can be kept until the timer fire, so it is increased and tuned
2937 * based on the result of experiment.
2939 bs
->slice_time
= wait_time
* BLOCK_IO_SLICE_TIME
* 10;
2940 bs
->slice_end
+= bs
->slice_time
- 3 * BLOCK_IO_SLICE_TIME
;
2942 *wait
= wait_time
* BLOCK_IO_SLICE_TIME
* 10;
2948 static bool bdrv_exceed_iops_limits(BlockDriverState
*bs
, bool is_write
,
2949 double elapsed_time
, uint64_t *wait
)
2951 uint64_t iops_limit
= 0;
2952 double ios_limit
, ios_base
;
2953 double slice_time
, wait_time
;
2955 if (bs
->io_limits
.iops
[BLOCK_IO_LIMIT_TOTAL
]) {
2956 iops_limit
= bs
->io_limits
.iops
[BLOCK_IO_LIMIT_TOTAL
];
2957 } else if (bs
->io_limits
.iops
[is_write
]) {
2958 iops_limit
= bs
->io_limits
.iops
[is_write
];
2967 slice_time
= bs
->slice_end
- bs
->slice_start
;
2968 slice_time
/= (NANOSECONDS_PER_SECOND
);
2969 ios_limit
= iops_limit
* slice_time
;
2970 ios_base
= bs
->nr_ops
[is_write
] - bs
->io_base
.ios
[is_write
];
2971 if (bs
->io_limits
.iops
[BLOCK_IO_LIMIT_TOTAL
]) {
2972 ios_base
+= bs
->nr_ops
[!is_write
] - bs
->io_base
.ios
[!is_write
];
2975 if (ios_base
+ 1 <= ios_limit
) {
2983 /* Calc approx time to dispatch */
2984 wait_time
= (ios_base
+ 1) / iops_limit
;
2985 if (wait_time
> elapsed_time
) {
2986 wait_time
= wait_time
- elapsed_time
;
2991 bs
->slice_time
= wait_time
* BLOCK_IO_SLICE_TIME
* 10;
2992 bs
->slice_end
+= bs
->slice_time
- 3 * BLOCK_IO_SLICE_TIME
;
2994 *wait
= wait_time
* BLOCK_IO_SLICE_TIME
* 10;
3000 static bool bdrv_exceed_io_limits(BlockDriverState
*bs
, int nb_sectors
,
3001 bool is_write
, int64_t *wait
)
3003 int64_t now
, max_wait
;
3004 uint64_t bps_wait
= 0, iops_wait
= 0;
3005 double elapsed_time
;
3006 int bps_ret
, iops_ret
;
3008 now
= qemu_get_clock_ns(vm_clock
);
3009 if ((bs
->slice_start
< now
)
3010 && (bs
->slice_end
> now
)) {
3011 bs
->slice_end
= now
+ bs
->slice_time
;
3013 bs
->slice_time
= 5 * BLOCK_IO_SLICE_TIME
;
3014 bs
->slice_start
= now
;
3015 bs
->slice_end
= now
+ bs
->slice_time
;
3017 bs
->io_base
.bytes
[is_write
] = bs
->nr_bytes
[is_write
];
3018 bs
->io_base
.bytes
[!is_write
] = bs
->nr_bytes
[!is_write
];
3020 bs
->io_base
.ios
[is_write
] = bs
->nr_ops
[is_write
];
3021 bs
->io_base
.ios
[!is_write
] = bs
->nr_ops
[!is_write
];
3024 elapsed_time
= now
- bs
->slice_start
;
3025 elapsed_time
/= (NANOSECONDS_PER_SECOND
);
3027 bps_ret
= bdrv_exceed_bps_limits(bs
, nb_sectors
,
3028 is_write
, elapsed_time
, &bps_wait
);
3029 iops_ret
= bdrv_exceed_iops_limits(bs
, is_write
,
3030 elapsed_time
, &iops_wait
);
3031 if (bps_ret
|| iops_ret
) {
3032 max_wait
= bps_wait
> iops_wait
? bps_wait
: iops_wait
;
3037 now
= qemu_get_clock_ns(vm_clock
);
3038 if (bs
->slice_end
< now
+ max_wait
) {
3039 bs
->slice_end
= now
+ max_wait
;
3052 /**************************************************************/
3053 /* async block device emulation */
3055 typedef struct BlockDriverAIOCBSync
{
3056 BlockDriverAIOCB common
;
3059 /* vector translation state */
3063 } BlockDriverAIOCBSync
;
3065 static void bdrv_aio_cancel_em(BlockDriverAIOCB
*blockacb
)
3067 BlockDriverAIOCBSync
*acb
=
3068 container_of(blockacb
, BlockDriverAIOCBSync
, common
);
3069 qemu_bh_delete(acb
->bh
);
3071 qemu_aio_release(acb
);
3074 static AIOPool bdrv_em_aio_pool
= {
3075 .aiocb_size
= sizeof(BlockDriverAIOCBSync
),
3076 .cancel
= bdrv_aio_cancel_em
,
3079 static void bdrv_aio_bh_cb(void *opaque
)
3081 BlockDriverAIOCBSync
*acb
= opaque
;
3084 qemu_iovec_from_buffer(acb
->qiov
, acb
->bounce
, acb
->qiov
->size
);
3085 qemu_vfree(acb
->bounce
);
3086 acb
->common
.cb(acb
->common
.opaque
, acb
->ret
);
3087 qemu_bh_delete(acb
->bh
);
3089 qemu_aio_release(acb
);
3092 static BlockDriverAIOCB
*bdrv_aio_rw_vector(BlockDriverState
*bs
,
3096 BlockDriverCompletionFunc
*cb
,
3101 BlockDriverAIOCBSync
*acb
;
3103 acb
= qemu_aio_get(&bdrv_em_aio_pool
, bs
, cb
, opaque
);
3104 acb
->is_write
= is_write
;
3106 acb
->bounce
= qemu_blockalign(bs
, qiov
->size
);
3109 acb
->bh
= qemu_bh_new(bdrv_aio_bh_cb
, acb
);
3112 qemu_iovec_to_buffer(acb
->qiov
, acb
->bounce
);
3113 acb
->ret
= bs
->drv
->bdrv_write(bs
, sector_num
, acb
->bounce
, nb_sectors
);
3115 acb
->ret
= bs
->drv
->bdrv_read(bs
, sector_num
, acb
->bounce
, nb_sectors
);
3118 qemu_bh_schedule(acb
->bh
);
3120 return &acb
->common
;
3123 static BlockDriverAIOCB
*bdrv_aio_readv_em(BlockDriverState
*bs
,
3124 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
3125 BlockDriverCompletionFunc
*cb
, void *opaque
)
3127 return bdrv_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
, 0);
3130 static BlockDriverAIOCB
*bdrv_aio_writev_em(BlockDriverState
*bs
,
3131 int64_t sector_num
, QEMUIOVector
*qiov
, int nb_sectors
,
3132 BlockDriverCompletionFunc
*cb
, void *opaque
)
3134 return bdrv_aio_rw_vector(bs
, sector_num
, qiov
, nb_sectors
, cb
, opaque
, 1);
3138 typedef struct BlockDriverAIOCBCoroutine
{
3139 BlockDriverAIOCB common
;
3143 } BlockDriverAIOCBCoroutine
;
3145 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB
*blockacb
)
3150 static AIOPool bdrv_em_co_aio_pool
= {
3151 .aiocb_size
= sizeof(BlockDriverAIOCBCoroutine
),
3152 .cancel
= bdrv_aio_co_cancel_em
,
3155 static void bdrv_co_em_bh(void *opaque
)
3157 BlockDriverAIOCBCoroutine
*acb
= opaque
;
3159 acb
->common
.cb(acb
->common
.opaque
, acb
->req
.error
);
3160 qemu_bh_delete(acb
->bh
);
3161 qemu_aio_release(acb
);
3164 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3165 static void coroutine_fn
bdrv_co_do_rw(void *opaque
)
3167 BlockDriverAIOCBCoroutine
*acb
= opaque
;
3168 BlockDriverState
*bs
= acb
->common
.bs
;
3170 if (!acb
->is_write
) {
3171 acb
->req
.error
= bdrv_co_do_readv(bs
, acb
->req
.sector
,
3172 acb
->req
.nb_sectors
, acb
->req
.qiov
);
3174 acb
->req
.error
= bdrv_co_do_writev(bs
, acb
->req
.sector
,
3175 acb
->req
.nb_sectors
, acb
->req
.qiov
);
3178 acb
->bh
= qemu_bh_new(bdrv_co_em_bh
, acb
);
3179 qemu_bh_schedule(acb
->bh
);
3182 static BlockDriverAIOCB
*bdrv_co_aio_rw_vector(BlockDriverState
*bs
,
3186 BlockDriverCompletionFunc
*cb
,
3191 BlockDriverAIOCBCoroutine
*acb
;
3193 acb
= qemu_aio_get(&bdrv_em_co_aio_pool
, bs
, cb
, opaque
);
3194 acb
->req
.sector
= sector_num
;
3195 acb
->req
.nb_sectors
= nb_sectors
;
3196 acb
->req
.qiov
= qiov
;
3197 acb
->is_write
= is_write
;
3199 co
= qemu_coroutine_create(bdrv_co_do_rw
);
3200 qemu_coroutine_enter(co
, acb
);
3202 return &acb
->common
;
3205 static void coroutine_fn
bdrv_aio_flush_co_entry(void *opaque
)
3207 BlockDriverAIOCBCoroutine
*acb
= opaque
;
3208 BlockDriverState
*bs
= acb
->common
.bs
;
3210 acb
->req
.error
= bdrv_co_flush(bs
);
3211 acb
->bh
= qemu_bh_new(bdrv_co_em_bh
, acb
);
3212 qemu_bh_schedule(acb
->bh
);
3215 BlockDriverAIOCB
*bdrv_aio_flush(BlockDriverState
*bs
,
3216 BlockDriverCompletionFunc
*cb
, void *opaque
)
3218 trace_bdrv_aio_flush(bs
, opaque
);
3221 BlockDriverAIOCBCoroutine
*acb
;
3223 acb
= qemu_aio_get(&bdrv_em_co_aio_pool
, bs
, cb
, opaque
);
3224 co
= qemu_coroutine_create(bdrv_aio_flush_co_entry
);
3225 qemu_coroutine_enter(co
, acb
);
3227 return &acb
->common
;
3230 static void coroutine_fn
bdrv_aio_discard_co_entry(void *opaque
)
3232 BlockDriverAIOCBCoroutine
*acb
= opaque
;
3233 BlockDriverState
*bs
= acb
->common
.bs
;
3235 acb
->req
.error
= bdrv_co_discard(bs
, acb
->req
.sector
, acb
->req
.nb_sectors
);
3236 acb
->bh
= qemu_bh_new(bdrv_co_em_bh
, acb
);
3237 qemu_bh_schedule(acb
->bh
);
3240 BlockDriverAIOCB
*bdrv_aio_discard(BlockDriverState
*bs
,
3241 int64_t sector_num
, int nb_sectors
,
3242 BlockDriverCompletionFunc
*cb
, void *opaque
)
3245 BlockDriverAIOCBCoroutine
*acb
;
3247 trace_bdrv_aio_discard(bs
, sector_num
, nb_sectors
, opaque
);
3249 acb
= qemu_aio_get(&bdrv_em_co_aio_pool
, bs
, cb
, opaque
);
3250 acb
->req
.sector
= sector_num
;
3251 acb
->req
.nb_sectors
= nb_sectors
;
3252 co
= qemu_coroutine_create(bdrv_aio_discard_co_entry
);
3253 qemu_coroutine_enter(co
, acb
);
3255 return &acb
->common
;
3258 void bdrv_init(void)
3260 module_call_init(MODULE_INIT_BLOCK
);
3263 void bdrv_init_with_whitelist(void)
3265 use_bdrv_whitelist
= 1;
3269 void *qemu_aio_get(AIOPool
*pool
, BlockDriverState
*bs
,
3270 BlockDriverCompletionFunc
*cb
, void *opaque
)
3272 BlockDriverAIOCB
*acb
;
3274 if (pool
->free_aiocb
) {
3275 acb
= pool
->free_aiocb
;
3276 pool
->free_aiocb
= acb
->next
;
3278 acb
= g_malloc0(pool
->aiocb_size
);
3283 acb
->opaque
= opaque
;
3287 void qemu_aio_release(void *p
)
3289 BlockDriverAIOCB
*acb
= (BlockDriverAIOCB
*)p
;
3290 AIOPool
*pool
= acb
->pool
;
3291 acb
->next
= pool
->free_aiocb
;
3292 pool
->free_aiocb
= acb
;
3295 /**************************************************************/
3296 /* Coroutine block device emulation */
3298 typedef struct CoroutineIOCompletion
{
3299 Coroutine
*coroutine
;
3301 } CoroutineIOCompletion
;
3303 static void bdrv_co_io_em_complete(void *opaque
, int ret
)
3305 CoroutineIOCompletion
*co
= opaque
;
3308 qemu_coroutine_enter(co
->coroutine
, NULL
);
3311 static int coroutine_fn
bdrv_co_io_em(BlockDriverState
*bs
, int64_t sector_num
,
3312 int nb_sectors
, QEMUIOVector
*iov
,
3315 CoroutineIOCompletion co
= {
3316 .coroutine
= qemu_coroutine_self(),
3318 BlockDriverAIOCB
*acb
;
3321 acb
= bs
->drv
->bdrv_aio_writev(bs
, sector_num
, iov
, nb_sectors
,
3322 bdrv_co_io_em_complete
, &co
);
3324 acb
= bs
->drv
->bdrv_aio_readv(bs
, sector_num
, iov
, nb_sectors
,
3325 bdrv_co_io_em_complete
, &co
);
3328 trace_bdrv_co_io_em(bs
, sector_num
, nb_sectors
, is_write
, acb
);
3332 qemu_coroutine_yield();
3337 static int coroutine_fn
bdrv_co_readv_em(BlockDriverState
*bs
,
3338 int64_t sector_num
, int nb_sectors
,
3341 return bdrv_co_io_em(bs
, sector_num
, nb_sectors
, iov
, false);
3344 static int coroutine_fn
bdrv_co_writev_em(BlockDriverState
*bs
,
3345 int64_t sector_num
, int nb_sectors
,
3348 return bdrv_co_io_em(bs
, sector_num
, nb_sectors
, iov
, true);
3351 static void coroutine_fn
bdrv_flush_co_entry(void *opaque
)
3353 RwCo
*rwco
= opaque
;
3355 rwco
->ret
= bdrv_co_flush(rwco
->bs
);
3358 int coroutine_fn
bdrv_co_flush(BlockDriverState
*bs
)
3366 /* Write back cached data to the OS even with cache=unsafe */
3367 if (bs
->drv
->bdrv_co_flush_to_os
) {
3368 ret
= bs
->drv
->bdrv_co_flush_to_os(bs
);
3374 /* But don't actually force it to the disk with cache=unsafe */
3375 if (bs
->open_flags
& BDRV_O_NO_FLUSH
) {
3379 if (bs
->drv
->bdrv_co_flush_to_disk
) {
3380 return bs
->drv
->bdrv_co_flush_to_disk(bs
);
3381 } else if (bs
->drv
->bdrv_aio_flush
) {
3382 BlockDriverAIOCB
*acb
;
3383 CoroutineIOCompletion co
= {
3384 .coroutine
= qemu_coroutine_self(),
3387 acb
= bs
->drv
->bdrv_aio_flush(bs
, bdrv_co_io_em_complete
, &co
);
3391 qemu_coroutine_yield();
3396 * Some block drivers always operate in either writethrough or unsafe
3397 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3398 * know how the server works (because the behaviour is hardcoded or
3399 * depends on server-side configuration), so we can't ensure that
3400 * everything is safe on disk. Returning an error doesn't work because
3401 * that would break guests even if the server operates in writethrough
3404 * Let's hope the user knows what he's doing.
3410 void bdrv_invalidate_cache(BlockDriverState
*bs
)
3412 if (bs
->drv
&& bs
->drv
->bdrv_invalidate_cache
) {
3413 bs
->drv
->bdrv_invalidate_cache(bs
);
3417 void bdrv_invalidate_cache_all(void)
3419 BlockDriverState
*bs
;
3421 QTAILQ_FOREACH(bs
, &bdrv_states
, list
) {
3422 bdrv_invalidate_cache(bs
);
3426 int bdrv_flush(BlockDriverState
*bs
)
3434 if (qemu_in_coroutine()) {
3435 /* Fast-path if already in coroutine context */
3436 bdrv_flush_co_entry(&rwco
);
3438 co
= qemu_coroutine_create(bdrv_flush_co_entry
);
3439 qemu_coroutine_enter(co
, &rwco
);
3440 while (rwco
.ret
== NOT_DONE
) {
3448 static void coroutine_fn
bdrv_discard_co_entry(void *opaque
)
3450 RwCo
*rwco
= opaque
;
3452 rwco
->ret
= bdrv_co_discard(rwco
->bs
, rwco
->sector_num
, rwco
->nb_sectors
);
3455 int coroutine_fn
bdrv_co_discard(BlockDriverState
*bs
, int64_t sector_num
,
3460 } else if (bdrv_check_request(bs
, sector_num
, nb_sectors
)) {
3462 } else if (bs
->read_only
) {
3464 } else if (bs
->drv
->bdrv_co_discard
) {
3465 return bs
->drv
->bdrv_co_discard(bs
, sector_num
, nb_sectors
);
3466 } else if (bs
->drv
->bdrv_aio_discard
) {
3467 BlockDriverAIOCB
*acb
;
3468 CoroutineIOCompletion co
= {
3469 .coroutine
= qemu_coroutine_self(),
3472 acb
= bs
->drv
->bdrv_aio_discard(bs
, sector_num
, nb_sectors
,
3473 bdrv_co_io_em_complete
, &co
);
3477 qemu_coroutine_yield();
3485 int bdrv_discard(BlockDriverState
*bs
, int64_t sector_num
, int nb_sectors
)
3490 .sector_num
= sector_num
,
3491 .nb_sectors
= nb_sectors
,
3495 if (qemu_in_coroutine()) {
3496 /* Fast-path if already in coroutine context */
3497 bdrv_discard_co_entry(&rwco
);
3499 co
= qemu_coroutine_create(bdrv_discard_co_entry
);
3500 qemu_coroutine_enter(co
, &rwco
);
3501 while (rwco
.ret
== NOT_DONE
) {
3509 /**************************************************************/
3510 /* removable device support */
3513 * Return TRUE if the media is present
3515 int bdrv_is_inserted(BlockDriverState
*bs
)
3517 BlockDriver
*drv
= bs
->drv
;
3521 if (!drv
->bdrv_is_inserted
)
3523 return drv
->bdrv_is_inserted(bs
);
3527 * Return whether the media changed since the last call to this
3528 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3530 int bdrv_media_changed(BlockDriverState
*bs
)
3532 BlockDriver
*drv
= bs
->drv
;
3534 if (drv
&& drv
->bdrv_media_changed
) {
3535 return drv
->bdrv_media_changed(bs
);
3541 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3543 void bdrv_eject(BlockDriverState
*bs
, int eject_flag
)
3545 BlockDriver
*drv
= bs
->drv
;
3547 if (drv
&& drv
->bdrv_eject
) {
3548 drv
->bdrv_eject(bs
, eject_flag
);
3553 * Lock or unlock the media (if it is locked, the user won't be able
3554 * to eject it manually).
3556 void bdrv_lock_medium(BlockDriverState
*bs
, bool locked
)
3558 BlockDriver
*drv
= bs
->drv
;
3560 trace_bdrv_lock_medium(bs
, locked
);
3562 if (drv
&& drv
->bdrv_lock_medium
) {
3563 drv
->bdrv_lock_medium(bs
, locked
);
3567 /* needed for generic scsi interface */
3569 int bdrv_ioctl(BlockDriverState
*bs
, unsigned long int req
, void *buf
)
3571 BlockDriver
*drv
= bs
->drv
;
3573 if (drv
&& drv
->bdrv_ioctl
)
3574 return drv
->bdrv_ioctl(bs
, req
, buf
);
3578 BlockDriverAIOCB
*bdrv_aio_ioctl(BlockDriverState
*bs
,
3579 unsigned long int req
, void *buf
,
3580 BlockDriverCompletionFunc
*cb
, void *opaque
)
3582 BlockDriver
*drv
= bs
->drv
;
3584 if (drv
&& drv
->bdrv_aio_ioctl
)
3585 return drv
->bdrv_aio_ioctl(bs
, req
, buf
, cb
, opaque
);
3589 void bdrv_set_buffer_alignment(BlockDriverState
*bs
, int align
)
3591 bs
->buffer_alignment
= align
;
3594 void *qemu_blockalign(BlockDriverState
*bs
, size_t size
)
3596 return qemu_memalign((bs
&& bs
->buffer_alignment
) ? bs
->buffer_alignment
: 512, size
);
3599 void bdrv_set_dirty_tracking(BlockDriverState
*bs
, int enable
)
3601 int64_t bitmap_size
;
3603 bs
->dirty_count
= 0;
3605 if (!bs
->dirty_bitmap
) {
3606 bitmap_size
= (bdrv_getlength(bs
) >> BDRV_SECTOR_BITS
) +
3607 BDRV_SECTORS_PER_DIRTY_CHUNK
* 8 - 1;
3608 bitmap_size
/= BDRV_SECTORS_PER_DIRTY_CHUNK
* 8;
3610 bs
->dirty_bitmap
= g_malloc0(bitmap_size
);
3613 if (bs
->dirty_bitmap
) {
3614 g_free(bs
->dirty_bitmap
);
3615 bs
->dirty_bitmap
= NULL
;
3620 int bdrv_get_dirty(BlockDriverState
*bs
, int64_t sector
)
3622 int64_t chunk
= sector
/ (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK
;
3624 if (bs
->dirty_bitmap
&&
3625 (sector
<< BDRV_SECTOR_BITS
) < bdrv_getlength(bs
)) {
3626 return !!(bs
->dirty_bitmap
[chunk
/ (sizeof(unsigned long) * 8)] &
3627 (1UL << (chunk
% (sizeof(unsigned long) * 8))));
3633 void bdrv_reset_dirty(BlockDriverState
*bs
, int64_t cur_sector
,
3636 set_dirty_bitmap(bs
, cur_sector
, nr_sectors
, 0);
3639 int64_t bdrv_get_dirty_count(BlockDriverState
*bs
)
3641 return bs
->dirty_count
;
3644 void bdrv_set_in_use(BlockDriverState
*bs
, int in_use
)
3646 assert(bs
->in_use
!= in_use
);
3647 bs
->in_use
= in_use
;
3650 int bdrv_in_use(BlockDriverState
*bs
)
3655 void bdrv_iostatus_enable(BlockDriverState
*bs
)
3657 bs
->iostatus_enabled
= true;
3658 bs
->iostatus
= BLOCK_DEVICE_IO_STATUS_OK
;
3661 /* The I/O status is only enabled if the drive explicitly
3662 * enables it _and_ the VM is configured to stop on errors */
3663 bool bdrv_iostatus_is_enabled(const BlockDriverState
*bs
)
3665 return (bs
->iostatus_enabled
&&
3666 (bs
->on_write_error
== BLOCK_ERR_STOP_ENOSPC
||
3667 bs
->on_write_error
== BLOCK_ERR_STOP_ANY
||
3668 bs
->on_read_error
== BLOCK_ERR_STOP_ANY
));
3671 void bdrv_iostatus_disable(BlockDriverState
*bs
)
3673 bs
->iostatus_enabled
= false;
3676 void bdrv_iostatus_reset(BlockDriverState
*bs
)
3678 if (bdrv_iostatus_is_enabled(bs
)) {
3679 bs
->iostatus
= BLOCK_DEVICE_IO_STATUS_OK
;
3683 /* XXX: Today this is set by device models because it makes the implementation
3684 quite simple. However, the block layer knows about the error, so it's
3685 possible to implement this without device models being involved */
3686 void bdrv_iostatus_set_err(BlockDriverState
*bs
, int error
)
3688 if (bdrv_iostatus_is_enabled(bs
) &&
3689 bs
->iostatus
== BLOCK_DEVICE_IO_STATUS_OK
) {
3691 bs
->iostatus
= error
== ENOSPC
? BLOCK_DEVICE_IO_STATUS_NOSPACE
:
3692 BLOCK_DEVICE_IO_STATUS_FAILED
;
3697 bdrv_acct_start(BlockDriverState
*bs
, BlockAcctCookie
*cookie
, int64_t bytes
,
3698 enum BlockAcctType type
)
3700 assert(type
< BDRV_MAX_IOTYPE
);
3702 cookie
->bytes
= bytes
;
3703 cookie
->start_time_ns
= get_clock();
3704 cookie
->type
= type
;
3708 bdrv_acct_done(BlockDriverState
*bs
, BlockAcctCookie
*cookie
)
3710 assert(cookie
->type
< BDRV_MAX_IOTYPE
);
3712 bs
->nr_bytes
[cookie
->type
] += cookie
->bytes
;
3713 bs
->nr_ops
[cookie
->type
]++;
3714 bs
->total_time_ns
[cookie
->type
] += get_clock() - cookie
->start_time_ns
;
3717 int bdrv_img_create(const char *filename
, const char *fmt
,
3718 const char *base_filename
, const char *base_fmt
,
3719 char *options
, uint64_t img_size
, int flags
)
3721 QEMUOptionParameter
*param
= NULL
, *create_options
= NULL
;
3722 QEMUOptionParameter
*backing_fmt
, *backing_file
, *size
;
3723 BlockDriverState
*bs
= NULL
;
3724 BlockDriver
*drv
, *proto_drv
;
3725 BlockDriver
*backing_drv
= NULL
;
3728 /* Find driver and parse its options */
3729 drv
= bdrv_find_format(fmt
);
3731 error_report("Unknown file format '%s'", fmt
);
3736 proto_drv
= bdrv_find_protocol(filename
);
3738 error_report("Unknown protocol '%s'", filename
);
3743 create_options
= append_option_parameters(create_options
,
3744 drv
->create_options
);
3745 create_options
= append_option_parameters(create_options
,
3746 proto_drv
->create_options
);
3748 /* Create parameter list with default values */
3749 param
= parse_option_parameters("", create_options
, param
);
3751 set_option_parameter_int(param
, BLOCK_OPT_SIZE
, img_size
);
3753 /* Parse -o options */
3755 param
= parse_option_parameters(options
, create_options
, param
);
3756 if (param
== NULL
) {
3757 error_report("Invalid options for file format '%s'.", fmt
);
3763 if (base_filename
) {
3764 if (set_option_parameter(param
, BLOCK_OPT_BACKING_FILE
,
3766 error_report("Backing file not supported for file format '%s'",
3774 if (set_option_parameter(param
, BLOCK_OPT_BACKING_FMT
, base_fmt
)) {
3775 error_report("Backing file format not supported for file "
3776 "format '%s'", fmt
);
3782 backing_file
= get_option_parameter(param
, BLOCK_OPT_BACKING_FILE
);
3783 if (backing_file
&& backing_file
->value
.s
) {
3784 if (!strcmp(filename
, backing_file
->value
.s
)) {
3785 error_report("Error: Trying to create an image with the "
3786 "same filename as the backing file");
3792 backing_fmt
= get_option_parameter(param
, BLOCK_OPT_BACKING_FMT
);
3793 if (backing_fmt
&& backing_fmt
->value
.s
) {
3794 backing_drv
= bdrv_find_format(backing_fmt
->value
.s
);
3796 error_report("Unknown backing file format '%s'",
3797 backing_fmt
->value
.s
);
3803 // The size for the image must always be specified, with one exception:
3804 // If we are using a backing file, we can obtain the size from there
3805 size
= get_option_parameter(param
, BLOCK_OPT_SIZE
);
3806 if (size
&& size
->value
.n
== -1) {
3807 if (backing_file
&& backing_file
->value
.s
) {
3813 ret
= bdrv_open(bs
, backing_file
->value
.s
, flags
, backing_drv
);
3815 error_report("Could not open '%s'", backing_file
->value
.s
);
3818 bdrv_get_geometry(bs
, &size
);
3821 snprintf(buf
, sizeof(buf
), "%" PRId64
, size
);
3822 set_option_parameter(param
, BLOCK_OPT_SIZE
, buf
);
3824 error_report("Image creation needs a size parameter");
3830 printf("Formatting '%s', fmt=%s ", filename
, fmt
);
3831 print_option_parameters(param
);
3834 ret
= bdrv_create(drv
, filename
, param
);
3837 if (ret
== -ENOTSUP
) {
3838 error_report("Formatting or formatting option not supported for "
3839 "file format '%s'", fmt
);
3840 } else if (ret
== -EFBIG
) {
3841 error_report("The image size is too large for file format '%s'",
3844 error_report("%s: error while creating %s: %s", filename
, fmt
,
3850 free_option_parameters(create_options
);
3851 free_option_parameters(param
);