]> git.proxmox.com Git - qemu.git/blame - block.c
configure: Disable (clang) initializer-overrides warnings
[qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
376253ec 27#include "monitor.h"
ea2384d3 28#include "block_int.h"
2f0c9fe6 29#include "blockjob.h"
5efa9d5a 30#include "module.h"
f795e743 31#include "qjson.h"
3e1caa5f 32#include "sysemu.h"
d7d512f6 33#include "notify.h"
68485420 34#include "qemu-coroutine.h"
b2023818 35#include "qmp-commands.h"
0563e191 36#include "qemu-timer.h"
fc01f7e7 37
71e72a19 38#ifdef CONFIG_BSD
7674e7bf
FB
39#include <sys/types.h>
40#include <sys/stat.h>
41#include <sys/ioctl.h>
72cf2d4f 42#include <sys/queue.h>
c5e97233 43#ifndef __DragonFly__
7674e7bf
FB
44#include <sys/disk.h>
45#endif
c5e97233 46#endif
7674e7bf 47
49dc768d
AL
48#ifdef _WIN32
49#include <windows.h>
50#endif
51
1c9805a3
SH
52#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
53
470c0504
SH
54typedef enum {
55 BDRV_REQ_COPY_ON_READ = 0x1,
f08f2dda 56 BDRV_REQ_ZERO_WRITE = 0x2,
470c0504
SH
57} BdrvRequestFlags;
58
7d4b4ba5 59static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
60static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 62 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
63static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
64 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 65 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
66static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors,
71 QEMUIOVector *iov);
c5fbe571 72static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
1c9805a3 75static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
76 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
77 BdrvRequestFlags flags);
b2a61371
SH
78static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
79 int64_t sector_num,
80 QEMUIOVector *qiov,
81 int nb_sectors,
82 BlockDriverCompletionFunc *cb,
83 void *opaque,
8c5873d6 84 bool is_write);
b2a61371 85static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589
KW
86static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
87 int64_t sector_num, int nb_sectors);
ec530c81 88
98f90dba
ZYW
89static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
90 bool is_write, double elapsed_time, uint64_t *wait);
91static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
92 double elapsed_time, uint64_t *wait);
93static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
94 bool is_write, int64_t *wait);
95
1b7bdbc1
SH
96static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
97 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 98
8a22f02a
SH
99static QLIST_HEAD(, BlockDriver) bdrv_drivers =
100 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 101
f9092b10
MA
102/* The device to use for VM snapshots */
103static BlockDriverState *bs_snapshots;
104
eb852011
MA
105/* If non-zero, use only whitelisted block drivers */
106static int use_bdrv_whitelist;
107
9e0b22f4
SH
108#ifdef _WIN32
109static int is_windows_drive_prefix(const char *filename)
110{
111 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
112 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
113 filename[1] == ':');
114}
115
116int is_windows_drive(const char *filename)
117{
118 if (is_windows_drive_prefix(filename) &&
119 filename[2] == '\0')
120 return 1;
121 if (strstart(filename, "\\\\.\\", NULL) ||
122 strstart(filename, "//./", NULL))
123 return 1;
124 return 0;
125}
126#endif
127
0563e191 128/* throttling disk I/O limits */
98f90dba
ZYW
129void bdrv_io_limits_disable(BlockDriverState *bs)
130{
131 bs->io_limits_enabled = false;
132
133 while (qemu_co_queue_next(&bs->throttled_reqs));
134
135 if (bs->block_timer) {
136 qemu_del_timer(bs->block_timer);
137 qemu_free_timer(bs->block_timer);
138 bs->block_timer = NULL;
139 }
140
141 bs->slice_start = 0;
142 bs->slice_end = 0;
143 bs->slice_time = 0;
144 memset(&bs->io_base, 0, sizeof(bs->io_base));
145}
146
0563e191
ZYW
147static void bdrv_block_timer(void *opaque)
148{
149 BlockDriverState *bs = opaque;
150
151 qemu_co_queue_next(&bs->throttled_reqs);
152}
153
154void bdrv_io_limits_enable(BlockDriverState *bs)
155{
156 qemu_co_queue_init(&bs->throttled_reqs);
157 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
158 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
159 bs->slice_start = qemu_get_clock_ns(vm_clock);
160 bs->slice_end = bs->slice_start + bs->slice_time;
161 memset(&bs->io_base, 0, sizeof(bs->io_base));
162 bs->io_limits_enabled = true;
163}
164
165bool bdrv_io_limits_enabled(BlockDriverState *bs)
166{
167 BlockIOLimit *io_limits = &bs->io_limits;
168 return io_limits->bps[BLOCK_IO_LIMIT_READ]
169 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
171 || io_limits->iops[BLOCK_IO_LIMIT_READ]
172 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
173 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
174}
175
98f90dba
ZYW
176static void bdrv_io_limits_intercept(BlockDriverState *bs,
177 bool is_write, int nb_sectors)
178{
179 int64_t wait_time = -1;
180
181 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
182 qemu_co_queue_wait(&bs->throttled_reqs);
183 }
184
185 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
186 * throttled requests will not be dequeued until the current request is
187 * allowed to be serviced. So if the current request still exceeds the
188 * limits, it will be inserted to the head. All requests followed it will
189 * be still in throttled_reqs queue.
190 */
191
192 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
193 qemu_mod_timer(bs->block_timer,
194 wait_time + qemu_get_clock_ns(vm_clock));
195 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
196 }
197
198 qemu_co_queue_next(&bs->throttled_reqs);
199}
200
9e0b22f4
SH
201/* check if the path starts with "<protocol>:" */
202static int path_has_protocol(const char *path)
203{
947995c0
PB
204 const char *p;
205
9e0b22f4
SH
206#ifdef _WIN32
207 if (is_windows_drive(path) ||
208 is_windows_drive_prefix(path)) {
209 return 0;
210 }
947995c0
PB
211 p = path + strcspn(path, ":/\\");
212#else
213 p = path + strcspn(path, ":/");
9e0b22f4
SH
214#endif
215
947995c0 216 return *p == ':';
9e0b22f4
SH
217}
218
83f64091 219int path_is_absolute(const char *path)
3b0d4f61 220{
21664424
FB
221#ifdef _WIN32
222 /* specific case for names like: "\\.\d:" */
f53f4da9 223 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 224 return 1;
f53f4da9
PB
225 }
226 return (*path == '/' || *path == '\\');
3b9f94e1 227#else
f53f4da9 228 return (*path == '/');
3b9f94e1 229#endif
3b0d4f61
FB
230}
231
83f64091
FB
232/* if filename is absolute, just copy it to dest. Otherwise, build a
233 path to it by considering it is relative to base_path. URL are
234 supported. */
235void path_combine(char *dest, int dest_size,
236 const char *base_path,
237 const char *filename)
3b0d4f61 238{
83f64091
FB
239 const char *p, *p1;
240 int len;
241
242 if (dest_size <= 0)
243 return;
244 if (path_is_absolute(filename)) {
245 pstrcpy(dest, dest_size, filename);
246 } else {
247 p = strchr(base_path, ':');
248 if (p)
249 p++;
250 else
251 p = base_path;
3b9f94e1
FB
252 p1 = strrchr(base_path, '/');
253#ifdef _WIN32
254 {
255 const char *p2;
256 p2 = strrchr(base_path, '\\');
257 if (!p1 || p2 > p1)
258 p1 = p2;
259 }
260#endif
83f64091
FB
261 if (p1)
262 p1++;
263 else
264 p1 = base_path;
265 if (p1 > p)
266 p = p1;
267 len = p - base_path;
268 if (len > dest_size - 1)
269 len = dest_size - 1;
270 memcpy(dest, base_path, len);
271 dest[len] = '\0';
272 pstrcat(dest, dest_size, filename);
3b0d4f61 273 }
3b0d4f61
FB
274}
275
dc5a1371
PB
276void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz)
277{
278 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) {
279 pstrcpy(dest, sz, bs->backing_file);
280 } else {
281 path_combine(dest, sz, bs->filename, bs->backing_file);
282 }
283}
284
5efa9d5a 285void bdrv_register(BlockDriver *bdrv)
ea2384d3 286{
8c5873d6
SH
287 /* Block drivers without coroutine functions need emulation */
288 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
289 bdrv->bdrv_co_readv = bdrv_co_readv_em;
290 bdrv->bdrv_co_writev = bdrv_co_writev_em;
291
f8c35c1d
SH
292 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
293 * the block driver lacks aio we need to emulate that too.
294 */
f9f05dc5
KW
295 if (!bdrv->bdrv_aio_readv) {
296 /* add AIO emulation layer */
297 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
298 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 299 }
83f64091 300 }
b2e12bc6 301
8a22f02a 302 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 303}
b338082b
FB
304
305/* create a new block device (by default it is empty) */
306BlockDriverState *bdrv_new(const char *device_name)
307{
1b7bdbc1 308 BlockDriverState *bs;
b338082b 309
7267c094 310 bs = g_malloc0(sizeof(BlockDriverState));
b338082b 311 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 312 if (device_name[0] != '\0') {
1b7bdbc1 313 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
ea2384d3 314 }
28a7282a 315 bdrv_iostatus_disable(bs);
d7d512f6
PB
316 notifier_list_init(&bs->close_notifiers);
317
b338082b
FB
318 return bs;
319}
320
d7d512f6
PB
321void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
322{
323 notifier_list_add(&bs->close_notifiers, notify);
324}
325
ea2384d3
FB
326BlockDriver *bdrv_find_format(const char *format_name)
327{
328 BlockDriver *drv1;
8a22f02a
SH
329 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
330 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 331 return drv1;
8a22f02a 332 }
ea2384d3
FB
333 }
334 return NULL;
335}
336
eb852011
MA
337static int bdrv_is_whitelisted(BlockDriver *drv)
338{
339 static const char *whitelist[] = {
340 CONFIG_BDRV_WHITELIST
341 };
342 const char **p;
343
344 if (!whitelist[0])
345 return 1; /* no whitelist, anything goes */
346
347 for (p = whitelist; *p; p++) {
348 if (!strcmp(drv->format_name, *p)) {
349 return 1;
350 }
351 }
352 return 0;
353}
354
355BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
356{
357 BlockDriver *drv = bdrv_find_format(format_name);
358 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
359}
360
5b7e1542
ZYW
361typedef struct CreateCo {
362 BlockDriver *drv;
363 char *filename;
364 QEMUOptionParameter *options;
365 int ret;
366} CreateCo;
367
368static void coroutine_fn bdrv_create_co_entry(void *opaque)
369{
370 CreateCo *cco = opaque;
371 assert(cco->drv);
372
373 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
374}
375
0e7e1989
KW
376int bdrv_create(BlockDriver *drv, const char* filename,
377 QEMUOptionParameter *options)
ea2384d3 378{
5b7e1542
ZYW
379 int ret;
380
381 Coroutine *co;
382 CreateCo cco = {
383 .drv = drv,
384 .filename = g_strdup(filename),
385 .options = options,
386 .ret = NOT_DONE,
387 };
388
389 if (!drv->bdrv_create) {
ea2384d3 390 return -ENOTSUP;
5b7e1542
ZYW
391 }
392
393 if (qemu_in_coroutine()) {
394 /* Fast-path if already in coroutine context */
395 bdrv_create_co_entry(&cco);
396 } else {
397 co = qemu_coroutine_create(bdrv_create_co_entry);
398 qemu_coroutine_enter(co, &cco);
399 while (cco.ret == NOT_DONE) {
400 qemu_aio_wait();
401 }
402 }
403
404 ret = cco.ret;
405 g_free(cco.filename);
0e7e1989 406
5b7e1542 407 return ret;
ea2384d3
FB
408}
409
84a12e66
CH
410int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
411{
412 BlockDriver *drv;
413
b50cbabc 414 drv = bdrv_find_protocol(filename);
84a12e66 415 if (drv == NULL) {
16905d71 416 return -ENOENT;
84a12e66
CH
417 }
418
419 return bdrv_create(drv, filename, options);
420}
421
eba25057
JM
422/*
423 * Create a uniquely-named empty temporary file.
424 * Return 0 upon success, otherwise a negative errno value.
425 */
426int get_tmp_filename(char *filename, int size)
d5249393 427{
eba25057 428#ifdef _WIN32
3b9f94e1 429 char temp_dir[MAX_PATH];
eba25057
JM
430 /* GetTempFileName requires that its output buffer (4th param)
431 have length MAX_PATH or greater. */
432 assert(size >= MAX_PATH);
433 return (GetTempPath(MAX_PATH, temp_dir)
434 && GetTempFileName(temp_dir, "qem", 0, filename)
435 ? 0 : -GetLastError());
d5249393 436#else
67b915a5 437 int fd;
7ccfb2eb 438 const char *tmpdir;
0badc1ee
AJ
439 tmpdir = getenv("TMPDIR");
440 if (!tmpdir)
441 tmpdir = "/tmp";
eba25057
JM
442 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
443 return -EOVERFLOW;
444 }
ea2384d3 445 fd = mkstemp(filename);
fe235a06
DH
446 if (fd < 0) {
447 return -errno;
448 }
449 if (close(fd) != 0) {
450 unlink(filename);
eba25057
JM
451 return -errno;
452 }
453 return 0;
d5249393 454#endif
eba25057 455}
fc01f7e7 456
84a12e66
CH
457/*
458 * Detect host devices. By convention, /dev/cdrom[N] is always
459 * recognized as a host CDROM.
460 */
461static BlockDriver *find_hdev_driver(const char *filename)
462{
463 int score_max = 0, score;
464 BlockDriver *drv = NULL, *d;
465
466 QLIST_FOREACH(d, &bdrv_drivers, list) {
467 if (d->bdrv_probe_device) {
468 score = d->bdrv_probe_device(filename);
469 if (score > score_max) {
470 score_max = score;
471 drv = d;
472 }
473 }
474 }
475
476 return drv;
477}
478
b50cbabc 479BlockDriver *bdrv_find_protocol(const char *filename)
83f64091
FB
480{
481 BlockDriver *drv1;
482 char protocol[128];
1cec71e3 483 int len;
83f64091 484 const char *p;
19cb3738 485
66f82cee
KW
486 /* TODO Drivers without bdrv_file_open must be specified explicitly */
487
39508e7a
CH
488 /*
489 * XXX(hch): we really should not let host device detection
490 * override an explicit protocol specification, but moving this
491 * later breaks access to device names with colons in them.
492 * Thanks to the brain-dead persistent naming schemes on udev-
493 * based Linux systems those actually are quite common.
494 */
495 drv1 = find_hdev_driver(filename);
496 if (drv1) {
497 return drv1;
498 }
499
9e0b22f4 500 if (!path_has_protocol(filename)) {
39508e7a 501 return bdrv_find_format("file");
84a12e66 502 }
9e0b22f4
SH
503 p = strchr(filename, ':');
504 assert(p != NULL);
1cec71e3
AL
505 len = p - filename;
506 if (len > sizeof(protocol) - 1)
507 len = sizeof(protocol) - 1;
508 memcpy(protocol, filename, len);
509 protocol[len] = '\0';
8a22f02a 510 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 511 if (drv1->protocol_name &&
8a22f02a 512 !strcmp(drv1->protocol_name, protocol)) {
83f64091 513 return drv1;
8a22f02a 514 }
83f64091
FB
515 }
516 return NULL;
517}
518
c98ac35d 519static int find_image_format(const char *filename, BlockDriver **pdrv)
f3a5d3f8
CH
520{
521 int ret, score, score_max;
522 BlockDriver *drv1, *drv;
523 uint8_t buf[2048];
524 BlockDriverState *bs;
525
f5edb014 526 ret = bdrv_file_open(&bs, filename, 0);
c98ac35d
SW
527 if (ret < 0) {
528 *pdrv = NULL;
529 return ret;
530 }
f8ea0b00 531
08a00559
KW
532 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
533 if (bs->sg || !bdrv_is_inserted(bs)) {
1a396859 534 bdrv_delete(bs);
c98ac35d
SW
535 drv = bdrv_find_format("raw");
536 if (!drv) {
537 ret = -ENOENT;
538 }
539 *pdrv = drv;
540 return ret;
1a396859 541 }
f8ea0b00 542
83f64091
FB
543 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
544 bdrv_delete(bs);
545 if (ret < 0) {
c98ac35d
SW
546 *pdrv = NULL;
547 return ret;
83f64091
FB
548 }
549
ea2384d3 550 score_max = 0;
84a12e66 551 drv = NULL;
8a22f02a 552 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
553 if (drv1->bdrv_probe) {
554 score = drv1->bdrv_probe(buf, ret, filename);
555 if (score > score_max) {
556 score_max = score;
557 drv = drv1;
558 }
0849bf08 559 }
fc01f7e7 560 }
c98ac35d
SW
561 if (!drv) {
562 ret = -ENOENT;
563 }
564 *pdrv = drv;
565 return ret;
ea2384d3
FB
566}
567
51762288
SH
568/**
569 * Set the current 'total_sectors' value
570 */
571static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
572{
573 BlockDriver *drv = bs->drv;
574
396759ad
NB
575 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
576 if (bs->sg)
577 return 0;
578
51762288
SH
579 /* query actual device if possible, otherwise just trust the hint */
580 if (drv->bdrv_getlength) {
581 int64_t length = drv->bdrv_getlength(bs);
582 if (length < 0) {
583 return length;
584 }
585 hint = length >> BDRV_SECTOR_BITS;
586 }
587
588 bs->total_sectors = hint;
589 return 0;
590}
591
c3993cdc
SH
592/**
593 * Set open flags for a given cache mode
594 *
595 * Return 0 on success, -1 if the cache mode was invalid.
596 */
597int bdrv_parse_cache_flags(const char *mode, int *flags)
598{
599 *flags &= ~BDRV_O_CACHE_MASK;
600
601 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
602 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
603 } else if (!strcmp(mode, "directsync")) {
604 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
605 } else if (!strcmp(mode, "writeback")) {
606 *flags |= BDRV_O_CACHE_WB;
607 } else if (!strcmp(mode, "unsafe")) {
608 *flags |= BDRV_O_CACHE_WB;
609 *flags |= BDRV_O_NO_FLUSH;
610 } else if (!strcmp(mode, "writethrough")) {
611 /* this is the default */
612 } else {
613 return -1;
614 }
615
616 return 0;
617}
618
53fec9d3
SH
619/**
620 * The copy-on-read flag is actually a reference count so multiple users may
621 * use the feature without worrying about clobbering its previous state.
622 * Copy-on-read stays enabled until all users have called to disable it.
623 */
624void bdrv_enable_copy_on_read(BlockDriverState *bs)
625{
626 bs->copy_on_read++;
627}
628
629void bdrv_disable_copy_on_read(BlockDriverState *bs)
630{
631 assert(bs->copy_on_read > 0);
632 bs->copy_on_read--;
633}
634
57915332
KW
635/*
636 * Common part for opening disk images and files
637 */
638static int bdrv_open_common(BlockDriverState *bs, const char *filename,
639 int flags, BlockDriver *drv)
640{
641 int ret, open_flags;
642
643 assert(drv != NULL);
6405875c 644 assert(bs->file == NULL);
57915332 645
28dcee10
SH
646 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
647
57915332 648 bs->open_flags = flags;
57915332
KW
649 bs->buffer_alignment = 512;
650
53fec9d3
SH
651 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
652 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
653 bdrv_enable_copy_on_read(bs);
654 }
655
57915332
KW
656 pstrcpy(bs->filename, sizeof(bs->filename), filename);
657
658 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
659 return -ENOTSUP;
660 }
661
662 bs->drv = drv;
7267c094 663 bs->opaque = g_malloc0(drv->instance_size);
57915332 664
03f541bd 665 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
e1e9b0ac 666 open_flags = flags | BDRV_O_CACHE_WB;
57915332
KW
667
668 /*
669 * Clear flags that are internal to the block layer before opening the
670 * image.
671 */
e1e9b0ac 672 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
57915332
KW
673
674 /*
ebabb67a 675 * Snapshots should be writable.
57915332
KW
676 */
677 if (bs->is_temporary) {
678 open_flags |= BDRV_O_RDWR;
679 }
680
be028adc 681 bs->read_only = !(open_flags & BDRV_O_RDWR);
e7c63796 682
66f82cee
KW
683 /* Open the image, either directly or using a protocol */
684 if (drv->bdrv_file_open) {
685 ret = drv->bdrv_file_open(bs, filename, open_flags);
686 } else {
687 ret = bdrv_file_open(&bs->file, filename, open_flags);
688 if (ret >= 0) {
689 ret = drv->bdrv_open(bs, open_flags);
690 }
691 }
692
57915332
KW
693 if (ret < 0) {
694 goto free_and_fail;
695 }
696
51762288
SH
697 ret = refresh_total_sectors(bs, bs->total_sectors);
698 if (ret < 0) {
699 goto free_and_fail;
57915332 700 }
51762288 701
57915332
KW
702#ifndef _WIN32
703 if (bs->is_temporary) {
704 unlink(filename);
705 }
706#endif
707 return 0;
708
709free_and_fail:
66f82cee
KW
710 if (bs->file) {
711 bdrv_delete(bs->file);
712 bs->file = NULL;
713 }
7267c094 714 g_free(bs->opaque);
57915332
KW
715 bs->opaque = NULL;
716 bs->drv = NULL;
717 return ret;
718}
719
b6ce07aa
KW
720/*
721 * Opens a file using a protocol (file, host_device, nbd, ...)
722 */
83f64091 723int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
ea2384d3 724{
83f64091 725 BlockDriverState *bs;
6db95603 726 BlockDriver *drv;
83f64091
FB
727 int ret;
728
b50cbabc 729 drv = bdrv_find_protocol(filename);
6db95603
CH
730 if (!drv) {
731 return -ENOENT;
732 }
733
83f64091 734 bs = bdrv_new("");
b6ce07aa 735 ret = bdrv_open_common(bs, filename, flags, drv);
83f64091
FB
736 if (ret < 0) {
737 bdrv_delete(bs);
738 return ret;
3b0d4f61 739 }
71d0770c 740 bs->growable = 1;
83f64091
FB
741 *pbs = bs;
742 return 0;
743}
744
b6ce07aa
KW
745/*
746 * Opens a disk image (raw, qcow2, vmdk, ...)
747 */
d6e9098e
KW
748int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
749 BlockDriver *drv)
ea2384d3 750{
b6ce07aa 751 int ret;
2b572816 752 char tmp_filename[PATH_MAX];
712e7874 753
83f64091 754 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
755 BlockDriverState *bs1;
756 int64_t total_size;
7c96d46e 757 int is_protocol = 0;
91a073a9
KW
758 BlockDriver *bdrv_qcow2;
759 QEMUOptionParameter *options;
b6ce07aa 760 char backing_filename[PATH_MAX];
3b46e624 761
ea2384d3
FB
762 /* if snapshot, we create a temporary backing file and open it
763 instead of opening 'filename' directly */
33e3963e 764
ea2384d3
FB
765 /* if there is a backing file, use it */
766 bs1 = bdrv_new("");
d6e9098e 767 ret = bdrv_open(bs1, filename, 0, drv);
51d7c00c 768 if (ret < 0) {
ea2384d3 769 bdrv_delete(bs1);
51d7c00c 770 return ret;
ea2384d3 771 }
3e82990b 772 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e
AL
773
774 if (bs1->drv && bs1->drv->protocol_name)
775 is_protocol = 1;
776
ea2384d3 777 bdrv_delete(bs1);
3b46e624 778
eba25057
JM
779 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
780 if (ret < 0) {
781 return ret;
782 }
7c96d46e
AL
783
784 /* Real path is meaningless for protocols */
785 if (is_protocol)
786 snprintf(backing_filename, sizeof(backing_filename),
787 "%s", filename);
114cdfa9
KS
788 else if (!realpath(filename, backing_filename))
789 return -errno;
7c96d46e 790
91a073a9
KW
791 bdrv_qcow2 = bdrv_find_format("qcow2");
792 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
793
3e82990b 794 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
91a073a9
KW
795 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
796 if (drv) {
797 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
798 drv->format_name);
799 }
800
801 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
d748768c 802 free_option_parameters(options);
51d7c00c
AL
803 if (ret < 0) {
804 return ret;
ea2384d3 805 }
91a073a9 806
ea2384d3 807 filename = tmp_filename;
91a073a9 808 drv = bdrv_qcow2;
ea2384d3
FB
809 bs->is_temporary = 1;
810 }
712e7874 811
b6ce07aa 812 /* Find the right image format driver */
6db95603 813 if (!drv) {
c98ac35d 814 ret = find_image_format(filename, &drv);
51d7c00c 815 }
6987307c 816
51d7c00c 817 if (!drv) {
51d7c00c 818 goto unlink_and_fail;
ea2384d3 819 }
b6ce07aa 820
be028adc
JC
821 if (flags & BDRV_O_RDWR) {
822 flags |= BDRV_O_ALLOW_RDWR;
823 }
824
b6ce07aa
KW
825 /* Open the image */
826 ret = bdrv_open_common(bs, filename, flags, drv);
827 if (ret < 0) {
6987307c
CH
828 goto unlink_and_fail;
829 }
830
b6ce07aa
KW
831 /* If there is a backing file, use it */
832 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
833 char backing_filename[PATH_MAX];
834 int back_flags;
835 BlockDriver *back_drv = NULL;
836
837 bs->backing_hd = bdrv_new("");
dc5a1371
PB
838 bdrv_get_full_backing_filename(bs, backing_filename,
839 sizeof(backing_filename));
df2dbb4a
SH
840
841 if (bs->backing_format[0] != '\0') {
b6ce07aa 842 back_drv = bdrv_find_format(bs->backing_format);
df2dbb4a 843 }
b6ce07aa
KW
844
845 /* backing files always opened read-only */
846 back_flags =
847 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
848
849 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
850 if (ret < 0) {
851 bdrv_close(bs);
852 return ret;
853 }
b6ce07aa
KW
854 }
855
856 if (!bdrv_key_required(bs)) {
7d4b4ba5 857 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
858 }
859
98f90dba
ZYW
860 /* throttling disk I/O limits */
861 if (bs->io_limits_enabled) {
862 bdrv_io_limits_enable(bs);
863 }
864
b6ce07aa
KW
865 return 0;
866
867unlink_and_fail:
868 if (bs->is_temporary) {
869 unlink(filename);
870 }
871 return ret;
872}
873
e971aa12
JC
874typedef struct BlockReopenQueueEntry {
875 bool prepared;
876 BDRVReopenState state;
877 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry;
878} BlockReopenQueueEntry;
879
880/*
881 * Adds a BlockDriverState to a simple queue for an atomic, transactional
882 * reopen of multiple devices.
883 *
884 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT
885 * already performed, or alternatively may be NULL a new BlockReopenQueue will
886 * be created and initialized. This newly created BlockReopenQueue should be
887 * passed back in for subsequent calls that are intended to be of the same
888 * atomic 'set'.
889 *
890 * bs is the BlockDriverState to add to the reopen queue.
891 *
892 * flags contains the open flags for the associated bs
893 *
894 * returns a pointer to bs_queue, which is either the newly allocated
895 * bs_queue, or the existing bs_queue being used.
896 *
897 */
898BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
899 BlockDriverState *bs, int flags)
900{
901 assert(bs != NULL);
902
903 BlockReopenQueueEntry *bs_entry;
904 if (bs_queue == NULL) {
905 bs_queue = g_new0(BlockReopenQueue, 1);
906 QSIMPLEQ_INIT(bs_queue);
907 }
908
909 if (bs->file) {
910 bdrv_reopen_queue(bs_queue, bs->file, flags);
911 }
912
913 bs_entry = g_new0(BlockReopenQueueEntry, 1);
914 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry);
915
916 bs_entry->state.bs = bs;
917 bs_entry->state.flags = flags;
918
919 return bs_queue;
920}
921
922/*
923 * Reopen multiple BlockDriverStates atomically & transactionally.
924 *
925 * The queue passed in (bs_queue) must have been built up previous
926 * via bdrv_reopen_queue().
927 *
928 * Reopens all BDS specified in the queue, with the appropriate
929 * flags. All devices are prepared for reopen, and failure of any
930 * device will cause all device changes to be abandonded, and intermediate
931 * data cleaned up.
932 *
933 * If all devices prepare successfully, then the changes are committed
934 * to all devices.
935 *
936 */
937int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
938{
939 int ret = -1;
940 BlockReopenQueueEntry *bs_entry, *next;
941 Error *local_err = NULL;
942
943 assert(bs_queue != NULL);
944
945 bdrv_drain_all();
946
947 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
948 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
949 error_propagate(errp, local_err);
950 goto cleanup;
951 }
952 bs_entry->prepared = true;
953 }
954
955 /* If we reach this point, we have success and just need to apply the
956 * changes
957 */
958 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
959 bdrv_reopen_commit(&bs_entry->state);
960 }
961
962 ret = 0;
963
964cleanup:
965 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
966 if (ret && bs_entry->prepared) {
967 bdrv_reopen_abort(&bs_entry->state);
968 }
969 g_free(bs_entry);
970 }
971 g_free(bs_queue);
972 return ret;
973}
974
975
976/* Reopen a single BlockDriverState with the specified flags. */
977int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp)
978{
979 int ret = -1;
980 Error *local_err = NULL;
981 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags);
982
983 ret = bdrv_reopen_multiple(queue, &local_err);
984 if (local_err != NULL) {
985 error_propagate(errp, local_err);
986 }
987 return ret;
988}
989
990
991/*
992 * Prepares a BlockDriverState for reopen. All changes are staged in the
993 * 'opaque' field of the BDRVReopenState, which is used and allocated by
994 * the block driver layer .bdrv_reopen_prepare()
995 *
996 * bs is the BlockDriverState to reopen
997 * flags are the new open flags
998 * queue is the reopen queue
999 *
1000 * Returns 0 on success, non-zero on error. On error errp will be set
1001 * as well.
1002 *
1003 * On failure, bdrv_reopen_abort() will be called to clean up any data.
1004 * It is the responsibility of the caller to then call the abort() or
1005 * commit() for any other BDS that have been left in a prepare() state
1006 *
1007 */
1008int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
1009 Error **errp)
1010{
1011 int ret = -1;
1012 Error *local_err = NULL;
1013 BlockDriver *drv;
1014
1015 assert(reopen_state != NULL);
1016 assert(reopen_state->bs->drv != NULL);
1017 drv = reopen_state->bs->drv;
1018
1019 /* if we are to stay read-only, do not allow permission change
1020 * to r/w */
1021 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
1022 reopen_state->flags & BDRV_O_RDWR) {
1023 error_set(errp, QERR_DEVICE_IS_READ_ONLY,
1024 reopen_state->bs->device_name);
1025 goto error;
1026 }
1027
1028
1029 ret = bdrv_flush(reopen_state->bs);
1030 if (ret) {
1031 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
1032 strerror(-ret));
1033 goto error;
1034 }
1035
1036 if (drv->bdrv_reopen_prepare) {
1037 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err);
1038 if (ret) {
1039 if (local_err != NULL) {
1040 error_propagate(errp, local_err);
1041 } else {
1042 error_set(errp, QERR_OPEN_FILE_FAILED,
1043 reopen_state->bs->filename);
1044 }
1045 goto error;
1046 }
1047 } else {
1048 /* It is currently mandatory to have a bdrv_reopen_prepare()
1049 * handler for each supported drv. */
1050 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
1051 drv->format_name, reopen_state->bs->device_name,
1052 "reopening of file");
1053 ret = -1;
1054 goto error;
1055 }
1056
1057 ret = 0;
1058
1059error:
1060 return ret;
1061}
1062
1063/*
1064 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and
1065 * makes them final by swapping the staging BlockDriverState contents into
1066 * the active BlockDriverState contents.
1067 */
1068void bdrv_reopen_commit(BDRVReopenState *reopen_state)
1069{
1070 BlockDriver *drv;
1071
1072 assert(reopen_state != NULL);
1073 drv = reopen_state->bs->drv;
1074 assert(drv != NULL);
1075
1076 /* If there are any driver level actions to take */
1077 if (drv->bdrv_reopen_commit) {
1078 drv->bdrv_reopen_commit(reopen_state);
1079 }
1080
1081 /* set BDS specific flags now */
1082 reopen_state->bs->open_flags = reopen_state->flags;
1083 reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
1084 BDRV_O_CACHE_WB);
1085 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
1086}
1087
1088/*
1089 * Abort the reopen, and delete and free the staged changes in
1090 * reopen_state
1091 */
1092void bdrv_reopen_abort(BDRVReopenState *reopen_state)
1093{
1094 BlockDriver *drv;
1095
1096 assert(reopen_state != NULL);
1097 drv = reopen_state->bs->drv;
1098 assert(drv != NULL);
1099
1100 if (drv->bdrv_reopen_abort) {
1101 drv->bdrv_reopen_abort(reopen_state);
1102 }
1103}
1104
1105
fc01f7e7
FB
1106void bdrv_close(BlockDriverState *bs)
1107{
80ccf93b 1108 bdrv_flush(bs);
3cbc002c
PB
1109 if (bs->job) {
1110 block_job_cancel_sync(bs->job);
1111 }
1112 bdrv_drain_all();
d7d512f6 1113 notifier_list_notify(&bs->close_notifiers, bs);
7094f12f 1114
3cbc002c 1115 if (bs->drv) {
f9092b10
MA
1116 if (bs == bs_snapshots) {
1117 bs_snapshots = NULL;
1118 }
557df6ac 1119 if (bs->backing_hd) {
ea2384d3 1120 bdrv_delete(bs->backing_hd);
557df6ac
SH
1121 bs->backing_hd = NULL;
1122 }
ea2384d3 1123 bs->drv->bdrv_close(bs);
7267c094 1124 g_free(bs->opaque);
ea2384d3
FB
1125#ifdef _WIN32
1126 if (bs->is_temporary) {
1127 unlink(bs->filename);
1128 }
67b915a5 1129#endif
ea2384d3
FB
1130 bs->opaque = NULL;
1131 bs->drv = NULL;
53fec9d3 1132 bs->copy_on_read = 0;
a275fa42
PB
1133 bs->backing_file[0] = '\0';
1134 bs->backing_format[0] = '\0';
6405875c
PB
1135 bs->total_sectors = 0;
1136 bs->encrypted = 0;
1137 bs->valid_key = 0;
1138 bs->sg = 0;
1139 bs->growable = 0;
b338082b 1140
66f82cee 1141 if (bs->file != NULL) {
0ac9377d
PB
1142 bdrv_delete(bs->file);
1143 bs->file = NULL;
66f82cee 1144 }
b338082b 1145 }
98f90dba 1146
9ca11154
PH
1147 bdrv_dev_change_media_cb(bs, false);
1148
98f90dba
ZYW
1149 /*throttling disk I/O limits*/
1150 if (bs->io_limits_enabled) {
1151 bdrv_io_limits_disable(bs);
1152 }
b338082b
FB
1153}
1154
2bc93fed
MK
1155void bdrv_close_all(void)
1156{
1157 BlockDriverState *bs;
1158
1159 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1160 bdrv_close(bs);
1161 }
1162}
1163
922453bc
SH
1164/*
1165 * Wait for pending requests to complete across all BlockDriverStates
1166 *
1167 * This function does not flush data to disk, use bdrv_flush_all() for that
1168 * after calling this function.
4c355d53
ZYW
1169 *
1170 * Note that completion of an asynchronous I/O operation can trigger any
1171 * number of other I/O operations on other devices---for example a coroutine
1172 * can be arbitrarily complex and a constant flow of I/O can come until the
1173 * coroutine is complete. Because of this, it is not possible to have a
1174 * function to drain a single device's I/O queue.
922453bc
SH
1175 */
1176void bdrv_drain_all(void)
1177{
1178 BlockDriverState *bs;
4c355d53
ZYW
1179 bool busy;
1180
1181 do {
1182 busy = qemu_aio_wait();
922453bc 1183
4c355d53
ZYW
1184 /* FIXME: We do not have timer support here, so this is effectively
1185 * a busy wait.
1186 */
1187 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1188 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
1189 qemu_co_queue_restart_all(&bs->throttled_reqs);
1190 busy = true;
1191 }
1192 }
1193 } while (busy);
922453bc
SH
1194
1195 /* If requests are still pending there is a bug somewhere */
1196 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1197 assert(QLIST_EMPTY(&bs->tracked_requests));
1198 assert(qemu_co_queue_empty(&bs->throttled_reqs));
1199 }
1200}
1201
d22b2f41
RH
1202/* make a BlockDriverState anonymous by removing from bdrv_state list.
1203 Also, NULL terminate the device_name to prevent double remove */
1204void bdrv_make_anon(BlockDriverState *bs)
1205{
1206 if (bs->device_name[0] != '\0') {
1207 QTAILQ_REMOVE(&bdrv_states, bs, list);
1208 }
1209 bs->device_name[0] = '\0';
1210}
1211
e023b2e2
PB
1212static void bdrv_rebind(BlockDriverState *bs)
1213{
1214 if (bs->drv && bs->drv->bdrv_rebind) {
1215 bs->drv->bdrv_rebind(bs);
1216 }
1217}
1218
4ddc07ca
PB
1219static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
1220 BlockDriverState *bs_src)
8802d1fd 1221{
4ddc07ca
PB
1222 /* move some fields that need to stay attached to the device */
1223 bs_dest->open_flags = bs_src->open_flags;
8802d1fd
JC
1224
1225 /* dev info */
4ddc07ca
PB
1226 bs_dest->dev_ops = bs_src->dev_ops;
1227 bs_dest->dev_opaque = bs_src->dev_opaque;
1228 bs_dest->dev = bs_src->dev;
1229 bs_dest->buffer_alignment = bs_src->buffer_alignment;
1230 bs_dest->copy_on_read = bs_src->copy_on_read;
8802d1fd 1231
4ddc07ca 1232 bs_dest->enable_write_cache = bs_src->enable_write_cache;
c4a248a1 1233
8802d1fd 1234 /* i/o timing parameters */
4ddc07ca
PB
1235 bs_dest->slice_time = bs_src->slice_time;
1236 bs_dest->slice_start = bs_src->slice_start;
1237 bs_dest->slice_end = bs_src->slice_end;
1238 bs_dest->io_limits = bs_src->io_limits;
1239 bs_dest->io_base = bs_src->io_base;
1240 bs_dest->throttled_reqs = bs_src->throttled_reqs;
1241 bs_dest->block_timer = bs_src->block_timer;
1242 bs_dest->io_limits_enabled = bs_src->io_limits_enabled;
8802d1fd 1243
8802d1fd 1244 /* r/w error */
4ddc07ca
PB
1245 bs_dest->on_read_error = bs_src->on_read_error;
1246 bs_dest->on_write_error = bs_src->on_write_error;
8802d1fd
JC
1247
1248 /* i/o status */
4ddc07ca
PB
1249 bs_dest->iostatus_enabled = bs_src->iostatus_enabled;
1250 bs_dest->iostatus = bs_src->iostatus;
8802d1fd 1251
a9fc4408 1252 /* dirty bitmap */
4ddc07ca
PB
1253 bs_dest->dirty_count = bs_src->dirty_count;
1254 bs_dest->dirty_bitmap = bs_src->dirty_bitmap;
a9fc4408
PB
1255
1256 /* job */
4ddc07ca
PB
1257 bs_dest->in_use = bs_src->in_use;
1258 bs_dest->job = bs_src->job;
a9fc4408 1259
8802d1fd 1260 /* keep the same entry in bdrv_states */
4ddc07ca
PB
1261 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
1262 bs_src->device_name);
1263 bs_dest->list = bs_src->list;
1264}
8802d1fd 1265
4ddc07ca
PB
1266/*
1267 * Swap bs contents for two image chains while they are live,
1268 * while keeping required fields on the BlockDriverState that is
1269 * actually attached to a device.
1270 *
1271 * This will modify the BlockDriverState fields, and swap contents
1272 * between bs_new and bs_old. Both bs_new and bs_old are modified.
1273 *
1274 * bs_new is required to be anonymous.
1275 *
1276 * This function does not create any image files.
1277 */
1278void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
1279{
1280 BlockDriverState tmp;
f6801b83 1281
4ddc07ca
PB
1282 /* bs_new must be anonymous and shouldn't have anything fancy enabled */
1283 assert(bs_new->device_name[0] == '\0');
1284 assert(bs_new->dirty_bitmap == NULL);
1285 assert(bs_new->job == NULL);
1286 assert(bs_new->dev == NULL);
1287 assert(bs_new->in_use == 0);
1288 assert(bs_new->io_limits_enabled == false);
1289 assert(bs_new->block_timer == NULL);
8802d1fd 1290
4ddc07ca
PB
1291 tmp = *bs_new;
1292 *bs_new = *bs_old;
1293 *bs_old = tmp;
a9fc4408 1294
4ddc07ca
PB
1295 /* there are some fields that should not be swapped, move them back */
1296 bdrv_move_feature_fields(&tmp, bs_old);
1297 bdrv_move_feature_fields(bs_old, bs_new);
1298 bdrv_move_feature_fields(bs_new, &tmp);
8802d1fd 1299
4ddc07ca
PB
1300 /* bs_new shouldn't be in bdrv_states even after the swap! */
1301 assert(bs_new->device_name[0] == '\0');
1302
1303 /* Check a few fields that should remain attached to the device */
1304 assert(bs_new->dev == NULL);
1305 assert(bs_new->job == NULL);
1306 assert(bs_new->in_use == 0);
1307 assert(bs_new->io_limits_enabled == false);
1308 assert(bs_new->block_timer == NULL);
e023b2e2
PB
1309
1310 bdrv_rebind(bs_new);
4ddc07ca
PB
1311 bdrv_rebind(bs_old);
1312}
1313
1314/*
1315 * Add new bs contents at the top of an image chain while the chain is
1316 * live, while keeping required fields on the top layer.
1317 *
1318 * This will modify the BlockDriverState fields, and swap contents
1319 * between bs_new and bs_top. Both bs_new and bs_top are modified.
1320 *
1321 * bs_new is required to be anonymous.
1322 *
1323 * This function does not create any image files.
1324 */
1325void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
1326{
1327 bdrv_swap(bs_new, bs_top);
1328
1329 /* The contents of 'tmp' will become bs_top, as we are
1330 * swapping bs_new and bs_top contents. */
1331 bs_top->backing_hd = bs_new;
1332 bs_top->open_flags &= ~BDRV_O_NO_BACKING;
1333 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file),
1334 bs_new->filename);
1335 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format),
1336 bs_new->drv ? bs_new->drv->format_name : "");
8802d1fd
JC
1337}
1338
b338082b
FB
1339void bdrv_delete(BlockDriverState *bs)
1340{
fa879d62 1341 assert(!bs->dev);
3e914655
PB
1342 assert(!bs->job);
1343 assert(!bs->in_use);
18846dee 1344
1b7bdbc1 1345 /* remove from list, if necessary */
d22b2f41 1346 bdrv_make_anon(bs);
34c6f050 1347
b338082b 1348 bdrv_close(bs);
66f82cee 1349
f9092b10 1350 assert(bs != bs_snapshots);
7267c094 1351 g_free(bs);
fc01f7e7
FB
1352}
1353
fa879d62
MA
1354int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1355/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 1356{
fa879d62 1357 if (bs->dev) {
18846dee
MA
1358 return -EBUSY;
1359 }
fa879d62 1360 bs->dev = dev;
28a7282a 1361 bdrv_iostatus_reset(bs);
18846dee
MA
1362 return 0;
1363}
1364
fa879d62
MA
1365/* TODO qdevified devices don't use this, remove when devices are qdevified */
1366void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 1367{
fa879d62
MA
1368 if (bdrv_attach_dev(bs, dev) < 0) {
1369 abort();
1370 }
1371}
1372
1373void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1374/* TODO change to DeviceState *dev when all users are qdevified */
1375{
1376 assert(bs->dev == dev);
1377 bs->dev = NULL;
0e49de52
MA
1378 bs->dev_ops = NULL;
1379 bs->dev_opaque = NULL;
29e05f20 1380 bs->buffer_alignment = 512;
18846dee
MA
1381}
1382
fa879d62
MA
1383/* TODO change to return DeviceState * when all users are qdevified */
1384void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 1385{
fa879d62 1386 return bs->dev;
18846dee
MA
1387}
1388
0e49de52
MA
1389void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1390 void *opaque)
1391{
1392 bs->dev_ops = ops;
1393 bs->dev_opaque = opaque;
2c6942fa
MA
1394 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1395 bs_snapshots = NULL;
1396 }
0e49de52
MA
1397}
1398
32c81a4a
PB
1399void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1400 enum MonitorEvent ev,
1401 BlockErrorAction action, bool is_read)
329c0a48
LC
1402{
1403 QObject *data;
1404 const char *action_str;
1405
1406 switch (action) {
1407 case BDRV_ACTION_REPORT:
1408 action_str = "report";
1409 break;
1410 case BDRV_ACTION_IGNORE:
1411 action_str = "ignore";
1412 break;
1413 case BDRV_ACTION_STOP:
1414 action_str = "stop";
1415 break;
1416 default:
1417 abort();
1418 }
1419
1420 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1421 bdrv->device_name,
1422 action_str,
1423 is_read ? "read" : "write");
32c81a4a 1424 monitor_protocol_event(ev, data);
329c0a48
LC
1425
1426 qobject_decref(data);
1427}
1428
6f382ed2
LC
1429static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1430{
1431 QObject *data;
1432
1433 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1434 bdrv_get_device_name(bs), ejected);
1435 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1436
1437 qobject_decref(data);
1438}
1439
7d4b4ba5 1440static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 1441{
145feb17 1442 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 1443 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 1444 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
1445 if (tray_was_closed) {
1446 /* tray open */
1447 bdrv_emit_qmp_eject_event(bs, true);
1448 }
1449 if (load) {
1450 /* tray close */
1451 bdrv_emit_qmp_eject_event(bs, false);
1452 }
145feb17
MA
1453 }
1454}
1455
2c6942fa
MA
1456bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1457{
1458 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1459}
1460
025ccaa7
PB
1461void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1462{
1463 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1464 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1465 }
1466}
1467
e4def80b
MA
1468bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1469{
1470 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1471 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1472 }
1473 return false;
1474}
1475
145feb17
MA
1476static void bdrv_dev_resize_cb(BlockDriverState *bs)
1477{
1478 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1479 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
1480 }
1481}
1482
f107639a
MA
1483bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1484{
1485 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1486 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1487 }
1488 return false;
1489}
1490
e97fc193
AL
1491/*
1492 * Run consistency checks on an image
1493 *
e076f338 1494 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 1495 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 1496 * check are stored in res.
e97fc193 1497 */
4534ff54 1498int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
e97fc193
AL
1499{
1500 if (bs->drv->bdrv_check == NULL) {
1501 return -ENOTSUP;
1502 }
1503
e076f338 1504 memset(res, 0, sizeof(*res));
4534ff54 1505 return bs->drv->bdrv_check(bs, res, fix);
e97fc193
AL
1506}
1507
8a426614
KW
1508#define COMMIT_BUF_SECTORS 2048
1509
33e3963e
FB
1510/* commit COW file into the raw image */
1511int bdrv_commit(BlockDriverState *bs)
1512{
19cb3738 1513 BlockDriver *drv = bs->drv;
8a426614
KW
1514 int64_t sector, total_sectors;
1515 int n, ro, open_flags;
0bce597d 1516 int ret = 0;
8a426614 1517 uint8_t *buf;
c2cba3d9 1518 char filename[PATH_MAX];
33e3963e 1519
19cb3738
FB
1520 if (!drv)
1521 return -ENOMEDIUM;
4dca4b63
NS
1522
1523 if (!bs->backing_hd) {
1524 return -ENOTSUP;
33e3963e
FB
1525 }
1526
2d3735d3
SH
1527 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1528 return -EBUSY;
1529 }
1530
4dca4b63 1531 ro = bs->backing_hd->read_only;
c2cba3d9
JM
1532 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
1533 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
4dca4b63
NS
1534 open_flags = bs->backing_hd->open_flags;
1535
1536 if (ro) {
0bce597d
JC
1537 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) {
1538 return -EACCES;
4dca4b63 1539 }
ea2384d3 1540 }
33e3963e 1541
6ea44308 1542 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
7267c094 1543 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
1544
1545 for (sector = 0; sector < total_sectors; sector += n) {
05c4af54 1546 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
8a426614
KW
1547
1548 if (bdrv_read(bs, sector, buf, n) != 0) {
1549 ret = -EIO;
1550 goto ro_cleanup;
1551 }
1552
1553 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1554 ret = -EIO;
1555 goto ro_cleanup;
1556 }
ea2384d3 1557 }
33e3963e 1558 }
95389c86 1559
1d44952f
CH
1560 if (drv->bdrv_make_empty) {
1561 ret = drv->bdrv_make_empty(bs);
1562 bdrv_flush(bs);
1563 }
95389c86 1564
3f5075ae
CH
1565 /*
1566 * Make sure all data we wrote to the backing device is actually
1567 * stable on disk.
1568 */
1569 if (bs->backing_hd)
1570 bdrv_flush(bs->backing_hd);
4dca4b63
NS
1571
1572ro_cleanup:
7267c094 1573 g_free(buf);
4dca4b63
NS
1574
1575 if (ro) {
0bce597d
JC
1576 /* ignoring error return here */
1577 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL);
4dca4b63
NS
1578 }
1579
1d44952f 1580 return ret;
33e3963e
FB
1581}
1582
e8877497 1583int bdrv_commit_all(void)
6ab4b5ab
MA
1584{
1585 BlockDriverState *bs;
1586
1587 QTAILQ_FOREACH(bs, &bdrv_states, list) {
e8877497
SH
1588 int ret = bdrv_commit(bs);
1589 if (ret < 0) {
1590 return ret;
1591 }
6ab4b5ab 1592 }
e8877497 1593 return 0;
6ab4b5ab
MA
1594}
1595
dbffbdcf
SH
1596struct BdrvTrackedRequest {
1597 BlockDriverState *bs;
1598 int64_t sector_num;
1599 int nb_sectors;
1600 bool is_write;
1601 QLIST_ENTRY(BdrvTrackedRequest) list;
5f8b6491 1602 Coroutine *co; /* owner, used for deadlock detection */
f4658285 1603 CoQueue wait_queue; /* coroutines blocked on this request */
dbffbdcf
SH
1604};
1605
1606/**
1607 * Remove an active request from the tracked requests list
1608 *
1609 * This function should be called when a tracked request is completing.
1610 */
1611static void tracked_request_end(BdrvTrackedRequest *req)
1612{
1613 QLIST_REMOVE(req, list);
f4658285 1614 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
1615}
1616
1617/**
1618 * Add an active request to the tracked requests list
1619 */
1620static void tracked_request_begin(BdrvTrackedRequest *req,
1621 BlockDriverState *bs,
1622 int64_t sector_num,
1623 int nb_sectors, bool is_write)
1624{
1625 *req = (BdrvTrackedRequest){
1626 .bs = bs,
1627 .sector_num = sector_num,
1628 .nb_sectors = nb_sectors,
1629 .is_write = is_write,
5f8b6491 1630 .co = qemu_coroutine_self(),
dbffbdcf
SH
1631 };
1632
f4658285
SH
1633 qemu_co_queue_init(&req->wait_queue);
1634
dbffbdcf
SH
1635 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1636}
1637
d83947ac
SH
1638/**
1639 * Round a region to cluster boundaries
1640 */
1641static void round_to_clusters(BlockDriverState *bs,
1642 int64_t sector_num, int nb_sectors,
1643 int64_t *cluster_sector_num,
1644 int *cluster_nb_sectors)
1645{
1646 BlockDriverInfo bdi;
1647
1648 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1649 *cluster_sector_num = sector_num;
1650 *cluster_nb_sectors = nb_sectors;
1651 } else {
1652 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1653 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1654 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1655 nb_sectors, c);
1656 }
1657}
1658
f4658285
SH
1659static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1660 int64_t sector_num, int nb_sectors) {
d83947ac
SH
1661 /* aaaa bbbb */
1662 if (sector_num >= req->sector_num + req->nb_sectors) {
1663 return false;
1664 }
1665 /* bbbb aaaa */
1666 if (req->sector_num >= sector_num + nb_sectors) {
1667 return false;
1668 }
1669 return true;
f4658285
SH
1670}
1671
1672static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1673 int64_t sector_num, int nb_sectors)
1674{
1675 BdrvTrackedRequest *req;
d83947ac
SH
1676 int64_t cluster_sector_num;
1677 int cluster_nb_sectors;
f4658285
SH
1678 bool retry;
1679
d83947ac
SH
1680 /* If we touch the same cluster it counts as an overlap. This guarantees
1681 * that allocating writes will be serialized and not race with each other
1682 * for the same cluster. For example, in copy-on-read it ensures that the
1683 * CoR read and write operations are atomic and guest writes cannot
1684 * interleave between them.
1685 */
1686 round_to_clusters(bs, sector_num, nb_sectors,
1687 &cluster_sector_num, &cluster_nb_sectors);
1688
f4658285
SH
1689 do {
1690 retry = false;
1691 QLIST_FOREACH(req, &bs->tracked_requests, list) {
d83947ac
SH
1692 if (tracked_request_overlaps(req, cluster_sector_num,
1693 cluster_nb_sectors)) {
5f8b6491
SH
1694 /* Hitting this means there was a reentrant request, for
1695 * example, a block driver issuing nested requests. This must
1696 * never happen since it means deadlock.
1697 */
1698 assert(qemu_coroutine_self() != req->co);
1699
f4658285
SH
1700 qemu_co_queue_wait(&req->wait_queue);
1701 retry = true;
1702 break;
1703 }
1704 }
1705 } while (retry);
1706}
1707
756e6736
KW
1708/*
1709 * Return values:
1710 * 0 - success
1711 * -EINVAL - backing format specified, but no file
1712 * -ENOSPC - can't update the backing file because no space is left in the
1713 * image file header
1714 * -ENOTSUP - format driver doesn't support changing the backing file
1715 */
1716int bdrv_change_backing_file(BlockDriverState *bs,
1717 const char *backing_file, const char *backing_fmt)
1718{
1719 BlockDriver *drv = bs->drv;
469ef350 1720 int ret;
756e6736 1721
5f377794
PB
1722 /* Backing file format doesn't make sense without a backing file */
1723 if (backing_fmt && !backing_file) {
1724 return -EINVAL;
1725 }
1726
756e6736 1727 if (drv->bdrv_change_backing_file != NULL) {
469ef350 1728 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 1729 } else {
469ef350 1730 ret = -ENOTSUP;
756e6736 1731 }
469ef350
PB
1732
1733 if (ret == 0) {
1734 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1735 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1736 }
1737 return ret;
756e6736
KW
1738}
1739
6ebdcee2
JC
1740/*
1741 * Finds the image layer in the chain that has 'bs' as its backing file.
1742 *
1743 * active is the current topmost image.
1744 *
1745 * Returns NULL if bs is not found in active's image chain,
1746 * or if active == bs.
1747 */
1748BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
1749 BlockDriverState *bs)
1750{
1751 BlockDriverState *overlay = NULL;
1752 BlockDriverState *intermediate;
1753
1754 assert(active != NULL);
1755 assert(bs != NULL);
1756
1757 /* if bs is the same as active, then by definition it has no overlay
1758 */
1759 if (active == bs) {
1760 return NULL;
1761 }
1762
1763 intermediate = active;
1764 while (intermediate->backing_hd) {
1765 if (intermediate->backing_hd == bs) {
1766 overlay = intermediate;
1767 break;
1768 }
1769 intermediate = intermediate->backing_hd;
1770 }
1771
1772 return overlay;
1773}
1774
1775typedef struct BlkIntermediateStates {
1776 BlockDriverState *bs;
1777 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
1778} BlkIntermediateStates;
1779
1780
1781/*
1782 * Drops images above 'base' up to and including 'top', and sets the image
1783 * above 'top' to have base as its backing file.
1784 *
1785 * Requires that the overlay to 'top' is opened r/w, so that the backing file
1786 * information in 'bs' can be properly updated.
1787 *
1788 * E.g., this will convert the following chain:
1789 * bottom <- base <- intermediate <- top <- active
1790 *
1791 * to
1792 *
1793 * bottom <- base <- active
1794 *
1795 * It is allowed for bottom==base, in which case it converts:
1796 *
1797 * base <- intermediate <- top <- active
1798 *
1799 * to
1800 *
1801 * base <- active
1802 *
1803 * Error conditions:
1804 * if active == top, that is considered an error
1805 *
1806 */
1807int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
1808 BlockDriverState *base)
1809{
1810 BlockDriverState *intermediate;
1811 BlockDriverState *base_bs = NULL;
1812 BlockDriverState *new_top_bs = NULL;
1813 BlkIntermediateStates *intermediate_state, *next;
1814 int ret = -EIO;
1815
1816 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
1817 QSIMPLEQ_INIT(&states_to_delete);
1818
1819 if (!top->drv || !base->drv) {
1820 goto exit;
1821 }
1822
1823 new_top_bs = bdrv_find_overlay(active, top);
1824
1825 if (new_top_bs == NULL) {
1826 /* we could not find the image above 'top', this is an error */
1827 goto exit;
1828 }
1829
1830 /* special case of new_top_bs->backing_hd already pointing to base - nothing
1831 * to do, no intermediate images */
1832 if (new_top_bs->backing_hd == base) {
1833 ret = 0;
1834 goto exit;
1835 }
1836
1837 intermediate = top;
1838
1839 /* now we will go down through the list, and add each BDS we find
1840 * into our deletion queue, until we hit the 'base'
1841 */
1842 while (intermediate) {
1843 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
1844 intermediate_state->bs = intermediate;
1845 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
1846
1847 if (intermediate->backing_hd == base) {
1848 base_bs = intermediate->backing_hd;
1849 break;
1850 }
1851 intermediate = intermediate->backing_hd;
1852 }
1853 if (base_bs == NULL) {
1854 /* something went wrong, we did not end at the base. safely
1855 * unravel everything, and exit with error */
1856 goto exit;
1857 }
1858
1859 /* success - we can delete the intermediate states, and link top->base */
1860 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
1861 base_bs->drv ? base_bs->drv->format_name : "");
1862 if (ret) {
1863 goto exit;
1864 }
1865 new_top_bs->backing_hd = base_bs;
1866
1867
1868 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1869 /* so that bdrv_close() does not recursively close the chain */
1870 intermediate_state->bs->backing_hd = NULL;
1871 bdrv_delete(intermediate_state->bs);
1872 }
1873 ret = 0;
1874
1875exit:
1876 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
1877 g_free(intermediate_state);
1878 }
1879 return ret;
1880}
1881
1882
71d0770c
AL
1883static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1884 size_t size)
1885{
1886 int64_t len;
1887
1888 if (!bdrv_is_inserted(bs))
1889 return -ENOMEDIUM;
1890
1891 if (bs->growable)
1892 return 0;
1893
1894 len = bdrv_getlength(bs);
1895
fbb7b4e0
KW
1896 if (offset < 0)
1897 return -EIO;
1898
1899 if ((offset > len) || (len - offset < size))
71d0770c
AL
1900 return -EIO;
1901
1902 return 0;
1903}
1904
1905static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1906 int nb_sectors)
1907{
eb5a3165
JS
1908 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1909 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
1910}
1911
1c9805a3
SH
1912typedef struct RwCo {
1913 BlockDriverState *bs;
1914 int64_t sector_num;
1915 int nb_sectors;
1916 QEMUIOVector *qiov;
1917 bool is_write;
1918 int ret;
1919} RwCo;
1920
1921static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 1922{
1c9805a3 1923 RwCo *rwco = opaque;
ea2384d3 1924
1c9805a3
SH
1925 if (!rwco->is_write) {
1926 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
470c0504 1927 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1928 } else {
1929 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
f08f2dda 1930 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1931 }
1932}
e7a8a783 1933
1c9805a3
SH
1934/*
1935 * Process a synchronous request using coroutines
1936 */
1937static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1938 int nb_sectors, bool is_write)
1939{
1940 QEMUIOVector qiov;
1941 struct iovec iov = {
1942 .iov_base = (void *)buf,
1943 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1944 };
1945 Coroutine *co;
1946 RwCo rwco = {
1947 .bs = bs,
1948 .sector_num = sector_num,
1949 .nb_sectors = nb_sectors,
1950 .qiov = &qiov,
1951 .is_write = is_write,
1952 .ret = NOT_DONE,
1953 };
e7a8a783 1954
1c9805a3 1955 qemu_iovec_init_external(&qiov, &iov, 1);
e7a8a783 1956
498e386c
ZYW
1957 /**
1958 * In sync call context, when the vcpu is blocked, this throttling timer
1959 * will not fire; so the I/O throttling function has to be disabled here
1960 * if it has been enabled.
1961 */
1962 if (bs->io_limits_enabled) {
1963 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1964 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1965 bdrv_io_limits_disable(bs);
1966 }
1967
1c9805a3
SH
1968 if (qemu_in_coroutine()) {
1969 /* Fast-path if already in coroutine context */
1970 bdrv_rw_co_entry(&rwco);
1971 } else {
1972 co = qemu_coroutine_create(bdrv_rw_co_entry);
1973 qemu_coroutine_enter(co, &rwco);
1974 while (rwco.ret == NOT_DONE) {
1975 qemu_aio_wait();
1976 }
1977 }
1978 return rwco.ret;
1979}
b338082b 1980
1c9805a3
SH
1981/* return < 0 if error. See bdrv_write() for the return codes */
1982int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1983 uint8_t *buf, int nb_sectors)
1984{
1985 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
fc01f7e7
FB
1986}
1987
07d27a44
MA
1988/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
1989int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
1990 uint8_t *buf, int nb_sectors)
1991{
1992 bool enabled;
1993 int ret;
1994
1995 enabled = bs->io_limits_enabled;
1996 bs->io_limits_enabled = false;
1997 ret = bdrv_read(bs, 0, buf, 1);
1998 bs->io_limits_enabled = enabled;
1999 return ret;
2000}
2001
71df14fc
PB
2002#define BITS_PER_LONG (sizeof(unsigned long) * 8)
2003
7cd1e32a 2004static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
a55eb92c 2005 int nb_sectors, int dirty)
7cd1e32a 2006{
2007 int64_t start, end;
c6d22830 2008 unsigned long val, idx, bit;
a55eb92c 2009
6ea44308 2010 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
c6d22830 2011 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c
JK
2012
2013 for (; start <= end; start++) {
71df14fc
PB
2014 idx = start / BITS_PER_LONG;
2015 bit = start % BITS_PER_LONG;
c6d22830
JK
2016 val = bs->dirty_bitmap[idx];
2017 if (dirty) {
6d59fec1 2018 if (!(val & (1UL << bit))) {
aaa0eb75 2019 bs->dirty_count++;
6d59fec1 2020 val |= 1UL << bit;
aaa0eb75 2021 }
c6d22830 2022 } else {
6d59fec1 2023 if (val & (1UL << bit)) {
aaa0eb75 2024 bs->dirty_count--;
6d59fec1 2025 val &= ~(1UL << bit);
aaa0eb75 2026 }
c6d22830
JK
2027 }
2028 bs->dirty_bitmap[idx] = val;
7cd1e32a 2029 }
2030}
2031
5fafdf24 2032/* Return < 0 if error. Important errors are:
19cb3738
FB
2033 -EIO generic I/O error (may happen for all errors)
2034 -ENOMEDIUM No media inserted.
2035 -EINVAL Invalid sector number or nb_sectors
2036 -EACCES Trying to write a read-only device
2037*/
5fafdf24 2038int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
2039 const uint8_t *buf, int nb_sectors)
2040{
1c9805a3 2041 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
83f64091
FB
2042}
2043
eda578e5
AL
2044int bdrv_pread(BlockDriverState *bs, int64_t offset,
2045 void *buf, int count1)
83f64091 2046{
6ea44308 2047 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
2048 int len, nb_sectors, count;
2049 int64_t sector_num;
9a8c4cce 2050 int ret;
83f64091
FB
2051
2052 count = count1;
2053 /* first read to align to sector start */
6ea44308 2054 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
2055 if (len > count)
2056 len = count;
6ea44308 2057 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 2058 if (len > 0) {
9a8c4cce
KW
2059 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2060 return ret;
6ea44308 2061 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
83f64091
FB
2062 count -= len;
2063 if (count == 0)
2064 return count1;
2065 sector_num++;
2066 buf += len;
2067 }
2068
2069 /* read the sectors "in place" */
6ea44308 2070 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 2071 if (nb_sectors > 0) {
9a8c4cce
KW
2072 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
2073 return ret;
83f64091 2074 sector_num += nb_sectors;
6ea44308 2075 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
2076 buf += len;
2077 count -= len;
2078 }
2079
2080 /* add data from the last sector */
2081 if (count > 0) {
9a8c4cce
KW
2082 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2083 return ret;
83f64091
FB
2084 memcpy(buf, tmp_buf, count);
2085 }
2086 return count1;
2087}
2088
eda578e5
AL
2089int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
2090 const void *buf, int count1)
83f64091 2091{
6ea44308 2092 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
2093 int len, nb_sectors, count;
2094 int64_t sector_num;
9a8c4cce 2095 int ret;
83f64091
FB
2096
2097 count = count1;
2098 /* first write to align to sector start */
6ea44308 2099 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
2100 if (len > count)
2101 len = count;
6ea44308 2102 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 2103 if (len > 0) {
9a8c4cce
KW
2104 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2105 return ret;
6ea44308 2106 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
9a8c4cce
KW
2107 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2108 return ret;
83f64091
FB
2109 count -= len;
2110 if (count == 0)
2111 return count1;
2112 sector_num++;
2113 buf += len;
2114 }
2115
2116 /* write the sectors "in place" */
6ea44308 2117 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 2118 if (nb_sectors > 0) {
9a8c4cce
KW
2119 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
2120 return ret;
83f64091 2121 sector_num += nb_sectors;
6ea44308 2122 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
2123 buf += len;
2124 count -= len;
2125 }
2126
2127 /* add data from the last sector */
2128 if (count > 0) {
9a8c4cce
KW
2129 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
2130 return ret;
83f64091 2131 memcpy(tmp_buf, buf, count);
9a8c4cce
KW
2132 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
2133 return ret;
83f64091
FB
2134 }
2135 return count1;
2136}
83f64091 2137
f08145fe
KW
2138/*
2139 * Writes to the file and ensures that no writes are reordered across this
2140 * request (acts as a barrier)
2141 *
2142 * Returns 0 on success, -errno in error cases.
2143 */
2144int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
2145 const void *buf, int count)
2146{
2147 int ret;
2148
2149 ret = bdrv_pwrite(bs, offset, buf, count);
2150 if (ret < 0) {
2151 return ret;
2152 }
2153
f05fa4ad
PB
2154 /* No flush needed for cache modes that already do it */
2155 if (bs->enable_write_cache) {
f08145fe
KW
2156 bdrv_flush(bs);
2157 }
2158
2159 return 0;
2160}
2161
470c0504 2162static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
2163 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2164{
2165 /* Perform I/O through a temporary buffer so that users who scribble over
2166 * their read buffer while the operation is in progress do not end up
2167 * modifying the image file. This is critical for zero-copy guest I/O
2168 * where anything might happen inside guest memory.
2169 */
2170 void *bounce_buffer;
2171
79c053bd 2172 BlockDriver *drv = bs->drv;
ab185921
SH
2173 struct iovec iov;
2174 QEMUIOVector bounce_qiov;
2175 int64_t cluster_sector_num;
2176 int cluster_nb_sectors;
2177 size_t skip_bytes;
2178 int ret;
2179
2180 /* Cover entire cluster so no additional backing file I/O is required when
2181 * allocating cluster in the image file.
2182 */
2183 round_to_clusters(bs, sector_num, nb_sectors,
2184 &cluster_sector_num, &cluster_nb_sectors);
2185
470c0504
SH
2186 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
2187 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
2188
2189 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
2190 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
2191 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
2192
79c053bd
SH
2193 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
2194 &bounce_qiov);
ab185921
SH
2195 if (ret < 0) {
2196 goto err;
2197 }
2198
79c053bd
SH
2199 if (drv->bdrv_co_write_zeroes &&
2200 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589
KW
2201 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
2202 cluster_nb_sectors);
79c053bd 2203 } else {
f05fa4ad
PB
2204 /* This does not change the data on the disk, it is not necessary
2205 * to flush even in cache=writethrough mode.
2206 */
79c053bd 2207 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 2208 &bounce_qiov);
79c053bd
SH
2209 }
2210
ab185921
SH
2211 if (ret < 0) {
2212 /* It might be okay to ignore write errors for guest requests. If this
2213 * is a deliberate copy-on-read then we don't want to ignore the error.
2214 * Simply report it in all cases.
2215 */
2216 goto err;
2217 }
2218
2219 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
03396148
MT
2220 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
2221 nb_sectors * BDRV_SECTOR_SIZE);
ab185921
SH
2222
2223err:
2224 qemu_vfree(bounce_buffer);
2225 return ret;
2226}
2227
c5fbe571
SH
2228/*
2229 * Handle a read request in coroutine context
2230 */
2231static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
2232 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2233 BdrvRequestFlags flags)
da1fa91d
KW
2234{
2235 BlockDriver *drv = bs->drv;
dbffbdcf
SH
2236 BdrvTrackedRequest req;
2237 int ret;
da1fa91d 2238
da1fa91d
KW
2239 if (!drv) {
2240 return -ENOMEDIUM;
2241 }
2242 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2243 return -EIO;
2244 }
2245
98f90dba
ZYW
2246 /* throttling disk read I/O */
2247 if (bs->io_limits_enabled) {
2248 bdrv_io_limits_intercept(bs, false, nb_sectors);
2249 }
2250
f4658285 2251 if (bs->copy_on_read) {
470c0504
SH
2252 flags |= BDRV_REQ_COPY_ON_READ;
2253 }
2254 if (flags & BDRV_REQ_COPY_ON_READ) {
2255 bs->copy_on_read_in_flight++;
2256 }
2257
2258 if (bs->copy_on_read_in_flight) {
f4658285
SH
2259 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2260 }
2261
dbffbdcf 2262 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
ab185921 2263
470c0504 2264 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
2265 int pnum;
2266
2267 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
2268 if (ret < 0) {
2269 goto out;
2270 }
2271
2272 if (!ret || pnum != nb_sectors) {
470c0504 2273 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
2274 goto out;
2275 }
2276 }
2277
dbffbdcf 2278 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
2279
2280out:
dbffbdcf 2281 tracked_request_end(&req);
470c0504
SH
2282
2283 if (flags & BDRV_REQ_COPY_ON_READ) {
2284 bs->copy_on_read_in_flight--;
2285 }
2286
dbffbdcf 2287 return ret;
da1fa91d
KW
2288}
2289
c5fbe571 2290int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
2291 int nb_sectors, QEMUIOVector *qiov)
2292{
c5fbe571 2293 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 2294
470c0504
SH
2295 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
2296}
2297
2298int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
2299 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
2300{
2301 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
2302
2303 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
2304 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
2305}
2306
f08f2dda
SH
2307static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
2308 int64_t sector_num, int nb_sectors)
2309{
2310 BlockDriver *drv = bs->drv;
2311 QEMUIOVector qiov;
2312 struct iovec iov;
2313 int ret;
2314
621f0589
KW
2315 /* TODO Emulate only part of misaligned requests instead of letting block
2316 * drivers return -ENOTSUP and emulate everything */
2317
f08f2dda
SH
2318 /* First try the efficient write zeroes operation */
2319 if (drv->bdrv_co_write_zeroes) {
621f0589
KW
2320 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2321 if (ret != -ENOTSUP) {
2322 return ret;
2323 }
f08f2dda
SH
2324 }
2325
2326 /* Fall back to bounce buffer if write zeroes is unsupported */
2327 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
2328 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
2329 memset(iov.iov_base, 0, iov.iov_len);
2330 qemu_iovec_init_external(&qiov, &iov, 1);
2331
2332 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
2333
2334 qemu_vfree(iov.iov_base);
2335 return ret;
2336}
2337
c5fbe571
SH
2338/*
2339 * Handle a write request in coroutine context
2340 */
2341static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
2342 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
2343 BdrvRequestFlags flags)
c5fbe571
SH
2344{
2345 BlockDriver *drv = bs->drv;
dbffbdcf 2346 BdrvTrackedRequest req;
6b7cb247 2347 int ret;
da1fa91d
KW
2348
2349 if (!bs->drv) {
2350 return -ENOMEDIUM;
2351 }
2352 if (bs->read_only) {
2353 return -EACCES;
2354 }
2355 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
2356 return -EIO;
2357 }
2358
98f90dba
ZYW
2359 /* throttling disk write I/O */
2360 if (bs->io_limits_enabled) {
2361 bdrv_io_limits_intercept(bs, true, nb_sectors);
2362 }
2363
470c0504 2364 if (bs->copy_on_read_in_flight) {
f4658285
SH
2365 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
2366 }
2367
dbffbdcf
SH
2368 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
2369
f08f2dda
SH
2370 if (flags & BDRV_REQ_ZERO_WRITE) {
2371 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
2372 } else {
2373 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
2374 }
6b7cb247 2375
f05fa4ad
PB
2376 if (ret == 0 && !bs->enable_write_cache) {
2377 ret = bdrv_co_flush(bs);
2378 }
2379
da1fa91d
KW
2380 if (bs->dirty_bitmap) {
2381 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2382 }
2383
2384 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
2385 bs->wr_highest_sector = sector_num + nb_sectors - 1;
2386 }
2387
dbffbdcf
SH
2388 tracked_request_end(&req);
2389
6b7cb247 2390 return ret;
da1fa91d
KW
2391}
2392
c5fbe571
SH
2393int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
2394 int nb_sectors, QEMUIOVector *qiov)
2395{
2396 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
2397
f08f2dda
SH
2398 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
2399}
2400
2401int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
2402 int64_t sector_num, int nb_sectors)
2403{
2404 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
2405
2406 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
2407 BDRV_REQ_ZERO_WRITE);
c5fbe571
SH
2408}
2409
83f64091
FB
2410/**
2411 * Truncate file to 'offset' bytes (needed only for file protocols)
2412 */
2413int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2414{
2415 BlockDriver *drv = bs->drv;
51762288 2416 int ret;
83f64091 2417 if (!drv)
19cb3738 2418 return -ENOMEDIUM;
83f64091
FB
2419 if (!drv->bdrv_truncate)
2420 return -ENOTSUP;
59f2689d
NS
2421 if (bs->read_only)
2422 return -EACCES;
8591675f
MT
2423 if (bdrv_in_use(bs))
2424 return -EBUSY;
51762288
SH
2425 ret = drv->bdrv_truncate(bs, offset);
2426 if (ret == 0) {
2427 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 2428 bdrv_dev_resize_cb(bs);
51762288
SH
2429 }
2430 return ret;
83f64091
FB
2431}
2432
4a1d5e1f
FZ
2433/**
2434 * Length of a allocated file in bytes. Sparse files are counted by actual
2435 * allocated space. Return < 0 if error or unknown.
2436 */
2437int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2438{
2439 BlockDriver *drv = bs->drv;
2440 if (!drv) {
2441 return -ENOMEDIUM;
2442 }
2443 if (drv->bdrv_get_allocated_file_size) {
2444 return drv->bdrv_get_allocated_file_size(bs);
2445 }
2446 if (bs->file) {
2447 return bdrv_get_allocated_file_size(bs->file);
2448 }
2449 return -ENOTSUP;
2450}
2451
83f64091
FB
2452/**
2453 * Length of a file in bytes. Return < 0 if error or unknown.
2454 */
2455int64_t bdrv_getlength(BlockDriverState *bs)
2456{
2457 BlockDriver *drv = bs->drv;
2458 if (!drv)
19cb3738 2459 return -ENOMEDIUM;
51762288 2460
2c6942fa 2461 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
46a4e4e6
SH
2462 if (drv->bdrv_getlength) {
2463 return drv->bdrv_getlength(bs);
2464 }
83f64091 2465 }
46a4e4e6 2466 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
2467}
2468
19cb3738 2469/* return 0 as number of sectors if no device present or error */
96b8f136 2470void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 2471{
19cb3738
FB
2472 int64_t length;
2473 length = bdrv_getlength(bs);
2474 if (length < 0)
2475 length = 0;
2476 else
6ea44308 2477 length = length >> BDRV_SECTOR_BITS;
19cb3738 2478 *nb_sectors_ptr = length;
fc01f7e7 2479}
cf98951b 2480
0563e191
ZYW
2481/* throttling disk io limits */
2482void bdrv_set_io_limits(BlockDriverState *bs,
2483 BlockIOLimit *io_limits)
2484{
2485 bs->io_limits = *io_limits;
2486 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2487}
2488
ff06f5f3
PB
2489void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
2490 BlockdevOnError on_write_error)
abd7f68d
MA
2491{
2492 bs->on_read_error = on_read_error;
2493 bs->on_write_error = on_write_error;
2494}
2495
1ceee0d5 2496BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
abd7f68d
MA
2497{
2498 return is_read ? bs->on_read_error : bs->on_write_error;
2499}
2500
3e1caa5f
PB
2501BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
2502{
2503 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
2504
2505 switch (on_err) {
2506 case BLOCKDEV_ON_ERROR_ENOSPC:
2507 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
2508 case BLOCKDEV_ON_ERROR_STOP:
2509 return BDRV_ACTION_STOP;
2510 case BLOCKDEV_ON_ERROR_REPORT:
2511 return BDRV_ACTION_REPORT;
2512 case BLOCKDEV_ON_ERROR_IGNORE:
2513 return BDRV_ACTION_IGNORE;
2514 default:
2515 abort();
2516 }
2517}
2518
2519/* This is done by device models because, while the block layer knows
2520 * about the error, it does not know whether an operation comes from
2521 * the device or the block layer (from a job, for example).
2522 */
2523void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
2524 bool is_read, int error)
2525{
2526 assert(error >= 0);
32c81a4a 2527 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
3e1caa5f
PB
2528 if (action == BDRV_ACTION_STOP) {
2529 vm_stop(RUN_STATE_IO_ERROR);
2530 bdrv_iostatus_set_err(bs, error);
2531 }
2532}
2533
b338082b
FB
2534int bdrv_is_read_only(BlockDriverState *bs)
2535{
2536 return bs->read_only;
2537}
2538
985a03b0
TS
2539int bdrv_is_sg(BlockDriverState *bs)
2540{
2541 return bs->sg;
2542}
2543
e900a7b7
CH
2544int bdrv_enable_write_cache(BlockDriverState *bs)
2545{
2546 return bs->enable_write_cache;
2547}
2548
425b0148
PB
2549void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
2550{
2551 bs->enable_write_cache = wce;
55b110f2
JC
2552
2553 /* so a reopen() will preserve wce */
2554 if (wce) {
2555 bs->open_flags |= BDRV_O_CACHE_WB;
2556 } else {
2557 bs->open_flags &= ~BDRV_O_CACHE_WB;
2558 }
425b0148
PB
2559}
2560
ea2384d3
FB
2561int bdrv_is_encrypted(BlockDriverState *bs)
2562{
2563 if (bs->backing_hd && bs->backing_hd->encrypted)
2564 return 1;
2565 return bs->encrypted;
2566}
2567
c0f4ce77
AL
2568int bdrv_key_required(BlockDriverState *bs)
2569{
2570 BlockDriverState *backing_hd = bs->backing_hd;
2571
2572 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2573 return 1;
2574 return (bs->encrypted && !bs->valid_key);
2575}
2576
ea2384d3
FB
2577int bdrv_set_key(BlockDriverState *bs, const char *key)
2578{
2579 int ret;
2580 if (bs->backing_hd && bs->backing_hd->encrypted) {
2581 ret = bdrv_set_key(bs->backing_hd, key);
2582 if (ret < 0)
2583 return ret;
2584 if (!bs->encrypted)
2585 return 0;
2586 }
fd04a2ae
SH
2587 if (!bs->encrypted) {
2588 return -EINVAL;
2589 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2590 return -ENOMEDIUM;
2591 }
c0f4ce77 2592 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
2593 if (ret < 0) {
2594 bs->valid_key = 0;
2595 } else if (!bs->valid_key) {
2596 bs->valid_key = 1;
2597 /* call the change callback now, we skipped it on open */
7d4b4ba5 2598 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 2599 }
c0f4ce77 2600 return ret;
ea2384d3
FB
2601}
2602
f8d6bba1 2603const char *bdrv_get_format_name(BlockDriverState *bs)
ea2384d3 2604{
f8d6bba1 2605 return bs->drv ? bs->drv->format_name : NULL;
ea2384d3
FB
2606}
2607
5fafdf24 2608void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
2609 void *opaque)
2610{
2611 BlockDriver *drv;
2612
8a22f02a 2613 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
2614 it(opaque, drv->format_name);
2615 }
2616}
2617
b338082b
FB
2618BlockDriverState *bdrv_find(const char *name)
2619{
2620 BlockDriverState *bs;
2621
1b7bdbc1
SH
2622 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2623 if (!strcmp(name, bs->device_name)) {
b338082b 2624 return bs;
1b7bdbc1 2625 }
b338082b
FB
2626 }
2627 return NULL;
2628}
2629
2f399b0a
MA
2630BlockDriverState *bdrv_next(BlockDriverState *bs)
2631{
2632 if (!bs) {
2633 return QTAILQ_FIRST(&bdrv_states);
2634 }
2635 return QTAILQ_NEXT(bs, list);
2636}
2637
51de9760 2638void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
2639{
2640 BlockDriverState *bs;
2641
1b7bdbc1 2642 QTAILQ_FOREACH(bs, &bdrv_states, list) {
51de9760 2643 it(opaque, bs);
81d0912d
FB
2644 }
2645}
2646
ea2384d3
FB
2647const char *bdrv_get_device_name(BlockDriverState *bs)
2648{
2649 return bs->device_name;
2650}
2651
c8433287
MA
2652int bdrv_get_flags(BlockDriverState *bs)
2653{
2654 return bs->open_flags;
2655}
2656
c6ca28d6
AL
2657void bdrv_flush_all(void)
2658{
2659 BlockDriverState *bs;
2660
1b7bdbc1 2661 QTAILQ_FOREACH(bs, &bdrv_states, list) {
29cdb251 2662 bdrv_flush(bs);
1b7bdbc1 2663 }
c6ca28d6
AL
2664}
2665
f2feebbd
KW
2666int bdrv_has_zero_init(BlockDriverState *bs)
2667{
2668 assert(bs->drv);
2669
336c1c12
KW
2670 if (bs->drv->bdrv_has_zero_init) {
2671 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
2672 }
2673
2674 return 1;
2675}
2676
376ae3f1
SH
2677typedef struct BdrvCoIsAllocatedData {
2678 BlockDriverState *bs;
2679 int64_t sector_num;
2680 int nb_sectors;
2681 int *pnum;
2682 int ret;
2683 bool done;
2684} BdrvCoIsAllocatedData;
2685
f58c7b35
TS
2686/*
2687 * Returns true iff the specified sector is present in the disk image. Drivers
2688 * not implementing the functionality are assumed to not support backing files,
2689 * hence all their sectors are reported as allocated.
2690 *
bd9533e3
SH
2691 * If 'sector_num' is beyond the end of the disk image the return value is 0
2692 * and 'pnum' is set to 0.
2693 *
f58c7b35
TS
2694 * 'pnum' is set to the number of sectors (including and immediately following
2695 * the specified sector) that are known to be in the same
2696 * allocated/unallocated state.
2697 *
bd9533e3
SH
2698 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2699 * beyond the end of the disk image it will be clamped.
f58c7b35 2700 */
060f51c9
SH
2701int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2702 int nb_sectors, int *pnum)
f58c7b35 2703{
bd9533e3
SH
2704 int64_t n;
2705
2706 if (sector_num >= bs->total_sectors) {
2707 *pnum = 0;
2708 return 0;
2709 }
2710
2711 n = bs->total_sectors - sector_num;
2712 if (n < nb_sectors) {
2713 nb_sectors = n;
2714 }
2715
6aebab14 2716 if (!bs->drv->bdrv_co_is_allocated) {
bd9533e3 2717 *pnum = nb_sectors;
f58c7b35
TS
2718 return 1;
2719 }
6aebab14 2720
060f51c9
SH
2721 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2722}
2723
2724/* Coroutine wrapper for bdrv_is_allocated() */
2725static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2726{
2727 BdrvCoIsAllocatedData *data = opaque;
2728 BlockDriverState *bs = data->bs;
2729
2730 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2731 data->pnum);
2732 data->done = true;
2733}
2734
2735/*
2736 * Synchronous wrapper around bdrv_co_is_allocated().
2737 *
2738 * See bdrv_co_is_allocated() for details.
2739 */
2740int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2741 int *pnum)
2742{
6aebab14
SH
2743 Coroutine *co;
2744 BdrvCoIsAllocatedData data = {
2745 .bs = bs,
2746 .sector_num = sector_num,
2747 .nb_sectors = nb_sectors,
2748 .pnum = pnum,
2749 .done = false,
2750 };
2751
2752 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2753 qemu_coroutine_enter(co, &data);
2754 while (!data.done) {
2755 qemu_aio_wait();
2756 }
2757 return data.ret;
f58c7b35
TS
2758}
2759
188a7bbf
PB
2760/*
2761 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2762 *
2763 * Return true if the given sector is allocated in any image between
2764 * BASE and TOP (inclusive). BASE can be NULL to check if the given
2765 * sector is allocated in any image of the chain. Return false otherwise.
2766 *
2767 * 'pnum' is set to the number of sectors (including and immediately following
2768 * the specified sector) that are known to be in the same
2769 * allocated/unallocated state.
2770 *
2771 */
2772int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
2773 BlockDriverState *base,
2774 int64_t sector_num,
2775 int nb_sectors, int *pnum)
2776{
2777 BlockDriverState *intermediate;
2778 int ret, n = nb_sectors;
2779
2780 intermediate = top;
2781 while (intermediate && intermediate != base) {
2782 int pnum_inter;
2783 ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
2784 &pnum_inter);
2785 if (ret < 0) {
2786 return ret;
2787 } else if (ret) {
2788 *pnum = pnum_inter;
2789 return 1;
2790 }
2791
2792 /*
2793 * [sector_num, nb_sectors] is unallocated on top but intermediate
2794 * might have
2795 *
2796 * [sector_num+x, nr_sectors] allocated.
2797 */
2798 if (n > pnum_inter) {
2799 n = pnum_inter;
2800 }
2801
2802 intermediate = intermediate->backing_hd;
2803 }
2804
2805 *pnum = n;
2806 return 0;
2807}
2808
b2023818 2809BlockInfoList *qmp_query_block(Error **errp)
b338082b 2810{
b2023818 2811 BlockInfoList *head = NULL, *cur_item = NULL;
b338082b
FB
2812 BlockDriverState *bs;
2813
1b7bdbc1 2814 QTAILQ_FOREACH(bs, &bdrv_states, list) {
b2023818 2815 BlockInfoList *info = g_malloc0(sizeof(*info));
d15e5465 2816
b2023818
LC
2817 info->value = g_malloc0(sizeof(*info->value));
2818 info->value->device = g_strdup(bs->device_name);
2819 info->value->type = g_strdup("unknown");
2820 info->value->locked = bdrv_dev_is_medium_locked(bs);
2821 info->value->removable = bdrv_dev_has_removable_media(bs);
d15e5465 2822
e4def80b 2823 if (bdrv_dev_has_removable_media(bs)) {
b2023818
LC
2824 info->value->has_tray_open = true;
2825 info->value->tray_open = bdrv_dev_is_tray_open(bs);
e4def80b 2826 }
f04ef601
LC
2827
2828 if (bdrv_iostatus_is_enabled(bs)) {
b2023818
LC
2829 info->value->has_io_status = true;
2830 info->value->io_status = bs->iostatus;
f04ef601
LC
2831 }
2832
19cb3738 2833 if (bs->drv) {
b2023818
LC
2834 info->value->has_inserted = true;
2835 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2836 info->value->inserted->file = g_strdup(bs->filename);
2837 info->value->inserted->ro = bs->read_only;
2838 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2839 info->value->inserted->encrypted = bs->encrypted;
c75a1a8a 2840 info->value->inserted->encryption_key_missing = bdrv_key_required(bs);
b2023818
LC
2841 if (bs->backing_file[0]) {
2842 info->value->inserted->has_backing_file = true;
2843 info->value->inserted->backing_file = g_strdup(bs->backing_file);
376253ec 2844 }
727f005e 2845
2e3e3317
BC
2846 info->value->inserted->backing_file_depth =
2847 bdrv_get_backing_file_depth(bs);
2848
727f005e
ZYW
2849 if (bs->io_limits_enabled) {
2850 info->value->inserted->bps =
2851 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2852 info->value->inserted->bps_rd =
2853 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2854 info->value->inserted->bps_wr =
2855 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2856 info->value->inserted->iops =
2857 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2858 info->value->inserted->iops_rd =
2859 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2860 info->value->inserted->iops_wr =
2861 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2862 }
b2023818 2863 }
d15e5465 2864
b2023818
LC
2865 /* XXX: waiting for the qapi to support GSList */
2866 if (!cur_item) {
2867 head = cur_item = info;
2868 } else {
2869 cur_item->next = info;
2870 cur_item = info;
b338082b 2871 }
b338082b 2872 }
d15e5465 2873
b2023818 2874 return head;
b338082b 2875}
a36e69dd 2876
f11f57e4
LC
2877/* Consider exposing this as a full fledged QMP command */
2878static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2879{
2880 BlockStats *s;
2881
2882 s = g_malloc0(sizeof(*s));
2883
2884 if (bs->device_name[0]) {
2885 s->has_device = true;
2886 s->device = g_strdup(bs->device_name);
294cc35f
KW
2887 }
2888
f11f57e4
LC
2889 s->stats = g_malloc0(sizeof(*s->stats));
2890 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2891 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2892 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2893 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2894 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2895 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2896 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2897 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2898 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2899
294cc35f 2900 if (bs->file) {
f11f57e4
LC
2901 s->has_parent = true;
2902 s->parent = qmp_query_blockstat(bs->file, NULL);
294cc35f
KW
2903 }
2904
f11f57e4 2905 return s;
294cc35f
KW
2906}
2907
f11f57e4 2908BlockStatsList *qmp_query_blockstats(Error **errp)
218a536a 2909{
f11f57e4 2910 BlockStatsList *head = NULL, *cur_item = NULL;
a36e69dd
TS
2911 BlockDriverState *bs;
2912
1b7bdbc1 2913 QTAILQ_FOREACH(bs, &bdrv_states, list) {
f11f57e4
LC
2914 BlockStatsList *info = g_malloc0(sizeof(*info));
2915 info->value = qmp_query_blockstat(bs, NULL);
2916
2917 /* XXX: waiting for the qapi to support GSList */
2918 if (!cur_item) {
2919 head = cur_item = info;
2920 } else {
2921 cur_item->next = info;
2922 cur_item = info;
2923 }
a36e69dd 2924 }
218a536a 2925
f11f57e4 2926 return head;
a36e69dd 2927}
ea2384d3 2928
045df330
AL
2929const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2930{
2931 if (bs->backing_hd && bs->backing_hd->encrypted)
2932 return bs->backing_file;
2933 else if (bs->encrypted)
2934 return bs->filename;
2935 else
2936 return NULL;
2937}
2938
5fafdf24 2939void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
2940 char *filename, int filename_size)
2941{
3574c608 2942 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
2943}
2944
5fafdf24 2945int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
2946 const uint8_t *buf, int nb_sectors)
2947{
2948 BlockDriver *drv = bs->drv;
2949 if (!drv)
19cb3738 2950 return -ENOMEDIUM;
faea38e7
FB
2951 if (!drv->bdrv_write_compressed)
2952 return -ENOTSUP;
fbb7b4e0
KW
2953 if (bdrv_check_request(bs, sector_num, nb_sectors))
2954 return -EIO;
a55eb92c 2955
c6d22830 2956 if (bs->dirty_bitmap) {
7cd1e32a 2957 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2958 }
a55eb92c 2959
faea38e7
FB
2960 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2961}
3b46e624 2962
faea38e7
FB
2963int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2964{
2965 BlockDriver *drv = bs->drv;
2966 if (!drv)
19cb3738 2967 return -ENOMEDIUM;
faea38e7
FB
2968 if (!drv->bdrv_get_info)
2969 return -ENOTSUP;
2970 memset(bdi, 0, sizeof(*bdi));
2971 return drv->bdrv_get_info(bs, bdi);
2972}
2973
45566e9c
CH
2974int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2975 int64_t pos, int size)
178e08a5
AL
2976{
2977 BlockDriver *drv = bs->drv;
2978 if (!drv)
2979 return -ENOMEDIUM;
7cdb1f6d
MK
2980 if (drv->bdrv_save_vmstate)
2981 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2982 if (bs->file)
2983 return bdrv_save_vmstate(bs->file, buf, pos, size);
2984 return -ENOTSUP;
178e08a5
AL
2985}
2986
45566e9c
CH
2987int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2988 int64_t pos, int size)
178e08a5
AL
2989{
2990 BlockDriver *drv = bs->drv;
2991 if (!drv)
2992 return -ENOMEDIUM;
7cdb1f6d
MK
2993 if (drv->bdrv_load_vmstate)
2994 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2995 if (bs->file)
2996 return bdrv_load_vmstate(bs->file, buf, pos, size);
2997 return -ENOTSUP;
178e08a5
AL
2998}
2999
8b9b0cc2
KW
3000void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
3001{
3002 BlockDriver *drv = bs->drv;
3003
3004 if (!drv || !drv->bdrv_debug_event) {
3005 return;
3006 }
3007
0ed8b6f6 3008 drv->bdrv_debug_event(bs, event);
8b9b0cc2
KW
3009
3010}
3011
faea38e7
FB
3012/**************************************************************/
3013/* handling of snapshots */
3014
feeee5ac
MDCF
3015int bdrv_can_snapshot(BlockDriverState *bs)
3016{
3017 BlockDriver *drv = bs->drv;
07b70bfb 3018 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
feeee5ac
MDCF
3019 return 0;
3020 }
3021
3022 if (!drv->bdrv_snapshot_create) {
3023 if (bs->file != NULL) {
3024 return bdrv_can_snapshot(bs->file);
3025 }
3026 return 0;
3027 }
3028
3029 return 1;
3030}
3031
199630b6
BS
3032int bdrv_is_snapshot(BlockDriverState *bs)
3033{
3034 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
3035}
3036
f9092b10
MA
3037BlockDriverState *bdrv_snapshots(void)
3038{
3039 BlockDriverState *bs;
3040
3ac906f7 3041 if (bs_snapshots) {
f9092b10 3042 return bs_snapshots;
3ac906f7 3043 }
f9092b10
MA
3044
3045 bs = NULL;
3046 while ((bs = bdrv_next(bs))) {
3047 if (bdrv_can_snapshot(bs)) {
3ac906f7
MA
3048 bs_snapshots = bs;
3049 return bs;
f9092b10
MA
3050 }
3051 }
3052 return NULL;
f9092b10
MA
3053}
3054
5fafdf24 3055int bdrv_snapshot_create(BlockDriverState *bs,
faea38e7
FB
3056 QEMUSnapshotInfo *sn_info)
3057{
3058 BlockDriver *drv = bs->drv;
3059 if (!drv)
19cb3738 3060 return -ENOMEDIUM;
7cdb1f6d
MK
3061 if (drv->bdrv_snapshot_create)
3062 return drv->bdrv_snapshot_create(bs, sn_info);
3063 if (bs->file)
3064 return bdrv_snapshot_create(bs->file, sn_info);
3065 return -ENOTSUP;
faea38e7
FB
3066}
3067
5fafdf24 3068int bdrv_snapshot_goto(BlockDriverState *bs,
faea38e7
FB
3069 const char *snapshot_id)
3070{
3071 BlockDriver *drv = bs->drv;
7cdb1f6d
MK
3072 int ret, open_ret;
3073
faea38e7 3074 if (!drv)
19cb3738 3075 return -ENOMEDIUM;
7cdb1f6d
MK
3076 if (drv->bdrv_snapshot_goto)
3077 return drv->bdrv_snapshot_goto(bs, snapshot_id);
3078
3079 if (bs->file) {
3080 drv->bdrv_close(bs);
3081 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
3082 open_ret = drv->bdrv_open(bs, bs->open_flags);
3083 if (open_ret < 0) {
3084 bdrv_delete(bs->file);
3085 bs->drv = NULL;
3086 return open_ret;
3087 }
3088 return ret;
3089 }
3090
3091 return -ENOTSUP;
faea38e7
FB
3092}
3093
3094int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
3095{
3096 BlockDriver *drv = bs->drv;
3097 if (!drv)
19cb3738 3098 return -ENOMEDIUM;
7cdb1f6d
MK
3099 if (drv->bdrv_snapshot_delete)
3100 return drv->bdrv_snapshot_delete(bs, snapshot_id);
3101 if (bs->file)
3102 return bdrv_snapshot_delete(bs->file, snapshot_id);
3103 return -ENOTSUP;
faea38e7
FB
3104}
3105
5fafdf24 3106int bdrv_snapshot_list(BlockDriverState *bs,
faea38e7
FB
3107 QEMUSnapshotInfo **psn_info)
3108{
3109 BlockDriver *drv = bs->drv;
3110 if (!drv)
19cb3738 3111 return -ENOMEDIUM;
7cdb1f6d
MK
3112 if (drv->bdrv_snapshot_list)
3113 return drv->bdrv_snapshot_list(bs, psn_info);
3114 if (bs->file)
3115 return bdrv_snapshot_list(bs->file, psn_info);
3116 return -ENOTSUP;
faea38e7
FB
3117}
3118
51ef6727 3119int bdrv_snapshot_load_tmp(BlockDriverState *bs,
3120 const char *snapshot_name)
3121{
3122 BlockDriver *drv = bs->drv;
3123 if (!drv) {
3124 return -ENOMEDIUM;
3125 }
3126 if (!bs->read_only) {
3127 return -EINVAL;
3128 }
3129 if (drv->bdrv_snapshot_load_tmp) {
3130 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
3131 }
3132 return -ENOTSUP;
3133}
3134
e8a6bb9c
MT
3135BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
3136 const char *backing_file)
3137{
3138 if (!bs->drv) {
3139 return NULL;
3140 }
3141
3142 if (bs->backing_hd) {
3143 if (strcmp(bs->backing_file, backing_file) == 0) {
3144 return bs->backing_hd;
3145 } else {
3146 return bdrv_find_backing_image(bs->backing_hd, backing_file);
3147 }
3148 }
3149
3150 return NULL;
3151}
3152
f198fd1c
BC
3153int bdrv_get_backing_file_depth(BlockDriverState *bs)
3154{
3155 if (!bs->drv) {
3156 return 0;
3157 }
3158
3159 if (!bs->backing_hd) {
3160 return 0;
3161 }
3162
3163 return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
3164}
3165
79fac568
JC
3166BlockDriverState *bdrv_find_base(BlockDriverState *bs)
3167{
3168 BlockDriverState *curr_bs = NULL;
3169
3170 if (!bs) {
3171 return NULL;
3172 }
3173
3174 curr_bs = bs;
3175
3176 while (curr_bs->backing_hd) {
3177 curr_bs = curr_bs->backing_hd;
3178 }
3179 return curr_bs;
3180}
3181
faea38e7
FB
3182#define NB_SUFFIXES 4
3183
3184char *get_human_readable_size(char *buf, int buf_size, int64_t size)
3185{
3186 static const char suffixes[NB_SUFFIXES] = "KMGT";
3187 int64_t base;
3188 int i;
3189
3190 if (size <= 999) {
3191 snprintf(buf, buf_size, "%" PRId64, size);
3192 } else {
3193 base = 1024;
3194 for(i = 0; i < NB_SUFFIXES; i++) {
3195 if (size < (10 * base)) {
5fafdf24 3196 snprintf(buf, buf_size, "%0.1f%c",
faea38e7
FB
3197 (double)size / base,
3198 suffixes[i]);
3199 break;
3200 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
5fafdf24 3201 snprintf(buf, buf_size, "%" PRId64 "%c",
faea38e7
FB
3202 ((size + (base >> 1)) / base),
3203 suffixes[i]);
3204 break;
3205 }
3206 base = base * 1024;
3207 }
3208 }
3209 return buf;
3210}
3211
3212char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
3213{
3214 char buf1[128], date_buf[128], clock_buf[128];
3b9f94e1
FB
3215#ifdef _WIN32
3216 struct tm *ptm;
3217#else
faea38e7 3218 struct tm tm;
3b9f94e1 3219#endif
faea38e7
FB
3220 time_t ti;
3221 int64_t secs;
3222
3223 if (!sn) {
5fafdf24
TS
3224 snprintf(buf, buf_size,
3225 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
3226 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
3227 } else {
3228 ti = sn->date_sec;
3b9f94e1
FB
3229#ifdef _WIN32
3230 ptm = localtime(&ti);
3231 strftime(date_buf, sizeof(date_buf),
3232 "%Y-%m-%d %H:%M:%S", ptm);
3233#else
faea38e7
FB
3234 localtime_r(&ti, &tm);
3235 strftime(date_buf, sizeof(date_buf),
3236 "%Y-%m-%d %H:%M:%S", &tm);
3b9f94e1 3237#endif
faea38e7
FB
3238 secs = sn->vm_clock_nsec / 1000000000;
3239 snprintf(clock_buf, sizeof(clock_buf),
3240 "%02d:%02d:%02d.%03d",
3241 (int)(secs / 3600),
3242 (int)((secs / 60) % 60),
5fafdf24 3243 (int)(secs % 60),
faea38e7
FB
3244 (int)((sn->vm_clock_nsec / 1000000) % 1000));
3245 snprintf(buf, buf_size,
5fafdf24 3246 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
3247 sn->id_str, sn->name,
3248 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
3249 date_buf,
3250 clock_buf);
3251 }
3252 return buf;
3253}
3254
ea2384d3 3255/**************************************************************/
83f64091 3256/* async I/Os */
ea2384d3 3257
3b69e4b9 3258BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 3259 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 3260 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 3261{
bbf0a440
SH
3262 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
3263
b2a61371 3264 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 3265 cb, opaque, false);
ea2384d3
FB
3266}
3267
f141eafe
AL
3268BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
3269 QEMUIOVector *qiov, int nb_sectors,
3270 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 3271{
bbf0a440
SH
3272 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
3273
1a6e115b 3274 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 3275 cb, opaque, true);
83f64091
FB
3276}
3277
40b4f539
KW
3278
3279typedef struct MultiwriteCB {
3280 int error;
3281 int num_requests;
3282 int num_callbacks;
3283 struct {
3284 BlockDriverCompletionFunc *cb;
3285 void *opaque;
3286 QEMUIOVector *free_qiov;
40b4f539
KW
3287 } callbacks[];
3288} MultiwriteCB;
3289
3290static void multiwrite_user_cb(MultiwriteCB *mcb)
3291{
3292 int i;
3293
3294 for (i = 0; i < mcb->num_callbacks; i++) {
3295 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
3296 if (mcb->callbacks[i].free_qiov) {
3297 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3298 }
7267c094 3299 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
3300 }
3301}
3302
3303static void multiwrite_cb(void *opaque, int ret)
3304{
3305 MultiwriteCB *mcb = opaque;
3306
6d519a5f
SH
3307 trace_multiwrite_cb(mcb, ret);
3308
cb6d3ca0 3309 if (ret < 0 && !mcb->error) {
40b4f539 3310 mcb->error = ret;
40b4f539
KW
3311 }
3312
3313 mcb->num_requests--;
3314 if (mcb->num_requests == 0) {
de189a1b 3315 multiwrite_user_cb(mcb);
7267c094 3316 g_free(mcb);
40b4f539
KW
3317 }
3318}
3319
3320static int multiwrite_req_compare(const void *a, const void *b)
3321{
77be4366
CH
3322 const BlockRequest *req1 = a, *req2 = b;
3323
3324 /*
3325 * Note that we can't simply subtract req2->sector from req1->sector
3326 * here as that could overflow the return value.
3327 */
3328 if (req1->sector > req2->sector) {
3329 return 1;
3330 } else if (req1->sector < req2->sector) {
3331 return -1;
3332 } else {
3333 return 0;
3334 }
40b4f539
KW
3335}
3336
3337/*
3338 * Takes a bunch of requests and tries to merge them. Returns the number of
3339 * requests that remain after merging.
3340 */
3341static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3342 int num_reqs, MultiwriteCB *mcb)
3343{
3344 int i, outidx;
3345
3346 // Sort requests by start sector
3347 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3348
3349 // Check if adjacent requests touch the same clusters. If so, combine them,
3350 // filling up gaps with zero sectors.
3351 outidx = 0;
3352 for (i = 1; i < num_reqs; i++) {
3353 int merge = 0;
3354 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3355
b6a127a1 3356 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
3357 if (reqs[i].sector <= oldreq_last) {
3358 merge = 1;
3359 }
3360
e2a305fb
CH
3361 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3362 merge = 0;
3363 }
3364
40b4f539
KW
3365 if (merge) {
3366 size_t size;
7267c094 3367 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
3368 qemu_iovec_init(qiov,
3369 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3370
3371 // Add the first request to the merged one. If the requests are
3372 // overlapping, drop the last sectors of the first request.
3373 size = (reqs[i].sector - reqs[outidx].sector) << 9;
1b093c48 3374 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
40b4f539 3375
b6a127a1
PB
3376 // We should need to add any zeros between the two requests
3377 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
3378
3379 // Add the second request
1b093c48 3380 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
40b4f539 3381
cbf1dff2 3382 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
3383 reqs[outidx].qiov = qiov;
3384
3385 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3386 } else {
3387 outidx++;
3388 reqs[outidx].sector = reqs[i].sector;
3389 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3390 reqs[outidx].qiov = reqs[i].qiov;
3391 }
3392 }
3393
3394 return outidx + 1;
3395}
3396
3397/*
3398 * Submit multiple AIO write requests at once.
3399 *
3400 * On success, the function returns 0 and all requests in the reqs array have
3401 * been submitted. In error case this function returns -1, and any of the
3402 * requests may or may not be submitted yet. In particular, this means that the
3403 * callback will be called for some of the requests, for others it won't. The
3404 * caller must check the error field of the BlockRequest to wait for the right
3405 * callbacks (if error != 0, no callback will be called).
3406 *
3407 * The implementation may modify the contents of the reqs array, e.g. to merge
3408 * requests. However, the fields opaque and error are left unmodified as they
3409 * are used to signal failure for a single request to the caller.
3410 */
3411int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3412{
40b4f539
KW
3413 MultiwriteCB *mcb;
3414 int i;
3415
301db7c2
RH
3416 /* don't submit writes if we don't have a medium */
3417 if (bs->drv == NULL) {
3418 for (i = 0; i < num_reqs; i++) {
3419 reqs[i].error = -ENOMEDIUM;
3420 }
3421 return -1;
3422 }
3423
40b4f539
KW
3424 if (num_reqs == 0) {
3425 return 0;
3426 }
3427
3428 // Create MultiwriteCB structure
7267c094 3429 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
3430 mcb->num_requests = 0;
3431 mcb->num_callbacks = num_reqs;
3432
3433 for (i = 0; i < num_reqs; i++) {
3434 mcb->callbacks[i].cb = reqs[i].cb;
3435 mcb->callbacks[i].opaque = reqs[i].opaque;
3436 }
3437
3438 // Check for mergable requests
3439 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3440
6d519a5f
SH
3441 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3442
df9309fb
PB
3443 /* Run the aio requests. */
3444 mcb->num_requests = num_reqs;
40b4f539 3445 for (i = 0; i < num_reqs; i++) {
ad54ae80 3446 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
40b4f539 3447 reqs[i].nb_sectors, multiwrite_cb, mcb);
40b4f539
KW
3448 }
3449
3450 return 0;
40b4f539
KW
3451}
3452
83f64091 3453void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 3454{
6bbff9a0 3455 acb->pool->cancel(acb);
83f64091
FB
3456}
3457
98f90dba
ZYW
3458/* block I/O throttling */
3459static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3460 bool is_write, double elapsed_time, uint64_t *wait)
3461{
3462 uint64_t bps_limit = 0;
3463 double bytes_limit, bytes_base, bytes_res;
3464 double slice_time, wait_time;
3465
3466 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3467 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3468 } else if (bs->io_limits.bps[is_write]) {
3469 bps_limit = bs->io_limits.bps[is_write];
3470 } else {
3471 if (wait) {
3472 *wait = 0;
3473 }
3474
3475 return false;
3476 }
3477
3478 slice_time = bs->slice_end - bs->slice_start;
3479 slice_time /= (NANOSECONDS_PER_SECOND);
3480 bytes_limit = bps_limit * slice_time;
3481 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3482 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3483 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3484 }
3485
3486 /* bytes_base: the bytes of data which have been read/written; and
3487 * it is obtained from the history statistic info.
3488 * bytes_res: the remaining bytes of data which need to be read/written.
3489 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3490 * the total time for completing reading/writting all data.
3491 */
3492 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3493
3494 if (bytes_base + bytes_res <= bytes_limit) {
3495 if (wait) {
3496 *wait = 0;
3497 }
3498
3499 return false;
3500 }
3501
3502 /* Calc approx time to dispatch */
3503 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3504
3505 /* When the I/O rate at runtime exceeds the limits,
3506 * bs->slice_end need to be extended in order that the current statistic
3507 * info can be kept until the timer fire, so it is increased and tuned
3508 * based on the result of experiment.
3509 */
3510 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3511 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3512 if (wait) {
3513 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3514 }
3515
3516 return true;
3517}
3518
3519static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3520 double elapsed_time, uint64_t *wait)
3521{
3522 uint64_t iops_limit = 0;
3523 double ios_limit, ios_base;
3524 double slice_time, wait_time;
3525
3526 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3527 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3528 } else if (bs->io_limits.iops[is_write]) {
3529 iops_limit = bs->io_limits.iops[is_write];
3530 } else {
3531 if (wait) {
3532 *wait = 0;
3533 }
3534
3535 return false;
3536 }
3537
3538 slice_time = bs->slice_end - bs->slice_start;
3539 slice_time /= (NANOSECONDS_PER_SECOND);
3540 ios_limit = iops_limit * slice_time;
3541 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3542 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3543 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3544 }
3545
3546 if (ios_base + 1 <= ios_limit) {
3547 if (wait) {
3548 *wait = 0;
3549 }
3550
3551 return false;
3552 }
3553
3554 /* Calc approx time to dispatch */
3555 wait_time = (ios_base + 1) / iops_limit;
3556 if (wait_time > elapsed_time) {
3557 wait_time = wait_time - elapsed_time;
3558 } else {
3559 wait_time = 0;
3560 }
3561
3562 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3563 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3564 if (wait) {
3565 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3566 }
3567
3568 return true;
3569}
3570
3571static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3572 bool is_write, int64_t *wait)
3573{
3574 int64_t now, max_wait;
3575 uint64_t bps_wait = 0, iops_wait = 0;
3576 double elapsed_time;
3577 int bps_ret, iops_ret;
3578
3579 now = qemu_get_clock_ns(vm_clock);
3580 if ((bs->slice_start < now)
3581 && (bs->slice_end > now)) {
3582 bs->slice_end = now + bs->slice_time;
3583 } else {
3584 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3585 bs->slice_start = now;
3586 bs->slice_end = now + bs->slice_time;
3587
3588 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3589 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3590
3591 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3592 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3593 }
3594
3595 elapsed_time = now - bs->slice_start;
3596 elapsed_time /= (NANOSECONDS_PER_SECOND);
3597
3598 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3599 is_write, elapsed_time, &bps_wait);
3600 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3601 elapsed_time, &iops_wait);
3602 if (bps_ret || iops_ret) {
3603 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3604 if (wait) {
3605 *wait = max_wait;
3606 }
3607
3608 now = qemu_get_clock_ns(vm_clock);
3609 if (bs->slice_end < now + max_wait) {
3610 bs->slice_end = now + max_wait;
3611 }
3612
3613 return true;
3614 }
3615
3616 if (wait) {
3617 *wait = 0;
3618 }
3619
3620 return false;
3621}
ce1a14dc 3622
83f64091
FB
3623/**************************************************************/
3624/* async block device emulation */
3625
c16b5a2c
CH
3626typedef struct BlockDriverAIOCBSync {
3627 BlockDriverAIOCB common;
3628 QEMUBH *bh;
3629 int ret;
3630 /* vector translation state */
3631 QEMUIOVector *qiov;
3632 uint8_t *bounce;
3633 int is_write;
3634} BlockDriverAIOCBSync;
3635
3636static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3637{
b666d239
KW
3638 BlockDriverAIOCBSync *acb =
3639 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 3640 qemu_bh_delete(acb->bh);
36afc451 3641 acb->bh = NULL;
c16b5a2c
CH
3642 qemu_aio_release(acb);
3643}
3644
3645static AIOPool bdrv_em_aio_pool = {
3646 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3647 .cancel = bdrv_aio_cancel_em,
3648};
3649
ce1a14dc 3650static void bdrv_aio_bh_cb(void *opaque)
83f64091 3651{
ce1a14dc 3652 BlockDriverAIOCBSync *acb = opaque;
f141eafe 3653
f141eafe 3654 if (!acb->is_write)
03396148 3655 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
ceb42de8 3656 qemu_vfree(acb->bounce);
ce1a14dc 3657 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 3658 qemu_bh_delete(acb->bh);
36afc451 3659 acb->bh = NULL;
ce1a14dc 3660 qemu_aio_release(acb);
83f64091 3661}
beac80cd 3662
f141eafe
AL
3663static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3664 int64_t sector_num,
3665 QEMUIOVector *qiov,
3666 int nb_sectors,
3667 BlockDriverCompletionFunc *cb,
3668 void *opaque,
3669 int is_write)
3670
83f64091 3671{
ce1a14dc 3672 BlockDriverAIOCBSync *acb;
ce1a14dc 3673
c16b5a2c 3674 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
f141eafe
AL
3675 acb->is_write = is_write;
3676 acb->qiov = qiov;
e268ca52 3677 acb->bounce = qemu_blockalign(bs, qiov->size);
3f3aace8 3678 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
3679
3680 if (is_write) {
d5e6b161 3681 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
1ed20acf 3682 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 3683 } else {
1ed20acf 3684 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
3685 }
3686
ce1a14dc 3687 qemu_bh_schedule(acb->bh);
f141eafe 3688
ce1a14dc 3689 return &acb->common;
beac80cd
FB
3690}
3691
f141eafe
AL
3692static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3693 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 3694 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 3695{
f141eafe
AL
3696 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3697}
83f64091 3698
f141eafe
AL
3699static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3700 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3701 BlockDriverCompletionFunc *cb, void *opaque)
3702{
3703 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 3704}
beac80cd 3705
68485420
KW
3706
3707typedef struct BlockDriverAIOCBCoroutine {
3708 BlockDriverAIOCB common;
3709 BlockRequest req;
3710 bool is_write;
3711 QEMUBH* bh;
3712} BlockDriverAIOCBCoroutine;
3713
3714static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3715{
3716 qemu_aio_flush();
3717}
3718
3719static AIOPool bdrv_em_co_aio_pool = {
3720 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3721 .cancel = bdrv_aio_co_cancel_em,
3722};
3723
35246a68 3724static void bdrv_co_em_bh(void *opaque)
68485420
KW
3725{
3726 BlockDriverAIOCBCoroutine *acb = opaque;
3727
3728 acb->common.cb(acb->common.opaque, acb->req.error);
3729 qemu_bh_delete(acb->bh);
3730 qemu_aio_release(acb);
3731}
3732
b2a61371
SH
3733/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3734static void coroutine_fn bdrv_co_do_rw(void *opaque)
3735{
3736 BlockDriverAIOCBCoroutine *acb = opaque;
3737 BlockDriverState *bs = acb->common.bs;
3738
3739 if (!acb->is_write) {
3740 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
470c0504 3741 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3742 } else {
3743 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
f08f2dda 3744 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3745 }
3746
35246a68 3747 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
3748 qemu_bh_schedule(acb->bh);
3749}
3750
68485420
KW
3751static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3752 int64_t sector_num,
3753 QEMUIOVector *qiov,
3754 int nb_sectors,
3755 BlockDriverCompletionFunc *cb,
3756 void *opaque,
8c5873d6 3757 bool is_write)
68485420
KW
3758{
3759 Coroutine *co;
3760 BlockDriverAIOCBCoroutine *acb;
3761
3762 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3763 acb->req.sector = sector_num;
3764 acb->req.nb_sectors = nb_sectors;
3765 acb->req.qiov = qiov;
3766 acb->is_write = is_write;
3767
8c5873d6 3768 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
3769 qemu_coroutine_enter(co, acb);
3770
3771 return &acb->common;
3772}
3773
07f07615 3774static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 3775{
07f07615
PB
3776 BlockDriverAIOCBCoroutine *acb = opaque;
3777 BlockDriverState *bs = acb->common.bs;
b2e12bc6 3778
07f07615
PB
3779 acb->req.error = bdrv_co_flush(bs);
3780 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 3781 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
3782}
3783
07f07615 3784BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
3785 BlockDriverCompletionFunc *cb, void *opaque)
3786{
07f07615 3787 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 3788
07f07615
PB
3789 Coroutine *co;
3790 BlockDriverAIOCBCoroutine *acb;
016f5cf6 3791
07f07615
PB
3792 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3793 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3794 qemu_coroutine_enter(co, acb);
016f5cf6 3795
016f5cf6
AG
3796 return &acb->common;
3797}
3798
4265d620
PB
3799static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3800{
3801 BlockDriverAIOCBCoroutine *acb = opaque;
3802 BlockDriverState *bs = acb->common.bs;
3803
3804 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3805 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3806 qemu_bh_schedule(acb->bh);
3807}
3808
3809BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3810 int64_t sector_num, int nb_sectors,
3811 BlockDriverCompletionFunc *cb, void *opaque)
3812{
3813 Coroutine *co;
3814 BlockDriverAIOCBCoroutine *acb;
3815
3816 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3817
3818 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3819 acb->req.sector = sector_num;
3820 acb->req.nb_sectors = nb_sectors;
3821 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3822 qemu_coroutine_enter(co, acb);
3823
3824 return &acb->common;
3825}
3826
ea2384d3
FB
3827void bdrv_init(void)
3828{
5efa9d5a 3829 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 3830}
ce1a14dc 3831
eb852011
MA
3832void bdrv_init_with_whitelist(void)
3833{
3834 use_bdrv_whitelist = 1;
3835 bdrv_init();
3836}
3837
c16b5a2c
CH
3838void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3839 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 3840{
ce1a14dc
PB
3841 BlockDriverAIOCB *acb;
3842
6bbff9a0
AL
3843 if (pool->free_aiocb) {
3844 acb = pool->free_aiocb;
3845 pool->free_aiocb = acb->next;
ce1a14dc 3846 } else {
7267c094 3847 acb = g_malloc0(pool->aiocb_size);
6bbff9a0 3848 acb->pool = pool;
ce1a14dc
PB
3849 }
3850 acb->bs = bs;
3851 acb->cb = cb;
3852 acb->opaque = opaque;
3853 return acb;
3854}
3855
3856void qemu_aio_release(void *p)
3857{
6bbff9a0
AL
3858 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3859 AIOPool *pool = acb->pool;
3860 acb->next = pool->free_aiocb;
3861 pool->free_aiocb = acb;
ce1a14dc 3862}
19cb3738 3863
f9f05dc5
KW
3864/**************************************************************/
3865/* Coroutine block device emulation */
3866
3867typedef struct CoroutineIOCompletion {
3868 Coroutine *coroutine;
3869 int ret;
3870} CoroutineIOCompletion;
3871
3872static void bdrv_co_io_em_complete(void *opaque, int ret)
3873{
3874 CoroutineIOCompletion *co = opaque;
3875
3876 co->ret = ret;
3877 qemu_coroutine_enter(co->coroutine, NULL);
3878}
3879
3880static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3881 int nb_sectors, QEMUIOVector *iov,
3882 bool is_write)
3883{
3884 CoroutineIOCompletion co = {
3885 .coroutine = qemu_coroutine_self(),
3886 };
3887 BlockDriverAIOCB *acb;
3888
3889 if (is_write) {
a652d160
SH
3890 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3891 bdrv_co_io_em_complete, &co);
f9f05dc5 3892 } else {
a652d160
SH
3893 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3894 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
3895 }
3896
59370aaa 3897 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
3898 if (!acb) {
3899 return -EIO;
3900 }
3901 qemu_coroutine_yield();
3902
3903 return co.ret;
3904}
3905
3906static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3907 int64_t sector_num, int nb_sectors,
3908 QEMUIOVector *iov)
3909{
3910 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3911}
3912
3913static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3914 int64_t sector_num, int nb_sectors,
3915 QEMUIOVector *iov)
3916{
3917 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3918}
3919
07f07615 3920static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 3921{
07f07615
PB
3922 RwCo *rwco = opaque;
3923
3924 rwco->ret = bdrv_co_flush(rwco->bs);
3925}
3926
3927int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3928{
eb489bb1
KW
3929 int ret;
3930
29cdb251 3931 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 3932 return 0;
eb489bb1
KW
3933 }
3934
ca716364 3935 /* Write back cached data to the OS even with cache=unsafe */
eb489bb1
KW
3936 if (bs->drv->bdrv_co_flush_to_os) {
3937 ret = bs->drv->bdrv_co_flush_to_os(bs);
3938 if (ret < 0) {
3939 return ret;
3940 }
3941 }
3942
ca716364
KW
3943 /* But don't actually force it to the disk with cache=unsafe */
3944 if (bs->open_flags & BDRV_O_NO_FLUSH) {
d4c82329 3945 goto flush_parent;
ca716364
KW
3946 }
3947
eb489bb1 3948 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 3949 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
3950 } else if (bs->drv->bdrv_aio_flush) {
3951 BlockDriverAIOCB *acb;
3952 CoroutineIOCompletion co = {
3953 .coroutine = qemu_coroutine_self(),
3954 };
3955
3956 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3957 if (acb == NULL) {
29cdb251 3958 ret = -EIO;
07f07615
PB
3959 } else {
3960 qemu_coroutine_yield();
29cdb251 3961 ret = co.ret;
07f07615 3962 }
07f07615
PB
3963 } else {
3964 /*
3965 * Some block drivers always operate in either writethrough or unsafe
3966 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3967 * know how the server works (because the behaviour is hardcoded or
3968 * depends on server-side configuration), so we can't ensure that
3969 * everything is safe on disk. Returning an error doesn't work because
3970 * that would break guests even if the server operates in writethrough
3971 * mode.
3972 *
3973 * Let's hope the user knows what he's doing.
3974 */
29cdb251 3975 ret = 0;
07f07615 3976 }
29cdb251
PB
3977 if (ret < 0) {
3978 return ret;
3979 }
3980
3981 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3982 * in the case of cache=unsafe, so there are no useless flushes.
3983 */
d4c82329 3984flush_parent:
29cdb251 3985 return bdrv_co_flush(bs->file);
07f07615
PB
3986}
3987
0f15423c
AL
3988void bdrv_invalidate_cache(BlockDriverState *bs)
3989{
3990 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3991 bs->drv->bdrv_invalidate_cache(bs);
3992 }
3993}
3994
3995void bdrv_invalidate_cache_all(void)
3996{
3997 BlockDriverState *bs;
3998
3999 QTAILQ_FOREACH(bs, &bdrv_states, list) {
4000 bdrv_invalidate_cache(bs);
4001 }
4002}
4003
07789269
BC
4004void bdrv_clear_incoming_migration_all(void)
4005{
4006 BlockDriverState *bs;
4007
4008 QTAILQ_FOREACH(bs, &bdrv_states, list) {
4009 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
4010 }
4011}
4012
07f07615
PB
4013int bdrv_flush(BlockDriverState *bs)
4014{
4015 Coroutine *co;
4016 RwCo rwco = {
4017 .bs = bs,
4018 .ret = NOT_DONE,
e7a8a783 4019 };
e7a8a783 4020
07f07615
PB
4021 if (qemu_in_coroutine()) {
4022 /* Fast-path if already in coroutine context */
4023 bdrv_flush_co_entry(&rwco);
4024 } else {
4025 co = qemu_coroutine_create(bdrv_flush_co_entry);
4026 qemu_coroutine_enter(co, &rwco);
4027 while (rwco.ret == NOT_DONE) {
4028 qemu_aio_wait();
4029 }
e7a8a783 4030 }
07f07615
PB
4031
4032 return rwco.ret;
e7a8a783
KW
4033}
4034
4265d620
PB
4035static void coroutine_fn bdrv_discard_co_entry(void *opaque)
4036{
4037 RwCo *rwco = opaque;
4038
4039 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
4040}
4041
4042int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
4043 int nb_sectors)
4044{
4045 if (!bs->drv) {
4046 return -ENOMEDIUM;
4047 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
4048 return -EIO;
4049 } else if (bs->read_only) {
4050 return -EROFS;
4051 } else if (bs->drv->bdrv_co_discard) {
4052 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
4053 } else if (bs->drv->bdrv_aio_discard) {
4054 BlockDriverAIOCB *acb;
4055 CoroutineIOCompletion co = {
4056 .coroutine = qemu_coroutine_self(),
4057 };
4058
4059 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
4060 bdrv_co_io_em_complete, &co);
4061 if (acb == NULL) {
4062 return -EIO;
4063 } else {
4064 qemu_coroutine_yield();
4065 return co.ret;
4066 }
4265d620
PB
4067 } else {
4068 return 0;
4069 }
4070}
4071
4072int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
4073{
4074 Coroutine *co;
4075 RwCo rwco = {
4076 .bs = bs,
4077 .sector_num = sector_num,
4078 .nb_sectors = nb_sectors,
4079 .ret = NOT_DONE,
4080 };
4081
4082 if (qemu_in_coroutine()) {
4083 /* Fast-path if already in coroutine context */
4084 bdrv_discard_co_entry(&rwco);
4085 } else {
4086 co = qemu_coroutine_create(bdrv_discard_co_entry);
4087 qemu_coroutine_enter(co, &rwco);
4088 while (rwco.ret == NOT_DONE) {
4089 qemu_aio_wait();
4090 }
4091 }
4092
4093 return rwco.ret;
4094}
4095
19cb3738
FB
4096/**************************************************************/
4097/* removable device support */
4098
4099/**
4100 * Return TRUE if the media is present
4101 */
4102int bdrv_is_inserted(BlockDriverState *bs)
4103{
4104 BlockDriver *drv = bs->drv;
a1aff5bf 4105
19cb3738
FB
4106 if (!drv)
4107 return 0;
4108 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
4109 return 1;
4110 return drv->bdrv_is_inserted(bs);
19cb3738
FB
4111}
4112
4113/**
8e49ca46
MA
4114 * Return whether the media changed since the last call to this
4115 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
4116 */
4117int bdrv_media_changed(BlockDriverState *bs)
4118{
4119 BlockDriver *drv = bs->drv;
19cb3738 4120
8e49ca46
MA
4121 if (drv && drv->bdrv_media_changed) {
4122 return drv->bdrv_media_changed(bs);
4123 }
4124 return -ENOTSUP;
19cb3738
FB
4125}
4126
4127/**
4128 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
4129 */
f36f3949 4130void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
4131{
4132 BlockDriver *drv = bs->drv;
19cb3738 4133
822e1cd1
MA
4134 if (drv && drv->bdrv_eject) {
4135 drv->bdrv_eject(bs, eject_flag);
19cb3738 4136 }
6f382ed2
LC
4137
4138 if (bs->device_name[0] != '\0') {
4139 bdrv_emit_qmp_eject_event(bs, eject_flag);
4140 }
19cb3738
FB
4141}
4142
19cb3738
FB
4143/**
4144 * Lock or unlock the media (if it is locked, the user won't be able
4145 * to eject it manually).
4146 */
025e849a 4147void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
4148{
4149 BlockDriver *drv = bs->drv;
4150
025e849a 4151 trace_bdrv_lock_medium(bs, locked);
b8c6d095 4152
025e849a
MA
4153 if (drv && drv->bdrv_lock_medium) {
4154 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
4155 }
4156}
985a03b0
TS
4157
4158/* needed for generic scsi interface */
4159
4160int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4161{
4162 BlockDriver *drv = bs->drv;
4163
4164 if (drv && drv->bdrv_ioctl)
4165 return drv->bdrv_ioctl(bs, req, buf);
4166 return -ENOTSUP;
4167}
7d780669 4168
221f715d
AL
4169BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
4170 unsigned long int req, void *buf,
4171 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 4172{
221f715d 4173 BlockDriver *drv = bs->drv;
7d780669 4174
221f715d
AL
4175 if (drv && drv->bdrv_aio_ioctl)
4176 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
4177 return NULL;
7d780669 4178}
e268ca52 4179
7b6f9300
MA
4180void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
4181{
4182 bs->buffer_alignment = align;
4183}
7cd1e32a 4184
e268ca52
AL
4185void *qemu_blockalign(BlockDriverState *bs, size_t size)
4186{
4187 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
4188}
7cd1e32a 4189
4190void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
4191{
4192 int64_t bitmap_size;
a55eb92c 4193
aaa0eb75 4194 bs->dirty_count = 0;
a55eb92c 4195 if (enable) {
c6d22830
JK
4196 if (!bs->dirty_bitmap) {
4197 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
71df14fc
PB
4198 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
4199 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
a55eb92c 4200
71df14fc 4201 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
a55eb92c 4202 }
7cd1e32a 4203 } else {
c6d22830 4204 if (bs->dirty_bitmap) {
7267c094 4205 g_free(bs->dirty_bitmap);
c6d22830 4206 bs->dirty_bitmap = NULL;
a55eb92c 4207 }
7cd1e32a 4208 }
4209}
4210
4211int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
4212{
6ea44308 4213 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 4214
c6d22830
JK
4215 if (bs->dirty_bitmap &&
4216 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
6d59fec1
MT
4217 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
4218 (1UL << (chunk % (sizeof(unsigned long) * 8))));
7cd1e32a 4219 } else {
4220 return 0;
4221 }
4222}
4223
a55eb92c
JK
4224void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
4225 int nr_sectors)
7cd1e32a 4226{
4227 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
4228}
aaa0eb75
LS
4229
4230int64_t bdrv_get_dirty_count(BlockDriverState *bs)
4231{
4232 return bs->dirty_count;
4233}
f88e1a42 4234
db593f25
MT
4235void bdrv_set_in_use(BlockDriverState *bs, int in_use)
4236{
4237 assert(bs->in_use != in_use);
4238 bs->in_use = in_use;
4239}
4240
4241int bdrv_in_use(BlockDriverState *bs)
4242{
4243 return bs->in_use;
4244}
4245
28a7282a
LC
4246void bdrv_iostatus_enable(BlockDriverState *bs)
4247{
d6bf279e 4248 bs->iostatus_enabled = true;
58e21ef5 4249 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
4250}
4251
4252/* The I/O status is only enabled if the drive explicitly
4253 * enables it _and_ the VM is configured to stop on errors */
4254bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
4255{
d6bf279e 4256 return (bs->iostatus_enabled &&
92aa5c6d
PB
4257 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
4258 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP ||
4259 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP));
28a7282a
LC
4260}
4261
4262void bdrv_iostatus_disable(BlockDriverState *bs)
4263{
d6bf279e 4264 bs->iostatus_enabled = false;
28a7282a
LC
4265}
4266
4267void bdrv_iostatus_reset(BlockDriverState *bs)
4268{
4269 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 4270 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
4271 }
4272}
4273
28a7282a
LC
4274void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
4275{
3e1caa5f
PB
4276 assert(bdrv_iostatus_is_enabled(bs));
4277 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
58e21ef5
LC
4278 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4279 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
4280 }
4281}
4282
a597e79c
CH
4283void
4284bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4285 enum BlockAcctType type)
4286{
4287 assert(type < BDRV_MAX_IOTYPE);
4288
4289 cookie->bytes = bytes;
c488c7f6 4290 cookie->start_time_ns = get_clock();
a597e79c
CH
4291 cookie->type = type;
4292}
4293
4294void
4295bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4296{
4297 assert(cookie->type < BDRV_MAX_IOTYPE);
4298
4299 bs->nr_bytes[cookie->type] += cookie->bytes;
4300 bs->nr_ops[cookie->type]++;
c488c7f6 4301 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
4302}
4303
f88e1a42
JS
4304int bdrv_img_create(const char *filename, const char *fmt,
4305 const char *base_filename, const char *base_fmt,
4306 char *options, uint64_t img_size, int flags)
4307{
4308 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 4309 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42
JS
4310 BlockDriverState *bs = NULL;
4311 BlockDriver *drv, *proto_drv;
96df67d1 4312 BlockDriver *backing_drv = NULL;
f88e1a42
JS
4313 int ret = 0;
4314
4315 /* Find driver and parse its options */
4316 drv = bdrv_find_format(fmt);
4317 if (!drv) {
4318 error_report("Unknown file format '%s'", fmt);
4f70f249 4319 ret = -EINVAL;
f88e1a42
JS
4320 goto out;
4321 }
4322
4323 proto_drv = bdrv_find_protocol(filename);
4324 if (!proto_drv) {
4325 error_report("Unknown protocol '%s'", filename);
4f70f249 4326 ret = -EINVAL;
f88e1a42
JS
4327 goto out;
4328 }
4329
4330 create_options = append_option_parameters(create_options,
4331 drv->create_options);
4332 create_options = append_option_parameters(create_options,
4333 proto_drv->create_options);
4334
4335 /* Create parameter list with default values */
4336 param = parse_option_parameters("", create_options, param);
4337
4338 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4339
4340 /* Parse -o options */
4341 if (options) {
4342 param = parse_option_parameters(options, create_options, param);
4343 if (param == NULL) {
4344 error_report("Invalid options for file format '%s'.", fmt);
4f70f249 4345 ret = -EINVAL;
f88e1a42
JS
4346 goto out;
4347 }
4348 }
4349
4350 if (base_filename) {
4351 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4352 base_filename)) {
4353 error_report("Backing file not supported for file format '%s'",
4354 fmt);
4f70f249 4355 ret = -EINVAL;
f88e1a42
JS
4356 goto out;
4357 }
4358 }
4359
4360 if (base_fmt) {
4361 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4362 error_report("Backing file format not supported for file "
4363 "format '%s'", fmt);
4f70f249 4364 ret = -EINVAL;
f88e1a42
JS
4365 goto out;
4366 }
4367 }
4368
792da93a
JS
4369 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4370 if (backing_file && backing_file->value.s) {
4371 if (!strcmp(filename, backing_file->value.s)) {
4372 error_report("Error: Trying to create an image with the "
4373 "same filename as the backing file");
4f70f249 4374 ret = -EINVAL;
792da93a
JS
4375 goto out;
4376 }
4377 }
4378
f88e1a42
JS
4379 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4380 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
4381 backing_drv = bdrv_find_format(backing_fmt->value.s);
4382 if (!backing_drv) {
f88e1a42
JS
4383 error_report("Unknown backing file format '%s'",
4384 backing_fmt->value.s);
4f70f249 4385 ret = -EINVAL;
f88e1a42
JS
4386 goto out;
4387 }
4388 }
4389
4390 // The size for the image must always be specified, with one exception:
4391 // If we are using a backing file, we can obtain the size from there
d220894e
KW
4392 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4393 if (size && size->value.n == -1) {
f88e1a42
JS
4394 if (backing_file && backing_file->value.s) {
4395 uint64_t size;
f88e1a42 4396 char buf[32];
63090dac
PB
4397 int back_flags;
4398
4399 /* backing files always opened read-only */
4400 back_flags =
4401 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 4402
f88e1a42
JS
4403 bs = bdrv_new("");
4404
63090dac 4405 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
f88e1a42 4406 if (ret < 0) {
96df67d1 4407 error_report("Could not open '%s'", backing_file->value.s);
f88e1a42
JS
4408 goto out;
4409 }
4410 bdrv_get_geometry(bs, &size);
4411 size *= 512;
4412
4413 snprintf(buf, sizeof(buf), "%" PRId64, size);
4414 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4415 } else {
4416 error_report("Image creation needs a size parameter");
4f70f249 4417 ret = -EINVAL;
f88e1a42
JS
4418 goto out;
4419 }
4420 }
4421
4422 printf("Formatting '%s', fmt=%s ", filename, fmt);
4423 print_option_parameters(param);
4424 puts("");
4425
4426 ret = bdrv_create(drv, filename, param);
4427
4428 if (ret < 0) {
4429 if (ret == -ENOTSUP) {
4430 error_report("Formatting or formatting option not supported for "
4431 "file format '%s'", fmt);
4432 } else if (ret == -EFBIG) {
4433 error_report("The image size is too large for file format '%s'",
4434 fmt);
4435 } else {
4436 error_report("%s: error while creating %s: %s", filename, fmt,
4437 strerror(-ret));
4438 }
4439 }
4440
4441out:
4442 free_option_parameters(create_options);
4443 free_option_parameters(param);
4444
4445 if (bs) {
4446 bdrv_delete(bs);
4447 }
4f70f249
JS
4448
4449 return ret;
f88e1a42 4450}