]> git.proxmox.com Git - mirror_qemu.git/blame - block.c
block: fix snapshot on QED
[mirror_qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
376253ec 27#include "monitor.h"
ea2384d3 28#include "block_int.h"
5efa9d5a 29#include "module.h"
f795e743 30#include "qjson.h"
68485420 31#include "qemu-coroutine.h"
b2023818 32#include "qmp-commands.h"
0563e191 33#include "qemu-timer.h"
fc01f7e7 34
71e72a19 35#ifdef CONFIG_BSD
7674e7bf
FB
36#include <sys/types.h>
37#include <sys/stat.h>
38#include <sys/ioctl.h>
72cf2d4f 39#include <sys/queue.h>
c5e97233 40#ifndef __DragonFly__
7674e7bf
FB
41#include <sys/disk.h>
42#endif
c5e97233 43#endif
7674e7bf 44
49dc768d
AL
45#ifdef _WIN32
46#include <windows.h>
47#endif
48
1c9805a3
SH
49#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
470c0504
SH
51typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
f08f2dda 53 BDRV_REQ_ZERO_WRITE = 0x2,
470c0504
SH
54} BdrvRequestFlags;
55
7d4b4ba5 56static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
57static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 59 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
60static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 62 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
63static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
c5fbe571 69static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
1c9805a3 72static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
b2a61371
SH
75static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
8c5873d6 81 bool is_write);
b2a61371 82static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589
KW
83static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
ec530c81 85
98f90dba
ZYW
86static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
92
1b7bdbc1
SH
93static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 95
8a22f02a
SH
96static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 98
f9092b10
MA
99/* The device to use for VM snapshots */
100static BlockDriverState *bs_snapshots;
101
eb852011
MA
102/* If non-zero, use only whitelisted block drivers */
103static int use_bdrv_whitelist;
104
9e0b22f4
SH
105#ifdef _WIN32
106static int is_windows_drive_prefix(const char *filename)
107{
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
111}
112
113int is_windows_drive(const char *filename)
114{
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
122}
123#endif
124
0563e191 125/* throttling disk I/O limits */
98f90dba
ZYW
126void bdrv_io_limits_disable(BlockDriverState *bs)
127{
128 bs->io_limits_enabled = false;
129
130 while (qemu_co_queue_next(&bs->throttled_reqs));
131
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
136 }
137
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
142}
143
0563e191
ZYW
144static void bdrv_block_timer(void *opaque)
145{
146 BlockDriverState *bs = opaque;
147
148 qemu_co_queue_next(&bs->throttled_reqs);
149}
150
151void bdrv_io_limits_enable(BlockDriverState *bs)
152{
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
160}
161
162bool bdrv_io_limits_enabled(BlockDriverState *bs)
163{
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171}
172
98f90dba
ZYW
173static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
175{
176 int64_t wait_time = -1;
177
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
180 }
181
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
187 */
188
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193 }
194
195 qemu_co_queue_next(&bs->throttled_reqs);
196}
197
9e0b22f4
SH
198/* check if the path starts with "<protocol>:" */
199static int path_has_protocol(const char *path)
200{
201#ifdef _WIN32
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
204 return 0;
205 }
206#endif
207
208 return strchr(path, ':') != NULL;
209}
210
83f64091 211int path_is_absolute(const char *path)
3b0d4f61 212{
83f64091 213 const char *p;
21664424
FB
214#ifdef _WIN32
215 /* specific case for names like: "\\.\d:" */
216 if (*path == '/' || *path == '\\')
217 return 1;
218#endif
83f64091
FB
219 p = strchr(path, ':');
220 if (p)
221 p++;
222 else
223 p = path;
3b9f94e1
FB
224#ifdef _WIN32
225 return (*p == '/' || *p == '\\');
226#else
227 return (*p == '/');
228#endif
3b0d4f61
FB
229}
230
83f64091
FB
231/* if filename is absolute, just copy it to dest. Otherwise, build a
232 path to it by considering it is relative to base_path. URL are
233 supported. */
234void path_combine(char *dest, int dest_size,
235 const char *base_path,
236 const char *filename)
3b0d4f61 237{
83f64091
FB
238 const char *p, *p1;
239 int len;
240
241 if (dest_size <= 0)
242 return;
243 if (path_is_absolute(filename)) {
244 pstrcpy(dest, dest_size, filename);
245 } else {
246 p = strchr(base_path, ':');
247 if (p)
248 p++;
249 else
250 p = base_path;
3b9f94e1
FB
251 p1 = strrchr(base_path, '/');
252#ifdef _WIN32
253 {
254 const char *p2;
255 p2 = strrchr(base_path, '\\');
256 if (!p1 || p2 > p1)
257 p1 = p2;
258 }
259#endif
83f64091
FB
260 if (p1)
261 p1++;
262 else
263 p1 = base_path;
264 if (p1 > p)
265 p = p1;
266 len = p - base_path;
267 if (len > dest_size - 1)
268 len = dest_size - 1;
269 memcpy(dest, base_path, len);
270 dest[len] = '\0';
271 pstrcat(dest, dest_size, filename);
3b0d4f61 272 }
3b0d4f61
FB
273}
274
5efa9d5a 275void bdrv_register(BlockDriver *bdrv)
ea2384d3 276{
8c5873d6
SH
277 /* Block drivers without coroutine functions need emulation */
278 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
279 bdrv->bdrv_co_readv = bdrv_co_readv_em;
280 bdrv->bdrv_co_writev = bdrv_co_writev_em;
281
f8c35c1d
SH
282 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283 * the block driver lacks aio we need to emulate that too.
284 */
f9f05dc5
KW
285 if (!bdrv->bdrv_aio_readv) {
286 /* add AIO emulation layer */
287 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 289 }
83f64091 290 }
b2e12bc6 291
8a22f02a 292 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 293}
b338082b
FB
294
295/* create a new block device (by default it is empty) */
296BlockDriverState *bdrv_new(const char *device_name)
297{
1b7bdbc1 298 BlockDriverState *bs;
b338082b 299
7267c094 300 bs = g_malloc0(sizeof(BlockDriverState));
b338082b 301 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 302 if (device_name[0] != '\0') {
1b7bdbc1 303 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
ea2384d3 304 }
28a7282a 305 bdrv_iostatus_disable(bs);
b338082b
FB
306 return bs;
307}
308
ea2384d3
FB
309BlockDriver *bdrv_find_format(const char *format_name)
310{
311 BlockDriver *drv1;
8a22f02a
SH
312 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 314 return drv1;
8a22f02a 315 }
ea2384d3
FB
316 }
317 return NULL;
318}
319
eb852011
MA
320static int bdrv_is_whitelisted(BlockDriver *drv)
321{
322 static const char *whitelist[] = {
323 CONFIG_BDRV_WHITELIST
324 };
325 const char **p;
326
327 if (!whitelist[0])
328 return 1; /* no whitelist, anything goes */
329
330 for (p = whitelist; *p; p++) {
331 if (!strcmp(drv->format_name, *p)) {
332 return 1;
333 }
334 }
335 return 0;
336}
337
338BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
339{
340 BlockDriver *drv = bdrv_find_format(format_name);
341 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
342}
343
5b7e1542
ZYW
344typedef struct CreateCo {
345 BlockDriver *drv;
346 char *filename;
347 QEMUOptionParameter *options;
348 int ret;
349} CreateCo;
350
351static void coroutine_fn bdrv_create_co_entry(void *opaque)
352{
353 CreateCo *cco = opaque;
354 assert(cco->drv);
355
356 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
357}
358
0e7e1989
KW
359int bdrv_create(BlockDriver *drv, const char* filename,
360 QEMUOptionParameter *options)
ea2384d3 361{
5b7e1542
ZYW
362 int ret;
363
364 Coroutine *co;
365 CreateCo cco = {
366 .drv = drv,
367 .filename = g_strdup(filename),
368 .options = options,
369 .ret = NOT_DONE,
370 };
371
372 if (!drv->bdrv_create) {
ea2384d3 373 return -ENOTSUP;
5b7e1542
ZYW
374 }
375
376 if (qemu_in_coroutine()) {
377 /* Fast-path if already in coroutine context */
378 bdrv_create_co_entry(&cco);
379 } else {
380 co = qemu_coroutine_create(bdrv_create_co_entry);
381 qemu_coroutine_enter(co, &cco);
382 while (cco.ret == NOT_DONE) {
383 qemu_aio_wait();
384 }
385 }
386
387 ret = cco.ret;
388 g_free(cco.filename);
0e7e1989 389
5b7e1542 390 return ret;
ea2384d3
FB
391}
392
84a12e66
CH
393int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
394{
395 BlockDriver *drv;
396
b50cbabc 397 drv = bdrv_find_protocol(filename);
84a12e66 398 if (drv == NULL) {
16905d71 399 return -ENOENT;
84a12e66
CH
400 }
401
402 return bdrv_create(drv, filename, options);
403}
404
d5249393 405#ifdef _WIN32
95389c86 406void get_tmp_filename(char *filename, int size)
d5249393 407{
3b9f94e1 408 char temp_dir[MAX_PATH];
3b46e624 409
3b9f94e1
FB
410 GetTempPath(MAX_PATH, temp_dir);
411 GetTempFileName(temp_dir, "qem", 0, filename);
d5249393
FB
412}
413#else
95389c86 414void get_tmp_filename(char *filename, int size)
fc01f7e7 415{
67b915a5 416 int fd;
7ccfb2eb 417 const char *tmpdir;
d5249393 418 /* XXX: race condition possible */
0badc1ee
AJ
419 tmpdir = getenv("TMPDIR");
420 if (!tmpdir)
421 tmpdir = "/tmp";
422 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
ea2384d3
FB
423 fd = mkstemp(filename);
424 close(fd);
425}
d5249393 426#endif
fc01f7e7 427
84a12e66
CH
428/*
429 * Detect host devices. By convention, /dev/cdrom[N] is always
430 * recognized as a host CDROM.
431 */
432static BlockDriver *find_hdev_driver(const char *filename)
433{
434 int score_max = 0, score;
435 BlockDriver *drv = NULL, *d;
436
437 QLIST_FOREACH(d, &bdrv_drivers, list) {
438 if (d->bdrv_probe_device) {
439 score = d->bdrv_probe_device(filename);
440 if (score > score_max) {
441 score_max = score;
442 drv = d;
443 }
444 }
445 }
446
447 return drv;
448}
449
b50cbabc 450BlockDriver *bdrv_find_protocol(const char *filename)
83f64091
FB
451{
452 BlockDriver *drv1;
453 char protocol[128];
1cec71e3 454 int len;
83f64091 455 const char *p;
19cb3738 456
66f82cee
KW
457 /* TODO Drivers without bdrv_file_open must be specified explicitly */
458
39508e7a
CH
459 /*
460 * XXX(hch): we really should not let host device detection
461 * override an explicit protocol specification, but moving this
462 * later breaks access to device names with colons in them.
463 * Thanks to the brain-dead persistent naming schemes on udev-
464 * based Linux systems those actually are quite common.
465 */
466 drv1 = find_hdev_driver(filename);
467 if (drv1) {
468 return drv1;
469 }
470
9e0b22f4 471 if (!path_has_protocol(filename)) {
39508e7a 472 return bdrv_find_format("file");
84a12e66 473 }
9e0b22f4
SH
474 p = strchr(filename, ':');
475 assert(p != NULL);
1cec71e3
AL
476 len = p - filename;
477 if (len > sizeof(protocol) - 1)
478 len = sizeof(protocol) - 1;
479 memcpy(protocol, filename, len);
480 protocol[len] = '\0';
8a22f02a 481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 482 if (drv1->protocol_name &&
8a22f02a 483 !strcmp(drv1->protocol_name, protocol)) {
83f64091 484 return drv1;
8a22f02a 485 }
83f64091
FB
486 }
487 return NULL;
488}
489
c98ac35d 490static int find_image_format(const char *filename, BlockDriver **pdrv)
f3a5d3f8
CH
491{
492 int ret, score, score_max;
493 BlockDriver *drv1, *drv;
494 uint8_t buf[2048];
495 BlockDriverState *bs;
496
f5edb014 497 ret = bdrv_file_open(&bs, filename, 0);
c98ac35d
SW
498 if (ret < 0) {
499 *pdrv = NULL;
500 return ret;
501 }
f8ea0b00 502
08a00559
KW
503 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
504 if (bs->sg || !bdrv_is_inserted(bs)) {
1a396859 505 bdrv_delete(bs);
c98ac35d
SW
506 drv = bdrv_find_format("raw");
507 if (!drv) {
508 ret = -ENOENT;
509 }
510 *pdrv = drv;
511 return ret;
1a396859 512 }
f8ea0b00 513
83f64091
FB
514 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
515 bdrv_delete(bs);
516 if (ret < 0) {
c98ac35d
SW
517 *pdrv = NULL;
518 return ret;
83f64091
FB
519 }
520
ea2384d3 521 score_max = 0;
84a12e66 522 drv = NULL;
8a22f02a 523 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
524 if (drv1->bdrv_probe) {
525 score = drv1->bdrv_probe(buf, ret, filename);
526 if (score > score_max) {
527 score_max = score;
528 drv = drv1;
529 }
0849bf08 530 }
fc01f7e7 531 }
c98ac35d
SW
532 if (!drv) {
533 ret = -ENOENT;
534 }
535 *pdrv = drv;
536 return ret;
ea2384d3
FB
537}
538
51762288
SH
539/**
540 * Set the current 'total_sectors' value
541 */
542static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
543{
544 BlockDriver *drv = bs->drv;
545
396759ad
NB
546 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
547 if (bs->sg)
548 return 0;
549
51762288
SH
550 /* query actual device if possible, otherwise just trust the hint */
551 if (drv->bdrv_getlength) {
552 int64_t length = drv->bdrv_getlength(bs);
553 if (length < 0) {
554 return length;
555 }
556 hint = length >> BDRV_SECTOR_BITS;
557 }
558
559 bs->total_sectors = hint;
560 return 0;
561}
562
c3993cdc
SH
563/**
564 * Set open flags for a given cache mode
565 *
566 * Return 0 on success, -1 if the cache mode was invalid.
567 */
568int bdrv_parse_cache_flags(const char *mode, int *flags)
569{
570 *flags &= ~BDRV_O_CACHE_MASK;
571
572 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
573 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
574 } else if (!strcmp(mode, "directsync")) {
575 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
576 } else if (!strcmp(mode, "writeback")) {
577 *flags |= BDRV_O_CACHE_WB;
578 } else if (!strcmp(mode, "unsafe")) {
579 *flags |= BDRV_O_CACHE_WB;
580 *flags |= BDRV_O_NO_FLUSH;
581 } else if (!strcmp(mode, "writethrough")) {
582 /* this is the default */
583 } else {
584 return -1;
585 }
586
587 return 0;
588}
589
53fec9d3
SH
590/**
591 * The copy-on-read flag is actually a reference count so multiple users may
592 * use the feature without worrying about clobbering its previous state.
593 * Copy-on-read stays enabled until all users have called to disable it.
594 */
595void bdrv_enable_copy_on_read(BlockDriverState *bs)
596{
597 bs->copy_on_read++;
598}
599
600void bdrv_disable_copy_on_read(BlockDriverState *bs)
601{
602 assert(bs->copy_on_read > 0);
603 bs->copy_on_read--;
604}
605
57915332
KW
606/*
607 * Common part for opening disk images and files
608 */
609static int bdrv_open_common(BlockDriverState *bs, const char *filename,
610 int flags, BlockDriver *drv)
611{
612 int ret, open_flags;
613
614 assert(drv != NULL);
615
28dcee10
SH
616 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
617
66f82cee 618 bs->file = NULL;
51762288 619 bs->total_sectors = 0;
57915332
KW
620 bs->encrypted = 0;
621 bs->valid_key = 0;
03f541bd 622 bs->sg = 0;
57915332 623 bs->open_flags = flags;
03f541bd 624 bs->growable = 0;
57915332
KW
625 bs->buffer_alignment = 512;
626
53fec9d3
SH
627 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
628 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
629 bdrv_enable_copy_on_read(bs);
630 }
631
57915332 632 pstrcpy(bs->filename, sizeof(bs->filename), filename);
03f541bd 633 bs->backing_file[0] = '\0';
57915332
KW
634
635 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
636 return -ENOTSUP;
637 }
638
639 bs->drv = drv;
7267c094 640 bs->opaque = g_malloc0(drv->instance_size);
57915332 641
03f541bd 642 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
57915332
KW
643
644 /*
645 * Clear flags that are internal to the block layer before opening the
646 * image.
647 */
648 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
649
650 /*
ebabb67a 651 * Snapshots should be writable.
57915332
KW
652 */
653 if (bs->is_temporary) {
654 open_flags |= BDRV_O_RDWR;
655 }
656
e7c63796
SH
657 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
658
66f82cee
KW
659 /* Open the image, either directly or using a protocol */
660 if (drv->bdrv_file_open) {
661 ret = drv->bdrv_file_open(bs, filename, open_flags);
662 } else {
663 ret = bdrv_file_open(&bs->file, filename, open_flags);
664 if (ret >= 0) {
665 ret = drv->bdrv_open(bs, open_flags);
666 }
667 }
668
57915332
KW
669 if (ret < 0) {
670 goto free_and_fail;
671 }
672
51762288
SH
673 ret = refresh_total_sectors(bs, bs->total_sectors);
674 if (ret < 0) {
675 goto free_and_fail;
57915332 676 }
51762288 677
57915332
KW
678#ifndef _WIN32
679 if (bs->is_temporary) {
680 unlink(filename);
681 }
682#endif
683 return 0;
684
685free_and_fail:
66f82cee
KW
686 if (bs->file) {
687 bdrv_delete(bs->file);
688 bs->file = NULL;
689 }
7267c094 690 g_free(bs->opaque);
57915332
KW
691 bs->opaque = NULL;
692 bs->drv = NULL;
693 return ret;
694}
695
b6ce07aa
KW
696/*
697 * Opens a file using a protocol (file, host_device, nbd, ...)
698 */
83f64091 699int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
ea2384d3 700{
83f64091 701 BlockDriverState *bs;
6db95603 702 BlockDriver *drv;
83f64091
FB
703 int ret;
704
b50cbabc 705 drv = bdrv_find_protocol(filename);
6db95603
CH
706 if (!drv) {
707 return -ENOENT;
708 }
709
83f64091 710 bs = bdrv_new("");
b6ce07aa 711 ret = bdrv_open_common(bs, filename, flags, drv);
83f64091
FB
712 if (ret < 0) {
713 bdrv_delete(bs);
714 return ret;
3b0d4f61 715 }
71d0770c 716 bs->growable = 1;
83f64091
FB
717 *pbs = bs;
718 return 0;
719}
720
b6ce07aa
KW
721/*
722 * Opens a disk image (raw, qcow2, vmdk, ...)
723 */
d6e9098e
KW
724int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
725 BlockDriver *drv)
ea2384d3 726{
b6ce07aa 727 int ret;
2b572816 728 char tmp_filename[PATH_MAX];
712e7874 729
83f64091 730 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
731 BlockDriverState *bs1;
732 int64_t total_size;
7c96d46e 733 int is_protocol = 0;
91a073a9
KW
734 BlockDriver *bdrv_qcow2;
735 QEMUOptionParameter *options;
b6ce07aa 736 char backing_filename[PATH_MAX];
3b46e624 737
ea2384d3
FB
738 /* if snapshot, we create a temporary backing file and open it
739 instead of opening 'filename' directly */
33e3963e 740
ea2384d3
FB
741 /* if there is a backing file, use it */
742 bs1 = bdrv_new("");
d6e9098e 743 ret = bdrv_open(bs1, filename, 0, drv);
51d7c00c 744 if (ret < 0) {
ea2384d3 745 bdrv_delete(bs1);
51d7c00c 746 return ret;
ea2384d3 747 }
3e82990b 748 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e
AL
749
750 if (bs1->drv && bs1->drv->protocol_name)
751 is_protocol = 1;
752
ea2384d3 753 bdrv_delete(bs1);
3b46e624 754
ea2384d3 755 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
7c96d46e
AL
756
757 /* Real path is meaningless for protocols */
758 if (is_protocol)
759 snprintf(backing_filename, sizeof(backing_filename),
760 "%s", filename);
114cdfa9
KS
761 else if (!realpath(filename, backing_filename))
762 return -errno;
7c96d46e 763
91a073a9
KW
764 bdrv_qcow2 = bdrv_find_format("qcow2");
765 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
766
3e82990b 767 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
91a073a9
KW
768 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
769 if (drv) {
770 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
771 drv->format_name);
772 }
773
774 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
d748768c 775 free_option_parameters(options);
51d7c00c
AL
776 if (ret < 0) {
777 return ret;
ea2384d3 778 }
91a073a9 779
ea2384d3 780 filename = tmp_filename;
91a073a9 781 drv = bdrv_qcow2;
ea2384d3
FB
782 bs->is_temporary = 1;
783 }
712e7874 784
b6ce07aa 785 /* Find the right image format driver */
6db95603 786 if (!drv) {
c98ac35d 787 ret = find_image_format(filename, &drv);
51d7c00c 788 }
6987307c 789
51d7c00c 790 if (!drv) {
51d7c00c 791 goto unlink_and_fail;
ea2384d3 792 }
b6ce07aa
KW
793
794 /* Open the image */
795 ret = bdrv_open_common(bs, filename, flags, drv);
796 if (ret < 0) {
6987307c
CH
797 goto unlink_and_fail;
798 }
799
b6ce07aa
KW
800 /* If there is a backing file, use it */
801 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
802 char backing_filename[PATH_MAX];
803 int back_flags;
804 BlockDriver *back_drv = NULL;
805
806 bs->backing_hd = bdrv_new("");
df2dbb4a
SH
807
808 if (path_has_protocol(bs->backing_file)) {
809 pstrcpy(backing_filename, sizeof(backing_filename),
810 bs->backing_file);
811 } else {
812 path_combine(backing_filename, sizeof(backing_filename),
813 filename, bs->backing_file);
814 }
815
816 if (bs->backing_format[0] != '\0') {
b6ce07aa 817 back_drv = bdrv_find_format(bs->backing_format);
df2dbb4a 818 }
b6ce07aa
KW
819
820 /* backing files always opened read-only */
821 back_flags =
822 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
823
824 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
825 if (ret < 0) {
826 bdrv_close(bs);
827 return ret;
828 }
829 if (bs->is_temporary) {
830 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
831 } else {
832 /* base image inherits from "parent" */
833 bs->backing_hd->keep_read_only = bs->keep_read_only;
834 }
835 }
836
837 if (!bdrv_key_required(bs)) {
7d4b4ba5 838 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
839 }
840
98f90dba
ZYW
841 /* throttling disk I/O limits */
842 if (bs->io_limits_enabled) {
843 bdrv_io_limits_enable(bs);
844 }
845
b6ce07aa
KW
846 return 0;
847
848unlink_and_fail:
849 if (bs->is_temporary) {
850 unlink(filename);
851 }
852 return ret;
853}
854
fc01f7e7
FB
855void bdrv_close(BlockDriverState *bs)
856{
80ccf93b 857 bdrv_flush(bs);
19cb3738 858 if (bs->drv) {
3e914655
PB
859 if (bs->job) {
860 block_job_cancel_sync(bs->job);
861 }
7094f12f
KW
862 bdrv_drain_all();
863
f9092b10
MA
864 if (bs == bs_snapshots) {
865 bs_snapshots = NULL;
866 }
557df6ac 867 if (bs->backing_hd) {
ea2384d3 868 bdrv_delete(bs->backing_hd);
557df6ac
SH
869 bs->backing_hd = NULL;
870 }
ea2384d3 871 bs->drv->bdrv_close(bs);
7267c094 872 g_free(bs->opaque);
ea2384d3
FB
873#ifdef _WIN32
874 if (bs->is_temporary) {
875 unlink(bs->filename);
876 }
67b915a5 877#endif
ea2384d3
FB
878 bs->opaque = NULL;
879 bs->drv = NULL;
53fec9d3 880 bs->copy_on_read = 0;
b338082b 881
66f82cee
KW
882 if (bs->file != NULL) {
883 bdrv_close(bs->file);
884 }
885
7d4b4ba5 886 bdrv_dev_change_media_cb(bs, false);
b338082b 887 }
98f90dba
ZYW
888
889 /*throttling disk I/O limits*/
890 if (bs->io_limits_enabled) {
891 bdrv_io_limits_disable(bs);
892 }
b338082b
FB
893}
894
2bc93fed
MK
895void bdrv_close_all(void)
896{
897 BlockDriverState *bs;
898
899 QTAILQ_FOREACH(bs, &bdrv_states, list) {
900 bdrv_close(bs);
901 }
902}
903
922453bc
SH
904/*
905 * Wait for pending requests to complete across all BlockDriverStates
906 *
907 * This function does not flush data to disk, use bdrv_flush_all() for that
908 * after calling this function.
4c355d53
ZYW
909 *
910 * Note that completion of an asynchronous I/O operation can trigger any
911 * number of other I/O operations on other devices---for example a coroutine
912 * can be arbitrarily complex and a constant flow of I/O can come until the
913 * coroutine is complete. Because of this, it is not possible to have a
914 * function to drain a single device's I/O queue.
922453bc
SH
915 */
916void bdrv_drain_all(void)
917{
918 BlockDriverState *bs;
4c355d53
ZYW
919 bool busy;
920
921 do {
922 busy = qemu_aio_wait();
922453bc 923
4c355d53
ZYW
924 /* FIXME: We do not have timer support here, so this is effectively
925 * a busy wait.
926 */
927 QTAILQ_FOREACH(bs, &bdrv_states, list) {
928 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
929 qemu_co_queue_restart_all(&bs->throttled_reqs);
930 busy = true;
931 }
932 }
933 } while (busy);
922453bc
SH
934
935 /* If requests are still pending there is a bug somewhere */
936 QTAILQ_FOREACH(bs, &bdrv_states, list) {
937 assert(QLIST_EMPTY(&bs->tracked_requests));
938 assert(qemu_co_queue_empty(&bs->throttled_reqs));
939 }
940}
941
d22b2f41
RH
942/* make a BlockDriverState anonymous by removing from bdrv_state list.
943 Also, NULL terminate the device_name to prevent double remove */
944void bdrv_make_anon(BlockDriverState *bs)
945{
946 if (bs->device_name[0] != '\0') {
947 QTAILQ_REMOVE(&bdrv_states, bs, list);
948 }
949 bs->device_name[0] = '\0';
950}
951
e023b2e2
PB
952static void bdrv_rebind(BlockDriverState *bs)
953{
954 if (bs->drv && bs->drv->bdrv_rebind) {
955 bs->drv->bdrv_rebind(bs);
956 }
957}
958
8802d1fd
JC
959/*
960 * Add new bs contents at the top of an image chain while the chain is
961 * live, while keeping required fields on the top layer.
962 *
963 * This will modify the BlockDriverState fields, and swap contents
964 * between bs_new and bs_top. Both bs_new and bs_top are modified.
965 *
f6801b83
JC
966 * bs_new is required to be anonymous.
967 *
8802d1fd
JC
968 * This function does not create any image files.
969 */
970void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
971{
972 BlockDriverState tmp;
973
f6801b83
JC
974 /* bs_new must be anonymous */
975 assert(bs_new->device_name[0] == '\0');
8802d1fd
JC
976
977 tmp = *bs_new;
978
979 /* there are some fields that need to stay on the top layer: */
980
981 /* dev info */
982 tmp.dev_ops = bs_top->dev_ops;
983 tmp.dev_opaque = bs_top->dev_opaque;
984 tmp.dev = bs_top->dev;
985 tmp.buffer_alignment = bs_top->buffer_alignment;
986 tmp.copy_on_read = bs_top->copy_on_read;
987
988 /* i/o timing parameters */
989 tmp.slice_time = bs_top->slice_time;
990 tmp.slice_start = bs_top->slice_start;
991 tmp.slice_end = bs_top->slice_end;
992 tmp.io_limits = bs_top->io_limits;
993 tmp.io_base = bs_top->io_base;
994 tmp.throttled_reqs = bs_top->throttled_reqs;
995 tmp.block_timer = bs_top->block_timer;
996 tmp.io_limits_enabled = bs_top->io_limits_enabled;
997
998 /* geometry */
999 tmp.cyls = bs_top->cyls;
1000 tmp.heads = bs_top->heads;
1001 tmp.secs = bs_top->secs;
1002 tmp.translation = bs_top->translation;
1003
1004 /* r/w error */
1005 tmp.on_read_error = bs_top->on_read_error;
1006 tmp.on_write_error = bs_top->on_write_error;
1007
1008 /* i/o status */
1009 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1010 tmp.iostatus = bs_top->iostatus;
1011
1012 /* keep the same entry in bdrv_states */
1013 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1014 tmp.list = bs_top->list;
1015
1016 /* The contents of 'tmp' will become bs_top, as we are
1017 * swapping bs_new and bs_top contents. */
1018 tmp.backing_hd = bs_new;
1019 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
f6801b83 1020 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
8802d1fd
JC
1021
1022 /* swap contents of the fixed new bs and the current top */
1023 *bs_new = *bs_top;
1024 *bs_top = tmp;
1025
f6801b83
JC
1026 /* device_name[] was carried over from the old bs_top. bs_new
1027 * shouldn't be in bdrv_states, so we need to make device_name[]
1028 * reflect the anonymity of bs_new
1029 */
1030 bs_new->device_name[0] = '\0';
1031
8802d1fd
JC
1032 /* clear the copied fields in the new backing file */
1033 bdrv_detach_dev(bs_new, bs_new->dev);
1034
1035 qemu_co_queue_init(&bs_new->throttled_reqs);
1036 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1037 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1038 bdrv_iostatus_disable(bs_new);
1039
1040 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1041 * to affect or delete the block_timer, as it has been moved to bs_top */
1042 bs_new->io_limits_enabled = false;
1043 bs_new->block_timer = NULL;
1044 bs_new->slice_time = 0;
1045 bs_new->slice_start = 0;
1046 bs_new->slice_end = 0;
e023b2e2
PB
1047
1048 bdrv_rebind(bs_new);
1049 bdrv_rebind(bs_top);
8802d1fd
JC
1050}
1051
b338082b
FB
1052void bdrv_delete(BlockDriverState *bs)
1053{
fa879d62 1054 assert(!bs->dev);
3e914655
PB
1055 assert(!bs->job);
1056 assert(!bs->in_use);
18846dee 1057
1b7bdbc1 1058 /* remove from list, if necessary */
d22b2f41 1059 bdrv_make_anon(bs);
34c6f050 1060
b338082b 1061 bdrv_close(bs);
66f82cee
KW
1062 if (bs->file != NULL) {
1063 bdrv_delete(bs->file);
1064 }
1065
f9092b10 1066 assert(bs != bs_snapshots);
7267c094 1067 g_free(bs);
fc01f7e7
FB
1068}
1069
fa879d62
MA
1070int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1071/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 1072{
fa879d62 1073 if (bs->dev) {
18846dee
MA
1074 return -EBUSY;
1075 }
fa879d62 1076 bs->dev = dev;
28a7282a 1077 bdrv_iostatus_reset(bs);
18846dee
MA
1078 return 0;
1079}
1080
fa879d62
MA
1081/* TODO qdevified devices don't use this, remove when devices are qdevified */
1082void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 1083{
fa879d62
MA
1084 if (bdrv_attach_dev(bs, dev) < 0) {
1085 abort();
1086 }
1087}
1088
1089void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1090/* TODO change to DeviceState *dev when all users are qdevified */
1091{
1092 assert(bs->dev == dev);
1093 bs->dev = NULL;
0e49de52
MA
1094 bs->dev_ops = NULL;
1095 bs->dev_opaque = NULL;
29e05f20 1096 bs->buffer_alignment = 512;
18846dee
MA
1097}
1098
fa879d62
MA
1099/* TODO change to return DeviceState * when all users are qdevified */
1100void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 1101{
fa879d62 1102 return bs->dev;
18846dee
MA
1103}
1104
0e49de52
MA
1105void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1106 void *opaque)
1107{
1108 bs->dev_ops = ops;
1109 bs->dev_opaque = opaque;
2c6942fa
MA
1110 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1111 bs_snapshots = NULL;
1112 }
0e49de52
MA
1113}
1114
329c0a48
LC
1115void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1116 BlockQMPEventAction action, int is_read)
1117{
1118 QObject *data;
1119 const char *action_str;
1120
1121 switch (action) {
1122 case BDRV_ACTION_REPORT:
1123 action_str = "report";
1124 break;
1125 case BDRV_ACTION_IGNORE:
1126 action_str = "ignore";
1127 break;
1128 case BDRV_ACTION_STOP:
1129 action_str = "stop";
1130 break;
1131 default:
1132 abort();
1133 }
1134
1135 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1136 bdrv->device_name,
1137 action_str,
1138 is_read ? "read" : "write");
1139 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1140
1141 qobject_decref(data);
1142}
1143
6f382ed2
LC
1144static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1145{
1146 QObject *data;
1147
1148 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1149 bdrv_get_device_name(bs), ejected);
1150 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1151
1152 qobject_decref(data);
1153}
1154
7d4b4ba5 1155static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 1156{
145feb17 1157 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 1158 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 1159 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
1160 if (tray_was_closed) {
1161 /* tray open */
1162 bdrv_emit_qmp_eject_event(bs, true);
1163 }
1164 if (load) {
1165 /* tray close */
1166 bdrv_emit_qmp_eject_event(bs, false);
1167 }
145feb17
MA
1168 }
1169}
1170
2c6942fa
MA
1171bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1172{
1173 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1174}
1175
025ccaa7
PB
1176void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1177{
1178 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1179 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1180 }
1181}
1182
e4def80b
MA
1183bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1184{
1185 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1186 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1187 }
1188 return false;
1189}
1190
145feb17
MA
1191static void bdrv_dev_resize_cb(BlockDriverState *bs)
1192{
1193 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1194 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
1195 }
1196}
1197
f107639a
MA
1198bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1199{
1200 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1201 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1202 }
1203 return false;
1204}
1205
e97fc193
AL
1206/*
1207 * Run consistency checks on an image
1208 *
e076f338 1209 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 1210 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 1211 * check are stored in res.
e97fc193 1212 */
e076f338 1213int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
e97fc193
AL
1214{
1215 if (bs->drv->bdrv_check == NULL) {
1216 return -ENOTSUP;
1217 }
1218
e076f338 1219 memset(res, 0, sizeof(*res));
9ac228e0 1220 return bs->drv->bdrv_check(bs, res);
e97fc193
AL
1221}
1222
8a426614
KW
1223#define COMMIT_BUF_SECTORS 2048
1224
33e3963e
FB
1225/* commit COW file into the raw image */
1226int bdrv_commit(BlockDriverState *bs)
1227{
19cb3738 1228 BlockDriver *drv = bs->drv;
ee181196 1229 BlockDriver *backing_drv;
8a426614
KW
1230 int64_t sector, total_sectors;
1231 int n, ro, open_flags;
4dca4b63 1232 int ret = 0, rw_ret = 0;
8a426614 1233 uint8_t *buf;
4dca4b63
NS
1234 char filename[1024];
1235 BlockDriverState *bs_rw, *bs_ro;
33e3963e 1236
19cb3738
FB
1237 if (!drv)
1238 return -ENOMEDIUM;
4dca4b63
NS
1239
1240 if (!bs->backing_hd) {
1241 return -ENOTSUP;
33e3963e
FB
1242 }
1243
4dca4b63
NS
1244 if (bs->backing_hd->keep_read_only) {
1245 return -EACCES;
1246 }
ee181196 1247
2d3735d3
SH
1248 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1249 return -EBUSY;
1250 }
1251
ee181196 1252 backing_drv = bs->backing_hd->drv;
4dca4b63
NS
1253 ro = bs->backing_hd->read_only;
1254 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1255 open_flags = bs->backing_hd->open_flags;
1256
1257 if (ro) {
1258 /* re-open as RW */
1259 bdrv_delete(bs->backing_hd);
1260 bs->backing_hd = NULL;
1261 bs_rw = bdrv_new("");
ee181196
KW
1262 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1263 backing_drv);
4dca4b63
NS
1264 if (rw_ret < 0) {
1265 bdrv_delete(bs_rw);
1266 /* try to re-open read-only */
1267 bs_ro = bdrv_new("");
ee181196
KW
1268 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1269 backing_drv);
4dca4b63
NS
1270 if (ret < 0) {
1271 bdrv_delete(bs_ro);
1272 /* drive not functional anymore */
1273 bs->drv = NULL;
1274 return ret;
1275 }
1276 bs->backing_hd = bs_ro;
1277 return rw_ret;
1278 }
1279 bs->backing_hd = bs_rw;
ea2384d3 1280 }
33e3963e 1281
6ea44308 1282 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
7267c094 1283 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
1284
1285 for (sector = 0; sector < total_sectors; sector += n) {
05c4af54 1286 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
8a426614
KW
1287
1288 if (bdrv_read(bs, sector, buf, n) != 0) {
1289 ret = -EIO;
1290 goto ro_cleanup;
1291 }
1292
1293 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1294 ret = -EIO;
1295 goto ro_cleanup;
1296 }
ea2384d3 1297 }
33e3963e 1298 }
95389c86 1299
1d44952f
CH
1300 if (drv->bdrv_make_empty) {
1301 ret = drv->bdrv_make_empty(bs);
1302 bdrv_flush(bs);
1303 }
95389c86 1304
3f5075ae
CH
1305 /*
1306 * Make sure all data we wrote to the backing device is actually
1307 * stable on disk.
1308 */
1309 if (bs->backing_hd)
1310 bdrv_flush(bs->backing_hd);
4dca4b63
NS
1311
1312ro_cleanup:
7267c094 1313 g_free(buf);
4dca4b63
NS
1314
1315 if (ro) {
1316 /* re-open as RO */
1317 bdrv_delete(bs->backing_hd);
1318 bs->backing_hd = NULL;
1319 bs_ro = bdrv_new("");
ee181196
KW
1320 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1321 backing_drv);
4dca4b63
NS
1322 if (ret < 0) {
1323 bdrv_delete(bs_ro);
1324 /* drive not functional anymore */
1325 bs->drv = NULL;
1326 return ret;
1327 }
1328 bs->backing_hd = bs_ro;
1329 bs->backing_hd->keep_read_only = 0;
1330 }
1331
1d44952f 1332 return ret;
33e3963e
FB
1333}
1334
e8877497 1335int bdrv_commit_all(void)
6ab4b5ab
MA
1336{
1337 BlockDriverState *bs;
1338
1339 QTAILQ_FOREACH(bs, &bdrv_states, list) {
e8877497
SH
1340 int ret = bdrv_commit(bs);
1341 if (ret < 0) {
1342 return ret;
1343 }
6ab4b5ab 1344 }
e8877497 1345 return 0;
6ab4b5ab
MA
1346}
1347
dbffbdcf
SH
1348struct BdrvTrackedRequest {
1349 BlockDriverState *bs;
1350 int64_t sector_num;
1351 int nb_sectors;
1352 bool is_write;
1353 QLIST_ENTRY(BdrvTrackedRequest) list;
5f8b6491 1354 Coroutine *co; /* owner, used for deadlock detection */
f4658285 1355 CoQueue wait_queue; /* coroutines blocked on this request */
dbffbdcf
SH
1356};
1357
1358/**
1359 * Remove an active request from the tracked requests list
1360 *
1361 * This function should be called when a tracked request is completing.
1362 */
1363static void tracked_request_end(BdrvTrackedRequest *req)
1364{
1365 QLIST_REMOVE(req, list);
f4658285 1366 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
1367}
1368
1369/**
1370 * Add an active request to the tracked requests list
1371 */
1372static void tracked_request_begin(BdrvTrackedRequest *req,
1373 BlockDriverState *bs,
1374 int64_t sector_num,
1375 int nb_sectors, bool is_write)
1376{
1377 *req = (BdrvTrackedRequest){
1378 .bs = bs,
1379 .sector_num = sector_num,
1380 .nb_sectors = nb_sectors,
1381 .is_write = is_write,
5f8b6491 1382 .co = qemu_coroutine_self(),
dbffbdcf
SH
1383 };
1384
f4658285
SH
1385 qemu_co_queue_init(&req->wait_queue);
1386
dbffbdcf
SH
1387 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1388}
1389
d83947ac
SH
1390/**
1391 * Round a region to cluster boundaries
1392 */
1393static void round_to_clusters(BlockDriverState *bs,
1394 int64_t sector_num, int nb_sectors,
1395 int64_t *cluster_sector_num,
1396 int *cluster_nb_sectors)
1397{
1398 BlockDriverInfo bdi;
1399
1400 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1401 *cluster_sector_num = sector_num;
1402 *cluster_nb_sectors = nb_sectors;
1403 } else {
1404 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1405 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1406 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1407 nb_sectors, c);
1408 }
1409}
1410
f4658285
SH
1411static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1412 int64_t sector_num, int nb_sectors) {
d83947ac
SH
1413 /* aaaa bbbb */
1414 if (sector_num >= req->sector_num + req->nb_sectors) {
1415 return false;
1416 }
1417 /* bbbb aaaa */
1418 if (req->sector_num >= sector_num + nb_sectors) {
1419 return false;
1420 }
1421 return true;
f4658285
SH
1422}
1423
1424static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1425 int64_t sector_num, int nb_sectors)
1426{
1427 BdrvTrackedRequest *req;
d83947ac
SH
1428 int64_t cluster_sector_num;
1429 int cluster_nb_sectors;
f4658285
SH
1430 bool retry;
1431
d83947ac
SH
1432 /* If we touch the same cluster it counts as an overlap. This guarantees
1433 * that allocating writes will be serialized and not race with each other
1434 * for the same cluster. For example, in copy-on-read it ensures that the
1435 * CoR read and write operations are atomic and guest writes cannot
1436 * interleave between them.
1437 */
1438 round_to_clusters(bs, sector_num, nb_sectors,
1439 &cluster_sector_num, &cluster_nb_sectors);
1440
f4658285
SH
1441 do {
1442 retry = false;
1443 QLIST_FOREACH(req, &bs->tracked_requests, list) {
d83947ac
SH
1444 if (tracked_request_overlaps(req, cluster_sector_num,
1445 cluster_nb_sectors)) {
5f8b6491
SH
1446 /* Hitting this means there was a reentrant request, for
1447 * example, a block driver issuing nested requests. This must
1448 * never happen since it means deadlock.
1449 */
1450 assert(qemu_coroutine_self() != req->co);
1451
f4658285
SH
1452 qemu_co_queue_wait(&req->wait_queue);
1453 retry = true;
1454 break;
1455 }
1456 }
1457 } while (retry);
1458}
1459
756e6736
KW
1460/*
1461 * Return values:
1462 * 0 - success
1463 * -EINVAL - backing format specified, but no file
1464 * -ENOSPC - can't update the backing file because no space is left in the
1465 * image file header
1466 * -ENOTSUP - format driver doesn't support changing the backing file
1467 */
1468int bdrv_change_backing_file(BlockDriverState *bs,
1469 const char *backing_file, const char *backing_fmt)
1470{
1471 BlockDriver *drv = bs->drv;
469ef350 1472 int ret;
756e6736 1473
5f377794
PB
1474 /* Backing file format doesn't make sense without a backing file */
1475 if (backing_fmt && !backing_file) {
1476 return -EINVAL;
1477 }
1478
756e6736 1479 if (drv->bdrv_change_backing_file != NULL) {
469ef350 1480 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 1481 } else {
469ef350 1482 ret = -ENOTSUP;
756e6736 1483 }
469ef350
PB
1484
1485 if (ret == 0) {
1486 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1487 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1488 }
1489 return ret;
756e6736
KW
1490}
1491
71d0770c
AL
1492static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1493 size_t size)
1494{
1495 int64_t len;
1496
1497 if (!bdrv_is_inserted(bs))
1498 return -ENOMEDIUM;
1499
1500 if (bs->growable)
1501 return 0;
1502
1503 len = bdrv_getlength(bs);
1504
fbb7b4e0
KW
1505 if (offset < 0)
1506 return -EIO;
1507
1508 if ((offset > len) || (len - offset < size))
71d0770c
AL
1509 return -EIO;
1510
1511 return 0;
1512}
1513
1514static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1515 int nb_sectors)
1516{
eb5a3165
JS
1517 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1518 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
1519}
1520
1c9805a3
SH
1521typedef struct RwCo {
1522 BlockDriverState *bs;
1523 int64_t sector_num;
1524 int nb_sectors;
1525 QEMUIOVector *qiov;
1526 bool is_write;
1527 int ret;
1528} RwCo;
1529
1530static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 1531{
1c9805a3 1532 RwCo *rwco = opaque;
ea2384d3 1533
1c9805a3
SH
1534 if (!rwco->is_write) {
1535 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
470c0504 1536 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1537 } else {
1538 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
f08f2dda 1539 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1540 }
1541}
e7a8a783 1542
1c9805a3
SH
1543/*
1544 * Process a synchronous request using coroutines
1545 */
1546static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1547 int nb_sectors, bool is_write)
1548{
1549 QEMUIOVector qiov;
1550 struct iovec iov = {
1551 .iov_base = (void *)buf,
1552 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1553 };
1554 Coroutine *co;
1555 RwCo rwco = {
1556 .bs = bs,
1557 .sector_num = sector_num,
1558 .nb_sectors = nb_sectors,
1559 .qiov = &qiov,
1560 .is_write = is_write,
1561 .ret = NOT_DONE,
1562 };
e7a8a783 1563
1c9805a3 1564 qemu_iovec_init_external(&qiov, &iov, 1);
e7a8a783 1565
498e386c
ZYW
1566 /**
1567 * In sync call context, when the vcpu is blocked, this throttling timer
1568 * will not fire; so the I/O throttling function has to be disabled here
1569 * if it has been enabled.
1570 */
1571 if (bs->io_limits_enabled) {
1572 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1573 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1574 bdrv_io_limits_disable(bs);
1575 }
1576
1c9805a3
SH
1577 if (qemu_in_coroutine()) {
1578 /* Fast-path if already in coroutine context */
1579 bdrv_rw_co_entry(&rwco);
1580 } else {
1581 co = qemu_coroutine_create(bdrv_rw_co_entry);
1582 qemu_coroutine_enter(co, &rwco);
1583 while (rwco.ret == NOT_DONE) {
1584 qemu_aio_wait();
1585 }
1586 }
1587 return rwco.ret;
1588}
b338082b 1589
1c9805a3
SH
1590/* return < 0 if error. See bdrv_write() for the return codes */
1591int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1592 uint8_t *buf, int nb_sectors)
1593{
1594 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
fc01f7e7
FB
1595}
1596
71df14fc
PB
1597#define BITS_PER_LONG (sizeof(unsigned long) * 8)
1598
7cd1e32a 1599static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
a55eb92c 1600 int nb_sectors, int dirty)
7cd1e32a
LS
1601{
1602 int64_t start, end;
c6d22830 1603 unsigned long val, idx, bit;
a55eb92c 1604
6ea44308 1605 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
c6d22830 1606 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c
JK
1607
1608 for (; start <= end; start++) {
71df14fc
PB
1609 idx = start / BITS_PER_LONG;
1610 bit = start % BITS_PER_LONG;
c6d22830
JK
1611 val = bs->dirty_bitmap[idx];
1612 if (dirty) {
6d59fec1 1613 if (!(val & (1UL << bit))) {
aaa0eb75 1614 bs->dirty_count++;
6d59fec1 1615 val |= 1UL << bit;
aaa0eb75 1616 }
c6d22830 1617 } else {
6d59fec1 1618 if (val & (1UL << bit)) {
aaa0eb75 1619 bs->dirty_count--;
6d59fec1 1620 val &= ~(1UL << bit);
aaa0eb75 1621 }
c6d22830
JK
1622 }
1623 bs->dirty_bitmap[idx] = val;
7cd1e32a
LS
1624 }
1625}
1626
5fafdf24 1627/* Return < 0 if error. Important errors are:
19cb3738
FB
1628 -EIO generic I/O error (may happen for all errors)
1629 -ENOMEDIUM No media inserted.
1630 -EINVAL Invalid sector number or nb_sectors
1631 -EACCES Trying to write a read-only device
1632*/
5fafdf24 1633int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
1634 const uint8_t *buf, int nb_sectors)
1635{
1c9805a3 1636 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
83f64091
FB
1637}
1638
eda578e5
AL
1639int bdrv_pread(BlockDriverState *bs, int64_t offset,
1640 void *buf, int count1)
83f64091 1641{
6ea44308 1642 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1643 int len, nb_sectors, count;
1644 int64_t sector_num;
9a8c4cce 1645 int ret;
83f64091
FB
1646
1647 count = count1;
1648 /* first read to align to sector start */
6ea44308 1649 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1650 if (len > count)
1651 len = count;
6ea44308 1652 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1653 if (len > 0) {
9a8c4cce
KW
1654 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1655 return ret;
6ea44308 1656 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
83f64091
FB
1657 count -= len;
1658 if (count == 0)
1659 return count1;
1660 sector_num++;
1661 buf += len;
1662 }
1663
1664 /* read the sectors "in place" */
6ea44308 1665 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1666 if (nb_sectors > 0) {
9a8c4cce
KW
1667 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1668 return ret;
83f64091 1669 sector_num += nb_sectors;
6ea44308 1670 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1671 buf += len;
1672 count -= len;
1673 }
1674
1675 /* add data from the last sector */
1676 if (count > 0) {
9a8c4cce
KW
1677 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1678 return ret;
83f64091
FB
1679 memcpy(buf, tmp_buf, count);
1680 }
1681 return count1;
1682}
1683
eda578e5
AL
1684int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1685 const void *buf, int count1)
83f64091 1686{
6ea44308 1687 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1688 int len, nb_sectors, count;
1689 int64_t sector_num;
9a8c4cce 1690 int ret;
83f64091
FB
1691
1692 count = count1;
1693 /* first write to align to sector start */
6ea44308 1694 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1695 if (len > count)
1696 len = count;
6ea44308 1697 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1698 if (len > 0) {
9a8c4cce
KW
1699 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1700 return ret;
6ea44308 1701 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
9a8c4cce
KW
1702 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1703 return ret;
83f64091
FB
1704 count -= len;
1705 if (count == 0)
1706 return count1;
1707 sector_num++;
1708 buf += len;
1709 }
1710
1711 /* write the sectors "in place" */
6ea44308 1712 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1713 if (nb_sectors > 0) {
9a8c4cce
KW
1714 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1715 return ret;
83f64091 1716 sector_num += nb_sectors;
6ea44308 1717 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1718 buf += len;
1719 count -= len;
1720 }
1721
1722 /* add data from the last sector */
1723 if (count > 0) {
9a8c4cce
KW
1724 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1725 return ret;
83f64091 1726 memcpy(tmp_buf, buf, count);
9a8c4cce
KW
1727 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1728 return ret;
83f64091
FB
1729 }
1730 return count1;
1731}
83f64091 1732
f08145fe
KW
1733/*
1734 * Writes to the file and ensures that no writes are reordered across this
1735 * request (acts as a barrier)
1736 *
1737 * Returns 0 on success, -errno in error cases.
1738 */
1739int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1740 const void *buf, int count)
1741{
1742 int ret;
1743
1744 ret = bdrv_pwrite(bs, offset, buf, count);
1745 if (ret < 0) {
1746 return ret;
1747 }
1748
92196b2f
SH
1749 /* No flush needed for cache modes that use O_DSYNC */
1750 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
f08145fe
KW
1751 bdrv_flush(bs);
1752 }
1753
1754 return 0;
1755}
1756
470c0504 1757static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
1758 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1759{
1760 /* Perform I/O through a temporary buffer so that users who scribble over
1761 * their read buffer while the operation is in progress do not end up
1762 * modifying the image file. This is critical for zero-copy guest I/O
1763 * where anything might happen inside guest memory.
1764 */
1765 void *bounce_buffer;
1766
79c053bd 1767 BlockDriver *drv = bs->drv;
ab185921
SH
1768 struct iovec iov;
1769 QEMUIOVector bounce_qiov;
1770 int64_t cluster_sector_num;
1771 int cluster_nb_sectors;
1772 size_t skip_bytes;
1773 int ret;
1774
1775 /* Cover entire cluster so no additional backing file I/O is required when
1776 * allocating cluster in the image file.
1777 */
1778 round_to_clusters(bs, sector_num, nb_sectors,
1779 &cluster_sector_num, &cluster_nb_sectors);
1780
470c0504
SH
1781 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1782 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
1783
1784 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1785 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1786 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1787
79c053bd
SH
1788 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1789 &bounce_qiov);
ab185921
SH
1790 if (ret < 0) {
1791 goto err;
1792 }
1793
79c053bd
SH
1794 if (drv->bdrv_co_write_zeroes &&
1795 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589
KW
1796 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1797 cluster_nb_sectors);
79c053bd
SH
1798 } else {
1799 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 1800 &bounce_qiov);
79c053bd
SH
1801 }
1802
ab185921
SH
1803 if (ret < 0) {
1804 /* It might be okay to ignore write errors for guest requests. If this
1805 * is a deliberate copy-on-read then we don't want to ignore the error.
1806 * Simply report it in all cases.
1807 */
1808 goto err;
1809 }
1810
1811 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1812 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1813 nb_sectors * BDRV_SECTOR_SIZE);
1814
1815err:
1816 qemu_vfree(bounce_buffer);
1817 return ret;
1818}
1819
c5fbe571
SH
1820/*
1821 * Handle a read request in coroutine context
1822 */
1823static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
1824 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1825 BdrvRequestFlags flags)
da1fa91d
KW
1826{
1827 BlockDriver *drv = bs->drv;
dbffbdcf
SH
1828 BdrvTrackedRequest req;
1829 int ret;
da1fa91d 1830
da1fa91d
KW
1831 if (!drv) {
1832 return -ENOMEDIUM;
1833 }
1834 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1835 return -EIO;
1836 }
1837
98f90dba
ZYW
1838 /* throttling disk read I/O */
1839 if (bs->io_limits_enabled) {
1840 bdrv_io_limits_intercept(bs, false, nb_sectors);
1841 }
1842
f4658285 1843 if (bs->copy_on_read) {
470c0504
SH
1844 flags |= BDRV_REQ_COPY_ON_READ;
1845 }
1846 if (flags & BDRV_REQ_COPY_ON_READ) {
1847 bs->copy_on_read_in_flight++;
1848 }
1849
1850 if (bs->copy_on_read_in_flight) {
f4658285
SH
1851 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1852 }
1853
dbffbdcf 1854 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
ab185921 1855
470c0504 1856 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
1857 int pnum;
1858
1859 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1860 if (ret < 0) {
1861 goto out;
1862 }
1863
1864 if (!ret || pnum != nb_sectors) {
470c0504 1865 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1866 goto out;
1867 }
1868 }
1869
dbffbdcf 1870 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1871
1872out:
dbffbdcf 1873 tracked_request_end(&req);
470c0504
SH
1874
1875 if (flags & BDRV_REQ_COPY_ON_READ) {
1876 bs->copy_on_read_in_flight--;
1877 }
1878
dbffbdcf 1879 return ret;
da1fa91d
KW
1880}
1881
c5fbe571 1882int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
1883 int nb_sectors, QEMUIOVector *qiov)
1884{
c5fbe571 1885 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 1886
470c0504
SH
1887 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1888}
1889
1890int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1891 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1892{
1893 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1894
1895 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1896 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
1897}
1898
f08f2dda
SH
1899static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1900 int64_t sector_num, int nb_sectors)
1901{
1902 BlockDriver *drv = bs->drv;
1903 QEMUIOVector qiov;
1904 struct iovec iov;
1905 int ret;
1906
621f0589
KW
1907 /* TODO Emulate only part of misaligned requests instead of letting block
1908 * drivers return -ENOTSUP and emulate everything */
1909
f08f2dda
SH
1910 /* First try the efficient write zeroes operation */
1911 if (drv->bdrv_co_write_zeroes) {
621f0589
KW
1912 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1913 if (ret != -ENOTSUP) {
1914 return ret;
1915 }
f08f2dda
SH
1916 }
1917
1918 /* Fall back to bounce buffer if write zeroes is unsupported */
1919 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1920 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1921 memset(iov.iov_base, 0, iov.iov_len);
1922 qemu_iovec_init_external(&qiov, &iov, 1);
1923
1924 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1925
1926 qemu_vfree(iov.iov_base);
1927 return ret;
1928}
1929
c5fbe571
SH
1930/*
1931 * Handle a write request in coroutine context
1932 */
1933static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
1934 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1935 BdrvRequestFlags flags)
c5fbe571
SH
1936{
1937 BlockDriver *drv = bs->drv;
dbffbdcf 1938 BdrvTrackedRequest req;
6b7cb247 1939 int ret;
da1fa91d
KW
1940
1941 if (!bs->drv) {
1942 return -ENOMEDIUM;
1943 }
1944 if (bs->read_only) {
1945 return -EACCES;
1946 }
1947 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1948 return -EIO;
1949 }
1950
98f90dba
ZYW
1951 /* throttling disk write I/O */
1952 if (bs->io_limits_enabled) {
1953 bdrv_io_limits_intercept(bs, true, nb_sectors);
1954 }
1955
470c0504 1956 if (bs->copy_on_read_in_flight) {
f4658285
SH
1957 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1958 }
1959
dbffbdcf
SH
1960 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1961
f08f2dda
SH
1962 if (flags & BDRV_REQ_ZERO_WRITE) {
1963 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1964 } else {
1965 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1966 }
6b7cb247 1967
da1fa91d
KW
1968 if (bs->dirty_bitmap) {
1969 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1970 }
1971
1972 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1973 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1974 }
1975
dbffbdcf
SH
1976 tracked_request_end(&req);
1977
6b7cb247 1978 return ret;
da1fa91d
KW
1979}
1980
c5fbe571
SH
1981int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1982 int nb_sectors, QEMUIOVector *qiov)
1983{
1984 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1985
f08f2dda
SH
1986 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1987}
1988
1989int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1990 int64_t sector_num, int nb_sectors)
1991{
1992 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1993
1994 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1995 BDRV_REQ_ZERO_WRITE);
c5fbe571
SH
1996}
1997
83f64091
FB
1998/**
1999 * Truncate file to 'offset' bytes (needed only for file protocols)
2000 */
2001int bdrv_truncate(BlockDriverState *bs, int64_t offset)
2002{
2003 BlockDriver *drv = bs->drv;
51762288 2004 int ret;
83f64091 2005 if (!drv)
19cb3738 2006 return -ENOMEDIUM;
83f64091
FB
2007 if (!drv->bdrv_truncate)
2008 return -ENOTSUP;
59f2689d
NS
2009 if (bs->read_only)
2010 return -EACCES;
8591675f
MT
2011 if (bdrv_in_use(bs))
2012 return -EBUSY;
51762288
SH
2013 ret = drv->bdrv_truncate(bs, offset);
2014 if (ret == 0) {
2015 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 2016 bdrv_dev_resize_cb(bs);
51762288
SH
2017 }
2018 return ret;
83f64091
FB
2019}
2020
4a1d5e1f
FZ
2021/**
2022 * Length of a allocated file in bytes. Sparse files are counted by actual
2023 * allocated space. Return < 0 if error or unknown.
2024 */
2025int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2026{
2027 BlockDriver *drv = bs->drv;
2028 if (!drv) {
2029 return -ENOMEDIUM;
2030 }
2031 if (drv->bdrv_get_allocated_file_size) {
2032 return drv->bdrv_get_allocated_file_size(bs);
2033 }
2034 if (bs->file) {
2035 return bdrv_get_allocated_file_size(bs->file);
2036 }
2037 return -ENOTSUP;
2038}
2039
83f64091
FB
2040/**
2041 * Length of a file in bytes. Return < 0 if error or unknown.
2042 */
2043int64_t bdrv_getlength(BlockDriverState *bs)
2044{
2045 BlockDriver *drv = bs->drv;
2046 if (!drv)
19cb3738 2047 return -ENOMEDIUM;
51762288 2048
2c6942fa 2049 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
46a4e4e6
SH
2050 if (drv->bdrv_getlength) {
2051 return drv->bdrv_getlength(bs);
2052 }
83f64091 2053 }
46a4e4e6 2054 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
2055}
2056
19cb3738 2057/* return 0 as number of sectors if no device present or error */
96b8f136 2058void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 2059{
19cb3738
FB
2060 int64_t length;
2061 length = bdrv_getlength(bs);
2062 if (length < 0)
2063 length = 0;
2064 else
6ea44308 2065 length = length >> BDRV_SECTOR_BITS;
19cb3738 2066 *nb_sectors_ptr = length;
fc01f7e7 2067}
cf98951b 2068
f3d54fc4
AL
2069struct partition {
2070 uint8_t boot_ind; /* 0x80 - active */
2071 uint8_t head; /* starting head */
2072 uint8_t sector; /* starting sector */
2073 uint8_t cyl; /* starting cylinder */
2074 uint8_t sys_ind; /* What partition type */
2075 uint8_t end_head; /* end head */
2076 uint8_t end_sector; /* end sector */
2077 uint8_t end_cyl; /* end cylinder */
2078 uint32_t start_sect; /* starting sector counting from 0 */
2079 uint32_t nr_sects; /* nr of sectors in partition */
541dc0d4 2080} QEMU_PACKED;
f3d54fc4
AL
2081
2082/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2083static int guess_disk_lchs(BlockDriverState *bs,
2084 int *pcylinders, int *pheads, int *psectors)
2085{
eb5a3165 2086 uint8_t buf[BDRV_SECTOR_SIZE];
f3d54fc4
AL
2087 int ret, i, heads, sectors, cylinders;
2088 struct partition *p;
2089 uint32_t nr_sects;
a38131b6 2090 uint64_t nb_sectors;
498e386c 2091 bool enabled;
f3d54fc4
AL
2092
2093 bdrv_get_geometry(bs, &nb_sectors);
2094
498e386c
ZYW
2095 /**
2096 * The function will be invoked during startup not only in sync I/O mode,
2097 * but also in async I/O mode. So the I/O throttling function has to
2098 * be disabled temporarily here, not permanently.
2099 */
2100 enabled = bs->io_limits_enabled;
2101 bs->io_limits_enabled = false;
f3d54fc4 2102 ret = bdrv_read(bs, 0, buf, 1);
498e386c 2103 bs->io_limits_enabled = enabled;
f3d54fc4
AL
2104 if (ret < 0)
2105 return -1;
2106 /* test msdos magic */
2107 if (buf[510] != 0x55 || buf[511] != 0xaa)
2108 return -1;
2109 for(i = 0; i < 4; i++) {
2110 p = ((struct partition *)(buf + 0x1be)) + i;
2111 nr_sects = le32_to_cpu(p->nr_sects);
2112 if (nr_sects && p->end_head) {
2113 /* We make the assumption that the partition terminates on
2114 a cylinder boundary */
2115 heads = p->end_head + 1;
2116 sectors = p->end_sector & 63;
2117 if (sectors == 0)
2118 continue;
2119 cylinders = nb_sectors / (heads * sectors);
2120 if (cylinders < 1 || cylinders > 16383)
2121 continue;
2122 *pheads = heads;
2123 *psectors = sectors;
2124 *pcylinders = cylinders;
2125#if 0
2126 printf("guessed geometry: LCHS=%d %d %d\n",
2127 cylinders, heads, sectors);
2128#endif
2129 return 0;
2130 }
2131 }
2132 return -1;
2133}
2134
2135void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2136{
2137 int translation, lba_detected = 0;
2138 int cylinders, heads, secs;
a38131b6 2139 uint64_t nb_sectors;
f3d54fc4
AL
2140
2141 /* if a geometry hint is available, use it */
2142 bdrv_get_geometry(bs, &nb_sectors);
2143 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2144 translation = bdrv_get_translation_hint(bs);
2145 if (cylinders != 0) {
2146 *pcyls = cylinders;
2147 *pheads = heads;
2148 *psecs = secs;
2149 } else {
2150 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2151 if (heads > 16) {
2152 /* if heads > 16, it means that a BIOS LBA
2153 translation was active, so the default
2154 hardware geometry is OK */
2155 lba_detected = 1;
2156 goto default_geometry;
2157 } else {
2158 *pcyls = cylinders;
2159 *pheads = heads;
2160 *psecs = secs;
2161 /* disable any translation to be in sync with
2162 the logical geometry */
2163 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2164 bdrv_set_translation_hint(bs,
2165 BIOS_ATA_TRANSLATION_NONE);
2166 }
2167 }
2168 } else {
2169 default_geometry:
2170 /* if no geometry, use a standard physical disk geometry */
2171 cylinders = nb_sectors / (16 * 63);
2172
2173 if (cylinders > 16383)
2174 cylinders = 16383;
2175 else if (cylinders < 2)
2176 cylinders = 2;
2177 *pcyls = cylinders;
2178 *pheads = 16;
2179 *psecs = 63;
2180 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2181 if ((*pcyls * *pheads) <= 131072) {
2182 bdrv_set_translation_hint(bs,
2183 BIOS_ATA_TRANSLATION_LARGE);
2184 } else {
2185 bdrv_set_translation_hint(bs,
2186 BIOS_ATA_TRANSLATION_LBA);
2187 }
2188 }
2189 }
2190 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2191 }
2192}
2193
5fafdf24 2194void bdrv_set_geometry_hint(BlockDriverState *bs,
b338082b
FB
2195 int cyls, int heads, int secs)
2196{
2197 bs->cyls = cyls;
2198 bs->heads = heads;
2199 bs->secs = secs;
2200}
2201
46d4767d
FB
2202void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2203{
2204 bs->translation = translation;
2205}
2206
5fafdf24 2207void bdrv_get_geometry_hint(BlockDriverState *bs,
b338082b
FB
2208 int *pcyls, int *pheads, int *psecs)
2209{
2210 *pcyls = bs->cyls;
2211 *pheads = bs->heads;
2212 *psecs = bs->secs;
2213}
2214
0563e191
ZYW
2215/* throttling disk io limits */
2216void bdrv_set_io_limits(BlockDriverState *bs,
2217 BlockIOLimit *io_limits)
2218{
2219 bs->io_limits = *io_limits;
2220 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2221}
2222
5bbdbb46
BS
2223/* Recognize floppy formats */
2224typedef struct FDFormat {
2225 FDriveType drive;
2226 uint8_t last_sect;
2227 uint8_t max_track;
2228 uint8_t max_head;
f8d3d128 2229 FDriveRate rate;
5bbdbb46
BS
2230} FDFormat;
2231
2232static const FDFormat fd_formats[] = {
2233 /* First entry is default format */
2234 /* 1.44 MB 3"1/2 floppy disks */
f8d3d128
HP
2235 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2236 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2237 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2238 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2239 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2240 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2241 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2242 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2243 /* 2.88 MB 3"1/2 floppy disks */
f8d3d128
HP
2244 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2245 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2246 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2247 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2248 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
5bbdbb46 2249 /* 720 kB 3"1/2 floppy disks */
f8d3d128
HP
2250 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2251 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2252 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2253 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2254 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2255 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2256 /* 1.2 MB 5"1/4 floppy disks */
f8d3d128
HP
2257 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2258 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2259 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2260 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2261 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2262 /* 720 kB 5"1/4 floppy disks */
f8d3d128
HP
2263 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2264 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2265 /* 360 kB 5"1/4 floppy disks */
f8d3d128
HP
2266 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2267 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2268 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2269 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
5bbdbb46 2270 /* 320 kB 5"1/4 floppy disks */
f8d3d128
HP
2271 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2272 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
5bbdbb46 2273 /* 360 kB must match 5"1/4 better than 3"1/2... */
f8d3d128 2274 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
5bbdbb46 2275 /* end */
f8d3d128 2276 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
5bbdbb46
BS
2277};
2278
2279void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2280 int *max_track, int *last_sect,
f8d3d128
HP
2281 FDriveType drive_in, FDriveType *drive,
2282 FDriveRate *rate)
5bbdbb46
BS
2283{
2284 const FDFormat *parse;
2285 uint64_t nb_sectors, size;
2286 int i, first_match, match;
2287
2288 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2289 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2290 /* User defined disk */
f8d3d128 2291 *rate = FDRIVE_RATE_500K;
5bbdbb46
BS
2292 } else {
2293 bdrv_get_geometry(bs, &nb_sectors);
2294 match = -1;
2295 first_match = -1;
2296 for (i = 0; ; i++) {
2297 parse = &fd_formats[i];
2298 if (parse->drive == FDRIVE_DRV_NONE) {
2299 break;
2300 }
2301 if (drive_in == parse->drive ||
2302 drive_in == FDRIVE_DRV_NONE) {
2303 size = (parse->max_head + 1) * parse->max_track *
2304 parse->last_sect;
2305 if (nb_sectors == size) {
2306 match = i;
2307 break;
2308 }
2309 if (first_match == -1) {
2310 first_match = i;
2311 }
2312 }
2313 }
2314 if (match == -1) {
2315 if (first_match == -1) {
2316 match = 1;
2317 } else {
2318 match = first_match;
2319 }
2320 parse = &fd_formats[match];
2321 }
2322 *nb_heads = parse->max_head + 1;
2323 *max_track = parse->max_track;
2324 *last_sect = parse->last_sect;
2325 *drive = parse->drive;
f8d3d128 2326 *rate = parse->rate;
5bbdbb46
BS
2327 }
2328}
2329
46d4767d
FB
2330int bdrv_get_translation_hint(BlockDriverState *bs)
2331{
2332 return bs->translation;
2333}
2334
abd7f68d
MA
2335void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2336 BlockErrorAction on_write_error)
2337{
2338 bs->on_read_error = on_read_error;
2339 bs->on_write_error = on_write_error;
2340}
2341
2342BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2343{
2344 return is_read ? bs->on_read_error : bs->on_write_error;
2345}
2346
b338082b
FB
2347int bdrv_is_read_only(BlockDriverState *bs)
2348{
2349 return bs->read_only;
2350}
2351
985a03b0
TS
2352int bdrv_is_sg(BlockDriverState *bs)
2353{
2354 return bs->sg;
2355}
2356
e900a7b7
CH
2357int bdrv_enable_write_cache(BlockDriverState *bs)
2358{
2359 return bs->enable_write_cache;
2360}
2361
ea2384d3
FB
2362int bdrv_is_encrypted(BlockDriverState *bs)
2363{
2364 if (bs->backing_hd && bs->backing_hd->encrypted)
2365 return 1;
2366 return bs->encrypted;
2367}
2368
c0f4ce77
AL
2369int bdrv_key_required(BlockDriverState *bs)
2370{
2371 BlockDriverState *backing_hd = bs->backing_hd;
2372
2373 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2374 return 1;
2375 return (bs->encrypted && !bs->valid_key);
2376}
2377
ea2384d3
FB
2378int bdrv_set_key(BlockDriverState *bs, const char *key)
2379{
2380 int ret;
2381 if (bs->backing_hd && bs->backing_hd->encrypted) {
2382 ret = bdrv_set_key(bs->backing_hd, key);
2383 if (ret < 0)
2384 return ret;
2385 if (!bs->encrypted)
2386 return 0;
2387 }
fd04a2ae
SH
2388 if (!bs->encrypted) {
2389 return -EINVAL;
2390 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2391 return -ENOMEDIUM;
2392 }
c0f4ce77 2393 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
2394 if (ret < 0) {
2395 bs->valid_key = 0;
2396 } else if (!bs->valid_key) {
2397 bs->valid_key = 1;
2398 /* call the change callback now, we skipped it on open */
7d4b4ba5 2399 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 2400 }
c0f4ce77 2401 return ret;
ea2384d3
FB
2402}
2403
2404void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2405{
19cb3738 2406 if (!bs->drv) {
ea2384d3
FB
2407 buf[0] = '\0';
2408 } else {
2409 pstrcpy(buf, buf_size, bs->drv->format_name);
2410 }
2411}
2412
5fafdf24 2413void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
2414 void *opaque)
2415{
2416 BlockDriver *drv;
2417
8a22f02a 2418 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
2419 it(opaque, drv->format_name);
2420 }
2421}
2422
b338082b
FB
2423BlockDriverState *bdrv_find(const char *name)
2424{
2425 BlockDriverState *bs;
2426
1b7bdbc1
SH
2427 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2428 if (!strcmp(name, bs->device_name)) {
b338082b 2429 return bs;
1b7bdbc1 2430 }
b338082b
FB
2431 }
2432 return NULL;
2433}
2434
2f399b0a
MA
2435BlockDriverState *bdrv_next(BlockDriverState *bs)
2436{
2437 if (!bs) {
2438 return QTAILQ_FIRST(&bdrv_states);
2439 }
2440 return QTAILQ_NEXT(bs, list);
2441}
2442
51de9760 2443void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
2444{
2445 BlockDriverState *bs;
2446
1b7bdbc1 2447 QTAILQ_FOREACH(bs, &bdrv_states, list) {
51de9760 2448 it(opaque, bs);
81d0912d
FB
2449 }
2450}
2451
ea2384d3
FB
2452const char *bdrv_get_device_name(BlockDriverState *bs)
2453{
2454 return bs->device_name;
2455}
2456
c6ca28d6
AL
2457void bdrv_flush_all(void)
2458{
2459 BlockDriverState *bs;
2460
1b7bdbc1 2461 QTAILQ_FOREACH(bs, &bdrv_states, list) {
29cdb251 2462 bdrv_flush(bs);
1b7bdbc1 2463 }
c6ca28d6
AL
2464}
2465
f2feebbd
KW
2466int bdrv_has_zero_init(BlockDriverState *bs)
2467{
2468 assert(bs->drv);
2469
336c1c12
KW
2470 if (bs->drv->bdrv_has_zero_init) {
2471 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
2472 }
2473
2474 return 1;
2475}
2476
376ae3f1
SH
2477typedef struct BdrvCoIsAllocatedData {
2478 BlockDriverState *bs;
2479 int64_t sector_num;
2480 int nb_sectors;
2481 int *pnum;
2482 int ret;
2483 bool done;
2484} BdrvCoIsAllocatedData;
2485
f58c7b35
TS
2486/*
2487 * Returns true iff the specified sector is present in the disk image. Drivers
2488 * not implementing the functionality are assumed to not support backing files,
2489 * hence all their sectors are reported as allocated.
2490 *
bd9533e3
SH
2491 * If 'sector_num' is beyond the end of the disk image the return value is 0
2492 * and 'pnum' is set to 0.
2493 *
f58c7b35
TS
2494 * 'pnum' is set to the number of sectors (including and immediately following
2495 * the specified sector) that are known to be in the same
2496 * allocated/unallocated state.
2497 *
bd9533e3
SH
2498 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2499 * beyond the end of the disk image it will be clamped.
f58c7b35 2500 */
060f51c9
SH
2501int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2502 int nb_sectors, int *pnum)
f58c7b35 2503{
bd9533e3
SH
2504 int64_t n;
2505
2506 if (sector_num >= bs->total_sectors) {
2507 *pnum = 0;
2508 return 0;
2509 }
2510
2511 n = bs->total_sectors - sector_num;
2512 if (n < nb_sectors) {
2513 nb_sectors = n;
2514 }
2515
6aebab14 2516 if (!bs->drv->bdrv_co_is_allocated) {
bd9533e3 2517 *pnum = nb_sectors;
f58c7b35
TS
2518 return 1;
2519 }
6aebab14 2520
060f51c9
SH
2521 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2522}
2523
2524/* Coroutine wrapper for bdrv_is_allocated() */
2525static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2526{
2527 BdrvCoIsAllocatedData *data = opaque;
2528 BlockDriverState *bs = data->bs;
2529
2530 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2531 data->pnum);
2532 data->done = true;
2533}
2534
2535/*
2536 * Synchronous wrapper around bdrv_co_is_allocated().
2537 *
2538 * See bdrv_co_is_allocated() for details.
2539 */
2540int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2541 int *pnum)
2542{
6aebab14
SH
2543 Coroutine *co;
2544 BdrvCoIsAllocatedData data = {
2545 .bs = bs,
2546 .sector_num = sector_num,
2547 .nb_sectors = nb_sectors,
2548 .pnum = pnum,
2549 .done = false,
2550 };
2551
2552 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2553 qemu_coroutine_enter(co, &data);
2554 while (!data.done) {
2555 qemu_aio_wait();
2556 }
2557 return data.ret;
f58c7b35
TS
2558}
2559
b2023818 2560BlockInfoList *qmp_query_block(Error **errp)
b338082b 2561{
b2023818 2562 BlockInfoList *head = NULL, *cur_item = NULL;
b338082b
FB
2563 BlockDriverState *bs;
2564
1b7bdbc1 2565 QTAILQ_FOREACH(bs, &bdrv_states, list) {
b2023818 2566 BlockInfoList *info = g_malloc0(sizeof(*info));
d15e5465 2567
b2023818
LC
2568 info->value = g_malloc0(sizeof(*info->value));
2569 info->value->device = g_strdup(bs->device_name);
2570 info->value->type = g_strdup("unknown");
2571 info->value->locked = bdrv_dev_is_medium_locked(bs);
2572 info->value->removable = bdrv_dev_has_removable_media(bs);
d15e5465 2573
e4def80b 2574 if (bdrv_dev_has_removable_media(bs)) {
b2023818
LC
2575 info->value->has_tray_open = true;
2576 info->value->tray_open = bdrv_dev_is_tray_open(bs);
e4def80b 2577 }
f04ef601
LC
2578
2579 if (bdrv_iostatus_is_enabled(bs)) {
b2023818
LC
2580 info->value->has_io_status = true;
2581 info->value->io_status = bs->iostatus;
f04ef601
LC
2582 }
2583
19cb3738 2584 if (bs->drv) {
b2023818
LC
2585 info->value->has_inserted = true;
2586 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2587 info->value->inserted->file = g_strdup(bs->filename);
2588 info->value->inserted->ro = bs->read_only;
2589 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2590 info->value->inserted->encrypted = bs->encrypted;
2591 if (bs->backing_file[0]) {
2592 info->value->inserted->has_backing_file = true;
2593 info->value->inserted->backing_file = g_strdup(bs->backing_file);
376253ec 2594 }
727f005e
ZYW
2595
2596 if (bs->io_limits_enabled) {
2597 info->value->inserted->bps =
2598 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2599 info->value->inserted->bps_rd =
2600 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2601 info->value->inserted->bps_wr =
2602 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2603 info->value->inserted->iops =
2604 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2605 info->value->inserted->iops_rd =
2606 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2607 info->value->inserted->iops_wr =
2608 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2609 }
b2023818 2610 }
d15e5465 2611
b2023818
LC
2612 /* XXX: waiting for the qapi to support GSList */
2613 if (!cur_item) {
2614 head = cur_item = info;
2615 } else {
2616 cur_item->next = info;
2617 cur_item = info;
b338082b 2618 }
b338082b 2619 }
d15e5465 2620
b2023818 2621 return head;
b338082b 2622}
a36e69dd 2623
f11f57e4
LC
2624/* Consider exposing this as a full fledged QMP command */
2625static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2626{
2627 BlockStats *s;
2628
2629 s = g_malloc0(sizeof(*s));
2630
2631 if (bs->device_name[0]) {
2632 s->has_device = true;
2633 s->device = g_strdup(bs->device_name);
294cc35f
KW
2634 }
2635
f11f57e4
LC
2636 s->stats = g_malloc0(sizeof(*s->stats));
2637 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2638 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2639 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2640 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2641 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2642 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2643 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2644 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2645 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2646
294cc35f 2647 if (bs->file) {
f11f57e4
LC
2648 s->has_parent = true;
2649 s->parent = qmp_query_blockstat(bs->file, NULL);
294cc35f
KW
2650 }
2651
f11f57e4 2652 return s;
294cc35f
KW
2653}
2654
f11f57e4 2655BlockStatsList *qmp_query_blockstats(Error **errp)
218a536a 2656{
f11f57e4 2657 BlockStatsList *head = NULL, *cur_item = NULL;
a36e69dd
TS
2658 BlockDriverState *bs;
2659
1b7bdbc1 2660 QTAILQ_FOREACH(bs, &bdrv_states, list) {
f11f57e4
LC
2661 BlockStatsList *info = g_malloc0(sizeof(*info));
2662 info->value = qmp_query_blockstat(bs, NULL);
2663
2664 /* XXX: waiting for the qapi to support GSList */
2665 if (!cur_item) {
2666 head = cur_item = info;
2667 } else {
2668 cur_item->next = info;
2669 cur_item = info;
2670 }
a36e69dd 2671 }
218a536a 2672
f11f57e4 2673 return head;
a36e69dd 2674}
ea2384d3 2675
045df330
AL
2676const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2677{
2678 if (bs->backing_hd && bs->backing_hd->encrypted)
2679 return bs->backing_file;
2680 else if (bs->encrypted)
2681 return bs->filename;
2682 else
2683 return NULL;
2684}
2685
5fafdf24 2686void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
2687 char *filename, int filename_size)
2688{
3574c608 2689 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
2690}
2691
5fafdf24 2692int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
2693 const uint8_t *buf, int nb_sectors)
2694{
2695 BlockDriver *drv = bs->drv;
2696 if (!drv)
19cb3738 2697 return -ENOMEDIUM;
faea38e7
FB
2698 if (!drv->bdrv_write_compressed)
2699 return -ENOTSUP;
fbb7b4e0
KW
2700 if (bdrv_check_request(bs, sector_num, nb_sectors))
2701 return -EIO;
a55eb92c 2702
c6d22830 2703 if (bs->dirty_bitmap) {
7cd1e32a
LS
2704 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2705 }
a55eb92c 2706
faea38e7
FB
2707 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2708}
3b46e624 2709
faea38e7
FB
2710int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2711{
2712 BlockDriver *drv = bs->drv;
2713 if (!drv)
19cb3738 2714 return -ENOMEDIUM;
faea38e7
FB
2715 if (!drv->bdrv_get_info)
2716 return -ENOTSUP;
2717 memset(bdi, 0, sizeof(*bdi));
2718 return drv->bdrv_get_info(bs, bdi);
2719}
2720
45566e9c
CH
2721int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2722 int64_t pos, int size)
178e08a5
AL
2723{
2724 BlockDriver *drv = bs->drv;
2725 if (!drv)
2726 return -ENOMEDIUM;
7cdb1f6d
MK
2727 if (drv->bdrv_save_vmstate)
2728 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2729 if (bs->file)
2730 return bdrv_save_vmstate(bs->file, buf, pos, size);
2731 return -ENOTSUP;
178e08a5
AL
2732}
2733
45566e9c
CH
2734int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2735 int64_t pos, int size)
178e08a5
AL
2736{
2737 BlockDriver *drv = bs->drv;
2738 if (!drv)
2739 return -ENOMEDIUM;
7cdb1f6d
MK
2740 if (drv->bdrv_load_vmstate)
2741 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2742 if (bs->file)
2743 return bdrv_load_vmstate(bs->file, buf, pos, size);
2744 return -ENOTSUP;
178e08a5
AL
2745}
2746
8b9b0cc2
KW
2747void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2748{
2749 BlockDriver *drv = bs->drv;
2750
2751 if (!drv || !drv->bdrv_debug_event) {
2752 return;
2753 }
2754
2755 return drv->bdrv_debug_event(bs, event);
2756
2757}
2758
faea38e7
FB
2759/**************************************************************/
2760/* handling of snapshots */
2761
feeee5ac
MDCF
2762int bdrv_can_snapshot(BlockDriverState *bs)
2763{
2764 BlockDriver *drv = bs->drv;
07b70bfb 2765 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
feeee5ac
MDCF
2766 return 0;
2767 }
2768
2769 if (!drv->bdrv_snapshot_create) {
2770 if (bs->file != NULL) {
2771 return bdrv_can_snapshot(bs->file);
2772 }
2773 return 0;
2774 }
2775
2776 return 1;
2777}
2778
199630b6
BS
2779int bdrv_is_snapshot(BlockDriverState *bs)
2780{
2781 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2782}
2783
f9092b10
MA
2784BlockDriverState *bdrv_snapshots(void)
2785{
2786 BlockDriverState *bs;
2787
3ac906f7 2788 if (bs_snapshots) {
f9092b10 2789 return bs_snapshots;
3ac906f7 2790 }
f9092b10
MA
2791
2792 bs = NULL;
2793 while ((bs = bdrv_next(bs))) {
2794 if (bdrv_can_snapshot(bs)) {
3ac906f7
MA
2795 bs_snapshots = bs;
2796 return bs;
f9092b10
MA
2797 }
2798 }
2799 return NULL;
f9092b10
MA
2800}
2801
5fafdf24 2802int bdrv_snapshot_create(BlockDriverState *bs,
faea38e7
FB
2803 QEMUSnapshotInfo *sn_info)
2804{
2805 BlockDriver *drv = bs->drv;
2806 if (!drv)
19cb3738 2807 return -ENOMEDIUM;
7cdb1f6d
MK
2808 if (drv->bdrv_snapshot_create)
2809 return drv->bdrv_snapshot_create(bs, sn_info);
2810 if (bs->file)
2811 return bdrv_snapshot_create(bs->file, sn_info);
2812 return -ENOTSUP;
faea38e7
FB
2813}
2814
5fafdf24 2815int bdrv_snapshot_goto(BlockDriverState *bs,
faea38e7
FB
2816 const char *snapshot_id)
2817{
2818 BlockDriver *drv = bs->drv;
7cdb1f6d
MK
2819 int ret, open_ret;
2820
faea38e7 2821 if (!drv)
19cb3738 2822 return -ENOMEDIUM;
7cdb1f6d
MK
2823 if (drv->bdrv_snapshot_goto)
2824 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2825
2826 if (bs->file) {
2827 drv->bdrv_close(bs);
2828 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2829 open_ret = drv->bdrv_open(bs, bs->open_flags);
2830 if (open_ret < 0) {
2831 bdrv_delete(bs->file);
2832 bs->drv = NULL;
2833 return open_ret;
2834 }
2835 return ret;
2836 }
2837
2838 return -ENOTSUP;
faea38e7
FB
2839}
2840
2841int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2842{
2843 BlockDriver *drv = bs->drv;
2844 if (!drv)
19cb3738 2845 return -ENOMEDIUM;
7cdb1f6d
MK
2846 if (drv->bdrv_snapshot_delete)
2847 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2848 if (bs->file)
2849 return bdrv_snapshot_delete(bs->file, snapshot_id);
2850 return -ENOTSUP;
faea38e7
FB
2851}
2852
5fafdf24 2853int bdrv_snapshot_list(BlockDriverState *bs,
faea38e7
FB
2854 QEMUSnapshotInfo **psn_info)
2855{
2856 BlockDriver *drv = bs->drv;
2857 if (!drv)
19cb3738 2858 return -ENOMEDIUM;
7cdb1f6d
MK
2859 if (drv->bdrv_snapshot_list)
2860 return drv->bdrv_snapshot_list(bs, psn_info);
2861 if (bs->file)
2862 return bdrv_snapshot_list(bs->file, psn_info);
2863 return -ENOTSUP;
faea38e7
FB
2864}
2865
51ef6727 2866int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2867 const char *snapshot_name)
2868{
2869 BlockDriver *drv = bs->drv;
2870 if (!drv) {
2871 return -ENOMEDIUM;
2872 }
2873 if (!bs->read_only) {
2874 return -EINVAL;
2875 }
2876 if (drv->bdrv_snapshot_load_tmp) {
2877 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2878 }
2879 return -ENOTSUP;
2880}
2881
e8a6bb9c
MT
2882BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2883 const char *backing_file)
2884{
2885 if (!bs->drv) {
2886 return NULL;
2887 }
2888
2889 if (bs->backing_hd) {
2890 if (strcmp(bs->backing_file, backing_file) == 0) {
2891 return bs->backing_hd;
2892 } else {
2893 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2894 }
2895 }
2896
2897 return NULL;
2898}
2899
faea38e7
FB
2900#define NB_SUFFIXES 4
2901
2902char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2903{
2904 static const char suffixes[NB_SUFFIXES] = "KMGT";
2905 int64_t base;
2906 int i;
2907
2908 if (size <= 999) {
2909 snprintf(buf, buf_size, "%" PRId64, size);
2910 } else {
2911 base = 1024;
2912 for(i = 0; i < NB_SUFFIXES; i++) {
2913 if (size < (10 * base)) {
5fafdf24 2914 snprintf(buf, buf_size, "%0.1f%c",
faea38e7
FB
2915 (double)size / base,
2916 suffixes[i]);
2917 break;
2918 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
5fafdf24 2919 snprintf(buf, buf_size, "%" PRId64 "%c",
faea38e7
FB
2920 ((size + (base >> 1)) / base),
2921 suffixes[i]);
2922 break;
2923 }
2924 base = base * 1024;
2925 }
2926 }
2927 return buf;
2928}
2929
2930char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2931{
2932 char buf1[128], date_buf[128], clock_buf[128];
3b9f94e1
FB
2933#ifdef _WIN32
2934 struct tm *ptm;
2935#else
faea38e7 2936 struct tm tm;
3b9f94e1 2937#endif
faea38e7
FB
2938 time_t ti;
2939 int64_t secs;
2940
2941 if (!sn) {
5fafdf24
TS
2942 snprintf(buf, buf_size,
2943 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2944 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2945 } else {
2946 ti = sn->date_sec;
3b9f94e1
FB
2947#ifdef _WIN32
2948 ptm = localtime(&ti);
2949 strftime(date_buf, sizeof(date_buf),
2950 "%Y-%m-%d %H:%M:%S", ptm);
2951#else
faea38e7
FB
2952 localtime_r(&ti, &tm);
2953 strftime(date_buf, sizeof(date_buf),
2954 "%Y-%m-%d %H:%M:%S", &tm);
3b9f94e1 2955#endif
faea38e7
FB
2956 secs = sn->vm_clock_nsec / 1000000000;
2957 snprintf(clock_buf, sizeof(clock_buf),
2958 "%02d:%02d:%02d.%03d",
2959 (int)(secs / 3600),
2960 (int)((secs / 60) % 60),
5fafdf24 2961 (int)(secs % 60),
faea38e7
FB
2962 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2963 snprintf(buf, buf_size,
5fafdf24 2964 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2965 sn->id_str, sn->name,
2966 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2967 date_buf,
2968 clock_buf);
2969 }
2970 return buf;
2971}
2972
ea2384d3 2973/**************************************************************/
83f64091 2974/* async I/Os */
ea2384d3 2975
3b69e4b9 2976BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 2977 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 2978 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 2979{
bbf0a440
SH
2980 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2981
b2a61371 2982 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2983 cb, opaque, false);
ea2384d3
FB
2984}
2985
f141eafe
AL
2986BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2987 QEMUIOVector *qiov, int nb_sectors,
2988 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 2989{
bbf0a440
SH
2990 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2991
1a6e115b 2992 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2993 cb, opaque, true);
83f64091
FB
2994}
2995
40b4f539
KW
2996
2997typedef struct MultiwriteCB {
2998 int error;
2999 int num_requests;
3000 int num_callbacks;
3001 struct {
3002 BlockDriverCompletionFunc *cb;
3003 void *opaque;
3004 QEMUIOVector *free_qiov;
40b4f539
KW
3005 } callbacks[];
3006} MultiwriteCB;
3007
3008static void multiwrite_user_cb(MultiwriteCB *mcb)
3009{
3010 int i;
3011
3012 for (i = 0; i < mcb->num_callbacks; i++) {
3013 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
3014 if (mcb->callbacks[i].free_qiov) {
3015 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3016 }
7267c094 3017 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
3018 }
3019}
3020
3021static void multiwrite_cb(void *opaque, int ret)
3022{
3023 MultiwriteCB *mcb = opaque;
3024
6d519a5f
SH
3025 trace_multiwrite_cb(mcb, ret);
3026
cb6d3ca0 3027 if (ret < 0 && !mcb->error) {
40b4f539 3028 mcb->error = ret;
40b4f539
KW
3029 }
3030
3031 mcb->num_requests--;
3032 if (mcb->num_requests == 0) {
de189a1b 3033 multiwrite_user_cb(mcb);
7267c094 3034 g_free(mcb);
40b4f539
KW
3035 }
3036}
3037
3038static int multiwrite_req_compare(const void *a, const void *b)
3039{
77be4366
CH
3040 const BlockRequest *req1 = a, *req2 = b;
3041
3042 /*
3043 * Note that we can't simply subtract req2->sector from req1->sector
3044 * here as that could overflow the return value.
3045 */
3046 if (req1->sector > req2->sector) {
3047 return 1;
3048 } else if (req1->sector < req2->sector) {
3049 return -1;
3050 } else {
3051 return 0;
3052 }
40b4f539
KW
3053}
3054
3055/*
3056 * Takes a bunch of requests and tries to merge them. Returns the number of
3057 * requests that remain after merging.
3058 */
3059static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3060 int num_reqs, MultiwriteCB *mcb)
3061{
3062 int i, outidx;
3063
3064 // Sort requests by start sector
3065 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3066
3067 // Check if adjacent requests touch the same clusters. If so, combine them,
3068 // filling up gaps with zero sectors.
3069 outidx = 0;
3070 for (i = 1; i < num_reqs; i++) {
3071 int merge = 0;
3072 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3073
b6a127a1 3074 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
3075 if (reqs[i].sector <= oldreq_last) {
3076 merge = 1;
3077 }
3078
e2a305fb
CH
3079 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3080 merge = 0;
3081 }
3082
40b4f539
KW
3083 if (merge) {
3084 size_t size;
7267c094 3085 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
3086 qemu_iovec_init(qiov,
3087 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3088
3089 // Add the first request to the merged one. If the requests are
3090 // overlapping, drop the last sectors of the first request.
3091 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3092 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3093
b6a127a1
PB
3094 // We should need to add any zeros between the two requests
3095 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
3096
3097 // Add the second request
3098 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3099
cbf1dff2 3100 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
3101 reqs[outidx].qiov = qiov;
3102
3103 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3104 } else {
3105 outidx++;
3106 reqs[outidx].sector = reqs[i].sector;
3107 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3108 reqs[outidx].qiov = reqs[i].qiov;
3109 }
3110 }
3111
3112 return outidx + 1;
3113}
3114
3115/*
3116 * Submit multiple AIO write requests at once.
3117 *
3118 * On success, the function returns 0 and all requests in the reqs array have
3119 * been submitted. In error case this function returns -1, and any of the
3120 * requests may or may not be submitted yet. In particular, this means that the
3121 * callback will be called for some of the requests, for others it won't. The
3122 * caller must check the error field of the BlockRequest to wait for the right
3123 * callbacks (if error != 0, no callback will be called).
3124 *
3125 * The implementation may modify the contents of the reqs array, e.g. to merge
3126 * requests. However, the fields opaque and error are left unmodified as they
3127 * are used to signal failure for a single request to the caller.
3128 */
3129int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3130{
40b4f539
KW
3131 MultiwriteCB *mcb;
3132 int i;
3133
301db7c2
RH
3134 /* don't submit writes if we don't have a medium */
3135 if (bs->drv == NULL) {
3136 for (i = 0; i < num_reqs; i++) {
3137 reqs[i].error = -ENOMEDIUM;
3138 }
3139 return -1;
3140 }
3141
40b4f539
KW
3142 if (num_reqs == 0) {
3143 return 0;
3144 }
3145
3146 // Create MultiwriteCB structure
7267c094 3147 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
3148 mcb->num_requests = 0;
3149 mcb->num_callbacks = num_reqs;
3150
3151 for (i = 0; i < num_reqs; i++) {
3152 mcb->callbacks[i].cb = reqs[i].cb;
3153 mcb->callbacks[i].opaque = reqs[i].opaque;
3154 }
3155
3156 // Check for mergable requests
3157 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3158
6d519a5f
SH
3159 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3160
df9309fb
PB
3161 /* Run the aio requests. */
3162 mcb->num_requests = num_reqs;
40b4f539 3163 for (i = 0; i < num_reqs; i++) {
ad54ae80 3164 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
40b4f539 3165 reqs[i].nb_sectors, multiwrite_cb, mcb);
40b4f539
KW
3166 }
3167
3168 return 0;
40b4f539
KW
3169}
3170
83f64091 3171void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 3172{
6bbff9a0 3173 acb->pool->cancel(acb);
83f64091
FB
3174}
3175
98f90dba
ZYW
3176/* block I/O throttling */
3177static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3178 bool is_write, double elapsed_time, uint64_t *wait)
3179{
3180 uint64_t bps_limit = 0;
3181 double bytes_limit, bytes_base, bytes_res;
3182 double slice_time, wait_time;
3183
3184 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3185 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3186 } else if (bs->io_limits.bps[is_write]) {
3187 bps_limit = bs->io_limits.bps[is_write];
3188 } else {
3189 if (wait) {
3190 *wait = 0;
3191 }
3192
3193 return false;
3194 }
3195
3196 slice_time = bs->slice_end - bs->slice_start;
3197 slice_time /= (NANOSECONDS_PER_SECOND);
3198 bytes_limit = bps_limit * slice_time;
3199 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3200 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3201 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3202 }
3203
3204 /* bytes_base: the bytes of data which have been read/written; and
3205 * it is obtained from the history statistic info.
3206 * bytes_res: the remaining bytes of data which need to be read/written.
3207 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3208 * the total time for completing reading/writting all data.
3209 */
3210 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3211
3212 if (bytes_base + bytes_res <= bytes_limit) {
3213 if (wait) {
3214 *wait = 0;
3215 }
3216
3217 return false;
3218 }
3219
3220 /* Calc approx time to dispatch */
3221 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3222
3223 /* When the I/O rate at runtime exceeds the limits,
3224 * bs->slice_end need to be extended in order that the current statistic
3225 * info can be kept until the timer fire, so it is increased and tuned
3226 * based on the result of experiment.
3227 */
3228 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3229 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3230 if (wait) {
3231 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3232 }
3233
3234 return true;
3235}
3236
3237static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3238 double elapsed_time, uint64_t *wait)
3239{
3240 uint64_t iops_limit = 0;
3241 double ios_limit, ios_base;
3242 double slice_time, wait_time;
3243
3244 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3245 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3246 } else if (bs->io_limits.iops[is_write]) {
3247 iops_limit = bs->io_limits.iops[is_write];
3248 } else {
3249 if (wait) {
3250 *wait = 0;
3251 }
3252
3253 return false;
3254 }
3255
3256 slice_time = bs->slice_end - bs->slice_start;
3257 slice_time /= (NANOSECONDS_PER_SECOND);
3258 ios_limit = iops_limit * slice_time;
3259 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3260 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3261 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3262 }
3263
3264 if (ios_base + 1 <= ios_limit) {
3265 if (wait) {
3266 *wait = 0;
3267 }
3268
3269 return false;
3270 }
3271
3272 /* Calc approx time to dispatch */
3273 wait_time = (ios_base + 1) / iops_limit;
3274 if (wait_time > elapsed_time) {
3275 wait_time = wait_time - elapsed_time;
3276 } else {
3277 wait_time = 0;
3278 }
3279
3280 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3281 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3282 if (wait) {
3283 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3284 }
3285
3286 return true;
3287}
3288
3289static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3290 bool is_write, int64_t *wait)
3291{
3292 int64_t now, max_wait;
3293 uint64_t bps_wait = 0, iops_wait = 0;
3294 double elapsed_time;
3295 int bps_ret, iops_ret;
3296
3297 now = qemu_get_clock_ns(vm_clock);
3298 if ((bs->slice_start < now)
3299 && (bs->slice_end > now)) {
3300 bs->slice_end = now + bs->slice_time;
3301 } else {
3302 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3303 bs->slice_start = now;
3304 bs->slice_end = now + bs->slice_time;
3305
3306 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3307 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3308
3309 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3310 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3311 }
3312
3313 elapsed_time = now - bs->slice_start;
3314 elapsed_time /= (NANOSECONDS_PER_SECOND);
3315
3316 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3317 is_write, elapsed_time, &bps_wait);
3318 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3319 elapsed_time, &iops_wait);
3320 if (bps_ret || iops_ret) {
3321 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3322 if (wait) {
3323 *wait = max_wait;
3324 }
3325
3326 now = qemu_get_clock_ns(vm_clock);
3327 if (bs->slice_end < now + max_wait) {
3328 bs->slice_end = now + max_wait;
3329 }
3330
3331 return true;
3332 }
3333
3334 if (wait) {
3335 *wait = 0;
3336 }
3337
3338 return false;
3339}
ce1a14dc 3340
83f64091
FB
3341/**************************************************************/
3342/* async block device emulation */
3343
c16b5a2c
CH
3344typedef struct BlockDriverAIOCBSync {
3345 BlockDriverAIOCB common;
3346 QEMUBH *bh;
3347 int ret;
3348 /* vector translation state */
3349 QEMUIOVector *qiov;
3350 uint8_t *bounce;
3351 int is_write;
3352} BlockDriverAIOCBSync;
3353
3354static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3355{
b666d239
KW
3356 BlockDriverAIOCBSync *acb =
3357 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 3358 qemu_bh_delete(acb->bh);
36afc451 3359 acb->bh = NULL;
c16b5a2c
CH
3360 qemu_aio_release(acb);
3361}
3362
3363static AIOPool bdrv_em_aio_pool = {
3364 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3365 .cancel = bdrv_aio_cancel_em,
3366};
3367
ce1a14dc 3368static void bdrv_aio_bh_cb(void *opaque)
83f64091 3369{
ce1a14dc 3370 BlockDriverAIOCBSync *acb = opaque;
f141eafe 3371
f141eafe
AL
3372 if (!acb->is_write)
3373 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
ceb42de8 3374 qemu_vfree(acb->bounce);
ce1a14dc 3375 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 3376 qemu_bh_delete(acb->bh);
36afc451 3377 acb->bh = NULL;
ce1a14dc 3378 qemu_aio_release(acb);
83f64091 3379}
beac80cd 3380
f141eafe
AL
3381static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3382 int64_t sector_num,
3383 QEMUIOVector *qiov,
3384 int nb_sectors,
3385 BlockDriverCompletionFunc *cb,
3386 void *opaque,
3387 int is_write)
3388
83f64091 3389{
ce1a14dc 3390 BlockDriverAIOCBSync *acb;
ce1a14dc 3391
c16b5a2c 3392 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
f141eafe
AL
3393 acb->is_write = is_write;
3394 acb->qiov = qiov;
e268ca52 3395 acb->bounce = qemu_blockalign(bs, qiov->size);
3f3aace8 3396 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
3397
3398 if (is_write) {
3399 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
1ed20acf 3400 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 3401 } else {
1ed20acf 3402 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
3403 }
3404
ce1a14dc 3405 qemu_bh_schedule(acb->bh);
f141eafe 3406
ce1a14dc 3407 return &acb->common;
beac80cd
FB
3408}
3409
f141eafe
AL
3410static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3411 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 3412 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 3413{
f141eafe
AL
3414 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3415}
83f64091 3416
f141eafe
AL
3417static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3418 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3419 BlockDriverCompletionFunc *cb, void *opaque)
3420{
3421 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 3422}
beac80cd 3423
68485420
KW
3424
3425typedef struct BlockDriverAIOCBCoroutine {
3426 BlockDriverAIOCB common;
3427 BlockRequest req;
3428 bool is_write;
3429 QEMUBH* bh;
3430} BlockDriverAIOCBCoroutine;
3431
3432static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3433{
3434 qemu_aio_flush();
3435}
3436
3437static AIOPool bdrv_em_co_aio_pool = {
3438 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3439 .cancel = bdrv_aio_co_cancel_em,
3440};
3441
35246a68 3442static void bdrv_co_em_bh(void *opaque)
68485420
KW
3443{
3444 BlockDriverAIOCBCoroutine *acb = opaque;
3445
3446 acb->common.cb(acb->common.opaque, acb->req.error);
3447 qemu_bh_delete(acb->bh);
3448 qemu_aio_release(acb);
3449}
3450
b2a61371
SH
3451/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3452static void coroutine_fn bdrv_co_do_rw(void *opaque)
3453{
3454 BlockDriverAIOCBCoroutine *acb = opaque;
3455 BlockDriverState *bs = acb->common.bs;
3456
3457 if (!acb->is_write) {
3458 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
470c0504 3459 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3460 } else {
3461 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
f08f2dda 3462 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3463 }
3464
35246a68 3465 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
3466 qemu_bh_schedule(acb->bh);
3467}
3468
68485420
KW
3469static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3470 int64_t sector_num,
3471 QEMUIOVector *qiov,
3472 int nb_sectors,
3473 BlockDriverCompletionFunc *cb,
3474 void *opaque,
8c5873d6 3475 bool is_write)
68485420
KW
3476{
3477 Coroutine *co;
3478 BlockDriverAIOCBCoroutine *acb;
3479
3480 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3481 acb->req.sector = sector_num;
3482 acb->req.nb_sectors = nb_sectors;
3483 acb->req.qiov = qiov;
3484 acb->is_write = is_write;
3485
8c5873d6 3486 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
3487 qemu_coroutine_enter(co, acb);
3488
3489 return &acb->common;
3490}
3491
07f07615 3492static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 3493{
07f07615
PB
3494 BlockDriverAIOCBCoroutine *acb = opaque;
3495 BlockDriverState *bs = acb->common.bs;
b2e12bc6 3496
07f07615
PB
3497 acb->req.error = bdrv_co_flush(bs);
3498 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 3499 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
3500}
3501
07f07615 3502BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
3503 BlockDriverCompletionFunc *cb, void *opaque)
3504{
07f07615 3505 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 3506
07f07615
PB
3507 Coroutine *co;
3508 BlockDriverAIOCBCoroutine *acb;
016f5cf6 3509
07f07615
PB
3510 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3511 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3512 qemu_coroutine_enter(co, acb);
016f5cf6 3513
016f5cf6
AG
3514 return &acb->common;
3515}
3516
4265d620
PB
3517static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3518{
3519 BlockDriverAIOCBCoroutine *acb = opaque;
3520 BlockDriverState *bs = acb->common.bs;
3521
3522 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3523 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3524 qemu_bh_schedule(acb->bh);
3525}
3526
3527BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3528 int64_t sector_num, int nb_sectors,
3529 BlockDriverCompletionFunc *cb, void *opaque)
3530{
3531 Coroutine *co;
3532 BlockDriverAIOCBCoroutine *acb;
3533
3534 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3535
3536 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3537 acb->req.sector = sector_num;
3538 acb->req.nb_sectors = nb_sectors;
3539 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3540 qemu_coroutine_enter(co, acb);
3541
3542 return &acb->common;
3543}
3544
ea2384d3
FB
3545void bdrv_init(void)
3546{
5efa9d5a 3547 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 3548}
ce1a14dc 3549
eb852011
MA
3550void bdrv_init_with_whitelist(void)
3551{
3552 use_bdrv_whitelist = 1;
3553 bdrv_init();
3554}
3555
c16b5a2c
CH
3556void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3557 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 3558{
ce1a14dc
PB
3559 BlockDriverAIOCB *acb;
3560
6bbff9a0
AL
3561 if (pool->free_aiocb) {
3562 acb = pool->free_aiocb;
3563 pool->free_aiocb = acb->next;
ce1a14dc 3564 } else {
7267c094 3565 acb = g_malloc0(pool->aiocb_size);
6bbff9a0 3566 acb->pool = pool;
ce1a14dc
PB
3567 }
3568 acb->bs = bs;
3569 acb->cb = cb;
3570 acb->opaque = opaque;
3571 return acb;
3572}
3573
3574void qemu_aio_release(void *p)
3575{
6bbff9a0
AL
3576 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3577 AIOPool *pool = acb->pool;
3578 acb->next = pool->free_aiocb;
3579 pool->free_aiocb = acb;
ce1a14dc 3580}
19cb3738 3581
f9f05dc5
KW
3582/**************************************************************/
3583/* Coroutine block device emulation */
3584
3585typedef struct CoroutineIOCompletion {
3586 Coroutine *coroutine;
3587 int ret;
3588} CoroutineIOCompletion;
3589
3590static void bdrv_co_io_em_complete(void *opaque, int ret)
3591{
3592 CoroutineIOCompletion *co = opaque;
3593
3594 co->ret = ret;
3595 qemu_coroutine_enter(co->coroutine, NULL);
3596}
3597
3598static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3599 int nb_sectors, QEMUIOVector *iov,
3600 bool is_write)
3601{
3602 CoroutineIOCompletion co = {
3603 .coroutine = qemu_coroutine_self(),
3604 };
3605 BlockDriverAIOCB *acb;
3606
3607 if (is_write) {
a652d160
SH
3608 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3609 bdrv_co_io_em_complete, &co);
f9f05dc5 3610 } else {
a652d160
SH
3611 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3612 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
3613 }
3614
59370aaa 3615 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
3616 if (!acb) {
3617 return -EIO;
3618 }
3619 qemu_coroutine_yield();
3620
3621 return co.ret;
3622}
3623
3624static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3625 int64_t sector_num, int nb_sectors,
3626 QEMUIOVector *iov)
3627{
3628 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3629}
3630
3631static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3632 int64_t sector_num, int nb_sectors,
3633 QEMUIOVector *iov)
3634{
3635 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3636}
3637
07f07615 3638static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 3639{
07f07615
PB
3640 RwCo *rwco = opaque;
3641
3642 rwco->ret = bdrv_co_flush(rwco->bs);
3643}
3644
3645int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3646{
eb489bb1
KW
3647 int ret;
3648
29cdb251 3649 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 3650 return 0;
eb489bb1
KW
3651 }
3652
ca716364 3653 /* Write back cached data to the OS even with cache=unsafe */
eb489bb1
KW
3654 if (bs->drv->bdrv_co_flush_to_os) {
3655 ret = bs->drv->bdrv_co_flush_to_os(bs);
3656 if (ret < 0) {
3657 return ret;
3658 }
3659 }
3660
ca716364
KW
3661 /* But don't actually force it to the disk with cache=unsafe */
3662 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3663 return 0;
3664 }
3665
eb489bb1 3666 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 3667 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
3668 } else if (bs->drv->bdrv_aio_flush) {
3669 BlockDriverAIOCB *acb;
3670 CoroutineIOCompletion co = {
3671 .coroutine = qemu_coroutine_self(),
3672 };
3673
3674 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3675 if (acb == NULL) {
29cdb251 3676 ret = -EIO;
07f07615
PB
3677 } else {
3678 qemu_coroutine_yield();
29cdb251 3679 ret = co.ret;
07f07615 3680 }
07f07615
PB
3681 } else {
3682 /*
3683 * Some block drivers always operate in either writethrough or unsafe
3684 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3685 * know how the server works (because the behaviour is hardcoded or
3686 * depends on server-side configuration), so we can't ensure that
3687 * everything is safe on disk. Returning an error doesn't work because
3688 * that would break guests even if the server operates in writethrough
3689 * mode.
3690 *
3691 * Let's hope the user knows what he's doing.
3692 */
29cdb251 3693 ret = 0;
07f07615 3694 }
29cdb251
PB
3695 if (ret < 0) {
3696 return ret;
3697 }
3698
3699 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3700 * in the case of cache=unsafe, so there are no useless flushes.
3701 */
3702 return bdrv_co_flush(bs->file);
07f07615
PB
3703}
3704
0f15423c
AL
3705void bdrv_invalidate_cache(BlockDriverState *bs)
3706{
3707 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3708 bs->drv->bdrv_invalidate_cache(bs);
3709 }
3710}
3711
3712void bdrv_invalidate_cache_all(void)
3713{
3714 BlockDriverState *bs;
3715
3716 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3717 bdrv_invalidate_cache(bs);
3718 }
3719}
3720
07789269
BC
3721void bdrv_clear_incoming_migration_all(void)
3722{
3723 BlockDriverState *bs;
3724
3725 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3726 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3727 }
3728}
3729
07f07615
PB
3730int bdrv_flush(BlockDriverState *bs)
3731{
3732 Coroutine *co;
3733 RwCo rwco = {
3734 .bs = bs,
3735 .ret = NOT_DONE,
e7a8a783 3736 };
e7a8a783 3737
07f07615
PB
3738 if (qemu_in_coroutine()) {
3739 /* Fast-path if already in coroutine context */
3740 bdrv_flush_co_entry(&rwco);
3741 } else {
3742 co = qemu_coroutine_create(bdrv_flush_co_entry);
3743 qemu_coroutine_enter(co, &rwco);
3744 while (rwco.ret == NOT_DONE) {
3745 qemu_aio_wait();
3746 }
e7a8a783 3747 }
07f07615
PB
3748
3749 return rwco.ret;
e7a8a783
KW
3750}
3751
4265d620
PB
3752static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3753{
3754 RwCo *rwco = opaque;
3755
3756 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3757}
3758
3759int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3760 int nb_sectors)
3761{
3762 if (!bs->drv) {
3763 return -ENOMEDIUM;
3764 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3765 return -EIO;
3766 } else if (bs->read_only) {
3767 return -EROFS;
3768 } else if (bs->drv->bdrv_co_discard) {
3769 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3770 } else if (bs->drv->bdrv_aio_discard) {
3771 BlockDriverAIOCB *acb;
3772 CoroutineIOCompletion co = {
3773 .coroutine = qemu_coroutine_self(),
3774 };
3775
3776 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3777 bdrv_co_io_em_complete, &co);
3778 if (acb == NULL) {
3779 return -EIO;
3780 } else {
3781 qemu_coroutine_yield();
3782 return co.ret;
3783 }
4265d620
PB
3784 } else {
3785 return 0;
3786 }
3787}
3788
3789int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3790{
3791 Coroutine *co;
3792 RwCo rwco = {
3793 .bs = bs,
3794 .sector_num = sector_num,
3795 .nb_sectors = nb_sectors,
3796 .ret = NOT_DONE,
3797 };
3798
3799 if (qemu_in_coroutine()) {
3800 /* Fast-path if already in coroutine context */
3801 bdrv_discard_co_entry(&rwco);
3802 } else {
3803 co = qemu_coroutine_create(bdrv_discard_co_entry);
3804 qemu_coroutine_enter(co, &rwco);
3805 while (rwco.ret == NOT_DONE) {
3806 qemu_aio_wait();
3807 }
3808 }
3809
3810 return rwco.ret;
3811}
3812
19cb3738
FB
3813/**************************************************************/
3814/* removable device support */
3815
3816/**
3817 * Return TRUE if the media is present
3818 */
3819int bdrv_is_inserted(BlockDriverState *bs)
3820{
3821 BlockDriver *drv = bs->drv;
a1aff5bf 3822
19cb3738
FB
3823 if (!drv)
3824 return 0;
3825 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
3826 return 1;
3827 return drv->bdrv_is_inserted(bs);
19cb3738
FB
3828}
3829
3830/**
8e49ca46
MA
3831 * Return whether the media changed since the last call to this
3832 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
3833 */
3834int bdrv_media_changed(BlockDriverState *bs)
3835{
3836 BlockDriver *drv = bs->drv;
19cb3738 3837
8e49ca46
MA
3838 if (drv && drv->bdrv_media_changed) {
3839 return drv->bdrv_media_changed(bs);
3840 }
3841 return -ENOTSUP;
19cb3738
FB
3842}
3843
3844/**
3845 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3846 */
f36f3949 3847void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
3848{
3849 BlockDriver *drv = bs->drv;
19cb3738 3850
822e1cd1
MA
3851 if (drv && drv->bdrv_eject) {
3852 drv->bdrv_eject(bs, eject_flag);
19cb3738 3853 }
6f382ed2
LC
3854
3855 if (bs->device_name[0] != '\0') {
3856 bdrv_emit_qmp_eject_event(bs, eject_flag);
3857 }
19cb3738
FB
3858}
3859
19cb3738
FB
3860/**
3861 * Lock or unlock the media (if it is locked, the user won't be able
3862 * to eject it manually).
3863 */
025e849a 3864void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
3865{
3866 BlockDriver *drv = bs->drv;
3867
025e849a 3868 trace_bdrv_lock_medium(bs, locked);
b8c6d095 3869
025e849a
MA
3870 if (drv && drv->bdrv_lock_medium) {
3871 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
3872 }
3873}
985a03b0
TS
3874
3875/* needed for generic scsi interface */
3876
3877int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3878{
3879 BlockDriver *drv = bs->drv;
3880
3881 if (drv && drv->bdrv_ioctl)
3882 return drv->bdrv_ioctl(bs, req, buf);
3883 return -ENOTSUP;
3884}
7d780669 3885
221f715d
AL
3886BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3887 unsigned long int req, void *buf,
3888 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 3889{
221f715d 3890 BlockDriver *drv = bs->drv;
7d780669 3891
221f715d
AL
3892 if (drv && drv->bdrv_aio_ioctl)
3893 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3894 return NULL;
7d780669 3895}
e268ca52 3896
7b6f9300
MA
3897void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3898{
3899 bs->buffer_alignment = align;
3900}
7cd1e32a 3901
e268ca52
AL
3902void *qemu_blockalign(BlockDriverState *bs, size_t size)
3903{
3904 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3905}
7cd1e32a
LS
3906
3907void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3908{
3909 int64_t bitmap_size;
a55eb92c 3910
aaa0eb75 3911 bs->dirty_count = 0;
a55eb92c 3912 if (enable) {
c6d22830
JK
3913 if (!bs->dirty_bitmap) {
3914 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
71df14fc
PB
3915 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3916 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
a55eb92c 3917
71df14fc 3918 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
a55eb92c 3919 }
7cd1e32a 3920 } else {
c6d22830 3921 if (bs->dirty_bitmap) {
7267c094 3922 g_free(bs->dirty_bitmap);
c6d22830 3923 bs->dirty_bitmap = NULL;
a55eb92c 3924 }
7cd1e32a
LS
3925 }
3926}
3927
3928int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3929{
6ea44308 3930 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 3931
c6d22830
JK
3932 if (bs->dirty_bitmap &&
3933 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
6d59fec1
MT
3934 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3935 (1UL << (chunk % (sizeof(unsigned long) * 8))));
7cd1e32a
LS
3936 } else {
3937 return 0;
3938 }
3939}
3940
a55eb92c
JK
3941void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3942 int nr_sectors)
7cd1e32a
LS
3943{
3944 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3945}
aaa0eb75
LS
3946
3947int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3948{
3949 return bs->dirty_count;
3950}
f88e1a42 3951
db593f25
MT
3952void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3953{
3954 assert(bs->in_use != in_use);
3955 bs->in_use = in_use;
3956}
3957
3958int bdrv_in_use(BlockDriverState *bs)
3959{
3960 return bs->in_use;
3961}
3962
28a7282a
LC
3963void bdrv_iostatus_enable(BlockDriverState *bs)
3964{
d6bf279e 3965 bs->iostatus_enabled = true;
58e21ef5 3966 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3967}
3968
3969/* The I/O status is only enabled if the drive explicitly
3970 * enables it _and_ the VM is configured to stop on errors */
3971bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3972{
d6bf279e 3973 return (bs->iostatus_enabled &&
28a7282a
LC
3974 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3975 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3976 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3977}
3978
3979void bdrv_iostatus_disable(BlockDriverState *bs)
3980{
d6bf279e 3981 bs->iostatus_enabled = false;
28a7282a
LC
3982}
3983
3984void bdrv_iostatus_reset(BlockDriverState *bs)
3985{
3986 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 3987 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3988 }
3989}
3990
3991/* XXX: Today this is set by device models because it makes the implementation
3992 quite simple. However, the block layer knows about the error, so it's
3993 possible to implement this without device models being involved */
3994void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3995{
58e21ef5
LC
3996 if (bdrv_iostatus_is_enabled(bs) &&
3997 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
28a7282a 3998 assert(error >= 0);
58e21ef5
LC
3999 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
4000 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
4001 }
4002}
4003
a597e79c
CH
4004void
4005bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4006 enum BlockAcctType type)
4007{
4008 assert(type < BDRV_MAX_IOTYPE);
4009
4010 cookie->bytes = bytes;
c488c7f6 4011 cookie->start_time_ns = get_clock();
a597e79c
CH
4012 cookie->type = type;
4013}
4014
4015void
4016bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4017{
4018 assert(cookie->type < BDRV_MAX_IOTYPE);
4019
4020 bs->nr_bytes[cookie->type] += cookie->bytes;
4021 bs->nr_ops[cookie->type]++;
c488c7f6 4022 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
4023}
4024
f88e1a42
JS
4025int bdrv_img_create(const char *filename, const char *fmt,
4026 const char *base_filename, const char *base_fmt,
4027 char *options, uint64_t img_size, int flags)
4028{
4029 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 4030 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42
JS
4031 BlockDriverState *bs = NULL;
4032 BlockDriver *drv, *proto_drv;
96df67d1 4033 BlockDriver *backing_drv = NULL;
f88e1a42
JS
4034 int ret = 0;
4035
4036 /* Find driver and parse its options */
4037 drv = bdrv_find_format(fmt);
4038 if (!drv) {
4039 error_report("Unknown file format '%s'", fmt);
4f70f249 4040 ret = -EINVAL;
f88e1a42
JS
4041 goto out;
4042 }
4043
4044 proto_drv = bdrv_find_protocol(filename);
4045 if (!proto_drv) {
4046 error_report("Unknown protocol '%s'", filename);
4f70f249 4047 ret = -EINVAL;
f88e1a42
JS
4048 goto out;
4049 }
4050
4051 create_options = append_option_parameters(create_options,
4052 drv->create_options);
4053 create_options = append_option_parameters(create_options,
4054 proto_drv->create_options);
4055
4056 /* Create parameter list with default values */
4057 param = parse_option_parameters("", create_options, param);
4058
4059 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4060
4061 /* Parse -o options */
4062 if (options) {
4063 param = parse_option_parameters(options, create_options, param);
4064 if (param == NULL) {
4065 error_report("Invalid options for file format '%s'.", fmt);
4f70f249 4066 ret = -EINVAL;
f88e1a42
JS
4067 goto out;
4068 }
4069 }
4070
4071 if (base_filename) {
4072 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4073 base_filename)) {
4074 error_report("Backing file not supported for file format '%s'",
4075 fmt);
4f70f249 4076 ret = -EINVAL;
f88e1a42
JS
4077 goto out;
4078 }
4079 }
4080
4081 if (base_fmt) {
4082 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4083 error_report("Backing file format not supported for file "
4084 "format '%s'", fmt);
4f70f249 4085 ret = -EINVAL;
f88e1a42
JS
4086 goto out;
4087 }
4088 }
4089
792da93a
JS
4090 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4091 if (backing_file && backing_file->value.s) {
4092 if (!strcmp(filename, backing_file->value.s)) {
4093 error_report("Error: Trying to create an image with the "
4094 "same filename as the backing file");
4f70f249 4095 ret = -EINVAL;
792da93a
JS
4096 goto out;
4097 }
4098 }
4099
f88e1a42
JS
4100 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4101 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
4102 backing_drv = bdrv_find_format(backing_fmt->value.s);
4103 if (!backing_drv) {
f88e1a42
JS
4104 error_report("Unknown backing file format '%s'",
4105 backing_fmt->value.s);
4f70f249 4106 ret = -EINVAL;
f88e1a42
JS
4107 goto out;
4108 }
4109 }
4110
4111 // The size for the image must always be specified, with one exception:
4112 // If we are using a backing file, we can obtain the size from there
d220894e
KW
4113 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4114 if (size && size->value.n == -1) {
f88e1a42
JS
4115 if (backing_file && backing_file->value.s) {
4116 uint64_t size;
f88e1a42 4117 char buf[32];
63090dac
PB
4118 int back_flags;
4119
4120 /* backing files always opened read-only */
4121 back_flags =
4122 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 4123
f88e1a42
JS
4124 bs = bdrv_new("");
4125
63090dac 4126 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
f88e1a42 4127 if (ret < 0) {
96df67d1 4128 error_report("Could not open '%s'", backing_file->value.s);
f88e1a42
JS
4129 goto out;
4130 }
4131 bdrv_get_geometry(bs, &size);
4132 size *= 512;
4133
4134 snprintf(buf, sizeof(buf), "%" PRId64, size);
4135 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4136 } else {
4137 error_report("Image creation needs a size parameter");
4f70f249 4138 ret = -EINVAL;
f88e1a42
JS
4139 goto out;
4140 }
4141 }
4142
4143 printf("Formatting '%s', fmt=%s ", filename, fmt);
4144 print_option_parameters(param);
4145 puts("");
4146
4147 ret = bdrv_create(drv, filename, param);
4148
4149 if (ret < 0) {
4150 if (ret == -ENOTSUP) {
4151 error_report("Formatting or formatting option not supported for "
4152 "file format '%s'", fmt);
4153 } else if (ret == -EFBIG) {
4154 error_report("The image size is too large for file format '%s'",
4155 fmt);
4156 } else {
4157 error_report("%s: error while creating %s: %s", filename, fmt,
4158 strerror(-ret));
4159 }
4160 }
4161
4162out:
4163 free_option_parameters(create_options);
4164 free_option_parameters(param);
4165
4166 if (bs) {
4167 bdrv_delete(bs);
4168 }
4f70f249
JS
4169
4170 return ret;
f88e1a42 4171}
eeec61f2
SH
4172
4173void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
c83c66c3
SH
4174 int64_t speed, BlockDriverCompletionFunc *cb,
4175 void *opaque, Error **errp)
eeec61f2
SH
4176{
4177 BlockJob *job;
4178
4179 if (bs->job || bdrv_in_use(bs)) {
fd7f8c65 4180 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
eeec61f2
SH
4181 return NULL;
4182 }
4183 bdrv_set_in_use(bs, 1);
4184
4185 job = g_malloc0(job_type->instance_size);
4186 job->job_type = job_type;
4187 job->bs = bs;
4188 job->cb = cb;
4189 job->opaque = opaque;
4190 bs->job = job;
c83c66c3
SH
4191
4192 /* Only set speed when necessary to avoid NotSupported error */
4193 if (speed != 0) {
4194 Error *local_err = NULL;
4195
4196 block_job_set_speed(job, speed, &local_err);
4197 if (error_is_set(&local_err)) {
4198 bs->job = NULL;
4199 g_free(job);
4200 bdrv_set_in_use(bs, 0);
4201 error_propagate(errp, local_err);
4202 return NULL;
4203 }
4204 }
eeec61f2
SH
4205 return job;
4206}
4207
4208void block_job_complete(BlockJob *job, int ret)
4209{
4210 BlockDriverState *bs = job->bs;
4211
4212 assert(bs->job == job);
4213 job->cb(job->opaque, ret);
4214 bs->job = NULL;
4215 g_free(job);
4216 bdrv_set_in_use(bs, 0);
4217}
4218
882ec7ce 4219void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
eeec61f2 4220{
9e6636c7 4221 Error *local_err = NULL;
9f25eccc 4222
eeec61f2 4223 if (!job->job_type->set_speed) {
9e6636c7
SH
4224 error_set(errp, QERR_NOT_SUPPORTED);
4225 return;
eeec61f2 4226 }
882ec7ce 4227 job->job_type->set_speed(job, speed, &local_err);
9e6636c7
SH
4228 if (error_is_set(&local_err)) {
4229 error_propagate(errp, local_err);
4230 return;
9f25eccc 4231 }
9e6636c7 4232
882ec7ce 4233 job->speed = speed;
eeec61f2
SH
4234}
4235
4236void block_job_cancel(BlockJob *job)
4237{
4238 job->cancelled = true;
4239}
4240
4241bool block_job_is_cancelled(BlockJob *job)
4242{
4243 return job->cancelled;
4244}
3e914655
PB
4245
4246void block_job_cancel_sync(BlockJob *job)
4247{
4248 BlockDriverState *bs = job->bs;
4249
4250 assert(bs->job == job);
4251 block_job_cancel(job);
4252 while (bs->job != NULL && bs->job->busy) {
4253 qemu_aio_wait();
4254 }
4255}