]> git.proxmox.com Git - qemu.git/blame - block.c
block: simplify path_is_absolute
[qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
376253ec 27#include "monitor.h"
ea2384d3 28#include "block_int.h"
5efa9d5a 29#include "module.h"
f795e743 30#include "qjson.h"
68485420 31#include "qemu-coroutine.h"
b2023818 32#include "qmp-commands.h"
0563e191 33#include "qemu-timer.h"
fc01f7e7 34
71e72a19 35#ifdef CONFIG_BSD
7674e7bf
FB
36#include <sys/types.h>
37#include <sys/stat.h>
38#include <sys/ioctl.h>
72cf2d4f 39#include <sys/queue.h>
c5e97233 40#ifndef __DragonFly__
7674e7bf
FB
41#include <sys/disk.h>
42#endif
c5e97233 43#endif
7674e7bf 44
49dc768d
AL
45#ifdef _WIN32
46#include <windows.h>
47#endif
48
1c9805a3
SH
49#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
470c0504
SH
51typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
f08f2dda 53 BDRV_REQ_ZERO_WRITE = 0x2,
470c0504
SH
54} BdrvRequestFlags;
55
7d4b4ba5 56static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
57static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 59 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
60static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 62 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
63static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
c5fbe571 69static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
1c9805a3 72static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
b2a61371
SH
75static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
8c5873d6 81 bool is_write);
b2a61371 82static void coroutine_fn bdrv_co_do_rw(void *opaque);
621f0589
KW
83static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
ec530c81 85
98f90dba
ZYW
86static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
92
1b7bdbc1
SH
93static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 95
8a22f02a
SH
96static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 98
f9092b10
MA
99/* The device to use for VM snapshots */
100static BlockDriverState *bs_snapshots;
101
eb852011
MA
102/* If non-zero, use only whitelisted block drivers */
103static int use_bdrv_whitelist;
104
9e0b22f4
SH
105#ifdef _WIN32
106static int is_windows_drive_prefix(const char *filename)
107{
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
111}
112
113int is_windows_drive(const char *filename)
114{
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
122}
123#endif
124
0563e191 125/* throttling disk I/O limits */
98f90dba
ZYW
126void bdrv_io_limits_disable(BlockDriverState *bs)
127{
128 bs->io_limits_enabled = false;
129
130 while (qemu_co_queue_next(&bs->throttled_reqs));
131
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
136 }
137
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
142}
143
0563e191
ZYW
144static void bdrv_block_timer(void *opaque)
145{
146 BlockDriverState *bs = opaque;
147
148 qemu_co_queue_next(&bs->throttled_reqs);
149}
150
151void bdrv_io_limits_enable(BlockDriverState *bs)
152{
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
160}
161
162bool bdrv_io_limits_enabled(BlockDriverState *bs)
163{
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171}
172
98f90dba
ZYW
173static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
175{
176 int64_t wait_time = -1;
177
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
180 }
181
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
187 */
188
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193 }
194
195 qemu_co_queue_next(&bs->throttled_reqs);
196}
197
9e0b22f4
SH
198/* check if the path starts with "<protocol>:" */
199static int path_has_protocol(const char *path)
200{
201#ifdef _WIN32
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
204 return 0;
205 }
206#endif
207
208 return strchr(path, ':') != NULL;
209}
210
83f64091 211int path_is_absolute(const char *path)
3b0d4f61 212{
21664424
FB
213#ifdef _WIN32
214 /* specific case for names like: "\\.\d:" */
f53f4da9 215 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
21664424 216 return 1;
f53f4da9
PB
217 }
218 return (*path == '/' || *path == '\\');
3b9f94e1 219#else
f53f4da9 220 return (*path == '/');
3b9f94e1 221#endif
3b0d4f61
FB
222}
223
83f64091
FB
224/* if filename is absolute, just copy it to dest. Otherwise, build a
225 path to it by considering it is relative to base_path. URL are
226 supported. */
227void path_combine(char *dest, int dest_size,
228 const char *base_path,
229 const char *filename)
3b0d4f61 230{
83f64091
FB
231 const char *p, *p1;
232 int len;
233
234 if (dest_size <= 0)
235 return;
236 if (path_is_absolute(filename)) {
237 pstrcpy(dest, dest_size, filename);
238 } else {
239 p = strchr(base_path, ':');
240 if (p)
241 p++;
242 else
243 p = base_path;
3b9f94e1
FB
244 p1 = strrchr(base_path, '/');
245#ifdef _WIN32
246 {
247 const char *p2;
248 p2 = strrchr(base_path, '\\');
249 if (!p1 || p2 > p1)
250 p1 = p2;
251 }
252#endif
83f64091
FB
253 if (p1)
254 p1++;
255 else
256 p1 = base_path;
257 if (p1 > p)
258 p = p1;
259 len = p - base_path;
260 if (len > dest_size - 1)
261 len = dest_size - 1;
262 memcpy(dest, base_path, len);
263 dest[len] = '\0';
264 pstrcat(dest, dest_size, filename);
3b0d4f61 265 }
3b0d4f61
FB
266}
267
5efa9d5a 268void bdrv_register(BlockDriver *bdrv)
ea2384d3 269{
8c5873d6
SH
270 /* Block drivers without coroutine functions need emulation */
271 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
272 bdrv->bdrv_co_readv = bdrv_co_readv_em;
273 bdrv->bdrv_co_writev = bdrv_co_writev_em;
274
f8c35c1d
SH
275 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
276 * the block driver lacks aio we need to emulate that too.
277 */
f9f05dc5
KW
278 if (!bdrv->bdrv_aio_readv) {
279 /* add AIO emulation layer */
280 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
281 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 282 }
83f64091 283 }
b2e12bc6 284
8a22f02a 285 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 286}
b338082b
FB
287
288/* create a new block device (by default it is empty) */
289BlockDriverState *bdrv_new(const char *device_name)
290{
1b7bdbc1 291 BlockDriverState *bs;
b338082b 292
7267c094 293 bs = g_malloc0(sizeof(BlockDriverState));
b338082b 294 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 295 if (device_name[0] != '\0') {
1b7bdbc1 296 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
ea2384d3 297 }
28a7282a 298 bdrv_iostatus_disable(bs);
b338082b
FB
299 return bs;
300}
301
ea2384d3
FB
302BlockDriver *bdrv_find_format(const char *format_name)
303{
304 BlockDriver *drv1;
8a22f02a
SH
305 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
306 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 307 return drv1;
8a22f02a 308 }
ea2384d3
FB
309 }
310 return NULL;
311}
312
eb852011
MA
313static int bdrv_is_whitelisted(BlockDriver *drv)
314{
315 static const char *whitelist[] = {
316 CONFIG_BDRV_WHITELIST
317 };
318 const char **p;
319
320 if (!whitelist[0])
321 return 1; /* no whitelist, anything goes */
322
323 for (p = whitelist; *p; p++) {
324 if (!strcmp(drv->format_name, *p)) {
325 return 1;
326 }
327 }
328 return 0;
329}
330
331BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
332{
333 BlockDriver *drv = bdrv_find_format(format_name);
334 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
335}
336
5b7e1542
ZYW
337typedef struct CreateCo {
338 BlockDriver *drv;
339 char *filename;
340 QEMUOptionParameter *options;
341 int ret;
342} CreateCo;
343
344static void coroutine_fn bdrv_create_co_entry(void *opaque)
345{
346 CreateCo *cco = opaque;
347 assert(cco->drv);
348
349 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
350}
351
0e7e1989
KW
352int bdrv_create(BlockDriver *drv, const char* filename,
353 QEMUOptionParameter *options)
ea2384d3 354{
5b7e1542
ZYW
355 int ret;
356
357 Coroutine *co;
358 CreateCo cco = {
359 .drv = drv,
360 .filename = g_strdup(filename),
361 .options = options,
362 .ret = NOT_DONE,
363 };
364
365 if (!drv->bdrv_create) {
ea2384d3 366 return -ENOTSUP;
5b7e1542
ZYW
367 }
368
369 if (qemu_in_coroutine()) {
370 /* Fast-path if already in coroutine context */
371 bdrv_create_co_entry(&cco);
372 } else {
373 co = qemu_coroutine_create(bdrv_create_co_entry);
374 qemu_coroutine_enter(co, &cco);
375 while (cco.ret == NOT_DONE) {
376 qemu_aio_wait();
377 }
378 }
379
380 ret = cco.ret;
381 g_free(cco.filename);
0e7e1989 382
5b7e1542 383 return ret;
ea2384d3
FB
384}
385
84a12e66
CH
386int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
387{
388 BlockDriver *drv;
389
b50cbabc 390 drv = bdrv_find_protocol(filename);
84a12e66 391 if (drv == NULL) {
16905d71 392 return -ENOENT;
84a12e66
CH
393 }
394
395 return bdrv_create(drv, filename, options);
396}
397
d5249393 398#ifdef _WIN32
95389c86 399void get_tmp_filename(char *filename, int size)
d5249393 400{
3b9f94e1 401 char temp_dir[MAX_PATH];
3b46e624 402
3b9f94e1
FB
403 GetTempPath(MAX_PATH, temp_dir);
404 GetTempFileName(temp_dir, "qem", 0, filename);
d5249393
FB
405}
406#else
95389c86 407void get_tmp_filename(char *filename, int size)
fc01f7e7 408{
67b915a5 409 int fd;
7ccfb2eb 410 const char *tmpdir;
d5249393 411 /* XXX: race condition possible */
0badc1ee
AJ
412 tmpdir = getenv("TMPDIR");
413 if (!tmpdir)
414 tmpdir = "/tmp";
415 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
ea2384d3
FB
416 fd = mkstemp(filename);
417 close(fd);
418}
d5249393 419#endif
fc01f7e7 420
84a12e66
CH
421/*
422 * Detect host devices. By convention, /dev/cdrom[N] is always
423 * recognized as a host CDROM.
424 */
425static BlockDriver *find_hdev_driver(const char *filename)
426{
427 int score_max = 0, score;
428 BlockDriver *drv = NULL, *d;
429
430 QLIST_FOREACH(d, &bdrv_drivers, list) {
431 if (d->bdrv_probe_device) {
432 score = d->bdrv_probe_device(filename);
433 if (score > score_max) {
434 score_max = score;
435 drv = d;
436 }
437 }
438 }
439
440 return drv;
441}
442
b50cbabc 443BlockDriver *bdrv_find_protocol(const char *filename)
83f64091
FB
444{
445 BlockDriver *drv1;
446 char protocol[128];
1cec71e3 447 int len;
83f64091 448 const char *p;
19cb3738 449
66f82cee
KW
450 /* TODO Drivers without bdrv_file_open must be specified explicitly */
451
39508e7a
CH
452 /*
453 * XXX(hch): we really should not let host device detection
454 * override an explicit protocol specification, but moving this
455 * later breaks access to device names with colons in them.
456 * Thanks to the brain-dead persistent naming schemes on udev-
457 * based Linux systems those actually are quite common.
458 */
459 drv1 = find_hdev_driver(filename);
460 if (drv1) {
461 return drv1;
462 }
463
9e0b22f4 464 if (!path_has_protocol(filename)) {
39508e7a 465 return bdrv_find_format("file");
84a12e66 466 }
9e0b22f4
SH
467 p = strchr(filename, ':');
468 assert(p != NULL);
1cec71e3
AL
469 len = p - filename;
470 if (len > sizeof(protocol) - 1)
471 len = sizeof(protocol) - 1;
472 memcpy(protocol, filename, len);
473 protocol[len] = '\0';
8a22f02a 474 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 475 if (drv1->protocol_name &&
8a22f02a 476 !strcmp(drv1->protocol_name, protocol)) {
83f64091 477 return drv1;
8a22f02a 478 }
83f64091
FB
479 }
480 return NULL;
481}
482
c98ac35d 483static int find_image_format(const char *filename, BlockDriver **pdrv)
f3a5d3f8
CH
484{
485 int ret, score, score_max;
486 BlockDriver *drv1, *drv;
487 uint8_t buf[2048];
488 BlockDriverState *bs;
489
f5edb014 490 ret = bdrv_file_open(&bs, filename, 0);
c98ac35d
SW
491 if (ret < 0) {
492 *pdrv = NULL;
493 return ret;
494 }
f8ea0b00 495
08a00559
KW
496 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
497 if (bs->sg || !bdrv_is_inserted(bs)) {
1a396859 498 bdrv_delete(bs);
c98ac35d
SW
499 drv = bdrv_find_format("raw");
500 if (!drv) {
501 ret = -ENOENT;
502 }
503 *pdrv = drv;
504 return ret;
1a396859 505 }
f8ea0b00 506
83f64091
FB
507 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
508 bdrv_delete(bs);
509 if (ret < 0) {
c98ac35d
SW
510 *pdrv = NULL;
511 return ret;
83f64091
FB
512 }
513
ea2384d3 514 score_max = 0;
84a12e66 515 drv = NULL;
8a22f02a 516 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
517 if (drv1->bdrv_probe) {
518 score = drv1->bdrv_probe(buf, ret, filename);
519 if (score > score_max) {
520 score_max = score;
521 drv = drv1;
522 }
0849bf08 523 }
fc01f7e7 524 }
c98ac35d
SW
525 if (!drv) {
526 ret = -ENOENT;
527 }
528 *pdrv = drv;
529 return ret;
ea2384d3
FB
530}
531
51762288
SH
532/**
533 * Set the current 'total_sectors' value
534 */
535static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
536{
537 BlockDriver *drv = bs->drv;
538
396759ad
NB
539 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
540 if (bs->sg)
541 return 0;
542
51762288
SH
543 /* query actual device if possible, otherwise just trust the hint */
544 if (drv->bdrv_getlength) {
545 int64_t length = drv->bdrv_getlength(bs);
546 if (length < 0) {
547 return length;
548 }
549 hint = length >> BDRV_SECTOR_BITS;
550 }
551
552 bs->total_sectors = hint;
553 return 0;
554}
555
c3993cdc
SH
556/**
557 * Set open flags for a given cache mode
558 *
559 * Return 0 on success, -1 if the cache mode was invalid.
560 */
561int bdrv_parse_cache_flags(const char *mode, int *flags)
562{
563 *flags &= ~BDRV_O_CACHE_MASK;
564
565 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
566 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
567 } else if (!strcmp(mode, "directsync")) {
568 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
569 } else if (!strcmp(mode, "writeback")) {
570 *flags |= BDRV_O_CACHE_WB;
571 } else if (!strcmp(mode, "unsafe")) {
572 *flags |= BDRV_O_CACHE_WB;
573 *flags |= BDRV_O_NO_FLUSH;
574 } else if (!strcmp(mode, "writethrough")) {
575 /* this is the default */
576 } else {
577 return -1;
578 }
579
580 return 0;
581}
582
53fec9d3
SH
583/**
584 * The copy-on-read flag is actually a reference count so multiple users may
585 * use the feature without worrying about clobbering its previous state.
586 * Copy-on-read stays enabled until all users have called to disable it.
587 */
588void bdrv_enable_copy_on_read(BlockDriverState *bs)
589{
590 bs->copy_on_read++;
591}
592
593void bdrv_disable_copy_on_read(BlockDriverState *bs)
594{
595 assert(bs->copy_on_read > 0);
596 bs->copy_on_read--;
597}
598
57915332
KW
599/*
600 * Common part for opening disk images and files
601 */
602static int bdrv_open_common(BlockDriverState *bs, const char *filename,
603 int flags, BlockDriver *drv)
604{
605 int ret, open_flags;
606
607 assert(drv != NULL);
608
28dcee10
SH
609 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
610
66f82cee 611 bs->file = NULL;
51762288 612 bs->total_sectors = 0;
57915332
KW
613 bs->encrypted = 0;
614 bs->valid_key = 0;
03f541bd 615 bs->sg = 0;
57915332 616 bs->open_flags = flags;
03f541bd 617 bs->growable = 0;
57915332
KW
618 bs->buffer_alignment = 512;
619
53fec9d3
SH
620 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
621 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
622 bdrv_enable_copy_on_read(bs);
623 }
624
57915332 625 pstrcpy(bs->filename, sizeof(bs->filename), filename);
03f541bd 626 bs->backing_file[0] = '\0';
57915332
KW
627
628 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
629 return -ENOTSUP;
630 }
631
632 bs->drv = drv;
7267c094 633 bs->opaque = g_malloc0(drv->instance_size);
57915332 634
03f541bd 635 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
57915332
KW
636
637 /*
638 * Clear flags that are internal to the block layer before opening the
639 * image.
640 */
641 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
642
643 /*
ebabb67a 644 * Snapshots should be writable.
57915332
KW
645 */
646 if (bs->is_temporary) {
647 open_flags |= BDRV_O_RDWR;
648 }
649
e7c63796
SH
650 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
651
66f82cee
KW
652 /* Open the image, either directly or using a protocol */
653 if (drv->bdrv_file_open) {
654 ret = drv->bdrv_file_open(bs, filename, open_flags);
655 } else {
656 ret = bdrv_file_open(&bs->file, filename, open_flags);
657 if (ret >= 0) {
658 ret = drv->bdrv_open(bs, open_flags);
659 }
660 }
661
57915332
KW
662 if (ret < 0) {
663 goto free_and_fail;
664 }
665
51762288
SH
666 ret = refresh_total_sectors(bs, bs->total_sectors);
667 if (ret < 0) {
668 goto free_and_fail;
57915332 669 }
51762288 670
57915332
KW
671#ifndef _WIN32
672 if (bs->is_temporary) {
673 unlink(filename);
674 }
675#endif
676 return 0;
677
678free_and_fail:
66f82cee
KW
679 if (bs->file) {
680 bdrv_delete(bs->file);
681 bs->file = NULL;
682 }
7267c094 683 g_free(bs->opaque);
57915332
KW
684 bs->opaque = NULL;
685 bs->drv = NULL;
686 return ret;
687}
688
b6ce07aa
KW
689/*
690 * Opens a file using a protocol (file, host_device, nbd, ...)
691 */
83f64091 692int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
ea2384d3 693{
83f64091 694 BlockDriverState *bs;
6db95603 695 BlockDriver *drv;
83f64091
FB
696 int ret;
697
b50cbabc 698 drv = bdrv_find_protocol(filename);
6db95603
CH
699 if (!drv) {
700 return -ENOENT;
701 }
702
83f64091 703 bs = bdrv_new("");
b6ce07aa 704 ret = bdrv_open_common(bs, filename, flags, drv);
83f64091
FB
705 if (ret < 0) {
706 bdrv_delete(bs);
707 return ret;
3b0d4f61 708 }
71d0770c 709 bs->growable = 1;
83f64091
FB
710 *pbs = bs;
711 return 0;
712}
713
b6ce07aa
KW
714/*
715 * Opens a disk image (raw, qcow2, vmdk, ...)
716 */
d6e9098e
KW
717int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
718 BlockDriver *drv)
ea2384d3 719{
b6ce07aa 720 int ret;
2b572816 721 char tmp_filename[PATH_MAX];
712e7874 722
83f64091 723 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
724 BlockDriverState *bs1;
725 int64_t total_size;
7c96d46e 726 int is_protocol = 0;
91a073a9
KW
727 BlockDriver *bdrv_qcow2;
728 QEMUOptionParameter *options;
b6ce07aa 729 char backing_filename[PATH_MAX];
3b46e624 730
ea2384d3
FB
731 /* if snapshot, we create a temporary backing file and open it
732 instead of opening 'filename' directly */
33e3963e 733
ea2384d3
FB
734 /* if there is a backing file, use it */
735 bs1 = bdrv_new("");
d6e9098e 736 ret = bdrv_open(bs1, filename, 0, drv);
51d7c00c 737 if (ret < 0) {
ea2384d3 738 bdrv_delete(bs1);
51d7c00c 739 return ret;
ea2384d3 740 }
3e82990b 741 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e
AL
742
743 if (bs1->drv && bs1->drv->protocol_name)
744 is_protocol = 1;
745
ea2384d3 746 bdrv_delete(bs1);
3b46e624 747
ea2384d3 748 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
7c96d46e
AL
749
750 /* Real path is meaningless for protocols */
751 if (is_protocol)
752 snprintf(backing_filename, sizeof(backing_filename),
753 "%s", filename);
114cdfa9
KS
754 else if (!realpath(filename, backing_filename))
755 return -errno;
7c96d46e 756
91a073a9
KW
757 bdrv_qcow2 = bdrv_find_format("qcow2");
758 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
759
3e82990b 760 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
91a073a9
KW
761 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
762 if (drv) {
763 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
764 drv->format_name);
765 }
766
767 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
d748768c 768 free_option_parameters(options);
51d7c00c
AL
769 if (ret < 0) {
770 return ret;
ea2384d3 771 }
91a073a9 772
ea2384d3 773 filename = tmp_filename;
91a073a9 774 drv = bdrv_qcow2;
ea2384d3
FB
775 bs->is_temporary = 1;
776 }
712e7874 777
b6ce07aa 778 /* Find the right image format driver */
6db95603 779 if (!drv) {
c98ac35d 780 ret = find_image_format(filename, &drv);
51d7c00c 781 }
6987307c 782
51d7c00c 783 if (!drv) {
51d7c00c 784 goto unlink_and_fail;
ea2384d3 785 }
b6ce07aa
KW
786
787 /* Open the image */
788 ret = bdrv_open_common(bs, filename, flags, drv);
789 if (ret < 0) {
6987307c
CH
790 goto unlink_and_fail;
791 }
792
b6ce07aa
KW
793 /* If there is a backing file, use it */
794 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
795 char backing_filename[PATH_MAX];
796 int back_flags;
797 BlockDriver *back_drv = NULL;
798
799 bs->backing_hd = bdrv_new("");
df2dbb4a
SH
800
801 if (path_has_protocol(bs->backing_file)) {
802 pstrcpy(backing_filename, sizeof(backing_filename),
803 bs->backing_file);
804 } else {
805 path_combine(backing_filename, sizeof(backing_filename),
806 filename, bs->backing_file);
807 }
808
809 if (bs->backing_format[0] != '\0') {
b6ce07aa 810 back_drv = bdrv_find_format(bs->backing_format);
df2dbb4a 811 }
b6ce07aa
KW
812
813 /* backing files always opened read-only */
814 back_flags =
815 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
816
817 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
818 if (ret < 0) {
819 bdrv_close(bs);
820 return ret;
821 }
822 if (bs->is_temporary) {
823 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
824 } else {
825 /* base image inherits from "parent" */
826 bs->backing_hd->keep_read_only = bs->keep_read_only;
827 }
828 }
829
830 if (!bdrv_key_required(bs)) {
7d4b4ba5 831 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
832 }
833
98f90dba
ZYW
834 /* throttling disk I/O limits */
835 if (bs->io_limits_enabled) {
836 bdrv_io_limits_enable(bs);
837 }
838
b6ce07aa
KW
839 return 0;
840
841unlink_and_fail:
842 if (bs->is_temporary) {
843 unlink(filename);
844 }
845 return ret;
846}
847
fc01f7e7
FB
848void bdrv_close(BlockDriverState *bs)
849{
80ccf93b 850 bdrv_flush(bs);
19cb3738 851 if (bs->drv) {
3e914655
PB
852 if (bs->job) {
853 block_job_cancel_sync(bs->job);
854 }
7094f12f
KW
855 bdrv_drain_all();
856
f9092b10
MA
857 if (bs == bs_snapshots) {
858 bs_snapshots = NULL;
859 }
557df6ac 860 if (bs->backing_hd) {
ea2384d3 861 bdrv_delete(bs->backing_hd);
557df6ac
SH
862 bs->backing_hd = NULL;
863 }
ea2384d3 864 bs->drv->bdrv_close(bs);
7267c094 865 g_free(bs->opaque);
ea2384d3
FB
866#ifdef _WIN32
867 if (bs->is_temporary) {
868 unlink(bs->filename);
869 }
67b915a5 870#endif
ea2384d3
FB
871 bs->opaque = NULL;
872 bs->drv = NULL;
53fec9d3 873 bs->copy_on_read = 0;
a275fa42
PB
874 bs->backing_file[0] = '\0';
875 bs->backing_format[0] = '\0';
b338082b 876
66f82cee 877 if (bs->file != NULL) {
0ac9377d
PB
878 bdrv_delete(bs->file);
879 bs->file = NULL;
66f82cee
KW
880 }
881
7d4b4ba5 882 bdrv_dev_change_media_cb(bs, false);
b338082b 883 }
98f90dba
ZYW
884
885 /*throttling disk I/O limits*/
886 if (bs->io_limits_enabled) {
887 bdrv_io_limits_disable(bs);
888 }
b338082b
FB
889}
890
2bc93fed
MK
891void bdrv_close_all(void)
892{
893 BlockDriverState *bs;
894
895 QTAILQ_FOREACH(bs, &bdrv_states, list) {
896 bdrv_close(bs);
897 }
898}
899
922453bc
SH
900/*
901 * Wait for pending requests to complete across all BlockDriverStates
902 *
903 * This function does not flush data to disk, use bdrv_flush_all() for that
904 * after calling this function.
4c355d53
ZYW
905 *
906 * Note that completion of an asynchronous I/O operation can trigger any
907 * number of other I/O operations on other devices---for example a coroutine
908 * can be arbitrarily complex and a constant flow of I/O can come until the
909 * coroutine is complete. Because of this, it is not possible to have a
910 * function to drain a single device's I/O queue.
922453bc
SH
911 */
912void bdrv_drain_all(void)
913{
914 BlockDriverState *bs;
4c355d53
ZYW
915 bool busy;
916
917 do {
918 busy = qemu_aio_wait();
922453bc 919
4c355d53
ZYW
920 /* FIXME: We do not have timer support here, so this is effectively
921 * a busy wait.
922 */
923 QTAILQ_FOREACH(bs, &bdrv_states, list) {
924 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
925 qemu_co_queue_restart_all(&bs->throttled_reqs);
926 busy = true;
927 }
928 }
929 } while (busy);
922453bc
SH
930
931 /* If requests are still pending there is a bug somewhere */
932 QTAILQ_FOREACH(bs, &bdrv_states, list) {
933 assert(QLIST_EMPTY(&bs->tracked_requests));
934 assert(qemu_co_queue_empty(&bs->throttled_reqs));
935 }
936}
937
d22b2f41
RH
938/* make a BlockDriverState anonymous by removing from bdrv_state list.
939 Also, NULL terminate the device_name to prevent double remove */
940void bdrv_make_anon(BlockDriverState *bs)
941{
942 if (bs->device_name[0] != '\0') {
943 QTAILQ_REMOVE(&bdrv_states, bs, list);
944 }
945 bs->device_name[0] = '\0';
946}
947
e023b2e2
PB
948static void bdrv_rebind(BlockDriverState *bs)
949{
950 if (bs->drv && bs->drv->bdrv_rebind) {
951 bs->drv->bdrv_rebind(bs);
952 }
953}
954
8802d1fd
JC
955/*
956 * Add new bs contents at the top of an image chain while the chain is
957 * live, while keeping required fields on the top layer.
958 *
959 * This will modify the BlockDriverState fields, and swap contents
960 * between bs_new and bs_top. Both bs_new and bs_top are modified.
961 *
f6801b83
JC
962 * bs_new is required to be anonymous.
963 *
8802d1fd
JC
964 * This function does not create any image files.
965 */
966void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
967{
968 BlockDriverState tmp;
969
f6801b83
JC
970 /* bs_new must be anonymous */
971 assert(bs_new->device_name[0] == '\0');
8802d1fd
JC
972
973 tmp = *bs_new;
974
975 /* there are some fields that need to stay on the top layer: */
3a389e79 976 tmp.open_flags = bs_top->open_flags;
8802d1fd
JC
977
978 /* dev info */
979 tmp.dev_ops = bs_top->dev_ops;
980 tmp.dev_opaque = bs_top->dev_opaque;
981 tmp.dev = bs_top->dev;
982 tmp.buffer_alignment = bs_top->buffer_alignment;
983 tmp.copy_on_read = bs_top->copy_on_read;
984
985 /* i/o timing parameters */
986 tmp.slice_time = bs_top->slice_time;
987 tmp.slice_start = bs_top->slice_start;
988 tmp.slice_end = bs_top->slice_end;
989 tmp.io_limits = bs_top->io_limits;
990 tmp.io_base = bs_top->io_base;
991 tmp.throttled_reqs = bs_top->throttled_reqs;
992 tmp.block_timer = bs_top->block_timer;
993 tmp.io_limits_enabled = bs_top->io_limits_enabled;
994
995 /* geometry */
996 tmp.cyls = bs_top->cyls;
997 tmp.heads = bs_top->heads;
998 tmp.secs = bs_top->secs;
999 tmp.translation = bs_top->translation;
1000
1001 /* r/w error */
1002 tmp.on_read_error = bs_top->on_read_error;
1003 tmp.on_write_error = bs_top->on_write_error;
1004
1005 /* i/o status */
1006 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1007 tmp.iostatus = bs_top->iostatus;
1008
1009 /* keep the same entry in bdrv_states */
1010 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1011 tmp.list = bs_top->list;
1012
1013 /* The contents of 'tmp' will become bs_top, as we are
1014 * swapping bs_new and bs_top contents. */
1015 tmp.backing_hd = bs_new;
1016 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
f6801b83 1017 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
8802d1fd
JC
1018
1019 /* swap contents of the fixed new bs and the current top */
1020 *bs_new = *bs_top;
1021 *bs_top = tmp;
1022
f6801b83
JC
1023 /* device_name[] was carried over from the old bs_top. bs_new
1024 * shouldn't be in bdrv_states, so we need to make device_name[]
1025 * reflect the anonymity of bs_new
1026 */
1027 bs_new->device_name[0] = '\0';
1028
8802d1fd
JC
1029 /* clear the copied fields in the new backing file */
1030 bdrv_detach_dev(bs_new, bs_new->dev);
1031
1032 qemu_co_queue_init(&bs_new->throttled_reqs);
1033 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1034 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1035 bdrv_iostatus_disable(bs_new);
1036
1037 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1038 * to affect or delete the block_timer, as it has been moved to bs_top */
1039 bs_new->io_limits_enabled = false;
1040 bs_new->block_timer = NULL;
1041 bs_new->slice_time = 0;
1042 bs_new->slice_start = 0;
1043 bs_new->slice_end = 0;
e023b2e2
PB
1044
1045 bdrv_rebind(bs_new);
1046 bdrv_rebind(bs_top);
8802d1fd
JC
1047}
1048
b338082b
FB
1049void bdrv_delete(BlockDriverState *bs)
1050{
fa879d62 1051 assert(!bs->dev);
3e914655
PB
1052 assert(!bs->job);
1053 assert(!bs->in_use);
18846dee 1054
1b7bdbc1 1055 /* remove from list, if necessary */
d22b2f41 1056 bdrv_make_anon(bs);
34c6f050 1057
b338082b 1058 bdrv_close(bs);
66f82cee 1059
f9092b10 1060 assert(bs != bs_snapshots);
7267c094 1061 g_free(bs);
fc01f7e7
FB
1062}
1063
fa879d62
MA
1064int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1065/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 1066{
fa879d62 1067 if (bs->dev) {
18846dee
MA
1068 return -EBUSY;
1069 }
fa879d62 1070 bs->dev = dev;
28a7282a 1071 bdrv_iostatus_reset(bs);
18846dee
MA
1072 return 0;
1073}
1074
fa879d62
MA
1075/* TODO qdevified devices don't use this, remove when devices are qdevified */
1076void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 1077{
fa879d62
MA
1078 if (bdrv_attach_dev(bs, dev) < 0) {
1079 abort();
1080 }
1081}
1082
1083void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1084/* TODO change to DeviceState *dev when all users are qdevified */
1085{
1086 assert(bs->dev == dev);
1087 bs->dev = NULL;
0e49de52
MA
1088 bs->dev_ops = NULL;
1089 bs->dev_opaque = NULL;
29e05f20 1090 bs->buffer_alignment = 512;
18846dee
MA
1091}
1092
fa879d62
MA
1093/* TODO change to return DeviceState * when all users are qdevified */
1094void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 1095{
fa879d62 1096 return bs->dev;
18846dee
MA
1097}
1098
0e49de52
MA
1099void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1100 void *opaque)
1101{
1102 bs->dev_ops = ops;
1103 bs->dev_opaque = opaque;
2c6942fa
MA
1104 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1105 bs_snapshots = NULL;
1106 }
0e49de52
MA
1107}
1108
329c0a48
LC
1109void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1110 BlockQMPEventAction action, int is_read)
1111{
1112 QObject *data;
1113 const char *action_str;
1114
1115 switch (action) {
1116 case BDRV_ACTION_REPORT:
1117 action_str = "report";
1118 break;
1119 case BDRV_ACTION_IGNORE:
1120 action_str = "ignore";
1121 break;
1122 case BDRV_ACTION_STOP:
1123 action_str = "stop";
1124 break;
1125 default:
1126 abort();
1127 }
1128
1129 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1130 bdrv->device_name,
1131 action_str,
1132 is_read ? "read" : "write");
1133 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1134
1135 qobject_decref(data);
1136}
1137
6f382ed2
LC
1138static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1139{
1140 QObject *data;
1141
1142 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1143 bdrv_get_device_name(bs), ejected);
1144 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1145
1146 qobject_decref(data);
1147}
1148
7d4b4ba5 1149static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 1150{
145feb17 1151 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 1152 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 1153 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
1154 if (tray_was_closed) {
1155 /* tray open */
1156 bdrv_emit_qmp_eject_event(bs, true);
1157 }
1158 if (load) {
1159 /* tray close */
1160 bdrv_emit_qmp_eject_event(bs, false);
1161 }
145feb17
MA
1162 }
1163}
1164
2c6942fa
MA
1165bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1166{
1167 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1168}
1169
025ccaa7
PB
1170void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1171{
1172 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1173 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1174 }
1175}
1176
e4def80b
MA
1177bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1178{
1179 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1180 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1181 }
1182 return false;
1183}
1184
145feb17
MA
1185static void bdrv_dev_resize_cb(BlockDriverState *bs)
1186{
1187 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1188 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
1189 }
1190}
1191
f107639a
MA
1192bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1193{
1194 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1195 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1196 }
1197 return false;
1198}
1199
e97fc193
AL
1200/*
1201 * Run consistency checks on an image
1202 *
e076f338 1203 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 1204 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 1205 * check are stored in res.
e97fc193 1206 */
e076f338 1207int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
e97fc193
AL
1208{
1209 if (bs->drv->bdrv_check == NULL) {
1210 return -ENOTSUP;
1211 }
1212
e076f338 1213 memset(res, 0, sizeof(*res));
9ac228e0 1214 return bs->drv->bdrv_check(bs, res);
e97fc193
AL
1215}
1216
8a426614
KW
1217#define COMMIT_BUF_SECTORS 2048
1218
33e3963e
FB
1219/* commit COW file into the raw image */
1220int bdrv_commit(BlockDriverState *bs)
1221{
19cb3738 1222 BlockDriver *drv = bs->drv;
ee181196 1223 BlockDriver *backing_drv;
8a426614
KW
1224 int64_t sector, total_sectors;
1225 int n, ro, open_flags;
4dca4b63 1226 int ret = 0, rw_ret = 0;
8a426614 1227 uint8_t *buf;
4dca4b63
NS
1228 char filename[1024];
1229 BlockDriverState *bs_rw, *bs_ro;
33e3963e 1230
19cb3738
FB
1231 if (!drv)
1232 return -ENOMEDIUM;
4dca4b63
NS
1233
1234 if (!bs->backing_hd) {
1235 return -ENOTSUP;
33e3963e
FB
1236 }
1237
4dca4b63
NS
1238 if (bs->backing_hd->keep_read_only) {
1239 return -EACCES;
1240 }
ee181196 1241
2d3735d3
SH
1242 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1243 return -EBUSY;
1244 }
1245
ee181196 1246 backing_drv = bs->backing_hd->drv;
4dca4b63
NS
1247 ro = bs->backing_hd->read_only;
1248 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1249 open_flags = bs->backing_hd->open_flags;
1250
1251 if (ro) {
1252 /* re-open as RW */
1253 bdrv_delete(bs->backing_hd);
1254 bs->backing_hd = NULL;
1255 bs_rw = bdrv_new("");
ee181196
KW
1256 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1257 backing_drv);
4dca4b63
NS
1258 if (rw_ret < 0) {
1259 bdrv_delete(bs_rw);
1260 /* try to re-open read-only */
1261 bs_ro = bdrv_new("");
ee181196
KW
1262 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1263 backing_drv);
4dca4b63
NS
1264 if (ret < 0) {
1265 bdrv_delete(bs_ro);
1266 /* drive not functional anymore */
1267 bs->drv = NULL;
1268 return ret;
1269 }
1270 bs->backing_hd = bs_ro;
1271 return rw_ret;
1272 }
1273 bs->backing_hd = bs_rw;
ea2384d3 1274 }
33e3963e 1275
6ea44308 1276 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
7267c094 1277 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
1278
1279 for (sector = 0; sector < total_sectors; sector += n) {
05c4af54 1280 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
8a426614
KW
1281
1282 if (bdrv_read(bs, sector, buf, n) != 0) {
1283 ret = -EIO;
1284 goto ro_cleanup;
1285 }
1286
1287 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1288 ret = -EIO;
1289 goto ro_cleanup;
1290 }
ea2384d3 1291 }
33e3963e 1292 }
95389c86 1293
1d44952f
CH
1294 if (drv->bdrv_make_empty) {
1295 ret = drv->bdrv_make_empty(bs);
1296 bdrv_flush(bs);
1297 }
95389c86 1298
3f5075ae
CH
1299 /*
1300 * Make sure all data we wrote to the backing device is actually
1301 * stable on disk.
1302 */
1303 if (bs->backing_hd)
1304 bdrv_flush(bs->backing_hd);
4dca4b63
NS
1305
1306ro_cleanup:
7267c094 1307 g_free(buf);
4dca4b63
NS
1308
1309 if (ro) {
1310 /* re-open as RO */
1311 bdrv_delete(bs->backing_hd);
1312 bs->backing_hd = NULL;
1313 bs_ro = bdrv_new("");
ee181196
KW
1314 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1315 backing_drv);
4dca4b63
NS
1316 if (ret < 0) {
1317 bdrv_delete(bs_ro);
1318 /* drive not functional anymore */
1319 bs->drv = NULL;
1320 return ret;
1321 }
1322 bs->backing_hd = bs_ro;
1323 bs->backing_hd->keep_read_only = 0;
1324 }
1325
1d44952f 1326 return ret;
33e3963e
FB
1327}
1328
e8877497 1329int bdrv_commit_all(void)
6ab4b5ab
MA
1330{
1331 BlockDriverState *bs;
1332
1333 QTAILQ_FOREACH(bs, &bdrv_states, list) {
e8877497
SH
1334 int ret = bdrv_commit(bs);
1335 if (ret < 0) {
1336 return ret;
1337 }
6ab4b5ab 1338 }
e8877497 1339 return 0;
6ab4b5ab
MA
1340}
1341
dbffbdcf
SH
1342struct BdrvTrackedRequest {
1343 BlockDriverState *bs;
1344 int64_t sector_num;
1345 int nb_sectors;
1346 bool is_write;
1347 QLIST_ENTRY(BdrvTrackedRequest) list;
5f8b6491 1348 Coroutine *co; /* owner, used for deadlock detection */
f4658285 1349 CoQueue wait_queue; /* coroutines blocked on this request */
dbffbdcf
SH
1350};
1351
1352/**
1353 * Remove an active request from the tracked requests list
1354 *
1355 * This function should be called when a tracked request is completing.
1356 */
1357static void tracked_request_end(BdrvTrackedRequest *req)
1358{
1359 QLIST_REMOVE(req, list);
f4658285 1360 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
1361}
1362
1363/**
1364 * Add an active request to the tracked requests list
1365 */
1366static void tracked_request_begin(BdrvTrackedRequest *req,
1367 BlockDriverState *bs,
1368 int64_t sector_num,
1369 int nb_sectors, bool is_write)
1370{
1371 *req = (BdrvTrackedRequest){
1372 .bs = bs,
1373 .sector_num = sector_num,
1374 .nb_sectors = nb_sectors,
1375 .is_write = is_write,
5f8b6491 1376 .co = qemu_coroutine_self(),
dbffbdcf
SH
1377 };
1378
f4658285
SH
1379 qemu_co_queue_init(&req->wait_queue);
1380
dbffbdcf
SH
1381 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1382}
1383
d83947ac
SH
1384/**
1385 * Round a region to cluster boundaries
1386 */
1387static void round_to_clusters(BlockDriverState *bs,
1388 int64_t sector_num, int nb_sectors,
1389 int64_t *cluster_sector_num,
1390 int *cluster_nb_sectors)
1391{
1392 BlockDriverInfo bdi;
1393
1394 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1395 *cluster_sector_num = sector_num;
1396 *cluster_nb_sectors = nb_sectors;
1397 } else {
1398 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1399 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1400 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1401 nb_sectors, c);
1402 }
1403}
1404
f4658285
SH
1405static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1406 int64_t sector_num, int nb_sectors) {
d83947ac
SH
1407 /* aaaa bbbb */
1408 if (sector_num >= req->sector_num + req->nb_sectors) {
1409 return false;
1410 }
1411 /* bbbb aaaa */
1412 if (req->sector_num >= sector_num + nb_sectors) {
1413 return false;
1414 }
1415 return true;
f4658285
SH
1416}
1417
1418static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1419 int64_t sector_num, int nb_sectors)
1420{
1421 BdrvTrackedRequest *req;
d83947ac
SH
1422 int64_t cluster_sector_num;
1423 int cluster_nb_sectors;
f4658285
SH
1424 bool retry;
1425
d83947ac
SH
1426 /* If we touch the same cluster it counts as an overlap. This guarantees
1427 * that allocating writes will be serialized and not race with each other
1428 * for the same cluster. For example, in copy-on-read it ensures that the
1429 * CoR read and write operations are atomic and guest writes cannot
1430 * interleave between them.
1431 */
1432 round_to_clusters(bs, sector_num, nb_sectors,
1433 &cluster_sector_num, &cluster_nb_sectors);
1434
f4658285
SH
1435 do {
1436 retry = false;
1437 QLIST_FOREACH(req, &bs->tracked_requests, list) {
d83947ac
SH
1438 if (tracked_request_overlaps(req, cluster_sector_num,
1439 cluster_nb_sectors)) {
5f8b6491
SH
1440 /* Hitting this means there was a reentrant request, for
1441 * example, a block driver issuing nested requests. This must
1442 * never happen since it means deadlock.
1443 */
1444 assert(qemu_coroutine_self() != req->co);
1445
f4658285
SH
1446 qemu_co_queue_wait(&req->wait_queue);
1447 retry = true;
1448 break;
1449 }
1450 }
1451 } while (retry);
1452}
1453
756e6736
KW
1454/*
1455 * Return values:
1456 * 0 - success
1457 * -EINVAL - backing format specified, but no file
1458 * -ENOSPC - can't update the backing file because no space is left in the
1459 * image file header
1460 * -ENOTSUP - format driver doesn't support changing the backing file
1461 */
1462int bdrv_change_backing_file(BlockDriverState *bs,
1463 const char *backing_file, const char *backing_fmt)
1464{
1465 BlockDriver *drv = bs->drv;
469ef350 1466 int ret;
756e6736 1467
5f377794
PB
1468 /* Backing file format doesn't make sense without a backing file */
1469 if (backing_fmt && !backing_file) {
1470 return -EINVAL;
1471 }
1472
756e6736 1473 if (drv->bdrv_change_backing_file != NULL) {
469ef350 1474 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
756e6736 1475 } else {
469ef350 1476 ret = -ENOTSUP;
756e6736 1477 }
469ef350
PB
1478
1479 if (ret == 0) {
1480 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1481 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1482 }
1483 return ret;
756e6736
KW
1484}
1485
71d0770c
AL
1486static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1487 size_t size)
1488{
1489 int64_t len;
1490
1491 if (!bdrv_is_inserted(bs))
1492 return -ENOMEDIUM;
1493
1494 if (bs->growable)
1495 return 0;
1496
1497 len = bdrv_getlength(bs);
1498
fbb7b4e0
KW
1499 if (offset < 0)
1500 return -EIO;
1501
1502 if ((offset > len) || (len - offset < size))
71d0770c
AL
1503 return -EIO;
1504
1505 return 0;
1506}
1507
1508static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1509 int nb_sectors)
1510{
eb5a3165
JS
1511 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1512 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
1513}
1514
1c9805a3
SH
1515typedef struct RwCo {
1516 BlockDriverState *bs;
1517 int64_t sector_num;
1518 int nb_sectors;
1519 QEMUIOVector *qiov;
1520 bool is_write;
1521 int ret;
1522} RwCo;
1523
1524static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 1525{
1c9805a3 1526 RwCo *rwco = opaque;
ea2384d3 1527
1c9805a3
SH
1528 if (!rwco->is_write) {
1529 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
470c0504 1530 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1531 } else {
1532 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
f08f2dda 1533 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1534 }
1535}
e7a8a783 1536
1c9805a3
SH
1537/*
1538 * Process a synchronous request using coroutines
1539 */
1540static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1541 int nb_sectors, bool is_write)
1542{
1543 QEMUIOVector qiov;
1544 struct iovec iov = {
1545 .iov_base = (void *)buf,
1546 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1547 };
1548 Coroutine *co;
1549 RwCo rwco = {
1550 .bs = bs,
1551 .sector_num = sector_num,
1552 .nb_sectors = nb_sectors,
1553 .qiov = &qiov,
1554 .is_write = is_write,
1555 .ret = NOT_DONE,
1556 };
e7a8a783 1557
1c9805a3 1558 qemu_iovec_init_external(&qiov, &iov, 1);
e7a8a783 1559
498e386c
ZYW
1560 /**
1561 * In sync call context, when the vcpu is blocked, this throttling timer
1562 * will not fire; so the I/O throttling function has to be disabled here
1563 * if it has been enabled.
1564 */
1565 if (bs->io_limits_enabled) {
1566 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1567 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1568 bdrv_io_limits_disable(bs);
1569 }
1570
1c9805a3
SH
1571 if (qemu_in_coroutine()) {
1572 /* Fast-path if already in coroutine context */
1573 bdrv_rw_co_entry(&rwco);
1574 } else {
1575 co = qemu_coroutine_create(bdrv_rw_co_entry);
1576 qemu_coroutine_enter(co, &rwco);
1577 while (rwco.ret == NOT_DONE) {
1578 qemu_aio_wait();
1579 }
1580 }
1581 return rwco.ret;
1582}
b338082b 1583
1c9805a3
SH
1584/* return < 0 if error. See bdrv_write() for the return codes */
1585int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1586 uint8_t *buf, int nb_sectors)
1587{
1588 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
fc01f7e7
FB
1589}
1590
71df14fc
PB
1591#define BITS_PER_LONG (sizeof(unsigned long) * 8)
1592
7cd1e32a 1593static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
a55eb92c 1594 int nb_sectors, int dirty)
7cd1e32a 1595{
1596 int64_t start, end;
c6d22830 1597 unsigned long val, idx, bit;
a55eb92c 1598
6ea44308 1599 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
c6d22830 1600 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c
JK
1601
1602 for (; start <= end; start++) {
71df14fc
PB
1603 idx = start / BITS_PER_LONG;
1604 bit = start % BITS_PER_LONG;
c6d22830
JK
1605 val = bs->dirty_bitmap[idx];
1606 if (dirty) {
6d59fec1 1607 if (!(val & (1UL << bit))) {
aaa0eb75 1608 bs->dirty_count++;
6d59fec1 1609 val |= 1UL << bit;
aaa0eb75 1610 }
c6d22830 1611 } else {
6d59fec1 1612 if (val & (1UL << bit)) {
aaa0eb75 1613 bs->dirty_count--;
6d59fec1 1614 val &= ~(1UL << bit);
aaa0eb75 1615 }
c6d22830
JK
1616 }
1617 bs->dirty_bitmap[idx] = val;
7cd1e32a 1618 }
1619}
1620
5fafdf24 1621/* Return < 0 if error. Important errors are:
19cb3738
FB
1622 -EIO generic I/O error (may happen for all errors)
1623 -ENOMEDIUM No media inserted.
1624 -EINVAL Invalid sector number or nb_sectors
1625 -EACCES Trying to write a read-only device
1626*/
5fafdf24 1627int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
1628 const uint8_t *buf, int nb_sectors)
1629{
1c9805a3 1630 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
83f64091
FB
1631}
1632
eda578e5
AL
1633int bdrv_pread(BlockDriverState *bs, int64_t offset,
1634 void *buf, int count1)
83f64091 1635{
6ea44308 1636 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1637 int len, nb_sectors, count;
1638 int64_t sector_num;
9a8c4cce 1639 int ret;
83f64091
FB
1640
1641 count = count1;
1642 /* first read to align to sector start */
6ea44308 1643 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1644 if (len > count)
1645 len = count;
6ea44308 1646 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1647 if (len > 0) {
9a8c4cce
KW
1648 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1649 return ret;
6ea44308 1650 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
83f64091
FB
1651 count -= len;
1652 if (count == 0)
1653 return count1;
1654 sector_num++;
1655 buf += len;
1656 }
1657
1658 /* read the sectors "in place" */
6ea44308 1659 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1660 if (nb_sectors > 0) {
9a8c4cce
KW
1661 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1662 return ret;
83f64091 1663 sector_num += nb_sectors;
6ea44308 1664 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1665 buf += len;
1666 count -= len;
1667 }
1668
1669 /* add data from the last sector */
1670 if (count > 0) {
9a8c4cce
KW
1671 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1672 return ret;
83f64091
FB
1673 memcpy(buf, tmp_buf, count);
1674 }
1675 return count1;
1676}
1677
eda578e5
AL
1678int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1679 const void *buf, int count1)
83f64091 1680{
6ea44308 1681 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1682 int len, nb_sectors, count;
1683 int64_t sector_num;
9a8c4cce 1684 int ret;
83f64091
FB
1685
1686 count = count1;
1687 /* first write to align to sector start */
6ea44308 1688 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1689 if (len > count)
1690 len = count;
6ea44308 1691 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1692 if (len > 0) {
9a8c4cce
KW
1693 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1694 return ret;
6ea44308 1695 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
9a8c4cce
KW
1696 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1697 return ret;
83f64091
FB
1698 count -= len;
1699 if (count == 0)
1700 return count1;
1701 sector_num++;
1702 buf += len;
1703 }
1704
1705 /* write the sectors "in place" */
6ea44308 1706 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1707 if (nb_sectors > 0) {
9a8c4cce
KW
1708 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1709 return ret;
83f64091 1710 sector_num += nb_sectors;
6ea44308 1711 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1712 buf += len;
1713 count -= len;
1714 }
1715
1716 /* add data from the last sector */
1717 if (count > 0) {
9a8c4cce
KW
1718 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1719 return ret;
83f64091 1720 memcpy(tmp_buf, buf, count);
9a8c4cce
KW
1721 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1722 return ret;
83f64091
FB
1723 }
1724 return count1;
1725}
83f64091 1726
f08145fe
KW
1727/*
1728 * Writes to the file and ensures that no writes are reordered across this
1729 * request (acts as a barrier)
1730 *
1731 * Returns 0 on success, -errno in error cases.
1732 */
1733int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1734 const void *buf, int count)
1735{
1736 int ret;
1737
1738 ret = bdrv_pwrite(bs, offset, buf, count);
1739 if (ret < 0) {
1740 return ret;
1741 }
1742
92196b2f
SH
1743 /* No flush needed for cache modes that use O_DSYNC */
1744 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
f08145fe
KW
1745 bdrv_flush(bs);
1746 }
1747
1748 return 0;
1749}
1750
470c0504 1751static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
1752 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1753{
1754 /* Perform I/O through a temporary buffer so that users who scribble over
1755 * their read buffer while the operation is in progress do not end up
1756 * modifying the image file. This is critical for zero-copy guest I/O
1757 * where anything might happen inside guest memory.
1758 */
1759 void *bounce_buffer;
1760
79c053bd 1761 BlockDriver *drv = bs->drv;
ab185921
SH
1762 struct iovec iov;
1763 QEMUIOVector bounce_qiov;
1764 int64_t cluster_sector_num;
1765 int cluster_nb_sectors;
1766 size_t skip_bytes;
1767 int ret;
1768
1769 /* Cover entire cluster so no additional backing file I/O is required when
1770 * allocating cluster in the image file.
1771 */
1772 round_to_clusters(bs, sector_num, nb_sectors,
1773 &cluster_sector_num, &cluster_nb_sectors);
1774
470c0504
SH
1775 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1776 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
1777
1778 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1779 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1780 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1781
79c053bd
SH
1782 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1783 &bounce_qiov);
ab185921
SH
1784 if (ret < 0) {
1785 goto err;
1786 }
1787
79c053bd
SH
1788 if (drv->bdrv_co_write_zeroes &&
1789 buffer_is_zero(bounce_buffer, iov.iov_len)) {
621f0589
KW
1790 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1791 cluster_nb_sectors);
79c053bd
SH
1792 } else {
1793 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 1794 &bounce_qiov);
79c053bd
SH
1795 }
1796
ab185921
SH
1797 if (ret < 0) {
1798 /* It might be okay to ignore write errors for guest requests. If this
1799 * is a deliberate copy-on-read then we don't want to ignore the error.
1800 * Simply report it in all cases.
1801 */
1802 goto err;
1803 }
1804
1805 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1806 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1807 nb_sectors * BDRV_SECTOR_SIZE);
1808
1809err:
1810 qemu_vfree(bounce_buffer);
1811 return ret;
1812}
1813
c5fbe571
SH
1814/*
1815 * Handle a read request in coroutine context
1816 */
1817static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
1818 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1819 BdrvRequestFlags flags)
da1fa91d
KW
1820{
1821 BlockDriver *drv = bs->drv;
dbffbdcf
SH
1822 BdrvTrackedRequest req;
1823 int ret;
da1fa91d 1824
da1fa91d
KW
1825 if (!drv) {
1826 return -ENOMEDIUM;
1827 }
1828 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1829 return -EIO;
1830 }
1831
98f90dba
ZYW
1832 /* throttling disk read I/O */
1833 if (bs->io_limits_enabled) {
1834 bdrv_io_limits_intercept(bs, false, nb_sectors);
1835 }
1836
f4658285 1837 if (bs->copy_on_read) {
470c0504
SH
1838 flags |= BDRV_REQ_COPY_ON_READ;
1839 }
1840 if (flags & BDRV_REQ_COPY_ON_READ) {
1841 bs->copy_on_read_in_flight++;
1842 }
1843
1844 if (bs->copy_on_read_in_flight) {
f4658285
SH
1845 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1846 }
1847
dbffbdcf 1848 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
ab185921 1849
470c0504 1850 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
1851 int pnum;
1852
1853 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1854 if (ret < 0) {
1855 goto out;
1856 }
1857
1858 if (!ret || pnum != nb_sectors) {
470c0504 1859 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1860 goto out;
1861 }
1862 }
1863
dbffbdcf 1864 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1865
1866out:
dbffbdcf 1867 tracked_request_end(&req);
470c0504
SH
1868
1869 if (flags & BDRV_REQ_COPY_ON_READ) {
1870 bs->copy_on_read_in_flight--;
1871 }
1872
dbffbdcf 1873 return ret;
da1fa91d
KW
1874}
1875
c5fbe571 1876int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
1877 int nb_sectors, QEMUIOVector *qiov)
1878{
c5fbe571 1879 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 1880
470c0504
SH
1881 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1882}
1883
1884int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1885 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1886{
1887 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1888
1889 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1890 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
1891}
1892
f08f2dda
SH
1893static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1894 int64_t sector_num, int nb_sectors)
1895{
1896 BlockDriver *drv = bs->drv;
1897 QEMUIOVector qiov;
1898 struct iovec iov;
1899 int ret;
1900
621f0589
KW
1901 /* TODO Emulate only part of misaligned requests instead of letting block
1902 * drivers return -ENOTSUP and emulate everything */
1903
f08f2dda
SH
1904 /* First try the efficient write zeroes operation */
1905 if (drv->bdrv_co_write_zeroes) {
621f0589
KW
1906 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1907 if (ret != -ENOTSUP) {
1908 return ret;
1909 }
f08f2dda
SH
1910 }
1911
1912 /* Fall back to bounce buffer if write zeroes is unsupported */
1913 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1914 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1915 memset(iov.iov_base, 0, iov.iov_len);
1916 qemu_iovec_init_external(&qiov, &iov, 1);
1917
1918 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1919
1920 qemu_vfree(iov.iov_base);
1921 return ret;
1922}
1923
c5fbe571
SH
1924/*
1925 * Handle a write request in coroutine context
1926 */
1927static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
1928 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1929 BdrvRequestFlags flags)
c5fbe571
SH
1930{
1931 BlockDriver *drv = bs->drv;
dbffbdcf 1932 BdrvTrackedRequest req;
6b7cb247 1933 int ret;
da1fa91d
KW
1934
1935 if (!bs->drv) {
1936 return -ENOMEDIUM;
1937 }
1938 if (bs->read_only) {
1939 return -EACCES;
1940 }
1941 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1942 return -EIO;
1943 }
1944
98f90dba
ZYW
1945 /* throttling disk write I/O */
1946 if (bs->io_limits_enabled) {
1947 bdrv_io_limits_intercept(bs, true, nb_sectors);
1948 }
1949
470c0504 1950 if (bs->copy_on_read_in_flight) {
f4658285
SH
1951 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1952 }
1953
dbffbdcf
SH
1954 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1955
f08f2dda
SH
1956 if (flags & BDRV_REQ_ZERO_WRITE) {
1957 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1958 } else {
1959 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1960 }
6b7cb247 1961
da1fa91d
KW
1962 if (bs->dirty_bitmap) {
1963 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1964 }
1965
1966 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1967 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1968 }
1969
dbffbdcf
SH
1970 tracked_request_end(&req);
1971
6b7cb247 1972 return ret;
da1fa91d
KW
1973}
1974
c5fbe571
SH
1975int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1976 int nb_sectors, QEMUIOVector *qiov)
1977{
1978 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1979
f08f2dda
SH
1980 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1981}
1982
1983int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1984 int64_t sector_num, int nb_sectors)
1985{
1986 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1987
1988 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1989 BDRV_REQ_ZERO_WRITE);
c5fbe571
SH
1990}
1991
83f64091
FB
1992/**
1993 * Truncate file to 'offset' bytes (needed only for file protocols)
1994 */
1995int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1996{
1997 BlockDriver *drv = bs->drv;
51762288 1998 int ret;
83f64091 1999 if (!drv)
19cb3738 2000 return -ENOMEDIUM;
83f64091
FB
2001 if (!drv->bdrv_truncate)
2002 return -ENOTSUP;
59f2689d
NS
2003 if (bs->read_only)
2004 return -EACCES;
8591675f
MT
2005 if (bdrv_in_use(bs))
2006 return -EBUSY;
51762288
SH
2007 ret = drv->bdrv_truncate(bs, offset);
2008 if (ret == 0) {
2009 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 2010 bdrv_dev_resize_cb(bs);
51762288
SH
2011 }
2012 return ret;
83f64091
FB
2013}
2014
4a1d5e1f
FZ
2015/**
2016 * Length of a allocated file in bytes. Sparse files are counted by actual
2017 * allocated space. Return < 0 if error or unknown.
2018 */
2019int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2020{
2021 BlockDriver *drv = bs->drv;
2022 if (!drv) {
2023 return -ENOMEDIUM;
2024 }
2025 if (drv->bdrv_get_allocated_file_size) {
2026 return drv->bdrv_get_allocated_file_size(bs);
2027 }
2028 if (bs->file) {
2029 return bdrv_get_allocated_file_size(bs->file);
2030 }
2031 return -ENOTSUP;
2032}
2033
83f64091
FB
2034/**
2035 * Length of a file in bytes. Return < 0 if error or unknown.
2036 */
2037int64_t bdrv_getlength(BlockDriverState *bs)
2038{
2039 BlockDriver *drv = bs->drv;
2040 if (!drv)
19cb3738 2041 return -ENOMEDIUM;
51762288 2042
2c6942fa 2043 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
46a4e4e6
SH
2044 if (drv->bdrv_getlength) {
2045 return drv->bdrv_getlength(bs);
2046 }
83f64091 2047 }
46a4e4e6 2048 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
2049}
2050
19cb3738 2051/* return 0 as number of sectors if no device present or error */
96b8f136 2052void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 2053{
19cb3738
FB
2054 int64_t length;
2055 length = bdrv_getlength(bs);
2056 if (length < 0)
2057 length = 0;
2058 else
6ea44308 2059 length = length >> BDRV_SECTOR_BITS;
19cb3738 2060 *nb_sectors_ptr = length;
fc01f7e7 2061}
cf98951b 2062
f3d54fc4
AL
2063struct partition {
2064 uint8_t boot_ind; /* 0x80 - active */
2065 uint8_t head; /* starting head */
2066 uint8_t sector; /* starting sector */
2067 uint8_t cyl; /* starting cylinder */
2068 uint8_t sys_ind; /* What partition type */
2069 uint8_t end_head; /* end head */
2070 uint8_t end_sector; /* end sector */
2071 uint8_t end_cyl; /* end cylinder */
2072 uint32_t start_sect; /* starting sector counting from 0 */
2073 uint32_t nr_sects; /* nr of sectors in partition */
541dc0d4 2074} QEMU_PACKED;
f3d54fc4
AL
2075
2076/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2077static int guess_disk_lchs(BlockDriverState *bs,
2078 int *pcylinders, int *pheads, int *psectors)
2079{
eb5a3165 2080 uint8_t buf[BDRV_SECTOR_SIZE];
f3d54fc4
AL
2081 int ret, i, heads, sectors, cylinders;
2082 struct partition *p;
2083 uint32_t nr_sects;
a38131b6 2084 uint64_t nb_sectors;
498e386c 2085 bool enabled;
f3d54fc4
AL
2086
2087 bdrv_get_geometry(bs, &nb_sectors);
2088
498e386c
ZYW
2089 /**
2090 * The function will be invoked during startup not only in sync I/O mode,
2091 * but also in async I/O mode. So the I/O throttling function has to
2092 * be disabled temporarily here, not permanently.
2093 */
2094 enabled = bs->io_limits_enabled;
2095 bs->io_limits_enabled = false;
f3d54fc4 2096 ret = bdrv_read(bs, 0, buf, 1);
498e386c 2097 bs->io_limits_enabled = enabled;
f3d54fc4
AL
2098 if (ret < 0)
2099 return -1;
2100 /* test msdos magic */
2101 if (buf[510] != 0x55 || buf[511] != 0xaa)
2102 return -1;
2103 for(i = 0; i < 4; i++) {
2104 p = ((struct partition *)(buf + 0x1be)) + i;
2105 nr_sects = le32_to_cpu(p->nr_sects);
2106 if (nr_sects && p->end_head) {
2107 /* We make the assumption that the partition terminates on
2108 a cylinder boundary */
2109 heads = p->end_head + 1;
2110 sectors = p->end_sector & 63;
2111 if (sectors == 0)
2112 continue;
2113 cylinders = nb_sectors / (heads * sectors);
2114 if (cylinders < 1 || cylinders > 16383)
2115 continue;
2116 *pheads = heads;
2117 *psectors = sectors;
2118 *pcylinders = cylinders;
2119#if 0
2120 printf("guessed geometry: LCHS=%d %d %d\n",
2121 cylinders, heads, sectors);
2122#endif
2123 return 0;
2124 }
2125 }
2126 return -1;
2127}
2128
2129void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2130{
2131 int translation, lba_detected = 0;
2132 int cylinders, heads, secs;
a38131b6 2133 uint64_t nb_sectors;
f3d54fc4
AL
2134
2135 /* if a geometry hint is available, use it */
2136 bdrv_get_geometry(bs, &nb_sectors);
2137 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2138 translation = bdrv_get_translation_hint(bs);
2139 if (cylinders != 0) {
2140 *pcyls = cylinders;
2141 *pheads = heads;
2142 *psecs = secs;
2143 } else {
2144 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2145 if (heads > 16) {
2146 /* if heads > 16, it means that a BIOS LBA
2147 translation was active, so the default
2148 hardware geometry is OK */
2149 lba_detected = 1;
2150 goto default_geometry;
2151 } else {
2152 *pcyls = cylinders;
2153 *pheads = heads;
2154 *psecs = secs;
2155 /* disable any translation to be in sync with
2156 the logical geometry */
2157 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2158 bdrv_set_translation_hint(bs,
2159 BIOS_ATA_TRANSLATION_NONE);
2160 }
2161 }
2162 } else {
2163 default_geometry:
2164 /* if no geometry, use a standard physical disk geometry */
2165 cylinders = nb_sectors / (16 * 63);
2166
2167 if (cylinders > 16383)
2168 cylinders = 16383;
2169 else if (cylinders < 2)
2170 cylinders = 2;
2171 *pcyls = cylinders;
2172 *pheads = 16;
2173 *psecs = 63;
2174 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2175 if ((*pcyls * *pheads) <= 131072) {
2176 bdrv_set_translation_hint(bs,
2177 BIOS_ATA_TRANSLATION_LARGE);
2178 } else {
2179 bdrv_set_translation_hint(bs,
2180 BIOS_ATA_TRANSLATION_LBA);
2181 }
2182 }
2183 }
2184 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2185 }
2186}
2187
5fafdf24 2188void bdrv_set_geometry_hint(BlockDriverState *bs,
b338082b
FB
2189 int cyls, int heads, int secs)
2190{
2191 bs->cyls = cyls;
2192 bs->heads = heads;
2193 bs->secs = secs;
2194}
2195
46d4767d
FB
2196void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2197{
2198 bs->translation = translation;
2199}
2200
5fafdf24 2201void bdrv_get_geometry_hint(BlockDriverState *bs,
b338082b
FB
2202 int *pcyls, int *pheads, int *psecs)
2203{
2204 *pcyls = bs->cyls;
2205 *pheads = bs->heads;
2206 *psecs = bs->secs;
2207}
2208
0563e191
ZYW
2209/* throttling disk io limits */
2210void bdrv_set_io_limits(BlockDriverState *bs,
2211 BlockIOLimit *io_limits)
2212{
2213 bs->io_limits = *io_limits;
2214 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2215}
2216
5bbdbb46
BS
2217/* Recognize floppy formats */
2218typedef struct FDFormat {
2219 FDriveType drive;
2220 uint8_t last_sect;
2221 uint8_t max_track;
2222 uint8_t max_head;
f8d3d128 2223 FDriveRate rate;
5bbdbb46
BS
2224} FDFormat;
2225
2226static const FDFormat fd_formats[] = {
2227 /* First entry is default format */
2228 /* 1.44 MB 3"1/2 floppy disks */
f8d3d128
HP
2229 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2230 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2231 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2232 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2233 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2234 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2235 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2236 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2237 /* 2.88 MB 3"1/2 floppy disks */
f8d3d128
HP
2238 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2239 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2240 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2241 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2242 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
5bbdbb46 2243 /* 720 kB 3"1/2 floppy disks */
f8d3d128
HP
2244 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2245 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2246 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2247 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2248 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2249 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2250 /* 1.2 MB 5"1/4 floppy disks */
f8d3d128
HP
2251 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2252 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2253 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2254 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2255 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2256 /* 720 kB 5"1/4 floppy disks */
f8d3d128
HP
2257 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2258 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2259 /* 360 kB 5"1/4 floppy disks */
f8d3d128
HP
2260 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2261 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2262 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2263 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
5bbdbb46 2264 /* 320 kB 5"1/4 floppy disks */
f8d3d128
HP
2265 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2266 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
5bbdbb46 2267 /* 360 kB must match 5"1/4 better than 3"1/2... */
f8d3d128 2268 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
5bbdbb46 2269 /* end */
f8d3d128 2270 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
5bbdbb46
BS
2271};
2272
2273void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2274 int *max_track, int *last_sect,
f8d3d128
HP
2275 FDriveType drive_in, FDriveType *drive,
2276 FDriveRate *rate)
5bbdbb46
BS
2277{
2278 const FDFormat *parse;
2279 uint64_t nb_sectors, size;
2280 int i, first_match, match;
2281
2282 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2283 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2284 /* User defined disk */
f8d3d128 2285 *rate = FDRIVE_RATE_500K;
5bbdbb46
BS
2286 } else {
2287 bdrv_get_geometry(bs, &nb_sectors);
2288 match = -1;
2289 first_match = -1;
2290 for (i = 0; ; i++) {
2291 parse = &fd_formats[i];
2292 if (parse->drive == FDRIVE_DRV_NONE) {
2293 break;
2294 }
2295 if (drive_in == parse->drive ||
2296 drive_in == FDRIVE_DRV_NONE) {
2297 size = (parse->max_head + 1) * parse->max_track *
2298 parse->last_sect;
2299 if (nb_sectors == size) {
2300 match = i;
2301 break;
2302 }
2303 if (first_match == -1) {
2304 first_match = i;
2305 }
2306 }
2307 }
2308 if (match == -1) {
2309 if (first_match == -1) {
2310 match = 1;
2311 } else {
2312 match = first_match;
2313 }
2314 parse = &fd_formats[match];
2315 }
2316 *nb_heads = parse->max_head + 1;
2317 *max_track = parse->max_track;
2318 *last_sect = parse->last_sect;
2319 *drive = parse->drive;
f8d3d128 2320 *rate = parse->rate;
5bbdbb46
BS
2321 }
2322}
2323
46d4767d
FB
2324int bdrv_get_translation_hint(BlockDriverState *bs)
2325{
2326 return bs->translation;
2327}
2328
abd7f68d
MA
2329void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2330 BlockErrorAction on_write_error)
2331{
2332 bs->on_read_error = on_read_error;
2333 bs->on_write_error = on_write_error;
2334}
2335
2336BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2337{
2338 return is_read ? bs->on_read_error : bs->on_write_error;
2339}
2340
b338082b
FB
2341int bdrv_is_read_only(BlockDriverState *bs)
2342{
2343 return bs->read_only;
2344}
2345
985a03b0
TS
2346int bdrv_is_sg(BlockDriverState *bs)
2347{
2348 return bs->sg;
2349}
2350
e900a7b7
CH
2351int bdrv_enable_write_cache(BlockDriverState *bs)
2352{
2353 return bs->enable_write_cache;
2354}
2355
ea2384d3
FB
2356int bdrv_is_encrypted(BlockDriverState *bs)
2357{
2358 if (bs->backing_hd && bs->backing_hd->encrypted)
2359 return 1;
2360 return bs->encrypted;
2361}
2362
c0f4ce77
AL
2363int bdrv_key_required(BlockDriverState *bs)
2364{
2365 BlockDriverState *backing_hd = bs->backing_hd;
2366
2367 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2368 return 1;
2369 return (bs->encrypted && !bs->valid_key);
2370}
2371
ea2384d3
FB
2372int bdrv_set_key(BlockDriverState *bs, const char *key)
2373{
2374 int ret;
2375 if (bs->backing_hd && bs->backing_hd->encrypted) {
2376 ret = bdrv_set_key(bs->backing_hd, key);
2377 if (ret < 0)
2378 return ret;
2379 if (!bs->encrypted)
2380 return 0;
2381 }
fd04a2ae
SH
2382 if (!bs->encrypted) {
2383 return -EINVAL;
2384 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2385 return -ENOMEDIUM;
2386 }
c0f4ce77 2387 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
2388 if (ret < 0) {
2389 bs->valid_key = 0;
2390 } else if (!bs->valid_key) {
2391 bs->valid_key = 1;
2392 /* call the change callback now, we skipped it on open */
7d4b4ba5 2393 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 2394 }
c0f4ce77 2395 return ret;
ea2384d3
FB
2396}
2397
2398void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2399{
19cb3738 2400 if (!bs->drv) {
ea2384d3
FB
2401 buf[0] = '\0';
2402 } else {
2403 pstrcpy(buf, buf_size, bs->drv->format_name);
2404 }
2405}
2406
5fafdf24 2407void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
2408 void *opaque)
2409{
2410 BlockDriver *drv;
2411
8a22f02a 2412 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
2413 it(opaque, drv->format_name);
2414 }
2415}
2416
b338082b
FB
2417BlockDriverState *bdrv_find(const char *name)
2418{
2419 BlockDriverState *bs;
2420
1b7bdbc1
SH
2421 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2422 if (!strcmp(name, bs->device_name)) {
b338082b 2423 return bs;
1b7bdbc1 2424 }
b338082b
FB
2425 }
2426 return NULL;
2427}
2428
2f399b0a
MA
2429BlockDriverState *bdrv_next(BlockDriverState *bs)
2430{
2431 if (!bs) {
2432 return QTAILQ_FIRST(&bdrv_states);
2433 }
2434 return QTAILQ_NEXT(bs, list);
2435}
2436
51de9760 2437void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
2438{
2439 BlockDriverState *bs;
2440
1b7bdbc1 2441 QTAILQ_FOREACH(bs, &bdrv_states, list) {
51de9760 2442 it(opaque, bs);
81d0912d
FB
2443 }
2444}
2445
ea2384d3
FB
2446const char *bdrv_get_device_name(BlockDriverState *bs)
2447{
2448 return bs->device_name;
2449}
2450
c6ca28d6
AL
2451void bdrv_flush_all(void)
2452{
2453 BlockDriverState *bs;
2454
1b7bdbc1 2455 QTAILQ_FOREACH(bs, &bdrv_states, list) {
29cdb251 2456 bdrv_flush(bs);
1b7bdbc1 2457 }
c6ca28d6
AL
2458}
2459
f2feebbd
KW
2460int bdrv_has_zero_init(BlockDriverState *bs)
2461{
2462 assert(bs->drv);
2463
336c1c12
KW
2464 if (bs->drv->bdrv_has_zero_init) {
2465 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
2466 }
2467
2468 return 1;
2469}
2470
376ae3f1
SH
2471typedef struct BdrvCoIsAllocatedData {
2472 BlockDriverState *bs;
2473 int64_t sector_num;
2474 int nb_sectors;
2475 int *pnum;
2476 int ret;
2477 bool done;
2478} BdrvCoIsAllocatedData;
2479
f58c7b35
TS
2480/*
2481 * Returns true iff the specified sector is present in the disk image. Drivers
2482 * not implementing the functionality are assumed to not support backing files,
2483 * hence all their sectors are reported as allocated.
2484 *
bd9533e3
SH
2485 * If 'sector_num' is beyond the end of the disk image the return value is 0
2486 * and 'pnum' is set to 0.
2487 *
f58c7b35
TS
2488 * 'pnum' is set to the number of sectors (including and immediately following
2489 * the specified sector) that are known to be in the same
2490 * allocated/unallocated state.
2491 *
bd9533e3
SH
2492 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2493 * beyond the end of the disk image it will be clamped.
f58c7b35 2494 */
060f51c9
SH
2495int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2496 int nb_sectors, int *pnum)
f58c7b35 2497{
bd9533e3
SH
2498 int64_t n;
2499
2500 if (sector_num >= bs->total_sectors) {
2501 *pnum = 0;
2502 return 0;
2503 }
2504
2505 n = bs->total_sectors - sector_num;
2506 if (n < nb_sectors) {
2507 nb_sectors = n;
2508 }
2509
6aebab14 2510 if (!bs->drv->bdrv_co_is_allocated) {
bd9533e3 2511 *pnum = nb_sectors;
f58c7b35
TS
2512 return 1;
2513 }
6aebab14 2514
060f51c9
SH
2515 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2516}
2517
2518/* Coroutine wrapper for bdrv_is_allocated() */
2519static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2520{
2521 BdrvCoIsAllocatedData *data = opaque;
2522 BlockDriverState *bs = data->bs;
2523
2524 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2525 data->pnum);
2526 data->done = true;
2527}
2528
2529/*
2530 * Synchronous wrapper around bdrv_co_is_allocated().
2531 *
2532 * See bdrv_co_is_allocated() for details.
2533 */
2534int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2535 int *pnum)
2536{
6aebab14
SH
2537 Coroutine *co;
2538 BdrvCoIsAllocatedData data = {
2539 .bs = bs,
2540 .sector_num = sector_num,
2541 .nb_sectors = nb_sectors,
2542 .pnum = pnum,
2543 .done = false,
2544 };
2545
2546 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2547 qemu_coroutine_enter(co, &data);
2548 while (!data.done) {
2549 qemu_aio_wait();
2550 }
2551 return data.ret;
f58c7b35
TS
2552}
2553
b2023818 2554BlockInfoList *qmp_query_block(Error **errp)
b338082b 2555{
b2023818 2556 BlockInfoList *head = NULL, *cur_item = NULL;
b338082b
FB
2557 BlockDriverState *bs;
2558
1b7bdbc1 2559 QTAILQ_FOREACH(bs, &bdrv_states, list) {
b2023818 2560 BlockInfoList *info = g_malloc0(sizeof(*info));
d15e5465 2561
b2023818
LC
2562 info->value = g_malloc0(sizeof(*info->value));
2563 info->value->device = g_strdup(bs->device_name);
2564 info->value->type = g_strdup("unknown");
2565 info->value->locked = bdrv_dev_is_medium_locked(bs);
2566 info->value->removable = bdrv_dev_has_removable_media(bs);
d15e5465 2567
e4def80b 2568 if (bdrv_dev_has_removable_media(bs)) {
b2023818
LC
2569 info->value->has_tray_open = true;
2570 info->value->tray_open = bdrv_dev_is_tray_open(bs);
e4def80b 2571 }
f04ef601
LC
2572
2573 if (bdrv_iostatus_is_enabled(bs)) {
b2023818
LC
2574 info->value->has_io_status = true;
2575 info->value->io_status = bs->iostatus;
f04ef601
LC
2576 }
2577
19cb3738 2578 if (bs->drv) {
b2023818
LC
2579 info->value->has_inserted = true;
2580 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2581 info->value->inserted->file = g_strdup(bs->filename);
2582 info->value->inserted->ro = bs->read_only;
2583 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2584 info->value->inserted->encrypted = bs->encrypted;
2585 if (bs->backing_file[0]) {
2586 info->value->inserted->has_backing_file = true;
2587 info->value->inserted->backing_file = g_strdup(bs->backing_file);
376253ec 2588 }
727f005e
ZYW
2589
2590 if (bs->io_limits_enabled) {
2591 info->value->inserted->bps =
2592 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2593 info->value->inserted->bps_rd =
2594 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2595 info->value->inserted->bps_wr =
2596 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2597 info->value->inserted->iops =
2598 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2599 info->value->inserted->iops_rd =
2600 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2601 info->value->inserted->iops_wr =
2602 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2603 }
b2023818 2604 }
d15e5465 2605
b2023818
LC
2606 /* XXX: waiting for the qapi to support GSList */
2607 if (!cur_item) {
2608 head = cur_item = info;
2609 } else {
2610 cur_item->next = info;
2611 cur_item = info;
b338082b 2612 }
b338082b 2613 }
d15e5465 2614
b2023818 2615 return head;
b338082b 2616}
a36e69dd 2617
f11f57e4
LC
2618/* Consider exposing this as a full fledged QMP command */
2619static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2620{
2621 BlockStats *s;
2622
2623 s = g_malloc0(sizeof(*s));
2624
2625 if (bs->device_name[0]) {
2626 s->has_device = true;
2627 s->device = g_strdup(bs->device_name);
294cc35f
KW
2628 }
2629
f11f57e4
LC
2630 s->stats = g_malloc0(sizeof(*s->stats));
2631 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2632 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2633 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2634 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2635 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2636 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2637 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2638 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2639 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2640
294cc35f 2641 if (bs->file) {
f11f57e4
LC
2642 s->has_parent = true;
2643 s->parent = qmp_query_blockstat(bs->file, NULL);
294cc35f
KW
2644 }
2645
f11f57e4 2646 return s;
294cc35f
KW
2647}
2648
f11f57e4 2649BlockStatsList *qmp_query_blockstats(Error **errp)
218a536a 2650{
f11f57e4 2651 BlockStatsList *head = NULL, *cur_item = NULL;
a36e69dd
TS
2652 BlockDriverState *bs;
2653
1b7bdbc1 2654 QTAILQ_FOREACH(bs, &bdrv_states, list) {
f11f57e4
LC
2655 BlockStatsList *info = g_malloc0(sizeof(*info));
2656 info->value = qmp_query_blockstat(bs, NULL);
2657
2658 /* XXX: waiting for the qapi to support GSList */
2659 if (!cur_item) {
2660 head = cur_item = info;
2661 } else {
2662 cur_item->next = info;
2663 cur_item = info;
2664 }
a36e69dd 2665 }
218a536a 2666
f11f57e4 2667 return head;
a36e69dd 2668}
ea2384d3 2669
045df330
AL
2670const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2671{
2672 if (bs->backing_hd && bs->backing_hd->encrypted)
2673 return bs->backing_file;
2674 else if (bs->encrypted)
2675 return bs->filename;
2676 else
2677 return NULL;
2678}
2679
5fafdf24 2680void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
2681 char *filename, int filename_size)
2682{
3574c608 2683 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
2684}
2685
5fafdf24 2686int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
2687 const uint8_t *buf, int nb_sectors)
2688{
2689 BlockDriver *drv = bs->drv;
2690 if (!drv)
19cb3738 2691 return -ENOMEDIUM;
faea38e7
FB
2692 if (!drv->bdrv_write_compressed)
2693 return -ENOTSUP;
fbb7b4e0
KW
2694 if (bdrv_check_request(bs, sector_num, nb_sectors))
2695 return -EIO;
a55eb92c 2696
c6d22830 2697 if (bs->dirty_bitmap) {
7cd1e32a 2698 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2699 }
a55eb92c 2700
faea38e7
FB
2701 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2702}
3b46e624 2703
faea38e7
FB
2704int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2705{
2706 BlockDriver *drv = bs->drv;
2707 if (!drv)
19cb3738 2708 return -ENOMEDIUM;
faea38e7
FB
2709 if (!drv->bdrv_get_info)
2710 return -ENOTSUP;
2711 memset(bdi, 0, sizeof(*bdi));
2712 return drv->bdrv_get_info(bs, bdi);
2713}
2714
45566e9c
CH
2715int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2716 int64_t pos, int size)
178e08a5
AL
2717{
2718 BlockDriver *drv = bs->drv;
2719 if (!drv)
2720 return -ENOMEDIUM;
7cdb1f6d
MK
2721 if (drv->bdrv_save_vmstate)
2722 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2723 if (bs->file)
2724 return bdrv_save_vmstate(bs->file, buf, pos, size);
2725 return -ENOTSUP;
178e08a5
AL
2726}
2727
45566e9c
CH
2728int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2729 int64_t pos, int size)
178e08a5
AL
2730{
2731 BlockDriver *drv = bs->drv;
2732 if (!drv)
2733 return -ENOMEDIUM;
7cdb1f6d
MK
2734 if (drv->bdrv_load_vmstate)
2735 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2736 if (bs->file)
2737 return bdrv_load_vmstate(bs->file, buf, pos, size);
2738 return -ENOTSUP;
178e08a5
AL
2739}
2740
8b9b0cc2
KW
2741void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2742{
2743 BlockDriver *drv = bs->drv;
2744
2745 if (!drv || !drv->bdrv_debug_event) {
2746 return;
2747 }
2748
2749 return drv->bdrv_debug_event(bs, event);
2750
2751}
2752
faea38e7
FB
2753/**************************************************************/
2754/* handling of snapshots */
2755
feeee5ac
MDCF
2756int bdrv_can_snapshot(BlockDriverState *bs)
2757{
2758 BlockDriver *drv = bs->drv;
07b70bfb 2759 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
feeee5ac
MDCF
2760 return 0;
2761 }
2762
2763 if (!drv->bdrv_snapshot_create) {
2764 if (bs->file != NULL) {
2765 return bdrv_can_snapshot(bs->file);
2766 }
2767 return 0;
2768 }
2769
2770 return 1;
2771}
2772
199630b6
BS
2773int bdrv_is_snapshot(BlockDriverState *bs)
2774{
2775 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2776}
2777
f9092b10
MA
2778BlockDriverState *bdrv_snapshots(void)
2779{
2780 BlockDriverState *bs;
2781
3ac906f7 2782 if (bs_snapshots) {
f9092b10 2783 return bs_snapshots;
3ac906f7 2784 }
f9092b10
MA
2785
2786 bs = NULL;
2787 while ((bs = bdrv_next(bs))) {
2788 if (bdrv_can_snapshot(bs)) {
3ac906f7
MA
2789 bs_snapshots = bs;
2790 return bs;
f9092b10
MA
2791 }
2792 }
2793 return NULL;
f9092b10
MA
2794}
2795
5fafdf24 2796int bdrv_snapshot_create(BlockDriverState *bs,
faea38e7
FB
2797 QEMUSnapshotInfo *sn_info)
2798{
2799 BlockDriver *drv = bs->drv;
2800 if (!drv)
19cb3738 2801 return -ENOMEDIUM;
7cdb1f6d
MK
2802 if (drv->bdrv_snapshot_create)
2803 return drv->bdrv_snapshot_create(bs, sn_info);
2804 if (bs->file)
2805 return bdrv_snapshot_create(bs->file, sn_info);
2806 return -ENOTSUP;
faea38e7
FB
2807}
2808
5fafdf24 2809int bdrv_snapshot_goto(BlockDriverState *bs,
faea38e7
FB
2810 const char *snapshot_id)
2811{
2812 BlockDriver *drv = bs->drv;
7cdb1f6d
MK
2813 int ret, open_ret;
2814
faea38e7 2815 if (!drv)
19cb3738 2816 return -ENOMEDIUM;
7cdb1f6d
MK
2817 if (drv->bdrv_snapshot_goto)
2818 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2819
2820 if (bs->file) {
2821 drv->bdrv_close(bs);
2822 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2823 open_ret = drv->bdrv_open(bs, bs->open_flags);
2824 if (open_ret < 0) {
2825 bdrv_delete(bs->file);
2826 bs->drv = NULL;
2827 return open_ret;
2828 }
2829 return ret;
2830 }
2831
2832 return -ENOTSUP;
faea38e7
FB
2833}
2834
2835int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2836{
2837 BlockDriver *drv = bs->drv;
2838 if (!drv)
19cb3738 2839 return -ENOMEDIUM;
7cdb1f6d
MK
2840 if (drv->bdrv_snapshot_delete)
2841 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2842 if (bs->file)
2843 return bdrv_snapshot_delete(bs->file, snapshot_id);
2844 return -ENOTSUP;
faea38e7
FB
2845}
2846
5fafdf24 2847int bdrv_snapshot_list(BlockDriverState *bs,
faea38e7
FB
2848 QEMUSnapshotInfo **psn_info)
2849{
2850 BlockDriver *drv = bs->drv;
2851 if (!drv)
19cb3738 2852 return -ENOMEDIUM;
7cdb1f6d
MK
2853 if (drv->bdrv_snapshot_list)
2854 return drv->bdrv_snapshot_list(bs, psn_info);
2855 if (bs->file)
2856 return bdrv_snapshot_list(bs->file, psn_info);
2857 return -ENOTSUP;
faea38e7
FB
2858}
2859
51ef6727 2860int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2861 const char *snapshot_name)
2862{
2863 BlockDriver *drv = bs->drv;
2864 if (!drv) {
2865 return -ENOMEDIUM;
2866 }
2867 if (!bs->read_only) {
2868 return -EINVAL;
2869 }
2870 if (drv->bdrv_snapshot_load_tmp) {
2871 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2872 }
2873 return -ENOTSUP;
2874}
2875
e8a6bb9c
MT
2876BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2877 const char *backing_file)
2878{
2879 if (!bs->drv) {
2880 return NULL;
2881 }
2882
2883 if (bs->backing_hd) {
2884 if (strcmp(bs->backing_file, backing_file) == 0) {
2885 return bs->backing_hd;
2886 } else {
2887 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2888 }
2889 }
2890
2891 return NULL;
2892}
2893
faea38e7
FB
2894#define NB_SUFFIXES 4
2895
2896char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2897{
2898 static const char suffixes[NB_SUFFIXES] = "KMGT";
2899 int64_t base;
2900 int i;
2901
2902 if (size <= 999) {
2903 snprintf(buf, buf_size, "%" PRId64, size);
2904 } else {
2905 base = 1024;
2906 for(i = 0; i < NB_SUFFIXES; i++) {
2907 if (size < (10 * base)) {
5fafdf24 2908 snprintf(buf, buf_size, "%0.1f%c",
faea38e7
FB
2909 (double)size / base,
2910 suffixes[i]);
2911 break;
2912 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
5fafdf24 2913 snprintf(buf, buf_size, "%" PRId64 "%c",
faea38e7
FB
2914 ((size + (base >> 1)) / base),
2915 suffixes[i]);
2916 break;
2917 }
2918 base = base * 1024;
2919 }
2920 }
2921 return buf;
2922}
2923
2924char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2925{
2926 char buf1[128], date_buf[128], clock_buf[128];
3b9f94e1
FB
2927#ifdef _WIN32
2928 struct tm *ptm;
2929#else
faea38e7 2930 struct tm tm;
3b9f94e1 2931#endif
faea38e7
FB
2932 time_t ti;
2933 int64_t secs;
2934
2935 if (!sn) {
5fafdf24
TS
2936 snprintf(buf, buf_size,
2937 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2938 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2939 } else {
2940 ti = sn->date_sec;
3b9f94e1
FB
2941#ifdef _WIN32
2942 ptm = localtime(&ti);
2943 strftime(date_buf, sizeof(date_buf),
2944 "%Y-%m-%d %H:%M:%S", ptm);
2945#else
faea38e7
FB
2946 localtime_r(&ti, &tm);
2947 strftime(date_buf, sizeof(date_buf),
2948 "%Y-%m-%d %H:%M:%S", &tm);
3b9f94e1 2949#endif
faea38e7
FB
2950 secs = sn->vm_clock_nsec / 1000000000;
2951 snprintf(clock_buf, sizeof(clock_buf),
2952 "%02d:%02d:%02d.%03d",
2953 (int)(secs / 3600),
2954 (int)((secs / 60) % 60),
5fafdf24 2955 (int)(secs % 60),
faea38e7
FB
2956 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2957 snprintf(buf, buf_size,
5fafdf24 2958 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2959 sn->id_str, sn->name,
2960 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2961 date_buf,
2962 clock_buf);
2963 }
2964 return buf;
2965}
2966
ea2384d3 2967/**************************************************************/
83f64091 2968/* async I/Os */
ea2384d3 2969
3b69e4b9 2970BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 2971 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 2972 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 2973{
bbf0a440
SH
2974 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2975
b2a61371 2976 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2977 cb, opaque, false);
ea2384d3
FB
2978}
2979
f141eafe
AL
2980BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2981 QEMUIOVector *qiov, int nb_sectors,
2982 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 2983{
bbf0a440
SH
2984 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2985
1a6e115b 2986 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2987 cb, opaque, true);
83f64091
FB
2988}
2989
40b4f539
KW
2990
2991typedef struct MultiwriteCB {
2992 int error;
2993 int num_requests;
2994 int num_callbacks;
2995 struct {
2996 BlockDriverCompletionFunc *cb;
2997 void *opaque;
2998 QEMUIOVector *free_qiov;
40b4f539
KW
2999 } callbacks[];
3000} MultiwriteCB;
3001
3002static void multiwrite_user_cb(MultiwriteCB *mcb)
3003{
3004 int i;
3005
3006 for (i = 0; i < mcb->num_callbacks; i++) {
3007 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
3008 if (mcb->callbacks[i].free_qiov) {
3009 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
3010 }
7267c094 3011 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
3012 }
3013}
3014
3015static void multiwrite_cb(void *opaque, int ret)
3016{
3017 MultiwriteCB *mcb = opaque;
3018
6d519a5f
SH
3019 trace_multiwrite_cb(mcb, ret);
3020
cb6d3ca0 3021 if (ret < 0 && !mcb->error) {
40b4f539 3022 mcb->error = ret;
40b4f539
KW
3023 }
3024
3025 mcb->num_requests--;
3026 if (mcb->num_requests == 0) {
de189a1b 3027 multiwrite_user_cb(mcb);
7267c094 3028 g_free(mcb);
40b4f539
KW
3029 }
3030}
3031
3032static int multiwrite_req_compare(const void *a, const void *b)
3033{
77be4366
CH
3034 const BlockRequest *req1 = a, *req2 = b;
3035
3036 /*
3037 * Note that we can't simply subtract req2->sector from req1->sector
3038 * here as that could overflow the return value.
3039 */
3040 if (req1->sector > req2->sector) {
3041 return 1;
3042 } else if (req1->sector < req2->sector) {
3043 return -1;
3044 } else {
3045 return 0;
3046 }
40b4f539
KW
3047}
3048
3049/*
3050 * Takes a bunch of requests and tries to merge them. Returns the number of
3051 * requests that remain after merging.
3052 */
3053static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3054 int num_reqs, MultiwriteCB *mcb)
3055{
3056 int i, outidx;
3057
3058 // Sort requests by start sector
3059 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3060
3061 // Check if adjacent requests touch the same clusters. If so, combine them,
3062 // filling up gaps with zero sectors.
3063 outidx = 0;
3064 for (i = 1; i < num_reqs; i++) {
3065 int merge = 0;
3066 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3067
b6a127a1 3068 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
3069 if (reqs[i].sector <= oldreq_last) {
3070 merge = 1;
3071 }
3072
e2a305fb
CH
3073 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3074 merge = 0;
3075 }
3076
40b4f539
KW
3077 if (merge) {
3078 size_t size;
7267c094 3079 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
3080 qemu_iovec_init(qiov,
3081 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3082
3083 // Add the first request to the merged one. If the requests are
3084 // overlapping, drop the last sectors of the first request.
3085 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3086 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3087
b6a127a1
PB
3088 // We should need to add any zeros between the two requests
3089 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
3090
3091 // Add the second request
3092 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3093
cbf1dff2 3094 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
3095 reqs[outidx].qiov = qiov;
3096
3097 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3098 } else {
3099 outidx++;
3100 reqs[outidx].sector = reqs[i].sector;
3101 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3102 reqs[outidx].qiov = reqs[i].qiov;
3103 }
3104 }
3105
3106 return outidx + 1;
3107}
3108
3109/*
3110 * Submit multiple AIO write requests at once.
3111 *
3112 * On success, the function returns 0 and all requests in the reqs array have
3113 * been submitted. In error case this function returns -1, and any of the
3114 * requests may or may not be submitted yet. In particular, this means that the
3115 * callback will be called for some of the requests, for others it won't. The
3116 * caller must check the error field of the BlockRequest to wait for the right
3117 * callbacks (if error != 0, no callback will be called).
3118 *
3119 * The implementation may modify the contents of the reqs array, e.g. to merge
3120 * requests. However, the fields opaque and error are left unmodified as they
3121 * are used to signal failure for a single request to the caller.
3122 */
3123int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3124{
40b4f539
KW
3125 MultiwriteCB *mcb;
3126 int i;
3127
301db7c2
RH
3128 /* don't submit writes if we don't have a medium */
3129 if (bs->drv == NULL) {
3130 for (i = 0; i < num_reqs; i++) {
3131 reqs[i].error = -ENOMEDIUM;
3132 }
3133 return -1;
3134 }
3135
40b4f539
KW
3136 if (num_reqs == 0) {
3137 return 0;
3138 }
3139
3140 // Create MultiwriteCB structure
7267c094 3141 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
3142 mcb->num_requests = 0;
3143 mcb->num_callbacks = num_reqs;
3144
3145 for (i = 0; i < num_reqs; i++) {
3146 mcb->callbacks[i].cb = reqs[i].cb;
3147 mcb->callbacks[i].opaque = reqs[i].opaque;
3148 }
3149
3150 // Check for mergable requests
3151 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3152
6d519a5f
SH
3153 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3154
df9309fb
PB
3155 /* Run the aio requests. */
3156 mcb->num_requests = num_reqs;
40b4f539 3157 for (i = 0; i < num_reqs; i++) {
ad54ae80 3158 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
40b4f539 3159 reqs[i].nb_sectors, multiwrite_cb, mcb);
40b4f539
KW
3160 }
3161
3162 return 0;
40b4f539
KW
3163}
3164
83f64091 3165void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 3166{
6bbff9a0 3167 acb->pool->cancel(acb);
83f64091
FB
3168}
3169
98f90dba
ZYW
3170/* block I/O throttling */
3171static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3172 bool is_write, double elapsed_time, uint64_t *wait)
3173{
3174 uint64_t bps_limit = 0;
3175 double bytes_limit, bytes_base, bytes_res;
3176 double slice_time, wait_time;
3177
3178 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3179 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3180 } else if (bs->io_limits.bps[is_write]) {
3181 bps_limit = bs->io_limits.bps[is_write];
3182 } else {
3183 if (wait) {
3184 *wait = 0;
3185 }
3186
3187 return false;
3188 }
3189
3190 slice_time = bs->slice_end - bs->slice_start;
3191 slice_time /= (NANOSECONDS_PER_SECOND);
3192 bytes_limit = bps_limit * slice_time;
3193 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3194 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3195 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3196 }
3197
3198 /* bytes_base: the bytes of data which have been read/written; and
3199 * it is obtained from the history statistic info.
3200 * bytes_res: the remaining bytes of data which need to be read/written.
3201 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3202 * the total time for completing reading/writting all data.
3203 */
3204 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3205
3206 if (bytes_base + bytes_res <= bytes_limit) {
3207 if (wait) {
3208 *wait = 0;
3209 }
3210
3211 return false;
3212 }
3213
3214 /* Calc approx time to dispatch */
3215 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3216
3217 /* When the I/O rate at runtime exceeds the limits,
3218 * bs->slice_end need to be extended in order that the current statistic
3219 * info can be kept until the timer fire, so it is increased and tuned
3220 * based on the result of experiment.
3221 */
3222 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3223 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3224 if (wait) {
3225 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3226 }
3227
3228 return true;
3229}
3230
3231static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3232 double elapsed_time, uint64_t *wait)
3233{
3234 uint64_t iops_limit = 0;
3235 double ios_limit, ios_base;
3236 double slice_time, wait_time;
3237
3238 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3239 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3240 } else if (bs->io_limits.iops[is_write]) {
3241 iops_limit = bs->io_limits.iops[is_write];
3242 } else {
3243 if (wait) {
3244 *wait = 0;
3245 }
3246
3247 return false;
3248 }
3249
3250 slice_time = bs->slice_end - bs->slice_start;
3251 slice_time /= (NANOSECONDS_PER_SECOND);
3252 ios_limit = iops_limit * slice_time;
3253 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3254 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3255 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3256 }
3257
3258 if (ios_base + 1 <= ios_limit) {
3259 if (wait) {
3260 *wait = 0;
3261 }
3262
3263 return false;
3264 }
3265
3266 /* Calc approx time to dispatch */
3267 wait_time = (ios_base + 1) / iops_limit;
3268 if (wait_time > elapsed_time) {
3269 wait_time = wait_time - elapsed_time;
3270 } else {
3271 wait_time = 0;
3272 }
3273
3274 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3275 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3276 if (wait) {
3277 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3278 }
3279
3280 return true;
3281}
3282
3283static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3284 bool is_write, int64_t *wait)
3285{
3286 int64_t now, max_wait;
3287 uint64_t bps_wait = 0, iops_wait = 0;
3288 double elapsed_time;
3289 int bps_ret, iops_ret;
3290
3291 now = qemu_get_clock_ns(vm_clock);
3292 if ((bs->slice_start < now)
3293 && (bs->slice_end > now)) {
3294 bs->slice_end = now + bs->slice_time;
3295 } else {
3296 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3297 bs->slice_start = now;
3298 bs->slice_end = now + bs->slice_time;
3299
3300 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3301 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3302
3303 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3304 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3305 }
3306
3307 elapsed_time = now - bs->slice_start;
3308 elapsed_time /= (NANOSECONDS_PER_SECOND);
3309
3310 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3311 is_write, elapsed_time, &bps_wait);
3312 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3313 elapsed_time, &iops_wait);
3314 if (bps_ret || iops_ret) {
3315 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3316 if (wait) {
3317 *wait = max_wait;
3318 }
3319
3320 now = qemu_get_clock_ns(vm_clock);
3321 if (bs->slice_end < now + max_wait) {
3322 bs->slice_end = now + max_wait;
3323 }
3324
3325 return true;
3326 }
3327
3328 if (wait) {
3329 *wait = 0;
3330 }
3331
3332 return false;
3333}
ce1a14dc 3334
83f64091
FB
3335/**************************************************************/
3336/* async block device emulation */
3337
c16b5a2c
CH
3338typedef struct BlockDriverAIOCBSync {
3339 BlockDriverAIOCB common;
3340 QEMUBH *bh;
3341 int ret;
3342 /* vector translation state */
3343 QEMUIOVector *qiov;
3344 uint8_t *bounce;
3345 int is_write;
3346} BlockDriverAIOCBSync;
3347
3348static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3349{
b666d239
KW
3350 BlockDriverAIOCBSync *acb =
3351 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 3352 qemu_bh_delete(acb->bh);
36afc451 3353 acb->bh = NULL;
c16b5a2c
CH
3354 qemu_aio_release(acb);
3355}
3356
3357static AIOPool bdrv_em_aio_pool = {
3358 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3359 .cancel = bdrv_aio_cancel_em,
3360};
3361
ce1a14dc 3362static void bdrv_aio_bh_cb(void *opaque)
83f64091 3363{
ce1a14dc 3364 BlockDriverAIOCBSync *acb = opaque;
f141eafe 3365
f141eafe
AL
3366 if (!acb->is_write)
3367 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
ceb42de8 3368 qemu_vfree(acb->bounce);
ce1a14dc 3369 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 3370 qemu_bh_delete(acb->bh);
36afc451 3371 acb->bh = NULL;
ce1a14dc 3372 qemu_aio_release(acb);
83f64091 3373}
beac80cd 3374
f141eafe
AL
3375static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3376 int64_t sector_num,
3377 QEMUIOVector *qiov,
3378 int nb_sectors,
3379 BlockDriverCompletionFunc *cb,
3380 void *opaque,
3381 int is_write)
3382
83f64091 3383{
ce1a14dc 3384 BlockDriverAIOCBSync *acb;
ce1a14dc 3385
c16b5a2c 3386 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
f141eafe
AL
3387 acb->is_write = is_write;
3388 acb->qiov = qiov;
e268ca52 3389 acb->bounce = qemu_blockalign(bs, qiov->size);
3f3aace8 3390 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
3391
3392 if (is_write) {
3393 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
1ed20acf 3394 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 3395 } else {
1ed20acf 3396 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
3397 }
3398
ce1a14dc 3399 qemu_bh_schedule(acb->bh);
f141eafe 3400
ce1a14dc 3401 return &acb->common;
beac80cd
FB
3402}
3403
f141eafe
AL
3404static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3405 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 3406 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 3407{
f141eafe
AL
3408 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3409}
83f64091 3410
f141eafe
AL
3411static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3412 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3413 BlockDriverCompletionFunc *cb, void *opaque)
3414{
3415 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 3416}
beac80cd 3417
68485420
KW
3418
3419typedef struct BlockDriverAIOCBCoroutine {
3420 BlockDriverAIOCB common;
3421 BlockRequest req;
3422 bool is_write;
3423 QEMUBH* bh;
3424} BlockDriverAIOCBCoroutine;
3425
3426static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3427{
3428 qemu_aio_flush();
3429}
3430
3431static AIOPool bdrv_em_co_aio_pool = {
3432 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3433 .cancel = bdrv_aio_co_cancel_em,
3434};
3435
35246a68 3436static void bdrv_co_em_bh(void *opaque)
68485420
KW
3437{
3438 BlockDriverAIOCBCoroutine *acb = opaque;
3439
3440 acb->common.cb(acb->common.opaque, acb->req.error);
3441 qemu_bh_delete(acb->bh);
3442 qemu_aio_release(acb);
3443}
3444
b2a61371
SH
3445/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3446static void coroutine_fn bdrv_co_do_rw(void *opaque)
3447{
3448 BlockDriverAIOCBCoroutine *acb = opaque;
3449 BlockDriverState *bs = acb->common.bs;
3450
3451 if (!acb->is_write) {
3452 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
470c0504 3453 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3454 } else {
3455 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
f08f2dda 3456 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3457 }
3458
35246a68 3459 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
3460 qemu_bh_schedule(acb->bh);
3461}
3462
68485420
KW
3463static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3464 int64_t sector_num,
3465 QEMUIOVector *qiov,
3466 int nb_sectors,
3467 BlockDriverCompletionFunc *cb,
3468 void *opaque,
8c5873d6 3469 bool is_write)
68485420
KW
3470{
3471 Coroutine *co;
3472 BlockDriverAIOCBCoroutine *acb;
3473
3474 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3475 acb->req.sector = sector_num;
3476 acb->req.nb_sectors = nb_sectors;
3477 acb->req.qiov = qiov;
3478 acb->is_write = is_write;
3479
8c5873d6 3480 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
3481 qemu_coroutine_enter(co, acb);
3482
3483 return &acb->common;
3484}
3485
07f07615 3486static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 3487{
07f07615
PB
3488 BlockDriverAIOCBCoroutine *acb = opaque;
3489 BlockDriverState *bs = acb->common.bs;
b2e12bc6 3490
07f07615
PB
3491 acb->req.error = bdrv_co_flush(bs);
3492 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 3493 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
3494}
3495
07f07615 3496BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
3497 BlockDriverCompletionFunc *cb, void *opaque)
3498{
07f07615 3499 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 3500
07f07615
PB
3501 Coroutine *co;
3502 BlockDriverAIOCBCoroutine *acb;
016f5cf6 3503
07f07615
PB
3504 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3505 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3506 qemu_coroutine_enter(co, acb);
016f5cf6 3507
016f5cf6
AG
3508 return &acb->common;
3509}
3510
4265d620
PB
3511static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3512{
3513 BlockDriverAIOCBCoroutine *acb = opaque;
3514 BlockDriverState *bs = acb->common.bs;
3515
3516 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3517 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3518 qemu_bh_schedule(acb->bh);
3519}
3520
3521BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3522 int64_t sector_num, int nb_sectors,
3523 BlockDriverCompletionFunc *cb, void *opaque)
3524{
3525 Coroutine *co;
3526 BlockDriverAIOCBCoroutine *acb;
3527
3528 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3529
3530 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3531 acb->req.sector = sector_num;
3532 acb->req.nb_sectors = nb_sectors;
3533 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3534 qemu_coroutine_enter(co, acb);
3535
3536 return &acb->common;
3537}
3538
ea2384d3
FB
3539void bdrv_init(void)
3540{
5efa9d5a 3541 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 3542}
ce1a14dc 3543
eb852011
MA
3544void bdrv_init_with_whitelist(void)
3545{
3546 use_bdrv_whitelist = 1;
3547 bdrv_init();
3548}
3549
c16b5a2c
CH
3550void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3551 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 3552{
ce1a14dc
PB
3553 BlockDriverAIOCB *acb;
3554
6bbff9a0
AL
3555 if (pool->free_aiocb) {
3556 acb = pool->free_aiocb;
3557 pool->free_aiocb = acb->next;
ce1a14dc 3558 } else {
7267c094 3559 acb = g_malloc0(pool->aiocb_size);
6bbff9a0 3560 acb->pool = pool;
ce1a14dc
PB
3561 }
3562 acb->bs = bs;
3563 acb->cb = cb;
3564 acb->opaque = opaque;
3565 return acb;
3566}
3567
3568void qemu_aio_release(void *p)
3569{
6bbff9a0
AL
3570 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3571 AIOPool *pool = acb->pool;
3572 acb->next = pool->free_aiocb;
3573 pool->free_aiocb = acb;
ce1a14dc 3574}
19cb3738 3575
f9f05dc5
KW
3576/**************************************************************/
3577/* Coroutine block device emulation */
3578
3579typedef struct CoroutineIOCompletion {
3580 Coroutine *coroutine;
3581 int ret;
3582} CoroutineIOCompletion;
3583
3584static void bdrv_co_io_em_complete(void *opaque, int ret)
3585{
3586 CoroutineIOCompletion *co = opaque;
3587
3588 co->ret = ret;
3589 qemu_coroutine_enter(co->coroutine, NULL);
3590}
3591
3592static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3593 int nb_sectors, QEMUIOVector *iov,
3594 bool is_write)
3595{
3596 CoroutineIOCompletion co = {
3597 .coroutine = qemu_coroutine_self(),
3598 };
3599 BlockDriverAIOCB *acb;
3600
3601 if (is_write) {
a652d160
SH
3602 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3603 bdrv_co_io_em_complete, &co);
f9f05dc5 3604 } else {
a652d160
SH
3605 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3606 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
3607 }
3608
59370aaa 3609 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
3610 if (!acb) {
3611 return -EIO;
3612 }
3613 qemu_coroutine_yield();
3614
3615 return co.ret;
3616}
3617
3618static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3619 int64_t sector_num, int nb_sectors,
3620 QEMUIOVector *iov)
3621{
3622 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3623}
3624
3625static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3626 int64_t sector_num, int nb_sectors,
3627 QEMUIOVector *iov)
3628{
3629 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3630}
3631
07f07615 3632static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 3633{
07f07615
PB
3634 RwCo *rwco = opaque;
3635
3636 rwco->ret = bdrv_co_flush(rwco->bs);
3637}
3638
3639int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3640{
eb489bb1
KW
3641 int ret;
3642
29cdb251 3643 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 3644 return 0;
eb489bb1
KW
3645 }
3646
ca716364 3647 /* Write back cached data to the OS even with cache=unsafe */
eb489bb1
KW
3648 if (bs->drv->bdrv_co_flush_to_os) {
3649 ret = bs->drv->bdrv_co_flush_to_os(bs);
3650 if (ret < 0) {
3651 return ret;
3652 }
3653 }
3654
ca716364
KW
3655 /* But don't actually force it to the disk with cache=unsafe */
3656 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3657 return 0;
3658 }
3659
eb489bb1 3660 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 3661 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
3662 } else if (bs->drv->bdrv_aio_flush) {
3663 BlockDriverAIOCB *acb;
3664 CoroutineIOCompletion co = {
3665 .coroutine = qemu_coroutine_self(),
3666 };
3667
3668 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3669 if (acb == NULL) {
29cdb251 3670 ret = -EIO;
07f07615
PB
3671 } else {
3672 qemu_coroutine_yield();
29cdb251 3673 ret = co.ret;
07f07615 3674 }
07f07615
PB
3675 } else {
3676 /*
3677 * Some block drivers always operate in either writethrough or unsafe
3678 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3679 * know how the server works (because the behaviour is hardcoded or
3680 * depends on server-side configuration), so we can't ensure that
3681 * everything is safe on disk. Returning an error doesn't work because
3682 * that would break guests even if the server operates in writethrough
3683 * mode.
3684 *
3685 * Let's hope the user knows what he's doing.
3686 */
29cdb251 3687 ret = 0;
07f07615 3688 }
29cdb251
PB
3689 if (ret < 0) {
3690 return ret;
3691 }
3692
3693 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3694 * in the case of cache=unsafe, so there are no useless flushes.
3695 */
3696 return bdrv_co_flush(bs->file);
07f07615
PB
3697}
3698
0f15423c
AL
3699void bdrv_invalidate_cache(BlockDriverState *bs)
3700{
3701 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3702 bs->drv->bdrv_invalidate_cache(bs);
3703 }
3704}
3705
3706void bdrv_invalidate_cache_all(void)
3707{
3708 BlockDriverState *bs;
3709
3710 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3711 bdrv_invalidate_cache(bs);
3712 }
3713}
3714
07789269
BC
3715void bdrv_clear_incoming_migration_all(void)
3716{
3717 BlockDriverState *bs;
3718
3719 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3720 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3721 }
3722}
3723
07f07615
PB
3724int bdrv_flush(BlockDriverState *bs)
3725{
3726 Coroutine *co;
3727 RwCo rwco = {
3728 .bs = bs,
3729 .ret = NOT_DONE,
e7a8a783 3730 };
e7a8a783 3731
07f07615
PB
3732 if (qemu_in_coroutine()) {
3733 /* Fast-path if already in coroutine context */
3734 bdrv_flush_co_entry(&rwco);
3735 } else {
3736 co = qemu_coroutine_create(bdrv_flush_co_entry);
3737 qemu_coroutine_enter(co, &rwco);
3738 while (rwco.ret == NOT_DONE) {
3739 qemu_aio_wait();
3740 }
e7a8a783 3741 }
07f07615
PB
3742
3743 return rwco.ret;
e7a8a783
KW
3744}
3745
4265d620
PB
3746static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3747{
3748 RwCo *rwco = opaque;
3749
3750 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3751}
3752
3753int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3754 int nb_sectors)
3755{
3756 if (!bs->drv) {
3757 return -ENOMEDIUM;
3758 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3759 return -EIO;
3760 } else if (bs->read_only) {
3761 return -EROFS;
3762 } else if (bs->drv->bdrv_co_discard) {
3763 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3764 } else if (bs->drv->bdrv_aio_discard) {
3765 BlockDriverAIOCB *acb;
3766 CoroutineIOCompletion co = {
3767 .coroutine = qemu_coroutine_self(),
3768 };
3769
3770 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3771 bdrv_co_io_em_complete, &co);
3772 if (acb == NULL) {
3773 return -EIO;
3774 } else {
3775 qemu_coroutine_yield();
3776 return co.ret;
3777 }
4265d620
PB
3778 } else {
3779 return 0;
3780 }
3781}
3782
3783int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3784{
3785 Coroutine *co;
3786 RwCo rwco = {
3787 .bs = bs,
3788 .sector_num = sector_num,
3789 .nb_sectors = nb_sectors,
3790 .ret = NOT_DONE,
3791 };
3792
3793 if (qemu_in_coroutine()) {
3794 /* Fast-path if already in coroutine context */
3795 bdrv_discard_co_entry(&rwco);
3796 } else {
3797 co = qemu_coroutine_create(bdrv_discard_co_entry);
3798 qemu_coroutine_enter(co, &rwco);
3799 while (rwco.ret == NOT_DONE) {
3800 qemu_aio_wait();
3801 }
3802 }
3803
3804 return rwco.ret;
3805}
3806
19cb3738
FB
3807/**************************************************************/
3808/* removable device support */
3809
3810/**
3811 * Return TRUE if the media is present
3812 */
3813int bdrv_is_inserted(BlockDriverState *bs)
3814{
3815 BlockDriver *drv = bs->drv;
a1aff5bf 3816
19cb3738
FB
3817 if (!drv)
3818 return 0;
3819 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
3820 return 1;
3821 return drv->bdrv_is_inserted(bs);
19cb3738
FB
3822}
3823
3824/**
8e49ca46
MA
3825 * Return whether the media changed since the last call to this
3826 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
3827 */
3828int bdrv_media_changed(BlockDriverState *bs)
3829{
3830 BlockDriver *drv = bs->drv;
19cb3738 3831
8e49ca46
MA
3832 if (drv && drv->bdrv_media_changed) {
3833 return drv->bdrv_media_changed(bs);
3834 }
3835 return -ENOTSUP;
19cb3738
FB
3836}
3837
3838/**
3839 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3840 */
f36f3949 3841void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
3842{
3843 BlockDriver *drv = bs->drv;
19cb3738 3844
822e1cd1
MA
3845 if (drv && drv->bdrv_eject) {
3846 drv->bdrv_eject(bs, eject_flag);
19cb3738 3847 }
6f382ed2
LC
3848
3849 if (bs->device_name[0] != '\0') {
3850 bdrv_emit_qmp_eject_event(bs, eject_flag);
3851 }
19cb3738
FB
3852}
3853
19cb3738
FB
3854/**
3855 * Lock or unlock the media (if it is locked, the user won't be able
3856 * to eject it manually).
3857 */
025e849a 3858void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
3859{
3860 BlockDriver *drv = bs->drv;
3861
025e849a 3862 trace_bdrv_lock_medium(bs, locked);
b8c6d095 3863
025e849a
MA
3864 if (drv && drv->bdrv_lock_medium) {
3865 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
3866 }
3867}
985a03b0
TS
3868
3869/* needed for generic scsi interface */
3870
3871int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3872{
3873 BlockDriver *drv = bs->drv;
3874
3875 if (drv && drv->bdrv_ioctl)
3876 return drv->bdrv_ioctl(bs, req, buf);
3877 return -ENOTSUP;
3878}
7d780669 3879
221f715d
AL
3880BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3881 unsigned long int req, void *buf,
3882 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 3883{
221f715d 3884 BlockDriver *drv = bs->drv;
7d780669 3885
221f715d
AL
3886 if (drv && drv->bdrv_aio_ioctl)
3887 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3888 return NULL;
7d780669 3889}
e268ca52 3890
7b6f9300
MA
3891void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3892{
3893 bs->buffer_alignment = align;
3894}
7cd1e32a 3895
e268ca52
AL
3896void *qemu_blockalign(BlockDriverState *bs, size_t size)
3897{
3898 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3899}
7cd1e32a 3900
3901void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3902{
3903 int64_t bitmap_size;
a55eb92c 3904
aaa0eb75 3905 bs->dirty_count = 0;
a55eb92c 3906 if (enable) {
c6d22830
JK
3907 if (!bs->dirty_bitmap) {
3908 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
71df14fc
PB
3909 BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
3910 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
a55eb92c 3911
71df14fc 3912 bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
a55eb92c 3913 }
7cd1e32a 3914 } else {
c6d22830 3915 if (bs->dirty_bitmap) {
7267c094 3916 g_free(bs->dirty_bitmap);
c6d22830 3917 bs->dirty_bitmap = NULL;
a55eb92c 3918 }
7cd1e32a 3919 }
3920}
3921
3922int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3923{
6ea44308 3924 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 3925
c6d22830
JK
3926 if (bs->dirty_bitmap &&
3927 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
6d59fec1
MT
3928 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3929 (1UL << (chunk % (sizeof(unsigned long) * 8))));
7cd1e32a 3930 } else {
3931 return 0;
3932 }
3933}
3934
a55eb92c
JK
3935void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3936 int nr_sectors)
7cd1e32a 3937{
3938 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3939}
aaa0eb75
LS
3940
3941int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3942{
3943 return bs->dirty_count;
3944}
f88e1a42 3945
db593f25
MT
3946void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3947{
3948 assert(bs->in_use != in_use);
3949 bs->in_use = in_use;
3950}
3951
3952int bdrv_in_use(BlockDriverState *bs)
3953{
3954 return bs->in_use;
3955}
3956
28a7282a
LC
3957void bdrv_iostatus_enable(BlockDriverState *bs)
3958{
d6bf279e 3959 bs->iostatus_enabled = true;
58e21ef5 3960 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3961}
3962
3963/* The I/O status is only enabled if the drive explicitly
3964 * enables it _and_ the VM is configured to stop on errors */
3965bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3966{
d6bf279e 3967 return (bs->iostatus_enabled &&
28a7282a
LC
3968 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3969 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3970 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3971}
3972
3973void bdrv_iostatus_disable(BlockDriverState *bs)
3974{
d6bf279e 3975 bs->iostatus_enabled = false;
28a7282a
LC
3976}
3977
3978void bdrv_iostatus_reset(BlockDriverState *bs)
3979{
3980 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 3981 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3982 }
3983}
3984
3985/* XXX: Today this is set by device models because it makes the implementation
3986 quite simple. However, the block layer knows about the error, so it's
3987 possible to implement this without device models being involved */
3988void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3989{
58e21ef5
LC
3990 if (bdrv_iostatus_is_enabled(bs) &&
3991 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
28a7282a 3992 assert(error >= 0);
58e21ef5
LC
3993 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3994 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
3995 }
3996}
3997
a597e79c
CH
3998void
3999bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
4000 enum BlockAcctType type)
4001{
4002 assert(type < BDRV_MAX_IOTYPE);
4003
4004 cookie->bytes = bytes;
c488c7f6 4005 cookie->start_time_ns = get_clock();
a597e79c
CH
4006 cookie->type = type;
4007}
4008
4009void
4010bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
4011{
4012 assert(cookie->type < BDRV_MAX_IOTYPE);
4013
4014 bs->nr_bytes[cookie->type] += cookie->bytes;
4015 bs->nr_ops[cookie->type]++;
c488c7f6 4016 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
4017}
4018
f88e1a42
JS
4019int bdrv_img_create(const char *filename, const char *fmt,
4020 const char *base_filename, const char *base_fmt,
4021 char *options, uint64_t img_size, int flags)
4022{
4023 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 4024 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42
JS
4025 BlockDriverState *bs = NULL;
4026 BlockDriver *drv, *proto_drv;
96df67d1 4027 BlockDriver *backing_drv = NULL;
f88e1a42
JS
4028 int ret = 0;
4029
4030 /* Find driver and parse its options */
4031 drv = bdrv_find_format(fmt);
4032 if (!drv) {
4033 error_report("Unknown file format '%s'", fmt);
4f70f249 4034 ret = -EINVAL;
f88e1a42
JS
4035 goto out;
4036 }
4037
4038 proto_drv = bdrv_find_protocol(filename);
4039 if (!proto_drv) {
4040 error_report("Unknown protocol '%s'", filename);
4f70f249 4041 ret = -EINVAL;
f88e1a42
JS
4042 goto out;
4043 }
4044
4045 create_options = append_option_parameters(create_options,
4046 drv->create_options);
4047 create_options = append_option_parameters(create_options,
4048 proto_drv->create_options);
4049
4050 /* Create parameter list with default values */
4051 param = parse_option_parameters("", create_options, param);
4052
4053 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4054
4055 /* Parse -o options */
4056 if (options) {
4057 param = parse_option_parameters(options, create_options, param);
4058 if (param == NULL) {
4059 error_report("Invalid options for file format '%s'.", fmt);
4f70f249 4060 ret = -EINVAL;
f88e1a42
JS
4061 goto out;
4062 }
4063 }
4064
4065 if (base_filename) {
4066 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4067 base_filename)) {
4068 error_report("Backing file not supported for file format '%s'",
4069 fmt);
4f70f249 4070 ret = -EINVAL;
f88e1a42
JS
4071 goto out;
4072 }
4073 }
4074
4075 if (base_fmt) {
4076 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4077 error_report("Backing file format not supported for file "
4078 "format '%s'", fmt);
4f70f249 4079 ret = -EINVAL;
f88e1a42
JS
4080 goto out;
4081 }
4082 }
4083
792da93a
JS
4084 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4085 if (backing_file && backing_file->value.s) {
4086 if (!strcmp(filename, backing_file->value.s)) {
4087 error_report("Error: Trying to create an image with the "
4088 "same filename as the backing file");
4f70f249 4089 ret = -EINVAL;
792da93a
JS
4090 goto out;
4091 }
4092 }
4093
f88e1a42
JS
4094 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4095 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
4096 backing_drv = bdrv_find_format(backing_fmt->value.s);
4097 if (!backing_drv) {
f88e1a42
JS
4098 error_report("Unknown backing file format '%s'",
4099 backing_fmt->value.s);
4f70f249 4100 ret = -EINVAL;
f88e1a42
JS
4101 goto out;
4102 }
4103 }
4104
4105 // The size for the image must always be specified, with one exception:
4106 // If we are using a backing file, we can obtain the size from there
d220894e
KW
4107 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4108 if (size && size->value.n == -1) {
f88e1a42
JS
4109 if (backing_file && backing_file->value.s) {
4110 uint64_t size;
f88e1a42 4111 char buf[32];
63090dac
PB
4112 int back_flags;
4113
4114 /* backing files always opened read-only */
4115 back_flags =
4116 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
f88e1a42 4117
f88e1a42
JS
4118 bs = bdrv_new("");
4119
63090dac 4120 ret = bdrv_open(bs, backing_file->value.s, back_flags, backing_drv);
f88e1a42 4121 if (ret < 0) {
96df67d1 4122 error_report("Could not open '%s'", backing_file->value.s);
f88e1a42
JS
4123 goto out;
4124 }
4125 bdrv_get_geometry(bs, &size);
4126 size *= 512;
4127
4128 snprintf(buf, sizeof(buf), "%" PRId64, size);
4129 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4130 } else {
4131 error_report("Image creation needs a size parameter");
4f70f249 4132 ret = -EINVAL;
f88e1a42
JS
4133 goto out;
4134 }
4135 }
4136
4137 printf("Formatting '%s', fmt=%s ", filename, fmt);
4138 print_option_parameters(param);
4139 puts("");
4140
4141 ret = bdrv_create(drv, filename, param);
4142
4143 if (ret < 0) {
4144 if (ret == -ENOTSUP) {
4145 error_report("Formatting or formatting option not supported for "
4146 "file format '%s'", fmt);
4147 } else if (ret == -EFBIG) {
4148 error_report("The image size is too large for file format '%s'",
4149 fmt);
4150 } else {
4151 error_report("%s: error while creating %s: %s", filename, fmt,
4152 strerror(-ret));
4153 }
4154 }
4155
4156out:
4157 free_option_parameters(create_options);
4158 free_option_parameters(param);
4159
4160 if (bs) {
4161 bdrv_delete(bs);
4162 }
4f70f249
JS
4163
4164 return ret;
f88e1a42 4165}
eeec61f2
SH
4166
4167void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
c83c66c3
SH
4168 int64_t speed, BlockDriverCompletionFunc *cb,
4169 void *opaque, Error **errp)
eeec61f2
SH
4170{
4171 BlockJob *job;
4172
4173 if (bs->job || bdrv_in_use(bs)) {
fd7f8c65 4174 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
eeec61f2
SH
4175 return NULL;
4176 }
4177 bdrv_set_in_use(bs, 1);
4178
4179 job = g_malloc0(job_type->instance_size);
4180 job->job_type = job_type;
4181 job->bs = bs;
4182 job->cb = cb;
4183 job->opaque = opaque;
4513eafe 4184 job->busy = true;
eeec61f2 4185 bs->job = job;
c83c66c3
SH
4186
4187 /* Only set speed when necessary to avoid NotSupported error */
4188 if (speed != 0) {
4189 Error *local_err = NULL;
4190
4191 block_job_set_speed(job, speed, &local_err);
4192 if (error_is_set(&local_err)) {
4193 bs->job = NULL;
4194 g_free(job);
4195 bdrv_set_in_use(bs, 0);
4196 error_propagate(errp, local_err);
4197 return NULL;
4198 }
4199 }
eeec61f2
SH
4200 return job;
4201}
4202
4203void block_job_complete(BlockJob *job, int ret)
4204{
4205 BlockDriverState *bs = job->bs;
4206
4207 assert(bs->job == job);
4208 job->cb(job->opaque, ret);
4209 bs->job = NULL;
4210 g_free(job);
4211 bdrv_set_in_use(bs, 0);
4212}
4213
882ec7ce 4214void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
eeec61f2 4215{
9e6636c7 4216 Error *local_err = NULL;
9f25eccc 4217
eeec61f2 4218 if (!job->job_type->set_speed) {
9e6636c7
SH
4219 error_set(errp, QERR_NOT_SUPPORTED);
4220 return;
eeec61f2 4221 }
882ec7ce 4222 job->job_type->set_speed(job, speed, &local_err);
9e6636c7
SH
4223 if (error_is_set(&local_err)) {
4224 error_propagate(errp, local_err);
4225 return;
9f25eccc 4226 }
9e6636c7 4227
882ec7ce 4228 job->speed = speed;
eeec61f2
SH
4229}
4230
4231void block_job_cancel(BlockJob *job)
4232{
4233 job->cancelled = true;
fa4478d5
PB
4234 if (job->co && !job->busy) {
4235 qemu_coroutine_enter(job->co, NULL);
4236 }
eeec61f2
SH
4237}
4238
4239bool block_job_is_cancelled(BlockJob *job)
4240{
4241 return job->cancelled;
4242}
3e914655 4243
fa4478d5
PB
4244struct BlockCancelData {
4245 BlockJob *job;
4246 BlockDriverCompletionFunc *cb;
4247 void *opaque;
4248 bool cancelled;
4249 int ret;
4250};
4251
4252static void block_job_cancel_cb(void *opaque, int ret)
3e914655 4253{
fa4478d5
PB
4254 struct BlockCancelData *data = opaque;
4255
4256 data->cancelled = block_job_is_cancelled(data->job);
4257 data->ret = ret;
4258 data->cb(data->opaque, ret);
4259}
4260
4261int block_job_cancel_sync(BlockJob *job)
4262{
4263 struct BlockCancelData data;
3e914655
PB
4264 BlockDriverState *bs = job->bs;
4265
4266 assert(bs->job == job);
fa4478d5
PB
4267
4268 /* Set up our own callback to store the result and chain to
4269 * the original callback.
4270 */
4271 data.job = job;
4272 data.cb = job->cb;
4273 data.opaque = job->opaque;
4274 data.ret = -EINPROGRESS;
4275 job->cb = block_job_cancel_cb;
4276 job->opaque = &data;
3e914655 4277 block_job_cancel(job);
fa4478d5 4278 while (data.ret == -EINPROGRESS) {
3e914655
PB
4279 qemu_aio_wait();
4280 }
fa4478d5 4281 return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
3e914655 4282}
4513eafe
PB
4283
4284void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
4285{
4286 /* Check cancellation *before* setting busy = false, too! */
4287 if (!block_job_is_cancelled(job)) {
4288 job->busy = false;
4289 co_sleep_ns(clock, ns);
4290 job->busy = true;
4291 }
4292}