]> git.proxmox.com Git - qemu.git/blame - block.c
block: Drain requests in bdrv_close
[qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
376253ec 27#include "monitor.h"
ea2384d3 28#include "block_int.h"
5efa9d5a 29#include "module.h"
f795e743 30#include "qjson.h"
68485420 31#include "qemu-coroutine.h"
b2023818 32#include "qmp-commands.h"
0563e191 33#include "qemu-timer.h"
fc01f7e7 34
71e72a19 35#ifdef CONFIG_BSD
7674e7bf
FB
36#include <sys/types.h>
37#include <sys/stat.h>
38#include <sys/ioctl.h>
72cf2d4f 39#include <sys/queue.h>
c5e97233 40#ifndef __DragonFly__
7674e7bf
FB
41#include <sys/disk.h>
42#endif
c5e97233 43#endif
7674e7bf 44
49dc768d
AL
45#ifdef _WIN32
46#include <windows.h>
47#endif
48
1c9805a3
SH
49#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
470c0504
SH
51typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
f08f2dda 53 BDRV_REQ_ZERO_WRITE = 0x2,
470c0504
SH
54} BdrvRequestFlags;
55
7d4b4ba5 56static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
57static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 59 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
60static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 62 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
63static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
c5fbe571 69static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
1c9805a3 72static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
b2a61371
SH
75static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
8c5873d6 81 bool is_write);
b2a61371 82static void coroutine_fn bdrv_co_do_rw(void *opaque);
ec530c81 83
98f90dba
ZYW
84static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85 bool is_write, double elapsed_time, uint64_t *wait);
86static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87 double elapsed_time, uint64_t *wait);
88static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89 bool is_write, int64_t *wait);
90
1b7bdbc1
SH
91static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 93
8a22f02a
SH
94static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 96
f9092b10
MA
97/* The device to use for VM snapshots */
98static BlockDriverState *bs_snapshots;
99
eb852011
MA
100/* If non-zero, use only whitelisted block drivers */
101static int use_bdrv_whitelist;
102
9e0b22f4
SH
103#ifdef _WIN32
104static int is_windows_drive_prefix(const char *filename)
105{
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109}
110
111int is_windows_drive(const char *filename)
112{
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120}
121#endif
122
0563e191 123/* throttling disk I/O limits */
98f90dba
ZYW
124void bdrv_io_limits_disable(BlockDriverState *bs)
125{
126 bs->io_limits_enabled = false;
127
128 while (qemu_co_queue_next(&bs->throttled_reqs));
129
130 if (bs->block_timer) {
131 qemu_del_timer(bs->block_timer);
132 qemu_free_timer(bs->block_timer);
133 bs->block_timer = NULL;
134 }
135
136 bs->slice_start = 0;
137 bs->slice_end = 0;
138 bs->slice_time = 0;
139 memset(&bs->io_base, 0, sizeof(bs->io_base));
140}
141
0563e191
ZYW
142static void bdrv_block_timer(void *opaque)
143{
144 BlockDriverState *bs = opaque;
145
146 qemu_co_queue_next(&bs->throttled_reqs);
147}
148
149void bdrv_io_limits_enable(BlockDriverState *bs)
150{
151 qemu_co_queue_init(&bs->throttled_reqs);
152 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
154 bs->slice_start = qemu_get_clock_ns(vm_clock);
155 bs->slice_end = bs->slice_start + bs->slice_time;
156 memset(&bs->io_base, 0, sizeof(bs->io_base));
157 bs->io_limits_enabled = true;
158}
159
160bool bdrv_io_limits_enabled(BlockDriverState *bs)
161{
162 BlockIOLimit *io_limits = &bs->io_limits;
163 return io_limits->bps[BLOCK_IO_LIMIT_READ]
164 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166 || io_limits->iops[BLOCK_IO_LIMIT_READ]
167 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
169}
170
98f90dba
ZYW
171static void bdrv_io_limits_intercept(BlockDriverState *bs,
172 bool is_write, int nb_sectors)
173{
174 int64_t wait_time = -1;
175
176 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177 qemu_co_queue_wait(&bs->throttled_reqs);
178 }
179
180 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181 * throttled requests will not be dequeued until the current request is
182 * allowed to be serviced. So if the current request still exceeds the
183 * limits, it will be inserted to the head. All requests followed it will
184 * be still in throttled_reqs queue.
185 */
186
187 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188 qemu_mod_timer(bs->block_timer,
189 wait_time + qemu_get_clock_ns(vm_clock));
190 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
191 }
192
193 qemu_co_queue_next(&bs->throttled_reqs);
194}
195
9e0b22f4
SH
196/* check if the path starts with "<protocol>:" */
197static int path_has_protocol(const char *path)
198{
199#ifdef _WIN32
200 if (is_windows_drive(path) ||
201 is_windows_drive_prefix(path)) {
202 return 0;
203 }
204#endif
205
206 return strchr(path, ':') != NULL;
207}
208
83f64091 209int path_is_absolute(const char *path)
3b0d4f61 210{
83f64091 211 const char *p;
21664424
FB
212#ifdef _WIN32
213 /* specific case for names like: "\\.\d:" */
214 if (*path == '/' || *path == '\\')
215 return 1;
216#endif
83f64091
FB
217 p = strchr(path, ':');
218 if (p)
219 p++;
220 else
221 p = path;
3b9f94e1
FB
222#ifdef _WIN32
223 return (*p == '/' || *p == '\\');
224#else
225 return (*p == '/');
226#endif
3b0d4f61
FB
227}
228
83f64091
FB
229/* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
231 supported. */
232void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
3b0d4f61 235{
83f64091
FB
236 const char *p, *p1;
237 int len;
238
239 if (dest_size <= 0)
240 return;
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
243 } else {
244 p = strchr(base_path, ':');
245 if (p)
246 p++;
247 else
248 p = base_path;
3b9f94e1
FB
249 p1 = strrchr(base_path, '/');
250#ifdef _WIN32
251 {
252 const char *p2;
253 p2 = strrchr(base_path, '\\');
254 if (!p1 || p2 > p1)
255 p1 = p2;
256 }
257#endif
83f64091
FB
258 if (p1)
259 p1++;
260 else
261 p1 = base_path;
262 if (p1 > p)
263 p = p1;
264 len = p - base_path;
265 if (len > dest_size - 1)
266 len = dest_size - 1;
267 memcpy(dest, base_path, len);
268 dest[len] = '\0';
269 pstrcat(dest, dest_size, filename);
3b0d4f61 270 }
3b0d4f61
FB
271}
272
5efa9d5a 273void bdrv_register(BlockDriver *bdrv)
ea2384d3 274{
8c5873d6
SH
275 /* Block drivers without coroutine functions need emulation */
276 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
277 bdrv->bdrv_co_readv = bdrv_co_readv_em;
278 bdrv->bdrv_co_writev = bdrv_co_writev_em;
279
f8c35c1d
SH
280 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281 * the block driver lacks aio we need to emulate that too.
282 */
f9f05dc5
KW
283 if (!bdrv->bdrv_aio_readv) {
284 /* add AIO emulation layer */
285 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 287 }
83f64091 288 }
b2e12bc6 289
8a22f02a 290 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 291}
b338082b
FB
292
293/* create a new block device (by default it is empty) */
294BlockDriverState *bdrv_new(const char *device_name)
295{
1b7bdbc1 296 BlockDriverState *bs;
b338082b 297
7267c094 298 bs = g_malloc0(sizeof(BlockDriverState));
b338082b 299 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 300 if (device_name[0] != '\0') {
1b7bdbc1 301 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
ea2384d3 302 }
28a7282a 303 bdrv_iostatus_disable(bs);
b338082b
FB
304 return bs;
305}
306
ea2384d3
FB
307BlockDriver *bdrv_find_format(const char *format_name)
308{
309 BlockDriver *drv1;
8a22f02a
SH
310 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 312 return drv1;
8a22f02a 313 }
ea2384d3
FB
314 }
315 return NULL;
316}
317
eb852011
MA
318static int bdrv_is_whitelisted(BlockDriver *drv)
319{
320 static const char *whitelist[] = {
321 CONFIG_BDRV_WHITELIST
322 };
323 const char **p;
324
325 if (!whitelist[0])
326 return 1; /* no whitelist, anything goes */
327
328 for (p = whitelist; *p; p++) {
329 if (!strcmp(drv->format_name, *p)) {
330 return 1;
331 }
332 }
333 return 0;
334}
335
336BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
337{
338 BlockDriver *drv = bdrv_find_format(format_name);
339 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
340}
341
0e7e1989
KW
342int bdrv_create(BlockDriver *drv, const char* filename,
343 QEMUOptionParameter *options)
ea2384d3
FB
344{
345 if (!drv->bdrv_create)
346 return -ENOTSUP;
0e7e1989
KW
347
348 return drv->bdrv_create(filename, options);
ea2384d3
FB
349}
350
84a12e66
CH
351int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
352{
353 BlockDriver *drv;
354
b50cbabc 355 drv = bdrv_find_protocol(filename);
84a12e66 356 if (drv == NULL) {
16905d71 357 return -ENOENT;
84a12e66
CH
358 }
359
360 return bdrv_create(drv, filename, options);
361}
362
d5249393 363#ifdef _WIN32
95389c86 364void get_tmp_filename(char *filename, int size)
d5249393 365{
3b9f94e1 366 char temp_dir[MAX_PATH];
3b46e624 367
3b9f94e1
FB
368 GetTempPath(MAX_PATH, temp_dir);
369 GetTempFileName(temp_dir, "qem", 0, filename);
d5249393
FB
370}
371#else
95389c86 372void get_tmp_filename(char *filename, int size)
fc01f7e7 373{
67b915a5 374 int fd;
7ccfb2eb 375 const char *tmpdir;
d5249393 376 /* XXX: race condition possible */
0badc1ee
AJ
377 tmpdir = getenv("TMPDIR");
378 if (!tmpdir)
379 tmpdir = "/tmp";
380 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
ea2384d3
FB
381 fd = mkstemp(filename);
382 close(fd);
383}
d5249393 384#endif
fc01f7e7 385
84a12e66
CH
386/*
387 * Detect host devices. By convention, /dev/cdrom[N] is always
388 * recognized as a host CDROM.
389 */
390static BlockDriver *find_hdev_driver(const char *filename)
391{
392 int score_max = 0, score;
393 BlockDriver *drv = NULL, *d;
394
395 QLIST_FOREACH(d, &bdrv_drivers, list) {
396 if (d->bdrv_probe_device) {
397 score = d->bdrv_probe_device(filename);
398 if (score > score_max) {
399 score_max = score;
400 drv = d;
401 }
402 }
403 }
404
405 return drv;
406}
407
b50cbabc 408BlockDriver *bdrv_find_protocol(const char *filename)
83f64091
FB
409{
410 BlockDriver *drv1;
411 char protocol[128];
1cec71e3 412 int len;
83f64091 413 const char *p;
19cb3738 414
66f82cee
KW
415 /* TODO Drivers without bdrv_file_open must be specified explicitly */
416
39508e7a
CH
417 /*
418 * XXX(hch): we really should not let host device detection
419 * override an explicit protocol specification, but moving this
420 * later breaks access to device names with colons in them.
421 * Thanks to the brain-dead persistent naming schemes on udev-
422 * based Linux systems those actually are quite common.
423 */
424 drv1 = find_hdev_driver(filename);
425 if (drv1) {
426 return drv1;
427 }
428
9e0b22f4 429 if (!path_has_protocol(filename)) {
39508e7a 430 return bdrv_find_format("file");
84a12e66 431 }
9e0b22f4
SH
432 p = strchr(filename, ':');
433 assert(p != NULL);
1cec71e3
AL
434 len = p - filename;
435 if (len > sizeof(protocol) - 1)
436 len = sizeof(protocol) - 1;
437 memcpy(protocol, filename, len);
438 protocol[len] = '\0';
8a22f02a 439 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 440 if (drv1->protocol_name &&
8a22f02a 441 !strcmp(drv1->protocol_name, protocol)) {
83f64091 442 return drv1;
8a22f02a 443 }
83f64091
FB
444 }
445 return NULL;
446}
447
c98ac35d 448static int find_image_format(const char *filename, BlockDriver **pdrv)
f3a5d3f8
CH
449{
450 int ret, score, score_max;
451 BlockDriver *drv1, *drv;
452 uint8_t buf[2048];
453 BlockDriverState *bs;
454
f5edb014 455 ret = bdrv_file_open(&bs, filename, 0);
c98ac35d
SW
456 if (ret < 0) {
457 *pdrv = NULL;
458 return ret;
459 }
f8ea0b00 460
08a00559
KW
461 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462 if (bs->sg || !bdrv_is_inserted(bs)) {
1a396859 463 bdrv_delete(bs);
c98ac35d
SW
464 drv = bdrv_find_format("raw");
465 if (!drv) {
466 ret = -ENOENT;
467 }
468 *pdrv = drv;
469 return ret;
1a396859 470 }
f8ea0b00 471
83f64091
FB
472 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473 bdrv_delete(bs);
474 if (ret < 0) {
c98ac35d
SW
475 *pdrv = NULL;
476 return ret;
83f64091
FB
477 }
478
ea2384d3 479 score_max = 0;
84a12e66 480 drv = NULL;
8a22f02a 481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
482 if (drv1->bdrv_probe) {
483 score = drv1->bdrv_probe(buf, ret, filename);
484 if (score > score_max) {
485 score_max = score;
486 drv = drv1;
487 }
0849bf08 488 }
fc01f7e7 489 }
c98ac35d
SW
490 if (!drv) {
491 ret = -ENOENT;
492 }
493 *pdrv = drv;
494 return ret;
ea2384d3
FB
495}
496
51762288
SH
497/**
498 * Set the current 'total_sectors' value
499 */
500static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
501{
502 BlockDriver *drv = bs->drv;
503
396759ad
NB
504 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505 if (bs->sg)
506 return 0;
507
51762288
SH
508 /* query actual device if possible, otherwise just trust the hint */
509 if (drv->bdrv_getlength) {
510 int64_t length = drv->bdrv_getlength(bs);
511 if (length < 0) {
512 return length;
513 }
514 hint = length >> BDRV_SECTOR_BITS;
515 }
516
517 bs->total_sectors = hint;
518 return 0;
519}
520
c3993cdc
SH
521/**
522 * Set open flags for a given cache mode
523 *
524 * Return 0 on success, -1 if the cache mode was invalid.
525 */
526int bdrv_parse_cache_flags(const char *mode, int *flags)
527{
528 *flags &= ~BDRV_O_CACHE_MASK;
529
530 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
532 } else if (!strcmp(mode, "directsync")) {
533 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
534 } else if (!strcmp(mode, "writeback")) {
535 *flags |= BDRV_O_CACHE_WB;
536 } else if (!strcmp(mode, "unsafe")) {
537 *flags |= BDRV_O_CACHE_WB;
538 *flags |= BDRV_O_NO_FLUSH;
539 } else if (!strcmp(mode, "writethrough")) {
540 /* this is the default */
541 } else {
542 return -1;
543 }
544
545 return 0;
546}
547
53fec9d3
SH
548/**
549 * The copy-on-read flag is actually a reference count so multiple users may
550 * use the feature without worrying about clobbering its previous state.
551 * Copy-on-read stays enabled until all users have called to disable it.
552 */
553void bdrv_enable_copy_on_read(BlockDriverState *bs)
554{
555 bs->copy_on_read++;
556}
557
558void bdrv_disable_copy_on_read(BlockDriverState *bs)
559{
560 assert(bs->copy_on_read > 0);
561 bs->copy_on_read--;
562}
563
57915332
KW
564/*
565 * Common part for opening disk images and files
566 */
567static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568 int flags, BlockDriver *drv)
569{
570 int ret, open_flags;
571
572 assert(drv != NULL);
573
28dcee10
SH
574 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
575
66f82cee 576 bs->file = NULL;
51762288 577 bs->total_sectors = 0;
57915332
KW
578 bs->encrypted = 0;
579 bs->valid_key = 0;
03f541bd 580 bs->sg = 0;
57915332 581 bs->open_flags = flags;
03f541bd 582 bs->growable = 0;
57915332
KW
583 bs->buffer_alignment = 512;
584
53fec9d3
SH
585 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587 bdrv_enable_copy_on_read(bs);
588 }
589
57915332 590 pstrcpy(bs->filename, sizeof(bs->filename), filename);
03f541bd 591 bs->backing_file[0] = '\0';
57915332
KW
592
593 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594 return -ENOTSUP;
595 }
596
597 bs->drv = drv;
7267c094 598 bs->opaque = g_malloc0(drv->instance_size);
57915332 599
03f541bd 600 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
57915332
KW
601
602 /*
603 * Clear flags that are internal to the block layer before opening the
604 * image.
605 */
606 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
607
608 /*
ebabb67a 609 * Snapshots should be writable.
57915332
KW
610 */
611 if (bs->is_temporary) {
612 open_flags |= BDRV_O_RDWR;
613 }
614
e7c63796
SH
615 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
616
66f82cee
KW
617 /* Open the image, either directly or using a protocol */
618 if (drv->bdrv_file_open) {
619 ret = drv->bdrv_file_open(bs, filename, open_flags);
620 } else {
621 ret = bdrv_file_open(&bs->file, filename, open_flags);
622 if (ret >= 0) {
623 ret = drv->bdrv_open(bs, open_flags);
624 }
625 }
626
57915332
KW
627 if (ret < 0) {
628 goto free_and_fail;
629 }
630
51762288
SH
631 ret = refresh_total_sectors(bs, bs->total_sectors);
632 if (ret < 0) {
633 goto free_and_fail;
57915332 634 }
51762288 635
57915332
KW
636#ifndef _WIN32
637 if (bs->is_temporary) {
638 unlink(filename);
639 }
640#endif
641 return 0;
642
643free_and_fail:
66f82cee
KW
644 if (bs->file) {
645 bdrv_delete(bs->file);
646 bs->file = NULL;
647 }
7267c094 648 g_free(bs->opaque);
57915332
KW
649 bs->opaque = NULL;
650 bs->drv = NULL;
651 return ret;
652}
653
b6ce07aa
KW
654/*
655 * Opens a file using a protocol (file, host_device, nbd, ...)
656 */
83f64091 657int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
ea2384d3 658{
83f64091 659 BlockDriverState *bs;
6db95603 660 BlockDriver *drv;
83f64091
FB
661 int ret;
662
b50cbabc 663 drv = bdrv_find_protocol(filename);
6db95603
CH
664 if (!drv) {
665 return -ENOENT;
666 }
667
83f64091 668 bs = bdrv_new("");
b6ce07aa 669 ret = bdrv_open_common(bs, filename, flags, drv);
83f64091
FB
670 if (ret < 0) {
671 bdrv_delete(bs);
672 return ret;
3b0d4f61 673 }
71d0770c 674 bs->growable = 1;
83f64091
FB
675 *pbs = bs;
676 return 0;
677}
678
b6ce07aa
KW
679/*
680 * Opens a disk image (raw, qcow2, vmdk, ...)
681 */
d6e9098e
KW
682int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683 BlockDriver *drv)
ea2384d3 684{
b6ce07aa 685 int ret;
2b572816 686 char tmp_filename[PATH_MAX];
712e7874 687
83f64091 688 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
689 BlockDriverState *bs1;
690 int64_t total_size;
7c96d46e 691 int is_protocol = 0;
91a073a9
KW
692 BlockDriver *bdrv_qcow2;
693 QEMUOptionParameter *options;
b6ce07aa 694 char backing_filename[PATH_MAX];
3b46e624 695
ea2384d3
FB
696 /* if snapshot, we create a temporary backing file and open it
697 instead of opening 'filename' directly */
33e3963e 698
ea2384d3
FB
699 /* if there is a backing file, use it */
700 bs1 = bdrv_new("");
d6e9098e 701 ret = bdrv_open(bs1, filename, 0, drv);
51d7c00c 702 if (ret < 0) {
ea2384d3 703 bdrv_delete(bs1);
51d7c00c 704 return ret;
ea2384d3 705 }
3e82990b 706 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e
AL
707
708 if (bs1->drv && bs1->drv->protocol_name)
709 is_protocol = 1;
710
ea2384d3 711 bdrv_delete(bs1);
3b46e624 712
ea2384d3 713 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
7c96d46e
AL
714
715 /* Real path is meaningless for protocols */
716 if (is_protocol)
717 snprintf(backing_filename, sizeof(backing_filename),
718 "%s", filename);
114cdfa9
KS
719 else if (!realpath(filename, backing_filename))
720 return -errno;
7c96d46e 721
91a073a9
KW
722 bdrv_qcow2 = bdrv_find_format("qcow2");
723 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
724
3e82990b 725 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
91a073a9
KW
726 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727 if (drv) {
728 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729 drv->format_name);
730 }
731
732 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
d748768c 733 free_option_parameters(options);
51d7c00c
AL
734 if (ret < 0) {
735 return ret;
ea2384d3 736 }
91a073a9 737
ea2384d3 738 filename = tmp_filename;
91a073a9 739 drv = bdrv_qcow2;
ea2384d3
FB
740 bs->is_temporary = 1;
741 }
712e7874 742
b6ce07aa 743 /* Find the right image format driver */
6db95603 744 if (!drv) {
c98ac35d 745 ret = find_image_format(filename, &drv);
51d7c00c 746 }
6987307c 747
51d7c00c 748 if (!drv) {
51d7c00c 749 goto unlink_and_fail;
ea2384d3 750 }
b6ce07aa
KW
751
752 /* Open the image */
753 ret = bdrv_open_common(bs, filename, flags, drv);
754 if (ret < 0) {
6987307c
CH
755 goto unlink_and_fail;
756 }
757
b6ce07aa
KW
758 /* If there is a backing file, use it */
759 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760 char backing_filename[PATH_MAX];
761 int back_flags;
762 BlockDriver *back_drv = NULL;
763
764 bs->backing_hd = bdrv_new("");
df2dbb4a
SH
765
766 if (path_has_protocol(bs->backing_file)) {
767 pstrcpy(backing_filename, sizeof(backing_filename),
768 bs->backing_file);
769 } else {
770 path_combine(backing_filename, sizeof(backing_filename),
771 filename, bs->backing_file);
772 }
773
774 if (bs->backing_format[0] != '\0') {
b6ce07aa 775 back_drv = bdrv_find_format(bs->backing_format);
df2dbb4a 776 }
b6ce07aa
KW
777
778 /* backing files always opened read-only */
779 back_flags =
780 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
781
782 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783 if (ret < 0) {
784 bdrv_close(bs);
785 return ret;
786 }
787 if (bs->is_temporary) {
788 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789 } else {
790 /* base image inherits from "parent" */
791 bs->backing_hd->keep_read_only = bs->keep_read_only;
792 }
793 }
794
795 if (!bdrv_key_required(bs)) {
7d4b4ba5 796 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
797 }
798
98f90dba
ZYW
799 /* throttling disk I/O limits */
800 if (bs->io_limits_enabled) {
801 bdrv_io_limits_enable(bs);
802 }
803
b6ce07aa
KW
804 return 0;
805
806unlink_and_fail:
807 if (bs->is_temporary) {
808 unlink(filename);
809 }
810 return ret;
811}
812
fc01f7e7
FB
813void bdrv_close(BlockDriverState *bs)
814{
19cb3738 815 if (bs->drv) {
3e914655
PB
816 if (bs->job) {
817 block_job_cancel_sync(bs->job);
818 }
7094f12f
KW
819 bdrv_drain_all();
820
f9092b10
MA
821 if (bs == bs_snapshots) {
822 bs_snapshots = NULL;
823 }
557df6ac 824 if (bs->backing_hd) {
ea2384d3 825 bdrv_delete(bs->backing_hd);
557df6ac
SH
826 bs->backing_hd = NULL;
827 }
ea2384d3 828 bs->drv->bdrv_close(bs);
7267c094 829 g_free(bs->opaque);
ea2384d3
FB
830#ifdef _WIN32
831 if (bs->is_temporary) {
832 unlink(bs->filename);
833 }
67b915a5 834#endif
ea2384d3
FB
835 bs->opaque = NULL;
836 bs->drv = NULL;
53fec9d3 837 bs->copy_on_read = 0;
b338082b 838
66f82cee
KW
839 if (bs->file != NULL) {
840 bdrv_close(bs->file);
841 }
842
7d4b4ba5 843 bdrv_dev_change_media_cb(bs, false);
b338082b 844 }
98f90dba
ZYW
845
846 /*throttling disk I/O limits*/
847 if (bs->io_limits_enabled) {
848 bdrv_io_limits_disable(bs);
849 }
b338082b
FB
850}
851
2bc93fed
MK
852void bdrv_close_all(void)
853{
854 BlockDriverState *bs;
855
856 QTAILQ_FOREACH(bs, &bdrv_states, list) {
857 bdrv_close(bs);
858 }
859}
860
922453bc
SH
861/*
862 * Wait for pending requests to complete across all BlockDriverStates
863 *
864 * This function does not flush data to disk, use bdrv_flush_all() for that
865 * after calling this function.
866 */
867void bdrv_drain_all(void)
868{
869 BlockDriverState *bs;
870
871 qemu_aio_flush();
872
873 /* If requests are still pending there is a bug somewhere */
874 QTAILQ_FOREACH(bs, &bdrv_states, list) {
875 assert(QLIST_EMPTY(&bs->tracked_requests));
876 assert(qemu_co_queue_empty(&bs->throttled_reqs));
877 }
878}
879
d22b2f41
RH
880/* make a BlockDriverState anonymous by removing from bdrv_state list.
881 Also, NULL terminate the device_name to prevent double remove */
882void bdrv_make_anon(BlockDriverState *bs)
883{
884 if (bs->device_name[0] != '\0') {
885 QTAILQ_REMOVE(&bdrv_states, bs, list);
886 }
887 bs->device_name[0] = '\0';
888}
889
8802d1fd
JC
890/*
891 * Add new bs contents at the top of an image chain while the chain is
892 * live, while keeping required fields on the top layer.
893 *
894 * This will modify the BlockDriverState fields, and swap contents
895 * between bs_new and bs_top. Both bs_new and bs_top are modified.
896 *
f6801b83
JC
897 * bs_new is required to be anonymous.
898 *
8802d1fd
JC
899 * This function does not create any image files.
900 */
901void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
902{
903 BlockDriverState tmp;
904
f6801b83
JC
905 /* bs_new must be anonymous */
906 assert(bs_new->device_name[0] == '\0');
8802d1fd
JC
907
908 tmp = *bs_new;
909
910 /* there are some fields that need to stay on the top layer: */
911
912 /* dev info */
913 tmp.dev_ops = bs_top->dev_ops;
914 tmp.dev_opaque = bs_top->dev_opaque;
915 tmp.dev = bs_top->dev;
916 tmp.buffer_alignment = bs_top->buffer_alignment;
917 tmp.copy_on_read = bs_top->copy_on_read;
918
919 /* i/o timing parameters */
920 tmp.slice_time = bs_top->slice_time;
921 tmp.slice_start = bs_top->slice_start;
922 tmp.slice_end = bs_top->slice_end;
923 tmp.io_limits = bs_top->io_limits;
924 tmp.io_base = bs_top->io_base;
925 tmp.throttled_reqs = bs_top->throttled_reqs;
926 tmp.block_timer = bs_top->block_timer;
927 tmp.io_limits_enabled = bs_top->io_limits_enabled;
928
929 /* geometry */
930 tmp.cyls = bs_top->cyls;
931 tmp.heads = bs_top->heads;
932 tmp.secs = bs_top->secs;
933 tmp.translation = bs_top->translation;
934
935 /* r/w error */
936 tmp.on_read_error = bs_top->on_read_error;
937 tmp.on_write_error = bs_top->on_write_error;
938
939 /* i/o status */
940 tmp.iostatus_enabled = bs_top->iostatus_enabled;
941 tmp.iostatus = bs_top->iostatus;
942
943 /* keep the same entry in bdrv_states */
944 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
945 tmp.list = bs_top->list;
946
947 /* The contents of 'tmp' will become bs_top, as we are
948 * swapping bs_new and bs_top contents. */
949 tmp.backing_hd = bs_new;
950 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
f6801b83 951 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
8802d1fd
JC
952
953 /* swap contents of the fixed new bs and the current top */
954 *bs_new = *bs_top;
955 *bs_top = tmp;
956
f6801b83
JC
957 /* device_name[] was carried over from the old bs_top. bs_new
958 * shouldn't be in bdrv_states, so we need to make device_name[]
959 * reflect the anonymity of bs_new
960 */
961 bs_new->device_name[0] = '\0';
962
8802d1fd
JC
963 /* clear the copied fields in the new backing file */
964 bdrv_detach_dev(bs_new, bs_new->dev);
965
966 qemu_co_queue_init(&bs_new->throttled_reqs);
967 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
968 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
969 bdrv_iostatus_disable(bs_new);
970
971 /* we don't use bdrv_io_limits_disable() for this, because we don't want
972 * to affect or delete the block_timer, as it has been moved to bs_top */
973 bs_new->io_limits_enabled = false;
974 bs_new->block_timer = NULL;
975 bs_new->slice_time = 0;
976 bs_new->slice_start = 0;
977 bs_new->slice_end = 0;
978}
979
b338082b
FB
980void bdrv_delete(BlockDriverState *bs)
981{
fa879d62 982 assert(!bs->dev);
3e914655
PB
983 assert(!bs->job);
984 assert(!bs->in_use);
18846dee 985
1b7bdbc1 986 /* remove from list, if necessary */
d22b2f41 987 bdrv_make_anon(bs);
34c6f050 988
b338082b 989 bdrv_close(bs);
66f82cee
KW
990 if (bs->file != NULL) {
991 bdrv_delete(bs->file);
992 }
993
f9092b10 994 assert(bs != bs_snapshots);
7267c094 995 g_free(bs);
fc01f7e7
FB
996}
997
fa879d62
MA
998int bdrv_attach_dev(BlockDriverState *bs, void *dev)
999/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 1000{
fa879d62 1001 if (bs->dev) {
18846dee
MA
1002 return -EBUSY;
1003 }
fa879d62 1004 bs->dev = dev;
28a7282a 1005 bdrv_iostatus_reset(bs);
18846dee
MA
1006 return 0;
1007}
1008
fa879d62
MA
1009/* TODO qdevified devices don't use this, remove when devices are qdevified */
1010void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 1011{
fa879d62
MA
1012 if (bdrv_attach_dev(bs, dev) < 0) {
1013 abort();
1014 }
1015}
1016
1017void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1018/* TODO change to DeviceState *dev when all users are qdevified */
1019{
1020 assert(bs->dev == dev);
1021 bs->dev = NULL;
0e49de52
MA
1022 bs->dev_ops = NULL;
1023 bs->dev_opaque = NULL;
29e05f20 1024 bs->buffer_alignment = 512;
18846dee
MA
1025}
1026
fa879d62
MA
1027/* TODO change to return DeviceState * when all users are qdevified */
1028void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 1029{
fa879d62 1030 return bs->dev;
18846dee
MA
1031}
1032
0e49de52
MA
1033void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1034 void *opaque)
1035{
1036 bs->dev_ops = ops;
1037 bs->dev_opaque = opaque;
2c6942fa
MA
1038 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1039 bs_snapshots = NULL;
1040 }
0e49de52
MA
1041}
1042
329c0a48
LC
1043void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1044 BlockQMPEventAction action, int is_read)
1045{
1046 QObject *data;
1047 const char *action_str;
1048
1049 switch (action) {
1050 case BDRV_ACTION_REPORT:
1051 action_str = "report";
1052 break;
1053 case BDRV_ACTION_IGNORE:
1054 action_str = "ignore";
1055 break;
1056 case BDRV_ACTION_STOP:
1057 action_str = "stop";
1058 break;
1059 default:
1060 abort();
1061 }
1062
1063 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1064 bdrv->device_name,
1065 action_str,
1066 is_read ? "read" : "write");
1067 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1068
1069 qobject_decref(data);
1070}
1071
6f382ed2
LC
1072static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1073{
1074 QObject *data;
1075
1076 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1077 bdrv_get_device_name(bs), ejected);
1078 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1079
1080 qobject_decref(data);
1081}
1082
7d4b4ba5 1083static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 1084{
145feb17 1085 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
6f382ed2 1086 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
7d4b4ba5 1087 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
6f382ed2
LC
1088 if (tray_was_closed) {
1089 /* tray open */
1090 bdrv_emit_qmp_eject_event(bs, true);
1091 }
1092 if (load) {
1093 /* tray close */
1094 bdrv_emit_qmp_eject_event(bs, false);
1095 }
145feb17
MA
1096 }
1097}
1098
2c6942fa
MA
1099bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1100{
1101 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1102}
1103
025ccaa7
PB
1104void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1105{
1106 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1107 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1108 }
1109}
1110
e4def80b
MA
1111bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1112{
1113 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1114 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1115 }
1116 return false;
1117}
1118
145feb17
MA
1119static void bdrv_dev_resize_cb(BlockDriverState *bs)
1120{
1121 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1122 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
1123 }
1124}
1125
f107639a
MA
1126bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1127{
1128 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1129 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1130 }
1131 return false;
1132}
1133
e97fc193
AL
1134/*
1135 * Run consistency checks on an image
1136 *
e076f338 1137 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 1138 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 1139 * check are stored in res.
e97fc193 1140 */
e076f338 1141int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
e97fc193
AL
1142{
1143 if (bs->drv->bdrv_check == NULL) {
1144 return -ENOTSUP;
1145 }
1146
e076f338 1147 memset(res, 0, sizeof(*res));
9ac228e0 1148 return bs->drv->bdrv_check(bs, res);
e97fc193
AL
1149}
1150
8a426614
KW
1151#define COMMIT_BUF_SECTORS 2048
1152
33e3963e
FB
1153/* commit COW file into the raw image */
1154int bdrv_commit(BlockDriverState *bs)
1155{
19cb3738 1156 BlockDriver *drv = bs->drv;
ee181196 1157 BlockDriver *backing_drv;
8a426614
KW
1158 int64_t sector, total_sectors;
1159 int n, ro, open_flags;
4dca4b63 1160 int ret = 0, rw_ret = 0;
8a426614 1161 uint8_t *buf;
4dca4b63
NS
1162 char filename[1024];
1163 BlockDriverState *bs_rw, *bs_ro;
33e3963e 1164
19cb3738
FB
1165 if (!drv)
1166 return -ENOMEDIUM;
4dca4b63
NS
1167
1168 if (!bs->backing_hd) {
1169 return -ENOTSUP;
33e3963e
FB
1170 }
1171
4dca4b63
NS
1172 if (bs->backing_hd->keep_read_only) {
1173 return -EACCES;
1174 }
ee181196 1175
2d3735d3
SH
1176 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1177 return -EBUSY;
1178 }
1179
ee181196 1180 backing_drv = bs->backing_hd->drv;
4dca4b63
NS
1181 ro = bs->backing_hd->read_only;
1182 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1183 open_flags = bs->backing_hd->open_flags;
1184
1185 if (ro) {
1186 /* re-open as RW */
1187 bdrv_delete(bs->backing_hd);
1188 bs->backing_hd = NULL;
1189 bs_rw = bdrv_new("");
ee181196
KW
1190 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1191 backing_drv);
4dca4b63
NS
1192 if (rw_ret < 0) {
1193 bdrv_delete(bs_rw);
1194 /* try to re-open read-only */
1195 bs_ro = bdrv_new("");
ee181196
KW
1196 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1197 backing_drv);
4dca4b63
NS
1198 if (ret < 0) {
1199 bdrv_delete(bs_ro);
1200 /* drive not functional anymore */
1201 bs->drv = NULL;
1202 return ret;
1203 }
1204 bs->backing_hd = bs_ro;
1205 return rw_ret;
1206 }
1207 bs->backing_hd = bs_rw;
ea2384d3 1208 }
33e3963e 1209
6ea44308 1210 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
7267c094 1211 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
1212
1213 for (sector = 0; sector < total_sectors; sector += n) {
05c4af54 1214 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
8a426614
KW
1215
1216 if (bdrv_read(bs, sector, buf, n) != 0) {
1217 ret = -EIO;
1218 goto ro_cleanup;
1219 }
1220
1221 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1222 ret = -EIO;
1223 goto ro_cleanup;
1224 }
ea2384d3 1225 }
33e3963e 1226 }
95389c86 1227
1d44952f
CH
1228 if (drv->bdrv_make_empty) {
1229 ret = drv->bdrv_make_empty(bs);
1230 bdrv_flush(bs);
1231 }
95389c86 1232
3f5075ae
CH
1233 /*
1234 * Make sure all data we wrote to the backing device is actually
1235 * stable on disk.
1236 */
1237 if (bs->backing_hd)
1238 bdrv_flush(bs->backing_hd);
4dca4b63
NS
1239
1240ro_cleanup:
7267c094 1241 g_free(buf);
4dca4b63
NS
1242
1243 if (ro) {
1244 /* re-open as RO */
1245 bdrv_delete(bs->backing_hd);
1246 bs->backing_hd = NULL;
1247 bs_ro = bdrv_new("");
ee181196
KW
1248 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1249 backing_drv);
4dca4b63
NS
1250 if (ret < 0) {
1251 bdrv_delete(bs_ro);
1252 /* drive not functional anymore */
1253 bs->drv = NULL;
1254 return ret;
1255 }
1256 bs->backing_hd = bs_ro;
1257 bs->backing_hd->keep_read_only = 0;
1258 }
1259
1d44952f 1260 return ret;
33e3963e
FB
1261}
1262
e8877497 1263int bdrv_commit_all(void)
6ab4b5ab
MA
1264{
1265 BlockDriverState *bs;
1266
1267 QTAILQ_FOREACH(bs, &bdrv_states, list) {
e8877497
SH
1268 int ret = bdrv_commit(bs);
1269 if (ret < 0) {
1270 return ret;
1271 }
6ab4b5ab 1272 }
e8877497 1273 return 0;
6ab4b5ab
MA
1274}
1275
dbffbdcf
SH
1276struct BdrvTrackedRequest {
1277 BlockDriverState *bs;
1278 int64_t sector_num;
1279 int nb_sectors;
1280 bool is_write;
1281 QLIST_ENTRY(BdrvTrackedRequest) list;
5f8b6491 1282 Coroutine *co; /* owner, used for deadlock detection */
f4658285 1283 CoQueue wait_queue; /* coroutines blocked on this request */
dbffbdcf
SH
1284};
1285
1286/**
1287 * Remove an active request from the tracked requests list
1288 *
1289 * This function should be called when a tracked request is completing.
1290 */
1291static void tracked_request_end(BdrvTrackedRequest *req)
1292{
1293 QLIST_REMOVE(req, list);
f4658285 1294 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
1295}
1296
1297/**
1298 * Add an active request to the tracked requests list
1299 */
1300static void tracked_request_begin(BdrvTrackedRequest *req,
1301 BlockDriverState *bs,
1302 int64_t sector_num,
1303 int nb_sectors, bool is_write)
1304{
1305 *req = (BdrvTrackedRequest){
1306 .bs = bs,
1307 .sector_num = sector_num,
1308 .nb_sectors = nb_sectors,
1309 .is_write = is_write,
5f8b6491 1310 .co = qemu_coroutine_self(),
dbffbdcf
SH
1311 };
1312
f4658285
SH
1313 qemu_co_queue_init(&req->wait_queue);
1314
dbffbdcf
SH
1315 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1316}
1317
d83947ac
SH
1318/**
1319 * Round a region to cluster boundaries
1320 */
1321static void round_to_clusters(BlockDriverState *bs,
1322 int64_t sector_num, int nb_sectors,
1323 int64_t *cluster_sector_num,
1324 int *cluster_nb_sectors)
1325{
1326 BlockDriverInfo bdi;
1327
1328 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1329 *cluster_sector_num = sector_num;
1330 *cluster_nb_sectors = nb_sectors;
1331 } else {
1332 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1333 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1334 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1335 nb_sectors, c);
1336 }
1337}
1338
f4658285
SH
1339static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1340 int64_t sector_num, int nb_sectors) {
d83947ac
SH
1341 /* aaaa bbbb */
1342 if (sector_num >= req->sector_num + req->nb_sectors) {
1343 return false;
1344 }
1345 /* bbbb aaaa */
1346 if (req->sector_num >= sector_num + nb_sectors) {
1347 return false;
1348 }
1349 return true;
f4658285
SH
1350}
1351
1352static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1353 int64_t sector_num, int nb_sectors)
1354{
1355 BdrvTrackedRequest *req;
d83947ac
SH
1356 int64_t cluster_sector_num;
1357 int cluster_nb_sectors;
f4658285
SH
1358 bool retry;
1359
d83947ac
SH
1360 /* If we touch the same cluster it counts as an overlap. This guarantees
1361 * that allocating writes will be serialized and not race with each other
1362 * for the same cluster. For example, in copy-on-read it ensures that the
1363 * CoR read and write operations are atomic and guest writes cannot
1364 * interleave between them.
1365 */
1366 round_to_clusters(bs, sector_num, nb_sectors,
1367 &cluster_sector_num, &cluster_nb_sectors);
1368
f4658285
SH
1369 do {
1370 retry = false;
1371 QLIST_FOREACH(req, &bs->tracked_requests, list) {
d83947ac
SH
1372 if (tracked_request_overlaps(req, cluster_sector_num,
1373 cluster_nb_sectors)) {
5f8b6491
SH
1374 /* Hitting this means there was a reentrant request, for
1375 * example, a block driver issuing nested requests. This must
1376 * never happen since it means deadlock.
1377 */
1378 assert(qemu_coroutine_self() != req->co);
1379
f4658285
SH
1380 qemu_co_queue_wait(&req->wait_queue);
1381 retry = true;
1382 break;
1383 }
1384 }
1385 } while (retry);
1386}
1387
756e6736
KW
1388/*
1389 * Return values:
1390 * 0 - success
1391 * -EINVAL - backing format specified, but no file
1392 * -ENOSPC - can't update the backing file because no space is left in the
1393 * image file header
1394 * -ENOTSUP - format driver doesn't support changing the backing file
1395 */
1396int bdrv_change_backing_file(BlockDriverState *bs,
1397 const char *backing_file, const char *backing_fmt)
1398{
1399 BlockDriver *drv = bs->drv;
1400
1401 if (drv->bdrv_change_backing_file != NULL) {
1402 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1403 } else {
1404 return -ENOTSUP;
1405 }
1406}
1407
71d0770c
AL
1408static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1409 size_t size)
1410{
1411 int64_t len;
1412
1413 if (!bdrv_is_inserted(bs))
1414 return -ENOMEDIUM;
1415
1416 if (bs->growable)
1417 return 0;
1418
1419 len = bdrv_getlength(bs);
1420
fbb7b4e0
KW
1421 if (offset < 0)
1422 return -EIO;
1423
1424 if ((offset > len) || (len - offset < size))
71d0770c
AL
1425 return -EIO;
1426
1427 return 0;
1428}
1429
1430static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1431 int nb_sectors)
1432{
eb5a3165
JS
1433 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1434 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
1435}
1436
1c9805a3
SH
1437typedef struct RwCo {
1438 BlockDriverState *bs;
1439 int64_t sector_num;
1440 int nb_sectors;
1441 QEMUIOVector *qiov;
1442 bool is_write;
1443 int ret;
1444} RwCo;
1445
1446static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 1447{
1c9805a3 1448 RwCo *rwco = opaque;
ea2384d3 1449
1c9805a3
SH
1450 if (!rwco->is_write) {
1451 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
470c0504 1452 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1453 } else {
1454 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
f08f2dda 1455 rwco->nb_sectors, rwco->qiov, 0);
1c9805a3
SH
1456 }
1457}
e7a8a783 1458
1c9805a3
SH
1459/*
1460 * Process a synchronous request using coroutines
1461 */
1462static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1463 int nb_sectors, bool is_write)
1464{
1465 QEMUIOVector qiov;
1466 struct iovec iov = {
1467 .iov_base = (void *)buf,
1468 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1469 };
1470 Coroutine *co;
1471 RwCo rwco = {
1472 .bs = bs,
1473 .sector_num = sector_num,
1474 .nb_sectors = nb_sectors,
1475 .qiov = &qiov,
1476 .is_write = is_write,
1477 .ret = NOT_DONE,
1478 };
e7a8a783 1479
1c9805a3 1480 qemu_iovec_init_external(&qiov, &iov, 1);
e7a8a783 1481
498e386c
ZYW
1482 /**
1483 * In sync call context, when the vcpu is blocked, this throttling timer
1484 * will not fire; so the I/O throttling function has to be disabled here
1485 * if it has been enabled.
1486 */
1487 if (bs->io_limits_enabled) {
1488 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1489 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1490 bdrv_io_limits_disable(bs);
1491 }
1492
1c9805a3
SH
1493 if (qemu_in_coroutine()) {
1494 /* Fast-path if already in coroutine context */
1495 bdrv_rw_co_entry(&rwco);
1496 } else {
1497 co = qemu_coroutine_create(bdrv_rw_co_entry);
1498 qemu_coroutine_enter(co, &rwco);
1499 while (rwco.ret == NOT_DONE) {
1500 qemu_aio_wait();
1501 }
1502 }
1503 return rwco.ret;
1504}
b338082b 1505
1c9805a3
SH
1506/* return < 0 if error. See bdrv_write() for the return codes */
1507int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1508 uint8_t *buf, int nb_sectors)
1509{
1510 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
fc01f7e7
FB
1511}
1512
7cd1e32a 1513static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
a55eb92c 1514 int nb_sectors, int dirty)
7cd1e32a 1515{
1516 int64_t start, end;
c6d22830 1517 unsigned long val, idx, bit;
a55eb92c 1518
6ea44308 1519 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
c6d22830 1520 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c
JK
1521
1522 for (; start <= end; start++) {
c6d22830
JK
1523 idx = start / (sizeof(unsigned long) * 8);
1524 bit = start % (sizeof(unsigned long) * 8);
1525 val = bs->dirty_bitmap[idx];
1526 if (dirty) {
6d59fec1 1527 if (!(val & (1UL << bit))) {
aaa0eb75 1528 bs->dirty_count++;
6d59fec1 1529 val |= 1UL << bit;
aaa0eb75 1530 }
c6d22830 1531 } else {
6d59fec1 1532 if (val & (1UL << bit)) {
aaa0eb75 1533 bs->dirty_count--;
6d59fec1 1534 val &= ~(1UL << bit);
aaa0eb75 1535 }
c6d22830
JK
1536 }
1537 bs->dirty_bitmap[idx] = val;
7cd1e32a 1538 }
1539}
1540
5fafdf24 1541/* Return < 0 if error. Important errors are:
19cb3738
FB
1542 -EIO generic I/O error (may happen for all errors)
1543 -ENOMEDIUM No media inserted.
1544 -EINVAL Invalid sector number or nb_sectors
1545 -EACCES Trying to write a read-only device
1546*/
5fafdf24 1547int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
1548 const uint8_t *buf, int nb_sectors)
1549{
1c9805a3 1550 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
83f64091
FB
1551}
1552
eda578e5
AL
1553int bdrv_pread(BlockDriverState *bs, int64_t offset,
1554 void *buf, int count1)
83f64091 1555{
6ea44308 1556 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1557 int len, nb_sectors, count;
1558 int64_t sector_num;
9a8c4cce 1559 int ret;
83f64091
FB
1560
1561 count = count1;
1562 /* first read to align to sector start */
6ea44308 1563 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1564 if (len > count)
1565 len = count;
6ea44308 1566 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1567 if (len > 0) {
9a8c4cce
KW
1568 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1569 return ret;
6ea44308 1570 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
83f64091
FB
1571 count -= len;
1572 if (count == 0)
1573 return count1;
1574 sector_num++;
1575 buf += len;
1576 }
1577
1578 /* read the sectors "in place" */
6ea44308 1579 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1580 if (nb_sectors > 0) {
9a8c4cce
KW
1581 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1582 return ret;
83f64091 1583 sector_num += nb_sectors;
6ea44308 1584 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1585 buf += len;
1586 count -= len;
1587 }
1588
1589 /* add data from the last sector */
1590 if (count > 0) {
9a8c4cce
KW
1591 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1592 return ret;
83f64091
FB
1593 memcpy(buf, tmp_buf, count);
1594 }
1595 return count1;
1596}
1597
eda578e5
AL
1598int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1599 const void *buf, int count1)
83f64091 1600{
6ea44308 1601 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1602 int len, nb_sectors, count;
1603 int64_t sector_num;
9a8c4cce 1604 int ret;
83f64091
FB
1605
1606 count = count1;
1607 /* first write to align to sector start */
6ea44308 1608 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1609 if (len > count)
1610 len = count;
6ea44308 1611 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1612 if (len > 0) {
9a8c4cce
KW
1613 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1614 return ret;
6ea44308 1615 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
9a8c4cce
KW
1616 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1617 return ret;
83f64091
FB
1618 count -= len;
1619 if (count == 0)
1620 return count1;
1621 sector_num++;
1622 buf += len;
1623 }
1624
1625 /* write the sectors "in place" */
6ea44308 1626 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1627 if (nb_sectors > 0) {
9a8c4cce
KW
1628 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1629 return ret;
83f64091 1630 sector_num += nb_sectors;
6ea44308 1631 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1632 buf += len;
1633 count -= len;
1634 }
1635
1636 /* add data from the last sector */
1637 if (count > 0) {
9a8c4cce
KW
1638 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1639 return ret;
83f64091 1640 memcpy(tmp_buf, buf, count);
9a8c4cce
KW
1641 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1642 return ret;
83f64091
FB
1643 }
1644 return count1;
1645}
83f64091 1646
f08145fe
KW
1647/*
1648 * Writes to the file and ensures that no writes are reordered across this
1649 * request (acts as a barrier)
1650 *
1651 * Returns 0 on success, -errno in error cases.
1652 */
1653int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1654 const void *buf, int count)
1655{
1656 int ret;
1657
1658 ret = bdrv_pwrite(bs, offset, buf, count);
1659 if (ret < 0) {
1660 return ret;
1661 }
1662
92196b2f
SH
1663 /* No flush needed for cache modes that use O_DSYNC */
1664 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
f08145fe
KW
1665 bdrv_flush(bs);
1666 }
1667
1668 return 0;
1669}
1670
470c0504 1671static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
ab185921
SH
1672 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1673{
1674 /* Perform I/O through a temporary buffer so that users who scribble over
1675 * their read buffer while the operation is in progress do not end up
1676 * modifying the image file. This is critical for zero-copy guest I/O
1677 * where anything might happen inside guest memory.
1678 */
1679 void *bounce_buffer;
1680
79c053bd 1681 BlockDriver *drv = bs->drv;
ab185921
SH
1682 struct iovec iov;
1683 QEMUIOVector bounce_qiov;
1684 int64_t cluster_sector_num;
1685 int cluster_nb_sectors;
1686 size_t skip_bytes;
1687 int ret;
1688
1689 /* Cover entire cluster so no additional backing file I/O is required when
1690 * allocating cluster in the image file.
1691 */
1692 round_to_clusters(bs, sector_num, nb_sectors,
1693 &cluster_sector_num, &cluster_nb_sectors);
1694
470c0504
SH
1695 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1696 cluster_sector_num, cluster_nb_sectors);
ab185921
SH
1697
1698 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1699 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1700 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1701
79c053bd
SH
1702 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1703 &bounce_qiov);
ab185921
SH
1704 if (ret < 0) {
1705 goto err;
1706 }
1707
79c053bd
SH
1708 if (drv->bdrv_co_write_zeroes &&
1709 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1710 ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1711 cluster_nb_sectors);
1712 } else {
1713 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
ab185921 1714 &bounce_qiov);
79c053bd
SH
1715 }
1716
ab185921
SH
1717 if (ret < 0) {
1718 /* It might be okay to ignore write errors for guest requests. If this
1719 * is a deliberate copy-on-read then we don't want to ignore the error.
1720 * Simply report it in all cases.
1721 */
1722 goto err;
1723 }
1724
1725 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1726 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1727 nb_sectors * BDRV_SECTOR_SIZE);
1728
1729err:
1730 qemu_vfree(bounce_buffer);
1731 return ret;
1732}
1733
c5fbe571
SH
1734/*
1735 * Handle a read request in coroutine context
1736 */
1737static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
470c0504
SH
1738 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1739 BdrvRequestFlags flags)
da1fa91d
KW
1740{
1741 BlockDriver *drv = bs->drv;
dbffbdcf
SH
1742 BdrvTrackedRequest req;
1743 int ret;
da1fa91d 1744
da1fa91d
KW
1745 if (!drv) {
1746 return -ENOMEDIUM;
1747 }
1748 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1749 return -EIO;
1750 }
1751
98f90dba
ZYW
1752 /* throttling disk read I/O */
1753 if (bs->io_limits_enabled) {
1754 bdrv_io_limits_intercept(bs, false, nb_sectors);
1755 }
1756
f4658285 1757 if (bs->copy_on_read) {
470c0504
SH
1758 flags |= BDRV_REQ_COPY_ON_READ;
1759 }
1760 if (flags & BDRV_REQ_COPY_ON_READ) {
1761 bs->copy_on_read_in_flight++;
1762 }
1763
1764 if (bs->copy_on_read_in_flight) {
f4658285
SH
1765 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1766 }
1767
dbffbdcf 1768 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
ab185921 1769
470c0504 1770 if (flags & BDRV_REQ_COPY_ON_READ) {
ab185921
SH
1771 int pnum;
1772
1773 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1774 if (ret < 0) {
1775 goto out;
1776 }
1777
1778 if (!ret || pnum != nb_sectors) {
470c0504 1779 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1780 goto out;
1781 }
1782 }
1783
dbffbdcf 1784 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1785
1786out:
dbffbdcf 1787 tracked_request_end(&req);
470c0504
SH
1788
1789 if (flags & BDRV_REQ_COPY_ON_READ) {
1790 bs->copy_on_read_in_flight--;
1791 }
1792
dbffbdcf 1793 return ret;
da1fa91d
KW
1794}
1795
c5fbe571 1796int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
1797 int nb_sectors, QEMUIOVector *qiov)
1798{
c5fbe571 1799 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 1800
470c0504
SH
1801 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1802}
1803
1804int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1805 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1806{
1807 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1808
1809 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1810 BDRV_REQ_COPY_ON_READ);
c5fbe571
SH
1811}
1812
f08f2dda
SH
1813static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1814 int64_t sector_num, int nb_sectors)
1815{
1816 BlockDriver *drv = bs->drv;
1817 QEMUIOVector qiov;
1818 struct iovec iov;
1819 int ret;
1820
1821 /* First try the efficient write zeroes operation */
1822 if (drv->bdrv_co_write_zeroes) {
1823 return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1824 }
1825
1826 /* Fall back to bounce buffer if write zeroes is unsupported */
1827 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1828 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1829 memset(iov.iov_base, 0, iov.iov_len);
1830 qemu_iovec_init_external(&qiov, &iov, 1);
1831
1832 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1833
1834 qemu_vfree(iov.iov_base);
1835 return ret;
1836}
1837
c5fbe571
SH
1838/*
1839 * Handle a write request in coroutine context
1840 */
1841static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
f08f2dda
SH
1842 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1843 BdrvRequestFlags flags)
c5fbe571
SH
1844{
1845 BlockDriver *drv = bs->drv;
dbffbdcf 1846 BdrvTrackedRequest req;
6b7cb247 1847 int ret;
da1fa91d
KW
1848
1849 if (!bs->drv) {
1850 return -ENOMEDIUM;
1851 }
1852 if (bs->read_only) {
1853 return -EACCES;
1854 }
1855 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1856 return -EIO;
1857 }
1858
98f90dba
ZYW
1859 /* throttling disk write I/O */
1860 if (bs->io_limits_enabled) {
1861 bdrv_io_limits_intercept(bs, true, nb_sectors);
1862 }
1863
470c0504 1864 if (bs->copy_on_read_in_flight) {
f4658285
SH
1865 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1866 }
1867
dbffbdcf
SH
1868 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1869
f08f2dda
SH
1870 if (flags & BDRV_REQ_ZERO_WRITE) {
1871 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1872 } else {
1873 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1874 }
6b7cb247 1875
da1fa91d
KW
1876 if (bs->dirty_bitmap) {
1877 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1878 }
1879
1880 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1881 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1882 }
1883
dbffbdcf
SH
1884 tracked_request_end(&req);
1885
6b7cb247 1886 return ret;
da1fa91d
KW
1887}
1888
c5fbe571
SH
1889int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1890 int nb_sectors, QEMUIOVector *qiov)
1891{
1892 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1893
f08f2dda
SH
1894 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1895}
1896
1897int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1898 int64_t sector_num, int nb_sectors)
1899{
1900 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1901
1902 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1903 BDRV_REQ_ZERO_WRITE);
c5fbe571
SH
1904}
1905
83f64091
FB
1906/**
1907 * Truncate file to 'offset' bytes (needed only for file protocols)
1908 */
1909int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1910{
1911 BlockDriver *drv = bs->drv;
51762288 1912 int ret;
83f64091 1913 if (!drv)
19cb3738 1914 return -ENOMEDIUM;
83f64091
FB
1915 if (!drv->bdrv_truncate)
1916 return -ENOTSUP;
59f2689d
NS
1917 if (bs->read_only)
1918 return -EACCES;
8591675f
MT
1919 if (bdrv_in_use(bs))
1920 return -EBUSY;
51762288
SH
1921 ret = drv->bdrv_truncate(bs, offset);
1922 if (ret == 0) {
1923 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 1924 bdrv_dev_resize_cb(bs);
51762288
SH
1925 }
1926 return ret;
83f64091
FB
1927}
1928
4a1d5e1f
FZ
1929/**
1930 * Length of a allocated file in bytes. Sparse files are counted by actual
1931 * allocated space. Return < 0 if error or unknown.
1932 */
1933int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1934{
1935 BlockDriver *drv = bs->drv;
1936 if (!drv) {
1937 return -ENOMEDIUM;
1938 }
1939 if (drv->bdrv_get_allocated_file_size) {
1940 return drv->bdrv_get_allocated_file_size(bs);
1941 }
1942 if (bs->file) {
1943 return bdrv_get_allocated_file_size(bs->file);
1944 }
1945 return -ENOTSUP;
1946}
1947
83f64091
FB
1948/**
1949 * Length of a file in bytes. Return < 0 if error or unknown.
1950 */
1951int64_t bdrv_getlength(BlockDriverState *bs)
1952{
1953 BlockDriver *drv = bs->drv;
1954 if (!drv)
19cb3738 1955 return -ENOMEDIUM;
51762288 1956
2c6942fa 1957 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
46a4e4e6
SH
1958 if (drv->bdrv_getlength) {
1959 return drv->bdrv_getlength(bs);
1960 }
83f64091 1961 }
46a4e4e6 1962 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
1963}
1964
19cb3738 1965/* return 0 as number of sectors if no device present or error */
96b8f136 1966void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 1967{
19cb3738
FB
1968 int64_t length;
1969 length = bdrv_getlength(bs);
1970 if (length < 0)
1971 length = 0;
1972 else
6ea44308 1973 length = length >> BDRV_SECTOR_BITS;
19cb3738 1974 *nb_sectors_ptr = length;
fc01f7e7 1975}
cf98951b 1976
f3d54fc4
AL
1977struct partition {
1978 uint8_t boot_ind; /* 0x80 - active */
1979 uint8_t head; /* starting head */
1980 uint8_t sector; /* starting sector */
1981 uint8_t cyl; /* starting cylinder */
1982 uint8_t sys_ind; /* What partition type */
1983 uint8_t end_head; /* end head */
1984 uint8_t end_sector; /* end sector */
1985 uint8_t end_cyl; /* end cylinder */
1986 uint32_t start_sect; /* starting sector counting from 0 */
1987 uint32_t nr_sects; /* nr of sectors in partition */
541dc0d4 1988} QEMU_PACKED;
f3d54fc4
AL
1989
1990/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1991static int guess_disk_lchs(BlockDriverState *bs,
1992 int *pcylinders, int *pheads, int *psectors)
1993{
eb5a3165 1994 uint8_t buf[BDRV_SECTOR_SIZE];
f3d54fc4
AL
1995 int ret, i, heads, sectors, cylinders;
1996 struct partition *p;
1997 uint32_t nr_sects;
a38131b6 1998 uint64_t nb_sectors;
498e386c 1999 bool enabled;
f3d54fc4
AL
2000
2001 bdrv_get_geometry(bs, &nb_sectors);
2002
498e386c
ZYW
2003 /**
2004 * The function will be invoked during startup not only in sync I/O mode,
2005 * but also in async I/O mode. So the I/O throttling function has to
2006 * be disabled temporarily here, not permanently.
2007 */
2008 enabled = bs->io_limits_enabled;
2009 bs->io_limits_enabled = false;
f3d54fc4 2010 ret = bdrv_read(bs, 0, buf, 1);
498e386c 2011 bs->io_limits_enabled = enabled;
f3d54fc4
AL
2012 if (ret < 0)
2013 return -1;
2014 /* test msdos magic */
2015 if (buf[510] != 0x55 || buf[511] != 0xaa)
2016 return -1;
2017 for(i = 0; i < 4; i++) {
2018 p = ((struct partition *)(buf + 0x1be)) + i;
2019 nr_sects = le32_to_cpu(p->nr_sects);
2020 if (nr_sects && p->end_head) {
2021 /* We make the assumption that the partition terminates on
2022 a cylinder boundary */
2023 heads = p->end_head + 1;
2024 sectors = p->end_sector & 63;
2025 if (sectors == 0)
2026 continue;
2027 cylinders = nb_sectors / (heads * sectors);
2028 if (cylinders < 1 || cylinders > 16383)
2029 continue;
2030 *pheads = heads;
2031 *psectors = sectors;
2032 *pcylinders = cylinders;
2033#if 0
2034 printf("guessed geometry: LCHS=%d %d %d\n",
2035 cylinders, heads, sectors);
2036#endif
2037 return 0;
2038 }
2039 }
2040 return -1;
2041}
2042
2043void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2044{
2045 int translation, lba_detected = 0;
2046 int cylinders, heads, secs;
a38131b6 2047 uint64_t nb_sectors;
f3d54fc4
AL
2048
2049 /* if a geometry hint is available, use it */
2050 bdrv_get_geometry(bs, &nb_sectors);
2051 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2052 translation = bdrv_get_translation_hint(bs);
2053 if (cylinders != 0) {
2054 *pcyls = cylinders;
2055 *pheads = heads;
2056 *psecs = secs;
2057 } else {
2058 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2059 if (heads > 16) {
2060 /* if heads > 16, it means that a BIOS LBA
2061 translation was active, so the default
2062 hardware geometry is OK */
2063 lba_detected = 1;
2064 goto default_geometry;
2065 } else {
2066 *pcyls = cylinders;
2067 *pheads = heads;
2068 *psecs = secs;
2069 /* disable any translation to be in sync with
2070 the logical geometry */
2071 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2072 bdrv_set_translation_hint(bs,
2073 BIOS_ATA_TRANSLATION_NONE);
2074 }
2075 }
2076 } else {
2077 default_geometry:
2078 /* if no geometry, use a standard physical disk geometry */
2079 cylinders = nb_sectors / (16 * 63);
2080
2081 if (cylinders > 16383)
2082 cylinders = 16383;
2083 else if (cylinders < 2)
2084 cylinders = 2;
2085 *pcyls = cylinders;
2086 *pheads = 16;
2087 *psecs = 63;
2088 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2089 if ((*pcyls * *pheads) <= 131072) {
2090 bdrv_set_translation_hint(bs,
2091 BIOS_ATA_TRANSLATION_LARGE);
2092 } else {
2093 bdrv_set_translation_hint(bs,
2094 BIOS_ATA_TRANSLATION_LBA);
2095 }
2096 }
2097 }
2098 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2099 }
2100}
2101
5fafdf24 2102void bdrv_set_geometry_hint(BlockDriverState *bs,
b338082b
FB
2103 int cyls, int heads, int secs)
2104{
2105 bs->cyls = cyls;
2106 bs->heads = heads;
2107 bs->secs = secs;
2108}
2109
46d4767d
FB
2110void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2111{
2112 bs->translation = translation;
2113}
2114
5fafdf24 2115void bdrv_get_geometry_hint(BlockDriverState *bs,
b338082b
FB
2116 int *pcyls, int *pheads, int *psecs)
2117{
2118 *pcyls = bs->cyls;
2119 *pheads = bs->heads;
2120 *psecs = bs->secs;
2121}
2122
0563e191
ZYW
2123/* throttling disk io limits */
2124void bdrv_set_io_limits(BlockDriverState *bs,
2125 BlockIOLimit *io_limits)
2126{
2127 bs->io_limits = *io_limits;
2128 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2129}
2130
5bbdbb46
BS
2131/* Recognize floppy formats */
2132typedef struct FDFormat {
2133 FDriveType drive;
2134 uint8_t last_sect;
2135 uint8_t max_track;
2136 uint8_t max_head;
f8d3d128 2137 FDriveRate rate;
5bbdbb46
BS
2138} FDFormat;
2139
2140static const FDFormat fd_formats[] = {
2141 /* First entry is default format */
2142 /* 1.44 MB 3"1/2 floppy disks */
f8d3d128
HP
2143 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2144 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2145 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2146 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2147 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2148 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2149 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2150 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2151 /* 2.88 MB 3"1/2 floppy disks */
f8d3d128
HP
2152 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2153 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2154 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2155 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2156 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
5bbdbb46 2157 /* 720 kB 3"1/2 floppy disks */
f8d3d128
HP
2158 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2159 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2160 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2161 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2162 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2163 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2164 /* 1.2 MB 5"1/4 floppy disks */
f8d3d128
HP
2165 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2166 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2167 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2168 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2169 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
5bbdbb46 2170 /* 720 kB 5"1/4 floppy disks */
f8d3d128
HP
2171 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2172 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
5bbdbb46 2173 /* 360 kB 5"1/4 floppy disks */
f8d3d128
HP
2174 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2175 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2176 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2177 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
5bbdbb46 2178 /* 320 kB 5"1/4 floppy disks */
f8d3d128
HP
2179 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2180 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
5bbdbb46 2181 /* 360 kB must match 5"1/4 better than 3"1/2... */
f8d3d128 2182 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
5bbdbb46 2183 /* end */
f8d3d128 2184 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
5bbdbb46
BS
2185};
2186
2187void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2188 int *max_track, int *last_sect,
f8d3d128
HP
2189 FDriveType drive_in, FDriveType *drive,
2190 FDriveRate *rate)
5bbdbb46
BS
2191{
2192 const FDFormat *parse;
2193 uint64_t nb_sectors, size;
2194 int i, first_match, match;
2195
2196 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2197 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2198 /* User defined disk */
f8d3d128 2199 *rate = FDRIVE_RATE_500K;
5bbdbb46
BS
2200 } else {
2201 bdrv_get_geometry(bs, &nb_sectors);
2202 match = -1;
2203 first_match = -1;
2204 for (i = 0; ; i++) {
2205 parse = &fd_formats[i];
2206 if (parse->drive == FDRIVE_DRV_NONE) {
2207 break;
2208 }
2209 if (drive_in == parse->drive ||
2210 drive_in == FDRIVE_DRV_NONE) {
2211 size = (parse->max_head + 1) * parse->max_track *
2212 parse->last_sect;
2213 if (nb_sectors == size) {
2214 match = i;
2215 break;
2216 }
2217 if (first_match == -1) {
2218 first_match = i;
2219 }
2220 }
2221 }
2222 if (match == -1) {
2223 if (first_match == -1) {
2224 match = 1;
2225 } else {
2226 match = first_match;
2227 }
2228 parse = &fd_formats[match];
2229 }
2230 *nb_heads = parse->max_head + 1;
2231 *max_track = parse->max_track;
2232 *last_sect = parse->last_sect;
2233 *drive = parse->drive;
f8d3d128 2234 *rate = parse->rate;
5bbdbb46
BS
2235 }
2236}
2237
46d4767d
FB
2238int bdrv_get_translation_hint(BlockDriverState *bs)
2239{
2240 return bs->translation;
2241}
2242
abd7f68d
MA
2243void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2244 BlockErrorAction on_write_error)
2245{
2246 bs->on_read_error = on_read_error;
2247 bs->on_write_error = on_write_error;
2248}
2249
2250BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2251{
2252 return is_read ? bs->on_read_error : bs->on_write_error;
2253}
2254
b338082b
FB
2255int bdrv_is_read_only(BlockDriverState *bs)
2256{
2257 return bs->read_only;
2258}
2259
985a03b0
TS
2260int bdrv_is_sg(BlockDriverState *bs)
2261{
2262 return bs->sg;
2263}
2264
e900a7b7
CH
2265int bdrv_enable_write_cache(BlockDriverState *bs)
2266{
2267 return bs->enable_write_cache;
2268}
2269
ea2384d3
FB
2270int bdrv_is_encrypted(BlockDriverState *bs)
2271{
2272 if (bs->backing_hd && bs->backing_hd->encrypted)
2273 return 1;
2274 return bs->encrypted;
2275}
2276
c0f4ce77
AL
2277int bdrv_key_required(BlockDriverState *bs)
2278{
2279 BlockDriverState *backing_hd = bs->backing_hd;
2280
2281 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2282 return 1;
2283 return (bs->encrypted && !bs->valid_key);
2284}
2285
ea2384d3
FB
2286int bdrv_set_key(BlockDriverState *bs, const char *key)
2287{
2288 int ret;
2289 if (bs->backing_hd && bs->backing_hd->encrypted) {
2290 ret = bdrv_set_key(bs->backing_hd, key);
2291 if (ret < 0)
2292 return ret;
2293 if (!bs->encrypted)
2294 return 0;
2295 }
fd04a2ae
SH
2296 if (!bs->encrypted) {
2297 return -EINVAL;
2298 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2299 return -ENOMEDIUM;
2300 }
c0f4ce77 2301 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
2302 if (ret < 0) {
2303 bs->valid_key = 0;
2304 } else if (!bs->valid_key) {
2305 bs->valid_key = 1;
2306 /* call the change callback now, we skipped it on open */
7d4b4ba5 2307 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 2308 }
c0f4ce77 2309 return ret;
ea2384d3
FB
2310}
2311
2312void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2313{
19cb3738 2314 if (!bs->drv) {
ea2384d3
FB
2315 buf[0] = '\0';
2316 } else {
2317 pstrcpy(buf, buf_size, bs->drv->format_name);
2318 }
2319}
2320
5fafdf24 2321void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
2322 void *opaque)
2323{
2324 BlockDriver *drv;
2325
8a22f02a 2326 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
2327 it(opaque, drv->format_name);
2328 }
2329}
2330
b338082b
FB
2331BlockDriverState *bdrv_find(const char *name)
2332{
2333 BlockDriverState *bs;
2334
1b7bdbc1
SH
2335 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2336 if (!strcmp(name, bs->device_name)) {
b338082b 2337 return bs;
1b7bdbc1 2338 }
b338082b
FB
2339 }
2340 return NULL;
2341}
2342
2f399b0a
MA
2343BlockDriverState *bdrv_next(BlockDriverState *bs)
2344{
2345 if (!bs) {
2346 return QTAILQ_FIRST(&bdrv_states);
2347 }
2348 return QTAILQ_NEXT(bs, list);
2349}
2350
51de9760 2351void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
2352{
2353 BlockDriverState *bs;
2354
1b7bdbc1 2355 QTAILQ_FOREACH(bs, &bdrv_states, list) {
51de9760 2356 it(opaque, bs);
81d0912d
FB
2357 }
2358}
2359
ea2384d3
FB
2360const char *bdrv_get_device_name(BlockDriverState *bs)
2361{
2362 return bs->device_name;
2363}
2364
c6ca28d6
AL
2365void bdrv_flush_all(void)
2366{
2367 BlockDriverState *bs;
2368
1b7bdbc1 2369 QTAILQ_FOREACH(bs, &bdrv_states, list) {
29cdb251 2370 bdrv_flush(bs);
1b7bdbc1 2371 }
c6ca28d6
AL
2372}
2373
f2feebbd
KW
2374int bdrv_has_zero_init(BlockDriverState *bs)
2375{
2376 assert(bs->drv);
2377
336c1c12
KW
2378 if (bs->drv->bdrv_has_zero_init) {
2379 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
2380 }
2381
2382 return 1;
2383}
2384
376ae3f1
SH
2385typedef struct BdrvCoIsAllocatedData {
2386 BlockDriverState *bs;
2387 int64_t sector_num;
2388 int nb_sectors;
2389 int *pnum;
2390 int ret;
2391 bool done;
2392} BdrvCoIsAllocatedData;
2393
f58c7b35
TS
2394/*
2395 * Returns true iff the specified sector is present in the disk image. Drivers
2396 * not implementing the functionality are assumed to not support backing files,
2397 * hence all their sectors are reported as allocated.
2398 *
bd9533e3
SH
2399 * If 'sector_num' is beyond the end of the disk image the return value is 0
2400 * and 'pnum' is set to 0.
2401 *
f58c7b35
TS
2402 * 'pnum' is set to the number of sectors (including and immediately following
2403 * the specified sector) that are known to be in the same
2404 * allocated/unallocated state.
2405 *
bd9533e3
SH
2406 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2407 * beyond the end of the disk image it will be clamped.
f58c7b35 2408 */
060f51c9
SH
2409int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2410 int nb_sectors, int *pnum)
f58c7b35 2411{
bd9533e3
SH
2412 int64_t n;
2413
2414 if (sector_num >= bs->total_sectors) {
2415 *pnum = 0;
2416 return 0;
2417 }
2418
2419 n = bs->total_sectors - sector_num;
2420 if (n < nb_sectors) {
2421 nb_sectors = n;
2422 }
2423
6aebab14 2424 if (!bs->drv->bdrv_co_is_allocated) {
bd9533e3 2425 *pnum = nb_sectors;
f58c7b35
TS
2426 return 1;
2427 }
6aebab14 2428
060f51c9
SH
2429 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2430}
2431
2432/* Coroutine wrapper for bdrv_is_allocated() */
2433static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2434{
2435 BdrvCoIsAllocatedData *data = opaque;
2436 BlockDriverState *bs = data->bs;
2437
2438 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2439 data->pnum);
2440 data->done = true;
2441}
2442
2443/*
2444 * Synchronous wrapper around bdrv_co_is_allocated().
2445 *
2446 * See bdrv_co_is_allocated() for details.
2447 */
2448int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2449 int *pnum)
2450{
6aebab14
SH
2451 Coroutine *co;
2452 BdrvCoIsAllocatedData data = {
2453 .bs = bs,
2454 .sector_num = sector_num,
2455 .nb_sectors = nb_sectors,
2456 .pnum = pnum,
2457 .done = false,
2458 };
2459
2460 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2461 qemu_coroutine_enter(co, &data);
2462 while (!data.done) {
2463 qemu_aio_wait();
2464 }
2465 return data.ret;
f58c7b35
TS
2466}
2467
b2023818 2468BlockInfoList *qmp_query_block(Error **errp)
b338082b 2469{
b2023818 2470 BlockInfoList *head = NULL, *cur_item = NULL;
b338082b
FB
2471 BlockDriverState *bs;
2472
1b7bdbc1 2473 QTAILQ_FOREACH(bs, &bdrv_states, list) {
b2023818 2474 BlockInfoList *info = g_malloc0(sizeof(*info));
d15e5465 2475
b2023818
LC
2476 info->value = g_malloc0(sizeof(*info->value));
2477 info->value->device = g_strdup(bs->device_name);
2478 info->value->type = g_strdup("unknown");
2479 info->value->locked = bdrv_dev_is_medium_locked(bs);
2480 info->value->removable = bdrv_dev_has_removable_media(bs);
d15e5465 2481
e4def80b 2482 if (bdrv_dev_has_removable_media(bs)) {
b2023818
LC
2483 info->value->has_tray_open = true;
2484 info->value->tray_open = bdrv_dev_is_tray_open(bs);
e4def80b 2485 }
f04ef601
LC
2486
2487 if (bdrv_iostatus_is_enabled(bs)) {
b2023818
LC
2488 info->value->has_io_status = true;
2489 info->value->io_status = bs->iostatus;
f04ef601
LC
2490 }
2491
19cb3738 2492 if (bs->drv) {
b2023818
LC
2493 info->value->has_inserted = true;
2494 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2495 info->value->inserted->file = g_strdup(bs->filename);
2496 info->value->inserted->ro = bs->read_only;
2497 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2498 info->value->inserted->encrypted = bs->encrypted;
2499 if (bs->backing_file[0]) {
2500 info->value->inserted->has_backing_file = true;
2501 info->value->inserted->backing_file = g_strdup(bs->backing_file);
376253ec 2502 }
727f005e
ZYW
2503
2504 if (bs->io_limits_enabled) {
2505 info->value->inserted->bps =
2506 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2507 info->value->inserted->bps_rd =
2508 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2509 info->value->inserted->bps_wr =
2510 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2511 info->value->inserted->iops =
2512 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2513 info->value->inserted->iops_rd =
2514 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2515 info->value->inserted->iops_wr =
2516 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2517 }
b2023818 2518 }
d15e5465 2519
b2023818
LC
2520 /* XXX: waiting for the qapi to support GSList */
2521 if (!cur_item) {
2522 head = cur_item = info;
2523 } else {
2524 cur_item->next = info;
2525 cur_item = info;
b338082b 2526 }
b338082b 2527 }
d15e5465 2528
b2023818 2529 return head;
b338082b 2530}
a36e69dd 2531
f11f57e4
LC
2532/* Consider exposing this as a full fledged QMP command */
2533static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2534{
2535 BlockStats *s;
2536
2537 s = g_malloc0(sizeof(*s));
2538
2539 if (bs->device_name[0]) {
2540 s->has_device = true;
2541 s->device = g_strdup(bs->device_name);
294cc35f
KW
2542 }
2543
f11f57e4
LC
2544 s->stats = g_malloc0(sizeof(*s->stats));
2545 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2546 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2547 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2548 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2549 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2550 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2551 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2552 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2553 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2554
294cc35f 2555 if (bs->file) {
f11f57e4
LC
2556 s->has_parent = true;
2557 s->parent = qmp_query_blockstat(bs->file, NULL);
294cc35f
KW
2558 }
2559
f11f57e4 2560 return s;
294cc35f
KW
2561}
2562
f11f57e4 2563BlockStatsList *qmp_query_blockstats(Error **errp)
218a536a 2564{
f11f57e4 2565 BlockStatsList *head = NULL, *cur_item = NULL;
a36e69dd
TS
2566 BlockDriverState *bs;
2567
1b7bdbc1 2568 QTAILQ_FOREACH(bs, &bdrv_states, list) {
f11f57e4
LC
2569 BlockStatsList *info = g_malloc0(sizeof(*info));
2570 info->value = qmp_query_blockstat(bs, NULL);
2571
2572 /* XXX: waiting for the qapi to support GSList */
2573 if (!cur_item) {
2574 head = cur_item = info;
2575 } else {
2576 cur_item->next = info;
2577 cur_item = info;
2578 }
a36e69dd 2579 }
218a536a 2580
f11f57e4 2581 return head;
a36e69dd 2582}
ea2384d3 2583
045df330
AL
2584const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2585{
2586 if (bs->backing_hd && bs->backing_hd->encrypted)
2587 return bs->backing_file;
2588 else if (bs->encrypted)
2589 return bs->filename;
2590 else
2591 return NULL;
2592}
2593
5fafdf24 2594void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
2595 char *filename, int filename_size)
2596{
3574c608 2597 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
2598}
2599
5fafdf24 2600int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
2601 const uint8_t *buf, int nb_sectors)
2602{
2603 BlockDriver *drv = bs->drv;
2604 if (!drv)
19cb3738 2605 return -ENOMEDIUM;
faea38e7
FB
2606 if (!drv->bdrv_write_compressed)
2607 return -ENOTSUP;
fbb7b4e0
KW
2608 if (bdrv_check_request(bs, sector_num, nb_sectors))
2609 return -EIO;
a55eb92c 2610
c6d22830 2611 if (bs->dirty_bitmap) {
7cd1e32a 2612 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2613 }
a55eb92c 2614
faea38e7
FB
2615 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2616}
3b46e624 2617
faea38e7
FB
2618int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2619{
2620 BlockDriver *drv = bs->drv;
2621 if (!drv)
19cb3738 2622 return -ENOMEDIUM;
faea38e7
FB
2623 if (!drv->bdrv_get_info)
2624 return -ENOTSUP;
2625 memset(bdi, 0, sizeof(*bdi));
2626 return drv->bdrv_get_info(bs, bdi);
2627}
2628
45566e9c
CH
2629int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2630 int64_t pos, int size)
178e08a5
AL
2631{
2632 BlockDriver *drv = bs->drv;
2633 if (!drv)
2634 return -ENOMEDIUM;
7cdb1f6d
MK
2635 if (drv->bdrv_save_vmstate)
2636 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2637 if (bs->file)
2638 return bdrv_save_vmstate(bs->file, buf, pos, size);
2639 return -ENOTSUP;
178e08a5
AL
2640}
2641
45566e9c
CH
2642int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2643 int64_t pos, int size)
178e08a5
AL
2644{
2645 BlockDriver *drv = bs->drv;
2646 if (!drv)
2647 return -ENOMEDIUM;
7cdb1f6d
MK
2648 if (drv->bdrv_load_vmstate)
2649 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2650 if (bs->file)
2651 return bdrv_load_vmstate(bs->file, buf, pos, size);
2652 return -ENOTSUP;
178e08a5
AL
2653}
2654
8b9b0cc2
KW
2655void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2656{
2657 BlockDriver *drv = bs->drv;
2658
2659 if (!drv || !drv->bdrv_debug_event) {
2660 return;
2661 }
2662
2663 return drv->bdrv_debug_event(bs, event);
2664
2665}
2666
faea38e7
FB
2667/**************************************************************/
2668/* handling of snapshots */
2669
feeee5ac
MDCF
2670int bdrv_can_snapshot(BlockDriverState *bs)
2671{
2672 BlockDriver *drv = bs->drv;
07b70bfb 2673 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
feeee5ac
MDCF
2674 return 0;
2675 }
2676
2677 if (!drv->bdrv_snapshot_create) {
2678 if (bs->file != NULL) {
2679 return bdrv_can_snapshot(bs->file);
2680 }
2681 return 0;
2682 }
2683
2684 return 1;
2685}
2686
199630b6
BS
2687int bdrv_is_snapshot(BlockDriverState *bs)
2688{
2689 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2690}
2691
f9092b10
MA
2692BlockDriverState *bdrv_snapshots(void)
2693{
2694 BlockDriverState *bs;
2695
3ac906f7 2696 if (bs_snapshots) {
f9092b10 2697 return bs_snapshots;
3ac906f7 2698 }
f9092b10
MA
2699
2700 bs = NULL;
2701 while ((bs = bdrv_next(bs))) {
2702 if (bdrv_can_snapshot(bs)) {
3ac906f7
MA
2703 bs_snapshots = bs;
2704 return bs;
f9092b10
MA
2705 }
2706 }
2707 return NULL;
f9092b10
MA
2708}
2709
5fafdf24 2710int bdrv_snapshot_create(BlockDriverState *bs,
faea38e7
FB
2711 QEMUSnapshotInfo *sn_info)
2712{
2713 BlockDriver *drv = bs->drv;
2714 if (!drv)
19cb3738 2715 return -ENOMEDIUM;
7cdb1f6d
MK
2716 if (drv->bdrv_snapshot_create)
2717 return drv->bdrv_snapshot_create(bs, sn_info);
2718 if (bs->file)
2719 return bdrv_snapshot_create(bs->file, sn_info);
2720 return -ENOTSUP;
faea38e7
FB
2721}
2722
5fafdf24 2723int bdrv_snapshot_goto(BlockDriverState *bs,
faea38e7
FB
2724 const char *snapshot_id)
2725{
2726 BlockDriver *drv = bs->drv;
7cdb1f6d
MK
2727 int ret, open_ret;
2728
faea38e7 2729 if (!drv)
19cb3738 2730 return -ENOMEDIUM;
7cdb1f6d
MK
2731 if (drv->bdrv_snapshot_goto)
2732 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2733
2734 if (bs->file) {
2735 drv->bdrv_close(bs);
2736 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2737 open_ret = drv->bdrv_open(bs, bs->open_flags);
2738 if (open_ret < 0) {
2739 bdrv_delete(bs->file);
2740 bs->drv = NULL;
2741 return open_ret;
2742 }
2743 return ret;
2744 }
2745
2746 return -ENOTSUP;
faea38e7
FB
2747}
2748
2749int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2750{
2751 BlockDriver *drv = bs->drv;
2752 if (!drv)
19cb3738 2753 return -ENOMEDIUM;
7cdb1f6d
MK
2754 if (drv->bdrv_snapshot_delete)
2755 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2756 if (bs->file)
2757 return bdrv_snapshot_delete(bs->file, snapshot_id);
2758 return -ENOTSUP;
faea38e7
FB
2759}
2760
5fafdf24 2761int bdrv_snapshot_list(BlockDriverState *bs,
faea38e7
FB
2762 QEMUSnapshotInfo **psn_info)
2763{
2764 BlockDriver *drv = bs->drv;
2765 if (!drv)
19cb3738 2766 return -ENOMEDIUM;
7cdb1f6d
MK
2767 if (drv->bdrv_snapshot_list)
2768 return drv->bdrv_snapshot_list(bs, psn_info);
2769 if (bs->file)
2770 return bdrv_snapshot_list(bs->file, psn_info);
2771 return -ENOTSUP;
faea38e7
FB
2772}
2773
51ef6727 2774int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2775 const char *snapshot_name)
2776{
2777 BlockDriver *drv = bs->drv;
2778 if (!drv) {
2779 return -ENOMEDIUM;
2780 }
2781 if (!bs->read_only) {
2782 return -EINVAL;
2783 }
2784 if (drv->bdrv_snapshot_load_tmp) {
2785 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2786 }
2787 return -ENOTSUP;
2788}
2789
e8a6bb9c
MT
2790BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2791 const char *backing_file)
2792{
2793 if (!bs->drv) {
2794 return NULL;
2795 }
2796
2797 if (bs->backing_hd) {
2798 if (strcmp(bs->backing_file, backing_file) == 0) {
2799 return bs->backing_hd;
2800 } else {
2801 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2802 }
2803 }
2804
2805 return NULL;
2806}
2807
faea38e7
FB
2808#define NB_SUFFIXES 4
2809
2810char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2811{
2812 static const char suffixes[NB_SUFFIXES] = "KMGT";
2813 int64_t base;
2814 int i;
2815
2816 if (size <= 999) {
2817 snprintf(buf, buf_size, "%" PRId64, size);
2818 } else {
2819 base = 1024;
2820 for(i = 0; i < NB_SUFFIXES; i++) {
2821 if (size < (10 * base)) {
5fafdf24 2822 snprintf(buf, buf_size, "%0.1f%c",
faea38e7
FB
2823 (double)size / base,
2824 suffixes[i]);
2825 break;
2826 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
5fafdf24 2827 snprintf(buf, buf_size, "%" PRId64 "%c",
faea38e7
FB
2828 ((size + (base >> 1)) / base),
2829 suffixes[i]);
2830 break;
2831 }
2832 base = base * 1024;
2833 }
2834 }
2835 return buf;
2836}
2837
2838char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2839{
2840 char buf1[128], date_buf[128], clock_buf[128];
3b9f94e1
FB
2841#ifdef _WIN32
2842 struct tm *ptm;
2843#else
faea38e7 2844 struct tm tm;
3b9f94e1 2845#endif
faea38e7
FB
2846 time_t ti;
2847 int64_t secs;
2848
2849 if (!sn) {
5fafdf24
TS
2850 snprintf(buf, buf_size,
2851 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2852 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2853 } else {
2854 ti = sn->date_sec;
3b9f94e1
FB
2855#ifdef _WIN32
2856 ptm = localtime(&ti);
2857 strftime(date_buf, sizeof(date_buf),
2858 "%Y-%m-%d %H:%M:%S", ptm);
2859#else
faea38e7
FB
2860 localtime_r(&ti, &tm);
2861 strftime(date_buf, sizeof(date_buf),
2862 "%Y-%m-%d %H:%M:%S", &tm);
3b9f94e1 2863#endif
faea38e7
FB
2864 secs = sn->vm_clock_nsec / 1000000000;
2865 snprintf(clock_buf, sizeof(clock_buf),
2866 "%02d:%02d:%02d.%03d",
2867 (int)(secs / 3600),
2868 (int)((secs / 60) % 60),
5fafdf24 2869 (int)(secs % 60),
faea38e7
FB
2870 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2871 snprintf(buf, buf_size,
5fafdf24 2872 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2873 sn->id_str, sn->name,
2874 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2875 date_buf,
2876 clock_buf);
2877 }
2878 return buf;
2879}
2880
ea2384d3 2881/**************************************************************/
83f64091 2882/* async I/Os */
ea2384d3 2883
3b69e4b9 2884BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 2885 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 2886 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 2887{
bbf0a440
SH
2888 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2889
b2a61371 2890 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2891 cb, opaque, false);
ea2384d3
FB
2892}
2893
f141eafe
AL
2894BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2895 QEMUIOVector *qiov, int nb_sectors,
2896 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 2897{
bbf0a440
SH
2898 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2899
1a6e115b 2900 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2901 cb, opaque, true);
83f64091
FB
2902}
2903
40b4f539
KW
2904
2905typedef struct MultiwriteCB {
2906 int error;
2907 int num_requests;
2908 int num_callbacks;
2909 struct {
2910 BlockDriverCompletionFunc *cb;
2911 void *opaque;
2912 QEMUIOVector *free_qiov;
40b4f539
KW
2913 } callbacks[];
2914} MultiwriteCB;
2915
2916static void multiwrite_user_cb(MultiwriteCB *mcb)
2917{
2918 int i;
2919
2920 for (i = 0; i < mcb->num_callbacks; i++) {
2921 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
2922 if (mcb->callbacks[i].free_qiov) {
2923 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2924 }
7267c094 2925 g_free(mcb->callbacks[i].free_qiov);
40b4f539
KW
2926 }
2927}
2928
2929static void multiwrite_cb(void *opaque, int ret)
2930{
2931 MultiwriteCB *mcb = opaque;
2932
6d519a5f
SH
2933 trace_multiwrite_cb(mcb, ret);
2934
cb6d3ca0 2935 if (ret < 0 && !mcb->error) {
40b4f539 2936 mcb->error = ret;
40b4f539
KW
2937 }
2938
2939 mcb->num_requests--;
2940 if (mcb->num_requests == 0) {
de189a1b 2941 multiwrite_user_cb(mcb);
7267c094 2942 g_free(mcb);
40b4f539
KW
2943 }
2944}
2945
2946static int multiwrite_req_compare(const void *a, const void *b)
2947{
77be4366
CH
2948 const BlockRequest *req1 = a, *req2 = b;
2949
2950 /*
2951 * Note that we can't simply subtract req2->sector from req1->sector
2952 * here as that could overflow the return value.
2953 */
2954 if (req1->sector > req2->sector) {
2955 return 1;
2956 } else if (req1->sector < req2->sector) {
2957 return -1;
2958 } else {
2959 return 0;
2960 }
40b4f539
KW
2961}
2962
2963/*
2964 * Takes a bunch of requests and tries to merge them. Returns the number of
2965 * requests that remain after merging.
2966 */
2967static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2968 int num_reqs, MultiwriteCB *mcb)
2969{
2970 int i, outidx;
2971
2972 // Sort requests by start sector
2973 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2974
2975 // Check if adjacent requests touch the same clusters. If so, combine them,
2976 // filling up gaps with zero sectors.
2977 outidx = 0;
2978 for (i = 1; i < num_reqs; i++) {
2979 int merge = 0;
2980 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2981
b6a127a1 2982 // Handle exactly sequential writes and overlapping writes.
40b4f539
KW
2983 if (reqs[i].sector <= oldreq_last) {
2984 merge = 1;
2985 }
2986
e2a305fb
CH
2987 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2988 merge = 0;
2989 }
2990
40b4f539
KW
2991 if (merge) {
2992 size_t size;
7267c094 2993 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
2994 qemu_iovec_init(qiov,
2995 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2996
2997 // Add the first request to the merged one. If the requests are
2998 // overlapping, drop the last sectors of the first request.
2999 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3000 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3001
b6a127a1
PB
3002 // We should need to add any zeros between the two requests
3003 assert (reqs[i].sector <= oldreq_last);
40b4f539
KW
3004
3005 // Add the second request
3006 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3007
cbf1dff2 3008 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
3009 reqs[outidx].qiov = qiov;
3010
3011 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3012 } else {
3013 outidx++;
3014 reqs[outidx].sector = reqs[i].sector;
3015 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3016 reqs[outidx].qiov = reqs[i].qiov;
3017 }
3018 }
3019
3020 return outidx + 1;
3021}
3022
3023/*
3024 * Submit multiple AIO write requests at once.
3025 *
3026 * On success, the function returns 0 and all requests in the reqs array have
3027 * been submitted. In error case this function returns -1, and any of the
3028 * requests may or may not be submitted yet. In particular, this means that the
3029 * callback will be called for some of the requests, for others it won't. The
3030 * caller must check the error field of the BlockRequest to wait for the right
3031 * callbacks (if error != 0, no callback will be called).
3032 *
3033 * The implementation may modify the contents of the reqs array, e.g. to merge
3034 * requests. However, the fields opaque and error are left unmodified as they
3035 * are used to signal failure for a single request to the caller.
3036 */
3037int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3038{
40b4f539
KW
3039 MultiwriteCB *mcb;
3040 int i;
3041
301db7c2
RH
3042 /* don't submit writes if we don't have a medium */
3043 if (bs->drv == NULL) {
3044 for (i = 0; i < num_reqs; i++) {
3045 reqs[i].error = -ENOMEDIUM;
3046 }
3047 return -1;
3048 }
3049
40b4f539
KW
3050 if (num_reqs == 0) {
3051 return 0;
3052 }
3053
3054 // Create MultiwriteCB structure
7267c094 3055 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
3056 mcb->num_requests = 0;
3057 mcb->num_callbacks = num_reqs;
3058
3059 for (i = 0; i < num_reqs; i++) {
3060 mcb->callbacks[i].cb = reqs[i].cb;
3061 mcb->callbacks[i].opaque = reqs[i].opaque;
3062 }
3063
3064 // Check for mergable requests
3065 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3066
6d519a5f
SH
3067 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3068
df9309fb
PB
3069 /* Run the aio requests. */
3070 mcb->num_requests = num_reqs;
40b4f539 3071 for (i = 0; i < num_reqs; i++) {
ad54ae80 3072 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
40b4f539 3073 reqs[i].nb_sectors, multiwrite_cb, mcb);
40b4f539
KW
3074 }
3075
3076 return 0;
40b4f539
KW
3077}
3078
83f64091 3079void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 3080{
6bbff9a0 3081 acb->pool->cancel(acb);
83f64091
FB
3082}
3083
98f90dba
ZYW
3084/* block I/O throttling */
3085static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3086 bool is_write, double elapsed_time, uint64_t *wait)
3087{
3088 uint64_t bps_limit = 0;
3089 double bytes_limit, bytes_base, bytes_res;
3090 double slice_time, wait_time;
3091
3092 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3093 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3094 } else if (bs->io_limits.bps[is_write]) {
3095 bps_limit = bs->io_limits.bps[is_write];
3096 } else {
3097 if (wait) {
3098 *wait = 0;
3099 }
3100
3101 return false;
3102 }
3103
3104 slice_time = bs->slice_end - bs->slice_start;
3105 slice_time /= (NANOSECONDS_PER_SECOND);
3106 bytes_limit = bps_limit * slice_time;
3107 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3108 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3109 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3110 }
3111
3112 /* bytes_base: the bytes of data which have been read/written; and
3113 * it is obtained from the history statistic info.
3114 * bytes_res: the remaining bytes of data which need to be read/written.
3115 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3116 * the total time for completing reading/writting all data.
3117 */
3118 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3119
3120 if (bytes_base + bytes_res <= bytes_limit) {
3121 if (wait) {
3122 *wait = 0;
3123 }
3124
3125 return false;
3126 }
3127
3128 /* Calc approx time to dispatch */
3129 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3130
3131 /* When the I/O rate at runtime exceeds the limits,
3132 * bs->slice_end need to be extended in order that the current statistic
3133 * info can be kept until the timer fire, so it is increased and tuned
3134 * based on the result of experiment.
3135 */
3136 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3137 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3138 if (wait) {
3139 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3140 }
3141
3142 return true;
3143}
3144
3145static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3146 double elapsed_time, uint64_t *wait)
3147{
3148 uint64_t iops_limit = 0;
3149 double ios_limit, ios_base;
3150 double slice_time, wait_time;
3151
3152 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3153 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3154 } else if (bs->io_limits.iops[is_write]) {
3155 iops_limit = bs->io_limits.iops[is_write];
3156 } else {
3157 if (wait) {
3158 *wait = 0;
3159 }
3160
3161 return false;
3162 }
3163
3164 slice_time = bs->slice_end - bs->slice_start;
3165 slice_time /= (NANOSECONDS_PER_SECOND);
3166 ios_limit = iops_limit * slice_time;
3167 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3168 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3169 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3170 }
3171
3172 if (ios_base + 1 <= ios_limit) {
3173 if (wait) {
3174 *wait = 0;
3175 }
3176
3177 return false;
3178 }
3179
3180 /* Calc approx time to dispatch */
3181 wait_time = (ios_base + 1) / iops_limit;
3182 if (wait_time > elapsed_time) {
3183 wait_time = wait_time - elapsed_time;
3184 } else {
3185 wait_time = 0;
3186 }
3187
3188 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3189 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3190 if (wait) {
3191 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3192 }
3193
3194 return true;
3195}
3196
3197static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3198 bool is_write, int64_t *wait)
3199{
3200 int64_t now, max_wait;
3201 uint64_t bps_wait = 0, iops_wait = 0;
3202 double elapsed_time;
3203 int bps_ret, iops_ret;
3204
3205 now = qemu_get_clock_ns(vm_clock);
3206 if ((bs->slice_start < now)
3207 && (bs->slice_end > now)) {
3208 bs->slice_end = now + bs->slice_time;
3209 } else {
3210 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3211 bs->slice_start = now;
3212 bs->slice_end = now + bs->slice_time;
3213
3214 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3215 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3216
3217 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3218 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3219 }
3220
3221 elapsed_time = now - bs->slice_start;
3222 elapsed_time /= (NANOSECONDS_PER_SECOND);
3223
3224 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3225 is_write, elapsed_time, &bps_wait);
3226 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3227 elapsed_time, &iops_wait);
3228 if (bps_ret || iops_ret) {
3229 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3230 if (wait) {
3231 *wait = max_wait;
3232 }
3233
3234 now = qemu_get_clock_ns(vm_clock);
3235 if (bs->slice_end < now + max_wait) {
3236 bs->slice_end = now + max_wait;
3237 }
3238
3239 return true;
3240 }
3241
3242 if (wait) {
3243 *wait = 0;
3244 }
3245
3246 return false;
3247}
ce1a14dc 3248
83f64091
FB
3249/**************************************************************/
3250/* async block device emulation */
3251
c16b5a2c
CH
3252typedef struct BlockDriverAIOCBSync {
3253 BlockDriverAIOCB common;
3254 QEMUBH *bh;
3255 int ret;
3256 /* vector translation state */
3257 QEMUIOVector *qiov;
3258 uint8_t *bounce;
3259 int is_write;
3260} BlockDriverAIOCBSync;
3261
3262static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3263{
b666d239
KW
3264 BlockDriverAIOCBSync *acb =
3265 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 3266 qemu_bh_delete(acb->bh);
36afc451 3267 acb->bh = NULL;
c16b5a2c
CH
3268 qemu_aio_release(acb);
3269}
3270
3271static AIOPool bdrv_em_aio_pool = {
3272 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3273 .cancel = bdrv_aio_cancel_em,
3274};
3275
ce1a14dc 3276static void bdrv_aio_bh_cb(void *opaque)
83f64091 3277{
ce1a14dc 3278 BlockDriverAIOCBSync *acb = opaque;
f141eafe 3279
f141eafe
AL
3280 if (!acb->is_write)
3281 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
ceb42de8 3282 qemu_vfree(acb->bounce);
ce1a14dc 3283 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 3284 qemu_bh_delete(acb->bh);
36afc451 3285 acb->bh = NULL;
ce1a14dc 3286 qemu_aio_release(acb);
83f64091 3287}
beac80cd 3288
f141eafe
AL
3289static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3290 int64_t sector_num,
3291 QEMUIOVector *qiov,
3292 int nb_sectors,
3293 BlockDriverCompletionFunc *cb,
3294 void *opaque,
3295 int is_write)
3296
83f64091 3297{
ce1a14dc 3298 BlockDriverAIOCBSync *acb;
ce1a14dc 3299
c16b5a2c 3300 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
f141eafe
AL
3301 acb->is_write = is_write;
3302 acb->qiov = qiov;
e268ca52 3303 acb->bounce = qemu_blockalign(bs, qiov->size);
3f3aace8 3304 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
3305
3306 if (is_write) {
3307 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
1ed20acf 3308 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 3309 } else {
1ed20acf 3310 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
3311 }
3312
ce1a14dc 3313 qemu_bh_schedule(acb->bh);
f141eafe 3314
ce1a14dc 3315 return &acb->common;
beac80cd
FB
3316}
3317
f141eafe
AL
3318static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3319 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 3320 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 3321{
f141eafe
AL
3322 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3323}
83f64091 3324
f141eafe
AL
3325static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3326 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3327 BlockDriverCompletionFunc *cb, void *opaque)
3328{
3329 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 3330}
beac80cd 3331
68485420
KW
3332
3333typedef struct BlockDriverAIOCBCoroutine {
3334 BlockDriverAIOCB common;
3335 BlockRequest req;
3336 bool is_write;
3337 QEMUBH* bh;
3338} BlockDriverAIOCBCoroutine;
3339
3340static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3341{
3342 qemu_aio_flush();
3343}
3344
3345static AIOPool bdrv_em_co_aio_pool = {
3346 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3347 .cancel = bdrv_aio_co_cancel_em,
3348};
3349
35246a68 3350static void bdrv_co_em_bh(void *opaque)
68485420
KW
3351{
3352 BlockDriverAIOCBCoroutine *acb = opaque;
3353
3354 acb->common.cb(acb->common.opaque, acb->req.error);
3355 qemu_bh_delete(acb->bh);
3356 qemu_aio_release(acb);
3357}
3358
b2a61371
SH
3359/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3360static void coroutine_fn bdrv_co_do_rw(void *opaque)
3361{
3362 BlockDriverAIOCBCoroutine *acb = opaque;
3363 BlockDriverState *bs = acb->common.bs;
3364
3365 if (!acb->is_write) {
3366 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
470c0504 3367 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3368 } else {
3369 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
f08f2dda 3370 acb->req.nb_sectors, acb->req.qiov, 0);
b2a61371
SH
3371 }
3372
35246a68 3373 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
3374 qemu_bh_schedule(acb->bh);
3375}
3376
68485420
KW
3377static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3378 int64_t sector_num,
3379 QEMUIOVector *qiov,
3380 int nb_sectors,
3381 BlockDriverCompletionFunc *cb,
3382 void *opaque,
8c5873d6 3383 bool is_write)
68485420
KW
3384{
3385 Coroutine *co;
3386 BlockDriverAIOCBCoroutine *acb;
3387
3388 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3389 acb->req.sector = sector_num;
3390 acb->req.nb_sectors = nb_sectors;
3391 acb->req.qiov = qiov;
3392 acb->is_write = is_write;
3393
8c5873d6 3394 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
3395 qemu_coroutine_enter(co, acb);
3396
3397 return &acb->common;
3398}
3399
07f07615 3400static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 3401{
07f07615
PB
3402 BlockDriverAIOCBCoroutine *acb = opaque;
3403 BlockDriverState *bs = acb->common.bs;
b2e12bc6 3404
07f07615
PB
3405 acb->req.error = bdrv_co_flush(bs);
3406 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 3407 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
3408}
3409
07f07615 3410BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
3411 BlockDriverCompletionFunc *cb, void *opaque)
3412{
07f07615 3413 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 3414
07f07615
PB
3415 Coroutine *co;
3416 BlockDriverAIOCBCoroutine *acb;
016f5cf6 3417
07f07615
PB
3418 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3419 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3420 qemu_coroutine_enter(co, acb);
016f5cf6 3421
016f5cf6
AG
3422 return &acb->common;
3423}
3424
4265d620
PB
3425static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3426{
3427 BlockDriverAIOCBCoroutine *acb = opaque;
3428 BlockDriverState *bs = acb->common.bs;
3429
3430 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3431 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3432 qemu_bh_schedule(acb->bh);
3433}
3434
3435BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3436 int64_t sector_num, int nb_sectors,
3437 BlockDriverCompletionFunc *cb, void *opaque)
3438{
3439 Coroutine *co;
3440 BlockDriverAIOCBCoroutine *acb;
3441
3442 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3443
3444 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3445 acb->req.sector = sector_num;
3446 acb->req.nb_sectors = nb_sectors;
3447 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3448 qemu_coroutine_enter(co, acb);
3449
3450 return &acb->common;
3451}
3452
ea2384d3
FB
3453void bdrv_init(void)
3454{
5efa9d5a 3455 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 3456}
ce1a14dc 3457
eb852011
MA
3458void bdrv_init_with_whitelist(void)
3459{
3460 use_bdrv_whitelist = 1;
3461 bdrv_init();
3462}
3463
c16b5a2c
CH
3464void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3465 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 3466{
ce1a14dc
PB
3467 BlockDriverAIOCB *acb;
3468
6bbff9a0
AL
3469 if (pool->free_aiocb) {
3470 acb = pool->free_aiocb;
3471 pool->free_aiocb = acb->next;
ce1a14dc 3472 } else {
7267c094 3473 acb = g_malloc0(pool->aiocb_size);
6bbff9a0 3474 acb->pool = pool;
ce1a14dc
PB
3475 }
3476 acb->bs = bs;
3477 acb->cb = cb;
3478 acb->opaque = opaque;
3479 return acb;
3480}
3481
3482void qemu_aio_release(void *p)
3483{
6bbff9a0
AL
3484 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3485 AIOPool *pool = acb->pool;
3486 acb->next = pool->free_aiocb;
3487 pool->free_aiocb = acb;
ce1a14dc 3488}
19cb3738 3489
f9f05dc5
KW
3490/**************************************************************/
3491/* Coroutine block device emulation */
3492
3493typedef struct CoroutineIOCompletion {
3494 Coroutine *coroutine;
3495 int ret;
3496} CoroutineIOCompletion;
3497
3498static void bdrv_co_io_em_complete(void *opaque, int ret)
3499{
3500 CoroutineIOCompletion *co = opaque;
3501
3502 co->ret = ret;
3503 qemu_coroutine_enter(co->coroutine, NULL);
3504}
3505
3506static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3507 int nb_sectors, QEMUIOVector *iov,
3508 bool is_write)
3509{
3510 CoroutineIOCompletion co = {
3511 .coroutine = qemu_coroutine_self(),
3512 };
3513 BlockDriverAIOCB *acb;
3514
3515 if (is_write) {
a652d160
SH
3516 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3517 bdrv_co_io_em_complete, &co);
f9f05dc5 3518 } else {
a652d160
SH
3519 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3520 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
3521 }
3522
59370aaa 3523 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
3524 if (!acb) {
3525 return -EIO;
3526 }
3527 qemu_coroutine_yield();
3528
3529 return co.ret;
3530}
3531
3532static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3533 int64_t sector_num, int nb_sectors,
3534 QEMUIOVector *iov)
3535{
3536 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3537}
3538
3539static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3540 int64_t sector_num, int nb_sectors,
3541 QEMUIOVector *iov)
3542{
3543 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3544}
3545
07f07615 3546static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 3547{
07f07615
PB
3548 RwCo *rwco = opaque;
3549
3550 rwco->ret = bdrv_co_flush(rwco->bs);
3551}
3552
3553int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3554{
eb489bb1
KW
3555 int ret;
3556
29cdb251 3557 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
07f07615 3558 return 0;
eb489bb1
KW
3559 }
3560
ca716364 3561 /* Write back cached data to the OS even with cache=unsafe */
eb489bb1
KW
3562 if (bs->drv->bdrv_co_flush_to_os) {
3563 ret = bs->drv->bdrv_co_flush_to_os(bs);
3564 if (ret < 0) {
3565 return ret;
3566 }
3567 }
3568
ca716364
KW
3569 /* But don't actually force it to the disk with cache=unsafe */
3570 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3571 return 0;
3572 }
3573
eb489bb1 3574 if (bs->drv->bdrv_co_flush_to_disk) {
29cdb251 3575 ret = bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
3576 } else if (bs->drv->bdrv_aio_flush) {
3577 BlockDriverAIOCB *acb;
3578 CoroutineIOCompletion co = {
3579 .coroutine = qemu_coroutine_self(),
3580 };
3581
3582 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3583 if (acb == NULL) {
29cdb251 3584 ret = -EIO;
07f07615
PB
3585 } else {
3586 qemu_coroutine_yield();
29cdb251 3587 ret = co.ret;
07f07615 3588 }
07f07615
PB
3589 } else {
3590 /*
3591 * Some block drivers always operate in either writethrough or unsafe
3592 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3593 * know how the server works (because the behaviour is hardcoded or
3594 * depends on server-side configuration), so we can't ensure that
3595 * everything is safe on disk. Returning an error doesn't work because
3596 * that would break guests even if the server operates in writethrough
3597 * mode.
3598 *
3599 * Let's hope the user knows what he's doing.
3600 */
29cdb251 3601 ret = 0;
07f07615 3602 }
29cdb251
PB
3603 if (ret < 0) {
3604 return ret;
3605 }
3606
3607 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3608 * in the case of cache=unsafe, so there are no useless flushes.
3609 */
3610 return bdrv_co_flush(bs->file);
07f07615
PB
3611}
3612
0f15423c
AL
3613void bdrv_invalidate_cache(BlockDriverState *bs)
3614{
3615 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3616 bs->drv->bdrv_invalidate_cache(bs);
3617 }
3618}
3619
3620void bdrv_invalidate_cache_all(void)
3621{
3622 BlockDriverState *bs;
3623
3624 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3625 bdrv_invalidate_cache(bs);
3626 }
3627}
3628
07789269
BC
3629void bdrv_clear_incoming_migration_all(void)
3630{
3631 BlockDriverState *bs;
3632
3633 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3634 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3635 }
3636}
3637
07f07615
PB
3638int bdrv_flush(BlockDriverState *bs)
3639{
3640 Coroutine *co;
3641 RwCo rwco = {
3642 .bs = bs,
3643 .ret = NOT_DONE,
e7a8a783 3644 };
e7a8a783 3645
07f07615
PB
3646 if (qemu_in_coroutine()) {
3647 /* Fast-path if already in coroutine context */
3648 bdrv_flush_co_entry(&rwco);
3649 } else {
3650 co = qemu_coroutine_create(bdrv_flush_co_entry);
3651 qemu_coroutine_enter(co, &rwco);
3652 while (rwco.ret == NOT_DONE) {
3653 qemu_aio_wait();
3654 }
e7a8a783 3655 }
07f07615
PB
3656
3657 return rwco.ret;
e7a8a783
KW
3658}
3659
4265d620
PB
3660static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3661{
3662 RwCo *rwco = opaque;
3663
3664 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3665}
3666
3667int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3668 int nb_sectors)
3669{
3670 if (!bs->drv) {
3671 return -ENOMEDIUM;
3672 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3673 return -EIO;
3674 } else if (bs->read_only) {
3675 return -EROFS;
3676 } else if (bs->drv->bdrv_co_discard) {
3677 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3678 } else if (bs->drv->bdrv_aio_discard) {
3679 BlockDriverAIOCB *acb;
3680 CoroutineIOCompletion co = {
3681 .coroutine = qemu_coroutine_self(),
3682 };
3683
3684 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3685 bdrv_co_io_em_complete, &co);
3686 if (acb == NULL) {
3687 return -EIO;
3688 } else {
3689 qemu_coroutine_yield();
3690 return co.ret;
3691 }
4265d620
PB
3692 } else {
3693 return 0;
3694 }
3695}
3696
3697int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3698{
3699 Coroutine *co;
3700 RwCo rwco = {
3701 .bs = bs,
3702 .sector_num = sector_num,
3703 .nb_sectors = nb_sectors,
3704 .ret = NOT_DONE,
3705 };
3706
3707 if (qemu_in_coroutine()) {
3708 /* Fast-path if already in coroutine context */
3709 bdrv_discard_co_entry(&rwco);
3710 } else {
3711 co = qemu_coroutine_create(bdrv_discard_co_entry);
3712 qemu_coroutine_enter(co, &rwco);
3713 while (rwco.ret == NOT_DONE) {
3714 qemu_aio_wait();
3715 }
3716 }
3717
3718 return rwco.ret;
3719}
3720
19cb3738
FB
3721/**************************************************************/
3722/* removable device support */
3723
3724/**
3725 * Return TRUE if the media is present
3726 */
3727int bdrv_is_inserted(BlockDriverState *bs)
3728{
3729 BlockDriver *drv = bs->drv;
a1aff5bf 3730
19cb3738
FB
3731 if (!drv)
3732 return 0;
3733 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
3734 return 1;
3735 return drv->bdrv_is_inserted(bs);
19cb3738
FB
3736}
3737
3738/**
8e49ca46
MA
3739 * Return whether the media changed since the last call to this
3740 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
3741 */
3742int bdrv_media_changed(BlockDriverState *bs)
3743{
3744 BlockDriver *drv = bs->drv;
19cb3738 3745
8e49ca46
MA
3746 if (drv && drv->bdrv_media_changed) {
3747 return drv->bdrv_media_changed(bs);
3748 }
3749 return -ENOTSUP;
19cb3738
FB
3750}
3751
3752/**
3753 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3754 */
f36f3949 3755void bdrv_eject(BlockDriverState *bs, bool eject_flag)
19cb3738
FB
3756{
3757 BlockDriver *drv = bs->drv;
19cb3738 3758
822e1cd1
MA
3759 if (drv && drv->bdrv_eject) {
3760 drv->bdrv_eject(bs, eject_flag);
19cb3738 3761 }
6f382ed2
LC
3762
3763 if (bs->device_name[0] != '\0') {
3764 bdrv_emit_qmp_eject_event(bs, eject_flag);
3765 }
19cb3738
FB
3766}
3767
19cb3738
FB
3768/**
3769 * Lock or unlock the media (if it is locked, the user won't be able
3770 * to eject it manually).
3771 */
025e849a 3772void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
3773{
3774 BlockDriver *drv = bs->drv;
3775
025e849a 3776 trace_bdrv_lock_medium(bs, locked);
b8c6d095 3777
025e849a
MA
3778 if (drv && drv->bdrv_lock_medium) {
3779 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
3780 }
3781}
985a03b0
TS
3782
3783/* needed for generic scsi interface */
3784
3785int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3786{
3787 BlockDriver *drv = bs->drv;
3788
3789 if (drv && drv->bdrv_ioctl)
3790 return drv->bdrv_ioctl(bs, req, buf);
3791 return -ENOTSUP;
3792}
7d780669 3793
221f715d
AL
3794BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3795 unsigned long int req, void *buf,
3796 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 3797{
221f715d 3798 BlockDriver *drv = bs->drv;
7d780669 3799
221f715d
AL
3800 if (drv && drv->bdrv_aio_ioctl)
3801 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3802 return NULL;
7d780669 3803}
e268ca52 3804
7b6f9300
MA
3805void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3806{
3807 bs->buffer_alignment = align;
3808}
7cd1e32a 3809
e268ca52
AL
3810void *qemu_blockalign(BlockDriverState *bs, size_t size)
3811{
3812 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3813}
7cd1e32a 3814
3815void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3816{
3817 int64_t bitmap_size;
a55eb92c 3818
aaa0eb75 3819 bs->dirty_count = 0;
a55eb92c 3820 if (enable) {
c6d22830
JK
3821 if (!bs->dirty_bitmap) {
3822 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3823 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3824 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
a55eb92c 3825
7267c094 3826 bs->dirty_bitmap = g_malloc0(bitmap_size);
a55eb92c 3827 }
7cd1e32a 3828 } else {
c6d22830 3829 if (bs->dirty_bitmap) {
7267c094 3830 g_free(bs->dirty_bitmap);
c6d22830 3831 bs->dirty_bitmap = NULL;
a55eb92c 3832 }
7cd1e32a 3833 }
3834}
3835
3836int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3837{
6ea44308 3838 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 3839
c6d22830
JK
3840 if (bs->dirty_bitmap &&
3841 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
6d59fec1
MT
3842 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3843 (1UL << (chunk % (sizeof(unsigned long) * 8))));
7cd1e32a 3844 } else {
3845 return 0;
3846 }
3847}
3848
a55eb92c
JK
3849void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3850 int nr_sectors)
7cd1e32a 3851{
3852 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3853}
aaa0eb75
LS
3854
3855int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3856{
3857 return bs->dirty_count;
3858}
f88e1a42 3859
db593f25
MT
3860void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3861{
3862 assert(bs->in_use != in_use);
3863 bs->in_use = in_use;
3864}
3865
3866int bdrv_in_use(BlockDriverState *bs)
3867{
3868 return bs->in_use;
3869}
3870
28a7282a
LC
3871void bdrv_iostatus_enable(BlockDriverState *bs)
3872{
d6bf279e 3873 bs->iostatus_enabled = true;
58e21ef5 3874 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3875}
3876
3877/* The I/O status is only enabled if the drive explicitly
3878 * enables it _and_ the VM is configured to stop on errors */
3879bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3880{
d6bf279e 3881 return (bs->iostatus_enabled &&
28a7282a
LC
3882 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3883 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3884 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3885}
3886
3887void bdrv_iostatus_disable(BlockDriverState *bs)
3888{
d6bf279e 3889 bs->iostatus_enabled = false;
28a7282a
LC
3890}
3891
3892void bdrv_iostatus_reset(BlockDriverState *bs)
3893{
3894 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 3895 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3896 }
3897}
3898
3899/* XXX: Today this is set by device models because it makes the implementation
3900 quite simple. However, the block layer knows about the error, so it's
3901 possible to implement this without device models being involved */
3902void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3903{
58e21ef5
LC
3904 if (bdrv_iostatus_is_enabled(bs) &&
3905 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
28a7282a 3906 assert(error >= 0);
58e21ef5
LC
3907 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3908 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
3909 }
3910}
3911
a597e79c
CH
3912void
3913bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3914 enum BlockAcctType type)
3915{
3916 assert(type < BDRV_MAX_IOTYPE);
3917
3918 cookie->bytes = bytes;
c488c7f6 3919 cookie->start_time_ns = get_clock();
a597e79c
CH
3920 cookie->type = type;
3921}
3922
3923void
3924bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3925{
3926 assert(cookie->type < BDRV_MAX_IOTYPE);
3927
3928 bs->nr_bytes[cookie->type] += cookie->bytes;
3929 bs->nr_ops[cookie->type]++;
c488c7f6 3930 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
3931}
3932
f88e1a42
JS
3933int bdrv_img_create(const char *filename, const char *fmt,
3934 const char *base_filename, const char *base_fmt,
3935 char *options, uint64_t img_size, int flags)
3936{
3937 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 3938 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42
JS
3939 BlockDriverState *bs = NULL;
3940 BlockDriver *drv, *proto_drv;
96df67d1 3941 BlockDriver *backing_drv = NULL;
f88e1a42
JS
3942 int ret = 0;
3943
3944 /* Find driver and parse its options */
3945 drv = bdrv_find_format(fmt);
3946 if (!drv) {
3947 error_report("Unknown file format '%s'", fmt);
4f70f249 3948 ret = -EINVAL;
f88e1a42
JS
3949 goto out;
3950 }
3951
3952 proto_drv = bdrv_find_protocol(filename);
3953 if (!proto_drv) {
3954 error_report("Unknown protocol '%s'", filename);
4f70f249 3955 ret = -EINVAL;
f88e1a42
JS
3956 goto out;
3957 }
3958
3959 create_options = append_option_parameters(create_options,
3960 drv->create_options);
3961 create_options = append_option_parameters(create_options,
3962 proto_drv->create_options);
3963
3964 /* Create parameter list with default values */
3965 param = parse_option_parameters("", create_options, param);
3966
3967 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3968
3969 /* Parse -o options */
3970 if (options) {
3971 param = parse_option_parameters(options, create_options, param);
3972 if (param == NULL) {
3973 error_report("Invalid options for file format '%s'.", fmt);
4f70f249 3974 ret = -EINVAL;
f88e1a42
JS
3975 goto out;
3976 }
3977 }
3978
3979 if (base_filename) {
3980 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3981 base_filename)) {
3982 error_report("Backing file not supported for file format '%s'",
3983 fmt);
4f70f249 3984 ret = -EINVAL;
f88e1a42
JS
3985 goto out;
3986 }
3987 }
3988
3989 if (base_fmt) {
3990 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3991 error_report("Backing file format not supported for file "
3992 "format '%s'", fmt);
4f70f249 3993 ret = -EINVAL;
f88e1a42
JS
3994 goto out;
3995 }
3996 }
3997
792da93a
JS
3998 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3999 if (backing_file && backing_file->value.s) {
4000 if (!strcmp(filename, backing_file->value.s)) {
4001 error_report("Error: Trying to create an image with the "
4002 "same filename as the backing file");
4f70f249 4003 ret = -EINVAL;
792da93a
JS
4004 goto out;
4005 }
4006 }
4007
f88e1a42
JS
4008 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4009 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
4010 backing_drv = bdrv_find_format(backing_fmt->value.s);
4011 if (!backing_drv) {
f88e1a42
JS
4012 error_report("Unknown backing file format '%s'",
4013 backing_fmt->value.s);
4f70f249 4014 ret = -EINVAL;
f88e1a42
JS
4015 goto out;
4016 }
4017 }
4018
4019 // The size for the image must always be specified, with one exception:
4020 // If we are using a backing file, we can obtain the size from there
d220894e
KW
4021 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4022 if (size && size->value.n == -1) {
f88e1a42
JS
4023 if (backing_file && backing_file->value.s) {
4024 uint64_t size;
f88e1a42
JS
4025 char buf[32];
4026
f88e1a42
JS
4027 bs = bdrv_new("");
4028
96df67d1 4029 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
f88e1a42 4030 if (ret < 0) {
96df67d1 4031 error_report("Could not open '%s'", backing_file->value.s);
f88e1a42
JS
4032 goto out;
4033 }
4034 bdrv_get_geometry(bs, &size);
4035 size *= 512;
4036
4037 snprintf(buf, sizeof(buf), "%" PRId64, size);
4038 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4039 } else {
4040 error_report("Image creation needs a size parameter");
4f70f249 4041 ret = -EINVAL;
f88e1a42
JS
4042 goto out;
4043 }
4044 }
4045
4046 printf("Formatting '%s', fmt=%s ", filename, fmt);
4047 print_option_parameters(param);
4048 puts("");
4049
4050 ret = bdrv_create(drv, filename, param);
4051
4052 if (ret < 0) {
4053 if (ret == -ENOTSUP) {
4054 error_report("Formatting or formatting option not supported for "
4055 "file format '%s'", fmt);
4056 } else if (ret == -EFBIG) {
4057 error_report("The image size is too large for file format '%s'",
4058 fmt);
4059 } else {
4060 error_report("%s: error while creating %s: %s", filename, fmt,
4061 strerror(-ret));
4062 }
4063 }
4064
4065out:
4066 free_option_parameters(create_options);
4067 free_option_parameters(param);
4068
4069 if (bs) {
4070 bdrv_delete(bs);
4071 }
4f70f249
JS
4072
4073 return ret;
f88e1a42 4074}
eeec61f2
SH
4075
4076void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4077 BlockDriverCompletionFunc *cb, void *opaque)
4078{
4079 BlockJob *job;
4080
4081 if (bs->job || bdrv_in_use(bs)) {
4082 return NULL;
4083 }
4084 bdrv_set_in_use(bs, 1);
4085
4086 job = g_malloc0(job_type->instance_size);
4087 job->job_type = job_type;
4088 job->bs = bs;
4089 job->cb = cb;
4090 job->opaque = opaque;
4091 bs->job = job;
4092 return job;
4093}
4094
4095void block_job_complete(BlockJob *job, int ret)
4096{
4097 BlockDriverState *bs = job->bs;
4098
4099 assert(bs->job == job);
4100 job->cb(job->opaque, ret);
4101 bs->job = NULL;
4102 g_free(job);
4103 bdrv_set_in_use(bs, 0);
4104}
4105
4106int block_job_set_speed(BlockJob *job, int64_t value)
4107{
9f25eccc
PB
4108 int rc;
4109
eeec61f2
SH
4110 if (!job->job_type->set_speed) {
4111 return -ENOTSUP;
4112 }
9f25eccc
PB
4113 rc = job->job_type->set_speed(job, value);
4114 if (rc == 0) {
4115 job->speed = value;
4116 }
4117 return rc;
eeec61f2
SH
4118}
4119
4120void block_job_cancel(BlockJob *job)
4121{
4122 job->cancelled = true;
4123}
4124
4125bool block_job_is_cancelled(BlockJob *job)
4126{
4127 return job->cancelled;
4128}
3e914655
PB
4129
4130void block_job_cancel_sync(BlockJob *job)
4131{
4132 BlockDriverState *bs = job->bs;
4133
4134 assert(bs->job == job);
4135 block_job_cancel(job);
4136 while (bs->job != NULL && bs->job->busy) {
4137 qemu_aio_wait();
4138 }
4139}