]> git.proxmox.com Git - qemu.git/blame - block.c
block: wait_for_overlapping_requests() deadlock detection
[qemu.git] / block.c
CommitLineData
fc01f7e7
FB
1/*
2 * QEMU System Emulator block driver
5fafdf24 3 *
fc01f7e7 4 * Copyright (c) 2003 Fabrice Bellard
5fafdf24 5 *
fc01f7e7
FB
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
3990d09a 24#include "config-host.h"
faf07963 25#include "qemu-common.h"
6d519a5f 26#include "trace.h"
376253ec 27#include "monitor.h"
ea2384d3 28#include "block_int.h"
5efa9d5a 29#include "module.h"
f795e743 30#include "qjson.h"
68485420 31#include "qemu-coroutine.h"
b2023818 32#include "qmp-commands.h"
0563e191 33#include "qemu-timer.h"
fc01f7e7 34
71e72a19 35#ifdef CONFIG_BSD
7674e7bf
FB
36#include <sys/types.h>
37#include <sys/stat.h>
38#include <sys/ioctl.h>
72cf2d4f 39#include <sys/queue.h>
c5e97233 40#ifndef __DragonFly__
7674e7bf
FB
41#include <sys/disk.h>
42#endif
c5e97233 43#endif
7674e7bf 44
49dc768d
AL
45#ifdef _WIN32
46#include <windows.h>
47#endif
48
1c9805a3
SH
49#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
7d4b4ba5 51static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
f141eafe
AL
52static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
53 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
c87c0672 54 BlockDriverCompletionFunc *cb, void *opaque);
f141eafe
AL
55static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
56 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 57 BlockDriverCompletionFunc *cb, void *opaque);
f9f05dc5
KW
58static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
59 int64_t sector_num, int nb_sectors,
60 QEMUIOVector *iov);
61static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
62 int64_t sector_num, int nb_sectors,
63 QEMUIOVector *iov);
c5fbe571
SH
64static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
65 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
1c9805a3
SH
66static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
b2a61371
SH
68static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
69 int64_t sector_num,
70 QEMUIOVector *qiov,
71 int nb_sectors,
72 BlockDriverCompletionFunc *cb,
73 void *opaque,
8c5873d6 74 bool is_write);
b2a61371 75static void coroutine_fn bdrv_co_do_rw(void *opaque);
ec530c81 76
98f90dba
ZYW
77static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
78 bool is_write, double elapsed_time, uint64_t *wait);
79static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
80 double elapsed_time, uint64_t *wait);
81static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
82 bool is_write, int64_t *wait);
83
1b7bdbc1
SH
84static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
85 QTAILQ_HEAD_INITIALIZER(bdrv_states);
7ee930d0 86
8a22f02a
SH
87static QLIST_HEAD(, BlockDriver) bdrv_drivers =
88 QLIST_HEAD_INITIALIZER(bdrv_drivers);
ea2384d3 89
f9092b10
MA
90/* The device to use for VM snapshots */
91static BlockDriverState *bs_snapshots;
92
eb852011
MA
93/* If non-zero, use only whitelisted block drivers */
94static int use_bdrv_whitelist;
95
9e0b22f4
SH
96#ifdef _WIN32
97static int is_windows_drive_prefix(const char *filename)
98{
99 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
100 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
101 filename[1] == ':');
102}
103
104int is_windows_drive(const char *filename)
105{
106 if (is_windows_drive_prefix(filename) &&
107 filename[2] == '\0')
108 return 1;
109 if (strstart(filename, "\\\\.\\", NULL) ||
110 strstart(filename, "//./", NULL))
111 return 1;
112 return 0;
113}
114#endif
115
0563e191 116/* throttling disk I/O limits */
98f90dba
ZYW
117void bdrv_io_limits_disable(BlockDriverState *bs)
118{
119 bs->io_limits_enabled = false;
120
121 while (qemu_co_queue_next(&bs->throttled_reqs));
122
123 if (bs->block_timer) {
124 qemu_del_timer(bs->block_timer);
125 qemu_free_timer(bs->block_timer);
126 bs->block_timer = NULL;
127 }
128
129 bs->slice_start = 0;
130 bs->slice_end = 0;
131 bs->slice_time = 0;
132 memset(&bs->io_base, 0, sizeof(bs->io_base));
133}
134
0563e191
ZYW
135static void bdrv_block_timer(void *opaque)
136{
137 BlockDriverState *bs = opaque;
138
139 qemu_co_queue_next(&bs->throttled_reqs);
140}
141
142void bdrv_io_limits_enable(BlockDriverState *bs)
143{
144 qemu_co_queue_init(&bs->throttled_reqs);
145 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
146 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
147 bs->slice_start = qemu_get_clock_ns(vm_clock);
148 bs->slice_end = bs->slice_start + bs->slice_time;
149 memset(&bs->io_base, 0, sizeof(bs->io_base));
150 bs->io_limits_enabled = true;
151}
152
153bool bdrv_io_limits_enabled(BlockDriverState *bs)
154{
155 BlockIOLimit *io_limits = &bs->io_limits;
156 return io_limits->bps[BLOCK_IO_LIMIT_READ]
157 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
158 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
159 || io_limits->iops[BLOCK_IO_LIMIT_READ]
160 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
161 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
162}
163
98f90dba
ZYW
164static void bdrv_io_limits_intercept(BlockDriverState *bs,
165 bool is_write, int nb_sectors)
166{
167 int64_t wait_time = -1;
168
169 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
170 qemu_co_queue_wait(&bs->throttled_reqs);
171 }
172
173 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
174 * throttled requests will not be dequeued until the current request is
175 * allowed to be serviced. So if the current request still exceeds the
176 * limits, it will be inserted to the head. All requests followed it will
177 * be still in throttled_reqs queue.
178 */
179
180 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
181 qemu_mod_timer(bs->block_timer,
182 wait_time + qemu_get_clock_ns(vm_clock));
183 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
184 }
185
186 qemu_co_queue_next(&bs->throttled_reqs);
187}
188
9e0b22f4
SH
189/* check if the path starts with "<protocol>:" */
190static int path_has_protocol(const char *path)
191{
192#ifdef _WIN32
193 if (is_windows_drive(path) ||
194 is_windows_drive_prefix(path)) {
195 return 0;
196 }
197#endif
198
199 return strchr(path, ':') != NULL;
200}
201
83f64091 202int path_is_absolute(const char *path)
3b0d4f61 203{
83f64091 204 const char *p;
21664424
FB
205#ifdef _WIN32
206 /* specific case for names like: "\\.\d:" */
207 if (*path == '/' || *path == '\\')
208 return 1;
209#endif
83f64091
FB
210 p = strchr(path, ':');
211 if (p)
212 p++;
213 else
214 p = path;
3b9f94e1
FB
215#ifdef _WIN32
216 return (*p == '/' || *p == '\\');
217#else
218 return (*p == '/');
219#endif
3b0d4f61
FB
220}
221
83f64091
FB
222/* if filename is absolute, just copy it to dest. Otherwise, build a
223 path to it by considering it is relative to base_path. URL are
224 supported. */
225void path_combine(char *dest, int dest_size,
226 const char *base_path,
227 const char *filename)
3b0d4f61 228{
83f64091
FB
229 const char *p, *p1;
230 int len;
231
232 if (dest_size <= 0)
233 return;
234 if (path_is_absolute(filename)) {
235 pstrcpy(dest, dest_size, filename);
236 } else {
237 p = strchr(base_path, ':');
238 if (p)
239 p++;
240 else
241 p = base_path;
3b9f94e1
FB
242 p1 = strrchr(base_path, '/');
243#ifdef _WIN32
244 {
245 const char *p2;
246 p2 = strrchr(base_path, '\\');
247 if (!p1 || p2 > p1)
248 p1 = p2;
249 }
250#endif
83f64091
FB
251 if (p1)
252 p1++;
253 else
254 p1 = base_path;
255 if (p1 > p)
256 p = p1;
257 len = p - base_path;
258 if (len > dest_size - 1)
259 len = dest_size - 1;
260 memcpy(dest, base_path, len);
261 dest[len] = '\0';
262 pstrcat(dest, dest_size, filename);
3b0d4f61 263 }
3b0d4f61
FB
264}
265
5efa9d5a 266void bdrv_register(BlockDriver *bdrv)
ea2384d3 267{
8c5873d6
SH
268 /* Block drivers without coroutine functions need emulation */
269 if (!bdrv->bdrv_co_readv) {
f9f05dc5
KW
270 bdrv->bdrv_co_readv = bdrv_co_readv_em;
271 bdrv->bdrv_co_writev = bdrv_co_writev_em;
272
f8c35c1d
SH
273 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
274 * the block driver lacks aio we need to emulate that too.
275 */
f9f05dc5
KW
276 if (!bdrv->bdrv_aio_readv) {
277 /* add AIO emulation layer */
278 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
279 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
f9f05dc5 280 }
83f64091 281 }
b2e12bc6 282
8a22f02a 283 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
ea2384d3 284}
b338082b
FB
285
286/* create a new block device (by default it is empty) */
287BlockDriverState *bdrv_new(const char *device_name)
288{
1b7bdbc1 289 BlockDriverState *bs;
b338082b 290
7267c094 291 bs = g_malloc0(sizeof(BlockDriverState));
b338082b 292 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
ea2384d3 293 if (device_name[0] != '\0') {
1b7bdbc1 294 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
ea2384d3 295 }
28a7282a 296 bdrv_iostatus_disable(bs);
b338082b
FB
297 return bs;
298}
299
ea2384d3
FB
300BlockDriver *bdrv_find_format(const char *format_name)
301{
302 BlockDriver *drv1;
8a22f02a
SH
303 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
304 if (!strcmp(drv1->format_name, format_name)) {
ea2384d3 305 return drv1;
8a22f02a 306 }
ea2384d3
FB
307 }
308 return NULL;
309}
310
eb852011
MA
311static int bdrv_is_whitelisted(BlockDriver *drv)
312{
313 static const char *whitelist[] = {
314 CONFIG_BDRV_WHITELIST
315 };
316 const char **p;
317
318 if (!whitelist[0])
319 return 1; /* no whitelist, anything goes */
320
321 for (p = whitelist; *p; p++) {
322 if (!strcmp(drv->format_name, *p)) {
323 return 1;
324 }
325 }
326 return 0;
327}
328
329BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
330{
331 BlockDriver *drv = bdrv_find_format(format_name);
332 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
333}
334
0e7e1989
KW
335int bdrv_create(BlockDriver *drv, const char* filename,
336 QEMUOptionParameter *options)
ea2384d3
FB
337{
338 if (!drv->bdrv_create)
339 return -ENOTSUP;
0e7e1989
KW
340
341 return drv->bdrv_create(filename, options);
ea2384d3
FB
342}
343
84a12e66
CH
344int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
345{
346 BlockDriver *drv;
347
b50cbabc 348 drv = bdrv_find_protocol(filename);
84a12e66 349 if (drv == NULL) {
16905d71 350 return -ENOENT;
84a12e66
CH
351 }
352
353 return bdrv_create(drv, filename, options);
354}
355
d5249393 356#ifdef _WIN32
95389c86 357void get_tmp_filename(char *filename, int size)
d5249393 358{
3b9f94e1 359 char temp_dir[MAX_PATH];
3b46e624 360
3b9f94e1
FB
361 GetTempPath(MAX_PATH, temp_dir);
362 GetTempFileName(temp_dir, "qem", 0, filename);
d5249393
FB
363}
364#else
95389c86 365void get_tmp_filename(char *filename, int size)
fc01f7e7 366{
67b915a5 367 int fd;
7ccfb2eb 368 const char *tmpdir;
d5249393 369 /* XXX: race condition possible */
0badc1ee
AJ
370 tmpdir = getenv("TMPDIR");
371 if (!tmpdir)
372 tmpdir = "/tmp";
373 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
ea2384d3
FB
374 fd = mkstemp(filename);
375 close(fd);
376}
d5249393 377#endif
fc01f7e7 378
84a12e66
CH
379/*
380 * Detect host devices. By convention, /dev/cdrom[N] is always
381 * recognized as a host CDROM.
382 */
383static BlockDriver *find_hdev_driver(const char *filename)
384{
385 int score_max = 0, score;
386 BlockDriver *drv = NULL, *d;
387
388 QLIST_FOREACH(d, &bdrv_drivers, list) {
389 if (d->bdrv_probe_device) {
390 score = d->bdrv_probe_device(filename);
391 if (score > score_max) {
392 score_max = score;
393 drv = d;
394 }
395 }
396 }
397
398 return drv;
399}
400
b50cbabc 401BlockDriver *bdrv_find_protocol(const char *filename)
83f64091
FB
402{
403 BlockDriver *drv1;
404 char protocol[128];
1cec71e3 405 int len;
83f64091 406 const char *p;
19cb3738 407
66f82cee
KW
408 /* TODO Drivers without bdrv_file_open must be specified explicitly */
409
39508e7a
CH
410 /*
411 * XXX(hch): we really should not let host device detection
412 * override an explicit protocol specification, but moving this
413 * later breaks access to device names with colons in them.
414 * Thanks to the brain-dead persistent naming schemes on udev-
415 * based Linux systems those actually are quite common.
416 */
417 drv1 = find_hdev_driver(filename);
418 if (drv1) {
419 return drv1;
420 }
421
9e0b22f4 422 if (!path_has_protocol(filename)) {
39508e7a 423 return bdrv_find_format("file");
84a12e66 424 }
9e0b22f4
SH
425 p = strchr(filename, ':');
426 assert(p != NULL);
1cec71e3
AL
427 len = p - filename;
428 if (len > sizeof(protocol) - 1)
429 len = sizeof(protocol) - 1;
430 memcpy(protocol, filename, len);
431 protocol[len] = '\0';
8a22f02a 432 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
5fafdf24 433 if (drv1->protocol_name &&
8a22f02a 434 !strcmp(drv1->protocol_name, protocol)) {
83f64091 435 return drv1;
8a22f02a 436 }
83f64091
FB
437 }
438 return NULL;
439}
440
c98ac35d 441static int find_image_format(const char *filename, BlockDriver **pdrv)
f3a5d3f8
CH
442{
443 int ret, score, score_max;
444 BlockDriver *drv1, *drv;
445 uint8_t buf[2048];
446 BlockDriverState *bs;
447
f5edb014 448 ret = bdrv_file_open(&bs, filename, 0);
c98ac35d
SW
449 if (ret < 0) {
450 *pdrv = NULL;
451 return ret;
452 }
f8ea0b00 453
08a00559
KW
454 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
455 if (bs->sg || !bdrv_is_inserted(bs)) {
1a396859 456 bdrv_delete(bs);
c98ac35d
SW
457 drv = bdrv_find_format("raw");
458 if (!drv) {
459 ret = -ENOENT;
460 }
461 *pdrv = drv;
462 return ret;
1a396859 463 }
f8ea0b00 464
83f64091
FB
465 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
466 bdrv_delete(bs);
467 if (ret < 0) {
c98ac35d
SW
468 *pdrv = NULL;
469 return ret;
83f64091
FB
470 }
471
ea2384d3 472 score_max = 0;
84a12e66 473 drv = NULL;
8a22f02a 474 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
83f64091
FB
475 if (drv1->bdrv_probe) {
476 score = drv1->bdrv_probe(buf, ret, filename);
477 if (score > score_max) {
478 score_max = score;
479 drv = drv1;
480 }
0849bf08 481 }
fc01f7e7 482 }
c98ac35d
SW
483 if (!drv) {
484 ret = -ENOENT;
485 }
486 *pdrv = drv;
487 return ret;
ea2384d3
FB
488}
489
51762288
SH
490/**
491 * Set the current 'total_sectors' value
492 */
493static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
494{
495 BlockDriver *drv = bs->drv;
496
396759ad
NB
497 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
498 if (bs->sg)
499 return 0;
500
51762288
SH
501 /* query actual device if possible, otherwise just trust the hint */
502 if (drv->bdrv_getlength) {
503 int64_t length = drv->bdrv_getlength(bs);
504 if (length < 0) {
505 return length;
506 }
507 hint = length >> BDRV_SECTOR_BITS;
508 }
509
510 bs->total_sectors = hint;
511 return 0;
512}
513
c3993cdc
SH
514/**
515 * Set open flags for a given cache mode
516 *
517 * Return 0 on success, -1 if the cache mode was invalid.
518 */
519int bdrv_parse_cache_flags(const char *mode, int *flags)
520{
521 *flags &= ~BDRV_O_CACHE_MASK;
522
523 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
524 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
92196b2f
SH
525 } else if (!strcmp(mode, "directsync")) {
526 *flags |= BDRV_O_NOCACHE;
c3993cdc
SH
527 } else if (!strcmp(mode, "writeback")) {
528 *flags |= BDRV_O_CACHE_WB;
529 } else if (!strcmp(mode, "unsafe")) {
530 *flags |= BDRV_O_CACHE_WB;
531 *flags |= BDRV_O_NO_FLUSH;
532 } else if (!strcmp(mode, "writethrough")) {
533 /* this is the default */
534 } else {
535 return -1;
536 }
537
538 return 0;
539}
540
53fec9d3
SH
541/**
542 * The copy-on-read flag is actually a reference count so multiple users may
543 * use the feature without worrying about clobbering its previous state.
544 * Copy-on-read stays enabled until all users have called to disable it.
545 */
546void bdrv_enable_copy_on_read(BlockDriverState *bs)
547{
548 bs->copy_on_read++;
549}
550
551void bdrv_disable_copy_on_read(BlockDriverState *bs)
552{
553 assert(bs->copy_on_read > 0);
554 bs->copy_on_read--;
555}
556
57915332
KW
557/*
558 * Common part for opening disk images and files
559 */
560static int bdrv_open_common(BlockDriverState *bs, const char *filename,
561 int flags, BlockDriver *drv)
562{
563 int ret, open_flags;
564
565 assert(drv != NULL);
566
28dcee10
SH
567 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
568
66f82cee 569 bs->file = NULL;
51762288 570 bs->total_sectors = 0;
57915332
KW
571 bs->encrypted = 0;
572 bs->valid_key = 0;
03f541bd 573 bs->sg = 0;
57915332 574 bs->open_flags = flags;
03f541bd 575 bs->growable = 0;
57915332
KW
576 bs->buffer_alignment = 512;
577
53fec9d3
SH
578 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
579 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
580 bdrv_enable_copy_on_read(bs);
581 }
582
57915332 583 pstrcpy(bs->filename, sizeof(bs->filename), filename);
03f541bd 584 bs->backing_file[0] = '\0';
57915332
KW
585
586 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
587 return -ENOTSUP;
588 }
589
590 bs->drv = drv;
7267c094 591 bs->opaque = g_malloc0(drv->instance_size);
57915332 592
03f541bd 593 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
57915332
KW
594
595 /*
596 * Clear flags that are internal to the block layer before opening the
597 * image.
598 */
599 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
600
601 /*
ebabb67a 602 * Snapshots should be writable.
57915332
KW
603 */
604 if (bs->is_temporary) {
605 open_flags |= BDRV_O_RDWR;
606 }
607
e7c63796
SH
608 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
609
66f82cee
KW
610 /* Open the image, either directly or using a protocol */
611 if (drv->bdrv_file_open) {
612 ret = drv->bdrv_file_open(bs, filename, open_flags);
613 } else {
614 ret = bdrv_file_open(&bs->file, filename, open_flags);
615 if (ret >= 0) {
616 ret = drv->bdrv_open(bs, open_flags);
617 }
618 }
619
57915332
KW
620 if (ret < 0) {
621 goto free_and_fail;
622 }
623
51762288
SH
624 ret = refresh_total_sectors(bs, bs->total_sectors);
625 if (ret < 0) {
626 goto free_and_fail;
57915332 627 }
51762288 628
57915332
KW
629#ifndef _WIN32
630 if (bs->is_temporary) {
631 unlink(filename);
632 }
633#endif
634 return 0;
635
636free_and_fail:
66f82cee
KW
637 if (bs->file) {
638 bdrv_delete(bs->file);
639 bs->file = NULL;
640 }
7267c094 641 g_free(bs->opaque);
57915332
KW
642 bs->opaque = NULL;
643 bs->drv = NULL;
644 return ret;
645}
646
b6ce07aa
KW
647/*
648 * Opens a file using a protocol (file, host_device, nbd, ...)
649 */
83f64091 650int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
ea2384d3 651{
83f64091 652 BlockDriverState *bs;
6db95603 653 BlockDriver *drv;
83f64091
FB
654 int ret;
655
b50cbabc 656 drv = bdrv_find_protocol(filename);
6db95603
CH
657 if (!drv) {
658 return -ENOENT;
659 }
660
83f64091 661 bs = bdrv_new("");
b6ce07aa 662 ret = bdrv_open_common(bs, filename, flags, drv);
83f64091
FB
663 if (ret < 0) {
664 bdrv_delete(bs);
665 return ret;
3b0d4f61 666 }
71d0770c 667 bs->growable = 1;
83f64091
FB
668 *pbs = bs;
669 return 0;
670}
671
b6ce07aa
KW
672/*
673 * Opens a disk image (raw, qcow2, vmdk, ...)
674 */
d6e9098e
KW
675int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
676 BlockDriver *drv)
ea2384d3 677{
b6ce07aa 678 int ret;
2b572816 679 char tmp_filename[PATH_MAX];
712e7874 680
83f64091 681 if (flags & BDRV_O_SNAPSHOT) {
ea2384d3
FB
682 BlockDriverState *bs1;
683 int64_t total_size;
7c96d46e 684 int is_protocol = 0;
91a073a9
KW
685 BlockDriver *bdrv_qcow2;
686 QEMUOptionParameter *options;
b6ce07aa 687 char backing_filename[PATH_MAX];
3b46e624 688
ea2384d3
FB
689 /* if snapshot, we create a temporary backing file and open it
690 instead of opening 'filename' directly */
33e3963e 691
ea2384d3
FB
692 /* if there is a backing file, use it */
693 bs1 = bdrv_new("");
d6e9098e 694 ret = bdrv_open(bs1, filename, 0, drv);
51d7c00c 695 if (ret < 0) {
ea2384d3 696 bdrv_delete(bs1);
51d7c00c 697 return ret;
ea2384d3 698 }
3e82990b 699 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
7c96d46e
AL
700
701 if (bs1->drv && bs1->drv->protocol_name)
702 is_protocol = 1;
703
ea2384d3 704 bdrv_delete(bs1);
3b46e624 705
ea2384d3 706 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
7c96d46e
AL
707
708 /* Real path is meaningless for protocols */
709 if (is_protocol)
710 snprintf(backing_filename, sizeof(backing_filename),
711 "%s", filename);
114cdfa9
KS
712 else if (!realpath(filename, backing_filename))
713 return -errno;
7c96d46e 714
91a073a9
KW
715 bdrv_qcow2 = bdrv_find_format("qcow2");
716 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
717
3e82990b 718 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
91a073a9
KW
719 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
720 if (drv) {
721 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
722 drv->format_name);
723 }
724
725 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
d748768c 726 free_option_parameters(options);
51d7c00c
AL
727 if (ret < 0) {
728 return ret;
ea2384d3 729 }
91a073a9 730
ea2384d3 731 filename = tmp_filename;
91a073a9 732 drv = bdrv_qcow2;
ea2384d3
FB
733 bs->is_temporary = 1;
734 }
712e7874 735
b6ce07aa 736 /* Find the right image format driver */
6db95603 737 if (!drv) {
c98ac35d 738 ret = find_image_format(filename, &drv);
51d7c00c 739 }
6987307c 740
51d7c00c 741 if (!drv) {
51d7c00c 742 goto unlink_and_fail;
ea2384d3 743 }
b6ce07aa
KW
744
745 /* Open the image */
746 ret = bdrv_open_common(bs, filename, flags, drv);
747 if (ret < 0) {
6987307c
CH
748 goto unlink_and_fail;
749 }
750
b6ce07aa
KW
751 /* If there is a backing file, use it */
752 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
753 char backing_filename[PATH_MAX];
754 int back_flags;
755 BlockDriver *back_drv = NULL;
756
757 bs->backing_hd = bdrv_new("");
df2dbb4a
SH
758
759 if (path_has_protocol(bs->backing_file)) {
760 pstrcpy(backing_filename, sizeof(backing_filename),
761 bs->backing_file);
762 } else {
763 path_combine(backing_filename, sizeof(backing_filename),
764 filename, bs->backing_file);
765 }
766
767 if (bs->backing_format[0] != '\0') {
b6ce07aa 768 back_drv = bdrv_find_format(bs->backing_format);
df2dbb4a 769 }
b6ce07aa
KW
770
771 /* backing files always opened read-only */
772 back_flags =
773 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
774
775 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
776 if (ret < 0) {
777 bdrv_close(bs);
778 return ret;
779 }
780 if (bs->is_temporary) {
781 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
782 } else {
783 /* base image inherits from "parent" */
784 bs->backing_hd->keep_read_only = bs->keep_read_only;
785 }
786 }
787
788 if (!bdrv_key_required(bs)) {
7d4b4ba5 789 bdrv_dev_change_media_cb(bs, true);
b6ce07aa
KW
790 }
791
98f90dba
ZYW
792 /* throttling disk I/O limits */
793 if (bs->io_limits_enabled) {
794 bdrv_io_limits_enable(bs);
795 }
796
b6ce07aa
KW
797 return 0;
798
799unlink_and_fail:
800 if (bs->is_temporary) {
801 unlink(filename);
802 }
803 return ret;
804}
805
fc01f7e7
FB
806void bdrv_close(BlockDriverState *bs)
807{
19cb3738 808 if (bs->drv) {
f9092b10
MA
809 if (bs == bs_snapshots) {
810 bs_snapshots = NULL;
811 }
557df6ac 812 if (bs->backing_hd) {
ea2384d3 813 bdrv_delete(bs->backing_hd);
557df6ac
SH
814 bs->backing_hd = NULL;
815 }
ea2384d3 816 bs->drv->bdrv_close(bs);
7267c094 817 g_free(bs->opaque);
ea2384d3
FB
818#ifdef _WIN32
819 if (bs->is_temporary) {
820 unlink(bs->filename);
821 }
67b915a5 822#endif
ea2384d3
FB
823 bs->opaque = NULL;
824 bs->drv = NULL;
53fec9d3 825 bs->copy_on_read = 0;
b338082b 826
66f82cee
KW
827 if (bs->file != NULL) {
828 bdrv_close(bs->file);
829 }
830
7d4b4ba5 831 bdrv_dev_change_media_cb(bs, false);
b338082b 832 }
98f90dba
ZYW
833
834 /*throttling disk I/O limits*/
835 if (bs->io_limits_enabled) {
836 bdrv_io_limits_disable(bs);
837 }
b338082b
FB
838}
839
2bc93fed
MK
840void bdrv_close_all(void)
841{
842 BlockDriverState *bs;
843
844 QTAILQ_FOREACH(bs, &bdrv_states, list) {
845 bdrv_close(bs);
846 }
847}
848
d22b2f41
RH
849/* make a BlockDriverState anonymous by removing from bdrv_state list.
850 Also, NULL terminate the device_name to prevent double remove */
851void bdrv_make_anon(BlockDriverState *bs)
852{
853 if (bs->device_name[0] != '\0') {
854 QTAILQ_REMOVE(&bdrv_states, bs, list);
855 }
856 bs->device_name[0] = '\0';
857}
858
b338082b
FB
859void bdrv_delete(BlockDriverState *bs)
860{
fa879d62 861 assert(!bs->dev);
18846dee 862
1b7bdbc1 863 /* remove from list, if necessary */
d22b2f41 864 bdrv_make_anon(bs);
34c6f050 865
b338082b 866 bdrv_close(bs);
66f82cee
KW
867 if (bs->file != NULL) {
868 bdrv_delete(bs->file);
869 }
870
f9092b10 871 assert(bs != bs_snapshots);
7267c094 872 g_free(bs);
fc01f7e7
FB
873}
874
fa879d62
MA
875int bdrv_attach_dev(BlockDriverState *bs, void *dev)
876/* TODO change to DeviceState *dev when all users are qdevified */
18846dee 877{
fa879d62 878 if (bs->dev) {
18846dee
MA
879 return -EBUSY;
880 }
fa879d62 881 bs->dev = dev;
28a7282a 882 bdrv_iostatus_reset(bs);
18846dee
MA
883 return 0;
884}
885
fa879d62
MA
886/* TODO qdevified devices don't use this, remove when devices are qdevified */
887void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
18846dee 888{
fa879d62
MA
889 if (bdrv_attach_dev(bs, dev) < 0) {
890 abort();
891 }
892}
893
894void bdrv_detach_dev(BlockDriverState *bs, void *dev)
895/* TODO change to DeviceState *dev when all users are qdevified */
896{
897 assert(bs->dev == dev);
898 bs->dev = NULL;
0e49de52
MA
899 bs->dev_ops = NULL;
900 bs->dev_opaque = NULL;
29e05f20 901 bs->buffer_alignment = 512;
18846dee
MA
902}
903
fa879d62
MA
904/* TODO change to return DeviceState * when all users are qdevified */
905void *bdrv_get_attached_dev(BlockDriverState *bs)
18846dee 906{
fa879d62 907 return bs->dev;
18846dee
MA
908}
909
0e49de52
MA
910void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
911 void *opaque)
912{
913 bs->dev_ops = ops;
914 bs->dev_opaque = opaque;
2c6942fa
MA
915 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
916 bs_snapshots = NULL;
917 }
0e49de52
MA
918}
919
7d4b4ba5 920static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
0e49de52 921{
145feb17 922 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
7d4b4ba5 923 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
145feb17
MA
924 }
925}
926
2c6942fa
MA
927bool bdrv_dev_has_removable_media(BlockDriverState *bs)
928{
929 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
930}
931
025ccaa7
PB
932void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
933{
934 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
935 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
936 }
937}
938
e4def80b
MA
939bool bdrv_dev_is_tray_open(BlockDriverState *bs)
940{
941 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
942 return bs->dev_ops->is_tray_open(bs->dev_opaque);
943 }
944 return false;
945}
946
145feb17
MA
947static void bdrv_dev_resize_cb(BlockDriverState *bs)
948{
949 if (bs->dev_ops && bs->dev_ops->resize_cb) {
950 bs->dev_ops->resize_cb(bs->dev_opaque);
0e49de52
MA
951 }
952}
953
f107639a
MA
954bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
955{
956 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
957 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
958 }
959 return false;
960}
961
e97fc193
AL
962/*
963 * Run consistency checks on an image
964 *
e076f338 965 * Returns 0 if the check could be completed (it doesn't mean that the image is
a1c7273b 966 * free of errors) or -errno when an internal error occurred. The results of the
e076f338 967 * check are stored in res.
e97fc193 968 */
e076f338 969int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
e97fc193
AL
970{
971 if (bs->drv->bdrv_check == NULL) {
972 return -ENOTSUP;
973 }
974
e076f338 975 memset(res, 0, sizeof(*res));
9ac228e0 976 return bs->drv->bdrv_check(bs, res);
e97fc193
AL
977}
978
8a426614
KW
979#define COMMIT_BUF_SECTORS 2048
980
33e3963e
FB
981/* commit COW file into the raw image */
982int bdrv_commit(BlockDriverState *bs)
983{
19cb3738 984 BlockDriver *drv = bs->drv;
ee181196 985 BlockDriver *backing_drv;
8a426614
KW
986 int64_t sector, total_sectors;
987 int n, ro, open_flags;
4dca4b63 988 int ret = 0, rw_ret = 0;
8a426614 989 uint8_t *buf;
4dca4b63
NS
990 char filename[1024];
991 BlockDriverState *bs_rw, *bs_ro;
33e3963e 992
19cb3738
FB
993 if (!drv)
994 return -ENOMEDIUM;
4dca4b63
NS
995
996 if (!bs->backing_hd) {
997 return -ENOTSUP;
33e3963e
FB
998 }
999
4dca4b63
NS
1000 if (bs->backing_hd->keep_read_only) {
1001 return -EACCES;
1002 }
ee181196
KW
1003
1004 backing_drv = bs->backing_hd->drv;
4dca4b63
NS
1005 ro = bs->backing_hd->read_only;
1006 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1007 open_flags = bs->backing_hd->open_flags;
1008
1009 if (ro) {
1010 /* re-open as RW */
1011 bdrv_delete(bs->backing_hd);
1012 bs->backing_hd = NULL;
1013 bs_rw = bdrv_new("");
ee181196
KW
1014 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1015 backing_drv);
4dca4b63
NS
1016 if (rw_ret < 0) {
1017 bdrv_delete(bs_rw);
1018 /* try to re-open read-only */
1019 bs_ro = bdrv_new("");
ee181196
KW
1020 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1021 backing_drv);
4dca4b63
NS
1022 if (ret < 0) {
1023 bdrv_delete(bs_ro);
1024 /* drive not functional anymore */
1025 bs->drv = NULL;
1026 return ret;
1027 }
1028 bs->backing_hd = bs_ro;
1029 return rw_ret;
1030 }
1031 bs->backing_hd = bs_rw;
ea2384d3 1032 }
33e3963e 1033
6ea44308 1034 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
7267c094 1035 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
8a426614
KW
1036
1037 for (sector = 0; sector < total_sectors; sector += n) {
05c4af54 1038 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
8a426614
KW
1039
1040 if (bdrv_read(bs, sector, buf, n) != 0) {
1041 ret = -EIO;
1042 goto ro_cleanup;
1043 }
1044
1045 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1046 ret = -EIO;
1047 goto ro_cleanup;
1048 }
ea2384d3 1049 }
33e3963e 1050 }
95389c86 1051
1d44952f
CH
1052 if (drv->bdrv_make_empty) {
1053 ret = drv->bdrv_make_empty(bs);
1054 bdrv_flush(bs);
1055 }
95389c86 1056
3f5075ae
CH
1057 /*
1058 * Make sure all data we wrote to the backing device is actually
1059 * stable on disk.
1060 */
1061 if (bs->backing_hd)
1062 bdrv_flush(bs->backing_hd);
4dca4b63
NS
1063
1064ro_cleanup:
7267c094 1065 g_free(buf);
4dca4b63
NS
1066
1067 if (ro) {
1068 /* re-open as RO */
1069 bdrv_delete(bs->backing_hd);
1070 bs->backing_hd = NULL;
1071 bs_ro = bdrv_new("");
ee181196
KW
1072 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1073 backing_drv);
4dca4b63
NS
1074 if (ret < 0) {
1075 bdrv_delete(bs_ro);
1076 /* drive not functional anymore */
1077 bs->drv = NULL;
1078 return ret;
1079 }
1080 bs->backing_hd = bs_ro;
1081 bs->backing_hd->keep_read_only = 0;
1082 }
1083
1d44952f 1084 return ret;
33e3963e
FB
1085}
1086
6ab4b5ab
MA
1087void bdrv_commit_all(void)
1088{
1089 BlockDriverState *bs;
1090
1091 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1092 bdrv_commit(bs);
1093 }
1094}
1095
dbffbdcf
SH
1096struct BdrvTrackedRequest {
1097 BlockDriverState *bs;
1098 int64_t sector_num;
1099 int nb_sectors;
1100 bool is_write;
1101 QLIST_ENTRY(BdrvTrackedRequest) list;
5f8b6491 1102 Coroutine *co; /* owner, used for deadlock detection */
f4658285 1103 CoQueue wait_queue; /* coroutines blocked on this request */
dbffbdcf
SH
1104};
1105
1106/**
1107 * Remove an active request from the tracked requests list
1108 *
1109 * This function should be called when a tracked request is completing.
1110 */
1111static void tracked_request_end(BdrvTrackedRequest *req)
1112{
1113 QLIST_REMOVE(req, list);
f4658285 1114 qemu_co_queue_restart_all(&req->wait_queue);
dbffbdcf
SH
1115}
1116
1117/**
1118 * Add an active request to the tracked requests list
1119 */
1120static void tracked_request_begin(BdrvTrackedRequest *req,
1121 BlockDriverState *bs,
1122 int64_t sector_num,
1123 int nb_sectors, bool is_write)
1124{
1125 *req = (BdrvTrackedRequest){
1126 .bs = bs,
1127 .sector_num = sector_num,
1128 .nb_sectors = nb_sectors,
1129 .is_write = is_write,
5f8b6491 1130 .co = qemu_coroutine_self(),
dbffbdcf
SH
1131 };
1132
f4658285
SH
1133 qemu_co_queue_init(&req->wait_queue);
1134
dbffbdcf
SH
1135 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1136}
1137
d83947ac
SH
1138/**
1139 * Round a region to cluster boundaries
1140 */
1141static void round_to_clusters(BlockDriverState *bs,
1142 int64_t sector_num, int nb_sectors,
1143 int64_t *cluster_sector_num,
1144 int *cluster_nb_sectors)
1145{
1146 BlockDriverInfo bdi;
1147
1148 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1149 *cluster_sector_num = sector_num;
1150 *cluster_nb_sectors = nb_sectors;
1151 } else {
1152 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1153 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1154 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1155 nb_sectors, c);
1156 }
1157}
1158
f4658285
SH
1159static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1160 int64_t sector_num, int nb_sectors) {
d83947ac
SH
1161 /* aaaa bbbb */
1162 if (sector_num >= req->sector_num + req->nb_sectors) {
1163 return false;
1164 }
1165 /* bbbb aaaa */
1166 if (req->sector_num >= sector_num + nb_sectors) {
1167 return false;
1168 }
1169 return true;
f4658285
SH
1170}
1171
1172static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1173 int64_t sector_num, int nb_sectors)
1174{
1175 BdrvTrackedRequest *req;
d83947ac
SH
1176 int64_t cluster_sector_num;
1177 int cluster_nb_sectors;
f4658285
SH
1178 bool retry;
1179
d83947ac
SH
1180 /* If we touch the same cluster it counts as an overlap. This guarantees
1181 * that allocating writes will be serialized and not race with each other
1182 * for the same cluster. For example, in copy-on-read it ensures that the
1183 * CoR read and write operations are atomic and guest writes cannot
1184 * interleave between them.
1185 */
1186 round_to_clusters(bs, sector_num, nb_sectors,
1187 &cluster_sector_num, &cluster_nb_sectors);
1188
f4658285
SH
1189 do {
1190 retry = false;
1191 QLIST_FOREACH(req, &bs->tracked_requests, list) {
d83947ac
SH
1192 if (tracked_request_overlaps(req, cluster_sector_num,
1193 cluster_nb_sectors)) {
5f8b6491
SH
1194 /* Hitting this means there was a reentrant request, for
1195 * example, a block driver issuing nested requests. This must
1196 * never happen since it means deadlock.
1197 */
1198 assert(qemu_coroutine_self() != req->co);
1199
f4658285
SH
1200 qemu_co_queue_wait(&req->wait_queue);
1201 retry = true;
1202 break;
1203 }
1204 }
1205 } while (retry);
1206}
1207
756e6736
KW
1208/*
1209 * Return values:
1210 * 0 - success
1211 * -EINVAL - backing format specified, but no file
1212 * -ENOSPC - can't update the backing file because no space is left in the
1213 * image file header
1214 * -ENOTSUP - format driver doesn't support changing the backing file
1215 */
1216int bdrv_change_backing_file(BlockDriverState *bs,
1217 const char *backing_file, const char *backing_fmt)
1218{
1219 BlockDriver *drv = bs->drv;
1220
1221 if (drv->bdrv_change_backing_file != NULL) {
1222 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1223 } else {
1224 return -ENOTSUP;
1225 }
1226}
1227
71d0770c
AL
1228static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1229 size_t size)
1230{
1231 int64_t len;
1232
1233 if (!bdrv_is_inserted(bs))
1234 return -ENOMEDIUM;
1235
1236 if (bs->growable)
1237 return 0;
1238
1239 len = bdrv_getlength(bs);
1240
fbb7b4e0
KW
1241 if (offset < 0)
1242 return -EIO;
1243
1244 if ((offset > len) || (len - offset < size))
71d0770c
AL
1245 return -EIO;
1246
1247 return 0;
1248}
1249
1250static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1251 int nb_sectors)
1252{
eb5a3165
JS
1253 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1254 nb_sectors * BDRV_SECTOR_SIZE);
71d0770c
AL
1255}
1256
1c9805a3
SH
1257typedef struct RwCo {
1258 BlockDriverState *bs;
1259 int64_t sector_num;
1260 int nb_sectors;
1261 QEMUIOVector *qiov;
1262 bool is_write;
1263 int ret;
1264} RwCo;
1265
1266static void coroutine_fn bdrv_rw_co_entry(void *opaque)
fc01f7e7 1267{
1c9805a3 1268 RwCo *rwco = opaque;
ea2384d3 1269
1c9805a3
SH
1270 if (!rwco->is_write) {
1271 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1272 rwco->nb_sectors, rwco->qiov);
1273 } else {
1274 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1275 rwco->nb_sectors, rwco->qiov);
1276 }
1277}
e7a8a783 1278
1c9805a3
SH
1279/*
1280 * Process a synchronous request using coroutines
1281 */
1282static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1283 int nb_sectors, bool is_write)
1284{
1285 QEMUIOVector qiov;
1286 struct iovec iov = {
1287 .iov_base = (void *)buf,
1288 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1289 };
1290 Coroutine *co;
1291 RwCo rwco = {
1292 .bs = bs,
1293 .sector_num = sector_num,
1294 .nb_sectors = nb_sectors,
1295 .qiov = &qiov,
1296 .is_write = is_write,
1297 .ret = NOT_DONE,
1298 };
e7a8a783 1299
1c9805a3 1300 qemu_iovec_init_external(&qiov, &iov, 1);
e7a8a783 1301
1c9805a3
SH
1302 if (qemu_in_coroutine()) {
1303 /* Fast-path if already in coroutine context */
1304 bdrv_rw_co_entry(&rwco);
1305 } else {
1306 co = qemu_coroutine_create(bdrv_rw_co_entry);
1307 qemu_coroutine_enter(co, &rwco);
1308 while (rwco.ret == NOT_DONE) {
1309 qemu_aio_wait();
1310 }
1311 }
1312 return rwco.ret;
1313}
b338082b 1314
1c9805a3
SH
1315/* return < 0 if error. See bdrv_write() for the return codes */
1316int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1317 uint8_t *buf, int nb_sectors)
1318{
1319 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
fc01f7e7
FB
1320}
1321
7cd1e32a 1322static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
a55eb92c 1323 int nb_sectors, int dirty)
7cd1e32a 1324{
1325 int64_t start, end;
c6d22830 1326 unsigned long val, idx, bit;
a55eb92c 1327
6ea44308 1328 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
c6d22830 1329 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c
JK
1330
1331 for (; start <= end; start++) {
c6d22830
JK
1332 idx = start / (sizeof(unsigned long) * 8);
1333 bit = start % (sizeof(unsigned long) * 8);
1334 val = bs->dirty_bitmap[idx];
1335 if (dirty) {
6d59fec1 1336 if (!(val & (1UL << bit))) {
aaa0eb75 1337 bs->dirty_count++;
6d59fec1 1338 val |= 1UL << bit;
aaa0eb75 1339 }
c6d22830 1340 } else {
6d59fec1 1341 if (val & (1UL << bit)) {
aaa0eb75 1342 bs->dirty_count--;
6d59fec1 1343 val &= ~(1UL << bit);
aaa0eb75 1344 }
c6d22830
JK
1345 }
1346 bs->dirty_bitmap[idx] = val;
7cd1e32a 1347 }
1348}
1349
5fafdf24 1350/* Return < 0 if error. Important errors are:
19cb3738
FB
1351 -EIO generic I/O error (may happen for all errors)
1352 -ENOMEDIUM No media inserted.
1353 -EINVAL Invalid sector number or nb_sectors
1354 -EACCES Trying to write a read-only device
1355*/
5fafdf24 1356int bdrv_write(BlockDriverState *bs, int64_t sector_num,
fc01f7e7
FB
1357 const uint8_t *buf, int nb_sectors)
1358{
1c9805a3 1359 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
83f64091
FB
1360}
1361
eda578e5
AL
1362int bdrv_pread(BlockDriverState *bs, int64_t offset,
1363 void *buf, int count1)
83f64091 1364{
6ea44308 1365 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1366 int len, nb_sectors, count;
1367 int64_t sector_num;
9a8c4cce 1368 int ret;
83f64091
FB
1369
1370 count = count1;
1371 /* first read to align to sector start */
6ea44308 1372 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1373 if (len > count)
1374 len = count;
6ea44308 1375 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1376 if (len > 0) {
9a8c4cce
KW
1377 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1378 return ret;
6ea44308 1379 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
83f64091
FB
1380 count -= len;
1381 if (count == 0)
1382 return count1;
1383 sector_num++;
1384 buf += len;
1385 }
1386
1387 /* read the sectors "in place" */
6ea44308 1388 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1389 if (nb_sectors > 0) {
9a8c4cce
KW
1390 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1391 return ret;
83f64091 1392 sector_num += nb_sectors;
6ea44308 1393 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1394 buf += len;
1395 count -= len;
1396 }
1397
1398 /* add data from the last sector */
1399 if (count > 0) {
9a8c4cce
KW
1400 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1401 return ret;
83f64091
FB
1402 memcpy(buf, tmp_buf, count);
1403 }
1404 return count1;
1405}
1406
eda578e5
AL
1407int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1408 const void *buf, int count1)
83f64091 1409{
6ea44308 1410 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
83f64091
FB
1411 int len, nb_sectors, count;
1412 int64_t sector_num;
9a8c4cce 1413 int ret;
83f64091
FB
1414
1415 count = count1;
1416 /* first write to align to sector start */
6ea44308 1417 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
83f64091
FB
1418 if (len > count)
1419 len = count;
6ea44308 1420 sector_num = offset >> BDRV_SECTOR_BITS;
83f64091 1421 if (len > 0) {
9a8c4cce
KW
1422 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1423 return ret;
6ea44308 1424 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
9a8c4cce
KW
1425 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1426 return ret;
83f64091
FB
1427 count -= len;
1428 if (count == 0)
1429 return count1;
1430 sector_num++;
1431 buf += len;
1432 }
1433
1434 /* write the sectors "in place" */
6ea44308 1435 nb_sectors = count >> BDRV_SECTOR_BITS;
83f64091 1436 if (nb_sectors > 0) {
9a8c4cce
KW
1437 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1438 return ret;
83f64091 1439 sector_num += nb_sectors;
6ea44308 1440 len = nb_sectors << BDRV_SECTOR_BITS;
83f64091
FB
1441 buf += len;
1442 count -= len;
1443 }
1444
1445 /* add data from the last sector */
1446 if (count > 0) {
9a8c4cce
KW
1447 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1448 return ret;
83f64091 1449 memcpy(tmp_buf, buf, count);
9a8c4cce
KW
1450 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1451 return ret;
83f64091
FB
1452 }
1453 return count1;
1454}
83f64091 1455
f08145fe
KW
1456/*
1457 * Writes to the file and ensures that no writes are reordered across this
1458 * request (acts as a barrier)
1459 *
1460 * Returns 0 on success, -errno in error cases.
1461 */
1462int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1463 const void *buf, int count)
1464{
1465 int ret;
1466
1467 ret = bdrv_pwrite(bs, offset, buf, count);
1468 if (ret < 0) {
1469 return ret;
1470 }
1471
92196b2f
SH
1472 /* No flush needed for cache modes that use O_DSYNC */
1473 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
f08145fe
KW
1474 bdrv_flush(bs);
1475 }
1476
1477 return 0;
1478}
1479
ab185921
SH
1480static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1481 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1482{
1483 /* Perform I/O through a temporary buffer so that users who scribble over
1484 * their read buffer while the operation is in progress do not end up
1485 * modifying the image file. This is critical for zero-copy guest I/O
1486 * where anything might happen inside guest memory.
1487 */
1488 void *bounce_buffer;
1489
1490 struct iovec iov;
1491 QEMUIOVector bounce_qiov;
1492 int64_t cluster_sector_num;
1493 int cluster_nb_sectors;
1494 size_t skip_bytes;
1495 int ret;
1496
1497 /* Cover entire cluster so no additional backing file I/O is required when
1498 * allocating cluster in the image file.
1499 */
1500 round_to_clusters(bs, sector_num, nb_sectors,
1501 &cluster_sector_num, &cluster_nb_sectors);
1502
1503 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors,
1504 cluster_sector_num, cluster_nb_sectors);
1505
1506 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1507 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1508 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1509
1510 ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1511 &bounce_qiov);
1512 if (ret < 0) {
1513 goto err;
1514 }
1515
1516 ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1517 &bounce_qiov);
1518 if (ret < 0) {
1519 /* It might be okay to ignore write errors for guest requests. If this
1520 * is a deliberate copy-on-read then we don't want to ignore the error.
1521 * Simply report it in all cases.
1522 */
1523 goto err;
1524 }
1525
1526 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1527 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1528 nb_sectors * BDRV_SECTOR_SIZE);
1529
1530err:
1531 qemu_vfree(bounce_buffer);
1532 return ret;
1533}
1534
c5fbe571
SH
1535/*
1536 * Handle a read request in coroutine context
1537 */
1538static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1539 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
da1fa91d
KW
1540{
1541 BlockDriver *drv = bs->drv;
dbffbdcf
SH
1542 BdrvTrackedRequest req;
1543 int ret;
da1fa91d 1544
da1fa91d
KW
1545 if (!drv) {
1546 return -ENOMEDIUM;
1547 }
1548 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1549 return -EIO;
1550 }
1551
98f90dba
ZYW
1552 /* throttling disk read I/O */
1553 if (bs->io_limits_enabled) {
1554 bdrv_io_limits_intercept(bs, false, nb_sectors);
1555 }
1556
f4658285
SH
1557 if (bs->copy_on_read) {
1558 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1559 }
1560
dbffbdcf 1561 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
ab185921
SH
1562
1563 if (bs->copy_on_read) {
1564 int pnum;
1565
1566 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1567 if (ret < 0) {
1568 goto out;
1569 }
1570
1571 if (!ret || pnum != nb_sectors) {
1572 ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1573 goto out;
1574 }
1575 }
1576
dbffbdcf 1577 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
ab185921
SH
1578
1579out:
dbffbdcf
SH
1580 tracked_request_end(&req);
1581 return ret;
da1fa91d
KW
1582}
1583
c5fbe571 1584int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
da1fa91d
KW
1585 int nb_sectors, QEMUIOVector *qiov)
1586{
c5fbe571 1587 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
da1fa91d 1588
c5fbe571
SH
1589 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
1590}
1591
1592/*
1593 * Handle a write request in coroutine context
1594 */
1595static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1596 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1597{
1598 BlockDriver *drv = bs->drv;
dbffbdcf 1599 BdrvTrackedRequest req;
6b7cb247 1600 int ret;
da1fa91d
KW
1601
1602 if (!bs->drv) {
1603 return -ENOMEDIUM;
1604 }
1605 if (bs->read_only) {
1606 return -EACCES;
1607 }
1608 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1609 return -EIO;
1610 }
1611
98f90dba
ZYW
1612 /* throttling disk write I/O */
1613 if (bs->io_limits_enabled) {
1614 bdrv_io_limits_intercept(bs, true, nb_sectors);
1615 }
1616
f4658285
SH
1617 if (bs->copy_on_read) {
1618 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1619 }
1620
dbffbdcf
SH
1621 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1622
6b7cb247
SH
1623 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1624
da1fa91d
KW
1625 if (bs->dirty_bitmap) {
1626 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1627 }
1628
1629 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1630 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1631 }
1632
dbffbdcf
SH
1633 tracked_request_end(&req);
1634
6b7cb247 1635 return ret;
da1fa91d
KW
1636}
1637
c5fbe571
SH
1638int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1639 int nb_sectors, QEMUIOVector *qiov)
1640{
1641 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1642
1643 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1644}
1645
83f64091
FB
1646/**
1647 * Truncate file to 'offset' bytes (needed only for file protocols)
1648 */
1649int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1650{
1651 BlockDriver *drv = bs->drv;
51762288 1652 int ret;
83f64091 1653 if (!drv)
19cb3738 1654 return -ENOMEDIUM;
83f64091
FB
1655 if (!drv->bdrv_truncate)
1656 return -ENOTSUP;
59f2689d
NS
1657 if (bs->read_only)
1658 return -EACCES;
8591675f
MT
1659 if (bdrv_in_use(bs))
1660 return -EBUSY;
51762288
SH
1661 ret = drv->bdrv_truncate(bs, offset);
1662 if (ret == 0) {
1663 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
145feb17 1664 bdrv_dev_resize_cb(bs);
51762288
SH
1665 }
1666 return ret;
83f64091
FB
1667}
1668
4a1d5e1f
FZ
1669/**
1670 * Length of a allocated file in bytes. Sparse files are counted by actual
1671 * allocated space. Return < 0 if error or unknown.
1672 */
1673int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1674{
1675 BlockDriver *drv = bs->drv;
1676 if (!drv) {
1677 return -ENOMEDIUM;
1678 }
1679 if (drv->bdrv_get_allocated_file_size) {
1680 return drv->bdrv_get_allocated_file_size(bs);
1681 }
1682 if (bs->file) {
1683 return bdrv_get_allocated_file_size(bs->file);
1684 }
1685 return -ENOTSUP;
1686}
1687
83f64091
FB
1688/**
1689 * Length of a file in bytes. Return < 0 if error or unknown.
1690 */
1691int64_t bdrv_getlength(BlockDriverState *bs)
1692{
1693 BlockDriver *drv = bs->drv;
1694 if (!drv)
19cb3738 1695 return -ENOMEDIUM;
51762288 1696
2c6942fa 1697 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
46a4e4e6
SH
1698 if (drv->bdrv_getlength) {
1699 return drv->bdrv_getlength(bs);
1700 }
83f64091 1701 }
46a4e4e6 1702 return bs->total_sectors * BDRV_SECTOR_SIZE;
fc01f7e7
FB
1703}
1704
19cb3738 1705/* return 0 as number of sectors if no device present or error */
96b8f136 1706void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
fc01f7e7 1707{
19cb3738
FB
1708 int64_t length;
1709 length = bdrv_getlength(bs);
1710 if (length < 0)
1711 length = 0;
1712 else
6ea44308 1713 length = length >> BDRV_SECTOR_BITS;
19cb3738 1714 *nb_sectors_ptr = length;
fc01f7e7 1715}
cf98951b 1716
f3d54fc4
AL
1717struct partition {
1718 uint8_t boot_ind; /* 0x80 - active */
1719 uint8_t head; /* starting head */
1720 uint8_t sector; /* starting sector */
1721 uint8_t cyl; /* starting cylinder */
1722 uint8_t sys_ind; /* What partition type */
1723 uint8_t end_head; /* end head */
1724 uint8_t end_sector; /* end sector */
1725 uint8_t end_cyl; /* end cylinder */
1726 uint32_t start_sect; /* starting sector counting from 0 */
1727 uint32_t nr_sects; /* nr of sectors in partition */
541dc0d4 1728} QEMU_PACKED;
f3d54fc4
AL
1729
1730/* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1731static int guess_disk_lchs(BlockDriverState *bs,
1732 int *pcylinders, int *pheads, int *psectors)
1733{
eb5a3165 1734 uint8_t buf[BDRV_SECTOR_SIZE];
f3d54fc4
AL
1735 int ret, i, heads, sectors, cylinders;
1736 struct partition *p;
1737 uint32_t nr_sects;
a38131b6 1738 uint64_t nb_sectors;
f3d54fc4
AL
1739
1740 bdrv_get_geometry(bs, &nb_sectors);
1741
1742 ret = bdrv_read(bs, 0, buf, 1);
1743 if (ret < 0)
1744 return -1;
1745 /* test msdos magic */
1746 if (buf[510] != 0x55 || buf[511] != 0xaa)
1747 return -1;
1748 for(i = 0; i < 4; i++) {
1749 p = ((struct partition *)(buf + 0x1be)) + i;
1750 nr_sects = le32_to_cpu(p->nr_sects);
1751 if (nr_sects && p->end_head) {
1752 /* We make the assumption that the partition terminates on
1753 a cylinder boundary */
1754 heads = p->end_head + 1;
1755 sectors = p->end_sector & 63;
1756 if (sectors == 0)
1757 continue;
1758 cylinders = nb_sectors / (heads * sectors);
1759 if (cylinders < 1 || cylinders > 16383)
1760 continue;
1761 *pheads = heads;
1762 *psectors = sectors;
1763 *pcylinders = cylinders;
1764#if 0
1765 printf("guessed geometry: LCHS=%d %d %d\n",
1766 cylinders, heads, sectors);
1767#endif
1768 return 0;
1769 }
1770 }
1771 return -1;
1772}
1773
1774void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1775{
1776 int translation, lba_detected = 0;
1777 int cylinders, heads, secs;
a38131b6 1778 uint64_t nb_sectors;
f3d54fc4
AL
1779
1780 /* if a geometry hint is available, use it */
1781 bdrv_get_geometry(bs, &nb_sectors);
1782 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1783 translation = bdrv_get_translation_hint(bs);
1784 if (cylinders != 0) {
1785 *pcyls = cylinders;
1786 *pheads = heads;
1787 *psecs = secs;
1788 } else {
1789 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1790 if (heads > 16) {
1791 /* if heads > 16, it means that a BIOS LBA
1792 translation was active, so the default
1793 hardware geometry is OK */
1794 lba_detected = 1;
1795 goto default_geometry;
1796 } else {
1797 *pcyls = cylinders;
1798 *pheads = heads;
1799 *psecs = secs;
1800 /* disable any translation to be in sync with
1801 the logical geometry */
1802 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1803 bdrv_set_translation_hint(bs,
1804 BIOS_ATA_TRANSLATION_NONE);
1805 }
1806 }
1807 } else {
1808 default_geometry:
1809 /* if no geometry, use a standard physical disk geometry */
1810 cylinders = nb_sectors / (16 * 63);
1811
1812 if (cylinders > 16383)
1813 cylinders = 16383;
1814 else if (cylinders < 2)
1815 cylinders = 2;
1816 *pcyls = cylinders;
1817 *pheads = 16;
1818 *psecs = 63;
1819 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1820 if ((*pcyls * *pheads) <= 131072) {
1821 bdrv_set_translation_hint(bs,
1822 BIOS_ATA_TRANSLATION_LARGE);
1823 } else {
1824 bdrv_set_translation_hint(bs,
1825 BIOS_ATA_TRANSLATION_LBA);
1826 }
1827 }
1828 }
1829 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1830 }
1831}
1832
5fafdf24 1833void bdrv_set_geometry_hint(BlockDriverState *bs,
b338082b
FB
1834 int cyls, int heads, int secs)
1835{
1836 bs->cyls = cyls;
1837 bs->heads = heads;
1838 bs->secs = secs;
1839}
1840
46d4767d
FB
1841void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1842{
1843 bs->translation = translation;
1844}
1845
5fafdf24 1846void bdrv_get_geometry_hint(BlockDriverState *bs,
b338082b
FB
1847 int *pcyls, int *pheads, int *psecs)
1848{
1849 *pcyls = bs->cyls;
1850 *pheads = bs->heads;
1851 *psecs = bs->secs;
1852}
1853
0563e191
ZYW
1854/* throttling disk io limits */
1855void bdrv_set_io_limits(BlockDriverState *bs,
1856 BlockIOLimit *io_limits)
1857{
1858 bs->io_limits = *io_limits;
1859 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
1860}
1861
5bbdbb46
BS
1862/* Recognize floppy formats */
1863typedef struct FDFormat {
1864 FDriveType drive;
1865 uint8_t last_sect;
1866 uint8_t max_track;
1867 uint8_t max_head;
1868} FDFormat;
1869
1870static const FDFormat fd_formats[] = {
1871 /* First entry is default format */
1872 /* 1.44 MB 3"1/2 floppy disks */
1873 { FDRIVE_DRV_144, 18, 80, 1, },
1874 { FDRIVE_DRV_144, 20, 80, 1, },
1875 { FDRIVE_DRV_144, 21, 80, 1, },
1876 { FDRIVE_DRV_144, 21, 82, 1, },
1877 { FDRIVE_DRV_144, 21, 83, 1, },
1878 { FDRIVE_DRV_144, 22, 80, 1, },
1879 { FDRIVE_DRV_144, 23, 80, 1, },
1880 { FDRIVE_DRV_144, 24, 80, 1, },
1881 /* 2.88 MB 3"1/2 floppy disks */
1882 { FDRIVE_DRV_288, 36, 80, 1, },
1883 { FDRIVE_DRV_288, 39, 80, 1, },
1884 { FDRIVE_DRV_288, 40, 80, 1, },
1885 { FDRIVE_DRV_288, 44, 80, 1, },
1886 { FDRIVE_DRV_288, 48, 80, 1, },
1887 /* 720 kB 3"1/2 floppy disks */
1888 { FDRIVE_DRV_144, 9, 80, 1, },
1889 { FDRIVE_DRV_144, 10, 80, 1, },
1890 { FDRIVE_DRV_144, 10, 82, 1, },
1891 { FDRIVE_DRV_144, 10, 83, 1, },
1892 { FDRIVE_DRV_144, 13, 80, 1, },
1893 { FDRIVE_DRV_144, 14, 80, 1, },
1894 /* 1.2 MB 5"1/4 floppy disks */
1895 { FDRIVE_DRV_120, 15, 80, 1, },
1896 { FDRIVE_DRV_120, 18, 80, 1, },
1897 { FDRIVE_DRV_120, 18, 82, 1, },
1898 { FDRIVE_DRV_120, 18, 83, 1, },
1899 { FDRIVE_DRV_120, 20, 80, 1, },
1900 /* 720 kB 5"1/4 floppy disks */
1901 { FDRIVE_DRV_120, 9, 80, 1, },
1902 { FDRIVE_DRV_120, 11, 80, 1, },
1903 /* 360 kB 5"1/4 floppy disks */
1904 { FDRIVE_DRV_120, 9, 40, 1, },
1905 { FDRIVE_DRV_120, 9, 40, 0, },
1906 { FDRIVE_DRV_120, 10, 41, 1, },
1907 { FDRIVE_DRV_120, 10, 42, 1, },
1908 /* 320 kB 5"1/4 floppy disks */
1909 { FDRIVE_DRV_120, 8, 40, 1, },
1910 { FDRIVE_DRV_120, 8, 40, 0, },
1911 /* 360 kB must match 5"1/4 better than 3"1/2... */
1912 { FDRIVE_DRV_144, 9, 80, 0, },
1913 /* end */
1914 { FDRIVE_DRV_NONE, -1, -1, 0, },
1915};
1916
1917void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1918 int *max_track, int *last_sect,
1919 FDriveType drive_in, FDriveType *drive)
1920{
1921 const FDFormat *parse;
1922 uint64_t nb_sectors, size;
1923 int i, first_match, match;
1924
1925 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1926 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1927 /* User defined disk */
1928 } else {
1929 bdrv_get_geometry(bs, &nb_sectors);
1930 match = -1;
1931 first_match = -1;
1932 for (i = 0; ; i++) {
1933 parse = &fd_formats[i];
1934 if (parse->drive == FDRIVE_DRV_NONE) {
1935 break;
1936 }
1937 if (drive_in == parse->drive ||
1938 drive_in == FDRIVE_DRV_NONE) {
1939 size = (parse->max_head + 1) * parse->max_track *
1940 parse->last_sect;
1941 if (nb_sectors == size) {
1942 match = i;
1943 break;
1944 }
1945 if (first_match == -1) {
1946 first_match = i;
1947 }
1948 }
1949 }
1950 if (match == -1) {
1951 if (first_match == -1) {
1952 match = 1;
1953 } else {
1954 match = first_match;
1955 }
1956 parse = &fd_formats[match];
1957 }
1958 *nb_heads = parse->max_head + 1;
1959 *max_track = parse->max_track;
1960 *last_sect = parse->last_sect;
1961 *drive = parse->drive;
1962 }
1963}
1964
46d4767d
FB
1965int bdrv_get_translation_hint(BlockDriverState *bs)
1966{
1967 return bs->translation;
1968}
1969
abd7f68d
MA
1970void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
1971 BlockErrorAction on_write_error)
1972{
1973 bs->on_read_error = on_read_error;
1974 bs->on_write_error = on_write_error;
1975}
1976
1977BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
1978{
1979 return is_read ? bs->on_read_error : bs->on_write_error;
1980}
1981
b338082b
FB
1982int bdrv_is_read_only(BlockDriverState *bs)
1983{
1984 return bs->read_only;
1985}
1986
985a03b0
TS
1987int bdrv_is_sg(BlockDriverState *bs)
1988{
1989 return bs->sg;
1990}
1991
e900a7b7
CH
1992int bdrv_enable_write_cache(BlockDriverState *bs)
1993{
1994 return bs->enable_write_cache;
1995}
1996
ea2384d3
FB
1997int bdrv_is_encrypted(BlockDriverState *bs)
1998{
1999 if (bs->backing_hd && bs->backing_hd->encrypted)
2000 return 1;
2001 return bs->encrypted;
2002}
2003
c0f4ce77
AL
2004int bdrv_key_required(BlockDriverState *bs)
2005{
2006 BlockDriverState *backing_hd = bs->backing_hd;
2007
2008 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2009 return 1;
2010 return (bs->encrypted && !bs->valid_key);
2011}
2012
ea2384d3
FB
2013int bdrv_set_key(BlockDriverState *bs, const char *key)
2014{
2015 int ret;
2016 if (bs->backing_hd && bs->backing_hd->encrypted) {
2017 ret = bdrv_set_key(bs->backing_hd, key);
2018 if (ret < 0)
2019 return ret;
2020 if (!bs->encrypted)
2021 return 0;
2022 }
fd04a2ae
SH
2023 if (!bs->encrypted) {
2024 return -EINVAL;
2025 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2026 return -ENOMEDIUM;
2027 }
c0f4ce77 2028 ret = bs->drv->bdrv_set_key(bs, key);
bb5fc20f
AL
2029 if (ret < 0) {
2030 bs->valid_key = 0;
2031 } else if (!bs->valid_key) {
2032 bs->valid_key = 1;
2033 /* call the change callback now, we skipped it on open */
7d4b4ba5 2034 bdrv_dev_change_media_cb(bs, true);
bb5fc20f 2035 }
c0f4ce77 2036 return ret;
ea2384d3
FB
2037}
2038
2039void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2040{
19cb3738 2041 if (!bs->drv) {
ea2384d3
FB
2042 buf[0] = '\0';
2043 } else {
2044 pstrcpy(buf, buf_size, bs->drv->format_name);
2045 }
2046}
2047
5fafdf24 2048void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
ea2384d3
FB
2049 void *opaque)
2050{
2051 BlockDriver *drv;
2052
8a22f02a 2053 QLIST_FOREACH(drv, &bdrv_drivers, list) {
ea2384d3
FB
2054 it(opaque, drv->format_name);
2055 }
2056}
2057
b338082b
FB
2058BlockDriverState *bdrv_find(const char *name)
2059{
2060 BlockDriverState *bs;
2061
1b7bdbc1
SH
2062 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2063 if (!strcmp(name, bs->device_name)) {
b338082b 2064 return bs;
1b7bdbc1 2065 }
b338082b
FB
2066 }
2067 return NULL;
2068}
2069
2f399b0a
MA
2070BlockDriverState *bdrv_next(BlockDriverState *bs)
2071{
2072 if (!bs) {
2073 return QTAILQ_FIRST(&bdrv_states);
2074 }
2075 return QTAILQ_NEXT(bs, list);
2076}
2077
51de9760 2078void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
81d0912d
FB
2079{
2080 BlockDriverState *bs;
2081
1b7bdbc1 2082 QTAILQ_FOREACH(bs, &bdrv_states, list) {
51de9760 2083 it(opaque, bs);
81d0912d
FB
2084 }
2085}
2086
ea2384d3
FB
2087const char *bdrv_get_device_name(BlockDriverState *bs)
2088{
2089 return bs->device_name;
2090}
2091
c6ca28d6
AL
2092void bdrv_flush_all(void)
2093{
2094 BlockDriverState *bs;
2095
1b7bdbc1 2096 QTAILQ_FOREACH(bs, &bdrv_states, list) {
c602a489 2097 if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
c6ca28d6 2098 bdrv_flush(bs);
1b7bdbc1
SH
2099 }
2100 }
c6ca28d6
AL
2101}
2102
f2feebbd
KW
2103int bdrv_has_zero_init(BlockDriverState *bs)
2104{
2105 assert(bs->drv);
2106
336c1c12
KW
2107 if (bs->drv->bdrv_has_zero_init) {
2108 return bs->drv->bdrv_has_zero_init(bs);
f2feebbd
KW
2109 }
2110
2111 return 1;
2112}
2113
376ae3f1
SH
2114typedef struct BdrvCoIsAllocatedData {
2115 BlockDriverState *bs;
2116 int64_t sector_num;
2117 int nb_sectors;
2118 int *pnum;
2119 int ret;
2120 bool done;
2121} BdrvCoIsAllocatedData;
2122
f58c7b35
TS
2123/*
2124 * Returns true iff the specified sector is present in the disk image. Drivers
2125 * not implementing the functionality are assumed to not support backing files,
2126 * hence all their sectors are reported as allocated.
2127 *
bd9533e3
SH
2128 * If 'sector_num' is beyond the end of the disk image the return value is 0
2129 * and 'pnum' is set to 0.
2130 *
f58c7b35
TS
2131 * 'pnum' is set to the number of sectors (including and immediately following
2132 * the specified sector) that are known to be in the same
2133 * allocated/unallocated state.
2134 *
bd9533e3
SH
2135 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2136 * beyond the end of the disk image it will be clamped.
f58c7b35 2137 */
060f51c9
SH
2138int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2139 int nb_sectors, int *pnum)
f58c7b35 2140{
bd9533e3
SH
2141 int64_t n;
2142
2143 if (sector_num >= bs->total_sectors) {
2144 *pnum = 0;
2145 return 0;
2146 }
2147
2148 n = bs->total_sectors - sector_num;
2149 if (n < nb_sectors) {
2150 nb_sectors = n;
2151 }
2152
6aebab14 2153 if (!bs->drv->bdrv_co_is_allocated) {
bd9533e3 2154 *pnum = nb_sectors;
f58c7b35
TS
2155 return 1;
2156 }
6aebab14 2157
060f51c9
SH
2158 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2159}
2160
2161/* Coroutine wrapper for bdrv_is_allocated() */
2162static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2163{
2164 BdrvCoIsAllocatedData *data = opaque;
2165 BlockDriverState *bs = data->bs;
2166
2167 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2168 data->pnum);
2169 data->done = true;
2170}
2171
2172/*
2173 * Synchronous wrapper around bdrv_co_is_allocated().
2174 *
2175 * See bdrv_co_is_allocated() for details.
2176 */
2177int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2178 int *pnum)
2179{
6aebab14
SH
2180 Coroutine *co;
2181 BdrvCoIsAllocatedData data = {
2182 .bs = bs,
2183 .sector_num = sector_num,
2184 .nb_sectors = nb_sectors,
2185 .pnum = pnum,
2186 .done = false,
2187 };
2188
2189 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2190 qemu_coroutine_enter(co, &data);
2191 while (!data.done) {
2192 qemu_aio_wait();
2193 }
2194 return data.ret;
f58c7b35
TS
2195}
2196
2582bfed
LC
2197void bdrv_mon_event(const BlockDriverState *bdrv,
2198 BlockMonEventAction action, int is_read)
2199{
2200 QObject *data;
2201 const char *action_str;
2202
2203 switch (action) {
2204 case BDRV_ACTION_REPORT:
2205 action_str = "report";
2206 break;
2207 case BDRV_ACTION_IGNORE:
2208 action_str = "ignore";
2209 break;
2210 case BDRV_ACTION_STOP:
2211 action_str = "stop";
2212 break;
2213 default:
2214 abort();
2215 }
2216
2217 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2218 bdrv->device_name,
2219 action_str,
2220 is_read ? "read" : "write");
2221 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
2222
2223 qobject_decref(data);
2224}
2225
b2023818 2226BlockInfoList *qmp_query_block(Error **errp)
b338082b 2227{
b2023818 2228 BlockInfoList *head = NULL, *cur_item = NULL;
b338082b
FB
2229 BlockDriverState *bs;
2230
1b7bdbc1 2231 QTAILQ_FOREACH(bs, &bdrv_states, list) {
b2023818 2232 BlockInfoList *info = g_malloc0(sizeof(*info));
d15e5465 2233
b2023818
LC
2234 info->value = g_malloc0(sizeof(*info->value));
2235 info->value->device = g_strdup(bs->device_name);
2236 info->value->type = g_strdup("unknown");
2237 info->value->locked = bdrv_dev_is_medium_locked(bs);
2238 info->value->removable = bdrv_dev_has_removable_media(bs);
d15e5465 2239
e4def80b 2240 if (bdrv_dev_has_removable_media(bs)) {
b2023818
LC
2241 info->value->has_tray_open = true;
2242 info->value->tray_open = bdrv_dev_is_tray_open(bs);
e4def80b 2243 }
f04ef601
LC
2244
2245 if (bdrv_iostatus_is_enabled(bs)) {
b2023818
LC
2246 info->value->has_io_status = true;
2247 info->value->io_status = bs->iostatus;
f04ef601
LC
2248 }
2249
19cb3738 2250 if (bs->drv) {
b2023818
LC
2251 info->value->has_inserted = true;
2252 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2253 info->value->inserted->file = g_strdup(bs->filename);
2254 info->value->inserted->ro = bs->read_only;
2255 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2256 info->value->inserted->encrypted = bs->encrypted;
2257 if (bs->backing_file[0]) {
2258 info->value->inserted->has_backing_file = true;
2259 info->value->inserted->backing_file = g_strdup(bs->backing_file);
376253ec 2260 }
727f005e
ZYW
2261
2262 if (bs->io_limits_enabled) {
2263 info->value->inserted->bps =
2264 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2265 info->value->inserted->bps_rd =
2266 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2267 info->value->inserted->bps_wr =
2268 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2269 info->value->inserted->iops =
2270 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2271 info->value->inserted->iops_rd =
2272 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2273 info->value->inserted->iops_wr =
2274 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2275 }
b2023818 2276 }
d15e5465 2277
b2023818
LC
2278 /* XXX: waiting for the qapi to support GSList */
2279 if (!cur_item) {
2280 head = cur_item = info;
2281 } else {
2282 cur_item->next = info;
2283 cur_item = info;
b338082b 2284 }
b338082b 2285 }
d15e5465 2286
b2023818 2287 return head;
b338082b 2288}
a36e69dd 2289
f11f57e4
LC
2290/* Consider exposing this as a full fledged QMP command */
2291static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2292{
2293 BlockStats *s;
2294
2295 s = g_malloc0(sizeof(*s));
2296
2297 if (bs->device_name[0]) {
2298 s->has_device = true;
2299 s->device = g_strdup(bs->device_name);
294cc35f
KW
2300 }
2301
f11f57e4
LC
2302 s->stats = g_malloc0(sizeof(*s->stats));
2303 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2304 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2305 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2306 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2307 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2308 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2309 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2310 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2311 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2312
294cc35f 2313 if (bs->file) {
f11f57e4
LC
2314 s->has_parent = true;
2315 s->parent = qmp_query_blockstat(bs->file, NULL);
294cc35f
KW
2316 }
2317
f11f57e4 2318 return s;
294cc35f
KW
2319}
2320
f11f57e4 2321BlockStatsList *qmp_query_blockstats(Error **errp)
218a536a 2322{
f11f57e4 2323 BlockStatsList *head = NULL, *cur_item = NULL;
a36e69dd
TS
2324 BlockDriverState *bs;
2325
1b7bdbc1 2326 QTAILQ_FOREACH(bs, &bdrv_states, list) {
f11f57e4
LC
2327 BlockStatsList *info = g_malloc0(sizeof(*info));
2328 info->value = qmp_query_blockstat(bs, NULL);
2329
2330 /* XXX: waiting for the qapi to support GSList */
2331 if (!cur_item) {
2332 head = cur_item = info;
2333 } else {
2334 cur_item->next = info;
2335 cur_item = info;
2336 }
a36e69dd 2337 }
218a536a 2338
f11f57e4 2339 return head;
a36e69dd 2340}
ea2384d3 2341
045df330
AL
2342const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2343{
2344 if (bs->backing_hd && bs->backing_hd->encrypted)
2345 return bs->backing_file;
2346 else if (bs->encrypted)
2347 return bs->filename;
2348 else
2349 return NULL;
2350}
2351
5fafdf24 2352void bdrv_get_backing_filename(BlockDriverState *bs,
83f64091
FB
2353 char *filename, int filename_size)
2354{
3574c608 2355 pstrcpy(filename, filename_size, bs->backing_file);
83f64091
FB
2356}
2357
5fafdf24 2358int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
faea38e7
FB
2359 const uint8_t *buf, int nb_sectors)
2360{
2361 BlockDriver *drv = bs->drv;
2362 if (!drv)
19cb3738 2363 return -ENOMEDIUM;
faea38e7
FB
2364 if (!drv->bdrv_write_compressed)
2365 return -ENOTSUP;
fbb7b4e0
KW
2366 if (bdrv_check_request(bs, sector_num, nb_sectors))
2367 return -EIO;
a55eb92c 2368
c6d22830 2369 if (bs->dirty_bitmap) {
7cd1e32a 2370 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2371 }
a55eb92c 2372
faea38e7
FB
2373 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2374}
3b46e624 2375
faea38e7
FB
2376int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2377{
2378 BlockDriver *drv = bs->drv;
2379 if (!drv)
19cb3738 2380 return -ENOMEDIUM;
faea38e7
FB
2381 if (!drv->bdrv_get_info)
2382 return -ENOTSUP;
2383 memset(bdi, 0, sizeof(*bdi));
2384 return drv->bdrv_get_info(bs, bdi);
2385}
2386
45566e9c
CH
2387int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2388 int64_t pos, int size)
178e08a5
AL
2389{
2390 BlockDriver *drv = bs->drv;
2391 if (!drv)
2392 return -ENOMEDIUM;
7cdb1f6d
MK
2393 if (drv->bdrv_save_vmstate)
2394 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2395 if (bs->file)
2396 return bdrv_save_vmstate(bs->file, buf, pos, size);
2397 return -ENOTSUP;
178e08a5
AL
2398}
2399
45566e9c
CH
2400int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2401 int64_t pos, int size)
178e08a5
AL
2402{
2403 BlockDriver *drv = bs->drv;
2404 if (!drv)
2405 return -ENOMEDIUM;
7cdb1f6d
MK
2406 if (drv->bdrv_load_vmstate)
2407 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2408 if (bs->file)
2409 return bdrv_load_vmstate(bs->file, buf, pos, size);
2410 return -ENOTSUP;
178e08a5
AL
2411}
2412
8b9b0cc2
KW
2413void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2414{
2415 BlockDriver *drv = bs->drv;
2416
2417 if (!drv || !drv->bdrv_debug_event) {
2418 return;
2419 }
2420
2421 return drv->bdrv_debug_event(bs, event);
2422
2423}
2424
faea38e7
FB
2425/**************************************************************/
2426/* handling of snapshots */
2427
feeee5ac
MDCF
2428int bdrv_can_snapshot(BlockDriverState *bs)
2429{
2430 BlockDriver *drv = bs->drv;
07b70bfb 2431 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
feeee5ac
MDCF
2432 return 0;
2433 }
2434
2435 if (!drv->bdrv_snapshot_create) {
2436 if (bs->file != NULL) {
2437 return bdrv_can_snapshot(bs->file);
2438 }
2439 return 0;
2440 }
2441
2442 return 1;
2443}
2444
199630b6
BS
2445int bdrv_is_snapshot(BlockDriverState *bs)
2446{
2447 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2448}
2449
f9092b10
MA
2450BlockDriverState *bdrv_snapshots(void)
2451{
2452 BlockDriverState *bs;
2453
3ac906f7 2454 if (bs_snapshots) {
f9092b10 2455 return bs_snapshots;
3ac906f7 2456 }
f9092b10
MA
2457
2458 bs = NULL;
2459 while ((bs = bdrv_next(bs))) {
2460 if (bdrv_can_snapshot(bs)) {
3ac906f7
MA
2461 bs_snapshots = bs;
2462 return bs;
f9092b10
MA
2463 }
2464 }
2465 return NULL;
f9092b10
MA
2466}
2467
5fafdf24 2468int bdrv_snapshot_create(BlockDriverState *bs,
faea38e7
FB
2469 QEMUSnapshotInfo *sn_info)
2470{
2471 BlockDriver *drv = bs->drv;
2472 if (!drv)
19cb3738 2473 return -ENOMEDIUM;
7cdb1f6d
MK
2474 if (drv->bdrv_snapshot_create)
2475 return drv->bdrv_snapshot_create(bs, sn_info);
2476 if (bs->file)
2477 return bdrv_snapshot_create(bs->file, sn_info);
2478 return -ENOTSUP;
faea38e7
FB
2479}
2480
5fafdf24 2481int bdrv_snapshot_goto(BlockDriverState *bs,
faea38e7
FB
2482 const char *snapshot_id)
2483{
2484 BlockDriver *drv = bs->drv;
7cdb1f6d
MK
2485 int ret, open_ret;
2486
faea38e7 2487 if (!drv)
19cb3738 2488 return -ENOMEDIUM;
7cdb1f6d
MK
2489 if (drv->bdrv_snapshot_goto)
2490 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2491
2492 if (bs->file) {
2493 drv->bdrv_close(bs);
2494 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2495 open_ret = drv->bdrv_open(bs, bs->open_flags);
2496 if (open_ret < 0) {
2497 bdrv_delete(bs->file);
2498 bs->drv = NULL;
2499 return open_ret;
2500 }
2501 return ret;
2502 }
2503
2504 return -ENOTSUP;
faea38e7
FB
2505}
2506
2507int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2508{
2509 BlockDriver *drv = bs->drv;
2510 if (!drv)
19cb3738 2511 return -ENOMEDIUM;
7cdb1f6d
MK
2512 if (drv->bdrv_snapshot_delete)
2513 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2514 if (bs->file)
2515 return bdrv_snapshot_delete(bs->file, snapshot_id);
2516 return -ENOTSUP;
faea38e7
FB
2517}
2518
5fafdf24 2519int bdrv_snapshot_list(BlockDriverState *bs,
faea38e7
FB
2520 QEMUSnapshotInfo **psn_info)
2521{
2522 BlockDriver *drv = bs->drv;
2523 if (!drv)
19cb3738 2524 return -ENOMEDIUM;
7cdb1f6d
MK
2525 if (drv->bdrv_snapshot_list)
2526 return drv->bdrv_snapshot_list(bs, psn_info);
2527 if (bs->file)
2528 return bdrv_snapshot_list(bs->file, psn_info);
2529 return -ENOTSUP;
faea38e7
FB
2530}
2531
51ef6727 2532int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2533 const char *snapshot_name)
2534{
2535 BlockDriver *drv = bs->drv;
2536 if (!drv) {
2537 return -ENOMEDIUM;
2538 }
2539 if (!bs->read_only) {
2540 return -EINVAL;
2541 }
2542 if (drv->bdrv_snapshot_load_tmp) {
2543 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2544 }
2545 return -ENOTSUP;
2546}
2547
faea38e7
FB
2548#define NB_SUFFIXES 4
2549
2550char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2551{
2552 static const char suffixes[NB_SUFFIXES] = "KMGT";
2553 int64_t base;
2554 int i;
2555
2556 if (size <= 999) {
2557 snprintf(buf, buf_size, "%" PRId64, size);
2558 } else {
2559 base = 1024;
2560 for(i = 0; i < NB_SUFFIXES; i++) {
2561 if (size < (10 * base)) {
5fafdf24 2562 snprintf(buf, buf_size, "%0.1f%c",
faea38e7
FB
2563 (double)size / base,
2564 suffixes[i]);
2565 break;
2566 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
5fafdf24 2567 snprintf(buf, buf_size, "%" PRId64 "%c",
faea38e7
FB
2568 ((size + (base >> 1)) / base),
2569 suffixes[i]);
2570 break;
2571 }
2572 base = base * 1024;
2573 }
2574 }
2575 return buf;
2576}
2577
2578char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2579{
2580 char buf1[128], date_buf[128], clock_buf[128];
3b9f94e1
FB
2581#ifdef _WIN32
2582 struct tm *ptm;
2583#else
faea38e7 2584 struct tm tm;
3b9f94e1 2585#endif
faea38e7
FB
2586 time_t ti;
2587 int64_t secs;
2588
2589 if (!sn) {
5fafdf24
TS
2590 snprintf(buf, buf_size,
2591 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2592 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2593 } else {
2594 ti = sn->date_sec;
3b9f94e1
FB
2595#ifdef _WIN32
2596 ptm = localtime(&ti);
2597 strftime(date_buf, sizeof(date_buf),
2598 "%Y-%m-%d %H:%M:%S", ptm);
2599#else
faea38e7
FB
2600 localtime_r(&ti, &tm);
2601 strftime(date_buf, sizeof(date_buf),
2602 "%Y-%m-%d %H:%M:%S", &tm);
3b9f94e1 2603#endif
faea38e7
FB
2604 secs = sn->vm_clock_nsec / 1000000000;
2605 snprintf(clock_buf, sizeof(clock_buf),
2606 "%02d:%02d:%02d.%03d",
2607 (int)(secs / 3600),
2608 (int)((secs / 60) % 60),
5fafdf24 2609 (int)(secs % 60),
faea38e7
FB
2610 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2611 snprintf(buf, buf_size,
5fafdf24 2612 "%-10s%-20s%7s%20s%15s",
faea38e7
FB
2613 sn->id_str, sn->name,
2614 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2615 date_buf,
2616 clock_buf);
2617 }
2618 return buf;
2619}
2620
ea2384d3 2621/**************************************************************/
83f64091 2622/* async I/Os */
ea2384d3 2623
3b69e4b9 2624BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
f141eafe 2625 QEMUIOVector *qiov, int nb_sectors,
3b69e4b9 2626 BlockDriverCompletionFunc *cb, void *opaque)
83f64091 2627{
bbf0a440
SH
2628 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2629
b2a61371 2630 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2631 cb, opaque, false);
ea2384d3
FB
2632}
2633
f141eafe
AL
2634BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2635 QEMUIOVector *qiov, int nb_sectors,
2636 BlockDriverCompletionFunc *cb, void *opaque)
ea2384d3 2637{
bbf0a440
SH
2638 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2639
1a6e115b 2640 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
8c5873d6 2641 cb, opaque, true);
83f64091
FB
2642}
2643
40b4f539
KW
2644
2645typedef struct MultiwriteCB {
2646 int error;
2647 int num_requests;
2648 int num_callbacks;
2649 struct {
2650 BlockDriverCompletionFunc *cb;
2651 void *opaque;
2652 QEMUIOVector *free_qiov;
2653 void *free_buf;
2654 } callbacks[];
2655} MultiwriteCB;
2656
2657static void multiwrite_user_cb(MultiwriteCB *mcb)
2658{
2659 int i;
2660
2661 for (i = 0; i < mcb->num_callbacks; i++) {
2662 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
1e1ea48d
SH
2663 if (mcb->callbacks[i].free_qiov) {
2664 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2665 }
7267c094 2666 g_free(mcb->callbacks[i].free_qiov);
f8a83245 2667 qemu_vfree(mcb->callbacks[i].free_buf);
40b4f539
KW
2668 }
2669}
2670
2671static void multiwrite_cb(void *opaque, int ret)
2672{
2673 MultiwriteCB *mcb = opaque;
2674
6d519a5f
SH
2675 trace_multiwrite_cb(mcb, ret);
2676
cb6d3ca0 2677 if (ret < 0 && !mcb->error) {
40b4f539 2678 mcb->error = ret;
40b4f539
KW
2679 }
2680
2681 mcb->num_requests--;
2682 if (mcb->num_requests == 0) {
de189a1b 2683 multiwrite_user_cb(mcb);
7267c094 2684 g_free(mcb);
40b4f539
KW
2685 }
2686}
2687
2688static int multiwrite_req_compare(const void *a, const void *b)
2689{
77be4366
CH
2690 const BlockRequest *req1 = a, *req2 = b;
2691
2692 /*
2693 * Note that we can't simply subtract req2->sector from req1->sector
2694 * here as that could overflow the return value.
2695 */
2696 if (req1->sector > req2->sector) {
2697 return 1;
2698 } else if (req1->sector < req2->sector) {
2699 return -1;
2700 } else {
2701 return 0;
2702 }
40b4f539
KW
2703}
2704
2705/*
2706 * Takes a bunch of requests and tries to merge them. Returns the number of
2707 * requests that remain after merging.
2708 */
2709static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2710 int num_reqs, MultiwriteCB *mcb)
2711{
2712 int i, outidx;
2713
2714 // Sort requests by start sector
2715 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2716
2717 // Check if adjacent requests touch the same clusters. If so, combine them,
2718 // filling up gaps with zero sectors.
2719 outidx = 0;
2720 for (i = 1; i < num_reqs; i++) {
2721 int merge = 0;
2722 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2723
2724 // This handles the cases that are valid for all block drivers, namely
2725 // exactly sequential writes and overlapping writes.
2726 if (reqs[i].sector <= oldreq_last) {
2727 merge = 1;
2728 }
2729
2730 // The block driver may decide that it makes sense to combine requests
2731 // even if there is a gap of some sectors between them. In this case,
2732 // the gap is filled with zeros (therefore only applicable for yet
2733 // unused space in format like qcow2).
2734 if (!merge && bs->drv->bdrv_merge_requests) {
2735 merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2736 }
2737
e2a305fb
CH
2738 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2739 merge = 0;
2740 }
2741
40b4f539
KW
2742 if (merge) {
2743 size_t size;
7267c094 2744 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
40b4f539
KW
2745 qemu_iovec_init(qiov,
2746 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2747
2748 // Add the first request to the merged one. If the requests are
2749 // overlapping, drop the last sectors of the first request.
2750 size = (reqs[i].sector - reqs[outidx].sector) << 9;
2751 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2752
2753 // We might need to add some zeros between the two requests
2754 if (reqs[i].sector > oldreq_last) {
2755 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2756 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2757 memset(buf, 0, zero_bytes);
2758 qemu_iovec_add(qiov, buf, zero_bytes);
2759 mcb->callbacks[i].free_buf = buf;
2760 }
2761
2762 // Add the second request
2763 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2764
cbf1dff2 2765 reqs[outidx].nb_sectors = qiov->size >> 9;
40b4f539
KW
2766 reqs[outidx].qiov = qiov;
2767
2768 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2769 } else {
2770 outidx++;
2771 reqs[outidx].sector = reqs[i].sector;
2772 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2773 reqs[outidx].qiov = reqs[i].qiov;
2774 }
2775 }
2776
2777 return outidx + 1;
2778}
2779
2780/*
2781 * Submit multiple AIO write requests at once.
2782 *
2783 * On success, the function returns 0 and all requests in the reqs array have
2784 * been submitted. In error case this function returns -1, and any of the
2785 * requests may or may not be submitted yet. In particular, this means that the
2786 * callback will be called for some of the requests, for others it won't. The
2787 * caller must check the error field of the BlockRequest to wait for the right
2788 * callbacks (if error != 0, no callback will be called).
2789 *
2790 * The implementation may modify the contents of the reqs array, e.g. to merge
2791 * requests. However, the fields opaque and error are left unmodified as they
2792 * are used to signal failure for a single request to the caller.
2793 */
2794int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2795{
2796 BlockDriverAIOCB *acb;
2797 MultiwriteCB *mcb;
2798 int i;
2799
301db7c2
RH
2800 /* don't submit writes if we don't have a medium */
2801 if (bs->drv == NULL) {
2802 for (i = 0; i < num_reqs; i++) {
2803 reqs[i].error = -ENOMEDIUM;
2804 }
2805 return -1;
2806 }
2807
40b4f539
KW
2808 if (num_reqs == 0) {
2809 return 0;
2810 }
2811
2812 // Create MultiwriteCB structure
7267c094 2813 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
40b4f539
KW
2814 mcb->num_requests = 0;
2815 mcb->num_callbacks = num_reqs;
2816
2817 for (i = 0; i < num_reqs; i++) {
2818 mcb->callbacks[i].cb = reqs[i].cb;
2819 mcb->callbacks[i].opaque = reqs[i].opaque;
2820 }
2821
2822 // Check for mergable requests
2823 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2824
6d519a5f
SH
2825 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2826
453f9a16
KW
2827 /*
2828 * Run the aio requests. As soon as one request can't be submitted
2829 * successfully, fail all requests that are not yet submitted (we must
2830 * return failure for all requests anyway)
2831 *
2832 * num_requests cannot be set to the right value immediately: If
2833 * bdrv_aio_writev fails for some request, num_requests would be too high
2834 * and therefore multiwrite_cb() would never recognize the multiwrite
2835 * request as completed. We also cannot use the loop variable i to set it
2836 * when the first request fails because the callback may already have been
2837 * called for previously submitted requests. Thus, num_requests must be
2838 * incremented for each request that is submitted.
2839 *
2840 * The problem that callbacks may be called early also means that we need
2841 * to take care that num_requests doesn't become 0 before all requests are
2842 * submitted - multiwrite_cb() would consider the multiwrite request
2843 * completed. A dummy request that is "completed" by a manual call to
2844 * multiwrite_cb() takes care of this.
2845 */
2846 mcb->num_requests = 1;
2847
6d519a5f 2848 // Run the aio requests
40b4f539 2849 for (i = 0; i < num_reqs; i++) {
453f9a16 2850 mcb->num_requests++;
40b4f539
KW
2851 acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2852 reqs[i].nb_sectors, multiwrite_cb, mcb);
2853
2854 if (acb == NULL) {
2855 // We can only fail the whole thing if no request has been
2856 // submitted yet. Otherwise we'll wait for the submitted AIOs to
2857 // complete and report the error in the callback.
453f9a16 2858 if (i == 0) {
6d519a5f 2859 trace_bdrv_aio_multiwrite_earlyfail(mcb);
40b4f539
KW
2860 goto fail;
2861 } else {
6d519a5f 2862 trace_bdrv_aio_multiwrite_latefail(mcb, i);
7eb58a6c 2863 multiwrite_cb(mcb, -EIO);
40b4f539
KW
2864 break;
2865 }
40b4f539
KW
2866 }
2867 }
2868
453f9a16
KW
2869 /* Complete the dummy request */
2870 multiwrite_cb(mcb, 0);
2871
40b4f539
KW
2872 return 0;
2873
2874fail:
453f9a16
KW
2875 for (i = 0; i < mcb->num_callbacks; i++) {
2876 reqs[i].error = -EIO;
2877 }
7267c094 2878 g_free(mcb);
40b4f539
KW
2879 return -1;
2880}
2881
83f64091 2882void bdrv_aio_cancel(BlockDriverAIOCB *acb)
83f64091 2883{
6bbff9a0 2884 acb->pool->cancel(acb);
83f64091
FB
2885}
2886
98f90dba
ZYW
2887/* block I/O throttling */
2888static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
2889 bool is_write, double elapsed_time, uint64_t *wait)
2890{
2891 uint64_t bps_limit = 0;
2892 double bytes_limit, bytes_base, bytes_res;
2893 double slice_time, wait_time;
2894
2895 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2896 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2897 } else if (bs->io_limits.bps[is_write]) {
2898 bps_limit = bs->io_limits.bps[is_write];
2899 } else {
2900 if (wait) {
2901 *wait = 0;
2902 }
2903
2904 return false;
2905 }
2906
2907 slice_time = bs->slice_end - bs->slice_start;
2908 slice_time /= (NANOSECONDS_PER_SECOND);
2909 bytes_limit = bps_limit * slice_time;
2910 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
2911 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2912 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
2913 }
2914
2915 /* bytes_base: the bytes of data which have been read/written; and
2916 * it is obtained from the history statistic info.
2917 * bytes_res: the remaining bytes of data which need to be read/written.
2918 * (bytes_base + bytes_res) / bps_limit: used to calcuate
2919 * the total time for completing reading/writting all data.
2920 */
2921 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2922
2923 if (bytes_base + bytes_res <= bytes_limit) {
2924 if (wait) {
2925 *wait = 0;
2926 }
2927
2928 return false;
2929 }
2930
2931 /* Calc approx time to dispatch */
2932 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
2933
2934 /* When the I/O rate at runtime exceeds the limits,
2935 * bs->slice_end need to be extended in order that the current statistic
2936 * info can be kept until the timer fire, so it is increased and tuned
2937 * based on the result of experiment.
2938 */
2939 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2940 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2941 if (wait) {
2942 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2943 }
2944
2945 return true;
2946}
2947
2948static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
2949 double elapsed_time, uint64_t *wait)
2950{
2951 uint64_t iops_limit = 0;
2952 double ios_limit, ios_base;
2953 double slice_time, wait_time;
2954
2955 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2956 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2957 } else if (bs->io_limits.iops[is_write]) {
2958 iops_limit = bs->io_limits.iops[is_write];
2959 } else {
2960 if (wait) {
2961 *wait = 0;
2962 }
2963
2964 return false;
2965 }
2966
2967 slice_time = bs->slice_end - bs->slice_start;
2968 slice_time /= (NANOSECONDS_PER_SECOND);
2969 ios_limit = iops_limit * slice_time;
2970 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
2971 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2972 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
2973 }
2974
2975 if (ios_base + 1 <= ios_limit) {
2976 if (wait) {
2977 *wait = 0;
2978 }
2979
2980 return false;
2981 }
2982
2983 /* Calc approx time to dispatch */
2984 wait_time = (ios_base + 1) / iops_limit;
2985 if (wait_time > elapsed_time) {
2986 wait_time = wait_time - elapsed_time;
2987 } else {
2988 wait_time = 0;
2989 }
2990
2991 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2992 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2993 if (wait) {
2994 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2995 }
2996
2997 return true;
2998}
2999
3000static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3001 bool is_write, int64_t *wait)
3002{
3003 int64_t now, max_wait;
3004 uint64_t bps_wait = 0, iops_wait = 0;
3005 double elapsed_time;
3006 int bps_ret, iops_ret;
3007
3008 now = qemu_get_clock_ns(vm_clock);
3009 if ((bs->slice_start < now)
3010 && (bs->slice_end > now)) {
3011 bs->slice_end = now + bs->slice_time;
3012 } else {
3013 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3014 bs->slice_start = now;
3015 bs->slice_end = now + bs->slice_time;
3016
3017 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3018 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3019
3020 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3021 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3022 }
3023
3024 elapsed_time = now - bs->slice_start;
3025 elapsed_time /= (NANOSECONDS_PER_SECOND);
3026
3027 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3028 is_write, elapsed_time, &bps_wait);
3029 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3030 elapsed_time, &iops_wait);
3031 if (bps_ret || iops_ret) {
3032 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3033 if (wait) {
3034 *wait = max_wait;
3035 }
3036
3037 now = qemu_get_clock_ns(vm_clock);
3038 if (bs->slice_end < now + max_wait) {
3039 bs->slice_end = now + max_wait;
3040 }
3041
3042 return true;
3043 }
3044
3045 if (wait) {
3046 *wait = 0;
3047 }
3048
3049 return false;
3050}
ce1a14dc 3051
83f64091
FB
3052/**************************************************************/
3053/* async block device emulation */
3054
c16b5a2c
CH
3055typedef struct BlockDriverAIOCBSync {
3056 BlockDriverAIOCB common;
3057 QEMUBH *bh;
3058 int ret;
3059 /* vector translation state */
3060 QEMUIOVector *qiov;
3061 uint8_t *bounce;
3062 int is_write;
3063} BlockDriverAIOCBSync;
3064
3065static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3066{
b666d239
KW
3067 BlockDriverAIOCBSync *acb =
3068 container_of(blockacb, BlockDriverAIOCBSync, common);
6a7ad299 3069 qemu_bh_delete(acb->bh);
36afc451 3070 acb->bh = NULL;
c16b5a2c
CH
3071 qemu_aio_release(acb);
3072}
3073
3074static AIOPool bdrv_em_aio_pool = {
3075 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3076 .cancel = bdrv_aio_cancel_em,
3077};
3078
ce1a14dc 3079static void bdrv_aio_bh_cb(void *opaque)
83f64091 3080{
ce1a14dc 3081 BlockDriverAIOCBSync *acb = opaque;
f141eafe 3082
f141eafe
AL
3083 if (!acb->is_write)
3084 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
ceb42de8 3085 qemu_vfree(acb->bounce);
ce1a14dc 3086 acb->common.cb(acb->common.opaque, acb->ret);
6a7ad299 3087 qemu_bh_delete(acb->bh);
36afc451 3088 acb->bh = NULL;
ce1a14dc 3089 qemu_aio_release(acb);
83f64091 3090}
beac80cd 3091
f141eafe
AL
3092static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3093 int64_t sector_num,
3094 QEMUIOVector *qiov,
3095 int nb_sectors,
3096 BlockDriverCompletionFunc *cb,
3097 void *opaque,
3098 int is_write)
3099
83f64091 3100{
ce1a14dc 3101 BlockDriverAIOCBSync *acb;
ce1a14dc 3102
c16b5a2c 3103 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
f141eafe
AL
3104 acb->is_write = is_write;
3105 acb->qiov = qiov;
e268ca52 3106 acb->bounce = qemu_blockalign(bs, qiov->size);
f141eafe 3107
ce1a14dc
PB
3108 if (!acb->bh)
3109 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
f141eafe
AL
3110
3111 if (is_write) {
3112 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
1ed20acf 3113 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
f141eafe 3114 } else {
1ed20acf 3115 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
f141eafe
AL
3116 }
3117
ce1a14dc 3118 qemu_bh_schedule(acb->bh);
f141eafe 3119
ce1a14dc 3120 return &acb->common;
beac80cd
FB
3121}
3122
f141eafe
AL
3123static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3124 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
ce1a14dc 3125 BlockDriverCompletionFunc *cb, void *opaque)
beac80cd 3126{
f141eafe
AL
3127 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3128}
83f64091 3129
f141eafe
AL
3130static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3131 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3132 BlockDriverCompletionFunc *cb, void *opaque)
3133{
3134 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
beac80cd 3135}
beac80cd 3136
68485420
KW
3137
3138typedef struct BlockDriverAIOCBCoroutine {
3139 BlockDriverAIOCB common;
3140 BlockRequest req;
3141 bool is_write;
3142 QEMUBH* bh;
3143} BlockDriverAIOCBCoroutine;
3144
3145static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3146{
3147 qemu_aio_flush();
3148}
3149
3150static AIOPool bdrv_em_co_aio_pool = {
3151 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3152 .cancel = bdrv_aio_co_cancel_em,
3153};
3154
35246a68 3155static void bdrv_co_em_bh(void *opaque)
68485420
KW
3156{
3157 BlockDriverAIOCBCoroutine *acb = opaque;
3158
3159 acb->common.cb(acb->common.opaque, acb->req.error);
3160 qemu_bh_delete(acb->bh);
3161 qemu_aio_release(acb);
3162}
3163
b2a61371
SH
3164/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3165static void coroutine_fn bdrv_co_do_rw(void *opaque)
3166{
3167 BlockDriverAIOCBCoroutine *acb = opaque;
3168 BlockDriverState *bs = acb->common.bs;
3169
3170 if (!acb->is_write) {
3171 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3172 acb->req.nb_sectors, acb->req.qiov);
3173 } else {
3174 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3175 acb->req.nb_sectors, acb->req.qiov);
3176 }
3177
35246a68 3178 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2a61371
SH
3179 qemu_bh_schedule(acb->bh);
3180}
3181
68485420
KW
3182static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3183 int64_t sector_num,
3184 QEMUIOVector *qiov,
3185 int nb_sectors,
3186 BlockDriverCompletionFunc *cb,
3187 void *opaque,
8c5873d6 3188 bool is_write)
68485420
KW
3189{
3190 Coroutine *co;
3191 BlockDriverAIOCBCoroutine *acb;
3192
3193 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3194 acb->req.sector = sector_num;
3195 acb->req.nb_sectors = nb_sectors;
3196 acb->req.qiov = qiov;
3197 acb->is_write = is_write;
3198
8c5873d6 3199 co = qemu_coroutine_create(bdrv_co_do_rw);
68485420
KW
3200 qemu_coroutine_enter(co, acb);
3201
3202 return &acb->common;
3203}
3204
07f07615 3205static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
b2e12bc6 3206{
07f07615
PB
3207 BlockDriverAIOCBCoroutine *acb = opaque;
3208 BlockDriverState *bs = acb->common.bs;
b2e12bc6 3209
07f07615
PB
3210 acb->req.error = bdrv_co_flush(bs);
3211 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
b2e12bc6 3212 qemu_bh_schedule(acb->bh);
b2e12bc6
CH
3213}
3214
07f07615 3215BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
016f5cf6
AG
3216 BlockDriverCompletionFunc *cb, void *opaque)
3217{
07f07615 3218 trace_bdrv_aio_flush(bs, opaque);
016f5cf6 3219
07f07615
PB
3220 Coroutine *co;
3221 BlockDriverAIOCBCoroutine *acb;
016f5cf6 3222
07f07615
PB
3223 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3224 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3225 qemu_coroutine_enter(co, acb);
016f5cf6 3226
016f5cf6
AG
3227 return &acb->common;
3228}
3229
4265d620
PB
3230static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3231{
3232 BlockDriverAIOCBCoroutine *acb = opaque;
3233 BlockDriverState *bs = acb->common.bs;
3234
3235 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3236 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3237 qemu_bh_schedule(acb->bh);
3238}
3239
3240BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3241 int64_t sector_num, int nb_sectors,
3242 BlockDriverCompletionFunc *cb, void *opaque)
3243{
3244 Coroutine *co;
3245 BlockDriverAIOCBCoroutine *acb;
3246
3247 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3248
3249 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3250 acb->req.sector = sector_num;
3251 acb->req.nb_sectors = nb_sectors;
3252 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3253 qemu_coroutine_enter(co, acb);
3254
3255 return &acb->common;
3256}
3257
ea2384d3
FB
3258void bdrv_init(void)
3259{
5efa9d5a 3260 module_call_init(MODULE_INIT_BLOCK);
ea2384d3 3261}
ce1a14dc 3262
eb852011
MA
3263void bdrv_init_with_whitelist(void)
3264{
3265 use_bdrv_whitelist = 1;
3266 bdrv_init();
3267}
3268
c16b5a2c
CH
3269void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3270 BlockDriverCompletionFunc *cb, void *opaque)
ce1a14dc 3271{
ce1a14dc
PB
3272 BlockDriverAIOCB *acb;
3273
6bbff9a0
AL
3274 if (pool->free_aiocb) {
3275 acb = pool->free_aiocb;
3276 pool->free_aiocb = acb->next;
ce1a14dc 3277 } else {
7267c094 3278 acb = g_malloc0(pool->aiocb_size);
6bbff9a0 3279 acb->pool = pool;
ce1a14dc
PB
3280 }
3281 acb->bs = bs;
3282 acb->cb = cb;
3283 acb->opaque = opaque;
3284 return acb;
3285}
3286
3287void qemu_aio_release(void *p)
3288{
6bbff9a0
AL
3289 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3290 AIOPool *pool = acb->pool;
3291 acb->next = pool->free_aiocb;
3292 pool->free_aiocb = acb;
ce1a14dc 3293}
19cb3738 3294
f9f05dc5
KW
3295/**************************************************************/
3296/* Coroutine block device emulation */
3297
3298typedef struct CoroutineIOCompletion {
3299 Coroutine *coroutine;
3300 int ret;
3301} CoroutineIOCompletion;
3302
3303static void bdrv_co_io_em_complete(void *opaque, int ret)
3304{
3305 CoroutineIOCompletion *co = opaque;
3306
3307 co->ret = ret;
3308 qemu_coroutine_enter(co->coroutine, NULL);
3309}
3310
3311static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3312 int nb_sectors, QEMUIOVector *iov,
3313 bool is_write)
3314{
3315 CoroutineIOCompletion co = {
3316 .coroutine = qemu_coroutine_self(),
3317 };
3318 BlockDriverAIOCB *acb;
3319
3320 if (is_write) {
a652d160
SH
3321 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3322 bdrv_co_io_em_complete, &co);
f9f05dc5 3323 } else {
a652d160
SH
3324 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3325 bdrv_co_io_em_complete, &co);
f9f05dc5
KW
3326 }
3327
59370aaa 3328 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
f9f05dc5
KW
3329 if (!acb) {
3330 return -EIO;
3331 }
3332 qemu_coroutine_yield();
3333
3334 return co.ret;
3335}
3336
3337static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3338 int64_t sector_num, int nb_sectors,
3339 QEMUIOVector *iov)
3340{
3341 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3342}
3343
3344static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3345 int64_t sector_num, int nb_sectors,
3346 QEMUIOVector *iov)
3347{
3348 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3349}
3350
07f07615 3351static void coroutine_fn bdrv_flush_co_entry(void *opaque)
e7a8a783 3352{
07f07615
PB
3353 RwCo *rwco = opaque;
3354
3355 rwco->ret = bdrv_co_flush(rwco->bs);
3356}
3357
3358int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3359{
eb489bb1
KW
3360 int ret;
3361
ca716364 3362 if (!bs->drv) {
07f07615 3363 return 0;
eb489bb1
KW
3364 }
3365
ca716364 3366 /* Write back cached data to the OS even with cache=unsafe */
eb489bb1
KW
3367 if (bs->drv->bdrv_co_flush_to_os) {
3368 ret = bs->drv->bdrv_co_flush_to_os(bs);
3369 if (ret < 0) {
3370 return ret;
3371 }
3372 }
3373
ca716364
KW
3374 /* But don't actually force it to the disk with cache=unsafe */
3375 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3376 return 0;
3377 }
3378
eb489bb1 3379 if (bs->drv->bdrv_co_flush_to_disk) {
c68b89ac 3380 return bs->drv->bdrv_co_flush_to_disk(bs);
07f07615
PB
3381 } else if (bs->drv->bdrv_aio_flush) {
3382 BlockDriverAIOCB *acb;
3383 CoroutineIOCompletion co = {
3384 .coroutine = qemu_coroutine_self(),
3385 };
3386
3387 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3388 if (acb == NULL) {
3389 return -EIO;
3390 } else {
3391 qemu_coroutine_yield();
3392 return co.ret;
3393 }
07f07615
PB
3394 } else {
3395 /*
3396 * Some block drivers always operate in either writethrough or unsafe
3397 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3398 * know how the server works (because the behaviour is hardcoded or
3399 * depends on server-side configuration), so we can't ensure that
3400 * everything is safe on disk. Returning an error doesn't work because
3401 * that would break guests even if the server operates in writethrough
3402 * mode.
3403 *
3404 * Let's hope the user knows what he's doing.
3405 */
3406 return 0;
3407 }
3408}
3409
0f15423c
AL
3410void bdrv_invalidate_cache(BlockDriverState *bs)
3411{
3412 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3413 bs->drv->bdrv_invalidate_cache(bs);
3414 }
3415}
3416
3417void bdrv_invalidate_cache_all(void)
3418{
3419 BlockDriverState *bs;
3420
3421 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3422 bdrv_invalidate_cache(bs);
3423 }
3424}
3425
07f07615
PB
3426int bdrv_flush(BlockDriverState *bs)
3427{
3428 Coroutine *co;
3429 RwCo rwco = {
3430 .bs = bs,
3431 .ret = NOT_DONE,
e7a8a783 3432 };
e7a8a783 3433
07f07615
PB
3434 if (qemu_in_coroutine()) {
3435 /* Fast-path if already in coroutine context */
3436 bdrv_flush_co_entry(&rwco);
3437 } else {
3438 co = qemu_coroutine_create(bdrv_flush_co_entry);
3439 qemu_coroutine_enter(co, &rwco);
3440 while (rwco.ret == NOT_DONE) {
3441 qemu_aio_wait();
3442 }
e7a8a783 3443 }
07f07615
PB
3444
3445 return rwco.ret;
e7a8a783
KW
3446}
3447
4265d620
PB
3448static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3449{
3450 RwCo *rwco = opaque;
3451
3452 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3453}
3454
3455int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3456 int nb_sectors)
3457{
3458 if (!bs->drv) {
3459 return -ENOMEDIUM;
3460 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3461 return -EIO;
3462 } else if (bs->read_only) {
3463 return -EROFS;
3464 } else if (bs->drv->bdrv_co_discard) {
3465 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3466 } else if (bs->drv->bdrv_aio_discard) {
3467 BlockDriverAIOCB *acb;
3468 CoroutineIOCompletion co = {
3469 .coroutine = qemu_coroutine_self(),
3470 };
3471
3472 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3473 bdrv_co_io_em_complete, &co);
3474 if (acb == NULL) {
3475 return -EIO;
3476 } else {
3477 qemu_coroutine_yield();
3478 return co.ret;
3479 }
4265d620
PB
3480 } else {
3481 return 0;
3482 }
3483}
3484
3485int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3486{
3487 Coroutine *co;
3488 RwCo rwco = {
3489 .bs = bs,
3490 .sector_num = sector_num,
3491 .nb_sectors = nb_sectors,
3492 .ret = NOT_DONE,
3493 };
3494
3495 if (qemu_in_coroutine()) {
3496 /* Fast-path if already in coroutine context */
3497 bdrv_discard_co_entry(&rwco);
3498 } else {
3499 co = qemu_coroutine_create(bdrv_discard_co_entry);
3500 qemu_coroutine_enter(co, &rwco);
3501 while (rwco.ret == NOT_DONE) {
3502 qemu_aio_wait();
3503 }
3504 }
3505
3506 return rwco.ret;
3507}
3508
19cb3738
FB
3509/**************************************************************/
3510/* removable device support */
3511
3512/**
3513 * Return TRUE if the media is present
3514 */
3515int bdrv_is_inserted(BlockDriverState *bs)
3516{
3517 BlockDriver *drv = bs->drv;
a1aff5bf 3518
19cb3738
FB
3519 if (!drv)
3520 return 0;
3521 if (!drv->bdrv_is_inserted)
a1aff5bf
MA
3522 return 1;
3523 return drv->bdrv_is_inserted(bs);
19cb3738
FB
3524}
3525
3526/**
8e49ca46
MA
3527 * Return whether the media changed since the last call to this
3528 * function, or -ENOTSUP if we don't know. Most drivers don't know.
19cb3738
FB
3529 */
3530int bdrv_media_changed(BlockDriverState *bs)
3531{
3532 BlockDriver *drv = bs->drv;
19cb3738 3533
8e49ca46
MA
3534 if (drv && drv->bdrv_media_changed) {
3535 return drv->bdrv_media_changed(bs);
3536 }
3537 return -ENOTSUP;
19cb3738
FB
3538}
3539
3540/**
3541 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3542 */
fdec4404 3543void bdrv_eject(BlockDriverState *bs, int eject_flag)
19cb3738
FB
3544{
3545 BlockDriver *drv = bs->drv;
19cb3738 3546
822e1cd1
MA
3547 if (drv && drv->bdrv_eject) {
3548 drv->bdrv_eject(bs, eject_flag);
19cb3738
FB
3549 }
3550}
3551
19cb3738
FB
3552/**
3553 * Lock or unlock the media (if it is locked, the user won't be able
3554 * to eject it manually).
3555 */
025e849a 3556void bdrv_lock_medium(BlockDriverState *bs, bool locked)
19cb3738
FB
3557{
3558 BlockDriver *drv = bs->drv;
3559
025e849a 3560 trace_bdrv_lock_medium(bs, locked);
b8c6d095 3561
025e849a
MA
3562 if (drv && drv->bdrv_lock_medium) {
3563 drv->bdrv_lock_medium(bs, locked);
19cb3738
FB
3564 }
3565}
985a03b0
TS
3566
3567/* needed for generic scsi interface */
3568
3569int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3570{
3571 BlockDriver *drv = bs->drv;
3572
3573 if (drv && drv->bdrv_ioctl)
3574 return drv->bdrv_ioctl(bs, req, buf);
3575 return -ENOTSUP;
3576}
7d780669 3577
221f715d
AL
3578BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3579 unsigned long int req, void *buf,
3580 BlockDriverCompletionFunc *cb, void *opaque)
7d780669 3581{
221f715d 3582 BlockDriver *drv = bs->drv;
7d780669 3583
221f715d
AL
3584 if (drv && drv->bdrv_aio_ioctl)
3585 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3586 return NULL;
7d780669 3587}
e268ca52 3588
7b6f9300
MA
3589void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3590{
3591 bs->buffer_alignment = align;
3592}
7cd1e32a 3593
e268ca52
AL
3594void *qemu_blockalign(BlockDriverState *bs, size_t size)
3595{
3596 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3597}
7cd1e32a 3598
3599void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3600{
3601 int64_t bitmap_size;
a55eb92c 3602
aaa0eb75 3603 bs->dirty_count = 0;
a55eb92c 3604 if (enable) {
c6d22830
JK
3605 if (!bs->dirty_bitmap) {
3606 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3607 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3608 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
a55eb92c 3609
7267c094 3610 bs->dirty_bitmap = g_malloc0(bitmap_size);
a55eb92c 3611 }
7cd1e32a 3612 } else {
c6d22830 3613 if (bs->dirty_bitmap) {
7267c094 3614 g_free(bs->dirty_bitmap);
c6d22830 3615 bs->dirty_bitmap = NULL;
a55eb92c 3616 }
7cd1e32a 3617 }
3618}
3619
3620int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3621{
6ea44308 3622 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
a55eb92c 3623
c6d22830
JK
3624 if (bs->dirty_bitmap &&
3625 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
6d59fec1
MT
3626 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3627 (1UL << (chunk % (sizeof(unsigned long) * 8))));
7cd1e32a 3628 } else {
3629 return 0;
3630 }
3631}
3632
a55eb92c
JK
3633void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3634 int nr_sectors)
7cd1e32a 3635{
3636 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3637}
aaa0eb75
LS
3638
3639int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3640{
3641 return bs->dirty_count;
3642}
f88e1a42 3643
db593f25
MT
3644void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3645{
3646 assert(bs->in_use != in_use);
3647 bs->in_use = in_use;
3648}
3649
3650int bdrv_in_use(BlockDriverState *bs)
3651{
3652 return bs->in_use;
3653}
3654
28a7282a
LC
3655void bdrv_iostatus_enable(BlockDriverState *bs)
3656{
d6bf279e 3657 bs->iostatus_enabled = true;
58e21ef5 3658 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3659}
3660
3661/* The I/O status is only enabled if the drive explicitly
3662 * enables it _and_ the VM is configured to stop on errors */
3663bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3664{
d6bf279e 3665 return (bs->iostatus_enabled &&
28a7282a
LC
3666 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3667 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3668 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3669}
3670
3671void bdrv_iostatus_disable(BlockDriverState *bs)
3672{
d6bf279e 3673 bs->iostatus_enabled = false;
28a7282a
LC
3674}
3675
3676void bdrv_iostatus_reset(BlockDriverState *bs)
3677{
3678 if (bdrv_iostatus_is_enabled(bs)) {
58e21ef5 3679 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
28a7282a
LC
3680 }
3681}
3682
3683/* XXX: Today this is set by device models because it makes the implementation
3684 quite simple. However, the block layer knows about the error, so it's
3685 possible to implement this without device models being involved */
3686void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3687{
58e21ef5
LC
3688 if (bdrv_iostatus_is_enabled(bs) &&
3689 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
28a7282a 3690 assert(error >= 0);
58e21ef5
LC
3691 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3692 BLOCK_DEVICE_IO_STATUS_FAILED;
28a7282a
LC
3693 }
3694}
3695
a597e79c
CH
3696void
3697bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3698 enum BlockAcctType type)
3699{
3700 assert(type < BDRV_MAX_IOTYPE);
3701
3702 cookie->bytes = bytes;
c488c7f6 3703 cookie->start_time_ns = get_clock();
a597e79c
CH
3704 cookie->type = type;
3705}
3706
3707void
3708bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3709{
3710 assert(cookie->type < BDRV_MAX_IOTYPE);
3711
3712 bs->nr_bytes[cookie->type] += cookie->bytes;
3713 bs->nr_ops[cookie->type]++;
c488c7f6 3714 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
a597e79c
CH
3715}
3716
f88e1a42
JS
3717int bdrv_img_create(const char *filename, const char *fmt,
3718 const char *base_filename, const char *base_fmt,
3719 char *options, uint64_t img_size, int flags)
3720{
3721 QEMUOptionParameter *param = NULL, *create_options = NULL;
d220894e 3722 QEMUOptionParameter *backing_fmt, *backing_file, *size;
f88e1a42
JS
3723 BlockDriverState *bs = NULL;
3724 BlockDriver *drv, *proto_drv;
96df67d1 3725 BlockDriver *backing_drv = NULL;
f88e1a42
JS
3726 int ret = 0;
3727
3728 /* Find driver and parse its options */
3729 drv = bdrv_find_format(fmt);
3730 if (!drv) {
3731 error_report("Unknown file format '%s'", fmt);
4f70f249 3732 ret = -EINVAL;
f88e1a42
JS
3733 goto out;
3734 }
3735
3736 proto_drv = bdrv_find_protocol(filename);
3737 if (!proto_drv) {
3738 error_report("Unknown protocol '%s'", filename);
4f70f249 3739 ret = -EINVAL;
f88e1a42
JS
3740 goto out;
3741 }
3742
3743 create_options = append_option_parameters(create_options,
3744 drv->create_options);
3745 create_options = append_option_parameters(create_options,
3746 proto_drv->create_options);
3747
3748 /* Create parameter list with default values */
3749 param = parse_option_parameters("", create_options, param);
3750
3751 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3752
3753 /* Parse -o options */
3754 if (options) {
3755 param = parse_option_parameters(options, create_options, param);
3756 if (param == NULL) {
3757 error_report("Invalid options for file format '%s'.", fmt);
4f70f249 3758 ret = -EINVAL;
f88e1a42
JS
3759 goto out;
3760 }
3761 }
3762
3763 if (base_filename) {
3764 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3765 base_filename)) {
3766 error_report("Backing file not supported for file format '%s'",
3767 fmt);
4f70f249 3768 ret = -EINVAL;
f88e1a42
JS
3769 goto out;
3770 }
3771 }
3772
3773 if (base_fmt) {
3774 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3775 error_report("Backing file format not supported for file "
3776 "format '%s'", fmt);
4f70f249 3777 ret = -EINVAL;
f88e1a42
JS
3778 goto out;
3779 }
3780 }
3781
792da93a
JS
3782 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3783 if (backing_file && backing_file->value.s) {
3784 if (!strcmp(filename, backing_file->value.s)) {
3785 error_report("Error: Trying to create an image with the "
3786 "same filename as the backing file");
4f70f249 3787 ret = -EINVAL;
792da93a
JS
3788 goto out;
3789 }
3790 }
3791
f88e1a42
JS
3792 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
3793 if (backing_fmt && backing_fmt->value.s) {
96df67d1
SH
3794 backing_drv = bdrv_find_format(backing_fmt->value.s);
3795 if (!backing_drv) {
f88e1a42
JS
3796 error_report("Unknown backing file format '%s'",
3797 backing_fmt->value.s);
4f70f249 3798 ret = -EINVAL;
f88e1a42
JS
3799 goto out;
3800 }
3801 }
3802
3803 // The size for the image must always be specified, with one exception:
3804 // If we are using a backing file, we can obtain the size from there
d220894e
KW
3805 size = get_option_parameter(param, BLOCK_OPT_SIZE);
3806 if (size && size->value.n == -1) {
f88e1a42
JS
3807 if (backing_file && backing_file->value.s) {
3808 uint64_t size;
f88e1a42
JS
3809 char buf[32];
3810
f88e1a42
JS
3811 bs = bdrv_new("");
3812
96df67d1 3813 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
f88e1a42 3814 if (ret < 0) {
96df67d1 3815 error_report("Could not open '%s'", backing_file->value.s);
f88e1a42
JS
3816 goto out;
3817 }
3818 bdrv_get_geometry(bs, &size);
3819 size *= 512;
3820
3821 snprintf(buf, sizeof(buf), "%" PRId64, size);
3822 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
3823 } else {
3824 error_report("Image creation needs a size parameter");
4f70f249 3825 ret = -EINVAL;
f88e1a42
JS
3826 goto out;
3827 }
3828 }
3829
3830 printf("Formatting '%s', fmt=%s ", filename, fmt);
3831 print_option_parameters(param);
3832 puts("");
3833
3834 ret = bdrv_create(drv, filename, param);
3835
3836 if (ret < 0) {
3837 if (ret == -ENOTSUP) {
3838 error_report("Formatting or formatting option not supported for "
3839 "file format '%s'", fmt);
3840 } else if (ret == -EFBIG) {
3841 error_report("The image size is too large for file format '%s'",
3842 fmt);
3843 } else {
3844 error_report("%s: error while creating %s: %s", filename, fmt,
3845 strerror(-ret));
3846 }
3847 }
3848
3849out:
3850 free_option_parameters(create_options);
3851 free_option_parameters(param);
3852
3853 if (bs) {
3854 bdrv_delete(bs);
3855 }
4f70f249
JS
3856
3857 return ret;
f88e1a42 3858}