]> git.proxmox.com Git - qemu.git/blob - block.c
qcow2: Fix error handling in qcow2_alloc_cluster_offset
[qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
55
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83
84 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
85 bool is_write, double elapsed_time, uint64_t *wait);
86 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
87 double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
89 bool is_write, int64_t *wait);
90
91 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
92 QTAILQ_HEAD_INITIALIZER(bdrv_states);
93
94 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
95 QLIST_HEAD_INITIALIZER(bdrv_drivers);
96
97 /* The device to use for VM snapshots */
98 static BlockDriverState *bs_snapshots;
99
100 /* If non-zero, use only whitelisted block drivers */
101 static int use_bdrv_whitelist;
102
103 #ifdef _WIN32
104 static int is_windows_drive_prefix(const char *filename)
105 {
106 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
107 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
108 filename[1] == ':');
109 }
110
111 int is_windows_drive(const char *filename)
112 {
113 if (is_windows_drive_prefix(filename) &&
114 filename[2] == '\0')
115 return 1;
116 if (strstart(filename, "\\\\.\\", NULL) ||
117 strstart(filename, "//./", NULL))
118 return 1;
119 return 0;
120 }
121 #endif
122
123 /* throttling disk I/O limits */
124 void bdrv_io_limits_disable(BlockDriverState *bs)
125 {
126 bs->io_limits_enabled = false;
127
128 while (qemu_co_queue_next(&bs->throttled_reqs));
129
130 if (bs->block_timer) {
131 qemu_del_timer(bs->block_timer);
132 qemu_free_timer(bs->block_timer);
133 bs->block_timer = NULL;
134 }
135
136 bs->slice_start = 0;
137 bs->slice_end = 0;
138 bs->slice_time = 0;
139 memset(&bs->io_base, 0, sizeof(bs->io_base));
140 }
141
142 static void bdrv_block_timer(void *opaque)
143 {
144 BlockDriverState *bs = opaque;
145
146 qemu_co_queue_next(&bs->throttled_reqs);
147 }
148
149 void bdrv_io_limits_enable(BlockDriverState *bs)
150 {
151 qemu_co_queue_init(&bs->throttled_reqs);
152 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
153 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
154 bs->slice_start = qemu_get_clock_ns(vm_clock);
155 bs->slice_end = bs->slice_start + bs->slice_time;
156 memset(&bs->io_base, 0, sizeof(bs->io_base));
157 bs->io_limits_enabled = true;
158 }
159
160 bool bdrv_io_limits_enabled(BlockDriverState *bs)
161 {
162 BlockIOLimit *io_limits = &bs->io_limits;
163 return io_limits->bps[BLOCK_IO_LIMIT_READ]
164 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
165 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
166 || io_limits->iops[BLOCK_IO_LIMIT_READ]
167 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
168 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
169 }
170
171 static void bdrv_io_limits_intercept(BlockDriverState *bs,
172 bool is_write, int nb_sectors)
173 {
174 int64_t wait_time = -1;
175
176 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
177 qemu_co_queue_wait(&bs->throttled_reqs);
178 }
179
180 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
181 * throttled requests will not be dequeued until the current request is
182 * allowed to be serviced. So if the current request still exceeds the
183 * limits, it will be inserted to the head. All requests followed it will
184 * be still in throttled_reqs queue.
185 */
186
187 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
188 qemu_mod_timer(bs->block_timer,
189 wait_time + qemu_get_clock_ns(vm_clock));
190 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
191 }
192
193 qemu_co_queue_next(&bs->throttled_reqs);
194 }
195
196 /* check if the path starts with "<protocol>:" */
197 static int path_has_protocol(const char *path)
198 {
199 #ifdef _WIN32
200 if (is_windows_drive(path) ||
201 is_windows_drive_prefix(path)) {
202 return 0;
203 }
204 #endif
205
206 return strchr(path, ':') != NULL;
207 }
208
209 int path_is_absolute(const char *path)
210 {
211 const char *p;
212 #ifdef _WIN32
213 /* specific case for names like: "\\.\d:" */
214 if (*path == '/' || *path == '\\')
215 return 1;
216 #endif
217 p = strchr(path, ':');
218 if (p)
219 p++;
220 else
221 p = path;
222 #ifdef _WIN32
223 return (*p == '/' || *p == '\\');
224 #else
225 return (*p == '/');
226 #endif
227 }
228
229 /* if filename is absolute, just copy it to dest. Otherwise, build a
230 path to it by considering it is relative to base_path. URL are
231 supported. */
232 void path_combine(char *dest, int dest_size,
233 const char *base_path,
234 const char *filename)
235 {
236 const char *p, *p1;
237 int len;
238
239 if (dest_size <= 0)
240 return;
241 if (path_is_absolute(filename)) {
242 pstrcpy(dest, dest_size, filename);
243 } else {
244 p = strchr(base_path, ':');
245 if (p)
246 p++;
247 else
248 p = base_path;
249 p1 = strrchr(base_path, '/');
250 #ifdef _WIN32
251 {
252 const char *p2;
253 p2 = strrchr(base_path, '\\');
254 if (!p1 || p2 > p1)
255 p1 = p2;
256 }
257 #endif
258 if (p1)
259 p1++;
260 else
261 p1 = base_path;
262 if (p1 > p)
263 p = p1;
264 len = p - base_path;
265 if (len > dest_size - 1)
266 len = dest_size - 1;
267 memcpy(dest, base_path, len);
268 dest[len] = '\0';
269 pstrcat(dest, dest_size, filename);
270 }
271 }
272
273 void bdrv_register(BlockDriver *bdrv)
274 {
275 /* Block drivers without coroutine functions need emulation */
276 if (!bdrv->bdrv_co_readv) {
277 bdrv->bdrv_co_readv = bdrv_co_readv_em;
278 bdrv->bdrv_co_writev = bdrv_co_writev_em;
279
280 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
281 * the block driver lacks aio we need to emulate that too.
282 */
283 if (!bdrv->bdrv_aio_readv) {
284 /* add AIO emulation layer */
285 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
286 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
287 }
288 }
289
290 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
291 }
292
293 /* create a new block device (by default it is empty) */
294 BlockDriverState *bdrv_new(const char *device_name)
295 {
296 BlockDriverState *bs;
297
298 bs = g_malloc0(sizeof(BlockDriverState));
299 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
300 if (device_name[0] != '\0') {
301 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
302 }
303 bdrv_iostatus_disable(bs);
304 return bs;
305 }
306
307 BlockDriver *bdrv_find_format(const char *format_name)
308 {
309 BlockDriver *drv1;
310 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
311 if (!strcmp(drv1->format_name, format_name)) {
312 return drv1;
313 }
314 }
315 return NULL;
316 }
317
318 static int bdrv_is_whitelisted(BlockDriver *drv)
319 {
320 static const char *whitelist[] = {
321 CONFIG_BDRV_WHITELIST
322 };
323 const char **p;
324
325 if (!whitelist[0])
326 return 1; /* no whitelist, anything goes */
327
328 for (p = whitelist; *p; p++) {
329 if (!strcmp(drv->format_name, *p)) {
330 return 1;
331 }
332 }
333 return 0;
334 }
335
336 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
337 {
338 BlockDriver *drv = bdrv_find_format(format_name);
339 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
340 }
341
342 int bdrv_create(BlockDriver *drv, const char* filename,
343 QEMUOptionParameter *options)
344 {
345 if (!drv->bdrv_create)
346 return -ENOTSUP;
347
348 return drv->bdrv_create(filename, options);
349 }
350
351 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
352 {
353 BlockDriver *drv;
354
355 drv = bdrv_find_protocol(filename);
356 if (drv == NULL) {
357 return -ENOENT;
358 }
359
360 return bdrv_create(drv, filename, options);
361 }
362
363 #ifdef _WIN32
364 void get_tmp_filename(char *filename, int size)
365 {
366 char temp_dir[MAX_PATH];
367
368 GetTempPath(MAX_PATH, temp_dir);
369 GetTempFileName(temp_dir, "qem", 0, filename);
370 }
371 #else
372 void get_tmp_filename(char *filename, int size)
373 {
374 int fd;
375 const char *tmpdir;
376 /* XXX: race condition possible */
377 tmpdir = getenv("TMPDIR");
378 if (!tmpdir)
379 tmpdir = "/tmp";
380 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
381 fd = mkstemp(filename);
382 close(fd);
383 }
384 #endif
385
386 /*
387 * Detect host devices. By convention, /dev/cdrom[N] is always
388 * recognized as a host CDROM.
389 */
390 static BlockDriver *find_hdev_driver(const char *filename)
391 {
392 int score_max = 0, score;
393 BlockDriver *drv = NULL, *d;
394
395 QLIST_FOREACH(d, &bdrv_drivers, list) {
396 if (d->bdrv_probe_device) {
397 score = d->bdrv_probe_device(filename);
398 if (score > score_max) {
399 score_max = score;
400 drv = d;
401 }
402 }
403 }
404
405 return drv;
406 }
407
408 BlockDriver *bdrv_find_protocol(const char *filename)
409 {
410 BlockDriver *drv1;
411 char protocol[128];
412 int len;
413 const char *p;
414
415 /* TODO Drivers without bdrv_file_open must be specified explicitly */
416
417 /*
418 * XXX(hch): we really should not let host device detection
419 * override an explicit protocol specification, but moving this
420 * later breaks access to device names with colons in them.
421 * Thanks to the brain-dead persistent naming schemes on udev-
422 * based Linux systems those actually are quite common.
423 */
424 drv1 = find_hdev_driver(filename);
425 if (drv1) {
426 return drv1;
427 }
428
429 if (!path_has_protocol(filename)) {
430 return bdrv_find_format("file");
431 }
432 p = strchr(filename, ':');
433 assert(p != NULL);
434 len = p - filename;
435 if (len > sizeof(protocol) - 1)
436 len = sizeof(protocol) - 1;
437 memcpy(protocol, filename, len);
438 protocol[len] = '\0';
439 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
440 if (drv1->protocol_name &&
441 !strcmp(drv1->protocol_name, protocol)) {
442 return drv1;
443 }
444 }
445 return NULL;
446 }
447
448 static int find_image_format(const char *filename, BlockDriver **pdrv)
449 {
450 int ret, score, score_max;
451 BlockDriver *drv1, *drv;
452 uint8_t buf[2048];
453 BlockDriverState *bs;
454
455 ret = bdrv_file_open(&bs, filename, 0);
456 if (ret < 0) {
457 *pdrv = NULL;
458 return ret;
459 }
460
461 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
462 if (bs->sg || !bdrv_is_inserted(bs)) {
463 bdrv_delete(bs);
464 drv = bdrv_find_format("raw");
465 if (!drv) {
466 ret = -ENOENT;
467 }
468 *pdrv = drv;
469 return ret;
470 }
471
472 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
473 bdrv_delete(bs);
474 if (ret < 0) {
475 *pdrv = NULL;
476 return ret;
477 }
478
479 score_max = 0;
480 drv = NULL;
481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482 if (drv1->bdrv_probe) {
483 score = drv1->bdrv_probe(buf, ret, filename);
484 if (score > score_max) {
485 score_max = score;
486 drv = drv1;
487 }
488 }
489 }
490 if (!drv) {
491 ret = -ENOENT;
492 }
493 *pdrv = drv;
494 return ret;
495 }
496
497 /**
498 * Set the current 'total_sectors' value
499 */
500 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
501 {
502 BlockDriver *drv = bs->drv;
503
504 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
505 if (bs->sg)
506 return 0;
507
508 /* query actual device if possible, otherwise just trust the hint */
509 if (drv->bdrv_getlength) {
510 int64_t length = drv->bdrv_getlength(bs);
511 if (length < 0) {
512 return length;
513 }
514 hint = length >> BDRV_SECTOR_BITS;
515 }
516
517 bs->total_sectors = hint;
518 return 0;
519 }
520
521 /**
522 * Set open flags for a given cache mode
523 *
524 * Return 0 on success, -1 if the cache mode was invalid.
525 */
526 int bdrv_parse_cache_flags(const char *mode, int *flags)
527 {
528 *flags &= ~BDRV_O_CACHE_MASK;
529
530 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
531 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
532 } else if (!strcmp(mode, "directsync")) {
533 *flags |= BDRV_O_NOCACHE;
534 } else if (!strcmp(mode, "writeback")) {
535 *flags |= BDRV_O_CACHE_WB;
536 } else if (!strcmp(mode, "unsafe")) {
537 *flags |= BDRV_O_CACHE_WB;
538 *flags |= BDRV_O_NO_FLUSH;
539 } else if (!strcmp(mode, "writethrough")) {
540 /* this is the default */
541 } else {
542 return -1;
543 }
544
545 return 0;
546 }
547
548 /**
549 * The copy-on-read flag is actually a reference count so multiple users may
550 * use the feature without worrying about clobbering its previous state.
551 * Copy-on-read stays enabled until all users have called to disable it.
552 */
553 void bdrv_enable_copy_on_read(BlockDriverState *bs)
554 {
555 bs->copy_on_read++;
556 }
557
558 void bdrv_disable_copy_on_read(BlockDriverState *bs)
559 {
560 assert(bs->copy_on_read > 0);
561 bs->copy_on_read--;
562 }
563
564 /*
565 * Common part for opening disk images and files
566 */
567 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
568 int flags, BlockDriver *drv)
569 {
570 int ret, open_flags;
571
572 assert(drv != NULL);
573
574 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
575
576 bs->file = NULL;
577 bs->total_sectors = 0;
578 bs->encrypted = 0;
579 bs->valid_key = 0;
580 bs->sg = 0;
581 bs->open_flags = flags;
582 bs->growable = 0;
583 bs->buffer_alignment = 512;
584
585 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
586 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
587 bdrv_enable_copy_on_read(bs);
588 }
589
590 pstrcpy(bs->filename, sizeof(bs->filename), filename);
591 bs->backing_file[0] = '\0';
592
593 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
594 return -ENOTSUP;
595 }
596
597 bs->drv = drv;
598 bs->opaque = g_malloc0(drv->instance_size);
599
600 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
601
602 /*
603 * Clear flags that are internal to the block layer before opening the
604 * image.
605 */
606 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
607
608 /*
609 * Snapshots should be writable.
610 */
611 if (bs->is_temporary) {
612 open_flags |= BDRV_O_RDWR;
613 }
614
615 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
616
617 /* Open the image, either directly or using a protocol */
618 if (drv->bdrv_file_open) {
619 ret = drv->bdrv_file_open(bs, filename, open_flags);
620 } else {
621 ret = bdrv_file_open(&bs->file, filename, open_flags);
622 if (ret >= 0) {
623 ret = drv->bdrv_open(bs, open_flags);
624 }
625 }
626
627 if (ret < 0) {
628 goto free_and_fail;
629 }
630
631 ret = refresh_total_sectors(bs, bs->total_sectors);
632 if (ret < 0) {
633 goto free_and_fail;
634 }
635
636 #ifndef _WIN32
637 if (bs->is_temporary) {
638 unlink(filename);
639 }
640 #endif
641 return 0;
642
643 free_and_fail:
644 if (bs->file) {
645 bdrv_delete(bs->file);
646 bs->file = NULL;
647 }
648 g_free(bs->opaque);
649 bs->opaque = NULL;
650 bs->drv = NULL;
651 return ret;
652 }
653
654 /*
655 * Opens a file using a protocol (file, host_device, nbd, ...)
656 */
657 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
658 {
659 BlockDriverState *bs;
660 BlockDriver *drv;
661 int ret;
662
663 drv = bdrv_find_protocol(filename);
664 if (!drv) {
665 return -ENOENT;
666 }
667
668 bs = bdrv_new("");
669 ret = bdrv_open_common(bs, filename, flags, drv);
670 if (ret < 0) {
671 bdrv_delete(bs);
672 return ret;
673 }
674 bs->growable = 1;
675 *pbs = bs;
676 return 0;
677 }
678
679 /*
680 * Opens a disk image (raw, qcow2, vmdk, ...)
681 */
682 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
683 BlockDriver *drv)
684 {
685 int ret;
686 char tmp_filename[PATH_MAX];
687
688 if (flags & BDRV_O_SNAPSHOT) {
689 BlockDriverState *bs1;
690 int64_t total_size;
691 int is_protocol = 0;
692 BlockDriver *bdrv_qcow2;
693 QEMUOptionParameter *options;
694 char backing_filename[PATH_MAX];
695
696 /* if snapshot, we create a temporary backing file and open it
697 instead of opening 'filename' directly */
698
699 /* if there is a backing file, use it */
700 bs1 = bdrv_new("");
701 ret = bdrv_open(bs1, filename, 0, drv);
702 if (ret < 0) {
703 bdrv_delete(bs1);
704 return ret;
705 }
706 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
707
708 if (bs1->drv && bs1->drv->protocol_name)
709 is_protocol = 1;
710
711 bdrv_delete(bs1);
712
713 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
714
715 /* Real path is meaningless for protocols */
716 if (is_protocol)
717 snprintf(backing_filename, sizeof(backing_filename),
718 "%s", filename);
719 else if (!realpath(filename, backing_filename))
720 return -errno;
721
722 bdrv_qcow2 = bdrv_find_format("qcow2");
723 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
724
725 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
726 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
727 if (drv) {
728 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
729 drv->format_name);
730 }
731
732 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
733 free_option_parameters(options);
734 if (ret < 0) {
735 return ret;
736 }
737
738 filename = tmp_filename;
739 drv = bdrv_qcow2;
740 bs->is_temporary = 1;
741 }
742
743 /* Find the right image format driver */
744 if (!drv) {
745 ret = find_image_format(filename, &drv);
746 }
747
748 if (!drv) {
749 goto unlink_and_fail;
750 }
751
752 /* Open the image */
753 ret = bdrv_open_common(bs, filename, flags, drv);
754 if (ret < 0) {
755 goto unlink_and_fail;
756 }
757
758 /* If there is a backing file, use it */
759 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
760 char backing_filename[PATH_MAX];
761 int back_flags;
762 BlockDriver *back_drv = NULL;
763
764 bs->backing_hd = bdrv_new("");
765
766 if (path_has_protocol(bs->backing_file)) {
767 pstrcpy(backing_filename, sizeof(backing_filename),
768 bs->backing_file);
769 } else {
770 path_combine(backing_filename, sizeof(backing_filename),
771 filename, bs->backing_file);
772 }
773
774 if (bs->backing_format[0] != '\0') {
775 back_drv = bdrv_find_format(bs->backing_format);
776 }
777
778 /* backing files always opened read-only */
779 back_flags =
780 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
781
782 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
783 if (ret < 0) {
784 bdrv_close(bs);
785 return ret;
786 }
787 if (bs->is_temporary) {
788 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
789 } else {
790 /* base image inherits from "parent" */
791 bs->backing_hd->keep_read_only = bs->keep_read_only;
792 }
793 }
794
795 if (!bdrv_key_required(bs)) {
796 bdrv_dev_change_media_cb(bs, true);
797 }
798
799 /* throttling disk I/O limits */
800 if (bs->io_limits_enabled) {
801 bdrv_io_limits_enable(bs);
802 }
803
804 return 0;
805
806 unlink_and_fail:
807 if (bs->is_temporary) {
808 unlink(filename);
809 }
810 return ret;
811 }
812
813 void bdrv_close(BlockDriverState *bs)
814 {
815 if (bs->drv) {
816 if (bs->job) {
817 block_job_cancel_sync(bs->job);
818 }
819 bdrv_drain_all();
820
821 if (bs == bs_snapshots) {
822 bs_snapshots = NULL;
823 }
824 if (bs->backing_hd) {
825 bdrv_delete(bs->backing_hd);
826 bs->backing_hd = NULL;
827 }
828 bs->drv->bdrv_close(bs);
829 g_free(bs->opaque);
830 #ifdef _WIN32
831 if (bs->is_temporary) {
832 unlink(bs->filename);
833 }
834 #endif
835 bs->opaque = NULL;
836 bs->drv = NULL;
837 bs->copy_on_read = 0;
838
839 if (bs->file != NULL) {
840 bdrv_close(bs->file);
841 }
842
843 bdrv_dev_change_media_cb(bs, false);
844 }
845
846 /*throttling disk I/O limits*/
847 if (bs->io_limits_enabled) {
848 bdrv_io_limits_disable(bs);
849 }
850 }
851
852 void bdrv_close_all(void)
853 {
854 BlockDriverState *bs;
855
856 QTAILQ_FOREACH(bs, &bdrv_states, list) {
857 bdrv_close(bs);
858 }
859 }
860
861 /*
862 * Wait for pending requests to complete across all BlockDriverStates
863 *
864 * This function does not flush data to disk, use bdrv_flush_all() for that
865 * after calling this function.
866 */
867 void bdrv_drain_all(void)
868 {
869 BlockDriverState *bs;
870
871 qemu_aio_flush();
872
873 /* If requests are still pending there is a bug somewhere */
874 QTAILQ_FOREACH(bs, &bdrv_states, list) {
875 assert(QLIST_EMPTY(&bs->tracked_requests));
876 assert(qemu_co_queue_empty(&bs->throttled_reqs));
877 }
878 }
879
880 /* make a BlockDriverState anonymous by removing from bdrv_state list.
881 Also, NULL terminate the device_name to prevent double remove */
882 void bdrv_make_anon(BlockDriverState *bs)
883 {
884 if (bs->device_name[0] != '\0') {
885 QTAILQ_REMOVE(&bdrv_states, bs, list);
886 }
887 bs->device_name[0] = '\0';
888 }
889
890 /*
891 * Add new bs contents at the top of an image chain while the chain is
892 * live, while keeping required fields on the top layer.
893 *
894 * This will modify the BlockDriverState fields, and swap contents
895 * between bs_new and bs_top. Both bs_new and bs_top are modified.
896 *
897 * bs_new is required to be anonymous.
898 *
899 * This function does not create any image files.
900 */
901 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
902 {
903 BlockDriverState tmp;
904
905 /* bs_new must be anonymous */
906 assert(bs_new->device_name[0] == '\0');
907
908 tmp = *bs_new;
909
910 /* there are some fields that need to stay on the top layer: */
911
912 /* dev info */
913 tmp.dev_ops = bs_top->dev_ops;
914 tmp.dev_opaque = bs_top->dev_opaque;
915 tmp.dev = bs_top->dev;
916 tmp.buffer_alignment = bs_top->buffer_alignment;
917 tmp.copy_on_read = bs_top->copy_on_read;
918
919 /* i/o timing parameters */
920 tmp.slice_time = bs_top->slice_time;
921 tmp.slice_start = bs_top->slice_start;
922 tmp.slice_end = bs_top->slice_end;
923 tmp.io_limits = bs_top->io_limits;
924 tmp.io_base = bs_top->io_base;
925 tmp.throttled_reqs = bs_top->throttled_reqs;
926 tmp.block_timer = bs_top->block_timer;
927 tmp.io_limits_enabled = bs_top->io_limits_enabled;
928
929 /* geometry */
930 tmp.cyls = bs_top->cyls;
931 tmp.heads = bs_top->heads;
932 tmp.secs = bs_top->secs;
933 tmp.translation = bs_top->translation;
934
935 /* r/w error */
936 tmp.on_read_error = bs_top->on_read_error;
937 tmp.on_write_error = bs_top->on_write_error;
938
939 /* i/o status */
940 tmp.iostatus_enabled = bs_top->iostatus_enabled;
941 tmp.iostatus = bs_top->iostatus;
942
943 /* keep the same entry in bdrv_states */
944 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
945 tmp.list = bs_top->list;
946
947 /* The contents of 'tmp' will become bs_top, as we are
948 * swapping bs_new and bs_top contents. */
949 tmp.backing_hd = bs_new;
950 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
951 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
952
953 /* swap contents of the fixed new bs and the current top */
954 *bs_new = *bs_top;
955 *bs_top = tmp;
956
957 /* device_name[] was carried over from the old bs_top. bs_new
958 * shouldn't be in bdrv_states, so we need to make device_name[]
959 * reflect the anonymity of bs_new
960 */
961 bs_new->device_name[0] = '\0';
962
963 /* clear the copied fields in the new backing file */
964 bdrv_detach_dev(bs_new, bs_new->dev);
965
966 qemu_co_queue_init(&bs_new->throttled_reqs);
967 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
968 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
969 bdrv_iostatus_disable(bs_new);
970
971 /* we don't use bdrv_io_limits_disable() for this, because we don't want
972 * to affect or delete the block_timer, as it has been moved to bs_top */
973 bs_new->io_limits_enabled = false;
974 bs_new->block_timer = NULL;
975 bs_new->slice_time = 0;
976 bs_new->slice_start = 0;
977 bs_new->slice_end = 0;
978 }
979
980 void bdrv_delete(BlockDriverState *bs)
981 {
982 assert(!bs->dev);
983 assert(!bs->job);
984 assert(!bs->in_use);
985
986 /* remove from list, if necessary */
987 bdrv_make_anon(bs);
988
989 bdrv_close(bs);
990 if (bs->file != NULL) {
991 bdrv_delete(bs->file);
992 }
993
994 assert(bs != bs_snapshots);
995 g_free(bs);
996 }
997
998 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
999 /* TODO change to DeviceState *dev when all users are qdevified */
1000 {
1001 if (bs->dev) {
1002 return -EBUSY;
1003 }
1004 bs->dev = dev;
1005 bdrv_iostatus_reset(bs);
1006 return 0;
1007 }
1008
1009 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1010 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1011 {
1012 if (bdrv_attach_dev(bs, dev) < 0) {
1013 abort();
1014 }
1015 }
1016
1017 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1018 /* TODO change to DeviceState *dev when all users are qdevified */
1019 {
1020 assert(bs->dev == dev);
1021 bs->dev = NULL;
1022 bs->dev_ops = NULL;
1023 bs->dev_opaque = NULL;
1024 bs->buffer_alignment = 512;
1025 }
1026
1027 /* TODO change to return DeviceState * when all users are qdevified */
1028 void *bdrv_get_attached_dev(BlockDriverState *bs)
1029 {
1030 return bs->dev;
1031 }
1032
1033 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1034 void *opaque)
1035 {
1036 bs->dev_ops = ops;
1037 bs->dev_opaque = opaque;
1038 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1039 bs_snapshots = NULL;
1040 }
1041 }
1042
1043 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1044 BlockQMPEventAction action, int is_read)
1045 {
1046 QObject *data;
1047 const char *action_str;
1048
1049 switch (action) {
1050 case BDRV_ACTION_REPORT:
1051 action_str = "report";
1052 break;
1053 case BDRV_ACTION_IGNORE:
1054 action_str = "ignore";
1055 break;
1056 case BDRV_ACTION_STOP:
1057 action_str = "stop";
1058 break;
1059 default:
1060 abort();
1061 }
1062
1063 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1064 bdrv->device_name,
1065 action_str,
1066 is_read ? "read" : "write");
1067 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1068
1069 qobject_decref(data);
1070 }
1071
1072 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1073 {
1074 QObject *data;
1075
1076 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1077 bdrv_get_device_name(bs), ejected);
1078 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1079
1080 qobject_decref(data);
1081 }
1082
1083 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1084 {
1085 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1086 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1087 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1088 if (tray_was_closed) {
1089 /* tray open */
1090 bdrv_emit_qmp_eject_event(bs, true);
1091 }
1092 if (load) {
1093 /* tray close */
1094 bdrv_emit_qmp_eject_event(bs, false);
1095 }
1096 }
1097 }
1098
1099 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1100 {
1101 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1102 }
1103
1104 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1105 {
1106 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1107 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1108 }
1109 }
1110
1111 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1112 {
1113 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1114 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1115 }
1116 return false;
1117 }
1118
1119 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1120 {
1121 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1122 bs->dev_ops->resize_cb(bs->dev_opaque);
1123 }
1124 }
1125
1126 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1127 {
1128 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1129 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1130 }
1131 return false;
1132 }
1133
1134 /*
1135 * Run consistency checks on an image
1136 *
1137 * Returns 0 if the check could be completed (it doesn't mean that the image is
1138 * free of errors) or -errno when an internal error occurred. The results of the
1139 * check are stored in res.
1140 */
1141 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1142 {
1143 if (bs->drv->bdrv_check == NULL) {
1144 return -ENOTSUP;
1145 }
1146
1147 memset(res, 0, sizeof(*res));
1148 return bs->drv->bdrv_check(bs, res);
1149 }
1150
1151 #define COMMIT_BUF_SECTORS 2048
1152
1153 /* commit COW file into the raw image */
1154 int bdrv_commit(BlockDriverState *bs)
1155 {
1156 BlockDriver *drv = bs->drv;
1157 BlockDriver *backing_drv;
1158 int64_t sector, total_sectors;
1159 int n, ro, open_flags;
1160 int ret = 0, rw_ret = 0;
1161 uint8_t *buf;
1162 char filename[1024];
1163 BlockDriverState *bs_rw, *bs_ro;
1164
1165 if (!drv)
1166 return -ENOMEDIUM;
1167
1168 if (!bs->backing_hd) {
1169 return -ENOTSUP;
1170 }
1171
1172 if (bs->backing_hd->keep_read_only) {
1173 return -EACCES;
1174 }
1175
1176 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1177 return -EBUSY;
1178 }
1179
1180 backing_drv = bs->backing_hd->drv;
1181 ro = bs->backing_hd->read_only;
1182 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1183 open_flags = bs->backing_hd->open_flags;
1184
1185 if (ro) {
1186 /* re-open as RW */
1187 bdrv_delete(bs->backing_hd);
1188 bs->backing_hd = NULL;
1189 bs_rw = bdrv_new("");
1190 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1191 backing_drv);
1192 if (rw_ret < 0) {
1193 bdrv_delete(bs_rw);
1194 /* try to re-open read-only */
1195 bs_ro = bdrv_new("");
1196 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1197 backing_drv);
1198 if (ret < 0) {
1199 bdrv_delete(bs_ro);
1200 /* drive not functional anymore */
1201 bs->drv = NULL;
1202 return ret;
1203 }
1204 bs->backing_hd = bs_ro;
1205 return rw_ret;
1206 }
1207 bs->backing_hd = bs_rw;
1208 }
1209
1210 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1211 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1212
1213 for (sector = 0; sector < total_sectors; sector += n) {
1214 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1215
1216 if (bdrv_read(bs, sector, buf, n) != 0) {
1217 ret = -EIO;
1218 goto ro_cleanup;
1219 }
1220
1221 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1222 ret = -EIO;
1223 goto ro_cleanup;
1224 }
1225 }
1226 }
1227
1228 if (drv->bdrv_make_empty) {
1229 ret = drv->bdrv_make_empty(bs);
1230 bdrv_flush(bs);
1231 }
1232
1233 /*
1234 * Make sure all data we wrote to the backing device is actually
1235 * stable on disk.
1236 */
1237 if (bs->backing_hd)
1238 bdrv_flush(bs->backing_hd);
1239
1240 ro_cleanup:
1241 g_free(buf);
1242
1243 if (ro) {
1244 /* re-open as RO */
1245 bdrv_delete(bs->backing_hd);
1246 bs->backing_hd = NULL;
1247 bs_ro = bdrv_new("");
1248 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1249 backing_drv);
1250 if (ret < 0) {
1251 bdrv_delete(bs_ro);
1252 /* drive not functional anymore */
1253 bs->drv = NULL;
1254 return ret;
1255 }
1256 bs->backing_hd = bs_ro;
1257 bs->backing_hd->keep_read_only = 0;
1258 }
1259
1260 return ret;
1261 }
1262
1263 int bdrv_commit_all(void)
1264 {
1265 BlockDriverState *bs;
1266
1267 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1268 int ret = bdrv_commit(bs);
1269 if (ret < 0) {
1270 return ret;
1271 }
1272 }
1273 return 0;
1274 }
1275
1276 struct BdrvTrackedRequest {
1277 BlockDriverState *bs;
1278 int64_t sector_num;
1279 int nb_sectors;
1280 bool is_write;
1281 QLIST_ENTRY(BdrvTrackedRequest) list;
1282 Coroutine *co; /* owner, used for deadlock detection */
1283 CoQueue wait_queue; /* coroutines blocked on this request */
1284 };
1285
1286 /**
1287 * Remove an active request from the tracked requests list
1288 *
1289 * This function should be called when a tracked request is completing.
1290 */
1291 static void tracked_request_end(BdrvTrackedRequest *req)
1292 {
1293 QLIST_REMOVE(req, list);
1294 qemu_co_queue_restart_all(&req->wait_queue);
1295 }
1296
1297 /**
1298 * Add an active request to the tracked requests list
1299 */
1300 static void tracked_request_begin(BdrvTrackedRequest *req,
1301 BlockDriverState *bs,
1302 int64_t sector_num,
1303 int nb_sectors, bool is_write)
1304 {
1305 *req = (BdrvTrackedRequest){
1306 .bs = bs,
1307 .sector_num = sector_num,
1308 .nb_sectors = nb_sectors,
1309 .is_write = is_write,
1310 .co = qemu_coroutine_self(),
1311 };
1312
1313 qemu_co_queue_init(&req->wait_queue);
1314
1315 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1316 }
1317
1318 /**
1319 * Round a region to cluster boundaries
1320 */
1321 static void round_to_clusters(BlockDriverState *bs,
1322 int64_t sector_num, int nb_sectors,
1323 int64_t *cluster_sector_num,
1324 int *cluster_nb_sectors)
1325 {
1326 BlockDriverInfo bdi;
1327
1328 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1329 *cluster_sector_num = sector_num;
1330 *cluster_nb_sectors = nb_sectors;
1331 } else {
1332 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1333 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1334 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1335 nb_sectors, c);
1336 }
1337 }
1338
1339 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1340 int64_t sector_num, int nb_sectors) {
1341 /* aaaa bbbb */
1342 if (sector_num >= req->sector_num + req->nb_sectors) {
1343 return false;
1344 }
1345 /* bbbb aaaa */
1346 if (req->sector_num >= sector_num + nb_sectors) {
1347 return false;
1348 }
1349 return true;
1350 }
1351
1352 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1353 int64_t sector_num, int nb_sectors)
1354 {
1355 BdrvTrackedRequest *req;
1356 int64_t cluster_sector_num;
1357 int cluster_nb_sectors;
1358 bool retry;
1359
1360 /* If we touch the same cluster it counts as an overlap. This guarantees
1361 * that allocating writes will be serialized and not race with each other
1362 * for the same cluster. For example, in copy-on-read it ensures that the
1363 * CoR read and write operations are atomic and guest writes cannot
1364 * interleave between them.
1365 */
1366 round_to_clusters(bs, sector_num, nb_sectors,
1367 &cluster_sector_num, &cluster_nb_sectors);
1368
1369 do {
1370 retry = false;
1371 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1372 if (tracked_request_overlaps(req, cluster_sector_num,
1373 cluster_nb_sectors)) {
1374 /* Hitting this means there was a reentrant request, for
1375 * example, a block driver issuing nested requests. This must
1376 * never happen since it means deadlock.
1377 */
1378 assert(qemu_coroutine_self() != req->co);
1379
1380 qemu_co_queue_wait(&req->wait_queue);
1381 retry = true;
1382 break;
1383 }
1384 }
1385 } while (retry);
1386 }
1387
1388 /*
1389 * Return values:
1390 * 0 - success
1391 * -EINVAL - backing format specified, but no file
1392 * -ENOSPC - can't update the backing file because no space is left in the
1393 * image file header
1394 * -ENOTSUP - format driver doesn't support changing the backing file
1395 */
1396 int bdrv_change_backing_file(BlockDriverState *bs,
1397 const char *backing_file, const char *backing_fmt)
1398 {
1399 BlockDriver *drv = bs->drv;
1400
1401 if (drv->bdrv_change_backing_file != NULL) {
1402 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1403 } else {
1404 return -ENOTSUP;
1405 }
1406 }
1407
1408 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1409 size_t size)
1410 {
1411 int64_t len;
1412
1413 if (!bdrv_is_inserted(bs))
1414 return -ENOMEDIUM;
1415
1416 if (bs->growable)
1417 return 0;
1418
1419 len = bdrv_getlength(bs);
1420
1421 if (offset < 0)
1422 return -EIO;
1423
1424 if ((offset > len) || (len - offset < size))
1425 return -EIO;
1426
1427 return 0;
1428 }
1429
1430 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1431 int nb_sectors)
1432 {
1433 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1434 nb_sectors * BDRV_SECTOR_SIZE);
1435 }
1436
1437 typedef struct RwCo {
1438 BlockDriverState *bs;
1439 int64_t sector_num;
1440 int nb_sectors;
1441 QEMUIOVector *qiov;
1442 bool is_write;
1443 int ret;
1444 } RwCo;
1445
1446 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1447 {
1448 RwCo *rwco = opaque;
1449
1450 if (!rwco->is_write) {
1451 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1452 rwco->nb_sectors, rwco->qiov, 0);
1453 } else {
1454 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1455 rwco->nb_sectors, rwco->qiov, 0);
1456 }
1457 }
1458
1459 /*
1460 * Process a synchronous request using coroutines
1461 */
1462 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1463 int nb_sectors, bool is_write)
1464 {
1465 QEMUIOVector qiov;
1466 struct iovec iov = {
1467 .iov_base = (void *)buf,
1468 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1469 };
1470 Coroutine *co;
1471 RwCo rwco = {
1472 .bs = bs,
1473 .sector_num = sector_num,
1474 .nb_sectors = nb_sectors,
1475 .qiov = &qiov,
1476 .is_write = is_write,
1477 .ret = NOT_DONE,
1478 };
1479
1480 qemu_iovec_init_external(&qiov, &iov, 1);
1481
1482 /**
1483 * In sync call context, when the vcpu is blocked, this throttling timer
1484 * will not fire; so the I/O throttling function has to be disabled here
1485 * if it has been enabled.
1486 */
1487 if (bs->io_limits_enabled) {
1488 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1489 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1490 bdrv_io_limits_disable(bs);
1491 }
1492
1493 if (qemu_in_coroutine()) {
1494 /* Fast-path if already in coroutine context */
1495 bdrv_rw_co_entry(&rwco);
1496 } else {
1497 co = qemu_coroutine_create(bdrv_rw_co_entry);
1498 qemu_coroutine_enter(co, &rwco);
1499 while (rwco.ret == NOT_DONE) {
1500 qemu_aio_wait();
1501 }
1502 }
1503 return rwco.ret;
1504 }
1505
1506 /* return < 0 if error. See bdrv_write() for the return codes */
1507 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1508 uint8_t *buf, int nb_sectors)
1509 {
1510 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1511 }
1512
1513 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1514 int nb_sectors, int dirty)
1515 {
1516 int64_t start, end;
1517 unsigned long val, idx, bit;
1518
1519 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1520 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1521
1522 for (; start <= end; start++) {
1523 idx = start / (sizeof(unsigned long) * 8);
1524 bit = start % (sizeof(unsigned long) * 8);
1525 val = bs->dirty_bitmap[idx];
1526 if (dirty) {
1527 if (!(val & (1UL << bit))) {
1528 bs->dirty_count++;
1529 val |= 1UL << bit;
1530 }
1531 } else {
1532 if (val & (1UL << bit)) {
1533 bs->dirty_count--;
1534 val &= ~(1UL << bit);
1535 }
1536 }
1537 bs->dirty_bitmap[idx] = val;
1538 }
1539 }
1540
1541 /* Return < 0 if error. Important errors are:
1542 -EIO generic I/O error (may happen for all errors)
1543 -ENOMEDIUM No media inserted.
1544 -EINVAL Invalid sector number or nb_sectors
1545 -EACCES Trying to write a read-only device
1546 */
1547 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1548 const uint8_t *buf, int nb_sectors)
1549 {
1550 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1551 }
1552
1553 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1554 void *buf, int count1)
1555 {
1556 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1557 int len, nb_sectors, count;
1558 int64_t sector_num;
1559 int ret;
1560
1561 count = count1;
1562 /* first read to align to sector start */
1563 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1564 if (len > count)
1565 len = count;
1566 sector_num = offset >> BDRV_SECTOR_BITS;
1567 if (len > 0) {
1568 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1569 return ret;
1570 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1571 count -= len;
1572 if (count == 0)
1573 return count1;
1574 sector_num++;
1575 buf += len;
1576 }
1577
1578 /* read the sectors "in place" */
1579 nb_sectors = count >> BDRV_SECTOR_BITS;
1580 if (nb_sectors > 0) {
1581 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1582 return ret;
1583 sector_num += nb_sectors;
1584 len = nb_sectors << BDRV_SECTOR_BITS;
1585 buf += len;
1586 count -= len;
1587 }
1588
1589 /* add data from the last sector */
1590 if (count > 0) {
1591 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1592 return ret;
1593 memcpy(buf, tmp_buf, count);
1594 }
1595 return count1;
1596 }
1597
1598 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1599 const void *buf, int count1)
1600 {
1601 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1602 int len, nb_sectors, count;
1603 int64_t sector_num;
1604 int ret;
1605
1606 count = count1;
1607 /* first write to align to sector start */
1608 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1609 if (len > count)
1610 len = count;
1611 sector_num = offset >> BDRV_SECTOR_BITS;
1612 if (len > 0) {
1613 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1614 return ret;
1615 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1616 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1617 return ret;
1618 count -= len;
1619 if (count == 0)
1620 return count1;
1621 sector_num++;
1622 buf += len;
1623 }
1624
1625 /* write the sectors "in place" */
1626 nb_sectors = count >> BDRV_SECTOR_BITS;
1627 if (nb_sectors > 0) {
1628 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1629 return ret;
1630 sector_num += nb_sectors;
1631 len = nb_sectors << BDRV_SECTOR_BITS;
1632 buf += len;
1633 count -= len;
1634 }
1635
1636 /* add data from the last sector */
1637 if (count > 0) {
1638 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1639 return ret;
1640 memcpy(tmp_buf, buf, count);
1641 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1642 return ret;
1643 }
1644 return count1;
1645 }
1646
1647 /*
1648 * Writes to the file and ensures that no writes are reordered across this
1649 * request (acts as a barrier)
1650 *
1651 * Returns 0 on success, -errno in error cases.
1652 */
1653 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1654 const void *buf, int count)
1655 {
1656 int ret;
1657
1658 ret = bdrv_pwrite(bs, offset, buf, count);
1659 if (ret < 0) {
1660 return ret;
1661 }
1662
1663 /* No flush needed for cache modes that use O_DSYNC */
1664 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1665 bdrv_flush(bs);
1666 }
1667
1668 return 0;
1669 }
1670
1671 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1672 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1673 {
1674 /* Perform I/O through a temporary buffer so that users who scribble over
1675 * their read buffer while the operation is in progress do not end up
1676 * modifying the image file. This is critical for zero-copy guest I/O
1677 * where anything might happen inside guest memory.
1678 */
1679 void *bounce_buffer;
1680
1681 BlockDriver *drv = bs->drv;
1682 struct iovec iov;
1683 QEMUIOVector bounce_qiov;
1684 int64_t cluster_sector_num;
1685 int cluster_nb_sectors;
1686 size_t skip_bytes;
1687 int ret;
1688
1689 /* Cover entire cluster so no additional backing file I/O is required when
1690 * allocating cluster in the image file.
1691 */
1692 round_to_clusters(bs, sector_num, nb_sectors,
1693 &cluster_sector_num, &cluster_nb_sectors);
1694
1695 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1696 cluster_sector_num, cluster_nb_sectors);
1697
1698 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1699 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1700 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1701
1702 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1703 &bounce_qiov);
1704 if (ret < 0) {
1705 goto err;
1706 }
1707
1708 if (drv->bdrv_co_write_zeroes &&
1709 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1710 ret = drv->bdrv_co_write_zeroes(bs, cluster_sector_num,
1711 cluster_nb_sectors);
1712 } else {
1713 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1714 &bounce_qiov);
1715 }
1716
1717 if (ret < 0) {
1718 /* It might be okay to ignore write errors for guest requests. If this
1719 * is a deliberate copy-on-read then we don't want to ignore the error.
1720 * Simply report it in all cases.
1721 */
1722 goto err;
1723 }
1724
1725 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1726 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1727 nb_sectors * BDRV_SECTOR_SIZE);
1728
1729 err:
1730 qemu_vfree(bounce_buffer);
1731 return ret;
1732 }
1733
1734 /*
1735 * Handle a read request in coroutine context
1736 */
1737 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1738 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1739 BdrvRequestFlags flags)
1740 {
1741 BlockDriver *drv = bs->drv;
1742 BdrvTrackedRequest req;
1743 int ret;
1744
1745 if (!drv) {
1746 return -ENOMEDIUM;
1747 }
1748 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1749 return -EIO;
1750 }
1751
1752 /* throttling disk read I/O */
1753 if (bs->io_limits_enabled) {
1754 bdrv_io_limits_intercept(bs, false, nb_sectors);
1755 }
1756
1757 if (bs->copy_on_read) {
1758 flags |= BDRV_REQ_COPY_ON_READ;
1759 }
1760 if (flags & BDRV_REQ_COPY_ON_READ) {
1761 bs->copy_on_read_in_flight++;
1762 }
1763
1764 if (bs->copy_on_read_in_flight) {
1765 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1766 }
1767
1768 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1769
1770 if (flags & BDRV_REQ_COPY_ON_READ) {
1771 int pnum;
1772
1773 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1774 if (ret < 0) {
1775 goto out;
1776 }
1777
1778 if (!ret || pnum != nb_sectors) {
1779 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1780 goto out;
1781 }
1782 }
1783
1784 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1785
1786 out:
1787 tracked_request_end(&req);
1788
1789 if (flags & BDRV_REQ_COPY_ON_READ) {
1790 bs->copy_on_read_in_flight--;
1791 }
1792
1793 return ret;
1794 }
1795
1796 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1797 int nb_sectors, QEMUIOVector *qiov)
1798 {
1799 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1800
1801 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1802 }
1803
1804 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1805 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1806 {
1807 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1808
1809 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1810 BDRV_REQ_COPY_ON_READ);
1811 }
1812
1813 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1814 int64_t sector_num, int nb_sectors)
1815 {
1816 BlockDriver *drv = bs->drv;
1817 QEMUIOVector qiov;
1818 struct iovec iov;
1819 int ret;
1820
1821 /* First try the efficient write zeroes operation */
1822 if (drv->bdrv_co_write_zeroes) {
1823 return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1824 }
1825
1826 /* Fall back to bounce buffer if write zeroes is unsupported */
1827 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1828 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1829 memset(iov.iov_base, 0, iov.iov_len);
1830 qemu_iovec_init_external(&qiov, &iov, 1);
1831
1832 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1833
1834 qemu_vfree(iov.iov_base);
1835 return ret;
1836 }
1837
1838 /*
1839 * Handle a write request in coroutine context
1840 */
1841 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1842 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1843 BdrvRequestFlags flags)
1844 {
1845 BlockDriver *drv = bs->drv;
1846 BdrvTrackedRequest req;
1847 int ret;
1848
1849 if (!bs->drv) {
1850 return -ENOMEDIUM;
1851 }
1852 if (bs->read_only) {
1853 return -EACCES;
1854 }
1855 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1856 return -EIO;
1857 }
1858
1859 /* throttling disk write I/O */
1860 if (bs->io_limits_enabled) {
1861 bdrv_io_limits_intercept(bs, true, nb_sectors);
1862 }
1863
1864 if (bs->copy_on_read_in_flight) {
1865 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1866 }
1867
1868 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1869
1870 if (flags & BDRV_REQ_ZERO_WRITE) {
1871 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1872 } else {
1873 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1874 }
1875
1876 if (bs->dirty_bitmap) {
1877 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1878 }
1879
1880 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1881 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1882 }
1883
1884 tracked_request_end(&req);
1885
1886 return ret;
1887 }
1888
1889 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1890 int nb_sectors, QEMUIOVector *qiov)
1891 {
1892 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1893
1894 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1895 }
1896
1897 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1898 int64_t sector_num, int nb_sectors)
1899 {
1900 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1901
1902 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1903 BDRV_REQ_ZERO_WRITE);
1904 }
1905
1906 /**
1907 * Truncate file to 'offset' bytes (needed only for file protocols)
1908 */
1909 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1910 {
1911 BlockDriver *drv = bs->drv;
1912 int ret;
1913 if (!drv)
1914 return -ENOMEDIUM;
1915 if (!drv->bdrv_truncate)
1916 return -ENOTSUP;
1917 if (bs->read_only)
1918 return -EACCES;
1919 if (bdrv_in_use(bs))
1920 return -EBUSY;
1921 ret = drv->bdrv_truncate(bs, offset);
1922 if (ret == 0) {
1923 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1924 bdrv_dev_resize_cb(bs);
1925 }
1926 return ret;
1927 }
1928
1929 /**
1930 * Length of a allocated file in bytes. Sparse files are counted by actual
1931 * allocated space. Return < 0 if error or unknown.
1932 */
1933 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1934 {
1935 BlockDriver *drv = bs->drv;
1936 if (!drv) {
1937 return -ENOMEDIUM;
1938 }
1939 if (drv->bdrv_get_allocated_file_size) {
1940 return drv->bdrv_get_allocated_file_size(bs);
1941 }
1942 if (bs->file) {
1943 return bdrv_get_allocated_file_size(bs->file);
1944 }
1945 return -ENOTSUP;
1946 }
1947
1948 /**
1949 * Length of a file in bytes. Return < 0 if error or unknown.
1950 */
1951 int64_t bdrv_getlength(BlockDriverState *bs)
1952 {
1953 BlockDriver *drv = bs->drv;
1954 if (!drv)
1955 return -ENOMEDIUM;
1956
1957 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1958 if (drv->bdrv_getlength) {
1959 return drv->bdrv_getlength(bs);
1960 }
1961 }
1962 return bs->total_sectors * BDRV_SECTOR_SIZE;
1963 }
1964
1965 /* return 0 as number of sectors if no device present or error */
1966 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1967 {
1968 int64_t length;
1969 length = bdrv_getlength(bs);
1970 if (length < 0)
1971 length = 0;
1972 else
1973 length = length >> BDRV_SECTOR_BITS;
1974 *nb_sectors_ptr = length;
1975 }
1976
1977 struct partition {
1978 uint8_t boot_ind; /* 0x80 - active */
1979 uint8_t head; /* starting head */
1980 uint8_t sector; /* starting sector */
1981 uint8_t cyl; /* starting cylinder */
1982 uint8_t sys_ind; /* What partition type */
1983 uint8_t end_head; /* end head */
1984 uint8_t end_sector; /* end sector */
1985 uint8_t end_cyl; /* end cylinder */
1986 uint32_t start_sect; /* starting sector counting from 0 */
1987 uint32_t nr_sects; /* nr of sectors in partition */
1988 } QEMU_PACKED;
1989
1990 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1991 static int guess_disk_lchs(BlockDriverState *bs,
1992 int *pcylinders, int *pheads, int *psectors)
1993 {
1994 uint8_t buf[BDRV_SECTOR_SIZE];
1995 int ret, i, heads, sectors, cylinders;
1996 struct partition *p;
1997 uint32_t nr_sects;
1998 uint64_t nb_sectors;
1999 bool enabled;
2000
2001 bdrv_get_geometry(bs, &nb_sectors);
2002
2003 /**
2004 * The function will be invoked during startup not only in sync I/O mode,
2005 * but also in async I/O mode. So the I/O throttling function has to
2006 * be disabled temporarily here, not permanently.
2007 */
2008 enabled = bs->io_limits_enabled;
2009 bs->io_limits_enabled = false;
2010 ret = bdrv_read(bs, 0, buf, 1);
2011 bs->io_limits_enabled = enabled;
2012 if (ret < 0)
2013 return -1;
2014 /* test msdos magic */
2015 if (buf[510] != 0x55 || buf[511] != 0xaa)
2016 return -1;
2017 for(i = 0; i < 4; i++) {
2018 p = ((struct partition *)(buf + 0x1be)) + i;
2019 nr_sects = le32_to_cpu(p->nr_sects);
2020 if (nr_sects && p->end_head) {
2021 /* We make the assumption that the partition terminates on
2022 a cylinder boundary */
2023 heads = p->end_head + 1;
2024 sectors = p->end_sector & 63;
2025 if (sectors == 0)
2026 continue;
2027 cylinders = nb_sectors / (heads * sectors);
2028 if (cylinders < 1 || cylinders > 16383)
2029 continue;
2030 *pheads = heads;
2031 *psectors = sectors;
2032 *pcylinders = cylinders;
2033 #if 0
2034 printf("guessed geometry: LCHS=%d %d %d\n",
2035 cylinders, heads, sectors);
2036 #endif
2037 return 0;
2038 }
2039 }
2040 return -1;
2041 }
2042
2043 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2044 {
2045 int translation, lba_detected = 0;
2046 int cylinders, heads, secs;
2047 uint64_t nb_sectors;
2048
2049 /* if a geometry hint is available, use it */
2050 bdrv_get_geometry(bs, &nb_sectors);
2051 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2052 translation = bdrv_get_translation_hint(bs);
2053 if (cylinders != 0) {
2054 *pcyls = cylinders;
2055 *pheads = heads;
2056 *psecs = secs;
2057 } else {
2058 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2059 if (heads > 16) {
2060 /* if heads > 16, it means that a BIOS LBA
2061 translation was active, so the default
2062 hardware geometry is OK */
2063 lba_detected = 1;
2064 goto default_geometry;
2065 } else {
2066 *pcyls = cylinders;
2067 *pheads = heads;
2068 *psecs = secs;
2069 /* disable any translation to be in sync with
2070 the logical geometry */
2071 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2072 bdrv_set_translation_hint(bs,
2073 BIOS_ATA_TRANSLATION_NONE);
2074 }
2075 }
2076 } else {
2077 default_geometry:
2078 /* if no geometry, use a standard physical disk geometry */
2079 cylinders = nb_sectors / (16 * 63);
2080
2081 if (cylinders > 16383)
2082 cylinders = 16383;
2083 else if (cylinders < 2)
2084 cylinders = 2;
2085 *pcyls = cylinders;
2086 *pheads = 16;
2087 *psecs = 63;
2088 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2089 if ((*pcyls * *pheads) <= 131072) {
2090 bdrv_set_translation_hint(bs,
2091 BIOS_ATA_TRANSLATION_LARGE);
2092 } else {
2093 bdrv_set_translation_hint(bs,
2094 BIOS_ATA_TRANSLATION_LBA);
2095 }
2096 }
2097 }
2098 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2099 }
2100 }
2101
2102 void bdrv_set_geometry_hint(BlockDriverState *bs,
2103 int cyls, int heads, int secs)
2104 {
2105 bs->cyls = cyls;
2106 bs->heads = heads;
2107 bs->secs = secs;
2108 }
2109
2110 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2111 {
2112 bs->translation = translation;
2113 }
2114
2115 void bdrv_get_geometry_hint(BlockDriverState *bs,
2116 int *pcyls, int *pheads, int *psecs)
2117 {
2118 *pcyls = bs->cyls;
2119 *pheads = bs->heads;
2120 *psecs = bs->secs;
2121 }
2122
2123 /* throttling disk io limits */
2124 void bdrv_set_io_limits(BlockDriverState *bs,
2125 BlockIOLimit *io_limits)
2126 {
2127 bs->io_limits = *io_limits;
2128 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2129 }
2130
2131 /* Recognize floppy formats */
2132 typedef struct FDFormat {
2133 FDriveType drive;
2134 uint8_t last_sect;
2135 uint8_t max_track;
2136 uint8_t max_head;
2137 FDriveRate rate;
2138 } FDFormat;
2139
2140 static const FDFormat fd_formats[] = {
2141 /* First entry is default format */
2142 /* 1.44 MB 3"1/2 floppy disks */
2143 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2144 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2145 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2146 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2147 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2148 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2149 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2150 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2151 /* 2.88 MB 3"1/2 floppy disks */
2152 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2153 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2154 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2155 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2156 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2157 /* 720 kB 3"1/2 floppy disks */
2158 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2159 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2160 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2161 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2162 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2163 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2164 /* 1.2 MB 5"1/4 floppy disks */
2165 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2166 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2167 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2168 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2169 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2170 /* 720 kB 5"1/4 floppy disks */
2171 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2172 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2173 /* 360 kB 5"1/4 floppy disks */
2174 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2175 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2176 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2177 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2178 /* 320 kB 5"1/4 floppy disks */
2179 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2180 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2181 /* 360 kB must match 5"1/4 better than 3"1/2... */
2182 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2183 /* end */
2184 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2185 };
2186
2187 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2188 int *max_track, int *last_sect,
2189 FDriveType drive_in, FDriveType *drive,
2190 FDriveRate *rate)
2191 {
2192 const FDFormat *parse;
2193 uint64_t nb_sectors, size;
2194 int i, first_match, match;
2195
2196 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2197 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2198 /* User defined disk */
2199 *rate = FDRIVE_RATE_500K;
2200 } else {
2201 bdrv_get_geometry(bs, &nb_sectors);
2202 match = -1;
2203 first_match = -1;
2204 for (i = 0; ; i++) {
2205 parse = &fd_formats[i];
2206 if (parse->drive == FDRIVE_DRV_NONE) {
2207 break;
2208 }
2209 if (drive_in == parse->drive ||
2210 drive_in == FDRIVE_DRV_NONE) {
2211 size = (parse->max_head + 1) * parse->max_track *
2212 parse->last_sect;
2213 if (nb_sectors == size) {
2214 match = i;
2215 break;
2216 }
2217 if (first_match == -1) {
2218 first_match = i;
2219 }
2220 }
2221 }
2222 if (match == -1) {
2223 if (first_match == -1) {
2224 match = 1;
2225 } else {
2226 match = first_match;
2227 }
2228 parse = &fd_formats[match];
2229 }
2230 *nb_heads = parse->max_head + 1;
2231 *max_track = parse->max_track;
2232 *last_sect = parse->last_sect;
2233 *drive = parse->drive;
2234 *rate = parse->rate;
2235 }
2236 }
2237
2238 int bdrv_get_translation_hint(BlockDriverState *bs)
2239 {
2240 return bs->translation;
2241 }
2242
2243 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2244 BlockErrorAction on_write_error)
2245 {
2246 bs->on_read_error = on_read_error;
2247 bs->on_write_error = on_write_error;
2248 }
2249
2250 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2251 {
2252 return is_read ? bs->on_read_error : bs->on_write_error;
2253 }
2254
2255 int bdrv_is_read_only(BlockDriverState *bs)
2256 {
2257 return bs->read_only;
2258 }
2259
2260 int bdrv_is_sg(BlockDriverState *bs)
2261 {
2262 return bs->sg;
2263 }
2264
2265 int bdrv_enable_write_cache(BlockDriverState *bs)
2266 {
2267 return bs->enable_write_cache;
2268 }
2269
2270 int bdrv_is_encrypted(BlockDriverState *bs)
2271 {
2272 if (bs->backing_hd && bs->backing_hd->encrypted)
2273 return 1;
2274 return bs->encrypted;
2275 }
2276
2277 int bdrv_key_required(BlockDriverState *bs)
2278 {
2279 BlockDriverState *backing_hd = bs->backing_hd;
2280
2281 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2282 return 1;
2283 return (bs->encrypted && !bs->valid_key);
2284 }
2285
2286 int bdrv_set_key(BlockDriverState *bs, const char *key)
2287 {
2288 int ret;
2289 if (bs->backing_hd && bs->backing_hd->encrypted) {
2290 ret = bdrv_set_key(bs->backing_hd, key);
2291 if (ret < 0)
2292 return ret;
2293 if (!bs->encrypted)
2294 return 0;
2295 }
2296 if (!bs->encrypted) {
2297 return -EINVAL;
2298 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2299 return -ENOMEDIUM;
2300 }
2301 ret = bs->drv->bdrv_set_key(bs, key);
2302 if (ret < 0) {
2303 bs->valid_key = 0;
2304 } else if (!bs->valid_key) {
2305 bs->valid_key = 1;
2306 /* call the change callback now, we skipped it on open */
2307 bdrv_dev_change_media_cb(bs, true);
2308 }
2309 return ret;
2310 }
2311
2312 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2313 {
2314 if (!bs->drv) {
2315 buf[0] = '\0';
2316 } else {
2317 pstrcpy(buf, buf_size, bs->drv->format_name);
2318 }
2319 }
2320
2321 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2322 void *opaque)
2323 {
2324 BlockDriver *drv;
2325
2326 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2327 it(opaque, drv->format_name);
2328 }
2329 }
2330
2331 BlockDriverState *bdrv_find(const char *name)
2332 {
2333 BlockDriverState *bs;
2334
2335 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2336 if (!strcmp(name, bs->device_name)) {
2337 return bs;
2338 }
2339 }
2340 return NULL;
2341 }
2342
2343 BlockDriverState *bdrv_next(BlockDriverState *bs)
2344 {
2345 if (!bs) {
2346 return QTAILQ_FIRST(&bdrv_states);
2347 }
2348 return QTAILQ_NEXT(bs, list);
2349 }
2350
2351 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2352 {
2353 BlockDriverState *bs;
2354
2355 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2356 it(opaque, bs);
2357 }
2358 }
2359
2360 const char *bdrv_get_device_name(BlockDriverState *bs)
2361 {
2362 return bs->device_name;
2363 }
2364
2365 void bdrv_flush_all(void)
2366 {
2367 BlockDriverState *bs;
2368
2369 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2370 bdrv_flush(bs);
2371 }
2372 }
2373
2374 int bdrv_has_zero_init(BlockDriverState *bs)
2375 {
2376 assert(bs->drv);
2377
2378 if (bs->drv->bdrv_has_zero_init) {
2379 return bs->drv->bdrv_has_zero_init(bs);
2380 }
2381
2382 return 1;
2383 }
2384
2385 typedef struct BdrvCoIsAllocatedData {
2386 BlockDriverState *bs;
2387 int64_t sector_num;
2388 int nb_sectors;
2389 int *pnum;
2390 int ret;
2391 bool done;
2392 } BdrvCoIsAllocatedData;
2393
2394 /*
2395 * Returns true iff the specified sector is present in the disk image. Drivers
2396 * not implementing the functionality are assumed to not support backing files,
2397 * hence all their sectors are reported as allocated.
2398 *
2399 * If 'sector_num' is beyond the end of the disk image the return value is 0
2400 * and 'pnum' is set to 0.
2401 *
2402 * 'pnum' is set to the number of sectors (including and immediately following
2403 * the specified sector) that are known to be in the same
2404 * allocated/unallocated state.
2405 *
2406 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2407 * beyond the end of the disk image it will be clamped.
2408 */
2409 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2410 int nb_sectors, int *pnum)
2411 {
2412 int64_t n;
2413
2414 if (sector_num >= bs->total_sectors) {
2415 *pnum = 0;
2416 return 0;
2417 }
2418
2419 n = bs->total_sectors - sector_num;
2420 if (n < nb_sectors) {
2421 nb_sectors = n;
2422 }
2423
2424 if (!bs->drv->bdrv_co_is_allocated) {
2425 *pnum = nb_sectors;
2426 return 1;
2427 }
2428
2429 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2430 }
2431
2432 /* Coroutine wrapper for bdrv_is_allocated() */
2433 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2434 {
2435 BdrvCoIsAllocatedData *data = opaque;
2436 BlockDriverState *bs = data->bs;
2437
2438 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2439 data->pnum);
2440 data->done = true;
2441 }
2442
2443 /*
2444 * Synchronous wrapper around bdrv_co_is_allocated().
2445 *
2446 * See bdrv_co_is_allocated() for details.
2447 */
2448 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2449 int *pnum)
2450 {
2451 Coroutine *co;
2452 BdrvCoIsAllocatedData data = {
2453 .bs = bs,
2454 .sector_num = sector_num,
2455 .nb_sectors = nb_sectors,
2456 .pnum = pnum,
2457 .done = false,
2458 };
2459
2460 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2461 qemu_coroutine_enter(co, &data);
2462 while (!data.done) {
2463 qemu_aio_wait();
2464 }
2465 return data.ret;
2466 }
2467
2468 BlockInfoList *qmp_query_block(Error **errp)
2469 {
2470 BlockInfoList *head = NULL, *cur_item = NULL;
2471 BlockDriverState *bs;
2472
2473 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2474 BlockInfoList *info = g_malloc0(sizeof(*info));
2475
2476 info->value = g_malloc0(sizeof(*info->value));
2477 info->value->device = g_strdup(bs->device_name);
2478 info->value->type = g_strdup("unknown");
2479 info->value->locked = bdrv_dev_is_medium_locked(bs);
2480 info->value->removable = bdrv_dev_has_removable_media(bs);
2481
2482 if (bdrv_dev_has_removable_media(bs)) {
2483 info->value->has_tray_open = true;
2484 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2485 }
2486
2487 if (bdrv_iostatus_is_enabled(bs)) {
2488 info->value->has_io_status = true;
2489 info->value->io_status = bs->iostatus;
2490 }
2491
2492 if (bs->drv) {
2493 info->value->has_inserted = true;
2494 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2495 info->value->inserted->file = g_strdup(bs->filename);
2496 info->value->inserted->ro = bs->read_only;
2497 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2498 info->value->inserted->encrypted = bs->encrypted;
2499 if (bs->backing_file[0]) {
2500 info->value->inserted->has_backing_file = true;
2501 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2502 }
2503
2504 if (bs->io_limits_enabled) {
2505 info->value->inserted->bps =
2506 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2507 info->value->inserted->bps_rd =
2508 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2509 info->value->inserted->bps_wr =
2510 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2511 info->value->inserted->iops =
2512 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2513 info->value->inserted->iops_rd =
2514 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2515 info->value->inserted->iops_wr =
2516 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2517 }
2518 }
2519
2520 /* XXX: waiting for the qapi to support GSList */
2521 if (!cur_item) {
2522 head = cur_item = info;
2523 } else {
2524 cur_item->next = info;
2525 cur_item = info;
2526 }
2527 }
2528
2529 return head;
2530 }
2531
2532 /* Consider exposing this as a full fledged QMP command */
2533 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2534 {
2535 BlockStats *s;
2536
2537 s = g_malloc0(sizeof(*s));
2538
2539 if (bs->device_name[0]) {
2540 s->has_device = true;
2541 s->device = g_strdup(bs->device_name);
2542 }
2543
2544 s->stats = g_malloc0(sizeof(*s->stats));
2545 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2546 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2547 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2548 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2549 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2550 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2551 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2552 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2553 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2554
2555 if (bs->file) {
2556 s->has_parent = true;
2557 s->parent = qmp_query_blockstat(bs->file, NULL);
2558 }
2559
2560 return s;
2561 }
2562
2563 BlockStatsList *qmp_query_blockstats(Error **errp)
2564 {
2565 BlockStatsList *head = NULL, *cur_item = NULL;
2566 BlockDriverState *bs;
2567
2568 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2569 BlockStatsList *info = g_malloc0(sizeof(*info));
2570 info->value = qmp_query_blockstat(bs, NULL);
2571
2572 /* XXX: waiting for the qapi to support GSList */
2573 if (!cur_item) {
2574 head = cur_item = info;
2575 } else {
2576 cur_item->next = info;
2577 cur_item = info;
2578 }
2579 }
2580
2581 return head;
2582 }
2583
2584 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2585 {
2586 if (bs->backing_hd && bs->backing_hd->encrypted)
2587 return bs->backing_file;
2588 else if (bs->encrypted)
2589 return bs->filename;
2590 else
2591 return NULL;
2592 }
2593
2594 void bdrv_get_backing_filename(BlockDriverState *bs,
2595 char *filename, int filename_size)
2596 {
2597 pstrcpy(filename, filename_size, bs->backing_file);
2598 }
2599
2600 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2601 const uint8_t *buf, int nb_sectors)
2602 {
2603 BlockDriver *drv = bs->drv;
2604 if (!drv)
2605 return -ENOMEDIUM;
2606 if (!drv->bdrv_write_compressed)
2607 return -ENOTSUP;
2608 if (bdrv_check_request(bs, sector_num, nb_sectors))
2609 return -EIO;
2610
2611 if (bs->dirty_bitmap) {
2612 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2613 }
2614
2615 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2616 }
2617
2618 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2619 {
2620 BlockDriver *drv = bs->drv;
2621 if (!drv)
2622 return -ENOMEDIUM;
2623 if (!drv->bdrv_get_info)
2624 return -ENOTSUP;
2625 memset(bdi, 0, sizeof(*bdi));
2626 return drv->bdrv_get_info(bs, bdi);
2627 }
2628
2629 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2630 int64_t pos, int size)
2631 {
2632 BlockDriver *drv = bs->drv;
2633 if (!drv)
2634 return -ENOMEDIUM;
2635 if (drv->bdrv_save_vmstate)
2636 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2637 if (bs->file)
2638 return bdrv_save_vmstate(bs->file, buf, pos, size);
2639 return -ENOTSUP;
2640 }
2641
2642 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2643 int64_t pos, int size)
2644 {
2645 BlockDriver *drv = bs->drv;
2646 if (!drv)
2647 return -ENOMEDIUM;
2648 if (drv->bdrv_load_vmstate)
2649 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2650 if (bs->file)
2651 return bdrv_load_vmstate(bs->file, buf, pos, size);
2652 return -ENOTSUP;
2653 }
2654
2655 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2656 {
2657 BlockDriver *drv = bs->drv;
2658
2659 if (!drv || !drv->bdrv_debug_event) {
2660 return;
2661 }
2662
2663 return drv->bdrv_debug_event(bs, event);
2664
2665 }
2666
2667 /**************************************************************/
2668 /* handling of snapshots */
2669
2670 int bdrv_can_snapshot(BlockDriverState *bs)
2671 {
2672 BlockDriver *drv = bs->drv;
2673 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2674 return 0;
2675 }
2676
2677 if (!drv->bdrv_snapshot_create) {
2678 if (bs->file != NULL) {
2679 return bdrv_can_snapshot(bs->file);
2680 }
2681 return 0;
2682 }
2683
2684 return 1;
2685 }
2686
2687 int bdrv_is_snapshot(BlockDriverState *bs)
2688 {
2689 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2690 }
2691
2692 BlockDriverState *bdrv_snapshots(void)
2693 {
2694 BlockDriverState *bs;
2695
2696 if (bs_snapshots) {
2697 return bs_snapshots;
2698 }
2699
2700 bs = NULL;
2701 while ((bs = bdrv_next(bs))) {
2702 if (bdrv_can_snapshot(bs)) {
2703 bs_snapshots = bs;
2704 return bs;
2705 }
2706 }
2707 return NULL;
2708 }
2709
2710 int bdrv_snapshot_create(BlockDriverState *bs,
2711 QEMUSnapshotInfo *sn_info)
2712 {
2713 BlockDriver *drv = bs->drv;
2714 if (!drv)
2715 return -ENOMEDIUM;
2716 if (drv->bdrv_snapshot_create)
2717 return drv->bdrv_snapshot_create(bs, sn_info);
2718 if (bs->file)
2719 return bdrv_snapshot_create(bs->file, sn_info);
2720 return -ENOTSUP;
2721 }
2722
2723 int bdrv_snapshot_goto(BlockDriverState *bs,
2724 const char *snapshot_id)
2725 {
2726 BlockDriver *drv = bs->drv;
2727 int ret, open_ret;
2728
2729 if (!drv)
2730 return -ENOMEDIUM;
2731 if (drv->bdrv_snapshot_goto)
2732 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2733
2734 if (bs->file) {
2735 drv->bdrv_close(bs);
2736 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2737 open_ret = drv->bdrv_open(bs, bs->open_flags);
2738 if (open_ret < 0) {
2739 bdrv_delete(bs->file);
2740 bs->drv = NULL;
2741 return open_ret;
2742 }
2743 return ret;
2744 }
2745
2746 return -ENOTSUP;
2747 }
2748
2749 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2750 {
2751 BlockDriver *drv = bs->drv;
2752 if (!drv)
2753 return -ENOMEDIUM;
2754 if (drv->bdrv_snapshot_delete)
2755 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2756 if (bs->file)
2757 return bdrv_snapshot_delete(bs->file, snapshot_id);
2758 return -ENOTSUP;
2759 }
2760
2761 int bdrv_snapshot_list(BlockDriverState *bs,
2762 QEMUSnapshotInfo **psn_info)
2763 {
2764 BlockDriver *drv = bs->drv;
2765 if (!drv)
2766 return -ENOMEDIUM;
2767 if (drv->bdrv_snapshot_list)
2768 return drv->bdrv_snapshot_list(bs, psn_info);
2769 if (bs->file)
2770 return bdrv_snapshot_list(bs->file, psn_info);
2771 return -ENOTSUP;
2772 }
2773
2774 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2775 const char *snapshot_name)
2776 {
2777 BlockDriver *drv = bs->drv;
2778 if (!drv) {
2779 return -ENOMEDIUM;
2780 }
2781 if (!bs->read_only) {
2782 return -EINVAL;
2783 }
2784 if (drv->bdrv_snapshot_load_tmp) {
2785 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2786 }
2787 return -ENOTSUP;
2788 }
2789
2790 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2791 const char *backing_file)
2792 {
2793 if (!bs->drv) {
2794 return NULL;
2795 }
2796
2797 if (bs->backing_hd) {
2798 if (strcmp(bs->backing_file, backing_file) == 0) {
2799 return bs->backing_hd;
2800 } else {
2801 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2802 }
2803 }
2804
2805 return NULL;
2806 }
2807
2808 #define NB_SUFFIXES 4
2809
2810 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2811 {
2812 static const char suffixes[NB_SUFFIXES] = "KMGT";
2813 int64_t base;
2814 int i;
2815
2816 if (size <= 999) {
2817 snprintf(buf, buf_size, "%" PRId64, size);
2818 } else {
2819 base = 1024;
2820 for(i = 0; i < NB_SUFFIXES; i++) {
2821 if (size < (10 * base)) {
2822 snprintf(buf, buf_size, "%0.1f%c",
2823 (double)size / base,
2824 suffixes[i]);
2825 break;
2826 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2827 snprintf(buf, buf_size, "%" PRId64 "%c",
2828 ((size + (base >> 1)) / base),
2829 suffixes[i]);
2830 break;
2831 }
2832 base = base * 1024;
2833 }
2834 }
2835 return buf;
2836 }
2837
2838 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2839 {
2840 char buf1[128], date_buf[128], clock_buf[128];
2841 #ifdef _WIN32
2842 struct tm *ptm;
2843 #else
2844 struct tm tm;
2845 #endif
2846 time_t ti;
2847 int64_t secs;
2848
2849 if (!sn) {
2850 snprintf(buf, buf_size,
2851 "%-10s%-20s%7s%20s%15s",
2852 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2853 } else {
2854 ti = sn->date_sec;
2855 #ifdef _WIN32
2856 ptm = localtime(&ti);
2857 strftime(date_buf, sizeof(date_buf),
2858 "%Y-%m-%d %H:%M:%S", ptm);
2859 #else
2860 localtime_r(&ti, &tm);
2861 strftime(date_buf, sizeof(date_buf),
2862 "%Y-%m-%d %H:%M:%S", &tm);
2863 #endif
2864 secs = sn->vm_clock_nsec / 1000000000;
2865 snprintf(clock_buf, sizeof(clock_buf),
2866 "%02d:%02d:%02d.%03d",
2867 (int)(secs / 3600),
2868 (int)((secs / 60) % 60),
2869 (int)(secs % 60),
2870 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2871 snprintf(buf, buf_size,
2872 "%-10s%-20s%7s%20s%15s",
2873 sn->id_str, sn->name,
2874 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2875 date_buf,
2876 clock_buf);
2877 }
2878 return buf;
2879 }
2880
2881 /**************************************************************/
2882 /* async I/Os */
2883
2884 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2885 QEMUIOVector *qiov, int nb_sectors,
2886 BlockDriverCompletionFunc *cb, void *opaque)
2887 {
2888 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2889
2890 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2891 cb, opaque, false);
2892 }
2893
2894 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2895 QEMUIOVector *qiov, int nb_sectors,
2896 BlockDriverCompletionFunc *cb, void *opaque)
2897 {
2898 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2899
2900 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2901 cb, opaque, true);
2902 }
2903
2904
2905 typedef struct MultiwriteCB {
2906 int error;
2907 int num_requests;
2908 int num_callbacks;
2909 struct {
2910 BlockDriverCompletionFunc *cb;
2911 void *opaque;
2912 QEMUIOVector *free_qiov;
2913 } callbacks[];
2914 } MultiwriteCB;
2915
2916 static void multiwrite_user_cb(MultiwriteCB *mcb)
2917 {
2918 int i;
2919
2920 for (i = 0; i < mcb->num_callbacks; i++) {
2921 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2922 if (mcb->callbacks[i].free_qiov) {
2923 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2924 }
2925 g_free(mcb->callbacks[i].free_qiov);
2926 }
2927 }
2928
2929 static void multiwrite_cb(void *opaque, int ret)
2930 {
2931 MultiwriteCB *mcb = opaque;
2932
2933 trace_multiwrite_cb(mcb, ret);
2934
2935 if (ret < 0 && !mcb->error) {
2936 mcb->error = ret;
2937 }
2938
2939 mcb->num_requests--;
2940 if (mcb->num_requests == 0) {
2941 multiwrite_user_cb(mcb);
2942 g_free(mcb);
2943 }
2944 }
2945
2946 static int multiwrite_req_compare(const void *a, const void *b)
2947 {
2948 const BlockRequest *req1 = a, *req2 = b;
2949
2950 /*
2951 * Note that we can't simply subtract req2->sector from req1->sector
2952 * here as that could overflow the return value.
2953 */
2954 if (req1->sector > req2->sector) {
2955 return 1;
2956 } else if (req1->sector < req2->sector) {
2957 return -1;
2958 } else {
2959 return 0;
2960 }
2961 }
2962
2963 /*
2964 * Takes a bunch of requests and tries to merge them. Returns the number of
2965 * requests that remain after merging.
2966 */
2967 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2968 int num_reqs, MultiwriteCB *mcb)
2969 {
2970 int i, outidx;
2971
2972 // Sort requests by start sector
2973 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2974
2975 // Check if adjacent requests touch the same clusters. If so, combine them,
2976 // filling up gaps with zero sectors.
2977 outidx = 0;
2978 for (i = 1; i < num_reqs; i++) {
2979 int merge = 0;
2980 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2981
2982 // Handle exactly sequential writes and overlapping writes.
2983 if (reqs[i].sector <= oldreq_last) {
2984 merge = 1;
2985 }
2986
2987 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2988 merge = 0;
2989 }
2990
2991 if (merge) {
2992 size_t size;
2993 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2994 qemu_iovec_init(qiov,
2995 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2996
2997 // Add the first request to the merged one. If the requests are
2998 // overlapping, drop the last sectors of the first request.
2999 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3000 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3001
3002 // We should need to add any zeros between the two requests
3003 assert (reqs[i].sector <= oldreq_last);
3004
3005 // Add the second request
3006 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3007
3008 reqs[outidx].nb_sectors = qiov->size >> 9;
3009 reqs[outidx].qiov = qiov;
3010
3011 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3012 } else {
3013 outidx++;
3014 reqs[outidx].sector = reqs[i].sector;
3015 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3016 reqs[outidx].qiov = reqs[i].qiov;
3017 }
3018 }
3019
3020 return outidx + 1;
3021 }
3022
3023 /*
3024 * Submit multiple AIO write requests at once.
3025 *
3026 * On success, the function returns 0 and all requests in the reqs array have
3027 * been submitted. In error case this function returns -1, and any of the
3028 * requests may or may not be submitted yet. In particular, this means that the
3029 * callback will be called for some of the requests, for others it won't. The
3030 * caller must check the error field of the BlockRequest to wait for the right
3031 * callbacks (if error != 0, no callback will be called).
3032 *
3033 * The implementation may modify the contents of the reqs array, e.g. to merge
3034 * requests. However, the fields opaque and error are left unmodified as they
3035 * are used to signal failure for a single request to the caller.
3036 */
3037 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3038 {
3039 MultiwriteCB *mcb;
3040 int i;
3041
3042 /* don't submit writes if we don't have a medium */
3043 if (bs->drv == NULL) {
3044 for (i = 0; i < num_reqs; i++) {
3045 reqs[i].error = -ENOMEDIUM;
3046 }
3047 return -1;
3048 }
3049
3050 if (num_reqs == 0) {
3051 return 0;
3052 }
3053
3054 // Create MultiwriteCB structure
3055 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3056 mcb->num_requests = 0;
3057 mcb->num_callbacks = num_reqs;
3058
3059 for (i = 0; i < num_reqs; i++) {
3060 mcb->callbacks[i].cb = reqs[i].cb;
3061 mcb->callbacks[i].opaque = reqs[i].opaque;
3062 }
3063
3064 // Check for mergable requests
3065 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3066
3067 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3068
3069 /* Run the aio requests. */
3070 mcb->num_requests = num_reqs;
3071 for (i = 0; i < num_reqs; i++) {
3072 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3073 reqs[i].nb_sectors, multiwrite_cb, mcb);
3074 }
3075
3076 return 0;
3077 }
3078
3079 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3080 {
3081 acb->pool->cancel(acb);
3082 }
3083
3084 /* block I/O throttling */
3085 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3086 bool is_write, double elapsed_time, uint64_t *wait)
3087 {
3088 uint64_t bps_limit = 0;
3089 double bytes_limit, bytes_base, bytes_res;
3090 double slice_time, wait_time;
3091
3092 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3093 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3094 } else if (bs->io_limits.bps[is_write]) {
3095 bps_limit = bs->io_limits.bps[is_write];
3096 } else {
3097 if (wait) {
3098 *wait = 0;
3099 }
3100
3101 return false;
3102 }
3103
3104 slice_time = bs->slice_end - bs->slice_start;
3105 slice_time /= (NANOSECONDS_PER_SECOND);
3106 bytes_limit = bps_limit * slice_time;
3107 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3108 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3109 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3110 }
3111
3112 /* bytes_base: the bytes of data which have been read/written; and
3113 * it is obtained from the history statistic info.
3114 * bytes_res: the remaining bytes of data which need to be read/written.
3115 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3116 * the total time for completing reading/writting all data.
3117 */
3118 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3119
3120 if (bytes_base + bytes_res <= bytes_limit) {
3121 if (wait) {
3122 *wait = 0;
3123 }
3124
3125 return false;
3126 }
3127
3128 /* Calc approx time to dispatch */
3129 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3130
3131 /* When the I/O rate at runtime exceeds the limits,
3132 * bs->slice_end need to be extended in order that the current statistic
3133 * info can be kept until the timer fire, so it is increased and tuned
3134 * based on the result of experiment.
3135 */
3136 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3137 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3138 if (wait) {
3139 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3140 }
3141
3142 return true;
3143 }
3144
3145 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3146 double elapsed_time, uint64_t *wait)
3147 {
3148 uint64_t iops_limit = 0;
3149 double ios_limit, ios_base;
3150 double slice_time, wait_time;
3151
3152 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3153 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3154 } else if (bs->io_limits.iops[is_write]) {
3155 iops_limit = bs->io_limits.iops[is_write];
3156 } else {
3157 if (wait) {
3158 *wait = 0;
3159 }
3160
3161 return false;
3162 }
3163
3164 slice_time = bs->slice_end - bs->slice_start;
3165 slice_time /= (NANOSECONDS_PER_SECOND);
3166 ios_limit = iops_limit * slice_time;
3167 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3168 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3169 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3170 }
3171
3172 if (ios_base + 1 <= ios_limit) {
3173 if (wait) {
3174 *wait = 0;
3175 }
3176
3177 return false;
3178 }
3179
3180 /* Calc approx time to dispatch */
3181 wait_time = (ios_base + 1) / iops_limit;
3182 if (wait_time > elapsed_time) {
3183 wait_time = wait_time - elapsed_time;
3184 } else {
3185 wait_time = 0;
3186 }
3187
3188 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3189 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3190 if (wait) {
3191 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3192 }
3193
3194 return true;
3195 }
3196
3197 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3198 bool is_write, int64_t *wait)
3199 {
3200 int64_t now, max_wait;
3201 uint64_t bps_wait = 0, iops_wait = 0;
3202 double elapsed_time;
3203 int bps_ret, iops_ret;
3204
3205 now = qemu_get_clock_ns(vm_clock);
3206 if ((bs->slice_start < now)
3207 && (bs->slice_end > now)) {
3208 bs->slice_end = now + bs->slice_time;
3209 } else {
3210 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3211 bs->slice_start = now;
3212 bs->slice_end = now + bs->slice_time;
3213
3214 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3215 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3216
3217 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3218 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3219 }
3220
3221 elapsed_time = now - bs->slice_start;
3222 elapsed_time /= (NANOSECONDS_PER_SECOND);
3223
3224 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3225 is_write, elapsed_time, &bps_wait);
3226 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3227 elapsed_time, &iops_wait);
3228 if (bps_ret || iops_ret) {
3229 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3230 if (wait) {
3231 *wait = max_wait;
3232 }
3233
3234 now = qemu_get_clock_ns(vm_clock);
3235 if (bs->slice_end < now + max_wait) {
3236 bs->slice_end = now + max_wait;
3237 }
3238
3239 return true;
3240 }
3241
3242 if (wait) {
3243 *wait = 0;
3244 }
3245
3246 return false;
3247 }
3248
3249 /**************************************************************/
3250 /* async block device emulation */
3251
3252 typedef struct BlockDriverAIOCBSync {
3253 BlockDriverAIOCB common;
3254 QEMUBH *bh;
3255 int ret;
3256 /* vector translation state */
3257 QEMUIOVector *qiov;
3258 uint8_t *bounce;
3259 int is_write;
3260 } BlockDriverAIOCBSync;
3261
3262 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3263 {
3264 BlockDriverAIOCBSync *acb =
3265 container_of(blockacb, BlockDriverAIOCBSync, common);
3266 qemu_bh_delete(acb->bh);
3267 acb->bh = NULL;
3268 qemu_aio_release(acb);
3269 }
3270
3271 static AIOPool bdrv_em_aio_pool = {
3272 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3273 .cancel = bdrv_aio_cancel_em,
3274 };
3275
3276 static void bdrv_aio_bh_cb(void *opaque)
3277 {
3278 BlockDriverAIOCBSync *acb = opaque;
3279
3280 if (!acb->is_write)
3281 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3282 qemu_vfree(acb->bounce);
3283 acb->common.cb(acb->common.opaque, acb->ret);
3284 qemu_bh_delete(acb->bh);
3285 acb->bh = NULL;
3286 qemu_aio_release(acb);
3287 }
3288
3289 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3290 int64_t sector_num,
3291 QEMUIOVector *qiov,
3292 int nb_sectors,
3293 BlockDriverCompletionFunc *cb,
3294 void *opaque,
3295 int is_write)
3296
3297 {
3298 BlockDriverAIOCBSync *acb;
3299
3300 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3301 acb->is_write = is_write;
3302 acb->qiov = qiov;
3303 acb->bounce = qemu_blockalign(bs, qiov->size);
3304 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3305
3306 if (is_write) {
3307 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3308 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3309 } else {
3310 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3311 }
3312
3313 qemu_bh_schedule(acb->bh);
3314
3315 return &acb->common;
3316 }
3317
3318 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3319 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3320 BlockDriverCompletionFunc *cb, void *opaque)
3321 {
3322 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3323 }
3324
3325 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3326 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3327 BlockDriverCompletionFunc *cb, void *opaque)
3328 {
3329 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3330 }
3331
3332
3333 typedef struct BlockDriverAIOCBCoroutine {
3334 BlockDriverAIOCB common;
3335 BlockRequest req;
3336 bool is_write;
3337 QEMUBH* bh;
3338 } BlockDriverAIOCBCoroutine;
3339
3340 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3341 {
3342 qemu_aio_flush();
3343 }
3344
3345 static AIOPool bdrv_em_co_aio_pool = {
3346 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3347 .cancel = bdrv_aio_co_cancel_em,
3348 };
3349
3350 static void bdrv_co_em_bh(void *opaque)
3351 {
3352 BlockDriverAIOCBCoroutine *acb = opaque;
3353
3354 acb->common.cb(acb->common.opaque, acb->req.error);
3355 qemu_bh_delete(acb->bh);
3356 qemu_aio_release(acb);
3357 }
3358
3359 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3360 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3361 {
3362 BlockDriverAIOCBCoroutine *acb = opaque;
3363 BlockDriverState *bs = acb->common.bs;
3364
3365 if (!acb->is_write) {
3366 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3367 acb->req.nb_sectors, acb->req.qiov, 0);
3368 } else {
3369 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3370 acb->req.nb_sectors, acb->req.qiov, 0);
3371 }
3372
3373 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3374 qemu_bh_schedule(acb->bh);
3375 }
3376
3377 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3378 int64_t sector_num,
3379 QEMUIOVector *qiov,
3380 int nb_sectors,
3381 BlockDriverCompletionFunc *cb,
3382 void *opaque,
3383 bool is_write)
3384 {
3385 Coroutine *co;
3386 BlockDriverAIOCBCoroutine *acb;
3387
3388 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3389 acb->req.sector = sector_num;
3390 acb->req.nb_sectors = nb_sectors;
3391 acb->req.qiov = qiov;
3392 acb->is_write = is_write;
3393
3394 co = qemu_coroutine_create(bdrv_co_do_rw);
3395 qemu_coroutine_enter(co, acb);
3396
3397 return &acb->common;
3398 }
3399
3400 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3401 {
3402 BlockDriverAIOCBCoroutine *acb = opaque;
3403 BlockDriverState *bs = acb->common.bs;
3404
3405 acb->req.error = bdrv_co_flush(bs);
3406 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3407 qemu_bh_schedule(acb->bh);
3408 }
3409
3410 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3411 BlockDriverCompletionFunc *cb, void *opaque)
3412 {
3413 trace_bdrv_aio_flush(bs, opaque);
3414
3415 Coroutine *co;
3416 BlockDriverAIOCBCoroutine *acb;
3417
3418 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3419 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3420 qemu_coroutine_enter(co, acb);
3421
3422 return &acb->common;
3423 }
3424
3425 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3426 {
3427 BlockDriverAIOCBCoroutine *acb = opaque;
3428 BlockDriverState *bs = acb->common.bs;
3429
3430 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3431 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3432 qemu_bh_schedule(acb->bh);
3433 }
3434
3435 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3436 int64_t sector_num, int nb_sectors,
3437 BlockDriverCompletionFunc *cb, void *opaque)
3438 {
3439 Coroutine *co;
3440 BlockDriverAIOCBCoroutine *acb;
3441
3442 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3443
3444 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3445 acb->req.sector = sector_num;
3446 acb->req.nb_sectors = nb_sectors;
3447 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3448 qemu_coroutine_enter(co, acb);
3449
3450 return &acb->common;
3451 }
3452
3453 void bdrv_init(void)
3454 {
3455 module_call_init(MODULE_INIT_BLOCK);
3456 }
3457
3458 void bdrv_init_with_whitelist(void)
3459 {
3460 use_bdrv_whitelist = 1;
3461 bdrv_init();
3462 }
3463
3464 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3465 BlockDriverCompletionFunc *cb, void *opaque)
3466 {
3467 BlockDriverAIOCB *acb;
3468
3469 if (pool->free_aiocb) {
3470 acb = pool->free_aiocb;
3471 pool->free_aiocb = acb->next;
3472 } else {
3473 acb = g_malloc0(pool->aiocb_size);
3474 acb->pool = pool;
3475 }
3476 acb->bs = bs;
3477 acb->cb = cb;
3478 acb->opaque = opaque;
3479 return acb;
3480 }
3481
3482 void qemu_aio_release(void *p)
3483 {
3484 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3485 AIOPool *pool = acb->pool;
3486 acb->next = pool->free_aiocb;
3487 pool->free_aiocb = acb;
3488 }
3489
3490 /**************************************************************/
3491 /* Coroutine block device emulation */
3492
3493 typedef struct CoroutineIOCompletion {
3494 Coroutine *coroutine;
3495 int ret;
3496 } CoroutineIOCompletion;
3497
3498 static void bdrv_co_io_em_complete(void *opaque, int ret)
3499 {
3500 CoroutineIOCompletion *co = opaque;
3501
3502 co->ret = ret;
3503 qemu_coroutine_enter(co->coroutine, NULL);
3504 }
3505
3506 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3507 int nb_sectors, QEMUIOVector *iov,
3508 bool is_write)
3509 {
3510 CoroutineIOCompletion co = {
3511 .coroutine = qemu_coroutine_self(),
3512 };
3513 BlockDriverAIOCB *acb;
3514
3515 if (is_write) {
3516 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3517 bdrv_co_io_em_complete, &co);
3518 } else {
3519 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3520 bdrv_co_io_em_complete, &co);
3521 }
3522
3523 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3524 if (!acb) {
3525 return -EIO;
3526 }
3527 qemu_coroutine_yield();
3528
3529 return co.ret;
3530 }
3531
3532 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3533 int64_t sector_num, int nb_sectors,
3534 QEMUIOVector *iov)
3535 {
3536 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3537 }
3538
3539 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3540 int64_t sector_num, int nb_sectors,
3541 QEMUIOVector *iov)
3542 {
3543 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3544 }
3545
3546 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3547 {
3548 RwCo *rwco = opaque;
3549
3550 rwco->ret = bdrv_co_flush(rwco->bs);
3551 }
3552
3553 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3554 {
3555 int ret;
3556
3557 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3558 return 0;
3559 }
3560
3561 /* Write back cached data to the OS even with cache=unsafe */
3562 if (bs->drv->bdrv_co_flush_to_os) {
3563 ret = bs->drv->bdrv_co_flush_to_os(bs);
3564 if (ret < 0) {
3565 return ret;
3566 }
3567 }
3568
3569 /* But don't actually force it to the disk with cache=unsafe */
3570 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3571 return 0;
3572 }
3573
3574 if (bs->drv->bdrv_co_flush_to_disk) {
3575 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3576 } else if (bs->drv->bdrv_aio_flush) {
3577 BlockDriverAIOCB *acb;
3578 CoroutineIOCompletion co = {
3579 .coroutine = qemu_coroutine_self(),
3580 };
3581
3582 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3583 if (acb == NULL) {
3584 ret = -EIO;
3585 } else {
3586 qemu_coroutine_yield();
3587 ret = co.ret;
3588 }
3589 } else {
3590 /*
3591 * Some block drivers always operate in either writethrough or unsafe
3592 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3593 * know how the server works (because the behaviour is hardcoded or
3594 * depends on server-side configuration), so we can't ensure that
3595 * everything is safe on disk. Returning an error doesn't work because
3596 * that would break guests even if the server operates in writethrough
3597 * mode.
3598 *
3599 * Let's hope the user knows what he's doing.
3600 */
3601 ret = 0;
3602 }
3603 if (ret < 0) {
3604 return ret;
3605 }
3606
3607 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3608 * in the case of cache=unsafe, so there are no useless flushes.
3609 */
3610 return bdrv_co_flush(bs->file);
3611 }
3612
3613 void bdrv_invalidate_cache(BlockDriverState *bs)
3614 {
3615 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3616 bs->drv->bdrv_invalidate_cache(bs);
3617 }
3618 }
3619
3620 void bdrv_invalidate_cache_all(void)
3621 {
3622 BlockDriverState *bs;
3623
3624 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3625 bdrv_invalidate_cache(bs);
3626 }
3627 }
3628
3629 void bdrv_clear_incoming_migration_all(void)
3630 {
3631 BlockDriverState *bs;
3632
3633 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3634 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3635 }
3636 }
3637
3638 int bdrv_flush(BlockDriverState *bs)
3639 {
3640 Coroutine *co;
3641 RwCo rwco = {
3642 .bs = bs,
3643 .ret = NOT_DONE,
3644 };
3645
3646 if (qemu_in_coroutine()) {
3647 /* Fast-path if already in coroutine context */
3648 bdrv_flush_co_entry(&rwco);
3649 } else {
3650 co = qemu_coroutine_create(bdrv_flush_co_entry);
3651 qemu_coroutine_enter(co, &rwco);
3652 while (rwco.ret == NOT_DONE) {
3653 qemu_aio_wait();
3654 }
3655 }
3656
3657 return rwco.ret;
3658 }
3659
3660 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3661 {
3662 RwCo *rwco = opaque;
3663
3664 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3665 }
3666
3667 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3668 int nb_sectors)
3669 {
3670 if (!bs->drv) {
3671 return -ENOMEDIUM;
3672 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3673 return -EIO;
3674 } else if (bs->read_only) {
3675 return -EROFS;
3676 } else if (bs->drv->bdrv_co_discard) {
3677 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3678 } else if (bs->drv->bdrv_aio_discard) {
3679 BlockDriverAIOCB *acb;
3680 CoroutineIOCompletion co = {
3681 .coroutine = qemu_coroutine_self(),
3682 };
3683
3684 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3685 bdrv_co_io_em_complete, &co);
3686 if (acb == NULL) {
3687 return -EIO;
3688 } else {
3689 qemu_coroutine_yield();
3690 return co.ret;
3691 }
3692 } else {
3693 return 0;
3694 }
3695 }
3696
3697 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3698 {
3699 Coroutine *co;
3700 RwCo rwco = {
3701 .bs = bs,
3702 .sector_num = sector_num,
3703 .nb_sectors = nb_sectors,
3704 .ret = NOT_DONE,
3705 };
3706
3707 if (qemu_in_coroutine()) {
3708 /* Fast-path if already in coroutine context */
3709 bdrv_discard_co_entry(&rwco);
3710 } else {
3711 co = qemu_coroutine_create(bdrv_discard_co_entry);
3712 qemu_coroutine_enter(co, &rwco);
3713 while (rwco.ret == NOT_DONE) {
3714 qemu_aio_wait();
3715 }
3716 }
3717
3718 return rwco.ret;
3719 }
3720
3721 /**************************************************************/
3722 /* removable device support */
3723
3724 /**
3725 * Return TRUE if the media is present
3726 */
3727 int bdrv_is_inserted(BlockDriverState *bs)
3728 {
3729 BlockDriver *drv = bs->drv;
3730
3731 if (!drv)
3732 return 0;
3733 if (!drv->bdrv_is_inserted)
3734 return 1;
3735 return drv->bdrv_is_inserted(bs);
3736 }
3737
3738 /**
3739 * Return whether the media changed since the last call to this
3740 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3741 */
3742 int bdrv_media_changed(BlockDriverState *bs)
3743 {
3744 BlockDriver *drv = bs->drv;
3745
3746 if (drv && drv->bdrv_media_changed) {
3747 return drv->bdrv_media_changed(bs);
3748 }
3749 return -ENOTSUP;
3750 }
3751
3752 /**
3753 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3754 */
3755 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3756 {
3757 BlockDriver *drv = bs->drv;
3758
3759 if (drv && drv->bdrv_eject) {
3760 drv->bdrv_eject(bs, eject_flag);
3761 }
3762
3763 if (bs->device_name[0] != '\0') {
3764 bdrv_emit_qmp_eject_event(bs, eject_flag);
3765 }
3766 }
3767
3768 /**
3769 * Lock or unlock the media (if it is locked, the user won't be able
3770 * to eject it manually).
3771 */
3772 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3773 {
3774 BlockDriver *drv = bs->drv;
3775
3776 trace_bdrv_lock_medium(bs, locked);
3777
3778 if (drv && drv->bdrv_lock_medium) {
3779 drv->bdrv_lock_medium(bs, locked);
3780 }
3781 }
3782
3783 /* needed for generic scsi interface */
3784
3785 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3786 {
3787 BlockDriver *drv = bs->drv;
3788
3789 if (drv && drv->bdrv_ioctl)
3790 return drv->bdrv_ioctl(bs, req, buf);
3791 return -ENOTSUP;
3792 }
3793
3794 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3795 unsigned long int req, void *buf,
3796 BlockDriverCompletionFunc *cb, void *opaque)
3797 {
3798 BlockDriver *drv = bs->drv;
3799
3800 if (drv && drv->bdrv_aio_ioctl)
3801 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3802 return NULL;
3803 }
3804
3805 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3806 {
3807 bs->buffer_alignment = align;
3808 }
3809
3810 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3811 {
3812 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3813 }
3814
3815 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3816 {
3817 int64_t bitmap_size;
3818
3819 bs->dirty_count = 0;
3820 if (enable) {
3821 if (!bs->dirty_bitmap) {
3822 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3823 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3824 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3825
3826 bs->dirty_bitmap = g_malloc0(bitmap_size);
3827 }
3828 } else {
3829 if (bs->dirty_bitmap) {
3830 g_free(bs->dirty_bitmap);
3831 bs->dirty_bitmap = NULL;
3832 }
3833 }
3834 }
3835
3836 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3837 {
3838 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3839
3840 if (bs->dirty_bitmap &&
3841 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3842 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3843 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3844 } else {
3845 return 0;
3846 }
3847 }
3848
3849 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3850 int nr_sectors)
3851 {
3852 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3853 }
3854
3855 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3856 {
3857 return bs->dirty_count;
3858 }
3859
3860 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3861 {
3862 assert(bs->in_use != in_use);
3863 bs->in_use = in_use;
3864 }
3865
3866 int bdrv_in_use(BlockDriverState *bs)
3867 {
3868 return bs->in_use;
3869 }
3870
3871 void bdrv_iostatus_enable(BlockDriverState *bs)
3872 {
3873 bs->iostatus_enabled = true;
3874 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3875 }
3876
3877 /* The I/O status is only enabled if the drive explicitly
3878 * enables it _and_ the VM is configured to stop on errors */
3879 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3880 {
3881 return (bs->iostatus_enabled &&
3882 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3883 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3884 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3885 }
3886
3887 void bdrv_iostatus_disable(BlockDriverState *bs)
3888 {
3889 bs->iostatus_enabled = false;
3890 }
3891
3892 void bdrv_iostatus_reset(BlockDriverState *bs)
3893 {
3894 if (bdrv_iostatus_is_enabled(bs)) {
3895 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3896 }
3897 }
3898
3899 /* XXX: Today this is set by device models because it makes the implementation
3900 quite simple. However, the block layer knows about the error, so it's
3901 possible to implement this without device models being involved */
3902 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3903 {
3904 if (bdrv_iostatus_is_enabled(bs) &&
3905 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3906 assert(error >= 0);
3907 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3908 BLOCK_DEVICE_IO_STATUS_FAILED;
3909 }
3910 }
3911
3912 void
3913 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3914 enum BlockAcctType type)
3915 {
3916 assert(type < BDRV_MAX_IOTYPE);
3917
3918 cookie->bytes = bytes;
3919 cookie->start_time_ns = get_clock();
3920 cookie->type = type;
3921 }
3922
3923 void
3924 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3925 {
3926 assert(cookie->type < BDRV_MAX_IOTYPE);
3927
3928 bs->nr_bytes[cookie->type] += cookie->bytes;
3929 bs->nr_ops[cookie->type]++;
3930 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3931 }
3932
3933 int bdrv_img_create(const char *filename, const char *fmt,
3934 const char *base_filename, const char *base_fmt,
3935 char *options, uint64_t img_size, int flags)
3936 {
3937 QEMUOptionParameter *param = NULL, *create_options = NULL;
3938 QEMUOptionParameter *backing_fmt, *backing_file, *size;
3939 BlockDriverState *bs = NULL;
3940 BlockDriver *drv, *proto_drv;
3941 BlockDriver *backing_drv = NULL;
3942 int ret = 0;
3943
3944 /* Find driver and parse its options */
3945 drv = bdrv_find_format(fmt);
3946 if (!drv) {
3947 error_report("Unknown file format '%s'", fmt);
3948 ret = -EINVAL;
3949 goto out;
3950 }
3951
3952 proto_drv = bdrv_find_protocol(filename);
3953 if (!proto_drv) {
3954 error_report("Unknown protocol '%s'", filename);
3955 ret = -EINVAL;
3956 goto out;
3957 }
3958
3959 create_options = append_option_parameters(create_options,
3960 drv->create_options);
3961 create_options = append_option_parameters(create_options,
3962 proto_drv->create_options);
3963
3964 /* Create parameter list with default values */
3965 param = parse_option_parameters("", create_options, param);
3966
3967 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
3968
3969 /* Parse -o options */
3970 if (options) {
3971 param = parse_option_parameters(options, create_options, param);
3972 if (param == NULL) {
3973 error_report("Invalid options for file format '%s'.", fmt);
3974 ret = -EINVAL;
3975 goto out;
3976 }
3977 }
3978
3979 if (base_filename) {
3980 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
3981 base_filename)) {
3982 error_report("Backing file not supported for file format '%s'",
3983 fmt);
3984 ret = -EINVAL;
3985 goto out;
3986 }
3987 }
3988
3989 if (base_fmt) {
3990 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
3991 error_report("Backing file format not supported for file "
3992 "format '%s'", fmt);
3993 ret = -EINVAL;
3994 goto out;
3995 }
3996 }
3997
3998 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
3999 if (backing_file && backing_file->value.s) {
4000 if (!strcmp(filename, backing_file->value.s)) {
4001 error_report("Error: Trying to create an image with the "
4002 "same filename as the backing file");
4003 ret = -EINVAL;
4004 goto out;
4005 }
4006 }
4007
4008 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4009 if (backing_fmt && backing_fmt->value.s) {
4010 backing_drv = bdrv_find_format(backing_fmt->value.s);
4011 if (!backing_drv) {
4012 error_report("Unknown backing file format '%s'",
4013 backing_fmt->value.s);
4014 ret = -EINVAL;
4015 goto out;
4016 }
4017 }
4018
4019 // The size for the image must always be specified, with one exception:
4020 // If we are using a backing file, we can obtain the size from there
4021 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4022 if (size && size->value.n == -1) {
4023 if (backing_file && backing_file->value.s) {
4024 uint64_t size;
4025 char buf[32];
4026
4027 bs = bdrv_new("");
4028
4029 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4030 if (ret < 0) {
4031 error_report("Could not open '%s'", backing_file->value.s);
4032 goto out;
4033 }
4034 bdrv_get_geometry(bs, &size);
4035 size *= 512;
4036
4037 snprintf(buf, sizeof(buf), "%" PRId64, size);
4038 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4039 } else {
4040 error_report("Image creation needs a size parameter");
4041 ret = -EINVAL;
4042 goto out;
4043 }
4044 }
4045
4046 printf("Formatting '%s', fmt=%s ", filename, fmt);
4047 print_option_parameters(param);
4048 puts("");
4049
4050 ret = bdrv_create(drv, filename, param);
4051
4052 if (ret < 0) {
4053 if (ret == -ENOTSUP) {
4054 error_report("Formatting or formatting option not supported for "
4055 "file format '%s'", fmt);
4056 } else if (ret == -EFBIG) {
4057 error_report("The image size is too large for file format '%s'",
4058 fmt);
4059 } else {
4060 error_report("%s: error while creating %s: %s", filename, fmt,
4061 strerror(-ret));
4062 }
4063 }
4064
4065 out:
4066 free_option_parameters(create_options);
4067 free_option_parameters(param);
4068
4069 if (bs) {
4070 bdrv_delete(bs);
4071 }
4072
4073 return ret;
4074 }
4075
4076 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4077 BlockDriverCompletionFunc *cb, void *opaque)
4078 {
4079 BlockJob *job;
4080
4081 if (bs->job || bdrv_in_use(bs)) {
4082 return NULL;
4083 }
4084 bdrv_set_in_use(bs, 1);
4085
4086 job = g_malloc0(job_type->instance_size);
4087 job->job_type = job_type;
4088 job->bs = bs;
4089 job->cb = cb;
4090 job->opaque = opaque;
4091 bs->job = job;
4092 return job;
4093 }
4094
4095 void block_job_complete(BlockJob *job, int ret)
4096 {
4097 BlockDriverState *bs = job->bs;
4098
4099 assert(bs->job == job);
4100 job->cb(job->opaque, ret);
4101 bs->job = NULL;
4102 g_free(job);
4103 bdrv_set_in_use(bs, 0);
4104 }
4105
4106 int block_job_set_speed(BlockJob *job, int64_t value)
4107 {
4108 int rc;
4109
4110 if (!job->job_type->set_speed) {
4111 return -ENOTSUP;
4112 }
4113 rc = job->job_type->set_speed(job, value);
4114 if (rc == 0) {
4115 job->speed = value;
4116 }
4117 return rc;
4118 }
4119
4120 void block_job_cancel(BlockJob *job)
4121 {
4122 job->cancelled = true;
4123 }
4124
4125 bool block_job_is_cancelled(BlockJob *job)
4126 {
4127 return job->cancelled;
4128 }
4129
4130 void block_job_cancel_sync(BlockJob *job)
4131 {
4132 BlockDriverState *bs = job->bs;
4133
4134 assert(bs->job == job);
4135 block_job_cancel(job);
4136 while (bs->job != NULL && bs->job->busy) {
4137 qemu_aio_wait();
4138 }
4139 }