]> git.proxmox.com Git - qemu.git/blob - block.c
068e41d9d9617800f5206b4fd1c88c164377e5b5
[qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
55
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
85
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
92
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
95
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
98
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
101
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
111 }
112
113 int is_windows_drive(const char *filename)
114 {
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
122 }
123 #endif
124
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
127 {
128 bs->io_limits_enabled = false;
129
130 while (qemu_co_queue_next(&bs->throttled_reqs));
131
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
136 }
137
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
142 }
143
144 static void bdrv_block_timer(void *opaque)
145 {
146 BlockDriverState *bs = opaque;
147
148 qemu_co_queue_next(&bs->throttled_reqs);
149 }
150
151 void bdrv_io_limits_enable(BlockDriverState *bs)
152 {
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
160 }
161
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
163 {
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171 }
172
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
175 {
176 int64_t wait_time = -1;
177
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
180 }
181
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
187 */
188
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193 }
194
195 qemu_co_queue_next(&bs->throttled_reqs);
196 }
197
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
200 {
201 #ifdef _WIN32
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
204 return 0;
205 }
206 #endif
207
208 return strchr(path, ':') != NULL;
209 }
210
211 int path_is_absolute(const char *path)
212 {
213 const char *p;
214 #ifdef _WIN32
215 /* specific case for names like: "\\.\d:" */
216 if (*path == '/' || *path == '\\')
217 return 1;
218 #endif
219 p = strchr(path, ':');
220 if (p)
221 p++;
222 else
223 p = path;
224 #ifdef _WIN32
225 return (*p == '/' || *p == '\\');
226 #else
227 return (*p == '/');
228 #endif
229 }
230
231 /* if filename is absolute, just copy it to dest. Otherwise, build a
232 path to it by considering it is relative to base_path. URL are
233 supported. */
234 void path_combine(char *dest, int dest_size,
235 const char *base_path,
236 const char *filename)
237 {
238 const char *p, *p1;
239 int len;
240
241 if (dest_size <= 0)
242 return;
243 if (path_is_absolute(filename)) {
244 pstrcpy(dest, dest_size, filename);
245 } else {
246 p = strchr(base_path, ':');
247 if (p)
248 p++;
249 else
250 p = base_path;
251 p1 = strrchr(base_path, '/');
252 #ifdef _WIN32
253 {
254 const char *p2;
255 p2 = strrchr(base_path, '\\');
256 if (!p1 || p2 > p1)
257 p1 = p2;
258 }
259 #endif
260 if (p1)
261 p1++;
262 else
263 p1 = base_path;
264 if (p1 > p)
265 p = p1;
266 len = p - base_path;
267 if (len > dest_size - 1)
268 len = dest_size - 1;
269 memcpy(dest, base_path, len);
270 dest[len] = '\0';
271 pstrcat(dest, dest_size, filename);
272 }
273 }
274
275 void bdrv_register(BlockDriver *bdrv)
276 {
277 /* Block drivers without coroutine functions need emulation */
278 if (!bdrv->bdrv_co_readv) {
279 bdrv->bdrv_co_readv = bdrv_co_readv_em;
280 bdrv->bdrv_co_writev = bdrv_co_writev_em;
281
282 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283 * the block driver lacks aio we need to emulate that too.
284 */
285 if (!bdrv->bdrv_aio_readv) {
286 /* add AIO emulation layer */
287 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
289 }
290 }
291
292 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
293 }
294
295 /* create a new block device (by default it is empty) */
296 BlockDriverState *bdrv_new(const char *device_name)
297 {
298 BlockDriverState *bs;
299
300 bs = g_malloc0(sizeof(BlockDriverState));
301 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
302 if (device_name[0] != '\0') {
303 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
304 }
305 bdrv_iostatus_disable(bs);
306 return bs;
307 }
308
309 BlockDriver *bdrv_find_format(const char *format_name)
310 {
311 BlockDriver *drv1;
312 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313 if (!strcmp(drv1->format_name, format_name)) {
314 return drv1;
315 }
316 }
317 return NULL;
318 }
319
320 static int bdrv_is_whitelisted(BlockDriver *drv)
321 {
322 static const char *whitelist[] = {
323 CONFIG_BDRV_WHITELIST
324 };
325 const char **p;
326
327 if (!whitelist[0])
328 return 1; /* no whitelist, anything goes */
329
330 for (p = whitelist; *p; p++) {
331 if (!strcmp(drv->format_name, *p)) {
332 return 1;
333 }
334 }
335 return 0;
336 }
337
338 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
339 {
340 BlockDriver *drv = bdrv_find_format(format_name);
341 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
342 }
343
344 typedef struct CreateCo {
345 BlockDriver *drv;
346 char *filename;
347 QEMUOptionParameter *options;
348 int ret;
349 } CreateCo;
350
351 static void coroutine_fn bdrv_create_co_entry(void *opaque)
352 {
353 CreateCo *cco = opaque;
354 assert(cco->drv);
355
356 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
357 }
358
359 int bdrv_create(BlockDriver *drv, const char* filename,
360 QEMUOptionParameter *options)
361 {
362 int ret;
363
364 Coroutine *co;
365 CreateCo cco = {
366 .drv = drv,
367 .filename = g_strdup(filename),
368 .options = options,
369 .ret = NOT_DONE,
370 };
371
372 if (!drv->bdrv_create) {
373 return -ENOTSUP;
374 }
375
376 if (qemu_in_coroutine()) {
377 /* Fast-path if already in coroutine context */
378 bdrv_create_co_entry(&cco);
379 } else {
380 co = qemu_coroutine_create(bdrv_create_co_entry);
381 qemu_coroutine_enter(co, &cco);
382 while (cco.ret == NOT_DONE) {
383 qemu_aio_wait();
384 }
385 }
386
387 ret = cco.ret;
388 g_free(cco.filename);
389
390 return ret;
391 }
392
393 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
394 {
395 BlockDriver *drv;
396
397 drv = bdrv_find_protocol(filename);
398 if (drv == NULL) {
399 return -ENOENT;
400 }
401
402 return bdrv_create(drv, filename, options);
403 }
404
405 #ifdef _WIN32
406 void get_tmp_filename(char *filename, int size)
407 {
408 char temp_dir[MAX_PATH];
409
410 GetTempPath(MAX_PATH, temp_dir);
411 GetTempFileName(temp_dir, "qem", 0, filename);
412 }
413 #else
414 void get_tmp_filename(char *filename, int size)
415 {
416 int fd;
417 const char *tmpdir;
418 /* XXX: race condition possible */
419 tmpdir = getenv("TMPDIR");
420 if (!tmpdir)
421 tmpdir = "/tmp";
422 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
423 fd = mkstemp(filename);
424 close(fd);
425 }
426 #endif
427
428 /*
429 * Detect host devices. By convention, /dev/cdrom[N] is always
430 * recognized as a host CDROM.
431 */
432 static BlockDriver *find_hdev_driver(const char *filename)
433 {
434 int score_max = 0, score;
435 BlockDriver *drv = NULL, *d;
436
437 QLIST_FOREACH(d, &bdrv_drivers, list) {
438 if (d->bdrv_probe_device) {
439 score = d->bdrv_probe_device(filename);
440 if (score > score_max) {
441 score_max = score;
442 drv = d;
443 }
444 }
445 }
446
447 return drv;
448 }
449
450 BlockDriver *bdrv_find_protocol(const char *filename)
451 {
452 BlockDriver *drv1;
453 char protocol[128];
454 int len;
455 const char *p;
456
457 /* TODO Drivers without bdrv_file_open must be specified explicitly */
458
459 /*
460 * XXX(hch): we really should not let host device detection
461 * override an explicit protocol specification, but moving this
462 * later breaks access to device names with colons in them.
463 * Thanks to the brain-dead persistent naming schemes on udev-
464 * based Linux systems those actually are quite common.
465 */
466 drv1 = find_hdev_driver(filename);
467 if (drv1) {
468 return drv1;
469 }
470
471 if (!path_has_protocol(filename)) {
472 return bdrv_find_format("file");
473 }
474 p = strchr(filename, ':');
475 assert(p != NULL);
476 len = p - filename;
477 if (len > sizeof(protocol) - 1)
478 len = sizeof(protocol) - 1;
479 memcpy(protocol, filename, len);
480 protocol[len] = '\0';
481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482 if (drv1->protocol_name &&
483 !strcmp(drv1->protocol_name, protocol)) {
484 return drv1;
485 }
486 }
487 return NULL;
488 }
489
490 static int find_image_format(const char *filename, BlockDriver **pdrv)
491 {
492 int ret, score, score_max;
493 BlockDriver *drv1, *drv;
494 uint8_t buf[2048];
495 BlockDriverState *bs;
496
497 ret = bdrv_file_open(&bs, filename, 0);
498 if (ret < 0) {
499 *pdrv = NULL;
500 return ret;
501 }
502
503 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
504 if (bs->sg || !bdrv_is_inserted(bs)) {
505 bdrv_delete(bs);
506 drv = bdrv_find_format("raw");
507 if (!drv) {
508 ret = -ENOENT;
509 }
510 *pdrv = drv;
511 return ret;
512 }
513
514 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
515 bdrv_delete(bs);
516 if (ret < 0) {
517 *pdrv = NULL;
518 return ret;
519 }
520
521 score_max = 0;
522 drv = NULL;
523 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
524 if (drv1->bdrv_probe) {
525 score = drv1->bdrv_probe(buf, ret, filename);
526 if (score > score_max) {
527 score_max = score;
528 drv = drv1;
529 }
530 }
531 }
532 if (!drv) {
533 ret = -ENOENT;
534 }
535 *pdrv = drv;
536 return ret;
537 }
538
539 /**
540 * Set the current 'total_sectors' value
541 */
542 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
543 {
544 BlockDriver *drv = bs->drv;
545
546 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
547 if (bs->sg)
548 return 0;
549
550 /* query actual device if possible, otherwise just trust the hint */
551 if (drv->bdrv_getlength) {
552 int64_t length = drv->bdrv_getlength(bs);
553 if (length < 0) {
554 return length;
555 }
556 hint = length >> BDRV_SECTOR_BITS;
557 }
558
559 bs->total_sectors = hint;
560 return 0;
561 }
562
563 /**
564 * Set open flags for a given cache mode
565 *
566 * Return 0 on success, -1 if the cache mode was invalid.
567 */
568 int bdrv_parse_cache_flags(const char *mode, int *flags)
569 {
570 *flags &= ~BDRV_O_CACHE_MASK;
571
572 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
573 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
574 } else if (!strcmp(mode, "directsync")) {
575 *flags |= BDRV_O_NOCACHE;
576 } else if (!strcmp(mode, "writeback")) {
577 *flags |= BDRV_O_CACHE_WB;
578 } else if (!strcmp(mode, "unsafe")) {
579 *flags |= BDRV_O_CACHE_WB;
580 *flags |= BDRV_O_NO_FLUSH;
581 } else if (!strcmp(mode, "writethrough")) {
582 /* this is the default */
583 } else {
584 return -1;
585 }
586
587 return 0;
588 }
589
590 /**
591 * The copy-on-read flag is actually a reference count so multiple users may
592 * use the feature without worrying about clobbering its previous state.
593 * Copy-on-read stays enabled until all users have called to disable it.
594 */
595 void bdrv_enable_copy_on_read(BlockDriverState *bs)
596 {
597 bs->copy_on_read++;
598 }
599
600 void bdrv_disable_copy_on_read(BlockDriverState *bs)
601 {
602 assert(bs->copy_on_read > 0);
603 bs->copy_on_read--;
604 }
605
606 /*
607 * Common part for opening disk images and files
608 */
609 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
610 int flags, BlockDriver *drv)
611 {
612 int ret, open_flags;
613
614 assert(drv != NULL);
615
616 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
617
618 bs->file = NULL;
619 bs->total_sectors = 0;
620 bs->encrypted = 0;
621 bs->valid_key = 0;
622 bs->sg = 0;
623 bs->open_flags = flags;
624 bs->growable = 0;
625 bs->buffer_alignment = 512;
626
627 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
628 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
629 bdrv_enable_copy_on_read(bs);
630 }
631
632 pstrcpy(bs->filename, sizeof(bs->filename), filename);
633 bs->backing_file[0] = '\0';
634
635 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
636 return -ENOTSUP;
637 }
638
639 bs->drv = drv;
640 bs->opaque = g_malloc0(drv->instance_size);
641
642 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
643
644 /*
645 * Clear flags that are internal to the block layer before opening the
646 * image.
647 */
648 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
649
650 /*
651 * Snapshots should be writable.
652 */
653 if (bs->is_temporary) {
654 open_flags |= BDRV_O_RDWR;
655 }
656
657 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
658
659 /* Open the image, either directly or using a protocol */
660 if (drv->bdrv_file_open) {
661 ret = drv->bdrv_file_open(bs, filename, open_flags);
662 } else {
663 ret = bdrv_file_open(&bs->file, filename, open_flags);
664 if (ret >= 0) {
665 ret = drv->bdrv_open(bs, open_flags);
666 }
667 }
668
669 if (ret < 0) {
670 goto free_and_fail;
671 }
672
673 ret = refresh_total_sectors(bs, bs->total_sectors);
674 if (ret < 0) {
675 goto free_and_fail;
676 }
677
678 #ifndef _WIN32
679 if (bs->is_temporary) {
680 unlink(filename);
681 }
682 #endif
683 return 0;
684
685 free_and_fail:
686 if (bs->file) {
687 bdrv_delete(bs->file);
688 bs->file = NULL;
689 }
690 g_free(bs->opaque);
691 bs->opaque = NULL;
692 bs->drv = NULL;
693 return ret;
694 }
695
696 /*
697 * Opens a file using a protocol (file, host_device, nbd, ...)
698 */
699 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
700 {
701 BlockDriverState *bs;
702 BlockDriver *drv;
703 int ret;
704
705 drv = bdrv_find_protocol(filename);
706 if (!drv) {
707 return -ENOENT;
708 }
709
710 bs = bdrv_new("");
711 ret = bdrv_open_common(bs, filename, flags, drv);
712 if (ret < 0) {
713 bdrv_delete(bs);
714 return ret;
715 }
716 bs->growable = 1;
717 *pbs = bs;
718 return 0;
719 }
720
721 /*
722 * Opens a disk image (raw, qcow2, vmdk, ...)
723 */
724 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
725 BlockDriver *drv)
726 {
727 int ret;
728 char tmp_filename[PATH_MAX];
729
730 if (flags & BDRV_O_SNAPSHOT) {
731 BlockDriverState *bs1;
732 int64_t total_size;
733 int is_protocol = 0;
734 BlockDriver *bdrv_qcow2;
735 QEMUOptionParameter *options;
736 char backing_filename[PATH_MAX];
737
738 /* if snapshot, we create a temporary backing file and open it
739 instead of opening 'filename' directly */
740
741 /* if there is a backing file, use it */
742 bs1 = bdrv_new("");
743 ret = bdrv_open(bs1, filename, 0, drv);
744 if (ret < 0) {
745 bdrv_delete(bs1);
746 return ret;
747 }
748 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
749
750 if (bs1->drv && bs1->drv->protocol_name)
751 is_protocol = 1;
752
753 bdrv_delete(bs1);
754
755 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
756
757 /* Real path is meaningless for protocols */
758 if (is_protocol)
759 snprintf(backing_filename, sizeof(backing_filename),
760 "%s", filename);
761 else if (!realpath(filename, backing_filename))
762 return -errno;
763
764 bdrv_qcow2 = bdrv_find_format("qcow2");
765 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
766
767 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
768 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
769 if (drv) {
770 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
771 drv->format_name);
772 }
773
774 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
775 free_option_parameters(options);
776 if (ret < 0) {
777 return ret;
778 }
779
780 filename = tmp_filename;
781 drv = bdrv_qcow2;
782 bs->is_temporary = 1;
783 }
784
785 /* Find the right image format driver */
786 if (!drv) {
787 ret = find_image_format(filename, &drv);
788 }
789
790 if (!drv) {
791 goto unlink_and_fail;
792 }
793
794 /* Open the image */
795 ret = bdrv_open_common(bs, filename, flags, drv);
796 if (ret < 0) {
797 goto unlink_and_fail;
798 }
799
800 /* If there is a backing file, use it */
801 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
802 char backing_filename[PATH_MAX];
803 int back_flags;
804 BlockDriver *back_drv = NULL;
805
806 bs->backing_hd = bdrv_new("");
807
808 if (path_has_protocol(bs->backing_file)) {
809 pstrcpy(backing_filename, sizeof(backing_filename),
810 bs->backing_file);
811 } else {
812 path_combine(backing_filename, sizeof(backing_filename),
813 filename, bs->backing_file);
814 }
815
816 if (bs->backing_format[0] != '\0') {
817 back_drv = bdrv_find_format(bs->backing_format);
818 }
819
820 /* backing files always opened read-only */
821 back_flags =
822 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
823
824 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
825 if (ret < 0) {
826 bdrv_close(bs);
827 return ret;
828 }
829 if (bs->is_temporary) {
830 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
831 } else {
832 /* base image inherits from "parent" */
833 bs->backing_hd->keep_read_only = bs->keep_read_only;
834 }
835 }
836
837 if (!bdrv_key_required(bs)) {
838 bdrv_dev_change_media_cb(bs, true);
839 }
840
841 /* throttling disk I/O limits */
842 if (bs->io_limits_enabled) {
843 bdrv_io_limits_enable(bs);
844 }
845
846 return 0;
847
848 unlink_and_fail:
849 if (bs->is_temporary) {
850 unlink(filename);
851 }
852 return ret;
853 }
854
855 void bdrv_close(BlockDriverState *bs)
856 {
857 bdrv_flush(bs);
858 if (bs->drv) {
859 if (bs->job) {
860 block_job_cancel_sync(bs->job);
861 }
862 bdrv_drain_all();
863
864 if (bs == bs_snapshots) {
865 bs_snapshots = NULL;
866 }
867 if (bs->backing_hd) {
868 bdrv_delete(bs->backing_hd);
869 bs->backing_hd = NULL;
870 }
871 bs->drv->bdrv_close(bs);
872 g_free(bs->opaque);
873 #ifdef _WIN32
874 if (bs->is_temporary) {
875 unlink(bs->filename);
876 }
877 #endif
878 bs->opaque = NULL;
879 bs->drv = NULL;
880 bs->copy_on_read = 0;
881
882 if (bs->file != NULL) {
883 bdrv_close(bs->file);
884 }
885
886 bdrv_dev_change_media_cb(bs, false);
887 }
888
889 /*throttling disk I/O limits*/
890 if (bs->io_limits_enabled) {
891 bdrv_io_limits_disable(bs);
892 }
893 }
894
895 void bdrv_close_all(void)
896 {
897 BlockDriverState *bs;
898
899 QTAILQ_FOREACH(bs, &bdrv_states, list) {
900 bdrv_close(bs);
901 }
902 }
903
904 /*
905 * Wait for pending requests to complete across all BlockDriverStates
906 *
907 * This function does not flush data to disk, use bdrv_flush_all() for that
908 * after calling this function.
909 *
910 * Note that completion of an asynchronous I/O operation can trigger any
911 * number of other I/O operations on other devices---for example a coroutine
912 * can be arbitrarily complex and a constant flow of I/O can come until the
913 * coroutine is complete. Because of this, it is not possible to have a
914 * function to drain a single device's I/O queue.
915 */
916 void bdrv_drain_all(void)
917 {
918 BlockDriverState *bs;
919 bool busy;
920
921 do {
922 busy = qemu_aio_wait();
923
924 /* FIXME: We do not have timer support here, so this is effectively
925 * a busy wait.
926 */
927 QTAILQ_FOREACH(bs, &bdrv_states, list) {
928 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
929 qemu_co_queue_restart_all(&bs->throttled_reqs);
930 busy = true;
931 }
932 }
933 } while (busy);
934
935 /* If requests are still pending there is a bug somewhere */
936 QTAILQ_FOREACH(bs, &bdrv_states, list) {
937 assert(QLIST_EMPTY(&bs->tracked_requests));
938 assert(qemu_co_queue_empty(&bs->throttled_reqs));
939 }
940 }
941
942 /* make a BlockDriverState anonymous by removing from bdrv_state list.
943 Also, NULL terminate the device_name to prevent double remove */
944 void bdrv_make_anon(BlockDriverState *bs)
945 {
946 if (bs->device_name[0] != '\0') {
947 QTAILQ_REMOVE(&bdrv_states, bs, list);
948 }
949 bs->device_name[0] = '\0';
950 }
951
952 /*
953 * Add new bs contents at the top of an image chain while the chain is
954 * live, while keeping required fields on the top layer.
955 *
956 * This will modify the BlockDriverState fields, and swap contents
957 * between bs_new and bs_top. Both bs_new and bs_top are modified.
958 *
959 * bs_new is required to be anonymous.
960 *
961 * This function does not create any image files.
962 */
963 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
964 {
965 BlockDriverState tmp;
966
967 /* bs_new must be anonymous */
968 assert(bs_new->device_name[0] == '\0');
969
970 tmp = *bs_new;
971
972 /* there are some fields that need to stay on the top layer: */
973
974 /* dev info */
975 tmp.dev_ops = bs_top->dev_ops;
976 tmp.dev_opaque = bs_top->dev_opaque;
977 tmp.dev = bs_top->dev;
978 tmp.buffer_alignment = bs_top->buffer_alignment;
979 tmp.copy_on_read = bs_top->copy_on_read;
980
981 /* i/o timing parameters */
982 tmp.slice_time = bs_top->slice_time;
983 tmp.slice_start = bs_top->slice_start;
984 tmp.slice_end = bs_top->slice_end;
985 tmp.io_limits = bs_top->io_limits;
986 tmp.io_base = bs_top->io_base;
987 tmp.throttled_reqs = bs_top->throttled_reqs;
988 tmp.block_timer = bs_top->block_timer;
989 tmp.io_limits_enabled = bs_top->io_limits_enabled;
990
991 /* geometry */
992 tmp.cyls = bs_top->cyls;
993 tmp.heads = bs_top->heads;
994 tmp.secs = bs_top->secs;
995 tmp.translation = bs_top->translation;
996
997 /* r/w error */
998 tmp.on_read_error = bs_top->on_read_error;
999 tmp.on_write_error = bs_top->on_write_error;
1000
1001 /* i/o status */
1002 tmp.iostatus_enabled = bs_top->iostatus_enabled;
1003 tmp.iostatus = bs_top->iostatus;
1004
1005 /* keep the same entry in bdrv_states */
1006 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
1007 tmp.list = bs_top->list;
1008
1009 /* The contents of 'tmp' will become bs_top, as we are
1010 * swapping bs_new and bs_top contents. */
1011 tmp.backing_hd = bs_new;
1012 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
1013 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
1014
1015 /* swap contents of the fixed new bs and the current top */
1016 *bs_new = *bs_top;
1017 *bs_top = tmp;
1018
1019 /* device_name[] was carried over from the old bs_top. bs_new
1020 * shouldn't be in bdrv_states, so we need to make device_name[]
1021 * reflect the anonymity of bs_new
1022 */
1023 bs_new->device_name[0] = '\0';
1024
1025 /* clear the copied fields in the new backing file */
1026 bdrv_detach_dev(bs_new, bs_new->dev);
1027
1028 qemu_co_queue_init(&bs_new->throttled_reqs);
1029 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1030 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1031 bdrv_iostatus_disable(bs_new);
1032
1033 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1034 * to affect or delete the block_timer, as it has been moved to bs_top */
1035 bs_new->io_limits_enabled = false;
1036 bs_new->block_timer = NULL;
1037 bs_new->slice_time = 0;
1038 bs_new->slice_start = 0;
1039 bs_new->slice_end = 0;
1040 }
1041
1042 void bdrv_delete(BlockDriverState *bs)
1043 {
1044 assert(!bs->dev);
1045 assert(!bs->job);
1046 assert(!bs->in_use);
1047
1048 /* remove from list, if necessary */
1049 bdrv_make_anon(bs);
1050
1051 bdrv_close(bs);
1052 if (bs->file != NULL) {
1053 bdrv_delete(bs->file);
1054 }
1055
1056 assert(bs != bs_snapshots);
1057 g_free(bs);
1058 }
1059
1060 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1061 /* TODO change to DeviceState *dev when all users are qdevified */
1062 {
1063 if (bs->dev) {
1064 return -EBUSY;
1065 }
1066 bs->dev = dev;
1067 bdrv_iostatus_reset(bs);
1068 return 0;
1069 }
1070
1071 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1072 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1073 {
1074 if (bdrv_attach_dev(bs, dev) < 0) {
1075 abort();
1076 }
1077 }
1078
1079 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1080 /* TODO change to DeviceState *dev when all users are qdevified */
1081 {
1082 assert(bs->dev == dev);
1083 bs->dev = NULL;
1084 bs->dev_ops = NULL;
1085 bs->dev_opaque = NULL;
1086 bs->buffer_alignment = 512;
1087 }
1088
1089 /* TODO change to return DeviceState * when all users are qdevified */
1090 void *bdrv_get_attached_dev(BlockDriverState *bs)
1091 {
1092 return bs->dev;
1093 }
1094
1095 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1096 void *opaque)
1097 {
1098 bs->dev_ops = ops;
1099 bs->dev_opaque = opaque;
1100 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1101 bs_snapshots = NULL;
1102 }
1103 }
1104
1105 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1106 BlockQMPEventAction action, int is_read)
1107 {
1108 QObject *data;
1109 const char *action_str;
1110
1111 switch (action) {
1112 case BDRV_ACTION_REPORT:
1113 action_str = "report";
1114 break;
1115 case BDRV_ACTION_IGNORE:
1116 action_str = "ignore";
1117 break;
1118 case BDRV_ACTION_STOP:
1119 action_str = "stop";
1120 break;
1121 default:
1122 abort();
1123 }
1124
1125 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1126 bdrv->device_name,
1127 action_str,
1128 is_read ? "read" : "write");
1129 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1130
1131 qobject_decref(data);
1132 }
1133
1134 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1135 {
1136 QObject *data;
1137
1138 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1139 bdrv_get_device_name(bs), ejected);
1140 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1141
1142 qobject_decref(data);
1143 }
1144
1145 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1146 {
1147 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1148 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1149 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1150 if (tray_was_closed) {
1151 /* tray open */
1152 bdrv_emit_qmp_eject_event(bs, true);
1153 }
1154 if (load) {
1155 /* tray close */
1156 bdrv_emit_qmp_eject_event(bs, false);
1157 }
1158 }
1159 }
1160
1161 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1162 {
1163 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1164 }
1165
1166 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1167 {
1168 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1169 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1170 }
1171 }
1172
1173 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1174 {
1175 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1176 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1177 }
1178 return false;
1179 }
1180
1181 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1182 {
1183 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1184 bs->dev_ops->resize_cb(bs->dev_opaque);
1185 }
1186 }
1187
1188 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1189 {
1190 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1191 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1192 }
1193 return false;
1194 }
1195
1196 /*
1197 * Run consistency checks on an image
1198 *
1199 * Returns 0 if the check could be completed (it doesn't mean that the image is
1200 * free of errors) or -errno when an internal error occurred. The results of the
1201 * check are stored in res.
1202 */
1203 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1204 {
1205 if (bs->drv->bdrv_check == NULL) {
1206 return -ENOTSUP;
1207 }
1208
1209 memset(res, 0, sizeof(*res));
1210 return bs->drv->bdrv_check(bs, res);
1211 }
1212
1213 #define COMMIT_BUF_SECTORS 2048
1214
1215 /* commit COW file into the raw image */
1216 int bdrv_commit(BlockDriverState *bs)
1217 {
1218 BlockDriver *drv = bs->drv;
1219 BlockDriver *backing_drv;
1220 int64_t sector, total_sectors;
1221 int n, ro, open_flags;
1222 int ret = 0, rw_ret = 0;
1223 uint8_t *buf;
1224 char filename[1024];
1225 BlockDriverState *bs_rw, *bs_ro;
1226
1227 if (!drv)
1228 return -ENOMEDIUM;
1229
1230 if (!bs->backing_hd) {
1231 return -ENOTSUP;
1232 }
1233
1234 if (bs->backing_hd->keep_read_only) {
1235 return -EACCES;
1236 }
1237
1238 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1239 return -EBUSY;
1240 }
1241
1242 backing_drv = bs->backing_hd->drv;
1243 ro = bs->backing_hd->read_only;
1244 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1245 open_flags = bs->backing_hd->open_flags;
1246
1247 if (ro) {
1248 /* re-open as RW */
1249 bdrv_delete(bs->backing_hd);
1250 bs->backing_hd = NULL;
1251 bs_rw = bdrv_new("");
1252 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1253 backing_drv);
1254 if (rw_ret < 0) {
1255 bdrv_delete(bs_rw);
1256 /* try to re-open read-only */
1257 bs_ro = bdrv_new("");
1258 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1259 backing_drv);
1260 if (ret < 0) {
1261 bdrv_delete(bs_ro);
1262 /* drive not functional anymore */
1263 bs->drv = NULL;
1264 return ret;
1265 }
1266 bs->backing_hd = bs_ro;
1267 return rw_ret;
1268 }
1269 bs->backing_hd = bs_rw;
1270 }
1271
1272 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1273 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1274
1275 for (sector = 0; sector < total_sectors; sector += n) {
1276 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1277
1278 if (bdrv_read(bs, sector, buf, n) != 0) {
1279 ret = -EIO;
1280 goto ro_cleanup;
1281 }
1282
1283 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1284 ret = -EIO;
1285 goto ro_cleanup;
1286 }
1287 }
1288 }
1289
1290 if (drv->bdrv_make_empty) {
1291 ret = drv->bdrv_make_empty(bs);
1292 bdrv_flush(bs);
1293 }
1294
1295 /*
1296 * Make sure all data we wrote to the backing device is actually
1297 * stable on disk.
1298 */
1299 if (bs->backing_hd)
1300 bdrv_flush(bs->backing_hd);
1301
1302 ro_cleanup:
1303 g_free(buf);
1304
1305 if (ro) {
1306 /* re-open as RO */
1307 bdrv_delete(bs->backing_hd);
1308 bs->backing_hd = NULL;
1309 bs_ro = bdrv_new("");
1310 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1311 backing_drv);
1312 if (ret < 0) {
1313 bdrv_delete(bs_ro);
1314 /* drive not functional anymore */
1315 bs->drv = NULL;
1316 return ret;
1317 }
1318 bs->backing_hd = bs_ro;
1319 bs->backing_hd->keep_read_only = 0;
1320 }
1321
1322 return ret;
1323 }
1324
1325 int bdrv_commit_all(void)
1326 {
1327 BlockDriverState *bs;
1328
1329 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1330 int ret = bdrv_commit(bs);
1331 if (ret < 0) {
1332 return ret;
1333 }
1334 }
1335 return 0;
1336 }
1337
1338 struct BdrvTrackedRequest {
1339 BlockDriverState *bs;
1340 int64_t sector_num;
1341 int nb_sectors;
1342 bool is_write;
1343 QLIST_ENTRY(BdrvTrackedRequest) list;
1344 Coroutine *co; /* owner, used for deadlock detection */
1345 CoQueue wait_queue; /* coroutines blocked on this request */
1346 };
1347
1348 /**
1349 * Remove an active request from the tracked requests list
1350 *
1351 * This function should be called when a tracked request is completing.
1352 */
1353 static void tracked_request_end(BdrvTrackedRequest *req)
1354 {
1355 QLIST_REMOVE(req, list);
1356 qemu_co_queue_restart_all(&req->wait_queue);
1357 }
1358
1359 /**
1360 * Add an active request to the tracked requests list
1361 */
1362 static void tracked_request_begin(BdrvTrackedRequest *req,
1363 BlockDriverState *bs,
1364 int64_t sector_num,
1365 int nb_sectors, bool is_write)
1366 {
1367 *req = (BdrvTrackedRequest){
1368 .bs = bs,
1369 .sector_num = sector_num,
1370 .nb_sectors = nb_sectors,
1371 .is_write = is_write,
1372 .co = qemu_coroutine_self(),
1373 };
1374
1375 qemu_co_queue_init(&req->wait_queue);
1376
1377 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1378 }
1379
1380 /**
1381 * Round a region to cluster boundaries
1382 */
1383 static void round_to_clusters(BlockDriverState *bs,
1384 int64_t sector_num, int nb_sectors,
1385 int64_t *cluster_sector_num,
1386 int *cluster_nb_sectors)
1387 {
1388 BlockDriverInfo bdi;
1389
1390 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1391 *cluster_sector_num = sector_num;
1392 *cluster_nb_sectors = nb_sectors;
1393 } else {
1394 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1395 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1396 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1397 nb_sectors, c);
1398 }
1399 }
1400
1401 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1402 int64_t sector_num, int nb_sectors) {
1403 /* aaaa bbbb */
1404 if (sector_num >= req->sector_num + req->nb_sectors) {
1405 return false;
1406 }
1407 /* bbbb aaaa */
1408 if (req->sector_num >= sector_num + nb_sectors) {
1409 return false;
1410 }
1411 return true;
1412 }
1413
1414 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1415 int64_t sector_num, int nb_sectors)
1416 {
1417 BdrvTrackedRequest *req;
1418 int64_t cluster_sector_num;
1419 int cluster_nb_sectors;
1420 bool retry;
1421
1422 /* If we touch the same cluster it counts as an overlap. This guarantees
1423 * that allocating writes will be serialized and not race with each other
1424 * for the same cluster. For example, in copy-on-read it ensures that the
1425 * CoR read and write operations are atomic and guest writes cannot
1426 * interleave between them.
1427 */
1428 round_to_clusters(bs, sector_num, nb_sectors,
1429 &cluster_sector_num, &cluster_nb_sectors);
1430
1431 do {
1432 retry = false;
1433 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1434 if (tracked_request_overlaps(req, cluster_sector_num,
1435 cluster_nb_sectors)) {
1436 /* Hitting this means there was a reentrant request, for
1437 * example, a block driver issuing nested requests. This must
1438 * never happen since it means deadlock.
1439 */
1440 assert(qemu_coroutine_self() != req->co);
1441
1442 qemu_co_queue_wait(&req->wait_queue);
1443 retry = true;
1444 break;
1445 }
1446 }
1447 } while (retry);
1448 }
1449
1450 /*
1451 * Return values:
1452 * 0 - success
1453 * -EINVAL - backing format specified, but no file
1454 * -ENOSPC - can't update the backing file because no space is left in the
1455 * image file header
1456 * -ENOTSUP - format driver doesn't support changing the backing file
1457 */
1458 int bdrv_change_backing_file(BlockDriverState *bs,
1459 const char *backing_file, const char *backing_fmt)
1460 {
1461 BlockDriver *drv = bs->drv;
1462
1463 /* Backing file format doesn't make sense without a backing file */
1464 if (backing_fmt && !backing_file) {
1465 return -EINVAL;
1466 }
1467
1468 if (drv->bdrv_change_backing_file != NULL) {
1469 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1470 } else {
1471 return -ENOTSUP;
1472 }
1473 }
1474
1475 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1476 size_t size)
1477 {
1478 int64_t len;
1479
1480 if (!bdrv_is_inserted(bs))
1481 return -ENOMEDIUM;
1482
1483 if (bs->growable)
1484 return 0;
1485
1486 len = bdrv_getlength(bs);
1487
1488 if (offset < 0)
1489 return -EIO;
1490
1491 if ((offset > len) || (len - offset < size))
1492 return -EIO;
1493
1494 return 0;
1495 }
1496
1497 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1498 int nb_sectors)
1499 {
1500 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1501 nb_sectors * BDRV_SECTOR_SIZE);
1502 }
1503
1504 typedef struct RwCo {
1505 BlockDriverState *bs;
1506 int64_t sector_num;
1507 int nb_sectors;
1508 QEMUIOVector *qiov;
1509 bool is_write;
1510 int ret;
1511 } RwCo;
1512
1513 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1514 {
1515 RwCo *rwco = opaque;
1516
1517 if (!rwco->is_write) {
1518 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1519 rwco->nb_sectors, rwco->qiov, 0);
1520 } else {
1521 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1522 rwco->nb_sectors, rwco->qiov, 0);
1523 }
1524 }
1525
1526 /*
1527 * Process a synchronous request using coroutines
1528 */
1529 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1530 int nb_sectors, bool is_write)
1531 {
1532 QEMUIOVector qiov;
1533 struct iovec iov = {
1534 .iov_base = (void *)buf,
1535 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1536 };
1537 Coroutine *co;
1538 RwCo rwco = {
1539 .bs = bs,
1540 .sector_num = sector_num,
1541 .nb_sectors = nb_sectors,
1542 .qiov = &qiov,
1543 .is_write = is_write,
1544 .ret = NOT_DONE,
1545 };
1546
1547 qemu_iovec_init_external(&qiov, &iov, 1);
1548
1549 /**
1550 * In sync call context, when the vcpu is blocked, this throttling timer
1551 * will not fire; so the I/O throttling function has to be disabled here
1552 * if it has been enabled.
1553 */
1554 if (bs->io_limits_enabled) {
1555 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1556 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1557 bdrv_io_limits_disable(bs);
1558 }
1559
1560 if (qemu_in_coroutine()) {
1561 /* Fast-path if already in coroutine context */
1562 bdrv_rw_co_entry(&rwco);
1563 } else {
1564 co = qemu_coroutine_create(bdrv_rw_co_entry);
1565 qemu_coroutine_enter(co, &rwco);
1566 while (rwco.ret == NOT_DONE) {
1567 qemu_aio_wait();
1568 }
1569 }
1570 return rwco.ret;
1571 }
1572
1573 /* return < 0 if error. See bdrv_write() for the return codes */
1574 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1575 uint8_t *buf, int nb_sectors)
1576 {
1577 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1578 }
1579
1580 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1581 int nb_sectors, int dirty)
1582 {
1583 int64_t start, end;
1584 unsigned long val, idx, bit;
1585
1586 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1587 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1588
1589 for (; start <= end; start++) {
1590 idx = start / (sizeof(unsigned long) * 8);
1591 bit = start % (sizeof(unsigned long) * 8);
1592 val = bs->dirty_bitmap[idx];
1593 if (dirty) {
1594 if (!(val & (1UL << bit))) {
1595 bs->dirty_count++;
1596 val |= 1UL << bit;
1597 }
1598 } else {
1599 if (val & (1UL << bit)) {
1600 bs->dirty_count--;
1601 val &= ~(1UL << bit);
1602 }
1603 }
1604 bs->dirty_bitmap[idx] = val;
1605 }
1606 }
1607
1608 /* Return < 0 if error. Important errors are:
1609 -EIO generic I/O error (may happen for all errors)
1610 -ENOMEDIUM No media inserted.
1611 -EINVAL Invalid sector number or nb_sectors
1612 -EACCES Trying to write a read-only device
1613 */
1614 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1615 const uint8_t *buf, int nb_sectors)
1616 {
1617 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1618 }
1619
1620 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1621 void *buf, int count1)
1622 {
1623 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1624 int len, nb_sectors, count;
1625 int64_t sector_num;
1626 int ret;
1627
1628 count = count1;
1629 /* first read to align to sector start */
1630 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1631 if (len > count)
1632 len = count;
1633 sector_num = offset >> BDRV_SECTOR_BITS;
1634 if (len > 0) {
1635 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1636 return ret;
1637 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1638 count -= len;
1639 if (count == 0)
1640 return count1;
1641 sector_num++;
1642 buf += len;
1643 }
1644
1645 /* read the sectors "in place" */
1646 nb_sectors = count >> BDRV_SECTOR_BITS;
1647 if (nb_sectors > 0) {
1648 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1649 return ret;
1650 sector_num += nb_sectors;
1651 len = nb_sectors << BDRV_SECTOR_BITS;
1652 buf += len;
1653 count -= len;
1654 }
1655
1656 /* add data from the last sector */
1657 if (count > 0) {
1658 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1659 return ret;
1660 memcpy(buf, tmp_buf, count);
1661 }
1662 return count1;
1663 }
1664
1665 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1666 const void *buf, int count1)
1667 {
1668 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1669 int len, nb_sectors, count;
1670 int64_t sector_num;
1671 int ret;
1672
1673 count = count1;
1674 /* first write to align to sector start */
1675 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1676 if (len > count)
1677 len = count;
1678 sector_num = offset >> BDRV_SECTOR_BITS;
1679 if (len > 0) {
1680 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1681 return ret;
1682 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1683 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1684 return ret;
1685 count -= len;
1686 if (count == 0)
1687 return count1;
1688 sector_num++;
1689 buf += len;
1690 }
1691
1692 /* write the sectors "in place" */
1693 nb_sectors = count >> BDRV_SECTOR_BITS;
1694 if (nb_sectors > 0) {
1695 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1696 return ret;
1697 sector_num += nb_sectors;
1698 len = nb_sectors << BDRV_SECTOR_BITS;
1699 buf += len;
1700 count -= len;
1701 }
1702
1703 /* add data from the last sector */
1704 if (count > 0) {
1705 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1706 return ret;
1707 memcpy(tmp_buf, buf, count);
1708 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1709 return ret;
1710 }
1711 return count1;
1712 }
1713
1714 /*
1715 * Writes to the file and ensures that no writes are reordered across this
1716 * request (acts as a barrier)
1717 *
1718 * Returns 0 on success, -errno in error cases.
1719 */
1720 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1721 const void *buf, int count)
1722 {
1723 int ret;
1724
1725 ret = bdrv_pwrite(bs, offset, buf, count);
1726 if (ret < 0) {
1727 return ret;
1728 }
1729
1730 /* No flush needed for cache modes that use O_DSYNC */
1731 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1732 bdrv_flush(bs);
1733 }
1734
1735 return 0;
1736 }
1737
1738 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1739 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1740 {
1741 /* Perform I/O through a temporary buffer so that users who scribble over
1742 * their read buffer while the operation is in progress do not end up
1743 * modifying the image file. This is critical for zero-copy guest I/O
1744 * where anything might happen inside guest memory.
1745 */
1746 void *bounce_buffer;
1747
1748 BlockDriver *drv = bs->drv;
1749 struct iovec iov;
1750 QEMUIOVector bounce_qiov;
1751 int64_t cluster_sector_num;
1752 int cluster_nb_sectors;
1753 size_t skip_bytes;
1754 int ret;
1755
1756 /* Cover entire cluster so no additional backing file I/O is required when
1757 * allocating cluster in the image file.
1758 */
1759 round_to_clusters(bs, sector_num, nb_sectors,
1760 &cluster_sector_num, &cluster_nb_sectors);
1761
1762 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1763 cluster_sector_num, cluster_nb_sectors);
1764
1765 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1766 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1767 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1768
1769 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1770 &bounce_qiov);
1771 if (ret < 0) {
1772 goto err;
1773 }
1774
1775 if (drv->bdrv_co_write_zeroes &&
1776 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1777 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1778 cluster_nb_sectors);
1779 } else {
1780 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1781 &bounce_qiov);
1782 }
1783
1784 if (ret < 0) {
1785 /* It might be okay to ignore write errors for guest requests. If this
1786 * is a deliberate copy-on-read then we don't want to ignore the error.
1787 * Simply report it in all cases.
1788 */
1789 goto err;
1790 }
1791
1792 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1793 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1794 nb_sectors * BDRV_SECTOR_SIZE);
1795
1796 err:
1797 qemu_vfree(bounce_buffer);
1798 return ret;
1799 }
1800
1801 /*
1802 * Handle a read request in coroutine context
1803 */
1804 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1805 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1806 BdrvRequestFlags flags)
1807 {
1808 BlockDriver *drv = bs->drv;
1809 BdrvTrackedRequest req;
1810 int ret;
1811
1812 if (!drv) {
1813 return -ENOMEDIUM;
1814 }
1815 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1816 return -EIO;
1817 }
1818
1819 /* throttling disk read I/O */
1820 if (bs->io_limits_enabled) {
1821 bdrv_io_limits_intercept(bs, false, nb_sectors);
1822 }
1823
1824 if (bs->copy_on_read) {
1825 flags |= BDRV_REQ_COPY_ON_READ;
1826 }
1827 if (flags & BDRV_REQ_COPY_ON_READ) {
1828 bs->copy_on_read_in_flight++;
1829 }
1830
1831 if (bs->copy_on_read_in_flight) {
1832 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1833 }
1834
1835 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1836
1837 if (flags & BDRV_REQ_COPY_ON_READ) {
1838 int pnum;
1839
1840 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1841 if (ret < 0) {
1842 goto out;
1843 }
1844
1845 if (!ret || pnum != nb_sectors) {
1846 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1847 goto out;
1848 }
1849 }
1850
1851 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1852
1853 out:
1854 tracked_request_end(&req);
1855
1856 if (flags & BDRV_REQ_COPY_ON_READ) {
1857 bs->copy_on_read_in_flight--;
1858 }
1859
1860 return ret;
1861 }
1862
1863 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1864 int nb_sectors, QEMUIOVector *qiov)
1865 {
1866 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1867
1868 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1869 }
1870
1871 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1872 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1873 {
1874 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1875
1876 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1877 BDRV_REQ_COPY_ON_READ);
1878 }
1879
1880 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1881 int64_t sector_num, int nb_sectors)
1882 {
1883 BlockDriver *drv = bs->drv;
1884 QEMUIOVector qiov;
1885 struct iovec iov;
1886 int ret;
1887
1888 /* TODO Emulate only part of misaligned requests instead of letting block
1889 * drivers return -ENOTSUP and emulate everything */
1890
1891 /* First try the efficient write zeroes operation */
1892 if (drv->bdrv_co_write_zeroes) {
1893 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1894 if (ret != -ENOTSUP) {
1895 return ret;
1896 }
1897 }
1898
1899 /* Fall back to bounce buffer if write zeroes is unsupported */
1900 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1901 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1902 memset(iov.iov_base, 0, iov.iov_len);
1903 qemu_iovec_init_external(&qiov, &iov, 1);
1904
1905 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1906
1907 qemu_vfree(iov.iov_base);
1908 return ret;
1909 }
1910
1911 /*
1912 * Handle a write request in coroutine context
1913 */
1914 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1915 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1916 BdrvRequestFlags flags)
1917 {
1918 BlockDriver *drv = bs->drv;
1919 BdrvTrackedRequest req;
1920 int ret;
1921
1922 if (!bs->drv) {
1923 return -ENOMEDIUM;
1924 }
1925 if (bs->read_only) {
1926 return -EACCES;
1927 }
1928 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1929 return -EIO;
1930 }
1931
1932 /* throttling disk write I/O */
1933 if (bs->io_limits_enabled) {
1934 bdrv_io_limits_intercept(bs, true, nb_sectors);
1935 }
1936
1937 if (bs->copy_on_read_in_flight) {
1938 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1939 }
1940
1941 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1942
1943 if (flags & BDRV_REQ_ZERO_WRITE) {
1944 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1945 } else {
1946 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1947 }
1948
1949 if (bs->dirty_bitmap) {
1950 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1951 }
1952
1953 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1954 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1955 }
1956
1957 tracked_request_end(&req);
1958
1959 return ret;
1960 }
1961
1962 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1963 int nb_sectors, QEMUIOVector *qiov)
1964 {
1965 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1966
1967 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1968 }
1969
1970 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1971 int64_t sector_num, int nb_sectors)
1972 {
1973 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1974
1975 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1976 BDRV_REQ_ZERO_WRITE);
1977 }
1978
1979 /**
1980 * Truncate file to 'offset' bytes (needed only for file protocols)
1981 */
1982 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1983 {
1984 BlockDriver *drv = bs->drv;
1985 int ret;
1986 if (!drv)
1987 return -ENOMEDIUM;
1988 if (!drv->bdrv_truncate)
1989 return -ENOTSUP;
1990 if (bs->read_only)
1991 return -EACCES;
1992 if (bdrv_in_use(bs))
1993 return -EBUSY;
1994 ret = drv->bdrv_truncate(bs, offset);
1995 if (ret == 0) {
1996 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1997 bdrv_dev_resize_cb(bs);
1998 }
1999 return ret;
2000 }
2001
2002 /**
2003 * Length of a allocated file in bytes. Sparse files are counted by actual
2004 * allocated space. Return < 0 if error or unknown.
2005 */
2006 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
2007 {
2008 BlockDriver *drv = bs->drv;
2009 if (!drv) {
2010 return -ENOMEDIUM;
2011 }
2012 if (drv->bdrv_get_allocated_file_size) {
2013 return drv->bdrv_get_allocated_file_size(bs);
2014 }
2015 if (bs->file) {
2016 return bdrv_get_allocated_file_size(bs->file);
2017 }
2018 return -ENOTSUP;
2019 }
2020
2021 /**
2022 * Length of a file in bytes. Return < 0 if error or unknown.
2023 */
2024 int64_t bdrv_getlength(BlockDriverState *bs)
2025 {
2026 BlockDriver *drv = bs->drv;
2027 if (!drv)
2028 return -ENOMEDIUM;
2029
2030 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2031 if (drv->bdrv_getlength) {
2032 return drv->bdrv_getlength(bs);
2033 }
2034 }
2035 return bs->total_sectors * BDRV_SECTOR_SIZE;
2036 }
2037
2038 /* return 0 as number of sectors if no device present or error */
2039 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2040 {
2041 int64_t length;
2042 length = bdrv_getlength(bs);
2043 if (length < 0)
2044 length = 0;
2045 else
2046 length = length >> BDRV_SECTOR_BITS;
2047 *nb_sectors_ptr = length;
2048 }
2049
2050 struct partition {
2051 uint8_t boot_ind; /* 0x80 - active */
2052 uint8_t head; /* starting head */
2053 uint8_t sector; /* starting sector */
2054 uint8_t cyl; /* starting cylinder */
2055 uint8_t sys_ind; /* What partition type */
2056 uint8_t end_head; /* end head */
2057 uint8_t end_sector; /* end sector */
2058 uint8_t end_cyl; /* end cylinder */
2059 uint32_t start_sect; /* starting sector counting from 0 */
2060 uint32_t nr_sects; /* nr of sectors in partition */
2061 } QEMU_PACKED;
2062
2063 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2064 static int guess_disk_lchs(BlockDriverState *bs,
2065 int *pcylinders, int *pheads, int *psectors)
2066 {
2067 uint8_t buf[BDRV_SECTOR_SIZE];
2068 int ret, i, heads, sectors, cylinders;
2069 struct partition *p;
2070 uint32_t nr_sects;
2071 uint64_t nb_sectors;
2072 bool enabled;
2073
2074 bdrv_get_geometry(bs, &nb_sectors);
2075
2076 /**
2077 * The function will be invoked during startup not only in sync I/O mode,
2078 * but also in async I/O mode. So the I/O throttling function has to
2079 * be disabled temporarily here, not permanently.
2080 */
2081 enabled = bs->io_limits_enabled;
2082 bs->io_limits_enabled = false;
2083 ret = bdrv_read(bs, 0, buf, 1);
2084 bs->io_limits_enabled = enabled;
2085 if (ret < 0)
2086 return -1;
2087 /* test msdos magic */
2088 if (buf[510] != 0x55 || buf[511] != 0xaa)
2089 return -1;
2090 for(i = 0; i < 4; i++) {
2091 p = ((struct partition *)(buf + 0x1be)) + i;
2092 nr_sects = le32_to_cpu(p->nr_sects);
2093 if (nr_sects && p->end_head) {
2094 /* We make the assumption that the partition terminates on
2095 a cylinder boundary */
2096 heads = p->end_head + 1;
2097 sectors = p->end_sector & 63;
2098 if (sectors == 0)
2099 continue;
2100 cylinders = nb_sectors / (heads * sectors);
2101 if (cylinders < 1 || cylinders > 16383)
2102 continue;
2103 *pheads = heads;
2104 *psectors = sectors;
2105 *pcylinders = cylinders;
2106 #if 0
2107 printf("guessed geometry: LCHS=%d %d %d\n",
2108 cylinders, heads, sectors);
2109 #endif
2110 return 0;
2111 }
2112 }
2113 return -1;
2114 }
2115
2116 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2117 {
2118 int translation, lba_detected = 0;
2119 int cylinders, heads, secs;
2120 uint64_t nb_sectors;
2121
2122 /* if a geometry hint is available, use it */
2123 bdrv_get_geometry(bs, &nb_sectors);
2124 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2125 translation = bdrv_get_translation_hint(bs);
2126 if (cylinders != 0) {
2127 *pcyls = cylinders;
2128 *pheads = heads;
2129 *psecs = secs;
2130 } else {
2131 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2132 if (heads > 16) {
2133 /* if heads > 16, it means that a BIOS LBA
2134 translation was active, so the default
2135 hardware geometry is OK */
2136 lba_detected = 1;
2137 goto default_geometry;
2138 } else {
2139 *pcyls = cylinders;
2140 *pheads = heads;
2141 *psecs = secs;
2142 /* disable any translation to be in sync with
2143 the logical geometry */
2144 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2145 bdrv_set_translation_hint(bs,
2146 BIOS_ATA_TRANSLATION_NONE);
2147 }
2148 }
2149 } else {
2150 default_geometry:
2151 /* if no geometry, use a standard physical disk geometry */
2152 cylinders = nb_sectors / (16 * 63);
2153
2154 if (cylinders > 16383)
2155 cylinders = 16383;
2156 else if (cylinders < 2)
2157 cylinders = 2;
2158 *pcyls = cylinders;
2159 *pheads = 16;
2160 *psecs = 63;
2161 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2162 if ((*pcyls * *pheads) <= 131072) {
2163 bdrv_set_translation_hint(bs,
2164 BIOS_ATA_TRANSLATION_LARGE);
2165 } else {
2166 bdrv_set_translation_hint(bs,
2167 BIOS_ATA_TRANSLATION_LBA);
2168 }
2169 }
2170 }
2171 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2172 }
2173 }
2174
2175 void bdrv_set_geometry_hint(BlockDriverState *bs,
2176 int cyls, int heads, int secs)
2177 {
2178 bs->cyls = cyls;
2179 bs->heads = heads;
2180 bs->secs = secs;
2181 }
2182
2183 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2184 {
2185 bs->translation = translation;
2186 }
2187
2188 void bdrv_get_geometry_hint(BlockDriverState *bs,
2189 int *pcyls, int *pheads, int *psecs)
2190 {
2191 *pcyls = bs->cyls;
2192 *pheads = bs->heads;
2193 *psecs = bs->secs;
2194 }
2195
2196 /* throttling disk io limits */
2197 void bdrv_set_io_limits(BlockDriverState *bs,
2198 BlockIOLimit *io_limits)
2199 {
2200 bs->io_limits = *io_limits;
2201 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2202 }
2203
2204 /* Recognize floppy formats */
2205 typedef struct FDFormat {
2206 FDriveType drive;
2207 uint8_t last_sect;
2208 uint8_t max_track;
2209 uint8_t max_head;
2210 FDriveRate rate;
2211 } FDFormat;
2212
2213 static const FDFormat fd_formats[] = {
2214 /* First entry is default format */
2215 /* 1.44 MB 3"1/2 floppy disks */
2216 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2217 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2218 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2219 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2220 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2221 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2222 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2223 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2224 /* 2.88 MB 3"1/2 floppy disks */
2225 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2226 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2227 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2228 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2229 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2230 /* 720 kB 3"1/2 floppy disks */
2231 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2232 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2233 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2234 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2235 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2236 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2237 /* 1.2 MB 5"1/4 floppy disks */
2238 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2239 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2240 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2241 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2242 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2243 /* 720 kB 5"1/4 floppy disks */
2244 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2245 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2246 /* 360 kB 5"1/4 floppy disks */
2247 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2248 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2249 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2250 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2251 /* 320 kB 5"1/4 floppy disks */
2252 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2253 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2254 /* 360 kB must match 5"1/4 better than 3"1/2... */
2255 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2256 /* end */
2257 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2258 };
2259
2260 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2261 int *max_track, int *last_sect,
2262 FDriveType drive_in, FDriveType *drive,
2263 FDriveRate *rate)
2264 {
2265 const FDFormat *parse;
2266 uint64_t nb_sectors, size;
2267 int i, first_match, match;
2268
2269 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2270 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2271 /* User defined disk */
2272 *rate = FDRIVE_RATE_500K;
2273 } else {
2274 bdrv_get_geometry(bs, &nb_sectors);
2275 match = -1;
2276 first_match = -1;
2277 for (i = 0; ; i++) {
2278 parse = &fd_formats[i];
2279 if (parse->drive == FDRIVE_DRV_NONE) {
2280 break;
2281 }
2282 if (drive_in == parse->drive ||
2283 drive_in == FDRIVE_DRV_NONE) {
2284 size = (parse->max_head + 1) * parse->max_track *
2285 parse->last_sect;
2286 if (nb_sectors == size) {
2287 match = i;
2288 break;
2289 }
2290 if (first_match == -1) {
2291 first_match = i;
2292 }
2293 }
2294 }
2295 if (match == -1) {
2296 if (first_match == -1) {
2297 match = 1;
2298 } else {
2299 match = first_match;
2300 }
2301 parse = &fd_formats[match];
2302 }
2303 *nb_heads = parse->max_head + 1;
2304 *max_track = parse->max_track;
2305 *last_sect = parse->last_sect;
2306 *drive = parse->drive;
2307 *rate = parse->rate;
2308 }
2309 }
2310
2311 int bdrv_get_translation_hint(BlockDriverState *bs)
2312 {
2313 return bs->translation;
2314 }
2315
2316 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2317 BlockErrorAction on_write_error)
2318 {
2319 bs->on_read_error = on_read_error;
2320 bs->on_write_error = on_write_error;
2321 }
2322
2323 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2324 {
2325 return is_read ? bs->on_read_error : bs->on_write_error;
2326 }
2327
2328 int bdrv_is_read_only(BlockDriverState *bs)
2329 {
2330 return bs->read_only;
2331 }
2332
2333 int bdrv_is_sg(BlockDriverState *bs)
2334 {
2335 return bs->sg;
2336 }
2337
2338 int bdrv_enable_write_cache(BlockDriverState *bs)
2339 {
2340 return bs->enable_write_cache;
2341 }
2342
2343 int bdrv_is_encrypted(BlockDriverState *bs)
2344 {
2345 if (bs->backing_hd && bs->backing_hd->encrypted)
2346 return 1;
2347 return bs->encrypted;
2348 }
2349
2350 int bdrv_key_required(BlockDriverState *bs)
2351 {
2352 BlockDriverState *backing_hd = bs->backing_hd;
2353
2354 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2355 return 1;
2356 return (bs->encrypted && !bs->valid_key);
2357 }
2358
2359 int bdrv_set_key(BlockDriverState *bs, const char *key)
2360 {
2361 int ret;
2362 if (bs->backing_hd && bs->backing_hd->encrypted) {
2363 ret = bdrv_set_key(bs->backing_hd, key);
2364 if (ret < 0)
2365 return ret;
2366 if (!bs->encrypted)
2367 return 0;
2368 }
2369 if (!bs->encrypted) {
2370 return -EINVAL;
2371 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2372 return -ENOMEDIUM;
2373 }
2374 ret = bs->drv->bdrv_set_key(bs, key);
2375 if (ret < 0) {
2376 bs->valid_key = 0;
2377 } else if (!bs->valid_key) {
2378 bs->valid_key = 1;
2379 /* call the change callback now, we skipped it on open */
2380 bdrv_dev_change_media_cb(bs, true);
2381 }
2382 return ret;
2383 }
2384
2385 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2386 {
2387 if (!bs->drv) {
2388 buf[0] = '\0';
2389 } else {
2390 pstrcpy(buf, buf_size, bs->drv->format_name);
2391 }
2392 }
2393
2394 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2395 void *opaque)
2396 {
2397 BlockDriver *drv;
2398
2399 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2400 it(opaque, drv->format_name);
2401 }
2402 }
2403
2404 BlockDriverState *bdrv_find(const char *name)
2405 {
2406 BlockDriverState *bs;
2407
2408 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2409 if (!strcmp(name, bs->device_name)) {
2410 return bs;
2411 }
2412 }
2413 return NULL;
2414 }
2415
2416 BlockDriverState *bdrv_next(BlockDriverState *bs)
2417 {
2418 if (!bs) {
2419 return QTAILQ_FIRST(&bdrv_states);
2420 }
2421 return QTAILQ_NEXT(bs, list);
2422 }
2423
2424 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2425 {
2426 BlockDriverState *bs;
2427
2428 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2429 it(opaque, bs);
2430 }
2431 }
2432
2433 const char *bdrv_get_device_name(BlockDriverState *bs)
2434 {
2435 return bs->device_name;
2436 }
2437
2438 void bdrv_flush_all(void)
2439 {
2440 BlockDriverState *bs;
2441
2442 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2443 bdrv_flush(bs);
2444 }
2445 }
2446
2447 int bdrv_has_zero_init(BlockDriverState *bs)
2448 {
2449 assert(bs->drv);
2450
2451 if (bs->drv->bdrv_has_zero_init) {
2452 return bs->drv->bdrv_has_zero_init(bs);
2453 }
2454
2455 return 1;
2456 }
2457
2458 typedef struct BdrvCoIsAllocatedData {
2459 BlockDriverState *bs;
2460 int64_t sector_num;
2461 int nb_sectors;
2462 int *pnum;
2463 int ret;
2464 bool done;
2465 } BdrvCoIsAllocatedData;
2466
2467 /*
2468 * Returns true iff the specified sector is present in the disk image. Drivers
2469 * not implementing the functionality are assumed to not support backing files,
2470 * hence all their sectors are reported as allocated.
2471 *
2472 * If 'sector_num' is beyond the end of the disk image the return value is 0
2473 * and 'pnum' is set to 0.
2474 *
2475 * 'pnum' is set to the number of sectors (including and immediately following
2476 * the specified sector) that are known to be in the same
2477 * allocated/unallocated state.
2478 *
2479 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2480 * beyond the end of the disk image it will be clamped.
2481 */
2482 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2483 int nb_sectors, int *pnum)
2484 {
2485 int64_t n;
2486
2487 if (sector_num >= bs->total_sectors) {
2488 *pnum = 0;
2489 return 0;
2490 }
2491
2492 n = bs->total_sectors - sector_num;
2493 if (n < nb_sectors) {
2494 nb_sectors = n;
2495 }
2496
2497 if (!bs->drv->bdrv_co_is_allocated) {
2498 *pnum = nb_sectors;
2499 return 1;
2500 }
2501
2502 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2503 }
2504
2505 /* Coroutine wrapper for bdrv_is_allocated() */
2506 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2507 {
2508 BdrvCoIsAllocatedData *data = opaque;
2509 BlockDriverState *bs = data->bs;
2510
2511 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2512 data->pnum);
2513 data->done = true;
2514 }
2515
2516 /*
2517 * Synchronous wrapper around bdrv_co_is_allocated().
2518 *
2519 * See bdrv_co_is_allocated() for details.
2520 */
2521 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2522 int *pnum)
2523 {
2524 Coroutine *co;
2525 BdrvCoIsAllocatedData data = {
2526 .bs = bs,
2527 .sector_num = sector_num,
2528 .nb_sectors = nb_sectors,
2529 .pnum = pnum,
2530 .done = false,
2531 };
2532
2533 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2534 qemu_coroutine_enter(co, &data);
2535 while (!data.done) {
2536 qemu_aio_wait();
2537 }
2538 return data.ret;
2539 }
2540
2541 BlockInfoList *qmp_query_block(Error **errp)
2542 {
2543 BlockInfoList *head = NULL, *cur_item = NULL;
2544 BlockDriverState *bs;
2545
2546 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2547 BlockInfoList *info = g_malloc0(sizeof(*info));
2548
2549 info->value = g_malloc0(sizeof(*info->value));
2550 info->value->device = g_strdup(bs->device_name);
2551 info->value->type = g_strdup("unknown");
2552 info->value->locked = bdrv_dev_is_medium_locked(bs);
2553 info->value->removable = bdrv_dev_has_removable_media(bs);
2554
2555 if (bdrv_dev_has_removable_media(bs)) {
2556 info->value->has_tray_open = true;
2557 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2558 }
2559
2560 if (bdrv_iostatus_is_enabled(bs)) {
2561 info->value->has_io_status = true;
2562 info->value->io_status = bs->iostatus;
2563 }
2564
2565 if (bs->drv) {
2566 info->value->has_inserted = true;
2567 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2568 info->value->inserted->file = g_strdup(bs->filename);
2569 info->value->inserted->ro = bs->read_only;
2570 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2571 info->value->inserted->encrypted = bs->encrypted;
2572 if (bs->backing_file[0]) {
2573 info->value->inserted->has_backing_file = true;
2574 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2575 }
2576
2577 if (bs->io_limits_enabled) {
2578 info->value->inserted->bps =
2579 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2580 info->value->inserted->bps_rd =
2581 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2582 info->value->inserted->bps_wr =
2583 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2584 info->value->inserted->iops =
2585 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2586 info->value->inserted->iops_rd =
2587 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2588 info->value->inserted->iops_wr =
2589 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2590 }
2591 }
2592
2593 /* XXX: waiting for the qapi to support GSList */
2594 if (!cur_item) {
2595 head = cur_item = info;
2596 } else {
2597 cur_item->next = info;
2598 cur_item = info;
2599 }
2600 }
2601
2602 return head;
2603 }
2604
2605 /* Consider exposing this as a full fledged QMP command */
2606 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2607 {
2608 BlockStats *s;
2609
2610 s = g_malloc0(sizeof(*s));
2611
2612 if (bs->device_name[0]) {
2613 s->has_device = true;
2614 s->device = g_strdup(bs->device_name);
2615 }
2616
2617 s->stats = g_malloc0(sizeof(*s->stats));
2618 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2619 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2620 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2621 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2622 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2623 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2624 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2625 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2626 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2627
2628 if (bs->file) {
2629 s->has_parent = true;
2630 s->parent = qmp_query_blockstat(bs->file, NULL);
2631 }
2632
2633 return s;
2634 }
2635
2636 BlockStatsList *qmp_query_blockstats(Error **errp)
2637 {
2638 BlockStatsList *head = NULL, *cur_item = NULL;
2639 BlockDriverState *bs;
2640
2641 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2642 BlockStatsList *info = g_malloc0(sizeof(*info));
2643 info->value = qmp_query_blockstat(bs, NULL);
2644
2645 /* XXX: waiting for the qapi to support GSList */
2646 if (!cur_item) {
2647 head = cur_item = info;
2648 } else {
2649 cur_item->next = info;
2650 cur_item = info;
2651 }
2652 }
2653
2654 return head;
2655 }
2656
2657 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2658 {
2659 if (bs->backing_hd && bs->backing_hd->encrypted)
2660 return bs->backing_file;
2661 else if (bs->encrypted)
2662 return bs->filename;
2663 else
2664 return NULL;
2665 }
2666
2667 void bdrv_get_backing_filename(BlockDriverState *bs,
2668 char *filename, int filename_size)
2669 {
2670 pstrcpy(filename, filename_size, bs->backing_file);
2671 }
2672
2673 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2674 const uint8_t *buf, int nb_sectors)
2675 {
2676 BlockDriver *drv = bs->drv;
2677 if (!drv)
2678 return -ENOMEDIUM;
2679 if (!drv->bdrv_write_compressed)
2680 return -ENOTSUP;
2681 if (bdrv_check_request(bs, sector_num, nb_sectors))
2682 return -EIO;
2683
2684 if (bs->dirty_bitmap) {
2685 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2686 }
2687
2688 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2689 }
2690
2691 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2692 {
2693 BlockDriver *drv = bs->drv;
2694 if (!drv)
2695 return -ENOMEDIUM;
2696 if (!drv->bdrv_get_info)
2697 return -ENOTSUP;
2698 memset(bdi, 0, sizeof(*bdi));
2699 return drv->bdrv_get_info(bs, bdi);
2700 }
2701
2702 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2703 int64_t pos, int size)
2704 {
2705 BlockDriver *drv = bs->drv;
2706 if (!drv)
2707 return -ENOMEDIUM;
2708 if (drv->bdrv_save_vmstate)
2709 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2710 if (bs->file)
2711 return bdrv_save_vmstate(bs->file, buf, pos, size);
2712 return -ENOTSUP;
2713 }
2714
2715 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2716 int64_t pos, int size)
2717 {
2718 BlockDriver *drv = bs->drv;
2719 if (!drv)
2720 return -ENOMEDIUM;
2721 if (drv->bdrv_load_vmstate)
2722 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2723 if (bs->file)
2724 return bdrv_load_vmstate(bs->file, buf, pos, size);
2725 return -ENOTSUP;
2726 }
2727
2728 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2729 {
2730 BlockDriver *drv = bs->drv;
2731
2732 if (!drv || !drv->bdrv_debug_event) {
2733 return;
2734 }
2735
2736 return drv->bdrv_debug_event(bs, event);
2737
2738 }
2739
2740 /**************************************************************/
2741 /* handling of snapshots */
2742
2743 int bdrv_can_snapshot(BlockDriverState *bs)
2744 {
2745 BlockDriver *drv = bs->drv;
2746 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2747 return 0;
2748 }
2749
2750 if (!drv->bdrv_snapshot_create) {
2751 if (bs->file != NULL) {
2752 return bdrv_can_snapshot(bs->file);
2753 }
2754 return 0;
2755 }
2756
2757 return 1;
2758 }
2759
2760 int bdrv_is_snapshot(BlockDriverState *bs)
2761 {
2762 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2763 }
2764
2765 BlockDriverState *bdrv_snapshots(void)
2766 {
2767 BlockDriverState *bs;
2768
2769 if (bs_snapshots) {
2770 return bs_snapshots;
2771 }
2772
2773 bs = NULL;
2774 while ((bs = bdrv_next(bs))) {
2775 if (bdrv_can_snapshot(bs)) {
2776 bs_snapshots = bs;
2777 return bs;
2778 }
2779 }
2780 return NULL;
2781 }
2782
2783 int bdrv_snapshot_create(BlockDriverState *bs,
2784 QEMUSnapshotInfo *sn_info)
2785 {
2786 BlockDriver *drv = bs->drv;
2787 if (!drv)
2788 return -ENOMEDIUM;
2789 if (drv->bdrv_snapshot_create)
2790 return drv->bdrv_snapshot_create(bs, sn_info);
2791 if (bs->file)
2792 return bdrv_snapshot_create(bs->file, sn_info);
2793 return -ENOTSUP;
2794 }
2795
2796 int bdrv_snapshot_goto(BlockDriverState *bs,
2797 const char *snapshot_id)
2798 {
2799 BlockDriver *drv = bs->drv;
2800 int ret, open_ret;
2801
2802 if (!drv)
2803 return -ENOMEDIUM;
2804 if (drv->bdrv_snapshot_goto)
2805 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2806
2807 if (bs->file) {
2808 drv->bdrv_close(bs);
2809 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2810 open_ret = drv->bdrv_open(bs, bs->open_flags);
2811 if (open_ret < 0) {
2812 bdrv_delete(bs->file);
2813 bs->drv = NULL;
2814 return open_ret;
2815 }
2816 return ret;
2817 }
2818
2819 return -ENOTSUP;
2820 }
2821
2822 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2823 {
2824 BlockDriver *drv = bs->drv;
2825 if (!drv)
2826 return -ENOMEDIUM;
2827 if (drv->bdrv_snapshot_delete)
2828 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2829 if (bs->file)
2830 return bdrv_snapshot_delete(bs->file, snapshot_id);
2831 return -ENOTSUP;
2832 }
2833
2834 int bdrv_snapshot_list(BlockDriverState *bs,
2835 QEMUSnapshotInfo **psn_info)
2836 {
2837 BlockDriver *drv = bs->drv;
2838 if (!drv)
2839 return -ENOMEDIUM;
2840 if (drv->bdrv_snapshot_list)
2841 return drv->bdrv_snapshot_list(bs, psn_info);
2842 if (bs->file)
2843 return bdrv_snapshot_list(bs->file, psn_info);
2844 return -ENOTSUP;
2845 }
2846
2847 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2848 const char *snapshot_name)
2849 {
2850 BlockDriver *drv = bs->drv;
2851 if (!drv) {
2852 return -ENOMEDIUM;
2853 }
2854 if (!bs->read_only) {
2855 return -EINVAL;
2856 }
2857 if (drv->bdrv_snapshot_load_tmp) {
2858 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2859 }
2860 return -ENOTSUP;
2861 }
2862
2863 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2864 const char *backing_file)
2865 {
2866 if (!bs->drv) {
2867 return NULL;
2868 }
2869
2870 if (bs->backing_hd) {
2871 if (strcmp(bs->backing_file, backing_file) == 0) {
2872 return bs->backing_hd;
2873 } else {
2874 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2875 }
2876 }
2877
2878 return NULL;
2879 }
2880
2881 #define NB_SUFFIXES 4
2882
2883 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2884 {
2885 static const char suffixes[NB_SUFFIXES] = "KMGT";
2886 int64_t base;
2887 int i;
2888
2889 if (size <= 999) {
2890 snprintf(buf, buf_size, "%" PRId64, size);
2891 } else {
2892 base = 1024;
2893 for(i = 0; i < NB_SUFFIXES; i++) {
2894 if (size < (10 * base)) {
2895 snprintf(buf, buf_size, "%0.1f%c",
2896 (double)size / base,
2897 suffixes[i]);
2898 break;
2899 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2900 snprintf(buf, buf_size, "%" PRId64 "%c",
2901 ((size + (base >> 1)) / base),
2902 suffixes[i]);
2903 break;
2904 }
2905 base = base * 1024;
2906 }
2907 }
2908 return buf;
2909 }
2910
2911 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2912 {
2913 char buf1[128], date_buf[128], clock_buf[128];
2914 #ifdef _WIN32
2915 struct tm *ptm;
2916 #else
2917 struct tm tm;
2918 #endif
2919 time_t ti;
2920 int64_t secs;
2921
2922 if (!sn) {
2923 snprintf(buf, buf_size,
2924 "%-10s%-20s%7s%20s%15s",
2925 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2926 } else {
2927 ti = sn->date_sec;
2928 #ifdef _WIN32
2929 ptm = localtime(&ti);
2930 strftime(date_buf, sizeof(date_buf),
2931 "%Y-%m-%d %H:%M:%S", ptm);
2932 #else
2933 localtime_r(&ti, &tm);
2934 strftime(date_buf, sizeof(date_buf),
2935 "%Y-%m-%d %H:%M:%S", &tm);
2936 #endif
2937 secs = sn->vm_clock_nsec / 1000000000;
2938 snprintf(clock_buf, sizeof(clock_buf),
2939 "%02d:%02d:%02d.%03d",
2940 (int)(secs / 3600),
2941 (int)((secs / 60) % 60),
2942 (int)(secs % 60),
2943 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2944 snprintf(buf, buf_size,
2945 "%-10s%-20s%7s%20s%15s",
2946 sn->id_str, sn->name,
2947 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2948 date_buf,
2949 clock_buf);
2950 }
2951 return buf;
2952 }
2953
2954 /**************************************************************/
2955 /* async I/Os */
2956
2957 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2958 QEMUIOVector *qiov, int nb_sectors,
2959 BlockDriverCompletionFunc *cb, void *opaque)
2960 {
2961 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2962
2963 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2964 cb, opaque, false);
2965 }
2966
2967 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2968 QEMUIOVector *qiov, int nb_sectors,
2969 BlockDriverCompletionFunc *cb, void *opaque)
2970 {
2971 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2972
2973 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2974 cb, opaque, true);
2975 }
2976
2977
2978 typedef struct MultiwriteCB {
2979 int error;
2980 int num_requests;
2981 int num_callbacks;
2982 struct {
2983 BlockDriverCompletionFunc *cb;
2984 void *opaque;
2985 QEMUIOVector *free_qiov;
2986 } callbacks[];
2987 } MultiwriteCB;
2988
2989 static void multiwrite_user_cb(MultiwriteCB *mcb)
2990 {
2991 int i;
2992
2993 for (i = 0; i < mcb->num_callbacks; i++) {
2994 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2995 if (mcb->callbacks[i].free_qiov) {
2996 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2997 }
2998 g_free(mcb->callbacks[i].free_qiov);
2999 }
3000 }
3001
3002 static void multiwrite_cb(void *opaque, int ret)
3003 {
3004 MultiwriteCB *mcb = opaque;
3005
3006 trace_multiwrite_cb(mcb, ret);
3007
3008 if (ret < 0 && !mcb->error) {
3009 mcb->error = ret;
3010 }
3011
3012 mcb->num_requests--;
3013 if (mcb->num_requests == 0) {
3014 multiwrite_user_cb(mcb);
3015 g_free(mcb);
3016 }
3017 }
3018
3019 static int multiwrite_req_compare(const void *a, const void *b)
3020 {
3021 const BlockRequest *req1 = a, *req2 = b;
3022
3023 /*
3024 * Note that we can't simply subtract req2->sector from req1->sector
3025 * here as that could overflow the return value.
3026 */
3027 if (req1->sector > req2->sector) {
3028 return 1;
3029 } else if (req1->sector < req2->sector) {
3030 return -1;
3031 } else {
3032 return 0;
3033 }
3034 }
3035
3036 /*
3037 * Takes a bunch of requests and tries to merge them. Returns the number of
3038 * requests that remain after merging.
3039 */
3040 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3041 int num_reqs, MultiwriteCB *mcb)
3042 {
3043 int i, outidx;
3044
3045 // Sort requests by start sector
3046 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3047
3048 // Check if adjacent requests touch the same clusters. If so, combine them,
3049 // filling up gaps with zero sectors.
3050 outidx = 0;
3051 for (i = 1; i < num_reqs; i++) {
3052 int merge = 0;
3053 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3054
3055 // Handle exactly sequential writes and overlapping writes.
3056 if (reqs[i].sector <= oldreq_last) {
3057 merge = 1;
3058 }
3059
3060 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3061 merge = 0;
3062 }
3063
3064 if (merge) {
3065 size_t size;
3066 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3067 qemu_iovec_init(qiov,
3068 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3069
3070 // Add the first request to the merged one. If the requests are
3071 // overlapping, drop the last sectors of the first request.
3072 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3073 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3074
3075 // We should need to add any zeros between the two requests
3076 assert (reqs[i].sector <= oldreq_last);
3077
3078 // Add the second request
3079 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3080
3081 reqs[outidx].nb_sectors = qiov->size >> 9;
3082 reqs[outidx].qiov = qiov;
3083
3084 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3085 } else {
3086 outidx++;
3087 reqs[outidx].sector = reqs[i].sector;
3088 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3089 reqs[outidx].qiov = reqs[i].qiov;
3090 }
3091 }
3092
3093 return outidx + 1;
3094 }
3095
3096 /*
3097 * Submit multiple AIO write requests at once.
3098 *
3099 * On success, the function returns 0 and all requests in the reqs array have
3100 * been submitted. In error case this function returns -1, and any of the
3101 * requests may or may not be submitted yet. In particular, this means that the
3102 * callback will be called for some of the requests, for others it won't. The
3103 * caller must check the error field of the BlockRequest to wait for the right
3104 * callbacks (if error != 0, no callback will be called).
3105 *
3106 * The implementation may modify the contents of the reqs array, e.g. to merge
3107 * requests. However, the fields opaque and error are left unmodified as they
3108 * are used to signal failure for a single request to the caller.
3109 */
3110 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3111 {
3112 MultiwriteCB *mcb;
3113 int i;
3114
3115 /* don't submit writes if we don't have a medium */
3116 if (bs->drv == NULL) {
3117 for (i = 0; i < num_reqs; i++) {
3118 reqs[i].error = -ENOMEDIUM;
3119 }
3120 return -1;
3121 }
3122
3123 if (num_reqs == 0) {
3124 return 0;
3125 }
3126
3127 // Create MultiwriteCB structure
3128 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3129 mcb->num_requests = 0;
3130 mcb->num_callbacks = num_reqs;
3131
3132 for (i = 0; i < num_reqs; i++) {
3133 mcb->callbacks[i].cb = reqs[i].cb;
3134 mcb->callbacks[i].opaque = reqs[i].opaque;
3135 }
3136
3137 // Check for mergable requests
3138 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3139
3140 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3141
3142 /* Run the aio requests. */
3143 mcb->num_requests = num_reqs;
3144 for (i = 0; i < num_reqs; i++) {
3145 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3146 reqs[i].nb_sectors, multiwrite_cb, mcb);
3147 }
3148
3149 return 0;
3150 }
3151
3152 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3153 {
3154 acb->pool->cancel(acb);
3155 }
3156
3157 /* block I/O throttling */
3158 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3159 bool is_write, double elapsed_time, uint64_t *wait)
3160 {
3161 uint64_t bps_limit = 0;
3162 double bytes_limit, bytes_base, bytes_res;
3163 double slice_time, wait_time;
3164
3165 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3166 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3167 } else if (bs->io_limits.bps[is_write]) {
3168 bps_limit = bs->io_limits.bps[is_write];
3169 } else {
3170 if (wait) {
3171 *wait = 0;
3172 }
3173
3174 return false;
3175 }
3176
3177 slice_time = bs->slice_end - bs->slice_start;
3178 slice_time /= (NANOSECONDS_PER_SECOND);
3179 bytes_limit = bps_limit * slice_time;
3180 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3181 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3182 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3183 }
3184
3185 /* bytes_base: the bytes of data which have been read/written; and
3186 * it is obtained from the history statistic info.
3187 * bytes_res: the remaining bytes of data which need to be read/written.
3188 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3189 * the total time for completing reading/writting all data.
3190 */
3191 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3192
3193 if (bytes_base + bytes_res <= bytes_limit) {
3194 if (wait) {
3195 *wait = 0;
3196 }
3197
3198 return false;
3199 }
3200
3201 /* Calc approx time to dispatch */
3202 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3203
3204 /* When the I/O rate at runtime exceeds the limits,
3205 * bs->slice_end need to be extended in order that the current statistic
3206 * info can be kept until the timer fire, so it is increased and tuned
3207 * based on the result of experiment.
3208 */
3209 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3210 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3211 if (wait) {
3212 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3213 }
3214
3215 return true;
3216 }
3217
3218 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3219 double elapsed_time, uint64_t *wait)
3220 {
3221 uint64_t iops_limit = 0;
3222 double ios_limit, ios_base;
3223 double slice_time, wait_time;
3224
3225 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3226 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3227 } else if (bs->io_limits.iops[is_write]) {
3228 iops_limit = bs->io_limits.iops[is_write];
3229 } else {
3230 if (wait) {
3231 *wait = 0;
3232 }
3233
3234 return false;
3235 }
3236
3237 slice_time = bs->slice_end - bs->slice_start;
3238 slice_time /= (NANOSECONDS_PER_SECOND);
3239 ios_limit = iops_limit * slice_time;
3240 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3241 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3242 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3243 }
3244
3245 if (ios_base + 1 <= ios_limit) {
3246 if (wait) {
3247 *wait = 0;
3248 }
3249
3250 return false;
3251 }
3252
3253 /* Calc approx time to dispatch */
3254 wait_time = (ios_base + 1) / iops_limit;
3255 if (wait_time > elapsed_time) {
3256 wait_time = wait_time - elapsed_time;
3257 } else {
3258 wait_time = 0;
3259 }
3260
3261 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3262 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3263 if (wait) {
3264 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3265 }
3266
3267 return true;
3268 }
3269
3270 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3271 bool is_write, int64_t *wait)
3272 {
3273 int64_t now, max_wait;
3274 uint64_t bps_wait = 0, iops_wait = 0;
3275 double elapsed_time;
3276 int bps_ret, iops_ret;
3277
3278 now = qemu_get_clock_ns(vm_clock);
3279 if ((bs->slice_start < now)
3280 && (bs->slice_end > now)) {
3281 bs->slice_end = now + bs->slice_time;
3282 } else {
3283 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3284 bs->slice_start = now;
3285 bs->slice_end = now + bs->slice_time;
3286
3287 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3288 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3289
3290 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3291 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3292 }
3293
3294 elapsed_time = now - bs->slice_start;
3295 elapsed_time /= (NANOSECONDS_PER_SECOND);
3296
3297 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3298 is_write, elapsed_time, &bps_wait);
3299 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3300 elapsed_time, &iops_wait);
3301 if (bps_ret || iops_ret) {
3302 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3303 if (wait) {
3304 *wait = max_wait;
3305 }
3306
3307 now = qemu_get_clock_ns(vm_clock);
3308 if (bs->slice_end < now + max_wait) {
3309 bs->slice_end = now + max_wait;
3310 }
3311
3312 return true;
3313 }
3314
3315 if (wait) {
3316 *wait = 0;
3317 }
3318
3319 return false;
3320 }
3321
3322 /**************************************************************/
3323 /* async block device emulation */
3324
3325 typedef struct BlockDriverAIOCBSync {
3326 BlockDriverAIOCB common;
3327 QEMUBH *bh;
3328 int ret;
3329 /* vector translation state */
3330 QEMUIOVector *qiov;
3331 uint8_t *bounce;
3332 int is_write;
3333 } BlockDriverAIOCBSync;
3334
3335 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3336 {
3337 BlockDriverAIOCBSync *acb =
3338 container_of(blockacb, BlockDriverAIOCBSync, common);
3339 qemu_bh_delete(acb->bh);
3340 acb->bh = NULL;
3341 qemu_aio_release(acb);
3342 }
3343
3344 static AIOPool bdrv_em_aio_pool = {
3345 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3346 .cancel = bdrv_aio_cancel_em,
3347 };
3348
3349 static void bdrv_aio_bh_cb(void *opaque)
3350 {
3351 BlockDriverAIOCBSync *acb = opaque;
3352
3353 if (!acb->is_write)
3354 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3355 qemu_vfree(acb->bounce);
3356 acb->common.cb(acb->common.opaque, acb->ret);
3357 qemu_bh_delete(acb->bh);
3358 acb->bh = NULL;
3359 qemu_aio_release(acb);
3360 }
3361
3362 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3363 int64_t sector_num,
3364 QEMUIOVector *qiov,
3365 int nb_sectors,
3366 BlockDriverCompletionFunc *cb,
3367 void *opaque,
3368 int is_write)
3369
3370 {
3371 BlockDriverAIOCBSync *acb;
3372
3373 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3374 acb->is_write = is_write;
3375 acb->qiov = qiov;
3376 acb->bounce = qemu_blockalign(bs, qiov->size);
3377 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3378
3379 if (is_write) {
3380 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3381 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3382 } else {
3383 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3384 }
3385
3386 qemu_bh_schedule(acb->bh);
3387
3388 return &acb->common;
3389 }
3390
3391 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3392 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3393 BlockDriverCompletionFunc *cb, void *opaque)
3394 {
3395 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3396 }
3397
3398 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3399 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3400 BlockDriverCompletionFunc *cb, void *opaque)
3401 {
3402 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3403 }
3404
3405
3406 typedef struct BlockDriverAIOCBCoroutine {
3407 BlockDriverAIOCB common;
3408 BlockRequest req;
3409 bool is_write;
3410 QEMUBH* bh;
3411 } BlockDriverAIOCBCoroutine;
3412
3413 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3414 {
3415 qemu_aio_flush();
3416 }
3417
3418 static AIOPool bdrv_em_co_aio_pool = {
3419 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3420 .cancel = bdrv_aio_co_cancel_em,
3421 };
3422
3423 static void bdrv_co_em_bh(void *opaque)
3424 {
3425 BlockDriverAIOCBCoroutine *acb = opaque;
3426
3427 acb->common.cb(acb->common.opaque, acb->req.error);
3428 qemu_bh_delete(acb->bh);
3429 qemu_aio_release(acb);
3430 }
3431
3432 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3433 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3434 {
3435 BlockDriverAIOCBCoroutine *acb = opaque;
3436 BlockDriverState *bs = acb->common.bs;
3437
3438 if (!acb->is_write) {
3439 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3440 acb->req.nb_sectors, acb->req.qiov, 0);
3441 } else {
3442 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3443 acb->req.nb_sectors, acb->req.qiov, 0);
3444 }
3445
3446 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3447 qemu_bh_schedule(acb->bh);
3448 }
3449
3450 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3451 int64_t sector_num,
3452 QEMUIOVector *qiov,
3453 int nb_sectors,
3454 BlockDriverCompletionFunc *cb,
3455 void *opaque,
3456 bool is_write)
3457 {
3458 Coroutine *co;
3459 BlockDriverAIOCBCoroutine *acb;
3460
3461 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3462 acb->req.sector = sector_num;
3463 acb->req.nb_sectors = nb_sectors;
3464 acb->req.qiov = qiov;
3465 acb->is_write = is_write;
3466
3467 co = qemu_coroutine_create(bdrv_co_do_rw);
3468 qemu_coroutine_enter(co, acb);
3469
3470 return &acb->common;
3471 }
3472
3473 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3474 {
3475 BlockDriverAIOCBCoroutine *acb = opaque;
3476 BlockDriverState *bs = acb->common.bs;
3477
3478 acb->req.error = bdrv_co_flush(bs);
3479 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3480 qemu_bh_schedule(acb->bh);
3481 }
3482
3483 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3484 BlockDriverCompletionFunc *cb, void *opaque)
3485 {
3486 trace_bdrv_aio_flush(bs, opaque);
3487
3488 Coroutine *co;
3489 BlockDriverAIOCBCoroutine *acb;
3490
3491 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3492 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3493 qemu_coroutine_enter(co, acb);
3494
3495 return &acb->common;
3496 }
3497
3498 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3499 {
3500 BlockDriverAIOCBCoroutine *acb = opaque;
3501 BlockDriverState *bs = acb->common.bs;
3502
3503 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3504 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3505 qemu_bh_schedule(acb->bh);
3506 }
3507
3508 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3509 int64_t sector_num, int nb_sectors,
3510 BlockDriverCompletionFunc *cb, void *opaque)
3511 {
3512 Coroutine *co;
3513 BlockDriverAIOCBCoroutine *acb;
3514
3515 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3516
3517 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3518 acb->req.sector = sector_num;
3519 acb->req.nb_sectors = nb_sectors;
3520 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3521 qemu_coroutine_enter(co, acb);
3522
3523 return &acb->common;
3524 }
3525
3526 void bdrv_init(void)
3527 {
3528 module_call_init(MODULE_INIT_BLOCK);
3529 }
3530
3531 void bdrv_init_with_whitelist(void)
3532 {
3533 use_bdrv_whitelist = 1;
3534 bdrv_init();
3535 }
3536
3537 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3538 BlockDriverCompletionFunc *cb, void *opaque)
3539 {
3540 BlockDriverAIOCB *acb;
3541
3542 if (pool->free_aiocb) {
3543 acb = pool->free_aiocb;
3544 pool->free_aiocb = acb->next;
3545 } else {
3546 acb = g_malloc0(pool->aiocb_size);
3547 acb->pool = pool;
3548 }
3549 acb->bs = bs;
3550 acb->cb = cb;
3551 acb->opaque = opaque;
3552 return acb;
3553 }
3554
3555 void qemu_aio_release(void *p)
3556 {
3557 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3558 AIOPool *pool = acb->pool;
3559 acb->next = pool->free_aiocb;
3560 pool->free_aiocb = acb;
3561 }
3562
3563 /**************************************************************/
3564 /* Coroutine block device emulation */
3565
3566 typedef struct CoroutineIOCompletion {
3567 Coroutine *coroutine;
3568 int ret;
3569 } CoroutineIOCompletion;
3570
3571 static void bdrv_co_io_em_complete(void *opaque, int ret)
3572 {
3573 CoroutineIOCompletion *co = opaque;
3574
3575 co->ret = ret;
3576 qemu_coroutine_enter(co->coroutine, NULL);
3577 }
3578
3579 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3580 int nb_sectors, QEMUIOVector *iov,
3581 bool is_write)
3582 {
3583 CoroutineIOCompletion co = {
3584 .coroutine = qemu_coroutine_self(),
3585 };
3586 BlockDriverAIOCB *acb;
3587
3588 if (is_write) {
3589 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3590 bdrv_co_io_em_complete, &co);
3591 } else {
3592 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3593 bdrv_co_io_em_complete, &co);
3594 }
3595
3596 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3597 if (!acb) {
3598 return -EIO;
3599 }
3600 qemu_coroutine_yield();
3601
3602 return co.ret;
3603 }
3604
3605 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3606 int64_t sector_num, int nb_sectors,
3607 QEMUIOVector *iov)
3608 {
3609 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3610 }
3611
3612 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3613 int64_t sector_num, int nb_sectors,
3614 QEMUIOVector *iov)
3615 {
3616 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3617 }
3618
3619 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3620 {
3621 RwCo *rwco = opaque;
3622
3623 rwco->ret = bdrv_co_flush(rwco->bs);
3624 }
3625
3626 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3627 {
3628 int ret;
3629
3630 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3631 return 0;
3632 }
3633
3634 /* Write back cached data to the OS even with cache=unsafe */
3635 if (bs->drv->bdrv_co_flush_to_os) {
3636 ret = bs->drv->bdrv_co_flush_to_os(bs);
3637 if (ret < 0) {
3638 return ret;
3639 }
3640 }
3641
3642 /* But don't actually force it to the disk with cache=unsafe */
3643 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3644 return 0;
3645 }
3646
3647 if (bs->drv->bdrv_co_flush_to_disk) {
3648 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3649 } else if (bs->drv->bdrv_aio_flush) {
3650 BlockDriverAIOCB *acb;
3651 CoroutineIOCompletion co = {
3652 .coroutine = qemu_coroutine_self(),
3653 };
3654
3655 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3656 if (acb == NULL) {
3657 ret = -EIO;
3658 } else {
3659 qemu_coroutine_yield();
3660 ret = co.ret;
3661 }
3662 } else {
3663 /*
3664 * Some block drivers always operate in either writethrough or unsafe
3665 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3666 * know how the server works (because the behaviour is hardcoded or
3667 * depends on server-side configuration), so we can't ensure that
3668 * everything is safe on disk. Returning an error doesn't work because
3669 * that would break guests even if the server operates in writethrough
3670 * mode.
3671 *
3672 * Let's hope the user knows what he's doing.
3673 */
3674 ret = 0;
3675 }
3676 if (ret < 0) {
3677 return ret;
3678 }
3679
3680 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3681 * in the case of cache=unsafe, so there are no useless flushes.
3682 */
3683 return bdrv_co_flush(bs->file);
3684 }
3685
3686 void bdrv_invalidate_cache(BlockDriverState *bs)
3687 {
3688 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3689 bs->drv->bdrv_invalidate_cache(bs);
3690 }
3691 }
3692
3693 void bdrv_invalidate_cache_all(void)
3694 {
3695 BlockDriverState *bs;
3696
3697 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3698 bdrv_invalidate_cache(bs);
3699 }
3700 }
3701
3702 void bdrv_clear_incoming_migration_all(void)
3703 {
3704 BlockDriverState *bs;
3705
3706 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3707 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3708 }
3709 }
3710
3711 int bdrv_flush(BlockDriverState *bs)
3712 {
3713 Coroutine *co;
3714 RwCo rwco = {
3715 .bs = bs,
3716 .ret = NOT_DONE,
3717 };
3718
3719 if (qemu_in_coroutine()) {
3720 /* Fast-path if already in coroutine context */
3721 bdrv_flush_co_entry(&rwco);
3722 } else {
3723 co = qemu_coroutine_create(bdrv_flush_co_entry);
3724 qemu_coroutine_enter(co, &rwco);
3725 while (rwco.ret == NOT_DONE) {
3726 qemu_aio_wait();
3727 }
3728 }
3729
3730 return rwco.ret;
3731 }
3732
3733 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3734 {
3735 RwCo *rwco = opaque;
3736
3737 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3738 }
3739
3740 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3741 int nb_sectors)
3742 {
3743 if (!bs->drv) {
3744 return -ENOMEDIUM;
3745 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3746 return -EIO;
3747 } else if (bs->read_only) {
3748 return -EROFS;
3749 } else if (bs->drv->bdrv_co_discard) {
3750 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3751 } else if (bs->drv->bdrv_aio_discard) {
3752 BlockDriverAIOCB *acb;
3753 CoroutineIOCompletion co = {
3754 .coroutine = qemu_coroutine_self(),
3755 };
3756
3757 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3758 bdrv_co_io_em_complete, &co);
3759 if (acb == NULL) {
3760 return -EIO;
3761 } else {
3762 qemu_coroutine_yield();
3763 return co.ret;
3764 }
3765 } else {
3766 return 0;
3767 }
3768 }
3769
3770 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3771 {
3772 Coroutine *co;
3773 RwCo rwco = {
3774 .bs = bs,
3775 .sector_num = sector_num,
3776 .nb_sectors = nb_sectors,
3777 .ret = NOT_DONE,
3778 };
3779
3780 if (qemu_in_coroutine()) {
3781 /* Fast-path if already in coroutine context */
3782 bdrv_discard_co_entry(&rwco);
3783 } else {
3784 co = qemu_coroutine_create(bdrv_discard_co_entry);
3785 qemu_coroutine_enter(co, &rwco);
3786 while (rwco.ret == NOT_DONE) {
3787 qemu_aio_wait();
3788 }
3789 }
3790
3791 return rwco.ret;
3792 }
3793
3794 /**************************************************************/
3795 /* removable device support */
3796
3797 /**
3798 * Return TRUE if the media is present
3799 */
3800 int bdrv_is_inserted(BlockDriverState *bs)
3801 {
3802 BlockDriver *drv = bs->drv;
3803
3804 if (!drv)
3805 return 0;
3806 if (!drv->bdrv_is_inserted)
3807 return 1;
3808 return drv->bdrv_is_inserted(bs);
3809 }
3810
3811 /**
3812 * Return whether the media changed since the last call to this
3813 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3814 */
3815 int bdrv_media_changed(BlockDriverState *bs)
3816 {
3817 BlockDriver *drv = bs->drv;
3818
3819 if (drv && drv->bdrv_media_changed) {
3820 return drv->bdrv_media_changed(bs);
3821 }
3822 return -ENOTSUP;
3823 }
3824
3825 /**
3826 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3827 */
3828 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3829 {
3830 BlockDriver *drv = bs->drv;
3831
3832 if (drv && drv->bdrv_eject) {
3833 drv->bdrv_eject(bs, eject_flag);
3834 }
3835
3836 if (bs->device_name[0] != '\0') {
3837 bdrv_emit_qmp_eject_event(bs, eject_flag);
3838 }
3839 }
3840
3841 /**
3842 * Lock or unlock the media (if it is locked, the user won't be able
3843 * to eject it manually).
3844 */
3845 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3846 {
3847 BlockDriver *drv = bs->drv;
3848
3849 trace_bdrv_lock_medium(bs, locked);
3850
3851 if (drv && drv->bdrv_lock_medium) {
3852 drv->bdrv_lock_medium(bs, locked);
3853 }
3854 }
3855
3856 /* needed for generic scsi interface */
3857
3858 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3859 {
3860 BlockDriver *drv = bs->drv;
3861
3862 if (drv && drv->bdrv_ioctl)
3863 return drv->bdrv_ioctl(bs, req, buf);
3864 return -ENOTSUP;
3865 }
3866
3867 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3868 unsigned long int req, void *buf,
3869 BlockDriverCompletionFunc *cb, void *opaque)
3870 {
3871 BlockDriver *drv = bs->drv;
3872
3873 if (drv && drv->bdrv_aio_ioctl)
3874 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3875 return NULL;
3876 }
3877
3878 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3879 {
3880 bs->buffer_alignment = align;
3881 }
3882
3883 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3884 {
3885 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3886 }
3887
3888 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3889 {
3890 int64_t bitmap_size;
3891
3892 bs->dirty_count = 0;
3893 if (enable) {
3894 if (!bs->dirty_bitmap) {
3895 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3896 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3897 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3898
3899 bs->dirty_bitmap = g_malloc0(bitmap_size);
3900 }
3901 } else {
3902 if (bs->dirty_bitmap) {
3903 g_free(bs->dirty_bitmap);
3904 bs->dirty_bitmap = NULL;
3905 }
3906 }
3907 }
3908
3909 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3910 {
3911 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3912
3913 if (bs->dirty_bitmap &&
3914 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3915 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3916 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3917 } else {
3918 return 0;
3919 }
3920 }
3921
3922 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3923 int nr_sectors)
3924 {
3925 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3926 }
3927
3928 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3929 {
3930 return bs->dirty_count;
3931 }
3932
3933 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3934 {
3935 assert(bs->in_use != in_use);
3936 bs->in_use = in_use;
3937 }
3938
3939 int bdrv_in_use(BlockDriverState *bs)
3940 {
3941 return bs->in_use;
3942 }
3943
3944 void bdrv_iostatus_enable(BlockDriverState *bs)
3945 {
3946 bs->iostatus_enabled = true;
3947 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3948 }
3949
3950 /* The I/O status is only enabled if the drive explicitly
3951 * enables it _and_ the VM is configured to stop on errors */
3952 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3953 {
3954 return (bs->iostatus_enabled &&
3955 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3956 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3957 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3958 }
3959
3960 void bdrv_iostatus_disable(BlockDriverState *bs)
3961 {
3962 bs->iostatus_enabled = false;
3963 }
3964
3965 void bdrv_iostatus_reset(BlockDriverState *bs)
3966 {
3967 if (bdrv_iostatus_is_enabled(bs)) {
3968 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3969 }
3970 }
3971
3972 /* XXX: Today this is set by device models because it makes the implementation
3973 quite simple. However, the block layer knows about the error, so it's
3974 possible to implement this without device models being involved */
3975 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3976 {
3977 if (bdrv_iostatus_is_enabled(bs) &&
3978 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3979 assert(error >= 0);
3980 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3981 BLOCK_DEVICE_IO_STATUS_FAILED;
3982 }
3983 }
3984
3985 void
3986 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3987 enum BlockAcctType type)
3988 {
3989 assert(type < BDRV_MAX_IOTYPE);
3990
3991 cookie->bytes = bytes;
3992 cookie->start_time_ns = get_clock();
3993 cookie->type = type;
3994 }
3995
3996 void
3997 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3998 {
3999 assert(cookie->type < BDRV_MAX_IOTYPE);
4000
4001 bs->nr_bytes[cookie->type] += cookie->bytes;
4002 bs->nr_ops[cookie->type]++;
4003 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
4004 }
4005
4006 int bdrv_img_create(const char *filename, const char *fmt,
4007 const char *base_filename, const char *base_fmt,
4008 char *options, uint64_t img_size, int flags)
4009 {
4010 QEMUOptionParameter *param = NULL, *create_options = NULL;
4011 QEMUOptionParameter *backing_fmt, *backing_file, *size;
4012 BlockDriverState *bs = NULL;
4013 BlockDriver *drv, *proto_drv;
4014 BlockDriver *backing_drv = NULL;
4015 int ret = 0;
4016
4017 /* Find driver and parse its options */
4018 drv = bdrv_find_format(fmt);
4019 if (!drv) {
4020 error_report("Unknown file format '%s'", fmt);
4021 ret = -EINVAL;
4022 goto out;
4023 }
4024
4025 proto_drv = bdrv_find_protocol(filename);
4026 if (!proto_drv) {
4027 error_report("Unknown protocol '%s'", filename);
4028 ret = -EINVAL;
4029 goto out;
4030 }
4031
4032 create_options = append_option_parameters(create_options,
4033 drv->create_options);
4034 create_options = append_option_parameters(create_options,
4035 proto_drv->create_options);
4036
4037 /* Create parameter list with default values */
4038 param = parse_option_parameters("", create_options, param);
4039
4040 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4041
4042 /* Parse -o options */
4043 if (options) {
4044 param = parse_option_parameters(options, create_options, param);
4045 if (param == NULL) {
4046 error_report("Invalid options for file format '%s'.", fmt);
4047 ret = -EINVAL;
4048 goto out;
4049 }
4050 }
4051
4052 if (base_filename) {
4053 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4054 base_filename)) {
4055 error_report("Backing file not supported for file format '%s'",
4056 fmt);
4057 ret = -EINVAL;
4058 goto out;
4059 }
4060 }
4061
4062 if (base_fmt) {
4063 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4064 error_report("Backing file format not supported for file "
4065 "format '%s'", fmt);
4066 ret = -EINVAL;
4067 goto out;
4068 }
4069 }
4070
4071 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4072 if (backing_file && backing_file->value.s) {
4073 if (!strcmp(filename, backing_file->value.s)) {
4074 error_report("Error: Trying to create an image with the "
4075 "same filename as the backing file");
4076 ret = -EINVAL;
4077 goto out;
4078 }
4079 }
4080
4081 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4082 if (backing_fmt && backing_fmt->value.s) {
4083 backing_drv = bdrv_find_format(backing_fmt->value.s);
4084 if (!backing_drv) {
4085 error_report("Unknown backing file format '%s'",
4086 backing_fmt->value.s);
4087 ret = -EINVAL;
4088 goto out;
4089 }
4090 }
4091
4092 // The size for the image must always be specified, with one exception:
4093 // If we are using a backing file, we can obtain the size from there
4094 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4095 if (size && size->value.n == -1) {
4096 if (backing_file && backing_file->value.s) {
4097 uint64_t size;
4098 char buf[32];
4099
4100 bs = bdrv_new("");
4101
4102 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4103 if (ret < 0) {
4104 error_report("Could not open '%s'", backing_file->value.s);
4105 goto out;
4106 }
4107 bdrv_get_geometry(bs, &size);
4108 size *= 512;
4109
4110 snprintf(buf, sizeof(buf), "%" PRId64, size);
4111 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4112 } else {
4113 error_report("Image creation needs a size parameter");
4114 ret = -EINVAL;
4115 goto out;
4116 }
4117 }
4118
4119 printf("Formatting '%s', fmt=%s ", filename, fmt);
4120 print_option_parameters(param);
4121 puts("");
4122
4123 ret = bdrv_create(drv, filename, param);
4124
4125 if (ret < 0) {
4126 if (ret == -ENOTSUP) {
4127 error_report("Formatting or formatting option not supported for "
4128 "file format '%s'", fmt);
4129 } else if (ret == -EFBIG) {
4130 error_report("The image size is too large for file format '%s'",
4131 fmt);
4132 } else {
4133 error_report("%s: error while creating %s: %s", filename, fmt,
4134 strerror(-ret));
4135 }
4136 }
4137
4138 out:
4139 free_option_parameters(create_options);
4140 free_option_parameters(param);
4141
4142 if (bs) {
4143 bdrv_delete(bs);
4144 }
4145
4146 return ret;
4147 }
4148
4149 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4150 int64_t speed, BlockDriverCompletionFunc *cb,
4151 void *opaque, Error **errp)
4152 {
4153 BlockJob *job;
4154
4155 if (bs->job || bdrv_in_use(bs)) {
4156 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4157 return NULL;
4158 }
4159 bdrv_set_in_use(bs, 1);
4160
4161 job = g_malloc0(job_type->instance_size);
4162 job->job_type = job_type;
4163 job->bs = bs;
4164 job->cb = cb;
4165 job->opaque = opaque;
4166 bs->job = job;
4167
4168 /* Only set speed when necessary to avoid NotSupported error */
4169 if (speed != 0) {
4170 Error *local_err = NULL;
4171
4172 block_job_set_speed(job, speed, &local_err);
4173 if (error_is_set(&local_err)) {
4174 bs->job = NULL;
4175 g_free(job);
4176 bdrv_set_in_use(bs, 0);
4177 error_propagate(errp, local_err);
4178 return NULL;
4179 }
4180 }
4181 return job;
4182 }
4183
4184 void block_job_complete(BlockJob *job, int ret)
4185 {
4186 BlockDriverState *bs = job->bs;
4187
4188 assert(bs->job == job);
4189 job->cb(job->opaque, ret);
4190 bs->job = NULL;
4191 g_free(job);
4192 bdrv_set_in_use(bs, 0);
4193 }
4194
4195 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4196 {
4197 Error *local_err = NULL;
4198
4199 if (!job->job_type->set_speed) {
4200 error_set(errp, QERR_NOT_SUPPORTED);
4201 return;
4202 }
4203 job->job_type->set_speed(job, speed, &local_err);
4204 if (error_is_set(&local_err)) {
4205 error_propagate(errp, local_err);
4206 return;
4207 }
4208
4209 job->speed = speed;
4210 }
4211
4212 void block_job_cancel(BlockJob *job)
4213 {
4214 job->cancelled = true;
4215 }
4216
4217 bool block_job_is_cancelled(BlockJob *job)
4218 {
4219 return job->cancelled;
4220 }
4221
4222 void block_job_cancel_sync(BlockJob *job)
4223 {
4224 BlockDriverState *bs = job->bs;
4225
4226 assert(bs->job == job);
4227 block_job_cancel(job);
4228 while (bs->job != NULL && bs->job->busy) {
4229 qemu_aio_wait();
4230 }
4231 }