]> git.proxmox.com Git - qemu.git/blob - block.c
Merge remote-tracking branch 'qemu-kvm/uq/master' into staging
[qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 BDRV_REQ_ZERO_WRITE = 0x2,
54 } BdrvRequestFlags;
55
56 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
57 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
58 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
59 BlockDriverCompletionFunc *cb, void *opaque);
60 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
62 BlockDriverCompletionFunc *cb, void *opaque);
63 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
64 int64_t sector_num, int nb_sectors,
65 QEMUIOVector *iov);
66 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
67 int64_t sector_num, int nb_sectors,
68 QEMUIOVector *iov);
69 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
70 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
71 BdrvRequestFlags flags);
72 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
74 BdrvRequestFlags flags);
75 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
76 int64_t sector_num,
77 QEMUIOVector *qiov,
78 int nb_sectors,
79 BlockDriverCompletionFunc *cb,
80 void *opaque,
81 bool is_write);
82 static void coroutine_fn bdrv_co_do_rw(void *opaque);
83 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
84 int64_t sector_num, int nb_sectors);
85
86 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, double elapsed_time, uint64_t *wait);
88 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
89 double elapsed_time, uint64_t *wait);
90 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
91 bool is_write, int64_t *wait);
92
93 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
94 QTAILQ_HEAD_INITIALIZER(bdrv_states);
95
96 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
97 QLIST_HEAD_INITIALIZER(bdrv_drivers);
98
99 /* The device to use for VM snapshots */
100 static BlockDriverState *bs_snapshots;
101
102 /* If non-zero, use only whitelisted block drivers */
103 static int use_bdrv_whitelist;
104
105 #ifdef _WIN32
106 static int is_windows_drive_prefix(const char *filename)
107 {
108 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
109 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
110 filename[1] == ':');
111 }
112
113 int is_windows_drive(const char *filename)
114 {
115 if (is_windows_drive_prefix(filename) &&
116 filename[2] == '\0')
117 return 1;
118 if (strstart(filename, "\\\\.\\", NULL) ||
119 strstart(filename, "//./", NULL))
120 return 1;
121 return 0;
122 }
123 #endif
124
125 /* throttling disk I/O limits */
126 void bdrv_io_limits_disable(BlockDriverState *bs)
127 {
128 bs->io_limits_enabled = false;
129
130 while (qemu_co_queue_next(&bs->throttled_reqs));
131
132 if (bs->block_timer) {
133 qemu_del_timer(bs->block_timer);
134 qemu_free_timer(bs->block_timer);
135 bs->block_timer = NULL;
136 }
137
138 bs->slice_start = 0;
139 bs->slice_end = 0;
140 bs->slice_time = 0;
141 memset(&bs->io_base, 0, sizeof(bs->io_base));
142 }
143
144 static void bdrv_block_timer(void *opaque)
145 {
146 BlockDriverState *bs = opaque;
147
148 qemu_co_queue_next(&bs->throttled_reqs);
149 }
150
151 void bdrv_io_limits_enable(BlockDriverState *bs)
152 {
153 qemu_co_queue_init(&bs->throttled_reqs);
154 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
155 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
156 bs->slice_start = qemu_get_clock_ns(vm_clock);
157 bs->slice_end = bs->slice_start + bs->slice_time;
158 memset(&bs->io_base, 0, sizeof(bs->io_base));
159 bs->io_limits_enabled = true;
160 }
161
162 bool bdrv_io_limits_enabled(BlockDriverState *bs)
163 {
164 BlockIOLimit *io_limits = &bs->io_limits;
165 return io_limits->bps[BLOCK_IO_LIMIT_READ]
166 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
167 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
168 || io_limits->iops[BLOCK_IO_LIMIT_READ]
169 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
170 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
171 }
172
173 static void bdrv_io_limits_intercept(BlockDriverState *bs,
174 bool is_write, int nb_sectors)
175 {
176 int64_t wait_time = -1;
177
178 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
179 qemu_co_queue_wait(&bs->throttled_reqs);
180 }
181
182 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
183 * throttled requests will not be dequeued until the current request is
184 * allowed to be serviced. So if the current request still exceeds the
185 * limits, it will be inserted to the head. All requests followed it will
186 * be still in throttled_reqs queue.
187 */
188
189 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
190 qemu_mod_timer(bs->block_timer,
191 wait_time + qemu_get_clock_ns(vm_clock));
192 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
193 }
194
195 qemu_co_queue_next(&bs->throttled_reqs);
196 }
197
198 /* check if the path starts with "<protocol>:" */
199 static int path_has_protocol(const char *path)
200 {
201 #ifdef _WIN32
202 if (is_windows_drive(path) ||
203 is_windows_drive_prefix(path)) {
204 return 0;
205 }
206 #endif
207
208 return strchr(path, ':') != NULL;
209 }
210
211 int path_is_absolute(const char *path)
212 {
213 const char *p;
214 #ifdef _WIN32
215 /* specific case for names like: "\\.\d:" */
216 if (*path == '/' || *path == '\\')
217 return 1;
218 #endif
219 p = strchr(path, ':');
220 if (p)
221 p++;
222 else
223 p = path;
224 #ifdef _WIN32
225 return (*p == '/' || *p == '\\');
226 #else
227 return (*p == '/');
228 #endif
229 }
230
231 /* if filename is absolute, just copy it to dest. Otherwise, build a
232 path to it by considering it is relative to base_path. URL are
233 supported. */
234 void path_combine(char *dest, int dest_size,
235 const char *base_path,
236 const char *filename)
237 {
238 const char *p, *p1;
239 int len;
240
241 if (dest_size <= 0)
242 return;
243 if (path_is_absolute(filename)) {
244 pstrcpy(dest, dest_size, filename);
245 } else {
246 p = strchr(base_path, ':');
247 if (p)
248 p++;
249 else
250 p = base_path;
251 p1 = strrchr(base_path, '/');
252 #ifdef _WIN32
253 {
254 const char *p2;
255 p2 = strrchr(base_path, '\\');
256 if (!p1 || p2 > p1)
257 p1 = p2;
258 }
259 #endif
260 if (p1)
261 p1++;
262 else
263 p1 = base_path;
264 if (p1 > p)
265 p = p1;
266 len = p - base_path;
267 if (len > dest_size - 1)
268 len = dest_size - 1;
269 memcpy(dest, base_path, len);
270 dest[len] = '\0';
271 pstrcat(dest, dest_size, filename);
272 }
273 }
274
275 void bdrv_register(BlockDriver *bdrv)
276 {
277 /* Block drivers without coroutine functions need emulation */
278 if (!bdrv->bdrv_co_readv) {
279 bdrv->bdrv_co_readv = bdrv_co_readv_em;
280 bdrv->bdrv_co_writev = bdrv_co_writev_em;
281
282 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
283 * the block driver lacks aio we need to emulate that too.
284 */
285 if (!bdrv->bdrv_aio_readv) {
286 /* add AIO emulation layer */
287 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
288 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
289 }
290 }
291
292 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
293 }
294
295 /* create a new block device (by default it is empty) */
296 BlockDriverState *bdrv_new(const char *device_name)
297 {
298 BlockDriverState *bs;
299
300 bs = g_malloc0(sizeof(BlockDriverState));
301 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
302 if (device_name[0] != '\0') {
303 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
304 }
305 bdrv_iostatus_disable(bs);
306 return bs;
307 }
308
309 BlockDriver *bdrv_find_format(const char *format_name)
310 {
311 BlockDriver *drv1;
312 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
313 if (!strcmp(drv1->format_name, format_name)) {
314 return drv1;
315 }
316 }
317 return NULL;
318 }
319
320 static int bdrv_is_whitelisted(BlockDriver *drv)
321 {
322 static const char *whitelist[] = {
323 CONFIG_BDRV_WHITELIST
324 };
325 const char **p;
326
327 if (!whitelist[0])
328 return 1; /* no whitelist, anything goes */
329
330 for (p = whitelist; *p; p++) {
331 if (!strcmp(drv->format_name, *p)) {
332 return 1;
333 }
334 }
335 return 0;
336 }
337
338 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
339 {
340 BlockDriver *drv = bdrv_find_format(format_name);
341 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
342 }
343
344 typedef struct CreateCo {
345 BlockDriver *drv;
346 char *filename;
347 QEMUOptionParameter *options;
348 int ret;
349 } CreateCo;
350
351 static void coroutine_fn bdrv_create_co_entry(void *opaque)
352 {
353 CreateCo *cco = opaque;
354 assert(cco->drv);
355
356 cco->ret = cco->drv->bdrv_create(cco->filename, cco->options);
357 }
358
359 int bdrv_create(BlockDriver *drv, const char* filename,
360 QEMUOptionParameter *options)
361 {
362 int ret;
363
364 Coroutine *co;
365 CreateCo cco = {
366 .drv = drv,
367 .filename = g_strdup(filename),
368 .options = options,
369 .ret = NOT_DONE,
370 };
371
372 if (!drv->bdrv_create) {
373 return -ENOTSUP;
374 }
375
376 if (qemu_in_coroutine()) {
377 /* Fast-path if already in coroutine context */
378 bdrv_create_co_entry(&cco);
379 } else {
380 co = qemu_coroutine_create(bdrv_create_co_entry);
381 qemu_coroutine_enter(co, &cco);
382 while (cco.ret == NOT_DONE) {
383 qemu_aio_wait();
384 }
385 }
386
387 ret = cco.ret;
388 g_free(cco.filename);
389
390 return ret;
391 }
392
393 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
394 {
395 BlockDriver *drv;
396
397 drv = bdrv_find_protocol(filename);
398 if (drv == NULL) {
399 return -ENOENT;
400 }
401
402 return bdrv_create(drv, filename, options);
403 }
404
405 #ifdef _WIN32
406 void get_tmp_filename(char *filename, int size)
407 {
408 char temp_dir[MAX_PATH];
409
410 GetTempPath(MAX_PATH, temp_dir);
411 GetTempFileName(temp_dir, "qem", 0, filename);
412 }
413 #else
414 void get_tmp_filename(char *filename, int size)
415 {
416 int fd;
417 const char *tmpdir;
418 /* XXX: race condition possible */
419 tmpdir = getenv("TMPDIR");
420 if (!tmpdir)
421 tmpdir = "/tmp";
422 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
423 fd = mkstemp(filename);
424 close(fd);
425 }
426 #endif
427
428 /*
429 * Detect host devices. By convention, /dev/cdrom[N] is always
430 * recognized as a host CDROM.
431 */
432 static BlockDriver *find_hdev_driver(const char *filename)
433 {
434 int score_max = 0, score;
435 BlockDriver *drv = NULL, *d;
436
437 QLIST_FOREACH(d, &bdrv_drivers, list) {
438 if (d->bdrv_probe_device) {
439 score = d->bdrv_probe_device(filename);
440 if (score > score_max) {
441 score_max = score;
442 drv = d;
443 }
444 }
445 }
446
447 return drv;
448 }
449
450 BlockDriver *bdrv_find_protocol(const char *filename)
451 {
452 BlockDriver *drv1;
453 char protocol[128];
454 int len;
455 const char *p;
456
457 /* TODO Drivers without bdrv_file_open must be specified explicitly */
458
459 /*
460 * XXX(hch): we really should not let host device detection
461 * override an explicit protocol specification, but moving this
462 * later breaks access to device names with colons in them.
463 * Thanks to the brain-dead persistent naming schemes on udev-
464 * based Linux systems those actually are quite common.
465 */
466 drv1 = find_hdev_driver(filename);
467 if (drv1) {
468 return drv1;
469 }
470
471 if (!path_has_protocol(filename)) {
472 return bdrv_find_format("file");
473 }
474 p = strchr(filename, ':');
475 assert(p != NULL);
476 len = p - filename;
477 if (len > sizeof(protocol) - 1)
478 len = sizeof(protocol) - 1;
479 memcpy(protocol, filename, len);
480 protocol[len] = '\0';
481 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
482 if (drv1->protocol_name &&
483 !strcmp(drv1->protocol_name, protocol)) {
484 return drv1;
485 }
486 }
487 return NULL;
488 }
489
490 static int find_image_format(const char *filename, BlockDriver **pdrv)
491 {
492 int ret, score, score_max;
493 BlockDriver *drv1, *drv;
494 uint8_t buf[2048];
495 BlockDriverState *bs;
496
497 ret = bdrv_file_open(&bs, filename, 0);
498 if (ret < 0) {
499 *pdrv = NULL;
500 return ret;
501 }
502
503 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
504 if (bs->sg || !bdrv_is_inserted(bs)) {
505 bdrv_delete(bs);
506 drv = bdrv_find_format("raw");
507 if (!drv) {
508 ret = -ENOENT;
509 }
510 *pdrv = drv;
511 return ret;
512 }
513
514 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
515 bdrv_delete(bs);
516 if (ret < 0) {
517 *pdrv = NULL;
518 return ret;
519 }
520
521 score_max = 0;
522 drv = NULL;
523 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
524 if (drv1->bdrv_probe) {
525 score = drv1->bdrv_probe(buf, ret, filename);
526 if (score > score_max) {
527 score_max = score;
528 drv = drv1;
529 }
530 }
531 }
532 if (!drv) {
533 ret = -ENOENT;
534 }
535 *pdrv = drv;
536 return ret;
537 }
538
539 /**
540 * Set the current 'total_sectors' value
541 */
542 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
543 {
544 BlockDriver *drv = bs->drv;
545
546 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
547 if (bs->sg)
548 return 0;
549
550 /* query actual device if possible, otherwise just trust the hint */
551 if (drv->bdrv_getlength) {
552 int64_t length = drv->bdrv_getlength(bs);
553 if (length < 0) {
554 return length;
555 }
556 hint = length >> BDRV_SECTOR_BITS;
557 }
558
559 bs->total_sectors = hint;
560 return 0;
561 }
562
563 /**
564 * Set open flags for a given cache mode
565 *
566 * Return 0 on success, -1 if the cache mode was invalid.
567 */
568 int bdrv_parse_cache_flags(const char *mode, int *flags)
569 {
570 *flags &= ~BDRV_O_CACHE_MASK;
571
572 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
573 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
574 } else if (!strcmp(mode, "directsync")) {
575 *flags |= BDRV_O_NOCACHE;
576 } else if (!strcmp(mode, "writeback")) {
577 *flags |= BDRV_O_CACHE_WB;
578 } else if (!strcmp(mode, "unsafe")) {
579 *flags |= BDRV_O_CACHE_WB;
580 *flags |= BDRV_O_NO_FLUSH;
581 } else if (!strcmp(mode, "writethrough")) {
582 /* this is the default */
583 } else {
584 return -1;
585 }
586
587 return 0;
588 }
589
590 /**
591 * The copy-on-read flag is actually a reference count so multiple users may
592 * use the feature without worrying about clobbering its previous state.
593 * Copy-on-read stays enabled until all users have called to disable it.
594 */
595 void bdrv_enable_copy_on_read(BlockDriverState *bs)
596 {
597 bs->copy_on_read++;
598 }
599
600 void bdrv_disable_copy_on_read(BlockDriverState *bs)
601 {
602 assert(bs->copy_on_read > 0);
603 bs->copy_on_read--;
604 }
605
606 /*
607 * Common part for opening disk images and files
608 */
609 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
610 int flags, BlockDriver *drv)
611 {
612 int ret, open_flags;
613
614 assert(drv != NULL);
615
616 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
617
618 bs->file = NULL;
619 bs->total_sectors = 0;
620 bs->encrypted = 0;
621 bs->valid_key = 0;
622 bs->sg = 0;
623 bs->open_flags = flags;
624 bs->growable = 0;
625 bs->buffer_alignment = 512;
626
627 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
628 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
629 bdrv_enable_copy_on_read(bs);
630 }
631
632 pstrcpy(bs->filename, sizeof(bs->filename), filename);
633 bs->backing_file[0] = '\0';
634
635 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
636 return -ENOTSUP;
637 }
638
639 bs->drv = drv;
640 bs->opaque = g_malloc0(drv->instance_size);
641
642 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
643
644 /*
645 * Clear flags that are internal to the block layer before opening the
646 * image.
647 */
648 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
649
650 /*
651 * Snapshots should be writable.
652 */
653 if (bs->is_temporary) {
654 open_flags |= BDRV_O_RDWR;
655 }
656
657 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
658
659 /* Open the image, either directly or using a protocol */
660 if (drv->bdrv_file_open) {
661 ret = drv->bdrv_file_open(bs, filename, open_flags);
662 } else {
663 ret = bdrv_file_open(&bs->file, filename, open_flags);
664 if (ret >= 0) {
665 ret = drv->bdrv_open(bs, open_flags);
666 }
667 }
668
669 if (ret < 0) {
670 goto free_and_fail;
671 }
672
673 ret = refresh_total_sectors(bs, bs->total_sectors);
674 if (ret < 0) {
675 goto free_and_fail;
676 }
677
678 #ifndef _WIN32
679 if (bs->is_temporary) {
680 unlink(filename);
681 }
682 #endif
683 return 0;
684
685 free_and_fail:
686 if (bs->file) {
687 bdrv_delete(bs->file);
688 bs->file = NULL;
689 }
690 g_free(bs->opaque);
691 bs->opaque = NULL;
692 bs->drv = NULL;
693 return ret;
694 }
695
696 /*
697 * Opens a file using a protocol (file, host_device, nbd, ...)
698 */
699 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
700 {
701 BlockDriverState *bs;
702 BlockDriver *drv;
703 int ret;
704
705 drv = bdrv_find_protocol(filename);
706 if (!drv) {
707 return -ENOENT;
708 }
709
710 bs = bdrv_new("");
711 ret = bdrv_open_common(bs, filename, flags, drv);
712 if (ret < 0) {
713 bdrv_delete(bs);
714 return ret;
715 }
716 bs->growable = 1;
717 *pbs = bs;
718 return 0;
719 }
720
721 /*
722 * Opens a disk image (raw, qcow2, vmdk, ...)
723 */
724 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
725 BlockDriver *drv)
726 {
727 int ret;
728 char tmp_filename[PATH_MAX];
729
730 if (flags & BDRV_O_SNAPSHOT) {
731 BlockDriverState *bs1;
732 int64_t total_size;
733 int is_protocol = 0;
734 BlockDriver *bdrv_qcow2;
735 QEMUOptionParameter *options;
736 char backing_filename[PATH_MAX];
737
738 /* if snapshot, we create a temporary backing file and open it
739 instead of opening 'filename' directly */
740
741 /* if there is a backing file, use it */
742 bs1 = bdrv_new("");
743 ret = bdrv_open(bs1, filename, 0, drv);
744 if (ret < 0) {
745 bdrv_delete(bs1);
746 return ret;
747 }
748 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
749
750 if (bs1->drv && bs1->drv->protocol_name)
751 is_protocol = 1;
752
753 bdrv_delete(bs1);
754
755 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
756
757 /* Real path is meaningless for protocols */
758 if (is_protocol)
759 snprintf(backing_filename, sizeof(backing_filename),
760 "%s", filename);
761 else if (!realpath(filename, backing_filename))
762 return -errno;
763
764 bdrv_qcow2 = bdrv_find_format("qcow2");
765 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
766
767 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
768 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
769 if (drv) {
770 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
771 drv->format_name);
772 }
773
774 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
775 free_option_parameters(options);
776 if (ret < 0) {
777 return ret;
778 }
779
780 filename = tmp_filename;
781 drv = bdrv_qcow2;
782 bs->is_temporary = 1;
783 }
784
785 /* Find the right image format driver */
786 if (!drv) {
787 ret = find_image_format(filename, &drv);
788 }
789
790 if (!drv) {
791 goto unlink_and_fail;
792 }
793
794 /* Open the image */
795 ret = bdrv_open_common(bs, filename, flags, drv);
796 if (ret < 0) {
797 goto unlink_and_fail;
798 }
799
800 /* If there is a backing file, use it */
801 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
802 char backing_filename[PATH_MAX];
803 int back_flags;
804 BlockDriver *back_drv = NULL;
805
806 bs->backing_hd = bdrv_new("");
807
808 if (path_has_protocol(bs->backing_file)) {
809 pstrcpy(backing_filename, sizeof(backing_filename),
810 bs->backing_file);
811 } else {
812 path_combine(backing_filename, sizeof(backing_filename),
813 filename, bs->backing_file);
814 }
815
816 if (bs->backing_format[0] != '\0') {
817 back_drv = bdrv_find_format(bs->backing_format);
818 }
819
820 /* backing files always opened read-only */
821 back_flags =
822 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
823
824 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
825 if (ret < 0) {
826 bdrv_close(bs);
827 return ret;
828 }
829 if (bs->is_temporary) {
830 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
831 } else {
832 /* base image inherits from "parent" */
833 bs->backing_hd->keep_read_only = bs->keep_read_only;
834 }
835 }
836
837 if (!bdrv_key_required(bs)) {
838 bdrv_dev_change_media_cb(bs, true);
839 }
840
841 /* throttling disk I/O limits */
842 if (bs->io_limits_enabled) {
843 bdrv_io_limits_enable(bs);
844 }
845
846 return 0;
847
848 unlink_and_fail:
849 if (bs->is_temporary) {
850 unlink(filename);
851 }
852 return ret;
853 }
854
855 void bdrv_close(BlockDriverState *bs)
856 {
857 bdrv_flush(bs);
858 if (bs->drv) {
859 if (bs->job) {
860 block_job_cancel_sync(bs->job);
861 }
862 bdrv_drain_all();
863
864 if (bs == bs_snapshots) {
865 bs_snapshots = NULL;
866 }
867 if (bs->backing_hd) {
868 bdrv_delete(bs->backing_hd);
869 bs->backing_hd = NULL;
870 }
871 bs->drv->bdrv_close(bs);
872 g_free(bs->opaque);
873 #ifdef _WIN32
874 if (bs->is_temporary) {
875 unlink(bs->filename);
876 }
877 #endif
878 bs->opaque = NULL;
879 bs->drv = NULL;
880 bs->copy_on_read = 0;
881
882 if (bs->file != NULL) {
883 bdrv_close(bs->file);
884 }
885
886 bdrv_dev_change_media_cb(bs, false);
887 }
888
889 /*throttling disk I/O limits*/
890 if (bs->io_limits_enabled) {
891 bdrv_io_limits_disable(bs);
892 }
893 }
894
895 void bdrv_close_all(void)
896 {
897 BlockDriverState *bs;
898
899 QTAILQ_FOREACH(bs, &bdrv_states, list) {
900 bdrv_close(bs);
901 }
902 }
903
904 /*
905 * Wait for pending requests to complete across all BlockDriverStates
906 *
907 * This function does not flush data to disk, use bdrv_flush_all() for that
908 * after calling this function.
909 */
910 void bdrv_drain_all(void)
911 {
912 BlockDriverState *bs;
913
914 qemu_aio_flush();
915
916 /* If requests are still pending there is a bug somewhere */
917 QTAILQ_FOREACH(bs, &bdrv_states, list) {
918 assert(QLIST_EMPTY(&bs->tracked_requests));
919 assert(qemu_co_queue_empty(&bs->throttled_reqs));
920 }
921 }
922
923 /* make a BlockDriverState anonymous by removing from bdrv_state list.
924 Also, NULL terminate the device_name to prevent double remove */
925 void bdrv_make_anon(BlockDriverState *bs)
926 {
927 if (bs->device_name[0] != '\0') {
928 QTAILQ_REMOVE(&bdrv_states, bs, list);
929 }
930 bs->device_name[0] = '\0';
931 }
932
933 /*
934 * Add new bs contents at the top of an image chain while the chain is
935 * live, while keeping required fields on the top layer.
936 *
937 * This will modify the BlockDriverState fields, and swap contents
938 * between bs_new and bs_top. Both bs_new and bs_top are modified.
939 *
940 * bs_new is required to be anonymous.
941 *
942 * This function does not create any image files.
943 */
944 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
945 {
946 BlockDriverState tmp;
947
948 /* bs_new must be anonymous */
949 assert(bs_new->device_name[0] == '\0');
950
951 tmp = *bs_new;
952
953 /* there are some fields that need to stay on the top layer: */
954
955 /* dev info */
956 tmp.dev_ops = bs_top->dev_ops;
957 tmp.dev_opaque = bs_top->dev_opaque;
958 tmp.dev = bs_top->dev;
959 tmp.buffer_alignment = bs_top->buffer_alignment;
960 tmp.copy_on_read = bs_top->copy_on_read;
961
962 /* i/o timing parameters */
963 tmp.slice_time = bs_top->slice_time;
964 tmp.slice_start = bs_top->slice_start;
965 tmp.slice_end = bs_top->slice_end;
966 tmp.io_limits = bs_top->io_limits;
967 tmp.io_base = bs_top->io_base;
968 tmp.throttled_reqs = bs_top->throttled_reqs;
969 tmp.block_timer = bs_top->block_timer;
970 tmp.io_limits_enabled = bs_top->io_limits_enabled;
971
972 /* geometry */
973 tmp.cyls = bs_top->cyls;
974 tmp.heads = bs_top->heads;
975 tmp.secs = bs_top->secs;
976 tmp.translation = bs_top->translation;
977
978 /* r/w error */
979 tmp.on_read_error = bs_top->on_read_error;
980 tmp.on_write_error = bs_top->on_write_error;
981
982 /* i/o status */
983 tmp.iostatus_enabled = bs_top->iostatus_enabled;
984 tmp.iostatus = bs_top->iostatus;
985
986 /* keep the same entry in bdrv_states */
987 pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name);
988 tmp.list = bs_top->list;
989
990 /* The contents of 'tmp' will become bs_top, as we are
991 * swapping bs_new and bs_top contents. */
992 tmp.backing_hd = bs_new;
993 pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
994 bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
995
996 /* swap contents of the fixed new bs and the current top */
997 *bs_new = *bs_top;
998 *bs_top = tmp;
999
1000 /* device_name[] was carried over from the old bs_top. bs_new
1001 * shouldn't be in bdrv_states, so we need to make device_name[]
1002 * reflect the anonymity of bs_new
1003 */
1004 bs_new->device_name[0] = '\0';
1005
1006 /* clear the copied fields in the new backing file */
1007 bdrv_detach_dev(bs_new, bs_new->dev);
1008
1009 qemu_co_queue_init(&bs_new->throttled_reqs);
1010 memset(&bs_new->io_base, 0, sizeof(bs_new->io_base));
1011 memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits));
1012 bdrv_iostatus_disable(bs_new);
1013
1014 /* we don't use bdrv_io_limits_disable() for this, because we don't want
1015 * to affect or delete the block_timer, as it has been moved to bs_top */
1016 bs_new->io_limits_enabled = false;
1017 bs_new->block_timer = NULL;
1018 bs_new->slice_time = 0;
1019 bs_new->slice_start = 0;
1020 bs_new->slice_end = 0;
1021 }
1022
1023 void bdrv_delete(BlockDriverState *bs)
1024 {
1025 assert(!bs->dev);
1026 assert(!bs->job);
1027 assert(!bs->in_use);
1028
1029 /* remove from list, if necessary */
1030 bdrv_make_anon(bs);
1031
1032 bdrv_close(bs);
1033 if (bs->file != NULL) {
1034 bdrv_delete(bs->file);
1035 }
1036
1037 assert(bs != bs_snapshots);
1038 g_free(bs);
1039 }
1040
1041 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
1042 /* TODO change to DeviceState *dev when all users are qdevified */
1043 {
1044 if (bs->dev) {
1045 return -EBUSY;
1046 }
1047 bs->dev = dev;
1048 bdrv_iostatus_reset(bs);
1049 return 0;
1050 }
1051
1052 /* TODO qdevified devices don't use this, remove when devices are qdevified */
1053 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
1054 {
1055 if (bdrv_attach_dev(bs, dev) < 0) {
1056 abort();
1057 }
1058 }
1059
1060 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
1061 /* TODO change to DeviceState *dev when all users are qdevified */
1062 {
1063 assert(bs->dev == dev);
1064 bs->dev = NULL;
1065 bs->dev_ops = NULL;
1066 bs->dev_opaque = NULL;
1067 bs->buffer_alignment = 512;
1068 }
1069
1070 /* TODO change to return DeviceState * when all users are qdevified */
1071 void *bdrv_get_attached_dev(BlockDriverState *bs)
1072 {
1073 return bs->dev;
1074 }
1075
1076 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
1077 void *opaque)
1078 {
1079 bs->dev_ops = ops;
1080 bs->dev_opaque = opaque;
1081 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
1082 bs_snapshots = NULL;
1083 }
1084 }
1085
1086 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
1087 BlockQMPEventAction action, int is_read)
1088 {
1089 QObject *data;
1090 const char *action_str;
1091
1092 switch (action) {
1093 case BDRV_ACTION_REPORT:
1094 action_str = "report";
1095 break;
1096 case BDRV_ACTION_IGNORE:
1097 action_str = "ignore";
1098 break;
1099 case BDRV_ACTION_STOP:
1100 action_str = "stop";
1101 break;
1102 default:
1103 abort();
1104 }
1105
1106 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
1107 bdrv->device_name,
1108 action_str,
1109 is_read ? "read" : "write");
1110 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
1111
1112 qobject_decref(data);
1113 }
1114
1115 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected)
1116 {
1117 QObject *data;
1118
1119 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }",
1120 bdrv_get_device_name(bs), ejected);
1121 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data);
1122
1123 qobject_decref(data);
1124 }
1125
1126 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
1127 {
1128 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
1129 bool tray_was_closed = !bdrv_dev_is_tray_open(bs);
1130 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
1131 if (tray_was_closed) {
1132 /* tray open */
1133 bdrv_emit_qmp_eject_event(bs, true);
1134 }
1135 if (load) {
1136 /* tray close */
1137 bdrv_emit_qmp_eject_event(bs, false);
1138 }
1139 }
1140 }
1141
1142 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
1143 {
1144 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
1145 }
1146
1147 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
1148 {
1149 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
1150 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
1151 }
1152 }
1153
1154 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
1155 {
1156 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
1157 return bs->dev_ops->is_tray_open(bs->dev_opaque);
1158 }
1159 return false;
1160 }
1161
1162 static void bdrv_dev_resize_cb(BlockDriverState *bs)
1163 {
1164 if (bs->dev_ops && bs->dev_ops->resize_cb) {
1165 bs->dev_ops->resize_cb(bs->dev_opaque);
1166 }
1167 }
1168
1169 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
1170 {
1171 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
1172 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
1173 }
1174 return false;
1175 }
1176
1177 /*
1178 * Run consistency checks on an image
1179 *
1180 * Returns 0 if the check could be completed (it doesn't mean that the image is
1181 * free of errors) or -errno when an internal error occurred. The results of the
1182 * check are stored in res.
1183 */
1184 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
1185 {
1186 if (bs->drv->bdrv_check == NULL) {
1187 return -ENOTSUP;
1188 }
1189
1190 memset(res, 0, sizeof(*res));
1191 return bs->drv->bdrv_check(bs, res);
1192 }
1193
1194 #define COMMIT_BUF_SECTORS 2048
1195
1196 /* commit COW file into the raw image */
1197 int bdrv_commit(BlockDriverState *bs)
1198 {
1199 BlockDriver *drv = bs->drv;
1200 BlockDriver *backing_drv;
1201 int64_t sector, total_sectors;
1202 int n, ro, open_flags;
1203 int ret = 0, rw_ret = 0;
1204 uint8_t *buf;
1205 char filename[1024];
1206 BlockDriverState *bs_rw, *bs_ro;
1207
1208 if (!drv)
1209 return -ENOMEDIUM;
1210
1211 if (!bs->backing_hd) {
1212 return -ENOTSUP;
1213 }
1214
1215 if (bs->backing_hd->keep_read_only) {
1216 return -EACCES;
1217 }
1218
1219 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1220 return -EBUSY;
1221 }
1222
1223 backing_drv = bs->backing_hd->drv;
1224 ro = bs->backing_hd->read_only;
1225 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1226 open_flags = bs->backing_hd->open_flags;
1227
1228 if (ro) {
1229 /* re-open as RW */
1230 bdrv_delete(bs->backing_hd);
1231 bs->backing_hd = NULL;
1232 bs_rw = bdrv_new("");
1233 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1234 backing_drv);
1235 if (rw_ret < 0) {
1236 bdrv_delete(bs_rw);
1237 /* try to re-open read-only */
1238 bs_ro = bdrv_new("");
1239 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1240 backing_drv);
1241 if (ret < 0) {
1242 bdrv_delete(bs_ro);
1243 /* drive not functional anymore */
1244 bs->drv = NULL;
1245 return ret;
1246 }
1247 bs->backing_hd = bs_ro;
1248 return rw_ret;
1249 }
1250 bs->backing_hd = bs_rw;
1251 }
1252
1253 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1254 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1255
1256 for (sector = 0; sector < total_sectors; sector += n) {
1257 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1258
1259 if (bdrv_read(bs, sector, buf, n) != 0) {
1260 ret = -EIO;
1261 goto ro_cleanup;
1262 }
1263
1264 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1265 ret = -EIO;
1266 goto ro_cleanup;
1267 }
1268 }
1269 }
1270
1271 if (drv->bdrv_make_empty) {
1272 ret = drv->bdrv_make_empty(bs);
1273 bdrv_flush(bs);
1274 }
1275
1276 /*
1277 * Make sure all data we wrote to the backing device is actually
1278 * stable on disk.
1279 */
1280 if (bs->backing_hd)
1281 bdrv_flush(bs->backing_hd);
1282
1283 ro_cleanup:
1284 g_free(buf);
1285
1286 if (ro) {
1287 /* re-open as RO */
1288 bdrv_delete(bs->backing_hd);
1289 bs->backing_hd = NULL;
1290 bs_ro = bdrv_new("");
1291 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1292 backing_drv);
1293 if (ret < 0) {
1294 bdrv_delete(bs_ro);
1295 /* drive not functional anymore */
1296 bs->drv = NULL;
1297 return ret;
1298 }
1299 bs->backing_hd = bs_ro;
1300 bs->backing_hd->keep_read_only = 0;
1301 }
1302
1303 return ret;
1304 }
1305
1306 int bdrv_commit_all(void)
1307 {
1308 BlockDriverState *bs;
1309
1310 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1311 int ret = bdrv_commit(bs);
1312 if (ret < 0) {
1313 return ret;
1314 }
1315 }
1316 return 0;
1317 }
1318
1319 struct BdrvTrackedRequest {
1320 BlockDriverState *bs;
1321 int64_t sector_num;
1322 int nb_sectors;
1323 bool is_write;
1324 QLIST_ENTRY(BdrvTrackedRequest) list;
1325 Coroutine *co; /* owner, used for deadlock detection */
1326 CoQueue wait_queue; /* coroutines blocked on this request */
1327 };
1328
1329 /**
1330 * Remove an active request from the tracked requests list
1331 *
1332 * This function should be called when a tracked request is completing.
1333 */
1334 static void tracked_request_end(BdrvTrackedRequest *req)
1335 {
1336 QLIST_REMOVE(req, list);
1337 qemu_co_queue_restart_all(&req->wait_queue);
1338 }
1339
1340 /**
1341 * Add an active request to the tracked requests list
1342 */
1343 static void tracked_request_begin(BdrvTrackedRequest *req,
1344 BlockDriverState *bs,
1345 int64_t sector_num,
1346 int nb_sectors, bool is_write)
1347 {
1348 *req = (BdrvTrackedRequest){
1349 .bs = bs,
1350 .sector_num = sector_num,
1351 .nb_sectors = nb_sectors,
1352 .is_write = is_write,
1353 .co = qemu_coroutine_self(),
1354 };
1355
1356 qemu_co_queue_init(&req->wait_queue);
1357
1358 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1359 }
1360
1361 /**
1362 * Round a region to cluster boundaries
1363 */
1364 static void round_to_clusters(BlockDriverState *bs,
1365 int64_t sector_num, int nb_sectors,
1366 int64_t *cluster_sector_num,
1367 int *cluster_nb_sectors)
1368 {
1369 BlockDriverInfo bdi;
1370
1371 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1372 *cluster_sector_num = sector_num;
1373 *cluster_nb_sectors = nb_sectors;
1374 } else {
1375 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1376 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1377 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1378 nb_sectors, c);
1379 }
1380 }
1381
1382 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1383 int64_t sector_num, int nb_sectors) {
1384 /* aaaa bbbb */
1385 if (sector_num >= req->sector_num + req->nb_sectors) {
1386 return false;
1387 }
1388 /* bbbb aaaa */
1389 if (req->sector_num >= sector_num + nb_sectors) {
1390 return false;
1391 }
1392 return true;
1393 }
1394
1395 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1396 int64_t sector_num, int nb_sectors)
1397 {
1398 BdrvTrackedRequest *req;
1399 int64_t cluster_sector_num;
1400 int cluster_nb_sectors;
1401 bool retry;
1402
1403 /* If we touch the same cluster it counts as an overlap. This guarantees
1404 * that allocating writes will be serialized and not race with each other
1405 * for the same cluster. For example, in copy-on-read it ensures that the
1406 * CoR read and write operations are atomic and guest writes cannot
1407 * interleave between them.
1408 */
1409 round_to_clusters(bs, sector_num, nb_sectors,
1410 &cluster_sector_num, &cluster_nb_sectors);
1411
1412 do {
1413 retry = false;
1414 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1415 if (tracked_request_overlaps(req, cluster_sector_num,
1416 cluster_nb_sectors)) {
1417 /* Hitting this means there was a reentrant request, for
1418 * example, a block driver issuing nested requests. This must
1419 * never happen since it means deadlock.
1420 */
1421 assert(qemu_coroutine_self() != req->co);
1422
1423 qemu_co_queue_wait(&req->wait_queue);
1424 retry = true;
1425 break;
1426 }
1427 }
1428 } while (retry);
1429 }
1430
1431 /*
1432 * Return values:
1433 * 0 - success
1434 * -EINVAL - backing format specified, but no file
1435 * -ENOSPC - can't update the backing file because no space is left in the
1436 * image file header
1437 * -ENOTSUP - format driver doesn't support changing the backing file
1438 */
1439 int bdrv_change_backing_file(BlockDriverState *bs,
1440 const char *backing_file, const char *backing_fmt)
1441 {
1442 BlockDriver *drv = bs->drv;
1443
1444 if (drv->bdrv_change_backing_file != NULL) {
1445 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1446 } else {
1447 return -ENOTSUP;
1448 }
1449 }
1450
1451 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1452 size_t size)
1453 {
1454 int64_t len;
1455
1456 if (!bdrv_is_inserted(bs))
1457 return -ENOMEDIUM;
1458
1459 if (bs->growable)
1460 return 0;
1461
1462 len = bdrv_getlength(bs);
1463
1464 if (offset < 0)
1465 return -EIO;
1466
1467 if ((offset > len) || (len - offset < size))
1468 return -EIO;
1469
1470 return 0;
1471 }
1472
1473 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1474 int nb_sectors)
1475 {
1476 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1477 nb_sectors * BDRV_SECTOR_SIZE);
1478 }
1479
1480 typedef struct RwCo {
1481 BlockDriverState *bs;
1482 int64_t sector_num;
1483 int nb_sectors;
1484 QEMUIOVector *qiov;
1485 bool is_write;
1486 int ret;
1487 } RwCo;
1488
1489 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1490 {
1491 RwCo *rwco = opaque;
1492
1493 if (!rwco->is_write) {
1494 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1495 rwco->nb_sectors, rwco->qiov, 0);
1496 } else {
1497 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1498 rwco->nb_sectors, rwco->qiov, 0);
1499 }
1500 }
1501
1502 /*
1503 * Process a synchronous request using coroutines
1504 */
1505 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1506 int nb_sectors, bool is_write)
1507 {
1508 QEMUIOVector qiov;
1509 struct iovec iov = {
1510 .iov_base = (void *)buf,
1511 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1512 };
1513 Coroutine *co;
1514 RwCo rwco = {
1515 .bs = bs,
1516 .sector_num = sector_num,
1517 .nb_sectors = nb_sectors,
1518 .qiov = &qiov,
1519 .is_write = is_write,
1520 .ret = NOT_DONE,
1521 };
1522
1523 qemu_iovec_init_external(&qiov, &iov, 1);
1524
1525 /**
1526 * In sync call context, when the vcpu is blocked, this throttling timer
1527 * will not fire; so the I/O throttling function has to be disabled here
1528 * if it has been enabled.
1529 */
1530 if (bs->io_limits_enabled) {
1531 fprintf(stderr, "Disabling I/O throttling on '%s' due "
1532 "to synchronous I/O.\n", bdrv_get_device_name(bs));
1533 bdrv_io_limits_disable(bs);
1534 }
1535
1536 if (qemu_in_coroutine()) {
1537 /* Fast-path if already in coroutine context */
1538 bdrv_rw_co_entry(&rwco);
1539 } else {
1540 co = qemu_coroutine_create(bdrv_rw_co_entry);
1541 qemu_coroutine_enter(co, &rwco);
1542 while (rwco.ret == NOT_DONE) {
1543 qemu_aio_wait();
1544 }
1545 }
1546 return rwco.ret;
1547 }
1548
1549 /* return < 0 if error. See bdrv_write() for the return codes */
1550 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1551 uint8_t *buf, int nb_sectors)
1552 {
1553 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1554 }
1555
1556 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1557 int nb_sectors, int dirty)
1558 {
1559 int64_t start, end;
1560 unsigned long val, idx, bit;
1561
1562 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1563 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1564
1565 for (; start <= end; start++) {
1566 idx = start / (sizeof(unsigned long) * 8);
1567 bit = start % (sizeof(unsigned long) * 8);
1568 val = bs->dirty_bitmap[idx];
1569 if (dirty) {
1570 if (!(val & (1UL << bit))) {
1571 bs->dirty_count++;
1572 val |= 1UL << bit;
1573 }
1574 } else {
1575 if (val & (1UL << bit)) {
1576 bs->dirty_count--;
1577 val &= ~(1UL << bit);
1578 }
1579 }
1580 bs->dirty_bitmap[idx] = val;
1581 }
1582 }
1583
1584 /* Return < 0 if error. Important errors are:
1585 -EIO generic I/O error (may happen for all errors)
1586 -ENOMEDIUM No media inserted.
1587 -EINVAL Invalid sector number or nb_sectors
1588 -EACCES Trying to write a read-only device
1589 */
1590 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1591 const uint8_t *buf, int nb_sectors)
1592 {
1593 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1594 }
1595
1596 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1597 void *buf, int count1)
1598 {
1599 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1600 int len, nb_sectors, count;
1601 int64_t sector_num;
1602 int ret;
1603
1604 count = count1;
1605 /* first read to align to sector start */
1606 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1607 if (len > count)
1608 len = count;
1609 sector_num = offset >> BDRV_SECTOR_BITS;
1610 if (len > 0) {
1611 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1612 return ret;
1613 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1614 count -= len;
1615 if (count == 0)
1616 return count1;
1617 sector_num++;
1618 buf += len;
1619 }
1620
1621 /* read the sectors "in place" */
1622 nb_sectors = count >> BDRV_SECTOR_BITS;
1623 if (nb_sectors > 0) {
1624 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1625 return ret;
1626 sector_num += nb_sectors;
1627 len = nb_sectors << BDRV_SECTOR_BITS;
1628 buf += len;
1629 count -= len;
1630 }
1631
1632 /* add data from the last sector */
1633 if (count > 0) {
1634 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1635 return ret;
1636 memcpy(buf, tmp_buf, count);
1637 }
1638 return count1;
1639 }
1640
1641 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1642 const void *buf, int count1)
1643 {
1644 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1645 int len, nb_sectors, count;
1646 int64_t sector_num;
1647 int ret;
1648
1649 count = count1;
1650 /* first write to align to sector start */
1651 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1652 if (len > count)
1653 len = count;
1654 sector_num = offset >> BDRV_SECTOR_BITS;
1655 if (len > 0) {
1656 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1657 return ret;
1658 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1659 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1660 return ret;
1661 count -= len;
1662 if (count == 0)
1663 return count1;
1664 sector_num++;
1665 buf += len;
1666 }
1667
1668 /* write the sectors "in place" */
1669 nb_sectors = count >> BDRV_SECTOR_BITS;
1670 if (nb_sectors > 0) {
1671 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1672 return ret;
1673 sector_num += nb_sectors;
1674 len = nb_sectors << BDRV_SECTOR_BITS;
1675 buf += len;
1676 count -= len;
1677 }
1678
1679 /* add data from the last sector */
1680 if (count > 0) {
1681 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1682 return ret;
1683 memcpy(tmp_buf, buf, count);
1684 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1685 return ret;
1686 }
1687 return count1;
1688 }
1689
1690 /*
1691 * Writes to the file and ensures that no writes are reordered across this
1692 * request (acts as a barrier)
1693 *
1694 * Returns 0 on success, -errno in error cases.
1695 */
1696 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1697 const void *buf, int count)
1698 {
1699 int ret;
1700
1701 ret = bdrv_pwrite(bs, offset, buf, count);
1702 if (ret < 0) {
1703 return ret;
1704 }
1705
1706 /* No flush needed for cache modes that use O_DSYNC */
1707 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1708 bdrv_flush(bs);
1709 }
1710
1711 return 0;
1712 }
1713
1714 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1715 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1716 {
1717 /* Perform I/O through a temporary buffer so that users who scribble over
1718 * their read buffer while the operation is in progress do not end up
1719 * modifying the image file. This is critical for zero-copy guest I/O
1720 * where anything might happen inside guest memory.
1721 */
1722 void *bounce_buffer;
1723
1724 BlockDriver *drv = bs->drv;
1725 struct iovec iov;
1726 QEMUIOVector bounce_qiov;
1727 int64_t cluster_sector_num;
1728 int cluster_nb_sectors;
1729 size_t skip_bytes;
1730 int ret;
1731
1732 /* Cover entire cluster so no additional backing file I/O is required when
1733 * allocating cluster in the image file.
1734 */
1735 round_to_clusters(bs, sector_num, nb_sectors,
1736 &cluster_sector_num, &cluster_nb_sectors);
1737
1738 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1739 cluster_sector_num, cluster_nb_sectors);
1740
1741 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1742 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1743 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1744
1745 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1746 &bounce_qiov);
1747 if (ret < 0) {
1748 goto err;
1749 }
1750
1751 if (drv->bdrv_co_write_zeroes &&
1752 buffer_is_zero(bounce_buffer, iov.iov_len)) {
1753 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
1754 cluster_nb_sectors);
1755 } else {
1756 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1757 &bounce_qiov);
1758 }
1759
1760 if (ret < 0) {
1761 /* It might be okay to ignore write errors for guest requests. If this
1762 * is a deliberate copy-on-read then we don't want to ignore the error.
1763 * Simply report it in all cases.
1764 */
1765 goto err;
1766 }
1767
1768 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1769 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1770 nb_sectors * BDRV_SECTOR_SIZE);
1771
1772 err:
1773 qemu_vfree(bounce_buffer);
1774 return ret;
1775 }
1776
1777 /*
1778 * Handle a read request in coroutine context
1779 */
1780 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1781 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1782 BdrvRequestFlags flags)
1783 {
1784 BlockDriver *drv = bs->drv;
1785 BdrvTrackedRequest req;
1786 int ret;
1787
1788 if (!drv) {
1789 return -ENOMEDIUM;
1790 }
1791 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1792 return -EIO;
1793 }
1794
1795 /* throttling disk read I/O */
1796 if (bs->io_limits_enabled) {
1797 bdrv_io_limits_intercept(bs, false, nb_sectors);
1798 }
1799
1800 if (bs->copy_on_read) {
1801 flags |= BDRV_REQ_COPY_ON_READ;
1802 }
1803 if (flags & BDRV_REQ_COPY_ON_READ) {
1804 bs->copy_on_read_in_flight++;
1805 }
1806
1807 if (bs->copy_on_read_in_flight) {
1808 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1809 }
1810
1811 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1812
1813 if (flags & BDRV_REQ_COPY_ON_READ) {
1814 int pnum;
1815
1816 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1817 if (ret < 0) {
1818 goto out;
1819 }
1820
1821 if (!ret || pnum != nb_sectors) {
1822 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1823 goto out;
1824 }
1825 }
1826
1827 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1828
1829 out:
1830 tracked_request_end(&req);
1831
1832 if (flags & BDRV_REQ_COPY_ON_READ) {
1833 bs->copy_on_read_in_flight--;
1834 }
1835
1836 return ret;
1837 }
1838
1839 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1840 int nb_sectors, QEMUIOVector *qiov)
1841 {
1842 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1843
1844 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1845 }
1846
1847 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1848 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1849 {
1850 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1851
1852 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1853 BDRV_REQ_COPY_ON_READ);
1854 }
1855
1856 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
1857 int64_t sector_num, int nb_sectors)
1858 {
1859 BlockDriver *drv = bs->drv;
1860 QEMUIOVector qiov;
1861 struct iovec iov;
1862 int ret;
1863
1864 /* TODO Emulate only part of misaligned requests instead of letting block
1865 * drivers return -ENOTSUP and emulate everything */
1866
1867 /* First try the efficient write zeroes operation */
1868 if (drv->bdrv_co_write_zeroes) {
1869 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1870 if (ret != -ENOTSUP) {
1871 return ret;
1872 }
1873 }
1874
1875 /* Fall back to bounce buffer if write zeroes is unsupported */
1876 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
1877 iov.iov_base = qemu_blockalign(bs, iov.iov_len);
1878 memset(iov.iov_base, 0, iov.iov_len);
1879 qemu_iovec_init_external(&qiov, &iov, 1);
1880
1881 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
1882
1883 qemu_vfree(iov.iov_base);
1884 return ret;
1885 }
1886
1887 /*
1888 * Handle a write request in coroutine context
1889 */
1890 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1891 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1892 BdrvRequestFlags flags)
1893 {
1894 BlockDriver *drv = bs->drv;
1895 BdrvTrackedRequest req;
1896 int ret;
1897
1898 if (!bs->drv) {
1899 return -ENOMEDIUM;
1900 }
1901 if (bs->read_only) {
1902 return -EACCES;
1903 }
1904 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1905 return -EIO;
1906 }
1907
1908 /* throttling disk write I/O */
1909 if (bs->io_limits_enabled) {
1910 bdrv_io_limits_intercept(bs, true, nb_sectors);
1911 }
1912
1913 if (bs->copy_on_read_in_flight) {
1914 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1915 }
1916
1917 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1918
1919 if (flags & BDRV_REQ_ZERO_WRITE) {
1920 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
1921 } else {
1922 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1923 }
1924
1925 if (bs->dirty_bitmap) {
1926 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1927 }
1928
1929 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1930 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1931 }
1932
1933 tracked_request_end(&req);
1934
1935 return ret;
1936 }
1937
1938 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1939 int nb_sectors, QEMUIOVector *qiov)
1940 {
1941 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1942
1943 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
1944 }
1945
1946 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
1947 int64_t sector_num, int nb_sectors)
1948 {
1949 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
1950
1951 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
1952 BDRV_REQ_ZERO_WRITE);
1953 }
1954
1955 /**
1956 * Truncate file to 'offset' bytes (needed only for file protocols)
1957 */
1958 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1959 {
1960 BlockDriver *drv = bs->drv;
1961 int ret;
1962 if (!drv)
1963 return -ENOMEDIUM;
1964 if (!drv->bdrv_truncate)
1965 return -ENOTSUP;
1966 if (bs->read_only)
1967 return -EACCES;
1968 if (bdrv_in_use(bs))
1969 return -EBUSY;
1970 ret = drv->bdrv_truncate(bs, offset);
1971 if (ret == 0) {
1972 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1973 bdrv_dev_resize_cb(bs);
1974 }
1975 return ret;
1976 }
1977
1978 /**
1979 * Length of a allocated file in bytes. Sparse files are counted by actual
1980 * allocated space. Return < 0 if error or unknown.
1981 */
1982 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1983 {
1984 BlockDriver *drv = bs->drv;
1985 if (!drv) {
1986 return -ENOMEDIUM;
1987 }
1988 if (drv->bdrv_get_allocated_file_size) {
1989 return drv->bdrv_get_allocated_file_size(bs);
1990 }
1991 if (bs->file) {
1992 return bdrv_get_allocated_file_size(bs->file);
1993 }
1994 return -ENOTSUP;
1995 }
1996
1997 /**
1998 * Length of a file in bytes. Return < 0 if error or unknown.
1999 */
2000 int64_t bdrv_getlength(BlockDriverState *bs)
2001 {
2002 BlockDriver *drv = bs->drv;
2003 if (!drv)
2004 return -ENOMEDIUM;
2005
2006 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
2007 if (drv->bdrv_getlength) {
2008 return drv->bdrv_getlength(bs);
2009 }
2010 }
2011 return bs->total_sectors * BDRV_SECTOR_SIZE;
2012 }
2013
2014 /* return 0 as number of sectors if no device present or error */
2015 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
2016 {
2017 int64_t length;
2018 length = bdrv_getlength(bs);
2019 if (length < 0)
2020 length = 0;
2021 else
2022 length = length >> BDRV_SECTOR_BITS;
2023 *nb_sectors_ptr = length;
2024 }
2025
2026 struct partition {
2027 uint8_t boot_ind; /* 0x80 - active */
2028 uint8_t head; /* starting head */
2029 uint8_t sector; /* starting sector */
2030 uint8_t cyl; /* starting cylinder */
2031 uint8_t sys_ind; /* What partition type */
2032 uint8_t end_head; /* end head */
2033 uint8_t end_sector; /* end sector */
2034 uint8_t end_cyl; /* end cylinder */
2035 uint32_t start_sect; /* starting sector counting from 0 */
2036 uint32_t nr_sects; /* nr of sectors in partition */
2037 } QEMU_PACKED;
2038
2039 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
2040 static int guess_disk_lchs(BlockDriverState *bs,
2041 int *pcylinders, int *pheads, int *psectors)
2042 {
2043 uint8_t buf[BDRV_SECTOR_SIZE];
2044 int ret, i, heads, sectors, cylinders;
2045 struct partition *p;
2046 uint32_t nr_sects;
2047 uint64_t nb_sectors;
2048 bool enabled;
2049
2050 bdrv_get_geometry(bs, &nb_sectors);
2051
2052 /**
2053 * The function will be invoked during startup not only in sync I/O mode,
2054 * but also in async I/O mode. So the I/O throttling function has to
2055 * be disabled temporarily here, not permanently.
2056 */
2057 enabled = bs->io_limits_enabled;
2058 bs->io_limits_enabled = false;
2059 ret = bdrv_read(bs, 0, buf, 1);
2060 bs->io_limits_enabled = enabled;
2061 if (ret < 0)
2062 return -1;
2063 /* test msdos magic */
2064 if (buf[510] != 0x55 || buf[511] != 0xaa)
2065 return -1;
2066 for(i = 0; i < 4; i++) {
2067 p = ((struct partition *)(buf + 0x1be)) + i;
2068 nr_sects = le32_to_cpu(p->nr_sects);
2069 if (nr_sects && p->end_head) {
2070 /* We make the assumption that the partition terminates on
2071 a cylinder boundary */
2072 heads = p->end_head + 1;
2073 sectors = p->end_sector & 63;
2074 if (sectors == 0)
2075 continue;
2076 cylinders = nb_sectors / (heads * sectors);
2077 if (cylinders < 1 || cylinders > 16383)
2078 continue;
2079 *pheads = heads;
2080 *psectors = sectors;
2081 *pcylinders = cylinders;
2082 #if 0
2083 printf("guessed geometry: LCHS=%d %d %d\n",
2084 cylinders, heads, sectors);
2085 #endif
2086 return 0;
2087 }
2088 }
2089 return -1;
2090 }
2091
2092 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
2093 {
2094 int translation, lba_detected = 0;
2095 int cylinders, heads, secs;
2096 uint64_t nb_sectors;
2097
2098 /* if a geometry hint is available, use it */
2099 bdrv_get_geometry(bs, &nb_sectors);
2100 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
2101 translation = bdrv_get_translation_hint(bs);
2102 if (cylinders != 0) {
2103 *pcyls = cylinders;
2104 *pheads = heads;
2105 *psecs = secs;
2106 } else {
2107 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
2108 if (heads > 16) {
2109 /* if heads > 16, it means that a BIOS LBA
2110 translation was active, so the default
2111 hardware geometry is OK */
2112 lba_detected = 1;
2113 goto default_geometry;
2114 } else {
2115 *pcyls = cylinders;
2116 *pheads = heads;
2117 *psecs = secs;
2118 /* disable any translation to be in sync with
2119 the logical geometry */
2120 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
2121 bdrv_set_translation_hint(bs,
2122 BIOS_ATA_TRANSLATION_NONE);
2123 }
2124 }
2125 } else {
2126 default_geometry:
2127 /* if no geometry, use a standard physical disk geometry */
2128 cylinders = nb_sectors / (16 * 63);
2129
2130 if (cylinders > 16383)
2131 cylinders = 16383;
2132 else if (cylinders < 2)
2133 cylinders = 2;
2134 *pcyls = cylinders;
2135 *pheads = 16;
2136 *psecs = 63;
2137 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
2138 if ((*pcyls * *pheads) <= 131072) {
2139 bdrv_set_translation_hint(bs,
2140 BIOS_ATA_TRANSLATION_LARGE);
2141 } else {
2142 bdrv_set_translation_hint(bs,
2143 BIOS_ATA_TRANSLATION_LBA);
2144 }
2145 }
2146 }
2147 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
2148 }
2149 }
2150
2151 void bdrv_set_geometry_hint(BlockDriverState *bs,
2152 int cyls, int heads, int secs)
2153 {
2154 bs->cyls = cyls;
2155 bs->heads = heads;
2156 bs->secs = secs;
2157 }
2158
2159 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
2160 {
2161 bs->translation = translation;
2162 }
2163
2164 void bdrv_get_geometry_hint(BlockDriverState *bs,
2165 int *pcyls, int *pheads, int *psecs)
2166 {
2167 *pcyls = bs->cyls;
2168 *pheads = bs->heads;
2169 *psecs = bs->secs;
2170 }
2171
2172 /* throttling disk io limits */
2173 void bdrv_set_io_limits(BlockDriverState *bs,
2174 BlockIOLimit *io_limits)
2175 {
2176 bs->io_limits = *io_limits;
2177 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
2178 }
2179
2180 /* Recognize floppy formats */
2181 typedef struct FDFormat {
2182 FDriveType drive;
2183 uint8_t last_sect;
2184 uint8_t max_track;
2185 uint8_t max_head;
2186 FDriveRate rate;
2187 } FDFormat;
2188
2189 static const FDFormat fd_formats[] = {
2190 /* First entry is default format */
2191 /* 1.44 MB 3"1/2 floppy disks */
2192 { FDRIVE_DRV_144, 18, 80, 1, FDRIVE_RATE_500K, },
2193 { FDRIVE_DRV_144, 20, 80, 1, FDRIVE_RATE_500K, },
2194 { FDRIVE_DRV_144, 21, 80, 1, FDRIVE_RATE_500K, },
2195 { FDRIVE_DRV_144, 21, 82, 1, FDRIVE_RATE_500K, },
2196 { FDRIVE_DRV_144, 21, 83, 1, FDRIVE_RATE_500K, },
2197 { FDRIVE_DRV_144, 22, 80, 1, FDRIVE_RATE_500K, },
2198 { FDRIVE_DRV_144, 23, 80, 1, FDRIVE_RATE_500K, },
2199 { FDRIVE_DRV_144, 24, 80, 1, FDRIVE_RATE_500K, },
2200 /* 2.88 MB 3"1/2 floppy disks */
2201 { FDRIVE_DRV_288, 36, 80, 1, FDRIVE_RATE_1M, },
2202 { FDRIVE_DRV_288, 39, 80, 1, FDRIVE_RATE_1M, },
2203 { FDRIVE_DRV_288, 40, 80, 1, FDRIVE_RATE_1M, },
2204 { FDRIVE_DRV_288, 44, 80, 1, FDRIVE_RATE_1M, },
2205 { FDRIVE_DRV_288, 48, 80, 1, FDRIVE_RATE_1M, },
2206 /* 720 kB 3"1/2 floppy disks */
2207 { FDRIVE_DRV_144, 9, 80, 1, FDRIVE_RATE_250K, },
2208 { FDRIVE_DRV_144, 10, 80, 1, FDRIVE_RATE_250K, },
2209 { FDRIVE_DRV_144, 10, 82, 1, FDRIVE_RATE_250K, },
2210 { FDRIVE_DRV_144, 10, 83, 1, FDRIVE_RATE_250K, },
2211 { FDRIVE_DRV_144, 13, 80, 1, FDRIVE_RATE_250K, },
2212 { FDRIVE_DRV_144, 14, 80, 1, FDRIVE_RATE_250K, },
2213 /* 1.2 MB 5"1/4 floppy disks */
2214 { FDRIVE_DRV_120, 15, 80, 1, FDRIVE_RATE_500K, },
2215 { FDRIVE_DRV_120, 18, 80, 1, FDRIVE_RATE_500K, },
2216 { FDRIVE_DRV_120, 18, 82, 1, FDRIVE_RATE_500K, },
2217 { FDRIVE_DRV_120, 18, 83, 1, FDRIVE_RATE_500K, },
2218 { FDRIVE_DRV_120, 20, 80, 1, FDRIVE_RATE_500K, },
2219 /* 720 kB 5"1/4 floppy disks */
2220 { FDRIVE_DRV_120, 9, 80, 1, FDRIVE_RATE_250K, },
2221 { FDRIVE_DRV_120, 11, 80, 1, FDRIVE_RATE_250K, },
2222 /* 360 kB 5"1/4 floppy disks */
2223 { FDRIVE_DRV_120, 9, 40, 1, FDRIVE_RATE_300K, },
2224 { FDRIVE_DRV_120, 9, 40, 0, FDRIVE_RATE_300K, },
2225 { FDRIVE_DRV_120, 10, 41, 1, FDRIVE_RATE_300K, },
2226 { FDRIVE_DRV_120, 10, 42, 1, FDRIVE_RATE_300K, },
2227 /* 320 kB 5"1/4 floppy disks */
2228 { FDRIVE_DRV_120, 8, 40, 1, FDRIVE_RATE_250K, },
2229 { FDRIVE_DRV_120, 8, 40, 0, FDRIVE_RATE_250K, },
2230 /* 360 kB must match 5"1/4 better than 3"1/2... */
2231 { FDRIVE_DRV_144, 9, 80, 0, FDRIVE_RATE_250K, },
2232 /* end */
2233 { FDRIVE_DRV_NONE, -1, -1, 0, 0, },
2234 };
2235
2236 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
2237 int *max_track, int *last_sect,
2238 FDriveType drive_in, FDriveType *drive,
2239 FDriveRate *rate)
2240 {
2241 const FDFormat *parse;
2242 uint64_t nb_sectors, size;
2243 int i, first_match, match;
2244
2245 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
2246 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
2247 /* User defined disk */
2248 *rate = FDRIVE_RATE_500K;
2249 } else {
2250 bdrv_get_geometry(bs, &nb_sectors);
2251 match = -1;
2252 first_match = -1;
2253 for (i = 0; ; i++) {
2254 parse = &fd_formats[i];
2255 if (parse->drive == FDRIVE_DRV_NONE) {
2256 break;
2257 }
2258 if (drive_in == parse->drive ||
2259 drive_in == FDRIVE_DRV_NONE) {
2260 size = (parse->max_head + 1) * parse->max_track *
2261 parse->last_sect;
2262 if (nb_sectors == size) {
2263 match = i;
2264 break;
2265 }
2266 if (first_match == -1) {
2267 first_match = i;
2268 }
2269 }
2270 }
2271 if (match == -1) {
2272 if (first_match == -1) {
2273 match = 1;
2274 } else {
2275 match = first_match;
2276 }
2277 parse = &fd_formats[match];
2278 }
2279 *nb_heads = parse->max_head + 1;
2280 *max_track = parse->max_track;
2281 *last_sect = parse->last_sect;
2282 *drive = parse->drive;
2283 *rate = parse->rate;
2284 }
2285 }
2286
2287 int bdrv_get_translation_hint(BlockDriverState *bs)
2288 {
2289 return bs->translation;
2290 }
2291
2292 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2293 BlockErrorAction on_write_error)
2294 {
2295 bs->on_read_error = on_read_error;
2296 bs->on_write_error = on_write_error;
2297 }
2298
2299 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2300 {
2301 return is_read ? bs->on_read_error : bs->on_write_error;
2302 }
2303
2304 int bdrv_is_read_only(BlockDriverState *bs)
2305 {
2306 return bs->read_only;
2307 }
2308
2309 int bdrv_is_sg(BlockDriverState *bs)
2310 {
2311 return bs->sg;
2312 }
2313
2314 int bdrv_enable_write_cache(BlockDriverState *bs)
2315 {
2316 return bs->enable_write_cache;
2317 }
2318
2319 int bdrv_is_encrypted(BlockDriverState *bs)
2320 {
2321 if (bs->backing_hd && bs->backing_hd->encrypted)
2322 return 1;
2323 return bs->encrypted;
2324 }
2325
2326 int bdrv_key_required(BlockDriverState *bs)
2327 {
2328 BlockDriverState *backing_hd = bs->backing_hd;
2329
2330 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2331 return 1;
2332 return (bs->encrypted && !bs->valid_key);
2333 }
2334
2335 int bdrv_set_key(BlockDriverState *bs, const char *key)
2336 {
2337 int ret;
2338 if (bs->backing_hd && bs->backing_hd->encrypted) {
2339 ret = bdrv_set_key(bs->backing_hd, key);
2340 if (ret < 0)
2341 return ret;
2342 if (!bs->encrypted)
2343 return 0;
2344 }
2345 if (!bs->encrypted) {
2346 return -EINVAL;
2347 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2348 return -ENOMEDIUM;
2349 }
2350 ret = bs->drv->bdrv_set_key(bs, key);
2351 if (ret < 0) {
2352 bs->valid_key = 0;
2353 } else if (!bs->valid_key) {
2354 bs->valid_key = 1;
2355 /* call the change callback now, we skipped it on open */
2356 bdrv_dev_change_media_cb(bs, true);
2357 }
2358 return ret;
2359 }
2360
2361 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2362 {
2363 if (!bs->drv) {
2364 buf[0] = '\0';
2365 } else {
2366 pstrcpy(buf, buf_size, bs->drv->format_name);
2367 }
2368 }
2369
2370 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2371 void *opaque)
2372 {
2373 BlockDriver *drv;
2374
2375 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2376 it(opaque, drv->format_name);
2377 }
2378 }
2379
2380 BlockDriverState *bdrv_find(const char *name)
2381 {
2382 BlockDriverState *bs;
2383
2384 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2385 if (!strcmp(name, bs->device_name)) {
2386 return bs;
2387 }
2388 }
2389 return NULL;
2390 }
2391
2392 BlockDriverState *bdrv_next(BlockDriverState *bs)
2393 {
2394 if (!bs) {
2395 return QTAILQ_FIRST(&bdrv_states);
2396 }
2397 return QTAILQ_NEXT(bs, list);
2398 }
2399
2400 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2401 {
2402 BlockDriverState *bs;
2403
2404 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2405 it(opaque, bs);
2406 }
2407 }
2408
2409 const char *bdrv_get_device_name(BlockDriverState *bs)
2410 {
2411 return bs->device_name;
2412 }
2413
2414 void bdrv_flush_all(void)
2415 {
2416 BlockDriverState *bs;
2417
2418 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2419 bdrv_flush(bs);
2420 }
2421 }
2422
2423 int bdrv_has_zero_init(BlockDriverState *bs)
2424 {
2425 assert(bs->drv);
2426
2427 if (bs->drv->bdrv_has_zero_init) {
2428 return bs->drv->bdrv_has_zero_init(bs);
2429 }
2430
2431 return 1;
2432 }
2433
2434 typedef struct BdrvCoIsAllocatedData {
2435 BlockDriverState *bs;
2436 int64_t sector_num;
2437 int nb_sectors;
2438 int *pnum;
2439 int ret;
2440 bool done;
2441 } BdrvCoIsAllocatedData;
2442
2443 /*
2444 * Returns true iff the specified sector is present in the disk image. Drivers
2445 * not implementing the functionality are assumed to not support backing files,
2446 * hence all their sectors are reported as allocated.
2447 *
2448 * If 'sector_num' is beyond the end of the disk image the return value is 0
2449 * and 'pnum' is set to 0.
2450 *
2451 * 'pnum' is set to the number of sectors (including and immediately following
2452 * the specified sector) that are known to be in the same
2453 * allocated/unallocated state.
2454 *
2455 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2456 * beyond the end of the disk image it will be clamped.
2457 */
2458 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2459 int nb_sectors, int *pnum)
2460 {
2461 int64_t n;
2462
2463 if (sector_num >= bs->total_sectors) {
2464 *pnum = 0;
2465 return 0;
2466 }
2467
2468 n = bs->total_sectors - sector_num;
2469 if (n < nb_sectors) {
2470 nb_sectors = n;
2471 }
2472
2473 if (!bs->drv->bdrv_co_is_allocated) {
2474 *pnum = nb_sectors;
2475 return 1;
2476 }
2477
2478 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2479 }
2480
2481 /* Coroutine wrapper for bdrv_is_allocated() */
2482 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2483 {
2484 BdrvCoIsAllocatedData *data = opaque;
2485 BlockDriverState *bs = data->bs;
2486
2487 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2488 data->pnum);
2489 data->done = true;
2490 }
2491
2492 /*
2493 * Synchronous wrapper around bdrv_co_is_allocated().
2494 *
2495 * See bdrv_co_is_allocated() for details.
2496 */
2497 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2498 int *pnum)
2499 {
2500 Coroutine *co;
2501 BdrvCoIsAllocatedData data = {
2502 .bs = bs,
2503 .sector_num = sector_num,
2504 .nb_sectors = nb_sectors,
2505 .pnum = pnum,
2506 .done = false,
2507 };
2508
2509 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2510 qemu_coroutine_enter(co, &data);
2511 while (!data.done) {
2512 qemu_aio_wait();
2513 }
2514 return data.ret;
2515 }
2516
2517 BlockInfoList *qmp_query_block(Error **errp)
2518 {
2519 BlockInfoList *head = NULL, *cur_item = NULL;
2520 BlockDriverState *bs;
2521
2522 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2523 BlockInfoList *info = g_malloc0(sizeof(*info));
2524
2525 info->value = g_malloc0(sizeof(*info->value));
2526 info->value->device = g_strdup(bs->device_name);
2527 info->value->type = g_strdup("unknown");
2528 info->value->locked = bdrv_dev_is_medium_locked(bs);
2529 info->value->removable = bdrv_dev_has_removable_media(bs);
2530
2531 if (bdrv_dev_has_removable_media(bs)) {
2532 info->value->has_tray_open = true;
2533 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2534 }
2535
2536 if (bdrv_iostatus_is_enabled(bs)) {
2537 info->value->has_io_status = true;
2538 info->value->io_status = bs->iostatus;
2539 }
2540
2541 if (bs->drv) {
2542 info->value->has_inserted = true;
2543 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2544 info->value->inserted->file = g_strdup(bs->filename);
2545 info->value->inserted->ro = bs->read_only;
2546 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2547 info->value->inserted->encrypted = bs->encrypted;
2548 if (bs->backing_file[0]) {
2549 info->value->inserted->has_backing_file = true;
2550 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2551 }
2552
2553 if (bs->io_limits_enabled) {
2554 info->value->inserted->bps =
2555 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2556 info->value->inserted->bps_rd =
2557 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2558 info->value->inserted->bps_wr =
2559 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2560 info->value->inserted->iops =
2561 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2562 info->value->inserted->iops_rd =
2563 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2564 info->value->inserted->iops_wr =
2565 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2566 }
2567 }
2568
2569 /* XXX: waiting for the qapi to support GSList */
2570 if (!cur_item) {
2571 head = cur_item = info;
2572 } else {
2573 cur_item->next = info;
2574 cur_item = info;
2575 }
2576 }
2577
2578 return head;
2579 }
2580
2581 /* Consider exposing this as a full fledged QMP command */
2582 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2583 {
2584 BlockStats *s;
2585
2586 s = g_malloc0(sizeof(*s));
2587
2588 if (bs->device_name[0]) {
2589 s->has_device = true;
2590 s->device = g_strdup(bs->device_name);
2591 }
2592
2593 s->stats = g_malloc0(sizeof(*s->stats));
2594 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2595 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2596 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2597 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2598 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2599 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2600 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2601 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2602 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2603
2604 if (bs->file) {
2605 s->has_parent = true;
2606 s->parent = qmp_query_blockstat(bs->file, NULL);
2607 }
2608
2609 return s;
2610 }
2611
2612 BlockStatsList *qmp_query_blockstats(Error **errp)
2613 {
2614 BlockStatsList *head = NULL, *cur_item = NULL;
2615 BlockDriverState *bs;
2616
2617 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2618 BlockStatsList *info = g_malloc0(sizeof(*info));
2619 info->value = qmp_query_blockstat(bs, NULL);
2620
2621 /* XXX: waiting for the qapi to support GSList */
2622 if (!cur_item) {
2623 head = cur_item = info;
2624 } else {
2625 cur_item->next = info;
2626 cur_item = info;
2627 }
2628 }
2629
2630 return head;
2631 }
2632
2633 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2634 {
2635 if (bs->backing_hd && bs->backing_hd->encrypted)
2636 return bs->backing_file;
2637 else if (bs->encrypted)
2638 return bs->filename;
2639 else
2640 return NULL;
2641 }
2642
2643 void bdrv_get_backing_filename(BlockDriverState *bs,
2644 char *filename, int filename_size)
2645 {
2646 pstrcpy(filename, filename_size, bs->backing_file);
2647 }
2648
2649 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2650 const uint8_t *buf, int nb_sectors)
2651 {
2652 BlockDriver *drv = bs->drv;
2653 if (!drv)
2654 return -ENOMEDIUM;
2655 if (!drv->bdrv_write_compressed)
2656 return -ENOTSUP;
2657 if (bdrv_check_request(bs, sector_num, nb_sectors))
2658 return -EIO;
2659
2660 if (bs->dirty_bitmap) {
2661 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2662 }
2663
2664 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2665 }
2666
2667 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2668 {
2669 BlockDriver *drv = bs->drv;
2670 if (!drv)
2671 return -ENOMEDIUM;
2672 if (!drv->bdrv_get_info)
2673 return -ENOTSUP;
2674 memset(bdi, 0, sizeof(*bdi));
2675 return drv->bdrv_get_info(bs, bdi);
2676 }
2677
2678 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2679 int64_t pos, int size)
2680 {
2681 BlockDriver *drv = bs->drv;
2682 if (!drv)
2683 return -ENOMEDIUM;
2684 if (drv->bdrv_save_vmstate)
2685 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2686 if (bs->file)
2687 return bdrv_save_vmstate(bs->file, buf, pos, size);
2688 return -ENOTSUP;
2689 }
2690
2691 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2692 int64_t pos, int size)
2693 {
2694 BlockDriver *drv = bs->drv;
2695 if (!drv)
2696 return -ENOMEDIUM;
2697 if (drv->bdrv_load_vmstate)
2698 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2699 if (bs->file)
2700 return bdrv_load_vmstate(bs->file, buf, pos, size);
2701 return -ENOTSUP;
2702 }
2703
2704 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2705 {
2706 BlockDriver *drv = bs->drv;
2707
2708 if (!drv || !drv->bdrv_debug_event) {
2709 return;
2710 }
2711
2712 return drv->bdrv_debug_event(bs, event);
2713
2714 }
2715
2716 /**************************************************************/
2717 /* handling of snapshots */
2718
2719 int bdrv_can_snapshot(BlockDriverState *bs)
2720 {
2721 BlockDriver *drv = bs->drv;
2722 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2723 return 0;
2724 }
2725
2726 if (!drv->bdrv_snapshot_create) {
2727 if (bs->file != NULL) {
2728 return bdrv_can_snapshot(bs->file);
2729 }
2730 return 0;
2731 }
2732
2733 return 1;
2734 }
2735
2736 int bdrv_is_snapshot(BlockDriverState *bs)
2737 {
2738 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2739 }
2740
2741 BlockDriverState *bdrv_snapshots(void)
2742 {
2743 BlockDriverState *bs;
2744
2745 if (bs_snapshots) {
2746 return bs_snapshots;
2747 }
2748
2749 bs = NULL;
2750 while ((bs = bdrv_next(bs))) {
2751 if (bdrv_can_snapshot(bs)) {
2752 bs_snapshots = bs;
2753 return bs;
2754 }
2755 }
2756 return NULL;
2757 }
2758
2759 int bdrv_snapshot_create(BlockDriverState *bs,
2760 QEMUSnapshotInfo *sn_info)
2761 {
2762 BlockDriver *drv = bs->drv;
2763 if (!drv)
2764 return -ENOMEDIUM;
2765 if (drv->bdrv_snapshot_create)
2766 return drv->bdrv_snapshot_create(bs, sn_info);
2767 if (bs->file)
2768 return bdrv_snapshot_create(bs->file, sn_info);
2769 return -ENOTSUP;
2770 }
2771
2772 int bdrv_snapshot_goto(BlockDriverState *bs,
2773 const char *snapshot_id)
2774 {
2775 BlockDriver *drv = bs->drv;
2776 int ret, open_ret;
2777
2778 if (!drv)
2779 return -ENOMEDIUM;
2780 if (drv->bdrv_snapshot_goto)
2781 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2782
2783 if (bs->file) {
2784 drv->bdrv_close(bs);
2785 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2786 open_ret = drv->bdrv_open(bs, bs->open_flags);
2787 if (open_ret < 0) {
2788 bdrv_delete(bs->file);
2789 bs->drv = NULL;
2790 return open_ret;
2791 }
2792 return ret;
2793 }
2794
2795 return -ENOTSUP;
2796 }
2797
2798 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2799 {
2800 BlockDriver *drv = bs->drv;
2801 if (!drv)
2802 return -ENOMEDIUM;
2803 if (drv->bdrv_snapshot_delete)
2804 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2805 if (bs->file)
2806 return bdrv_snapshot_delete(bs->file, snapshot_id);
2807 return -ENOTSUP;
2808 }
2809
2810 int bdrv_snapshot_list(BlockDriverState *bs,
2811 QEMUSnapshotInfo **psn_info)
2812 {
2813 BlockDriver *drv = bs->drv;
2814 if (!drv)
2815 return -ENOMEDIUM;
2816 if (drv->bdrv_snapshot_list)
2817 return drv->bdrv_snapshot_list(bs, psn_info);
2818 if (bs->file)
2819 return bdrv_snapshot_list(bs->file, psn_info);
2820 return -ENOTSUP;
2821 }
2822
2823 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2824 const char *snapshot_name)
2825 {
2826 BlockDriver *drv = bs->drv;
2827 if (!drv) {
2828 return -ENOMEDIUM;
2829 }
2830 if (!bs->read_only) {
2831 return -EINVAL;
2832 }
2833 if (drv->bdrv_snapshot_load_tmp) {
2834 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2835 }
2836 return -ENOTSUP;
2837 }
2838
2839 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2840 const char *backing_file)
2841 {
2842 if (!bs->drv) {
2843 return NULL;
2844 }
2845
2846 if (bs->backing_hd) {
2847 if (strcmp(bs->backing_file, backing_file) == 0) {
2848 return bs->backing_hd;
2849 } else {
2850 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2851 }
2852 }
2853
2854 return NULL;
2855 }
2856
2857 #define NB_SUFFIXES 4
2858
2859 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2860 {
2861 static const char suffixes[NB_SUFFIXES] = "KMGT";
2862 int64_t base;
2863 int i;
2864
2865 if (size <= 999) {
2866 snprintf(buf, buf_size, "%" PRId64, size);
2867 } else {
2868 base = 1024;
2869 for(i = 0; i < NB_SUFFIXES; i++) {
2870 if (size < (10 * base)) {
2871 snprintf(buf, buf_size, "%0.1f%c",
2872 (double)size / base,
2873 suffixes[i]);
2874 break;
2875 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2876 snprintf(buf, buf_size, "%" PRId64 "%c",
2877 ((size + (base >> 1)) / base),
2878 suffixes[i]);
2879 break;
2880 }
2881 base = base * 1024;
2882 }
2883 }
2884 return buf;
2885 }
2886
2887 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2888 {
2889 char buf1[128], date_buf[128], clock_buf[128];
2890 #ifdef _WIN32
2891 struct tm *ptm;
2892 #else
2893 struct tm tm;
2894 #endif
2895 time_t ti;
2896 int64_t secs;
2897
2898 if (!sn) {
2899 snprintf(buf, buf_size,
2900 "%-10s%-20s%7s%20s%15s",
2901 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2902 } else {
2903 ti = sn->date_sec;
2904 #ifdef _WIN32
2905 ptm = localtime(&ti);
2906 strftime(date_buf, sizeof(date_buf),
2907 "%Y-%m-%d %H:%M:%S", ptm);
2908 #else
2909 localtime_r(&ti, &tm);
2910 strftime(date_buf, sizeof(date_buf),
2911 "%Y-%m-%d %H:%M:%S", &tm);
2912 #endif
2913 secs = sn->vm_clock_nsec / 1000000000;
2914 snprintf(clock_buf, sizeof(clock_buf),
2915 "%02d:%02d:%02d.%03d",
2916 (int)(secs / 3600),
2917 (int)((secs / 60) % 60),
2918 (int)(secs % 60),
2919 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2920 snprintf(buf, buf_size,
2921 "%-10s%-20s%7s%20s%15s",
2922 sn->id_str, sn->name,
2923 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2924 date_buf,
2925 clock_buf);
2926 }
2927 return buf;
2928 }
2929
2930 /**************************************************************/
2931 /* async I/Os */
2932
2933 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2934 QEMUIOVector *qiov, int nb_sectors,
2935 BlockDriverCompletionFunc *cb, void *opaque)
2936 {
2937 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2938
2939 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2940 cb, opaque, false);
2941 }
2942
2943 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2944 QEMUIOVector *qiov, int nb_sectors,
2945 BlockDriverCompletionFunc *cb, void *opaque)
2946 {
2947 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2948
2949 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2950 cb, opaque, true);
2951 }
2952
2953
2954 typedef struct MultiwriteCB {
2955 int error;
2956 int num_requests;
2957 int num_callbacks;
2958 struct {
2959 BlockDriverCompletionFunc *cb;
2960 void *opaque;
2961 QEMUIOVector *free_qiov;
2962 } callbacks[];
2963 } MultiwriteCB;
2964
2965 static void multiwrite_user_cb(MultiwriteCB *mcb)
2966 {
2967 int i;
2968
2969 for (i = 0; i < mcb->num_callbacks; i++) {
2970 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2971 if (mcb->callbacks[i].free_qiov) {
2972 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2973 }
2974 g_free(mcb->callbacks[i].free_qiov);
2975 }
2976 }
2977
2978 static void multiwrite_cb(void *opaque, int ret)
2979 {
2980 MultiwriteCB *mcb = opaque;
2981
2982 trace_multiwrite_cb(mcb, ret);
2983
2984 if (ret < 0 && !mcb->error) {
2985 mcb->error = ret;
2986 }
2987
2988 mcb->num_requests--;
2989 if (mcb->num_requests == 0) {
2990 multiwrite_user_cb(mcb);
2991 g_free(mcb);
2992 }
2993 }
2994
2995 static int multiwrite_req_compare(const void *a, const void *b)
2996 {
2997 const BlockRequest *req1 = a, *req2 = b;
2998
2999 /*
3000 * Note that we can't simply subtract req2->sector from req1->sector
3001 * here as that could overflow the return value.
3002 */
3003 if (req1->sector > req2->sector) {
3004 return 1;
3005 } else if (req1->sector < req2->sector) {
3006 return -1;
3007 } else {
3008 return 0;
3009 }
3010 }
3011
3012 /*
3013 * Takes a bunch of requests and tries to merge them. Returns the number of
3014 * requests that remain after merging.
3015 */
3016 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
3017 int num_reqs, MultiwriteCB *mcb)
3018 {
3019 int i, outidx;
3020
3021 // Sort requests by start sector
3022 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
3023
3024 // Check if adjacent requests touch the same clusters. If so, combine them,
3025 // filling up gaps with zero sectors.
3026 outidx = 0;
3027 for (i = 1; i < num_reqs; i++) {
3028 int merge = 0;
3029 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
3030
3031 // Handle exactly sequential writes and overlapping writes.
3032 if (reqs[i].sector <= oldreq_last) {
3033 merge = 1;
3034 }
3035
3036 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
3037 merge = 0;
3038 }
3039
3040 if (merge) {
3041 size_t size;
3042 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
3043 qemu_iovec_init(qiov,
3044 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
3045
3046 // Add the first request to the merged one. If the requests are
3047 // overlapping, drop the last sectors of the first request.
3048 size = (reqs[i].sector - reqs[outidx].sector) << 9;
3049 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
3050
3051 // We should need to add any zeros between the two requests
3052 assert (reqs[i].sector <= oldreq_last);
3053
3054 // Add the second request
3055 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
3056
3057 reqs[outidx].nb_sectors = qiov->size >> 9;
3058 reqs[outidx].qiov = qiov;
3059
3060 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
3061 } else {
3062 outidx++;
3063 reqs[outidx].sector = reqs[i].sector;
3064 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
3065 reqs[outidx].qiov = reqs[i].qiov;
3066 }
3067 }
3068
3069 return outidx + 1;
3070 }
3071
3072 /*
3073 * Submit multiple AIO write requests at once.
3074 *
3075 * On success, the function returns 0 and all requests in the reqs array have
3076 * been submitted. In error case this function returns -1, and any of the
3077 * requests may or may not be submitted yet. In particular, this means that the
3078 * callback will be called for some of the requests, for others it won't. The
3079 * caller must check the error field of the BlockRequest to wait for the right
3080 * callbacks (if error != 0, no callback will be called).
3081 *
3082 * The implementation may modify the contents of the reqs array, e.g. to merge
3083 * requests. However, the fields opaque and error are left unmodified as they
3084 * are used to signal failure for a single request to the caller.
3085 */
3086 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
3087 {
3088 MultiwriteCB *mcb;
3089 int i;
3090
3091 /* don't submit writes if we don't have a medium */
3092 if (bs->drv == NULL) {
3093 for (i = 0; i < num_reqs; i++) {
3094 reqs[i].error = -ENOMEDIUM;
3095 }
3096 return -1;
3097 }
3098
3099 if (num_reqs == 0) {
3100 return 0;
3101 }
3102
3103 // Create MultiwriteCB structure
3104 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
3105 mcb->num_requests = 0;
3106 mcb->num_callbacks = num_reqs;
3107
3108 for (i = 0; i < num_reqs; i++) {
3109 mcb->callbacks[i].cb = reqs[i].cb;
3110 mcb->callbacks[i].opaque = reqs[i].opaque;
3111 }
3112
3113 // Check for mergable requests
3114 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
3115
3116 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
3117
3118 /* Run the aio requests. */
3119 mcb->num_requests = num_reqs;
3120 for (i = 0; i < num_reqs; i++) {
3121 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
3122 reqs[i].nb_sectors, multiwrite_cb, mcb);
3123 }
3124
3125 return 0;
3126 }
3127
3128 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
3129 {
3130 acb->pool->cancel(acb);
3131 }
3132
3133 /* block I/O throttling */
3134 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
3135 bool is_write, double elapsed_time, uint64_t *wait)
3136 {
3137 uint64_t bps_limit = 0;
3138 double bytes_limit, bytes_base, bytes_res;
3139 double slice_time, wait_time;
3140
3141 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3142 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
3143 } else if (bs->io_limits.bps[is_write]) {
3144 bps_limit = bs->io_limits.bps[is_write];
3145 } else {
3146 if (wait) {
3147 *wait = 0;
3148 }
3149
3150 return false;
3151 }
3152
3153 slice_time = bs->slice_end - bs->slice_start;
3154 slice_time /= (NANOSECONDS_PER_SECOND);
3155 bytes_limit = bps_limit * slice_time;
3156 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
3157 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
3158 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
3159 }
3160
3161 /* bytes_base: the bytes of data which have been read/written; and
3162 * it is obtained from the history statistic info.
3163 * bytes_res: the remaining bytes of data which need to be read/written.
3164 * (bytes_base + bytes_res) / bps_limit: used to calcuate
3165 * the total time for completing reading/writting all data.
3166 */
3167 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
3168
3169 if (bytes_base + bytes_res <= bytes_limit) {
3170 if (wait) {
3171 *wait = 0;
3172 }
3173
3174 return false;
3175 }
3176
3177 /* Calc approx time to dispatch */
3178 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
3179
3180 /* When the I/O rate at runtime exceeds the limits,
3181 * bs->slice_end need to be extended in order that the current statistic
3182 * info can be kept until the timer fire, so it is increased and tuned
3183 * based on the result of experiment.
3184 */
3185 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3186 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3187 if (wait) {
3188 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3189 }
3190
3191 return true;
3192 }
3193
3194 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
3195 double elapsed_time, uint64_t *wait)
3196 {
3197 uint64_t iops_limit = 0;
3198 double ios_limit, ios_base;
3199 double slice_time, wait_time;
3200
3201 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3202 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
3203 } else if (bs->io_limits.iops[is_write]) {
3204 iops_limit = bs->io_limits.iops[is_write];
3205 } else {
3206 if (wait) {
3207 *wait = 0;
3208 }
3209
3210 return false;
3211 }
3212
3213 slice_time = bs->slice_end - bs->slice_start;
3214 slice_time /= (NANOSECONDS_PER_SECOND);
3215 ios_limit = iops_limit * slice_time;
3216 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
3217 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
3218 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
3219 }
3220
3221 if (ios_base + 1 <= ios_limit) {
3222 if (wait) {
3223 *wait = 0;
3224 }
3225
3226 return false;
3227 }
3228
3229 /* Calc approx time to dispatch */
3230 wait_time = (ios_base + 1) / iops_limit;
3231 if (wait_time > elapsed_time) {
3232 wait_time = wait_time - elapsed_time;
3233 } else {
3234 wait_time = 0;
3235 }
3236
3237 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3238 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3239 if (wait) {
3240 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3241 }
3242
3243 return true;
3244 }
3245
3246 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3247 bool is_write, int64_t *wait)
3248 {
3249 int64_t now, max_wait;
3250 uint64_t bps_wait = 0, iops_wait = 0;
3251 double elapsed_time;
3252 int bps_ret, iops_ret;
3253
3254 now = qemu_get_clock_ns(vm_clock);
3255 if ((bs->slice_start < now)
3256 && (bs->slice_end > now)) {
3257 bs->slice_end = now + bs->slice_time;
3258 } else {
3259 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3260 bs->slice_start = now;
3261 bs->slice_end = now + bs->slice_time;
3262
3263 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3264 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3265
3266 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3267 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3268 }
3269
3270 elapsed_time = now - bs->slice_start;
3271 elapsed_time /= (NANOSECONDS_PER_SECOND);
3272
3273 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3274 is_write, elapsed_time, &bps_wait);
3275 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3276 elapsed_time, &iops_wait);
3277 if (bps_ret || iops_ret) {
3278 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3279 if (wait) {
3280 *wait = max_wait;
3281 }
3282
3283 now = qemu_get_clock_ns(vm_clock);
3284 if (bs->slice_end < now + max_wait) {
3285 bs->slice_end = now + max_wait;
3286 }
3287
3288 return true;
3289 }
3290
3291 if (wait) {
3292 *wait = 0;
3293 }
3294
3295 return false;
3296 }
3297
3298 /**************************************************************/
3299 /* async block device emulation */
3300
3301 typedef struct BlockDriverAIOCBSync {
3302 BlockDriverAIOCB common;
3303 QEMUBH *bh;
3304 int ret;
3305 /* vector translation state */
3306 QEMUIOVector *qiov;
3307 uint8_t *bounce;
3308 int is_write;
3309 } BlockDriverAIOCBSync;
3310
3311 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3312 {
3313 BlockDriverAIOCBSync *acb =
3314 container_of(blockacb, BlockDriverAIOCBSync, common);
3315 qemu_bh_delete(acb->bh);
3316 acb->bh = NULL;
3317 qemu_aio_release(acb);
3318 }
3319
3320 static AIOPool bdrv_em_aio_pool = {
3321 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3322 .cancel = bdrv_aio_cancel_em,
3323 };
3324
3325 static void bdrv_aio_bh_cb(void *opaque)
3326 {
3327 BlockDriverAIOCBSync *acb = opaque;
3328
3329 if (!acb->is_write)
3330 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3331 qemu_vfree(acb->bounce);
3332 acb->common.cb(acb->common.opaque, acb->ret);
3333 qemu_bh_delete(acb->bh);
3334 acb->bh = NULL;
3335 qemu_aio_release(acb);
3336 }
3337
3338 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3339 int64_t sector_num,
3340 QEMUIOVector *qiov,
3341 int nb_sectors,
3342 BlockDriverCompletionFunc *cb,
3343 void *opaque,
3344 int is_write)
3345
3346 {
3347 BlockDriverAIOCBSync *acb;
3348
3349 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3350 acb->is_write = is_write;
3351 acb->qiov = qiov;
3352 acb->bounce = qemu_blockalign(bs, qiov->size);
3353 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3354
3355 if (is_write) {
3356 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3357 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3358 } else {
3359 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3360 }
3361
3362 qemu_bh_schedule(acb->bh);
3363
3364 return &acb->common;
3365 }
3366
3367 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3368 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3369 BlockDriverCompletionFunc *cb, void *opaque)
3370 {
3371 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3372 }
3373
3374 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3375 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3376 BlockDriverCompletionFunc *cb, void *opaque)
3377 {
3378 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3379 }
3380
3381
3382 typedef struct BlockDriverAIOCBCoroutine {
3383 BlockDriverAIOCB common;
3384 BlockRequest req;
3385 bool is_write;
3386 QEMUBH* bh;
3387 } BlockDriverAIOCBCoroutine;
3388
3389 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3390 {
3391 qemu_aio_flush();
3392 }
3393
3394 static AIOPool bdrv_em_co_aio_pool = {
3395 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3396 .cancel = bdrv_aio_co_cancel_em,
3397 };
3398
3399 static void bdrv_co_em_bh(void *opaque)
3400 {
3401 BlockDriverAIOCBCoroutine *acb = opaque;
3402
3403 acb->common.cb(acb->common.opaque, acb->req.error);
3404 qemu_bh_delete(acb->bh);
3405 qemu_aio_release(acb);
3406 }
3407
3408 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3409 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3410 {
3411 BlockDriverAIOCBCoroutine *acb = opaque;
3412 BlockDriverState *bs = acb->common.bs;
3413
3414 if (!acb->is_write) {
3415 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3416 acb->req.nb_sectors, acb->req.qiov, 0);
3417 } else {
3418 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3419 acb->req.nb_sectors, acb->req.qiov, 0);
3420 }
3421
3422 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3423 qemu_bh_schedule(acb->bh);
3424 }
3425
3426 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3427 int64_t sector_num,
3428 QEMUIOVector *qiov,
3429 int nb_sectors,
3430 BlockDriverCompletionFunc *cb,
3431 void *opaque,
3432 bool is_write)
3433 {
3434 Coroutine *co;
3435 BlockDriverAIOCBCoroutine *acb;
3436
3437 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3438 acb->req.sector = sector_num;
3439 acb->req.nb_sectors = nb_sectors;
3440 acb->req.qiov = qiov;
3441 acb->is_write = is_write;
3442
3443 co = qemu_coroutine_create(bdrv_co_do_rw);
3444 qemu_coroutine_enter(co, acb);
3445
3446 return &acb->common;
3447 }
3448
3449 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3450 {
3451 BlockDriverAIOCBCoroutine *acb = opaque;
3452 BlockDriverState *bs = acb->common.bs;
3453
3454 acb->req.error = bdrv_co_flush(bs);
3455 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3456 qemu_bh_schedule(acb->bh);
3457 }
3458
3459 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3460 BlockDriverCompletionFunc *cb, void *opaque)
3461 {
3462 trace_bdrv_aio_flush(bs, opaque);
3463
3464 Coroutine *co;
3465 BlockDriverAIOCBCoroutine *acb;
3466
3467 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3468 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3469 qemu_coroutine_enter(co, acb);
3470
3471 return &acb->common;
3472 }
3473
3474 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3475 {
3476 BlockDriverAIOCBCoroutine *acb = opaque;
3477 BlockDriverState *bs = acb->common.bs;
3478
3479 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3480 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3481 qemu_bh_schedule(acb->bh);
3482 }
3483
3484 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3485 int64_t sector_num, int nb_sectors,
3486 BlockDriverCompletionFunc *cb, void *opaque)
3487 {
3488 Coroutine *co;
3489 BlockDriverAIOCBCoroutine *acb;
3490
3491 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3492
3493 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3494 acb->req.sector = sector_num;
3495 acb->req.nb_sectors = nb_sectors;
3496 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3497 qemu_coroutine_enter(co, acb);
3498
3499 return &acb->common;
3500 }
3501
3502 void bdrv_init(void)
3503 {
3504 module_call_init(MODULE_INIT_BLOCK);
3505 }
3506
3507 void bdrv_init_with_whitelist(void)
3508 {
3509 use_bdrv_whitelist = 1;
3510 bdrv_init();
3511 }
3512
3513 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3514 BlockDriverCompletionFunc *cb, void *opaque)
3515 {
3516 BlockDriverAIOCB *acb;
3517
3518 if (pool->free_aiocb) {
3519 acb = pool->free_aiocb;
3520 pool->free_aiocb = acb->next;
3521 } else {
3522 acb = g_malloc0(pool->aiocb_size);
3523 acb->pool = pool;
3524 }
3525 acb->bs = bs;
3526 acb->cb = cb;
3527 acb->opaque = opaque;
3528 return acb;
3529 }
3530
3531 void qemu_aio_release(void *p)
3532 {
3533 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3534 AIOPool *pool = acb->pool;
3535 acb->next = pool->free_aiocb;
3536 pool->free_aiocb = acb;
3537 }
3538
3539 /**************************************************************/
3540 /* Coroutine block device emulation */
3541
3542 typedef struct CoroutineIOCompletion {
3543 Coroutine *coroutine;
3544 int ret;
3545 } CoroutineIOCompletion;
3546
3547 static void bdrv_co_io_em_complete(void *opaque, int ret)
3548 {
3549 CoroutineIOCompletion *co = opaque;
3550
3551 co->ret = ret;
3552 qemu_coroutine_enter(co->coroutine, NULL);
3553 }
3554
3555 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3556 int nb_sectors, QEMUIOVector *iov,
3557 bool is_write)
3558 {
3559 CoroutineIOCompletion co = {
3560 .coroutine = qemu_coroutine_self(),
3561 };
3562 BlockDriverAIOCB *acb;
3563
3564 if (is_write) {
3565 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3566 bdrv_co_io_em_complete, &co);
3567 } else {
3568 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3569 bdrv_co_io_em_complete, &co);
3570 }
3571
3572 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3573 if (!acb) {
3574 return -EIO;
3575 }
3576 qemu_coroutine_yield();
3577
3578 return co.ret;
3579 }
3580
3581 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3582 int64_t sector_num, int nb_sectors,
3583 QEMUIOVector *iov)
3584 {
3585 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3586 }
3587
3588 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3589 int64_t sector_num, int nb_sectors,
3590 QEMUIOVector *iov)
3591 {
3592 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3593 }
3594
3595 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3596 {
3597 RwCo *rwco = opaque;
3598
3599 rwco->ret = bdrv_co_flush(rwco->bs);
3600 }
3601
3602 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3603 {
3604 int ret;
3605
3606 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
3607 return 0;
3608 }
3609
3610 /* Write back cached data to the OS even with cache=unsafe */
3611 if (bs->drv->bdrv_co_flush_to_os) {
3612 ret = bs->drv->bdrv_co_flush_to_os(bs);
3613 if (ret < 0) {
3614 return ret;
3615 }
3616 }
3617
3618 /* But don't actually force it to the disk with cache=unsafe */
3619 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3620 return 0;
3621 }
3622
3623 if (bs->drv->bdrv_co_flush_to_disk) {
3624 ret = bs->drv->bdrv_co_flush_to_disk(bs);
3625 } else if (bs->drv->bdrv_aio_flush) {
3626 BlockDriverAIOCB *acb;
3627 CoroutineIOCompletion co = {
3628 .coroutine = qemu_coroutine_self(),
3629 };
3630
3631 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3632 if (acb == NULL) {
3633 ret = -EIO;
3634 } else {
3635 qemu_coroutine_yield();
3636 ret = co.ret;
3637 }
3638 } else {
3639 /*
3640 * Some block drivers always operate in either writethrough or unsafe
3641 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3642 * know how the server works (because the behaviour is hardcoded or
3643 * depends on server-side configuration), so we can't ensure that
3644 * everything is safe on disk. Returning an error doesn't work because
3645 * that would break guests even if the server operates in writethrough
3646 * mode.
3647 *
3648 * Let's hope the user knows what he's doing.
3649 */
3650 ret = 0;
3651 }
3652 if (ret < 0) {
3653 return ret;
3654 }
3655
3656 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
3657 * in the case of cache=unsafe, so there are no useless flushes.
3658 */
3659 return bdrv_co_flush(bs->file);
3660 }
3661
3662 void bdrv_invalidate_cache(BlockDriverState *bs)
3663 {
3664 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3665 bs->drv->bdrv_invalidate_cache(bs);
3666 }
3667 }
3668
3669 void bdrv_invalidate_cache_all(void)
3670 {
3671 BlockDriverState *bs;
3672
3673 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3674 bdrv_invalidate_cache(bs);
3675 }
3676 }
3677
3678 void bdrv_clear_incoming_migration_all(void)
3679 {
3680 BlockDriverState *bs;
3681
3682 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3683 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
3684 }
3685 }
3686
3687 int bdrv_flush(BlockDriverState *bs)
3688 {
3689 Coroutine *co;
3690 RwCo rwco = {
3691 .bs = bs,
3692 .ret = NOT_DONE,
3693 };
3694
3695 if (qemu_in_coroutine()) {
3696 /* Fast-path if already in coroutine context */
3697 bdrv_flush_co_entry(&rwco);
3698 } else {
3699 co = qemu_coroutine_create(bdrv_flush_co_entry);
3700 qemu_coroutine_enter(co, &rwco);
3701 while (rwco.ret == NOT_DONE) {
3702 qemu_aio_wait();
3703 }
3704 }
3705
3706 return rwco.ret;
3707 }
3708
3709 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3710 {
3711 RwCo *rwco = opaque;
3712
3713 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3714 }
3715
3716 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3717 int nb_sectors)
3718 {
3719 if (!bs->drv) {
3720 return -ENOMEDIUM;
3721 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3722 return -EIO;
3723 } else if (bs->read_only) {
3724 return -EROFS;
3725 } else if (bs->drv->bdrv_co_discard) {
3726 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3727 } else if (bs->drv->bdrv_aio_discard) {
3728 BlockDriverAIOCB *acb;
3729 CoroutineIOCompletion co = {
3730 .coroutine = qemu_coroutine_self(),
3731 };
3732
3733 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3734 bdrv_co_io_em_complete, &co);
3735 if (acb == NULL) {
3736 return -EIO;
3737 } else {
3738 qemu_coroutine_yield();
3739 return co.ret;
3740 }
3741 } else {
3742 return 0;
3743 }
3744 }
3745
3746 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
3747 {
3748 Coroutine *co;
3749 RwCo rwco = {
3750 .bs = bs,
3751 .sector_num = sector_num,
3752 .nb_sectors = nb_sectors,
3753 .ret = NOT_DONE,
3754 };
3755
3756 if (qemu_in_coroutine()) {
3757 /* Fast-path if already in coroutine context */
3758 bdrv_discard_co_entry(&rwco);
3759 } else {
3760 co = qemu_coroutine_create(bdrv_discard_co_entry);
3761 qemu_coroutine_enter(co, &rwco);
3762 while (rwco.ret == NOT_DONE) {
3763 qemu_aio_wait();
3764 }
3765 }
3766
3767 return rwco.ret;
3768 }
3769
3770 /**************************************************************/
3771 /* removable device support */
3772
3773 /**
3774 * Return TRUE if the media is present
3775 */
3776 int bdrv_is_inserted(BlockDriverState *bs)
3777 {
3778 BlockDriver *drv = bs->drv;
3779
3780 if (!drv)
3781 return 0;
3782 if (!drv->bdrv_is_inserted)
3783 return 1;
3784 return drv->bdrv_is_inserted(bs);
3785 }
3786
3787 /**
3788 * Return whether the media changed since the last call to this
3789 * function, or -ENOTSUP if we don't know. Most drivers don't know.
3790 */
3791 int bdrv_media_changed(BlockDriverState *bs)
3792 {
3793 BlockDriver *drv = bs->drv;
3794
3795 if (drv && drv->bdrv_media_changed) {
3796 return drv->bdrv_media_changed(bs);
3797 }
3798 return -ENOTSUP;
3799 }
3800
3801 /**
3802 * If eject_flag is TRUE, eject the media. Otherwise, close the tray
3803 */
3804 void bdrv_eject(BlockDriverState *bs, bool eject_flag)
3805 {
3806 BlockDriver *drv = bs->drv;
3807
3808 if (drv && drv->bdrv_eject) {
3809 drv->bdrv_eject(bs, eject_flag);
3810 }
3811
3812 if (bs->device_name[0] != '\0') {
3813 bdrv_emit_qmp_eject_event(bs, eject_flag);
3814 }
3815 }
3816
3817 /**
3818 * Lock or unlock the media (if it is locked, the user won't be able
3819 * to eject it manually).
3820 */
3821 void bdrv_lock_medium(BlockDriverState *bs, bool locked)
3822 {
3823 BlockDriver *drv = bs->drv;
3824
3825 trace_bdrv_lock_medium(bs, locked);
3826
3827 if (drv && drv->bdrv_lock_medium) {
3828 drv->bdrv_lock_medium(bs, locked);
3829 }
3830 }
3831
3832 /* needed for generic scsi interface */
3833
3834 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
3835 {
3836 BlockDriver *drv = bs->drv;
3837
3838 if (drv && drv->bdrv_ioctl)
3839 return drv->bdrv_ioctl(bs, req, buf);
3840 return -ENOTSUP;
3841 }
3842
3843 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
3844 unsigned long int req, void *buf,
3845 BlockDriverCompletionFunc *cb, void *opaque)
3846 {
3847 BlockDriver *drv = bs->drv;
3848
3849 if (drv && drv->bdrv_aio_ioctl)
3850 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
3851 return NULL;
3852 }
3853
3854 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
3855 {
3856 bs->buffer_alignment = align;
3857 }
3858
3859 void *qemu_blockalign(BlockDriverState *bs, size_t size)
3860 {
3861 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
3862 }
3863
3864 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
3865 {
3866 int64_t bitmap_size;
3867
3868 bs->dirty_count = 0;
3869 if (enable) {
3870 if (!bs->dirty_bitmap) {
3871 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
3872 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
3873 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
3874
3875 bs->dirty_bitmap = g_malloc0(bitmap_size);
3876 }
3877 } else {
3878 if (bs->dirty_bitmap) {
3879 g_free(bs->dirty_bitmap);
3880 bs->dirty_bitmap = NULL;
3881 }
3882 }
3883 }
3884
3885 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
3886 {
3887 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
3888
3889 if (bs->dirty_bitmap &&
3890 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
3891 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] &
3892 (1UL << (chunk % (sizeof(unsigned long) * 8))));
3893 } else {
3894 return 0;
3895 }
3896 }
3897
3898 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
3899 int nr_sectors)
3900 {
3901 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
3902 }
3903
3904 int64_t bdrv_get_dirty_count(BlockDriverState *bs)
3905 {
3906 return bs->dirty_count;
3907 }
3908
3909 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
3910 {
3911 assert(bs->in_use != in_use);
3912 bs->in_use = in_use;
3913 }
3914
3915 int bdrv_in_use(BlockDriverState *bs)
3916 {
3917 return bs->in_use;
3918 }
3919
3920 void bdrv_iostatus_enable(BlockDriverState *bs)
3921 {
3922 bs->iostatus_enabled = true;
3923 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3924 }
3925
3926 /* The I/O status is only enabled if the drive explicitly
3927 * enables it _and_ the VM is configured to stop on errors */
3928 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs)
3929 {
3930 return (bs->iostatus_enabled &&
3931 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC ||
3932 bs->on_write_error == BLOCK_ERR_STOP_ANY ||
3933 bs->on_read_error == BLOCK_ERR_STOP_ANY));
3934 }
3935
3936 void bdrv_iostatus_disable(BlockDriverState *bs)
3937 {
3938 bs->iostatus_enabled = false;
3939 }
3940
3941 void bdrv_iostatus_reset(BlockDriverState *bs)
3942 {
3943 if (bdrv_iostatus_is_enabled(bs)) {
3944 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
3945 }
3946 }
3947
3948 /* XXX: Today this is set by device models because it makes the implementation
3949 quite simple. However, the block layer knows about the error, so it's
3950 possible to implement this without device models being involved */
3951 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
3952 {
3953 if (bdrv_iostatus_is_enabled(bs) &&
3954 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
3955 assert(error >= 0);
3956 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
3957 BLOCK_DEVICE_IO_STATUS_FAILED;
3958 }
3959 }
3960
3961 void
3962 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
3963 enum BlockAcctType type)
3964 {
3965 assert(type < BDRV_MAX_IOTYPE);
3966
3967 cookie->bytes = bytes;
3968 cookie->start_time_ns = get_clock();
3969 cookie->type = type;
3970 }
3971
3972 void
3973 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
3974 {
3975 assert(cookie->type < BDRV_MAX_IOTYPE);
3976
3977 bs->nr_bytes[cookie->type] += cookie->bytes;
3978 bs->nr_ops[cookie->type]++;
3979 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
3980 }
3981
3982 int bdrv_img_create(const char *filename, const char *fmt,
3983 const char *base_filename, const char *base_fmt,
3984 char *options, uint64_t img_size, int flags)
3985 {
3986 QEMUOptionParameter *param = NULL, *create_options = NULL;
3987 QEMUOptionParameter *backing_fmt, *backing_file, *size;
3988 BlockDriverState *bs = NULL;
3989 BlockDriver *drv, *proto_drv;
3990 BlockDriver *backing_drv = NULL;
3991 int ret = 0;
3992
3993 /* Find driver and parse its options */
3994 drv = bdrv_find_format(fmt);
3995 if (!drv) {
3996 error_report("Unknown file format '%s'", fmt);
3997 ret = -EINVAL;
3998 goto out;
3999 }
4000
4001 proto_drv = bdrv_find_protocol(filename);
4002 if (!proto_drv) {
4003 error_report("Unknown protocol '%s'", filename);
4004 ret = -EINVAL;
4005 goto out;
4006 }
4007
4008 create_options = append_option_parameters(create_options,
4009 drv->create_options);
4010 create_options = append_option_parameters(create_options,
4011 proto_drv->create_options);
4012
4013 /* Create parameter list with default values */
4014 param = parse_option_parameters("", create_options, param);
4015
4016 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size);
4017
4018 /* Parse -o options */
4019 if (options) {
4020 param = parse_option_parameters(options, create_options, param);
4021 if (param == NULL) {
4022 error_report("Invalid options for file format '%s'.", fmt);
4023 ret = -EINVAL;
4024 goto out;
4025 }
4026 }
4027
4028 if (base_filename) {
4029 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
4030 base_filename)) {
4031 error_report("Backing file not supported for file format '%s'",
4032 fmt);
4033 ret = -EINVAL;
4034 goto out;
4035 }
4036 }
4037
4038 if (base_fmt) {
4039 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) {
4040 error_report("Backing file format not supported for file "
4041 "format '%s'", fmt);
4042 ret = -EINVAL;
4043 goto out;
4044 }
4045 }
4046
4047 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE);
4048 if (backing_file && backing_file->value.s) {
4049 if (!strcmp(filename, backing_file->value.s)) {
4050 error_report("Error: Trying to create an image with the "
4051 "same filename as the backing file");
4052 ret = -EINVAL;
4053 goto out;
4054 }
4055 }
4056
4057 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT);
4058 if (backing_fmt && backing_fmt->value.s) {
4059 backing_drv = bdrv_find_format(backing_fmt->value.s);
4060 if (!backing_drv) {
4061 error_report("Unknown backing file format '%s'",
4062 backing_fmt->value.s);
4063 ret = -EINVAL;
4064 goto out;
4065 }
4066 }
4067
4068 // The size for the image must always be specified, with one exception:
4069 // If we are using a backing file, we can obtain the size from there
4070 size = get_option_parameter(param, BLOCK_OPT_SIZE);
4071 if (size && size->value.n == -1) {
4072 if (backing_file && backing_file->value.s) {
4073 uint64_t size;
4074 char buf[32];
4075
4076 bs = bdrv_new("");
4077
4078 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv);
4079 if (ret < 0) {
4080 error_report("Could not open '%s'", backing_file->value.s);
4081 goto out;
4082 }
4083 bdrv_get_geometry(bs, &size);
4084 size *= 512;
4085
4086 snprintf(buf, sizeof(buf), "%" PRId64, size);
4087 set_option_parameter(param, BLOCK_OPT_SIZE, buf);
4088 } else {
4089 error_report("Image creation needs a size parameter");
4090 ret = -EINVAL;
4091 goto out;
4092 }
4093 }
4094
4095 printf("Formatting '%s', fmt=%s ", filename, fmt);
4096 print_option_parameters(param);
4097 puts("");
4098
4099 ret = bdrv_create(drv, filename, param);
4100
4101 if (ret < 0) {
4102 if (ret == -ENOTSUP) {
4103 error_report("Formatting or formatting option not supported for "
4104 "file format '%s'", fmt);
4105 } else if (ret == -EFBIG) {
4106 error_report("The image size is too large for file format '%s'",
4107 fmt);
4108 } else {
4109 error_report("%s: error while creating %s: %s", filename, fmt,
4110 strerror(-ret));
4111 }
4112 }
4113
4114 out:
4115 free_option_parameters(create_options);
4116 free_option_parameters(param);
4117
4118 if (bs) {
4119 bdrv_delete(bs);
4120 }
4121
4122 return ret;
4123 }
4124
4125 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
4126 int64_t speed, BlockDriverCompletionFunc *cb,
4127 void *opaque, Error **errp)
4128 {
4129 BlockJob *job;
4130
4131 if (bs->job || bdrv_in_use(bs)) {
4132 error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
4133 return NULL;
4134 }
4135 bdrv_set_in_use(bs, 1);
4136
4137 job = g_malloc0(job_type->instance_size);
4138 job->job_type = job_type;
4139 job->bs = bs;
4140 job->cb = cb;
4141 job->opaque = opaque;
4142 bs->job = job;
4143
4144 /* Only set speed when necessary to avoid NotSupported error */
4145 if (speed != 0) {
4146 Error *local_err = NULL;
4147
4148 block_job_set_speed(job, speed, &local_err);
4149 if (error_is_set(&local_err)) {
4150 bs->job = NULL;
4151 g_free(job);
4152 bdrv_set_in_use(bs, 0);
4153 error_propagate(errp, local_err);
4154 return NULL;
4155 }
4156 }
4157 return job;
4158 }
4159
4160 void block_job_complete(BlockJob *job, int ret)
4161 {
4162 BlockDriverState *bs = job->bs;
4163
4164 assert(bs->job == job);
4165 job->cb(job->opaque, ret);
4166 bs->job = NULL;
4167 g_free(job);
4168 bdrv_set_in_use(bs, 0);
4169 }
4170
4171 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
4172 {
4173 Error *local_err = NULL;
4174
4175 if (!job->job_type->set_speed) {
4176 error_set(errp, QERR_NOT_SUPPORTED);
4177 return;
4178 }
4179 job->job_type->set_speed(job, speed, &local_err);
4180 if (error_is_set(&local_err)) {
4181 error_propagate(errp, local_err);
4182 return;
4183 }
4184
4185 job->speed = speed;
4186 }
4187
4188 void block_job_cancel(BlockJob *job)
4189 {
4190 job->cancelled = true;
4191 }
4192
4193 bool block_job_is_cancelled(BlockJob *job)
4194 {
4195 return job->cancelled;
4196 }
4197
4198 void block_job_cancel_sync(BlockJob *job)
4199 {
4200 BlockDriverState *bs = job->bs;
4201
4202 assert(bs->job == job);
4203 block_job_cancel(job);
4204 while (bs->job != NULL && bs->job->busy) {
4205 qemu_aio_wait();
4206 }
4207 }