]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>. | |
5 | * Copyright (c) Intel Corporation. | |
6 | * All rights reserved. | |
7 | * | |
8 | * Redistribution and use in source and binary forms, with or without | |
9 | * modification, are permitted provided that the following conditions | |
10 | * are met: | |
11 | * | |
12 | * * Redistributions of source code must retain the above copyright | |
13 | * notice, this list of conditions and the following disclaimer. | |
14 | * * Redistributions in binary form must reproduce the above copyright | |
15 | * notice, this list of conditions and the following disclaimer in | |
16 | * the documentation and/or other materials provided with the | |
17 | * distribution. | |
18 | * * Neither the name of Intel Corporation nor the names of its | |
19 | * contributors may be used to endorse or promote products derived | |
20 | * from this software without specific prior written permission. | |
21 | * | |
22 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
23 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
24 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
25 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
26 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
27 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
28 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
29 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
30 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
31 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
32 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
33 | */ | |
34 | ||
35 | #include <stdio.h> | |
36 | #include <errno.h> | |
37 | ||
38 | #include "blockdev_malloc.h" | |
39 | #include "spdk/bdev.h" | |
40 | #include "spdk/conf.h" | |
41 | #include "spdk/endian.h" | |
42 | #include "spdk/env.h" | |
43 | #include "spdk/copy_engine.h" | |
44 | #include "spdk/io_channel.h" | |
45 | ||
46 | #include "spdk_internal/bdev.h" | |
47 | #include "spdk_internal/log.h" | |
48 | ||
49 | #define MALLOC_MAX_UNMAP_BDESC 1 | |
50 | ||
51 | struct malloc_disk { | |
52 | struct spdk_bdev disk; | |
53 | void *malloc_buf; | |
54 | struct malloc_disk *next; | |
55 | }; | |
56 | ||
57 | struct malloc_task { | |
58 | int num_outstanding; | |
59 | enum spdk_bdev_io_status status; | |
60 | }; | |
61 | ||
62 | static struct malloc_task * | |
63 | __malloc_task_from_copy_task(struct spdk_copy_task *ct) | |
64 | { | |
65 | return (struct malloc_task *)((uintptr_t)ct - sizeof(struct malloc_task)); | |
66 | } | |
67 | ||
68 | static struct spdk_copy_task * | |
69 | __copy_task_from_malloc_task(struct malloc_task *mt) | |
70 | { | |
71 | return (struct spdk_copy_task *)((uintptr_t)mt + sizeof(struct malloc_task)); | |
72 | } | |
73 | ||
74 | static void | |
75 | malloc_done(void *ref, int status) | |
76 | { | |
77 | struct malloc_task *task = __malloc_task_from_copy_task(ref); | |
78 | ||
79 | if (status != 0) { | |
80 | task->status = SPDK_BDEV_IO_STATUS_FAILED; | |
81 | } | |
82 | ||
83 | if (--task->num_outstanding == 0) { | |
84 | spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); | |
85 | } | |
86 | } | |
87 | ||
88 | static struct malloc_disk *g_malloc_disk_head = NULL; | |
89 | ||
90 | int malloc_disk_count = 0; | |
91 | ||
92 | static int blockdev_malloc_initialize(void); | |
93 | static void blockdev_malloc_finish(void); | |
94 | static void blockdev_malloc_get_spdk_running_config(FILE *fp); | |
95 | ||
96 | static int | |
97 | blockdev_malloc_get_ctx_size(void) | |
98 | { | |
99 | return sizeof(struct malloc_task) + spdk_copy_task_size(); | |
100 | } | |
101 | ||
102 | SPDK_BDEV_MODULE_REGISTER(blockdev_malloc_initialize, blockdev_malloc_finish, | |
103 | blockdev_malloc_get_spdk_running_config, blockdev_malloc_get_ctx_size) | |
104 | ||
105 | static void | |
106 | blockdev_malloc_delete_from_list(struct malloc_disk *malloc_disk) | |
107 | { | |
108 | struct malloc_disk *prev = NULL; | |
109 | struct malloc_disk *node = g_malloc_disk_head; | |
110 | ||
111 | if (malloc_disk == NULL) | |
112 | return; | |
113 | ||
114 | while (node != NULL) { | |
115 | if (node == malloc_disk) { | |
116 | if (prev != NULL) { | |
117 | prev->next = malloc_disk->next; | |
118 | } else { | |
119 | g_malloc_disk_head = malloc_disk->next; | |
120 | } | |
121 | break; | |
122 | } | |
123 | prev = node; | |
124 | node = node->next; | |
125 | } | |
126 | } | |
127 | ||
128 | static int | |
129 | blockdev_malloc_destruct(void *ctx) | |
130 | { | |
131 | struct malloc_disk *malloc_disk = ctx; | |
132 | blockdev_malloc_delete_from_list(malloc_disk); | |
133 | spdk_free(malloc_disk->malloc_buf); | |
134 | spdk_free(malloc_disk); | |
135 | return 0; | |
136 | } | |
137 | ||
138 | static int | |
139 | blockdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) | |
140 | { | |
141 | int i; | |
142 | ||
143 | for (i = 0; i < iovcnt; i++) { | |
144 | if (nbytes < iovs[i].iov_len) | |
145 | return 0; | |
146 | ||
147 | nbytes -= iovs[i].iov_len; | |
148 | } | |
149 | ||
150 | return nbytes != 0; | |
151 | } | |
152 | ||
153 | static void | |
154 | blockdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, | |
155 | struct malloc_task *task, | |
156 | struct iovec *iov, int iovcnt, size_t len, uint64_t offset) | |
157 | { | |
158 | int64_t res = 0; | |
159 | void *src = mdisk->malloc_buf + offset; | |
160 | int i; | |
161 | ||
162 | if (blockdev_malloc_check_iov_len(iov, iovcnt, len)) { | |
163 | spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), | |
164 | SPDK_BDEV_IO_STATUS_FAILED); | |
165 | return; | |
166 | } | |
167 | ||
168 | SPDK_TRACELOG(SPDK_TRACE_MALLOC, "read %lu bytes from offset %#lx\n", | |
169 | len, offset); | |
170 | ||
171 | task->status = SPDK_BDEV_IO_STATUS_SUCCESS; | |
172 | task->num_outstanding = iovcnt; | |
173 | ||
174 | for (i = 0; i < iovcnt; i++) { | |
175 | res = spdk_copy_submit(__copy_task_from_malloc_task(task), | |
176 | ch, iov[i].iov_base, | |
177 | src, iov[i].iov_len, malloc_done); | |
178 | ||
179 | if (res != (int64_t)iov[i].iov_len) { | |
180 | malloc_done(__copy_task_from_malloc_task(task), -1); | |
181 | } | |
182 | ||
183 | src += iov[i].iov_len; | |
184 | len -= iov[i].iov_len; | |
185 | } | |
186 | } | |
187 | ||
188 | static void | |
189 | blockdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, | |
190 | struct malloc_task *task, | |
191 | struct iovec *iov, int iovcnt, size_t len, uint64_t offset) | |
192 | { | |
193 | int64_t res = 0; | |
194 | void *dst = mdisk->malloc_buf + offset; | |
195 | int i; | |
196 | ||
197 | if (blockdev_malloc_check_iov_len(iov, iovcnt, len)) { | |
198 | spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), | |
199 | SPDK_BDEV_IO_STATUS_FAILED); | |
200 | return; | |
201 | } | |
202 | ||
203 | SPDK_TRACELOG(SPDK_TRACE_MALLOC, "wrote %lu bytes to offset %#lx\n", | |
204 | len, offset); | |
205 | ||
206 | task->status = SPDK_BDEV_IO_STATUS_SUCCESS; | |
207 | task->num_outstanding = iovcnt; | |
208 | ||
209 | for (i = 0; i < iovcnt; i++) { | |
210 | res = spdk_copy_submit(__copy_task_from_malloc_task(task), | |
211 | ch, dst, iov[i].iov_base, | |
212 | iov[i].iov_len, malloc_done); | |
213 | ||
214 | if (res != (int64_t)iov[i].iov_len) { | |
215 | malloc_done(__copy_task_from_malloc_task(task), -1); | |
216 | } | |
217 | ||
218 | dst += iov[i].iov_len; | |
219 | len -= iov[i].iov_len; | |
220 | } | |
221 | } | |
222 | ||
223 | static int | |
224 | blockdev_malloc_unmap(struct malloc_disk *mdisk, | |
225 | struct spdk_io_channel *ch, | |
226 | struct malloc_task *task, | |
227 | struct spdk_scsi_unmap_bdesc *unmap_d, | |
228 | uint16_t bdesc_count) | |
229 | { | |
230 | uint64_t lba, offset, byte_count; | |
231 | uint32_t block_count; | |
232 | ||
233 | assert(bdesc_count <= MALLOC_MAX_UNMAP_BDESC); | |
234 | ||
235 | /* | |
236 | * For now, only support a single unmap descriptor per command. The copy engine API does not | |
237 | * support batch submission of operations. | |
238 | */ | |
239 | assert(bdesc_count == 1); | |
240 | ||
241 | lba = from_be64(&unmap_d[0].lba); | |
242 | offset = lba * mdisk->disk.blocklen; | |
243 | block_count = from_be32(&unmap_d[0].block_count); | |
244 | byte_count = (uint64_t)block_count * mdisk->disk.blocklen; | |
245 | ||
246 | if (lba >= mdisk->disk.blockcnt || block_count > mdisk->disk.blockcnt - lba) { | |
247 | return -1; | |
248 | } | |
249 | ||
250 | task->status = SPDK_BDEV_IO_STATUS_SUCCESS; | |
251 | task->num_outstanding = 1; | |
252 | ||
253 | return spdk_copy_submit_fill(__copy_task_from_malloc_task(task), ch, | |
254 | mdisk->malloc_buf + offset, 0, byte_count, malloc_done); | |
255 | } | |
256 | ||
257 | static int64_t | |
258 | blockdev_malloc_flush(struct malloc_disk *mdisk, struct malloc_task *task, | |
259 | uint64_t offset, uint64_t nbytes) | |
260 | { | |
261 | spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), SPDK_BDEV_IO_STATUS_SUCCESS); | |
262 | ||
263 | return 0; | |
264 | } | |
265 | ||
266 | static int | |
267 | blockdev_malloc_reset(struct malloc_disk *mdisk, struct malloc_task *task) | |
268 | { | |
269 | spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), SPDK_BDEV_IO_STATUS_SUCCESS); | |
270 | ||
271 | return 0; | |
272 | } | |
273 | ||
274 | static int _blockdev_malloc_submit_request(struct spdk_bdev_io *bdev_io) | |
275 | { | |
276 | switch (bdev_io->type) { | |
277 | case SPDK_BDEV_IO_TYPE_READ: | |
278 | if (bdev_io->u.read.iovs[0].iov_base == NULL) { | |
279 | assert(bdev_io->u.read.iovcnt == 1); | |
280 | bdev_io->u.read.iovs[0].iov_base = | |
281 | ((struct malloc_disk *)bdev_io->ctx)->malloc_buf + | |
282 | bdev_io->u.read.offset; | |
283 | bdev_io->u.read.iovs[0].iov_len = bdev_io->u.read.len; | |
284 | bdev_io->u.read.put_rbuf = false; | |
285 | spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bdev_io->driver_ctx), | |
286 | SPDK_BDEV_IO_STATUS_SUCCESS); | |
287 | return 0; | |
288 | } | |
289 | ||
290 | blockdev_malloc_readv((struct malloc_disk *)bdev_io->ctx, | |
291 | bdev_io->ch, | |
292 | (struct malloc_task *)bdev_io->driver_ctx, | |
293 | bdev_io->u.read.iovs, | |
294 | bdev_io->u.read.iovcnt, | |
295 | bdev_io->u.read.len, | |
296 | bdev_io->u.read.offset); | |
297 | return 0; | |
298 | ||
299 | case SPDK_BDEV_IO_TYPE_WRITE: | |
300 | blockdev_malloc_writev((struct malloc_disk *)bdev_io->ctx, | |
301 | bdev_io->ch, | |
302 | (struct malloc_task *)bdev_io->driver_ctx, | |
303 | bdev_io->u.write.iovs, | |
304 | bdev_io->u.write.iovcnt, | |
305 | bdev_io->u.write.len, | |
306 | bdev_io->u.write.offset); | |
307 | return 0; | |
308 | ||
309 | case SPDK_BDEV_IO_TYPE_RESET: | |
310 | return blockdev_malloc_reset((struct malloc_disk *)bdev_io->ctx, | |
311 | (struct malloc_task *)bdev_io->driver_ctx); | |
312 | ||
313 | case SPDK_BDEV_IO_TYPE_FLUSH: | |
314 | return blockdev_malloc_flush((struct malloc_disk *)bdev_io->ctx, | |
315 | (struct malloc_task *)bdev_io->driver_ctx, | |
316 | bdev_io->u.flush.offset, | |
317 | bdev_io->u.flush.length); | |
318 | ||
319 | case SPDK_BDEV_IO_TYPE_UNMAP: | |
320 | return blockdev_malloc_unmap((struct malloc_disk *)bdev_io->ctx, | |
321 | bdev_io->ch, | |
322 | (struct malloc_task *)bdev_io->driver_ctx, | |
323 | bdev_io->u.unmap.unmap_bdesc, | |
324 | bdev_io->u.unmap.bdesc_count); | |
325 | default: | |
326 | return -1; | |
327 | } | |
328 | return 0; | |
329 | } | |
330 | ||
331 | static void blockdev_malloc_submit_request(struct spdk_bdev_io *bdev_io) | |
332 | { | |
333 | if (_blockdev_malloc_submit_request(bdev_io) < 0) { | |
334 | spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); | |
335 | } | |
336 | } | |
337 | ||
338 | static bool | |
339 | blockdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) | |
340 | { | |
341 | switch (io_type) { | |
342 | case SPDK_BDEV_IO_TYPE_READ: | |
343 | case SPDK_BDEV_IO_TYPE_WRITE: | |
344 | case SPDK_BDEV_IO_TYPE_FLUSH: | |
345 | case SPDK_BDEV_IO_TYPE_RESET: | |
346 | case SPDK_BDEV_IO_TYPE_UNMAP: | |
347 | return true; | |
348 | ||
349 | default: | |
350 | return false; | |
351 | } | |
352 | } | |
353 | ||
354 | static struct spdk_io_channel * | |
355 | blockdev_malloc_get_io_channel(void *ctx, uint32_t priority) | |
356 | { | |
357 | return spdk_copy_engine_get_io_channel(priority); | |
358 | } | |
359 | ||
360 | static const struct spdk_bdev_fn_table malloc_fn_table = { | |
361 | .destruct = blockdev_malloc_destruct, | |
362 | .submit_request = blockdev_malloc_submit_request, | |
363 | .io_type_supported = blockdev_malloc_io_type_supported, | |
364 | .get_io_channel = blockdev_malloc_get_io_channel, | |
365 | }; | |
366 | ||
367 | struct spdk_bdev *create_malloc_disk(uint64_t num_blocks, uint32_t block_size) | |
368 | { | |
369 | struct malloc_disk *mdisk; | |
370 | ||
371 | if (block_size % 512 != 0) { | |
372 | SPDK_ERRLOG("Block size %u is not a multiple of 512.\n", block_size); | |
373 | return NULL; | |
374 | } | |
375 | ||
376 | if (num_blocks == 0) { | |
377 | SPDK_ERRLOG("Disk must be more than 0 blocks\n"); | |
378 | return NULL; | |
379 | } | |
380 | ||
381 | mdisk = spdk_zmalloc(sizeof(*mdisk), 0, NULL); | |
382 | if (!mdisk) { | |
383 | perror("mdisk"); | |
384 | return NULL; | |
385 | } | |
386 | ||
387 | /* | |
388 | * Allocate the large backend memory buffer from pinned memory. | |
389 | * | |
390 | * TODO: need to pass a hint so we know which socket to allocate | |
391 | * from on multi-socket systems. | |
392 | */ | |
393 | mdisk->malloc_buf = spdk_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL); | |
394 | if (!mdisk->malloc_buf) { | |
395 | SPDK_ERRLOG("spdk_zmalloc failed\n"); | |
396 | spdk_free(mdisk); | |
397 | return NULL; | |
398 | } | |
399 | ||
400 | snprintf(mdisk->disk.name, SPDK_BDEV_MAX_NAME_LENGTH, "Malloc%d", malloc_disk_count); | |
401 | snprintf(mdisk->disk.product_name, SPDK_BDEV_MAX_PRODUCT_NAME_LENGTH, "Malloc disk"); | |
402 | malloc_disk_count++; | |
403 | ||
404 | mdisk->disk.write_cache = 1; | |
405 | mdisk->disk.blocklen = block_size; | |
406 | mdisk->disk.blockcnt = num_blocks; | |
407 | mdisk->disk.thin_provisioning = 1; | |
408 | mdisk->disk.max_unmap_bdesc_count = MALLOC_MAX_UNMAP_BDESC; | |
409 | ||
410 | mdisk->disk.ctxt = mdisk; | |
411 | mdisk->disk.fn_table = &malloc_fn_table; | |
412 | ||
413 | spdk_bdev_register(&mdisk->disk); | |
414 | ||
415 | mdisk->next = g_malloc_disk_head; | |
416 | g_malloc_disk_head = mdisk; | |
417 | ||
418 | return &mdisk->disk; | |
419 | } | |
420 | ||
421 | static void free_malloc_disk(struct malloc_disk *mdisk) | |
422 | { | |
423 | spdk_free(mdisk->malloc_buf); | |
424 | spdk_free(mdisk); | |
425 | } | |
426 | ||
427 | static int blockdev_malloc_initialize(void) | |
428 | { | |
429 | struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Malloc"); | |
430 | int NumberOfLuns, LunSizeInMB, BlockSize, i; | |
431 | uint64_t size; | |
432 | struct spdk_bdev *bdev; | |
433 | ||
434 | if (sp != NULL) { | |
435 | NumberOfLuns = spdk_conf_section_get_intval(sp, "NumberOfLuns"); | |
436 | LunSizeInMB = spdk_conf_section_get_intval(sp, "LunSizeInMB"); | |
437 | BlockSize = spdk_conf_section_get_intval(sp, "BlockSize"); | |
438 | if ((NumberOfLuns < 1) || (LunSizeInMB < 1)) { | |
439 | SPDK_ERRLOG("Malloc section present, but no devices specified\n"); | |
440 | return EINVAL; | |
441 | } | |
442 | if (BlockSize < 1) { | |
443 | /* Default is 512 bytes */ | |
444 | BlockSize = 512; | |
445 | } | |
446 | size = (uint64_t)LunSizeInMB * 1024 * 1024; | |
447 | for (i = 0; i < NumberOfLuns; i++) { | |
448 | bdev = create_malloc_disk(size / BlockSize, BlockSize); | |
449 | if (bdev == NULL) { | |
450 | SPDK_ERRLOG("Could not create malloc disk\n"); | |
451 | return EINVAL; | |
452 | } | |
453 | } | |
454 | } | |
455 | return 0; | |
456 | } | |
457 | ||
458 | static void blockdev_malloc_finish(void) | |
459 | { | |
460 | struct malloc_disk *mdisk; | |
461 | ||
462 | while (g_malloc_disk_head != NULL) { | |
463 | mdisk = g_malloc_disk_head; | |
464 | g_malloc_disk_head = mdisk->next; | |
465 | free_malloc_disk(mdisk); | |
466 | } | |
467 | } | |
468 | ||
469 | static void | |
470 | blockdev_malloc_get_spdk_running_config(FILE *fp) | |
471 | { | |
472 | int num_malloc_luns = 0; | |
473 | uint64_t malloc_lun_size = 0; | |
474 | ||
475 | /* count number of malloc LUNs, get LUN size */ | |
476 | struct malloc_disk *mdisk = g_malloc_disk_head; | |
477 | while (mdisk != NULL) { | |
478 | if (0 == malloc_lun_size) { | |
479 | /* assume all malloc luns the same size */ | |
480 | malloc_lun_size = mdisk->disk.blocklen * mdisk->disk.blockcnt; | |
481 | malloc_lun_size /= (1024 * 1024); | |
482 | } | |
483 | num_malloc_luns++; | |
484 | mdisk = mdisk->next; | |
485 | } | |
486 | ||
487 | if (num_malloc_luns > 0) { | |
488 | fprintf(fp, | |
489 | "\n" | |
490 | "# Users may change this section to create a different number or size of\n" | |
491 | "# malloc LUNs.\n" | |
492 | "# This will generate %d LUNs with a malloc-allocated backend. Each LUN \n" | |
493 | "# will be %" PRIu64 "MB in size and these will be named Malloc0 through Malloc%d.\n" | |
494 | "# Not all LUNs defined here are necessarily used below.\n" | |
495 | "[Malloc]\n" | |
496 | " NumberOfLuns %d\n" | |
497 | " LunSizeInMB %" PRIu64 "\n", | |
498 | num_malloc_luns, malloc_lun_size, | |
499 | num_malloc_luns - 1, num_malloc_luns, | |
500 | malloc_lun_size); | |
501 | } | |
502 | } | |
503 | ||
504 | SPDK_LOG_REGISTER_TRACE_FLAG("malloc", SPDK_TRACE_MALLOC) |