]> git.proxmox.com Git - mirror_qemu.git/blob - hw/nvme/ctrl.c
hw/nvme: add comment for nvme-ns properties
[mirror_qemu.git] / hw / nvme / ctrl.c
1 /*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13 *
14 * https://nvmexpress.org/developers/nvme-specification/
15 *
16 *
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use thes format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
22 *
23 * Usage
24 * -----
25 * See docs/system/nvme.rst for extensive documentation.
26 *
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * sriov_max_vfs=<N[optional]> \
39 * sriov_vq_flexible=<N[optional]> \
40 * sriov_vi_flexible=<N[optional]> \
41 * sriov_max_vi_per_vf=<N[optional]> \
42 * sriov_max_vq_per_vf=<N[optional]> \
43 * subsys=<subsys_id>
44 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
45 * zoned=<true|false[optional]>, \
46 * subsys=<subsys_id>,shared=<true|false[optional]>, \
47 * detached=<true|false[optional]>, \
48 * zoned.zone_size=<N[optional]>, \
49 * zoned.zone_capacity=<N[optional]>, \
50 * zoned.descr_ext_size=<N[optional]>, \
51 * zoned.max_active=<N[optional]>, \
52 * zoned.max_open=<N[optional]>, \
53 * zoned.cross_read=<true|false[optional]>
54 *
55 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
56 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
57 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
58 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
59 *
60 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
61 * For example:
62 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
63 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
64 *
65 * The PMR will use BAR 4/5 exclusively.
66 *
67 * To place controller(s) and namespace(s) to a subsystem, then provide
68 * nvme-subsys device as above.
69 *
70 * nvme subsystem device parameters
71 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
72 * - `nqn`
73 * This parameter provides the `<nqn_id>` part of the string
74 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
75 * of subsystem controllers. Note that `<nqn_id>` should be unique per
76 * subsystem, but this is not enforced by QEMU. If not specified, it will
77 * default to the value of the `id` parameter (`<subsys_id>`).
78 *
79 * nvme device parameters
80 * ~~~~~~~~~~~~~~~~~~~~~~
81 * - `subsys`
82 * Specifying this parameter attaches the controller to the subsystem and
83 * the SUBNQN field in the controller will report the NQN of the subsystem
84 * device. This also enables multi controller capability represented in
85 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
86 * Namespace Sharing Capabilities).
87 *
88 * - `aerl`
89 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
90 * of concurrently outstanding Asynchronous Event Request commands support
91 * by the controller. This is a 0's based value.
92 *
93 * - `aer_max_queued`
94 * This is the maximum number of events that the device will enqueue for
95 * completion when there are no outstanding AERs. When the maximum number of
96 * enqueued events are reached, subsequent events will be dropped.
97 *
98 * - `mdts`
99 * Indicates the maximum data transfer size for a command that transfers data
100 * between host-accessible memory and the controller. The value is specified
101 * as a power of two (2^n) and is in units of the minimum memory page size
102 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
103 *
104 * - `vsl`
105 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
106 * this value is specified as a power of two (2^n) and is in units of the
107 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
108 * KiB).
109 *
110 * - `zoned.zasl`
111 * Indicates the maximum data transfer size for the Zone Append command. Like
112 * `mdts`, the value is specified as a power of two (2^n) and is in units of
113 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
114 * defaulting to the value of `mdts`).
115 *
116 * - `zoned.auto_transition`
117 * Indicates if zones in zone state implicitly opened can be automatically
118 * transitioned to zone state closed for resource management purposes.
119 * Defaults to 'on'.
120 *
121 * - `sriov_max_vfs`
122 * Indicates the maximum number of PCIe virtual functions supported
123 * by the controller. The default value is 0. Specifying a non-zero value
124 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
125 * Virtual function controllers will not report SR-IOV capability.
126 *
127 * NOTE: Single Root I/O Virtualization support is experimental.
128 * All the related parameters may be subject to change.
129 *
130 * - `sriov_vq_flexible`
131 * Indicates the total number of flexible queue resources assignable to all
132 * the secondary controllers. Implicitly sets the number of primary
133 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
134 *
135 * - `sriov_vi_flexible`
136 * Indicates the total number of flexible interrupt resources assignable to
137 * all the secondary controllers. Implicitly sets the number of primary
138 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
139 *
140 * - `sriov_max_vi_per_vf`
141 * Indicates the maximum number of virtual interrupt resources assignable
142 * to a secondary controller. The default 0 resolves to
143 * `(sriov_vi_flexible / sriov_max_vfs)`.
144 *
145 * - `sriov_max_vq_per_vf`
146 * Indicates the maximum number of virtual queue resources assignable to
147 * a secondary controller. The default 0 resolves to
148 * `(sriov_vq_flexible / sriov_max_vfs)`.
149 *
150 * nvme namespace device parameters
151 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
152 * - `shared`
153 * When the parent nvme device (as defined explicitly by the 'bus' parameter
154 * or implicitly by the most recently defined NvmeBus) is linked to an
155 * nvme-subsys device, the namespace will be attached to all controllers in
156 * the subsystem. If set to 'off' (the default), the namespace will remain a
157 * private namespace and may only be attached to a single controller at a
158 * time.
159 *
160 * - `detached`
161 * This parameter is only valid together with the `subsys` parameter. If left
162 * at the default value (`false/off`), the namespace will be attached to all
163 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
164 * namespace will be available in the subsystem but not attached to any
165 * controllers.
166 *
167 * Setting `zoned` to true selects Zoned Command Set at the namespace.
168 * In this case, the following namespace properties are available to configure
169 * zoned operation:
170 * zoned.zone_size=<zone size in bytes, default: 128MiB>
171 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
172 *
173 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
174 * The value 0 (default) forces zone capacity to be the same as zone
175 * size. The value of this property may not exceed zone size.
176 *
177 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
178 * This value needs to be specified in 64B units. If it is zero,
179 * namespace(s) will not support zone descriptor extensions.
180 *
181 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
182 * The default value means there is no limit to the number of
183 * concurrently active zones.
184 *
185 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
186 * The default value means there is no limit to the number of
187 * concurrently open zones.
188 *
189 * zoned.cross_read=<enable RAZB, default: false>
190 * Setting this property to true enables Read Across Zone Boundaries.
191 */
192
193 #include "qemu/osdep.h"
194 #include "qemu/cutils.h"
195 #include "qemu/error-report.h"
196 #include "qemu/log.h"
197 #include "qemu/units.h"
198 #include "qemu/range.h"
199 #include "qapi/error.h"
200 #include "qapi/visitor.h"
201 #include "sysemu/sysemu.h"
202 #include "sysemu/block-backend.h"
203 #include "sysemu/hostmem.h"
204 #include "hw/pci/msix.h"
205 #include "hw/pci/pcie_sriov.h"
206 #include "migration/vmstate.h"
207
208 #include "nvme.h"
209 #include "dif.h"
210 #include "trace.h"
211
212 #define NVME_MAX_IOQPAIRS 0xffff
213 #define NVME_DB_SIZE 4
214 #define NVME_SPEC_VER 0x00010400
215 #define NVME_CMB_BIR 2
216 #define NVME_PMR_BIR 4
217 #define NVME_TEMPERATURE 0x143
218 #define NVME_TEMPERATURE_WARNING 0x157
219 #define NVME_TEMPERATURE_CRITICAL 0x175
220 #define NVME_NUM_FW_SLOTS 1
221 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
222 #define NVME_MAX_VFS 127
223 #define NVME_VF_RES_GRANULARITY 1
224 #define NVME_VF_OFFSET 0x1
225 #define NVME_VF_STRIDE 1
226
227 #define NVME_GUEST_ERR(trace, fmt, ...) \
228 do { \
229 (trace_##trace)(__VA_ARGS__); \
230 qemu_log_mask(LOG_GUEST_ERROR, #trace \
231 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
232 } while (0)
233
234 static const bool nvme_feature_support[NVME_FID_MAX] = {
235 [NVME_ARBITRATION] = true,
236 [NVME_POWER_MANAGEMENT] = true,
237 [NVME_TEMPERATURE_THRESHOLD] = true,
238 [NVME_ERROR_RECOVERY] = true,
239 [NVME_VOLATILE_WRITE_CACHE] = true,
240 [NVME_NUMBER_OF_QUEUES] = true,
241 [NVME_INTERRUPT_COALESCING] = true,
242 [NVME_INTERRUPT_VECTOR_CONF] = true,
243 [NVME_WRITE_ATOMICITY] = true,
244 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
245 [NVME_TIMESTAMP] = true,
246 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
247 [NVME_COMMAND_SET_PROFILE] = true,
248 [NVME_FDP_MODE] = true,
249 [NVME_FDP_EVENTS] = true,
250 };
251
252 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
253 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
254 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
255 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
256 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
257 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
258 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
259 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
260 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
261 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE,
262 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
263 };
264
265 static const uint32_t nvme_cse_acs[256] = {
266 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
267 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
268 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
269 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
270 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
271 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
272 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
273 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
274 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
275 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
276 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
277 [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
278 [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
279 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
280 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP,
281 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP,
282 };
283
284 static const uint32_t nvme_cse_iocs_none[256];
285
286 static const uint32_t nvme_cse_iocs_nvm[256] = {
287 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
288 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
289 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
290 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
291 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
292 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
293 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
294 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
295 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
296 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
297 };
298
299 static const uint32_t nvme_cse_iocs_zoned[256] = {
300 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
301 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
302 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
303 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
304 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
305 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
306 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
307 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
308 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
309 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
310 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
311 };
312
313 static void nvme_process_sq(void *opaque);
314 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
315 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
316
317 static uint16_t nvme_sqid(NvmeRequest *req)
318 {
319 return le16_to_cpu(req->sq->sqid);
320 }
321
322 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
323 uint16_t ph)
324 {
325 uint16_t rgif = ns->endgrp->fdp.rgif;
326
327 if (!rgif) {
328 return ph;
329 }
330
331 return (rg << (16 - rgif)) | ph;
332 }
333
334 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
335 {
336 return ph < ns->fdp.nphs;
337 }
338
339 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
340 {
341 return rg < endgrp->fdp.nrg;
342 }
343
344 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
345 {
346 uint16_t rgif = ns->endgrp->fdp.rgif;
347
348 if (!rgif) {
349 return pid;
350 }
351
352 return pid & ((1 << (15 - rgif)) - 1);
353 }
354
355 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
356 {
357 uint16_t rgif = ns->endgrp->fdp.rgif;
358
359 if (!rgif) {
360 return 0;
361 }
362
363 return pid >> (16 - rgif);
364 }
365
366 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
367 uint16_t *ph, uint16_t *rg)
368 {
369 *rg = nvme_pid2rg(ns, pid);
370 *ph = nvme_pid2ph(ns, pid);
371
372 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
373 }
374
375 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
376 NvmeZoneState state)
377 {
378 if (QTAILQ_IN_USE(zone, entry)) {
379 switch (nvme_get_zone_state(zone)) {
380 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
381 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
382 break;
383 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
384 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
385 break;
386 case NVME_ZONE_STATE_CLOSED:
387 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
388 break;
389 case NVME_ZONE_STATE_FULL:
390 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
391 default:
392 ;
393 }
394 }
395
396 nvme_set_zone_state(zone, state);
397
398 switch (state) {
399 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
400 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
401 break;
402 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
403 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
404 break;
405 case NVME_ZONE_STATE_CLOSED:
406 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
407 break;
408 case NVME_ZONE_STATE_FULL:
409 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
410 case NVME_ZONE_STATE_READ_ONLY:
411 break;
412 default:
413 zone->d.za = 0;
414 }
415 }
416
417 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
418 uint32_t opn, uint32_t zrwa)
419 {
420 if (ns->params.max_active_zones != 0 &&
421 ns->nr_active_zones + act > ns->params.max_active_zones) {
422 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
423 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
424 }
425
426 if (ns->params.max_open_zones != 0 &&
427 ns->nr_open_zones + opn > ns->params.max_open_zones) {
428 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
429 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
430 }
431
432 if (zrwa > ns->zns.numzrwa) {
433 return NVME_NOZRWA | NVME_DNR;
434 }
435
436 return NVME_SUCCESS;
437 }
438
439 /*
440 * Check if we can open a zone without exceeding open/active limits.
441 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
442 */
443 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
444 {
445 return nvme_zns_check_resources(ns, act, opn, 0);
446 }
447
448 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
449 {
450 NvmeFdpEvent *ret = NULL;
451 bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
452
453 ret = &ebuf->events[ebuf->next++];
454 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
455 ebuf->next = 0;
456 }
457 if (is_full) {
458 ebuf->start = ebuf->next;
459 } else {
460 ebuf->nelems++;
461 }
462
463 memset(ret, 0, sizeof(NvmeFdpEvent));
464 ret->timestamp = nvme_get_timestamp(n);
465
466 return ret;
467 }
468
469 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
470 {
471 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
472 }
473
474 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
475 {
476 NvmeEnduranceGroup *endgrp = ns->endgrp;
477 NvmeRuHandle *ruh;
478 NvmeReclaimUnit *ru;
479 NvmeFdpEvent *e = NULL;
480 uint16_t ph, rg, ruhid;
481
482 if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
483 return false;
484 }
485
486 ruhid = ns->fdp.phs[ph];
487
488 ruh = &endgrp->fdp.ruhs[ruhid];
489 ru = &ruh->rus[rg];
490
491 if (ru->ruamw) {
492 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
493 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
494 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
495 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
496 e->pid = cpu_to_le16(pid);
497 e->nsid = cpu_to_le32(ns->params.nsid);
498 e->rgid = cpu_to_le16(rg);
499 e->ruhid = cpu_to_le16(ruhid);
500 }
501
502 /* log (eventual) GC overhead of prematurely swapping the RU */
503 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
504 }
505
506 ru->ruamw = ruh->ruamw;
507
508 return true;
509 }
510
511 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
512 {
513 hwaddr hi, lo;
514
515 if (!n->cmb.cmse) {
516 return false;
517 }
518
519 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
520 hi = lo + int128_get64(n->cmb.mem.size);
521
522 return addr >= lo && addr < hi;
523 }
524
525 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
526 {
527 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
528 return &n->cmb.buf[addr - base];
529 }
530
531 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
532 {
533 hwaddr hi;
534
535 if (!n->pmr.cmse) {
536 return false;
537 }
538
539 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
540
541 return addr >= n->pmr.cba && addr < hi;
542 }
543
544 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
545 {
546 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
547 }
548
549 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
550 {
551 hwaddr hi, lo;
552
553 /*
554 * The purpose of this check is to guard against invalid "local" access to
555 * the iomem (i.e. controller registers). Thus, we check against the range
556 * covered by the 'bar0' MemoryRegion since that is currently composed of
557 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
558 * that if the device model is ever changed to allow the CMB to be located
559 * in BAR0 as well, then this must be changed.
560 */
561 lo = n->bar0.addr;
562 hi = lo + int128_get64(n->bar0.size);
563
564 return addr >= lo && addr < hi;
565 }
566
567 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
568 {
569 hwaddr hi = addr + size - 1;
570 if (hi < addr) {
571 return 1;
572 }
573
574 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
575 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
576 return 0;
577 }
578
579 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
580 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
581 return 0;
582 }
583
584 return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
585 }
586
587 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
588 {
589 hwaddr hi = addr + size - 1;
590 if (hi < addr) {
591 return 1;
592 }
593
594 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
595 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
596 return 0;
597 }
598
599 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
600 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
601 return 0;
602 }
603
604 return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
605 }
606
607 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
608 {
609 return nsid &&
610 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
611 }
612
613 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
614 {
615 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
616 }
617
618 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
619 {
620 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
621 }
622
623 static void nvme_inc_cq_tail(NvmeCQueue *cq)
624 {
625 cq->tail++;
626 if (cq->tail >= cq->size) {
627 cq->tail = 0;
628 cq->phase = !cq->phase;
629 }
630 }
631
632 static void nvme_inc_sq_head(NvmeSQueue *sq)
633 {
634 sq->head = (sq->head + 1) % sq->size;
635 }
636
637 static uint8_t nvme_cq_full(NvmeCQueue *cq)
638 {
639 return (cq->tail + 1) % cq->size == cq->head;
640 }
641
642 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
643 {
644 return sq->head == sq->tail;
645 }
646
647 static void nvme_irq_check(NvmeCtrl *n)
648 {
649 PCIDevice *pci = PCI_DEVICE(n);
650 uint32_t intms = ldl_le_p(&n->bar.intms);
651
652 if (msix_enabled(pci)) {
653 return;
654 }
655 if (~intms & n->irq_status) {
656 pci_irq_assert(pci);
657 } else {
658 pci_irq_deassert(pci);
659 }
660 }
661
662 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
663 {
664 PCIDevice *pci = PCI_DEVICE(n);
665
666 if (cq->irq_enabled) {
667 if (msix_enabled(pci)) {
668 trace_pci_nvme_irq_msix(cq->vector);
669 msix_notify(pci, cq->vector);
670 } else {
671 trace_pci_nvme_irq_pin();
672 assert(cq->vector < 32);
673 n->irq_status |= 1 << cq->vector;
674 nvme_irq_check(n);
675 }
676 } else {
677 trace_pci_nvme_irq_masked();
678 }
679 }
680
681 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
682 {
683 if (cq->irq_enabled) {
684 if (msix_enabled(PCI_DEVICE(n))) {
685 return;
686 } else {
687 assert(cq->vector < 32);
688 if (!n->cq_pending) {
689 n->irq_status &= ~(1 << cq->vector);
690 }
691 nvme_irq_check(n);
692 }
693 }
694 }
695
696 static void nvme_req_clear(NvmeRequest *req)
697 {
698 req->ns = NULL;
699 req->opaque = NULL;
700 req->aiocb = NULL;
701 memset(&req->cqe, 0x0, sizeof(req->cqe));
702 req->status = NVME_SUCCESS;
703 }
704
705 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
706 {
707 if (dma) {
708 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
709 sg->flags = NVME_SG_DMA;
710 } else {
711 qemu_iovec_init(&sg->iov, 0);
712 }
713
714 sg->flags |= NVME_SG_ALLOC;
715 }
716
717 static inline void nvme_sg_unmap(NvmeSg *sg)
718 {
719 if (!(sg->flags & NVME_SG_ALLOC)) {
720 return;
721 }
722
723 if (sg->flags & NVME_SG_DMA) {
724 qemu_sglist_destroy(&sg->qsg);
725 } else {
726 qemu_iovec_destroy(&sg->iov);
727 }
728
729 memset(sg, 0x0, sizeof(*sg));
730 }
731
732 /*
733 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
734 * holds both data and metadata. This function splits the data and metadata
735 * into two separate QSG/IOVs.
736 */
737 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
738 NvmeSg *mdata)
739 {
740 NvmeSg *dst = data;
741 uint32_t trans_len, count = ns->lbasz;
742 uint64_t offset = 0;
743 bool dma = sg->flags & NVME_SG_DMA;
744 size_t sge_len;
745 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
746 int sg_idx = 0;
747
748 assert(sg->flags & NVME_SG_ALLOC);
749
750 while (sg_len) {
751 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
752
753 trans_len = MIN(sg_len, count);
754 trans_len = MIN(trans_len, sge_len - offset);
755
756 if (dst) {
757 if (dma) {
758 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
759 trans_len);
760 } else {
761 qemu_iovec_add(&dst->iov,
762 sg->iov.iov[sg_idx].iov_base + offset,
763 trans_len);
764 }
765 }
766
767 sg_len -= trans_len;
768 count -= trans_len;
769 offset += trans_len;
770
771 if (count == 0) {
772 dst = (dst == data) ? mdata : data;
773 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
774 }
775
776 if (sge_len == offset) {
777 offset = 0;
778 sg_idx++;
779 }
780 }
781 }
782
783 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
784 size_t len)
785 {
786 if (!len) {
787 return NVME_SUCCESS;
788 }
789
790 trace_pci_nvme_map_addr_cmb(addr, len);
791
792 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
793 return NVME_DATA_TRAS_ERROR;
794 }
795
796 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
797
798 return NVME_SUCCESS;
799 }
800
801 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
802 size_t len)
803 {
804 if (!len) {
805 return NVME_SUCCESS;
806 }
807
808 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
809 return NVME_DATA_TRAS_ERROR;
810 }
811
812 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
813
814 return NVME_SUCCESS;
815 }
816
817 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
818 {
819 bool cmb = false, pmr = false;
820
821 if (!len) {
822 return NVME_SUCCESS;
823 }
824
825 trace_pci_nvme_map_addr(addr, len);
826
827 if (nvme_addr_is_iomem(n, addr)) {
828 return NVME_DATA_TRAS_ERROR;
829 }
830
831 if (nvme_addr_is_cmb(n, addr)) {
832 cmb = true;
833 } else if (nvme_addr_is_pmr(n, addr)) {
834 pmr = true;
835 }
836
837 if (cmb || pmr) {
838 if (sg->flags & NVME_SG_DMA) {
839 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
840 }
841
842 if (sg->iov.niov + 1 > IOV_MAX) {
843 goto max_mappings_exceeded;
844 }
845
846 if (cmb) {
847 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
848 } else {
849 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
850 }
851 }
852
853 if (!(sg->flags & NVME_SG_DMA)) {
854 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
855 }
856
857 if (sg->qsg.nsg + 1 > IOV_MAX) {
858 goto max_mappings_exceeded;
859 }
860
861 qemu_sglist_add(&sg->qsg, addr, len);
862
863 return NVME_SUCCESS;
864
865 max_mappings_exceeded:
866 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
867 "number of mappings exceed 1024");
868 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
869 }
870
871 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
872 {
873 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
874 }
875
876 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
877 uint64_t prp2, uint32_t len)
878 {
879 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
880 trans_len = MIN(len, trans_len);
881 int num_prps = (len >> n->page_bits) + 1;
882 uint16_t status;
883 int ret;
884
885 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
886
887 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
888
889 status = nvme_map_addr(n, sg, prp1, trans_len);
890 if (status) {
891 goto unmap;
892 }
893
894 len -= trans_len;
895 if (len) {
896 if (len > n->page_size) {
897 uint64_t prp_list[n->max_prp_ents];
898 uint32_t nents, prp_trans;
899 int i = 0;
900
901 /*
902 * The first PRP list entry, pointed to by PRP2 may contain offset.
903 * Hence, we need to calculate the number of entries in based on
904 * that offset.
905 */
906 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
907 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
908 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
909 if (ret) {
910 trace_pci_nvme_err_addr_read(prp2);
911 status = NVME_DATA_TRAS_ERROR;
912 goto unmap;
913 }
914 while (len != 0) {
915 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
916
917 if (i == nents - 1 && len > n->page_size) {
918 if (unlikely(prp_ent & (n->page_size - 1))) {
919 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
920 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
921 goto unmap;
922 }
923
924 i = 0;
925 nents = (len + n->page_size - 1) >> n->page_bits;
926 nents = MIN(nents, n->max_prp_ents);
927 prp_trans = nents * sizeof(uint64_t);
928 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
929 prp_trans);
930 if (ret) {
931 trace_pci_nvme_err_addr_read(prp_ent);
932 status = NVME_DATA_TRAS_ERROR;
933 goto unmap;
934 }
935 prp_ent = le64_to_cpu(prp_list[i]);
936 }
937
938 if (unlikely(prp_ent & (n->page_size - 1))) {
939 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
940 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
941 goto unmap;
942 }
943
944 trans_len = MIN(len, n->page_size);
945 status = nvme_map_addr(n, sg, prp_ent, trans_len);
946 if (status) {
947 goto unmap;
948 }
949
950 len -= trans_len;
951 i++;
952 }
953 } else {
954 if (unlikely(prp2 & (n->page_size - 1))) {
955 trace_pci_nvme_err_invalid_prp2_align(prp2);
956 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
957 goto unmap;
958 }
959 status = nvme_map_addr(n, sg, prp2, len);
960 if (status) {
961 goto unmap;
962 }
963 }
964 }
965
966 return NVME_SUCCESS;
967
968 unmap:
969 nvme_sg_unmap(sg);
970 return status;
971 }
972
973 /*
974 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
975 * number of bytes mapped in len.
976 */
977 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
978 NvmeSglDescriptor *segment, uint64_t nsgld,
979 size_t *len, NvmeCmd *cmd)
980 {
981 dma_addr_t addr, trans_len;
982 uint32_t dlen;
983 uint16_t status;
984
985 for (int i = 0; i < nsgld; i++) {
986 uint8_t type = NVME_SGL_TYPE(segment[i].type);
987
988 switch (type) {
989 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
990 break;
991 case NVME_SGL_DESCR_TYPE_SEGMENT:
992 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
993 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
994 default:
995 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
996 }
997
998 dlen = le32_to_cpu(segment[i].len);
999
1000 if (!dlen) {
1001 continue;
1002 }
1003
1004 if (*len == 0) {
1005 /*
1006 * All data has been mapped, but the SGL contains additional
1007 * segments and/or descriptors. The controller might accept
1008 * ignoring the rest of the SGL.
1009 */
1010 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1011 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1012 break;
1013 }
1014
1015 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1016 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1017 }
1018
1019 trans_len = MIN(*len, dlen);
1020
1021 addr = le64_to_cpu(segment[i].addr);
1022
1023 if (UINT64_MAX - addr < dlen) {
1024 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1025 }
1026
1027 status = nvme_map_addr(n, sg, addr, trans_len);
1028 if (status) {
1029 return status;
1030 }
1031
1032 *len -= trans_len;
1033 }
1034
1035 return NVME_SUCCESS;
1036 }
1037
1038 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1039 size_t len, NvmeCmd *cmd)
1040 {
1041 /*
1042 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1043 * dynamically allocating a potentially huge SGL. The spec allows the SGL
1044 * to be larger (as in number of bytes required to describe the SGL
1045 * descriptors and segment chain) than the command transfer size, so it is
1046 * not bounded by MDTS.
1047 */
1048 const int SEG_CHUNK_SIZE = 256;
1049
1050 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
1051 uint64_t nsgld;
1052 uint32_t seg_len;
1053 uint16_t status;
1054 hwaddr addr;
1055 int ret;
1056
1057 sgld = &sgl;
1058 addr = le64_to_cpu(sgl.addr);
1059
1060 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1061
1062 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1063
1064 /*
1065 * If the entire transfer can be described with a single data block it can
1066 * be mapped directly.
1067 */
1068 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1069 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1070 if (status) {
1071 goto unmap;
1072 }
1073
1074 goto out;
1075 }
1076
1077 for (;;) {
1078 switch (NVME_SGL_TYPE(sgld->type)) {
1079 case NVME_SGL_DESCR_TYPE_SEGMENT:
1080 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1081 break;
1082 default:
1083 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1084 }
1085
1086 seg_len = le32_to_cpu(sgld->len);
1087
1088 /* check the length of the (Last) Segment descriptor */
1089 if (!seg_len || seg_len & 0xf) {
1090 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1091 }
1092
1093 if (UINT64_MAX - addr < seg_len) {
1094 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1095 }
1096
1097 nsgld = seg_len / sizeof(NvmeSglDescriptor);
1098
1099 while (nsgld > SEG_CHUNK_SIZE) {
1100 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1101 trace_pci_nvme_err_addr_read(addr);
1102 status = NVME_DATA_TRAS_ERROR;
1103 goto unmap;
1104 }
1105
1106 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1107 &len, cmd);
1108 if (status) {
1109 goto unmap;
1110 }
1111
1112 nsgld -= SEG_CHUNK_SIZE;
1113 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1114 }
1115
1116 ret = nvme_addr_read(n, addr, segment, nsgld *
1117 sizeof(NvmeSglDescriptor));
1118 if (ret) {
1119 trace_pci_nvme_err_addr_read(addr);
1120 status = NVME_DATA_TRAS_ERROR;
1121 goto unmap;
1122 }
1123
1124 last_sgld = &segment[nsgld - 1];
1125
1126 /*
1127 * If the segment ends with a Data Block, then we are done.
1128 */
1129 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1130 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1131 if (status) {
1132 goto unmap;
1133 }
1134
1135 goto out;
1136 }
1137
1138 /*
1139 * If the last descriptor was not a Data Block, then the current
1140 * segment must not be a Last Segment.
1141 */
1142 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1143 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1144 goto unmap;
1145 }
1146
1147 sgld = last_sgld;
1148 addr = le64_to_cpu(sgld->addr);
1149
1150 /*
1151 * Do not map the last descriptor; it will be a Segment or Last Segment
1152 * descriptor and is handled by the next iteration.
1153 */
1154 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1155 if (status) {
1156 goto unmap;
1157 }
1158 }
1159
1160 out:
1161 /* if there is any residual left in len, the SGL was too short */
1162 if (len) {
1163 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1164 goto unmap;
1165 }
1166
1167 return NVME_SUCCESS;
1168
1169 unmap:
1170 nvme_sg_unmap(sg);
1171 return status;
1172 }
1173
1174 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1175 NvmeCmd *cmd)
1176 {
1177 uint64_t prp1, prp2;
1178
1179 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1180 case NVME_PSDT_PRP:
1181 prp1 = le64_to_cpu(cmd->dptr.prp1);
1182 prp2 = le64_to_cpu(cmd->dptr.prp2);
1183
1184 return nvme_map_prp(n, sg, prp1, prp2, len);
1185 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1186 case NVME_PSDT_SGL_MPTR_SGL:
1187 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1188 default:
1189 return NVME_INVALID_FIELD;
1190 }
1191 }
1192
1193 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1194 NvmeCmd *cmd)
1195 {
1196 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1197 hwaddr mptr = le64_to_cpu(cmd->mptr);
1198 uint16_t status;
1199
1200 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1201 NvmeSglDescriptor sgl;
1202
1203 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1204 return NVME_DATA_TRAS_ERROR;
1205 }
1206
1207 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1208 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1209 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1210 }
1211
1212 return status;
1213 }
1214
1215 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1216 status = nvme_map_addr(n, sg, mptr, len);
1217 if (status) {
1218 nvme_sg_unmap(sg);
1219 }
1220
1221 return status;
1222 }
1223
1224 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1225 {
1226 NvmeNamespace *ns = req->ns;
1227 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1228 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1229 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1230 size_t len = nvme_l2b(ns, nlb);
1231 uint16_t status;
1232
1233 if (nvme_ns_ext(ns) &&
1234 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1235 NvmeSg sg;
1236
1237 len += nvme_m2b(ns, nlb);
1238
1239 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1240 if (status) {
1241 return status;
1242 }
1243
1244 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1245 nvme_sg_split(&sg, ns, &req->sg, NULL);
1246 nvme_sg_unmap(&sg);
1247
1248 return NVME_SUCCESS;
1249 }
1250
1251 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1252 }
1253
1254 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1255 {
1256 NvmeNamespace *ns = req->ns;
1257 size_t len = nvme_m2b(ns, nlb);
1258 uint16_t status;
1259
1260 if (nvme_ns_ext(ns)) {
1261 NvmeSg sg;
1262
1263 len += nvme_l2b(ns, nlb);
1264
1265 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1266 if (status) {
1267 return status;
1268 }
1269
1270 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1271 nvme_sg_split(&sg, ns, NULL, &req->sg);
1272 nvme_sg_unmap(&sg);
1273
1274 return NVME_SUCCESS;
1275 }
1276
1277 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1278 }
1279
1280 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1281 uint32_t len, uint32_t bytes,
1282 int32_t skip_bytes, int64_t offset,
1283 NvmeTxDirection dir)
1284 {
1285 hwaddr addr;
1286 uint32_t trans_len, count = bytes;
1287 bool dma = sg->flags & NVME_SG_DMA;
1288 int64_t sge_len;
1289 int sg_idx = 0;
1290 int ret;
1291
1292 assert(sg->flags & NVME_SG_ALLOC);
1293
1294 while (len) {
1295 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1296
1297 if (sge_len - offset < 0) {
1298 offset -= sge_len;
1299 sg_idx++;
1300 continue;
1301 }
1302
1303 if (sge_len == offset) {
1304 offset = 0;
1305 sg_idx++;
1306 continue;
1307 }
1308
1309 trans_len = MIN(len, count);
1310 trans_len = MIN(trans_len, sge_len - offset);
1311
1312 if (dma) {
1313 addr = sg->qsg.sg[sg_idx].base + offset;
1314 } else {
1315 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1316 }
1317
1318 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1319 ret = nvme_addr_read(n, addr, ptr, trans_len);
1320 } else {
1321 ret = nvme_addr_write(n, addr, ptr, trans_len);
1322 }
1323
1324 if (ret) {
1325 return NVME_DATA_TRAS_ERROR;
1326 }
1327
1328 ptr += trans_len;
1329 len -= trans_len;
1330 count -= trans_len;
1331 offset += trans_len;
1332
1333 if (count == 0) {
1334 count = bytes;
1335 offset += skip_bytes;
1336 }
1337 }
1338
1339 return NVME_SUCCESS;
1340 }
1341
1342 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1343 NvmeTxDirection dir)
1344 {
1345 assert(sg->flags & NVME_SG_ALLOC);
1346
1347 if (sg->flags & NVME_SG_DMA) {
1348 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1349 dma_addr_t residual;
1350
1351 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1352 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1353 } else {
1354 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1355 }
1356
1357 if (unlikely(residual)) {
1358 trace_pci_nvme_err_invalid_dma();
1359 return NVME_INVALID_FIELD | NVME_DNR;
1360 }
1361 } else {
1362 size_t bytes;
1363
1364 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1365 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1366 } else {
1367 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1368 }
1369
1370 if (unlikely(bytes != len)) {
1371 trace_pci_nvme_err_invalid_dma();
1372 return NVME_INVALID_FIELD | NVME_DNR;
1373 }
1374 }
1375
1376 return NVME_SUCCESS;
1377 }
1378
1379 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1380 NvmeRequest *req)
1381 {
1382 uint16_t status;
1383
1384 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1385 if (status) {
1386 return status;
1387 }
1388
1389 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1390 }
1391
1392 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1393 NvmeRequest *req)
1394 {
1395 uint16_t status;
1396
1397 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1398 if (status) {
1399 return status;
1400 }
1401
1402 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1403 }
1404
1405 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1406 NvmeTxDirection dir, NvmeRequest *req)
1407 {
1408 NvmeNamespace *ns = req->ns;
1409 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1410 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1411 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1412
1413 if (nvme_ns_ext(ns) &&
1414 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1415 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1416 ns->lbaf.ms, 0, dir);
1417 }
1418
1419 return nvme_tx(n, &req->sg, ptr, len, dir);
1420 }
1421
1422 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1423 NvmeTxDirection dir, NvmeRequest *req)
1424 {
1425 NvmeNamespace *ns = req->ns;
1426 uint16_t status;
1427
1428 if (nvme_ns_ext(ns)) {
1429 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1430 ns->lbasz, ns->lbasz, dir);
1431 }
1432
1433 nvme_sg_unmap(&req->sg);
1434
1435 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1436 if (status) {
1437 return status;
1438 }
1439
1440 return nvme_tx(n, &req->sg, ptr, len, dir);
1441 }
1442
1443 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1444 uint32_t align, BlockCompletionFunc *cb,
1445 NvmeRequest *req)
1446 {
1447 assert(req->sg.flags & NVME_SG_ALLOC);
1448
1449 if (req->sg.flags & NVME_SG_DMA) {
1450 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1451 } else {
1452 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1453 }
1454 }
1455
1456 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1457 uint32_t align, BlockCompletionFunc *cb,
1458 NvmeRequest *req)
1459 {
1460 assert(req->sg.flags & NVME_SG_ALLOC);
1461
1462 if (req->sg.flags & NVME_SG_DMA) {
1463 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1464 } else {
1465 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1466 }
1467 }
1468
1469 static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1470 {
1471 uint32_t v = cpu_to_le32(cq->head);
1472
1473 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1474
1475 pci_dma_write(PCI_DEVICE(cq->ctrl), cq->ei_addr, &v, sizeof(v));
1476 }
1477
1478 static void nvme_update_cq_head(NvmeCQueue *cq)
1479 {
1480 uint32_t v;
1481
1482 pci_dma_read(PCI_DEVICE(cq->ctrl), cq->db_addr, &v, sizeof(v));
1483
1484 cq->head = le32_to_cpu(v);
1485
1486 trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1487 }
1488
1489 static void nvme_post_cqes(void *opaque)
1490 {
1491 NvmeCQueue *cq = opaque;
1492 NvmeCtrl *n = cq->ctrl;
1493 NvmeRequest *req, *next;
1494 bool pending = cq->head != cq->tail;
1495 int ret;
1496
1497 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1498 NvmeSQueue *sq;
1499 hwaddr addr;
1500
1501 if (n->dbbuf_enabled) {
1502 nvme_update_cq_eventidx(cq);
1503 nvme_update_cq_head(cq);
1504 }
1505
1506 if (nvme_cq_full(cq)) {
1507 break;
1508 }
1509
1510 sq = req->sq;
1511 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1512 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1513 req->cqe.sq_head = cpu_to_le16(sq->head);
1514 addr = cq->dma_addr + cq->tail * n->cqe_size;
1515 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1516 sizeof(req->cqe));
1517 if (ret) {
1518 trace_pci_nvme_err_addr_write(addr);
1519 trace_pci_nvme_err_cfs();
1520 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1521 break;
1522 }
1523 QTAILQ_REMOVE(&cq->req_list, req, entry);
1524 nvme_inc_cq_tail(cq);
1525 nvme_sg_unmap(&req->sg);
1526 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1527 }
1528 if (cq->tail != cq->head) {
1529 if (cq->irq_enabled && !pending) {
1530 n->cq_pending++;
1531 }
1532
1533 nvme_irq_assert(n, cq);
1534 }
1535 }
1536
1537 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1538 {
1539 assert(cq->cqid == req->sq->cqid);
1540 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1541 le32_to_cpu(req->cqe.result),
1542 le32_to_cpu(req->cqe.dw1),
1543 req->status);
1544
1545 if (req->status) {
1546 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1547 req->status, req->cmd.opcode);
1548 }
1549
1550 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1551 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1552
1553 qemu_bh_schedule(cq->bh);
1554 }
1555
1556 static void nvme_process_aers(void *opaque)
1557 {
1558 NvmeCtrl *n = opaque;
1559 NvmeAsyncEvent *event, *next;
1560
1561 trace_pci_nvme_process_aers(n->aer_queued);
1562
1563 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1564 NvmeRequest *req;
1565 NvmeAerResult *result;
1566
1567 /* can't post cqe if there is nothing to complete */
1568 if (!n->outstanding_aers) {
1569 trace_pci_nvme_no_outstanding_aers();
1570 break;
1571 }
1572
1573 /* ignore if masked (cqe posted, but event not cleared) */
1574 if (n->aer_mask & (1 << event->result.event_type)) {
1575 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1576 continue;
1577 }
1578
1579 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1580 n->aer_queued--;
1581
1582 n->aer_mask |= 1 << event->result.event_type;
1583 n->outstanding_aers--;
1584
1585 req = n->aer_reqs[n->outstanding_aers];
1586
1587 result = (NvmeAerResult *) &req->cqe.result;
1588 result->event_type = event->result.event_type;
1589 result->event_info = event->result.event_info;
1590 result->log_page = event->result.log_page;
1591 g_free(event);
1592
1593 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1594 result->log_page);
1595
1596 nvme_enqueue_req_completion(&n->admin_cq, req);
1597 }
1598 }
1599
1600 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1601 uint8_t event_info, uint8_t log_page)
1602 {
1603 NvmeAsyncEvent *event;
1604
1605 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1606
1607 if (n->aer_queued == n->params.aer_max_queued) {
1608 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1609 return;
1610 }
1611
1612 event = g_new(NvmeAsyncEvent, 1);
1613 event->result = (NvmeAerResult) {
1614 .event_type = event_type,
1615 .event_info = event_info,
1616 .log_page = log_page,
1617 };
1618
1619 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1620 n->aer_queued++;
1621
1622 nvme_process_aers(n);
1623 }
1624
1625 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1626 {
1627 uint8_t aer_info;
1628
1629 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1630 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1631 return;
1632 }
1633
1634 switch (event) {
1635 case NVME_SMART_SPARE:
1636 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1637 break;
1638 case NVME_SMART_TEMPERATURE:
1639 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1640 break;
1641 case NVME_SMART_RELIABILITY:
1642 case NVME_SMART_MEDIA_READ_ONLY:
1643 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1644 case NVME_SMART_PMR_UNRELIABLE:
1645 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1646 break;
1647 default:
1648 return;
1649 }
1650
1651 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1652 }
1653
1654 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1655 {
1656 n->aer_mask &= ~(1 << event_type);
1657 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1658 nvme_process_aers(n);
1659 }
1660 }
1661
1662 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1663 {
1664 uint8_t mdts = n->params.mdts;
1665
1666 if (mdts && len > n->page_size << mdts) {
1667 trace_pci_nvme_err_mdts(len);
1668 return NVME_INVALID_FIELD | NVME_DNR;
1669 }
1670
1671 return NVME_SUCCESS;
1672 }
1673
1674 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1675 uint32_t nlb)
1676 {
1677 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1678
1679 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1680 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1681 return NVME_LBA_RANGE | NVME_DNR;
1682 }
1683
1684 return NVME_SUCCESS;
1685 }
1686
1687 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1688 uint32_t nlb, int flags)
1689 {
1690 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1691
1692 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1693 int64_t offset = nvme_l2b(ns, slba);
1694 int ret;
1695
1696 /*
1697 * `pnum` holds the number of bytes after offset that shares the same
1698 * allocation status as the byte at offset. If `pnum` is different from
1699 * `bytes`, we should check the allocation status of the next range and
1700 * continue this until all bytes have been checked.
1701 */
1702 do {
1703 bytes -= pnum;
1704
1705 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1706 if (ret < 0) {
1707 return ret;
1708 }
1709
1710
1711 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1712 !!(ret & BDRV_BLOCK_ZERO));
1713
1714 if (!(ret & flags)) {
1715 return 1;
1716 }
1717
1718 offset += pnum;
1719 } while (pnum != bytes);
1720
1721 return 0;
1722 }
1723
1724 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1725 uint32_t nlb)
1726 {
1727 int ret;
1728 Error *err = NULL;
1729
1730 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1731 if (ret) {
1732 if (ret < 0) {
1733 error_setg_errno(&err, -ret, "unable to get block status");
1734 error_report_err(err);
1735
1736 return NVME_INTERNAL_DEV_ERROR;
1737 }
1738
1739 return NVME_DULB;
1740 }
1741
1742 return NVME_SUCCESS;
1743 }
1744
1745 static void nvme_aio_err(NvmeRequest *req, int ret)
1746 {
1747 uint16_t status = NVME_SUCCESS;
1748 Error *local_err = NULL;
1749
1750 switch (req->cmd.opcode) {
1751 case NVME_CMD_READ:
1752 status = NVME_UNRECOVERED_READ;
1753 break;
1754 case NVME_CMD_FLUSH:
1755 case NVME_CMD_WRITE:
1756 case NVME_CMD_WRITE_ZEROES:
1757 case NVME_CMD_ZONE_APPEND:
1758 status = NVME_WRITE_FAULT;
1759 break;
1760 default:
1761 status = NVME_INTERNAL_DEV_ERROR;
1762 break;
1763 }
1764
1765 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1766
1767 error_setg_errno(&local_err, -ret, "aio failed");
1768 error_report_err(local_err);
1769
1770 /*
1771 * Set the command status code to the first encountered error but allow a
1772 * subsequent Internal Device Error to trump it.
1773 */
1774 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1775 return;
1776 }
1777
1778 req->status = status;
1779 }
1780
1781 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1782 {
1783 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1784 slba / ns->zone_size;
1785 }
1786
1787 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1788 {
1789 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1790
1791 if (zone_idx >= ns->num_zones) {
1792 return NULL;
1793 }
1794
1795 return &ns->zone_array[zone_idx];
1796 }
1797
1798 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1799 {
1800 uint64_t zslba = zone->d.zslba;
1801
1802 switch (nvme_get_zone_state(zone)) {
1803 case NVME_ZONE_STATE_EMPTY:
1804 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1805 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1806 case NVME_ZONE_STATE_CLOSED:
1807 return NVME_SUCCESS;
1808 case NVME_ZONE_STATE_FULL:
1809 trace_pci_nvme_err_zone_is_full(zslba);
1810 return NVME_ZONE_FULL;
1811 case NVME_ZONE_STATE_OFFLINE:
1812 trace_pci_nvme_err_zone_is_offline(zslba);
1813 return NVME_ZONE_OFFLINE;
1814 case NVME_ZONE_STATE_READ_ONLY:
1815 trace_pci_nvme_err_zone_is_read_only(zslba);
1816 return NVME_ZONE_READ_ONLY;
1817 default:
1818 assert(false);
1819 }
1820
1821 return NVME_INTERNAL_DEV_ERROR;
1822 }
1823
1824 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1825 uint64_t slba, uint32_t nlb)
1826 {
1827 uint64_t zcap = nvme_zone_wr_boundary(zone);
1828 uint16_t status;
1829
1830 status = nvme_check_zone_state_for_write(zone);
1831 if (status) {
1832 return status;
1833 }
1834
1835 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1836 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1837
1838 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1839 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1840 return NVME_ZONE_INVALID_WRITE;
1841 }
1842 } else {
1843 if (unlikely(slba != zone->w_ptr)) {
1844 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1845 zone->w_ptr);
1846 return NVME_ZONE_INVALID_WRITE;
1847 }
1848 }
1849
1850 if (unlikely((slba + nlb) > zcap)) {
1851 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1852 return NVME_ZONE_BOUNDARY_ERROR;
1853 }
1854
1855 return NVME_SUCCESS;
1856 }
1857
1858 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1859 {
1860 switch (nvme_get_zone_state(zone)) {
1861 case NVME_ZONE_STATE_EMPTY:
1862 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1863 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1864 case NVME_ZONE_STATE_FULL:
1865 case NVME_ZONE_STATE_CLOSED:
1866 case NVME_ZONE_STATE_READ_ONLY:
1867 return NVME_SUCCESS;
1868 case NVME_ZONE_STATE_OFFLINE:
1869 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1870 return NVME_ZONE_OFFLINE;
1871 default:
1872 assert(false);
1873 }
1874
1875 return NVME_INTERNAL_DEV_ERROR;
1876 }
1877
1878 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1879 uint32_t nlb)
1880 {
1881 NvmeZone *zone;
1882 uint64_t bndry, end;
1883 uint16_t status;
1884
1885 zone = nvme_get_zone_by_slba(ns, slba);
1886 assert(zone);
1887
1888 bndry = nvme_zone_rd_boundary(ns, zone);
1889 end = slba + nlb;
1890
1891 status = nvme_check_zone_state_for_read(zone);
1892 if (status) {
1893 ;
1894 } else if (unlikely(end > bndry)) {
1895 if (!ns->params.cross_zone_read) {
1896 status = NVME_ZONE_BOUNDARY_ERROR;
1897 } else {
1898 /*
1899 * Read across zone boundary - check that all subsequent
1900 * zones that are being read have an appropriate state.
1901 */
1902 do {
1903 zone++;
1904 status = nvme_check_zone_state_for_read(zone);
1905 if (status) {
1906 break;
1907 }
1908 } while (end > nvme_zone_rd_boundary(ns, zone));
1909 }
1910 }
1911
1912 return status;
1913 }
1914
1915 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1916 {
1917 switch (nvme_get_zone_state(zone)) {
1918 case NVME_ZONE_STATE_FULL:
1919 return NVME_SUCCESS;
1920
1921 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1922 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1923 nvme_aor_dec_open(ns);
1924 /* fallthrough */
1925 case NVME_ZONE_STATE_CLOSED:
1926 nvme_aor_dec_active(ns);
1927
1928 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1929 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1930 if (ns->params.numzrwa) {
1931 ns->zns.numzrwa++;
1932 }
1933 }
1934
1935 /* fallthrough */
1936 case NVME_ZONE_STATE_EMPTY:
1937 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1938 return NVME_SUCCESS;
1939
1940 default:
1941 return NVME_ZONE_INVAL_TRANSITION;
1942 }
1943 }
1944
1945 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1946 {
1947 switch (nvme_get_zone_state(zone)) {
1948 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1949 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1950 nvme_aor_dec_open(ns);
1951 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1952 /* fall through */
1953 case NVME_ZONE_STATE_CLOSED:
1954 return NVME_SUCCESS;
1955
1956 default:
1957 return NVME_ZONE_INVAL_TRANSITION;
1958 }
1959 }
1960
1961 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1962 {
1963 switch (nvme_get_zone_state(zone)) {
1964 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1965 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1966 nvme_aor_dec_open(ns);
1967 /* fallthrough */
1968 case NVME_ZONE_STATE_CLOSED:
1969 nvme_aor_dec_active(ns);
1970
1971 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1972 if (ns->params.numzrwa) {
1973 ns->zns.numzrwa++;
1974 }
1975 }
1976
1977 /* fallthrough */
1978 case NVME_ZONE_STATE_FULL:
1979 zone->w_ptr = zone->d.zslba;
1980 zone->d.wp = zone->w_ptr;
1981 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1982 /* fallthrough */
1983 case NVME_ZONE_STATE_EMPTY:
1984 return NVME_SUCCESS;
1985
1986 default:
1987 return NVME_ZONE_INVAL_TRANSITION;
1988 }
1989 }
1990
1991 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1992 {
1993 NvmeZone *zone;
1994
1995 if (ns->params.max_open_zones &&
1996 ns->nr_open_zones == ns->params.max_open_zones) {
1997 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1998 if (zone) {
1999 /*
2000 * Automatically close this implicitly open zone.
2001 */
2002 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
2003 nvme_zrm_close(ns, zone);
2004 }
2005 }
2006 }
2007
2008 enum {
2009 NVME_ZRM_AUTO = 1 << 0,
2010 NVME_ZRM_ZRWA = 1 << 1,
2011 };
2012
2013 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
2014 NvmeZone *zone, int flags)
2015 {
2016 int act = 0;
2017 uint16_t status;
2018
2019 switch (nvme_get_zone_state(zone)) {
2020 case NVME_ZONE_STATE_EMPTY:
2021 act = 1;
2022
2023 /* fallthrough */
2024
2025 case NVME_ZONE_STATE_CLOSED:
2026 if (n->params.auto_transition_zones) {
2027 nvme_zrm_auto_transition_zone(ns);
2028 }
2029 status = nvme_zns_check_resources(ns, act, 1,
2030 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2031 if (status) {
2032 return status;
2033 }
2034
2035 if (act) {
2036 nvme_aor_inc_active(ns);
2037 }
2038
2039 nvme_aor_inc_open(ns);
2040
2041 if (flags & NVME_ZRM_AUTO) {
2042 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2043 return NVME_SUCCESS;
2044 }
2045
2046 /* fallthrough */
2047
2048 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2049 if (flags & NVME_ZRM_AUTO) {
2050 return NVME_SUCCESS;
2051 }
2052
2053 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2054
2055 /* fallthrough */
2056
2057 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2058 if (flags & NVME_ZRM_ZRWA) {
2059 ns->zns.numzrwa--;
2060
2061 zone->d.za |= NVME_ZA_ZRWA_VALID;
2062 }
2063
2064 return NVME_SUCCESS;
2065
2066 default:
2067 return NVME_ZONE_INVAL_TRANSITION;
2068 }
2069 }
2070
2071 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2072 NvmeZone *zone)
2073 {
2074 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2075 }
2076
2077 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2078 uint32_t nlb)
2079 {
2080 zone->d.wp += nlb;
2081
2082 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2083 nvme_zrm_finish(ns, zone);
2084 }
2085 }
2086
2087 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2088 uint32_t nlbc)
2089 {
2090 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2091
2092 nlbc = nzrwafgs * ns->zns.zrwafg;
2093
2094 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2095
2096 zone->w_ptr += nlbc;
2097
2098 nvme_advance_zone_wp(ns, zone, nlbc);
2099 }
2100
2101 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2102 {
2103 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2104 NvmeZone *zone;
2105 uint64_t slba;
2106 uint32_t nlb;
2107
2108 slba = le64_to_cpu(rw->slba);
2109 nlb = le16_to_cpu(rw->nlb) + 1;
2110 zone = nvme_get_zone_by_slba(ns, slba);
2111 assert(zone);
2112
2113 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2114 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2115 uint64_t elba = slba + nlb - 1;
2116
2117 if (elba > ezrwa) {
2118 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2119 }
2120
2121 return;
2122 }
2123
2124 nvme_advance_zone_wp(ns, zone, nlb);
2125 }
2126
2127 static inline bool nvme_is_write(NvmeRequest *req)
2128 {
2129 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2130
2131 return rw->opcode == NVME_CMD_WRITE ||
2132 rw->opcode == NVME_CMD_ZONE_APPEND ||
2133 rw->opcode == NVME_CMD_WRITE_ZEROES;
2134 }
2135
2136 static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
2137 {
2138 return qemu_get_aio_context();
2139 }
2140
2141 static void nvme_misc_cb(void *opaque, int ret)
2142 {
2143 NvmeRequest *req = opaque;
2144
2145 trace_pci_nvme_misc_cb(nvme_cid(req));
2146
2147 if (ret) {
2148 nvme_aio_err(req, ret);
2149 }
2150
2151 nvme_enqueue_req_completion(nvme_cq(req), req);
2152 }
2153
2154 void nvme_rw_complete_cb(void *opaque, int ret)
2155 {
2156 NvmeRequest *req = opaque;
2157 NvmeNamespace *ns = req->ns;
2158 BlockBackend *blk = ns->blkconf.blk;
2159 BlockAcctCookie *acct = &req->acct;
2160 BlockAcctStats *stats = blk_get_stats(blk);
2161
2162 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2163
2164 if (ret) {
2165 block_acct_failed(stats, acct);
2166 nvme_aio_err(req, ret);
2167 } else {
2168 block_acct_done(stats, acct);
2169 }
2170
2171 if (ns->params.zoned && nvme_is_write(req)) {
2172 nvme_finalize_zoned_write(ns, req);
2173 }
2174
2175 nvme_enqueue_req_completion(nvme_cq(req), req);
2176 }
2177
2178 static void nvme_rw_cb(void *opaque, int ret)
2179 {
2180 NvmeRequest *req = opaque;
2181 NvmeNamespace *ns = req->ns;
2182
2183 BlockBackend *blk = ns->blkconf.blk;
2184
2185 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2186
2187 if (ret) {
2188 goto out;
2189 }
2190
2191 if (ns->lbaf.ms) {
2192 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2193 uint64_t slba = le64_to_cpu(rw->slba);
2194 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2195 uint64_t offset = nvme_moff(ns, slba);
2196
2197 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2198 size_t mlen = nvme_m2b(ns, nlb);
2199
2200 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2201 BDRV_REQ_MAY_UNMAP,
2202 nvme_rw_complete_cb, req);
2203 return;
2204 }
2205
2206 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2207 uint16_t status;
2208
2209 nvme_sg_unmap(&req->sg);
2210 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2211 if (status) {
2212 ret = -EFAULT;
2213 goto out;
2214 }
2215
2216 if (req->cmd.opcode == NVME_CMD_READ) {
2217 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2218 }
2219
2220 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2221 }
2222 }
2223
2224 out:
2225 nvme_rw_complete_cb(req, ret);
2226 }
2227
2228 static void nvme_verify_cb(void *opaque, int ret)
2229 {
2230 NvmeBounceContext *ctx = opaque;
2231 NvmeRequest *req = ctx->req;
2232 NvmeNamespace *ns = req->ns;
2233 BlockBackend *blk = ns->blkconf.blk;
2234 BlockAcctCookie *acct = &req->acct;
2235 BlockAcctStats *stats = blk_get_stats(blk);
2236 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2237 uint64_t slba = le64_to_cpu(rw->slba);
2238 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2239 uint16_t apptag = le16_to_cpu(rw->apptag);
2240 uint16_t appmask = le16_to_cpu(rw->appmask);
2241 uint64_t reftag = le32_to_cpu(rw->reftag);
2242 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2243 uint16_t status;
2244
2245 reftag |= cdw3 << 32;
2246
2247 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2248
2249 if (ret) {
2250 block_acct_failed(stats, acct);
2251 nvme_aio_err(req, ret);
2252 goto out;
2253 }
2254
2255 block_acct_done(stats, acct);
2256
2257 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2258 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2259 ctx->mdata.iov.size, slba);
2260 if (status) {
2261 req->status = status;
2262 goto out;
2263 }
2264
2265 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2266 ctx->mdata.bounce, ctx->mdata.iov.size,
2267 prinfo, slba, apptag, appmask, &reftag);
2268 }
2269
2270 out:
2271 qemu_iovec_destroy(&ctx->data.iov);
2272 g_free(ctx->data.bounce);
2273
2274 qemu_iovec_destroy(&ctx->mdata.iov);
2275 g_free(ctx->mdata.bounce);
2276
2277 g_free(ctx);
2278
2279 nvme_enqueue_req_completion(nvme_cq(req), req);
2280 }
2281
2282
2283 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2284 {
2285 NvmeBounceContext *ctx = opaque;
2286 NvmeRequest *req = ctx->req;
2287 NvmeNamespace *ns = req->ns;
2288 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2289 uint64_t slba = le64_to_cpu(rw->slba);
2290 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2291 size_t mlen = nvme_m2b(ns, nlb);
2292 uint64_t offset = nvme_moff(ns, slba);
2293 BlockBackend *blk = ns->blkconf.blk;
2294
2295 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2296
2297 if (ret) {
2298 goto out;
2299 }
2300
2301 ctx->mdata.bounce = g_malloc(mlen);
2302
2303 qemu_iovec_reset(&ctx->mdata.iov);
2304 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2305
2306 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2307 nvme_verify_cb, ctx);
2308 return;
2309
2310 out:
2311 nvme_verify_cb(ctx, ret);
2312 }
2313
2314 struct nvme_compare_ctx {
2315 struct {
2316 QEMUIOVector iov;
2317 uint8_t *bounce;
2318 } data;
2319
2320 struct {
2321 QEMUIOVector iov;
2322 uint8_t *bounce;
2323 } mdata;
2324 };
2325
2326 static void nvme_compare_mdata_cb(void *opaque, int ret)
2327 {
2328 NvmeRequest *req = opaque;
2329 NvmeNamespace *ns = req->ns;
2330 NvmeCtrl *n = nvme_ctrl(req);
2331 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2332 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2333 uint16_t apptag = le16_to_cpu(rw->apptag);
2334 uint16_t appmask = le16_to_cpu(rw->appmask);
2335 uint64_t reftag = le32_to_cpu(rw->reftag);
2336 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2337 struct nvme_compare_ctx *ctx = req->opaque;
2338 g_autofree uint8_t *buf = NULL;
2339 BlockBackend *blk = ns->blkconf.blk;
2340 BlockAcctCookie *acct = &req->acct;
2341 BlockAcctStats *stats = blk_get_stats(blk);
2342 uint16_t status = NVME_SUCCESS;
2343
2344 reftag |= cdw3 << 32;
2345
2346 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2347
2348 if (ret) {
2349 block_acct_failed(stats, acct);
2350 nvme_aio_err(req, ret);
2351 goto out;
2352 }
2353
2354 buf = g_malloc(ctx->mdata.iov.size);
2355
2356 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2357 NVME_TX_DIRECTION_TO_DEVICE, req);
2358 if (status) {
2359 req->status = status;
2360 goto out;
2361 }
2362
2363 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2364 uint64_t slba = le64_to_cpu(rw->slba);
2365 uint8_t *bufp;
2366 uint8_t *mbufp = ctx->mdata.bounce;
2367 uint8_t *end = mbufp + ctx->mdata.iov.size;
2368 int16_t pil = 0;
2369
2370 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2371 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2372 slba, apptag, appmask, &reftag);
2373 if (status) {
2374 req->status = status;
2375 goto out;
2376 }
2377
2378 /*
2379 * When formatted with protection information, do not compare the DIF
2380 * tuple.
2381 */
2382 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2383 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2384 }
2385
2386 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2387 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2388 req->status = NVME_CMP_FAILURE | NVME_DNR;
2389 goto out;
2390 }
2391 }
2392
2393 goto out;
2394 }
2395
2396 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2397 req->status = NVME_CMP_FAILURE | NVME_DNR;
2398 goto out;
2399 }
2400
2401 block_acct_done(stats, acct);
2402
2403 out:
2404 qemu_iovec_destroy(&ctx->data.iov);
2405 g_free(ctx->data.bounce);
2406
2407 qemu_iovec_destroy(&ctx->mdata.iov);
2408 g_free(ctx->mdata.bounce);
2409
2410 g_free(ctx);
2411
2412 nvme_enqueue_req_completion(nvme_cq(req), req);
2413 }
2414
2415 static void nvme_compare_data_cb(void *opaque, int ret)
2416 {
2417 NvmeRequest *req = opaque;
2418 NvmeCtrl *n = nvme_ctrl(req);
2419 NvmeNamespace *ns = req->ns;
2420 BlockBackend *blk = ns->blkconf.blk;
2421 BlockAcctCookie *acct = &req->acct;
2422 BlockAcctStats *stats = blk_get_stats(blk);
2423
2424 struct nvme_compare_ctx *ctx = req->opaque;
2425 g_autofree uint8_t *buf = NULL;
2426 uint16_t status;
2427
2428 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2429
2430 if (ret) {
2431 block_acct_failed(stats, acct);
2432 nvme_aio_err(req, ret);
2433 goto out;
2434 }
2435
2436 buf = g_malloc(ctx->data.iov.size);
2437
2438 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2439 NVME_TX_DIRECTION_TO_DEVICE, req);
2440 if (status) {
2441 req->status = status;
2442 goto out;
2443 }
2444
2445 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2446 req->status = NVME_CMP_FAILURE | NVME_DNR;
2447 goto out;
2448 }
2449
2450 if (ns->lbaf.ms) {
2451 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2452 uint64_t slba = le64_to_cpu(rw->slba);
2453 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2454 size_t mlen = nvme_m2b(ns, nlb);
2455 uint64_t offset = nvme_moff(ns, slba);
2456
2457 ctx->mdata.bounce = g_malloc(mlen);
2458
2459 qemu_iovec_init(&ctx->mdata.iov, 1);
2460 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2461
2462 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2463 nvme_compare_mdata_cb, req);
2464 return;
2465 }
2466
2467 block_acct_done(stats, acct);
2468
2469 out:
2470 qemu_iovec_destroy(&ctx->data.iov);
2471 g_free(ctx->data.bounce);
2472 g_free(ctx);
2473
2474 nvme_enqueue_req_completion(nvme_cq(req), req);
2475 }
2476
2477 typedef struct NvmeDSMAIOCB {
2478 BlockAIOCB common;
2479 BlockAIOCB *aiocb;
2480 NvmeRequest *req;
2481 int ret;
2482
2483 NvmeDsmRange *range;
2484 unsigned int nr;
2485 unsigned int idx;
2486 } NvmeDSMAIOCB;
2487
2488 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2489 {
2490 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2491
2492 /* break nvme_dsm_cb loop */
2493 iocb->idx = iocb->nr;
2494 iocb->ret = -ECANCELED;
2495
2496 if (iocb->aiocb) {
2497 blk_aio_cancel_async(iocb->aiocb);
2498 iocb->aiocb = NULL;
2499 } else {
2500 /*
2501 * We only reach this if nvme_dsm_cancel() has already been called or
2502 * the command ran to completion.
2503 */
2504 assert(iocb->idx == iocb->nr);
2505 }
2506 }
2507
2508 static const AIOCBInfo nvme_dsm_aiocb_info = {
2509 .aiocb_size = sizeof(NvmeDSMAIOCB),
2510 .cancel_async = nvme_dsm_cancel,
2511 };
2512
2513 static void nvme_dsm_cb(void *opaque, int ret);
2514
2515 static void nvme_dsm_md_cb(void *opaque, int ret)
2516 {
2517 NvmeDSMAIOCB *iocb = opaque;
2518 NvmeRequest *req = iocb->req;
2519 NvmeNamespace *ns = req->ns;
2520 NvmeDsmRange *range;
2521 uint64_t slba;
2522 uint32_t nlb;
2523
2524 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2525 goto done;
2526 }
2527
2528 range = &iocb->range[iocb->idx - 1];
2529 slba = le64_to_cpu(range->slba);
2530 nlb = le32_to_cpu(range->nlb);
2531
2532 /*
2533 * Check that all block were discarded (zeroed); otherwise we do not zero
2534 * the metadata.
2535 */
2536
2537 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2538 if (ret) {
2539 if (ret < 0) {
2540 goto done;
2541 }
2542
2543 nvme_dsm_cb(iocb, 0);
2544 return;
2545 }
2546
2547 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2548 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2549 nvme_dsm_cb, iocb);
2550 return;
2551
2552 done:
2553 nvme_dsm_cb(iocb, ret);
2554 }
2555
2556 static void nvme_dsm_cb(void *opaque, int ret)
2557 {
2558 NvmeDSMAIOCB *iocb = opaque;
2559 NvmeRequest *req = iocb->req;
2560 NvmeCtrl *n = nvme_ctrl(req);
2561 NvmeNamespace *ns = req->ns;
2562 NvmeDsmRange *range;
2563 uint64_t slba;
2564 uint32_t nlb;
2565
2566 if (iocb->ret < 0) {
2567 goto done;
2568 } else if (ret < 0) {
2569 iocb->ret = ret;
2570 goto done;
2571 }
2572
2573 next:
2574 if (iocb->idx == iocb->nr) {
2575 goto done;
2576 }
2577
2578 range = &iocb->range[iocb->idx++];
2579 slba = le64_to_cpu(range->slba);
2580 nlb = le32_to_cpu(range->nlb);
2581
2582 trace_pci_nvme_dsm_deallocate(slba, nlb);
2583
2584 if (nlb > n->dmrsl) {
2585 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2586 goto next;
2587 }
2588
2589 if (nvme_check_bounds(ns, slba, nlb)) {
2590 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2591 ns->id_ns.nsze);
2592 goto next;
2593 }
2594
2595 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2596 nvme_l2b(ns, nlb),
2597 nvme_dsm_md_cb, iocb);
2598 return;
2599
2600 done:
2601 iocb->aiocb = NULL;
2602 iocb->common.cb(iocb->common.opaque, iocb->ret);
2603 qemu_aio_unref(iocb);
2604 }
2605
2606 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2607 {
2608 NvmeNamespace *ns = req->ns;
2609 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2610 uint32_t attr = le32_to_cpu(dsm->attributes);
2611 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2612 uint16_t status = NVME_SUCCESS;
2613
2614 trace_pci_nvme_dsm(nr, attr);
2615
2616 if (attr & NVME_DSMGMT_AD) {
2617 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2618 nvme_misc_cb, req);
2619
2620 iocb->req = req;
2621 iocb->ret = 0;
2622 iocb->range = g_new(NvmeDsmRange, nr);
2623 iocb->nr = nr;
2624 iocb->idx = 0;
2625
2626 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2627 req);
2628 if (status) {
2629 g_free(iocb->range);
2630 qemu_aio_unref(iocb);
2631
2632 return status;
2633 }
2634
2635 req->aiocb = &iocb->common;
2636 nvme_dsm_cb(iocb, 0);
2637
2638 return NVME_NO_COMPLETE;
2639 }
2640
2641 return status;
2642 }
2643
2644 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2645 {
2646 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2647 NvmeNamespace *ns = req->ns;
2648 BlockBackend *blk = ns->blkconf.blk;
2649 uint64_t slba = le64_to_cpu(rw->slba);
2650 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2651 size_t len = nvme_l2b(ns, nlb);
2652 int64_t offset = nvme_l2b(ns, slba);
2653 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2654 uint32_t reftag = le32_to_cpu(rw->reftag);
2655 NvmeBounceContext *ctx = NULL;
2656 uint16_t status;
2657
2658 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2659
2660 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2661 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2662 if (status) {
2663 return status;
2664 }
2665
2666 if (prinfo & NVME_PRINFO_PRACT) {
2667 return NVME_INVALID_PROT_INFO | NVME_DNR;
2668 }
2669 }
2670
2671 if (len > n->page_size << n->params.vsl) {
2672 return NVME_INVALID_FIELD | NVME_DNR;
2673 }
2674
2675 status = nvme_check_bounds(ns, slba, nlb);
2676 if (status) {
2677 return status;
2678 }
2679
2680 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2681 status = nvme_check_dulbe(ns, slba, nlb);
2682 if (status) {
2683 return status;
2684 }
2685 }
2686
2687 ctx = g_new0(NvmeBounceContext, 1);
2688 ctx->req = req;
2689
2690 ctx->data.bounce = g_malloc(len);
2691
2692 qemu_iovec_init(&ctx->data.iov, 1);
2693 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2694
2695 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2696 BLOCK_ACCT_READ);
2697
2698 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2699 nvme_verify_mdata_in_cb, ctx);
2700 return NVME_NO_COMPLETE;
2701 }
2702
2703 typedef struct NvmeCopyAIOCB {
2704 BlockAIOCB common;
2705 BlockAIOCB *aiocb;
2706 NvmeRequest *req;
2707 int ret;
2708
2709 void *ranges;
2710 unsigned int format;
2711 int nr;
2712 int idx;
2713
2714 uint8_t *bounce;
2715 QEMUIOVector iov;
2716 struct {
2717 BlockAcctCookie read;
2718 BlockAcctCookie write;
2719 } acct;
2720
2721 uint64_t reftag;
2722 uint64_t slba;
2723
2724 NvmeZone *zone;
2725 } NvmeCopyAIOCB;
2726
2727 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2728 {
2729 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2730
2731 iocb->ret = -ECANCELED;
2732
2733 if (iocb->aiocb) {
2734 blk_aio_cancel_async(iocb->aiocb);
2735 iocb->aiocb = NULL;
2736 }
2737 }
2738
2739 static const AIOCBInfo nvme_copy_aiocb_info = {
2740 .aiocb_size = sizeof(NvmeCopyAIOCB),
2741 .cancel_async = nvme_copy_cancel,
2742 };
2743
2744 static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2745 {
2746 NvmeRequest *req = iocb->req;
2747 NvmeNamespace *ns = req->ns;
2748 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2749
2750 if (iocb->idx != iocb->nr) {
2751 req->cqe.result = cpu_to_le32(iocb->idx);
2752 }
2753
2754 qemu_iovec_destroy(&iocb->iov);
2755 g_free(iocb->bounce);
2756
2757 if (iocb->ret < 0) {
2758 block_acct_failed(stats, &iocb->acct.read);
2759 block_acct_failed(stats, &iocb->acct.write);
2760 } else {
2761 block_acct_done(stats, &iocb->acct.read);
2762 block_acct_done(stats, &iocb->acct.write);
2763 }
2764
2765 iocb->common.cb(iocb->common.opaque, iocb->ret);
2766 qemu_aio_unref(iocb);
2767 }
2768
2769 static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2770
2771 static void nvme_copy_source_range_parse_format0(void *ranges, int idx,
2772 uint64_t *slba, uint32_t *nlb,
2773 uint16_t *apptag,
2774 uint16_t *appmask,
2775 uint64_t *reftag)
2776 {
2777 NvmeCopySourceRangeFormat0 *_ranges = ranges;
2778
2779 if (slba) {
2780 *slba = le64_to_cpu(_ranges[idx].slba);
2781 }
2782
2783 if (nlb) {
2784 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2785 }
2786
2787 if (apptag) {
2788 *apptag = le16_to_cpu(_ranges[idx].apptag);
2789 }
2790
2791 if (appmask) {
2792 *appmask = le16_to_cpu(_ranges[idx].appmask);
2793 }
2794
2795 if (reftag) {
2796 *reftag = le32_to_cpu(_ranges[idx].reftag);
2797 }
2798 }
2799
2800 static void nvme_copy_source_range_parse_format1(void *ranges, int idx,
2801 uint64_t *slba, uint32_t *nlb,
2802 uint16_t *apptag,
2803 uint16_t *appmask,
2804 uint64_t *reftag)
2805 {
2806 NvmeCopySourceRangeFormat1 *_ranges = ranges;
2807
2808 if (slba) {
2809 *slba = le64_to_cpu(_ranges[idx].slba);
2810 }
2811
2812 if (nlb) {
2813 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2814 }
2815
2816 if (apptag) {
2817 *apptag = le16_to_cpu(_ranges[idx].apptag);
2818 }
2819
2820 if (appmask) {
2821 *appmask = le16_to_cpu(_ranges[idx].appmask);
2822 }
2823
2824 if (reftag) {
2825 *reftag = 0;
2826
2827 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2828 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2829 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2830 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2831 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2832 *reftag |= (uint64_t)_ranges[idx].sr[9];
2833 }
2834 }
2835
2836 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2837 uint64_t *slba, uint32_t *nlb,
2838 uint16_t *apptag, uint16_t *appmask,
2839 uint64_t *reftag)
2840 {
2841 switch (format) {
2842 case NVME_COPY_FORMAT_0:
2843 nvme_copy_source_range_parse_format0(ranges, idx, slba, nlb, apptag,
2844 appmask, reftag);
2845 break;
2846
2847 case NVME_COPY_FORMAT_1:
2848 nvme_copy_source_range_parse_format1(ranges, idx, slba, nlb, apptag,
2849 appmask, reftag);
2850 break;
2851
2852 default:
2853 abort();
2854 }
2855 }
2856
2857 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2858 {
2859 NvmeCopyAIOCB *iocb = opaque;
2860 NvmeRequest *req = iocb->req;
2861 NvmeNamespace *ns = req->ns;
2862 uint32_t nlb;
2863
2864 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2865 &nlb, NULL, NULL, NULL);
2866
2867 if (ret < 0) {
2868 iocb->ret = ret;
2869 goto out;
2870 } else if (iocb->ret < 0) {
2871 goto out;
2872 }
2873
2874 if (ns->params.zoned) {
2875 nvme_advance_zone_wp(ns, iocb->zone, nlb);
2876 }
2877
2878 iocb->idx++;
2879 iocb->slba += nlb;
2880 out:
2881 nvme_do_copy(iocb);
2882 }
2883
2884 static void nvme_copy_out_cb(void *opaque, int ret)
2885 {
2886 NvmeCopyAIOCB *iocb = opaque;
2887 NvmeRequest *req = iocb->req;
2888 NvmeNamespace *ns = req->ns;
2889 uint32_t nlb;
2890 size_t mlen;
2891 uint8_t *mbounce;
2892
2893 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2894 goto out;
2895 }
2896
2897 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2898 &nlb, NULL, NULL, NULL);
2899
2900 mlen = nvme_m2b(ns, nlb);
2901 mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2902
2903 qemu_iovec_reset(&iocb->iov);
2904 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2905
2906 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2907 &iocb->iov, 0, nvme_copy_out_completed_cb,
2908 iocb);
2909
2910 return;
2911
2912 out:
2913 nvme_copy_out_completed_cb(iocb, ret);
2914 }
2915
2916 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2917 {
2918 NvmeCopyAIOCB *iocb = opaque;
2919 NvmeRequest *req = iocb->req;
2920 NvmeNamespace *ns = req->ns;
2921 uint32_t nlb;
2922 uint64_t slba;
2923 uint16_t apptag, appmask;
2924 uint64_t reftag;
2925 size_t len;
2926 uint16_t status;
2927
2928 if (ret < 0) {
2929 iocb->ret = ret;
2930 goto out;
2931 } else if (iocb->ret < 0) {
2932 goto out;
2933 }
2934
2935 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2936 &nlb, &apptag, &appmask, &reftag);
2937 len = nvme_l2b(ns, nlb);
2938
2939 trace_pci_nvme_copy_out(iocb->slba, nlb);
2940
2941 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2942 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2943
2944 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2945 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2946
2947 size_t mlen = nvme_m2b(ns, nlb);
2948 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2949
2950 status = nvme_dif_mangle_mdata(ns, mbounce, mlen, slba);
2951 if (status) {
2952 goto invalid;
2953 }
2954 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2955 slba, apptag, appmask, &reftag);
2956 if (status) {
2957 goto invalid;
2958 }
2959
2960 apptag = le16_to_cpu(copy->apptag);
2961 appmask = le16_to_cpu(copy->appmask);
2962
2963 if (prinfow & NVME_PRINFO_PRACT) {
2964 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2965 if (status) {
2966 goto invalid;
2967 }
2968
2969 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2970 apptag, &iocb->reftag);
2971 } else {
2972 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2973 prinfow, iocb->slba, apptag, appmask,
2974 &iocb->reftag);
2975 if (status) {
2976 goto invalid;
2977 }
2978 }
2979 }
2980
2981 status = nvme_check_bounds(ns, iocb->slba, nlb);
2982 if (status) {
2983 goto invalid;
2984 }
2985
2986 if (ns->params.zoned) {
2987 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2988 if (status) {
2989 goto invalid;
2990 }
2991
2992 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
2993 iocb->zone->w_ptr += nlb;
2994 }
2995 }
2996
2997 qemu_iovec_reset(&iocb->iov);
2998 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2999
3000 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
3001 &iocb->iov, 0, nvme_copy_out_cb, iocb);
3002
3003 return;
3004
3005 invalid:
3006 req->status = status;
3007 iocb->ret = -1;
3008 out:
3009 nvme_do_copy(iocb);
3010 }
3011
3012 static void nvme_copy_in_cb(void *opaque, int ret)
3013 {
3014 NvmeCopyAIOCB *iocb = opaque;
3015 NvmeRequest *req = iocb->req;
3016 NvmeNamespace *ns = req->ns;
3017 uint64_t slba;
3018 uint32_t nlb;
3019
3020 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
3021 goto out;
3022 }
3023
3024 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3025 &nlb, NULL, NULL, NULL);
3026
3027 qemu_iovec_reset(&iocb->iov);
3028 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
3029 nvme_m2b(ns, nlb));
3030
3031 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
3032 &iocb->iov, 0, nvme_copy_in_completed_cb,
3033 iocb);
3034 return;
3035
3036 out:
3037 nvme_copy_in_completed_cb(iocb, ret);
3038 }
3039
3040 static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3041 {
3042 NvmeRequest *req = iocb->req;
3043 NvmeNamespace *ns = req->ns;
3044 uint64_t slba;
3045 uint32_t nlb;
3046 size_t len;
3047 uint16_t status;
3048
3049 if (iocb->ret < 0) {
3050 goto done;
3051 }
3052
3053 if (iocb->idx == iocb->nr) {
3054 goto done;
3055 }
3056
3057 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3058 &nlb, NULL, NULL, NULL);
3059 len = nvme_l2b(ns, nlb);
3060
3061 trace_pci_nvme_copy_source_range(slba, nlb);
3062
3063 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
3064 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3065 goto invalid;
3066 }
3067
3068 status = nvme_check_bounds(ns, slba, nlb);
3069 if (status) {
3070 goto invalid;
3071 }
3072
3073 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3074 status = nvme_check_dulbe(ns, slba, nlb);
3075 if (status) {
3076 goto invalid;
3077 }
3078 }
3079
3080 if (ns->params.zoned) {
3081 status = nvme_check_zone_read(ns, slba, nlb);
3082 if (status) {
3083 goto invalid;
3084 }
3085 }
3086
3087 qemu_iovec_reset(&iocb->iov);
3088 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3089
3090 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
3091 &iocb->iov, 0, nvme_copy_in_cb, iocb);
3092 return;
3093
3094 invalid:
3095 req->status = status;
3096 iocb->ret = -1;
3097 done:
3098 nvme_copy_done(iocb);
3099 }
3100
3101 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3102 {
3103 NvmeNamespace *ns = req->ns;
3104 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3105 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3106 nvme_misc_cb, req);
3107 uint16_t nr = copy->nr + 1;
3108 uint8_t format = copy->control[0] & 0xf;
3109 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3110 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3111 size_t len = sizeof(NvmeCopySourceRangeFormat0);
3112
3113 uint16_t status;
3114
3115 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3116
3117 iocb->ranges = NULL;
3118 iocb->zone = NULL;
3119
3120 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
3121 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3122 status = NVME_INVALID_FIELD | NVME_DNR;
3123 goto invalid;
3124 }
3125
3126 if (!(n->id_ctrl.ocfs & (1 << format))) {
3127 trace_pci_nvme_err_copy_invalid_format(format);
3128 status = NVME_INVALID_FIELD | NVME_DNR;
3129 goto invalid;
3130 }
3131
3132 if (nr > ns->id_ns.msrc + 1) {
3133 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3134 goto invalid;
3135 }
3136
3137 if ((ns->pif == 0x0 && format != 0x0) ||
3138 (ns->pif != 0x0 && format != 0x1)) {
3139 status = NVME_INVALID_FORMAT | NVME_DNR;
3140 goto invalid;
3141 }
3142
3143 if (ns->pif) {
3144 len = sizeof(NvmeCopySourceRangeFormat1);
3145 }
3146
3147 iocb->format = format;
3148 iocb->ranges = g_malloc_n(nr, len);
3149 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3150 if (status) {
3151 goto invalid;
3152 }
3153
3154 iocb->slba = le64_to_cpu(copy->sdlba);
3155
3156 if (ns->params.zoned) {
3157 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3158 if (!iocb->zone) {
3159 status = NVME_LBA_RANGE | NVME_DNR;
3160 goto invalid;
3161 }
3162
3163 status = nvme_zrm_auto(n, ns, iocb->zone);
3164 if (status) {
3165 goto invalid;
3166 }
3167 }
3168
3169 iocb->req = req;
3170 iocb->ret = 0;
3171 iocb->nr = nr;
3172 iocb->idx = 0;
3173 iocb->reftag = le32_to_cpu(copy->reftag);
3174 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3175 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
3176 ns->lbasz + ns->lbaf.ms);
3177
3178 qemu_iovec_init(&iocb->iov, 1);
3179
3180 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
3181 BLOCK_ACCT_READ);
3182 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
3183 BLOCK_ACCT_WRITE);
3184
3185 req->aiocb = &iocb->common;
3186 nvme_do_copy(iocb);
3187
3188 return NVME_NO_COMPLETE;
3189
3190 invalid:
3191 g_free(iocb->ranges);
3192 qemu_aio_unref(iocb);
3193 return status;
3194 }
3195
3196 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3197 {
3198 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3199 NvmeNamespace *ns = req->ns;
3200 BlockBackend *blk = ns->blkconf.blk;
3201 uint64_t slba = le64_to_cpu(rw->slba);
3202 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3203 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3204 size_t data_len = nvme_l2b(ns, nlb);
3205 size_t len = data_len;
3206 int64_t offset = nvme_l2b(ns, slba);
3207 struct nvme_compare_ctx *ctx = NULL;
3208 uint16_t status;
3209
3210 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3211
3212 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3213 return NVME_INVALID_PROT_INFO | NVME_DNR;
3214 }
3215
3216 if (nvme_ns_ext(ns)) {
3217 len += nvme_m2b(ns, nlb);
3218 }
3219
3220 status = nvme_check_mdts(n, len);
3221 if (status) {
3222 return status;
3223 }
3224
3225 status = nvme_check_bounds(ns, slba, nlb);
3226 if (status) {
3227 return status;
3228 }
3229
3230 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3231 status = nvme_check_dulbe(ns, slba, nlb);
3232 if (status) {
3233 return status;
3234 }
3235 }
3236
3237 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3238 if (status) {
3239 return status;
3240 }
3241
3242 ctx = g_new(struct nvme_compare_ctx, 1);
3243 ctx->data.bounce = g_malloc(data_len);
3244
3245 req->opaque = ctx;
3246
3247 qemu_iovec_init(&ctx->data.iov, 1);
3248 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3249
3250 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3251 BLOCK_ACCT_READ);
3252 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3253 nvme_compare_data_cb, req);
3254
3255 return NVME_NO_COMPLETE;
3256 }
3257
3258 typedef struct NvmeFlushAIOCB {
3259 BlockAIOCB common;
3260 BlockAIOCB *aiocb;
3261 NvmeRequest *req;
3262 int ret;
3263
3264 NvmeNamespace *ns;
3265 uint32_t nsid;
3266 bool broadcast;
3267 } NvmeFlushAIOCB;
3268
3269 static void nvme_flush_cancel(BlockAIOCB *acb)
3270 {
3271 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3272
3273 iocb->ret = -ECANCELED;
3274
3275 if (iocb->aiocb) {
3276 blk_aio_cancel_async(iocb->aiocb);
3277 iocb->aiocb = NULL;
3278 }
3279 }
3280
3281 static const AIOCBInfo nvme_flush_aiocb_info = {
3282 .aiocb_size = sizeof(NvmeFlushAIOCB),
3283 .cancel_async = nvme_flush_cancel,
3284 .get_aio_context = nvme_get_aio_context,
3285 };
3286
3287 static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3288
3289 static void nvme_flush_ns_cb(void *opaque, int ret)
3290 {
3291 NvmeFlushAIOCB *iocb = opaque;
3292 NvmeNamespace *ns = iocb->ns;
3293
3294 if (ret < 0) {
3295 iocb->ret = ret;
3296 goto out;
3297 } else if (iocb->ret < 0) {
3298 goto out;
3299 }
3300
3301 if (ns) {
3302 trace_pci_nvme_flush_ns(iocb->nsid);
3303
3304 iocb->ns = NULL;
3305 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3306 return;
3307 }
3308
3309 out:
3310 nvme_do_flush(iocb);
3311 }
3312
3313 static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3314 {
3315 NvmeRequest *req = iocb->req;
3316 NvmeCtrl *n = nvme_ctrl(req);
3317 int i;
3318
3319 if (iocb->ret < 0) {
3320 goto done;
3321 }
3322
3323 if (iocb->broadcast) {
3324 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3325 iocb->ns = nvme_ns(n, i);
3326 if (iocb->ns) {
3327 iocb->nsid = i;
3328 break;
3329 }
3330 }
3331 }
3332
3333 if (!iocb->ns) {
3334 goto done;
3335 }
3336
3337 nvme_flush_ns_cb(iocb, 0);
3338 return;
3339
3340 done:
3341 iocb->common.cb(iocb->common.opaque, iocb->ret);
3342 qemu_aio_unref(iocb);
3343 }
3344
3345 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3346 {
3347 NvmeFlushAIOCB *iocb;
3348 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3349 uint16_t status;
3350
3351 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3352
3353 iocb->req = req;
3354 iocb->ret = 0;
3355 iocb->ns = NULL;
3356 iocb->nsid = 0;
3357 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3358
3359 if (!iocb->broadcast) {
3360 if (!nvme_nsid_valid(n, nsid)) {
3361 status = NVME_INVALID_NSID | NVME_DNR;
3362 goto out;
3363 }
3364
3365 iocb->ns = nvme_ns(n, nsid);
3366 if (!iocb->ns) {
3367 status = NVME_INVALID_FIELD | NVME_DNR;
3368 goto out;
3369 }
3370
3371 iocb->nsid = nsid;
3372 }
3373
3374 req->aiocb = &iocb->common;
3375 nvme_do_flush(iocb);
3376
3377 return NVME_NO_COMPLETE;
3378
3379 out:
3380 qemu_aio_unref(iocb);
3381
3382 return status;
3383 }
3384
3385 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3386 {
3387 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3388 NvmeNamespace *ns = req->ns;
3389 uint64_t slba = le64_to_cpu(rw->slba);
3390 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3391 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3392 uint64_t data_size = nvme_l2b(ns, nlb);
3393 uint64_t mapped_size = data_size;
3394 uint64_t data_offset;
3395 BlockBackend *blk = ns->blkconf.blk;
3396 uint16_t status;
3397
3398 if (nvme_ns_ext(ns)) {
3399 mapped_size += nvme_m2b(ns, nlb);
3400
3401 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3402 bool pract = prinfo & NVME_PRINFO_PRACT;
3403
3404 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3405 mapped_size = data_size;
3406 }
3407 }
3408 }
3409
3410 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3411
3412 status = nvme_check_mdts(n, mapped_size);
3413 if (status) {
3414 goto invalid;
3415 }
3416
3417 status = nvme_check_bounds(ns, slba, nlb);
3418 if (status) {
3419 goto invalid;
3420 }
3421
3422 if (ns->params.zoned) {
3423 status = nvme_check_zone_read(ns, slba, nlb);
3424 if (status) {
3425 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3426 goto invalid;
3427 }
3428 }
3429
3430 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3431 status = nvme_check_dulbe(ns, slba, nlb);
3432 if (status) {
3433 goto invalid;
3434 }
3435 }
3436
3437 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3438 return nvme_dif_rw(n, req);
3439 }
3440
3441 status = nvme_map_data(n, nlb, req);
3442 if (status) {
3443 goto invalid;
3444 }
3445
3446 data_offset = nvme_l2b(ns, slba);
3447
3448 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3449 BLOCK_ACCT_READ);
3450 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3451 return NVME_NO_COMPLETE;
3452
3453 invalid:
3454 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3455 return status | NVME_DNR;
3456 }
3457
3458 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3459 uint32_t nlb)
3460 {
3461 NvmeNamespace *ns = req->ns;
3462 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3463 uint64_t data_size = nvme_l2b(ns, nlb);
3464 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3465 uint8_t dtype = (dw12 >> 20) & 0xf;
3466 uint16_t pid = le16_to_cpu(rw->dspec);
3467 uint16_t ph, rg, ruhid;
3468 NvmeReclaimUnit *ru;
3469
3470 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3471 !nvme_parse_pid(ns, pid, &ph, &rg)) {
3472 ph = 0;
3473 rg = 0;
3474 }
3475
3476 ruhid = ns->fdp.phs[ph];
3477 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3478
3479 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3480 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3481
3482 while (nlb) {
3483 if (nlb < ru->ruamw) {
3484 ru->ruamw -= nlb;
3485 break;
3486 }
3487
3488 nlb -= ru->ruamw;
3489 nvme_update_ruh(n, ns, pid);
3490 }
3491 }
3492
3493 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3494 bool wrz)
3495 {
3496 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3497 NvmeNamespace *ns = req->ns;
3498 uint64_t slba = le64_to_cpu(rw->slba);
3499 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3500 uint16_t ctrl = le16_to_cpu(rw->control);
3501 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3502 uint64_t data_size = nvme_l2b(ns, nlb);
3503 uint64_t mapped_size = data_size;
3504 uint64_t data_offset;
3505 NvmeZone *zone;
3506 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3507 BlockBackend *blk = ns->blkconf.blk;
3508 uint16_t status;
3509
3510 if (nvme_ns_ext(ns)) {
3511 mapped_size += nvme_m2b(ns, nlb);
3512
3513 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3514 bool pract = prinfo & NVME_PRINFO_PRACT;
3515
3516 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3517 mapped_size -= nvme_m2b(ns, nlb);
3518 }
3519 }
3520 }
3521
3522 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3523 nvme_nsid(ns), nlb, mapped_size, slba);
3524
3525 if (!wrz) {
3526 status = nvme_check_mdts(n, mapped_size);
3527 if (status) {
3528 goto invalid;
3529 }
3530 }
3531
3532 status = nvme_check_bounds(ns, slba, nlb);
3533 if (status) {
3534 goto invalid;
3535 }
3536
3537 if (ns->params.zoned) {
3538 zone = nvme_get_zone_by_slba(ns, slba);
3539 assert(zone);
3540
3541 if (append) {
3542 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3543
3544 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3545 return NVME_INVALID_ZONE_OP | NVME_DNR;
3546 }
3547
3548 if (unlikely(slba != zone->d.zslba)) {
3549 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3550 status = NVME_INVALID_FIELD;
3551 goto invalid;
3552 }
3553
3554 if (n->params.zasl &&
3555 data_size > (uint64_t)n->page_size << n->params.zasl) {
3556 trace_pci_nvme_err_zasl(data_size);
3557 return NVME_INVALID_FIELD | NVME_DNR;
3558 }
3559
3560 slba = zone->w_ptr;
3561 rw->slba = cpu_to_le64(slba);
3562 res->slba = cpu_to_le64(slba);
3563
3564 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3565 case NVME_ID_NS_DPS_TYPE_1:
3566 if (!piremap) {
3567 return NVME_INVALID_PROT_INFO | NVME_DNR;
3568 }
3569
3570 /* fallthrough */
3571
3572 case NVME_ID_NS_DPS_TYPE_2:
3573 if (piremap) {
3574 uint32_t reftag = le32_to_cpu(rw->reftag);
3575 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3576 }
3577
3578 break;
3579
3580 case NVME_ID_NS_DPS_TYPE_3:
3581 if (piremap) {
3582 return NVME_INVALID_PROT_INFO | NVME_DNR;
3583 }
3584
3585 break;
3586 }
3587 }
3588
3589 status = nvme_check_zone_write(ns, zone, slba, nlb);
3590 if (status) {
3591 goto invalid;
3592 }
3593
3594 status = nvme_zrm_auto(n, ns, zone);
3595 if (status) {
3596 goto invalid;
3597 }
3598
3599 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3600 zone->w_ptr += nlb;
3601 }
3602 } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3603 nvme_do_write_fdp(n, req, slba, nlb);
3604 }
3605
3606 data_offset = nvme_l2b(ns, slba);
3607
3608 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3609 return nvme_dif_rw(n, req);
3610 }
3611
3612 if (!wrz) {
3613 status = nvme_map_data(n, nlb, req);
3614 if (status) {
3615 goto invalid;
3616 }
3617
3618 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3619 BLOCK_ACCT_WRITE);
3620 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3621 } else {
3622 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3623 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3624 req);
3625 }
3626
3627 return NVME_NO_COMPLETE;
3628
3629 invalid:
3630 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3631 return status | NVME_DNR;
3632 }
3633
3634 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3635 {
3636 return nvme_do_write(n, req, false, false);
3637 }
3638
3639 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3640 {
3641 return nvme_do_write(n, req, false, true);
3642 }
3643
3644 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3645 {
3646 return nvme_do_write(n, req, true, false);
3647 }
3648
3649 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3650 uint64_t *slba, uint32_t *zone_idx)
3651 {
3652 uint32_t dw10 = le32_to_cpu(c->cdw10);
3653 uint32_t dw11 = le32_to_cpu(c->cdw11);
3654
3655 if (!ns->params.zoned) {
3656 trace_pci_nvme_err_invalid_opc(c->opcode);
3657 return NVME_INVALID_OPCODE | NVME_DNR;
3658 }
3659
3660 *slba = ((uint64_t)dw11) << 32 | dw10;
3661 if (unlikely(*slba >= ns->id_ns.nsze)) {
3662 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3663 *slba = 0;
3664 return NVME_LBA_RANGE | NVME_DNR;
3665 }
3666
3667 *zone_idx = nvme_zone_idx(ns, *slba);
3668 assert(*zone_idx < ns->num_zones);
3669
3670 return NVME_SUCCESS;
3671 }
3672
3673 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3674 NvmeRequest *);
3675
3676 enum NvmeZoneProcessingMask {
3677 NVME_PROC_CURRENT_ZONE = 0,
3678 NVME_PROC_OPENED_ZONES = 1 << 0,
3679 NVME_PROC_CLOSED_ZONES = 1 << 1,
3680 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3681 NVME_PROC_FULL_ZONES = 1 << 3,
3682 };
3683
3684 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3685 NvmeZoneState state, NvmeRequest *req)
3686 {
3687 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3688 int flags = 0;
3689
3690 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3691 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3692
3693 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3694 return NVME_INVALID_ZONE_OP | NVME_DNR;
3695 }
3696
3697 if (zone->w_ptr % ns->zns.zrwafg) {
3698 return NVME_NOZRWA | NVME_DNR;
3699 }
3700
3701 flags = NVME_ZRM_ZRWA;
3702 }
3703
3704 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3705 }
3706
3707 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3708 NvmeZoneState state, NvmeRequest *req)
3709 {
3710 return nvme_zrm_close(ns, zone);
3711 }
3712
3713 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3714 NvmeZoneState state, NvmeRequest *req)
3715 {
3716 return nvme_zrm_finish(ns, zone);
3717 }
3718
3719 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3720 NvmeZoneState state, NvmeRequest *req)
3721 {
3722 switch (state) {
3723 case NVME_ZONE_STATE_READ_ONLY:
3724 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3725 /* fall through */
3726 case NVME_ZONE_STATE_OFFLINE:
3727 return NVME_SUCCESS;
3728 default:
3729 return NVME_ZONE_INVAL_TRANSITION;
3730 }
3731 }
3732
3733 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3734 {
3735 uint16_t status;
3736 uint8_t state = nvme_get_zone_state(zone);
3737
3738 if (state == NVME_ZONE_STATE_EMPTY) {
3739 status = nvme_aor_check(ns, 1, 0);
3740 if (status) {
3741 return status;
3742 }
3743 nvme_aor_inc_active(ns);
3744 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3745 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3746 return NVME_SUCCESS;
3747 }
3748
3749 return NVME_ZONE_INVAL_TRANSITION;
3750 }
3751
3752 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3753 enum NvmeZoneProcessingMask proc_mask,
3754 op_handler_t op_hndlr, NvmeRequest *req)
3755 {
3756 uint16_t status = NVME_SUCCESS;
3757 NvmeZoneState zs = nvme_get_zone_state(zone);
3758 bool proc_zone;
3759
3760 switch (zs) {
3761 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3762 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3763 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3764 break;
3765 case NVME_ZONE_STATE_CLOSED:
3766 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3767 break;
3768 case NVME_ZONE_STATE_READ_ONLY:
3769 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3770 break;
3771 case NVME_ZONE_STATE_FULL:
3772 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3773 break;
3774 default:
3775 proc_zone = false;
3776 }
3777
3778 if (proc_zone) {
3779 status = op_hndlr(ns, zone, zs, req);
3780 }
3781
3782 return status;
3783 }
3784
3785 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3786 enum NvmeZoneProcessingMask proc_mask,
3787 op_handler_t op_hndlr, NvmeRequest *req)
3788 {
3789 NvmeZone *next;
3790 uint16_t status = NVME_SUCCESS;
3791 int i;
3792
3793 if (!proc_mask) {
3794 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3795 } else {
3796 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3797 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3798 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3799 req);
3800 if (status && status != NVME_NO_COMPLETE) {
3801 goto out;
3802 }
3803 }
3804 }
3805 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3806 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3807 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3808 req);
3809 if (status && status != NVME_NO_COMPLETE) {
3810 goto out;
3811 }
3812 }
3813
3814 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3815 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3816 req);
3817 if (status && status != NVME_NO_COMPLETE) {
3818 goto out;
3819 }
3820 }
3821 }
3822 if (proc_mask & NVME_PROC_FULL_ZONES) {
3823 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3824 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3825 req);
3826 if (status && status != NVME_NO_COMPLETE) {
3827 goto out;
3828 }
3829 }
3830 }
3831
3832 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3833 for (i = 0; i < ns->num_zones; i++, zone++) {
3834 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3835 req);
3836 if (status && status != NVME_NO_COMPLETE) {
3837 goto out;
3838 }
3839 }
3840 }
3841 }
3842
3843 out:
3844 return status;
3845 }
3846
3847 typedef struct NvmeZoneResetAIOCB {
3848 BlockAIOCB common;
3849 BlockAIOCB *aiocb;
3850 NvmeRequest *req;
3851 int ret;
3852
3853 bool all;
3854 int idx;
3855 NvmeZone *zone;
3856 } NvmeZoneResetAIOCB;
3857
3858 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3859 {
3860 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3861 NvmeRequest *req = iocb->req;
3862 NvmeNamespace *ns = req->ns;
3863
3864 iocb->idx = ns->num_zones;
3865
3866 iocb->ret = -ECANCELED;
3867
3868 if (iocb->aiocb) {
3869 blk_aio_cancel_async(iocb->aiocb);
3870 iocb->aiocb = NULL;
3871 }
3872 }
3873
3874 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3875 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3876 .cancel_async = nvme_zone_reset_cancel,
3877 };
3878
3879 static void nvme_zone_reset_cb(void *opaque, int ret);
3880
3881 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3882 {
3883 NvmeZoneResetAIOCB *iocb = opaque;
3884 NvmeRequest *req = iocb->req;
3885 NvmeNamespace *ns = req->ns;
3886 int64_t moff;
3887 int count;
3888
3889 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
3890 goto out;
3891 }
3892
3893 moff = nvme_moff(ns, iocb->zone->d.zslba);
3894 count = nvme_m2b(ns, ns->zone_size);
3895
3896 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3897 BDRV_REQ_MAY_UNMAP,
3898 nvme_zone_reset_cb, iocb);
3899 return;
3900
3901 out:
3902 nvme_zone_reset_cb(iocb, ret);
3903 }
3904
3905 static void nvme_zone_reset_cb(void *opaque, int ret)
3906 {
3907 NvmeZoneResetAIOCB *iocb = opaque;
3908 NvmeRequest *req = iocb->req;
3909 NvmeNamespace *ns = req->ns;
3910
3911 if (iocb->ret < 0) {
3912 goto done;
3913 } else if (ret < 0) {
3914 iocb->ret = ret;
3915 goto done;
3916 }
3917
3918 if (iocb->zone) {
3919 nvme_zrm_reset(ns, iocb->zone);
3920
3921 if (!iocb->all) {
3922 goto done;
3923 }
3924 }
3925
3926 while (iocb->idx < ns->num_zones) {
3927 NvmeZone *zone = &ns->zone_array[iocb->idx++];
3928
3929 switch (nvme_get_zone_state(zone)) {
3930 case NVME_ZONE_STATE_EMPTY:
3931 if (!iocb->all) {
3932 goto done;
3933 }
3934
3935 continue;
3936
3937 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3938 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3939 case NVME_ZONE_STATE_CLOSED:
3940 case NVME_ZONE_STATE_FULL:
3941 iocb->zone = zone;
3942 break;
3943
3944 default:
3945 continue;
3946 }
3947
3948 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3949
3950 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3951 nvme_l2b(ns, zone->d.zslba),
3952 nvme_l2b(ns, ns->zone_size),
3953 BDRV_REQ_MAY_UNMAP,
3954 nvme_zone_reset_epilogue_cb,
3955 iocb);
3956 return;
3957 }
3958
3959 done:
3960 iocb->aiocb = NULL;
3961
3962 iocb->common.cb(iocb->common.opaque, iocb->ret);
3963 qemu_aio_unref(iocb);
3964 }
3965
3966 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
3967 uint64_t elba, NvmeRequest *req)
3968 {
3969 NvmeNamespace *ns = req->ns;
3970 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3971 uint64_t wp = zone->d.wp;
3972 uint32_t nlb = elba - wp + 1;
3973 uint16_t status;
3974
3975
3976 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3977 return NVME_INVALID_ZONE_OP | NVME_DNR;
3978 }
3979
3980 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3981 return NVME_INVALID_FIELD | NVME_DNR;
3982 }
3983
3984 if (elba < wp || elba > wp + ns->zns.zrwas) {
3985 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
3986 }
3987
3988 if (nlb % ns->zns.zrwafg) {
3989 return NVME_INVALID_FIELD | NVME_DNR;
3990 }
3991
3992 status = nvme_zrm_auto(n, ns, zone);
3993 if (status) {
3994 return status;
3995 }
3996
3997 zone->w_ptr += nlb;
3998
3999 nvme_advance_zone_wp(ns, zone, nlb);
4000
4001 return NVME_SUCCESS;
4002 }
4003
4004 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4005 {
4006 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4007 NvmeNamespace *ns = req->ns;
4008 NvmeZone *zone;
4009 NvmeZoneResetAIOCB *iocb;
4010 uint8_t *zd_ext;
4011 uint64_t slba = 0;
4012 uint32_t zone_idx = 0;
4013 uint16_t status;
4014 uint8_t action = cmd->zsa;
4015 bool all;
4016 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4017
4018 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4019
4020 req->status = NVME_SUCCESS;
4021
4022 if (!all) {
4023 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4024 if (status) {
4025 return status;
4026 }
4027 }
4028
4029 zone = &ns->zone_array[zone_idx];
4030 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4031 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4032 return NVME_INVALID_FIELD | NVME_DNR;
4033 }
4034
4035 switch (action) {
4036
4037 case NVME_ZONE_ACTION_OPEN:
4038 if (all) {
4039 proc_mask = NVME_PROC_CLOSED_ZONES;
4040 }
4041 trace_pci_nvme_open_zone(slba, zone_idx, all);
4042 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4043 break;
4044
4045 case NVME_ZONE_ACTION_CLOSE:
4046 if (all) {
4047 proc_mask = NVME_PROC_OPENED_ZONES;
4048 }
4049 trace_pci_nvme_close_zone(slba, zone_idx, all);
4050 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4051 break;
4052
4053 case NVME_ZONE_ACTION_FINISH:
4054 if (all) {
4055 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4056 }
4057 trace_pci_nvme_finish_zone(slba, zone_idx, all);
4058 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4059 break;
4060
4061 case NVME_ZONE_ACTION_RESET:
4062 trace_pci_nvme_reset_zone(slba, zone_idx, all);
4063
4064 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4065 nvme_misc_cb, req);
4066
4067 iocb->req = req;
4068 iocb->ret = 0;
4069 iocb->all = all;
4070 iocb->idx = zone_idx;
4071 iocb->zone = NULL;
4072
4073 req->aiocb = &iocb->common;
4074 nvme_zone_reset_cb(iocb, 0);
4075
4076 return NVME_NO_COMPLETE;
4077
4078 case NVME_ZONE_ACTION_OFFLINE:
4079 if (all) {
4080 proc_mask = NVME_PROC_READ_ONLY_ZONES;
4081 }
4082 trace_pci_nvme_offline_zone(slba, zone_idx, all);
4083 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4084 break;
4085
4086 case NVME_ZONE_ACTION_SET_ZD_EXT:
4087 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4088 if (all || !ns->params.zd_extension_size) {
4089 return NVME_INVALID_FIELD | NVME_DNR;
4090 }
4091 zd_ext = nvme_get_zd_extension(ns, zone_idx);
4092 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4093 if (status) {
4094 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4095 return status;
4096 }
4097
4098 status = nvme_set_zd_ext(ns, zone);
4099 if (status == NVME_SUCCESS) {
4100 trace_pci_nvme_zd_extension_set(zone_idx);
4101 return status;
4102 }
4103 break;
4104
4105 case NVME_ZONE_ACTION_ZRWA_FLUSH:
4106 if (all) {
4107 return NVME_INVALID_FIELD | NVME_DNR;
4108 }
4109
4110 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4111
4112 default:
4113 trace_pci_nvme_err_invalid_mgmt_action(action);
4114 status = NVME_INVALID_FIELD;
4115 }
4116
4117 if (status == NVME_ZONE_INVAL_TRANSITION) {
4118 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4119 zone->d.za);
4120 }
4121 if (status) {
4122 status |= NVME_DNR;
4123 }
4124
4125 return status;
4126 }
4127
4128 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4129 {
4130 NvmeZoneState zs = nvme_get_zone_state(zl);
4131
4132 switch (zafs) {
4133 case NVME_ZONE_REPORT_ALL:
4134 return true;
4135 case NVME_ZONE_REPORT_EMPTY:
4136 return zs == NVME_ZONE_STATE_EMPTY;
4137 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4138 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4139 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4140 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4141 case NVME_ZONE_REPORT_CLOSED:
4142 return zs == NVME_ZONE_STATE_CLOSED;
4143 case NVME_ZONE_REPORT_FULL:
4144 return zs == NVME_ZONE_STATE_FULL;
4145 case NVME_ZONE_REPORT_READ_ONLY:
4146 return zs == NVME_ZONE_STATE_READ_ONLY;
4147 case NVME_ZONE_REPORT_OFFLINE:
4148 return zs == NVME_ZONE_STATE_OFFLINE;
4149 default:
4150 return false;
4151 }
4152 }
4153
4154 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4155 {
4156 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
4157 NvmeNamespace *ns = req->ns;
4158 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4159 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4160 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4161 uint32_t zone_idx, zra, zrasf, partial;
4162 uint64_t max_zones, nr_zones = 0;
4163 uint16_t status;
4164 uint64_t slba;
4165 NvmeZoneDescr *z;
4166 NvmeZone *zone;
4167 NvmeZoneReportHeader *header;
4168 void *buf, *buf_p;
4169 size_t zone_entry_sz;
4170 int i;
4171
4172 req->status = NVME_SUCCESS;
4173
4174 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4175 if (status) {
4176 return status;
4177 }
4178
4179 zra = dw13 & 0xff;
4180 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4181 return NVME_INVALID_FIELD | NVME_DNR;
4182 }
4183 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4184 return NVME_INVALID_FIELD | NVME_DNR;
4185 }
4186
4187 zrasf = (dw13 >> 8) & 0xff;
4188 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4189 return NVME_INVALID_FIELD | NVME_DNR;
4190 }
4191
4192 if (data_size < sizeof(NvmeZoneReportHeader)) {
4193 return NVME_INVALID_FIELD | NVME_DNR;
4194 }
4195
4196 status = nvme_check_mdts(n, data_size);
4197 if (status) {
4198 return status;
4199 }
4200
4201 partial = (dw13 >> 16) & 0x01;
4202
4203 zone_entry_sz = sizeof(NvmeZoneDescr);
4204 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4205 zone_entry_sz += ns->params.zd_extension_size;
4206 }
4207
4208 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4209 buf = g_malloc0(data_size);
4210
4211 zone = &ns->zone_array[zone_idx];
4212 for (i = zone_idx; i < ns->num_zones; i++) {
4213 if (partial && nr_zones >= max_zones) {
4214 break;
4215 }
4216 if (nvme_zone_matches_filter(zrasf, zone++)) {
4217 nr_zones++;
4218 }
4219 }
4220 header = buf;
4221 header->nr_zones = cpu_to_le64(nr_zones);
4222
4223 buf_p = buf + sizeof(NvmeZoneReportHeader);
4224 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4225 zone = &ns->zone_array[zone_idx];
4226 if (nvme_zone_matches_filter(zrasf, zone)) {
4227 z = buf_p;
4228 buf_p += sizeof(NvmeZoneDescr);
4229
4230 z->zt = zone->d.zt;
4231 z->zs = zone->d.zs;
4232 z->zcap = cpu_to_le64(zone->d.zcap);
4233 z->zslba = cpu_to_le64(zone->d.zslba);
4234 z->za = zone->d.za;
4235
4236 if (nvme_wp_is_valid(zone)) {
4237 z->wp = cpu_to_le64(zone->d.wp);
4238 } else {
4239 z->wp = cpu_to_le64(~0ULL);
4240 }
4241
4242 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4243 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4244 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4245 ns->params.zd_extension_size);
4246 }
4247 buf_p += ns->params.zd_extension_size;
4248 }
4249
4250 max_zones--;
4251 }
4252 }
4253
4254 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4255
4256 g_free(buf);
4257
4258 return status;
4259 }
4260
4261 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4262 size_t len)
4263 {
4264 NvmeNamespace *ns = req->ns;
4265 NvmeEnduranceGroup *endgrp;
4266 NvmeRuhStatus *hdr;
4267 NvmeRuhStatusDescr *ruhsd;
4268 unsigned int nruhsd;
4269 uint16_t rg, ph, *ruhid;
4270 size_t trans_len;
4271 g_autofree uint8_t *buf = NULL;
4272
4273 if (!n->subsys) {
4274 return NVME_INVALID_FIELD | NVME_DNR;
4275 }
4276
4277 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4278 return NVME_INVALID_NSID | NVME_DNR;
4279 }
4280
4281 if (!n->subsys->endgrp.fdp.enabled) {
4282 return NVME_FDP_DISABLED | NVME_DNR;
4283 }
4284
4285 endgrp = ns->endgrp;
4286
4287 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4288 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4289 buf = g_malloc(trans_len);
4290
4291 trans_len = MIN(trans_len, len);
4292
4293 hdr = (NvmeRuhStatus *)buf;
4294 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4295
4296 hdr->nruhsd = cpu_to_le16(nruhsd);
4297
4298 ruhid = ns->fdp.phs;
4299
4300 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4301 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4302
4303 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4304 uint16_t pid = nvme_make_pid(ns, rg, ph);
4305
4306 ruhsd->pid = cpu_to_le16(pid);
4307 ruhsd->ruhid = *ruhid;
4308 ruhsd->earutr = 0;
4309 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4310 }
4311 }
4312
4313 return nvme_c2h(n, buf, trans_len, req);
4314 }
4315
4316 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4317 {
4318 NvmeCmd *cmd = &req->cmd;
4319 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4320 uint32_t numd = le32_to_cpu(cmd->cdw11);
4321 uint8_t mo = (cdw10 & 0xff);
4322 size_t len = (numd + 1) << 2;
4323
4324 switch (mo) {
4325 case NVME_IOMR_MO_NOP:
4326 return 0;
4327 case NVME_IOMR_MO_RUH_STATUS:
4328 return nvme_io_mgmt_recv_ruhs(n, req, len);
4329 default:
4330 return NVME_INVALID_FIELD | NVME_DNR;
4331 };
4332 }
4333
4334 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4335 {
4336 NvmeCmd *cmd = &req->cmd;
4337 NvmeNamespace *ns = req->ns;
4338 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4339 uint16_t ret = NVME_SUCCESS;
4340 uint32_t npid = (cdw10 >> 1) + 1;
4341 unsigned int i = 0;
4342 g_autofree uint16_t *pids = NULL;
4343 uint32_t maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4344
4345 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4346 return NVME_INVALID_FIELD | NVME_DNR;
4347 }
4348
4349 pids = g_new(uint16_t, npid);
4350
4351 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4352 if (ret) {
4353 return ret;
4354 }
4355
4356 for (; i < npid; i++) {
4357 if (!nvme_update_ruh(n, ns, pids[i])) {
4358 return NVME_INVALID_FIELD | NVME_DNR;
4359 }
4360 }
4361
4362 return ret;
4363 }
4364
4365 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4366 {
4367 NvmeCmd *cmd = &req->cmd;
4368 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4369 uint8_t mo = (cdw10 & 0xff);
4370
4371 switch (mo) {
4372 case NVME_IOMS_MO_NOP:
4373 return 0;
4374 case NVME_IOMS_MO_RUH_UPDATE:
4375 return nvme_io_mgmt_send_ruh_update(n, req);
4376 default:
4377 return NVME_INVALID_FIELD | NVME_DNR;
4378 };
4379 }
4380
4381 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4382 {
4383 NvmeNamespace *ns;
4384 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4385
4386 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4387 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4388
4389 if (!nvme_nsid_valid(n, nsid)) {
4390 return NVME_INVALID_NSID | NVME_DNR;
4391 }
4392
4393 /*
4394 * In the base NVM command set, Flush may apply to all namespaces
4395 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4396 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4397 *
4398 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4399 * opcode with a specific command since we cannot determine a unique I/O
4400 * command set. Opcode 0h could have any other meaning than something
4401 * equivalent to flushing and say it DOES have completely different
4402 * semantics in some other command set - does an NSID of FFFFFFFFh then
4403 * mean "for all namespaces, apply whatever command set specific command
4404 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4405 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4406 * to be FFFFFFFFh"?
4407 *
4408 * Anyway (and luckily), for now, we do not care about this since the
4409 * device only supports namespace types that includes the NVM Flush command
4410 * (NVM and Zoned), so always do an NVM Flush.
4411 */
4412 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4413 return nvme_flush(n, req);
4414 }
4415
4416 ns = nvme_ns(n, nsid);
4417 if (unlikely(!ns)) {
4418 return NVME_INVALID_FIELD | NVME_DNR;
4419 }
4420
4421 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4422 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4423 return NVME_INVALID_OPCODE | NVME_DNR;
4424 }
4425
4426 if (ns->status) {
4427 return ns->status;
4428 }
4429
4430 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4431 return NVME_INVALID_FIELD;
4432 }
4433
4434 req->ns = ns;
4435
4436 switch (req->cmd.opcode) {
4437 case NVME_CMD_WRITE_ZEROES:
4438 return nvme_write_zeroes(n, req);
4439 case NVME_CMD_ZONE_APPEND:
4440 return nvme_zone_append(n, req);
4441 case NVME_CMD_WRITE:
4442 return nvme_write(n, req);
4443 case NVME_CMD_READ:
4444 return nvme_read(n, req);
4445 case NVME_CMD_COMPARE:
4446 return nvme_compare(n, req);
4447 case NVME_CMD_DSM:
4448 return nvme_dsm(n, req);
4449 case NVME_CMD_VERIFY:
4450 return nvme_verify(n, req);
4451 case NVME_CMD_COPY:
4452 return nvme_copy(n, req);
4453 case NVME_CMD_ZONE_MGMT_SEND:
4454 return nvme_zone_mgmt_send(n, req);
4455 case NVME_CMD_ZONE_MGMT_RECV:
4456 return nvme_zone_mgmt_recv(n, req);
4457 case NVME_CMD_IO_MGMT_RECV:
4458 return nvme_io_mgmt_recv(n, req);
4459 case NVME_CMD_IO_MGMT_SEND:
4460 return nvme_io_mgmt_send(n, req);
4461 default:
4462 assert(false);
4463 }
4464
4465 return NVME_INVALID_OPCODE | NVME_DNR;
4466 }
4467
4468 static void nvme_cq_notifier(EventNotifier *e)
4469 {
4470 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4471 NvmeCtrl *n = cq->ctrl;
4472
4473 if (!event_notifier_test_and_clear(e)) {
4474 return;
4475 }
4476
4477 nvme_update_cq_head(cq);
4478
4479 if (cq->tail == cq->head) {
4480 if (cq->irq_enabled) {
4481 n->cq_pending--;
4482 }
4483
4484 nvme_irq_deassert(n, cq);
4485 }
4486
4487 qemu_bh_schedule(cq->bh);
4488 }
4489
4490 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4491 {
4492 NvmeCtrl *n = cq->ctrl;
4493 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4494 int ret;
4495
4496 ret = event_notifier_init(&cq->notifier, 0);
4497 if (ret < 0) {
4498 return ret;
4499 }
4500
4501 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4502 memory_region_add_eventfd(&n->iomem,
4503 0x1000 + offset, 4, false, 0, &cq->notifier);
4504
4505 return 0;
4506 }
4507
4508 static void nvme_sq_notifier(EventNotifier *e)
4509 {
4510 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4511
4512 if (!event_notifier_test_and_clear(e)) {
4513 return;
4514 }
4515
4516 nvme_process_sq(sq);
4517 }
4518
4519 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4520 {
4521 NvmeCtrl *n = sq->ctrl;
4522 uint16_t offset = sq->sqid << 3;
4523 int ret;
4524
4525 ret = event_notifier_init(&sq->notifier, 0);
4526 if (ret < 0) {
4527 return ret;
4528 }
4529
4530 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4531 memory_region_add_eventfd(&n->iomem,
4532 0x1000 + offset, 4, false, 0, &sq->notifier);
4533
4534 return 0;
4535 }
4536
4537 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4538 {
4539 uint16_t offset = sq->sqid << 3;
4540
4541 n->sq[sq->sqid] = NULL;
4542 qemu_bh_delete(sq->bh);
4543 if (sq->ioeventfd_enabled) {
4544 memory_region_del_eventfd(&n->iomem,
4545 0x1000 + offset, 4, false, 0, &sq->notifier);
4546 event_notifier_set_handler(&sq->notifier, NULL);
4547 event_notifier_cleanup(&sq->notifier);
4548 }
4549 g_free(sq->io_req);
4550 if (sq->sqid) {
4551 g_free(sq);
4552 }
4553 }
4554
4555 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4556 {
4557 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4558 NvmeRequest *r, *next;
4559 NvmeSQueue *sq;
4560 NvmeCQueue *cq;
4561 uint16_t qid = le16_to_cpu(c->qid);
4562
4563 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4564 trace_pci_nvme_err_invalid_del_sq(qid);
4565 return NVME_INVALID_QID | NVME_DNR;
4566 }
4567
4568 trace_pci_nvme_del_sq(qid);
4569
4570 sq = n->sq[qid];
4571 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4572 r = QTAILQ_FIRST(&sq->out_req_list);
4573 assert(r->aiocb);
4574 blk_aio_cancel(r->aiocb);
4575 }
4576
4577 assert(QTAILQ_EMPTY(&sq->out_req_list));
4578
4579 if (!nvme_check_cqid(n, sq->cqid)) {
4580 cq = n->cq[sq->cqid];
4581 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4582
4583 nvme_post_cqes(cq);
4584 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4585 if (r->sq == sq) {
4586 QTAILQ_REMOVE(&cq->req_list, r, entry);
4587 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4588 }
4589 }
4590 }
4591
4592 nvme_free_sq(sq, n);
4593 return NVME_SUCCESS;
4594 }
4595
4596 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4597 uint16_t sqid, uint16_t cqid, uint16_t size)
4598 {
4599 int i;
4600 NvmeCQueue *cq;
4601
4602 sq->ctrl = n;
4603 sq->dma_addr = dma_addr;
4604 sq->sqid = sqid;
4605 sq->size = size;
4606 sq->cqid = cqid;
4607 sq->head = sq->tail = 0;
4608 sq->io_req = g_new0(NvmeRequest, sq->size);
4609
4610 QTAILQ_INIT(&sq->req_list);
4611 QTAILQ_INIT(&sq->out_req_list);
4612 for (i = 0; i < sq->size; i++) {
4613 sq->io_req[i].sq = sq;
4614 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4615 }
4616
4617 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4618 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4619
4620 if (n->dbbuf_enabled) {
4621 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4622 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4623
4624 if (n->params.ioeventfd && sq->sqid != 0) {
4625 if (!nvme_init_sq_ioeventfd(sq)) {
4626 sq->ioeventfd_enabled = true;
4627 }
4628 }
4629 }
4630
4631 assert(n->cq[cqid]);
4632 cq = n->cq[cqid];
4633 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4634 n->sq[sqid] = sq;
4635 }
4636
4637 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4638 {
4639 NvmeSQueue *sq;
4640 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4641
4642 uint16_t cqid = le16_to_cpu(c->cqid);
4643 uint16_t sqid = le16_to_cpu(c->sqid);
4644 uint16_t qsize = le16_to_cpu(c->qsize);
4645 uint16_t qflags = le16_to_cpu(c->sq_flags);
4646 uint64_t prp1 = le64_to_cpu(c->prp1);
4647
4648 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4649
4650 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4651 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4652 return NVME_INVALID_CQID | NVME_DNR;
4653 }
4654 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4655 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4656 return NVME_INVALID_QID | NVME_DNR;
4657 }
4658 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4659 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4660 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4661 }
4662 if (unlikely(prp1 & (n->page_size - 1))) {
4663 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4664 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4665 }
4666 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4667 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4668 return NVME_INVALID_FIELD | NVME_DNR;
4669 }
4670 sq = g_malloc0(sizeof(*sq));
4671 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4672 return NVME_SUCCESS;
4673 }
4674
4675 struct nvme_stats {
4676 uint64_t units_read;
4677 uint64_t units_written;
4678 uint64_t read_commands;
4679 uint64_t write_commands;
4680 };
4681
4682 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4683 {
4684 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4685
4686 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4687 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4688 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4689 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4690 }
4691
4692 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4693 uint64_t off, NvmeRequest *req)
4694 {
4695 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4696 struct nvme_stats stats = { 0 };
4697 NvmeSmartLog smart = { 0 };
4698 uint32_t trans_len;
4699 NvmeNamespace *ns;
4700 time_t current_ms;
4701 uint64_t u_read, u_written;
4702
4703 if (off >= sizeof(smart)) {
4704 return NVME_INVALID_FIELD | NVME_DNR;
4705 }
4706
4707 if (nsid != 0xffffffff) {
4708 ns = nvme_ns(n, nsid);
4709 if (!ns) {
4710 return NVME_INVALID_NSID | NVME_DNR;
4711 }
4712 nvme_set_blk_stats(ns, &stats);
4713 } else {
4714 int i;
4715
4716 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4717 ns = nvme_ns(n, i);
4718 if (!ns) {
4719 continue;
4720 }
4721 nvme_set_blk_stats(ns, &stats);
4722 }
4723 }
4724
4725 trans_len = MIN(sizeof(smart) - off, buf_len);
4726 smart.critical_warning = n->smart_critical_warning;
4727
4728 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
4729 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
4730
4731 smart.data_units_read[0] = cpu_to_le64(u_read);
4732 smart.data_units_written[0] = cpu_to_le64(u_written);
4733 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4734 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4735
4736 smart.temperature = cpu_to_le16(n->temperature);
4737
4738 if ((n->temperature >= n->features.temp_thresh_hi) ||
4739 (n->temperature <= n->features.temp_thresh_low)) {
4740 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4741 }
4742
4743 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4744 smart.power_on_hours[0] =
4745 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4746
4747 if (!rae) {
4748 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4749 }
4750
4751 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4752 }
4753
4754 static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4755 uint64_t off, NvmeRequest *req)
4756 {
4757 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
4758 uint16_t endgrpid = (dw11 >> 16) & 0xffff;
4759 struct nvme_stats stats = {};
4760 NvmeEndGrpLog info = {};
4761 int i;
4762
4763 if (!n->subsys || endgrpid != 0x1) {
4764 return NVME_INVALID_FIELD | NVME_DNR;
4765 }
4766
4767 if (off >= sizeof(info)) {
4768 return NVME_INVALID_FIELD | NVME_DNR;
4769 }
4770
4771 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4772 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
4773 if (!ns) {
4774 continue;
4775 }
4776
4777 nvme_set_blk_stats(ns, &stats);
4778 }
4779
4780 info.data_units_read[0] =
4781 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
4782 info.data_units_written[0] =
4783 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4784 info.media_units_written[0] =
4785 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4786
4787 info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4788 info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4789
4790 buf_len = MIN(sizeof(info) - off, buf_len);
4791
4792 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
4793 }
4794
4795
4796 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4797 NvmeRequest *req)
4798 {
4799 uint32_t trans_len;
4800 NvmeFwSlotInfoLog fw_log = {
4801 .afi = 0x1,
4802 };
4803
4804 if (off >= sizeof(fw_log)) {
4805 return NVME_INVALID_FIELD | NVME_DNR;
4806 }
4807
4808 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4809 trans_len = MIN(sizeof(fw_log) - off, buf_len);
4810
4811 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4812 }
4813
4814 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4815 uint64_t off, NvmeRequest *req)
4816 {
4817 uint32_t trans_len;
4818 NvmeErrorLog errlog;
4819
4820 if (off >= sizeof(errlog)) {
4821 return NVME_INVALID_FIELD | NVME_DNR;
4822 }
4823
4824 if (!rae) {
4825 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4826 }
4827
4828 memset(&errlog, 0x0, sizeof(errlog));
4829 trans_len = MIN(sizeof(errlog) - off, buf_len);
4830
4831 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4832 }
4833
4834 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4835 uint64_t off, NvmeRequest *req)
4836 {
4837 uint32_t nslist[1024];
4838 uint32_t trans_len;
4839 int i = 0;
4840 uint32_t nsid;
4841
4842 if (off >= sizeof(nslist)) {
4843 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
4844 return NVME_INVALID_FIELD | NVME_DNR;
4845 }
4846
4847 memset(nslist, 0x0, sizeof(nslist));
4848 trans_len = MIN(sizeof(nslist) - off, buf_len);
4849
4850 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4851 NVME_CHANGED_NSID_SIZE) {
4852 /*
4853 * If more than 1024 namespaces, the first entry in the log page should
4854 * be set to FFFFFFFFh and the others to 0 as spec.
4855 */
4856 if (i == ARRAY_SIZE(nslist)) {
4857 memset(nslist, 0x0, sizeof(nslist));
4858 nslist[0] = 0xffffffff;
4859 break;
4860 }
4861
4862 nslist[i++] = nsid;
4863 clear_bit(nsid, n->changed_nsids);
4864 }
4865
4866 /*
4867 * Remove all the remaining list entries in case returns directly due to
4868 * more than 1024 namespaces.
4869 */
4870 if (nslist[0] == 0xffffffff) {
4871 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4872 }
4873
4874 if (!rae) {
4875 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4876 }
4877
4878 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4879 }
4880
4881 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4882 uint64_t off, NvmeRequest *req)
4883 {
4884 NvmeEffectsLog log = {};
4885 const uint32_t *src_iocs = NULL;
4886 uint32_t trans_len;
4887
4888 if (off >= sizeof(log)) {
4889 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4890 return NVME_INVALID_FIELD | NVME_DNR;
4891 }
4892
4893 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4894 case NVME_CC_CSS_NVM:
4895 src_iocs = nvme_cse_iocs_nvm;
4896 /* fall through */
4897 case NVME_CC_CSS_ADMIN_ONLY:
4898 break;
4899 case NVME_CC_CSS_CSI:
4900 switch (csi) {
4901 case NVME_CSI_NVM:
4902 src_iocs = nvme_cse_iocs_nvm;
4903 break;
4904 case NVME_CSI_ZONED:
4905 src_iocs = nvme_cse_iocs_zoned;
4906 break;
4907 }
4908 }
4909
4910 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4911
4912 if (src_iocs) {
4913 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4914 }
4915
4916 trans_len = MIN(sizeof(log) - off, buf_len);
4917
4918 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4919 }
4920
4921 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
4922 {
4923 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
4924 + vss;
4925 return ROUND_UP(entry_siz, 8);
4926 }
4927
4928 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
4929 uint64_t off, NvmeRequest *req)
4930 {
4931 uint32_t log_size, trans_len;
4932 g_autofree uint8_t *buf = NULL;
4933 NvmeFdpDescrHdr *hdr;
4934 NvmeRuhDescr *ruhd;
4935 NvmeEnduranceGroup *endgrp;
4936 NvmeFdpConfsHdr *log;
4937 size_t nruh, fdp_descr_size;
4938 int i;
4939
4940 if (endgrpid != 1 || !n->subsys) {
4941 return NVME_INVALID_FIELD | NVME_DNR;
4942 }
4943
4944 endgrp = &n->subsys->endgrp;
4945
4946 if (endgrp->fdp.enabled) {
4947 nruh = endgrp->fdp.nruh;
4948 } else {
4949 nruh = 1;
4950 }
4951
4952 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
4953 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
4954
4955 if (off >= log_size) {
4956 return NVME_INVALID_FIELD | NVME_DNR;
4957 }
4958
4959 trans_len = MIN(log_size - off, buf_len);
4960
4961 buf = g_malloc0(log_size);
4962 log = (NvmeFdpConfsHdr *)buf;
4963 hdr = (NvmeFdpDescrHdr *)(log + 1);
4964 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
4965
4966 log->num_confs = cpu_to_le16(0);
4967 log->size = cpu_to_le32(log_size);
4968
4969 hdr->descr_size = cpu_to_le16(fdp_descr_size);
4970 if (endgrp->fdp.enabled) {
4971 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
4972 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
4973 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
4974 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
4975 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
4976 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
4977 hdr->runs = cpu_to_le64(endgrp->fdp.runs);
4978
4979 for (i = 0; i < nruh; i++) {
4980 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
4981 ruhd++;
4982 }
4983 } else {
4984 /* 1 bit for RUH in PIF -> 2 RUHs max. */
4985 hdr->nrg = cpu_to_le16(1);
4986 hdr->nruh = cpu_to_le16(1);
4987 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
4988 hdr->nnss = cpu_to_le32(1);
4989 hdr->runs = cpu_to_le64(96 * MiB);
4990
4991 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
4992 }
4993
4994 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
4995 }
4996
4997 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
4998 uint32_t dw10, uint32_t dw12,
4999 uint32_t buf_len, uint64_t off,
5000 NvmeRequest *req)
5001 {
5002 NvmeRuHandle *ruh;
5003 NvmeRuhuLog *hdr;
5004 NvmeRuhuDescr *ruhud;
5005 NvmeEnduranceGroup *endgrp;
5006 g_autofree uint8_t *buf = NULL;
5007 uint32_t log_size, trans_len;
5008 uint16_t i;
5009
5010 if (endgrpid != 1 || !n->subsys) {
5011 return NVME_INVALID_FIELD | NVME_DNR;
5012 }
5013
5014 endgrp = &n->subsys->endgrp;
5015
5016 if (!endgrp->fdp.enabled) {
5017 return NVME_FDP_DISABLED | NVME_DNR;
5018 }
5019
5020 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5021
5022 if (off >= log_size) {
5023 return NVME_INVALID_FIELD | NVME_DNR;
5024 }
5025
5026 trans_len = MIN(log_size - off, buf_len);
5027
5028 buf = g_malloc0(log_size);
5029 hdr = (NvmeRuhuLog *)buf;
5030 ruhud = (NvmeRuhuDescr *)(hdr + 1);
5031
5032 ruh = endgrp->fdp.ruhs;
5033 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5034
5035 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5036 ruhud->ruha = ruh->ruha;
5037 }
5038
5039 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5040 }
5041
5042 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5043 uint64_t off, NvmeRequest *req)
5044 {
5045 NvmeEnduranceGroup *endgrp;
5046 NvmeFdpStatsLog log = {};
5047 uint32_t trans_len;
5048
5049 if (off >= sizeof(NvmeFdpStatsLog)) {
5050 return NVME_INVALID_FIELD | NVME_DNR;
5051 }
5052
5053 if (endgrpid != 1 || !n->subsys) {
5054 return NVME_INVALID_FIELD | NVME_DNR;
5055 }
5056
5057 if (!n->subsys->endgrp.fdp.enabled) {
5058 return NVME_FDP_DISABLED | NVME_DNR;
5059 }
5060
5061 endgrp = &n->subsys->endgrp;
5062
5063 trans_len = MIN(sizeof(log) - off, buf_len);
5064
5065 /* spec value is 128 bit, we only use 64 bit */
5066 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5067 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5068 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5069
5070 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5071 }
5072
5073 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5074 uint32_t buf_len, uint64_t off,
5075 NvmeRequest *req)
5076 {
5077 NvmeEnduranceGroup *endgrp;
5078 NvmeCmd *cmd = &req->cmd;
5079 bool host_events = (cmd->cdw10 >> 8) & 0x1;
5080 uint32_t log_size, trans_len;
5081 NvmeFdpEventBuffer *ebuf;
5082 g_autofree NvmeFdpEventsLog *elog = NULL;
5083 NvmeFdpEvent *event;
5084
5085 if (endgrpid != 1 || !n->subsys) {
5086 return NVME_INVALID_FIELD | NVME_DNR;
5087 }
5088
5089 endgrp = &n->subsys->endgrp;
5090
5091 if (!endgrp->fdp.enabled) {
5092 return NVME_FDP_DISABLED | NVME_DNR;
5093 }
5094
5095 if (host_events) {
5096 ebuf = &endgrp->fdp.host_events;
5097 } else {
5098 ebuf = &endgrp->fdp.ctrl_events;
5099 }
5100
5101 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5102 trans_len = MIN(log_size - off, buf_len);
5103 elog = g_malloc0(log_size);
5104 elog->num_events = cpu_to_le32(ebuf->nelems);
5105 event = (NvmeFdpEvent *)(elog + 1);
5106
5107 if (ebuf->nelems && ebuf->start == ebuf->next) {
5108 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5109 /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5110 memcpy(event, &ebuf->events[ebuf->start],
5111 sizeof(NvmeFdpEvent) * nelems);
5112 memcpy(event + nelems, ebuf->events,
5113 sizeof(NvmeFdpEvent) * ebuf->next);
5114 } else if (ebuf->start < ebuf->next) {
5115 memcpy(event, &ebuf->events[ebuf->start],
5116 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5117 }
5118
5119 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5120 }
5121
5122 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5123 {
5124 NvmeCmd *cmd = &req->cmd;
5125
5126 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5127 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5128 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5129 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5130 uint8_t lid = dw10 & 0xff;
5131 uint8_t lsp = (dw10 >> 8) & 0xf;
5132 uint8_t rae = (dw10 >> 15) & 0x1;
5133 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
5134 uint32_t numdl, numdu, lspi;
5135 uint64_t off, lpol, lpou;
5136 size_t len;
5137 uint16_t status;
5138
5139 numdl = (dw10 >> 16);
5140 numdu = (dw11 & 0xffff);
5141 lspi = (dw11 >> 16);
5142 lpol = dw12;
5143 lpou = dw13;
5144
5145 len = (((numdu << 16) | numdl) + 1) << 2;
5146 off = (lpou << 32ULL) | lpol;
5147
5148 if (off & 0x3) {
5149 return NVME_INVALID_FIELD | NVME_DNR;
5150 }
5151
5152 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5153
5154 status = nvme_check_mdts(n, len);
5155 if (status) {
5156 return status;
5157 }
5158
5159 switch (lid) {
5160 case NVME_LOG_ERROR_INFO:
5161 return nvme_error_info(n, rae, len, off, req);
5162 case NVME_LOG_SMART_INFO:
5163 return nvme_smart_info(n, rae, len, off, req);
5164 case NVME_LOG_FW_SLOT_INFO:
5165 return nvme_fw_log_info(n, len, off, req);
5166 case NVME_LOG_CHANGED_NSLIST:
5167 return nvme_changed_nslist(n, rae, len, off, req);
5168 case NVME_LOG_CMD_EFFECTS:
5169 return nvme_cmd_effects(n, csi, len, off, req);
5170 case NVME_LOG_ENDGRP:
5171 return nvme_endgrp_info(n, rae, len, off, req);
5172 case NVME_LOG_FDP_CONFS:
5173 return nvme_fdp_confs(n, lspi, len, off, req);
5174 case NVME_LOG_FDP_RUH_USAGE:
5175 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5176 case NVME_LOG_FDP_STATS:
5177 return nvme_fdp_stats(n, lspi, len, off, req);
5178 case NVME_LOG_FDP_EVENTS:
5179 return nvme_fdp_events(n, lspi, len, off, req);
5180 default:
5181 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5182 return NVME_INVALID_FIELD | NVME_DNR;
5183 }
5184 }
5185
5186 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5187 {
5188 PCIDevice *pci = PCI_DEVICE(n);
5189 uint16_t offset = (cq->cqid << 3) + (1 << 2);
5190
5191 n->cq[cq->cqid] = NULL;
5192 qemu_bh_delete(cq->bh);
5193 if (cq->ioeventfd_enabled) {
5194 memory_region_del_eventfd(&n->iomem,
5195 0x1000 + offset, 4, false, 0, &cq->notifier);
5196 event_notifier_set_handler(&cq->notifier, NULL);
5197 event_notifier_cleanup(&cq->notifier);
5198 }
5199 if (msix_enabled(pci)) {
5200 msix_vector_unuse(pci, cq->vector);
5201 }
5202 if (cq->cqid) {
5203 g_free(cq);
5204 }
5205 }
5206
5207 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5208 {
5209 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5210 NvmeCQueue *cq;
5211 uint16_t qid = le16_to_cpu(c->qid);
5212
5213 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5214 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5215 return NVME_INVALID_CQID | NVME_DNR;
5216 }
5217
5218 cq = n->cq[qid];
5219 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5220 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5221 return NVME_INVALID_QUEUE_DEL;
5222 }
5223
5224 if (cq->irq_enabled && cq->tail != cq->head) {
5225 n->cq_pending--;
5226 }
5227
5228 nvme_irq_deassert(n, cq);
5229 trace_pci_nvme_del_cq(qid);
5230 nvme_free_cq(cq, n);
5231 return NVME_SUCCESS;
5232 }
5233
5234 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5235 uint16_t cqid, uint16_t vector, uint16_t size,
5236 uint16_t irq_enabled)
5237 {
5238 PCIDevice *pci = PCI_DEVICE(n);
5239
5240 if (msix_enabled(pci)) {
5241 msix_vector_use(pci, vector);
5242 }
5243 cq->ctrl = n;
5244 cq->cqid = cqid;
5245 cq->size = size;
5246 cq->dma_addr = dma_addr;
5247 cq->phase = 1;
5248 cq->irq_enabled = irq_enabled;
5249 cq->vector = vector;
5250 cq->head = cq->tail = 0;
5251 QTAILQ_INIT(&cq->req_list);
5252 QTAILQ_INIT(&cq->sq_list);
5253 if (n->dbbuf_enabled) {
5254 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5255 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5256
5257 if (n->params.ioeventfd && cqid != 0) {
5258 if (!nvme_init_cq_ioeventfd(cq)) {
5259 cq->ioeventfd_enabled = true;
5260 }
5261 }
5262 }
5263 n->cq[cqid] = cq;
5264 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5265 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5266 }
5267
5268 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5269 {
5270 NvmeCQueue *cq;
5271 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5272 uint16_t cqid = le16_to_cpu(c->cqid);
5273 uint16_t vector = le16_to_cpu(c->irq_vector);
5274 uint16_t qsize = le16_to_cpu(c->qsize);
5275 uint16_t qflags = le16_to_cpu(c->cq_flags);
5276 uint64_t prp1 = le64_to_cpu(c->prp1);
5277
5278 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5279 NVME_CQ_FLAGS_IEN(qflags) != 0);
5280
5281 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5282 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5283 return NVME_INVALID_QID | NVME_DNR;
5284 }
5285 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5286 trace_pci_nvme_err_invalid_create_cq_size(qsize);
5287 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5288 }
5289 if (unlikely(prp1 & (n->page_size - 1))) {
5290 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5291 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5292 }
5293 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5294 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5295 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5296 }
5297 if (unlikely(vector >= n->conf_msix_qsize)) {
5298 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5299 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5300 }
5301 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5302 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5303 return NVME_INVALID_FIELD | NVME_DNR;
5304 }
5305
5306 cq = g_malloc0(sizeof(*cq));
5307 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5308 NVME_CQ_FLAGS_IEN(qflags));
5309
5310 /*
5311 * It is only required to set qs_created when creating a completion queue;
5312 * creating a submission queue without a matching completion queue will
5313 * fail.
5314 */
5315 n->qs_created = true;
5316 return NVME_SUCCESS;
5317 }
5318
5319 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5320 {
5321 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5322
5323 return nvme_c2h(n, id, sizeof(id), req);
5324 }
5325
5326 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5327 {
5328 trace_pci_nvme_identify_ctrl();
5329
5330 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5331 }
5332
5333 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5334 {
5335 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5336 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5337 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5338
5339 trace_pci_nvme_identify_ctrl_csi(c->csi);
5340
5341 switch (c->csi) {
5342 case NVME_CSI_NVM:
5343 id_nvm->vsl = n->params.vsl;
5344 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5345 break;
5346
5347 case NVME_CSI_ZONED:
5348 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5349 break;
5350
5351 default:
5352 return NVME_INVALID_FIELD | NVME_DNR;
5353 }
5354
5355 return nvme_c2h(n, id, sizeof(id), req);
5356 }
5357
5358 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5359 {
5360 NvmeNamespace *ns;
5361 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5362 uint32_t nsid = le32_to_cpu(c->nsid);
5363
5364 trace_pci_nvme_identify_ns(nsid);
5365
5366 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5367 return NVME_INVALID_NSID | NVME_DNR;
5368 }
5369
5370 ns = nvme_ns(n, nsid);
5371 if (unlikely(!ns)) {
5372 if (!active) {
5373 ns = nvme_subsys_ns(n->subsys, nsid);
5374 if (!ns) {
5375 return nvme_rpt_empty_id_struct(n, req);
5376 }
5377 } else {
5378 return nvme_rpt_empty_id_struct(n, req);
5379 }
5380 }
5381
5382 if (active || ns->csi == NVME_CSI_NVM) {
5383 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5384 }
5385
5386 return NVME_INVALID_CMD_SET | NVME_DNR;
5387 }
5388
5389 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5390 bool attached)
5391 {
5392 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5393 uint32_t nsid = le32_to_cpu(c->nsid);
5394 uint16_t min_id = le16_to_cpu(c->ctrlid);
5395 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5396 uint16_t *ids = &list[1];
5397 NvmeNamespace *ns;
5398 NvmeCtrl *ctrl;
5399 int cntlid, nr_ids = 0;
5400
5401 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5402
5403 if (!n->subsys) {
5404 return NVME_INVALID_FIELD | NVME_DNR;
5405 }
5406
5407 if (attached) {
5408 if (nsid == NVME_NSID_BROADCAST) {
5409 return NVME_INVALID_FIELD | NVME_DNR;
5410 }
5411
5412 ns = nvme_subsys_ns(n->subsys, nsid);
5413 if (!ns) {
5414 return NVME_INVALID_FIELD | NVME_DNR;
5415 }
5416 }
5417
5418 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5419 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5420 if (!ctrl) {
5421 continue;
5422 }
5423
5424 if (attached && !nvme_ns(ctrl, nsid)) {
5425 continue;
5426 }
5427
5428 ids[nr_ids++] = cntlid;
5429 }
5430
5431 list[0] = nr_ids;
5432
5433 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5434 }
5435
5436 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5437 {
5438 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5439
5440 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5441 sizeof(NvmePriCtrlCap), req);
5442 }
5443
5444 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5445 {
5446 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5447 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5448 uint16_t min_id = le16_to_cpu(c->ctrlid);
5449 uint8_t num_sec_ctrl = n->sec_ctrl_list.numcntl;
5450 NvmeSecCtrlList list = {0};
5451 uint8_t i;
5452
5453 for (i = 0; i < num_sec_ctrl; i++) {
5454 if (n->sec_ctrl_list.sec[i].scid >= min_id) {
5455 list.numcntl = num_sec_ctrl - i;
5456 memcpy(&list.sec, n->sec_ctrl_list.sec + i,
5457 list.numcntl * sizeof(NvmeSecCtrlEntry));
5458 break;
5459 }
5460 }
5461
5462 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5463
5464 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5465 }
5466
5467 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5468 bool active)
5469 {
5470 NvmeNamespace *ns;
5471 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5472 uint32_t nsid = le32_to_cpu(c->nsid);
5473
5474 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5475
5476 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5477 return NVME_INVALID_NSID | NVME_DNR;
5478 }
5479
5480 ns = nvme_ns(n, nsid);
5481 if (unlikely(!ns)) {
5482 if (!active) {
5483 ns = nvme_subsys_ns(n->subsys, nsid);
5484 if (!ns) {
5485 return nvme_rpt_empty_id_struct(n, req);
5486 }
5487 } else {
5488 return nvme_rpt_empty_id_struct(n, req);
5489 }
5490 }
5491
5492 if (c->csi == NVME_CSI_NVM) {
5493 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5494 req);
5495 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5496 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5497 req);
5498 }
5499
5500 return NVME_INVALID_FIELD | NVME_DNR;
5501 }
5502
5503 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5504 bool active)
5505 {
5506 NvmeNamespace *ns;
5507 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5508 uint32_t min_nsid = le32_to_cpu(c->nsid);
5509 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5510 static const int data_len = sizeof(list);
5511 uint32_t *list_ptr = (uint32_t *)list;
5512 int i, j = 0;
5513
5514 trace_pci_nvme_identify_nslist(min_nsid);
5515
5516 /*
5517 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5518 * since the Active Namespace ID List should return namespaces with ids
5519 * *higher* than the NSID specified in the command. This is also specified
5520 * in the spec (NVM Express v1.3d, Section 5.15.4).
5521 */
5522 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5523 return NVME_INVALID_NSID | NVME_DNR;
5524 }
5525
5526 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5527 ns = nvme_ns(n, i);
5528 if (!ns) {
5529 if (!active) {
5530 ns = nvme_subsys_ns(n->subsys, i);
5531 if (!ns) {
5532 continue;
5533 }
5534 } else {
5535 continue;
5536 }
5537 }
5538 if (ns->params.nsid <= min_nsid) {
5539 continue;
5540 }
5541 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5542 if (j == data_len / sizeof(uint32_t)) {
5543 break;
5544 }
5545 }
5546
5547 return nvme_c2h(n, list, data_len, req);
5548 }
5549
5550 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5551 bool active)
5552 {
5553 NvmeNamespace *ns;
5554 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5555 uint32_t min_nsid = le32_to_cpu(c->nsid);
5556 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5557 static const int data_len = sizeof(list);
5558 uint32_t *list_ptr = (uint32_t *)list;
5559 int i, j = 0;
5560
5561 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5562
5563 /*
5564 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5565 */
5566 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5567 return NVME_INVALID_NSID | NVME_DNR;
5568 }
5569
5570 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5571 return NVME_INVALID_FIELD | NVME_DNR;
5572 }
5573
5574 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5575 ns = nvme_ns(n, i);
5576 if (!ns) {
5577 if (!active) {
5578 ns = nvme_subsys_ns(n->subsys, i);
5579 if (!ns) {
5580 continue;
5581 }
5582 } else {
5583 continue;
5584 }
5585 }
5586 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5587 continue;
5588 }
5589 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5590 if (j == data_len / sizeof(uint32_t)) {
5591 break;
5592 }
5593 }
5594
5595 return nvme_c2h(n, list, data_len, req);
5596 }
5597
5598 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5599 {
5600 NvmeNamespace *ns;
5601 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5602 uint32_t nsid = le32_to_cpu(c->nsid);
5603 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5604 uint8_t *pos = list;
5605 struct {
5606 NvmeIdNsDescr hdr;
5607 uint8_t v[NVME_NIDL_UUID];
5608 } QEMU_PACKED uuid = {};
5609 struct {
5610 NvmeIdNsDescr hdr;
5611 uint64_t v;
5612 } QEMU_PACKED eui64 = {};
5613 struct {
5614 NvmeIdNsDescr hdr;
5615 uint8_t v;
5616 } QEMU_PACKED csi = {};
5617
5618 trace_pci_nvme_identify_ns_descr_list(nsid);
5619
5620 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5621 return NVME_INVALID_NSID | NVME_DNR;
5622 }
5623
5624 ns = nvme_ns(n, nsid);
5625 if (unlikely(!ns)) {
5626 return NVME_INVALID_FIELD | NVME_DNR;
5627 }
5628
5629 if (!qemu_uuid_is_null(&ns->params.uuid)) {
5630 uuid.hdr.nidt = NVME_NIDT_UUID;
5631 uuid.hdr.nidl = NVME_NIDL_UUID;
5632 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
5633 memcpy(pos, &uuid, sizeof(uuid));
5634 pos += sizeof(uuid);
5635 }
5636
5637 if (ns->params.eui64) {
5638 eui64.hdr.nidt = NVME_NIDT_EUI64;
5639 eui64.hdr.nidl = NVME_NIDL_EUI64;
5640 eui64.v = cpu_to_be64(ns->params.eui64);
5641 memcpy(pos, &eui64, sizeof(eui64));
5642 pos += sizeof(eui64);
5643 }
5644
5645 csi.hdr.nidt = NVME_NIDT_CSI;
5646 csi.hdr.nidl = NVME_NIDL_CSI;
5647 csi.v = ns->csi;
5648 memcpy(pos, &csi, sizeof(csi));
5649 pos += sizeof(csi);
5650
5651 return nvme_c2h(n, list, sizeof(list), req);
5652 }
5653
5654 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
5655 {
5656 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5657 static const int data_len = sizeof(list);
5658
5659 trace_pci_nvme_identify_cmd_set();
5660
5661 NVME_SET_CSI(*list, NVME_CSI_NVM);
5662 NVME_SET_CSI(*list, NVME_CSI_ZONED);
5663
5664 return nvme_c2h(n, list, data_len, req);
5665 }
5666
5667 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
5668 {
5669 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5670
5671 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
5672 c->csi);
5673
5674 switch (c->cns) {
5675 case NVME_ID_CNS_NS:
5676 return nvme_identify_ns(n, req, true);
5677 case NVME_ID_CNS_NS_PRESENT:
5678 return nvme_identify_ns(n, req, false);
5679 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5680 return nvme_identify_ctrl_list(n, req, true);
5681 case NVME_ID_CNS_CTRL_LIST:
5682 return nvme_identify_ctrl_list(n, req, false);
5683 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
5684 return nvme_identify_pri_ctrl_cap(n, req);
5685 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
5686 return nvme_identify_sec_ctrl_list(n, req);
5687 case NVME_ID_CNS_CS_NS:
5688 return nvme_identify_ns_csi(n, req, true);
5689 case NVME_ID_CNS_CS_NS_PRESENT:
5690 return nvme_identify_ns_csi(n, req, false);
5691 case NVME_ID_CNS_CTRL:
5692 return nvme_identify_ctrl(n, req);
5693 case NVME_ID_CNS_CS_CTRL:
5694 return nvme_identify_ctrl_csi(n, req);
5695 case NVME_ID_CNS_NS_ACTIVE_LIST:
5696 return nvme_identify_nslist(n, req, true);
5697 case NVME_ID_CNS_NS_PRESENT_LIST:
5698 return nvme_identify_nslist(n, req, false);
5699 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
5700 return nvme_identify_nslist_csi(n, req, true);
5701 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
5702 return nvme_identify_nslist_csi(n, req, false);
5703 case NVME_ID_CNS_NS_DESCR_LIST:
5704 return nvme_identify_ns_descr_list(n, req);
5705 case NVME_ID_CNS_IO_COMMAND_SET:
5706 return nvme_identify_cmd_set(n, req);
5707 default:
5708 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
5709 return NVME_INVALID_FIELD | NVME_DNR;
5710 }
5711 }
5712
5713 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
5714 {
5715 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
5716
5717 req->cqe.result = 1;
5718 if (nvme_check_sqid(n, sqid)) {
5719 return NVME_INVALID_FIELD | NVME_DNR;
5720 }
5721
5722 return NVME_SUCCESS;
5723 }
5724
5725 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
5726 {
5727 trace_pci_nvme_setfeat_timestamp(ts);
5728
5729 n->host_timestamp = le64_to_cpu(ts);
5730 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5731 }
5732
5733 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
5734 {
5735 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5736 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
5737
5738 union nvme_timestamp {
5739 struct {
5740 uint64_t timestamp:48;
5741 uint64_t sync:1;
5742 uint64_t origin:3;
5743 uint64_t rsvd1:12;
5744 };
5745 uint64_t all;
5746 };
5747
5748 union nvme_timestamp ts;
5749 ts.all = 0;
5750 ts.timestamp = n->host_timestamp + elapsed_time;
5751
5752 /* If the host timestamp is non-zero, set the timestamp origin */
5753 ts.origin = n->host_timestamp ? 0x01 : 0x00;
5754
5755 trace_pci_nvme_getfeat_timestamp(ts.all);
5756
5757 return cpu_to_le64(ts.all);
5758 }
5759
5760 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
5761 {
5762 uint64_t timestamp = nvme_get_timestamp(n);
5763
5764 return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
5765 }
5766
5767 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
5768 uint32_t *result)
5769 {
5770 *result = 0;
5771
5772 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
5773 return NVME_INVALID_FIELD | NVME_DNR;
5774 }
5775
5776 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
5777 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
5778
5779 return NVME_SUCCESS;
5780 }
5781
5782 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
5783 NvmeRequest *req, uint32_t *result)
5784 {
5785 NvmeCmd *cmd = &req->cmd;
5786 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
5787 uint16_t ph = cdw11 & 0xffff;
5788 uint8_t noet = (cdw11 >> 16) & 0xff;
5789 uint16_t ruhid, ret;
5790 uint32_t nentries = 0;
5791 uint8_t s_events_ndx = 0;
5792 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
5793 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
5794 NvmeRuHandle *ruh;
5795 NvmeFdpEventDescr *s_event;
5796
5797 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
5798 return NVME_FDP_DISABLED | NVME_DNR;
5799 }
5800
5801 if (!nvme_ph_valid(ns, ph)) {
5802 return NVME_INVALID_FIELD | NVME_DNR;
5803 }
5804
5805 ruhid = ns->fdp.phs[ph];
5806 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
5807
5808 assert(ruh);
5809
5810 if (unlikely(noet == 0)) {
5811 return NVME_INVALID_FIELD | NVME_DNR;
5812 }
5813
5814 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
5815 uint8_t shift = nvme_fdp_evf_shifts[event_type];
5816 if (!shift && event_type) {
5817 /*
5818 * only first entry (event_type == 0) has a shift value of 0
5819 * other entries are simply unpopulated.
5820 */
5821 continue;
5822 }
5823
5824 nentries++;
5825
5826 s_event = &s_events[s_events_ndx];
5827 s_event->evt = event_type;
5828 s_event->evta = (ruh->event_filter >> shift) & 0x1;
5829
5830 /* break if all `noet` entries are filled */
5831 if ((++s_events_ndx) == noet) {
5832 break;
5833 }
5834 }
5835
5836 ret = nvme_c2h(n, s_events, s_events_siz, req);
5837 if (ret) {
5838 return ret;
5839 }
5840
5841 *result = nentries;
5842 return NVME_SUCCESS;
5843 }
5844
5845 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
5846 {
5847 NvmeCmd *cmd = &req->cmd;
5848 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5849 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5850 uint32_t nsid = le32_to_cpu(cmd->nsid);
5851 uint32_t result;
5852 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5853 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
5854 uint16_t iv;
5855 NvmeNamespace *ns;
5856 int i;
5857 uint16_t endgrpid = 0, ret = NVME_SUCCESS;
5858
5859 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
5860 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
5861 };
5862
5863 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
5864
5865 if (!nvme_feature_support[fid]) {
5866 return NVME_INVALID_FIELD | NVME_DNR;
5867 }
5868
5869 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5870 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5871 /*
5872 * The Reservation Notification Mask and Reservation Persistence
5873 * features require a status code of Invalid Field in Command when
5874 * NSID is FFFFFFFFh. Since the device does not support those
5875 * features we can always return Invalid Namespace or Format as we
5876 * should do for all other features.
5877 */
5878 return NVME_INVALID_NSID | NVME_DNR;
5879 }
5880
5881 if (!nvme_ns(n, nsid)) {
5882 return NVME_INVALID_FIELD | NVME_DNR;
5883 }
5884 }
5885
5886 switch (sel) {
5887 case NVME_GETFEAT_SELECT_CURRENT:
5888 break;
5889 case NVME_GETFEAT_SELECT_SAVED:
5890 /* no features are saveable by the controller; fallthrough */
5891 case NVME_GETFEAT_SELECT_DEFAULT:
5892 goto defaults;
5893 case NVME_GETFEAT_SELECT_CAP:
5894 result = nvme_feature_cap[fid];
5895 goto out;
5896 }
5897
5898 switch (fid) {
5899 case NVME_TEMPERATURE_THRESHOLD:
5900 result = 0;
5901
5902 /*
5903 * The controller only implements the Composite Temperature sensor, so
5904 * return 0 for all other sensors.
5905 */
5906 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5907 goto out;
5908 }
5909
5910 switch (NVME_TEMP_THSEL(dw11)) {
5911 case NVME_TEMP_THSEL_OVER:
5912 result = n->features.temp_thresh_hi;
5913 goto out;
5914 case NVME_TEMP_THSEL_UNDER:
5915 result = n->features.temp_thresh_low;
5916 goto out;
5917 }
5918
5919 return NVME_INVALID_FIELD | NVME_DNR;
5920 case NVME_ERROR_RECOVERY:
5921 if (!nvme_nsid_valid(n, nsid)) {
5922 return NVME_INVALID_NSID | NVME_DNR;
5923 }
5924
5925 ns = nvme_ns(n, nsid);
5926 if (unlikely(!ns)) {
5927 return NVME_INVALID_FIELD | NVME_DNR;
5928 }
5929
5930 result = ns->features.err_rec;
5931 goto out;
5932 case NVME_VOLATILE_WRITE_CACHE:
5933 result = 0;
5934 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5935 ns = nvme_ns(n, i);
5936 if (!ns) {
5937 continue;
5938 }
5939
5940 result = blk_enable_write_cache(ns->blkconf.blk);
5941 if (result) {
5942 break;
5943 }
5944 }
5945 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
5946 goto out;
5947 case NVME_ASYNCHRONOUS_EVENT_CONF:
5948 result = n->features.async_config;
5949 goto out;
5950 case NVME_TIMESTAMP:
5951 return nvme_get_feature_timestamp(n, req);
5952 case NVME_HOST_BEHAVIOR_SUPPORT:
5953 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
5954 sizeof(n->features.hbs), req);
5955 case NVME_FDP_MODE:
5956 endgrpid = dw11 & 0xff;
5957
5958 if (endgrpid != 0x1) {
5959 return NVME_INVALID_FIELD | NVME_DNR;
5960 }
5961
5962 ret = nvme_get_feature_fdp(n, endgrpid, &result);
5963 if (ret) {
5964 return ret;
5965 }
5966 goto out;
5967 case NVME_FDP_EVENTS:
5968 if (!nvme_nsid_valid(n, nsid)) {
5969 return NVME_INVALID_NSID | NVME_DNR;
5970 }
5971
5972 ns = nvme_ns(n, nsid);
5973 if (unlikely(!ns)) {
5974 return NVME_INVALID_FIELD | NVME_DNR;
5975 }
5976
5977 ret = nvme_get_feature_fdp_events(n, ns, req, &result);
5978 if (ret) {
5979 return ret;
5980 }
5981 goto out;
5982 default:
5983 break;
5984 }
5985
5986 defaults:
5987 switch (fid) {
5988 case NVME_TEMPERATURE_THRESHOLD:
5989 result = 0;
5990
5991 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5992 break;
5993 }
5994
5995 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
5996 result = NVME_TEMPERATURE_WARNING;
5997 }
5998
5999 break;
6000 case NVME_NUMBER_OF_QUEUES:
6001 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6002 trace_pci_nvme_getfeat_numq(result);
6003 break;
6004 case NVME_INTERRUPT_VECTOR_CONF:
6005 iv = dw11 & 0xffff;
6006 if (iv >= n->conf_ioqpairs + 1) {
6007 return NVME_INVALID_FIELD | NVME_DNR;
6008 }
6009
6010 result = iv;
6011 if (iv == n->admin_cq.vector) {
6012 result |= NVME_INTVC_NOCOALESCING;
6013 }
6014 break;
6015 case NVME_FDP_MODE:
6016 endgrpid = dw11 & 0xff;
6017
6018 if (endgrpid != 0x1) {
6019 return NVME_INVALID_FIELD | NVME_DNR;
6020 }
6021
6022 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6023 if (ret) {
6024 return ret;
6025 }
6026 goto out;
6027
6028 break;
6029 default:
6030 result = nvme_feature_default[fid];
6031 break;
6032 }
6033
6034 out:
6035 req->cqe.result = cpu_to_le32(result);
6036 return ret;
6037 }
6038
6039 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6040 {
6041 uint16_t ret;
6042 uint64_t timestamp;
6043
6044 ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
6045 if (ret) {
6046 return ret;
6047 }
6048
6049 nvme_set_timestamp(n, timestamp);
6050
6051 return NVME_SUCCESS;
6052 }
6053
6054 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6055 NvmeRequest *req)
6056 {
6057 NvmeCmd *cmd = &req->cmd;
6058 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6059 uint16_t ph = cdw11 & 0xffff;
6060 uint8_t noet = (cdw11 >> 16) & 0xff;
6061 uint16_t ret, ruhid;
6062 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6063 uint8_t event_mask = 0;
6064 unsigned int i;
6065 g_autofree uint8_t *events = g_malloc0(noet);
6066 NvmeRuHandle *ruh = NULL;
6067
6068 assert(ns);
6069
6070 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6071 return NVME_FDP_DISABLED | NVME_DNR;
6072 }
6073
6074 if (!nvme_ph_valid(ns, ph)) {
6075 return NVME_INVALID_FIELD | NVME_DNR;
6076 }
6077
6078 ruhid = ns->fdp.phs[ph];
6079 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6080
6081 ret = nvme_h2c(n, events, noet, req);
6082 if (ret) {
6083 return ret;
6084 }
6085
6086 for (i = 0; i < noet; i++) {
6087 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6088 }
6089
6090 if (enable) {
6091 ruh->event_filter |= event_mask;
6092 } else {
6093 ruh->event_filter = ruh->event_filter & ~event_mask;
6094 }
6095
6096 return NVME_SUCCESS;
6097 }
6098
6099 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6100 {
6101 NvmeNamespace *ns = NULL;
6102
6103 NvmeCmd *cmd = &req->cmd;
6104 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6105 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6106 uint32_t nsid = le32_to_cpu(cmd->nsid);
6107 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6108 uint8_t save = NVME_SETFEAT_SAVE(dw10);
6109 uint16_t status;
6110 int i;
6111
6112 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6113
6114 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6115 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6116 }
6117
6118 if (!nvme_feature_support[fid]) {
6119 return NVME_INVALID_FIELD | NVME_DNR;
6120 }
6121
6122 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6123 if (nsid != NVME_NSID_BROADCAST) {
6124 if (!nvme_nsid_valid(n, nsid)) {
6125 return NVME_INVALID_NSID | NVME_DNR;
6126 }
6127
6128 ns = nvme_ns(n, nsid);
6129 if (unlikely(!ns)) {
6130 return NVME_INVALID_FIELD | NVME_DNR;
6131 }
6132 }
6133 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6134 if (!nvme_nsid_valid(n, nsid)) {
6135 return NVME_INVALID_NSID | NVME_DNR;
6136 }
6137
6138 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6139 }
6140
6141 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6142 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6143 }
6144
6145 switch (fid) {
6146 case NVME_TEMPERATURE_THRESHOLD:
6147 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6148 break;
6149 }
6150
6151 switch (NVME_TEMP_THSEL(dw11)) {
6152 case NVME_TEMP_THSEL_OVER:
6153 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6154 break;
6155 case NVME_TEMP_THSEL_UNDER:
6156 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6157 break;
6158 default:
6159 return NVME_INVALID_FIELD | NVME_DNR;
6160 }
6161
6162 if ((n->temperature >= n->features.temp_thresh_hi) ||
6163 (n->temperature <= n->features.temp_thresh_low)) {
6164 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6165 }
6166
6167 break;
6168 case NVME_ERROR_RECOVERY:
6169 if (nsid == NVME_NSID_BROADCAST) {
6170 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6171 ns = nvme_ns(n, i);
6172
6173 if (!ns) {
6174 continue;
6175 }
6176
6177 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6178 ns->features.err_rec = dw11;
6179 }
6180 }
6181
6182 break;
6183 }
6184
6185 assert(ns);
6186 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6187 ns->features.err_rec = dw11;
6188 }
6189 break;
6190 case NVME_VOLATILE_WRITE_CACHE:
6191 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6192 ns = nvme_ns(n, i);
6193 if (!ns) {
6194 continue;
6195 }
6196
6197 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6198 blk_flush(ns->blkconf.blk);
6199 }
6200
6201 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6202 }
6203
6204 break;
6205
6206 case NVME_NUMBER_OF_QUEUES:
6207 if (n->qs_created) {
6208 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6209 }
6210
6211 /*
6212 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6213 * and NSQR.
6214 */
6215 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6216 return NVME_INVALID_FIELD | NVME_DNR;
6217 }
6218
6219 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6220 ((dw11 >> 16) & 0xffff) + 1,
6221 n->conf_ioqpairs,
6222 n->conf_ioqpairs);
6223 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6224 ((n->conf_ioqpairs - 1) << 16));
6225 break;
6226 case NVME_ASYNCHRONOUS_EVENT_CONF:
6227 n->features.async_config = dw11;
6228 break;
6229 case NVME_TIMESTAMP:
6230 return nvme_set_feature_timestamp(n, req);
6231 case NVME_HOST_BEHAVIOR_SUPPORT:
6232 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6233 sizeof(n->features.hbs), req);
6234 if (status) {
6235 return status;
6236 }
6237
6238 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6239 ns = nvme_ns(n, i);
6240
6241 if (!ns) {
6242 continue;
6243 }
6244
6245 ns->id_ns.nlbaf = ns->nlbaf - 1;
6246 if (!n->features.hbs.lbafee) {
6247 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6248 }
6249 }
6250
6251 return status;
6252 case NVME_COMMAND_SET_PROFILE:
6253 if (dw11 & 0x1ff) {
6254 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6255 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
6256 }
6257 break;
6258 case NVME_FDP_MODE:
6259 /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6260 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6261 case NVME_FDP_EVENTS:
6262 return nvme_set_feature_fdp_events(n, ns, req);
6263 default:
6264 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6265 }
6266 return NVME_SUCCESS;
6267 }
6268
6269 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6270 {
6271 trace_pci_nvme_aer(nvme_cid(req));
6272
6273 if (n->outstanding_aers > n->params.aerl) {
6274 trace_pci_nvme_aer_aerl_exceeded();
6275 return NVME_AER_LIMIT_EXCEEDED;
6276 }
6277
6278 n->aer_reqs[n->outstanding_aers] = req;
6279 n->outstanding_aers++;
6280
6281 if (!QTAILQ_EMPTY(&n->aer_queue)) {
6282 nvme_process_aers(n);
6283 }
6284
6285 return NVME_NO_COMPLETE;
6286 }
6287
6288 static void nvme_update_dmrsl(NvmeCtrl *n)
6289 {
6290 int nsid;
6291
6292 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6293 NvmeNamespace *ns = nvme_ns(n, nsid);
6294 if (!ns) {
6295 continue;
6296 }
6297
6298 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6299 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6300 }
6301 }
6302
6303 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
6304 {
6305 uint32_t cc = ldl_le_p(&n->bar.cc);
6306
6307 ns->iocs = nvme_cse_iocs_none;
6308 switch (ns->csi) {
6309 case NVME_CSI_NVM:
6310 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
6311 ns->iocs = nvme_cse_iocs_nvm;
6312 }
6313 break;
6314 case NVME_CSI_ZONED:
6315 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
6316 ns->iocs = nvme_cse_iocs_zoned;
6317 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
6318 ns->iocs = nvme_cse_iocs_nvm;
6319 }
6320 break;
6321 }
6322 }
6323
6324 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6325 {
6326 NvmeNamespace *ns;
6327 NvmeCtrl *ctrl;
6328 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6329 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6330 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6331 uint8_t sel = dw10 & 0xf;
6332 uint16_t *nr_ids = &list[0];
6333 uint16_t *ids = &list[1];
6334 uint16_t ret;
6335 int i;
6336
6337 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6338
6339 if (!nvme_nsid_valid(n, nsid)) {
6340 return NVME_INVALID_NSID | NVME_DNR;
6341 }
6342
6343 ns = nvme_subsys_ns(n->subsys, nsid);
6344 if (!ns) {
6345 return NVME_INVALID_FIELD | NVME_DNR;
6346 }
6347
6348 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6349 if (ret) {
6350 return ret;
6351 }
6352
6353 if (!*nr_ids) {
6354 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6355 }
6356
6357 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6358 for (i = 0; i < *nr_ids; i++) {
6359 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6360 if (!ctrl) {
6361 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6362 }
6363
6364 switch (sel) {
6365 case NVME_NS_ATTACHMENT_ATTACH:
6366 if (nvme_ns(ctrl, nsid)) {
6367 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6368 }
6369
6370 if (ns->attached && !ns->params.shared) {
6371 return NVME_NS_PRIVATE | NVME_DNR;
6372 }
6373
6374 nvme_attach_ns(ctrl, ns);
6375 nvme_select_iocs_ns(ctrl, ns);
6376
6377 break;
6378
6379 case NVME_NS_ATTACHMENT_DETACH:
6380 if (!nvme_ns(ctrl, nsid)) {
6381 return NVME_NS_NOT_ATTACHED | NVME_DNR;
6382 }
6383
6384 ctrl->namespaces[nsid] = NULL;
6385 ns->attached--;
6386
6387 nvme_update_dmrsl(ctrl);
6388
6389 break;
6390
6391 default:
6392 return NVME_INVALID_FIELD | NVME_DNR;
6393 }
6394
6395 /*
6396 * Add namespace id to the changed namespace id list for event clearing
6397 * via Get Log Page command.
6398 */
6399 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6400 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6401 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6402 NVME_LOG_CHANGED_NSLIST);
6403 }
6404 }
6405
6406 return NVME_SUCCESS;
6407 }
6408
6409 typedef struct NvmeFormatAIOCB {
6410 BlockAIOCB common;
6411 BlockAIOCB *aiocb;
6412 NvmeRequest *req;
6413 int ret;
6414
6415 NvmeNamespace *ns;
6416 uint32_t nsid;
6417 bool broadcast;
6418 int64_t offset;
6419
6420 uint8_t lbaf;
6421 uint8_t mset;
6422 uint8_t pi;
6423 uint8_t pil;
6424 } NvmeFormatAIOCB;
6425
6426 static void nvme_format_cancel(BlockAIOCB *aiocb)
6427 {
6428 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6429
6430 iocb->ret = -ECANCELED;
6431
6432 if (iocb->aiocb) {
6433 blk_aio_cancel_async(iocb->aiocb);
6434 iocb->aiocb = NULL;
6435 }
6436 }
6437
6438 static const AIOCBInfo nvme_format_aiocb_info = {
6439 .aiocb_size = sizeof(NvmeFormatAIOCB),
6440 .cancel_async = nvme_format_cancel,
6441 .get_aio_context = nvme_get_aio_context,
6442 };
6443
6444 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6445 uint8_t pi, uint8_t pil)
6446 {
6447 uint8_t lbafl = lbaf & 0xf;
6448 uint8_t lbafu = lbaf >> 4;
6449
6450 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6451
6452 ns->id_ns.dps = (pil << 3) | pi;
6453 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6454
6455 nvme_ns_init_format(ns);
6456 }
6457
6458 static void nvme_do_format(NvmeFormatAIOCB *iocb);
6459
6460 static void nvme_format_ns_cb(void *opaque, int ret)
6461 {
6462 NvmeFormatAIOCB *iocb = opaque;
6463 NvmeNamespace *ns = iocb->ns;
6464 int bytes;
6465
6466 if (iocb->ret < 0) {
6467 goto done;
6468 } else if (ret < 0) {
6469 iocb->ret = ret;
6470 goto done;
6471 }
6472
6473 assert(ns);
6474
6475 if (iocb->offset < ns->size) {
6476 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6477
6478 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6479 bytes, BDRV_REQ_MAY_UNMAP,
6480 nvme_format_ns_cb, iocb);
6481
6482 iocb->offset += bytes;
6483 return;
6484 }
6485
6486 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6487 ns->status = 0x0;
6488 iocb->ns = NULL;
6489 iocb->offset = 0;
6490
6491 done:
6492 nvme_do_format(iocb);
6493 }
6494
6495 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6496 {
6497 if (ns->params.zoned) {
6498 return NVME_INVALID_FORMAT | NVME_DNR;
6499 }
6500
6501 if (lbaf > ns->id_ns.nlbaf) {
6502 return NVME_INVALID_FORMAT | NVME_DNR;
6503 }
6504
6505 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6506 return NVME_INVALID_FORMAT | NVME_DNR;
6507 }
6508
6509 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6510 return NVME_INVALID_FIELD | NVME_DNR;
6511 }
6512
6513 return NVME_SUCCESS;
6514 }
6515
6516 static void nvme_do_format(NvmeFormatAIOCB *iocb)
6517 {
6518 NvmeRequest *req = iocb->req;
6519 NvmeCtrl *n = nvme_ctrl(req);
6520 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6521 uint8_t lbaf = dw10 & 0xf;
6522 uint8_t pi = (dw10 >> 5) & 0x7;
6523 uint16_t status;
6524 int i;
6525
6526 if (iocb->ret < 0) {
6527 goto done;
6528 }
6529
6530 if (iocb->broadcast) {
6531 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6532 iocb->ns = nvme_ns(n, i);
6533 if (iocb->ns) {
6534 iocb->nsid = i;
6535 break;
6536 }
6537 }
6538 }
6539
6540 if (!iocb->ns) {
6541 goto done;
6542 }
6543
6544 status = nvme_format_check(iocb->ns, lbaf, pi);
6545 if (status) {
6546 req->status = status;
6547 goto done;
6548 }
6549
6550 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
6551 nvme_format_ns_cb(iocb, 0);
6552 return;
6553
6554 done:
6555 iocb->common.cb(iocb->common.opaque, iocb->ret);
6556 qemu_aio_unref(iocb);
6557 }
6558
6559 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
6560 {
6561 NvmeFormatAIOCB *iocb;
6562 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6563 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6564 uint8_t lbaf = dw10 & 0xf;
6565 uint8_t mset = (dw10 >> 4) & 0x1;
6566 uint8_t pi = (dw10 >> 5) & 0x7;
6567 uint8_t pil = (dw10 >> 8) & 0x1;
6568 uint8_t lbafu = (dw10 >> 12) & 0x3;
6569 uint16_t status;
6570
6571 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
6572
6573 iocb->req = req;
6574 iocb->ret = 0;
6575 iocb->ns = NULL;
6576 iocb->nsid = 0;
6577 iocb->lbaf = lbaf;
6578 iocb->mset = mset;
6579 iocb->pi = pi;
6580 iocb->pil = pil;
6581 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
6582 iocb->offset = 0;
6583
6584 if (n->features.hbs.lbafee) {
6585 iocb->lbaf |= lbafu << 4;
6586 }
6587
6588 if (!iocb->broadcast) {
6589 if (!nvme_nsid_valid(n, nsid)) {
6590 status = NVME_INVALID_NSID | NVME_DNR;
6591 goto out;
6592 }
6593
6594 iocb->ns = nvme_ns(n, nsid);
6595 if (!iocb->ns) {
6596 status = NVME_INVALID_FIELD | NVME_DNR;
6597 goto out;
6598 }
6599 }
6600
6601 req->aiocb = &iocb->common;
6602 nvme_do_format(iocb);
6603
6604 return NVME_NO_COMPLETE;
6605
6606 out:
6607 qemu_aio_unref(iocb);
6608
6609 return status;
6610 }
6611
6612 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
6613 int *num_prim, int *num_sec)
6614 {
6615 *num_total = le32_to_cpu(rt ?
6616 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
6617 *num_prim = le16_to_cpu(rt ?
6618 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
6619 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
6620 }
6621
6622 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
6623 uint16_t cntlid, uint8_t rt,
6624 int nr)
6625 {
6626 int num_total, num_prim, num_sec;
6627
6628 if (cntlid != n->cntlid) {
6629 return NVME_INVALID_CTRL_ID | NVME_DNR;
6630 }
6631
6632 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6633
6634 if (nr > num_total) {
6635 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6636 }
6637
6638 if (nr > num_total - num_sec) {
6639 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6640 }
6641
6642 if (rt) {
6643 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
6644 } else {
6645 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
6646 }
6647
6648 req->cqe.result = cpu_to_le32(nr);
6649 return req->status;
6650 }
6651
6652 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
6653 uint8_t rt, int nr)
6654 {
6655 int prev_nr, prev_total;
6656
6657 if (rt) {
6658 prev_nr = le16_to_cpu(sctrl->nvi);
6659 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
6660 sctrl->nvi = cpu_to_le16(nr);
6661 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
6662 } else {
6663 prev_nr = le16_to_cpu(sctrl->nvq);
6664 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
6665 sctrl->nvq = cpu_to_le16(nr);
6666 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
6667 }
6668 }
6669
6670 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
6671 uint16_t cntlid, uint8_t rt, int nr)
6672 {
6673 int num_total, num_prim, num_sec, num_free, diff, limit;
6674 NvmeSecCtrlEntry *sctrl;
6675
6676 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6677 if (!sctrl) {
6678 return NVME_INVALID_CTRL_ID | NVME_DNR;
6679 }
6680
6681 if (sctrl->scs) {
6682 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6683 }
6684
6685 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
6686 if (nr > limit) {
6687 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6688 }
6689
6690 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6691 num_free = num_total - num_prim - num_sec;
6692 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
6693
6694 if (diff > num_free) {
6695 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6696 }
6697
6698 nvme_update_virt_res(n, sctrl, rt, nr);
6699 req->cqe.result = cpu_to_le32(nr);
6700
6701 return req->status;
6702 }
6703
6704 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
6705 {
6706 PCIDevice *pci = PCI_DEVICE(n);
6707 NvmeCtrl *sn = NULL;
6708 NvmeSecCtrlEntry *sctrl;
6709 int vf_index;
6710
6711 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6712 if (!sctrl) {
6713 return NVME_INVALID_CTRL_ID | NVME_DNR;
6714 }
6715
6716 if (!pci_is_vf(pci)) {
6717 vf_index = le16_to_cpu(sctrl->vfn) - 1;
6718 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
6719 }
6720
6721 if (online) {
6722 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
6723 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6724 }
6725
6726 if (!sctrl->scs) {
6727 sctrl->scs = 0x1;
6728 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
6729 }
6730 } else {
6731 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
6732 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
6733
6734 if (sctrl->scs) {
6735 sctrl->scs = 0x0;
6736 if (sn) {
6737 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
6738 }
6739 }
6740 }
6741
6742 return NVME_SUCCESS;
6743 }
6744
6745 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
6746 {
6747 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6748 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
6749 uint8_t act = dw10 & 0xf;
6750 uint8_t rt = (dw10 >> 8) & 0x7;
6751 uint16_t cntlid = (dw10 >> 16) & 0xffff;
6752 int nr = dw11 & 0xffff;
6753
6754 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
6755
6756 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
6757 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6758 }
6759
6760 switch (act) {
6761 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
6762 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
6763 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
6764 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
6765 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
6766 return nvme_virt_set_state(n, cntlid, true);
6767 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
6768 return nvme_virt_set_state(n, cntlid, false);
6769 default:
6770 return NVME_INVALID_FIELD | NVME_DNR;
6771 }
6772 }
6773
6774 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
6775 {
6776 PCIDevice *pci = PCI_DEVICE(n);
6777 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
6778 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
6779 int i;
6780
6781 /* Address should be page aligned */
6782 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
6783 return NVME_INVALID_FIELD | NVME_DNR;
6784 }
6785
6786 /* Save shadow buffer base addr for use during queue creation */
6787 n->dbbuf_dbs = dbs_addr;
6788 n->dbbuf_eis = eis_addr;
6789 n->dbbuf_enabled = true;
6790
6791 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
6792 NvmeSQueue *sq = n->sq[i];
6793 NvmeCQueue *cq = n->cq[i];
6794
6795 if (sq) {
6796 /*
6797 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
6798 * nvme_process_db() uses this hard-coded way to calculate
6799 * doorbell offsets. Be consistent with that here.
6800 */
6801 sq->db_addr = dbs_addr + (i << 3);
6802 sq->ei_addr = eis_addr + (i << 3);
6803 pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
6804
6805 if (n->params.ioeventfd && sq->sqid != 0) {
6806 if (!nvme_init_sq_ioeventfd(sq)) {
6807 sq->ioeventfd_enabled = true;
6808 }
6809 }
6810 }
6811
6812 if (cq) {
6813 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
6814 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
6815 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
6816 pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
6817
6818 if (n->params.ioeventfd && cq->cqid != 0) {
6819 if (!nvme_init_cq_ioeventfd(cq)) {
6820 cq->ioeventfd_enabled = true;
6821 }
6822 }
6823 }
6824 }
6825
6826 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
6827
6828 return NVME_SUCCESS;
6829 }
6830
6831 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
6832 {
6833 return NVME_INVALID_FIELD | NVME_DNR;
6834 }
6835
6836 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
6837 {
6838 NvmeNamespace *ns;
6839 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6840 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
6841 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6842 uint8_t doper, dtype;
6843 uint32_t numd, trans_len;
6844 NvmeDirectiveIdentify id = {
6845 .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
6846 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
6847 };
6848
6849 numd = dw10 + 1;
6850 doper = dw11 & 0xff;
6851 dtype = (dw11 >> 8) & 0xff;
6852
6853 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
6854
6855 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
6856 doper != NVME_DIRECTIVE_RETURN_PARAMS) {
6857 return NVME_INVALID_FIELD | NVME_DNR;
6858 }
6859
6860 ns = nvme_ns(n, nsid);
6861 if (!ns) {
6862 return NVME_INVALID_FIELD | NVME_DNR;
6863 }
6864
6865 switch (dtype) {
6866 case NVME_DIRECTIVE_IDENTIFY:
6867 switch (doper) {
6868 case NVME_DIRECTIVE_RETURN_PARAMS:
6869 if (ns->endgrp->fdp.enabled) {
6870 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
6871 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
6872 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
6873 }
6874
6875 return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
6876
6877 default:
6878 return NVME_INVALID_FIELD | NVME_DNR;
6879 }
6880
6881 default:
6882 return NVME_INVALID_FIELD;
6883 }
6884 }
6885
6886 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
6887 {
6888 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
6889 nvme_adm_opc_str(req->cmd.opcode));
6890
6891 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
6892 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
6893 return NVME_INVALID_OPCODE | NVME_DNR;
6894 }
6895
6896 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
6897 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
6898 return NVME_INVALID_FIELD | NVME_DNR;
6899 }
6900
6901 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
6902 return NVME_INVALID_FIELD;
6903 }
6904
6905 switch (req->cmd.opcode) {
6906 case NVME_ADM_CMD_DELETE_SQ:
6907 return nvme_del_sq(n, req);
6908 case NVME_ADM_CMD_CREATE_SQ:
6909 return nvme_create_sq(n, req);
6910 case NVME_ADM_CMD_GET_LOG_PAGE:
6911 return nvme_get_log(n, req);
6912 case NVME_ADM_CMD_DELETE_CQ:
6913 return nvme_del_cq(n, req);
6914 case NVME_ADM_CMD_CREATE_CQ:
6915 return nvme_create_cq(n, req);
6916 case NVME_ADM_CMD_IDENTIFY:
6917 return nvme_identify(n, req);
6918 case NVME_ADM_CMD_ABORT:
6919 return nvme_abort(n, req);
6920 case NVME_ADM_CMD_SET_FEATURES:
6921 return nvme_set_feature(n, req);
6922 case NVME_ADM_CMD_GET_FEATURES:
6923 return nvme_get_feature(n, req);
6924 case NVME_ADM_CMD_ASYNC_EV_REQ:
6925 return nvme_aer(n, req);
6926 case NVME_ADM_CMD_NS_ATTACHMENT:
6927 return nvme_ns_attachment(n, req);
6928 case NVME_ADM_CMD_VIRT_MNGMT:
6929 return nvme_virt_mngmt(n, req);
6930 case NVME_ADM_CMD_DBBUF_CONFIG:
6931 return nvme_dbbuf_config(n, req);
6932 case NVME_ADM_CMD_FORMAT_NVM:
6933 return nvme_format(n, req);
6934 case NVME_ADM_CMD_DIRECTIVE_SEND:
6935 return nvme_directive_send(n, req);
6936 case NVME_ADM_CMD_DIRECTIVE_RECV:
6937 return nvme_directive_receive(n, req);
6938 default:
6939 assert(false);
6940 }
6941
6942 return NVME_INVALID_OPCODE | NVME_DNR;
6943 }
6944
6945 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
6946 {
6947 uint32_t v = cpu_to_le32(sq->tail);
6948
6949 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
6950
6951 pci_dma_write(PCI_DEVICE(sq->ctrl), sq->ei_addr, &v, sizeof(v));
6952 }
6953
6954 static void nvme_update_sq_tail(NvmeSQueue *sq)
6955 {
6956 uint32_t v;
6957
6958 pci_dma_read(PCI_DEVICE(sq->ctrl), sq->db_addr, &v, sizeof(v));
6959
6960 sq->tail = le32_to_cpu(v);
6961
6962 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
6963 }
6964
6965 static void nvme_process_sq(void *opaque)
6966 {
6967 NvmeSQueue *sq = opaque;
6968 NvmeCtrl *n = sq->ctrl;
6969 NvmeCQueue *cq = n->cq[sq->cqid];
6970
6971 uint16_t status;
6972 hwaddr addr;
6973 NvmeCmd cmd;
6974 NvmeRequest *req;
6975
6976 if (n->dbbuf_enabled) {
6977 nvme_update_sq_tail(sq);
6978 }
6979
6980 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
6981 addr = sq->dma_addr + sq->head * n->sqe_size;
6982 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
6983 trace_pci_nvme_err_addr_read(addr);
6984 trace_pci_nvme_err_cfs();
6985 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
6986 break;
6987 }
6988 nvme_inc_sq_head(sq);
6989
6990 req = QTAILQ_FIRST(&sq->req_list);
6991 QTAILQ_REMOVE(&sq->req_list, req, entry);
6992 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
6993 nvme_req_clear(req);
6994 req->cqe.cid = cmd.cid;
6995 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
6996
6997 status = sq->sqid ? nvme_io_cmd(n, req) :
6998 nvme_admin_cmd(n, req);
6999 if (status != NVME_NO_COMPLETE) {
7000 req->status = status;
7001 nvme_enqueue_req_completion(cq, req);
7002 }
7003
7004 if (n->dbbuf_enabled) {
7005 nvme_update_sq_eventidx(sq);
7006 nvme_update_sq_tail(sq);
7007 }
7008 }
7009 }
7010
7011 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7012 {
7013 uint8_t *config;
7014
7015 if (!msix_present(pci_dev)) {
7016 return;
7017 }
7018
7019 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7020
7021 config = pci_dev->config + pci_dev->msix_cap;
7022 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7023 table_size - 1);
7024 }
7025
7026 static void nvme_activate_virt_res(NvmeCtrl *n)
7027 {
7028 PCIDevice *pci_dev = PCI_DEVICE(n);
7029 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7030 NvmeSecCtrlEntry *sctrl;
7031
7032 /* -1 to account for the admin queue */
7033 if (pci_is_vf(pci_dev)) {
7034 sctrl = nvme_sctrl(n);
7035 cap->vqprt = sctrl->nvq;
7036 cap->viprt = sctrl->nvi;
7037 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7038 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7039 } else {
7040 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7041 cap->virfap = n->next_pri_ctrl_cap.virfap;
7042 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7043 le16_to_cpu(cap->vqrfap) - 1;
7044 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7045 le16_to_cpu(cap->virfap);
7046 }
7047 }
7048
7049 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7050 {
7051 PCIDevice *pci_dev = PCI_DEVICE(n);
7052 NvmeSecCtrlEntry *sctrl;
7053 NvmeNamespace *ns;
7054 int i;
7055
7056 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7057 ns = nvme_ns(n, i);
7058 if (!ns) {
7059 continue;
7060 }
7061
7062 nvme_ns_drain(ns);
7063 }
7064
7065 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7066 if (n->sq[i] != NULL) {
7067 nvme_free_sq(n->sq[i], n);
7068 }
7069 }
7070 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7071 if (n->cq[i] != NULL) {
7072 nvme_free_cq(n->cq[i], n);
7073 }
7074 }
7075
7076 while (!QTAILQ_EMPTY(&n->aer_queue)) {
7077 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7078 QTAILQ_REMOVE(&n->aer_queue, event, entry);
7079 g_free(event);
7080 }
7081
7082 if (n->params.sriov_max_vfs) {
7083 if (!pci_is_vf(pci_dev)) {
7084 for (i = 0; i < n->sec_ctrl_list.numcntl; i++) {
7085 sctrl = &n->sec_ctrl_list.sec[i];
7086 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7087 }
7088
7089 if (rst != NVME_RESET_CONTROLLER) {
7090 pcie_sriov_pf_disable_vfs(pci_dev);
7091 }
7092 }
7093
7094 if (rst != NVME_RESET_CONTROLLER) {
7095 nvme_activate_virt_res(n);
7096 }
7097 }
7098
7099 n->aer_queued = 0;
7100 n->aer_mask = 0;
7101 n->outstanding_aers = 0;
7102 n->qs_created = false;
7103
7104 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7105
7106 if (pci_is_vf(pci_dev)) {
7107 sctrl = nvme_sctrl(n);
7108
7109 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7110 } else {
7111 stl_le_p(&n->bar.csts, 0);
7112 }
7113
7114 stl_le_p(&n->bar.intms, 0);
7115 stl_le_p(&n->bar.intmc, 0);
7116 stl_le_p(&n->bar.cc, 0);
7117
7118 n->dbbuf_dbs = 0;
7119 n->dbbuf_eis = 0;
7120 n->dbbuf_enabled = false;
7121 }
7122
7123 static void nvme_ctrl_shutdown(NvmeCtrl *n)
7124 {
7125 NvmeNamespace *ns;
7126 int i;
7127
7128 if (n->pmr.dev) {
7129 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7130 }
7131
7132 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7133 ns = nvme_ns(n, i);
7134 if (!ns) {
7135 continue;
7136 }
7137
7138 nvme_ns_shutdown(ns);
7139 }
7140 }
7141
7142 static void nvme_select_iocs(NvmeCtrl *n)
7143 {
7144 NvmeNamespace *ns;
7145 int i;
7146
7147 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7148 ns = nvme_ns(n, i);
7149 if (!ns) {
7150 continue;
7151 }
7152
7153 nvme_select_iocs_ns(n, ns);
7154 }
7155 }
7156
7157 static int nvme_start_ctrl(NvmeCtrl *n)
7158 {
7159 uint64_t cap = ldq_le_p(&n->bar.cap);
7160 uint32_t cc = ldl_le_p(&n->bar.cc);
7161 uint32_t aqa = ldl_le_p(&n->bar.aqa);
7162 uint64_t asq = ldq_le_p(&n->bar.asq);
7163 uint64_t acq = ldq_le_p(&n->bar.acq);
7164 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7165 uint32_t page_size = 1 << page_bits;
7166 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7167
7168 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7169 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7170 le16_to_cpu(sctrl->nvq));
7171 return -1;
7172 }
7173 if (unlikely(n->cq[0])) {
7174 trace_pci_nvme_err_startfail_cq();
7175 return -1;
7176 }
7177 if (unlikely(n->sq[0])) {
7178 trace_pci_nvme_err_startfail_sq();
7179 return -1;
7180 }
7181 if (unlikely(asq & (page_size - 1))) {
7182 trace_pci_nvme_err_startfail_asq_misaligned(asq);
7183 return -1;
7184 }
7185 if (unlikely(acq & (page_size - 1))) {
7186 trace_pci_nvme_err_startfail_acq_misaligned(acq);
7187 return -1;
7188 }
7189 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7190 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7191 return -1;
7192 }
7193 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7194 trace_pci_nvme_err_startfail_page_too_small(
7195 NVME_CC_MPS(cc),
7196 NVME_CAP_MPSMIN(cap));
7197 return -1;
7198 }
7199 if (unlikely(NVME_CC_MPS(cc) >
7200 NVME_CAP_MPSMAX(cap))) {
7201 trace_pci_nvme_err_startfail_page_too_large(
7202 NVME_CC_MPS(cc),
7203 NVME_CAP_MPSMAX(cap));
7204 return -1;
7205 }
7206 if (unlikely(NVME_CC_IOCQES(cc) <
7207 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
7208 trace_pci_nvme_err_startfail_cqent_too_small(
7209 NVME_CC_IOCQES(cc),
7210 NVME_CTRL_CQES_MIN(cap));
7211 return -1;
7212 }
7213 if (unlikely(NVME_CC_IOCQES(cc) >
7214 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
7215 trace_pci_nvme_err_startfail_cqent_too_large(
7216 NVME_CC_IOCQES(cc),
7217 NVME_CTRL_CQES_MAX(cap));
7218 return -1;
7219 }
7220 if (unlikely(NVME_CC_IOSQES(cc) <
7221 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
7222 trace_pci_nvme_err_startfail_sqent_too_small(
7223 NVME_CC_IOSQES(cc),
7224 NVME_CTRL_SQES_MIN(cap));
7225 return -1;
7226 }
7227 if (unlikely(NVME_CC_IOSQES(cc) >
7228 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
7229 trace_pci_nvme_err_startfail_sqent_too_large(
7230 NVME_CC_IOSQES(cc),
7231 NVME_CTRL_SQES_MAX(cap));
7232 return -1;
7233 }
7234 if (unlikely(!NVME_AQA_ASQS(aqa))) {
7235 trace_pci_nvme_err_startfail_asqent_sz_zero();
7236 return -1;
7237 }
7238 if (unlikely(!NVME_AQA_ACQS(aqa))) {
7239 trace_pci_nvme_err_startfail_acqent_sz_zero();
7240 return -1;
7241 }
7242
7243 n->page_bits = page_bits;
7244 n->page_size = page_size;
7245 n->max_prp_ents = n->page_size / sizeof(uint64_t);
7246 n->cqe_size = 1 << NVME_CC_IOCQES(cc);
7247 n->sqe_size = 1 << NVME_CC_IOSQES(cc);
7248 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7249 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7250
7251 nvme_set_timestamp(n, 0ULL);
7252
7253 nvme_select_iocs(n);
7254
7255 return 0;
7256 }
7257
7258 static void nvme_cmb_enable_regs(NvmeCtrl *n)
7259 {
7260 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7261 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7262
7263 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7264 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7265 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7266 stl_le_p(&n->bar.cmbloc, cmbloc);
7267
7268 NVME_CMBSZ_SET_SQS(cmbsz, 1);
7269 NVME_CMBSZ_SET_CQS(cmbsz, 0);
7270 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7271 NVME_CMBSZ_SET_RDS(cmbsz, 1);
7272 NVME_CMBSZ_SET_WDS(cmbsz, 1);
7273 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7274 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7275 stl_le_p(&n->bar.cmbsz, cmbsz);
7276 }
7277
7278 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7279 unsigned size)
7280 {
7281 PCIDevice *pci = PCI_DEVICE(n);
7282 uint64_t cap = ldq_le_p(&n->bar.cap);
7283 uint32_t cc = ldl_le_p(&n->bar.cc);
7284 uint32_t intms = ldl_le_p(&n->bar.intms);
7285 uint32_t csts = ldl_le_p(&n->bar.csts);
7286 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7287
7288 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7289 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7290 "MMIO write not 32-bit aligned,"
7291 " offset=0x%"PRIx64"", offset);
7292 /* should be ignored, fall through for now */
7293 }
7294
7295 if (unlikely(size < sizeof(uint32_t))) {
7296 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7297 "MMIO write smaller than 32-bits,"
7298 " offset=0x%"PRIx64", size=%u",
7299 offset, size);
7300 /* should be ignored, fall through for now */
7301 }
7302
7303 switch (offset) {
7304 case NVME_REG_INTMS:
7305 if (unlikely(msix_enabled(pci))) {
7306 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7307 "undefined access to interrupt mask set"
7308 " when MSI-X is enabled");
7309 /* should be ignored, fall through for now */
7310 }
7311 intms |= data;
7312 stl_le_p(&n->bar.intms, intms);
7313 n->bar.intmc = n->bar.intms;
7314 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7315 nvme_irq_check(n);
7316 break;
7317 case NVME_REG_INTMC:
7318 if (unlikely(msix_enabled(pci))) {
7319 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7320 "undefined access to interrupt mask clr"
7321 " when MSI-X is enabled");
7322 /* should be ignored, fall through for now */
7323 }
7324 intms &= ~data;
7325 stl_le_p(&n->bar.intms, intms);
7326 n->bar.intmc = n->bar.intms;
7327 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7328 nvme_irq_check(n);
7329 break;
7330 case NVME_REG_CC:
7331 stl_le_p(&n->bar.cc, data);
7332
7333 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7334
7335 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7336 trace_pci_nvme_mmio_shutdown_set();
7337 nvme_ctrl_shutdown(n);
7338 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7339 csts |= NVME_CSTS_SHST_COMPLETE;
7340 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7341 trace_pci_nvme_mmio_shutdown_cleared();
7342 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7343 }
7344
7345 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7346 if (unlikely(nvme_start_ctrl(n))) {
7347 trace_pci_nvme_err_startfail();
7348 csts = NVME_CSTS_FAILED;
7349 } else {
7350 trace_pci_nvme_mmio_start_success();
7351 csts = NVME_CSTS_READY;
7352 }
7353 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7354 trace_pci_nvme_mmio_stopped();
7355 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7356
7357 break;
7358 }
7359
7360 stl_le_p(&n->bar.csts, csts);
7361
7362 break;
7363 case NVME_REG_CSTS:
7364 if (data & (1 << 4)) {
7365 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7366 "attempted to W1C CSTS.NSSRO"
7367 " but CAP.NSSRS is zero (not supported)");
7368 } else if (data != 0) {
7369 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7370 "attempted to set a read only bit"
7371 " of controller status");
7372 }
7373 break;
7374 case NVME_REG_NSSR:
7375 if (data == 0x4e564d65) {
7376 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7377 } else {
7378 /* The spec says that writes of other values have no effect */
7379 return;
7380 }
7381 break;
7382 case NVME_REG_AQA:
7383 stl_le_p(&n->bar.aqa, data);
7384 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7385 break;
7386 case NVME_REG_ASQ:
7387 stn_le_p(&n->bar.asq, size, data);
7388 trace_pci_nvme_mmio_asqaddr(data);
7389 break;
7390 case NVME_REG_ASQ + 4:
7391 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7392 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7393 break;
7394 case NVME_REG_ACQ:
7395 trace_pci_nvme_mmio_acqaddr(data);
7396 stn_le_p(&n->bar.acq, size, data);
7397 break;
7398 case NVME_REG_ACQ + 4:
7399 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7400 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7401 break;
7402 case NVME_REG_CMBLOC:
7403 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7404 "invalid write to reserved CMBLOC"
7405 " when CMBSZ is zero, ignored");
7406 return;
7407 case NVME_REG_CMBSZ:
7408 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7409 "invalid write to read only CMBSZ, ignored");
7410 return;
7411 case NVME_REG_CMBMSC:
7412 if (!NVME_CAP_CMBS(cap)) {
7413 return;
7414 }
7415
7416 stn_le_p(&n->bar.cmbmsc, size, data);
7417 n->cmb.cmse = false;
7418
7419 if (NVME_CMBMSC_CRE(data)) {
7420 nvme_cmb_enable_regs(n);
7421
7422 if (NVME_CMBMSC_CMSE(data)) {
7423 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7424 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7425 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7426 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7427 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7428 stl_le_p(&n->bar.cmbsts, cmbsts);
7429 return;
7430 }
7431
7432 n->cmb.cba = cba;
7433 n->cmb.cmse = true;
7434 }
7435 } else {
7436 n->bar.cmbsz = 0;
7437 n->bar.cmbloc = 0;
7438 }
7439
7440 return;
7441 case NVME_REG_CMBMSC + 4:
7442 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7443 return;
7444
7445 case NVME_REG_PMRCAP:
7446 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7447 "invalid write to PMRCAP register, ignored");
7448 return;
7449 case NVME_REG_PMRCTL:
7450 if (!NVME_CAP_PMRS(cap)) {
7451 return;
7452 }
7453
7454 stl_le_p(&n->bar.pmrctl, data);
7455 if (NVME_PMRCTL_EN(data)) {
7456 memory_region_set_enabled(&n->pmr.dev->mr, true);
7457 pmrsts = 0;
7458 } else {
7459 memory_region_set_enabled(&n->pmr.dev->mr, false);
7460 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7461 n->pmr.cmse = false;
7462 }
7463 stl_le_p(&n->bar.pmrsts, pmrsts);
7464 return;
7465 case NVME_REG_PMRSTS:
7466 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7467 "invalid write to PMRSTS register, ignored");
7468 return;
7469 case NVME_REG_PMREBS:
7470 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7471 "invalid write to PMREBS register, ignored");
7472 return;
7473 case NVME_REG_PMRSWTP:
7474 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7475 "invalid write to PMRSWTP register, ignored");
7476 return;
7477 case NVME_REG_PMRMSCL:
7478 if (!NVME_CAP_PMRS(cap)) {
7479 return;
7480 }
7481
7482 stl_le_p(&n->bar.pmrmscl, data);
7483 n->pmr.cmse = false;
7484
7485 if (NVME_PMRMSCL_CMSE(data)) {
7486 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
7487 hwaddr cba = pmrmscu << 32 |
7488 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
7489 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
7490 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
7491 stl_le_p(&n->bar.pmrsts, pmrsts);
7492 return;
7493 }
7494
7495 n->pmr.cmse = true;
7496 n->pmr.cba = cba;
7497 }
7498
7499 return;
7500 case NVME_REG_PMRMSCU:
7501 if (!NVME_CAP_PMRS(cap)) {
7502 return;
7503 }
7504
7505 stl_le_p(&n->bar.pmrmscu, data);
7506 return;
7507 default:
7508 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
7509 "invalid MMIO write,"
7510 " offset=0x%"PRIx64", data=%"PRIx64"",
7511 offset, data);
7512 break;
7513 }
7514 }
7515
7516 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
7517 {
7518 NvmeCtrl *n = (NvmeCtrl *)opaque;
7519 uint8_t *ptr = (uint8_t *)&n->bar;
7520
7521 trace_pci_nvme_mmio_read(addr, size);
7522
7523 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
7524 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
7525 "MMIO read not 32-bit aligned,"
7526 " offset=0x%"PRIx64"", addr);
7527 /* should RAZ, fall through for now */
7528 } else if (unlikely(size < sizeof(uint32_t))) {
7529 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
7530 "MMIO read smaller than 32-bits,"
7531 " offset=0x%"PRIx64"", addr);
7532 /* should RAZ, fall through for now */
7533 }
7534
7535 if (addr > sizeof(n->bar) - size) {
7536 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
7537 "MMIO read beyond last register,"
7538 " offset=0x%"PRIx64", returning 0", addr);
7539
7540 return 0;
7541 }
7542
7543 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7544 addr != NVME_REG_CSTS) {
7545 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7546 return 0;
7547 }
7548
7549 /*
7550 * When PMRWBM bit 1 is set then read from
7551 * from PMRSTS should ensure prior writes
7552 * made it to persistent media
7553 */
7554 if (addr == NVME_REG_PMRSTS &&
7555 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
7556 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7557 }
7558
7559 return ldn_le_p(ptr + addr, size);
7560 }
7561
7562 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
7563 {
7564 PCIDevice *pci = PCI_DEVICE(n);
7565 uint32_t qid;
7566
7567 if (unlikely(addr & ((1 << 2) - 1))) {
7568 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
7569 "doorbell write not 32-bit aligned,"
7570 " offset=0x%"PRIx64", ignoring", addr);
7571 return;
7572 }
7573
7574 if (((addr - 0x1000) >> 2) & 1) {
7575 /* Completion queue doorbell write */
7576
7577 uint16_t new_head = val & 0xffff;
7578 int start_sqs;
7579 NvmeCQueue *cq;
7580
7581 qid = (addr - (0x1000 + (1 << 2))) >> 3;
7582 if (unlikely(nvme_check_cqid(n, qid))) {
7583 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
7584 "completion queue doorbell write"
7585 " for nonexistent queue,"
7586 " sqid=%"PRIu32", ignoring", qid);
7587
7588 /*
7589 * NVM Express v1.3d, Section 4.1 state: "If host software writes
7590 * an invalid value to the Submission Queue Tail Doorbell or
7591 * Completion Queue Head Doorbell regiter and an Asynchronous Event
7592 * Request command is outstanding, then an asynchronous event is
7593 * posted to the Admin Completion Queue with a status code of
7594 * Invalid Doorbell Write Value."
7595 *
7596 * Also note that the spec includes the "Invalid Doorbell Register"
7597 * status code, but nowhere does it specify when to use it.
7598 * However, it seems reasonable to use it here in a similar
7599 * fashion.
7600 */
7601 if (n->outstanding_aers) {
7602 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7603 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
7604 NVME_LOG_ERROR_INFO);
7605 }
7606
7607 return;
7608 }
7609
7610 cq = n->cq[qid];
7611 if (unlikely(new_head >= cq->size)) {
7612 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
7613 "completion queue doorbell write value"
7614 " beyond queue size, sqid=%"PRIu32","
7615 " new_head=%"PRIu16", ignoring",
7616 qid, new_head);
7617
7618 if (n->outstanding_aers) {
7619 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7620 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
7621 NVME_LOG_ERROR_INFO);
7622 }
7623
7624 return;
7625 }
7626
7627 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
7628
7629 start_sqs = nvme_cq_full(cq) ? 1 : 0;
7630 cq->head = new_head;
7631 if (!qid && n->dbbuf_enabled) {
7632 pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
7633 }
7634 if (start_sqs) {
7635 NvmeSQueue *sq;
7636 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
7637 qemu_bh_schedule(sq->bh);
7638 }
7639 qemu_bh_schedule(cq->bh);
7640 }
7641
7642 if (cq->tail == cq->head) {
7643 if (cq->irq_enabled) {
7644 n->cq_pending--;
7645 }
7646
7647 nvme_irq_deassert(n, cq);
7648 }
7649 } else {
7650 /* Submission queue doorbell write */
7651
7652 uint16_t new_tail = val & 0xffff;
7653 NvmeSQueue *sq;
7654
7655 qid = (addr - 0x1000) >> 3;
7656 if (unlikely(nvme_check_sqid(n, qid))) {
7657 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
7658 "submission queue doorbell write"
7659 " for nonexistent queue,"
7660 " sqid=%"PRIu32", ignoring", qid);
7661
7662 if (n->outstanding_aers) {
7663 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7664 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
7665 NVME_LOG_ERROR_INFO);
7666 }
7667
7668 return;
7669 }
7670
7671 sq = n->sq[qid];
7672 if (unlikely(new_tail >= sq->size)) {
7673 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
7674 "submission queue doorbell write value"
7675 " beyond queue size, sqid=%"PRIu32","
7676 " new_tail=%"PRIu16", ignoring",
7677 qid, new_tail);
7678
7679 if (n->outstanding_aers) {
7680 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7681 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
7682 NVME_LOG_ERROR_INFO);
7683 }
7684
7685 return;
7686 }
7687
7688 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
7689
7690 sq->tail = new_tail;
7691 if (!qid && n->dbbuf_enabled) {
7692 /*
7693 * The spec states "the host shall also update the controller's
7694 * corresponding doorbell property to match the value of that entry
7695 * in the Shadow Doorbell buffer."
7696 *
7697 * Since this context is currently a VM trap, we can safely enforce
7698 * the requirement from the device side in case the host is
7699 * misbehaving.
7700 *
7701 * Note, we shouldn't have to do this, but various drivers
7702 * including ones that run on Linux, are not updating Admin Queues,
7703 * so we can't trust reading it for an appropriate sq tail.
7704 */
7705 pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
7706 }
7707
7708 qemu_bh_schedule(sq->bh);
7709 }
7710 }
7711
7712 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
7713 unsigned size)
7714 {
7715 NvmeCtrl *n = (NvmeCtrl *)opaque;
7716
7717 trace_pci_nvme_mmio_write(addr, data, size);
7718
7719 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7720 addr != NVME_REG_CSTS) {
7721 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7722 return;
7723 }
7724
7725 if (addr < sizeof(n->bar)) {
7726 nvme_write_bar(n, addr, data, size);
7727 } else {
7728 nvme_process_db(n, addr, data);
7729 }
7730 }
7731
7732 static const MemoryRegionOps nvme_mmio_ops = {
7733 .read = nvme_mmio_read,
7734 .write = nvme_mmio_write,
7735 .endianness = DEVICE_LITTLE_ENDIAN,
7736 .impl = {
7737 .min_access_size = 2,
7738 .max_access_size = 8,
7739 },
7740 };
7741
7742 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
7743 unsigned size)
7744 {
7745 NvmeCtrl *n = (NvmeCtrl *)opaque;
7746 stn_le_p(&n->cmb.buf[addr], size, data);
7747 }
7748
7749 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
7750 {
7751 NvmeCtrl *n = (NvmeCtrl *)opaque;
7752 return ldn_le_p(&n->cmb.buf[addr], size);
7753 }
7754
7755 static const MemoryRegionOps nvme_cmb_ops = {
7756 .read = nvme_cmb_read,
7757 .write = nvme_cmb_write,
7758 .endianness = DEVICE_LITTLE_ENDIAN,
7759 .impl = {
7760 .min_access_size = 1,
7761 .max_access_size = 8,
7762 },
7763 };
7764
7765 static bool nvme_check_params(NvmeCtrl *n, Error **errp)
7766 {
7767 NvmeParams *params = &n->params;
7768
7769 if (params->num_queues) {
7770 warn_report("num_queues is deprecated; please use max_ioqpairs "
7771 "instead");
7772
7773 params->max_ioqpairs = params->num_queues - 1;
7774 }
7775
7776 if (n->namespace.blkconf.blk && n->subsys) {
7777 error_setg(errp, "subsystem support is unavailable with legacy "
7778 "namespace ('drive' property)");
7779 return false;
7780 }
7781
7782 if (params->max_ioqpairs < 1 ||
7783 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
7784 error_setg(errp, "max_ioqpairs must be between 1 and %d",
7785 NVME_MAX_IOQPAIRS);
7786 return false;
7787 }
7788
7789 if (params->msix_qsize < 1 ||
7790 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
7791 error_setg(errp, "msix_qsize must be between 1 and %d",
7792 PCI_MSIX_FLAGS_QSIZE + 1);
7793 return false;
7794 }
7795
7796 if (!params->serial) {
7797 error_setg(errp, "serial property not set");
7798 return false;
7799 }
7800
7801 if (n->pmr.dev) {
7802 if (host_memory_backend_is_mapped(n->pmr.dev)) {
7803 error_setg(errp, "can't use already busy memdev: %s",
7804 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
7805 return false;
7806 }
7807
7808 if (!is_power_of_2(n->pmr.dev->size)) {
7809 error_setg(errp, "pmr backend size needs to be power of 2 in size");
7810 return false;
7811 }
7812
7813 host_memory_backend_set_mapped(n->pmr.dev, true);
7814 }
7815
7816 if (n->params.zasl > n->params.mdts) {
7817 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
7818 "than or equal to mdts (Maximum Data Transfer Size)");
7819 return false;
7820 }
7821
7822 if (!n->params.vsl) {
7823 error_setg(errp, "vsl must be non-zero");
7824 return false;
7825 }
7826
7827 if (params->sriov_max_vfs) {
7828 if (!n->subsys) {
7829 error_setg(errp, "subsystem is required for the use of SR-IOV");
7830 return false;
7831 }
7832
7833 if (params->sriov_max_vfs > NVME_MAX_VFS) {
7834 error_setg(errp, "sriov_max_vfs must be between 0 and %d",
7835 NVME_MAX_VFS);
7836 return false;
7837 }
7838
7839 if (params->cmb_size_mb) {
7840 error_setg(errp, "CMB is not supported with SR-IOV");
7841 return false;
7842 }
7843
7844 if (n->pmr.dev) {
7845 error_setg(errp, "PMR is not supported with SR-IOV");
7846 return false;
7847 }
7848
7849 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
7850 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
7851 " must be set for the use of SR-IOV");
7852 return false;
7853 }
7854
7855 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
7856 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
7857 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
7858 return false;
7859 }
7860
7861 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
7862 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
7863 " greater than or equal to 2");
7864 return false;
7865 }
7866
7867 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
7868 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
7869 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
7870 return false;
7871 }
7872
7873 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
7874 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
7875 " greater than or equal to 1");
7876 return false;
7877 }
7878
7879 if (params->sriov_max_vi_per_vf &&
7880 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
7881 error_setg(errp, "sriov_max_vi_per_vf must meet:"
7882 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
7883 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
7884 return false;
7885 }
7886
7887 if (params->sriov_max_vq_per_vf &&
7888 (params->sriov_max_vq_per_vf < 2 ||
7889 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
7890 error_setg(errp, "sriov_max_vq_per_vf must meet:"
7891 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
7892 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
7893 return false;
7894 }
7895 }
7896
7897 return true;
7898 }
7899
7900 static void nvme_init_state(NvmeCtrl *n)
7901 {
7902 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7903 NvmeSecCtrlList *list = &n->sec_ctrl_list;
7904 NvmeSecCtrlEntry *sctrl;
7905 PCIDevice *pci = PCI_DEVICE(n);
7906 uint8_t max_vfs;
7907 int i;
7908
7909 if (pci_is_vf(pci)) {
7910 sctrl = nvme_sctrl(n);
7911 max_vfs = 0;
7912 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7913 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7914 } else {
7915 max_vfs = n->params.sriov_max_vfs;
7916 n->conf_ioqpairs = n->params.max_ioqpairs;
7917 n->conf_msix_qsize = n->params.msix_qsize;
7918 }
7919
7920 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
7921 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
7922 n->temperature = NVME_TEMPERATURE;
7923 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
7924 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
7925 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
7926 QTAILQ_INIT(&n->aer_queue);
7927
7928 list->numcntl = cpu_to_le16(max_vfs);
7929 for (i = 0; i < max_vfs; i++) {
7930 sctrl = &list->sec[i];
7931 sctrl->pcid = cpu_to_le16(n->cntlid);
7932 sctrl->vfn = cpu_to_le16(i + 1);
7933 }
7934
7935 cap->cntlid = cpu_to_le16(n->cntlid);
7936 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
7937
7938 if (pci_is_vf(pci)) {
7939 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
7940 } else {
7941 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
7942 n->params.sriov_vq_flexible);
7943 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
7944 cap->vqrfap = cap->vqfrt;
7945 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
7946 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
7947 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
7948 cap->vqfrt / MAX(max_vfs, 1);
7949 }
7950
7951 if (pci_is_vf(pci)) {
7952 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
7953 } else {
7954 cap->viprt = cpu_to_le16(n->params.msix_qsize -
7955 n->params.sriov_vi_flexible);
7956 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
7957 cap->virfap = cap->vifrt;
7958 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
7959 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
7960 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
7961 cap->vifrt / MAX(max_vfs, 1);
7962 }
7963 }
7964
7965 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
7966 {
7967 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
7968 uint64_t cap = ldq_le_p(&n->bar.cap);
7969
7970 n->cmb.buf = g_malloc0(cmb_size);
7971 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
7972 "nvme-cmb", cmb_size);
7973 pci_register_bar(pci_dev, NVME_CMB_BIR,
7974 PCI_BASE_ADDRESS_SPACE_MEMORY |
7975 PCI_BASE_ADDRESS_MEM_TYPE_64 |
7976 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
7977
7978 NVME_CAP_SET_CMBS(cap, 1);
7979 stq_le_p(&n->bar.cap, cap);
7980
7981 if (n->params.legacy_cmb) {
7982 nvme_cmb_enable_regs(n);
7983 n->cmb.cmse = true;
7984 }
7985 }
7986
7987 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
7988 {
7989 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
7990
7991 NVME_PMRCAP_SET_RDS(pmrcap, 1);
7992 NVME_PMRCAP_SET_WDS(pmrcap, 1);
7993 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
7994 /* Turn on bit 1 support */
7995 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
7996 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
7997 stl_le_p(&n->bar.pmrcap, pmrcap);
7998
7999 pci_register_bar(pci_dev, NVME_PMR_BIR,
8000 PCI_BASE_ADDRESS_SPACE_MEMORY |
8001 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8002 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8003
8004 memory_region_set_enabled(&n->pmr.dev->mr, false);
8005 }
8006
8007 static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs,
8008 unsigned *msix_table_offset,
8009 unsigned *msix_pba_offset)
8010 {
8011 uint64_t bar_size, msix_table_size, msix_pba_size;
8012
8013 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8014 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8015
8016 if (msix_table_offset) {
8017 *msix_table_offset = bar_size;
8018 }
8019
8020 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8021 bar_size += msix_table_size;
8022 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8023
8024 if (msix_pba_offset) {
8025 *msix_pba_offset = bar_size;
8026 }
8027
8028 msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8;
8029 bar_size += msix_pba_size;
8030
8031 bar_size = pow2ceil(bar_size);
8032 return bar_size;
8033 }
8034
8035 static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
8036 {
8037 uint16_t vf_dev_id = n->params.use_intel_id ?
8038 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8039 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8040 uint64_t bar_size = nvme_bar_size(le16_to_cpu(cap->vqfrsm),
8041 le16_to_cpu(cap->vifrsm),
8042 NULL, NULL);
8043
8044 pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8045 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8046 NVME_VF_OFFSET, NVME_VF_STRIDE);
8047
8048 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8049 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8050 }
8051
8052 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8053 {
8054 Error *err = NULL;
8055 int ret;
8056
8057 ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
8058 PCI_PM_SIZEOF, &err);
8059 if (err) {
8060 error_report_err(err);
8061 return ret;
8062 }
8063
8064 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8065 PCI_PM_CAP_VER_1_2);
8066 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8067 PCI_PM_CTRL_NO_SOFT_RESET);
8068 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8069 PCI_PM_CTRL_STATE_MASK);
8070
8071 return 0;
8072 }
8073
8074 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8075 {
8076 ERRP_GUARD();
8077 uint8_t *pci_conf = pci_dev->config;
8078 uint64_t bar_size;
8079 unsigned msix_table_offset, msix_pba_offset;
8080 int ret;
8081
8082 pci_conf[PCI_INTERRUPT_PIN] = 1;
8083 pci_config_set_prog_interface(pci_conf, 0x2);
8084
8085 if (n->params.use_intel_id) {
8086 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8087 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8088 } else {
8089 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8090 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8091 }
8092
8093 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8094 nvme_add_pm_capability(pci_dev, 0x60);
8095 pcie_endpoint_cap_init(pci_dev, 0x80);
8096 pcie_cap_flr_init(pci_dev);
8097 if (n->params.sriov_max_vfs) {
8098 pcie_ari_init(pci_dev, 0x100, 1);
8099 }
8100
8101 /* add one to max_ioqpairs to account for the admin queue pair */
8102 bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize,
8103 &msix_table_offset, &msix_pba_offset);
8104
8105 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8106 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8107 msix_table_offset);
8108 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8109
8110 if (pci_is_vf(pci_dev)) {
8111 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8112 } else {
8113 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8114 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8115 }
8116 ret = msix_init(pci_dev, n->params.msix_qsize,
8117 &n->bar0, 0, msix_table_offset,
8118 &n->bar0, 0, msix_pba_offset, 0, errp);
8119 if (ret == -ENOTSUP) {
8120 /* report that msix is not supported, but do not error out */
8121 warn_report_err(*errp);
8122 *errp = NULL;
8123 } else if (ret < 0) {
8124 /* propagate error to caller */
8125 return false;
8126 }
8127
8128 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8129
8130 if (n->params.cmb_size_mb) {
8131 nvme_init_cmb(n, pci_dev);
8132 }
8133
8134 if (n->pmr.dev) {
8135 nvme_init_pmr(n, pci_dev);
8136 }
8137
8138 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8139 nvme_init_sriov(n, pci_dev, 0x120);
8140 }
8141
8142 return true;
8143 }
8144
8145 static void nvme_init_subnqn(NvmeCtrl *n)
8146 {
8147 NvmeSubsystem *subsys = n->subsys;
8148 NvmeIdCtrl *id = &n->id_ctrl;
8149
8150 if (!subsys) {
8151 snprintf((char *)id->subnqn, sizeof(id->subnqn),
8152 "nqn.2019-08.org.qemu:%s", n->params.serial);
8153 } else {
8154 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8155 }
8156 }
8157
8158 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8159 {
8160 NvmeIdCtrl *id = &n->id_ctrl;
8161 uint8_t *pci_conf = pci_dev->config;
8162 uint64_t cap = ldq_le_p(&n->bar.cap);
8163 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8164 uint32_t ctratt;
8165
8166 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8167 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8168 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8169 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8170 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8171
8172 id->cntlid = cpu_to_le16(n->cntlid);
8173
8174 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8175 ctratt = NVME_CTRATT_ELBAS;
8176
8177 id->rab = 6;
8178
8179 if (n->params.use_intel_id) {
8180 id->ieee[0] = 0xb3;
8181 id->ieee[1] = 0x02;
8182 id->ieee[2] = 0x00;
8183 } else {
8184 id->ieee[0] = 0x00;
8185 id->ieee[1] = 0x54;
8186 id->ieee[2] = 0x52;
8187 }
8188
8189 id->mdts = n->params.mdts;
8190 id->ver = cpu_to_le32(NVME_SPEC_VER);
8191 id->oacs =
8192 cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
8193 NVME_OACS_DIRECTIVES);
8194 id->cntrltype = 0x1;
8195
8196 /*
8197 * Because the controller always completes the Abort command immediately,
8198 * there can never be more than one concurrently executing Abort command,
8199 * so this value is never used for anything. Note that there can easily be
8200 * many Abort commands in the queues, but they are not considered
8201 * "executing" until processed by nvme_abort.
8202 *
8203 * The specification recommends a value of 3 for Abort Command Limit (four
8204 * concurrently outstanding Abort commands), so lets use that though it is
8205 * inconsequential.
8206 */
8207 id->acl = 3;
8208 id->aerl = n->params.aerl;
8209 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8210 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8211
8212 /* recommended default value (~70 C) */
8213 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8214 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8215
8216 id->sqes = (0x6 << 4) | 0x6;
8217 id->cqes = (0x4 << 4) | 0x4;
8218 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8219 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8220 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8221 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
8222
8223 /*
8224 * NOTE: If this device ever supports a command set that does NOT use 0x0
8225 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8226 * should probably be removed.
8227 *
8228 * See comment in nvme_io_cmd.
8229 */
8230 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8231
8232 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1);
8233 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN);
8234
8235 nvme_init_subnqn(n);
8236
8237 id->psd[0].mp = cpu_to_le16(0x9c4);
8238 id->psd[0].enlat = cpu_to_le32(0x10);
8239 id->psd[0].exlat = cpu_to_le32(0x4);
8240
8241 if (n->subsys) {
8242 id->cmic |= NVME_CMIC_MULTI_CTRL;
8243 ctratt |= NVME_CTRATT_ENDGRPS;
8244
8245 id->endgidmax = cpu_to_le16(0x1);
8246
8247 if (n->subsys->endgrp.fdp.enabled) {
8248 ctratt |= NVME_CTRATT_FDPS;
8249 }
8250 }
8251
8252 id->ctratt = cpu_to_le32(ctratt);
8253
8254 NVME_CAP_SET_MQES(cap, 0x7ff);
8255 NVME_CAP_SET_CQR(cap, 1);
8256 NVME_CAP_SET_TO(cap, 0xf);
8257 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
8258 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
8259 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
8260 NVME_CAP_SET_MPSMAX(cap, 4);
8261 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8262 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8263 stq_le_p(&n->bar.cap, cap);
8264
8265 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8266 n->bar.intmc = n->bar.intms = 0;
8267
8268 if (pci_is_vf(pci_dev) && !sctrl->scs) {
8269 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8270 }
8271 }
8272
8273 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8274 {
8275 int cntlid;
8276
8277 if (!n->subsys) {
8278 return 0;
8279 }
8280
8281 cntlid = nvme_subsys_register_ctrl(n, errp);
8282 if (cntlid < 0) {
8283 return -1;
8284 }
8285
8286 n->cntlid = cntlid;
8287
8288 return 0;
8289 }
8290
8291 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8292 {
8293 uint32_t nsid = ns->params.nsid;
8294 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8295
8296 n->namespaces[nsid] = ns;
8297 ns->attached++;
8298
8299 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
8300 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
8301 }
8302
8303 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8304 {
8305 NvmeCtrl *n = NVME(pci_dev);
8306 DeviceState *dev = DEVICE(pci_dev);
8307 NvmeNamespace *ns;
8308 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8309
8310 if (pci_is_vf(pci_dev)) {
8311 /*
8312 * VFs derive settings from the parent. PF's lifespan exceeds
8313 * that of VF's, so it's safe to share params.serial.
8314 */
8315 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8316 n->subsys = pn->subsys;
8317 }
8318
8319 if (!nvme_check_params(n, errp)) {
8320 return;
8321 }
8322
8323 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8324
8325 if (nvme_init_subsys(n, errp)) {
8326 return;
8327 }
8328 nvme_init_state(n);
8329 if (!nvme_init_pci(n, pci_dev, errp)) {
8330 return;
8331 }
8332 nvme_init_ctrl(n, pci_dev);
8333
8334 /* setup a namespace if the controller drive property was given */
8335 if (n->namespace.blkconf.blk) {
8336 ns = &n->namespace;
8337 ns->params.nsid = 1;
8338
8339 if (nvme_ns_setup(ns, errp)) {
8340 return;
8341 }
8342
8343 nvme_attach_ns(n, ns);
8344 }
8345 }
8346
8347 static void nvme_exit(PCIDevice *pci_dev)
8348 {
8349 NvmeCtrl *n = NVME(pci_dev);
8350 NvmeNamespace *ns;
8351 int i;
8352
8353 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8354
8355 if (n->subsys) {
8356 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
8357 ns = nvme_ns(n, i);
8358 if (ns) {
8359 ns->attached--;
8360 }
8361 }
8362
8363 nvme_subsys_unregister_ctrl(n->subsys, n);
8364 }
8365
8366 g_free(n->cq);
8367 g_free(n->sq);
8368 g_free(n->aer_reqs);
8369
8370 if (n->params.cmb_size_mb) {
8371 g_free(n->cmb.buf);
8372 }
8373
8374 if (n->pmr.dev) {
8375 host_memory_backend_set_mapped(n->pmr.dev, false);
8376 }
8377
8378 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8379 pcie_sriov_pf_exit(pci_dev);
8380 }
8381
8382 msix_uninit(pci_dev, &n->bar0, &n->bar0);
8383 memory_region_del_subregion(&n->bar0, &n->iomem);
8384 }
8385
8386 static Property nvme_props[] = {
8387 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
8388 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
8389 HostMemoryBackend *),
8390 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
8391 NvmeSubsystem *),
8392 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
8393 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
8394 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
8395 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
8396 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
8397 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
8398 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
8399 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
8400 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
8401 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
8402 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
8403 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
8404 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
8405 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
8406 params.auto_transition_zones, true),
8407 DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
8408 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
8409 params.sriov_vq_flexible, 0),
8410 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
8411 params.sriov_vi_flexible, 0),
8412 DEFINE_PROP_UINT8("sriov_max_vi_per_vf", NvmeCtrl,
8413 params.sriov_max_vi_per_vf, 0),
8414 DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
8415 params.sriov_max_vq_per_vf, 0),
8416 DEFINE_PROP_END_OF_LIST(),
8417 };
8418
8419 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
8420 void *opaque, Error **errp)
8421 {
8422 NvmeCtrl *n = NVME(obj);
8423 uint8_t value = n->smart_critical_warning;
8424
8425 visit_type_uint8(v, name, &value, errp);
8426 }
8427
8428 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
8429 void *opaque, Error **errp)
8430 {
8431 NvmeCtrl *n = NVME(obj);
8432 uint8_t value, old_value, cap = 0, index, event;
8433
8434 if (!visit_type_uint8(v, name, &value, errp)) {
8435 return;
8436 }
8437
8438 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
8439 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
8440 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
8441 cap |= NVME_SMART_PMR_UNRELIABLE;
8442 }
8443
8444 if ((value & cap) != value) {
8445 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
8446 value & ~cap);
8447 return;
8448 }
8449
8450 old_value = n->smart_critical_warning;
8451 n->smart_critical_warning = value;
8452
8453 /* only inject new bits of smart critical warning */
8454 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
8455 event = 1 << index;
8456 if (value & ~old_value & event)
8457 nvme_smart_event(n, event);
8458 }
8459 }
8460
8461 static void nvme_pci_reset(DeviceState *qdev)
8462 {
8463 PCIDevice *pci_dev = PCI_DEVICE(qdev);
8464 NvmeCtrl *n = NVME(pci_dev);
8465
8466 trace_pci_nvme_pci_reset();
8467 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8468 }
8469
8470 static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address,
8471 uint32_t val, int len)
8472 {
8473 NvmeCtrl *n = NVME(dev);
8474 NvmeSecCtrlEntry *sctrl;
8475 uint16_t sriov_cap = dev->exp.sriov_cap;
8476 uint32_t off = address - sriov_cap;
8477 int i, num_vfs;
8478
8479 if (!sriov_cap) {
8480 return;
8481 }
8482
8483 if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
8484 if (!(val & PCI_SRIOV_CTRL_VFE)) {
8485 num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
8486 for (i = 0; i < num_vfs; i++) {
8487 sctrl = &n->sec_ctrl_list.sec[i];
8488 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
8489 }
8490 }
8491 }
8492 }
8493
8494 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
8495 uint32_t val, int len)
8496 {
8497 nvme_sriov_pre_write_ctrl(dev, address, val, len);
8498 pci_default_write_config(dev, address, val, len);
8499 pcie_cap_flr_write_config(dev, address, val, len);
8500 }
8501
8502 static const VMStateDescription nvme_vmstate = {
8503 .name = "nvme",
8504 .unmigratable = 1,
8505 };
8506
8507 static void nvme_class_init(ObjectClass *oc, void *data)
8508 {
8509 DeviceClass *dc = DEVICE_CLASS(oc);
8510 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
8511
8512 pc->realize = nvme_realize;
8513 pc->config_write = nvme_pci_write_config;
8514 pc->exit = nvme_exit;
8515 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
8516 pc->revision = 2;
8517
8518 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
8519 dc->desc = "Non-Volatile Memory Express";
8520 device_class_set_props(dc, nvme_props);
8521 dc->vmsd = &nvme_vmstate;
8522 dc->reset = nvme_pci_reset;
8523 }
8524
8525 static void nvme_instance_init(Object *obj)
8526 {
8527 NvmeCtrl *n = NVME(obj);
8528
8529 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
8530 "bootindex", "/namespace@1,0",
8531 DEVICE(obj));
8532
8533 object_property_add(obj, "smart_critical_warning", "uint8",
8534 nvme_get_smart_warning,
8535 nvme_set_smart_warning, NULL, NULL);
8536 }
8537
8538 static const TypeInfo nvme_info = {
8539 .name = TYPE_NVME,
8540 .parent = TYPE_PCI_DEVICE,
8541 .instance_size = sizeof(NvmeCtrl),
8542 .instance_init = nvme_instance_init,
8543 .class_init = nvme_class_init,
8544 .interfaces = (InterfaceInfo[]) {
8545 { INTERFACE_PCIE_DEVICE },
8546 { }
8547 },
8548 };
8549
8550 static const TypeInfo nvme_bus_info = {
8551 .name = TYPE_NVME_BUS,
8552 .parent = TYPE_BUS,
8553 .instance_size = sizeof(NvmeBus),
8554 };
8555
8556 static void nvme_register_types(void)
8557 {
8558 type_register_static(&nvme_info);
8559 type_register_static(&nvme_bus_info);
8560 }
8561
8562 type_init(nvme_register_types)