/* pe operations */
#define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
-#define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
- if (ret_fr) { \
- ret_fr = -ret_fr; \
- if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \
- trace_vtd_fault_disabled(); \
- } else { \
- vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \
- } \
- goto error; \
- } \
-}
+
+/*
+ * PCI bus number (or SID) is not reliable since the device is usaully
+ * initialized before guest can configure the PCI bridge
+ * (SECONDARY_BUS_NUMBER).
+ */
+struct vtd_as_key {
+ PCIBus *bus;
+ uint8_t devfn;
+ uint32_t pasid;
+};
+
+struct vtd_iotlb_key {
+ uint64_t gfn;
+ uint32_t pasid;
+ uint16_t sid;
+ uint8_t level;
+};
static void vtd_address_space_refresh_all(IntelIOMMUState *s);
static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
}
}
+static void vtd_update_iq_dw(IntelIOMMUState *s)
+{
+ uint64_t val = vtd_get_quad_raw(s, DMAR_IQA_REG);
+
+ if (s->ecap & VTD_ECAP_SMTS &&
+ val & VTD_IQA_DW_MASK) {
+ s->iq_dw = true;
+ } else {
+ s->iq_dw = false;
+ }
+}
+
/* Whether the address space needs to notify new mappings */
static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
{
}
/* GHashTable functions */
-static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
+static gboolean vtd_iotlb_equal(gconstpointer v1, gconstpointer v2)
+{
+ const struct vtd_iotlb_key *key1 = v1;
+ const struct vtd_iotlb_key *key2 = v2;
+
+ return key1->sid == key2->sid &&
+ key1->pasid == key2->pasid &&
+ key1->level == key2->level &&
+ key1->gfn == key2->gfn;
+}
+
+static guint vtd_iotlb_hash(gconstpointer v)
+{
+ const struct vtd_iotlb_key *key = v;
+ uint64_t hash64 = key->gfn | ((uint64_t)(key->sid) << VTD_IOTLB_SID_SHIFT) |
+ (uint64_t)(key->level - 1) << VTD_IOTLB_LVL_SHIFT |
+ (uint64_t)(key->pasid) << VTD_IOTLB_PASID_SHIFT;
+
+ return (guint)((hash64 >> 32) ^ (hash64 & 0xffffffffU));
+}
+
+static gboolean vtd_as_equal(gconstpointer v1, gconstpointer v2)
{
- return *((const uint64_t *)v1) == *((const uint64_t *)v2);
+ const struct vtd_as_key *key1 = v1;
+ const struct vtd_as_key *key2 = v2;
+
+ return (key1->bus == key2->bus) && (key1->devfn == key2->devfn) &&
+ (key1->pasid == key2->pasid);
}
-static guint vtd_uint64_hash(gconstpointer v)
+/*
+ * Note that we use pointer to PCIBus as the key, so hashing/shifting
+ * based on the pointer value is intended. Note that we deal with
+ * collisions through vtd_as_equal().
+ */
+static guint vtd_as_hash(gconstpointer v)
{
- return (guint)*(const uint64_t *)v;
+ const struct vtd_as_key *key = v;
+ guint value = (guint)(uintptr_t)key->bus;
+
+ return (guint)(value << 8 | key->devfn);
}
static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
{
VTDAddressSpace *vtd_as;
- VTDBus *vtd_bus;
- GHashTableIter bus_it;
- uint32_t devfn_it;
+ GHashTableIter as_it;
trace_vtd_context_cache_reset();
- g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
+ g_hash_table_iter_init(&as_it, s->vtd_address_spaces);
- while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
- for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
- vtd_as = vtd_bus->dev_as[devfn_it];
- if (!vtd_as) {
- continue;
- }
- vtd_as->context_cache_entry.context_cache_gen = 0;
- }
+ while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) {
+ vtd_as->context_cache_entry.context_cache_gen = 0;
}
s->context_cache_gen = 1;
}
vtd_iommu_unlock(s);
}
-static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
- uint32_t level)
-{
- return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
- ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
-}
-
static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
{
return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
/* Must be called with IOMMU lock held */
static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
- hwaddr addr)
+ uint32_t pasid, hwaddr addr)
{
+ struct vtd_iotlb_key key;
VTDIOTLBEntry *entry;
- uint64_t key;
int level;
for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
- key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
- source_id, level);
+ key.gfn = vtd_get_iotlb_gfn(addr, level);
+ key.level = level;
+ key.sid = source_id;
+ key.pasid = pasid;
entry = g_hash_table_lookup(s->iotlb, &key);
if (entry) {
goto out;
/* Must be with IOMMU lock held */
static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
uint16_t domain_id, hwaddr addr, uint64_t slpte,
- uint8_t access_flags, uint32_t level)
+ uint8_t access_flags, uint32_t level,
+ uint32_t pasid)
{
VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
- uint64_t *key = g_malloc(sizeof(*key));
+ struct vtd_iotlb_key *key = g_malloc(sizeof(*key));
uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
entry->slpte = slpte;
entry->access_flags = access_flags;
entry->mask = vtd_slpt_level_page_mask(level);
- *key = vtd_get_iotlb_key(gfn, source_id, level);
+ entry->pasid = pasid;
+
+ key->gfn = gfn;
+ key->sid = source_id;
+ key->level = level;
+ key->pasid = pasid;
+
g_hash_table_replace(s->iotlb, key, entry);
}
trace_vtd_irq_generate(msi.address, msi.data);
- apic_get_class()->send_msi(&msi);
+ apic_get_class(NULL)->send_msi(&msi);
}
/* Generate a fault event to software via MSI if conditions are met.
/* Must not update F field now, should be done later */
static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
- uint16_t source_id, hwaddr addr,
- VTDFaultReason fault, bool is_write)
+ uint64_t hi, uint64_t lo)
{
- uint64_t hi = 0, lo;
hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
assert(index < DMAR_FRCD_REG_NR);
- lo = VTD_FRCD_FI(addr);
- hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
- if (!is_write) {
- hi |= VTD_FRCD_T;
- }
vtd_set_quad_raw(s, frcd_reg_addr, lo);
vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
}
/* Log and report an DMAR (address translation) fault to software */
-static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
- hwaddr addr, VTDFaultReason fault,
- bool is_write)
+static void vtd_report_frcd_fault(IntelIOMMUState *s, uint64_t source_id,
+ uint64_t hi, uint64_t lo)
{
uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
- assert(fault < VTD_FR_MAX);
-
- if (fault == VTD_FR_RESERVED_ERR) {
- /* This is not a normal fault reason case. Drop it. */
- return;
- }
-
- trace_vtd_dmar_fault(source_id, fault, addr, is_write);
-
if (fsts_reg & VTD_FSTS_PFO) {
error_report_once("New fault is not recorded due to "
"Primary Fault Overflow");
return;
}
- vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
+ vtd_record_frcd(s, s->next_frcd_reg, hi, lo);
if (fsts_reg & VTD_FSTS_PPF) {
error_report_once("There are pending faults already, "
}
}
+/* Log and report an DMAR (address translation) fault to software */
+static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
+ hwaddr addr, VTDFaultReason fault,
+ bool is_write, bool is_pasid,
+ uint32_t pasid)
+{
+ uint64_t hi, lo;
+
+ assert(fault < VTD_FR_MAX);
+
+ trace_vtd_dmar_fault(source_id, fault, addr, is_write);
+
+ lo = VTD_FRCD_FI(addr);
+ hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault) |
+ VTD_FRCD_PV(pasid) | VTD_FRCD_PP(is_pasid);
+ if (!is_write) {
+ hi |= VTD_FRCD_T;
+ }
+
+ vtd_report_frcd_fault(s, source_id, hi, lo);
+}
+
+
+static void vtd_report_ir_fault(IntelIOMMUState *s, uint64_t source_id,
+ VTDFaultReason fault, uint16_t index)
+{
+ uint64_t hi, lo;
+
+ lo = VTD_FRCD_IR_IDX(index);
+ hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
+
+ vtd_report_frcd_fault(s, source_id, hi, lo);
+}
+
/* Handle Invalidation Queue Errors of queued invalidation interface error
* conditions.
*/
return -VTD_FR_PASID_TABLE_INV;
}
+ pdire->val = le64_to_cpu(pdire->val);
+
return 0;
}
pe, entry_size, MEMTXATTRS_UNSPECIFIED)) {
return -VTD_FR_PASID_TABLE_INV;
}
+ for (size_t i = 0; i < ARRAY_SIZE(pe->val); i++) {
+ pe->val[i] = le64_to_cpu(pe->val[i]);
+ }
/* Do translation type check */
if (!vtd_pe_type_check(x86_iommu, pe)) {
static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
VTDContextEntry *ce,
- VTDPASIDEntry *pe)
+ VTDPASIDEntry *pe,
+ uint32_t pasid)
{
- uint32_t pasid;
dma_addr_t pasid_dir_base;
int ret = 0;
- pasid = VTD_CE_GET_RID2PASID(ce);
+ if (pasid == PCI_NO_PASID) {
+ pasid = VTD_CE_GET_RID2PASID(ce);
+ }
pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe);
static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
VTDContextEntry *ce,
- bool *pe_fpd_set)
+ bool *pe_fpd_set,
+ uint32_t pasid)
{
int ret;
- uint32_t pasid;
dma_addr_t pasid_dir_base;
VTDPASIDDirEntry pdire;
VTDPASIDEntry pe;
- pasid = VTD_CE_GET_RID2PASID(ce);
+ if (pasid == PCI_NO_PASID) {
+ pasid = VTD_CE_GET_RID2PASID(ce);
+ }
pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
/*
}
static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
- VTDContextEntry *ce)
+ VTDContextEntry *ce,
+ uint32_t pasid)
{
VTDPASIDEntry pe;
if (s->root_scalable) {
- vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
return VTD_PE_GET_LEVEL(&pe);
}
}
static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
- VTDContextEntry *ce)
+ VTDContextEntry *ce,
+ uint32_t pasid)
{
VTDPASIDEntry pe;
if (s->root_scalable) {
- vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
}
}
static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
- VTDContextEntry *ce, uint8_t aw)
+ VTDContextEntry *ce, uint8_t aw,
+ uint32_t pasid)
{
- uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
+ uint32_t ce_agaw = vtd_get_iova_agaw(s, ce, pasid);
return 1ULL << MIN(ce_agaw, aw);
}
/* Return true if IOVA passes range check, otherwise false. */
static inline bool vtd_iova_range_check(IntelIOMMUState *s,
uint64_t iova, VTDContextEntry *ce,
- uint8_t aw)
+ uint8_t aw, uint32_t pasid)
{
/*
* Check if @iova is above 2^X-1, where X is the minimum of MGAW
* in CAP_REG and AW in context-entry.
*/
- return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
+ return !(iova & ~(vtd_iova_limit(s, ce, aw, pasid) - 1));
}
static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
- VTDContextEntry *ce)
+ VTDContextEntry *ce,
+ uint32_t pasid)
{
VTDPASIDEntry pe;
if (s->root_scalable) {
- vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
}
* Rsvd field masks for spte:
* vtd_spte_rsvd 4k pages
* vtd_spte_rsvd_large large pages
+ *
+ * We support only 3-level and 4-level page tables (see vtd_init() which
+ * sets only VTD_CAP_SAGAW_39bit and maybe VTD_CAP_SAGAW_48bit bits in s->cap).
*/
-static uint64_t vtd_spte_rsvd[5];
-static uint64_t vtd_spte_rsvd_large[5];
+#define VTD_SPTE_RSVD_LEN 5
+static uint64_t vtd_spte_rsvd[VTD_SPTE_RSVD_LEN];
+static uint64_t vtd_spte_rsvd_large[VTD_SPTE_RSVD_LEN];
static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
{
- uint64_t rsvd_mask = vtd_spte_rsvd[level];
+ uint64_t rsvd_mask;
+
+ /*
+ * We should have caught a guest-mis-programmed level earlier,
+ * via vtd_is_level_supported.
+ */
+ assert(level < VTD_SPTE_RSVD_LEN);
+ /*
+ * Zero level doesn't exist. The smallest level is VTD_SL_PT_LEVEL=1 and
+ * checked by vtd_is_last_slpte().
+ */
+ assert(level);
if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) &&
(slpte & VTD_SL_PT_PAGE_SIZE_MASK)) {
/* large page */
rsvd_mask = vtd_spte_rsvd_large[level];
+ } else {
+ rsvd_mask = vtd_spte_rsvd[level];
}
return slpte & rsvd_mask;
}
-/* Find the VTD address space associated with a given bus number */
-static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
-{
- VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
- GHashTableIter iter;
-
- if (vtd_bus) {
- return vtd_bus;
- }
-
- /*
- * Iterate over the registered buses to find the one which
- * currently holds this bus number and update the bus_num
- * lookup table.
- */
- g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
- while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
- if (pci_bus_num(vtd_bus->bus) == bus_num) {
- s->vtd_as_by_bus_num[bus_num] = vtd_bus;
- return vtd_bus;
- }
- }
-
- return NULL;
-}
-
/* Given the @iova, get relevant @slptep. @slpte_level will be the last level
* of the translation, can be used for deciding the size of large page.
*/
static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
uint64_t iova, bool is_write,
uint64_t *slptep, uint32_t *slpte_level,
- bool *reads, bool *writes, uint8_t aw_bits)
+ bool *reads, bool *writes, uint8_t aw_bits,
+ uint32_t pasid)
{
- dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
- uint32_t level = vtd_get_iova_level(s, ce);
+ dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid);
+ uint32_t level = vtd_get_iova_level(s, ce, pasid);
uint32_t offset;
uint64_t slpte;
uint64_t access_right_check;
+ uint64_t xlat, size;
- if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
- error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
- __func__, iova);
+ if (!vtd_iova_range_check(s, iova, ce, aw_bits, pasid)) {
+ error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ","
+ "pasid=0x%" PRIx32 ")", __func__, iova, pasid);
return -VTD_FR_ADDR_BEYOND_MGAW;
}
if (slpte == (uint64_t)-1) {
error_report_once("%s: detected read error on DMAR slpte "
- "(iova=0x%" PRIx64 ")", __func__, iova);
- if (level == vtd_get_iova_level(s, ce)) {
+ "(iova=0x%" PRIx64 ", pasid=0x%" PRIx32 ")",
+ __func__, iova, pasid);
+ if (level == vtd_get_iova_level(s, ce, pasid)) {
/* Invalid programming of context-entry */
return -VTD_FR_CONTEXT_ENTRY_INV;
} else {
if (!(slpte & access_right_check)) {
error_report_once("%s: detected slpte permission error "
"(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
- "slpte=0x%" PRIx64 ", write=%d)", __func__,
- iova, level, slpte, is_write);
+ "slpte=0x%" PRIx64 ", write=%d, pasid=0x%"
+ PRIx32 ")", __func__, iova, level,
+ slpte, is_write, pasid);
return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
}
if (vtd_slpte_nonzero_rsvd(slpte, level)) {
error_report_once("%s: detected splte reserve non-zero "
"iova=0x%" PRIx64 ", level=0x%" PRIx32
- "slpte=0x%" PRIx64 ")", __func__, iova,
- level, slpte);
+ "slpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")",
+ __func__, iova, level, slpte, pasid);
return -VTD_FR_PAGING_ENTRY_RSVD;
}
if (vtd_is_last_slpte(slpte, level)) {
*slptep = slpte;
*slpte_level = level;
- return 0;
+ break;
}
addr = vtd_get_slpte_addr(slpte, aw_bits);
level--;
}
+
+ xlat = vtd_get_slpte_addr(*slptep, aw_bits);
+ size = ~vtd_slpt_level_page_mask(level) + 1;
+
+ /*
+ * From VT-d spec 3.14: Untranslated requests and translation
+ * requests that result in an address in the interrupt range will be
+ * blocked with condition code LGN.4 or SGN.8.
+ */
+ if ((xlat > VTD_INTERRUPT_ADDR_LAST ||
+ xlat + size - 1 < VTD_INTERRUPT_ADDR_FIRST)) {
+ return 0;
+ } else {
+ error_report_once("%s: xlat address is in interrupt range "
+ "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
+ "slpte=0x%" PRIx64 ", write=%d, "
+ "xlat=0x%" PRIx64 ", size=0x%" PRIx64 ", "
+ "pasid=0x%" PRIx32 ")",
+ __func__, iova, level, slpte, is_write,
+ xlat, size, pasid);
+ return s->scalable_mode ? -VTD_FR_SM_INTERRUPT_ADDR :
+ -VTD_FR_INTERRUPT_ADDR;
+ }
}
typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private);
return ret;
}
/* Drop any existing mapping */
- iova_tree_remove(as->iova_tree, &target);
+ iova_tree_remove(as->iova_tree, target);
/* Recover the correct type */
event->type = IOMMU_NOTIFIER_MAP;
entry->perm = cache_perm;
trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
return 0;
}
- iova_tree_remove(as->iova_tree, &target);
+ iova_tree_remove(as->iova_tree, target);
}
trace_vtd_page_walk_one(info->domain_id, entry->iova,
*/
static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
uint64_t start, uint64_t end,
- vtd_page_walk_info *info)
+ vtd_page_walk_info *info,
+ uint32_t pasid)
{
- dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
- uint32_t level = vtd_get_iova_level(s, ce);
+ dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid);
+ uint32_t level = vtd_get_iova_level(s, ce, pasid);
- if (!vtd_iova_range_check(s, start, ce, info->aw)) {
+ if (!vtd_iova_range_check(s, start, ce, info->aw, pasid)) {
return -VTD_FR_ADDR_BEYOND_MGAW;
}
- if (!vtd_iova_range_check(s, end, ce, info->aw)) {
+ if (!vtd_iova_range_check(s, end, ce, info->aw, pasid)) {
/* Fix end so that it reaches the maximum */
- end = vtd_iova_limit(s, ce, info->aw);
+ end = vtd_iova_limit(s, ce, info->aw, pasid);
}
return vtd_page_walk_level(addr, start, end, level, true, true, info);
* has valid rid2pasid setting, which includes valid
* rid2pasid field and corresponding pasid entry setting
*/
- return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ return vtd_ce_get_rid2pasid_entry(s, ce, &pe, PCI_NO_PASID);
}
/* Map a device to its corresponding domain (context-entry) */
}
static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
- VTDContextEntry *ce)
+ VTDContextEntry *ce,
+ uint32_t pasid)
{
VTDPASIDEntry pe;
if (s->root_scalable) {
- vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
}
.notify_unmap = true,
.aw = s->aw_bits,
.as = vtd_as,
- .domain_id = vtd_get_domain_id(s, ce),
+ .domain_id = vtd_get_domain_id(s, ce, vtd_as->pasid),
};
- return vtd_page_walk(s, ce, addr, addr + size, &info);
+ return vtd_page_walk(s, ce, addr, addr + size, &info, vtd_as->pasid);
}
-static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
+static int vtd_address_space_sync(VTDAddressSpace *vtd_as)
{
int ret;
VTDContextEntry ce;
IOMMUNotifier *n;
- if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) {
+ /* If no MAP notifier registered, we simply invalidate all the cache */
+ if (!vtd_as_has_map_notifier(vtd_as)) {
+ IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
+ memory_region_unmap_iommu_notifier_range(n);
+ }
return 0;
}
* 1st-level translation or 2nd-level translation, it depends
* on PGTT setting.
*/
-static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce)
+static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce,
+ uint32_t pasid)
{
VTDPASIDEntry pe;
int ret;
if (s->root_scalable) {
- ret = vtd_ce_get_rid2pasid_entry(s, ce, &pe);
+ ret = vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
if (ret) {
- error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
- __func__, ret);
+ /*
+ * This error is guest triggerable. We should assumt PT
+ * not enabled for safety.
+ */
return false;
}
return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
{
IntelIOMMUState *s;
VTDContextEntry ce;
- int ret;
assert(as);
s = as->iommu_state;
- ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
- as->devfn, &ce);
- if (ret) {
+ if (vtd_dev_to_context_entry(s, pci_bus_num(as->bus), as->devfn,
+ &ce)) {
/*
* Possibly failed to parse the context entry for some reason
* (e.g., during init, or any guest configuration errors on
return false;
}
- return vtd_dev_pt_enabled(s, &ce);
+ return vtd_dev_pt_enabled(s, &ce, as->pasid);
}
/* Return whether the device is using IOMMU translation. */
static bool vtd_switch_address_space(VTDAddressSpace *as)
{
- bool use_iommu;
+ bool use_iommu, pt;
/* Whether we need to take the BQL on our own */
- bool take_bql = !qemu_mutex_iothread_locked();
+ bool take_bql = !bql_locked();
assert(as);
use_iommu = as->iommu_state->dmar_enabled && !vtd_as_pt_enabled(as);
+ pt = as->iommu_state->dmar_enabled && vtd_as_pt_enabled(as);
trace_vtd_switch_address_space(pci_bus_num(as->bus),
VTD_PCI_SLOT(as->devfn),
* it. We'd better make sure we have had it already, or, take it.
*/
if (take_bql) {
- qemu_mutex_lock_iothread();
+ bql_lock();
}
/* Turn off first then on the other */
if (use_iommu) {
memory_region_set_enabled(&as->nodmar, false);
memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
+ /*
+ * vt-d spec v3.4 3.14:
+ *
+ * """
+ * Requests-with-PASID with input address in range 0xFEEx_xxxx
+ * are translated normally like any other request-with-PASID
+ * through DMA-remapping hardware.
+ * """
+ *
+ * Need to disable ir for as with PASID.
+ */
+ if (as->pasid != PCI_NO_PASID) {
+ memory_region_set_enabled(&as->iommu_ir, false);
+ } else {
+ memory_region_set_enabled(&as->iommu_ir, true);
+ }
} else {
memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
memory_region_set_enabled(&as->nodmar, true);
}
+ /*
+ * vtd-spec v3.4 3.14:
+ *
+ * """
+ * Requests-with-PASID with input address in range 0xFEEx_xxxx are
+ * translated normally like any other request-with-PASID through
+ * DMA-remapping hardware. However, if such a request is processed
+ * using pass-through translation, it will be blocked as described
+ * in the paragraph below.
+ *
+ * Software must not program paging-structure entries to remap any
+ * address to the interrupt address range. Untranslated requests
+ * and translation requests that result in an address in the
+ * interrupt range will be blocked with condition code LGN.4 or
+ * SGN.8.
+ * """
+ *
+ * We enable per as memory region (iommu_ir_fault) for catching
+ * the translation for interrupt range through PASID + PT.
+ */
+ if (pt && as->pasid != PCI_NO_PASID) {
+ memory_region_set_enabled(&as->iommu_ir_fault, true);
+ } else {
+ memory_region_set_enabled(&as->iommu_ir_fault, false);
+ }
+
if (take_bql) {
- qemu_mutex_unlock_iothread();
+ bql_unlock();
}
return use_iommu;
static void vtd_switch_address_space_all(IntelIOMMUState *s)
{
+ VTDAddressSpace *vtd_as;
GHashTableIter iter;
- VTDBus *vtd_bus;
- int i;
-
- g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
- while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
- for (i = 0; i < PCI_DEVFN_MAX; i++) {
- if (!vtd_bus->dev_as[i]) {
- continue;
- }
- vtd_switch_address_space(vtd_bus->dev_as[i]);
- }
- }
-}
-static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
-{
- return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
+ g_hash_table_iter_init(&iter, s->vtd_address_spaces);
+ while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_as)) {
+ vtd_switch_address_space(vtd_as);
+ }
}
static const bool vtd_qualified_faults[] = {
[VTD_FR_PAGING_ENTRY_INV] = true,
[VTD_FR_ROOT_TABLE_INV] = false,
[VTD_FR_CONTEXT_TABLE_INV] = false,
+ [VTD_FR_INTERRUPT_ADDR] = true,
[VTD_FR_ROOT_ENTRY_RSVD] = false,
[VTD_FR_PAGING_ENTRY_RSVD] = true,
[VTD_FR_CONTEXT_ENTRY_TT] = true,
[VTD_FR_PASID_TABLE_INV] = false,
- [VTD_FR_RESERVED_ERR] = false,
+ [VTD_FR_SM_INTERRUPT_ADDR] = true,
[VTD_FR_MAX] = false,
};
return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
}
+static gboolean vtd_find_as_by_sid(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ struct vtd_as_key *as_key = (struct vtd_as_key *)key;
+ uint16_t target_sid = *(uint16_t *)user_data;
+ uint16_t sid = PCI_BUILD_BDF(pci_bus_num(as_key->bus), as_key->devfn);
+ return sid == target_sid;
+}
+
+static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid)
+{
+ uint8_t bus_num = PCI_BUS_NUM(sid);
+ VTDAddressSpace *vtd_as = s->vtd_as_cache[bus_num];
+
+ if (vtd_as &&
+ (sid == PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn))) {
+ return vtd_as;
+ }
+
+ vtd_as = g_hash_table_find(s->vtd_address_spaces, vtd_find_as_by_sid, &sid);
+ s->vtd_as_cache[bus_num] = vtd_as;
+
+ return vtd_as;
+}
+
static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
{
- VTDBus *vtd_bus;
VTDAddressSpace *vtd_as;
bool success = false;
- vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
- if (!vtd_bus) {
- goto out;
- }
-
- vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
+ vtd_as = vtd_get_as_by_sid(s, source_id);
if (!vtd_as) {
goto out;
}
trace_vtd_pt_enable_fast_path(source_id, success);
}
+static void vtd_report_fault(IntelIOMMUState *s,
+ int err, bool is_fpd_set,
+ uint16_t source_id,
+ hwaddr addr,
+ bool is_write,
+ bool is_pasid,
+ uint32_t pasid)
+{
+ if (is_fpd_set && vtd_is_qualified_fault(err)) {
+ trace_vtd_fault_disabled();
+ } else {
+ vtd_report_dmar_fault(s, source_id, addr, err, is_write,
+ is_pasid, pasid);
+ }
+}
+
/* Map dev to context-entry then do a paging-structures walk to do a iommu
* translation.
*
uint8_t bus_num = pci_bus_num(bus);
VTDContextCacheEntry *cc_entry;
uint64_t slpte, page_mask;
- uint32_t level;
- uint16_t source_id = vtd_make_source_id(bus_num, devfn);
+ uint32_t level, pasid = vtd_as->pasid;
+ uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn);
int ret_fr;
bool is_fpd_set = false;
bool reads = true;
bool writes = true;
uint8_t access_flags;
+ bool rid2pasid = (pasid == PCI_NO_PASID) && s->root_scalable;
VTDIOTLBEntry *iotlb_entry;
/*
cc_entry = &vtd_as->context_cache_entry;
- /* Try to fetch slpte form IOTLB */
- iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
- if (iotlb_entry) {
- trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
- iotlb_entry->domain_id);
- slpte = iotlb_entry->slpte;
- access_flags = iotlb_entry->access_flags;
- page_mask = iotlb_entry->mask;
- goto out;
+ /* Try to fetch slpte form IOTLB, we don't need RID2PASID logic */
+ if (!rid2pasid) {
+ iotlb_entry = vtd_lookup_iotlb(s, source_id, pasid, addr);
+ if (iotlb_entry) {
+ trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
+ iotlb_entry->domain_id);
+ slpte = iotlb_entry->slpte;
+ access_flags = iotlb_entry->access_flags;
+ page_mask = iotlb_entry->mask;
+ goto out;
+ }
}
/* Try to fetch context-entry from cache first */
ce = cc_entry->context_entry;
is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
if (!is_fpd_set && s->root_scalable) {
- ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
- VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+ ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, pasid);
+ if (ret_fr) {
+ vtd_report_fault(s, -ret_fr, is_fpd_set,
+ source_id, addr, is_write,
+ false, 0);
+ goto error;
+ }
}
} else {
ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
if (!ret_fr && !is_fpd_set && s->root_scalable) {
- ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
+ ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, pasid);
+ }
+ if (ret_fr) {
+ vtd_report_fault(s, -ret_fr, is_fpd_set,
+ source_id, addr, is_write,
+ false, 0);
+ goto error;
}
- VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
/* Update context-cache */
trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
cc_entry->context_cache_gen,
cc_entry->context_cache_gen = s->context_cache_gen;
}
+ if (rid2pasid) {
+ pasid = VTD_CE_GET_RID2PASID(&ce);
+ }
+
/*
* We don't need to translate for pass-through context entries.
* Also, let's ignore IOTLB caching as well for PT devices.
*/
- if (vtd_dev_pt_enabled(s, &ce)) {
+ if (vtd_dev_pt_enabled(s, &ce, pasid)) {
entry->iova = addr & VTD_PAGE_MASK_4K;
entry->translated_addr = entry->iova;
entry->addr_mask = ~VTD_PAGE_MASK_4K;
return true;
}
+ /* Try to fetch slpte form IOTLB for RID2PASID slow path */
+ if (rid2pasid) {
+ iotlb_entry = vtd_lookup_iotlb(s, source_id, pasid, addr);
+ if (iotlb_entry) {
+ trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
+ iotlb_entry->domain_id);
+ slpte = iotlb_entry->slpte;
+ access_flags = iotlb_entry->access_flags;
+ page_mask = iotlb_entry->mask;
+ goto out;
+ }
+ }
+
ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
- &reads, &writes, s->aw_bits);
- VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
+ &reads, &writes, s->aw_bits, pasid);
+ if (ret_fr) {
+ vtd_report_fault(s, -ret_fr, is_fpd_set, source_id,
+ addr, is_write, pasid != PCI_NO_PASID, pasid);
+ goto error;
+ }
page_mask = vtd_slpt_level_page_mask(level);
access_flags = IOMMU_ACCESS_FLAG(reads, writes);
- vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
- access_flags, level);
+ vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce, pasid),
+ addr, slpte, access_flags, level, pasid);
out:
vtd_iommu_unlock(s);
entry->iova = addr & page_mask;
VTDAddressSpace *vtd_as;
QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
- vtd_sync_shadow_page_table(vtd_as);
+ vtd_address_space_sync(vtd_as);
}
}
uint16_t source_id,
uint16_t func_mask)
{
+ GHashTableIter as_it;
uint16_t mask;
- VTDBus *vtd_bus;
VTDAddressSpace *vtd_as;
uint8_t bus_n, devfn;
- uint16_t devfn_it;
trace_vtd_inv_desc_cc_devices(source_id, func_mask);
mask = ~mask;
bus_n = VTD_SID_TO_BUS(source_id);
- vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
- if (vtd_bus) {
- devfn = VTD_SID_TO_DEVFN(source_id);
- for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
- vtd_as = vtd_bus->dev_as[devfn_it];
- if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
- trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
- VTD_PCI_FUNC(devfn_it));
- vtd_iommu_lock(s);
- vtd_as->context_cache_entry.context_cache_gen = 0;
- vtd_iommu_unlock(s);
- /*
- * Do switch address space when needed, in case if the
- * device passthrough bit is switched.
- */
- vtd_switch_address_space(vtd_as);
- /*
- * So a device is moving out of (or moving into) a
- * domain, resync the shadow page table.
- * This won't bring bad even if we have no such
- * notifier registered - the IOMMU notification
- * framework will skip MAP notifications if that
- * happened.
- */
- vtd_sync_shadow_page_table(vtd_as);
- }
+ devfn = VTD_SID_TO_DEVFN(source_id);
+
+ g_hash_table_iter_init(&as_it, s->vtd_address_spaces);
+ while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) {
+ if ((pci_bus_num(vtd_as->bus) == bus_n) &&
+ (vtd_as->devfn & mask) == (devfn & mask)) {
+ trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(vtd_as->devfn),
+ VTD_PCI_FUNC(vtd_as->devfn));
+ vtd_iommu_lock(s);
+ vtd_as->context_cache_entry.context_cache_gen = 0;
+ vtd_iommu_unlock(s);
+ /*
+ * Do switch address space when needed, in case if the
+ * device passthrough bit is switched.
+ */
+ vtd_switch_address_space(vtd_as);
+ /*
+ * So a device is moving out of (or moving into) a
+ * domain, resync the shadow page table.
+ * This won't bring bad even if we have no such
+ * notifier registered - the IOMMU notification
+ * framework will skip MAP notifications if that
+ * happened.
+ */
+ vtd_address_space_sync(vtd_as);
}
}
}
QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
vtd_as->devfn, &ce) &&
- domain_id == vtd_get_domain_id(s, &ce)) {
- vtd_sync_shadow_page_table(vtd_as);
+ domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) {
+ vtd_address_space_sync(vtd_as);
}
}
}
static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
uint16_t domain_id, hwaddr addr,
- uint8_t am)
+ uint8_t am, uint32_t pasid)
{
VTDAddressSpace *vtd_as;
VTDContextEntry ce;
hwaddr size = (1 << am) * VTD_PAGE_SIZE;
QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
+ if (pasid != PCI_NO_PASID && pasid != vtd_as->pasid) {
+ continue;
+ }
ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
vtd_as->devfn, &ce);
- if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
+ if (!ret && domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) {
if (vtd_as_has_map_notifier(vtd_as)) {
/*
* As long as we have MAP notifications registered in
vtd_iommu_lock(s);
g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
vtd_iommu_unlock(s);
- vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
+ vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am, PCI_NO_PASID);
}
/* Flush IOTLB
{
VTDAddressSpace *vtd_dev_as;
IOMMUTLBEvent event;
- struct VTDBus *vtd_bus;
hwaddr addr;
uint64_t sz;
uint16_t sid;
- uint8_t devfn;
bool size;
- uint8_t bus_num;
addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
- devfn = sid & 0xff;
- bus_num = sid >> 8;
size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
return false;
}
- vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
- if (!vtd_bus) {
- goto done;
- }
-
- vtd_dev_as = vtd_bus->dev_as[devfn];
+ /*
+ * Using sid is OK since the guest should have finished the
+ * initialization of both the bus and device.
+ */
+ vtd_dev_as = vtd_get_as_by_sid(s, sid);
if (!vtd_dev_as) {
goto done;
}
} else {
vtd_set_quad(s, addr, val);
}
- if (s->ecap & VTD_ECAP_SMTS &&
- val & VTD_IQA_DW_MASK) {
- s->iq_dw = true;
- } else {
- s->iq_dw = false;
- }
+ vtd_update_iq_dw(s);
break;
case DMAR_IQA_REG_HI:
{
VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
IntelIOMMUState *s = vtd_as->iommu_state;
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
/* TODO: add support for VFIO and vhost users */
if (s->snoop_control) {
"Snoop Control with vhost or VFIO is not supported");
return -ENOTSUP;
}
+ if (!s->caching_mode && (new & IOMMU_NOTIFIER_MAP)) {
+ error_setg_errno(errp, ENOTSUP,
+ "device %02x.%02x.%x requires caching mode",
+ pci_bus_num(vtd_as->bus), PCI_SLOT(vtd_as->devfn),
+ PCI_FUNC(vtd_as->devfn));
+ return -ENOTSUP;
+ }
+ if (!x86_iommu->dt_supported && (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP)) {
+ error_setg_errno(errp, ENOTSUP,
+ "device %02x.%02x.%x requires device IOTLB mode",
+ pci_bus_num(vtd_as->bus), PCI_SLOT(vtd_as->devfn),
+ PCI_FUNC(vtd_as->devfn));
+ return -ENOTSUP;
+ }
/* Update per-address-space notifier flags */
vtd_as->notifier_flags = new;
{
IntelIOMMUState *iommu = opaque;
- /*
- * Memory regions are dynamically turned on/off depending on
- * context entry configurations from the guest. After migration,
- * we need to make sure the memory regions are still correct.
- */
- vtd_switch_address_space_all(iommu);
-
/*
* We don't need to migrate the root_scalable because we can
* simply do the calculation after the loading is complete. We
*/
vtd_update_scalable_state(iommu);
+ vtd_update_iq_dw(iommu);
+
+ /*
+ * Memory regions are dynamically turned on/off depending on
+ * context entry configurations from the guest. After migration,
+ * we need to make sure the memory regions are still correct.
+ */
+ vtd_switch_address_space_all(iommu);
+
return 0;
}
.minimum_version_id = 1,
.priority = MIG_PRI_IOMMU,
.post_load = vtd_post_load,
- .fields = (VMStateField[]) {
+ .fields = (const VMStateField[]) {
VMSTATE_UINT64(root, IntelIOMMUState),
VMSTATE_UINT64(intr_root, IntelIOMMUState),
VMSTATE_UINT64(iq, IntelIOMMUState),
DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
+ DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false),
DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true),
DEFINE_PROP_END_OF_LIST(),
};
/* Read IRTE entry with specific index */
-static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
- VTD_IR_TableEntry *entry, uint16_t sid)
+static bool vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
+ VTD_IR_TableEntry *entry, uint16_t sid,
+ bool do_fault)
{
static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
{0xffff, 0xfffb, 0xfff9, 0xfff8};
if (index >= iommu->intr_size) {
error_report_once("%s: index too large: ind=0x%x",
__func__, index);
- return -VTD_FR_IR_INDEX_OVER;
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_INDEX_OVER, index);
+ }
+ return false;
}
addr = iommu->intr_root + index * sizeof(*entry);
entry, sizeof(*entry), MEMTXATTRS_UNSPECIFIED)) {
error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64,
__func__, index, addr);
- return -VTD_FR_IR_ROOT_INVAL;
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_ROOT_INVAL, index);
+ }
+ return false;
}
- trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
- le64_to_cpu(entry->data[0]));
+ entry->data[0] = le64_to_cpu(entry->data[0]);
+ entry->data[1] = le64_to_cpu(entry->data[1]);
+
+ trace_vtd_ir_irte_get(index, entry->data[1], entry->data[0]);
+
+ /*
+ * The remaining potential fault conditions are "qualified" by the
+ * Fault Processing Disable bit in the IRTE. Even "not present".
+ * So just clear the do_fault flag if PFD is set, which will
+ * prevent faults being raised.
+ */
+ if (entry->irte.fault_disable) {
+ do_fault = false;
+ }
if (!entry->irte.present) {
error_report_once("%s: detected non-present IRTE "
"(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
- __func__, index, le64_to_cpu(entry->data[1]),
- le64_to_cpu(entry->data[0]));
- return -VTD_FR_IR_ENTRY_P;
+ __func__, index, entry->data[1], entry->data[0]);
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_ENTRY_P, index);
+ }
+ return false;
}
if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
entry->irte.__reserved_2) {
error_report_once("%s: detected non-zero reserved IRTE "
"(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
- __func__, index, le64_to_cpu(entry->data[1]),
- le64_to_cpu(entry->data[0]));
- return -VTD_FR_IR_IRTE_RSVD;
+ __func__, index, entry->data[1], entry->data[0]);
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_IRTE_RSVD, index);
+ }
+ return false;
}
if (sid != X86_IOMMU_SID_INVALID) {
/* Validate IRTE SID */
- source_id = le32_to_cpu(entry->irte.source_id);
+ source_id = entry->irte.source_id;
switch (entry->irte.sid_vtype) {
case VTD_SVT_NONE:
break;
error_report_once("%s: invalid IRTE SID "
"(index=%u, sid=%u, source_id=%u)",
__func__, index, sid, source_id);
- return -VTD_FR_IR_SID_ERR;
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_SID_ERR, index);
+ }
+ return false;
}
break;
error_report_once("%s: invalid SVT_BUS "
"(index=%u, bus=%u, min=%u, max=%u)",
__func__, index, bus, bus_min, bus_max);
- return -VTD_FR_IR_SID_ERR;
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_SID_ERR, index);
+ }
+ return false;
}
break;
"(index=%u, type=%d)", __func__,
index, entry->irte.sid_vtype);
/* Take this as verification failure. */
- return -VTD_FR_IR_SID_ERR;
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_SID_ERR, index);
+ }
+ return false;
}
}
- return 0;
+ return true;
}
/* Fetch IRQ information of specific IR index */
-static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
- X86IOMMUIrq *irq, uint16_t sid)
+static bool vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
+ X86IOMMUIrq *irq, uint16_t sid, bool do_fault)
{
VTD_IR_TableEntry irte = {};
- int ret = 0;
- ret = vtd_irte_get(iommu, index, &irte, sid);
- if (ret) {
- return ret;
+ if (!vtd_irte_get(iommu, index, &irte, sid, do_fault)) {
+ return false;
}
irq->trigger_mode = irte.irte.trigger_mode;
irq->vector = irte.irte.vector;
irq->delivery_mode = irte.irte.delivery_mode;
- irq->dest = le32_to_cpu(irte.irte.dest_id);
+ irq->dest = irte.irte.dest_id;
if (!iommu->intr_eime) {
#define VTD_IR_APIC_DEST_MASK (0xff00ULL)
#define VTD_IR_APIC_DEST_SHIFT (8)
trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
irq->delivery_mode, irq->dest, irq->dest_mode);
- return 0;
+ return true;
}
/* Interrupt remapping for MSI/MSI-X entry */
static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
MSIMessage *origin,
MSIMessage *translated,
- uint16_t sid)
+ uint16_t sid, bool do_fault)
{
- int ret = 0;
VTD_IR_MSIAddress addr;
uint16_t index;
X86IOMMUIrq irq = {};
if (origin->address & VTD_MSI_ADDR_HI_MASK) {
error_report_once("%s: MSI address high 32 bits non-zero detected: "
"address=0x%" PRIx64, __func__, origin->address);
- return -VTD_FR_IR_REQ_RSVD;
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_REQ_RSVD, 0);
+ }
+ return -EINVAL;
}
addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
if (addr.addr.__head != 0xfee) {
error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32,
__func__, addr.data);
- return -VTD_FR_IR_REQ_RSVD;
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_REQ_RSVD, 0);
+ }
+ return -EINVAL;
}
/* This is compatible mode. */
goto out;
}
- index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
+ index = addr.addr.index_h << 15 | addr.addr.index_l;
#define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff)
#define VTD_IR_MSI_DATA_RESERVED (0xffff0000)
index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
}
- ret = vtd_remap_irq_get(iommu, index, &irq, sid);
- if (ret) {
- return ret;
+ if (!vtd_remap_irq_get(iommu, index, &irq, sid, do_fault)) {
+ return -EINVAL;
}
if (addr.addr.sub_valid) {
"(sid=%u, address=0x%" PRIx64
", data=0x%" PRIx32 ")",
__func__, sid, origin->address, origin->data);
- return -VTD_FR_IR_REQ_RSVD;
+ if (do_fault) {
+ vtd_report_ir_fault(iommu, sid, VTD_FR_IR_REQ_RSVD, 0);
+ }
+ return -EINVAL;
}
} else {
uint8_t vector = origin->data & 0xff;
MSIMessage *dst, uint16_t sid)
{
return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
- src, dst, sid);
+ src, dst, sid, false);
}
static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
sid = attrs.requester_id;
}
- ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
+ ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid, true);
if (ret) {
- /* TODO: report error */
/* Drop this interrupt */
return MEMTX_ERROR;
}
- apic_get_class()->send_msi(&to);
+ apic_get_class(NULL)->send_msi(&to);
return MEMTX_OK;
}
},
};
-VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+static void vtd_report_ir_illegal_access(VTDAddressSpace *vtd_as,
+ hwaddr addr, bool is_write)
{
- uintptr_t key = (uintptr_t)bus;
- VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
- VTDAddressSpace *vtd_dev_as;
- char name[128];
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ uint8_t bus_n = pci_bus_num(vtd_as->bus);
+ uint16_t sid = PCI_BUILD_BDF(bus_n, vtd_as->devfn);
+ bool is_fpd_set = false;
+ VTDContextEntry ce;
+
+ assert(vtd_as->pasid != PCI_NO_PASID);
- if (!vtd_bus) {
- uintptr_t *new_key = g_malloc(sizeof(*new_key));
- *new_key = (uintptr_t)bus;
- /* No corresponding free() */
- vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
- PCI_DEVFN_MAX);
- vtd_bus->bus = bus;
- g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
+ /* Try out best to fetch FPD, we can't do anything more */
+ if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
+ is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
+ if (!is_fpd_set && s->root_scalable) {
+ vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid);
+ }
}
- vtd_dev_as = vtd_bus->dev_as[devfn];
+ vtd_report_fault(s, VTD_FR_SM_INTERRUPT_ADDR,
+ is_fpd_set, sid, addr, is_write,
+ true, vtd_as->pasid);
+}
+
+static MemTxResult vtd_mem_ir_fault_read(void *opaque, hwaddr addr,
+ uint64_t *data, unsigned size,
+ MemTxAttrs attrs)
+{
+ vtd_report_ir_illegal_access(opaque, addr, false);
+
+ return MEMTX_ERROR;
+}
+
+static MemTxResult vtd_mem_ir_fault_write(void *opaque, hwaddr addr,
+ uint64_t value, unsigned size,
+ MemTxAttrs attrs)
+{
+ vtd_report_ir_illegal_access(opaque, addr, true);
+
+ return MEMTX_ERROR;
+}
+
+static const MemoryRegionOps vtd_mem_ir_fault_ops = {
+ .read_with_attrs = vtd_mem_ir_fault_read,
+ .write_with_attrs = vtd_mem_ir_fault_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ },
+ .valid = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ },
+};
+
+VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
+ int devfn, unsigned int pasid)
+{
+ /*
+ * We can't simply use sid here since the bus number might not be
+ * initialized by the guest.
+ */
+ struct vtd_as_key key = {
+ .bus = bus,
+ .devfn = devfn,
+ .pasid = pasid,
+ };
+ VTDAddressSpace *vtd_dev_as;
+ char name[128];
+ vtd_dev_as = g_hash_table_lookup(s->vtd_address_spaces, &key);
if (!vtd_dev_as) {
- snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
- PCI_FUNC(devfn));
- vtd_bus->dev_as[devfn] = vtd_dev_as = g_new0(VTDAddressSpace, 1);
+ struct vtd_as_key *new_key = g_malloc(sizeof(*new_key));
+
+ new_key->bus = bus;
+ new_key->devfn = devfn;
+ new_key->pasid = pasid;
+
+ if (pasid == PCI_NO_PASID) {
+ snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
+ PCI_FUNC(devfn));
+ } else {
+ snprintf(name, sizeof(name), "vtd-%02x.%x-pasid-%x", PCI_SLOT(devfn),
+ PCI_FUNC(devfn), pasid);
+ }
+
+ vtd_dev_as = g_new0(VTDAddressSpace, 1);
vtd_dev_as->bus = bus;
vtd_dev_as->devfn = (uint8_t)devfn;
+ vtd_dev_as->pasid = pasid;
vtd_dev_as->iommu_state = s;
vtd_dev_as->context_cache_entry.context_cache_gen = 0;
vtd_dev_as->iova_tree = iova_tree_new();
VTD_INTERRUPT_ADDR_FIRST,
&vtd_dev_as->iommu_ir, 1);
+ /*
+ * This region is used for catching fault to access interrupt
+ * range via passthrough + PASID. See also
+ * vtd_switch_address_space(). We can't use alias since we
+ * need to know the sid which is valid for MSI who uses
+ * bus_master_as (see msi_send_message()).
+ */
+ memory_region_init_io(&vtd_dev_as->iommu_ir_fault, OBJECT(s),
+ &vtd_mem_ir_fault_ops, vtd_dev_as, "vtd-no-ir",
+ VTD_INTERRUPT_ADDR_SIZE);
+ /*
+ * Hook to root since when PT is enabled vtd_dev_as->iommu
+ * will be disabled.
+ */
+ memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->root),
+ VTD_INTERRUPT_ADDR_FIRST,
+ &vtd_dev_as->iommu_ir_fault, 2);
+
/*
* Hook both the containers under the root container, we
* switch between DMAR & noDMAR by enable/disable
&vtd_dev_as->nodmar, 0);
vtd_switch_address_space(vtd_dev_as);
+
+ g_hash_table_insert(s->vtd_address_spaces, new_key, vtd_dev_as);
}
return vtd_dev_as;
}
/* Unmap the whole range in the notifier's scope. */
static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
{
- hwaddr size, remain;
+ hwaddr total, remain;
hwaddr start = n->start;
hwaddr end = n->end;
IntelIOMMUState *s = as->iommu_state;
}
assert(start <= end);
- size = remain = end - start + 1;
+ total = remain = end - start + 1;
while (remain >= VTD_PAGE_SIZE) {
IOMMUTLBEvent event;
trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
VTD_PCI_SLOT(as->devfn),
VTD_PCI_FUNC(as->devfn),
- n->start, size);
+ n->start, total);
map.iova = n->start;
- map.size = size;
- iova_tree_remove(as->iova_tree, &map);
+ map.size = total - 1; /* Inclusive */
+ iova_tree_remove(as->iova_tree, map);
}
static void vtd_address_space_unmap_all(IntelIOMMUState *s)
IntelIOMMUState *s = vtd_as->iommu_state;
uint8_t bus_n = pci_bus_num(vtd_as->bus);
VTDContextEntry ce;
+ DMAMap map = { .iova = 0, .size = HWADDR_MAX };
- /*
- * The replay can be triggered by either a invalidation or a newly
- * created entry. No matter what, we release existing mappings
- * (it means flushing caches for UNMAP-only registers).
- */
- vtd_address_space_unmap(vtd_as, n);
+ /* replay is protected by BQL, page walk will re-setup it safely */
+ iova_tree_remove(vtd_as->iova_tree, map);
if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
"legacy mode",
bus_n, PCI_SLOT(vtd_as->devfn),
PCI_FUNC(vtd_as->devfn),
- vtd_get_domain_id(s, &ce),
+ vtd_get_domain_id(s, &ce, vtd_as->pasid),
ce.hi, ce.lo);
- if (vtd_as_has_map_notifier(vtd_as)) {
+ if (n->notifier_flags & IOMMU_NOTIFIER_MAP) {
/* This is required only for MAP typed notifiers */
vtd_page_walk_info info = {
.hook_fn = vtd_replay_hook,
.notify_unmap = false,
.aw = s->aw_bits,
.as = vtd_as,
- .domain_id = vtd_get_domain_id(s, &ce),
+ .domain_id = vtd_get_domain_id(s, &ce, vtd_as->pasid),
};
- vtd_page_walk(s, &ce, 0, ~0ULL, &info);
+ vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid);
}
} else {
trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
s->ecap |= VTD_ECAP_SC;
}
+ if (s->pasid) {
+ s->ecap |= VTD_ECAP_PASID;
+ }
+
vtd_reset_caches(s);
/* Define registers with default values and bit semantics */
assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
- vtd_as = vtd_find_add_as(s, bus, devfn);
+ vtd_as = vtd_find_add_as(s, bus, devfn, PCI_NO_PASID);
return &vtd_as->as;
}
+static PCIIOMMUOps vtd_iommu_ops = {
+ .get_address_space = vtd_host_dma_iommu,
+};
+
static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
{
X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
return false;
}
+ if (kvm_enabled() && !kvm_enable_x2apic()) {
+ error_setg(errp, "eim=on requires support on the KVM side"
+ "(X2APIC_API, first shipped in v4.7)");
+ return false;
+ }
}
/* Currently only address widths supported are 39 and 48 bits */
return false;
}
+ if (s->pasid && !s->scalable_mode) {
+ error_setg(errp, "Need to set scalable mode for PASID");
+ return false;
+ }
+
return true;
}
X86MachineState *x86ms = X86_MACHINE(ms);
PCIBus *bus = pcms->bus;
IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
+ X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+
+ if (s->pasid && x86_iommu->dt_supported) {
+ /*
+ * PASID-based-Device-TLB Invalidate Descriptor is not
+ * implemented and it requires support from vhost layer which
+ * needs to be implemented in the future.
+ */
+ error_setg(errp, "PASID based device IOTLB is not supported");
+ return;
+ }
if (!vtd_decide_config(s, errp)) {
return;
QLIST_INIT(&s->vtd_as_with_notifiers);
qemu_mutex_init(&s->iommu_lock);
- memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
"intel_iommu", DMAR_REG_SIZE);
+ memory_region_add_subregion(get_system_memory(),
+ Q35_HOST_BRIDGE_IOMMU_ADDR, &s->csrmem);
/* Create the shared memory regions by all devices */
memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar",
memory_region_add_subregion_overlap(&s->mr_nodmar,
VTD_INTERRUPT_ADDR_FIRST,
&s->mr_ir, 1);
-
- sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
/* No corresponding destroy */
- s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
+ s->iotlb = g_hash_table_new_full(vtd_iotlb_hash, vtd_iotlb_equal,
g_free, g_free);
- s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
- g_free, g_free);
+ s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal,
+ g_free, g_free);
vtd_init(s);
- sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
- pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
+ pci_setup_iommu(bus, &vtd_iommu_ops, dev);
/* Pseudo address space under root PCI bus. */
x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);