]> git.proxmox.com Git - qemu.git/blobdiff - target-i386/translate.c
kvm: i386: fix LAPIC TSC deadline timer save/restore
[qemu.git] / target-i386 / translate.c
index d742fe36936d5beb89d1464e1af29374f9d9b17a..065a9d320e0d6ce95f48b04700c38e7d519c65a0 100644 (file)
@@ -72,7 +72,6 @@ static TCGv cpu_tmp0, cpu_tmp4;
 static TCGv_ptr cpu_ptr0, cpu_ptr1;
 static TCGv_i32 cpu_tmp2_i32, cpu_tmp3_i32;
 static TCGv_i64 cpu_tmp1_i64;
-static TCGv cpu_tmp5;
 
 static uint8_t gen_opc_cc_op[OPC_BUF_SIZE];
 
@@ -209,6 +208,11 @@ static const uint8_t cc_op_live[CC_OP_NB] = {
     [CC_OP_DECB ... CC_OP_DECQ] = USES_CC_DST | USES_CC_SRC,
     [CC_OP_SHLB ... CC_OP_SHLQ] = USES_CC_DST | USES_CC_SRC,
     [CC_OP_SARB ... CC_OP_SARQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_BMILGB ... CC_OP_BMILGQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADCX] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_CLR] = 0,
 };
 
 static void set_cc_op(DisasContext *s, CCOp op)
@@ -234,10 +238,18 @@ static void set_cc_op(DisasContext *s, CCOp op)
         tcg_gen_discard_tl(cpu_cc_srcT);
     }
 
+    if (op == CC_OP_DYNAMIC) {
+        /* The DYNAMIC setting is translator only, and should never be
+           stored.  Thus we always consider it clean.  */
+        s->cc_op_dirty = false;
+    } else {
+        /* Discard any computed CC_OP value (see shifts).  */
+        if (s->cc_op == CC_OP_DYNAMIC) {
+            tcg_gen_discard_i32(cpu_cc_op);
+        }
+        s->cc_op_dirty = true;
+    }
     s->cc_op = op;
-    /* The DYNAMIC setting is translator only, and should never be
-       stored.  Thus we always consider it clean.  */
-    s->cc_op_dirty = (op != CC_OP_DYNAMIC);
 }
 
 static void gen_update_cc_op(DisasContext *s)
@@ -902,6 +914,11 @@ static void gen_compute_eflags(DisasContext *s)
     if (s->cc_op == CC_OP_EFLAGS) {
         return;
     }
+    if (s->cc_op == CC_OP_CLR) {
+        tcg_gen_movi_tl(cpu_cc_src, CC_Z);
+        set_cc_op(s, CC_OP_EFLAGS);
+        return;
+    }
 
     TCGV_UNUSED(zero);
     dst = cpu_cc_dst;
@@ -970,6 +987,7 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
                              .reg2 = t1, .mask = -1, .use_reg2 = true };
 
     case CC_OP_LOGICB ... CC_OP_LOGICQ:
+    case CC_OP_CLR:
         return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
 
     case CC_OP_INCB ... CC_OP_INCQ:
@@ -988,6 +1006,16 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
         return (CCPrepare) { .cond = TCG_COND_NE,
                              .reg = cpu_cc_src, .mask = -1 };
 
+    case CC_OP_BMILGB ... CC_OP_BMILGQ:
+        size = s->cc_op - CC_OP_BMILGB;
+        t0 = gen_ext_tl(reg, cpu_cc_src, size, false);
+        return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+
+    case CC_OP_ADCX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst,
+                             .mask = -1, .no_setcond = true };
+
     case CC_OP_EFLAGS:
     case CC_OP_SARB ... CC_OP_SARQ:
         /* CC_SRC & 1 */
@@ -1021,8 +1049,13 @@ static CCPrepare gen_prepare_eflags_s(DisasContext *s, TCGv reg)
         gen_compute_eflags(s);
         /* FALLTHRU */
     case CC_OP_EFLAGS:
+    case CC_OP_ADCX:
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
         return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
                              .mask = CC_S };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
     default:
         {
             int size = (s->cc_op - CC_OP_ADDB) & 3;
@@ -1035,9 +1068,18 @@ static CCPrepare gen_prepare_eflags_s(DisasContext *s, TCGv reg)
 /* compute eflags.O to reg */
 static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg)
 {
-    gen_compute_eflags(s);
-    return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
-                         .mask = CC_O };
+    switch (s->cc_op) {
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2,
+                             .mask = -1, .no_setcond = true };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+    default:
+        gen_compute_eflags(s);
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_O };
+    }
 }
 
 /* compute eflags.Z to reg */
@@ -1048,8 +1090,13 @@ static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
         gen_compute_eflags(s);
         /* FALLTHRU */
     case CC_OP_EFLAGS:
+    case CC_OP_ADCX:
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
         return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
                              .mask = CC_Z };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_ALWAYS, .mask = -1 };
     default:
         {
             int size = (s->cc_op - CC_OP_ADDB) & 3;
@@ -1529,19 +1576,56 @@ static void gen_inc(DisasContext *s1, int ot, int d, int c)
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
-static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, 
-                            int is_right, int is_arith)
+static void gen_shift_flags(DisasContext *s, int ot, TCGv result, TCGv shm1,
+                            TCGv count, bool is_right)
 {
-    target_ulong mask;
-    int shift_label;
-    TCGv t0, t1, t2;
+    TCGv_i32 z32, s32, oldop;
+    TCGv z_tl;
+
+    /* Store the results into the CC variables.  If we know that the
+       variable must be dead, store unconditionally.  Otherwise we'll
+       need to not disrupt the current contents.  */
+    z_tl = tcg_const_tl(0);
+    if (cc_op_live[s->cc_op] & USES_CC_DST) {
+        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_dst, count, z_tl,
+                           result, cpu_cc_dst);
+    } else {
+        tcg_gen_mov_tl(cpu_cc_dst, result);
+    }
+    if (cc_op_live[s->cc_op] & USES_CC_SRC) {
+        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_src, count, z_tl,
+                           shm1, cpu_cc_src);
+    } else {
+        tcg_gen_mov_tl(cpu_cc_src, shm1);
+    }
+    tcg_temp_free(z_tl);
 
-    if (ot == OT_QUAD) {
-        mask = 0x3f;
+    /* Get the two potential CC_OP values into temporaries.  */
+    tcg_gen_movi_i32(cpu_tmp2_i32, (is_right ? CC_OP_SARB : CC_OP_SHLB) + ot);
+    if (s->cc_op == CC_OP_DYNAMIC) {
+        oldop = cpu_cc_op;
     } else {
-        mask = 0x1f;
+        tcg_gen_movi_i32(cpu_tmp3_i32, s->cc_op);
+        oldop = cpu_tmp3_i32;
     }
 
+    /* Conditionally store the CC_OP value.  */
+    z32 = tcg_const_i32(0);
+    s32 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(s32, count);
+    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, s32, z32, cpu_tmp2_i32, oldop);
+    tcg_temp_free_i32(z32);
+    tcg_temp_free_i32(s32);
+
+    /* The CC_OP value is no longer predictable.  */
+    set_cc_op(s, CC_OP_DYNAMIC);
+}
+
+static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, 
+                            int is_right, int is_arith)
+{
+    target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+
     /* load */
     if (op1 == OR_TMP0) {
         gen_op_ld_T0_A0(ot + s->mem_index);
@@ -1549,25 +1633,22 @@ static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
         gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-
-    tcg_gen_andi_tl(t2, cpu_T[1], mask);
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
+    tcg_gen_subi_tl(cpu_tmp0, cpu_T[1], 1);
 
     if (is_right) {
         if (is_arith) {
             gen_exts(ot, cpu_T[0]);
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], t2);
+            tcg_gen_sar_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         } else {
             gen_extu(ot, cpu_T[0]);
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], t2);
+            tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         }
     } else {
-        tcg_gen_mov_tl(t0, cpu_T[0]);
-        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], t2);
+        tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
     }
 
     /* store */
@@ -1577,50 +1658,13 @@ static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
         gen_op_mov_reg_T0(ot, op1);
     }
 
-    /* Update eflags data because we cannot predict flags afterward.  */
-    gen_update_cc_op(s);
-    set_cc_op(s, CC_OP_DYNAMIC);
-
-    tcg_gen_mov_tl(t1, cpu_T[0]);
-
-    shift_label = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, shift_label);
-
-    tcg_gen_addi_tl(t2, t2, -1);
-    tcg_gen_mov_tl(cpu_cc_dst, t1);
-
-    if (is_right) {
-        if (is_arith) {
-            tcg_gen_sar_tl(cpu_cc_src, t0, t2);
-        } else {
-            tcg_gen_shr_tl(cpu_cc_src, t0, t2);
-        }
-    } else {
-        tcg_gen_shl_tl(cpu_cc_src, t0, t2);
-    }
-
-    if (is_right) {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
-    } else {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
-    }
-
-    gen_set_label(shift_label);
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
+    gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, cpu_T[1], is_right);
 }
 
 static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2,
                             int is_right, int is_arith)
 {
-    int mask;
-    
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
 
     /* load */
     if (op1 == OR_TMP0)
@@ -1668,167 +1712,174 @@ static inline void tcg_gen_lshift(TCGv ret, TCGv arg1, target_long arg2)
         tcg_gen_shri_tl(ret, arg1, -arg2);
 }
 
-static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, 
-                          int is_right)
+static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, int is_right)
 {
-    target_ulong mask;
-    int label1, label2, data_bits;
-    TCGv t0, t1, t2, a0;
-
-    /* XXX: inefficient, but we must use local temps */
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+    TCGv_i32 t0, t1;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    tcg_gen_mov_tl(t1, cpu_T[1]);
-
-    tcg_gen_andi_tl(t1, t1, mask);
-
-    /* Must test zero case to avoid using undefined behaviour in TCG
-       shifts. */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label1);
-    
-    if (ot <= OT_WORD)
-        tcg_gen_andi_tl(cpu_tmp0, t1, (1 << (3 + ot)) - 1);
-    else
-        tcg_gen_mov_tl(cpu_tmp0, t1);
-    
-    gen_extu(ot, t0);
-    tcg_gen_mov_tl(t2, t0);
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
 
-    data_bits = 8 << ot;
-    /* XXX: rely on behaviour of shifts when operand 2 overflows (XXX:
-       fix TCG definition) */
-    if (is_right) {
-        tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp0);
-        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
-        tcg_gen_shl_tl(t0, t0, cpu_tmp0);
-    } else {
-        tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp0);
-        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
-        tcg_gen_shr_tl(t0, t0, cpu_tmp0);
+    switch (ot) {
+    case OT_BYTE:
+        /* Replicate the 8-bit input so that a 32-bit rotate works.  */
+        tcg_gen_ext8u_tl(cpu_T[0], cpu_T[0]);
+        tcg_gen_muli_tl(cpu_T[0], cpu_T[0], 0x01010101);
+        goto do_long;
+    case OT_WORD:
+        /* Replicate the 16-bit input so that a 32-bit rotate works.  */
+        tcg_gen_deposit_tl(cpu_T[0], cpu_T[0], cpu_T[0], 16, 16);
+        goto do_long;
+    do_long:
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+        if (is_right) {
+            tcg_gen_rotr_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+        } else {
+            tcg_gen_rotl_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+        }
+        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+        break;
+#endif
+    default:
+        if (is_right) {
+            tcg_gen_rotr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        } else {
+            tcg_gen_rotl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        }
+        break;
     }
-    tcg_gen_or_tl(t0, t0, cpu_tmp4);
 
-    gen_set_label(label1);
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
-    
-    /* update eflags.  It is needed anyway most of the time, do it always.  */
-    gen_compute_eflags(s);
-    assert(s->cc_op == CC_OP_EFLAGS);
 
-    label2 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label2);
+    /* We'll need the flags computed into CC_SRC.  */
+    gen_compute_eflags(s);
 
-    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
-    tcg_gen_xor_tl(cpu_tmp0, t2, t0);
-    tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
-    tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
-    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+    /* The value that was "rotated out" is now present at the other end
+       of the word.  Compute C into CC_DST and O into CC_SRC2.  Note that
+       since we've computed the flags into CC_SRC, these variables are
+       currently dead.  */
     if (is_right) {
-        tcg_gen_shri_tl(t0, t0, data_bits - 1);
+        tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+        tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+        tcg_gen_andi_tl(cpu_cc_dst, cpu_cc_dst, 1);
+    } else {
+        tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+        tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
     }
-    tcg_gen_andi_tl(t0, t0, CC_C);
-    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
-
-    gen_set_label(label2);
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
-    tcg_temp_free(a0);
+    tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+    tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+
+    /* Now conditionally store the new CC_OP value.  If the shift count
+       is 0 we keep the CC_OP_EFLAGS setting so that only CC_SRC is live.
+       Otherwise reuse CC_OP_ADCOX which have the C and O flags split out
+       exactly as we computed above.  */
+    t0 = tcg_const_i32(0);
+    t1 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(t1, cpu_T[1]);
+    tcg_gen_movi_i32(cpu_tmp2_i32, CC_OP_ADCOX); 
+    tcg_gen_movi_i32(cpu_tmp3_i32, CC_OP_EFLAGS);
+    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, t1, t0,
+                        cpu_tmp2_i32, cpu_tmp3_i32);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+
+    /* The CC_OP value is no longer predictable.  */ 
+    set_cc_op(s, CC_OP_DYNAMIC);
 }
 
 static void gen_rot_rm_im(DisasContext *s, int ot, int op1, int op2,
                           int is_right)
 {
-    int mask;
-    int data_bits;
-    TCGv t0, t1, a0;
-
-    /* XXX: inefficient, but we must use local temps */
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+    int shift;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    gen_extu(ot, t0);
-    tcg_gen_mov_tl(t1, t0);
-
     op2 &= mask;
-    data_bits = 8 << ot;
     if (op2 != 0) {
-        int shift = op2 & ((1 << (3 + ot)) - 1);
-        if (is_right) {
-            tcg_gen_shri_tl(cpu_tmp4, t0, shift);
-            tcg_gen_shli_tl(t0, t0, data_bits - shift);
-        }
-        else {
-            tcg_gen_shli_tl(cpu_tmp4, t0, shift);
-            tcg_gen_shri_tl(t0, t0, data_bits - shift);
+        switch (ot) {
+#ifdef TARGET_X86_64
+        case OT_LONG:
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            if (is_right) {
+                tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+            } else {
+                tcg_gen_rotli_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+            }
+            tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+            break;
+#endif
+        default:
+            if (is_right) {
+                tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], op2);
+            } else {
+                tcg_gen_rotli_tl(cpu_T[0], cpu_T[0], op2);
+            }
+            break;
+        case OT_BYTE:
+            mask = 7;
+            goto do_shifts;
+        case OT_WORD:
+            mask = 15;
+        do_shifts:
+            shift = op2 & mask;
+            if (is_right) {
+                shift = mask + 1 - shift;
+            }
+            gen_extu(ot, cpu_T[0]);
+            tcg_gen_shli_tl(cpu_tmp0, cpu_T[0], shift);
+            tcg_gen_shri_tl(cpu_T[0], cpu_T[0], mask + 1 - shift);
+            tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+            break;
         }
-        tcg_gen_or_tl(t0, t0, cpu_tmp4);
     }
 
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
 
     if (op2 != 0) {
-        /* update eflags */
+        /* Compute the flags into CC_SRC.  */
         gen_compute_eflags(s);
-        assert(s->cc_op == CC_OP_EFLAGS);
 
-        tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
-        tcg_gen_xor_tl(cpu_tmp0, t1, t0);
-        tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
-        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
-        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+        /* The value that was "rotated out" is now present at the other end
+           of the word.  Compute C into CC_DST and O into CC_SRC2.  Note that
+           since we've computed the flags into CC_SRC, these variables are
+           currently dead.  */
         if (is_right) {
-            tcg_gen_shri_tl(t0, t0, data_bits - 1);
+            tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+            tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+            tcg_gen_andi_tl(cpu_cc_dst, cpu_cc_dst, 1);
+        } else {
+            tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+            tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
         }
-        tcg_gen_andi_tl(t0, t0, CC_C);
-        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
+        tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+        tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+        set_cc_op(s, CC_OP_ADCOX);
     }
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(a0);
 }
 
 /* XXX: add faster immediate = 1 case */
@@ -1888,128 +1939,88 @@ static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1,
 
 /* XXX: add faster immediate case */
 static void gen_shiftd_rm_T1(DisasContext *s, int ot, int op1,
-                             int is_right, TCGv count)
+                             bool is_right, TCGv count_in)
 {
-    int label1, label2, data_bits;
-    target_ulong mask;
-    TCGv t0, t1, t2, a0;
-
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    target_ulong mask = (ot == OT_QUAD ? 63 : 31);
+    TCGv count;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    tcg_gen_andi_tl(t2, count, mask);
-    tcg_gen_mov_tl(t1, cpu_T[1]);
+    count = tcg_temp_new();
+    tcg_gen_andi_tl(count, count_in, mask);
 
-    /* Must test zero case to avoid using undefined behaviour in TCG
-       shifts. */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
-    
-    tcg_gen_addi_tl(cpu_tmp5, t2, -1);
-    if (ot == OT_WORD) {
-        /* Note: we implement the Intel behaviour for shift count > 16 */
+    switch (ot) {
+    case OT_WORD:
+        /* Note: we implement the Intel behaviour for shift count > 16.
+           This means "shrdw C, B, A" shifts A:B:A >> C.  Build the B:A
+           portion by constructing it as a 32-bit value.  */
         if (is_right) {
-            tcg_gen_andi_tl(t0, t0, 0xffff);
-            tcg_gen_shli_tl(cpu_tmp0, t1, 16);
-            tcg_gen_or_tl(t0, t0, cpu_tmp0);
-            tcg_gen_ext32u_tl(t0, t0);
-
-            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
-            
-            /* only needed if count > 16, but a test would complicate */
-            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
-            tcg_gen_shl_tl(cpu_tmp0, t0, cpu_tmp5);
-
-            tcg_gen_shr_tl(t0, t0, t2);
-
-            tcg_gen_or_tl(t0, t0, cpu_tmp0);
+            tcg_gen_deposit_tl(cpu_tmp0, cpu_T[0], cpu_T[1], 16, 16);
+            tcg_gen_mov_tl(cpu_T[1], cpu_T[0]);
+            tcg_gen_mov_tl(cpu_T[0], cpu_tmp0);
         } else {
-            /* XXX: not optimal */
-            tcg_gen_andi_tl(t0, t0, 0xffff);
-            tcg_gen_shli_tl(t1, t1, 16);
-            tcg_gen_or_tl(t1, t1, t0);
-            tcg_gen_ext32u_tl(t1, t1);
-            
-            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-            tcg_gen_subfi_tl(cpu_tmp0, 32, cpu_tmp5);
-            tcg_gen_shr_tl(cpu_tmp5, t1, cpu_tmp0);
-            tcg_gen_or_tl(cpu_tmp4, cpu_tmp4, cpu_tmp5);
-
-            tcg_gen_shl_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
-            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_deposit_tl(cpu_T[1], cpu_T[0], cpu_T[1], 16, 16);
         }
-    } else {
-        data_bits = 8 << ot;
+        /* FALLTHRU */
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        /* Concatenate the two 32-bit values and use a 64-bit shift.  */
+        tcg_gen_subi_tl(cpu_tmp0, count, 1);
         if (is_right) {
-            if (ot == OT_LONG)
-                tcg_gen_ext32u_tl(t0, t0);
-
-            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
+            tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[0], cpu_T[1]);
+            tcg_gen_shr_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shr_i64(cpu_T[0], cpu_T[0], count);
+        } else {
+            tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[1], cpu_T[0]);
+            tcg_gen_shl_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shl_i64(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shri_i64(cpu_tmp0, cpu_tmp0, 32);
+            tcg_gen_shri_i64(cpu_T[0], cpu_T[0], 32);
+        }
+        break;
+#endif
+    default:
+        tcg_gen_subi_tl(cpu_tmp0, count, 1);
+        if (is_right) {
+            tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
 
-            tcg_gen_shr_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
-            tcg_gen_shl_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
-            
+            tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
         } else {
-            if (ot == OT_LONG)
-                tcg_gen_ext32u_tl(t1, t1);
-
-            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-            
-            tcg_gen_shl_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
-            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            if (ot == OT_WORD) {
+                /* Only needed if count > 16, for Intel behaviour.  */
+                tcg_gen_subfi_tl(cpu_tmp4, 33, count);
+                tcg_gen_shr_tl(cpu_tmp4, cpu_T[1], cpu_tmp4);
+                tcg_gen_or_tl(cpu_tmp0, cpu_tmp0, cpu_tmp4);
+            }
+
+            tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+            tcg_gen_shl_tl(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shr_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
         }
+        tcg_gen_movi_tl(cpu_tmp4, 0);
+        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[1], count, cpu_tmp4,
+                           cpu_tmp4, cpu_T[1]);
+        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        break;
     }
-    tcg_gen_mov_tl(t1, cpu_tmp4);
 
-    gen_set_label(label1);
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
-    } else {
-        gen_op_mov_reg_v(ot, op1, t0);
-    }
-    
-    /* Update eflags data because we cannot predict flags afterward.  */
-    gen_update_cc_op(s);
-    set_cc_op(s, CC_OP_DYNAMIC);
-
-    label2 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label2);
-
-    tcg_gen_mov_tl(cpu_cc_src, t1);
-    tcg_gen_mov_tl(cpu_cc_dst, t0);
-    if (is_right) {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
+        gen_op_mov_reg_T0(ot, op1);
     }
-    gen_set_label(label2);
 
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
-    tcg_temp_free(a0);
+    gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, count, is_right);
+    tcg_temp_free(count);
 }
 
 static void gen_shift(DisasContext *s1, int op, int ot, int d, int s)
@@ -3138,6 +3149,9 @@ struct SSEOpHelper_eppi {
 #define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 }
 #define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
 #define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 }
+#define PCLMULQDQ_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, \
+        CPUID_EXT_PCLMULQDQ }
+#define AESNI_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_AES }
 
 static const struct SSEOpHelper_epp sse_op_table6[256] = {
     [0x00] = SSSE3_OP(pshufb),
@@ -3186,6 +3200,11 @@ static const struct SSEOpHelper_epp sse_op_table6[256] = {
     [0x3f] = SSE41_OP(pmaxud),
     [0x40] = SSE41_OP(pmulld),
     [0x41] = SSE41_OP(phminposuw),
+    [0xdb] = AESNI_OP(aesimc),
+    [0xdc] = AESNI_OP(aesenc),
+    [0xdd] = AESNI_OP(aesenclast),
+    [0xde] = AESNI_OP(aesdec),
+    [0xdf] = AESNI_OP(aesdeclast),
 };
 
 static const struct SSEOpHelper_eppi sse_op_table7[256] = {
@@ -3207,10 +3226,12 @@ static const struct SSEOpHelper_eppi sse_op_table7[256] = {
     [0x40] = SSE41_OP(dpps),
     [0x41] = SSE41_OP(dppd),
     [0x42] = SSE41_OP(mpsadbw),
+    [0x44] = PCLMULQDQ_OP(pclmulqdq),
     [0x60] = SSE42_OP(pcmpestrm),
     [0x61] = SSE42_OP(pcmpestri),
     [0x62] = SSE42_OP(pcmpistrm),
     [0x63] = SSE42_OP(pcmpistri),
+    [0xdf] = AESNI_OP(aeskeygenassist),
 };
 
 static void gen_sse(CPUX86State *env, DisasContext *s, int b,
@@ -4066,6 +4087,247 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                 }
                 break;
 
+            case 0x0f5: /* bzhi Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                tcg_gen_ext8u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                {
+                    TCGv bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+                    /* Note that since we're using BMILG (in order to get O
+                       cleared) we need to store the inverse into C.  */
+                    tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src,
+                                       cpu_T[1], bound);
+                    tcg_gen_movcond_tl(TCG_COND_GT, cpu_T[1], cpu_T[1],
+                                       bound, bound, cpu_T[1]);
+                    tcg_temp_free(bound);
+                }
+                tcg_gen_movi_tl(cpu_A0, -1);
+                tcg_gen_shl_tl(cpu_A0, cpu_A0, cpu_T[1]);
+                tcg_gen_andc_tl(cpu_T[0], cpu_T[0], cpu_A0);
+                gen_op_mov_reg_T0(ot, reg);
+                gen_op_update1_cc();
+                set_cc_op(s, CC_OP_BMILGB + ot);
+                break;
+
+            case 0x3f6: /* mulx By, Gy, rdx, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                switch (ot) {
+                default:
+                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                    tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EDX]);
+                    tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                      cpu_tmp2_i32, cpu_tmp3_i32);
+                    tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], cpu_tmp2_i32);
+                    tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp3_i32);
+                    break;
+#ifdef TARGET_X86_64
+                case OT_QUAD:
+                    tcg_gen_mulu2_i64(cpu_regs[s->vex_v], cpu_regs[reg],
+                                      cpu_T[0], cpu_regs[R_EDX]);
+                    break;
+#endif
+                }
+                break;
+
+            case 0x3f5: /* pdep Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                /* Note that by zero-extending the mask operand, we
+                   automatically handle zero-extending the result.  */
+                if (s->dflag == 2) {
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                } else {
+                    tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                }
+                gen_helper_pdep(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                break;
+
+            case 0x2f5: /* pext Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                /* Note that by zero-extending the mask operand, we
+                   automatically handle zero-extending the result.  */
+                if (s->dflag == 2) {
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                } else {
+                    tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                }
+                gen_helper_pext(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                break;
+
+            case 0x1f6: /* adcx Gy, Ey */
+            case 0x2f6: /* adox Gy, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) {
+                    goto illegal_op;
+                } else {
+                    TCGv carry_in, carry_out, zero;
+                    int end_op;
+
+                    ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+                    /* Re-use the carry-out from a previous round.  */
+                    TCGV_UNUSED(carry_in);
+                    carry_out = (b == 0x1f6 ? cpu_cc_dst : cpu_cc_src2);
+                    switch (s->cc_op) {
+                    case CC_OP_ADCX:
+                        if (b == 0x1f6) {
+                            carry_in = cpu_cc_dst;
+                            end_op = CC_OP_ADCX;
+                        } else {
+                            end_op = CC_OP_ADCOX;
+                        }
+                        break;
+                    case CC_OP_ADOX:
+                        if (b == 0x1f6) {
+                            end_op = CC_OP_ADCOX;
+                        } else {
+                            carry_in = cpu_cc_src2;
+                            end_op = CC_OP_ADOX;
+                        }
+                        break;
+                    case CC_OP_ADCOX:
+                        end_op = CC_OP_ADCOX;
+                        carry_in = carry_out;
+                        break;
+                    default:
+                        end_op = (b == 0x1f6 ? CC_OP_ADCX : CC_OP_ADOX);
+                        break;
+                    }
+                    /* If we can't reuse carry-out, get it out of EFLAGS.  */
+                    if (TCGV_IS_UNUSED(carry_in)) {
+                        if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
+                            gen_compute_eflags(s);
+                        }
+                        carry_in = cpu_tmp0;
+                        tcg_gen_shri_tl(carry_in, cpu_cc_src,
+                                        ctz32(b == 0x1f6 ? CC_C : CC_O));
+                        tcg_gen_andi_tl(carry_in, carry_in, 1);
+                    }
+
+                    switch (ot) {
+#ifdef TARGET_X86_64
+                    case OT_LONG:
+                        /* If we know TL is 64-bit, and we want a 32-bit
+                           result, just do everything in 64-bit arithmetic.  */
+                        tcg_gen_ext32u_i64(cpu_regs[reg], cpu_regs[reg]);
+                        tcg_gen_ext32u_i64(cpu_T[0], cpu_T[0]);
+                        tcg_gen_add_i64(cpu_T[0], cpu_T[0], cpu_regs[reg]);
+                        tcg_gen_add_i64(cpu_T[0], cpu_T[0], carry_in);
+                        tcg_gen_ext32u_i64(cpu_regs[reg], cpu_T[0]);
+                        tcg_gen_shri_i64(carry_out, cpu_T[0], 32);
+                        break;
+#endif
+                    default:
+                        /* Otherwise compute the carry-out in two steps.  */
+                        zero = tcg_const_tl(0);
+                        tcg_gen_add2_tl(cpu_T[0], carry_out,
+                                        cpu_T[0], zero,
+                                        carry_in, zero);
+                        tcg_gen_add2_tl(cpu_regs[reg], carry_out,
+                                        cpu_regs[reg], carry_out,
+                                        cpu_T[0], zero);
+                        tcg_temp_free(zero);
+                        break;
+                    }
+                    set_cc_op(s, end_op);
+                }
+                break;
+
+            case 0x1f7: /* shlx Gy, Ey, By */
+            case 0x2f7: /* sarx Gy, Ey, By */
+            case 0x3f7: /* shrx Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                if (ot == OT_QUAD) {
+                    tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 63);
+                } else {
+                    tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 31);
+                }
+                if (b == 0x1f7) {
+                    tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                } else if (b == 0x2f7) {
+                    if (ot != OT_QUAD) {
+                        tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+                    }
+                    tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                } else {
+                    if (ot != OT_QUAD) {
+                        tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
+                    }
+                    tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            case 0x0f3:
+            case 0x1f3:
+            case 0x2f3:
+            case 0x3f3: /* Group 17 */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+                switch (reg & 7) {
+                case 1: /* blsr By,Ey */
+                    tcg_gen_neg_tl(cpu_T[1], cpu_T[0]);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                    gen_op_mov_reg_T0(ot, s->vex_v);
+                    gen_op_update2_cc();
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
+
+                case 2: /* blsmsk By,Ey */
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                    tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+                    tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
+
+                case 3: /* blsi By, Ey */
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                    tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
+
+                default:
+                    goto illegal_op;
+                }
+                break;
+
             default:
                 goto illegal_op;
             }
@@ -4154,9 +4416,9 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                     if (mod == 3)
                         gen_op_mov_TN_reg(OT_LONG, 0, rm);
                     else
-                        tcg_gen_qemu_ld8u(cpu_tmp0, cpu_A0,
+                        tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0,
                                         (s->mem_index >> 2) - 1);
-                    tcg_gen_st8_tl(cpu_tmp0, cpu_env, offsetof(CPUX86State,
+                    tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
                                             xmm_regs[reg].XMM_B(val & 15)));
                     break;
                 case 0x21: /* insertps */
@@ -4252,6 +4514,38 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             sse_fn_eppi(cpu_env, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
             break;
+
+        case 0x33a:
+            /* Various integer extensions at 0f 3a f[0-f].  */
+            b = modrm | (b1 << 8);
+            modrm = cpu_ldub_code(env, s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+
+            switch (b) {
+            case 0x3f0: /* rorx Gy,Ey, Ib */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                b = cpu_ldub_code(env, s->pc++);
+                if (ot == OT_QUAD) {
+                    tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], b & 63);
+                } else {
+                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                    tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, b & 31);
+                    tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            default:
+                goto illegal_op;
+            }
+            break;
+
         default:
             goto illegal_op;
         }
@@ -4383,8 +4677,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     }
     s->pc = pc_start;
     prefixes = 0;
-    aflag = s->code32;
-    dflag = s->code32;
     s->override = -1;
     rex_w = -1;
     rex_r = 0;
@@ -4464,7 +4756,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             }
             s->pc++;
 
-            /* 4.1.1-4.1.3: No preceeding lock, 66, f2, f3, or rex prefixes. */
+            /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */
             if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ
                             | PREFIX_LOCK | PREFIX_DATA)) {
                 goto illegal_op;
@@ -4507,23 +4799,25 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     }
 
     /* Post-process prefixes.  */
-    if (prefixes & PREFIX_DATA) {
-        dflag ^= 1;
-    }
-    if (prefixes & PREFIX_ADR) {
-        aflag ^= 1;
-    }
-#ifdef TARGET_X86_64
     if (CODE64(s)) {
-        if (rex_w == 1) {
-            /* 0x66 is ignored if rex.w is set */
-            dflag = 2;
+        /* In 64-bit mode, the default data size is 32-bit.  Select 64-bit
+           data with rex_w, and 16-bit data with 0x66; rex_w takes precedence
+           over 0x66 if both are present.  */
+        dflag = (rex_w > 0 ? 2 : prefixes & PREFIX_DATA ? 0 : 1);
+        /* In 64-bit mode, 0x67 selects 32-bit addressing.  */
+        aflag = (prefixes & PREFIX_ADR ? 1 : 2);
+    } else {
+        /* In 16/32-bit mode, 0x66 selects the opposite data size.  */
+        dflag = s->code32;
+        if (prefixes & PREFIX_DATA) {
+            dflag ^= 1;
         }
-        if (!(prefixes & PREFIX_ADR)) {
-            aflag = 2;
+        /* In 16/32-bit mode, 0x67 selects the opposite addressing.  */
+        aflag = s->code32;
+        if (prefixes & PREFIX_ADR) {
+            aflag ^= 1;
         }
     }
-#endif
 
     s->prefix = prefixes;
     s->aflag = aflag;
@@ -4574,10 +4868,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 } else if (op == OP_XORL && rm == reg) {
                 xor_zero:
                     /* xor reg, reg optimisation */
+                    set_cc_op(s, CC_OP_CLR);
                     gen_op_movl_T0_0();
-                    set_cc_op(s, CC_OP_LOGICB + ot);
                     gen_op_mov_reg_T0(ot, reg);
-                    gen_op_update1_cc();
                     break;
                 } else {
                     opreg = rm;
@@ -4738,39 +5031,22 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 break;
             default:
             case OT_LONG:
-#ifdef TARGET_X86_64
-                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32u_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
-                gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_extu_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-                }
-#endif
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+                tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                  cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
-                gen_helper_mulq_EAX_T0(cpu_env, cpu_T[0]);
+                tcg_gen_mulu2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+                                  cpu_T[0], cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
@@ -4806,41 +5082,25 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 break;
             default:
             case OT_LONG:
-#ifdef TARGET_X86_64
-                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
-                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
-                gen_op_mov_reg_T0(OT_LONG, R_EDX);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                }
-#endif
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+                tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                  cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+                tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
                 set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
-                gen_helper_imulq_EAX_T0(cpu_env, cpu_T[0]);
+                tcg_gen_muls2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+                                  cpu_T[0], cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_sari_tl(cpu_cc_src, cpu_regs[R_EAX], 63);
+                tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
@@ -5095,37 +5355,27 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         } else {
             gen_op_mov_TN_reg(ot, 1, reg);
         }
-
-#ifdef TARGET_X86_64
-        if (ot == OT_QUAD) {
-            gen_helper_imulq_T0_T1(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
-        } else
-#endif
-        if (ot == OT_LONG) {
+        switch (ot) {
 #ifdef TARGET_X86_64
-                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
-                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[1], t0);
-                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[1], cpu_tmp0);
-                }
+        case OT_QUAD:
+            tcg_gen_muls2_i64(cpu_regs[reg], cpu_T[1], cpu_T[0], cpu_T[1]);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+            tcg_gen_sari_tl(cpu_cc_src, cpu_cc_dst, 63);
+            tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_T[1]);
+            break;
 #endif
-        } else {
+        case OT_LONG:
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+            tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                              cpu_tmp2_i32, cpu_tmp3_i32);
+            tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp2_i32);
+            tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+            tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+            tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
+            break;
+        default:
             tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
             tcg_gen_ext16s_tl(cpu_T[1], cpu_T[1]);
             /* XXX: use 32 bit mul which could be faster */
@@ -5133,8 +5383,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
             tcg_gen_ext16s_tl(cpu_tmp0, cpu_T[0]);
             tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+            gen_op_mov_reg_T0(ot, reg);
+            break;
         }
-        gen_op_mov_reg_T0(ot, reg);
         set_cc_op(s, CC_OP_MULB + ot);
         break;
     case 0x1c0:
@@ -6841,46 +7092,58 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_movi_tl(cpu_cc_dst, 0);
         }
         break;
-    case 0x1bc: /* bsf */
-    case 0x1bd: /* bsr */
-        {
-            int label1;
-            TCGv t0;
-
-            ot = dflag + OT_WORD;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
-            gen_ldst_modrm(env, s,modrm, ot, OR_TMP0, 0);
-            gen_extu(ot, cpu_T[0]);
-            t0 = tcg_temp_local_new();
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            if ((b & 1) && (prefixes & PREFIX_REPZ) &&
-                (s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
-                switch(ot) {
-                case OT_WORD: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(16)); break;
-                case OT_LONG: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(32)); break;
-                case OT_QUAD: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(64)); break;
-                }
-                gen_op_mov_reg_T0(ot, reg);
+    case 0x1bc: /* bsf / tzcnt */
+    case 0x1bd: /* bsr / lzcnt */
+        ot = dflag + OT_WORD;
+        modrm = cpu_ldub_code(env, s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+        gen_extu(ot, cpu_T[0]);
+
+        /* Note that lzcnt and tzcnt are in different extensions.  */
+        if ((prefixes & PREFIX_REPZ)
+            && (b & 1
+                ? s->cpuid_ext3_features & CPUID_EXT3_ABM
+                : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
+            int size = 8 << ot;
+            tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+            if (b & 1) {
+                /* For lzcnt, reduce the target_ulong result by the
+                   number of zeros that we expect to find at the top.  */
+                gen_helper_clz(cpu_T[0], cpu_T[0]);
+                tcg_gen_subi_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - size);
             } else {
-                label1 = gen_new_label();
-                tcg_gen_movi_tl(cpu_cc_dst, 0);
-                tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, label1);
-                if (b & 1) {
-                    gen_helper_bsr(cpu_T[0], t0);
-                } else {
-                    gen_helper_bsf(cpu_T[0], t0);
-                }
-                gen_op_mov_reg_T0(ot, reg);
-                tcg_gen_movi_tl(cpu_cc_dst, 1);
-                gen_set_label(label1);
-                set_cc_op(s, CC_OP_LOGICB + ot);
+                /* For tzcnt, a zero input must return the operand size:
+                   force all bits outside the operand size to 1.  */
+                target_ulong mask = (target_ulong)-2 << (size - 1);
+                tcg_gen_ori_tl(cpu_T[0], cpu_T[0], mask);
+                gen_helper_ctz(cpu_T[0], cpu_T[0]);
             }
-            tcg_temp_free(t0);
+            /* For lzcnt/tzcnt, C and Z bits are defined and are
+               related to the result.  */
+            gen_op_update1_cc();
+            set_cc_op(s, CC_OP_BMILGB + ot);
+        } else {
+            /* For bsr/bsf, only the Z bit is defined and it is related
+               to the input and not the result.  */
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+            set_cc_op(s, CC_OP_LOGICB + ot);
+            if (b & 1) {
+                /* For bsr, return the bit index of the first 1 bit,
+                   not the count of leading zeros.  */
+                gen_helper_clz(cpu_T[0], cpu_T[0]);
+                tcg_gen_xori_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - 1);
+            } else {
+                gen_helper_ctz(cpu_T[0], cpu_T[0]);
+            }
+            /* ??? The manual says that the output is undefined when the
+               input is zero, but real hardware leaves it unchanged, and
+               real programs appear to depend on that.  */
+            tcg_gen_movi_tl(cpu_tmp0, 0);
+            tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[0], cpu_cc_dst, cpu_tmp0,
+                               cpu_regs[reg], cpu_T[0]);
         }
+        gen_op_mov_reg_T0(ot, reg);
         break;
         /************************/
         /* bcd */
@@ -7988,10 +8251,12 @@ void optimize_flags_init(void)
 /* generate intermediate code in gen_opc_buf and gen_opparam_buf for
    basic block 'tb'. If search_pc is TRUE, also generate PC
    information for each intermediate instruction. */
-static inline void gen_intermediate_code_internal(CPUX86State *env,
+static inline void gen_intermediate_code_internal(X86CPU *cpu,
                                                   TranslationBlock *tb,
-                                                  int search_pc)
+                                                  bool search_pc)
 {
+    CPUState *cs = CPU(cpu);
+    CPUX86State *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     target_ulong pc_ptr;
     uint16_t *gen_opc_end;
@@ -8017,7 +8282,7 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     dc->cpl = (flags >> HF_CPL_SHIFT) & 3;
     dc->iopl = (flags >> IOPL_SHIFT) & 3;
     dc->tf = (flags >> TF_SHIFT) & 1;
-    dc->singlestep_enabled = env->singlestep_enabled;
+    dc->singlestep_enabled = cs->singlestep_enabled;
     dc->cc_op = CC_OP_DYNAMIC;
     dc->cc_op_dirty = false;
     dc->cs_base = cs_base;
@@ -8028,17 +8293,17 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     if (flags & HF_SOFTMMU_MASK) {
         dc->mem_index = (cpu_mmu_index(env) + 1) << 2;
     }
-    dc->cpuid_features = env->cpuid_features;
-    dc->cpuid_ext_features = env->cpuid_ext_features;
-    dc->cpuid_ext2_features = env->cpuid_ext2_features;
-    dc->cpuid_ext3_features = env->cpuid_ext3_features;
-    dc->cpuid_7_0_ebx_features = env->cpuid_7_0_ebx_features;
+    dc->cpuid_features = env->features[FEAT_1_EDX];
+    dc->cpuid_ext_features = env->features[FEAT_1_ECX];
+    dc->cpuid_ext2_features = env->features[FEAT_8000_0001_EDX];
+    dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
+    dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
 #ifdef TARGET_X86_64
     dc->lma = (flags >> HF_LMA_SHIFT) & 1;
     dc->code64 = (flags >> HF_CS64_SHIFT) & 1;
 #endif
     dc->flags = flags;
-    dc->jmp_opt = !(dc->tf || env->singlestep_enabled ||
+    dc->jmp_opt = !(dc->tf || cs->singlestep_enabled ||
                     (flags & HF_INHIBIT_IRQ_MASK)
 #ifndef CONFIG_SOFTMMU
                     || (flags & HF_SOFTMMU_MASK)
@@ -8059,7 +8324,6 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     cpu_tmp2_i32 = tcg_temp_new_i32();
     cpu_tmp3_i32 = tcg_temp_new_i32();
     cpu_tmp4 = tcg_temp_new();
-    cpu_tmp5 = tcg_temp_new();
     cpu_ptr0 = tcg_temp_new_ptr();
     cpu_ptr1 = tcg_temp_new_ptr();
     cpu_cc_srcT = tcg_temp_local_new();
@@ -8074,7 +8338,7 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     if (max_insns == 0)
         max_insns = CF_COUNT_MASK;
 
-    gen_icount_start();
+    gen_tb_start();
     for(;;) {
         if (unlikely(!QTAILQ_EMPTY(&env->breakpoints))) {
             QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
@@ -8132,7 +8396,7 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     }
     if (tb->cflags & CF_LAST_IO)
         gen_io_end();
-    gen_icount_end(tb, num_insns);
+    gen_tb_end(tb, num_insns);
     *tcg_ctx.gen_opc_ptr = INDEX_op_end;
     /* we don't forget to fill the last values */
     if (search_pc) {
@@ -8166,12 +8430,12 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
 
 void gen_intermediate_code(CPUX86State *env, TranslationBlock *tb)
 {
-    gen_intermediate_code_internal(env, tb, 0);
+    gen_intermediate_code_internal(x86_env_get_cpu(env), tb, false);
 }
 
 void gen_intermediate_code_pc(CPUX86State *env, TranslationBlock *tb)
 {
-    gen_intermediate_code_internal(env, tb, 1);
+    gen_intermediate_code_internal(x86_env_get_cpu(env), tb, true);
 }
 
 void restore_state_to_opc(CPUX86State *env, TranslationBlock *tb, int pc_pos)