]> git.proxmox.com Git - qemu.git/blobdiff - target-i386/translate.c
kvm: i386: fix LAPIC TSC deadline timer save/restore
[qemu.git] / target-i386 / translate.c
index ccb06e27f59fb7e90a44df118103272c4020a98d..065a9d320e0d6ce95f48b04700c38e7d519c65a0 100644 (file)
@@ -23,6 +23,7 @@
 #include <inttypes.h>
 #include <signal.h>
 
+#include "qemu/host-utils.h"
 #include "cpu.h"
 #include "disas/disas.h"
 #include "tcg-op.h"
@@ -36,6 +37,7 @@
 #define PREFIX_LOCK   0x04
 #define PREFIX_DATA   0x08
 #define PREFIX_ADR    0x10
+#define PREFIX_VEX    0x20
 
 #ifdef TARGET_X86_64
 #define CODE64(s) ((s)->code64)
 #define REX_B(s) 0
 #endif
 
+#ifdef TARGET_X86_64
+# define ctztl  ctz64
+# define clztl  clz64
+#else
+# define ctztl  ctz32
+# define clztl  clz32
+#endif
+
 //#define MACRO_TEST   1
 
 /* global register indexes */
 static TCGv_ptr cpu_env;
-static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst, cpu_cc_tmp;
+static TCGv cpu_A0;
+static TCGv cpu_cc_dst, cpu_cc_src, cpu_cc_src2, cpu_cc_srcT;
 static TCGv_i32 cpu_cc_op;
 static TCGv cpu_regs[CPU_NB_REGS];
 /* local temps */
-static TCGv cpu_T[2], cpu_T3;
+static TCGv cpu_T[2];
 /* local register indexes (only used inside old micro ops) */
 static TCGv cpu_tmp0, cpu_tmp4;
 static TCGv_ptr cpu_ptr0, cpu_ptr1;
 static TCGv_i32 cpu_tmp2_i32, cpu_tmp3_i32;
 static TCGv_i64 cpu_tmp1_i64;
-static TCGv cpu_tmp5;
 
 static uint8_t gen_opc_cc_op[OPC_BUF_SIZE];
 
@@ -88,8 +98,11 @@ typedef struct DisasContext {
     int code64; /* 64 bit code segment */
     int rex_x, rex_b;
 #endif
+    int vex_l;  /* vex vector length */
+    int vex_v;  /* vex vvvv register, without 1's compliment.  */
     int ss32;   /* 32 bit stack segment */
-    int cc_op;  /* current CC operation */
+    CCOp cc_op;  /* current CC operation */
+    bool cc_op_dirty;
     int addseg; /* non zero if either DS/ES/SS have a non zero base */
     int f_st;   /* currently unused */
     int vm86;   /* vm86 mode */
@@ -113,6 +126,7 @@ typedef struct DisasContext {
 static void gen_eob(DisasContext *s);
 static void gen_jmp(DisasContext *s, target_ulong eip);
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num);
+static void gen_op(DisasContext *s1, int op, int ot, int d);
 
 /* i386 arith/logic operations */
 enum {
@@ -173,6 +187,79 @@ enum {
     OR_A0, /* temporary register used when doing address evaluation */
 };
 
+enum {
+    USES_CC_DST  = 1,
+    USES_CC_SRC  = 2,
+    USES_CC_SRC2 = 4,
+    USES_CC_SRCT = 8,
+};
+
+/* Bit set if the global variable is live after setting CC_OP to X.  */
+static const uint8_t cc_op_live[CC_OP_NB] = {
+    [CC_OP_DYNAMIC] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_EFLAGS] = USES_CC_SRC,
+    [CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADDB ... CC_OP_ADDQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRCT,
+    [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_LOGICB ... CC_OP_LOGICQ] = USES_CC_DST,
+    [CC_OP_INCB ... CC_OP_INCQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_DECB ... CC_OP_DECQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_SHLB ... CC_OP_SHLQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_SARB ... CC_OP_SARQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_BMILGB ... CC_OP_BMILGQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADCX] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_CLR] = 0,
+};
+
+static void set_cc_op(DisasContext *s, CCOp op)
+{
+    int dead;
+
+    if (s->cc_op == op) {
+        return;
+    }
+
+    /* Discard CC computation that will no longer be used.  */
+    dead = cc_op_live[s->cc_op] & ~cc_op_live[op];
+    if (dead & USES_CC_DST) {
+        tcg_gen_discard_tl(cpu_cc_dst);
+    }
+    if (dead & USES_CC_SRC) {
+        tcg_gen_discard_tl(cpu_cc_src);
+    }
+    if (dead & USES_CC_SRC2) {
+        tcg_gen_discard_tl(cpu_cc_src2);
+    }
+    if (dead & USES_CC_SRCT) {
+        tcg_gen_discard_tl(cpu_cc_srcT);
+    }
+
+    if (op == CC_OP_DYNAMIC) {
+        /* The DYNAMIC setting is translator only, and should never be
+           stored.  Thus we always consider it clean.  */
+        s->cc_op_dirty = false;
+    } else {
+        /* Discard any computed CC_OP value (see shifts).  */
+        if (s->cc_op == CC_OP_DYNAMIC) {
+            tcg_gen_discard_i32(cpu_cc_op);
+        }
+        s->cc_op_dirty = true;
+    }
+    s->cc_op = op;
+}
+
+static void gen_update_cc_op(DisasContext *s)
+{
+    if (s->cc_op_dirty) {
+        tcg_gen_movi_i32(cpu_cc_op, s->cc_op);
+        s->cc_op_dirty = false;
+    }
+}
+
 static inline void gen_op_movl_T0_0(void)
 {
     tcg_gen_movi_tl(cpu_T[0], 0);
@@ -439,11 +526,6 @@ static inline void gen_op_add_reg_T0(int size, int reg)
     }
 }
 
-static inline void gen_op_set_cc_op(int32_t val)
-{
-    tcg_gen_movi_i32(cpu_cc_op, val);
-}
-
 static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
 {
     tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
@@ -752,8 +834,7 @@ static void gen_check_io(DisasContext *s, int ot, target_ulong cur_eip,
 
     state_saved = 0;
     if (s->pe && (s->cpl > s->iopl || s->vm86)) {
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(cur_eip);
         state_saved = 1;
         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
@@ -771,8 +852,7 @@ static void gen_check_io(DisasContext *s, int ot, target_ulong cur_eip,
     }
     if(s->flags & HF_SVMI_MASK) {
         if (!state_saved) {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(cur_eip);
         }
         svm_flags |= (1 << (4 + ot));
@@ -795,17 +875,8 @@ static inline void gen_movs(DisasContext *s, int ot)
     gen_op_add_reg_T0(s->aflag, R_EDI);
 }
 
-static inline void gen_update_cc_op(DisasContext *s)
-{
-    if (s->cc_op != CC_OP_DYNAMIC) {
-        gen_op_set_cc_op(s->cc_op);
-        s->cc_op = CC_OP_DYNAMIC;
-    }
-}
-
 static void gen_op_update1_cc(void)
 {
-    tcg_gen_discard_tl(cpu_cc_src);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
@@ -815,259 +886,392 @@ static void gen_op_update2_cc(void)
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
-static inline void gen_op_cmpl_T0_T1_cc(void)
+static void gen_op_update3_cc(TCGv reg)
 {
+    tcg_gen_mov_tl(cpu_cc_src2, reg);
     tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-    tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
+    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
 static inline void gen_op_testl_T0_T1_cc(void)
 {
-    tcg_gen_discard_tl(cpu_cc_src);
     tcg_gen_and_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
 }
 
 static void gen_op_update_neg_cc(void)
 {
-    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
+    tcg_gen_movi_tl(cpu_cc_srcT, 0);
 }
 
-/* compute eflags.C to reg */
-static void gen_compute_eflags_c(TCGv reg)
+/* compute all eflags to cc_src */
+static void gen_compute_eflags(DisasContext *s)
 {
-    gen_helper_cc_compute_c(cpu_tmp2_i32, cpu_env, cpu_cc_op);
-    tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
+    TCGv zero, dst, src1, src2;
+    int live, dead;
+
+    if (s->cc_op == CC_OP_EFLAGS) {
+        return;
+    }
+    if (s->cc_op == CC_OP_CLR) {
+        tcg_gen_movi_tl(cpu_cc_src, CC_Z);
+        set_cc_op(s, CC_OP_EFLAGS);
+        return;
+    }
+
+    TCGV_UNUSED(zero);
+    dst = cpu_cc_dst;
+    src1 = cpu_cc_src;
+    src2 = cpu_cc_src2;
+
+    /* Take care to not read values that are not live.  */
+    live = cc_op_live[s->cc_op] & ~USES_CC_SRCT;
+    dead = live ^ (USES_CC_DST | USES_CC_SRC | USES_CC_SRC2);
+    if (dead) {
+        zero = tcg_const_tl(0);
+        if (dead & USES_CC_DST) {
+            dst = zero;
+        }
+        if (dead & USES_CC_SRC) {
+            src1 = zero;
+        }
+        if (dead & USES_CC_SRC2) {
+            src2 = zero;
+        }
+    }
+
+    gen_update_cc_op(s);
+    gen_helper_cc_compute_all(cpu_cc_src, dst, src1, src2, cpu_cc_op);
+    set_cc_op(s, CC_OP_EFLAGS);
+
+    if (dead) {
+        tcg_temp_free(zero);
+    }
 }
 
-/* compute all eflags to cc_src */
-static void gen_compute_eflags(TCGv reg)
+typedef struct CCPrepare {
+    TCGCond cond;
+    TCGv reg;
+    TCGv reg2;
+    target_ulong imm;
+    target_ulong mask;
+    bool use_reg2;
+    bool no_setcond;
+} CCPrepare;
+
+/* compute eflags.C to reg */
+static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
+{
+    TCGv t0, t1;
+    int size, shift;
+
+    switch (s->cc_op) {
+    case CC_OP_SUBB ... CC_OP_SUBQ:
+        /* (DATA_TYPE)CC_SRCT < (DATA_TYPE)CC_SRC */
+        size = s->cc_op - CC_OP_SUBB;
+        t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+        /* If no temporary was used, be careful not to alias t1 and t0.  */
+        t0 = TCGV_EQUAL(t1, cpu_cc_src) ? cpu_tmp0 : reg;
+        tcg_gen_mov_tl(t0, cpu_cc_srcT);
+        gen_extu(size, t0);
+        goto add_sub;
+
+    case CC_OP_ADDB ... CC_OP_ADDQ:
+        /* (DATA_TYPE)CC_DST < (DATA_TYPE)CC_SRC */
+        size = s->cc_op - CC_OP_ADDB;
+        t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+        t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
+    add_sub:
+        return (CCPrepare) { .cond = TCG_COND_LTU, .reg = t0,
+                             .reg2 = t1, .mask = -1, .use_reg2 = true };
+
+    case CC_OP_LOGICB ... CC_OP_LOGICQ:
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+
+    case CC_OP_INCB ... CC_OP_INCQ:
+    case CC_OP_DECB ... CC_OP_DECQ:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = -1, .no_setcond = true };
+
+    case CC_OP_SHLB ... CC_OP_SHLQ:
+        /* (CC_SRC >> (DATA_BITS - 1)) & 1 */
+        size = s->cc_op - CC_OP_SHLB;
+        shift = (8 << size) - 1;
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = (target_ulong)1 << shift };
+
+    case CC_OP_MULB ... CC_OP_MULQ:
+        return (CCPrepare) { .cond = TCG_COND_NE,
+                             .reg = cpu_cc_src, .mask = -1 };
+
+    case CC_OP_BMILGB ... CC_OP_BMILGQ:
+        size = s->cc_op - CC_OP_BMILGB;
+        t0 = gen_ext_tl(reg, cpu_cc_src, size, false);
+        return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+
+    case CC_OP_ADCX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst,
+                             .mask = -1, .no_setcond = true };
+
+    case CC_OP_EFLAGS:
+    case CC_OP_SARB ... CC_OP_SARQ:
+        /* CC_SRC & 1 */
+        return (CCPrepare) { .cond = TCG_COND_NE,
+                             .reg = cpu_cc_src, .mask = CC_C };
+
+    default:
+       /* The need to compute only C from CC_OP_DYNAMIC is important
+          in efficiently implementing e.g. INC at the start of a TB.  */
+       gen_update_cc_op(s);
+       gen_helper_cc_compute_c(reg, cpu_cc_dst, cpu_cc_src,
+                               cpu_cc_src2, cpu_cc_op);
+       return (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                            .mask = -1, .no_setcond = true };
+    }
+}
+
+/* compute eflags.P to reg */
+static CCPrepare gen_prepare_eflags_p(DisasContext *s, TCGv reg)
 {
-    gen_helper_cc_compute_all(cpu_tmp2_i32, cpu_env, cpu_cc_op);
-    tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
+    gen_compute_eflags(s);
+    return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                         .mask = CC_P };
 }
 
-static inline void gen_setcc_slow_T0(DisasContext *s, int jcc_op)
+/* compute eflags.S to reg */
+static CCPrepare gen_prepare_eflags_s(DisasContext *s, TCGv reg)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
-    switch(jcc_op) {
-    case JCC_O:
-        gen_compute_eflags(cpu_T[0]);
-        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 11);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_B:
-        gen_compute_eflags_c(cpu_T[0]);
-        break;
-    case JCC_Z:
-        gen_compute_eflags(cpu_T[0]);
-        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 6);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_BE:
-        gen_compute_eflags(cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 6);
-        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_S:
-        gen_compute_eflags(cpu_T[0]);
-        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 7);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_P:
-        gen_compute_eflags(cpu_T[0]);
-        tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 2);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_L:
-        gen_compute_eflags(cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 11); /* CC_O */
-        tcg_gen_shri_tl(cpu_tmp0, cpu_tmp0, 7); /* CC_S */
-        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
+    switch (s->cc_op) {
+    case CC_OP_DYNAMIC:
+        gen_compute_eflags(s);
+        /* FALLTHRU */
+    case CC_OP_EFLAGS:
+    case CC_OP_ADCX:
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_S };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
     default:
-    case JCC_LE:
-        gen_compute_eflags(cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 11); /* CC_O */
-        tcg_gen_shri_tl(cpu_tmp4, cpu_tmp0, 7); /* CC_S */
-        tcg_gen_shri_tl(cpu_tmp0, cpu_tmp0, 6); /* CC_Z */
-        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
-        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
+        {
+            int size = (s->cc_op - CC_OP_ADDB) & 3;
+            TCGv t0 = gen_ext_tl(reg, cpu_cc_dst, size, true);
+            return (CCPrepare) { .cond = TCG_COND_LT, .reg = t0, .mask = -1 };
+        }
     }
 }
 
-/* return true if setcc_slow is not needed (WARNING: must be kept in
-   sync with gen_jcc1) */
-static int is_fast_jcc_case(DisasContext *s, int b)
+/* compute eflags.O to reg */
+static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg)
 {
-    int jcc_op;
-    jcc_op = (b >> 1) & 7;
-    switch(s->cc_op) {
-        /* we optimize the cmp/jcc case */
-    case CC_OP_SUBB:
-    case CC_OP_SUBW:
-    case CC_OP_SUBL:
-    case CC_OP_SUBQ:
-        if (jcc_op == JCC_O || jcc_op == JCC_P)
-            goto slow_jcc;
-        break;
+    switch (s->cc_op) {
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2,
+                             .mask = -1, .no_setcond = true };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+    default:
+        gen_compute_eflags(s);
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_O };
+    }
+}
 
-        /* some jumps are easy to compute */
-    case CC_OP_ADDB:
-    case CC_OP_ADDW:
-    case CC_OP_ADDL:
-    case CC_OP_ADDQ:
-
-    case CC_OP_LOGICB:
-    case CC_OP_LOGICW:
-    case CC_OP_LOGICL:
-    case CC_OP_LOGICQ:
-
-    case CC_OP_INCB:
-    case CC_OP_INCW:
-    case CC_OP_INCL:
-    case CC_OP_INCQ:
-
-    case CC_OP_DECB:
-    case CC_OP_DECW:
-    case CC_OP_DECL:
-    case CC_OP_DECQ:
-
-    case CC_OP_SHLB:
-    case CC_OP_SHLW:
-    case CC_OP_SHLL:
-    case CC_OP_SHLQ:
-        if (jcc_op != JCC_Z && jcc_op != JCC_S)
-            goto slow_jcc;
-        break;
+/* compute eflags.Z to reg */
+static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
+{
+    switch (s->cc_op) {
+    case CC_OP_DYNAMIC:
+        gen_compute_eflags(s);
+        /* FALLTHRU */
+    case CC_OP_EFLAGS:
+    case CC_OP_ADCX:
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_Z };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_ALWAYS, .mask = -1 };
     default:
-    slow_jcc:
-        return 0;
+        {
+            int size = (s->cc_op - CC_OP_ADDB) & 3;
+            TCGv t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
+            return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+        }
     }
-    return 1;
 }
 
-/* generate a conditional jump to label 'l1' according to jump opcode
+/* perform a conditional store into register 'reg' according to jump opcode
    value 'b'. In the fast case, T0 is guaranted not to be used. */
-static inline void gen_jcc1(DisasContext *s, int cc_op, int b, int l1)
+static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
 {
     int inv, jcc_op, size, cond;
+    CCPrepare cc;
     TCGv t0;
 
     inv = b & 1;
     jcc_op = (b >> 1) & 7;
 
-    switch(cc_op) {
-        /* we optimize the cmp/jcc case */
-    case CC_OP_SUBB:
-    case CC_OP_SUBW:
-    case CC_OP_SUBL:
-    case CC_OP_SUBQ:
-        
-        size = cc_op - CC_OP_SUBB;
-        switch(jcc_op) {
-        case JCC_Z:
-        fast_jcc_z:
-            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_dst, size, false);
-            tcg_gen_brcondi_tl(inv ? TCG_COND_NE : TCG_COND_EQ, t0, 0, l1);
-            break;
-        case JCC_S:
-        fast_jcc_s:
-            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_dst, size, true);
-            tcg_gen_brcondi_tl(inv ? TCG_COND_GE : TCG_COND_LT, t0, 0, l1);
-            break;
-
-        case JCC_B:
-            cond = inv ? TCG_COND_GEU : TCG_COND_LTU;
-            goto fast_jcc_b;
+    switch (s->cc_op) {
+    case CC_OP_SUBB ... CC_OP_SUBQ:
+        /* We optimize relational operators for the cmp/jcc case.  */
+        size = s->cc_op - CC_OP_SUBB;
+        switch (jcc_op) {
         case JCC_BE:
-            cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
-        fast_jcc_b:
-            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            tcg_gen_mov_tl(cpu_tmp4, cpu_cc_srcT);
             gen_extu(size, cpu_tmp4);
             t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
-            tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+            cc = (CCPrepare) { .cond = TCG_COND_LEU, .reg = cpu_tmp4,
+                               .reg2 = t0, .mask = -1, .use_reg2 = true };
             break;
-            
+
         case JCC_L:
-            cond = inv ? TCG_COND_GE : TCG_COND_LT;
+            cond = TCG_COND_LT;
             goto fast_jcc_l;
         case JCC_LE:
-            cond = inv ? TCG_COND_GT : TCG_COND_LE;
+            cond = TCG_COND_LE;
         fast_jcc_l:
-            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            tcg_gen_mov_tl(cpu_tmp4, cpu_cc_srcT);
             gen_exts(size, cpu_tmp4);
             t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, true);
-            tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+            cc = (CCPrepare) { .cond = cond, .reg = cpu_tmp4,
+                               .reg2 = t0, .mask = -1, .use_reg2 = true };
             break;
-            
+
         default:
             goto slow_jcc;
         }
         break;
-        
-        /* some jumps are easy to compute */
-    case CC_OP_ADDB:
-    case CC_OP_ADDW:
-    case CC_OP_ADDL:
-    case CC_OP_ADDQ:
-        
-    case CC_OP_ADCB:
-    case CC_OP_ADCW:
-    case CC_OP_ADCL:
-    case CC_OP_ADCQ:
-        
-    case CC_OP_SBBB:
-    case CC_OP_SBBW:
-    case CC_OP_SBBL:
-    case CC_OP_SBBQ:
-        
-    case CC_OP_LOGICB:
-    case CC_OP_LOGICW:
-    case CC_OP_LOGICL:
-    case CC_OP_LOGICQ:
-        
-    case CC_OP_INCB:
-    case CC_OP_INCW:
-    case CC_OP_INCL:
-    case CC_OP_INCQ:
-        
-    case CC_OP_DECB:
-    case CC_OP_DECW:
-    case CC_OP_DECL:
-    case CC_OP_DECQ:
-        
-    case CC_OP_SHLB:
-    case CC_OP_SHLW:
-    case CC_OP_SHLL:
-    case CC_OP_SHLQ:
-        
-    case CC_OP_SARB:
-    case CC_OP_SARW:
-    case CC_OP_SARL:
-    case CC_OP_SARQ:
-        switch(jcc_op) {
+
+    default:
+    slow_jcc:
+        /* This actually generates good code for JC, JZ and JS.  */
+        switch (jcc_op) {
+        case JCC_O:
+            cc = gen_prepare_eflags_o(s, reg);
+            break;
+        case JCC_B:
+            cc = gen_prepare_eflags_c(s, reg);
+            break;
         case JCC_Z:
-            size = (cc_op - CC_OP_ADDB) & 3;
-            goto fast_jcc_z;
+            cc = gen_prepare_eflags_z(s, reg);
+            break;
+        case JCC_BE:
+            gen_compute_eflags(s);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                               .mask = CC_Z | CC_C };
+            break;
         case JCC_S:
-            size = (cc_op - CC_OP_ADDB) & 3;
-            goto fast_jcc_s;
+            cc = gen_prepare_eflags_s(s, reg);
+            break;
+        case JCC_P:
+            cc = gen_prepare_eflags_p(s, reg);
+            break;
+        case JCC_L:
+            gen_compute_eflags(s);
+            if (TCGV_EQUAL(reg, cpu_cc_src)) {
+                reg = cpu_tmp0;
+            }
+            tcg_gen_shri_tl(reg, cpu_cc_src, 4); /* CC_O -> CC_S */
+            tcg_gen_xor_tl(reg, reg, cpu_cc_src);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                               .mask = CC_S };
+            break;
         default:
-            goto slow_jcc;
+        case JCC_LE:
+            gen_compute_eflags(s);
+            if (TCGV_EQUAL(reg, cpu_cc_src)) {
+                reg = cpu_tmp0;
+            }
+            tcg_gen_shri_tl(reg, cpu_cc_src, 4); /* CC_O -> CC_S */
+            tcg_gen_xor_tl(reg, reg, cpu_cc_src);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                               .mask = CC_S | CC_Z };
+            break;
         }
         break;
-    default:
-    slow_jcc:
-        gen_setcc_slow_T0(s, jcc_op);
-        tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, 
-                           cpu_T[0], 0, l1);
-        break;
+    }
+
+    if (inv) {
+        cc.cond = tcg_invert_cond(cc.cond);
+    }
+    return cc;
+}
+
+static void gen_setcc1(DisasContext *s, int b, TCGv reg)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, reg);
+
+    if (cc.no_setcond) {
+        if (cc.cond == TCG_COND_EQ) {
+            tcg_gen_xori_tl(reg, cc.reg, 1);
+        } else {
+            tcg_gen_mov_tl(reg, cc.reg);
+        }
+        return;
+    }
+
+    if (cc.cond == TCG_COND_NE && !cc.use_reg2 && cc.imm == 0 &&
+        cc.mask != 0 && (cc.mask & (cc.mask - 1)) == 0) {
+        tcg_gen_shri_tl(reg, cc.reg, ctztl(cc.mask));
+        tcg_gen_andi_tl(reg, reg, 1);
+        return;
+    }
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(reg, cc.reg, cc.mask);
+        cc.reg = reg;
+    }
+    if (cc.use_reg2) {
+        tcg_gen_setcond_tl(cc.cond, reg, cc.reg, cc.reg2);
+    } else {
+        tcg_gen_setcondi_tl(cc.cond, reg, cc.reg, cc.imm);
+    }
+}
+
+static inline void gen_compute_eflags_c(DisasContext *s, TCGv reg)
+{
+    gen_setcc1(s, JCC_B << 1, reg);
+}
+
+/* generate a conditional jump to label 'l1' according to jump opcode
+   value 'b'. In the fast case, T0 is guaranted not to be used. */
+static inline void gen_jcc1_noeob(DisasContext *s, int b, int l1)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, cpu_T[0]);
+
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(cpu_T[0], cc.reg, cc.mask);
+        cc.reg = cpu_T[0];
+    }
+    if (cc.use_reg2) {
+        tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
+    } else {
+        tcg_gen_brcondi_tl(cc.cond, cc.reg, cc.imm, l1);
+    }
+}
+
+/* Generate a conditional jump to label 'l1' according to jump opcode
+   value 'b'. In the fast case, T0 is guaranted not to be used.
+   A translation block must end soon.  */
+static inline void gen_jcc1(DisasContext *s, int b, int l1)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, cpu_T[0]);
+
+    gen_update_cc_op(s);
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(cpu_T[0], cc.reg, cc.mask);
+        cc.reg = cpu_T[0];
+    }
+    set_cc_op(s, CC_OP_DYNAMIC);
+    if (cc.use_reg2) {
+        tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
+    } else {
+        tcg_gen_brcondi_tl(cc.cond, cc.reg, cc.imm, l1);
     }
 }
 
@@ -1106,21 +1310,19 @@ static inline void gen_lods(DisasContext *s, int ot)
 
 static inline void gen_scas(DisasContext *s, int ot)
 {
-    gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
     gen_string_movl_A0_EDI(s);
     gen_op_ld_T1_A0(ot + s->mem_index);
-    gen_op_cmpl_T0_T1_cc();
+    gen_op(s, OP_CMPL, ot, R_EAX);
     gen_op_movl_T0_Dshift(ot);
     gen_op_add_reg_T0(s->aflag, R_EDI);
 }
 
 static inline void gen_cmps(DisasContext *s, int ot)
 {
-    gen_string_movl_A0_ESI(s);
-    gen_op_ld_T0_A0(ot + s->mem_index);
     gen_string_movl_A0_EDI(s);
     gen_op_ld_T1_A0(ot + s->mem_index);
-    gen_op_cmpl_T0_T1_cc();
+    gen_string_movl_A0_ESI(s);
+    gen_op(s, OP_CMPL, ot, OR_TMP0);
     gen_op_movl_T0_Dshift(ot);
     gen_op_add_reg_T0(s->aflag, R_ESI);
     gen_op_add_reg_T0(s->aflag, R_EDI);
@@ -1194,8 +1396,8 @@ static inline void gen_repz_ ## op(DisasContext *s, int ot,                   \
     l2 = gen_jz_ecx_string(s, next_eip);                                      \
     gen_ ## op(s, ot);                                                        \
     gen_op_add_reg_im(s->aflag, R_ECX, -1);                                   \
-    gen_op_set_cc_op(CC_OP_SUBB + ot);                                        \
-    gen_jcc1(s, CC_OP_SUBB + ot, (JCC_Z << 1) | (nz ^ 1), l2);                \
+    gen_update_cc_op(s);                                                      \
+    gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2);                                 \
     if (!s->jmp_opt)                                                          \
         gen_op_jz_ecx(s->aflag, l2);                                          \
     gen_jmp(s, cur_eip);                                                      \
@@ -1275,38 +1477,26 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
     }
     switch(op) {
     case OP_ADCL:
-        if (s1->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s1->cc_op);
-        gen_compute_eflags_c(cpu_tmp4);
+        gen_compute_eflags_c(s1, cpu_tmp4);
         tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         tcg_gen_add_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
         if (d != OR_TMP0)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
-        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
-        tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
-        tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_ADDB + ot);
-        s1->cc_op = CC_OP_DYNAMIC;
+        gen_op_update3_cc(cpu_tmp4);
+        set_cc_op(s1, CC_OP_ADCB + ot);
         break;
     case OP_SBBL:
-        if (s1->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s1->cc_op);
-        gen_compute_eflags_c(cpu_tmp4);
+        gen_compute_eflags_c(s1, cpu_tmp4);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
         if (d != OR_TMP0)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
-        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
-        tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
-        tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_SUBB + ot);
-        s1->cc_op = CC_OP_DYNAMIC;
+        gen_op_update3_cc(cpu_tmp4);
+        set_cc_op(s1, CC_OP_SBBB + ot);
         break;
     case OP_ADDL:
         gen_op_addl_T0_T1();
@@ -1315,16 +1505,17 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update2_cc();
-        s1->cc_op = CC_OP_ADDB + ot;
+        set_cc_op(s1, CC_OP_ADDB + ot);
         break;
     case OP_SUBL:
+        tcg_gen_mov_tl(cpu_cc_srcT, cpu_T[0]);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         if (d != OR_TMP0)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update2_cc();
-        s1->cc_op = CC_OP_SUBB + ot;
+        set_cc_op(s1, CC_OP_SUBB + ot);
         break;
     default:
     case OP_ANDL:
@@ -1334,7 +1525,7 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update1_cc();
-        s1->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_ORL:
         tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
@@ -1343,7 +1534,7 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update1_cc();
-        s1->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_XORL:
         tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
@@ -1352,11 +1543,13 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
         gen_op_update1_cc();
-        s1->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_CMPL:
-        gen_op_cmpl_T0_T1_cc();
-        s1->cc_op = CC_OP_SUBB + ot;
+        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+        tcg_gen_mov_tl(cpu_cc_srcT, cpu_T[0]);
+        tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
+        set_cc_op(s1, CC_OP_SUBB + ot);
         break;
     }
 }
@@ -1368,35 +1561,70 @@ static void gen_inc(DisasContext *s1, int ot, int d, int c)
         gen_op_mov_TN_reg(ot, 0, d);
     else
         gen_op_ld_T0_A0(ot + s1->mem_index);
-    if (s1->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s1->cc_op);
+    gen_compute_eflags_c(s1, cpu_cc_src);
     if (c > 0) {
         tcg_gen_addi_tl(cpu_T[0], cpu_T[0], 1);
-        s1->cc_op = CC_OP_INCB + ot;
+        set_cc_op(s1, CC_OP_INCB + ot);
     } else {
         tcg_gen_addi_tl(cpu_T[0], cpu_T[0], -1);
-        s1->cc_op = CC_OP_DECB + ot;
+        set_cc_op(s1, CC_OP_DECB + ot);
     }
     if (d != OR_TMP0)
         gen_op_mov_reg_T0(ot, d);
     else
         gen_op_st_T0_A0(ot + s1->mem_index);
-    gen_compute_eflags_c(cpu_cc_src);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
-static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, 
-                            int is_right, int is_arith)
+static void gen_shift_flags(DisasContext *s, int ot, TCGv result, TCGv shm1,
+                            TCGv count, bool is_right)
 {
-    target_ulong mask;
-    int shift_label;
-    TCGv t0, t1, t2;
+    TCGv_i32 z32, s32, oldop;
+    TCGv z_tl;
 
-    if (ot == OT_QUAD) {
-        mask = 0x3f;
+    /* Store the results into the CC variables.  If we know that the
+       variable must be dead, store unconditionally.  Otherwise we'll
+       need to not disrupt the current contents.  */
+    z_tl = tcg_const_tl(0);
+    if (cc_op_live[s->cc_op] & USES_CC_DST) {
+        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_dst, count, z_tl,
+                           result, cpu_cc_dst);
+    } else {
+        tcg_gen_mov_tl(cpu_cc_dst, result);
+    }
+    if (cc_op_live[s->cc_op] & USES_CC_SRC) {
+        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_src, count, z_tl,
+                           shm1, cpu_cc_src);
     } else {
-        mask = 0x1f;
+        tcg_gen_mov_tl(cpu_cc_src, shm1);
     }
+    tcg_temp_free(z_tl);
+
+    /* Get the two potential CC_OP values into temporaries.  */
+    tcg_gen_movi_i32(cpu_tmp2_i32, (is_right ? CC_OP_SARB : CC_OP_SHLB) + ot);
+    if (s->cc_op == CC_OP_DYNAMIC) {
+        oldop = cpu_cc_op;
+    } else {
+        tcg_gen_movi_i32(cpu_tmp3_i32, s->cc_op);
+        oldop = cpu_tmp3_i32;
+    }
+
+    /* Conditionally store the CC_OP value.  */
+    z32 = tcg_const_i32(0);
+    s32 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(s32, count);
+    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, s32, z32, cpu_tmp2_i32, oldop);
+    tcg_temp_free_i32(z32);
+    tcg_temp_free_i32(s32);
+
+    /* The CC_OP value is no longer predictable.  */
+    set_cc_op(s, CC_OP_DYNAMIC);
+}
+
+static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, 
+                            int is_right, int is_arith)
+{
+    target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
 
     /* load */
     if (op1 == OR_TMP0) {
@@ -1405,25 +1633,22 @@ static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
         gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-
-    tcg_gen_andi_tl(t2, cpu_T[1], mask);
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
+    tcg_gen_subi_tl(cpu_tmp0, cpu_T[1], 1);
 
     if (is_right) {
         if (is_arith) {
             gen_exts(ot, cpu_T[0]);
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], t2);
+            tcg_gen_sar_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         } else {
             gen_extu(ot, cpu_T[0]);
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], t2);
+            tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         }
     } else {
-        tcg_gen_mov_tl(t0, cpu_T[0]);
-        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], t2);
+        tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
     }
 
     /* store */
@@ -1433,52 +1658,13 @@ static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
         gen_op_mov_reg_T0(ot, op1);
     }
 
-    /* update eflags if non zero shift */
-    if (s->cc_op != CC_OP_DYNAMIC) {
-        gen_op_set_cc_op(s->cc_op);
-    }
-
-    tcg_gen_mov_tl(t1, cpu_T[0]);
-
-    shift_label = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, shift_label);
-
-    tcg_gen_addi_tl(t2, t2, -1);
-    tcg_gen_mov_tl(cpu_cc_dst, t1);
-
-    if (is_right) {
-        if (is_arith) {
-            tcg_gen_sar_tl(cpu_cc_src, t0, t2);
-        } else {
-            tcg_gen_shr_tl(cpu_cc_src, t0, t2);
-        }
-    } else {
-        tcg_gen_shl_tl(cpu_cc_src, t0, t2);
-    }
-
-    if (is_right) {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
-    } else {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
-    }
-
-    gen_set_label(shift_label);
-    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
+    gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, cpu_T[1], is_right);
 }
 
 static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2,
                             int is_right, int is_arith)
 {
-    int mask;
-    
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
 
     /* load */
     if (op1 == OR_TMP0)
@@ -1514,10 +1700,7 @@ static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2,
     if (op2 != 0) {
         tcg_gen_mov_tl(cpu_cc_src, cpu_tmp4);
         tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-        if (is_right)
-            s->cc_op = CC_OP_SARB + ot;
-        else
-            s->cc_op = CC_OP_SHLB + ot;
+        set_cc_op(s, (is_right ? CC_OP_SARB : CC_OP_SHLB) + ot);
     }
 }
 
@@ -1529,187 +1712,182 @@ static inline void tcg_gen_lshift(TCGv ret, TCGv arg1, target_long arg2)
         tcg_gen_shri_tl(ret, arg1, -arg2);
 }
 
-static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, 
-                          int is_right)
+static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, int is_right)
 {
-    target_ulong mask;
-    int label1, label2, data_bits;
-    TCGv t0, t1, t2, a0;
-
-    /* XXX: inefficient, but we must use local temps */
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+    TCGv_i32 t0, t1;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    tcg_gen_mov_tl(t1, cpu_T[1]);
-
-    tcg_gen_andi_tl(t1, t1, mask);
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
 
-    /* Must test zero case to avoid using undefined behaviour in TCG
-       shifts. */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label1);
-    
-    if (ot <= OT_WORD)
-        tcg_gen_andi_tl(cpu_tmp0, t1, (1 << (3 + ot)) - 1);
-    else
-        tcg_gen_mov_tl(cpu_tmp0, t1);
-    
-    gen_extu(ot, t0);
-    tcg_gen_mov_tl(t2, t0);
-
-    data_bits = 8 << ot;
-    /* XXX: rely on behaviour of shifts when operand 2 overflows (XXX:
-       fix TCG definition) */
-    if (is_right) {
-        tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp0);
-        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
-        tcg_gen_shl_tl(t0, t0, cpu_tmp0);
-    } else {
-        tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp0);
-        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
-        tcg_gen_shr_tl(t0, t0, cpu_tmp0);
+    switch (ot) {
+    case OT_BYTE:
+        /* Replicate the 8-bit input so that a 32-bit rotate works.  */
+        tcg_gen_ext8u_tl(cpu_T[0], cpu_T[0]);
+        tcg_gen_muli_tl(cpu_T[0], cpu_T[0], 0x01010101);
+        goto do_long;
+    case OT_WORD:
+        /* Replicate the 16-bit input so that a 32-bit rotate works.  */
+        tcg_gen_deposit_tl(cpu_T[0], cpu_T[0], cpu_T[0], 16, 16);
+        goto do_long;
+    do_long:
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+        if (is_right) {
+            tcg_gen_rotr_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+        } else {
+            tcg_gen_rotl_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+        }
+        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+        break;
+#endif
+    default:
+        if (is_right) {
+            tcg_gen_rotr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        } else {
+            tcg_gen_rotl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        }
+        break;
     }
-    tcg_gen_or_tl(t0, t0, cpu_tmp4);
 
-    gen_set_label(label1);
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
-    
-    /* update eflags */
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
-
-    label2 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label2);
-
-    gen_compute_eflags(cpu_cc_src);
-    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
-    tcg_gen_xor_tl(cpu_tmp0, t2, t0);
-    tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
-    tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
-    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+
+    /* We'll need the flags computed into CC_SRC.  */
+    gen_compute_eflags(s);
+
+    /* The value that was "rotated out" is now present at the other end
+       of the word.  Compute C into CC_DST and O into CC_SRC2.  Note that
+       since we've computed the flags into CC_SRC, these variables are
+       currently dead.  */
     if (is_right) {
-        tcg_gen_shri_tl(t0, t0, data_bits - 1);
+        tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+        tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+        tcg_gen_andi_tl(cpu_cc_dst, cpu_cc_dst, 1);
+    } else {
+        tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+        tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
     }
-    tcg_gen_andi_tl(t0, t0, CC_C);
-    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
-    
-    tcg_gen_discard_tl(cpu_cc_dst);
-    tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
-        
-    gen_set_label(label2);
-    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
-    tcg_temp_free(a0);
+    tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+    tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+
+    /* Now conditionally store the new CC_OP value.  If the shift count
+       is 0 we keep the CC_OP_EFLAGS setting so that only CC_SRC is live.
+       Otherwise reuse CC_OP_ADCOX which have the C and O flags split out
+       exactly as we computed above.  */
+    t0 = tcg_const_i32(0);
+    t1 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(t1, cpu_T[1]);
+    tcg_gen_movi_i32(cpu_tmp2_i32, CC_OP_ADCOX); 
+    tcg_gen_movi_i32(cpu_tmp3_i32, CC_OP_EFLAGS);
+    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, t1, t0,
+                        cpu_tmp2_i32, cpu_tmp3_i32);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+
+    /* The CC_OP value is no longer predictable.  */ 
+    set_cc_op(s, CC_OP_DYNAMIC);
 }
 
 static void gen_rot_rm_im(DisasContext *s, int ot, int op1, int op2,
                           int is_right)
 {
-    int mask;
-    int data_bits;
-    TCGv t0, t1, a0;
-
-    /* XXX: inefficient, but we must use local temps */
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+    int shift;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    gen_extu(ot, t0);
-    tcg_gen_mov_tl(t1, t0);
-
     op2 &= mask;
-    data_bits = 8 << ot;
     if (op2 != 0) {
-        int shift = op2 & ((1 << (3 + ot)) - 1);
-        if (is_right) {
-            tcg_gen_shri_tl(cpu_tmp4, t0, shift);
-            tcg_gen_shli_tl(t0, t0, data_bits - shift);
-        }
-        else {
-            tcg_gen_shli_tl(cpu_tmp4, t0, shift);
-            tcg_gen_shri_tl(t0, t0, data_bits - shift);
+        switch (ot) {
+#ifdef TARGET_X86_64
+        case OT_LONG:
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            if (is_right) {
+                tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+            } else {
+                tcg_gen_rotli_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+            }
+            tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+            break;
+#endif
+        default:
+            if (is_right) {
+                tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], op2);
+            } else {
+                tcg_gen_rotli_tl(cpu_T[0], cpu_T[0], op2);
+            }
+            break;
+        case OT_BYTE:
+            mask = 7;
+            goto do_shifts;
+        case OT_WORD:
+            mask = 15;
+        do_shifts:
+            shift = op2 & mask;
+            if (is_right) {
+                shift = mask + 1 - shift;
+            }
+            gen_extu(ot, cpu_T[0]);
+            tcg_gen_shli_tl(cpu_tmp0, cpu_T[0], shift);
+            tcg_gen_shri_tl(cpu_T[0], cpu_T[0], mask + 1 - shift);
+            tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+            break;
         }
-        tcg_gen_or_tl(t0, t0, cpu_tmp4);
     }
 
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
 
     if (op2 != 0) {
-        /* update eflags */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-
-        gen_compute_eflags(cpu_cc_src);
-        tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
-        tcg_gen_xor_tl(cpu_tmp0, t1, t0);
-        tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
-        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
-        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+        /* Compute the flags into CC_SRC.  */
+        gen_compute_eflags(s);
+
+        /* The value that was "rotated out" is now present at the other end
+           of the word.  Compute C into CC_DST and O into CC_SRC2.  Note that
+           since we've computed the flags into CC_SRC, these variables are
+           currently dead.  */
         if (is_right) {
-            tcg_gen_shri_tl(t0, t0, data_bits - 1);
+            tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+            tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+            tcg_gen_andi_tl(cpu_cc_dst, cpu_cc_dst, 1);
+        } else {
+            tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+            tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
         }
-        tcg_gen_andi_tl(t0, t0, CC_C);
-        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
-
-        tcg_gen_discard_tl(cpu_cc_dst);
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
-        s->cc_op = CC_OP_EFLAGS;
+        tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+        tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+        set_cc_op(s, CC_OP_ADCOX);
     }
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(a0);
 }
 
 /* XXX: add faster immediate = 1 case */
 static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1, 
                            int is_right)
 {
-    int label1;
-
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_compute_eflags(s);
+    assert(s->cc_op == CC_OP_EFLAGS);
 
     /* load */
     if (op1 == OR_TMP0)
@@ -1757,146 +1935,92 @@ static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1,
         gen_op_st_T0_A0(ot + s->mem_index);
     else
         gen_op_mov_reg_T0(ot, op1);
-
-    /* update eflags */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_cc_tmp, -1, label1);
-
-    tcg_gen_mov_tl(cpu_cc_src, cpu_cc_tmp);
-    tcg_gen_discard_tl(cpu_cc_dst);
-    tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS);
-        
-    gen_set_label(label1);
-    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
 }
 
 /* XXX: add faster immediate case */
-static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, 
-                                int is_right)
+static void gen_shiftd_rm_T1(DisasContext *s, int ot, int op1,
+                             bool is_right, TCGv count_in)
 {
-    int label1, label2, data_bits;
-    target_ulong mask;
-    TCGv t0, t1, t2, a0;
-
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    target_ulong mask = (ot == OT_QUAD ? 63 : 31);
+    TCGv count;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    tcg_gen_andi_tl(cpu_T3, cpu_T3, mask);
+    count = tcg_temp_new();
+    tcg_gen_andi_tl(count, count_in, mask);
 
-    tcg_gen_mov_tl(t1, cpu_T[1]);
-    tcg_gen_mov_tl(t2, cpu_T3);
-
-    /* Must test zero case to avoid using undefined behaviour in TCG
-       shifts. */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
-    
-    tcg_gen_addi_tl(cpu_tmp5, t2, -1);
-    if (ot == OT_WORD) {
-        /* Note: we implement the Intel behaviour for shift count > 16 */
+    switch (ot) {
+    case OT_WORD:
+        /* Note: we implement the Intel behaviour for shift count > 16.
+           This means "shrdw C, B, A" shifts A:B:A >> C.  Build the B:A
+           portion by constructing it as a 32-bit value.  */
         if (is_right) {
-            tcg_gen_andi_tl(t0, t0, 0xffff);
-            tcg_gen_shli_tl(cpu_tmp0, t1, 16);
-            tcg_gen_or_tl(t0, t0, cpu_tmp0);
-            tcg_gen_ext32u_tl(t0, t0);
-
-            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
-            
-            /* only needed if count > 16, but a test would complicate */
-            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
-            tcg_gen_shl_tl(cpu_tmp0, t0, cpu_tmp5);
-
-            tcg_gen_shr_tl(t0, t0, t2);
-
-            tcg_gen_or_tl(t0, t0, cpu_tmp0);
+            tcg_gen_deposit_tl(cpu_tmp0, cpu_T[0], cpu_T[1], 16, 16);
+            tcg_gen_mov_tl(cpu_T[1], cpu_T[0]);
+            tcg_gen_mov_tl(cpu_T[0], cpu_tmp0);
         } else {
-            /* XXX: not optimal */
-            tcg_gen_andi_tl(t0, t0, 0xffff);
-            tcg_gen_shli_tl(t1, t1, 16);
-            tcg_gen_or_tl(t1, t1, t0);
-            tcg_gen_ext32u_tl(t1, t1);
-            
-            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-            tcg_gen_subfi_tl(cpu_tmp0, 32, cpu_tmp5);
-            tcg_gen_shr_tl(cpu_tmp5, t1, cpu_tmp0);
-            tcg_gen_or_tl(cpu_tmp4, cpu_tmp4, cpu_tmp5);
-
-            tcg_gen_shl_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
-            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_deposit_tl(cpu_T[1], cpu_T[0], cpu_T[1], 16, 16);
         }
-    } else {
-        data_bits = 8 << ot;
+        /* FALLTHRU */
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        /* Concatenate the two 32-bit values and use a 64-bit shift.  */
+        tcg_gen_subi_tl(cpu_tmp0, count, 1);
         if (is_right) {
-            if (ot == OT_LONG)
-                tcg_gen_ext32u_tl(t0, t0);
-
-            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
+            tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[0], cpu_T[1]);
+            tcg_gen_shr_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shr_i64(cpu_T[0], cpu_T[0], count);
+        } else {
+            tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[1], cpu_T[0]);
+            tcg_gen_shl_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shl_i64(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shri_i64(cpu_tmp0, cpu_tmp0, 32);
+            tcg_gen_shri_i64(cpu_T[0], cpu_T[0], 32);
+        }
+        break;
+#endif
+    default:
+        tcg_gen_subi_tl(cpu_tmp0, count, 1);
+        if (is_right) {
+            tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
 
-            tcg_gen_shr_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
-            tcg_gen_shl_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
-            
+            tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
         } else {
-            if (ot == OT_LONG)
-                tcg_gen_ext32u_tl(t1, t1);
-
-            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-            
-            tcg_gen_shl_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
-            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            if (ot == OT_WORD) {
+                /* Only needed if count > 16, for Intel behaviour.  */
+                tcg_gen_subfi_tl(cpu_tmp4, 33, count);
+                tcg_gen_shr_tl(cpu_tmp4, cpu_T[1], cpu_tmp4);
+                tcg_gen_or_tl(cpu_tmp0, cpu_tmp0, cpu_tmp4);
+            }
+
+            tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+            tcg_gen_shl_tl(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shr_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
         }
+        tcg_gen_movi_tl(cpu_tmp4, 0);
+        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[1], count, cpu_tmp4,
+                           cpu_tmp4, cpu_T[1]);
+        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        break;
     }
-    tcg_gen_mov_tl(t1, cpu_tmp4);
 
-    gen_set_label(label1);
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
-    } else {
-        gen_op_mov_reg_v(ot, op1, t0);
-    }
-    
-    /* update eflags */
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
-
-    label2 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label2);
-
-    tcg_gen_mov_tl(cpu_cc_src, t1);
-    tcg_gen_mov_tl(cpu_cc_dst, t0);
-    if (is_right) {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
+        gen_op_mov_reg_T0(ot, op1);
     }
-    gen_set_label(label2);
-    s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */
 
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
-    tcg_temp_free(a0);
+    gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, count, is_right);
+    tcg_temp_free(count);
 }
 
 static void gen_shift(DisasContext *s1, int op, int ot, int d, int s)
@@ -2300,24 +2424,21 @@ static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
 static inline void gen_jcc(DisasContext *s, int b,
                            target_ulong val, target_ulong next_eip)
 {
-    int l1, l2, cc_op;
+    int l1, l2;
 
-    cc_op = s->cc_op;
-    gen_update_cc_op(s);
     if (s->jmp_opt) {
         l1 = gen_new_label();
-        gen_jcc1(s, cc_op, b, l1);
-        
+        gen_jcc1(s, b, l1);
+
         gen_goto_tb(s, 0, next_eip);
 
         gen_set_label(l1);
         gen_goto_tb(s, 1, val);
         s->is_jmp = DISAS_TB_JUMP;
     } else {
-
         l1 = gen_new_label();
         l2 = gen_new_label();
-        gen_jcc1(s, cc_op, b, l1);
+        gen_jcc1(s, b, l1);
 
         gen_jmp_im(next_eip);
         tcg_gen_br(l2);
@@ -2329,32 +2450,32 @@ static inline void gen_jcc(DisasContext *s, int b,
     }
 }
 
-static void gen_setcc(DisasContext *s, int b)
+static void gen_cmovcc1(CPUX86State *env, DisasContext *s, int ot, int b,
+                        int modrm, int reg)
 {
-    int inv, jcc_op, l1;
-    TCGv t0;
+    CCPrepare cc;
 
-    if (is_fast_jcc_case(s, b)) {
-        /* nominal case: we use a jump */
-        /* XXX: make it faster by adding new instructions in TCG */
-        t0 = tcg_temp_local_new();
-        tcg_gen_movi_tl(t0, 0);
-        l1 = gen_new_label();
-        gen_jcc1(s, s->cc_op, b ^ 1, l1);
-        tcg_gen_movi_tl(t0, 1);
-        gen_set_label(l1);
-        tcg_gen_mov_tl(cpu_T[0], t0);
-        tcg_temp_free(t0);
-    } else {
-        /* slow case: it is more efficient not to generate a jump,
-           although it is questionnable whether this optimization is
-           worth to */
-        inv = b & 1;
-        jcc_op = (b >> 1) & 7;
-        gen_setcc_slow_T0(s, jcc_op);
-        if (inv) {
-            tcg_gen_xori_tl(cpu_T[0], cpu_T[0], 1);
-        }
+    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+    cc = gen_prepare_cc(s, b, cpu_T[1]);
+    if (cc.mask != -1) {
+        TCGv t0 = tcg_temp_new();
+        tcg_gen_andi_tl(t0, cc.reg, cc.mask);
+        cc.reg = t0;
+    }
+    if (!cc.use_reg2) {
+        cc.reg2 = tcg_const_tl(cc.imm);
+    }
+
+    tcg_gen_movcond_tl(cc.cond, cpu_T[0], cc.reg, cc.reg2,
+                       cpu_T[0], cpu_regs[reg]);
+    gen_op_mov_reg_T0(ot, reg);
+
+    if (cc.mask != -1) {
+        tcg_temp_free(cc.reg);
+    }
+    if (!cc.use_reg2) {
+        tcg_temp_free(cc.reg2);
     }
 }
 
@@ -2380,8 +2501,7 @@ static void gen_movl_seg_T0(DisasContext *s, int seg_reg, target_ulong cur_eip)
 {
     if (s->pe && !s->vm86) {
         /* XXX: optimize by finding processor state dynamically */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(cur_eip);
         tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
         gen_helper_load_seg(cpu_env, tcg_const_i32(seg_reg), cpu_tmp2_i32);
@@ -2410,8 +2530,7 @@ gen_svm_check_intercept_param(DisasContext *s, target_ulong pc_start,
     /* no SVM activated; fast case */
     if (likely(!(s->flags & HF_SVMI_MASK)))
         return;
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     gen_jmp_im(pc_start - s->cs_base);
     gen_helper_svm_check_intercept_param(cpu_env, tcg_const_i32(type),
                                          tcg_const_i64(param));
@@ -2658,8 +2777,7 @@ static void gen_enter(DisasContext *s, int esp_addend, int level)
 
 static void gen_exception(DisasContext *s, int trapno, target_ulong cur_eip)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     gen_jmp_im(cur_eip);
     gen_helper_raise_exception(cpu_env, tcg_const_i32(trapno));
     s->is_jmp = DISAS_TB_JUMP;
@@ -2670,8 +2788,7 @@ static void gen_exception(DisasContext *s, int trapno, target_ulong cur_eip)
 static void gen_interrupt(DisasContext *s, int intno,
                           target_ulong cur_eip, target_ulong next_eip)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     gen_jmp_im(cur_eip);
     gen_helper_raise_interrupt(cpu_env, tcg_const_i32(intno),
                                tcg_const_i32(next_eip - cur_eip));
@@ -2680,8 +2797,7 @@ static void gen_interrupt(DisasContext *s, int intno,
 
 static void gen_debug(DisasContext *s, target_ulong cur_eip)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     gen_jmp_im(cur_eip);
     gen_helper_debug(cpu_env);
     s->is_jmp = DISAS_TB_JUMP;
@@ -2691,8 +2807,7 @@ static void gen_debug(DisasContext *s, target_ulong cur_eip)
    if needed */
 static void gen_eob(DisasContext *s)
 {
-    if (s->cc_op != CC_OP_DYNAMIC)
-        gen_op_set_cc_op(s->cc_op);
+    gen_update_cc_op(s);
     if (s->tb->flags & HF_INHIBIT_IRQ_MASK) {
         gen_helper_reset_inhibit_irq(cpu_env);
     }
@@ -2713,8 +2828,9 @@ static void gen_eob(DisasContext *s)
    direct call to the next block may occur */
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num)
 {
+    gen_update_cc_op(s);
+    set_cc_op(s, CC_OP_DYNAMIC);
     if (s->jmp_opt) {
-        gen_update_cc_op(s);
         gen_goto_tb(s, tb_num, eip);
         s->is_jmp = DISAS_TB_JUMP;
     } else {
@@ -2850,8 +2966,9 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps,
                (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */
 
-    [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */
-    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */
+    /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX.  */
+    [0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
+    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
 
     /* MMX ops and their SSE extensions */
     [0x60] = MMX_OP2(punpcklbw),
@@ -3032,6 +3149,9 @@ struct SSEOpHelper_eppi {
 #define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 }
 #define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
 #define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 }
+#define PCLMULQDQ_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, \
+        CPUID_EXT_PCLMULQDQ }
+#define AESNI_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_AES }
 
 static const struct SSEOpHelper_epp sse_op_table6[256] = {
     [0x00] = SSSE3_OP(pshufb),
@@ -3080,6 +3200,11 @@ static const struct SSEOpHelper_epp sse_op_table6[256] = {
     [0x3f] = SSE41_OP(pmaxud),
     [0x40] = SSE41_OP(pmulld),
     [0x41] = SSE41_OP(phminposuw),
+    [0xdb] = AESNI_OP(aesimc),
+    [0xdc] = AESNI_OP(aesenc),
+    [0xdd] = AESNI_OP(aesenclast),
+    [0xde] = AESNI_OP(aesdec),
+    [0xdf] = AESNI_OP(aesdeclast),
 };
 
 static const struct SSEOpHelper_eppi sse_op_table7[256] = {
@@ -3101,10 +3226,12 @@ static const struct SSEOpHelper_eppi sse_op_table7[256] = {
     [0x40] = SSE41_OP(dpps),
     [0x41] = SSE41_OP(dppd),
     [0x42] = SSE41_OP(mpsadbw),
+    [0x44] = PCLMULQDQ_OP(pclmulqdq),
     [0x60] = SSE42_OP(pcmpestrm),
     [0x61] = SSE42_OP(pcmpestri),
     [0x62] = SSE42_OP(pcmpistrm),
     [0x63] = SSE42_OP(pcmpistri),
+    [0xdf] = AESNI_OP(aeskeygenassist),
 };
 
 static void gen_sse(CPUX86State *env, DisasContext *s, int b,
@@ -3732,11 +3859,13 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             reg = ((modrm >> 3) & 7) | rex_r;
             gen_op_mov_reg_T0(OT_LONG, reg);
             break;
+
         case 0x138:
-            if (s->prefix & PREFIX_REPNZ)
-                goto crc32;
         case 0x038:
             b = modrm;
+            if ((b & 0xf0) == 0xf0) {
+                goto do_0f_38_fx;
+            }
             modrm = cpu_ldub_code(env, s->pc++);
             rm = modrm & 7;
             reg = ((modrm >> 3) & 7) | rex_r;
@@ -3787,57 +3916,423 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                         gen_ldo_env_A0(s->mem_index, op2_offset);
                     }
                 }
-            } else {
-                op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
-                if (mod == 3) {
-                    op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+            } else {
+                op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
+                if (mod == 3) {
+                    op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+                } else {
+                    op2_offset = offsetof(CPUX86State,mmx_t0);
+                    gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
+                    gen_ldq_env_A0(s->mem_index, op2_offset);
+                }
+            }
+            if (sse_fn_epp == SSE_SPECIAL) {
+                goto illegal_op;
+            }
+
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
+
+            if (b == 0x17) {
+                set_cc_op(s, CC_OP_EFLAGS);
+            }
+            break;
+
+        case 0x238:
+        case 0x338:
+        do_0f_38_fx:
+            /* Various integer extensions at 0f 38 f[0-f].  */
+            b = modrm | (b1 << 8);
+            modrm = cpu_ldub_code(env, s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+
+            switch (b) {
+            case 0x3f0: /* crc32 Gd,Eb */
+            case 0x3f1: /* crc32 Gd,Ey */
+            do_crc32:
+                if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) {
+                    goto illegal_op;
+                }
+                if ((b & 0xff) == 0xf0) {
+                    ot = OT_BYTE;
+                } else if (s->dflag != 2) {
+                    ot = (s->prefix & PREFIX_DATA ? OT_WORD : OT_LONG);
+                } else {
+                    ot = OT_QUAD;
+                }
+
+                gen_op_mov_TN_reg(OT_LONG, 0, reg);
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
+                                 cpu_T[0], tcg_const_i32(8 << ot));
+
+                ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            case 0x1f0: /* crc32 or movbe */
+            case 0x1f1:
+                /* For these insns, the f3 prefix is supposed to have priority
+                   over the 66 prefix, but that's not what we implement above
+                   setting b1.  */
+                if (s->prefix & PREFIX_REPNZ) {
+                    goto do_crc32;
+                }
+                /* FALLTHRU */
+            case 0x0f0: /* movbe Gy,My */
+            case 0x0f1: /* movbe My,Gy */
+                if (!(s->cpuid_ext_features & CPUID_EXT_MOVBE)) {
+                    goto illegal_op;
+                }
+                if (s->dflag != 2) {
+                    ot = (s->prefix & PREFIX_DATA ? OT_WORD : OT_LONG);
+                } else {
+                    ot = OT_QUAD;
+                }
+
+                /* Load the data incoming to the bswap.  Note that the TCG
+                   implementation of bswap requires the input be zero
+                   extended.  In the case of the loads, we simply know that
+                   gen_op_ld_v via gen_ldst_modrm does that already.  */
+                if ((b & 1) == 0) {
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                } else {
+                    switch (ot) {
+                    case OT_WORD:
+                        tcg_gen_ext16u_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    default:
+                        tcg_gen_ext32u_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    case OT_QUAD:
+                        tcg_gen_mov_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    }
+                }
+
+                switch (ot) {
+                case OT_WORD:
+                    tcg_gen_bswap16_tl(cpu_T[0], cpu_T[0]);
+                    break;
+                default:
+                    tcg_gen_bswap32_tl(cpu_T[0], cpu_T[0]);
+                    break;
+#ifdef TARGET_X86_64
+                case OT_QUAD:
+                    tcg_gen_bswap64_tl(cpu_T[0], cpu_T[0]);
+                    break;
+#endif
+                }
+
+                if ((b & 1) == 0) {
+                    gen_op_mov_reg_T0(ot, reg);
+                } else {
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 1);
+                }
+                break;
+
+            case 0x0f2: /* andn Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                tcg_gen_andc_tl(cpu_T[0], cpu_regs[s->vex_v], cpu_T[0]);
+                gen_op_mov_reg_T0(ot, reg);
+                gen_op_update1_cc();
+                set_cc_op(s, CC_OP_LOGICB + ot);
+                break;
+
+            case 0x0f7: /* bextr Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                {
+                    TCGv bound, zero;
+
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                    /* Extract START, and shift the operand.
+                       Shifts larger than operand size get zeros.  */
+                    tcg_gen_ext8u_tl(cpu_A0, cpu_regs[s->vex_v]);
+                    tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_A0);
+
+                    bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+                    zero = tcg_const_tl(0);
+                    tcg_gen_movcond_tl(TCG_COND_LEU, cpu_T[0], cpu_A0, bound,
+                                       cpu_T[0], zero);
+                    tcg_temp_free(zero);
+
+                    /* Extract the LEN into a mask.  Lengths larger than
+                       operand size get all ones.  */
+                    tcg_gen_shri_tl(cpu_A0, cpu_regs[s->vex_v], 8);
+                    tcg_gen_ext8u_tl(cpu_A0, cpu_A0);
+                    tcg_gen_movcond_tl(TCG_COND_LEU, cpu_A0, cpu_A0, bound,
+                                       cpu_A0, bound);
+                    tcg_temp_free(bound);
+                    tcg_gen_movi_tl(cpu_T[1], 1);
+                    tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_A0);
+                    tcg_gen_subi_tl(cpu_T[1], cpu_T[1], 1);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+
+                    gen_op_mov_reg_T0(ot, reg);
+                    gen_op_update1_cc();
+                    set_cc_op(s, CC_OP_LOGICB + ot);
+                }
+                break;
+
+            case 0x0f5: /* bzhi Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                tcg_gen_ext8u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                {
+                    TCGv bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+                    /* Note that since we're using BMILG (in order to get O
+                       cleared) we need to store the inverse into C.  */
+                    tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src,
+                                       cpu_T[1], bound);
+                    tcg_gen_movcond_tl(TCG_COND_GT, cpu_T[1], cpu_T[1],
+                                       bound, bound, cpu_T[1]);
+                    tcg_temp_free(bound);
+                }
+                tcg_gen_movi_tl(cpu_A0, -1);
+                tcg_gen_shl_tl(cpu_A0, cpu_A0, cpu_T[1]);
+                tcg_gen_andc_tl(cpu_T[0], cpu_T[0], cpu_A0);
+                gen_op_mov_reg_T0(ot, reg);
+                gen_op_update1_cc();
+                set_cc_op(s, CC_OP_BMILGB + ot);
+                break;
+
+            case 0x3f6: /* mulx By, Gy, rdx, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                switch (ot) {
+                default:
+                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                    tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EDX]);
+                    tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                      cpu_tmp2_i32, cpu_tmp3_i32);
+                    tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], cpu_tmp2_i32);
+                    tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp3_i32);
+                    break;
+#ifdef TARGET_X86_64
+                case OT_QUAD:
+                    tcg_gen_mulu2_i64(cpu_regs[s->vex_v], cpu_regs[reg],
+                                      cpu_T[0], cpu_regs[R_EDX]);
+                    break;
+#endif
+                }
+                break;
+
+            case 0x3f5: /* pdep Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                /* Note that by zero-extending the mask operand, we
+                   automatically handle zero-extending the result.  */
+                if (s->dflag == 2) {
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                } else {
+                    tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                }
+                gen_helper_pdep(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                break;
+
+            case 0x2f5: /* pext Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                /* Note that by zero-extending the mask operand, we
+                   automatically handle zero-extending the result.  */
+                if (s->dflag == 2) {
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                } else {
+                    tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                }
+                gen_helper_pext(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                break;
+
+            case 0x1f6: /* adcx Gy, Ey */
+            case 0x2f6: /* adox Gy, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) {
+                    goto illegal_op;
+                } else {
+                    TCGv carry_in, carry_out, zero;
+                    int end_op;
+
+                    ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+                    /* Re-use the carry-out from a previous round.  */
+                    TCGV_UNUSED(carry_in);
+                    carry_out = (b == 0x1f6 ? cpu_cc_dst : cpu_cc_src2);
+                    switch (s->cc_op) {
+                    case CC_OP_ADCX:
+                        if (b == 0x1f6) {
+                            carry_in = cpu_cc_dst;
+                            end_op = CC_OP_ADCX;
+                        } else {
+                            end_op = CC_OP_ADCOX;
+                        }
+                        break;
+                    case CC_OP_ADOX:
+                        if (b == 0x1f6) {
+                            end_op = CC_OP_ADCOX;
+                        } else {
+                            carry_in = cpu_cc_src2;
+                            end_op = CC_OP_ADOX;
+                        }
+                        break;
+                    case CC_OP_ADCOX:
+                        end_op = CC_OP_ADCOX;
+                        carry_in = carry_out;
+                        break;
+                    default:
+                        end_op = (b == 0x1f6 ? CC_OP_ADCX : CC_OP_ADOX);
+                        break;
+                    }
+                    /* If we can't reuse carry-out, get it out of EFLAGS.  */
+                    if (TCGV_IS_UNUSED(carry_in)) {
+                        if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
+                            gen_compute_eflags(s);
+                        }
+                        carry_in = cpu_tmp0;
+                        tcg_gen_shri_tl(carry_in, cpu_cc_src,
+                                        ctz32(b == 0x1f6 ? CC_C : CC_O));
+                        tcg_gen_andi_tl(carry_in, carry_in, 1);
+                    }
+
+                    switch (ot) {
+#ifdef TARGET_X86_64
+                    case OT_LONG:
+                        /* If we know TL is 64-bit, and we want a 32-bit
+                           result, just do everything in 64-bit arithmetic.  */
+                        tcg_gen_ext32u_i64(cpu_regs[reg], cpu_regs[reg]);
+                        tcg_gen_ext32u_i64(cpu_T[0], cpu_T[0]);
+                        tcg_gen_add_i64(cpu_T[0], cpu_T[0], cpu_regs[reg]);
+                        tcg_gen_add_i64(cpu_T[0], cpu_T[0], carry_in);
+                        tcg_gen_ext32u_i64(cpu_regs[reg], cpu_T[0]);
+                        tcg_gen_shri_i64(carry_out, cpu_T[0], 32);
+                        break;
+#endif
+                    default:
+                        /* Otherwise compute the carry-out in two steps.  */
+                        zero = tcg_const_tl(0);
+                        tcg_gen_add2_tl(cpu_T[0], carry_out,
+                                        cpu_T[0], zero,
+                                        carry_in, zero);
+                        tcg_gen_add2_tl(cpu_regs[reg], carry_out,
+                                        cpu_regs[reg], carry_out,
+                                        cpu_T[0], zero);
+                        tcg_temp_free(zero);
+                        break;
+                    }
+                    set_cc_op(s, end_op);
+                }
+                break;
+
+            case 0x1f7: /* shlx Gy, Ey, By */
+            case 0x2f7: /* sarx Gy, Ey, By */
+            case 0x3f7: /* shrx Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                if (ot == OT_QUAD) {
+                    tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 63);
                 } else {
-                    op2_offset = offsetof(CPUX86State,mmx_t0);
-                    gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-                    gen_ldq_env_A0(s->mem_index, op2_offset);
+                    tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 31);
                 }
-            }
-            if (sse_fn_epp == SSE_SPECIAL) {
-                goto illegal_op;
-            }
-
-            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
-            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
-            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
+                if (b == 0x1f7) {
+                    tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                } else if (b == 0x2f7) {
+                    if (ot != OT_QUAD) {
+                        tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+                    }
+                    tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                } else {
+                    if (ot != OT_QUAD) {
+                        tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
+                    }
+                    tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                break;
 
-            if (b == 0x17)
-                s->cc_op = CC_OP_EFLAGS;
-            break;
-        case 0x338: /* crc32 */
-        crc32:
-            b = modrm;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
+            case 0x0f3:
+            case 0x1f3:
+            case 0x2f3:
+            case 0x3f3: /* Group 17 */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+                switch (reg & 7) {
+                case 1: /* blsr By,Ey */
+                    tcg_gen_neg_tl(cpu_T[1], cpu_T[0]);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                    gen_op_mov_reg_T0(ot, s->vex_v);
+                    gen_op_update2_cc();
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
 
-            if (b != 0xf0 && b != 0xf1)
-                goto illegal_op;
-            if (!(s->cpuid_ext_features & CPUID_EXT_SSE42))
-                goto illegal_op;
+                case 2: /* blsmsk By,Ey */
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                    tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+                    tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
 
-            if (b == 0xf0)
-                ot = OT_BYTE;
-            else if (b == 0xf1 && s->dflag != 2)
-                if (s->prefix & PREFIX_DATA)
-                    ot = OT_WORD;
-                else
-                    ot = OT_LONG;
-            else
-                ot = OT_QUAD;
+                case 3: /* blsi By, Ey */
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                    tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
 
-            gen_op_mov_TN_reg(OT_LONG, 0, reg);
-            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-            gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
-            gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
-                             cpu_T[0], tcg_const_i32(8 << ot));
+                default:
+                    goto illegal_op;
+                }
+                break;
 
-            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
-            gen_op_mov_reg_T0(ot, reg);
+            default:
+                goto illegal_op;
+            }
             break;
+
         case 0x03a:
         case 0x13a:
             b = modrm;
@@ -3921,9 +4416,9 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                     if (mod == 3)
                         gen_op_mov_TN_reg(OT_LONG, 0, rm);
                     else
-                        tcg_gen_qemu_ld8u(cpu_tmp0, cpu_A0,
+                        tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0,
                                         (s->mem_index >> 2) - 1);
-                    tcg_gen_st8_tl(cpu_tmp0, cpu_env, offsetof(CPUX86State,
+                    tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
                                             xmm_regs[reg].XMM_B(val & 15)));
                     break;
                 case 0x21: /* insertps */
@@ -4008,7 +4503,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             val = cpu_ldub_code(env, s->pc++);
 
             if ((b & 0xfc) == 0x60) { /* pcmpXstrX */
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
 
                 if (s->dflag == 2)
                     /* The helper must use entire 64-bit gp registers */
@@ -4019,6 +4514,38 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             sse_fn_eppi(cpu_env, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
             break;
+
+        case 0x33a:
+            /* Various integer extensions at 0f 3a f[0-f].  */
+            b = modrm | (b1 << 8);
+            modrm = cpu_ldub_code(env, s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+
+            switch (b) {
+            case 0x3f0: /* rorx Gy,Ey, Ib */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                b = cpu_ldub_code(env, s->pc++);
+                if (ot == OT_QUAD) {
+                    tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], b & 63);
+                } else {
+                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                    tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, b & 31);
+                    tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            default:
+                goto illegal_op;
+            }
+            break;
+
         default:
             goto illegal_op;
         }
@@ -4129,7 +4656,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             break;
         }
         if (b == 0x2e || b == 0x2f) {
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
         }
     }
 }
@@ -4150,8 +4677,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     }
     s->pc = pc_start;
     prefixes = 0;
-    aflag = s->code32;
-    dflag = s->code32;
     s->override = -1;
     rex_w = -1;
     rex_r = 0;
@@ -4161,47 +4686,49 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     x86_64_hregs = 0;
 #endif
     s->rip_offset = 0; /* for relative ip address */
+    s->vex_l = 0;
+    s->vex_v = 0;
  next_byte:
     b = cpu_ldub_code(env, s->pc);
     s->pc++;
-    /* check prefixes */
+    /* Collect prefixes.  */
+    switch (b) {
+    case 0xf3:
+        prefixes |= PREFIX_REPZ;
+        goto next_byte;
+    case 0xf2:
+        prefixes |= PREFIX_REPNZ;
+        goto next_byte;
+    case 0xf0:
+        prefixes |= PREFIX_LOCK;
+        goto next_byte;
+    case 0x2e:
+        s->override = R_CS;
+        goto next_byte;
+    case 0x36:
+        s->override = R_SS;
+        goto next_byte;
+    case 0x3e:
+        s->override = R_DS;
+        goto next_byte;
+    case 0x26:
+        s->override = R_ES;
+        goto next_byte;
+    case 0x64:
+        s->override = R_FS;
+        goto next_byte;
+    case 0x65:
+        s->override = R_GS;
+        goto next_byte;
+    case 0x66:
+        prefixes |= PREFIX_DATA;
+        goto next_byte;
+    case 0x67:
+        prefixes |= PREFIX_ADR;
+        goto next_byte;
 #ifdef TARGET_X86_64
-    if (CODE64(s)) {
-        switch (b) {
-        case 0xf3:
-            prefixes |= PREFIX_REPZ;
-            goto next_byte;
-        case 0xf2:
-            prefixes |= PREFIX_REPNZ;
-            goto next_byte;
-        case 0xf0:
-            prefixes |= PREFIX_LOCK;
-            goto next_byte;
-        case 0x2e:
-            s->override = R_CS;
-            goto next_byte;
-        case 0x36:
-            s->override = R_SS;
-            goto next_byte;
-        case 0x3e:
-            s->override = R_DS;
-            goto next_byte;
-        case 0x26:
-            s->override = R_ES;
-            goto next_byte;
-        case 0x64:
-            s->override = R_FS;
-            goto next_byte;
-        case 0x65:
-            s->override = R_GS;
-            goto next_byte;
-        case 0x66:
-            prefixes |= PREFIX_DATA;
-            goto next_byte;
-        case 0x67:
-            prefixes |= PREFIX_ADR;
-            goto next_byte;
-        case 0x40 ... 0x4f:
+    case 0x40 ... 0x4f:
+        if (CODE64(s)) {
             /* REX prefix */
             rex_w = (b >> 3) & 1;
             rex_r = (b & 0x4) << 1;
@@ -4210,57 +4737,86 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             x86_64_hregs = 1; /* select uniform byte register addressing */
             goto next_byte;
         }
-        if (rex_w == 1) {
-            /* 0x66 is ignored if rex.w is set */
-            dflag = 2;
-        } else {
-            if (prefixes & PREFIX_DATA)
-                dflag ^= 1;
-        }
-        if (!(prefixes & PREFIX_ADR))
-            aflag = 2;
-    } else
+        break;
 #endif
-    {
-        switch (b) {
-        case 0xf3:
-            prefixes |= PREFIX_REPZ;
-            goto next_byte;
-        case 0xf2:
-            prefixes |= PREFIX_REPNZ;
-            goto next_byte;
-        case 0xf0:
-            prefixes |= PREFIX_LOCK;
-            goto next_byte;
-        case 0x2e:
-            s->override = R_CS;
-            goto next_byte;
-        case 0x36:
-            s->override = R_SS;
-            goto next_byte;
-        case 0x3e:
-            s->override = R_DS;
-            goto next_byte;
-        case 0x26:
-            s->override = R_ES;
-            goto next_byte;
-        case 0x64:
-            s->override = R_FS;
-            goto next_byte;
-        case 0x65:
-            s->override = R_GS;
-            goto next_byte;
-        case 0x66:
-            prefixes |= PREFIX_DATA;
-            goto next_byte;
-        case 0x67:
-            prefixes |= PREFIX_ADR;
-            goto next_byte;
+    case 0xc5: /* 2-byte VEX */
+    case 0xc4: /* 3-byte VEX */
+        /* VEX prefixes cannot be used except in 32-bit mode.
+           Otherwise the instruction is LES or LDS.  */
+        if (s->code32 && !s->vm86) {
+            static const int pp_prefix[4] = {
+                0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ
+            };
+            int vex3, vex2 = cpu_ldub_code(env, s->pc);
+
+            if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) {
+                /* 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b,
+                   otherwise the instruction is LES or LDS.  */
+                break;
+            }
+            s->pc++;
+
+            /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */
+            if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ
+                            | PREFIX_LOCK | PREFIX_DATA)) {
+                goto illegal_op;
+            }
+#ifdef TARGET_X86_64
+            if (x86_64_hregs) {
+                goto illegal_op;
+            }
+#endif
+            rex_r = (~vex2 >> 4) & 8;
+            if (b == 0xc5) {
+                vex3 = vex2;
+                b = cpu_ldub_code(env, s->pc++);
+            } else {
+#ifdef TARGET_X86_64
+                s->rex_x = (~vex2 >> 3) & 8;
+                s->rex_b = (~vex2 >> 2) & 8;
+#endif
+                vex3 = cpu_ldub_code(env, s->pc++);
+                rex_w = (vex3 >> 7) & 1;
+                switch (vex2 & 0x1f) {
+                case 0x01: /* Implied 0f leading opcode bytes.  */
+                    b = cpu_ldub_code(env, s->pc++) | 0x100;
+                    break;
+                case 0x02: /* Implied 0f 38 leading opcode bytes.  */
+                    b = 0x138;
+                    break;
+                case 0x03: /* Implied 0f 3a leading opcode bytes.  */
+                    b = 0x13a;
+                    break;
+                default:   /* Reserved for future use.  */
+                    goto illegal_op;
+                }
+            }
+            s->vex_v = (~vex3 >> 3) & 0xf;
+            s->vex_l = (vex3 >> 2) & 1;
+            prefixes |= pp_prefix[vex3 & 3] | PREFIX_VEX;
         }
-        if (prefixes & PREFIX_DATA)
+        break;
+    }
+
+    /* Post-process prefixes.  */
+    if (CODE64(s)) {
+        /* In 64-bit mode, the default data size is 32-bit.  Select 64-bit
+           data with rex_w, and 16-bit data with 0x66; rex_w takes precedence
+           over 0x66 if both are present.  */
+        dflag = (rex_w > 0 ? 2 : prefixes & PREFIX_DATA ? 0 : 1);
+        /* In 64-bit mode, 0x67 selects 32-bit addressing.  */
+        aflag = (prefixes & PREFIX_ADR ? 1 : 2);
+    } else {
+        /* In 16/32-bit mode, 0x66 selects the opposite data size.  */
+        dflag = s->code32;
+        if (prefixes & PREFIX_DATA) {
             dflag ^= 1;
-        if (prefixes & PREFIX_ADR)
+        }
+        /* In 16/32-bit mode, 0x67 selects the opposite addressing.  */
+        aflag = s->code32;
+        if (prefixes & PREFIX_ADR) {
             aflag ^= 1;
+        }
     }
 
     s->prefix = prefixes;
@@ -4312,10 +4868,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 } else if (op == OP_XORL && rm == reg) {
                 xor_zero:
                     /* xor reg, reg optimisation */
+                    set_cc_op(s, CC_OP_CLR);
                     gen_op_movl_T0_0();
-                    s->cc_op = CC_OP_LOGICB + ot;
                     gen_op_mov_reg_T0(ot, reg);
-                    gen_op_update1_cc();
                     break;
                 } else {
                     opreg = rm;
@@ -4428,7 +4983,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             val = insn_get(env, s, ot);
             gen_op_movl_T1_im(val);
             gen_op_testl_T0_T1_cc();
-            s->cc_op = CC_OP_LOGICB + ot;
+            set_cc_op(s, CC_OP_LOGICB + ot);
             break;
         case 2: /* not */
             tcg_gen_not_tl(cpu_T[0], cpu_T[0]);
@@ -4446,7 +5001,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 gen_op_mov_reg_T0(ot, rm);
             }
             gen_op_update_neg_cc();
-            s->cc_op = CC_OP_SUBB + ot;
+            set_cc_op(s, CC_OP_SUBB + ot);
             break;
         case 4: /* mul */
             switch(ot) {
@@ -4459,7 +5014,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 gen_op_mov_reg_T0(OT_WORD, R_EAX);
                 tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
                 tcg_gen_andi_tl(cpu_cc_src, cpu_T[0], 0xff00);
-                s->cc_op = CC_OP_MULB;
+                set_cc_op(s, CC_OP_MULB);
                 break;
             case OT_WORD:
                 gen_op_mov_TN_reg(OT_WORD, 1, R_EAX);
@@ -4472,44 +5027,27 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 16);
                 gen_op_mov_reg_T0(OT_WORD, R_EDX);
                 tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-                s->cc_op = CC_OP_MULW;
+                set_cc_op(s, CC_OP_MULW);
                 break;
             default:
             case OT_LONG:
-#ifdef TARGET_X86_64
-                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32u_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
-                gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_extu_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-                }
-#endif
-                s->cc_op = CC_OP_MULL;
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+                tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                  cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
+                set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
-                gen_helper_mulq_EAX_T0(cpu_env, cpu_T[0]);
-                s->cc_op = CC_OP_MULQ;
+                tcg_gen_mulu2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+                                  cpu_T[0], cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
+                set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
             }
@@ -4526,7 +5064,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
                 tcg_gen_ext8s_tl(cpu_tmp0, cpu_T[0]);
                 tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                s->cc_op = CC_OP_MULB;
+                set_cc_op(s, CC_OP_MULB);
                 break;
             case OT_WORD:
                 gen_op_mov_TN_reg(OT_WORD, 1, R_EAX);
@@ -4540,46 +5078,30 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
                 tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 16);
                 gen_op_mov_reg_T0(OT_WORD, R_EDX);
-                s->cc_op = CC_OP_MULW;
+                set_cc_op(s, CC_OP_MULW);
                 break;
             default:
             case OT_LONG:
-#ifdef TARGET_X86_64
-                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
-                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
-                gen_op_mov_reg_T0(OT_LONG, R_EDX);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                }
-#endif
-                s->cc_op = CC_OP_MULL;
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+                tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                  cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+                tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
+                set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
-                gen_helper_imulq_EAX_T0(cpu_env, cpu_T[0]);
-                s->cc_op = CC_OP_MULQ;
+                tcg_gen_muls2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+                                  cpu_T[0], cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_sari_tl(cpu_cc_src, cpu_regs[R_EAX], 63);
+                tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_regs[R_EDX]);
+                set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
             }
@@ -4699,8 +5221,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_op_ldu_T0_A0(OT_WORD + s->mem_index);
         do_lcall:
             if (s->pe && !s->vm86) {
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
                 gen_helper_lcall_protected(cpu_env, cpu_tmp2_i32, cpu_T[1],
@@ -4726,8 +5247,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_op_ldu_T0_A0(OT_WORD + s->mem_index);
         do_ljmp:
             if (s->pe && !s->vm86) {
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
                 gen_helper_ljmp_protected(cpu_env, cpu_tmp2_i32, cpu_T[1],
@@ -4760,7 +5280,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
         gen_op_mov_TN_reg(ot, 1, reg);
         gen_op_testl_T0_T1_cc();
-        s->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s, CC_OP_LOGICB + ot);
         break;
 
     case 0xa8: /* test eAX, Iv */
@@ -4774,7 +5294,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         gen_op_mov_TN_reg(ot, 0, OR_EAX);
         gen_op_movl_T1_im(val);
         gen_op_testl_T0_T1_cc();
-        s->cc_op = CC_OP_LOGICB + ot;
+        set_cc_op(s, CC_OP_LOGICB + ot);
         break;
 
     case 0x98: /* CWDE/CBW */
@@ -4835,37 +5355,27 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         } else {
             gen_op_mov_TN_reg(ot, 1, reg);
         }
-
-#ifdef TARGET_X86_64
-        if (ot == OT_QUAD) {
-            gen_helper_imulq_T0_T1(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
-        } else
-#endif
-        if (ot == OT_LONG) {
+        switch (ot) {
 #ifdef TARGET_X86_64
-                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
-                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[1], t0);
-                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[1], cpu_tmp0);
-                }
+        case OT_QUAD:
+            tcg_gen_muls2_i64(cpu_regs[reg], cpu_T[1], cpu_T[0], cpu_T[1]);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+            tcg_gen_sari_tl(cpu_cc_src, cpu_cc_dst, 63);
+            tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_T[1]);
+            break;
 #endif
-        } else {
+        case OT_LONG:
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+            tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                              cpu_tmp2_i32, cpu_tmp3_i32);
+            tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp2_i32);
+            tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+            tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+            tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
+            break;
+        default:
             tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
             tcg_gen_ext16s_tl(cpu_T[1], cpu_T[1]);
             /* XXX: use 32 bit mul which could be faster */
@@ -4873,9 +5383,10 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
             tcg_gen_ext16s_tl(cpu_tmp0, cpu_T[0]);
             tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+            gen_op_mov_reg_T0(ot, reg);
+            break;
         }
-        gen_op_mov_reg_T0(ot, reg);
-        s->cc_op = CC_OP_MULB + ot;
+        set_cc_op(s, CC_OP_MULB + ot);
         break;
     case 0x1c0:
     case 0x1c1: /* xadd Ev, Gv */
@@ -4902,7 +5413,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_op_mov_reg_T1(ot, reg);
         }
         gen_op_update2_cc();
-        s->cc_op = CC_OP_ADDB + ot;
+        set_cc_op(s, CC_OP_ADDB + ot);
         break;
     case 0x1b0:
     case 0x1b1: /* cmpxchg Ev, Gv */
@@ -4932,9 +5443,10 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 rm = 0; /* avoid warning */
             }
             label1 = gen_new_label();
-            tcg_gen_sub_tl(t2, cpu_regs[R_EAX], t0);
+            tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
+            gen_extu(ot, t0);
             gen_extu(ot, t2);
-            tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
+            tcg_gen_brcond_tl(TCG_COND_EQ, t2, t0, label1);
             label2 = gen_new_label();
             if (mod == 3) {
                 gen_op_mov_reg_v(ot, R_EAX, t0);
@@ -4953,8 +5465,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             }
             gen_set_label(label2);
             tcg_gen_mov_tl(cpu_cc_src, t0);
-            tcg_gen_mov_tl(cpu_cc_dst, t2);
-            s->cc_op = CC_OP_SUBB + ot;
+            tcg_gen_mov_tl(cpu_cc_srcT, t2);
+            tcg_gen_sub_tl(cpu_cc_dst, t2, t0);
+            set_cc_op(s, CC_OP_SUBB + ot);
             tcg_temp_free(t0);
             tcg_temp_free(t1);
             tcg_temp_free(t2);
@@ -4971,8 +5484,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             if (!(s->cpuid_ext_features & CPUID_EXT_CX16))
                 goto illegal_op;
             gen_jmp_im(pc_start - s->cs_base);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
             gen_helper_cmpxchg16b(cpu_env, cpu_A0);
         } else
@@ -4981,12 +5493,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             if (!(s->cpuid_features & CPUID_CX8))
                 goto illegal_op;
             gen_jmp_im(pc_start - s->cs_base);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
             gen_helper_cmpxchg8b(cpu_env, cpu_A0);
         }
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
 
         /**************************/
@@ -5390,13 +5901,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         }
         break;
     case 0xc4: /* les Gv */
-        if (CODE64(s))
-            goto illegal_op;
+        /* In CODE64 this is VEX3; see above.  */
         op = R_ES;
         goto do_lxx;
     case 0xc5: /* lds Gv */
-        if (CODE64(s))
-            goto illegal_op;
+        /* In CODE64 this is VEX2; see above.  */
         op = R_DS;
         goto do_lxx;
     case 0x1b2: /* lss Gv */
@@ -5507,12 +6016,12 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         gen_op_mov_TN_reg(ot, 1, reg);
 
         if (shift) {
-            val = cpu_ldub_code(env, s->pc++);
-            tcg_gen_movi_tl(cpu_T3, val);
+            TCGv imm = tcg_const_tl(cpu_ldub_code(env, s->pc++));
+            gen_shiftd_rm_T1(s, ot, opreg, op, imm);
+            tcg_temp_free(imm);
         } else {
-            tcg_gen_mov_tl(cpu_T3, cpu_regs[R_ECX]);
+            gen_shiftd_rm_T1(s, ot, opreg, op, cpu_regs[R_ECX]);
         }
-        gen_shiftd_rm_T1_T3(s, ot, opreg, op);
         break;
 
         /************************/
@@ -5655,8 +6164,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 }
                 break;
             case 0x0c: /* fldenv mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fldenv(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
@@ -5666,8 +6174,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 gen_helper_fldcw(cpu_env, cpu_tmp2_i32);
                 break;
             case 0x0e: /* fnstenv mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fstenv(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
@@ -5677,27 +6184,23 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 gen_op_st_T0_A0(OT_WORD + s->mem_index);
                 break;
             case 0x1d: /* fldt mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fldt_ST0(cpu_env, cpu_A0);
                 break;
             case 0x1f: /* fstpt mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fstt_ST0(cpu_env, cpu_A0);
                 gen_helper_fpop(cpu_env);
                 break;
             case 0x2c: /* frstor mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_frstor(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
             case 0x2e: /* fnsave mem */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fsave(cpu_env, cpu_A0, tcg_const_i32(s->dflag));
                 break;
@@ -5707,14 +6210,12 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 gen_op_st_T0_A0(OT_WORD + s->mem_index);
                 break;
             case 0x3c: /* fbld */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fbld_ST0(cpu_env, cpu_A0);
                 break;
             case 0x3e: /* fbstp */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 gen_helper_fbst_ST0(cpu_env, cpu_A0);
                 gen_helper_fpop(cpu_env);
@@ -5752,8 +6253,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 switch(rm) {
                 case 0: /* fnop */
                     /* check exceptions (FreeBSD FPU probe) */
-                    if (s->cc_op != CC_OP_DYNAMIC)
-                        gen_op_set_cc_op(s->cc_op);
+                    gen_update_cc_op(s);
                     gen_jmp_im(pc_start - s->cs_base);
                     gen_helper_fwait(cpu_env);
                     break;
@@ -5934,18 +6434,16 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 }
                 break;
             case 0x1d: /* fucomi */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                 gen_helper_fucomi_ST0_FT0(cpu_env);
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
                 break;
             case 0x1e: /* fcomi */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                 gen_helper_fcomi_ST0_FT0(cpu_env);
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
                 break;
             case 0x28: /* ffree sti */
                 gen_helper_ffree_STN(cpu_env, tcg_const_i32(opreg));
@@ -5997,20 +6495,18 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 }
                 break;
             case 0x3d: /* fucomip */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                 gen_helper_fucomi_ST0_FT0(cpu_env);
                 gen_helper_fpop(cpu_env);
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
                 break;
             case 0x3e: /* fcomip */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_helper_fmov_FT0_STN(cpu_env, tcg_const_i32(opreg));
                 gen_helper_fcomi_ST0_FT0(cpu_env);
                 gen_helper_fpop(cpu_env);
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
                 break;
             case 0x10 ... 0x13: /* fcmovxx */
             case 0x18 ... 0x1b:
@@ -6024,7 +6520,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                     };
                     op1 = fcmov_cc[op & 3] | (((op >> 3) & 1) ^ 1);
                     l1 = gen_new_label();
-                    gen_jcc1(s, s->cc_op, op1, l1);
+                    gen_jcc1_noeob(s, op1, l1);
                     gen_helper_fmov_ST0_STN(cpu_env, tcg_const_i32(opreg));
                     gen_set_label(l1);
                 }
@@ -6088,7 +6584,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_repz_scas(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 0);
         } else {
             gen_scas(s, ot);
-            s->cc_op = CC_OP_SUBB + ot;
         }
         break;
 
@@ -6104,7 +6599,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_repz_cmps(s, ot, pc_start - s->cs_base, s->pc - s->cs_base, 0);
         } else {
             gen_cmps(s, ot);
-            s->cc_op = CC_OP_SUBB + ot;
         }
         break;
     case 0x6c: /* insS */
@@ -6261,8 +6755,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         s->pc += 2;
     do_lret:
         if (s->pe && !s->vm86) {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_lret_protected(cpu_env, tcg_const_i32(s->dflag),
                                       tcg_const_i32(val));
@@ -6292,21 +6785,20 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         if (!s->pe) {
             /* real mode */
             gen_helper_iret_real(cpu_env, tcg_const_i32(s->dflag));
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
         } else if (s->vm86) {
             if (s->iopl != 3) {
                 gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
             } else {
                 gen_helper_iret_real(cpu_env, tcg_const_i32(s->dflag));
-                s->cc_op = CC_OP_EFLAGS;
+                set_cc_op(s, CC_OP_EFLAGS);
             }
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_iret_protected(cpu_env, tcg_const_i32(s->dflag),
                                       tcg_const_i32(s->pc - s->cs_base));
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
         }
         gen_eob(s);
         break;
@@ -6393,44 +6885,14 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
 
     case 0x190 ... 0x19f: /* setcc Gv */
         modrm = cpu_ldub_code(env, s->pc++);
-        gen_setcc(s, b);
+        gen_setcc1(s, b, cpu_T[0]);
         gen_ldst_modrm(env, s, modrm, OT_BYTE, OR_TMP0, 1);
         break;
     case 0x140 ... 0x14f: /* cmov Gv, Ev */
-        {
-            int l1;
-            TCGv t0;
-
-            ot = dflag + OT_WORD;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
-            mod = (modrm >> 6) & 3;
-            t0 = tcg_temp_local_new();
-            if (mod != 3) {
-                gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-                gen_op_ld_v(ot + s->mem_index, t0, cpu_A0);
-            } else {
-                rm = (modrm & 7) | REX_B(s);
-                gen_op_mov_v_reg(ot, t0, rm);
-            }
-#ifdef TARGET_X86_64
-            if (ot == OT_LONG) {
-                /* XXX: specific Intel behaviour ? */
-                l1 = gen_new_label();
-                gen_jcc1(s, s->cc_op, b ^ 1, l1);
-                tcg_gen_mov_tl(cpu_regs[reg], t0);
-                gen_set_label(l1);
-                tcg_gen_ext32u_tl(cpu_regs[reg], cpu_regs[reg]);
-            } else
-#endif
-            {
-                l1 = gen_new_label();
-                gen_jcc1(s, s->cc_op, b ^ 1, l1);
-                gen_op_mov_reg_v(ot, reg, t0);
-                gen_set_label(l1);
-            }
-            tcg_temp_free(t0);
-        }
+        ot = dflag + OT_WORD;
+        modrm = cpu_ldub_code(env, s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        gen_cmovcc1(env, s, ot, b, modrm, reg);
         break;
 
         /************************/
@@ -6440,8 +6902,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         if (s->vm86 && s->iopl != 3) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_helper_read_eflags(cpu_T[0], cpu_env);
             gen_push_T0(s);
         }
@@ -6498,7 +6959,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 }
             }
             gen_pop_update(s);
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
             /* abort translation because TF/AC flag may change */
             gen_jmp_im(s->pc - s->cs_base);
             gen_eob(s);
@@ -6508,44 +6969,30 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
             goto illegal_op;
         gen_op_mov_TN_reg(OT_BYTE, 0, R_AH);
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
         tcg_gen_andi_tl(cpu_T[0], cpu_T[0], CC_S | CC_Z | CC_A | CC_P | CC_C);
         tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_T[0]);
-        s->cc_op = CC_OP_EFLAGS;
         break;
     case 0x9f: /* lahf */
         if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_T[0]);
+        gen_compute_eflags(s);
         /* Note: gen_compute_eflags() only gives the condition codes */
-        tcg_gen_ori_tl(cpu_T[0], cpu_T[0], 0x02);
+        tcg_gen_ori_tl(cpu_T[0], cpu_cc_src, 0x02);
         gen_op_mov_reg_T0(OT_BYTE, R_AH);
         break;
     case 0xf5: /* cmc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_xori_tl(cpu_cc_src, cpu_cc_src, CC_C);
-        s->cc_op = CC_OP_EFLAGS;
         break;
     case 0xf8: /* clc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_C);
-        s->cc_op = CC_OP_EFLAGS;
         break;
     case 0xf9: /* stc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags(cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_ori_tl(cpu_cc_src, cpu_cc_src, CC_C);
-        s->cc_op = CC_OP_EFLAGS;
         break;
     case 0xfc: /* cld */
         tcg_gen_movi_i32(cpu_tmp2_i32, 1);
@@ -6635,7 +7082,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
             break;
         }
-        s->cc_op = CC_OP_SARB + ot;
+        set_cc_op(s, CC_OP_SARB + ot);
         if (op != 0) {
             if (mod != 3)
                 gen_op_st_T0_A0(ot + s->mem_index);
@@ -6645,81 +7092,88 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_movi_tl(cpu_cc_dst, 0);
         }
         break;
-    case 0x1bc: /* bsf */
-    case 0x1bd: /* bsr */
-        {
-            int label1;
-            TCGv t0;
-
-            ot = dflag + OT_WORD;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
-            gen_ldst_modrm(env, s,modrm, ot, OR_TMP0, 0);
-            gen_extu(ot, cpu_T[0]);
-            t0 = tcg_temp_local_new();
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            if ((b & 1) && (prefixes & PREFIX_REPZ) &&
-                (s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
-                switch(ot) {
-                case OT_WORD: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(16)); break;
-                case OT_LONG: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(32)); break;
-                case OT_QUAD: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(64)); break;
-                }
-                gen_op_mov_reg_T0(ot, reg);
+    case 0x1bc: /* bsf / tzcnt */
+    case 0x1bd: /* bsr / lzcnt */
+        ot = dflag + OT_WORD;
+        modrm = cpu_ldub_code(env, s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+        gen_extu(ot, cpu_T[0]);
+
+        /* Note that lzcnt and tzcnt are in different extensions.  */
+        if ((prefixes & PREFIX_REPZ)
+            && (b & 1
+                ? s->cpuid_ext3_features & CPUID_EXT3_ABM
+                : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
+            int size = 8 << ot;
+            tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+            if (b & 1) {
+                /* For lzcnt, reduce the target_ulong result by the
+                   number of zeros that we expect to find at the top.  */
+                gen_helper_clz(cpu_T[0], cpu_T[0]);
+                tcg_gen_subi_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - size);
             } else {
-                label1 = gen_new_label();
-                tcg_gen_movi_tl(cpu_cc_dst, 0);
-                tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, label1);
-                if (b & 1) {
-                    gen_helper_bsr(cpu_T[0], t0);
-                } else {
-                    gen_helper_bsf(cpu_T[0], t0);
-                }
-                gen_op_mov_reg_T0(ot, reg);
-                tcg_gen_movi_tl(cpu_cc_dst, 1);
-                gen_set_label(label1);
-                tcg_gen_discard_tl(cpu_cc_src);
-                s->cc_op = CC_OP_LOGICB + ot;
+                /* For tzcnt, a zero input must return the operand size:
+                   force all bits outside the operand size to 1.  */
+                target_ulong mask = (target_ulong)-2 << (size - 1);
+                tcg_gen_ori_tl(cpu_T[0], cpu_T[0], mask);
+                gen_helper_ctz(cpu_T[0], cpu_T[0]);
             }
-            tcg_temp_free(t0);
+            /* For lzcnt/tzcnt, C and Z bits are defined and are
+               related to the result.  */
+            gen_op_update1_cc();
+            set_cc_op(s, CC_OP_BMILGB + ot);
+        } else {
+            /* For bsr/bsf, only the Z bit is defined and it is related
+               to the input and not the result.  */
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+            set_cc_op(s, CC_OP_LOGICB + ot);
+            if (b & 1) {
+                /* For bsr, return the bit index of the first 1 bit,
+                   not the count of leading zeros.  */
+                gen_helper_clz(cpu_T[0], cpu_T[0]);
+                tcg_gen_xori_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - 1);
+            } else {
+                gen_helper_ctz(cpu_T[0], cpu_T[0]);
+            }
+            /* ??? The manual says that the output is undefined when the
+               input is zero, but real hardware leaves it unchanged, and
+               real programs appear to depend on that.  */
+            tcg_gen_movi_tl(cpu_tmp0, 0);
+            tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[0], cpu_cc_dst, cpu_tmp0,
+                               cpu_regs[reg], cpu_T[0]);
         }
+        gen_op_mov_reg_T0(ot, reg);
         break;
         /************************/
         /* bcd */
     case 0x27: /* daa */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_helper_daa(cpu_env);
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0x2f: /* das */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_helper_das(cpu_env);
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0x37: /* aaa */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_helper_aaa(cpu_env);
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0x3f: /* aas */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_helper_aas(cpu_env);
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0xd4: /* aam */
         if (CODE64(s))
@@ -6729,7 +7183,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_exception(s, EXCP00_DIVZ, pc_start - s->cs_base);
         } else {
             gen_helper_aam(cpu_env, tcg_const_i32(val));
-            s->cc_op = CC_OP_LOGICB;
+            set_cc_op(s, CC_OP_LOGICB);
         }
         break;
     case 0xd5: /* aad */
@@ -6737,7 +7191,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             goto illegal_op;
         val = cpu_ldub_code(env, s->pc++);
         gen_helper_aad(cpu_env, tcg_const_i32(val));
-        s->cc_op = CC_OP_LOGICB;
+        set_cc_op(s, CC_OP_LOGICB);
         break;
         /************************/
         /* misc */
@@ -6759,8 +7213,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             (HF_MP_MASK | HF_TS_MASK)) {
             gen_exception(s, EXCP07_PREX, pc_start - s->cs_base);
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_fwait(cpu_env);
         }
@@ -6779,8 +7232,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     case 0xce: /* into */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(pc_start - s->cs_base);
         gen_helper_into(cpu_env, tcg_const_i32(s->pc - pc_start));
         break;
@@ -6873,9 +7325,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     case 0xd6: /* salc */
         if (CODE64(s))
             goto illegal_op;
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
-        gen_compute_eflags_c(cpu_T[0]);
+        gen_compute_eflags_c(s, cpu_T[0]);
         tcg_gen_neg_tl(cpu_T[0], cpu_T[0]);
         gen_op_mov_reg_T0(OT_BYTE, R_EAX);
         break;
@@ -6899,17 +7349,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             switch(b) {
             case 0: /* loopnz */
             case 1: /* loopz */
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
                 gen_op_add_reg_im(s->aflag, R_ECX, -1);
                 gen_op_jz_ecx(s->aflag, l3);
-                gen_compute_eflags(cpu_tmp0);
-                tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_Z);
-                if (b == 0) {
-                    tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, l1);
-                } else {
-                    tcg_gen_brcondi_tl(TCG_COND_NE, cpu_tmp0, 0, l1);
-                }
+                gen_jcc1(s, (JCC_Z << 1) | (b ^ 1), l1);
                 break;
             case 2: /* loop */
                 gen_op_add_reg_im(s->aflag, R_ECX, -1);
@@ -6936,8 +7378,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         if (s->cpl != 0) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             if (b & 2) {
                 gen_helper_rdmsr(cpu_env);
@@ -6947,8 +7388,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         }
         break;
     case 0x131: /* rdtsc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(pc_start - s->cs_base);
         if (use_icount)
             gen_io_start();
@@ -6959,8 +7399,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         }
         break;
     case 0x133: /* rdpmc */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(pc_start - s->cs_base);
         gen_helper_rdpmc(cpu_env);
         break;
@@ -7006,15 +7445,15 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_sysret(cpu_env, tcg_const_i32(s->dflag));
             /* condition codes are modified only in long mode */
-            if (s->lma)
-                s->cc_op = CC_OP_EFLAGS;
+            if (s->lma) {
+                set_cc_op(s, CC_OP_EFLAGS);
+            }
             gen_eob(s);
         }
         break;
 #endif
     case 0x1a2: /* cpuid */
-        if (s->cc_op != CC_OP_DYNAMIC)
-            gen_op_set_cc_op(s->cc_op);
+        gen_update_cc_op(s);
         gen_jmp_im(pc_start - s->cs_base);
         gen_helper_cpuid(cpu_env);
         break;
@@ -7022,8 +7461,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         if (s->cpl != 0) {
             gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
         } else {
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_hlt(cpu_env, tcg_const_i32(s->pc - pc_start));
             s->is_jmp = DISAS_TB_JUMP;
@@ -7085,14 +7523,13 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             if (!s->pe || s->vm86)
                 goto illegal_op;
             gen_ldst_modrm(env, s, modrm, OT_WORD, OR_TMP0, 0);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             if (op == 4) {
                 gen_helper_verr(cpu_env, cpu_T[0]);
             } else {
                 gen_helper_verw(cpu_env, cpu_T[0]);
             }
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
             break;
         default:
             goto illegal_op;
@@ -7124,8 +7561,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                     if (!(s->cpuid_ext_features & CPUID_EXT_MONITOR) ||
                         s->cpl != 0)
                         goto illegal_op;
-                    if (s->cc_op != CC_OP_DYNAMIC)
-                        gen_op_set_cc_op(s->cc_op);
+                    gen_update_cc_op(s);
                     gen_jmp_im(pc_start - s->cs_base);
 #ifdef TARGET_X86_64
                     if (s->aflag == 2) {
@@ -7185,8 +7621,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         case 2: /* lgdt */
         case 3: /* lidt */
             if (mod == 3) {
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 switch(rm) {
                 case 0: /* VMRUN */
@@ -7314,8 +7749,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 if (s->cpl != 0) {
                     gen_exception(s, EXCP0D_GPF, pc_start - s->cs_base);
                 } else {
-                    if (s->cc_op != CC_OP_DYNAMIC)
-                        gen_op_set_cc_op(s->cc_op);
+                    gen_update_cc_op(s);
                     gen_jmp_im(pc_start - s->cs_base);
                     gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
                     gen_helper_invlpg(cpu_env, cpu_A0);
@@ -7348,8 +7782,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 case 1: /* rdtscp */
                     if (!(s->cpuid_ext2_features & CPUID_EXT2_RDTSCP))
                         goto illegal_op;
-                    if (s->cc_op != CC_OP_DYNAMIC)
-                        gen_op_set_cc_op(s->cc_op);
+                    gen_update_cc_op(s);
                     gen_jmp_im(pc_start - s->cs_base);
                     if (use_icount)
                         gen_io_start();
@@ -7445,12 +7878,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
            } else {
                 gen_op_mov_reg_v(ot, rm, t0);
             }
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
-            gen_compute_eflags(cpu_cc_src);
+            gen_compute_eflags(s);
             tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_Z);
             tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t2);
-            s->cc_op = CC_OP_EFLAGS;
             tcg_temp_free(t0);
             tcg_temp_free(t1);
             tcg_temp_free(t2);
@@ -7468,8 +7898,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             reg = ((modrm >> 3) & 7) | rex_r;
             gen_ldst_modrm(env, s, modrm, OT_WORD, OR_TMP0, 0);
             t0 = tcg_temp_local_new();
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             if (b == 0x102) {
                 gen_helper_lar(t0, cpu_env, cpu_T[0]);
             } else {
@@ -7480,7 +7909,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_brcondi_tl(TCG_COND_EQ, cpu_tmp0, 0, label1);
             gen_op_mov_reg_v(ot, reg, t0);
             gen_set_label(label1);
-            s->cc_op = CC_OP_EFLAGS;
+            set_cc_op(s, CC_OP_EFLAGS);
             tcg_temp_free(t0);
         }
         break;
@@ -7534,8 +7963,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             case 3:
             case 4:
             case 8:
-                if (s->cc_op != CC_OP_DYNAMIC)
-                    gen_op_set_cc_op(s->cc_op);
+                gen_update_cc_op(s);
                 gen_jmp_im(pc_start - s->cs_base);
                 if (b & 2) {
                     gen_op_mov_TN_reg(ot, 0, rm);
@@ -7624,8 +8052,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 break;
             }
             gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_fxsave(cpu_env, cpu_A0, tcg_const_i32((s->dflag == 2)));
             break;
@@ -7638,8 +8065,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 break;
             }
             gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-            if (s->cc_op != CC_OP_DYNAMIC)
-                gen_op_set_cc_op(s->cc_op);
+            gen_update_cc_op(s);
             gen_jmp_im(pc_start - s->cs_base);
             gen_helper_fxrstor(cpu_env, cpu_A0,
                                tcg_const_i32((s->dflag == 2)));
@@ -7723,7 +8149,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         gen_helper_popcnt(cpu_T[0], cpu_env, cpu_T[0], tcg_const_i32(ot));
         gen_op_mov_reg_T0(ot, reg);
 
-        s->cc_op = CC_OP_EFLAGS;
+        set_cc_op(s, CC_OP_EFLAGS);
         break;
     case 0x10e ... 0x10f:
         /* 3DNow! instructions, ignore prefixes */
@@ -7758,12 +8184,12 @@ void optimize_flags_init(void)
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
     cpu_cc_op = tcg_global_mem_new_i32(TCG_AREG0,
                                        offsetof(CPUX86State, cc_op), "cc_op");
-    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
-                                    "cc_src");
     cpu_cc_dst = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_dst),
                                     "cc_dst");
-    cpu_cc_tmp = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_tmp),
-                                    "cc_tmp");
+    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
+                                    "cc_src");
+    cpu_cc_src2 = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src2),
+                                     "cc_src2");
 
 #ifdef TARGET_X86_64
     cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0,
@@ -7825,10 +8251,12 @@ void optimize_flags_init(void)
 /* generate intermediate code in gen_opc_buf and gen_opparam_buf for
    basic block 'tb'. If search_pc is TRUE, also generate PC
    information for each intermediate instruction. */
-static inline void gen_intermediate_code_internal(CPUX86State *env,
+static inline void gen_intermediate_code_internal(X86CPU *cpu,
                                                   TranslationBlock *tb,
-                                                  int search_pc)
+                                                  bool search_pc)
 {
+    CPUState *cs = CPU(cpu);
+    CPUX86State *env = &cpu->env;
     DisasContext dc1, *dc = &dc1;
     target_ulong pc_ptr;
     uint16_t *gen_opc_end;
@@ -7854,8 +8282,9 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     dc->cpl = (flags >> HF_CPL_SHIFT) & 3;
     dc->iopl = (flags >> IOPL_SHIFT) & 3;
     dc->tf = (flags >> TF_SHIFT) & 1;
-    dc->singlestep_enabled = env->singlestep_enabled;
+    dc->singlestep_enabled = cs->singlestep_enabled;
     dc->cc_op = CC_OP_DYNAMIC;
+    dc->cc_op_dirty = false;
     dc->cs_base = cs_base;
     dc->tb = tb;
     dc->popl_esp_hack = 0;
@@ -7864,17 +8293,17 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     if (flags & HF_SOFTMMU_MASK) {
         dc->mem_index = (cpu_mmu_index(env) + 1) << 2;
     }
-    dc->cpuid_features = env->cpuid_features;
-    dc->cpuid_ext_features = env->cpuid_ext_features;
-    dc->cpuid_ext2_features = env->cpuid_ext2_features;
-    dc->cpuid_ext3_features = env->cpuid_ext3_features;
-    dc->cpuid_7_0_ebx_features = env->cpuid_7_0_ebx_features;
+    dc->cpuid_features = env->features[FEAT_1_EDX];
+    dc->cpuid_ext_features = env->features[FEAT_1_ECX];
+    dc->cpuid_ext2_features = env->features[FEAT_8000_0001_EDX];
+    dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
+    dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
 #ifdef TARGET_X86_64
     dc->lma = (flags >> HF_LMA_SHIFT) & 1;
     dc->code64 = (flags >> HF_CS64_SHIFT) & 1;
 #endif
     dc->flags = flags;
-    dc->jmp_opt = !(dc->tf || env->singlestep_enabled ||
+    dc->jmp_opt = !(dc->tf || cs->singlestep_enabled ||
                     (flags & HF_INHIBIT_IRQ_MASK)
 #ifndef CONFIG_SOFTMMU
                     || (flags & HF_SOFTMMU_MASK)
@@ -7889,16 +8318,15 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     cpu_T[0] = tcg_temp_new();
     cpu_T[1] = tcg_temp_new();
     cpu_A0 = tcg_temp_new();
-    cpu_T3 = tcg_temp_new();
 
     cpu_tmp0 = tcg_temp_new();
     cpu_tmp1_i64 = tcg_temp_new_i64();
     cpu_tmp2_i32 = tcg_temp_new_i32();
     cpu_tmp3_i32 = tcg_temp_new_i32();
     cpu_tmp4 = tcg_temp_new();
-    cpu_tmp5 = tcg_temp_new();
     cpu_ptr0 = tcg_temp_new_ptr();
     cpu_ptr1 = tcg_temp_new_ptr();
+    cpu_cc_srcT = tcg_temp_local_new();
 
     gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
 
@@ -7910,7 +8338,7 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     if (max_insns == 0)
         max_insns = CF_COUNT_MASK;
 
-    gen_icount_start();
+    gen_tb_start();
     for(;;) {
         if (unlikely(!QTAILQ_EMPTY(&env->breakpoints))) {
             QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
@@ -7968,7 +8396,7 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     }
     if (tb->cflags & CF_LAST_IO)
         gen_io_end();
-    gen_icount_end(tb, num_insns);
+    gen_tb_end(tb, num_insns);
     *tcg_ctx.gen_opc_ptr = INDEX_op_end;
     /* we don't forget to fill the last values */
     if (search_pc) {
@@ -8002,12 +8430,12 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
 
 void gen_intermediate_code(CPUX86State *env, TranslationBlock *tb)
 {
-    gen_intermediate_code_internal(env, tb, 0);
+    gen_intermediate_code_internal(x86_env_get_cpu(env), tb, false);
 }
 
 void gen_intermediate_code_pc(CPUX86State *env, TranslationBlock *tb)
 {
-    gen_intermediate_code_internal(env, tb, 1);
+    gen_intermediate_code_internal(x86_env_get_cpu(env), tb, true);
 }
 
 void restore_state_to_opc(CPUX86State *env, TranslationBlock *tb, int pc_pos)