]> git.proxmox.com Git - qemu.git/blobdiff - target-i386/translate.c
target-i386: Fix aflag logic for CODE64 and the 0x67 prefix
[qemu.git] / target-i386 / translate.c
index 9bbe969cd646f4bb794bec9b53bb6ed69650dd2f..14b02987496054da0b87bc3c8bf22db98b2307f4 100644 (file)
@@ -23,6 +23,7 @@
 #include <inttypes.h>
 #include <signal.h>
 
+#include "qemu/host-utils.h"
 #include "cpu.h"
 #include "disas/disas.h"
 #include "tcg-op.h"
@@ -36,6 +37,7 @@
 #define PREFIX_LOCK   0x04
 #define PREFIX_DATA   0x08
 #define PREFIX_ADR    0x10
+#define PREFIX_VEX    0x20
 
 #ifdef TARGET_X86_64
 #define CODE64(s) ((s)->code64)
 #define REX_B(s) 0
 #endif
 
+#ifdef TARGET_X86_64
+# define ctztl  ctz64
+# define clztl  clz64
+#else
+# define ctztl  ctz32
+# define clztl  clz32
+#endif
+
 //#define MACRO_TEST   1
 
 /* global register indexes */
 static TCGv_ptr cpu_env;
-static TCGv cpu_A0, cpu_cc_src, cpu_cc_dst;
+static TCGv cpu_A0;
+static TCGv cpu_cc_dst, cpu_cc_src, cpu_cc_src2, cpu_cc_srcT;
 static TCGv_i32 cpu_cc_op;
 static TCGv cpu_regs[CPU_NB_REGS];
 /* local temps */
-static TCGv cpu_T[2], cpu_T3;
+static TCGv cpu_T[2];
 /* local register indexes (only used inside old micro ops) */
 static TCGv cpu_tmp0, cpu_tmp4;
 static TCGv_ptr cpu_ptr0, cpu_ptr1;
 static TCGv_i32 cpu_tmp2_i32, cpu_tmp3_i32;
 static TCGv_i64 cpu_tmp1_i64;
-static TCGv cpu_tmp5;
 
 static uint8_t gen_opc_cc_op[OPC_BUF_SIZE];
 
@@ -88,6 +98,8 @@ typedef struct DisasContext {
     int code64; /* 64 bit code segment */
     int rex_x, rex_b;
 #endif
+    int vex_l;  /* vex vector length */
+    int vex_v;  /* vex vvvv register, without 1's compliment.  */
     int ss32;   /* 32 bit stack segment */
     CCOp cc_op;  /* current CC operation */
     bool cc_op_dirty;
@@ -114,6 +126,7 @@ typedef struct DisasContext {
 static void gen_eob(DisasContext *s);
 static void gen_jmp(DisasContext *s, target_ulong eip);
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num);
+static void gen_op(DisasContext *s1, int op, int ot, int d);
 
 /* i386 arith/logic operations */
 enum {
@@ -174,14 +187,69 @@ enum {
     OR_A0, /* temporary register used when doing address evaluation */
 };
 
+enum {
+    USES_CC_DST  = 1,
+    USES_CC_SRC  = 2,
+    USES_CC_SRC2 = 4,
+    USES_CC_SRCT = 8,
+};
+
+/* Bit set if the global variable is live after setting CC_OP to X.  */
+static const uint8_t cc_op_live[CC_OP_NB] = {
+    [CC_OP_DYNAMIC] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_EFLAGS] = USES_CC_SRC,
+    [CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADDB ... CC_OP_ADDQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRCT,
+    [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_LOGICB ... CC_OP_LOGICQ] = USES_CC_DST,
+    [CC_OP_INCB ... CC_OP_INCQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_DECB ... CC_OP_DECQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_SHLB ... CC_OP_SHLQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_SARB ... CC_OP_SARQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_BMILGB ... CC_OP_BMILGQ] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADCX] = USES_CC_DST | USES_CC_SRC,
+    [CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
+    [CC_OP_CLR] = 0,
+};
+
 static void set_cc_op(DisasContext *s, CCOp op)
 {
-    if (s->cc_op != op) {
-        s->cc_op = op;
+    int dead;
+
+    if (s->cc_op == op) {
+        return;
+    }
+
+    /* Discard CC computation that will no longer be used.  */
+    dead = cc_op_live[s->cc_op] & ~cc_op_live[op];
+    if (dead & USES_CC_DST) {
+        tcg_gen_discard_tl(cpu_cc_dst);
+    }
+    if (dead & USES_CC_SRC) {
+        tcg_gen_discard_tl(cpu_cc_src);
+    }
+    if (dead & USES_CC_SRC2) {
+        tcg_gen_discard_tl(cpu_cc_src2);
+    }
+    if (dead & USES_CC_SRCT) {
+        tcg_gen_discard_tl(cpu_cc_srcT);
+    }
+
+    if (op == CC_OP_DYNAMIC) {
         /* The DYNAMIC setting is translator only, and should never be
            stored.  Thus we always consider it clean.  */
-        s->cc_op_dirty = (op != CC_OP_DYNAMIC);
+        s->cc_op_dirty = false;
+    } else {
+        /* Discard any computed CC_OP value (see shifts).  */
+        if (s->cc_op == CC_OP_DYNAMIC) {
+            tcg_gen_discard_i32(cpu_cc_op);
+        }
+        s->cc_op_dirty = true;
     }
+    s->cc_op = op;
 }
 
 static void gen_update_cc_op(DisasContext *s)
@@ -809,7 +877,6 @@ static inline void gen_movs(DisasContext *s, int ot)
 
 static void gen_op_update1_cc(void)
 {
-    tcg_gen_discard_tl(cpu_cc_src);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
@@ -819,287 +886,392 @@ static void gen_op_update2_cc(void)
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
-static inline void gen_op_cmpl_T0_T1_cc(void)
+static void gen_op_update3_cc(TCGv reg)
 {
+    tcg_gen_mov_tl(cpu_cc_src2, reg);
     tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-    tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
+    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
 static inline void gen_op_testl_T0_T1_cc(void)
 {
-    tcg_gen_discard_tl(cpu_cc_src);
     tcg_gen_and_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
 }
 
 static void gen_op_update_neg_cc(void)
 {
-    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+    tcg_gen_neg_tl(cpu_cc_src, cpu_T[0]);
+    tcg_gen_movi_tl(cpu_cc_srcT, 0);
 }
 
-/* compute eflags.C to reg */
-static void gen_compute_eflags_c(DisasContext *s, TCGv reg)
+/* compute all eflags to cc_src */
+static void gen_compute_eflags(DisasContext *s)
 {
+    TCGv zero, dst, src1, src2;
+    int live, dead;
+
+    if (s->cc_op == CC_OP_EFLAGS) {
+        return;
+    }
+    if (s->cc_op == CC_OP_CLR) {
+        tcg_gen_movi_tl(cpu_cc_src, CC_Z);
+        set_cc_op(s, CC_OP_EFLAGS);
+        return;
+    }
+
+    TCGV_UNUSED(zero);
+    dst = cpu_cc_dst;
+    src1 = cpu_cc_src;
+    src2 = cpu_cc_src2;
+
+    /* Take care to not read values that are not live.  */
+    live = cc_op_live[s->cc_op] & ~USES_CC_SRCT;
+    dead = live ^ (USES_CC_DST | USES_CC_SRC | USES_CC_SRC2);
+    if (dead) {
+        zero = tcg_const_tl(0);
+        if (dead & USES_CC_DST) {
+            dst = zero;
+        }
+        if (dead & USES_CC_SRC) {
+            src1 = zero;
+        }
+        if (dead & USES_CC_SRC2) {
+            src2 = zero;
+        }
+    }
+
     gen_update_cc_op(s);
-    gen_helper_cc_compute_c(cpu_tmp2_i32, cpu_env, cpu_cc_op);
-    tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
+    gen_helper_cc_compute_all(cpu_cc_src, dst, src1, src2, cpu_cc_op);
+    set_cc_op(s, CC_OP_EFLAGS);
+
+    if (dead) {
+        tcg_temp_free(zero);
+    }
 }
 
-/* compute all eflags to reg */
-static void gen_compute_eflags(DisasContext *s, TCGv reg)
+typedef struct CCPrepare {
+    TCGCond cond;
+    TCGv reg;
+    TCGv reg2;
+    target_ulong imm;
+    target_ulong mask;
+    bool use_reg2;
+    bool no_setcond;
+} CCPrepare;
+
+/* compute eflags.C to reg */
+static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg)
 {
-    gen_update_cc_op(s);
-    gen_helper_cc_compute_all(cpu_tmp2_i32, cpu_env, cpu_cc_op);
-    if (TCGV_EQUAL(reg, cpu_cc_src)) {
-        tcg_gen_discard_tl(cpu_cc_dst);
-        set_cc_op(s, CC_OP_EFLAGS);
+    TCGv t0, t1;
+    int size, shift;
+
+    switch (s->cc_op) {
+    case CC_OP_SUBB ... CC_OP_SUBQ:
+        /* (DATA_TYPE)CC_SRCT < (DATA_TYPE)CC_SRC */
+        size = s->cc_op - CC_OP_SUBB;
+        t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+        /* If no temporary was used, be careful not to alias t1 and t0.  */
+        t0 = TCGV_EQUAL(t1, cpu_cc_src) ? cpu_tmp0 : reg;
+        tcg_gen_mov_tl(t0, cpu_cc_srcT);
+        gen_extu(size, t0);
+        goto add_sub;
+
+    case CC_OP_ADDB ... CC_OP_ADDQ:
+        /* (DATA_TYPE)CC_DST < (DATA_TYPE)CC_SRC */
+        size = s->cc_op - CC_OP_ADDB;
+        t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
+        t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
+    add_sub:
+        return (CCPrepare) { .cond = TCG_COND_LTU, .reg = t0,
+                             .reg2 = t1, .mask = -1, .use_reg2 = true };
+
+    case CC_OP_LOGICB ... CC_OP_LOGICQ:
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+
+    case CC_OP_INCB ... CC_OP_INCQ:
+    case CC_OP_DECB ... CC_OP_DECQ:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = -1, .no_setcond = true };
+
+    case CC_OP_SHLB ... CC_OP_SHLQ:
+        /* (CC_SRC >> (DATA_BITS - 1)) & 1 */
+        size = s->cc_op - CC_OP_SHLB;
+        shift = (8 << size) - 1;
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = (target_ulong)1 << shift };
+
+    case CC_OP_MULB ... CC_OP_MULQ:
+        return (CCPrepare) { .cond = TCG_COND_NE,
+                             .reg = cpu_cc_src, .mask = -1 };
+
+    case CC_OP_BMILGB ... CC_OP_BMILGQ:
+        size = s->cc_op - CC_OP_BMILGB;
+        t0 = gen_ext_tl(reg, cpu_cc_src, size, false);
+        return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+
+    case CC_OP_ADCX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_dst,
+                             .mask = -1, .no_setcond = true };
+
+    case CC_OP_EFLAGS:
+    case CC_OP_SARB ... CC_OP_SARQ:
+        /* CC_SRC & 1 */
+        return (CCPrepare) { .cond = TCG_COND_NE,
+                             .reg = cpu_cc_src, .mask = CC_C };
+
+    default:
+       /* The need to compute only C from CC_OP_DYNAMIC is important
+          in efficiently implementing e.g. INC at the start of a TB.  */
+       gen_update_cc_op(s);
+       gen_helper_cc_compute_c(reg, cpu_cc_dst, cpu_cc_src,
+                               cpu_cc_src2, cpu_cc_op);
+       return (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                            .mask = -1, .no_setcond = true };
     }
-    tcg_gen_extu_i32_tl(reg, cpu_tmp2_i32);
 }
 
 /* compute eflags.P to reg */
-static void gen_compute_eflags_p(DisasContext *s, TCGv reg)
+static CCPrepare gen_prepare_eflags_p(DisasContext *s, TCGv reg)
 {
-    gen_compute_eflags(s, reg);
-    tcg_gen_shri_tl(reg, reg, 2);
-    tcg_gen_andi_tl(reg, reg, 1);
+    gen_compute_eflags(s);
+    return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                         .mask = CC_P };
 }
 
 /* compute eflags.S to reg */
-static void gen_compute_eflags_s(DisasContext *s, TCGv reg)
+static CCPrepare gen_prepare_eflags_s(DisasContext *s, TCGv reg)
 {
-    gen_compute_eflags(s, reg);
-    tcg_gen_shri_tl(reg, reg, 7);
-    tcg_gen_andi_tl(reg, reg, 1);
+    switch (s->cc_op) {
+    case CC_OP_DYNAMIC:
+        gen_compute_eflags(s);
+        /* FALLTHRU */
+    case CC_OP_EFLAGS:
+    case CC_OP_ADCX:
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_S };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
+    default:
+        {
+            int size = (s->cc_op - CC_OP_ADDB) & 3;
+            TCGv t0 = gen_ext_tl(reg, cpu_cc_dst, size, true);
+            return (CCPrepare) { .cond = TCG_COND_LT, .reg = t0, .mask = -1 };
+        }
+    }
 }
 
 /* compute eflags.O to reg */
-static void gen_compute_eflags_o(DisasContext *s, TCGv reg)
+static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv reg)
 {
-    gen_compute_eflags(s, reg);
-    tcg_gen_shri_tl(reg, reg, 11);
-    tcg_gen_andi_tl(reg, reg, 1);
-}
-
-/* compute eflags.Z to reg */
-static void gen_compute_eflags_z(DisasContext *s, TCGv reg)
-{
-    gen_compute_eflags(s, reg);
-    tcg_gen_shri_tl(reg, reg, 6);
-    tcg_gen_andi_tl(reg, reg, 1);
-}
-
-static inline void gen_setcc_slow_T0(DisasContext *s, int jcc_op)
-{
-    switch(jcc_op) {
-    case JCC_O:
-        gen_compute_eflags_o(s, cpu_T[0]);
-        break;
-    case JCC_B:
-        gen_compute_eflags_c(s, cpu_T[0]);
-        break;
-    case JCC_Z:
-        gen_compute_eflags_z(s, cpu_T[0]);
-        break;
-    case JCC_BE:
-        gen_compute_eflags(s, cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 6);
-        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
-    case JCC_S:
-        gen_compute_eflags_s(s, cpu_T[0]);
-        break;
-    case JCC_P:
-        gen_compute_eflags_p(s, cpu_T[0]);
-        break;
-    case JCC_L:
-        gen_compute_eflags(s, cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 11); /* CC_O */
-        tcg_gen_shri_tl(cpu_tmp0, cpu_tmp0, 7); /* CC_S */
-        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
+    switch (s->cc_op) {
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src2,
+                             .mask = -1, .no_setcond = true };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 };
     default:
-    case JCC_LE:
-        gen_compute_eflags(s, cpu_tmp0);
-        tcg_gen_shri_tl(cpu_T[0], cpu_tmp0, 11); /* CC_O */
-        tcg_gen_shri_tl(cpu_tmp4, cpu_tmp0, 7); /* CC_S */
-        tcg_gen_shri_tl(cpu_tmp0, cpu_tmp0, 6); /* CC_Z */
-        tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_tmp4);
-        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
-        tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 1);
-        break;
+        gen_compute_eflags(s);
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_O };
     }
 }
 
-/* return true if setcc_slow is not needed (WARNING: must be kept in
-   sync with gen_jcc1) */
-static int is_fast_jcc_case(DisasContext *s, int b)
+/* compute eflags.Z to reg */
+static CCPrepare gen_prepare_eflags_z(DisasContext *s, TCGv reg)
 {
-    int jcc_op;
-    jcc_op = (b >> 1) & 7;
-    switch(s->cc_op) {
-        /* we optimize the cmp/jcc case */
-    case CC_OP_SUBB:
-    case CC_OP_SUBW:
-    case CC_OP_SUBL:
-    case CC_OP_SUBQ:
-        if (jcc_op == JCC_O || jcc_op == JCC_P)
-            goto slow_jcc;
-        break;
-
-        /* some jumps are easy to compute */
-    case CC_OP_ADDB:
-    case CC_OP_ADDW:
-    case CC_OP_ADDL:
-    case CC_OP_ADDQ:
-
-    case CC_OP_LOGICB:
-    case CC_OP_LOGICW:
-    case CC_OP_LOGICL:
-    case CC_OP_LOGICQ:
-
-    case CC_OP_INCB:
-    case CC_OP_INCW:
-    case CC_OP_INCL:
-    case CC_OP_INCQ:
-
-    case CC_OP_DECB:
-    case CC_OP_DECW:
-    case CC_OP_DECL:
-    case CC_OP_DECQ:
-
-    case CC_OP_SHLB:
-    case CC_OP_SHLW:
-    case CC_OP_SHLL:
-    case CC_OP_SHLQ:
-        if (jcc_op != JCC_Z && jcc_op != JCC_S)
-            goto slow_jcc;
-        break;
+    switch (s->cc_op) {
+    case CC_OP_DYNAMIC:
+        gen_compute_eflags(s);
+        /* FALLTHRU */
+    case CC_OP_EFLAGS:
+    case CC_OP_ADCX:
+    case CC_OP_ADOX:
+    case CC_OP_ADCOX:
+        return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                             .mask = CC_Z };
+    case CC_OP_CLR:
+        return (CCPrepare) { .cond = TCG_COND_ALWAYS, .mask = -1 };
     default:
-    slow_jcc:
-        return 0;
+        {
+            int size = (s->cc_op - CC_OP_ADDB) & 3;
+            TCGv t0 = gen_ext_tl(reg, cpu_cc_dst, size, false);
+            return (CCPrepare) { .cond = TCG_COND_EQ, .reg = t0, .mask = -1 };
+        }
     }
-    return 1;
 }
 
-/* generate a conditional jump to label 'l1' according to jump opcode
+/* perform a conditional store into register 'reg' according to jump opcode
    value 'b'. In the fast case, T0 is guaranted not to be used. */
-static inline void gen_jcc1(DisasContext *s, int b, int l1)
+static CCPrepare gen_prepare_cc(DisasContext *s, int b, TCGv reg)
 {
     int inv, jcc_op, size, cond;
+    CCPrepare cc;
     TCGv t0;
 
     inv = b & 1;
     jcc_op = (b >> 1) & 7;
 
     switch (s->cc_op) {
-        /* we optimize the cmp/jcc case */
-    case CC_OP_SUBB:
-    case CC_OP_SUBW:
-    case CC_OP_SUBL:
-    case CC_OP_SUBQ:
-        
+    case CC_OP_SUBB ... CC_OP_SUBQ:
+        /* We optimize relational operators for the cmp/jcc case.  */
         size = s->cc_op - CC_OP_SUBB;
-        switch(jcc_op) {
-        case JCC_Z:
-        fast_jcc_z:
-            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_dst, size, false);
-            tcg_gen_brcondi_tl(inv ? TCG_COND_NE : TCG_COND_EQ, t0, 0, l1);
-            break;
-        case JCC_S:
-        fast_jcc_s:
-            t0 = gen_ext_tl(cpu_tmp0, cpu_cc_dst, size, true);
-            tcg_gen_brcondi_tl(inv ? TCG_COND_GE : TCG_COND_LT, t0, 0, l1);
-            break;
-
-        case JCC_B:
-            cond = inv ? TCG_COND_GEU : TCG_COND_LTU;
-            goto fast_jcc_b;
+        switch (jcc_op) {
         case JCC_BE:
-            cond = inv ? TCG_COND_GTU : TCG_COND_LEU;
-        fast_jcc_b:
-            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            tcg_gen_mov_tl(cpu_tmp4, cpu_cc_srcT);
             gen_extu(size, cpu_tmp4);
             t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false);
-            tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+            cc = (CCPrepare) { .cond = TCG_COND_LEU, .reg = cpu_tmp4,
+                               .reg2 = t0, .mask = -1, .use_reg2 = true };
             break;
-            
+
         case JCC_L:
-            cond = inv ? TCG_COND_GE : TCG_COND_LT;
+            cond = TCG_COND_LT;
             goto fast_jcc_l;
         case JCC_LE:
-            cond = inv ? TCG_COND_GT : TCG_COND_LE;
+            cond = TCG_COND_LE;
         fast_jcc_l:
-            tcg_gen_add_tl(cpu_tmp4, cpu_cc_dst, cpu_cc_src);
+            tcg_gen_mov_tl(cpu_tmp4, cpu_cc_srcT);
             gen_exts(size, cpu_tmp4);
             t0 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, true);
-            tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1);
+            cc = (CCPrepare) { .cond = cond, .reg = cpu_tmp4,
+                               .reg2 = t0, .mask = -1, .use_reg2 = true };
             break;
-            
+
         default:
             goto slow_jcc;
         }
         break;
-        
-        /* some jumps are easy to compute */
-    case CC_OP_ADDB:
-    case CC_OP_ADDW:
-    case CC_OP_ADDL:
-    case CC_OP_ADDQ:
-        
-    case CC_OP_ADCB:
-    case CC_OP_ADCW:
-    case CC_OP_ADCL:
-    case CC_OP_ADCQ:
-        
-    case CC_OP_SBBB:
-    case CC_OP_SBBW:
-    case CC_OP_SBBL:
-    case CC_OP_SBBQ:
-        
-    case CC_OP_LOGICB:
-    case CC_OP_LOGICW:
-    case CC_OP_LOGICL:
-    case CC_OP_LOGICQ:
-        
-    case CC_OP_INCB:
-    case CC_OP_INCW:
-    case CC_OP_INCL:
-    case CC_OP_INCQ:
-        
-    case CC_OP_DECB:
-    case CC_OP_DECW:
-    case CC_OP_DECL:
-    case CC_OP_DECQ:
-        
-    case CC_OP_SHLB:
-    case CC_OP_SHLW:
-    case CC_OP_SHLL:
-    case CC_OP_SHLQ:
-        
-    case CC_OP_SARB:
-    case CC_OP_SARW:
-    case CC_OP_SARL:
-    case CC_OP_SARQ:
-        switch(jcc_op) {
+
+    default:
+    slow_jcc:
+        /* This actually generates good code for JC, JZ and JS.  */
+        switch (jcc_op) {
+        case JCC_O:
+            cc = gen_prepare_eflags_o(s, reg);
+            break;
+        case JCC_B:
+            cc = gen_prepare_eflags_c(s, reg);
+            break;
         case JCC_Z:
-            size = (s->cc_op - CC_OP_ADDB) & 3;
-            goto fast_jcc_z;
+            cc = gen_prepare_eflags_z(s, reg);
+            break;
+        case JCC_BE:
+            gen_compute_eflags(s);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src,
+                               .mask = CC_Z | CC_C };
+            break;
         case JCC_S:
-            size = (s->cc_op - CC_OP_ADDB) & 3;
-            goto fast_jcc_s;
+            cc = gen_prepare_eflags_s(s, reg);
+            break;
+        case JCC_P:
+            cc = gen_prepare_eflags_p(s, reg);
+            break;
+        case JCC_L:
+            gen_compute_eflags(s);
+            if (TCGV_EQUAL(reg, cpu_cc_src)) {
+                reg = cpu_tmp0;
+            }
+            tcg_gen_shri_tl(reg, cpu_cc_src, 4); /* CC_O -> CC_S */
+            tcg_gen_xor_tl(reg, reg, cpu_cc_src);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                               .mask = CC_S };
+            break;
         default:
-            goto slow_jcc;
+        case JCC_LE:
+            gen_compute_eflags(s);
+            if (TCGV_EQUAL(reg, cpu_cc_src)) {
+                reg = cpu_tmp0;
+            }
+            tcg_gen_shri_tl(reg, cpu_cc_src, 4); /* CC_O -> CC_S */
+            tcg_gen_xor_tl(reg, reg, cpu_cc_src);
+            cc = (CCPrepare) { .cond = TCG_COND_NE, .reg = reg,
+                               .mask = CC_S | CC_Z };
+            break;
         }
         break;
-    default:
-    slow_jcc:
-        gen_setcc_slow_T0(s, jcc_op);
-        tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, 
-                           cpu_T[0], 0, l1);
-        break;
+    }
+
+    if (inv) {
+        cc.cond = tcg_invert_cond(cc.cond);
+    }
+    return cc;
+}
+
+static void gen_setcc1(DisasContext *s, int b, TCGv reg)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, reg);
+
+    if (cc.no_setcond) {
+        if (cc.cond == TCG_COND_EQ) {
+            tcg_gen_xori_tl(reg, cc.reg, 1);
+        } else {
+            tcg_gen_mov_tl(reg, cc.reg);
+        }
+        return;
+    }
+
+    if (cc.cond == TCG_COND_NE && !cc.use_reg2 && cc.imm == 0 &&
+        cc.mask != 0 && (cc.mask & (cc.mask - 1)) == 0) {
+        tcg_gen_shri_tl(reg, cc.reg, ctztl(cc.mask));
+        tcg_gen_andi_tl(reg, reg, 1);
+        return;
+    }
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(reg, cc.reg, cc.mask);
+        cc.reg = reg;
+    }
+    if (cc.use_reg2) {
+        tcg_gen_setcond_tl(cc.cond, reg, cc.reg, cc.reg2);
+    } else {
+        tcg_gen_setcondi_tl(cc.cond, reg, cc.reg, cc.imm);
+    }
+}
+
+static inline void gen_compute_eflags_c(DisasContext *s, TCGv reg)
+{
+    gen_setcc1(s, JCC_B << 1, reg);
+}
+
+/* generate a conditional jump to label 'l1' according to jump opcode
+   value 'b'. In the fast case, T0 is guaranted not to be used. */
+static inline void gen_jcc1_noeob(DisasContext *s, int b, int l1)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, cpu_T[0]);
+
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(cpu_T[0], cc.reg, cc.mask);
+        cc.reg = cpu_T[0];
+    }
+    if (cc.use_reg2) {
+        tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
+    } else {
+        tcg_gen_brcondi_tl(cc.cond, cc.reg, cc.imm, l1);
+    }
+}
+
+/* Generate a conditional jump to label 'l1' according to jump opcode
+   value 'b'. In the fast case, T0 is guaranted not to be used.
+   A translation block must end soon.  */
+static inline void gen_jcc1(DisasContext *s, int b, int l1)
+{
+    CCPrepare cc = gen_prepare_cc(s, b, cpu_T[0]);
+
+    gen_update_cc_op(s);
+    if (cc.mask != -1) {
+        tcg_gen_andi_tl(cpu_T[0], cc.reg, cc.mask);
+        cc.reg = cpu_T[0];
+    }
+    set_cc_op(s, CC_OP_DYNAMIC);
+    if (cc.use_reg2) {
+        tcg_gen_brcond_tl(cc.cond, cc.reg, cc.reg2, l1);
+    } else {
+        tcg_gen_brcondi_tl(cc.cond, cc.reg, cc.imm, l1);
     }
 }
 
@@ -1138,26 +1310,22 @@ static inline void gen_lods(DisasContext *s, int ot)
 
 static inline void gen_scas(DisasContext *s, int ot)
 {
-    gen_op_mov_TN_reg(OT_LONG, 0, R_EAX);
     gen_string_movl_A0_EDI(s);
     gen_op_ld_T1_A0(ot + s->mem_index);
-    gen_op_cmpl_T0_T1_cc();
+    gen_op(s, OP_CMPL, ot, R_EAX);
     gen_op_movl_T0_Dshift(ot);
     gen_op_add_reg_T0(s->aflag, R_EDI);
-    set_cc_op(s, CC_OP_SUBB + ot);
 }
 
 static inline void gen_cmps(DisasContext *s, int ot)
 {
-    gen_string_movl_A0_ESI(s);
-    gen_op_ld_T0_A0(ot + s->mem_index);
     gen_string_movl_A0_EDI(s);
     gen_op_ld_T1_A0(ot + s->mem_index);
-    gen_op_cmpl_T0_T1_cc();
+    gen_string_movl_A0_ESI(s);
+    gen_op(s, OP_CMPL, ot, OR_TMP0);
     gen_op_movl_T0_Dshift(ot);
     gen_op_add_reg_T0(s->aflag, R_ESI);
     gen_op_add_reg_T0(s->aflag, R_EDI);
-    set_cc_op(s, CC_OP_SUBB + ot);
 }
 
 static inline void gen_ins(DisasContext *s, int ot)
@@ -1233,7 +1401,6 @@ static inline void gen_repz_ ## op(DisasContext *s, int ot,                   \
     if (!s->jmp_opt)                                                          \
         gen_op_jz_ecx(s->aflag, l2);                                          \
     gen_jmp(s, cur_eip);                                                      \
-    set_cc_op(s, CC_OP_DYNAMIC);                                              \
 }
 
 GEN_REPZ(movs)
@@ -1317,12 +1484,8 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
-        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
-        tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
-        tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_ADDB + ot);
-        set_cc_op(s1, CC_OP_DYNAMIC);
+        gen_op_update3_cc(cpu_tmp4);
+        set_cc_op(s1, CC_OP_ADCB + ot);
         break;
     case OP_SBBL:
         gen_compute_eflags_c(s1, cpu_tmp4);
@@ -1332,12 +1495,8 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
             gen_op_mov_reg_T0(ot, d);
         else
             gen_op_st_T0_A0(ot + s1->mem_index);
-        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
-        tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4);
-        tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2);
-        tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_SUBB + ot);
-        set_cc_op(s1, CC_OP_DYNAMIC);
+        gen_op_update3_cc(cpu_tmp4);
+        set_cc_op(s1, CC_OP_SBBB + ot);
         break;
     case OP_ADDL:
         gen_op_addl_T0_T1();
@@ -1349,6 +1508,7 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
         set_cc_op(s1, CC_OP_ADDB + ot);
         break;
     case OP_SUBL:
+        tcg_gen_mov_tl(cpu_cc_srcT, cpu_T[0]);
         tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         if (d != OR_TMP0)
             gen_op_mov_reg_T0(ot, d);
@@ -1386,7 +1546,9 @@ static void gen_op(DisasContext *s1, int op, int ot, int d)
         set_cc_op(s1, CC_OP_LOGICB + ot);
         break;
     case OP_CMPL:
-        gen_op_cmpl_T0_T1_cc();
+        tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]);
+        tcg_gen_mov_tl(cpu_cc_srcT, cpu_T[0]);
+        tcg_gen_sub_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]);
         set_cc_op(s1, CC_OP_SUBB + ot);
         break;
     }
@@ -1414,18 +1576,55 @@ static void gen_inc(DisasContext *s1, int ot, int d, int c)
     tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
 }
 
-static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, 
-                            int is_right, int is_arith)
+static void gen_shift_flags(DisasContext *s, int ot, TCGv result, TCGv shm1,
+                            TCGv count, bool is_right)
 {
-    target_ulong mask;
-    int shift_label;
-    TCGv t0, t1, t2;
+    TCGv_i32 z32, s32, oldop;
+    TCGv z_tl;
 
-    if (ot == OT_QUAD) {
-        mask = 0x3f;
+    /* Store the results into the CC variables.  If we know that the
+       variable must be dead, store unconditionally.  Otherwise we'll
+       need to not disrupt the current contents.  */
+    z_tl = tcg_const_tl(0);
+    if (cc_op_live[s->cc_op] & USES_CC_DST) {
+        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_dst, count, z_tl,
+                           result, cpu_cc_dst);
+    } else {
+        tcg_gen_mov_tl(cpu_cc_dst, result);
+    }
+    if (cc_op_live[s->cc_op] & USES_CC_SRC) {
+        tcg_gen_movcond_tl(TCG_COND_NE, cpu_cc_src, count, z_tl,
+                           shm1, cpu_cc_src);
     } else {
-        mask = 0x1f;
+        tcg_gen_mov_tl(cpu_cc_src, shm1);
     }
+    tcg_temp_free(z_tl);
+
+    /* Get the two potential CC_OP values into temporaries.  */
+    tcg_gen_movi_i32(cpu_tmp2_i32, (is_right ? CC_OP_SARB : CC_OP_SHLB) + ot);
+    if (s->cc_op == CC_OP_DYNAMIC) {
+        oldop = cpu_cc_op;
+    } else {
+        tcg_gen_movi_i32(cpu_tmp3_i32, s->cc_op);
+        oldop = cpu_tmp3_i32;
+    }
+
+    /* Conditionally store the CC_OP value.  */
+    z32 = tcg_const_i32(0);
+    s32 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(s32, count);
+    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, s32, z32, cpu_tmp2_i32, oldop);
+    tcg_temp_free_i32(z32);
+    tcg_temp_free_i32(s32);
+
+    /* The CC_OP value is no longer predictable.  */
+    set_cc_op(s, CC_OP_DYNAMIC);
+}
+
+static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, 
+                            int is_right, int is_arith)
+{
+    target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
 
     /* load */
     if (op1 == OR_TMP0) {
@@ -1434,25 +1633,22 @@ static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
         gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-
-    tcg_gen_andi_tl(t2, cpu_T[1], mask);
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
+    tcg_gen_subi_tl(cpu_tmp0, cpu_T[1], 1);
 
     if (is_right) {
         if (is_arith) {
             gen_exts(ot, cpu_T[0]);
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], t2);
+            tcg_gen_sar_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         } else {
             gen_extu(ot, cpu_T[0]);
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], t2);
+            tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
         }
     } else {
-        tcg_gen_mov_tl(t0, cpu_T[0]);
-        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], t2);
+        tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+        tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
     }
 
     /* store */
@@ -1462,50 +1658,13 @@ static void gen_shift_rm_T1(DisasContext *s, int ot, int op1,
         gen_op_mov_reg_T0(ot, op1);
     }
 
-    /* update eflags */
-    gen_update_cc_op(s);
-
-    tcg_gen_mov_tl(t1, cpu_T[0]);
-
-    shift_label = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, shift_label);
-
-    tcg_gen_addi_tl(t2, t2, -1);
-    tcg_gen_mov_tl(cpu_cc_dst, t1);
-
-    if (is_right) {
-        if (is_arith) {
-            tcg_gen_sar_tl(cpu_cc_src, t0, t2);
-        } else {
-            tcg_gen_shr_tl(cpu_cc_src, t0, t2);
-        }
-    } else {
-        tcg_gen_shl_tl(cpu_cc_src, t0, t2);
-    }
-
-    if (is_right) {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
-    } else {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
-    }
-
-    gen_set_label(shift_label);
-    set_cc_op(s, CC_OP_DYNAMIC); /* cannot predict flags after */
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
+    gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, cpu_T[1], is_right);
 }
 
 static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2,
                             int is_right, int is_arith)
 {
-    int mask;
-    
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
 
     /* load */
     if (op1 == OR_TMP0)
@@ -1553,175 +1712,181 @@ static inline void tcg_gen_lshift(TCGv ret, TCGv arg1, target_long arg2)
         tcg_gen_shri_tl(ret, arg1, -arg2);
 }
 
-static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, 
-                          int is_right)
+static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, int is_right)
 {
-    target_ulong mask;
-    int label1, label2, data_bits;
-    TCGv t0, t1, t2, a0;
-
-    /* XXX: inefficient, but we must use local temps */
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    target_ulong mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+    TCGv_i32 t0, t1;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    tcg_gen_mov_tl(t1, cpu_T[1]);
+    tcg_gen_andi_tl(cpu_T[1], cpu_T[1], mask);
 
-    tcg_gen_andi_tl(t1, t1, mask);
-
-    /* Must test zero case to avoid using undefined behaviour in TCG
-       shifts. */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label1);
-    
-    if (ot <= OT_WORD)
-        tcg_gen_andi_tl(cpu_tmp0, t1, (1 << (3 + ot)) - 1);
-    else
-        tcg_gen_mov_tl(cpu_tmp0, t1);
-    
-    gen_extu(ot, t0);
-    tcg_gen_mov_tl(t2, t0);
-
-    data_bits = 8 << ot;
-    /* XXX: rely on behaviour of shifts when operand 2 overflows (XXX:
-       fix TCG definition) */
-    if (is_right) {
-        tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp0);
-        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
-        tcg_gen_shl_tl(t0, t0, cpu_tmp0);
-    } else {
-        tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp0);
-        tcg_gen_subfi_tl(cpu_tmp0, data_bits, cpu_tmp0);
-        tcg_gen_shr_tl(t0, t0, cpu_tmp0);
+    switch (ot) {
+    case OT_BYTE:
+        /* Replicate the 8-bit input so that a 32-bit rotate works.  */
+        tcg_gen_ext8u_tl(cpu_T[0], cpu_T[0]);
+        tcg_gen_muli_tl(cpu_T[0], cpu_T[0], 0x01010101);
+        goto do_long;
+    case OT_WORD:
+        /* Replicate the 16-bit input so that a 32-bit rotate works.  */
+        tcg_gen_deposit_tl(cpu_T[0], cpu_T[0], cpu_T[0], 16, 16);
+        goto do_long;
+    do_long:
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+        tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+        if (is_right) {
+            tcg_gen_rotr_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+        } else {
+            tcg_gen_rotl_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+        }
+        tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+        break;
+#endif
+    default:
+        if (is_right) {
+            tcg_gen_rotr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        } else {
+            tcg_gen_rotl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        }
+        break;
     }
-    tcg_gen_or_tl(t0, t0, cpu_tmp4);
 
-    gen_set_label(label1);
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
-    
-    /* update eflags.  It is needed anyway most of the time, do it always.  */
-    gen_compute_eflags(s, cpu_cc_src);
-    assert(s->cc_op == CC_OP_EFLAGS);
 
-    label2 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label2);
+    /* We'll need the flags computed into CC_SRC.  */
+    gen_compute_eflags(s);
 
-    tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
-    tcg_gen_xor_tl(cpu_tmp0, t2, t0);
-    tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
-    tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
-    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+    /* The value that was "rotated out" is now present at the other end
+       of the word.  Compute C into CC_DST and O into CC_SRC2.  Note that
+       since we've computed the flags into CC_SRC, these variables are
+       currently dead.  */
     if (is_right) {
-        tcg_gen_shri_tl(t0, t0, data_bits - 1);
+        tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+        tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+        tcg_gen_andi_tl(cpu_cc_dst, cpu_cc_dst, 1);
+    } else {
+        tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+        tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
     }
-    tcg_gen_andi_tl(t0, t0, CC_C);
-    tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
-
-    gen_set_label(label2);
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
-    tcg_temp_free(a0);
+    tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+    tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+
+    /* Now conditionally store the new CC_OP value.  If the shift count
+       is 0 we keep the CC_OP_EFLAGS setting so that only CC_SRC is live.
+       Otherwise reuse CC_OP_ADCOX which have the C and O flags split out
+       exactly as we computed above.  */
+    t0 = tcg_const_i32(0);
+    t1 = tcg_temp_new_i32();
+    tcg_gen_trunc_tl_i32(t1, cpu_T[1]);
+    tcg_gen_movi_i32(cpu_tmp2_i32, CC_OP_ADCOX); 
+    tcg_gen_movi_i32(cpu_tmp3_i32, CC_OP_EFLAGS);
+    tcg_gen_movcond_i32(TCG_COND_NE, cpu_cc_op, t1, t0,
+                        cpu_tmp2_i32, cpu_tmp3_i32);
+    tcg_temp_free_i32(t0);
+    tcg_temp_free_i32(t1);
+
+    /* The CC_OP value is no longer predictable.  */ 
+    set_cc_op(s, CC_OP_DYNAMIC);
 }
 
 static void gen_rot_rm_im(DisasContext *s, int ot, int op1, int op2,
                           int is_right)
 {
-    int mask;
-    int data_bits;
-    TCGv t0, t1, a0;
-
-    /* XXX: inefficient, but we must use local temps */
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    int mask = (ot == OT_QUAD ? 0x3f : 0x1f);
+    int shift;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    gen_extu(ot, t0);
-    tcg_gen_mov_tl(t1, t0);
-
     op2 &= mask;
-    data_bits = 8 << ot;
     if (op2 != 0) {
-        int shift = op2 & ((1 << (3 + ot)) - 1);
-        if (is_right) {
-            tcg_gen_shri_tl(cpu_tmp4, t0, shift);
-            tcg_gen_shli_tl(t0, t0, data_bits - shift);
-        }
-        else {
-            tcg_gen_shli_tl(cpu_tmp4, t0, shift);
-            tcg_gen_shri_tl(t0, t0, data_bits - shift);
+        switch (ot) {
+#ifdef TARGET_X86_64
+        case OT_LONG:
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            if (is_right) {
+                tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+            } else {
+                tcg_gen_rotli_i32(cpu_tmp2_i32, cpu_tmp2_i32, op2);
+            }
+            tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+            break;
+#endif
+        default:
+            if (is_right) {
+                tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], op2);
+            } else {
+                tcg_gen_rotli_tl(cpu_T[0], cpu_T[0], op2);
+            }
+            break;
+        case OT_BYTE:
+            mask = 7;
+            goto do_shifts;
+        case OT_WORD:
+            mask = 15;
+        do_shifts:
+            shift = op2 & mask;
+            if (is_right) {
+                shift = mask + 1 - shift;
+            }
+            gen_extu(ot, cpu_T[0]);
+            tcg_gen_shli_tl(cpu_tmp0, cpu_T[0], shift);
+            tcg_gen_shri_tl(cpu_T[0], cpu_T[0], mask + 1 - shift);
+            tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_tmp0);
+            break;
         }
-        tcg_gen_or_tl(t0, t0, cpu_tmp4);
     }
 
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_reg_v(ot, op1, t0);
+        gen_op_mov_reg_T0(ot, op1);
     }
 
     if (op2 != 0) {
-        /* update eflags */
-        gen_compute_eflags(s, cpu_cc_src);
-        assert(s->cc_op == CC_OP_EFLAGS);
-
-        tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~(CC_O | CC_C));
-        tcg_gen_xor_tl(cpu_tmp0, t1, t0);
-        tcg_gen_lshift(cpu_tmp0, cpu_tmp0, 11 - (data_bits - 1));
-        tcg_gen_andi_tl(cpu_tmp0, cpu_tmp0, CC_O);
-        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_tmp0);
+        /* Compute the flags into CC_SRC.  */
+        gen_compute_eflags(s);
+
+        /* The value that was "rotated out" is now present at the other end
+           of the word.  Compute C into CC_DST and O into CC_SRC2.  Note that
+           since we've computed the flags into CC_SRC, these variables are
+           currently dead.  */
         if (is_right) {
-            tcg_gen_shri_tl(t0, t0, data_bits - 1);
+            tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask - 1);
+            tcg_gen_shri_tl(cpu_cc_dst, cpu_T[0], mask);
+            tcg_gen_andi_tl(cpu_cc_dst, cpu_cc_dst, 1);
+        } else {
+            tcg_gen_shri_tl(cpu_cc_src2, cpu_T[0], mask);
+            tcg_gen_andi_tl(cpu_cc_dst, cpu_T[0], 1);
         }
-        tcg_gen_andi_tl(t0, t0, CC_C);
-        tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0);
+        tcg_gen_andi_tl(cpu_cc_src2, cpu_cc_src2, 1);
+        tcg_gen_xor_tl(cpu_cc_src2, cpu_cc_src2, cpu_cc_dst);
+        set_cc_op(s, CC_OP_ADCOX);
     }
-
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(a0);
 }
 
 /* XXX: add faster immediate = 1 case */
 static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1, 
                            int is_right)
 {
-    gen_update_cc_op(s);
-    gen_compute_eflags(s, cpu_cc_src);
+    gen_compute_eflags(s);
     assert(s->cc_op == CC_OP_EFLAGS);
 
     /* load */
@@ -1773,131 +1938,89 @@ static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1,
 }
 
 /* XXX: add faster immediate case */
-static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, 
-                                int is_right)
+static void gen_shiftd_rm_T1(DisasContext *s, int ot, int op1,
+                             bool is_right, TCGv count_in)
 {
-    int label1, label2, data_bits;
-    target_ulong mask;
-    TCGv t0, t1, t2, a0;
-
-    t0 = tcg_temp_local_new();
-    t1 = tcg_temp_local_new();
-    t2 = tcg_temp_local_new();
-    a0 = tcg_temp_local_new();
-
-    if (ot == OT_QUAD)
-        mask = 0x3f;
-    else
-        mask = 0x1f;
+    target_ulong mask = (ot == OT_QUAD ? 63 : 31);
+    TCGv count;
 
     /* load */
     if (op1 == OR_TMP0) {
-        tcg_gen_mov_tl(a0, cpu_A0);
-        gen_op_ld_v(ot + s->mem_index, t0, a0);
+        gen_op_ld_T0_A0(ot + s->mem_index);
     } else {
-        gen_op_mov_v_reg(ot, t0, op1);
+        gen_op_mov_TN_reg(ot, 0, op1);
     }
 
-    tcg_gen_andi_tl(cpu_T3, cpu_T3, mask);
+    count = tcg_temp_new();
+    tcg_gen_andi_tl(count, count_in, mask);
 
-    tcg_gen_mov_tl(t1, cpu_T[1]);
-    tcg_gen_mov_tl(t2, cpu_T3);
-
-    /* Must test zero case to avoid using undefined behaviour in TCG
-       shifts. */
-    label1 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
-    
-    tcg_gen_addi_tl(cpu_tmp5, t2, -1);
-    if (ot == OT_WORD) {
-        /* Note: we implement the Intel behaviour for shift count > 16 */
+    switch (ot) {
+    case OT_WORD:
+        /* Note: we implement the Intel behaviour for shift count > 16.
+           This means "shrdw C, B, A" shifts A:B:A >> C.  Build the B:A
+           portion by constructing it as a 32-bit value.  */
         if (is_right) {
-            tcg_gen_andi_tl(t0, t0, 0xffff);
-            tcg_gen_shli_tl(cpu_tmp0, t1, 16);
-            tcg_gen_or_tl(t0, t0, cpu_tmp0);
-            tcg_gen_ext32u_tl(t0, t0);
-
-            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
-            
-            /* only needed if count > 16, but a test would complicate */
-            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
-            tcg_gen_shl_tl(cpu_tmp0, t0, cpu_tmp5);
-
-            tcg_gen_shr_tl(t0, t0, t2);
-
-            tcg_gen_or_tl(t0, t0, cpu_tmp0);
+            tcg_gen_deposit_tl(cpu_tmp0, cpu_T[0], cpu_T[1], 16, 16);
+            tcg_gen_mov_tl(cpu_T[1], cpu_T[0]);
+            tcg_gen_mov_tl(cpu_T[0], cpu_tmp0);
         } else {
-            /* XXX: not optimal */
-            tcg_gen_andi_tl(t0, t0, 0xffff);
-            tcg_gen_shli_tl(t1, t1, 16);
-            tcg_gen_or_tl(t1, t1, t0);
-            tcg_gen_ext32u_tl(t1, t1);
-            
-            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-            tcg_gen_subfi_tl(cpu_tmp0, 32, cpu_tmp5);
-            tcg_gen_shr_tl(cpu_tmp5, t1, cpu_tmp0);
-            tcg_gen_or_tl(cpu_tmp4, cpu_tmp4, cpu_tmp5);
-
-            tcg_gen_shl_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, 32, t2);
-            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_deposit_tl(cpu_T[1], cpu_T[0], cpu_T[1], 16, 16);
         }
-    } else {
-        data_bits = 8 << ot;
+        /* FALLTHRU */
+#ifdef TARGET_X86_64
+    case OT_LONG:
+        /* Concatenate the two 32-bit values and use a 64-bit shift.  */
+        tcg_gen_subi_tl(cpu_tmp0, count, 1);
         if (is_right) {
-            if (ot == OT_LONG)
-                tcg_gen_ext32u_tl(t0, t0);
-
-            tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5);
+            tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[0], cpu_T[1]);
+            tcg_gen_shr_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shr_i64(cpu_T[0], cpu_T[0], count);
+        } else {
+            tcg_gen_concat_tl_i64(cpu_T[0], cpu_T[1], cpu_T[0]);
+            tcg_gen_shl_i64(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            tcg_gen_shl_i64(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shri_i64(cpu_tmp0, cpu_tmp0, 32);
+            tcg_gen_shri_i64(cpu_T[0], cpu_T[0], 32);
+        }
+        break;
+#endif
+    default:
+        tcg_gen_subi_tl(cpu_tmp0, count, 1);
+        if (is_right) {
+            tcg_gen_shr_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
 
-            tcg_gen_shr_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
-            tcg_gen_shl_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
-            
+            tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+            tcg_gen_shr_tl(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
         } else {
-            if (ot == OT_LONG)
-                tcg_gen_ext32u_tl(t1, t1);
-
-            tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5);
-            
-            tcg_gen_shl_tl(t0, t0, t2);
-            tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2);
-            tcg_gen_shr_tl(t1, t1, cpu_tmp5);
-            tcg_gen_or_tl(t0, t0, t1);
+            tcg_gen_shl_tl(cpu_tmp0, cpu_T[0], cpu_tmp0);
+            if (ot == OT_WORD) {
+                /* Only needed if count > 16, for Intel behaviour.  */
+                tcg_gen_subfi_tl(cpu_tmp4, 33, count);
+                tcg_gen_shr_tl(cpu_tmp4, cpu_T[1], cpu_tmp4);
+                tcg_gen_or_tl(cpu_tmp0, cpu_tmp0, cpu_tmp4);
+            }
+
+            tcg_gen_subfi_tl(cpu_tmp4, mask + 1, count);
+            tcg_gen_shl_tl(cpu_T[0], cpu_T[0], count);
+            tcg_gen_shr_tl(cpu_T[1], cpu_T[1], cpu_tmp4);
         }
+        tcg_gen_movi_tl(cpu_tmp4, 0);
+        tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[1], count, cpu_tmp4,
+                           cpu_tmp4, cpu_T[1]);
+        tcg_gen_or_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+        break;
     }
-    tcg_gen_mov_tl(t1, cpu_tmp4);
 
-    gen_set_label(label1);
     /* store */
     if (op1 == OR_TMP0) {
-        gen_op_st_v(ot + s->mem_index, t0, a0);
-    } else {
-        gen_op_mov_reg_v(ot, op1, t0);
-    }
-    
-    /* update eflags */
-    gen_update_cc_op(s);
-
-    label2 = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label2);
-
-    tcg_gen_mov_tl(cpu_cc_src, t1);
-    tcg_gen_mov_tl(cpu_cc_dst, t0);
-    if (is_right) {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot);
+        gen_op_st_T0_A0(ot + s->mem_index);
     } else {
-        tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot);
+        gen_op_mov_reg_T0(ot, op1);
     }
-    gen_set_label(label2);
-    set_cc_op(s, CC_OP_DYNAMIC); /* cannot predict flags after */
 
-    tcg_temp_free(t0);
-    tcg_temp_free(t1);
-    tcg_temp_free(t2);
-    tcg_temp_free(a0);
+    gen_shift_flags(s, ot, cpu_T[0], cpu_tmp0, count, is_right);
+    tcg_temp_free(count);
 }
 
 static void gen_shift(DisasContext *s1, int op, int ot, int d, int s)
@@ -2303,23 +2426,19 @@ static inline void gen_jcc(DisasContext *s, int b,
 {
     int l1, l2;
 
-    gen_update_cc_op(s);
     if (s->jmp_opt) {
         l1 = gen_new_label();
         gen_jcc1(s, b, l1);
-        set_cc_op(s, CC_OP_DYNAMIC);
-        
+
         gen_goto_tb(s, 0, next_eip);
 
         gen_set_label(l1);
         gen_goto_tb(s, 1, val);
         s->is_jmp = DISAS_TB_JUMP;
     } else {
-
         l1 = gen_new_label();
         l2 = gen_new_label();
         gen_jcc1(s, b, l1);
-        set_cc_op(s, CC_OP_DYNAMIC);
 
         gen_jmp_im(next_eip);
         tcg_gen_br(l2);
@@ -2331,32 +2450,32 @@ static inline void gen_jcc(DisasContext *s, int b,
     }
 }
 
-static void gen_setcc(DisasContext *s, int b)
+static void gen_cmovcc1(CPUX86State *env, DisasContext *s, int ot, int b,
+                        int modrm, int reg)
 {
-    int inv, jcc_op, l1;
-    TCGv t0;
+    CCPrepare cc;
 
-    if (is_fast_jcc_case(s, b)) {
-        /* nominal case: we use a jump */
-        /* XXX: make it faster by adding new instructions in TCG */
-        t0 = tcg_temp_local_new();
-        tcg_gen_movi_tl(t0, 0);
-        l1 = gen_new_label();
-        gen_jcc1(s, b ^ 1, l1);
-        tcg_gen_movi_tl(t0, 1);
-        gen_set_label(l1);
-        tcg_gen_mov_tl(cpu_T[0], t0);
-        tcg_temp_free(t0);
-    } else {
-        /* slow case: it is more efficient not to generate a jump,
-           although it is questionnable whether this optimization is
-           worth to */
-        inv = b & 1;
-        jcc_op = (b >> 1) & 7;
-        gen_setcc_slow_T0(s, jcc_op);
-        if (inv) {
-            tcg_gen_xori_tl(cpu_T[0], cpu_T[0], 1);
-        }
+    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+    cc = gen_prepare_cc(s, b, cpu_T[1]);
+    if (cc.mask != -1) {
+        TCGv t0 = tcg_temp_new();
+        tcg_gen_andi_tl(t0, cc.reg, cc.mask);
+        cc.reg = t0;
+    }
+    if (!cc.use_reg2) {
+        cc.reg2 = tcg_const_tl(cc.imm);
+    }
+
+    tcg_gen_movcond_tl(cc.cond, cpu_T[0], cc.reg, cc.reg2,
+                       cpu_T[0], cpu_regs[reg]);
+    gen_op_mov_reg_T0(ot, reg);
+
+    if (cc.mask != -1) {
+        tcg_temp_free(cc.reg);
+    }
+    if (!cc.use_reg2) {
+        tcg_temp_free(cc.reg2);
     }
 }
 
@@ -2709,8 +2828,9 @@ static void gen_eob(DisasContext *s)
    direct call to the next block may occur */
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num)
 {
+    gen_update_cc_op(s);
+    set_cc_op(s, CC_OP_DYNAMIC);
     if (s->jmp_opt) {
-        gen_update_cc_op(s);
         gen_goto_tb(s, tb_num, eip);
         s->is_jmp = DISAS_TB_JUMP;
     } else {
@@ -2846,8 +2966,9 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps,
                (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */
 
-    [0x38] = { SSE_SPECIAL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* SSSE3/SSE4 */
-    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL }, /* SSSE3/SSE4 */
+    /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX.  */
+    [0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
+    [0x3a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
 
     /* MMX ops and their SSE extensions */
     [0x60] = MMX_OP2(punpcklbw),
@@ -3028,6 +3149,9 @@ struct SSEOpHelper_eppi {
 #define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 }
 #define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 }
 #define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 }
+#define PCLMULQDQ_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, \
+        CPUID_EXT_PCLMULQDQ }
+#define AESNI_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_AES }
 
 static const struct SSEOpHelper_epp sse_op_table6[256] = {
     [0x00] = SSSE3_OP(pshufb),
@@ -3076,6 +3200,11 @@ static const struct SSEOpHelper_epp sse_op_table6[256] = {
     [0x3f] = SSE41_OP(pmaxud),
     [0x40] = SSE41_OP(pmulld),
     [0x41] = SSE41_OP(phminposuw),
+    [0xdb] = AESNI_OP(aesimc),
+    [0xdc] = AESNI_OP(aesenc),
+    [0xdd] = AESNI_OP(aesenclast),
+    [0xde] = AESNI_OP(aesdec),
+    [0xdf] = AESNI_OP(aesdeclast),
 };
 
 static const struct SSEOpHelper_eppi sse_op_table7[256] = {
@@ -3097,10 +3226,12 @@ static const struct SSEOpHelper_eppi sse_op_table7[256] = {
     [0x40] = SSE41_OP(dpps),
     [0x41] = SSE41_OP(dppd),
     [0x42] = SSE41_OP(mpsadbw),
+    [0x44] = PCLMULQDQ_OP(pclmulqdq),
     [0x60] = SSE42_OP(pcmpestrm),
     [0x61] = SSE42_OP(pcmpestri),
     [0x62] = SSE42_OP(pcmpistrm),
     [0x63] = SSE42_OP(pcmpistri),
+    [0xdf] = AESNI_OP(aeskeygenassist),
 };
 
 static void gen_sse(CPUX86State *env, DisasContext *s, int b,
@@ -3728,11 +3859,13 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             reg = ((modrm >> 3) & 7) | rex_r;
             gen_op_mov_reg_T0(OT_LONG, reg);
             break;
+
         case 0x138:
-            if (s->prefix & PREFIX_REPNZ)
-                goto crc32;
         case 0x038:
             b = modrm;
+            if ((b & 0xf0) == 0xf0) {
+                goto do_0f_38_fx;
+            }
             modrm = cpu_ldub_code(env, s->pc++);
             rm = modrm & 7;
             reg = ((modrm >> 3) & 7) | rex_r;
@@ -3770,71 +3903,436 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                         tcg_gen_st_i32(cpu_tmp2_i32, cpu_env, op2_offset +
                                         offsetof(XMMReg, XMM_L(0)));
                         break;
-                    case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */
-                        tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0,
-                                          (s->mem_index >> 2) - 1);
-                        tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset +
-                                        offsetof(XMMReg, XMM_W(0)));
+                    case 0x22: case 0x32: /* pmovsxbq, pmovzxbq */
+                        tcg_gen_qemu_ld16u(cpu_tmp0, cpu_A0,
+                                          (s->mem_index >> 2) - 1);
+                        tcg_gen_st16_tl(cpu_tmp0, cpu_env, op2_offset +
+                                        offsetof(XMMReg, XMM_W(0)));
+                        break;
+                    case 0x2a:            /* movntqda */
+                        gen_ldo_env_A0(s->mem_index, op1_offset);
+                        return;
+                    default:
+                        gen_ldo_env_A0(s->mem_index, op2_offset);
+                    }
+                }
+            } else {
+                op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
+                if (mod == 3) {
+                    op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+                } else {
+                    op2_offset = offsetof(CPUX86State,mmx_t0);
+                    gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
+                    gen_ldq_env_A0(s->mem_index, op2_offset);
+                }
+            }
+            if (sse_fn_epp == SSE_SPECIAL) {
+                goto illegal_op;
+            }
+
+            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
+            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
+            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
+
+            if (b == 0x17) {
+                set_cc_op(s, CC_OP_EFLAGS);
+            }
+            break;
+
+        case 0x238:
+        case 0x338:
+        do_0f_38_fx:
+            /* Various integer extensions at 0f 38 f[0-f].  */
+            b = modrm | (b1 << 8);
+            modrm = cpu_ldub_code(env, s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+
+            switch (b) {
+            case 0x3f0: /* crc32 Gd,Eb */
+            case 0x3f1: /* crc32 Gd,Ey */
+            do_crc32:
+                if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) {
+                    goto illegal_op;
+                }
+                if ((b & 0xff) == 0xf0) {
+                    ot = OT_BYTE;
+                } else if (s->dflag != 2) {
+                    ot = (s->prefix & PREFIX_DATA ? OT_WORD : OT_LONG);
+                } else {
+                    ot = OT_QUAD;
+                }
+
+                gen_op_mov_TN_reg(OT_LONG, 0, reg);
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
+                                 cpu_T[0], tcg_const_i32(8 << ot));
+
+                ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            case 0x1f0: /* crc32 or movbe */
+            case 0x1f1:
+                /* For these insns, the f3 prefix is supposed to have priority
+                   over the 66 prefix, but that's not what we implement above
+                   setting b1.  */
+                if (s->prefix & PREFIX_REPNZ) {
+                    goto do_crc32;
+                }
+                /* FALLTHRU */
+            case 0x0f0: /* movbe Gy,My */
+            case 0x0f1: /* movbe My,Gy */
+                if (!(s->cpuid_ext_features & CPUID_EXT_MOVBE)) {
+                    goto illegal_op;
+                }
+                if (s->dflag != 2) {
+                    ot = (s->prefix & PREFIX_DATA ? OT_WORD : OT_LONG);
+                } else {
+                    ot = OT_QUAD;
+                }
+
+                /* Load the data incoming to the bswap.  Note that the TCG
+                   implementation of bswap requires the input be zero
+                   extended.  In the case of the loads, we simply know that
+                   gen_op_ld_v via gen_ldst_modrm does that already.  */
+                if ((b & 1) == 0) {
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                } else {
+                    switch (ot) {
+                    case OT_WORD:
+                        tcg_gen_ext16u_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    default:
+                        tcg_gen_ext32u_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    case OT_QUAD:
+                        tcg_gen_mov_tl(cpu_T[0], cpu_regs[reg]);
+                        break;
+                    }
+                }
+
+                switch (ot) {
+                case OT_WORD:
+                    tcg_gen_bswap16_tl(cpu_T[0], cpu_T[0]);
+                    break;
+                default:
+                    tcg_gen_bswap32_tl(cpu_T[0], cpu_T[0]);
+                    break;
+#ifdef TARGET_X86_64
+                case OT_QUAD:
+                    tcg_gen_bswap64_tl(cpu_T[0], cpu_T[0]);
+                    break;
+#endif
+                }
+
+                if ((b & 1) == 0) {
+                    gen_op_mov_reg_T0(ot, reg);
+                } else {
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 1);
+                }
+                break;
+
+            case 0x0f2: /* andn Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                tcg_gen_andc_tl(cpu_T[0], cpu_regs[s->vex_v], cpu_T[0]);
+                gen_op_mov_reg_T0(ot, reg);
+                gen_op_update1_cc();
+                set_cc_op(s, CC_OP_LOGICB + ot);
+                break;
+
+            case 0x0f7: /* bextr Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                {
+                    TCGv bound, zero;
+
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                    /* Extract START, and shift the operand.
+                       Shifts larger than operand size get zeros.  */
+                    tcg_gen_ext8u_tl(cpu_A0, cpu_regs[s->vex_v]);
+                    tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_A0);
+
+                    bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+                    zero = tcg_const_tl(0);
+                    tcg_gen_movcond_tl(TCG_COND_LEU, cpu_T[0], cpu_A0, bound,
+                                       cpu_T[0], zero);
+                    tcg_temp_free(zero);
+
+                    /* Extract the LEN into a mask.  Lengths larger than
+                       operand size get all ones.  */
+                    tcg_gen_shri_tl(cpu_A0, cpu_regs[s->vex_v], 8);
+                    tcg_gen_ext8u_tl(cpu_A0, cpu_A0);
+                    tcg_gen_movcond_tl(TCG_COND_LEU, cpu_A0, cpu_A0, bound,
+                                       cpu_A0, bound);
+                    tcg_temp_free(bound);
+                    tcg_gen_movi_tl(cpu_T[1], 1);
+                    tcg_gen_shl_tl(cpu_T[1], cpu_T[1], cpu_A0);
+                    tcg_gen_subi_tl(cpu_T[1], cpu_T[1], 1);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+
+                    gen_op_mov_reg_T0(ot, reg);
+                    gen_op_update1_cc();
+                    set_cc_op(s, CC_OP_LOGICB + ot);
+                }
+                break;
+
+            case 0x0f5: /* bzhi Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                tcg_gen_ext8u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                {
+                    TCGv bound = tcg_const_tl(ot == OT_QUAD ? 63 : 31);
+                    /* Note that since we're using BMILG (in order to get O
+                       cleared) we need to store the inverse into C.  */
+                    tcg_gen_setcond_tl(TCG_COND_LT, cpu_cc_src,
+                                       cpu_T[1], bound);
+                    tcg_gen_movcond_tl(TCG_COND_GT, cpu_T[1], cpu_T[1],
+                                       bound, bound, cpu_T[1]);
+                    tcg_temp_free(bound);
+                }
+                tcg_gen_movi_tl(cpu_A0, -1);
+                tcg_gen_shl_tl(cpu_A0, cpu_A0, cpu_T[1]);
+                tcg_gen_andc_tl(cpu_T[0], cpu_T[0], cpu_A0);
+                gen_op_mov_reg_T0(ot, reg);
+                gen_op_update1_cc();
+                set_cc_op(s, CC_OP_BMILGB + ot);
+                break;
+
+            case 0x3f6: /* mulx By, Gy, rdx, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                switch (ot) {
+                default:
+                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                    tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EDX]);
+                    tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                      cpu_tmp2_i32, cpu_tmp3_i32);
+                    tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], cpu_tmp2_i32);
+                    tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp3_i32);
+                    break;
+#ifdef TARGET_X86_64
+                case OT_QUAD:
+                    tcg_gen_mulu2_i64(cpu_regs[s->vex_v], cpu_regs[reg],
+                                      cpu_T[0], cpu_regs[R_EDX]);
+                    break;
+#endif
+                }
+                break;
+
+            case 0x3f5: /* pdep Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                /* Note that by zero-extending the mask operand, we
+                   automatically handle zero-extending the result.  */
+                if (s->dflag == 2) {
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                } else {
+                    tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                }
+                gen_helper_pdep(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                break;
+
+            case 0x2f5: /* pext Gy, By, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                /* Note that by zero-extending the mask operand, we
+                   automatically handle zero-extending the result.  */
+                if (s->dflag == 2) {
+                    tcg_gen_mov_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                } else {
+                    tcg_gen_ext32u_tl(cpu_T[1], cpu_regs[s->vex_v]);
+                }
+                gen_helper_pext(cpu_regs[reg], cpu_T[0], cpu_T[1]);
+                break;
+
+            case 0x1f6: /* adcx Gy, Ey */
+            case 0x2f6: /* adox Gy, Ey */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) {
+                    goto illegal_op;
+                } else {
+                    TCGv carry_in, carry_out, zero;
+                    int end_op;
+
+                    ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+                    gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+                    /* Re-use the carry-out from a previous round.  */
+                    TCGV_UNUSED(carry_in);
+                    carry_out = (b == 0x1f6 ? cpu_cc_dst : cpu_cc_src2);
+                    switch (s->cc_op) {
+                    case CC_OP_ADCX:
+                        if (b == 0x1f6) {
+                            carry_in = cpu_cc_dst;
+                            end_op = CC_OP_ADCX;
+                        } else {
+                            end_op = CC_OP_ADCOX;
+                        }
+                        break;
+                    case CC_OP_ADOX:
+                        if (b == 0x1f6) {
+                            end_op = CC_OP_ADCOX;
+                        } else {
+                            carry_in = cpu_cc_src2;
+                            end_op = CC_OP_ADOX;
+                        }
+                        break;
+                    case CC_OP_ADCOX:
+                        end_op = CC_OP_ADCOX;
+                        carry_in = carry_out;
+                        break;
+                    default:
+                        end_op = (b == 0x1f6 ? CC_OP_ADCX : CC_OP_ADOX);
+                        break;
+                    }
+                    /* If we can't reuse carry-out, get it out of EFLAGS.  */
+                    if (TCGV_IS_UNUSED(carry_in)) {
+                        if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
+                            gen_compute_eflags(s);
+                        }
+                        carry_in = cpu_tmp0;
+                        tcg_gen_shri_tl(carry_in, cpu_cc_src,
+                                        ctz32(b == 0x1f6 ? CC_C : CC_O));
+                        tcg_gen_andi_tl(carry_in, carry_in, 1);
+                    }
+
+                    switch (ot) {
+#ifdef TARGET_X86_64
+                    case OT_LONG:
+                        /* If we know TL is 64-bit, and we want a 32-bit
+                           result, just do everything in 64-bit arithmetic.  */
+                        tcg_gen_ext32u_i64(cpu_regs[reg], cpu_regs[reg]);
+                        tcg_gen_ext32u_i64(cpu_T[0], cpu_T[0]);
+                        tcg_gen_add_i64(cpu_T[0], cpu_T[0], cpu_regs[reg]);
+                        tcg_gen_add_i64(cpu_T[0], cpu_T[0], carry_in);
+                        tcg_gen_ext32u_i64(cpu_regs[reg], cpu_T[0]);
+                        tcg_gen_shri_i64(carry_out, cpu_T[0], 32);
                         break;
-                    case 0x2a:            /* movntqda */
-                        gen_ldo_env_A0(s->mem_index, op1_offset);
-                        return;
+#endif
                     default:
-                        gen_ldo_env_A0(s->mem_index, op2_offset);
+                        /* Otherwise compute the carry-out in two steps.  */
+                        zero = tcg_const_tl(0);
+                        tcg_gen_add2_tl(cpu_T[0], carry_out,
+                                        cpu_T[0], zero,
+                                        carry_in, zero);
+                        tcg_gen_add2_tl(cpu_regs[reg], carry_out,
+                                        cpu_regs[reg], carry_out,
+                                        cpu_T[0], zero);
+                        tcg_temp_free(zero);
+                        break;
                     }
+                    set_cc_op(s, end_op);
                 }
-            } else {
-                op1_offset = offsetof(CPUX86State,fpregs[reg].mmx);
-                if (mod == 3) {
-                    op2_offset = offsetof(CPUX86State,fpregs[rm].mmx);
+                break;
+
+            case 0x1f7: /* shlx Gy, Ey, By */
+            case 0x2f7: /* sarx Gy, Ey, By */
+            case 0x3f7: /* shrx Gy, Ey, By */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = (s->dflag == 2 ? OT_QUAD : OT_LONG);
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                if (ot == OT_QUAD) {
+                    tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 63);
                 } else {
-                    op2_offset = offsetof(CPUX86State,mmx_t0);
-                    gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-                    gen_ldq_env_A0(s->mem_index, op2_offset);
+                    tcg_gen_andi_tl(cpu_T[1], cpu_regs[s->vex_v], 31);
                 }
-            }
-            if (sse_fn_epp == SSE_SPECIAL) {
-                goto illegal_op;
-            }
-
-            tcg_gen_addi_ptr(cpu_ptr0, cpu_env, op1_offset);
-            tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
-            sse_fn_epp(cpu_env, cpu_ptr0, cpu_ptr1);
+                if (b == 0x1f7) {
+                    tcg_gen_shl_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                } else if (b == 0x2f7) {
+                    if (ot != OT_QUAD) {
+                        tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
+                    }
+                    tcg_gen_sar_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                } else {
+                    if (ot != OT_QUAD) {
+                        tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
+                    }
+                    tcg_gen_shr_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                break;
 
-            if (b == 0x17) {
-                set_cc_op(s, CC_OP_EFLAGS);
-            }
-            break;
-        case 0x338: /* crc32 */
-        crc32:
-            b = modrm;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
+            case 0x0f3:
+            case 0x1f3:
+            case 0x2f3:
+            case 0x3f3: /* Group 17 */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+
+                switch (reg & 7) {
+                case 1: /* blsr By,Ey */
+                    tcg_gen_neg_tl(cpu_T[1], cpu_T[0]);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
+                    gen_op_mov_reg_T0(ot, s->vex_v);
+                    gen_op_update2_cc();
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
 
-            if (b != 0xf0 && b != 0xf1)
-                goto illegal_op;
-            if (!(s->cpuid_ext_features & CPUID_EXT_SSE42))
-                goto illegal_op;
+                case 2: /* blsmsk By,Ey */
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                    tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+                    tcg_gen_xor_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
 
-            if (b == 0xf0)
-                ot = OT_BYTE;
-            else if (b == 0xf1 && s->dflag != 2)
-                if (s->prefix & PREFIX_DATA)
-                    ot = OT_WORD;
-                else
-                    ot = OT_LONG;
-            else
-                ot = OT_QUAD;
+                case 3: /* blsi By, Ey */
+                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+                    tcg_gen_subi_tl(cpu_T[0], cpu_T[0], 1);
+                    tcg_gen_and_tl(cpu_T[0], cpu_T[0], cpu_cc_src);
+                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+                    set_cc_op(s, CC_OP_BMILGB + ot);
+                    break;
 
-            gen_op_mov_TN_reg(OT_LONG, 0, reg);
-            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
-            gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
-            gen_helper_crc32(cpu_T[0], cpu_tmp2_i32,
-                             cpu_T[0], tcg_const_i32(8 << ot));
+                default:
+                    goto illegal_op;
+                }
+                break;
 
-            ot = (s->dflag == 2) ? OT_QUAD : OT_LONG;
-            gen_op_mov_reg_T0(ot, reg);
+            default:
+                goto illegal_op;
+            }
             break;
+
         case 0x03a:
         case 0x13a:
             b = modrm;
@@ -3918,9 +4416,9 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                     if (mod == 3)
                         gen_op_mov_TN_reg(OT_LONG, 0, rm);
                     else
-                        tcg_gen_qemu_ld8u(cpu_tmp0, cpu_A0,
+                        tcg_gen_qemu_ld8u(cpu_T[0], cpu_A0,
                                         (s->mem_index >> 2) - 1);
-                    tcg_gen_st8_tl(cpu_tmp0, cpu_env, offsetof(CPUX86State,
+                    tcg_gen_st8_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
                                             xmm_regs[reg].XMM_B(val & 15)));
                     break;
                 case 0x21: /* insertps */
@@ -4016,6 +4514,38 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             tcg_gen_addi_ptr(cpu_ptr1, cpu_env, op2_offset);
             sse_fn_eppi(cpu_env, cpu_ptr0, cpu_ptr1, tcg_const_i32(val));
             break;
+
+        case 0x33a:
+            /* Various integer extensions at 0f 3a f[0-f].  */
+            b = modrm | (b1 << 8);
+            modrm = cpu_ldub_code(env, s->pc++);
+            reg = ((modrm >> 3) & 7) | rex_r;
+
+            switch (b) {
+            case 0x3f0: /* rorx Gy,Ey, Ib */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI2)
+                    || !(s->prefix & PREFIX_VEX)
+                    || s->vex_l != 0) {
+                    goto illegal_op;
+                }
+                ot = s->dflag == 2 ? OT_QUAD : OT_LONG;
+                gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+                b = cpu_ldub_code(env, s->pc++);
+                if (ot == OT_QUAD) {
+                    tcg_gen_rotri_tl(cpu_T[0], cpu_T[0], b & 63);
+                } else {
+                    tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                    tcg_gen_rotri_i32(cpu_tmp2_i32, cpu_tmp2_i32, b & 31);
+                    tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32);
+                }
+                gen_op_mov_reg_T0(ot, reg);
+                break;
+
+            default:
+                goto illegal_op;
+            }
+            break;
+
         default:
             goto illegal_op;
         }
@@ -4147,8 +4677,6 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     }
     s->pc = pc_start;
     prefixes = 0;
-    aflag = s->code32;
-    dflag = s->code32;
     s->override = -1;
     rex_w = -1;
     rex_r = 0;
@@ -4158,47 +4686,49 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     x86_64_hregs = 0;
 #endif
     s->rip_offset = 0; /* for relative ip address */
+    s->vex_l = 0;
+    s->vex_v = 0;
  next_byte:
     b = cpu_ldub_code(env, s->pc);
     s->pc++;
-    /* check prefixes */
+    /* Collect prefixes.  */
+    switch (b) {
+    case 0xf3:
+        prefixes |= PREFIX_REPZ;
+        goto next_byte;
+    case 0xf2:
+        prefixes |= PREFIX_REPNZ;
+        goto next_byte;
+    case 0xf0:
+        prefixes |= PREFIX_LOCK;
+        goto next_byte;
+    case 0x2e:
+        s->override = R_CS;
+        goto next_byte;
+    case 0x36:
+        s->override = R_SS;
+        goto next_byte;
+    case 0x3e:
+        s->override = R_DS;
+        goto next_byte;
+    case 0x26:
+        s->override = R_ES;
+        goto next_byte;
+    case 0x64:
+        s->override = R_FS;
+        goto next_byte;
+    case 0x65:
+        s->override = R_GS;
+        goto next_byte;
+    case 0x66:
+        prefixes |= PREFIX_DATA;
+        goto next_byte;
+    case 0x67:
+        prefixes |= PREFIX_ADR;
+        goto next_byte;
 #ifdef TARGET_X86_64
-    if (CODE64(s)) {
-        switch (b) {
-        case 0xf3:
-            prefixes |= PREFIX_REPZ;
-            goto next_byte;
-        case 0xf2:
-            prefixes |= PREFIX_REPNZ;
-            goto next_byte;
-        case 0xf0:
-            prefixes |= PREFIX_LOCK;
-            goto next_byte;
-        case 0x2e:
-            s->override = R_CS;
-            goto next_byte;
-        case 0x36:
-            s->override = R_SS;
-            goto next_byte;
-        case 0x3e:
-            s->override = R_DS;
-            goto next_byte;
-        case 0x26:
-            s->override = R_ES;
-            goto next_byte;
-        case 0x64:
-            s->override = R_FS;
-            goto next_byte;
-        case 0x65:
-            s->override = R_GS;
-            goto next_byte;
-        case 0x66:
-            prefixes |= PREFIX_DATA;
-            goto next_byte;
-        case 0x67:
-            prefixes |= PREFIX_ADR;
-            goto next_byte;
-        case 0x40 ... 0x4f:
+    case 0x40 ... 0x4f:
+        if (CODE64(s)) {
             /* REX prefix */
             rex_w = (b >> 3) & 1;
             rex_r = (b & 0x4) << 1;
@@ -4207,57 +4737,86 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             x86_64_hregs = 1; /* select uniform byte register addressing */
             goto next_byte;
         }
-        if (rex_w == 1) {
-            /* 0x66 is ignored if rex.w is set */
-            dflag = 2;
-        } else {
-            if (prefixes & PREFIX_DATA)
-                dflag ^= 1;
-        }
-        if (!(prefixes & PREFIX_ADR))
-            aflag = 2;
-    } else
+        break;
 #endif
-    {
-        switch (b) {
-        case 0xf3:
-            prefixes |= PREFIX_REPZ;
-            goto next_byte;
-        case 0xf2:
-            prefixes |= PREFIX_REPNZ;
-            goto next_byte;
-        case 0xf0:
-            prefixes |= PREFIX_LOCK;
-            goto next_byte;
-        case 0x2e:
-            s->override = R_CS;
-            goto next_byte;
-        case 0x36:
-            s->override = R_SS;
-            goto next_byte;
-        case 0x3e:
-            s->override = R_DS;
-            goto next_byte;
-        case 0x26:
-            s->override = R_ES;
-            goto next_byte;
-        case 0x64:
-            s->override = R_FS;
-            goto next_byte;
-        case 0x65:
-            s->override = R_GS;
-            goto next_byte;
-        case 0x66:
-            prefixes |= PREFIX_DATA;
-            goto next_byte;
-        case 0x67:
-            prefixes |= PREFIX_ADR;
-            goto next_byte;
+    case 0xc5: /* 2-byte VEX */
+    case 0xc4: /* 3-byte VEX */
+        /* VEX prefixes cannot be used except in 32-bit mode.
+           Otherwise the instruction is LES or LDS.  */
+        if (s->code32 && !s->vm86) {
+            static const int pp_prefix[4] = {
+                0, PREFIX_DATA, PREFIX_REPZ, PREFIX_REPNZ
+            };
+            int vex3, vex2 = cpu_ldub_code(env, s->pc);
+
+            if (!CODE64(s) && (vex2 & 0xc0) != 0xc0) {
+                /* 4.1.4.6: In 32-bit mode, bits [7:6] must be 11b,
+                   otherwise the instruction is LES or LDS.  */
+                break;
+            }
+            s->pc++;
+
+            /* 4.1.1-4.1.3: No preceding lock, 66, f2, f3, or rex prefixes. */
+            if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ
+                            | PREFIX_LOCK | PREFIX_DATA)) {
+                goto illegal_op;
+            }
+#ifdef TARGET_X86_64
+            if (x86_64_hregs) {
+                goto illegal_op;
+            }
+#endif
+            rex_r = (~vex2 >> 4) & 8;
+            if (b == 0xc5) {
+                vex3 = vex2;
+                b = cpu_ldub_code(env, s->pc++);
+            } else {
+#ifdef TARGET_X86_64
+                s->rex_x = (~vex2 >> 3) & 8;
+                s->rex_b = (~vex2 >> 2) & 8;
+#endif
+                vex3 = cpu_ldub_code(env, s->pc++);
+                rex_w = (vex3 >> 7) & 1;
+                switch (vex2 & 0x1f) {
+                case 0x01: /* Implied 0f leading opcode bytes.  */
+                    b = cpu_ldub_code(env, s->pc++) | 0x100;
+                    break;
+                case 0x02: /* Implied 0f 38 leading opcode bytes.  */
+                    b = 0x138;
+                    break;
+                case 0x03: /* Implied 0f 3a leading opcode bytes.  */
+                    b = 0x13a;
+                    break;
+                default:   /* Reserved for future use.  */
+                    goto illegal_op;
+                }
+            }
+            s->vex_v = (~vex3 >> 3) & 0xf;
+            s->vex_l = (vex3 >> 2) & 1;
+            prefixes |= pp_prefix[vex3 & 3] | PREFIX_VEX;
         }
-        if (prefixes & PREFIX_DATA)
+        break;
+    }
+
+    /* Post-process prefixes.  */
+    if (CODE64(s)) {
+        /* In 64-bit mode, the default data size is 32-bit.  Select 64-bit
+           data with rex_w, and 16-bit data with 0x66; rex_w takes precedence
+           over 0x66 if both are present.  */
+        dflag = (rex_w > 0 ? 2 : prefixes & PREFIX_DATA ? 0 : 1);
+        /* In 64-bit mode, 0x67 selects 32-bit addressing.  */
+        aflag = (prefixes & PREFIX_ADR ? 1 : 2);
+    } else {
+        /* In 16/32-bit mode, 0x66 selects the opposite data size.  */
+        dflag = s->code32;
+        if (prefixes & PREFIX_DATA) {
             dflag ^= 1;
-        if (prefixes & PREFIX_ADR)
+        }
+        /* In 16/32-bit mode, 0x67 selects the opposite addressing.  */
+        aflag = s->code32;
+        if (prefixes & PREFIX_ADR) {
             aflag ^= 1;
+        }
     }
 
     s->prefix = prefixes;
@@ -4309,10 +4868,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 } else if (op == OP_XORL && rm == reg) {
                 xor_zero:
                     /* xor reg, reg optimisation */
+                    set_cc_op(s, CC_OP_CLR);
                     gen_op_movl_T0_0();
-                    set_cc_op(s, CC_OP_LOGICB + ot);
                     gen_op_mov_reg_T0(ot, reg);
-                    gen_op_update1_cc();
                     break;
                 } else {
                     opreg = rm;
@@ -4473,39 +5031,22 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 break;
             default:
             case OT_LONG:
-#ifdef TARGET_X86_64
-                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                tcg_gen_ext32u_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32u_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
-                gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                    tcg_gen_extu_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_extu_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                    tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
-                }
-#endif
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+                tcg_gen_mulu2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                  cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
-                gen_helper_mulq_EAX_T0(cpu_env, cpu_T[0]);
+                tcg_gen_mulu2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+                                  cpu_T[0], cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
@@ -4541,41 +5082,25 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 break;
             default:
             case OT_LONG:
-#ifdef TARGET_X86_64
-                gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
-                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                tcg_gen_shri_tl(cpu_T[0], cpu_T[0], 32);
-                gen_op_mov_reg_T0(OT_LONG, R_EDX);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    gen_op_mov_TN_reg(OT_LONG, 1, R_EAX);
-                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EAX);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    gen_op_mov_reg_T0(OT_LONG, R_EDX);
-                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-                }
-#endif
+                tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+                tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_regs[R_EAX]);
+                tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                                  cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EAX], cpu_tmp2_i32);
+                tcg_gen_extu_i32_tl(cpu_regs[R_EDX], cpu_tmp3_i32);
+                tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+                tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
                 set_cc_op(s, CC_OP_MULL);
                 break;
 #ifdef TARGET_X86_64
             case OT_QUAD:
-                gen_helper_imulq_EAX_T0(cpu_env, cpu_T[0]);
+                tcg_gen_muls2_i64(cpu_regs[R_EAX], cpu_regs[R_EDX],
+                                  cpu_T[0], cpu_regs[R_EAX]);
+                tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[R_EAX]);
+                tcg_gen_sari_tl(cpu_cc_src, cpu_regs[R_EAX], 63);
+                tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_regs[R_EDX]);
                 set_cc_op(s, CC_OP_MULQ);
                 break;
 #endif
@@ -4830,37 +5355,27 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         } else {
             gen_op_mov_TN_reg(ot, 1, reg);
         }
-
-#ifdef TARGET_X86_64
-        if (ot == OT_QUAD) {
-            gen_helper_imulq_T0_T1(cpu_T[0], cpu_env, cpu_T[0], cpu_T[1]);
-        } else
-#endif
-        if (ot == OT_LONG) {
+        switch (ot) {
 #ifdef TARGET_X86_64
-                tcg_gen_ext32s_tl(cpu_T[0], cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_T[1], cpu_T[1]);
-                tcg_gen_mul_tl(cpu_T[0], cpu_T[0], cpu_T[1]);
-                tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                tcg_gen_ext32s_tl(cpu_tmp0, cpu_T[0]);
-                tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
-#else
-                {
-                    TCGv_i64 t0, t1;
-                    t0 = tcg_temp_new_i64();
-                    t1 = tcg_temp_new_i64();
-                    tcg_gen_ext_i32_i64(t0, cpu_T[0]);
-                    tcg_gen_ext_i32_i64(t1, cpu_T[1]);
-                    tcg_gen_mul_i64(t0, t0, t1);
-                    tcg_gen_trunc_i64_i32(cpu_T[0], t0);
-                    tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
-                    tcg_gen_sari_tl(cpu_tmp0, cpu_T[0], 31);
-                    tcg_gen_shri_i64(t0, t0, 32);
-                    tcg_gen_trunc_i64_i32(cpu_T[1], t0);
-                    tcg_gen_sub_tl(cpu_cc_src, cpu_T[1], cpu_tmp0);
-                }
+        case OT_QUAD:
+            tcg_gen_muls2_i64(cpu_regs[reg], cpu_T[1], cpu_T[0], cpu_T[1]);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+            tcg_gen_sari_tl(cpu_cc_src, cpu_cc_dst, 63);
+            tcg_gen_sub_tl(cpu_cc_src, cpu_cc_src, cpu_T[1]);
+            break;
 #endif
-        } else {
+        case OT_LONG:
+            tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]);
+            tcg_gen_trunc_tl_i32(cpu_tmp3_i32, cpu_T[1]);
+            tcg_gen_muls2_i32(cpu_tmp2_i32, cpu_tmp3_i32,
+                              cpu_tmp2_i32, cpu_tmp3_i32);
+            tcg_gen_extu_i32_tl(cpu_regs[reg], cpu_tmp2_i32);
+            tcg_gen_sari_i32(cpu_tmp2_i32, cpu_tmp2_i32, 31);
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_regs[reg]);
+            tcg_gen_sub_i32(cpu_tmp2_i32, cpu_tmp2_i32, cpu_tmp3_i32);
+            tcg_gen_extu_i32_tl(cpu_cc_src, cpu_tmp2_i32);
+            break;
+        default:
             tcg_gen_ext16s_tl(cpu_T[0], cpu_T[0]);
             tcg_gen_ext16s_tl(cpu_T[1], cpu_T[1]);
             /* XXX: use 32 bit mul which could be faster */
@@ -4868,8 +5383,9 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
             tcg_gen_ext16s_tl(cpu_tmp0, cpu_T[0]);
             tcg_gen_sub_tl(cpu_cc_src, cpu_T[0], cpu_tmp0);
+            gen_op_mov_reg_T0(ot, reg);
+            break;
         }
-        gen_op_mov_reg_T0(ot, reg);
         set_cc_op(s, CC_OP_MULB + ot);
         break;
     case 0x1c0:
@@ -4927,9 +5443,10 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 rm = 0; /* avoid warning */
             }
             label1 = gen_new_label();
-            tcg_gen_sub_tl(t2, cpu_regs[R_EAX], t0);
+            tcg_gen_mov_tl(t2, cpu_regs[R_EAX]);
+            gen_extu(ot, t0);
             gen_extu(ot, t2);
-            tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1);
+            tcg_gen_brcond_tl(TCG_COND_EQ, t2, t0, label1);
             label2 = gen_new_label();
             if (mod == 3) {
                 gen_op_mov_reg_v(ot, R_EAX, t0);
@@ -4948,7 +5465,8 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             }
             gen_set_label(label2);
             tcg_gen_mov_tl(cpu_cc_src, t0);
-            tcg_gen_mov_tl(cpu_cc_dst, t2);
+            tcg_gen_mov_tl(cpu_cc_srcT, t2);
+            tcg_gen_sub_tl(cpu_cc_dst, t2, t0);
             set_cc_op(s, CC_OP_SUBB + ot);
             tcg_temp_free(t0);
             tcg_temp_free(t1);
@@ -5383,13 +5901,11 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         }
         break;
     case 0xc4: /* les Gv */
-        if (CODE64(s))
-            goto illegal_op;
+        /* In CODE64 this is VEX3; see above.  */
         op = R_ES;
         goto do_lxx;
     case 0xc5: /* lds Gv */
-        if (CODE64(s))
-            goto illegal_op;
+        /* In CODE64 this is VEX2; see above.  */
         op = R_DS;
         goto do_lxx;
     case 0x1b2: /* lss Gv */
@@ -5500,12 +6016,12 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         gen_op_mov_TN_reg(ot, 1, reg);
 
         if (shift) {
-            val = cpu_ldub_code(env, s->pc++);
-            tcg_gen_movi_tl(cpu_T3, val);
+            TCGv imm = tcg_const_tl(cpu_ldub_code(env, s->pc++));
+            gen_shiftd_rm_T1(s, ot, opreg, op, imm);
+            tcg_temp_free(imm);
         } else {
-            tcg_gen_mov_tl(cpu_T3, cpu_regs[R_ECX]);
+            gen_shiftd_rm_T1(s, ot, opreg, op, cpu_regs[R_ECX]);
         }
-        gen_shiftd_rm_T1_T3(s, ot, opreg, op);
         break;
 
         /************************/
@@ -6004,7 +6520,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                     };
                     op1 = fcmov_cc[op & 3] | (((op >> 3) & 1) ^ 1);
                     l1 = gen_new_label();
-                    gen_jcc1(s, op1, l1);
+                    gen_jcc1_noeob(s, op1, l1);
                     gen_helper_fmov_ST0_STN(cpu_env, tcg_const_i32(opreg));
                     gen_set_label(l1);
                 }
@@ -6369,44 +6885,14 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
 
     case 0x190 ... 0x19f: /* setcc Gv */
         modrm = cpu_ldub_code(env, s->pc++);
-        gen_setcc(s, b);
+        gen_setcc1(s, b, cpu_T[0]);
         gen_ldst_modrm(env, s, modrm, OT_BYTE, OR_TMP0, 1);
         break;
     case 0x140 ... 0x14f: /* cmov Gv, Ev */
-        {
-            int l1;
-            TCGv t0;
-
-            ot = dflag + OT_WORD;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
-            mod = (modrm >> 6) & 3;
-            t0 = tcg_temp_local_new();
-            if (mod != 3) {
-                gen_lea_modrm(env, s, modrm, &reg_addr, &offset_addr);
-                gen_op_ld_v(ot + s->mem_index, t0, cpu_A0);
-            } else {
-                rm = (modrm & 7) | REX_B(s);
-                gen_op_mov_v_reg(ot, t0, rm);
-            }
-#ifdef TARGET_X86_64
-            if (ot == OT_LONG) {
-                /* XXX: specific Intel behaviour ? */
-                l1 = gen_new_label();
-                gen_jcc1(s, b ^ 1, l1);
-                tcg_gen_mov_tl(cpu_regs[reg], t0);
-                gen_set_label(l1);
-                tcg_gen_ext32u_tl(cpu_regs[reg], cpu_regs[reg]);
-            } else
-#endif
-            {
-                l1 = gen_new_label();
-                gen_jcc1(s, b ^ 1, l1);
-                gen_op_mov_reg_v(ot, reg, t0);
-                gen_set_label(l1);
-            }
-            tcg_temp_free(t0);
-        }
+        ot = dflag + OT_WORD;
+        modrm = cpu_ldub_code(env, s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        gen_cmovcc1(env, s, ot, b, modrm, reg);
         break;
 
         /************************/
@@ -6483,7 +6969,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
         if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
             goto illegal_op;
         gen_op_mov_TN_reg(OT_BYTE, 0, R_AH);
-        gen_compute_eflags(s, cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
         tcg_gen_andi_tl(cpu_T[0], cpu_T[0], CC_S | CC_Z | CC_A | CC_P | CC_C);
         tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, cpu_T[0]);
@@ -6491,21 +6977,21 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
     case 0x9f: /* lahf */
         if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
             goto illegal_op;
-        gen_compute_eflags(s, cpu_T[0]);
+        gen_compute_eflags(s);
         /* Note: gen_compute_eflags() only gives the condition codes */
-        tcg_gen_ori_tl(cpu_T[0], cpu_T[0], 0x02);
+        tcg_gen_ori_tl(cpu_T[0], cpu_cc_src, 0x02);
         gen_op_mov_reg_T0(OT_BYTE, R_AH);
         break;
     case 0xf5: /* cmc */
-        gen_compute_eflags(s, cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_xori_tl(cpu_cc_src, cpu_cc_src, CC_C);
         break;
     case 0xf8: /* clc */
-        gen_compute_eflags(s, cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_C);
         break;
     case 0xf9: /* stc */
-        gen_compute_eflags(s, cpu_cc_src);
+        gen_compute_eflags(s);
         tcg_gen_ori_tl(cpu_cc_src, cpu_cc_src, CC_C);
         break;
     case 0xfc: /* cld */
@@ -6606,47 +7092,58 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             tcg_gen_movi_tl(cpu_cc_dst, 0);
         }
         break;
-    case 0x1bc: /* bsf */
-    case 0x1bd: /* bsr */
-        {
-            int label1;
-            TCGv t0;
-
-            ot = dflag + OT_WORD;
-            modrm = cpu_ldub_code(env, s->pc++);
-            reg = ((modrm >> 3) & 7) | rex_r;
-            gen_ldst_modrm(env, s,modrm, ot, OR_TMP0, 0);
-            gen_extu(ot, cpu_T[0]);
-            t0 = tcg_temp_local_new();
-            tcg_gen_mov_tl(t0, cpu_T[0]);
-            if ((b & 1) && (prefixes & PREFIX_REPZ) &&
-                (s->cpuid_ext3_features & CPUID_EXT3_ABM)) {
-                switch(ot) {
-                case OT_WORD: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(16)); break;
-                case OT_LONG: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(32)); break;
-                case OT_QUAD: gen_helper_lzcnt(cpu_T[0], t0,
-                    tcg_const_i32(64)); break;
-                }
-                gen_op_mov_reg_T0(ot, reg);
+    case 0x1bc: /* bsf / tzcnt */
+    case 0x1bd: /* bsr / lzcnt */
+        ot = dflag + OT_WORD;
+        modrm = cpu_ldub_code(env, s->pc++);
+        reg = ((modrm >> 3) & 7) | rex_r;
+        gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0);
+        gen_extu(ot, cpu_T[0]);
+
+        /* Note that lzcnt and tzcnt are in different extensions.  */
+        if ((prefixes & PREFIX_REPZ)
+            && (b & 1
+                ? s->cpuid_ext3_features & CPUID_EXT3_ABM
+                : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) {
+            int size = 8 << ot;
+            tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]);
+            if (b & 1) {
+                /* For lzcnt, reduce the target_ulong result by the
+                   number of zeros that we expect to find at the top.  */
+                gen_helper_clz(cpu_T[0], cpu_T[0]);
+                tcg_gen_subi_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - size);
             } else {
-                label1 = gen_new_label();
-                tcg_gen_movi_tl(cpu_cc_dst, 0);
-                tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, label1);
-                if (b & 1) {
-                    gen_helper_bsr(cpu_T[0], t0);
-                } else {
-                    gen_helper_bsf(cpu_T[0], t0);
-                }
-                gen_op_mov_reg_T0(ot, reg);
-                tcg_gen_movi_tl(cpu_cc_dst, 1);
-                gen_set_label(label1);
-                tcg_gen_discard_tl(cpu_cc_src);
-                set_cc_op(s, CC_OP_LOGICB + ot);
+                /* For tzcnt, a zero input must return the operand size:
+                   force all bits outside the operand size to 1.  */
+                target_ulong mask = (target_ulong)-2 << (size - 1);
+                tcg_gen_ori_tl(cpu_T[0], cpu_T[0], mask);
+                gen_helper_ctz(cpu_T[0], cpu_T[0]);
             }
-            tcg_temp_free(t0);
+            /* For lzcnt/tzcnt, C and Z bits are defined and are
+               related to the result.  */
+            gen_op_update1_cc();
+            set_cc_op(s, CC_OP_BMILGB + ot);
+        } else {
+            /* For bsr/bsf, only the Z bit is defined and it is related
+               to the input and not the result.  */
+            tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]);
+            set_cc_op(s, CC_OP_LOGICB + ot);
+            if (b & 1) {
+                /* For bsr, return the bit index of the first 1 bit,
+                   not the count of leading zeros.  */
+                gen_helper_clz(cpu_T[0], cpu_T[0]);
+                tcg_gen_xori_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - 1);
+            } else {
+                gen_helper_ctz(cpu_T[0], cpu_T[0]);
+            }
+            /* ??? The manual says that the output is undefined when the
+               input is zero, but real hardware leaves it unchanged, and
+               real programs appear to depend on that.  */
+            tcg_gen_movi_tl(cpu_tmp0, 0);
+            tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[0], cpu_cc_dst, cpu_tmp0,
+                               cpu_regs[reg], cpu_T[0]);
         }
+        gen_op_mov_reg_T0(ot, reg);
         break;
         /************************/
         /* bcd */
@@ -7381,7 +7878,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
            } else {
                 gen_op_mov_reg_v(ot, rm, t0);
             }
-            gen_compute_eflags(s, cpu_cc_src);
+            gen_compute_eflags(s);
             tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, ~CC_Z);
             tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t2);
             tcg_temp_free(t0);
@@ -7687,10 +8184,12 @@ void optimize_flags_init(void)
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
     cpu_cc_op = tcg_global_mem_new_i32(TCG_AREG0,
                                        offsetof(CPUX86State, cc_op), "cc_op");
-    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
-                                    "cc_src");
     cpu_cc_dst = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_dst),
                                     "cc_dst");
+    cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src),
+                                    "cc_src");
+    cpu_cc_src2 = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src2),
+                                     "cc_src2");
 
 #ifdef TARGET_X86_64
     cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0,
@@ -7792,11 +8291,11 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     if (flags & HF_SOFTMMU_MASK) {
         dc->mem_index = (cpu_mmu_index(env) + 1) << 2;
     }
-    dc->cpuid_features = env->cpuid_features;
-    dc->cpuid_ext_features = env->cpuid_ext_features;
-    dc->cpuid_ext2_features = env->cpuid_ext2_features;
-    dc->cpuid_ext3_features = env->cpuid_ext3_features;
-    dc->cpuid_7_0_ebx_features = env->cpuid_7_0_ebx_features;
+    dc->cpuid_features = env->features[FEAT_1_EDX];
+    dc->cpuid_ext_features = env->features[FEAT_1_ECX];
+    dc->cpuid_ext2_features = env->features[FEAT_8000_0001_EDX];
+    dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
+    dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
 #ifdef TARGET_X86_64
     dc->lma = (flags >> HF_LMA_SHIFT) & 1;
     dc->code64 = (flags >> HF_CS64_SHIFT) & 1;
@@ -7817,16 +8316,15 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     cpu_T[0] = tcg_temp_new();
     cpu_T[1] = tcg_temp_new();
     cpu_A0 = tcg_temp_new();
-    cpu_T3 = tcg_temp_new();
 
     cpu_tmp0 = tcg_temp_new();
     cpu_tmp1_i64 = tcg_temp_new_i64();
     cpu_tmp2_i32 = tcg_temp_new_i32();
     cpu_tmp3_i32 = tcg_temp_new_i32();
     cpu_tmp4 = tcg_temp_new();
-    cpu_tmp5 = tcg_temp_new();
     cpu_ptr0 = tcg_temp_new_ptr();
     cpu_ptr1 = tcg_temp_new_ptr();
+    cpu_cc_srcT = tcg_temp_local_new();
 
     gen_opc_end = tcg_ctx.gen_opc_buf + OPC_MAX_SIZE;
 
@@ -7838,7 +8336,7 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     if (max_insns == 0)
         max_insns = CF_COUNT_MASK;
 
-    gen_icount_start();
+    gen_tb_start();
     for(;;) {
         if (unlikely(!QTAILQ_EMPTY(&env->breakpoints))) {
             QTAILQ_FOREACH(bp, &env->breakpoints, entry) {
@@ -7896,7 +8394,7 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     }
     if (tb->cflags & CF_LAST_IO)
         gen_io_end();
-    gen_icount_end(tb, num_insns);
+    gen_tb_end(tb, num_insns);
     *tcg_ctx.gen_opc_ptr = INDEX_op_end;
     /* we don't forget to fill the last values */
     if (search_pc) {