include: Clean up includes

[mirror_qemu.git] / tcg / optimize.c
diff --git a/tcg/optimize.c b/tcg/optimize.c

index f723deaafede66baea9137728e74221ba1e812c0..f2d01654c59ed1866d912eeed98678af73beee7e 100644 (file)
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -25,7 +25,8 @@
  
  #include "qemu/osdep.h"
  #include "qemu/int128.h"
-#include "tcg/tcg-op.h"
+#include "qemu/interval-tree.h"
+#include "tcg/tcg-op-common.h"
  #include "tcg-internal.h"
  
  #define CASE_OP_32_64(x)                        \
@@ -37,12 +38,21 @@
          glue(glue(case INDEX_op_, x), _i64):    \
          glue(glue(case INDEX_op_, x), _vec)
  
+typedef struct MemCopyInfo {
+    IntervalTreeNode itree;
+    QSIMPLEQ_ENTRY (MemCopyInfo) next;
+    TCGTemp *ts;
+    TCGType type;
+} MemCopyInfo;
+
  typedef struct TempOptInfo {
      bool is_const;
      TCGTemp *prev_copy;
      TCGTemp *next_copy;
+    QSIMPLEQ_HEAD(, MemCopyInfo) mem_copy;
      uint64_t val;
      uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
  } TempOptInfo;
  
  typedef struct OptContext {
@@ -50,12 +60,55 @@ typedef struct OptContext {
      TCGOp *prev_mb;
      TCGTempSet temps_used;
  
+    IntervalTreeRoot mem_copy;
+    QSIMPLEQ_HEAD(, MemCopyInfo) mem_free;
+
      /* In flight values from optimization. */
      uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
      uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+    uint64_t s_mask;  /* mask of clrsb(value) bits */
      TCGType type;
  } OptContext;
  
+/* Calculate the smask for a specific value. */
+static uint64_t smask_from_value(uint64_t value)
+{
+    int rep = clrsb64(value);
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Calculate the smask for a given set of known-zeros.
+ * If there are lots of zeros on the left, we can consider the remainder
+ * an unsigned field, and thus the corresponding signed field is one bit
+ * larger.
+ */
+static uint64_t smask_from_zmask(uint64_t zmask)
+{
+    /*
+     * Only the 0 bits are significant for zmask, thus the msb itself
+     * must be zero, else we have no sign information.
+     */
+    int rep = clz64(zmask);
+    if (rep == 0) {
+        return 0;
+    }
+    rep -= 1;
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Recreate a properly left-aligned smask after manipulation.
+ * Some bit-shuffling, particularly shifts and rotates, may
+ * retain sign bits on the left, but may scatter disconnected
+ * sign bits on the right.  Retain only what remains to the left.
+ */
+static uint64_t smask_from_smask(int64_t smask)
+{
+    /* Only the 1 bits are significant for smask */
+    return smask_from_zmask(~smask);
+}
+
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -81,24 +134,9 @@ static inline bool ts_is_copy(TCGTemp *ts)
      return ts_info(ts)->next_copy != ts;
  }
  
-/* Reset TEMP's state, possibly removing the temp for the list of copies.  */
-static void reset_ts(TCGTemp *ts)
+static TCGTemp *cmp_better_copy(TCGTemp *a, TCGTemp *b)
  {
-    TempOptInfo *ti = ts_info(ts);
-    TempOptInfo *pi = ts_info(ti->prev_copy);
-    TempOptInfo *ni = ts_info(ti->next_copy);
-
-    ni->prev_copy = ti->prev_copy;
-    pi->next_copy = ti->next_copy;
-    ti->next_copy = ts;
-    ti->prev_copy = ts;
-    ti->is_const = false;
-    ti->z_mask = -1;
-}
-
-static void reset_temp(TCGArg arg)
-{
-    reset_ts(arg_temp(arg));
+    return a->kind < b->kind ? b : a;
  }
  
  /* Initialize and activate a temporary.  */
@@ -120,44 +158,146 @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
  
      ti->next_copy = ts;
      ti->prev_copy = ts;
+    QSIMPLEQ_INIT(&ti->mem_copy);
      if (ts->kind == TEMP_CONST) {
          ti->is_const = true;
          ti->val = ts->val;
          ti->z_mask = ts->val;
-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-            /* High bits of a 32-bit quantity are garbage.  */
-            ti->z_mask |= ~0xffffffffull;
-        }
+        ti->s_mask = smask_from_value(ts->val);
      } else {
          ti->is_const = false;
          ti->z_mask = -1;
+        ti->s_mask = 0;
      }
  }
  
-static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+static MemCopyInfo *mem_copy_first(OptContext *ctx, intptr_t s, intptr_t l)
+{
+    IntervalTreeNode *r = interval_tree_iter_first(&ctx->mem_copy, s, l);
+    return r ? container_of(r, MemCopyInfo, itree) : NULL;
+}
+
+static MemCopyInfo *mem_copy_next(MemCopyInfo *mem, intptr_t s, intptr_t l)
+{
+    IntervalTreeNode *r = interval_tree_iter_next(&mem->itree, s, l);
+    return r ? container_of(r, MemCopyInfo, itree) : NULL;
+}
+
+static void remove_mem_copy(OptContext *ctx, MemCopyInfo *mc)
  {
-    TCGTemp *i, *g, *l;
+    TCGTemp *ts = mc->ts;
+    TempOptInfo *ti = ts_info(ts);
+
+    interval_tree_remove(&mc->itree, &ctx->mem_copy);
+    QSIMPLEQ_REMOVE(&ti->mem_copy, mc, MemCopyInfo, next);
+    QSIMPLEQ_INSERT_TAIL(&ctx->mem_free, mc, next);
+}
+
+static void remove_mem_copy_in(OptContext *ctx, intptr_t s, intptr_t l)
+{
+    while (true) {
+        MemCopyInfo *mc = mem_copy_first(ctx, s, l);
+        if (!mc) {
+            break;
+        }
+        remove_mem_copy(ctx, mc);
+    }
+}
+
+static void remove_mem_copy_all(OptContext *ctx)
+{
+    remove_mem_copy_in(ctx, 0, -1);
+    tcg_debug_assert(interval_tree_is_empty(&ctx->mem_copy));
+}
+
+static TCGTemp *find_better_copy(TCGTemp *ts)
+{
+    TCGTemp *i, *ret;
  
      /* If this is already readonly, we can't do better. */
      if (temp_readonly(ts)) {
          return ts;
      }
  
-    g = l = NULL;
+    ret = ts;
      for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-        if (temp_readonly(i)) {
-            return i;
-        } else if (i->kind > ts->kind) {
-            if (i->kind == TEMP_GLOBAL) {
-                g = i;
-            } else if (i->kind == TEMP_LOCAL) {
-                l = i;
+        ret = cmp_better_copy(ret, i);
+    }
+    return ret;
+}
+
+static void move_mem_copies(TCGTemp *dst_ts, TCGTemp *src_ts)
+{
+    TempOptInfo *si = ts_info(src_ts);
+    TempOptInfo *di = ts_info(dst_ts);
+    MemCopyInfo *mc;
+
+    QSIMPLEQ_FOREACH(mc, &si->mem_copy, next) {
+        tcg_debug_assert(mc->ts == src_ts);
+        mc->ts = dst_ts;
+    }
+    QSIMPLEQ_CONCAT(&di->mem_copy, &si->mem_copy);
+}
+
+/* Reset TEMP's state, possibly removing the temp for the list of copies.  */
+static void reset_ts(OptContext *ctx, TCGTemp *ts)
+{
+    TempOptInfo *ti = ts_info(ts);
+    TCGTemp *pts = ti->prev_copy;
+    TCGTemp *nts = ti->next_copy;
+    TempOptInfo *pi = ts_info(pts);
+    TempOptInfo *ni = ts_info(nts);
+
+    ni->prev_copy = ti->prev_copy;
+    pi->next_copy = ti->next_copy;
+    ti->next_copy = ts;
+    ti->prev_copy = ts;
+    ti->is_const = false;
+    ti->z_mask = -1;
+    ti->s_mask = 0;
+
+    if (!QSIMPLEQ_EMPTY(&ti->mem_copy)) {
+        if (ts == nts) {
+            /* Last temp copy being removed, the mem copies die. */
+            MemCopyInfo *mc;
+            QSIMPLEQ_FOREACH(mc, &ti->mem_copy, next) {
+                interval_tree_remove(&mc->itree, &ctx->mem_copy);
              }
+            QSIMPLEQ_CONCAT(&ctx->mem_free, &ti->mem_copy);
+        } else {
+            move_mem_copies(find_better_copy(nts), ts);
          }
      }
+}
+
+static void reset_temp(OptContext *ctx, TCGArg arg)
+{
+    reset_ts(ctx, arg_temp(arg));
+}
+
+static void record_mem_copy(OptContext *ctx, TCGType type,
+                            TCGTemp *ts, intptr_t start, intptr_t last)
+{
+    MemCopyInfo *mc;
+    TempOptInfo *ti;
+
+    mc = QSIMPLEQ_FIRST(&ctx->mem_free);
+    if (mc) {
+        QSIMPLEQ_REMOVE_HEAD(&ctx->mem_free, next);
+    } else {
+        mc = tcg_malloc(sizeof(*mc));
+    }
+
+    memset(mc, 0, sizeof(*mc));
+    mc->itree.start = start;
+    mc->itree.last = last;
+    mc->type = type;
+    interval_tree_insert(&mc->itree, &ctx->mem_copy);
  
-    /* If we didn't find a better representation, return the same temp. */
-    return g ? g : l ? l : ts;
+    ts = find_better_copy(ts);
+    ti = ts_info(ts);
+    mc->ts = ts;
+    QSIMPLEQ_INSERT_TAIL(&ti->mem_copy, mc, next);
  }
  
  static bool ts_are_copies(TCGTemp *ts1, TCGTemp *ts2)
@@ -186,13 +326,39 @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
      return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
  }
  
+static TCGTemp *find_mem_copy_for(OptContext *ctx, TCGType type, intptr_t s)
+{
+    MemCopyInfo *mc;
+
+    for (mc = mem_copy_first(ctx, s, s); mc; mc = mem_copy_next(mc, s, s)) {
+        if (mc->itree.start == s && mc->type == type) {
+            return find_better_copy(mc->ts);
+        }
+    }
+    return NULL;
+}
+
+static TCGArg arg_new_constant(OptContext *ctx, uint64_t val)
+{
+    TCGType type = ctx->type;
+    TCGTemp *ts;
+
+    if (type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    ts = tcg_constant_internal(type, val);
+    init_ts_info(ctx, ts);
+
+    return temp_arg(ts);
+}
+
  static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
      TempOptInfo *di;
      TempOptInfo *si;
-    uint64_t z_mask;
      TCGOpcode new_op;
  
      if (ts_are_copies(dst_ts, src_ts)) {
@@ -200,7 +366,7 @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
          return true;
      }
  
-    reset_ts(dst_ts);
+    reset_ts(ctx, dst_ts);
      di = ts_info(dst_ts);
      si = ts_info(src_ts);
  
@@ -224,12 +390,8 @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      op->args[0] = dst;
      op->args[1] = src;
  
-    z_mask = si->z_mask;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-        /* High bits of the destination are now garbage.  */
-        z_mask |= ~0xffffffffull;
-    }
-    di->z_mask = z_mask;
+    di->z_mask = si->z_mask;
+    di->s_mask = si->s_mask;
  
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -240,6 +402,11 @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
          si->next_copy = dst_ts;
          di->is_const = si->is_const;
          di->val = si->val;
+
+        if (!QSIMPLEQ_EMPTY(&si->mem_copy)
+            && cmp_better_copy(src_ts, dst_ts) == dst_ts) {
+            move_mem_copies(dst_ts, src_ts);
+        }
      }
      return true;
  }
@@ -248,10 +415,7 @@ static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
      /* Convert movi to mov with constant temp. */
-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
-
-    init_ts_info(ctx, tv);
-    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+    return tcg_opt_gen_mov(ctx, op, dst, arg_new_constant(ctx, val));
  }
  
  static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -268,13 +432,13 @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      CASE_OP_32_64(mul):
          return x * y;
  
-    CASE_OP_32_64(and):
+    CASE_OP_32_64_VEC(and):
          return x & y;
  
-    CASE_OP_32_64(or):
+    CASE_OP_32_64_VEC(or):
          return x | y;
  
-    CASE_OP_32_64(xor):
+    CASE_OP_32_64_VEC(xor):
          return x ^ y;
  
      case INDEX_op_shl_i32:
@@ -307,25 +471,25 @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      case INDEX_op_rotl_i64:
          return rol64(x, y & 63);
  
-    CASE_OP_32_64(not):
+    CASE_OP_32_64_VEC(not):
          return ~x;
  
      CASE_OP_32_64(neg):
          return -x;
  
-    CASE_OP_32_64(andc):
+    CASE_OP_32_64_VEC(andc):
          return x & ~y;
  
-    CASE_OP_32_64(orc):
+    CASE_OP_32_64_VEC(orc):
          return x | ~y;
  
-    CASE_OP_32_64(eqv):
+    CASE_OP_32_64_VEC(eqv):
          return ~(x ^ y);
  
-    CASE_OP_32_64(nand):
+    CASE_OP_32_64_VEC(nand):
          return ~(x & y);
  
-    CASE_OP_32_64(nor):
+    CASE_OP_32_64_VEC(nor):
          return ~(x | y);
  
      case INDEX_op_clz_i32:
@@ -413,9 +577,7 @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
          return (uint64_t)x % ((uint64_t)y ? : 1);
  
      default:
-        fprintf(stderr,
-                "Unrecognized operation %d in do_constant_folding.\n", op);
-        tcg_abort();
+        g_assert_not_reached();
      }
  }
  
@@ -453,7 +615,7 @@ static bool do_constant_folding_cond_32(uint32_t x, uint32_t y, TCGCond c)
      case TCG_COND_GTU:
          return x > y;
      default:
-        tcg_abort();
+        g_assert_not_reached();
      }
  }
  
@@ -481,7 +643,7 @@ static bool do_constant_folding_cond_64(uint64_t x, uint64_t y, TCGCond c)
      case TCG_COND_GTU:
          return x > y;
      default:
-        tcg_abort();
+        g_assert_not_reached();
      }
  }
  
@@ -501,7 +663,7 @@ static bool do_constant_folding_cond_eq(TCGCond c)
      case TCG_COND_EQ:
          return 1;
      default:
-        tcg_abort();
+        g_assert_not_reached();
      }
  }
  
@@ -512,10 +674,10 @@ static bool do_constant_folding_cond_eq(TCGCond c)
  static int do_constant_folding_cond(TCGType type, TCGArg x,
                                      TCGArg y, TCGCond c)
  {
-    uint64_t xv = arg_info(x)->val;
-    uint64_t yv = arg_info(y)->val;
-
      if (arg_is_const(x) && arg_is_const(y)) {
+        uint64_t xv = arg_info(x)->val;
+        uint64_t yv = arg_info(y)->val;
+
          switch (type) {
          case TCG_TYPE_I32:
              return do_constant_folding_cond_32(xv, yv, c);
@@ -527,7 +689,7 @@ static int do_constant_folding_cond(TCGType type, TCGArg x,
          }
      } else if (args_are_copies(x, y)) {
          return do_constant_folding_cond_eq(c);
-    } else if (arg_is_const(y) && yv == 0) {
+    } else if (arg_is_const(y) && arg_info(y)->val == 0) {
          switch (c) {
          case TCG_COND_LTU:
              return 0;
@@ -577,6 +739,19 @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
      return -1;
  }
  
+/**
+ * swap_commutative:
+ * @dest: TCGArg of the destination argument, or NO_DEST.
+ * @p1: first paired argument
+ * @p2: second paired argument
+ *
+ * If *@p1 is a constant and *@p2 is not, swap.
+ * If *@p2 matches @dest, swap.
+ * Return true if a swap was performed.
+ */
+
+#define NO_DEST  temp_arg(NULL)
+
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
  {
      TCGArg a1 = *p1, a2 = *p2;
@@ -614,21 +789,17 @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
  {
      for (int i = 0; i < nb_args; i++) {
          TCGTemp *ts = arg_temp(op->args[i]);
-        if (ts) {
-            init_ts_info(ctx, ts);
-        }
+        init_ts_info(ctx, ts);
      }
  }
  
  static void copy_propagate(OptContext *ctx, TCGOp *op,
                             int nb_oargs, int nb_iargs)
  {
-    TCGContext *s = ctx->tcg;
-
      for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
          TCGTemp *ts = arg_temp(op->args[i]);
-        if (ts && ts_is_copy(ts)) {
-            op->args[i] = temp_arg(find_better_copy(s, ts));
+        if (ts_is_copy(ts)) {
+            op->args[i] = temp_arg(find_better_copy(ts));
          }
      }
  }
@@ -639,24 +810,29 @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      int i, nb_oargs;
  
      /*
-     * For an opcode that ends a BB, reset all temp data.
-     * We do no cross-BB optimization.
+     * We only optimize extended basic blocks.  If the opcode ends a BB
+     * and is not a conditional branch, reset all temp data.
       */
      if (def->flags & TCG_OPF_BB_END) {
-        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
          ctx->prev_mb = NULL;
+        if (!(def->flags & TCG_OPF_COND_BRANCH)) {
+            memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+            remove_mem_copy_all(ctx);
+        }
          return;
      }
  
      nb_oargs = def->nb_oargs;
      for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        TCGTemp *ts = arg_temp(op->args[i]);
+        reset_ts(ctx, ts);
          /*
-         * Save the corresponding known-zero bits mask for the
+         * Save the corresponding known-zero/sign bits mask for the
           * first output argument (only one supported so far).
           */
          if (i == 0) {
-            arg_info(op->args[i])->z_mask = ctx->z_mask;
+            ts_info(ts)->z_mask = ctx->z_mask;
+            ts_info(ts)->s_mask = ctx->s_mask;
          }
      }
  }
@@ -696,20 +872,37 @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
  
+static bool fold_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return false;
+}
+
+static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return fold_const2(ctx, op);
+}
+
  static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
      uint64_t z_mask = ctx->z_mask;
+    uint64_t s_mask = ctx->s_mask;
  
      /*
-     * 32-bit ops generate 32-bit results.  For the result is zero test
-     * below, we can ignore high bits, but for further optimizations we
-     * need to record that the high bits contain garbage.
+     * 32-bit ops generate 32-bit results, which for the purpose of
+     * simplifying tcg are sign-extended.  Certainly that's how we
+     * represent our constants elsewhere.  Note that the bits will
+     * be reset properly for a 64-bit value when encountering the
+     * type changing opcodes.
       */
      if (ctx->type == TCG_TYPE_I32) {
-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
-        a_mask &= MAKE_64BIT_MASK(0, 32);
-        z_mask &= MAKE_64BIT_MASK(0, 32);
+        a_mask = (int32_t)a_mask;
+        z_mask = (int32_t)z_mask;
+        s_mask |= MAKE_64BIT_MASK(32, 32);
+        ctx->z_mask = z_mask;
+        ctx->s_mask = s_mask;
      }
  
      if (z_mask == 0) {
@@ -832,7 +1025,17 @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
  
  static bool fold_add(OptContext *ctx, TCGOp *op)
  {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
+}
+
+/* We cannot as yet do_constant_folding with vectors. */
+static bool fold_add_vec(OptContext *ctx, TCGOp *op)
+{
+    if (fold_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
@@ -841,8 +1044,10 @@ static bool fold_add(OptContext *ctx, TCGOp *op)
  
  static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
  {
-    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
-        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+    bool a_const = arg_is_const(op->args[2]) && arg_is_const(op->args[3]);
+    bool b_const = arg_is_const(op->args[4]) && arg_is_const(op->args[5]);
+
+    if (a_const && b_const) {
          uint64_t al = arg_info(op->args[2])->val;
          uint64_t ah = arg_info(op->args[3])->val;
          uint64_t bl = arg_info(op->args[4])->val;
@@ -880,17 +1085,36 @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
          rh = op->args[1];
  
          /* The proper opcode is supplied by tcg_opt_gen_mov. */
-        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0, 2);
  
          tcg_opt_gen_movi(ctx, op, rl, al);
          tcg_opt_gen_movi(ctx, op2, rh, ah);
          return true;
      }
+
+    /* Fold sub2 r,x,i to add2 r,x,-i */
+    if (!add && b_const) {
+        uint64_t bl = arg_info(op->args[4])->val;
+        uint64_t bh = arg_info(op->args[5])->val;
+
+        /* Negate the two parts without assembling and disassembling. */
+        bl = -bl;
+        bh = ~bh + !bl;
+
+        op->opc = (ctx->type == TCG_TYPE_I32
+                   ? INDEX_op_add2_i32 : INDEX_op_add2_i64);
+        op->args[4] = arg_new_constant(ctx, bl);
+        op->args[5] = arg_new_constant(ctx, bh);
+    }
      return false;
  }
  
  static bool fold_add2(OptContext *ctx, TCGOp *op)
  {
+    /* Note that the high and low parts may be independently swapped. */
+    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
+    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
+
      return fold_addsub2(ctx, op, true);
  }
  
@@ -898,7 +1122,7 @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      uint64_t z1, z2;
  
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
@@ -909,6 +1133,13 @@ static bool fold_and(OptContext *ctx, TCGOp *op)
      z2 = arg_info(op->args[2])->z_mask;
      ctx->z_mask = z1 & z2;
  
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
+
      /*
       * Known-zeros does not imply known-ones.  Therefore unless
       * arg2 is constant, we can't infer affected bits from it.
@@ -944,14 +1175,21 @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
      }
      ctx->z_mask = z1;
  
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
  
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
+    int i;
+
+    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
+        op->args[2] = cond = tcg_swap_cond(cond);
+    }
  
+    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
          return true;
@@ -966,10 +1204,14 @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
  static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[4];
-    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      TCGArg label = op->args[5];
-    int inv = 0;
+    int i, inv = 0;
  
+    if (swap_commutative2(&op->args[0], &op->args[2])) {
+        op->args[4] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      if (i >= 0) {
          goto do_brcond_const;
      }
@@ -1043,7 +1285,7 @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
-    uint64_t z_mask, sign;
+    uint64_t z_mask, s_mask, sign;
  
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -1053,6 +1295,7 @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      }
  
      z_mask = arg_info(op->args[1])->z_mask;
+
      switch (op->opc) {
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -1071,6 +1314,7 @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
+    s_mask = smask_from_zmask(z_mask);
  
      switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
      case TCG_BSWAP_OZ:
@@ -1079,14 +1323,17 @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
          /* If the sign bit may be 1, force all the bits above to 1. */
          if (z_mask & sign) {
              z_mask |= sign;
+            s_mask = sign << 1;
          }
          break;
      default:
          /* The high bits are undefined: force all bits above the sign to 1. */
          z_mask |= sign << 1;
+        s_mask = 0;
          break;
      }
      ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
  
      return fold_masks(ctx, op);
  }
@@ -1108,14 +1355,19 @@ static bool fold_call(OptContext *ctx, TCGOp *op)
  
          for (i = 0; i < nb_globals; i++) {
              if (test_bit(i, ctx->temps_used.l)) {
-                reset_ts(&ctx->tcg->temps[i]);
+                reset_ts(ctx, &ctx->tcg->temps[i]);
              }
          }
      }
  
+    /* If the function has side effects, reset mem data. */
+    if (!(flags & TCG_CALL_NO_SIDE_EFFECTS)) {
+        remove_mem_copy_all(ctx);
+    }
+
      /* Reset temp data for outputs. */
      for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        reset_temp(ctx, op->args[i]);
      }
  
      /* Stop optimizing MB across calls. */
@@ -1148,7 +1400,7 @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          g_assert_not_reached();
      }
      ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
-
+    ctx->s_mask = smask_from_zmask(ctx->z_mask);
      return false;
  }
  
@@ -1168,11 +1420,14 @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
+    ctx->s_mask = smask_from_zmask(ctx->z_mask);
      return false;
  }
  
  static bool fold_deposit(OptContext *ctx, TCGOp *op)
  {
+    TCGOpcode and_opc;
+
      if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
          uint64_t t1 = arg_info(op->args[1])->val;
          uint64_t t2 = arg_info(op->args[2])->val;
@@ -1181,6 +1436,41 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
  
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        and_opc = INDEX_op_and_i32;
+        break;
+    case TCG_TYPE_I64:
+        and_opc = INDEX_op_and_i64;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    /* Inserting a value into zero at offset 0. */
+    if (arg_is_const(op->args[1])
+        && arg_info(op->args[1])->val == 0
+        && op->args[3] == 0) {
+        uint64_t mask = MAKE_64BIT_MASK(0, op->args[4]);
+
+        op->opc = and_opc;
+        op->args[1] = op->args[2];
+        op->args[2] = arg_new_constant(ctx, mask);
+        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
+        return false;
+    }
+
+    /* Inserting zero into a value. */
+    if (arg_is_const(op->args[2])
+        && arg_info(op->args[2])->val == 0) {
+        uint64_t mask = deposit64(-1, op->args[3], op->args[4], 0);
+
+        op->opc = and_opc;
+        op->args[2] = arg_new_constant(ctx, mask);
+        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
+        return false;
+    }
+
      ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
                              op->args[3], op->args[4],
                              arg_info(op->args[2])->z_mask);
@@ -1189,7 +1479,11 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
  
  static bool fold_divide(OptContext *ctx, TCGOp *op)
  {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 1)) {
+        return true;
+    }
+    return false;
  }
  
  static bool fold_dup(OptContext *ctx, TCGOp *op)
@@ -1219,32 +1513,38 @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
      return false;
  }
  
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
      uint64_t z_mask_old, z_mask;
+    int pos = op->args[2];
+    int len = op->args[3];
  
      if (arg_is_const(op->args[1])) {
          uint64_t t;
  
          t = arg_info(op->args[1])->val;
-        t = extract64(t, op->args[2], op->args[3]);
+        t = extract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
  
      z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0) {
+    z_mask = extract64(z_mask_old, pos, len);
+    if (pos == 0) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
      ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
  
      return fold_masks(ctx, op);
  }
@@ -1261,7 +1561,7 @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
              v2 <<= 64 - shr;
          } else {
              v1 = (uint32_t)v1 >> shr;
-            v2 = (int32_t)v2 << (32 - shr);
+            v2 = (uint64_t)((int32_t)v2 << (32 - shr));
          }
          return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
      }
@@ -1270,14 +1570,16 @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
-    uint64_t z_mask_old, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask, sign;
      bool type_change = false;
  
      if (fold_const1(ctx, op)) {
          return true;
      }
  
-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+    s_mask = arg_info(op->args[1])->s_mask;
+    s_mask_old = s_mask;
  
      switch (op->opc) {
      CASE_OP_32_64(ext8s):
@@ -1301,10 +1603,14 @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
  
      if (z_mask & sign) {
          z_mask |= sign;
-    } else if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
      }
+    s_mask |= sign << 1;
+
      ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
+    if (!type_change) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
  
      return fold_masks(ctx, op);
  }
@@ -1343,6 +1649,7 @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
      }
  
      ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
      if (!type_change) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
@@ -1381,26 +1688,48 @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
  
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination reg so
+     * that the tcg backend can implement a "move if true" operation.
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = cond = tcg_invert_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
  
      ctx->z_mask = arg_info(op->args[3])->z_mask
                  | arg_info(op->args[4])->z_mask;
+    ctx->s_mask = arg_info(op->args[3])->s_mask
+                & arg_info(op->args[4])->s_mask;
  
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
-        TCGOpcode opc;
+        TCGOpcode opc, negopc = 0;
  
          switch (ctx->type) {
          case TCG_TYPE_I32:
              opc = INDEX_op_setcond_i32;
+            if (TCG_TARGET_HAS_negsetcond_i32) {
+                negopc = INDEX_op_negsetcond_i32;
+            }
+            tv = (int32_t)tv;
+            fv = (int32_t)fv;
              break;
          case TCG_TYPE_I64:
              opc = INDEX_op_setcond_i64;
+            if (TCG_TARGET_HAS_negsetcond_i64) {
+                negopc = INDEX_op_negsetcond_i64;
+            }
              break;
          default:
              g_assert_not_reached();
@@ -1412,6 +1741,14 @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
          } else if (fv == 1 && tv == 0) {
              op->opc = opc;
              op->args[3] = tcg_invert_cond(cond);
+        } else if (negopc) {
+            if (tv == -1 && fv == 0) {
+                op->opc = negopc;
+                op->args[3] = cond;
+            } else if (fv == -1 && tv == 0) {
+                op->opc = negopc;
+                op->args[3] = tcg_invert_cond(cond);
+            }
          }
      }
      return false;
@@ -1420,7 +1757,8 @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
  static bool fold_mul(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
-        fold_xi_to_i(ctx, op, 0)) {
+        fold_xi_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 1)) {
          return true;
      }
      return false;
@@ -1428,7 +1766,7 @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
  
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0)) {
          return true;
      }
@@ -1437,6 +1775,8 @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  
  static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
+    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
+
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
          uint64_t a = arg_info(op->args[2])->val;
          uint64_t b = arg_info(op->args[3])->val;
@@ -1469,7 +1809,7 @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
          rh = op->args[1];
  
          /* The proper opcode is supplied by tcg_opt_gen_mov. */
-        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0, 2);
  
          tcg_opt_gen_movi(ctx, op, rl, l);
          tcg_opt_gen_movi(ctx, op2, rh, h);
@@ -1480,10 +1820,13 @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
      return false;
  }
  
@@ -1509,10 +1852,13 @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
      return false;
  }
  
@@ -1522,6 +1868,8 @@ static bool fold_not(OptContext *ctx, TCGOp *op)
          return true;
      }
  
+    ctx->s_mask = arg_info(op->args[1])->s_mask;
+
      /* Because of fold_to_not, we want to always return true, via finish. */
      finish_folding(ctx, op);
      return true;
@@ -1529,7 +1877,7 @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
@@ -1537,16 +1885,22 @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  
      ctx->z_mask = arg_info(op->args[1])->z_mask
                  | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
  
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, -1) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_ix_to_not(ctx, op, 0)) {
          return true;
      }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
      return false;
  }
  
@@ -1557,8 +1911,12 @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
      MemOp mop = get_memop(oi);
      int width = 8 * memop_size(mop);
  
-    if (!(mop & MO_SIGN) && width < 64) {
-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    if (width < 64) {
+        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        if (!(mop & MO_SIGN)) {
+            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            ctx->s_mask <<= 1;
+        }
      }
  
      /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -1575,28 +1933,62 @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
  
  static bool fold_remainder(OptContext *ctx, TCGOp *op)
  {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
  }
  
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
+
+    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
+        op->args[3] = cond = tcg_swap_cond(cond);
+    }
  
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
  
      ctx->z_mask = 1;
+    ctx->s_mask = smask_from_zmask(1);
      return false;
  }
  
+static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[3];
+    int i;
+
+    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
+        op->args[3] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    if (i >= 0) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], -i);
+    }
+
+    /* Value is {0,-1} so all bits are repetitions of the sign. */
+    ctx->s_mask = -1;
+    return false;
+}
+
+
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
-    int inv = 0;
+    int i, inv = 0;
+
+    if (swap_commutative2(&op->args[1], &op->args[3])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
  
+    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
      if (i >= 0) {
          goto do_setcond_const;
      }
@@ -1656,6 +2048,7 @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
      }
  
      ctx->z_mask = 1;
+    ctx->s_mask = smask_from_zmask(1);
      return false;
  
   do_setcond_const:
@@ -1664,40 +2057,80 @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
-    int64_t z_mask_old, z_mask;
+    uint64_t z_mask, s_mask, s_mask_old;
+    int pos = op->args[2];
+    int len = op->args[3];
  
      if (arg_is_const(op->args[1])) {
          uint64_t t;
  
          t = arg_info(op->args[1])->val;
-        t = sextract64(t, op->args[2], op->args[3]);
+        t = sextract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
  
-    z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0 && z_mask >= 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
-    }
+    z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask, pos, len);
      ctx->z_mask = z_mask;
  
+    s_mask_old = arg_info(op->args[1])->s_mask;
+    s_mask = sextract64(s_mask_old, pos, len);
+    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+    ctx->s_mask = s_mask;
+
+    if (pos == 0) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
+
      return fold_masks(ctx, op);
  }
  
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
+    uint64_t s_mask, z_mask, sign;
+
      if (fold_const2(ctx, op) ||
          fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
  
+    s_mask = arg_info(op->args[1])->s_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+
      if (arg_is_const(op->args[2])) {
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-                                          arg_info(op->args[1])->z_mask,
-                                          arg_info(op->args[2])->val);
+        int sh = arg_info(op->args[2])->val;
+
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+
+        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
+        ctx->s_mask = smask_from_smask(s_mask);
+
          return fold_masks(ctx, op);
      }
+
+    switch (op->opc) {
+    CASE_OP_32_64(sar):
+        /*
+         * Arithmetic right shift will not reduce the number of
+         * input sign repetitions.
+         */
+        ctx->s_mask = s_mask;
+        break;
+    CASE_OP_32_64(shr):
+        /*
+         * If the sign bit is known zero, then logical right shift
+         * will not reduced the number of input sign repetitions.
+         */
+        sign = (s_mask & -s_mask) >> 1;
+        if (!(z_mask & sign)) {
+            ctx->s_mask = s_mask;
+        }
+        break;
+    default:
+        break;
+    }
+
      return false;
  }
  
@@ -1713,11 +2146,11 @@ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
      switch (ctx->type) {
      case TCG_TYPE_I32:
          neg_op = INDEX_op_neg_i32;
-        have_neg = TCG_TARGET_HAS_neg_i32;
+        have_neg = true;
          break;
      case TCG_TYPE_I64:
          neg_op = INDEX_op_neg_i64;
-        have_neg = TCG_TARGET_HAS_neg_i64;
+        have_neg = true;
          break;
      case TCG_TYPE_V64:
      case TCG_TYPE_V128:
@@ -1737,10 +2170,10 @@ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
      return false;
  }
  
-static bool fold_sub(OptContext *ctx, TCGOp *op)
+/* We cannot as yet do_constant_folding with vectors. */
+static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
  {
-    if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0) ||
+    if (fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_sub_to_neg(ctx, op)) {
          return true;
@@ -1748,6 +2181,23 @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
      return false;
  }
  
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    if (fold_const2(ctx, op) || fold_sub_vec(ctx, op)) {
+        return true;
+    }
+
+    /* Fold sub r,x,i to add r,x,-i */
+    if (arg_is_const(op->args[2])) {
+        uint64_t val = arg_info(op->args[2])->val;
+
+        op->opc = (ctx->type == TCG_TYPE_I32
+                   ? INDEX_op_add_i32 : INDEX_op_add_i64);
+        op->args[2] = arg_new_constant(ctx, -val);
+    }
+    return false;
+}
+
  static bool fold_sub2(OptContext *ctx, TCGOp *op)
  {
      return fold_addsub2(ctx, op, false);
@@ -1757,24 +2207,126 @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  {
      /* We can't do any folding with a load, but we can record bits. */
      switch (op->opc) {
+    CASE_OP_32_64(ld8s):
+        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        break;
      CASE_OP_32_64(ld8u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
+        break;
+    CASE_OP_32_64(ld16s):
+        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
          break;
      CASE_OP_32_64(ld16u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
+        break;
+    case INDEX_op_ld32s_i64:
+        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
          break;
      case INDEX_op_ld32u_i64:
          ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
+}
+
+static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op)
+{
+    TCGTemp *dst, *src;
+    intptr_t ofs;
+    TCGType type;
+
+    if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
+        return false;
+    }
+
+    type = ctx->type;
+    ofs = op->args[2];
+    dst = arg_temp(op->args[0]);
+    src = find_mem_copy_for(ctx, type, ofs);
+    if (src && src->base_type == type) {
+        return tcg_opt_gen_mov(ctx, op, temp_arg(dst), temp_arg(src));
+    }
+
+    reset_ts(ctx, dst);
+    record_mem_copy(ctx, type, dst, ofs, ofs + tcg_type_size(type) - 1);
+    return true;
+}
+
+static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
+{
+    intptr_t ofs = op->args[2];
+    intptr_t lm1;
+
+    if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
+        remove_mem_copy_all(ctx);
+        return false;
+    }
+
+    switch (op->opc) {
+    CASE_OP_32_64(st8):
+        lm1 = 0;
+        break;
+    CASE_OP_32_64(st16):
+        lm1 = 1;
+        break;
+    case INDEX_op_st32_i64:
+    case INDEX_op_st_i32:
+        lm1 = 3;
+        break;
+    case INDEX_op_st_i64:
+        lm1 = 7;
+        break;
+    case INDEX_op_st_vec:
+        lm1 = tcg_type_size(ctx->type) - 1;
          break;
      default:
          g_assert_not_reached();
      }
+    remove_mem_copy_in(ctx, ofs, ofs + lm1);
+    return false;
+}
+
+static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
+{
+    TCGTemp *src;
+    intptr_t ofs, last;
+    TCGType type;
+
+    if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
+        fold_tcg_st(ctx, op);
+        return false;
+    }
+
+    src = arg_temp(op->args[0]);
+    ofs = op->args[2];
+    type = ctx->type;
+
+    /*
+     * Eliminate duplicate stores of a constant.
+     * This happens frequently when the target ISA zero-extends.
+     */
+    if (ts_is_const(src)) {
+        TCGTemp *prev = find_mem_copy_for(ctx, type, ofs);
+        if (src == prev) {
+            tcg_op_remove(ctx->tcg, op);
+            return true;
+        }
+    }
+
+    last = ofs + tcg_type_size(type) - 1;
+    remove_mem_copy_in(ctx, ofs, last);
+    record_mem_copy(ctx, type, src, ofs, last);
      return false;
  }
  
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
@@ -1783,6 +2335,8 @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
  
      ctx->z_mask = arg_info(op->args[1])->z_mask
                  | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
  
@@ -1793,6 +2347,8 @@ void tcg_optimize(TCGContext *s)
      TCGOp *op, *op_next;
      OptContext ctx = { .tcg = s };
  
+    QSIMPLEQ_INIT(&ctx.mem_free);
+
      /* Array VALS has an element for each temp.
         If this temp holds a constant then its value is kept in VALS' element.
         If this temp is a copy of other ones then the other copies are
@@ -1827,75 +2383,22 @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
  
-        /* For commutative operations make constant second argument */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-            break;
-        CASE_OP_32_64(brcond):
-            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
-                op->args[2] = tcg_swap_cond(op->args[2]);
-            }
-            break;
-        CASE_OP_32_64(setcond):
-            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
-                op->args[3] = tcg_swap_cond(op->args[3]);
-            }
-            break;
-        CASE_OP_32_64(movcond):
-            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            /* For movcond, we canonicalize the "false" input reg to match
-               the destination reg so that the tcg backend can implement
-               a "move if true" operation.  */
-            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-                op->args[5] = tcg_invert_cond(op->args[5]);
-            }
-            break;
-        CASE_OP_32_64(add2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-            break;
-        CASE_OP_32_64(mulu2):
-        CASE_OP_32_64(muls2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
-            break;
-        case INDEX_op_brcond2_i32:
-            if (swap_commutative2(&op->args[0], &op->args[2])) {
-                op->args[4] = tcg_swap_cond(op->args[4]);
-            }
-            break;
-        case INDEX_op_setcond2_i32:
-            if (swap_commutative2(&op->args[1], &op->args[3])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            break;
-        default:
-            break;
-        }
-
-        /* Assume all bits affected, and no bits known zero. */
+        /* Assume all bits affected, no bits known zero, no sign reps. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
+        ctx.s_mask = 0;
  
          /*
           * Process each opcode.
           * Sorted alphabetically by opcode as much as possible.
           */
          switch (opc) {
-        CASE_OP_32_64_VEC(add):
+        CASE_OP_32_64(add):
              done = fold_add(&ctx, op);
              break;
+        case INDEX_op_add_vec:
+            done = fold_add_vec(&ctx, op);
+            break;
          CASE_OP_32_64(add2):
              done = fold_add2(&ctx, op);
              break;
@@ -1936,7 +2439,7 @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              done = fold_dup2(&ctx, op);
              break;
-        CASE_OP_32_64(eqv):
+        CASE_OP_32_64_VEC(eqv):
              done = fold_eqv(&ctx, op);
              break;
          CASE_OP_32_64(extract):
@@ -1959,11 +2462,29 @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
+        CASE_OP_32_64(ld8s):
          CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16s):
          CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32s_i64:
          case INDEX_op_ld32u_i64:
              done = fold_tcg_ld(&ctx, op);
              break;
+        case INDEX_op_ld_i32:
+        case INDEX_op_ld_i64:
+        case INDEX_op_ld_vec:
+            done = fold_tcg_ld_memcopy(&ctx, op);
+            break;
+        CASE_OP_32_64(st8):
+        CASE_OP_32_64(st16):
+        case INDEX_op_st32_i64:
+            done = fold_tcg_st(&ctx, op);
+            break;
+        case INDEX_op_st_i32:
+        case INDEX_op_st_i64:
+        case INDEX_op_st_vec:
+            done = fold_tcg_st_memcopy(&ctx, op);
+            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
@@ -1984,13 +2505,13 @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulu2):
              done = fold_multiply2(&ctx, op);
              break;
-        CASE_OP_32_64(nand):
+        CASE_OP_32_64_VEC(nand):
              done = fold_nand(&ctx, op);
              break;
          CASE_OP_32_64(neg):
              done = fold_neg(&ctx, op);
              break;
-        CASE_OP_32_64(nor):
+        CASE_OP_32_64_VEC(nor):
              done = fold_nor(&ctx, op);
              break;
          CASE_OP_32_64_VEC(not):
@@ -2002,13 +2523,22 @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(orc):
              done = fold_orc(&ctx, op);
              break;
-        case INDEX_op_qemu_ld_i32:
-        case INDEX_op_qemu_ld_i64:
+        case INDEX_op_qemu_ld_a32_i32:
+        case INDEX_op_qemu_ld_a64_i32:
+        case INDEX_op_qemu_ld_a32_i64:
+        case INDEX_op_qemu_ld_a64_i64:
+        case INDEX_op_qemu_ld_a32_i128:
+        case INDEX_op_qemu_ld_a64_i128:
              done = fold_qemu_ld(&ctx, op);
              break;
-        case INDEX_op_qemu_st_i32:
-        case INDEX_op_qemu_st8_i32:
-        case INDEX_op_qemu_st_i64:
+        case INDEX_op_qemu_st8_a32_i32:
+        case INDEX_op_qemu_st8_a64_i32:
+        case INDEX_op_qemu_st_a32_i32:
+        case INDEX_op_qemu_st_a64_i32:
+        case INDEX_op_qemu_st_a32_i64:
+        case INDEX_op_qemu_st_a64_i64:
+        case INDEX_op_qemu_st_a32_i128:
+        case INDEX_op_qemu_st_a64_i128:
              done = fold_qemu_st(&ctx, op);
              break;
          CASE_OP_32_64(rem):
@@ -2025,15 +2555,21 @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(setcond):
              done = fold_setcond(&ctx, op);
              break;
+        CASE_OP_32_64(negsetcond):
+            done = fold_negsetcond(&ctx, op);
+            break;
          case INDEX_op_setcond2_i32:
              done = fold_setcond2(&ctx, op);
              break;
          CASE_OP_32_64(sextract):
              done = fold_sextract(&ctx, op);
              break;
-        CASE_OP_32_64_VEC(sub):
+        CASE_OP_32_64(sub):
              done = fold_sub(&ctx, op);
              break;
+        case INDEX_op_sub_vec:
+            done = fold_sub_vec(&ctx, op);
+            break;
          CASE_OP_32_64(sub2):
              done = fold_sub2(&ctx, op);
              break;