]> git.proxmox.com Git - mirror_qemu.git/blobdiff - tcg/mips/tcg-target.c
tcg/mips: implement rotl/rotr ops on MIPS32R2
[mirror_qemu.git] / tcg / mips / tcg-target.c
index f7ed0baeddc464bf23625a3a2c897376adcf93ef..592e42a2bc9f2e1e548e7f20e1f8a00a0016b542 100644 (file)
@@ -68,7 +68,7 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #endif
 
 /* check if we really need so many registers :P */
-static const int tcg_target_reg_alloc_order[] = {
+static const TCGReg tcg_target_reg_alloc_order[] = {
     TCG_REG_S0,
     TCG_REG_S1,
     TCG_REG_S2,
@@ -94,14 +94,14 @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_V1
 };
 
-static const int tcg_target_call_iarg_regs[4] = {
+static const TCGReg tcg_target_call_iarg_regs[4] = {
     TCG_REG_A0,
     TCG_REG_A1,
     TCG_REG_A2,
     TCG_REG_A3
 };
 
-static const int tcg_target_call_oarg_regs[2] = {
+static const TCGReg tcg_target_call_oarg_regs[2] = {
     TCG_REG_V0,
     TCG_REG_V1
 };
@@ -217,17 +217,23 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_set(ct->u.regs, 0xffffffff);
 #if defined(CONFIG_SOFTMMU)
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
+# if (TARGET_LONG_BITS == 64)
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
+# endif
 #endif
         break;
     case 'S': /* qemu_st constraint */
         ct->ct |= TCG_CT_REG;
         tcg_regset_set(ct->u.regs, 0xffffffff);
-#if defined(CONFIG_SOFTMMU)
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
-# if TARGET_LONG_BITS == 64
+#if defined(CONFIG_SOFTMMU)
+# if (TARGET_LONG_BITS == 32)
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A1);
 # endif
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
+# if TARGET_LONG_BITS == 64
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A3);
+# endif
 #endif
         break;
     case 'I':
@@ -270,9 +276,10 @@ static inline int tcg_target_const_match(tcg_target_long val,
 
 /* instruction opcodes */
 enum {
-    OPC_SPECIAL  = 0x00 << 26,
     OPC_BEQ      = 0x04 << 26,
     OPC_BNE      = 0x05 << 26,
+    OPC_BLEZ     = 0x06 << 26,
+    OPC_BGTZ     = 0x07 << 26,
     OPC_ADDIU    = 0x09 << 26,
     OPC_SLTI     = 0x0A << 26,
     OPC_SLTIU    = 0x0B << 26,
@@ -289,11 +296,15 @@ enum {
     OPC_SB       = 0x28 << 26,
     OPC_SH       = 0x29 << 26,
     OPC_SW       = 0x2B << 26,
+
+    OPC_SPECIAL  = 0x00 << 26,
     OPC_SLL      = OPC_SPECIAL | 0x00,
     OPC_SRL      = OPC_SPECIAL | 0x02,
+    OPC_ROTR     = OPC_SPECIAL | (0x01 << 21) | 0x02,
     OPC_SRA      = OPC_SPECIAL | 0x03,
     OPC_SLLV     = OPC_SPECIAL | 0x04,
     OPC_SRLV     = OPC_SPECIAL | 0x06,
+    OPC_ROTRV    = OPC_SPECIAL | (0x01 <<  6) | 0x06,
     OPC_SRAV     = OPC_SPECIAL | 0x07,
     OPC_JR       = OPC_SPECIAL | 0x08,
     OPC_JALR     = OPC_SPECIAL | 0x09,
@@ -311,12 +322,22 @@ enum {
     OPC_NOR      = OPC_SPECIAL | 0x27,
     OPC_SLT      = OPC_SPECIAL | 0x2A,
     OPC_SLTU     = OPC_SPECIAL | 0x2B,
+
+    OPC_REGIMM   = 0x01 << 26,
+    OPC_BLTZ     = OPC_REGIMM | (0x00 << 16),
+    OPC_BGEZ     = OPC_REGIMM | (0x01 << 16),
+
+    OPC_SPECIAL3 = 0x1f << 26,
+    OPC_WSBH     = OPC_SPECIAL3 | 0x0a0,
+    OPC_SEB      = OPC_SPECIAL3 | 0x420,
+    OPC_SEH      = OPC_SPECIAL3 | 0x620,
 };
 
 /*
  * Type reg
  */
-static inline void tcg_out_opc_reg(TCGContext *s, int opc, int rd, int rs, int rt)
+static inline void tcg_out_opc_reg(TCGContext *s, int opc,
+                                   TCGReg rd, TCGReg rs, TCGReg rt)
 {
     int32_t inst;
 
@@ -330,7 +351,8 @@ static inline void tcg_out_opc_reg(TCGContext *s, int opc, int rd, int rs, int r
 /*
  * Type immediate
  */
-static inline void tcg_out_opc_imm(TCGContext *s, int opc, int rt, int rs, int imm)
+static inline void tcg_out_opc_imm(TCGContext *s, int opc,
+                                   TCGReg rt, TCGReg rs, TCGArg imm)
 {
     int32_t inst;
 
@@ -341,10 +363,25 @@ static inline void tcg_out_opc_imm(TCGContext *s, int opc, int rt, int rs, int i
     tcg_out32(s, inst);
 }
 
+/*
+ * Type branch
+ */
+static inline void tcg_out_opc_br(TCGContext *s, int opc,
+                                  TCGReg rt, TCGReg rs)
+{
+    /* We pay attention here to not modify the branch target by reading
+       the existing value and using it again. This ensure that caches and
+       memory are kept coherent during retranslation. */
+    uint16_t offset = (uint16_t)(*(uint32_t *) s->code_ptr);
+
+    tcg_out_opc_imm(s, opc, rt, rs, offset);
+}
+
 /*
  * Type sa
  */
-static inline void tcg_out_opc_sa(TCGContext *s, int opc, int rd, int rt, int sa)
+static inline void tcg_out_opc_sa(TCGContext *s, int opc,
+                                  TCGReg rd, TCGReg rt, TCGArg sa)
 {
     int32_t inst;
 
@@ -361,13 +398,17 @@ static inline void tcg_out_nop(TCGContext *s)
     tcg_out32(s, 0);
 }
 
-static inline void tcg_out_mov(TCGContext *s, int ret, int arg)
+static inline void tcg_out_mov(TCGContext *s, TCGType type,
+                               TCGReg ret, TCGReg arg)
 {
-    tcg_out_opc_reg(s, OPC_ADDU, ret, arg, TCG_REG_ZERO);
+    /* Simple reg-reg move, optimising out the 'do nothing' case */
+    if (ret != arg) {
+        tcg_out_opc_reg(s, OPC_ADDU, ret, arg, TCG_REG_ZERO);
+    }
 }
 
 static inline void tcg_out_movi(TCGContext *s, TCGType type,
-                                int reg, int32_t arg)
+                                TCGReg reg, tcg_target_long arg)
 {
     if (arg == (int16_t)arg) {
         tcg_out_opc_imm(s, OPC_ADDIU, reg, TCG_REG_ZERO, arg);
@@ -379,38 +420,47 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
     }
 }
 
-static inline void tcg_out_bswap16(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap16(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+#else
     /* ret and arg can't be register at */
     if (ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
     }
 
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0x00ff);
-
     tcg_out_opc_sa(s, OPC_SLL, ret, arg, 8);
     tcg_out_opc_imm(s, OPC_ANDI, ret, ret, 0xff00);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
 }
 
-static inline void tcg_out_bswap16s(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap16s(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+    tcg_out_opc_reg(s, OPC_SEH, ret, 0, ret);
+#else
     /* ret and arg can't be register at */
     if (ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
     }
 
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
-    tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0xff);
-
     tcg_out_opc_sa(s, OPC_SLL, ret, arg, 24);
     tcg_out_opc_sa(s, OPC_SRA, ret, ret, 16);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
 }
 
-static inline void tcg_out_bswap32(TCGContext *s, int ret, int arg)
+static inline void tcg_out_bswap32(TCGContext *s, TCGReg ret, TCGReg arg)
 {
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
+    tcg_out_opc_sa(s, OPC_ROTR, ret, ret, 16);
+#else
     /* ret and arg must be different and can't be register at */
     if (ret == arg || ret == TCG_REG_AT || arg == TCG_REG_AT) {
         tcg_abort();
@@ -428,10 +478,31 @@ static inline void tcg_out_bswap32(TCGContext *s, int ret, int arg)
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_AT, arg, 8);
     tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_AT, TCG_REG_AT, 0xff00);
     tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_REG_AT);
+#endif
+}
+
+static inline void tcg_out_ext8s(TCGContext *s, TCGReg ret, TCGReg arg)
+{
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_SEB, ret, 0, arg);
+#else
+    tcg_out_opc_sa(s, OPC_SLL, ret, arg, 24);
+    tcg_out_opc_sa(s, OPC_SRA, ret, ret, 24);
+#endif
+}
+
+static inline void tcg_out_ext16s(TCGContext *s, TCGReg ret, TCGReg arg)
+{
+#ifdef _MIPS_ARCH_MIPS32R2
+    tcg_out_opc_reg(s, OPC_SEH, ret, 0, arg);
+#else
+    tcg_out_opc_sa(s, OPC_SLL, ret, arg, 16);
+    tcg_out_opc_sa(s, OPC_SRA, ret, ret, 16);
+#endif
 }
 
-static inline void tcg_out_ldst(TCGContext *s, int opc, int arg,
-                              int arg1, tcg_target_long arg2)
+static inline void tcg_out_ldst(TCGContext *s, int opc, TCGArg arg,
+                                TCGReg arg1, TCGArg arg2)
 {
     if (arg2 == (int16_t) arg2) {
         tcg_out_opc_imm(s, opc, arg, arg1, arg2);
@@ -442,19 +513,19 @@ static inline void tcg_out_ldst(TCGContext *s, int opc, int arg,
     }
 }
 
-static inline void tcg_out_ld(TCGContext *s, TCGType type, int arg,
-                              int arg1, tcg_target_long arg2)
+static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
+                              TCGReg arg1, tcg_target_long arg2)
 {
     tcg_out_ldst(s, OPC_LW, arg, arg1, arg2);
 }
 
-static inline void tcg_out_st(TCGContext *s, TCGType type, int arg,
-                              int arg1, tcg_target_long arg2)
+static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+                              TCGReg arg1, tcg_target_long arg2)
 {
     tcg_out_ldst(s, OPC_SW, arg, arg1, arg2);
 }
 
-static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
+static inline void tcg_out_addi(TCGContext *s, TCGReg reg, TCGArg val)
 {
     if (val == (int16_t)val) {
         tcg_out_opc_imm(s, OPC_ADDIU, reg, reg, val);
@@ -464,49 +535,126 @@ static inline void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
     }
 }
 
-static void tcg_out_brcond(TCGContext *s, TCGCond cond, int arg1,
-                           int arg2, int label_index)
+/* Helper routines for marshalling helper function arguments into
+ * the correct registers and stack.
+ * arg_num is where we want to put this argument, and is updated to be ready
+ * for the next call. arg is the argument itself. Note that arg_num 0..3 is
+ * real registers, 4+ on stack.
+ *
+ * We provide routines for arguments which are: immediate, 32 bit
+ * value in register, 16 and 8 bit values in register (which must be zero
+ * extended before use) and 64 bit value in a lo:hi register pair.
+ */
+#define DEFINE_TCG_OUT_CALL_IARG(NAME, ARGPARAM)                               \
+    static inline void NAME(TCGContext *s, int *arg_num, ARGPARAM)             \
+    {                                                                          \
+    if (*arg_num < 4) {                                                        \
+        DEFINE_TCG_OUT_CALL_IARG_GET_ARG(tcg_target_call_iarg_regs[*arg_num]); \
+    } else {                                                                   \
+        DEFINE_TCG_OUT_CALL_IARG_GET_ARG(TCG_REG_AT);                          \
+        tcg_out_st(s, TCG_TYPE_I32, TCG_REG_AT, TCG_REG_SP, 4 * (*arg_num));   \
+    }                                                                          \
+    (*arg_num)++;                                                              \
+}
+#define DEFINE_TCG_OUT_CALL_IARG_GET_ARG(A) \
+    tcg_out_opc_imm(s, OPC_ANDI, A, arg, 0xff);
+DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_reg8, TCGReg arg)
+#undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
+#define DEFINE_TCG_OUT_CALL_IARG_GET_ARG(A) \
+    tcg_out_opc_imm(s, OPC_ANDI, A, arg, 0xffff);
+DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_reg16, TCGReg arg)
+#undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
+#define DEFINE_TCG_OUT_CALL_IARG_GET_ARG(A) \
+    tcg_out_movi(s, TCG_TYPE_I32, A, arg);
+DEFINE_TCG_OUT_CALL_IARG(tcg_out_call_iarg_imm32, TCGArg arg)
+#undef DEFINE_TCG_OUT_CALL_IARG_GET_ARG
+
+/* We don't use the macro for this one to avoid an unnecessary reg-reg
+   move when storing to the stack. */
+static inline void tcg_out_call_iarg_reg32(TCGContext *s, int *arg_num,
+                                           TCGReg arg)
+{
+    if (*arg_num < 4) {
+        tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[*arg_num], arg);
+    } else {
+        tcg_out_st(s, TCG_TYPE_I32, arg, TCG_REG_SP, 4 * (*arg_num));
+    }
+    (*arg_num)++;
+}
+
+static inline void tcg_out_call_iarg_reg64(TCGContext *s, int *arg_num,
+                                           TCGReg arg_low, TCGReg arg_high)
+{
+    (*arg_num) = (*arg_num + 1) & ~1;
+
+#if defined(TCG_TARGET_WORDS_BIGENDIAN)
+    tcg_out_call_iarg_reg32(s, arg_num, arg_high);
+    tcg_out_call_iarg_reg32(s, arg_num, arg_low);
+#else
+    tcg_out_call_iarg_reg32(s, arg_num, arg_low);
+    tcg_out_call_iarg_reg32(s, arg_num, arg_high);
+#endif
+}
+
+static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGArg arg1,
+                           TCGArg arg2, int label_index)
 {
     TCGLabel *l = &s->labels[label_index];
 
     switch (cond) {
     case TCG_COND_EQ:
-        tcg_out_opc_imm(s, OPC_BEQ, arg1, arg2, 0);
+        tcg_out_opc_br(s, OPC_BEQ, arg1, arg2);
         break;
     case TCG_COND_NE:
-        tcg_out_opc_imm(s, OPC_BNE, arg1, arg2, 0);
+        tcg_out_opc_br(s, OPC_BNE, arg1, arg2);
         break;
     case TCG_COND_LT:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_imm(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO, 0);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BLTZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
+            tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_LTU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_imm(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO, 0);
+        tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_GE:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_imm(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO, 0);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BGEZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg1, arg2);
+            tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_GEU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg1, arg2);
-        tcg_out_opc_imm(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO, 0);
+        tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_LE:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_imm(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO, 0);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BLEZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
+            tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_LEU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_imm(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO, 0);
+        tcg_out_opc_br(s, OPC_BEQ, TCG_REG_AT, TCG_REG_ZERO);
         break;
     case TCG_COND_GT:
-        tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_imm(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO, 0);
+        if (arg2 == 0) {
+            tcg_out_opc_br(s, OPC_BGTZ, 0, arg1);
+        } else {
+            tcg_out_opc_reg(s, OPC_SLT, TCG_REG_AT, arg2, arg1);
+            tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
+        }
         break;
     case TCG_COND_GTU:
         tcg_out_opc_reg(s, OPC_SLTU, TCG_REG_AT, arg2, arg1);
-        tcg_out_opc_imm(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO, 0);
+        tcg_out_opc_br(s, OPC_BNE, TCG_REG_AT, TCG_REG_ZERO);
         break;
     default:
         tcg_abort();
@@ -522,8 +670,9 @@ static void tcg_out_brcond(TCGContext *s, TCGCond cond, int arg1,
 
 /* XXX: we implement it at the target level to avoid having to
    handle cross basic blocks temporaries */
-static void tcg_out_brcond2(TCGContext *s, TCGCond cond, int arg1,
-                            int arg2, int arg3, int arg4, int label_index)
+static void tcg_out_brcond2(TCGContext *s, TCGCond cond, TCGArg arg1,
+                            TCGArg arg2, TCGArg arg3, TCGArg arg4,
+                            int label_index)
 {
     void *label_ptr;
 
@@ -555,7 +704,7 @@ static void tcg_out_brcond2(TCGContext *s, TCGCond cond, int arg1,
     }
 
     label_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BNE, arg2, arg4, 0);
+    tcg_out_opc_br(s, OPC_BNE, arg2, arg4);
     tcg_out_nop(s);
 
     switch(cond) {
@@ -585,8 +734,8 @@ static void tcg_out_brcond2(TCGContext *s, TCGCond cond, int arg1,
     reloc_pc16(label_ptr, (tcg_target_long) s->code_ptr);
 }
 
-static void tcg_out_setcond(TCGContext *s, TCGCond cond, int ret,
-                            int arg1, int arg2)
+static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGReg ret,
+                            TCGArg arg1, TCGArg arg2)
 {
     switch (cond) {
     case TCG_COND_EQ:
@@ -645,8 +794,8 @@ static void tcg_out_setcond(TCGContext *s, TCGCond cond, int ret,
 
 /* XXX: we implement it at the target level to avoid having to
    handle cross basic blocks temporaries */
-static void tcg_out_setcond2(TCGContext *s, TCGCond cond, int ret,
-                             int arg1, int arg2, int arg3, int arg4)
+static void tcg_out_setcond2(TCGContext *s, TCGCond cond, TCGReg ret,
+                             TCGArg arg1, TCGArg arg2, TCGArg arg3, TCGArg arg4)
 {
     switch (cond) {
     case TCG_COND_EQ:
@@ -711,36 +860,39 @@ static void tcg_out_setcond2(TCGContext *s, TCGCond cond, int ret,
 
 #include "../../softmmu_defs.h"
 
-static void *qemu_ld_helpers[4] = {
-    __ldb_mmu,
-    __ldw_mmu,
-    __ldl_mmu,
-    __ldq_mmu,
+/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
+   int mmu_idx) */
+static const void * const qemu_ld_helpers[4] = {
+    helper_ldb_mmu,
+    helper_ldw_mmu,
+    helper_ldl_mmu,
+    helper_ldq_mmu,
 };
 
-static void *qemu_st_helpers[4] = {
-    __stb_mmu,
-    __stw_mmu,
-    __stl_mmu,
-    __stq_mmu,
+/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
+   uintxx_t val, int mmu_idx) */
+static const void * const qemu_st_helpers[4] = {
+    helper_stb_mmu,
+    helper_stw_mmu,
+    helper_stl_mmu,
+    helper_stq_mmu,
 };
 #endif
 
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
                             int opc)
 {
-    int addr_regl, addr_reg1, addr_meml;
-    int data_regl, data_regh, data_reg1, data_reg2;
-    int mem_index, s_bits;
+    TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
 #if defined(CONFIG_SOFTMMU)
     void *label1_ptr, *label2_ptr;
-    int sp_args;
-#endif
-#if TARGET_LONG_BITS == 64
-# if defined(CONFIG_SOFTMMU)
+    int arg_num;
+    int mem_index, s_bits;
+    int addr_meml;
+# if TARGET_LONG_BITS == 64
     uint8_t *label3_ptr;
+    TCGReg addr_regh;
+    int addr_memh;
 # endif
-    int addr_regh, addr_reg2, addr_memh;
 #endif
     data_regl = *args++;
     if (opc == 3)
@@ -748,11 +900,22 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     else
         data_regh = 0;
     addr_regl = *args++;
-#if TARGET_LONG_BITS == 64
+#if defined(CONFIG_SOFTMMU)
+# if TARGET_LONG_BITS == 64
     addr_regh = *args++;
-#endif
+#  if defined(TCG_TARGET_WORDS_BIGENDIAN)
+    addr_memh = 0;
+    addr_meml = 4;
+#  else
+    addr_memh = 4;
+    addr_meml = 0;
+#  endif
+# else
+    addr_meml = 0;
+# endif
     mem_index = *args;
     s_bits = opc & 3;
+#endif
 
     if (opc == 3) {
 #if defined(TCG_TARGET_WORDS_BIGENDIAN)
@@ -766,58 +929,43 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         data_reg1 = data_regl;
         data_reg2 = 0;
     }
-#if TARGET_LONG_BITS == 64
-# if defined(TCG_TARGET_WORDS_BIGENDIAN)
-    addr_reg1 = addr_regh;
-    addr_reg2 = addr_regl;
-    addr_memh = 0;
-    addr_meml = 4;
-# else
-    addr_reg1 = addr_regl;
-    addr_reg2 = addr_regh;
-    addr_memh = 4;
-    addr_meml = 0;
-# endif
-#else
-    addr_reg1 = addr_regl;
-    addr_meml = 0;
-#endif
-
 #if defined(CONFIG_SOFTMMU)
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addr_regl, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
     tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
     tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
     tcg_out_opc_imm(s, OPC_LW, TCG_REG_AT, TCG_REG_A0,
-                    offsetof(CPUState, tlb_table[mem_index][0].addr_read) + addr_meml);
+                    offsetof(CPUArchState, tlb_table[mem_index][0].addr_read) + addr_meml);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T0, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
     tcg_out_opc_reg(s, OPC_AND, TCG_REG_T0, TCG_REG_T0, addr_regl);
 
 # if TARGET_LONG_BITS == 64
     label3_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BNE, TCG_REG_T0, TCG_REG_AT, 0);
+    tcg_out_opc_br(s, OPC_BNE, TCG_REG_T0, TCG_REG_AT);
     tcg_out_nop(s);
 
     tcg_out_opc_imm(s, OPC_LW, TCG_REG_AT, TCG_REG_A0,
-                    offsetof(CPUState, tlb_table[mem_index][0].addr_read) + addr_memh);
+                    offsetof(CPUArchState, tlb_table[mem_index][0].addr_read) + addr_memh);
 
     label1_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BEQ, addr_regh, TCG_REG_AT, 0);
+    tcg_out_opc_br(s, OPC_BEQ, addr_regh, TCG_REG_AT);
     tcg_out_nop(s);
 
     reloc_pc16(label3_ptr, (tcg_target_long) s->code_ptr);
 # else
     label1_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BEQ, TCG_REG_T0, TCG_REG_AT, 0);
+    tcg_out_opc_br(s, OPC_BEQ, TCG_REG_T0, TCG_REG_AT);
     tcg_out_nop(s);
 # endif
 
     /* slow path */
-    sp_args = TCG_REG_A0;
-    tcg_out_mov(s, sp_args++, addr_reg1);
+    arg_num = 0;
+    tcg_out_call_iarg_reg32(s, &arg_num, TCG_AREG0);
 # if TARGET_LONG_BITS == 64
-    tcg_out_mov(s, sp_args++, addr_reg2);
+    tcg_out_call_iarg_reg64(s, &arg_num, addr_regl, addr_regh);
+# else
+    tcg_out_call_iarg_reg32(s, &arg_num, addr_regl);
 # endif
-    tcg_out_movi(s, TCG_TYPE_I32, sp_args++, mem_index);
+    tcg_out_call_iarg_imm32(s, &arg_num, mem_index);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T9, (tcg_target_long)qemu_ld_helpers[s_bits]);
     tcg_out_opc_reg(s, OPC_JALR, TCG_REG_RA, TCG_REG_T9, 0);
     tcg_out_nop(s);
@@ -827,85 +975,84 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         tcg_out_opc_imm(s, OPC_ANDI, data_reg1, TCG_REG_V0, 0xff);
         break;
     case 0 | 4:
-        tcg_out_opc_sa(s, OPC_SLL, TCG_REG_V0, TCG_REG_V0, 24);
-        tcg_out_opc_sa(s, OPC_SRA, data_reg1, TCG_REG_V0, 24);
+        tcg_out_ext8s(s, data_reg1, TCG_REG_V0);
         break;
     case 1:
         tcg_out_opc_imm(s, OPC_ANDI, data_reg1, TCG_REG_V0, 0xffff);
         break;
     case 1 | 4:
-        tcg_out_opc_sa(s, OPC_SLL, TCG_REG_V0, TCG_REG_V0, 16);
-        tcg_out_opc_sa(s, OPC_SRA, data_reg1, TCG_REG_V0, 16);
+        tcg_out_ext16s(s, data_reg1, TCG_REG_V0);
         break;
     case 2:
-        tcg_out_mov(s, data_reg1, TCG_REG_V0);
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg1, TCG_REG_V0);
         break;
     case 3:
-        tcg_out_mov(s, data_reg2, TCG_REG_V1);
-        tcg_out_mov(s, data_reg1, TCG_REG_V0);
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_V1);
+        tcg_out_mov(s, TCG_TYPE_I32, data_reg1, TCG_REG_V0);
         break;
     default:
         tcg_abort();
     }
 
     label2_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO, 0);
+    tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
     tcg_out_nop(s);
 
     /* label1: fast path */
     reloc_pc16(label1_ptr, (tcg_target_long) s->code_ptr);
 
     tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0,
-                    offsetof(CPUState, tlb_table[mem_index][0].addend) + addr_meml);
+                    offsetof(CPUArchState, tlb_table[mem_index][0].addend));
     tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_V0, TCG_REG_A0, addr_regl);
-
-    addr_reg1 = TCG_REG_V0;
+#else
+    if (GUEST_BASE == (int16_t)GUEST_BASE) {
+        tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_V0, addr_regl, GUEST_BASE);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_V0, GUEST_BASE);
+        tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_V0, TCG_REG_V0, addr_regl);
+    }
 #endif
 
     switch(opc) {
     case 0:
-        tcg_out_opc_imm(s, OPC_LBU, data_reg1, addr_reg1, 0);
+        tcg_out_opc_imm(s, OPC_LBU, data_reg1, TCG_REG_V0, 0);
         break;
     case 0 | 4:
-        tcg_out_opc_imm(s, OPC_LB, data_reg1, addr_reg1, 0);
+        tcg_out_opc_imm(s, OPC_LB, data_reg1, TCG_REG_V0, 0);
         break;
     case 1:
         if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_LHU, TCG_REG_T0, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_LHU, TCG_REG_T0, TCG_REG_V0, 0);
             tcg_out_bswap16(s, data_reg1, TCG_REG_T0);
         } else {
-            tcg_out_opc_imm(s, OPC_LHU, data_reg1, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_LHU, data_reg1, TCG_REG_V0, 0);
         }
         break;
     case 1 | 4:
         if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_LHU, TCG_REG_T0, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_LHU, TCG_REG_T0, TCG_REG_V0, 0);
             tcg_out_bswap16s(s, data_reg1, TCG_REG_T0);
         } else {
-            tcg_out_opc_imm(s, OPC_LH, data_reg1, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_LH, data_reg1, TCG_REG_V0, 0);
         }
         break;
     case 2:
         if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, TCG_REG_V0, 0);
             tcg_out_bswap32(s, data_reg1, TCG_REG_T0);
         } else {
-            tcg_out_opc_imm(s, OPC_LW, data_reg1, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_LW, data_reg1, TCG_REG_V0, 0);
         }
         break;
     case 3:
-#if !defined(CONFIG_SOFTMMU)
-        tcg_out_mov(s, TCG_REG_V0, addr_reg1);
-        addr_reg1 = TCG_REG_V0;
-#endif
         if (TCG_NEED_BSWAP) {
-            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, addr_reg1, 4);
+            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, TCG_REG_V0, 4);
             tcg_out_bswap32(s, data_reg1, TCG_REG_T0);
-            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_LW, TCG_REG_T0, TCG_REG_V0, 0);
             tcg_out_bswap32(s, data_reg2, TCG_REG_T0);
         } else {
-            tcg_out_opc_imm(s, OPC_LW, data_reg1, addr_reg1, 0);
-            tcg_out_opc_imm(s, OPC_LW, data_reg2, addr_reg1, 4);
+            tcg_out_opc_imm(s, OPC_LW, data_reg1, TCG_REG_V0, 0);
+            tcg_out_opc_imm(s, OPC_LW, data_reg2, TCG_REG_V0, 4);
         }
         break;
     default:
@@ -920,163 +1067,164 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
                             int opc)
 {
-    int addr_regl, addr_reg1, addr_meml;
-    int data_regl, data_regh, data_reg1, data_reg2;
-    int mem_index, s_bits;
+    TCGReg addr_regl, data_regl, data_regh, data_reg1, data_reg2;
 #if defined(CONFIG_SOFTMMU)
     uint8_t *label1_ptr, *label2_ptr;
-    int sp_args;
+    int arg_num;
+    int mem_index, s_bits;
+    int addr_meml;
 #endif
 #if TARGET_LONG_BITS == 64
 # if defined(CONFIG_SOFTMMU)
     uint8_t *label3_ptr;
+    TCGReg addr_regh;
+    int addr_memh;
 # endif
-    int addr_regh, addr_reg2, addr_memh;
 #endif
-
     data_regl = *args++;
     if (opc == 3) {
         data_regh = *args++;
-#if defined(TCG_TARGET_WORDS_BIGENDIAN)
-        data_reg1 = data_regh;
-        data_reg2 = data_regl;
-#else
-        data_reg1 = data_regl;
-        data_reg2 = data_regh;
-#endif
     } else {
-        data_reg1 = data_regl;
-        data_reg2 = 0;
         data_regh = 0;
     }
     addr_regl = *args++;
-#if TARGET_LONG_BITS == 64
+#if defined(CONFIG_SOFTMMU)
+# if TARGET_LONG_BITS == 64
     addr_regh = *args++;
-# if defined(TCG_TARGET_WORDS_BIGENDIAN)
-    addr_reg1 = addr_regh;
-    addr_reg2 = addr_regl;
+#  if defined(TCG_TARGET_WORDS_BIGENDIAN)
     addr_memh = 0;
     addr_meml = 4;
-# else
-    addr_reg1 = addr_regl;
-    addr_reg2 = addr_regh;
+#  else
     addr_memh = 4;
     addr_meml = 0;
-# endif
-#else
-    addr_reg1 = addr_regl;
+#  endif
+# else
     addr_meml = 0;
-#endif
+# endif
     mem_index = *args;
     s_bits = opc;
+#endif
+
+    if (opc == 3) {
+#if defined(TCG_TARGET_WORDS_BIGENDIAN)
+        data_reg1 = data_regh;
+        data_reg2 = data_regl;
+#else
+        data_reg1 = data_regl;
+        data_reg2 = data_regh;
+#endif
+    } else {
+        data_reg1 = data_regl;
+        data_reg2 = 0;
+    }
 
 #if defined(CONFIG_SOFTMMU)
     tcg_out_opc_sa(s, OPC_SRL, TCG_REG_A0, addr_regl, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
     tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_A0, TCG_REG_A0, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
     tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, TCG_AREG0);
     tcg_out_opc_imm(s, OPC_LW, TCG_REG_AT, TCG_REG_A0,
-                    offsetof(CPUState, tlb_table[mem_index][0].addr_write) + addr_meml);
+                    offsetof(CPUArchState, tlb_table[mem_index][0].addr_write) + addr_meml);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T0, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
     tcg_out_opc_reg(s, OPC_AND, TCG_REG_T0, TCG_REG_T0, addr_regl);
 
 # if TARGET_LONG_BITS == 64
     label3_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BNE, TCG_REG_T0, TCG_REG_AT, 0);
+    tcg_out_opc_br(s, OPC_BNE, TCG_REG_T0, TCG_REG_AT);
     tcg_out_nop(s);
 
     tcg_out_opc_imm(s, OPC_LW, TCG_REG_AT, TCG_REG_A0,
-                    offsetof(CPUState, tlb_table[mem_index][0].addr_write) + addr_memh);
+                    offsetof(CPUArchState, tlb_table[mem_index][0].addr_write) + addr_memh);
 
     label1_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BEQ, addr_regh, TCG_REG_AT, 0);
+    tcg_out_opc_br(s, OPC_BEQ, addr_regh, TCG_REG_AT);
     tcg_out_nop(s);
 
     reloc_pc16(label3_ptr, (tcg_target_long) s->code_ptr);
 # else
     label1_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BEQ, TCG_REG_T0, TCG_REG_AT, 0);
+    tcg_out_opc_br(s, OPC_BEQ, TCG_REG_T0, TCG_REG_AT);
     tcg_out_nop(s);
 # endif
 
     /* slow path */
-    sp_args = TCG_REG_A0;
-    tcg_out_mov(s, sp_args++, addr_reg1);
+    arg_num = 0;
+    tcg_out_call_iarg_reg32(s, &arg_num, TCG_AREG0);
 # if TARGET_LONG_BITS == 64
-    tcg_out_mov(s, sp_args++, addr_reg2);
+    tcg_out_call_iarg_reg64(s, &arg_num, addr_regl, addr_regh);
+# else
+    tcg_out_call_iarg_reg32(s, &arg_num, addr_regl);
 # endif
     switch(opc) {
     case 0:
-        tcg_out_opc_imm(s, OPC_ANDI, sp_args++, data_reg1, 0xff);
+        tcg_out_call_iarg_reg8(s, &arg_num, data_regl);
         break;
     case 1:
-        tcg_out_opc_imm(s, OPC_ANDI, sp_args++, data_reg1, 0xffff);
+        tcg_out_call_iarg_reg16(s, &arg_num, data_regl);
         break;
     case 2:
-        tcg_out_mov(s, sp_args++, data_reg1);
+        tcg_out_call_iarg_reg32(s, &arg_num, data_regl);
         break;
     case 3:
-        sp_args = (sp_args + 1) & ~1;
-        tcg_out_mov(s, sp_args++, data_reg1);
-        tcg_out_mov(s, sp_args++, data_reg2);
+        tcg_out_call_iarg_reg64(s, &arg_num, data_regl, data_regh);
         break;
     default:
         tcg_abort();
     }
-    if (sp_args > TCG_REG_A3) {
-        /* Push mem_index on the stack */
-        tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_AT, mem_index);
-        tcg_out_st(s, TCG_TYPE_I32, TCG_REG_AT, TCG_REG_SP, 16);
-    } else {
-        tcg_out_movi(s, TCG_TYPE_I32, sp_args, mem_index);
-    }
-
+    tcg_out_call_iarg_imm32(s, &arg_num, mem_index);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_T9, (tcg_target_long)qemu_st_helpers[s_bits]);
     tcg_out_opc_reg(s, OPC_JALR, TCG_REG_RA, TCG_REG_T9, 0);
     tcg_out_nop(s);
 
     label2_ptr = s->code_ptr;
-    tcg_out_opc_imm(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO, 0);
+    tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
     tcg_out_nop(s);
 
     /* label1: fast path */
     reloc_pc16(label1_ptr, (tcg_target_long) s->code_ptr);
 
     tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0,
-                    offsetof(CPUState, tlb_table[mem_index][0].addend) + addr_meml);
+                    offsetof(CPUArchState, tlb_table[mem_index][0].addend));
     tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, addr_regl);
+#else
+    if (GUEST_BASE == (int16_t)GUEST_BASE) {
+        tcg_out_opc_imm(s, OPC_ADDIU, TCG_REG_A0, addr_regl, GUEST_BASE);
+    } else {
+        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_A0, GUEST_BASE);
+        tcg_out_opc_reg(s, OPC_ADDU, TCG_REG_A0, TCG_REG_A0, addr_regl);
+    }
 
-    addr_reg1 = TCG_REG_A0;
 #endif
 
     switch(opc) {
     case 0:
-        tcg_out_opc_imm(s, OPC_SB, data_reg1, addr_reg1, 0);
+        tcg_out_opc_imm(s, OPC_SB, data_reg1, TCG_REG_A0, 0);
         break;
     case 1:
         if (TCG_NEED_BSWAP) {
-            tcg_out_bswap16(s, TCG_REG_T0, data_reg1);
-            tcg_out_opc_imm(s, OPC_SH, TCG_REG_T0, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_T0, data_reg1, 0xffff);
+            tcg_out_bswap16(s, TCG_REG_T0, TCG_REG_T0);
+            tcg_out_opc_imm(s, OPC_SH, TCG_REG_T0, TCG_REG_A0, 0);
         } else {
-            tcg_out_opc_imm(s, OPC_SH, data_reg1, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_SH, data_reg1, TCG_REG_A0, 0);
         }
         break;
     case 2:
         if (TCG_NEED_BSWAP) {
             tcg_out_bswap32(s, TCG_REG_T0, data_reg1);
-            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, TCG_REG_A0, 0);
         } else {
-            tcg_out_opc_imm(s, OPC_SW, data_reg1, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_SW, data_reg1, TCG_REG_A0, 0);
         }
         break;
     case 3:
         if (TCG_NEED_BSWAP) {
             tcg_out_bswap32(s, TCG_REG_T0, data_reg2);
-            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, addr_reg1, 0);
+            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, TCG_REG_A0, 0);
             tcg_out_bswap32(s, TCG_REG_T0, data_reg1);
-            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, addr_reg1, 4);
+            tcg_out_opc_imm(s, OPC_SW, TCG_REG_T0, TCG_REG_A0, 4);
         } else {
-            tcg_out_opc_imm(s, OPC_SW, data_reg1, addr_reg1, 0);
-            tcg_out_opc_imm(s, OPC_SW, data_reg2, addr_reg1, 4);
+            tcg_out_opc_imm(s, OPC_SW, data_reg1, TCG_REG_A0, 0);
+            tcg_out_opc_imm(s, OPC_SW, data_reg2, TCG_REG_A0, 4);
         }
         break;
     default:
@@ -1124,14 +1272,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mov_i32:
-        tcg_out_mov(s, args[0], args[1]);
+        tcg_out_mov(s, TCG_TYPE_I32, args[0], args[1]);
         break;
     case INDEX_op_movi_i32:
         tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
         break;
 
     case INDEX_op_ld8u_i32:
-       tcg_out_ldst(s, OPC_LBU, args[0], args[1], args[2]);
+        tcg_out_ldst(s, OPC_LBU, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld8s_i32:
         tcg_out_ldst(s, OPC_LB, args[0], args[1], args[2]);
@@ -1175,7 +1323,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
              tcg_out_opc_reg(s, OPC_ADDU, args[1], args[3], args[5]);
         }
         tcg_out_opc_reg(s, OPC_ADDU, args[1], args[1], TCG_REG_T0);
-        tcg_out_mov(s, args[0], TCG_REG_AT);
+        tcg_out_mov(s, TCG_TYPE_I32, args[0], TCG_REG_AT);
         break;
     case INDEX_op_sub_i32:
         if (const_args[2]) {
@@ -1197,7 +1345,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
              tcg_out_opc_reg(s, OPC_SUBU, args[1], args[3], args[5]);
         }
         tcg_out_opc_reg(s, OPC_SUBU, args[1], args[1], TCG_REG_T0);
-        tcg_out_mov(s, args[0], TCG_REG_AT);
+        tcg_out_mov(s, TCG_TYPE_I32, args[0], TCG_REG_AT);
         break;
     case INDEX_op_mul_i32:
         tcg_out_opc_reg(s, OPC_MULT, 0, args[1], args[2]);
@@ -1239,8 +1387,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out_opc_reg(s, OPC_OR, args[0], args[1], args[2]);
         }
         break;
+    case INDEX_op_nor_i32:
+        tcg_out_opc_reg(s, OPC_NOR, args[0], args[1], args[2]);
+        break;
     case INDEX_op_not_i32:
-        tcg_out_opc_reg(s, OPC_NOR, args[0], args[1], args[1]);
+        tcg_out_opc_reg(s, OPC_NOR, args[0], TCG_REG_ZERO, args[1]);
         break;
     case INDEX_op_xor_i32:
         if (const_args[2]) {
@@ -1271,6 +1422,38 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out_opc_reg(s, OPC_SRLV, args[0], args[2], args[1]);
         }
         break;
+    case INDEX_op_rotl_i32:
+        if (const_args[2]) {
+            tcg_out_opc_sa(s, OPC_ROTR, args[0], args[1], 0x20 - args[2]);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_AT, 32);
+            tcg_out_opc_reg(s, OPC_SUBU, TCG_REG_AT, TCG_REG_AT, args[2]);
+            tcg_out_opc_reg(s, OPC_ROTRV, args[0], TCG_REG_AT, args[1]);
+        }
+        break;
+    case INDEX_op_rotr_i32:
+        if (const_args[2]) {
+            tcg_out_opc_sa(s, OPC_ROTR, args[0], args[1], args[2]);
+        } else {
+            tcg_out_opc_reg(s, OPC_ROTRV, args[0], args[2], args[1]);
+        }
+        break;
+
+    /* The bswap routines do not work on non-R2 CPU. In that case
+       we let TCG generating the corresponding code. */
+    case INDEX_op_bswap16_i32:
+        tcg_out_bswap16(s, args[0], args[1]);
+        break;
+    case INDEX_op_bswap32_i32:
+        tcg_out_bswap32(s, args[0], args[1]);
+        break;
+
+    case INDEX_op_ext8s_i32:
+        tcg_out_ext8s(s, args[0], args[1]);
+        break;
+    case INDEX_op_ext16s_i32:
+        tcg_out_ext16s(s, args[0], args[1]);
+        break;
 
     case INDEX_op_brcond_i32:
         tcg_out_brcond(s, args[2], args[0], args[1], args[3]);
@@ -1340,30 +1523,39 @@ static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_st16_i32, { "rZ", "r" } },
     { INDEX_op_st_i32, { "rZ", "r" } },
 
-    { INDEX_op_add_i32, { "r", "rZ", "rJZ" } },
+    { INDEX_op_add_i32, { "r", "rZ", "rJ" } },
     { INDEX_op_mul_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_mulu2_i32, { "r", "r", "rZ", "rZ" } },
     { INDEX_op_div_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_divu_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_rem_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_remu_i32, { "r", "rZ", "rZ" } },
-    { INDEX_op_sub_i32, { "r", "rZ", "rJZ" } },
+    { INDEX_op_sub_i32, { "r", "rZ", "rJ" } },
 
-    { INDEX_op_and_i32, { "r", "rZ", "rIZ" } },
+    { INDEX_op_and_i32, { "r", "rZ", "rI" } },
+    { INDEX_op_nor_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_not_i32, { "r", "rZ" } },
     { INDEX_op_or_i32, { "r", "rZ", "rIZ" } },
     { INDEX_op_xor_i32, { "r", "rZ", "rIZ" } },
 
-    { INDEX_op_shl_i32, { "r", "rZ", "riZ" } },
-    { INDEX_op_shr_i32, { "r", "rZ", "riZ" } },
-    { INDEX_op_sar_i32, { "r", "rZ", "riZ" } },
+    { INDEX_op_shl_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_shr_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_sar_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_rotr_i32, { "r", "rZ", "ri" } },
+    { INDEX_op_rotl_i32, { "r", "rZ", "ri" } },
+
+    { INDEX_op_bswap16_i32, { "r", "r" } },
+    { INDEX_op_bswap32_i32, { "r", "r" } },
+
+    { INDEX_op_ext8s_i32, { "r", "rZ" } },
+    { INDEX_op_ext16s_i32, { "r", "rZ" } },
 
     { INDEX_op_brcond_i32, { "rZ", "rZ" } },
     { INDEX_op_setcond_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_setcond2_i32, { "r", "rZ", "rZ", "rZ", "rZ" } },
 
-    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rJZ", "rJZ" } },
-    { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rJZ", "rJZ" } },
+    { INDEX_op_add2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
+    { INDEX_op_sub2_i32, { "r", "r", "rZ", "rZ", "rJ", "rJ" } },
     { INDEX_op_brcond2_i32, { "rZ", "rZ", "rZ", "rZ" } },
 
 #if TARGET_LONG_BITS == 32
@@ -1395,7 +1587,7 @@ static const TCGTargetOpDef mips_op_defs[] = {
 };
 
 static int tcg_target_callee_save_regs[] = {
-    TCG_REG_S0,
+    TCG_REG_S0,       /* used for the global env (TCG_AREG0) */
     TCG_REG_S1,
     TCG_REG_S2,
     TCG_REG_S3,
@@ -1403,22 +1595,24 @@ static int tcg_target_callee_save_regs[] = {
     TCG_REG_S5,
     TCG_REG_S6,
     TCG_REG_S7,
-    TCG_REG_GP,
-    /* TCG_REG_FP, */ /* currently used for the global env, so np
-                         need to save */
+    TCG_REG_FP,
     TCG_REG_RA,       /* should be last for ABI compliance */
 };
 
 /* Generate global QEMU prologue and epilogue code */
-void tcg_target_qemu_prologue(TCGContext *s)
+static void tcg_target_qemu_prologue(TCGContext *s)
 {
     int i, frame_size;
 
-    /* reserve some stack space */
+    /* reserve some stack space, also for TCG temps. */
     frame_size = ARRAY_SIZE(tcg_target_callee_save_regs) * 4
-                 + TCG_STATIC_CALL_ARGS_SIZE;
+                 + TCG_STATIC_CALL_ARGS_SIZE
+                 + CPU_TEMP_BUF_NLONGS * sizeof(long);
     frame_size = (frame_size + TCG_TARGET_STACK_ALIGN - 1) &
                  ~(TCG_TARGET_STACK_ALIGN - 1);
+    tcg_set_frame(s, TCG_REG_SP, ARRAY_SIZE(tcg_target_callee_save_regs) * 4
+                  + TCG_STATIC_CALL_ARGS_SIZE,
+                  CPU_TEMP_BUF_NLONGS * sizeof(long));
 
     /* TB prologue */
     tcg_out_addi(s, TCG_REG_SP, -frame_size);
@@ -1428,8 +1622,8 @@ void tcg_target_qemu_prologue(TCGContext *s)
     }
 
     /* Call generated code */
-    tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_A0, 0);
-    tcg_out_nop(s);
+    tcg_out_opc_reg(s, OPC_JR, 0, tcg_target_call_iarg_regs[1], 0);
+    tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
     tb_ret_addr = s->code_ptr;
 
     /* TB epilogue */
@@ -1442,7 +1636,7 @@ void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out_addi(s, TCG_REG_SP, frame_size);
 }
 
-void tcg_target_init(TCGContext *s)
+static void tcg_target_init(TCGContext *s)
 {
     tcg_regset_set(tcg_target_available_regs[TCG_TYPE_I32], 0xffffffff);
     tcg_regset_set(tcg_target_call_clobber_regs,
@@ -1470,6 +1664,7 @@ void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_T0);   /* internal use */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_RA);   /* return address */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);   /* stack pointer */
+    tcg_regset_set_reg(s->reserved_regs, TCG_REG_GP);   /* global pointer */
 
     tcg_add_target_add_op_defs(mips_op_defs);
 }