ARM FP16 support

author Paul Brook <paul@codesourcery.com>

Thu, 19 Nov 2009 16:45:20 +0000 (16:45 +0000)

committer Paul Brook <paul@codesourcery.com>

Thu, 19 Nov 2009 16:45:20 +0000 (16:45 +0000)
author Paul Brook <paul@codesourcery.com>
Thu, 19 Nov 2009 16:45:20 +0000 (16:45 +0000)
committer Paul Brook <paul@codesourcery.com>
Thu, 19 Nov 2009 16:45:20 +0000 (16:45 +0000)
diff --git a/fpu/softfloat.c b/fpu/softfloat.c

index 4d58744c87b6582363603c0162f19d18c60cd97e..395f9b139e70ebcaa3ae43082412fef899188bbb 100644 (file)
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -2457,6 +2457,144 @@ float32 float64_to_float32( float64 a STATUS_PARAM )
  
  }
  
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| half-precision floating-point value, returning the result.  After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result.  This means that any integer portion of `zSig'
+| will be added into the exponent.  Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
+{
+    return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
+}
+
+/* Half precision floats come in two formats: standard IEEE and "ARM" format.
+   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
+  
+float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+
+    aSign = a >> 15;
+    aExp = (a >> 10) & 0x1f;
+    aSig = a & 0x3ff;
+
+    if (aExp == 0x1f && ieee) {
+        if (aSig) {
+            /* Make sure correct exceptions are raised.  */
+            float32ToCommonNaN(a STATUS_VAR);
+            aSig |= 0x200;
+        }
+        return packFloat32(aSign, 0xff, aSig << 13);
+    }
+    if (aExp == 0) {
+        int8 shiftCount;
+
+        if (aSig == 0) {
+            return packFloat32(aSign, 0, 0);
+        }
+
+        shiftCount = countLeadingZeros32( aSig ) - 21;
+        aSig = aSig << shiftCount;
+        aExp = -shiftCount;
+    }
+    return packFloat32( aSign, aExp + 0x70, aSig << 13);
+}
+
+bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
+{
+    flag aSign;
+    int16 aExp;
+    bits32 aSig;
+    bits32 mask;
+    bits32 increment;
+    int8 roundingMode;
+
+    aSig = extractFloat32Frac( a );
+    aExp = extractFloat32Exp( a );
+    aSign = extractFloat32Sign( a );
+    if ( aExp == 0xFF ) {
+        if (aSig) {
+            /* Make sure correct exceptions are raised.  */
+            float32ToCommonNaN(a STATUS_VAR);
+            aSig |= 0x00400000;
+        }
+        return packFloat16(aSign, 0x1f, aSig >> 13);
+    }
+    if (aExp == 0 && aSign == 0) {
+        return packFloat16(aSign, 0, 0);
+    }
+    /* Decimal point between bits 22 and 23.  */
+    aSig |= 0x00800000;
+    aExp -= 0x7f;
+    if (aExp < -14) {
+        mask = 0x007fffff;
+        if (aExp < -24) {
+            aExp = -25;
+        } else {
+            mask >>= 24 + aExp;
+        }
+    } else {
+        mask = 0x00001fff;
+    }
+    if (aSig & mask) {
+        float_raise( float_flag_underflow STATUS_VAR );
+        roundingMode = STATUS(float_rounding_mode);
+        switch (roundingMode) {
+        case float_round_nearest_even:
+            increment = (mask + 1) >> 1;
+            if ((aSig & mask) == increment) {
+                increment = aSig & (increment << 1);
+            }
+            break;
+        case float_round_up:
+            increment = aSign ? 0 : mask;
+            break;
+        case float_round_down:
+            increment = aSign ? mask : 0;
+            break;
+        default: /* round_to_zero */
+            increment = 0;
+            break;
+        }
+        aSig += increment;
+        if (aSig >= 0x01000000) {
+            aSig >>= 1;
+            aExp++;
+        }
+    } else if (aExp < -14
+          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
+        float_raise( float_flag_underflow STATUS_VAR);
+    }
+
+    if (ieee) {
+        if (aExp > 15) {
+            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
+            return packFloat16(aSign, 0x1f, 0);
+        }
+    } else {
+        if (aExp > 16) {
+            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
+            return packFloat16(aSign, 0x1f, 0x3ff);
+        }
+    }
+    if (aExp < -24) {
+        return packFloat16(aSign, 0, 0);
+    }
+    if (aExp < -14) {
+        aSig >>= -14 - aExp;
+        aExp = -14;
+    }
+    return packFloat16(aSign, aExp + 14, aSig >> 13);
+}
+
  #ifdef FLOATX80
  
  /*----------------------------------------------------------------------------
diff --git a/fpu/softfloat.h b/fpu/softfloat.h

index 789179a6b2d15fbae08c7293c2efbd36795b56ba..636591b04cdb9043f94975c5f1f801aa66dcb558 100644 (file)
--- a/fpu/softfloat.h
+++ b/fpu/softfloat.h
@@ -242,6 +242,12 @@ floatx80 int64_to_floatx80( int64_t STATUS_PARAM );
  float128 int64_to_float128( int64_t STATUS_PARAM );
  #endif
  
+/*----------------------------------------------------------------------------
+| Software half-precision conversion routines.
+*----------------------------------------------------------------------------*/
+bits16 float32_to_float16( float32, flag STATUS_PARAM );
+float32 float16_to_float32( bits16, flag STATUS_PARAM );
+
  /*----------------------------------------------------------------------------
  | Software IEC/IEEE single-precision conversion routines.
  *----------------------------------------------------------------------------*/
diff --git a/target-arm/cpu.h b/target-arm/cpu.h

index 257e7aa7053066172c183da9d5790f8a13917829..6c0f9d61d15897e231bfb6c4d529c94a85bd07e0 100644 (file)
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -337,6 +337,7 @@ enum arm_features {
      ARM_FEATURE_THUMB2,
      ARM_FEATURE_MPU,    /* Only has Memory Protection Unit, not full MMU.  */
      ARM_FEATURE_VFP3,
+    ARM_FEATURE_VFP_FP16,
      ARM_FEATURE_NEON,
      ARM_FEATURE_DIV,
      ARM_FEATURE_M, /* Microcontroller profile.  */
diff --git a/target-arm/helper.c b/target-arm/helper.c

index 5e10533f1c82c909638f1cca4ecb74a4697390f9..cb95c6eaba689ade642ccc42d102c70efc93cb7c 100644 (file)
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -115,6 +115,7 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id)
          set_feature(env, ARM_FEATURE_THUMB2);
          set_feature(env, ARM_FEATURE_VFP);
          set_feature(env, ARM_FEATURE_VFP3);
+        set_feature(env, ARM_FEATURE_VFP_FP16);
          set_feature(env, ARM_FEATURE_NEON);
          set_feature(env, ARM_FEATURE_THUMB2EE);
          set_feature(env, ARM_FEATURE_DIV);
@@ -2568,6 +2569,21 @@ VFP_CONV_FIX(uh, s, float32, uint16, u)
  VFP_CONV_FIX(ul, s, float32, uint32, u)
  #undef VFP_CONV_FIX
  
+/* Half precision conversions.  */
+float32 HELPER(vfp_fcvt_f16_to_f32)(uint32_t a, CPUState *env)
+{
+    float_status *s = &env->vfp.fp_status;
+    int ieee = (env->vfp.xregs[ARM_VFP_FPSCR] & (1 << 26)) == 0;
+    return float16_to_float32(a, ieee, s);
+}
+
+uint32_t HELPER(vfp_fcvt_f32_to_f16)(float32 a, CPUState *env)
+{
+    float_status *s = &env->vfp.fp_status;
+    int ieee = (env->vfp.xregs[ARM_VFP_FPSCR] & (1 << 26)) == 0;
+    return float32_to_float16(a, ieee, s);
+}
+
  float32 HELPER(recps_f32)(float32 a, float32 b, CPUState *env)
  {
      float_status *s = &env->vfp.fp_status;
diff --git a/target-arm/helpers.h b/target-arm/helpers.h

index 4d07e0cea3d3638bc92bc703827f6dfef81f9b57..dc25f185d503c35acd3cd4a32e60a66555616a6c 100644 (file)
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -131,6 +131,9 @@ DEF_HELPER_3(vfp_sltod, f64, f64, i32, env)
  DEF_HELPER_3(vfp_uhtod, f64, f64, i32, env)
  DEF_HELPER_3(vfp_ultod, f64, f64, i32, env)
  
+DEF_HELPER_2(vfp_fcvt_f16_to_f32, f32, i32, env)
+DEF_HELPER_2(vfp_fcvt_f32_to_f16, i32, f32, env)
+
  DEF_HELPER_3(recps_f32, f32, f32, f32, env)
  DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env)
  DEF_HELPER_2(recpe_f32, f32, f32, env)
diff --git a/target-arm/translate.c b/target-arm/translate.c

index 57845662ae25cbadd6d3ae241295d7eb067e3f22..a002f7e029beb3cfc5bc245d31fb23daa9282420 100644 (file)
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -2974,6 +2974,47 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn)
                      case 3: /* sqrt */
                          gen_vfp_sqrt(dp);
                          break;
+                    case 4: /* vcvtb.f32.f16 */
+                        if (!arm_feature(env, ARM_FEATURE_VFP_FP16))
+                          return 1;
+                        tmp = gen_vfp_mrs();
+                        tcg_gen_ext16u_i32(tmp, tmp);
+                        gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp, cpu_env);
+                        dead_tmp(tmp);
+                        break;
+                    case 5: /* vcvtt.f32.f16 */
+                        if (!arm_feature(env, ARM_FEATURE_VFP_FP16))
+                          return 1;
+                        tmp = gen_vfp_mrs();
+                        tcg_gen_shri_i32(tmp, tmp, 16);
+                        gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp, cpu_env);
+                        dead_tmp(tmp);
+                        break;
+                    case 6: /* vcvtb.f16.f32 */
+                        if (!arm_feature(env, ARM_FEATURE_VFP_FP16))
+                          return 1;
+                        tmp = new_tmp();
+                        gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env);
+                        gen_mov_F0_vreg(0, rd);
+                        tmp2 = gen_vfp_mrs();
+                        tcg_gen_andi_i32(tmp2, tmp2, 0xffff0000);
+                        tcg_gen_or_i32(tmp, tmp, tmp2);
+                        dead_tmp(tmp2);
+                        gen_vfp_msr(tmp);
+                        break;
+                    case 7: /* vcvtt.f16.f32 */
+                        if (!arm_feature(env, ARM_FEATURE_VFP_FP16))
+                          return 1;
+                        tmp = new_tmp();
+                        gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env);
+                        tcg_gen_shli_i32(tmp, tmp, 16);
+                        gen_mov_F0_vreg(0, rd);
+                        tmp2 = gen_vfp_mrs();
+                        tcg_gen_ext16u_i32(tmp2, tmp2);
+                        tcg_gen_or_i32(tmp, tmp, tmp2);
+                        dead_tmp(tmp2);
+                        gen_vfp_msr(tmp);
+                        break;
                      case 8: /* cmp */
                          gen_vfp_cmp(dp);
                          break;
@@ -5328,6 +5369,50 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                          neon_store_reg64(cpu_V0, rd + pass);
                      }
                      break;
+                case 44: /* VCVT.F16.F32 */
+                    if (!arm_feature(env, ARM_FEATURE_VFP_FP16))
+                      return 1;
+                    tmp = new_tmp();
+                    tmp2 = new_tmp();
+                    tcg_gen_ld_f32(cpu_F0s, cpu_env, neon_reg_offset(rm, 0));
+                    gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env);
+                    tcg_gen_ld_f32(cpu_F0s, cpu_env, neon_reg_offset(rm, 1));
+                    gen_helper_vfp_fcvt_f32_to_f16(tmp2, cpu_F0s, cpu_env);
+                    tcg_gen_shli_i32(tmp2, tmp2, 16);
+                    tcg_gen_or_i32(tmp2, tmp2, tmp);
+                    tcg_gen_ld_f32(cpu_F0s, cpu_env, neon_reg_offset(rm, 2));
+                    gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env);
+                    tcg_gen_ld_f32(cpu_F0s, cpu_env, neon_reg_offset(rm, 3));
+                    neon_store_reg(rd, 0, tmp2);
+                    tmp2 = new_tmp();
+                    gen_helper_vfp_fcvt_f32_to_f16(tmp2, cpu_F0s, cpu_env);
+                    tcg_gen_shli_i32(tmp2, tmp2, 16);
+                    tcg_gen_or_i32(tmp2, tmp2, tmp);
+                    neon_store_reg(rd, 1, tmp2);
+                    dead_tmp(tmp);
+                    break;
+                case 46: /* VCVT.F32.F16 */
+                    if (!arm_feature(env, ARM_FEATURE_VFP_FP16))
+                      return 1;
+                    tmp3 = new_tmp();
+                    tmp = neon_load_reg(rm, 0);
+                    tmp2 = neon_load_reg(rm, 1);
+                    tcg_gen_ext16u_i32(tmp3, tmp);
+                    gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp3, cpu_env);
+                    tcg_gen_st_f32(cpu_F0s, cpu_env, neon_reg_offset(rd, 0));
+                    tcg_gen_shri_i32(tmp3, tmp, 16);
+                    gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp3, cpu_env);
+                    tcg_gen_st_f32(cpu_F0s, cpu_env, neon_reg_offset(rd, 1));
+                    dead_tmp(tmp);
+                    tcg_gen_ext16u_i32(tmp3, tmp2);
+                    gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp3, cpu_env);
+                    tcg_gen_st_f32(cpu_F0s, cpu_env, neon_reg_offset(rd, 2));
+                    tcg_gen_shri_i32(tmp3, tmp2, 16);
+                    gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp3, cpu_env);
+                    tcg_gen_st_f32(cpu_F0s, cpu_env, neon_reg_offset(rd, 3));
+                    dead_tmp(tmp2);
+                    dead_tmp(tmp3);
+                    break;
                  default:
                  elementwise:
                      for (pass = 0; pass < (q ? 4 : 2); pass++) {
author	Paul Brook <paul@codesourcery.com>
	Thu, 19 Nov 2009 16:45:20 +0000 (16:45 +0000)
committer	Paul Brook <paul@codesourcery.com>
	Thu, 19 Nov 2009 16:45:20 +0000 (16:45 +0000)
fpu/softfloat.c		patch \| blob \| blame \| history
fpu/softfloat.h		patch \| blob \| blame \| history
target-arm/cpu.h		patch \| blob \| blame \| history
target-arm/helper.c		patch \| blob \| blame \| history
target-arm/helpers.h		patch \| blob \| blame \| history
target-arm/translate.c		patch \| blob \| blame \| history