From 600114988cb1beede13ce29dec65398f3e62e510 Mon Sep 17 00:00:00 2001 From: Paul Brook Date: Thu, 19 Nov 2009 16:45:20 +0000 Subject: [PATCH] ARM FP16 support Implement the ARM VFP half precision floating point extensions. Signed-off-by: Paul Brook --- fpu/softfloat.c | 138 +++++++++++++++++++++++++++++++++++++++++ fpu/softfloat.h | 6 ++ target-arm/cpu.h | 1 + target-arm/helper.c | 16 +++++ target-arm/helpers.h | 3 + target-arm/translate.c | 85 +++++++++++++++++++++++++ 6 files changed, 249 insertions(+) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 4d58744c8..395f9b139 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2457,6 +2457,144 @@ float32 float64_to_float32( float64 a STATUS_PARAM ) } + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a +| half-precision floating-point value, returning the result. After being +| shifted into the proper positions, the three fields are simply added +| together to form the result. This means that any integer portion of `zSig' +| will be added into the exponent. Since a properly normalized significand +| will have an integer portion equal to 1, the `zExp' input should be 1 less +| than the desired result exponent whenever `zSig' is a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ +static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig) +{ + return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig; +} + +/* Half precision floats come in two formats: standard IEEE and "ARM" format. + The latter gains extra exponent range by omitting the NaN/Inf encodings. */ + +float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM ) +{ + flag aSign; + int16 aExp; + bits32 aSig; + + aSign = a >> 15; + aExp = (a >> 10) & 0x1f; + aSig = a & 0x3ff; + + if (aExp == 0x1f && ieee) { + if (aSig) { + /* Make sure correct exceptions are raised. */ + float32ToCommonNaN(a STATUS_VAR); + aSig |= 0x200; + } + return packFloat32(aSign, 0xff, aSig << 13); + } + if (aExp == 0) { + int8 shiftCount; + + if (aSig == 0) { + return packFloat32(aSign, 0, 0); + } + + shiftCount = countLeadingZeros32( aSig ) - 21; + aSig = aSig << shiftCount; + aExp = -shiftCount; + } + return packFloat32( aSign, aExp + 0x70, aSig << 13); +} + +bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM) +{ + flag aSign; + int16 aExp; + bits32 aSig; + bits32 mask; + bits32 increment; + int8 roundingMode; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if (aSig) { + /* Make sure correct exceptions are raised. */ + float32ToCommonNaN(a STATUS_VAR); + aSig |= 0x00400000; + } + return packFloat16(aSign, 0x1f, aSig >> 13); + } + if (aExp == 0 && aSign == 0) { + return packFloat16(aSign, 0, 0); + } + /* Decimal point between bits 22 and 23. */ + aSig |= 0x00800000; + aExp -= 0x7f; + if (aExp < -14) { + mask = 0x007fffff; + if (aExp < -24) { + aExp = -25; + } else { + mask >>= 24 + aExp; + } + } else { + mask = 0x00001fff; + } + if (aSig & mask) { + float_raise( float_flag_underflow STATUS_VAR ); + roundingMode = STATUS(float_rounding_mode); + switch (roundingMode) { + case float_round_nearest_even: + increment = (mask + 1) >> 1; + if ((aSig & mask) == increment) { + increment = aSig & (increment << 1); + } + break; + case float_round_up: + increment = aSign ? 0 : mask; + break; + case float_round_down: + increment = aSign ? mask : 0; + break; + default: /* round_to_zero */ + increment = 0; + break; + } + aSig += increment; + if (aSig >= 0x01000000) { + aSig >>= 1; + aExp++; + } + } else if (aExp < -14 + && STATUS(float_detect_tininess) == float_tininess_before_rounding) { + float_raise( float_flag_underflow STATUS_VAR); + } + + if (ieee) { + if (aExp > 15) { + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); + return packFloat16(aSign, 0x1f, 0); + } + } else { + if (aExp > 16) { + float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); + return packFloat16(aSign, 0x1f, 0x3ff); + } + } + if (aExp < -24) { + return packFloat16(aSign, 0, 0); + } + if (aExp < -14) { + aSig >>= -14 - aExp; + aExp = -14; + } + return packFloat16(aSign, aExp + 14, aSig >> 13); +} + #ifdef FLOATX80 /*---------------------------------------------------------------------------- diff --git a/fpu/softfloat.h b/fpu/softfloat.h index 789179a6b..636591b04 100644 --- a/fpu/softfloat.h +++ b/fpu/softfloat.h @@ -242,6 +242,12 @@ floatx80 int64_to_floatx80( int64_t STATUS_PARAM ); float128 int64_to_float128( int64_t STATUS_PARAM ); #endif +/*---------------------------------------------------------------------------- +| Software half-precision conversion routines. +*----------------------------------------------------------------------------*/ +bits16 float32_to_float16( float32, flag STATUS_PARAM ); +float32 float16_to_float32( bits16, flag STATUS_PARAM ); + /*---------------------------------------------------------------------------- | Software IEC/IEEE single-precision conversion routines. *----------------------------------------------------------------------------*/ diff --git a/target-arm/cpu.h b/target-arm/cpu.h index 257e7aa70..6c0f9d61d 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -337,6 +337,7 @@ enum arm_features { ARM_FEATURE_THUMB2, ARM_FEATURE_MPU, /* Only has Memory Protection Unit, not full MMU. */ ARM_FEATURE_VFP3, + ARM_FEATURE_VFP_FP16, ARM_FEATURE_NEON, ARM_FEATURE_DIV, ARM_FEATURE_M, /* Microcontroller profile. */ diff --git a/target-arm/helper.c b/target-arm/helper.c index 5e10533f1..cb95c6eab 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -115,6 +115,7 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) set_feature(env, ARM_FEATURE_THUMB2); set_feature(env, ARM_FEATURE_VFP); set_feature(env, ARM_FEATURE_VFP3); + set_feature(env, ARM_FEATURE_VFP_FP16); set_feature(env, ARM_FEATURE_NEON); set_feature(env, ARM_FEATURE_THUMB2EE); set_feature(env, ARM_FEATURE_DIV); @@ -2568,6 +2569,21 @@ VFP_CONV_FIX(uh, s, float32, uint16, u) VFP_CONV_FIX(ul, s, float32, uint32, u) #undef VFP_CONV_FIX +/* Half precision conversions. */ +float32 HELPER(vfp_fcvt_f16_to_f32)(uint32_t a, CPUState *env) +{ + float_status *s = &env->vfp.fp_status; + int ieee = (env->vfp.xregs[ARM_VFP_FPSCR] & (1 << 26)) == 0; + return float16_to_float32(a, ieee, s); +} + +uint32_t HELPER(vfp_fcvt_f32_to_f16)(float32 a, CPUState *env) +{ + float_status *s = &env->vfp.fp_status; + int ieee = (env->vfp.xregs[ARM_VFP_FPSCR] & (1 << 26)) == 0; + return float32_to_float16(a, ieee, s); +} + float32 HELPER(recps_f32)(float32 a, float32 b, CPUState *env) { float_status *s = &env->vfp.fp_status; diff --git a/target-arm/helpers.h b/target-arm/helpers.h index 4d07e0cea..dc25f185d 100644 --- a/target-arm/helpers.h +++ b/target-arm/helpers.h @@ -131,6 +131,9 @@ DEF_HELPER_3(vfp_sltod, f64, f64, i32, env) DEF_HELPER_3(vfp_uhtod, f64, f64, i32, env) DEF_HELPER_3(vfp_ultod, f64, f64, i32, env) +DEF_HELPER_2(vfp_fcvt_f16_to_f32, f32, i32, env) +DEF_HELPER_2(vfp_fcvt_f32_to_f16, i32, f32, env) + DEF_HELPER_3(recps_f32, f32, f32, f32, env) DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env) DEF_HELPER_2(recpe_f32, f32, f32, env) diff --git a/target-arm/translate.c b/target-arm/translate.c index 57845662a..a002f7e02 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -2974,6 +2974,47 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn) case 3: /* sqrt */ gen_vfp_sqrt(dp); break; + case 4: /* vcvtb.f32.f16 */ + if (!arm_feature(env, ARM_FEATURE_VFP_FP16)) + return 1; + tmp = gen_vfp_mrs(); + tcg_gen_ext16u_i32(tmp, tmp); + gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp, cpu_env); + dead_tmp(tmp); + break; + case 5: /* vcvtt.f32.f16 */ + if (!arm_feature(env, ARM_FEATURE_VFP_FP16)) + return 1; + tmp = gen_vfp_mrs(); + tcg_gen_shri_i32(tmp, tmp, 16); + gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp, cpu_env); + dead_tmp(tmp); + break; + case 6: /* vcvtb.f16.f32 */ + if (!arm_feature(env, ARM_FEATURE_VFP_FP16)) + return 1; + tmp = new_tmp(); + gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env); + gen_mov_F0_vreg(0, rd); + tmp2 = gen_vfp_mrs(); + tcg_gen_andi_i32(tmp2, tmp2, 0xffff0000); + tcg_gen_or_i32(tmp, tmp, tmp2); + dead_tmp(tmp2); + gen_vfp_msr(tmp); + break; + case 7: /* vcvtt.f16.f32 */ + if (!arm_feature(env, ARM_FEATURE_VFP_FP16)) + return 1; + tmp = new_tmp(); + gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env); + tcg_gen_shli_i32(tmp, tmp, 16); + gen_mov_F0_vreg(0, rd); + tmp2 = gen_vfp_mrs(); + tcg_gen_ext16u_i32(tmp2, tmp2); + tcg_gen_or_i32(tmp, tmp, tmp2); + dead_tmp(tmp2); + gen_vfp_msr(tmp); + break; case 8: /* cmp */ gen_vfp_cmp(dp); break; @@ -5328,6 +5369,50 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) neon_store_reg64(cpu_V0, rd + pass); } break; + case 44: /* VCVT.F16.F32 */ + if (!arm_feature(env, ARM_FEATURE_VFP_FP16)) + return 1; + tmp = new_tmp(); + tmp2 = new_tmp(); + tcg_gen_ld_f32(cpu_F0s, cpu_env, neon_reg_offset(rm, 0)); + gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env); + tcg_gen_ld_f32(cpu_F0s, cpu_env, neon_reg_offset(rm, 1)); + gen_helper_vfp_fcvt_f32_to_f16(tmp2, cpu_F0s, cpu_env); + tcg_gen_shli_i32(tmp2, tmp2, 16); + tcg_gen_or_i32(tmp2, tmp2, tmp); + tcg_gen_ld_f32(cpu_F0s, cpu_env, neon_reg_offset(rm, 2)); + gen_helper_vfp_fcvt_f32_to_f16(tmp, cpu_F0s, cpu_env); + tcg_gen_ld_f32(cpu_F0s, cpu_env, neon_reg_offset(rm, 3)); + neon_store_reg(rd, 0, tmp2); + tmp2 = new_tmp(); + gen_helper_vfp_fcvt_f32_to_f16(tmp2, cpu_F0s, cpu_env); + tcg_gen_shli_i32(tmp2, tmp2, 16); + tcg_gen_or_i32(tmp2, tmp2, tmp); + neon_store_reg(rd, 1, tmp2); + dead_tmp(tmp); + break; + case 46: /* VCVT.F32.F16 */ + if (!arm_feature(env, ARM_FEATURE_VFP_FP16)) + return 1; + tmp3 = new_tmp(); + tmp = neon_load_reg(rm, 0); + tmp2 = neon_load_reg(rm, 1); + tcg_gen_ext16u_i32(tmp3, tmp); + gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp3, cpu_env); + tcg_gen_st_f32(cpu_F0s, cpu_env, neon_reg_offset(rd, 0)); + tcg_gen_shri_i32(tmp3, tmp, 16); + gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp3, cpu_env); + tcg_gen_st_f32(cpu_F0s, cpu_env, neon_reg_offset(rd, 1)); + dead_tmp(tmp); + tcg_gen_ext16u_i32(tmp3, tmp2); + gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp3, cpu_env); + tcg_gen_st_f32(cpu_F0s, cpu_env, neon_reg_offset(rd, 2)); + tcg_gen_shri_i32(tmp3, tmp2, 16); + gen_helper_vfp_fcvt_f16_to_f32(cpu_F0s, tmp3, cpu_env); + tcg_gen_st_f32(cpu_F0s, cpu_env, neon_reg_offset(rd, 3)); + dead_tmp(tmp2); + dead_tmp(tmp3); + break; default: elementwise: for (pass = 0; pass < (q ? 4 : 2); pass++) { -- 2.39.2