interpreter: add host-fp support

* It provides better performance than softfloat. But exception and NaN handling is slow, so we disable it in host-fp. * FIXME: porvay in SPEC 2006 gets wrong answer
qshan · Mar 31, 2021 · 795dd3c · 795dd3c
1 parent dc35364
commit 795dd3c
Show file tree

Hide file tree

Showing 7 changed files with 244 additions and 49 deletions.
diff --git a/Kconfig b/Kconfig
@@ -181,6 +181,18 @@ endmenu
 source "src/memory/Kconfig"
 source "src/device/Kconfig"
 
+menuconfig FPU
+  bool "Enable FPU"
+  default y
+  help
+    Enable floating point support.
+
+if FPU
+config FPU_SOFTFLOAT
+  bool "Use softfloat library"
+  default n
+endif # FPU
+
 menu "Miscellaneous"
 choice
   prompt "Host timer"

diff --git a/Makefile b/Makefile
@@ -51,7 +51,7 @@ else
 SHARE = 1
 endif
 
-ifeq ($(ENGINE),interpreter)
+ifdef CONFIG_FPU_SOFTFLOAT
 SOFTFLOAT = resource/softfloat/build/softfloat.a
 ifeq ($(ISA),riscv64)
 SPECIALIZE_TYPE = RISCV
@@ -66,6 +66,8 @@ $(SOFTFLOAT):
 	SPECIALIZE_TYPE=$(SPECIALIZE_TYPE) $(MAKE) -s -C resource/softfloat/
 
 .PHONY: $(SOFTFLOAT)
+else ifdef CONFIG_FPU
+LDFLAGS += -lm
 endif
 
 include $(NEMU_HOME)/scripts/git.mk

diff --git a/include/rtl/fp.h b/include/rtl/fp.h
@@ -8,6 +8,22 @@ enum {
   FPCALL_W80,
 };
 
+enum {
+  FPCALL_RM_RNE,  // round to nearest, ties to even
+  FPCALL_RM_RTZ,  // round towards zero
+  FPCALL_RM_RDN,  // round down (towards -INF)
+  FPCALL_RM_RUP,  // round up (towards +INF)
+  FPCALL_RM_RMM,  // round to nearest, ties to max magnitude
+};
+
+enum {
+  FPCALL_EX_NX = 0x01,  // inexact
+  FPCALL_EX_UF = 0x02,  // underflow
+  FPCALL_EX_OF = 0x04,  // overflow
+  FPCALL_EX_DZ = 0x08,  // divide by zero
+  FPCALL_EX_NV = 0x10,  // invalid operation
+};
+
 enum {
   FPCALL_ADD,
   FPCALL_SUB,

diff --git a/src/engine/interpreter/fp.c b/src/engine/interpreter/fp.c
@@ -1,15 +1,11 @@
 #include <rtl/rtl.h>
-#include <softfloat.h>
-#include <specialize.h>
-#include <internals.h>
+#include MUXDEF(CONFIG_FPU_SOFTFLOAT, "softfloat-fp.h", "host-fp.h")
 
 #define BOX_MASK 0xFFFFFFFF00000000
-#define F32_SIGN ((uint64_t)1ul << 31)
-#define F64_SIGN ((uint64_t)1ul << 63)
 
 static inline rtlreg_t unbox(rtlreg_t r) {
-  if ((r & BOX_MASK) == BOX_MASK) return r & ~BOX_MASK;
-  else return defaultNaNF32UI;
+  return MUXDEF(CONFIG_FPU_SOFTFLOAT, (r & BOX_MASK) == BOX_MASK, true)
+    ? (r & ~BOX_MASK) : defaultNaNF32UI;
 }
 
 static inline float32_t rtlToF32(rtlreg_t r) {
@@ -22,35 +18,11 @@ static inline float64_t rtlToF64(rtlreg_t r) {
   return f;
 }
 
-static inline float32_t f32_min(float32_t a, float32_t b){
-  bool less = f32_lt_quiet(a, b) || (f32_eq(a, b) && (a.v & F32_SIGN));
-  if(isNaNF32UI(a.v) && isNaNF32UI(b.v)) return rtlToF32(defaultNaNF32UI);
-  else return(less || isNaNF32UI(b.v) ? a : b);
-}
-
-static inline float32_t f32_max(float32_t a, float32_t b){
-  bool greater = f32_lt_quiet(b, a) || (f32_eq(b, a) && (b.v & F32_SIGN));
-  if(isNaNF32UI(a.v) && isNaNF32UI(b.v)) return rtlToF32(defaultNaNF32UI);
-  else return(greater || isNaNF32UI(b.v) ? a : b);
-}
-
-static inline float64_t f64_min(float64_t a, float64_t b){
-  bool less = f64_lt_quiet(a, b) || (f64_eq(a, b) && (a.v & F64_SIGN));
-  if(isNaNF64UI(a.v) && isNaNF64UI(b.v)) return rtlToF64(defaultNaNF64UI);
-  else return(less || isNaNF64UI(b.v) ? a : b);
-}
-
-static inline float64_t f64_max(float64_t a, float64_t b){
-  bool greater = f64_lt_quiet(b, a) || (f64_eq(b, a) && (b.v & F64_SIGN));
-  if(isNaNF64UI(a.v) && isNaNF64UI(b.v)) return rtlToF64(defaultNaNF64UI);
-  else return(greater || isNaNF64UI(b.v) ? a : b);
-}
-
 uint32_t isa_fp_get_rm(Decode *s);
-void isa_fp_update_ex_flags(Decode *s, uint32_t ex_flags);
+void isa_fp_set_ex(uint32_t ex);
 
 def_rtl(fpcall, rtlreg_t *dest, const rtlreg_t *src1, const rtlreg_t *src2, uint32_t cmd) {
-  softfloat_roundingMode = isa_fp_get_rm(s);
+  IFDEF(CONFIG_FPU_SOFTFLOAT, fp_set_rm(isa_fp_get_rm(s)));
   int w = FPCALL_W(cmd);
   int op = FPCALL_OP(cmd);
 
@@ -78,10 +50,10 @@ def_rtl(fpcall, rtlreg_t *dest, const rtlreg_t *src1, const rtlreg_t *src2, uint
       case FPCALL_I64ToF: *dest = i64_to_f32 (*src1).v; break;
       case FPCALL_U64ToF: *dest = ui64_to_f32(*src1).v; break;
 
-      case FPCALL_FToI32: *dest = f32_to_i32 (fsrc1, softfloat_roundingMode, true); break;
-      case FPCALL_FToU32: *dest = f32_to_ui32(fsrc1, softfloat_roundingMode, true); break;
-      case FPCALL_FToI64: *dest = f32_to_i64 (fsrc1, softfloat_roundingMode, true); break;
-      case FPCALL_FToU64: *dest = f32_to_ui64(fsrc1, softfloat_roundingMode, true); break;
+      case FPCALL_FToI32: *dest = my_f32_to_i32 (fsrc1); break;
+      case FPCALL_FToU32: *dest = my_f32_to_ui32(fsrc1); break;
+      case FPCALL_FToI64: *dest = my_f32_to_i64 (fsrc1); break;
+      case FPCALL_FToU64: *dest = my_f32_to_ui64(fsrc1); break;
       default: panic("op = %d not supported", op);
     }
   } else if (w == FPCALL_W64) {
@@ -108,19 +80,20 @@ def_rtl(fpcall, rtlreg_t *dest, const rtlreg_t *src1, const rtlreg_t *src2, uint
       case FPCALL_I64ToF: *dest = i64_to_f64 (*src1).v; break;
       case FPCALL_U64ToF: *dest = ui64_to_f64(*src1).v; break;
 
-      case FPCALL_FToI32: *dest = f64_to_i32 (fsrc1, softfloat_roundingMode, true); break;
-      case FPCALL_FToU32: *dest = f64_to_ui32(fsrc1, softfloat_roundingMode, true); break;
-      case FPCALL_FToI64: *dest = f64_to_i64 (fsrc1, softfloat_roundingMode, true); break;
-      case FPCALL_FToU64: *dest = f64_to_ui64(fsrc1, softfloat_roundingMode, true); break;
+      case FPCALL_FToI32: *dest = my_f64_to_i32 (fsrc1); break;
+      case FPCALL_FToU32: *dest = my_f64_to_ui32(fsrc1); break;
+      case FPCALL_FToI64: *dest = my_f64_to_i64 (fsrc1); break;
+      case FPCALL_FToU64: *dest = my_f64_to_ui64(fsrc1); break;
 
       case FPCALL_F32ToF64: *dest = f32_to_f64(rtlToF32(*src1)).v; break;
       case FPCALL_F64ToF32: *dest = f64_to_f32(fsrc1).v; break;
       default: panic("op = %d not supported", op);
     }
   }
 
-  if (softfloat_exceptionFlags) {
-    isa_fp_update_ex_flags(s, softfloat_exceptionFlags);
-    softfloat_exceptionFlags = 0;
+  uint32_t ex = MUXDEF(CONFIG_FPU_SOFTFLOAT, fp_get_exception(), 0);
+  if (ex) {
+    isa_fp_set_ex(ex);
+    fp_clear_exception();
   }
 }
diff --git a/src/engine/interpreter/host-fp.h b/src/engine/interpreter/host-fp.h
@@ -0,0 +1,88 @@
+#ifndef __HOSTFP_H__
+#include <rtl/fp.h>
+#include <math.h>
+#include <fenv.h>
+
+#define defaultNaNF32UI 0x7FC00000
+
+typedef union { uint32_t v; float  f; } float32_t;
+static inline float32_t float32(float f) { float32_t r = { .f = f }; return r; }
+
+static inline float32_t f32_add(float32_t a, float32_t b) { return float32(a.f + b.f); }
+static inline float32_t f32_sub(float32_t a, float32_t b) { return float32(a.f - b.f); }
+static inline float32_t f32_mul(float32_t a, float32_t b) { return float32(a.f * b.f); }
+static inline float32_t f32_div(float32_t a, float32_t b) { return float32(a.f / b.f); }
+static inline float32_t f32_sqrt(float32_t a) { return float32(sqrtf(a.f)); }
+static inline float32_t f32_mulAdd(float32_t a, float32_t b,
+    float32_t c) { return float32(fmaf(a.f, b.f, c.f)); }
+static inline float32_t f32_min(float32_t a, float32_t b) { return float32(fminf(a.f, b.f)); }
+static inline float32_t f32_max(float32_t a, float32_t b) { return float32(fmaxf(a.f, b.f)); }
+static inline bool f32_le(float32_t a, float32_t b) { return a.f <= b.f; }
+static inline bool f32_lt(float32_t a, float32_t b) { return a.f <  b.f; }
+static inline bool f32_eq(float32_t a, float32_t b) { return a.f == b.f; }
+static inline float32_t i32_to_f32 (rtlreg_t a) { return float32((int32_t)a); }
+static inline float32_t ui32_to_f32(rtlreg_t a) { return float32((uint32_t)a); }
+static inline float32_t i64_to_f32 (rtlreg_t a) { return float32((int64_t)a); }
+static inline float32_t ui64_to_f32(rtlreg_t a) { return float32((uint64_t)a); }
+static inline int32_t  my_f32_to_i32 (float32_t a) { return (int32_t)a.f; }
+static inline uint32_t my_f32_to_ui32(float32_t a) { return (uint32_t)a.f; }
+static inline int64_t  my_f32_to_i64 (float32_t a) { return (int64_t)a.f; }
+static inline uint64_t my_f32_to_ui64(float32_t a) { return (uint64_t)a.f; }
+
+
+typedef union { uint64_t v; double f; } float64_t;
+static inline float64_t float64(double f) { float64_t r = { .f = f }; return r; }
+
+static inline float64_t f64_add(float64_t a, float64_t b) { return float64(a.f + b.f); }
+static inline float64_t f64_sub(float64_t a, float64_t b) { return float64(a.f - b.f); }
+static inline float64_t f64_mul(float64_t a, float64_t b) { return float64(a.f * b.f); }
+static inline float64_t f64_div(float64_t a, float64_t b) { return float64(a.f / b.f); }
+static inline float64_t f64_sqrt(float64_t a) { return float64(sqrt(a.f)); }
+static inline float64_t f64_mulAdd(float64_t a, float64_t b,
+    float64_t c) { return float64(fma(a.f, b.f, c.f)); }
+static inline float64_t f64_min(float64_t a, float64_t b) { return float64(fmin(a.f, b.f)); }
+static inline float64_t f64_max(float64_t a, float64_t b) { return float64(fmax(a.f, b.f)); }
+static inline bool f64_le(float64_t a, float64_t b) { return a.f <= b.f; }
+static inline bool f64_lt(float64_t a, float64_t b) { return a.f <  b.f; }
+static inline bool f64_eq(float64_t a, float64_t b) { return a.f == b.f; }
+static inline float64_t i32_to_f64 (rtlreg_t a) { return float64((int32_t)a); }
+static inline float64_t ui32_to_f64(rtlreg_t a) { return float64((uint32_t)a); }
+static inline float64_t i64_to_f64 (rtlreg_t a) { return float64((int64_t)a); }
+static inline float64_t ui64_to_f64(rtlreg_t a) { return float64((uint64_t)a); }
+static inline int32_t  my_f64_to_i32 (float64_t a) { return (int32_t)a.f; }
+static inline uint32_t my_f64_to_ui32(float64_t a) { return (uint32_t)a.f; }
+static inline int64_t  my_f64_to_i64 (float64_t a) { return (int64_t)a.f; }
+static inline uint64_t my_f64_to_ui64(float64_t a) { return (uint64_t)a.f; }
+
+static inline float64_t f32_to_f64(float32_t a) { return float64(a.f); }
+static inline float32_t f64_to_f32(float64_t a) { return float32(a.f); }
+
+
+static inline void fp_set_rm(int rm) {
+  switch (rm) {
+    case FPCALL_RM_RNE: rm = FE_TONEAREST; break;
+    case FPCALL_RM_RTZ: rm = FE_TOWARDZERO; break;
+    case FPCALL_RM_RDN: rm = FE_DOWNWARD; break;
+    case FPCALL_RM_RUP: rm = FE_UPWARD; break;
+    case FPCALL_RM_RMM: rm = FE_TONEAREST; break; // x86 does not support RMM
+    default: assert(0);
+  }
+  fesetround(rm);
+}
+
+static inline uint32_t fp_get_exception() {
+  uint32_t ex = 0;
+  uint32_t host_ex = fetestexcept(FE_ALL_EXCEPT);
+  if (host_ex & FE_INEXACT  ) ex |= FPCALL_EX_NX;
+  if (host_ex & FE_UNDERFLOW) ex |= FPCALL_EX_UF;
+  if (host_ex & FE_OVERFLOW ) ex |= FPCALL_EX_OF;
+  if (host_ex & FE_DIVBYZERO) ex |= FPCALL_EX_DZ;
+  if (host_ex & FE_INVALID  ) ex |= FPCALL_EX_NV;
+  return ex;
+}
+
+static inline void fp_clear_exception() {
+  feclearexcept(FE_ALL_EXCEPT);
+}
+
+#endif
diff --git a/src/engine/interpreter/softfloat-fp.h b/src/engine/interpreter/softfloat-fp.h
@@ -0,0 +1,88 @@
+#ifndef __SOFTFLOAT_FP_H__
+#define __SOFTFLOAT_FP_H__
+
+#include <softfloat.h>
+#include <specialize.h>
+#include <internals.h>
+
+#define F32_SIGN ((uint64_t)1ul << 31)
+#define F64_SIGN ((uint64_t)1ul << 63)
+
+static inline float32_t rtlToF32(rtlreg_t r);
+static inline float64_t rtlToF64(rtlreg_t r);
+
+static inline float32_t f32_min(float32_t a, float32_t b){
+  bool less = f32_lt_quiet(a, b) || (f32_eq(a, b) && (a.v & F32_SIGN));
+  if(isNaNF32UI(a.v) && isNaNF32UI(b.v)) return rtlToF32(defaultNaNF32UI);
+  else return(less || isNaNF32UI(b.v) ? a : b);
+}
+
+static inline float32_t f32_max(float32_t a, float32_t b){
+  bool greater = f32_lt_quiet(b, a) || (f32_eq(b, a) && (b.v & F32_SIGN));
+  if(isNaNF32UI(a.v) && isNaNF32UI(b.v)) return rtlToF32(defaultNaNF32UI);
+  else return(greater || isNaNF32UI(b.v) ? a : b);
+}
+
+static inline float64_t f64_min(float64_t a, float64_t b){
+  bool less = f64_lt_quiet(a, b) || (f64_eq(a, b) && (a.v & F64_SIGN));
+  if(isNaNF64UI(a.v) && isNaNF64UI(b.v)) return rtlToF64(defaultNaNF64UI);
+  else return(less || isNaNF64UI(b.v) ? a : b);
+}
+
+static inline float64_t f64_max(float64_t a, float64_t b){
+  bool greater = f64_lt_quiet(b, a) || (f64_eq(b, a) && (b.v & F64_SIGN));
+  if(isNaNF64UI(a.v) && isNaNF64UI(b.v)) return rtlToF64(defaultNaNF64UI);
+  else return(greater || isNaNF64UI(b.v) ? a : b);
+}
+
+static inline int32_t  my_f32_to_i32 (float32_t a) {
+  return f32_to_i32 (a, softfloat_roundingMode, true);
+}
+static inline uint32_t my_f32_to_ui32(float32_t a) {
+  return f32_to_ui32(a, softfloat_roundingMode, true);
+}
+static inline int64_t  my_f32_to_i64 (float32_t a) {
+  return f32_to_i64 (a, softfloat_roundingMode, true);
+}
+static inline uint64_t my_f32_to_ui64(float32_t a) {
+  return f32_to_ui64(a, softfloat_roundingMode, true);
+}
+static inline int32_t  my_f64_to_i32 (float64_t a) {
+  return f64_to_i32 (a, softfloat_roundingMode, true);
+}
+static inline uint32_t my_f64_to_ui32(float64_t a) {
+  return f64_to_ui32(a, softfloat_roundingMode, true);
+}
+static inline int64_t  my_f64_to_i64 (float64_t a) {
+  return f64_to_i64 (a, softfloat_roundingMode, true);
+}
+static inline uint64_t my_f64_to_ui64(float64_t a) {
+  return f64_to_ui64(a, softfloat_roundingMode, true);
+}
+
+static inline void fp_set_rm(int rm) {
+  switch (rm) {
+    case FPCALL_RM_RNE: softfloat_roundingMode = softfloat_round_near_even; break;
+    case FPCALL_RM_RTZ: softfloat_roundingMode = softfloat_round_minMag; break;
+    case FPCALL_RM_RDN: softfloat_roundingMode = softfloat_round_min; break;
+    case FPCALL_RM_RUP: softfloat_roundingMode = softfloat_round_max; break;
+    case FPCALL_RM_RMM: softfloat_roundingMode = softfloat_round_near_maxMag; break;
+    default: assert(0);
+  }
+}
+
+static inline uint32_t fp_get_exception() {
+  uint32_t ex = 0;
+  uint32_t softfp_ex = softfloat_exceptionFlags;
+  if (softfp_ex & softfloat_flag_inexact  ) ex |= FPCALL_EX_NX;
+  if (softfp_ex & softfloat_flag_underflow) ex |= FPCALL_EX_UF;
+  if (softfp_ex & softfloat_flag_overflow ) ex |= FPCALL_EX_OF;
+  if (softfp_ex & softfloat_flag_infinite ) ex |= FPCALL_EX_DZ;
+  if (softfp_ex & softfloat_flag_invalid  ) ex |= FPCALL_EX_NV;
+  return ex;
+}
+
+static inline void fp_clear_exception() {
+  softfloat_exceptionFlags = 0;
+}
+#endif
diff --git a/src/isa/riscv64/instr/fp.c b/src/isa/riscv64/instr/fp.c
@@ -1,5 +1,6 @@
 #include "../local-include/csr.h"
 #include "../local-include/intr.h"
+#include <rtl/fp.h>
 #include <cpu/cpu.h>
 
 bool fp_enable() {
@@ -12,16 +13,31 @@ void fp_set_dirty() {
 }
 
 uint32_t isa_fp_get_rm(Decode *s) {
-  int rm = s->isa.instr.fp.rm;
+  uint32_t rm = s->isa.instr.fp.rm;
   if (rm == 7) { rm = fcsr->frm; }
-  if (rm <= 4) { return rm; }
+  if (rm <= 4) {
+    switch (rm) {
+      case 0: return FPCALL_RM_RNE;
+      case 1: return FPCALL_RM_RTZ;
+      case 2: return FPCALL_RM_RDN;
+      case 3: return FPCALL_RM_RUP;
+      case 4: return FPCALL_RM_RMM;
+      default: assert(0);
+    }
+  }
   else {
     save_globals(s);
     longjmp_exception(EX_II);
   }
 }
 
-void isa_fp_update_ex_flags(Decode *s, uint32_t ex_flags) {
-  fcsr->fflags.val |= ex_flags;
+void isa_fp_set_ex(uint32_t ex) {
+  uint32_t f = 0;
+  if (ex & FPCALL_EX_NX) f |= 0x01;
+  if (ex & FPCALL_EX_UF) f |= 0x02;
+  if (ex & FPCALL_EX_OF) f |= 0x04;
+  if (ex & FPCALL_EX_DZ) f |= 0x08;
+  if (ex & FPCALL_EX_NV) f |= 0x10;
+  fcsr->fflags.val = f;
   fp_set_dirty();
 }