OpenXiangShan · Ziyue-Zhang · Mar 21, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 20, 2024
diff --git a/include/rtl/fp.h b/include/rtl/fp.h
@@ -17,6 +17,7 @@
 #define __RTL_FP_H__
 
 enum {
+  FPCALL_W8,
   FPCALL_W16,
   FPCALL_W32,
   FPCALL_W64,

diff --git a/src/engine/interpreter/fp.c b/src/engine/interpreter/fp.c
@@ -145,8 +145,19 @@ def_rtl(vfpcall, rtlreg_t *dest, const rtlreg_t *src1, const rtlreg_t *src2, uin
   isa_fp_csr_check();
 
   softfloat_roundingMode = isa_fp_get_frm();
-
-  if (w == FPCALL_W16) {
+  if (w == FPCALL_W8) {
+    // w8 only can hold int/uint
+    // f need at least w16
+    // so src must be int/uint
+    int8_t isrc1 = *(int8_t*)(src1);
+    // now is unused variable 
+    // int8_t isrc2 = *(int8_t*)(src2);
+    // uint8_t usrc1 = *(uint8_t*)(src1);
+    // uint8_t usrc2 = *(uint8_t*)(src2);
+    switch (op) {
+      case FPCALL_SToDF: *dest = i32_to_f16((int32_t)(int8_t)isrc1).v; break;
+    }
+  } else if (w == FPCALL_W16) {
     float16_t fsrc1 = rtlToF16(*src1);
     float16_t fsrc2 = rtlToF16(*src2);
     switch (op) {
@@ -200,11 +211,7 @@ def_rtl(vfpcall, rtlreg_t *dest, const rtlreg_t *src1, const rtlreg_t *src2, uin
       case FPCALL_FToDUT: *dest = f16_to_ui32(fsrc1, softfloat_round_minMag, true); break;
       case FPCALL_FToDST: *dest = f16_to_i32(fsrc1, softfloat_round_minMag, true); break;
       case FPCALL_UToDF: *dest = ui32_to_f32(fsrc1.v).v; break;
-      case FPCALL_SToDF: 
-          if ((fsrc1.v & ~0xffffULL) == 0) *dest = i32_to_f32((int32_t)(int16_t)fsrc1.v).v;
-          else if ((fsrc1.v & ~0xffULL) == 0) *dest = i32_to_f32((int32_t)(int8_t)fsrc1.v).v;
-          else *dest = i32_to_f32(fsrc1.v).v;
-          break;
+      case FPCALL_SToDF: *dest = i32_to_f32((int32_t)(int16_t)fsrc1.v).v; break;
       case FPCALL_FToDF: *dest = f16_to_f32(fsrc1).v; break;
 
       case FPCALL_DFToU: *dest = f16_to_ui8(fsrc1, softfloat_roundingMode, true); break;
@@ -282,11 +289,7 @@ def_rtl(vfpcall, rtlreg_t *dest, const rtlreg_t *src1, const rtlreg_t *src2, uin
       case FPCALL_FToDUT: *dest = f32_to_ui64(fsrc1, softfloat_round_minMag, true); break;
       case FPCALL_FToDST: *dest = f32_to_i64(fsrc1, softfloat_round_minMag, true); break;
       case FPCALL_UToDF: *dest = ui32_to_f64(fsrc1.v).v; break;
-      case FPCALL_SToDF:
-          if ((fsrc1.v & ~0xffffULL ) == 0) *dest = i32_to_f64((int32_t)(int16_t)fsrc1.v).v;
-          else if ((fsrc1.v & ~0xffULL ) == 0) *dest = i32_to_f64((int32_t)(int8_t)fsrc1.v).v;
-          else *dest = i32_to_f64(fsrc1.v).v;
-          break;
+      case FPCALL_SToDF: *dest = i32_to_f64(fsrc1.v).v; break;
       case FPCALL_FToDF: *dest = f32_to_f64(fsrc1).v; break;
 
       case FPCALL_DFToU: *dest = f32_to_ui16(fsrc1, softfloat_roundingMode, true); break;

diff --git a/src/isa/riscv64/instr/rvv/vcompute.h b/src/isa/riscv64/instr/rvv/vcompute.h
@@ -406,13 +406,16 @@ def_EHelper(vfirst) {
 }
 
 def_EHelper(vmsbf) {
+  // The vmsbf instruction will raise an illegal instruction exception if vstart is non-zero.
   if(vstart->val != 0)
-    check_vstart_ignore(s);
+    longjmp_exception(EX_II);
 
   bool first_one = false;
   for(int idx = vstart->val; idx < vl->val; idx ++) {
     rtlreg_t mask = get_mask(0, idx, vtype->vsew, vtype->vlmul);
     if(s->vm == 0 && mask == 0) {
+      // it need v0 mask, but this element is not choosed by v0
+      // if vma, set 1; others, continue
       if (RVV_AGNOSTIC) {
         if (vtype->vma) {
           set_mask(id_dest->reg, idx, 1, vtype->vsew, vtype->vlmul);
@@ -421,6 +424,10 @@ def_EHelper(vmsbf) {
       continue;
     }
 
+    // s->vm == 1: donot need v0 mask
+    // or
+    // s->vm == 0 && mask == 1: this element is choosed by v0
+
     *s0 = get_mask(id_src2->reg, idx, vtype->vsew, vtype->vlmul);
     *s0 &= 1;
 
@@ -434,13 +441,7 @@ def_EHelper(vmsbf) {
       set_mask(id_dest->reg, idx, 1, vtype->vsew, vtype->vlmul);
     }
   }
-  // If there is no set bit in the active element of the source vector,
-  // all active elements in the target are written to 1.
-  if (!first_one) {
-    for(int idx = vstart->val; idx < vl->val; idx ++) {
-      set_mask(id_dest->reg, idx, 1, vtype->vsew, vtype->vlmul);
-    }
-  }
+  /* The tail elements in the destination mask register are updated under a tail-agnostic policy. */
   if (RVV_AGNOSTIC) {
     for (int idx = vl->val; idx < VLEN; idx++) {
       set_mask(id_dest->reg, idx, 1, vtype->vsew, vtype->vlmul);

diff --git a/src/isa/riscv64/instr/rvv/vcompute_impl.c b/src/isa/riscv64/instr/rvv/vcompute_impl.c
@@ -480,7 +480,12 @@ void floating_arthimetic_instr(int opcode, int is_signed, int widening, int dest
   word_t FPCALL_TYPE = FPCALL_W64;
   // fpcall type
   switch (vtype->vsew) {
-    case 0 : panic("f8 not supported"); break;
+    case 0 :
+      switch (widening) {
+        case vdNarrow : FPCALL_TYPE = FPCALL_W16; break;
+        case vdWidening : FPCALL_TYPE = FPCALL_W8; break;
+      }
+      break;
     case 1 : 
       switch (widening) {
         case vsdWidening : FPCALL_TYPE = FPCALL_W16_to_32; break;

diff --git a/src/isa/riscv64/instr/rvv/vldst_impl.c b/src/isa/riscv64/instr/rvv/vldst_impl.c
@@ -46,8 +46,10 @@ void vld(int mode, int is_signed, Decode *s, int mmu_mode) {
   if(check_vstart_ignore(s)) return;
   word_t idx;
   uint64_t nf, fn, vl_val, base_addr, vd, addr;
-  int eew, emul, stride, is_stride;
+  int eew, emul, emul_coding, stride, is_stride;
 
+  // s->v_width is the bytes of a unit
+  // eew is the coding like vsew
   eew = 0;
   switch(s->v_width) {
     case 1: eew = 0; break;
@@ -56,10 +58,10 @@ void vld(int mode, int is_signed, Decode *s, int mmu_mode) {
     case 8: eew = 3; break;
     default: break;
   }
-  emul = vtype->vlmul > 4 ? vtype->vlmul - 8 + eew - vtype->vsew : vtype->vlmul + eew - vtype->vsew;
-  isa_emul_check(mode == MODE_MASK ? 1 : emul, 1);
-  emul = emul < 0 ? 0 : emul;
-  emul = 1 << emul;
+  emul_coding = vtype->vlmul > 4 ? vtype->vlmul - 8 + eew - vtype->vsew : vtype->vlmul + eew - vtype->vsew;
+  isa_emul_check(mode == MODE_MASK ? 1 : emul_coding, 1);
+  emul_coding = emul_coding < 0 ? 0 : emul_coding;
+  emul = 1 << emul_coding;
 
   if (mode == MODE_STRIDED) {
     stride = id_src2->val;
@@ -82,24 +84,24 @@ void vld(int mode, int is_signed, Decode *s, int mmu_mode) {
       if (RVV_AGNOSTIC && vtype->vma) {
         tmp_reg[1] = (uint64_t) -1;
         for (fn = 0; fn < nf; fn++) {
-          set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, vtype->vlmul, mode == MODE_MASK ? 0 : 1);
+          set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, emul_coding, mode == MODE_MASK ? 0 : 1);
         }
       }
       continue;
     }
     for (fn = 0; fn < nf; fn++) {
       addr = base_addr + idx * stride + (idx * nf * is_stride + fn) * s->v_width;
       rtl_lm(s, &tmp_reg[1], &addr, 0, s->v_width, mmu_mode);
-      set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, vtype->vlmul, mode == MODE_MASK ? 0 : 1);
+      set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, emul_coding, mode == MODE_MASK ? 0 : 1);
     }
   }
 
   if (RVV_AGNOSTIC && (mode == MODE_MASK || vtype->vta)) {   // set tail of vector register to 1
-    int vlmax = mode == MODE_MASK ? VLEN / 8 : get_vlen_max(eew, vtype->vlmul, 0);
+    int vlmax =  mode == MODE_MASK ? VLEN / 8 : get_vlen_max(eew, emul_coding, 0);
     for(idx = vl_val; idx < vlmax; idx++) {
       tmp_reg[1] = (uint64_t) -1;
       for (fn = 0; fn < nf; fn++) {
-        set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, vtype->vlmul, mode == MODE_MASK ? 0 : 1);
+        set_vreg(vd + fn * emul, idx, tmp_reg[1], eew, emul_coding, mode == MODE_MASK ? 0 : 1);
       }
     }
   }