From ce19922a5e283a005c6299410ebfca5898de3687 Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Thu, 19 Dec 2024 14:22:50 +0900
Subject: [PATCH 01/11] feat: integrate DivSqrter with fflags into the FP
 pipeline

---
 .../Src/FloatingPointUnit/FP32DivSqrter.sv    |  28 +--
 .../FP32DivSqrterWithFFlags.sv                | 206 ++++++++++++++++++
 .../Src/FloatingPointUnit/FPDivSqrtUnit.sv    |  12 +-
 Processor/Src/Makefiles/CoreSources.inc.mk    |   1 +
 Processor/Src/Makefiles/TestCommands.inc.mk   |  32 +--
 .../Pipeline/FPBackEnd/FPExecutionStage.sv    |   3 +-
 6 files changed, 245 insertions(+), 37 deletions(-)
 create mode 100644 Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv

diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
index 35df9ab8..759d3a50 100644
--- a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
+++ b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
@@ -3,9 +3,9 @@ import FPUTypes::*;
 module FP32DivSqrter (
 input
     logic clk, rst,
-    logic [31:0] lhs,
-    logic [31:0] rhs,
-    logic is_divide,
+    logic [31:0] input_lhs,
+    logic [31:0] input_rhs,
+    logic input_is_divide,
     logic req,
 output
     logic finished,
@@ -43,11 +43,11 @@ output
     FDivSqrtRegPath regData, nextData;
     logic [31:0] regResult, nextResult;
 
-    wire       lhs_sign = lhs[31];
+    wire       lhs_sign = input_lhs[31];
     wire       rhs_sign = rhs[31];
-    wire [7:0] lhs_expo = lhs[30:23];
+    wire [7:0] lhs_expo = input_lhs[30:23];
     wire [7:0] rhs_expo = rhs[30:23];
-    wire[22:0] lhs_mant = lhs[22:0];
+    wire[22:0] lhs_mant = input_lhs[22:0];
     wire[22:0] rhs_mant = rhs[22:0];
 
     // NaN handling
@@ -57,15 +57,15 @@ output
     wire rhs_is_inf  = rhs_expo == 8'hff & rhs_mant == 0;
     wire lhs_is_nan  = lhs_expo == 8'hff & lhs_mant != 0;
     wire rhs_is_nan  = rhs_expo == 8'hff & rhs_mant != 0;
-    wire lhs_is_neg  = lhs_sign & lhs != 32'h80000000;
-    wire res_is_nan  = is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
+    wire lhs_is_neg  = lhs_sign & input_lhs != 32'h80000000;
+    wire res_is_nan  = input_is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
                                  : lhs_is_nan | lhs_is_neg;
-    //wire[31:0]  nan  = is_divide ? lhs_is_nan ? lhs | 32'h00400000 : rhs_is_nan ? rhs | 32'h00400000 : 32'hffc00000
-    //                             : lhs_is_nan ? lhs | 32'h00400000 : 32'hffc00000; // qNaN
+    //wire[31:0]  nan  = is_divide ? lhs_is_nan ? input_lhs | 32'h00400000 : rhs_is_nan ? rhs | 32'h00400000 : 32'hffc00000
+    //                             : lhs_is_nan ? input_lhs | 32'h00400000 : 32'hffc00000; // qNaN
     wire[31:0]   nan = 32'h7fc00000;
 
     // Preparation
-    wire       result_sign  = is_divide & (lhs_sign ^ rhs_sign);
+    wire       result_sign  = input_is_divide & (lhs_sign ^ rhs_sign);
     wire [9:0] v_lhs_expo   = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // virtual exponent (ignores subnormals, but is biased)
     wire [9:0] v_rhs_expo   = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // virtual exponent (ignores subnormals, but is biased)
     wire[23:0] v_lhs_mant = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant };
@@ -146,10 +146,10 @@ output
             nextData.v_rhs_mant = v_rhs_mant;
             nextData.result_sign = result_sign;
             nextData.lhs_sign = lhs_sign;
-            nextData.is_divide = is_divide;
+            nextData.is_divide = input_is_divide;
             nextData.res_is_nan = res_is_nan;
-            nextData.res_is_inf = is_divide ? (lhs_is_inf | rhs_is_zero) : (!lhs_sign & lhs_is_inf);
-            nextData.res_is_zero = is_divide ? (lhs_is_zero | rhs_is_inf) : lhs_is_zero;
+            nextData.res_is_inf = input_is_divide ? (lhs_is_inf | rhs_is_zero) : (!lhs_sign & lhs_is_inf);
+            nextData.res_is_zero = input_is_divide ? (lhs_is_zero | rhs_is_inf) : lhs_is_zero;
             nextData.nan = nan;
             nextPhase = PHASE_PREPARATION;
         end
diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv
new file mode 100644
index 00000000..87c32b77
--- /dev/null
+++ b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv
@@ -0,0 +1,206 @@
+
+module FP32DivSqrterWithFFlags(
+input
+    logic clk, rst,
+    logic [31:0] input_lhs, 
+    logic [31:0] input_rhs,
+    logic input_is_divide, 
+    logic [2:0] input_round_mode,
+    logic req, 
+output
+    logic [31:0] result,
+    logic [4:0] fflags,
+    logic finished
+);
+
+    function round_to_away;
+        input[2:0] round_mode;
+        input      sign;
+        input      last_place;
+        input      guard_bit;
+        input      sticky_bit;
+        input      reminder_is_positive;
+        input      reminder_is_zero;
+
+        case(round_mode)
+            3'b000:  round_to_away = guard_bit & (sticky_bit | reminder_is_positive | (reminder_is_zero & last_place)); // round to nearest, ties to even
+            3'b100:  round_to_away = guard_bit & (sticky_bit | reminder_is_positive | reminder_is_zero);                // round to nearest, ties to away
+            3'b010:  round_to_away = sign & (guard_bit | sticky_bit | reminder_is_positive);  // round downward
+            3'b011:  round_to_away = !sign & (guard_bit | sticky_bit | reminder_is_positive); // round upward
+            default: round_to_away = 0; // round towards zero
+        endcase
+    endfunction
+
+    function[2:0] srt_table;
+        input[5:0] rem;
+        input[3:0] div;
+
+        reg[5:0] th12;
+        reg[5:0] th01;
+        th12 = div < 1 ? 6 : div < 2 ? 7 : div < 4 ? 8 : div < 5 ? 9 : div < 6 ? 10 : 11;
+        th01 =               div < 2 ? 2 :                             div < 6 ?  3 :  4;
+
+            if($signed(rem) < $signed(-th12)) srt_table = -2;
+        else if($signed(rem) < $signed(-th01)) srt_table = -1;
+        else if($signed(rem) < $signed( th01)) srt_table =  0;
+        else if($signed(rem) < $signed( th12)) srt_table =  1;
+        else                                   srt_table =  2;
+    endfunction
+
+    reg [31:0] lhs;
+    reg [31:0] rhs;
+    reg        is_divide;
+    reg  [2:0] round_mode;
+
+    function [9:0] leading_zeros_count;
+        input[22:0] x;
+        for(leading_zeros_count = 0; leading_zeros_count <= 22; leading_zeros_count = leading_zeros_count + 1)
+            if(x[22-leading_zeros_count]) break;
+    endfunction
+
+    wire       lhs_sign = lhs[31];
+    wire       rhs_sign = rhs[31];
+    wire [7:0] lhs_expo = lhs[30:23];
+    wire [7:0] rhs_expo = rhs[30:23];
+    wire[22:0] lhs_mant = lhs[22:0];
+    wire[22:0] rhs_mant = rhs[22:0];
+
+    // NaN handling
+    wire lhs_is_zero = lhs_expo == 8'h00 & lhs_mant == 0;
+    wire rhs_is_zero = rhs_expo == 8'h00 & rhs_mant == 0;
+    wire lhs_is_inf  = lhs_expo == 8'hff & lhs_mant == 0;
+    wire rhs_is_inf  = rhs_expo == 8'hff & rhs_mant == 0;
+    wire lhs_is_nan  = lhs_expo == 8'hff & lhs_mant != 0;
+    wire rhs_is_nan  = rhs_expo == 8'hff & rhs_mant != 0;
+    wire lhs_is_snan = lhs_is_nan & lhs_mant[22] == 0;
+    wire rhs_is_snan = rhs_is_nan & rhs_mant[22] == 0;
+    wire lhs_is_neg  = !lhs_is_nan & lhs_sign & lhs != 32'h80000000;
+    wire res_is_nan  = is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
+                                 : lhs_is_nan | lhs_is_neg;
+    // === About handling NaN ===
+    // x86 returns the following qNaN:
+    //  mullhs_is_nan ? mullhs | 32'h00400000 :
+    //  mulrhs_is_nan ? mulrhs | 32'h00400000 :
+    //  addend_is_nan ? addend | 32'h00400000 : 32'hffc00000
+    // RISC-V always returns canonical NaN (32'h7fc00000).
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
+    wire[31:0]  nan  = 32'h7fc00000;
+    wire invalid_operation = is_divide ? lhs_is_snan | rhs_is_snan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
+                                       : lhs_is_snan | lhs_is_neg;
+
+    // Preparation
+    wire       result_sign  = is_divide ? lhs_sign ^ rhs_sign : lhs_sign;
+    wire [9:0] v_lhs_expo   = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // biased virtual exponent (ignores subnormals)
+    wire [9:0] v_rhs_expo   = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // biased virtual exponent (ignores subnormals)
+    wire[23:0] v_lhs_mant_w = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant };
+    wire[23:0] v_rhs_mant_w = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant };
+    reg [23:0] v_lhs_mant, v_rhs_mant;
+    wire dividend_normalize = v_lhs_mant < v_rhs_mant;
+    wire [9:0] virtual_expo = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals)
+    wire       subnormal    = is_divide & $signed(virtual_expo) <= 0;
+
+    // The SRT loop. rem needs 27 bits. 24(mantissa)+2(x8/3,SRT)+1(sign)
+    wire[26:0] rem_0 = is_divide ? dividend_normalize ? { 2'b00, v_lhs_mant, 1'b0 } : { 3'b000, v_lhs_mant }
+                                 : v_lhs_expo[0] ? { 2'b0, v_lhs_mant_w, 1'b0 } - 27'h1e40000 : { 1'b0, v_lhs_mant_w, 2'b0 } - 27'h2400000; // 2 * (x - 1.375^2 or 1.5^2)
+    wire[25:0] quo_0 = is_divide ? 26'h0
+                                 : v_lhs_expo[0] ? 26'h1600000 : 26'h1800000; // magical initial guess: 1.375 or 1.5; this avoids SRT-table defects at ([-4.5,-4-11/36], 1.5) and ([-4,-4+1/144], 1.25)
+
+    reg  [3:0] stage;
+    reg [26:0] rem;
+    reg [25:0] quo;
+    always@(posedge clk) begin
+        if (rst) begin
+            lhs <= '0;
+            rhs <= '0;
+            stage <= '0;
+            rem <= '0;
+            quo <= '0;
+            v_lhs_mant <= '0;
+            v_rhs_mant <= '0;
+        end
+        else if (stage == 13) begin
+            if (req) begin
+                lhs <= input_lhs;
+                rhs <= input_rhs;
+                is_divide <= input_is_divide;
+                round_mode <= input_round_mode;
+                stage <= input_is_divide ? 14 : 15;
+            end
+        end else if (stage == 14) begin
+            v_lhs_mant <= v_lhs_mant_w;
+            v_rhs_mant <= v_rhs_mant_w;
+            stage <= 15;
+        end else if (stage == 15) begin
+            rem <= rem_0;
+            quo <= quo_0;
+            stage <= is_divide ? 0 : 1;
+        end else begin
+            reg[3:0] div = is_divide ? { 1'b0, v_rhs_mant[22:20] }
+                                     : { quo[25], quo[23:21] };
+            reg[2:0] q = srt_table( rem[26:21], div );
+            case(q)
+            3'b010: rem <= is_divide ? (rem << 2) - { v_rhs_mant, 3'b000 }
+                                     : (rem << 2) - { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2));
+            3'b001: rem <= is_divide ? (rem << 2) - { 1'b0, v_rhs_mant, 2'b00 }
+                                     : (rem << 2) - { quo, 1'b0 } - (27'd1 << (24-stage*2));
+            3'b111: rem <= is_divide ? (rem << 2) + { 1'b0, v_rhs_mant, 2'b00 }
+                                     : (rem << 2) + { quo, 1'b0 } - (27'd1 << (24-stage*2));
+            3'b110: rem <= is_divide ? (rem << 2) + { v_rhs_mant, 3'b000 }
+                                     : (rem << 2) + { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2));
+            default: rem <= rem << 2;
+            endcase
+            quo <= quo + ({ {23{q[2]}}, q } << (24-stage*2));
+            stage <= stage + 1;
+        end
+    end
+    assign finished = stage == 13; // Here, quo has a <1/3ULP error.
+
+    wire[47:0] before_round = subnormal ? { 1'b1, quo[23:0], 23'h0 } >> -virtual_expo : { quo[23:0], 24'h0 };
+    wire       round_away   = round_to_away(round_mode, result_sign, before_round[25], before_round[24], before_round[23:0] != 0, $signed(rem) > 0, rem == 0);
+    wire       round_fall   = round_mode == 2 ? !result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud downward
+                              round_mode == 3 ? result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud upward
+                              round_mode == 1 ? before_round[24:0] == 0 & $signed(rem) < 0 // round towards zero
+                                              : 0;
+    wire       exp_plus_one = before_round[47:25] == 23'h7fffff & round_away;
+    // Since dividend is normalized, situations where before_round[24:0] == 0 & $signed(rem) < 0 do not happen; thus, `exp_minus_one' is always zero.
+    // wire   exp_minus_one = before_round[47:25] == 23'h000000 & round_fall;
+    wire[22:0] result_mant  = before_round[47:25] + { 22'h0, round_away } - { 22'h0, round_fall }; // No special treatment is required even if a overflow occurs since the answer will be correct.
+    wire [7:0] result_expo  = is_divide ? (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'h0, exp_plus_one }
+                                        : v_lhs_expo[8:1] + { 7'b0, v_lhs_expo[0] } + 63 + { 7'h0, exp_plus_one };
+
+    // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent)
+    wire       u_round_away = round_to_away(round_mode, result_sign, quo[1], quo[0], 1'b0, $signed(rem) > 0, rem == 0);
+    wire     u_exp_plus_one = before_round[47:24] == 24'hffffff & u_round_away;
+
+    // Special cases
+    wire       res_is_huge  = is_divide & $signed(virtual_expo) >= 255;
+    wire       res_is_tiny  = is_divide & !lhs_is_zero & !rhs_is_inf & $signed(virtual_expo) <= -24;
+    wire       res_is_inf   = is_divide ? lhs_is_inf | rhs_is_zero
+                                        : lhs_is_inf;
+    wire       res_is_zero  = is_divide ? lhs_is_zero | rhs_is_inf
+                                        : lhs_is_zero;
+    wire       dir_is_away  = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign);
+    wire       huge_is_inf  = round_mode == 0 | round_mode == 4 | dir_is_away;
+
+    wire[31:0] huge         = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
+    wire[31:0] tiny         = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
+    wire[31:0] inf          = { result_sign, 8'hff, 23'h0 };
+    wire[31:0] zero         = { result_sign, 8'h00, 23'h0 };
+
+    // Final result
+    assign result = res_is_nan  ? nan  :
+                    res_is_inf  ? inf  :
+                    res_is_huge ? huge :
+                    res_is_tiny ? tiny :
+                    res_is_zero ? zero : { result_sign, result_expo, result_mant };
+    // Exception flags
+    wire divide_by_zero     = is_divide & !res_is_nan & !lhs_is_inf & rhs_is_zero;
+    wire overflow           = is_divide & !res_is_nan & !lhs_is_inf & !rhs_is_zero & (res_is_huge | (virtual_expo == 254 & exp_plus_one));
+    wire inexact            = !res_is_nan & !(is_divide ? lhs_is_zero | lhs_is_inf | rhs_is_zero | rhs_is_inf : lhs_is_zero) & (overflow | res_is_tiny | before_round[24:0] != 0 | rem != 0);
+    // === About underflow (UF) flag
+    // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86)
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
+    wire underflow          = inexact & subnormal & !u_exp_plus_one;
+    //                NV                 DZ              OF        UF         NX
+    assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact };
+endmodule
diff --git a/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv b/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv
index 14c97341..18f73d5f 100644
--- a/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv
+++ b/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv
@@ -25,15 +25,17 @@ module FPDivSqrtUnit(FPDivSqrtUnitIF.FPDivSqrtUnit port, RecoveryManagerIF.FPDiv
     ActiveListIndexPath nextActiveListPtr[FP_DIVSQRT_ISSUE_WIDTH];
 
     for (genvar i = 0; i < FP_DIVSQRT_ISSUE_WIDTH; i++) begin : BlockDivUnit
-        FP32DivSqrter fpDivSqrter(
+        FP32DivSqrterWithFFlags fpDivSqrter(
             .clk(port.clk),
             .rst(rst_divider[i]),
-            .lhs(port.dataInA[i]),
-            .rhs(port.dataInB[i]),
-            .is_divide(port.is_divide[i]),
+            .input_lhs(port.dataInA[i]),
+            .input_rhs(port.dataInB[i]),
+            .input_is_divide(port.is_divide[i]),
+            .input_round_mode(port.rm[i]),
             .req(port.Req[i]),
             .finished(finished[i]),
-            .result(port.DataOut[i])
+            .result(port.DataOut[i]),
+            .fflags(port.FFlagsOut[i])
         );
     end
 
diff --git a/Processor/Src/Makefiles/CoreSources.inc.mk b/Processor/Src/Makefiles/CoreSources.inc.mk
index d33567dc..52d974fa 100644
--- a/Processor/Src/Makefiles/CoreSources.inc.mk
+++ b/Processor/Src/Makefiles/CoreSources.inc.mk
@@ -116,6 +116,7 @@ CORE_MODULES = \
 	FloatingPointUnit/FP32PipelinedFMA.sv \
 	FloatingPointUnit/FP32PipelinedOther.sv \
 	FloatingPointUnit/FP32DivSqrter.sv \
+	FloatingPointUnit/FP32DivSqrterWithFFlags.sv \
 	FloatingPointUnit/FPDivSqrtUnit.sv \
 	FloatingPointUnit/FPDivSqrtUnitIF.sv \
 	RenameLogic/RenameLogic.sv \
diff --git a/Processor/Src/Makefiles/TestCommands.inc.mk b/Processor/Src/Makefiles/TestCommands.inc.mk
index a8e0b2ae..8c205b9d 100644
--- a/Processor/Src/Makefiles/TestCommands.inc.mk
+++ b/Processor/Src/Makefiles/TestCommands.inc.mk
@@ -330,10 +330,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS)
     fadd_b2-01 \
     fadd_b4-01 \
     fadd_b5-01 \
-    fdiv_b4-01 \
-    fdiv_b5-01 \
-    fdiv_b6-01 \
-    fdiv_b7-01 \
     fmadd_b4-01 \
     fmadd_b5-01 \
     fmadd_b6-01 \
@@ -354,11 +350,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS)
     fnmsub_b5-01 \
     fnmsub_b6-01 \
     fnmsub_b7-01 \
-    fsqrt_b3-01 \
-    fsqrt_b4-01 \
-    fsqrt_b5-01 \
-    fsqrt_b7-01 \
-    fsqrt_b8-01 \
     fsub_b4-01 \
     fsub_b5-01 \
     fsub_b7-01 \
@@ -368,9 +359,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS)
     fadd_b10-01 \
     fadd_b12-01 \
     fadd_b13-01 \
-    fdiv_b1-01 \
-    fdiv_b2-01 \
-    fdiv_b20-01 \
     fmadd_b14-01 \
     fmadd_b16-01 \
     fmadd_b17-01 \
@@ -393,10 +381,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS)
     fnmsub_b17-01 \
     fnmsub_b18-01 \
     fnmsub_b2-01 \
-    fsqrt_b1-01 \
-    fsqrt_b2-01 \
-    fsqrt_b20-01 \
-    fsqrt_b9-01 \
     fsub_b1-01 \
     fsub_b10-01 \
     fsub_b12-01 \
@@ -442,6 +426,22 @@ RISCV_RV32F_COMPLIANCE_TESTS =    \
   fsgnjn_b1-01 \
   fsgnjx_b1-01 \
   fsw-align-01 \
+  fdiv_b1-01 \
+  fdiv_b2-01 \
+  fdiv_b4-01 \
+  fdiv_b5-01 \
+  fdiv_b6-01 \
+  fdiv_b7-01 \
+  fdiv_b20-01 \
+  fsqrt_b1-01 \
+  fsqrt_b2-01 \
+  fsqrt_b20-01 \
+  fsqrt_b9-01 \
+  fsqrt_b3-01 \
+  fsqrt_b4-01 \
+  fsqrt_b5-01 \
+  fsqrt_b7-01 \
+  fsqrt_b8-01 \
 
 RISCV_RV32F_COMPLIANCE_TEST_TARGETS = $(RISCV_RV32F_COMPLIANCE_TESTS:%=test-riscv-compliance-%)
 
diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
index 6eedc552..a0ef0e80 100644
--- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
+++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
@@ -299,8 +299,7 @@ module FPExecutionStage(
                 end
                 FP_MOP_TYPE_DIV, FP_MOP_TYPE_SQRT: begin
                     dataOut[i].data = fpDivSqrtUnit.DataOut[i];
-                    //fflagsData[i] = fpDivSqrtUnit.FFlagsOut[i];
-                    fflagsOut[i] = '0;
+                    fflagsOut[i] = fpDivSqrtUnit.FFlagsOut[i];
                 end 
                 default: begin /* FP_MOP_TYPE_OTHER */
                     dataOut[i].data = otherDataOut[i];

From f42531d9535bbf4afb953c6650b901f155348206 Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Fri, 20 Dec 2024 13:42:27 +0900
Subject: [PATCH 02/11] feat: integrate FMA with fflags into the FP pipeline

---
 .../FP32PipelinedFMA_WithFFlags.sv            | 314 ++++++++++++++++++
 Processor/Src/FloatingPointUnit/FPUTypes.sv   |  28 +-
 Processor/Src/Makefiles/TestCommands.inc.mk   | 121 ++++---
 .../Pipeline/FPBackEnd/FPExecutionStage.sv    |  12 +-
 4 files changed, 406 insertions(+), 69 deletions(-)
 create mode 100644 Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv

diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
new file mode 100644
index 00000000..b896f8e4
--- /dev/null
+++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
@@ -0,0 +1,314 @@
+
+
+module FP32PipelinedFMA_WithFFlags(
+input
+    logic clk,
+    logic [31:0] mullhs, 
+    logic [31:0] mulrhs,
+    logic [31:0] addend, 
+    logic [2:0] round_mode,
+    logic is_fmul,
+output
+    logic [31:0] result,
+    logic [4:0] fflags
+);
+
+    FMAStage1RegPath stg0Out;
+    FMAStage2RegPath stg1Out;
+    FMAStage3RegPath stg2Out;
+    FMAStage4RegPath stg3Out;
+    
+    // Fused-multiply-adder (24bit*24bit<<3+76bit+sign)
+    // The multiplication result is shifted by 2 bits for the guard bit and the sticky bit.
+    // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| ~ 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23.
+    logic [76:0] multiplier_lhs, multiplier_rhs, multiplier_addend, fma_result;
+    logic [76:0] mlhs, mrhs, maddend;
+    logic is_subtract, is_sub;
+    always_ff @(posedge clk) begin
+        multiplier_lhs    <= mlhs;
+        multiplier_rhs    <= mrhs;
+        multiplier_addend <= maddend;
+        is_subtract <= is_sub;
+        fma_result <= is_subtract ? multiplier_lhs * multiplier_rhs - multiplier_addend
+                                  : multiplier_lhs * multiplier_rhs + multiplier_addend;
+    end
+
+    FMA_WithFFlagsStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_fmul, is_sub, mlhs, mrhs, maddend);
+    FMA_WithFFlagsStage1 stg1(clk, stg0Out, stg1Out);
+    FMA_WithFFlagsStage2 stg2(clk, stg1Out, stg2Out, fma_result);
+    FMA_WithFFlagsStage3 stg3(clk, stg2Out, stg3Out);
+    FMA_WithFFlagsStage4 stg4(clk, stg3Out, result, fflags);
+
+    logic [31:0] ref_result;
+    logic [4:0] ref_fflags;
+    float_fused_multiply_adder fma(mullhs, mulrhs, addend, round_mode, ref_result, ref_fflags);
+endmodule
+
+module FMA_WithFFlagsStage0(
+    input logic clk,
+    output FMAStage1RegPath stg0Out,
+    input logic [31:0] mullhs,
+    input logic [31:0] mulrhs,
+    input logic [31:0] addend,
+    input logic [2:0] round_mode,
+    input logic is_fmul,
+    output logic is_subtract,
+    output logic [76:0] mlhs,
+    output logic [76:0] mrhs,
+    output logic [76:0] maddend
+);
+
+    wire       mullhs_sign = mullhs[31];
+    wire       mulrhs_sign = mulrhs[31];
+    wire       addend_sign = addend[31];
+    wire [7:0] mullhs_expo = mullhs[30:23];
+    wire [7:0] mulrhs_expo = mulrhs[30:23];
+    wire [7:0] addend_expo = addend[30:23];
+    wire[22:0] mullhs_mant = mullhs[22:0];
+    wire[22:0] mulrhs_mant = mulrhs[22:0];
+    wire[22:0] addend_mant = addend[22:0];
+
+    assign is_subtract     = mullhs_sign ^ mulrhs_sign ^ addend_sign;
+
+    // NaN handling
+    wire mullhs_is_zero    = mullhs_expo == 8'h00 & mullhs_mant == 0;
+    wire mulrhs_is_zero    = mulrhs_expo == 8'h00 & mulrhs_mant == 0;
+    wire addend_is_zero    = addend_expo == 8'h00 & addend_mant == 0;
+    wire mullhs_is_inf     = mullhs_expo == 8'hff & mullhs_mant == 0;
+    wire mulrhs_is_inf     = mulrhs_expo == 8'hff & mulrhs_mant == 0;
+    wire addend_is_inf     = addend_expo == 8'hff & addend_mant == 0;
+    wire mullhs_is_nan     = mullhs_expo == 8'hff & mullhs_mant != 0;
+    wire mulrhs_is_nan     = mulrhs_expo == 8'hff & mulrhs_mant != 0;
+    wire addend_is_nan     = addend_expo == 8'hff & addend_mant != 0;
+    wire mullhs_is_snan    = mullhs_is_nan & mullhs_mant[22] == 0;
+    wire mulrhs_is_snan    = mulrhs_is_nan & mulrhs_mant[22] == 0;
+    wire addend_is_snan    = addend_is_nan & addend_mant[22] == 0;
+    wire mulres_is_inf     = (mullhs_is_inf & !mulrhs_is_nan) | (!mullhs_is_nan & mulrhs_is_inf); 
+    wire mulres_is_zero    = mullhs_is_zero | mulrhs_is_zero;
+    wire res_is_addend    = mulres_is_zero & !addend_is_zero;
+    // === About setting invalid operation (NV) flag ===
+    // x86 does not set the NV flag on ±0×±∞±qNaN.
+    // RISC-V sets the NV flag on ±0×±∞±qNaN.
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.116
+    wire invalid_operation = mullhs_is_snan | mulrhs_is_snan | addend_is_snan // One of the input values is sNaN
+                             | (mullhs_is_zero & mulrhs_is_inf) | (mullhs_is_inf & mulrhs_is_zero) // Inf * Zero
+                             | (is_subtract & mulres_is_inf & addend_is_inf); // Inf - Inf
+    wire result_is_nan     = mullhs_is_nan | mulrhs_is_nan | addend_is_nan | invalid_operation; 
+    // === About handling NaN ===
+    // x86 returns the following qNaN:
+    //  mullhs_is_nan ? mullhs | 32'h00400000 :
+    //  mulrhs_is_nan ? mulrhs | 32'h00400000 :
+    //  addend_is_nan ? addend | 32'h00400000 : 32'hffc00000
+    // RISC-V always returns canonical NaN (32'h7fc00000).
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
+    wire[31:0] nan         = 32'h7fc00000;
+
+    // Inf handling
+    wire res_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf;
+    wire mul_sign       = mullhs_sign ^ mulrhs_sign;
+    wire inf_sign   = addend_is_inf ? addend_sign : mul_sign;
+    wire[31:0] inf  = { inf_sign, 8'hff, 23'h0 };
+
+    // Main path (including subnormal handling)
+    wire [9:0] v_mullhs_expo  = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo };
+    wire [9:0] v_mulrhs_expo  = { 2'b0, mulrhs_expo == 8'h00 ? 8'h01 : mulrhs_expo };
+    wire [9:0] v_addend_expo  = { 2'b0, addend_expo == 8'h00 ? 8'h01 : addend_expo };
+    wire[23:0] v_mullhs_mant  = { mullhs_expo != 8'h00, mullhs_mant };
+    wire[23:0] v_mulrhs_mant  = { mulrhs_expo != 8'h00, mulrhs_mant };
+    wire[23:0] v_addend_mant  = { addend_expo != 8'h00, addend_mant };
+    wire [9:0] v_fmares_expo  = v_mullhs_expo + v_mulrhs_expo - 127 + 26; // See below: There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs.
+    wire [9:0] addend_shift   = v_fmares_expo - v_addend_expo;
+    wire[74:0] shifted_addend = { v_addend_mant, 2'b00, 49'b0 } >> addend_shift; // The 2'b00 are the guard bit and the round bit.
+    wire       addend_sticky  = $signed(addend_shift) > 75 ? v_addend_mant != 0
+                                                           : v_addend_mant << (10'd75 - addend_shift) != 24'h000000; // the part shifted out above
+    // Special cases
+    wire       mulres_is_tiny   = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps)
+    wire       res_is_tiny      = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN
+  
+    // Fused-multiply-adder (24bit*24bit<<3+76bit+sign)
+    // The multiplication result is shifted by 3 bits for the guard bit, the round bit, and the sticky bit.
+    // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| < 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23.
+    assign mlhs    = { 51'b0, v_mullhs_mant, 2'b0 };
+    assign mrhs    = { 52'b0, v_mulrhs_mant, 1'b0 };
+    assign maddend = { 1'b0, shifted_addend, addend_sticky };
+    // wire[76:0] multiplier_result = is_subtract ? multiplier_lhs * multiplier_rhs - multiplier_addend
+    //                                            : multiplier_lhs * multiplier_rhs + multiplier_addend;
+
+    assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan,
+                      res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, inf, nan, addend,
+                      mulres_is_tiny, res_is_tiny, invalid_operation, round_mode, is_fmul};
+endmodule
+
+module FMA_WithFFlagsStage1(
+    input logic clk,
+    input FMAStage1RegPath stg1In,
+    output FMAStage2RegPath stg1Out
+);
+    FMAStage1RegPath pipeReg;
+    always_ff @(posedge clk) begin
+        pipeReg <= stg1In; 
+    end
+    assign stg1Out = pipeReg;
+endmodule
+
+module FMA_WithFFlagsStage2(
+    input logic clk,
+    input FMAStage2RegPath stg2In,
+    output FMAStage3RegPath stg2Out,
+    input logic [76:0] fma_result
+);
+    FMAStage2RegPath pipeReg;
+    always_ff @(posedge clk) begin
+        pipeReg <= stg2In; 
+    end
+
+    wire       mul_sign   = pipeReg.mul_sign;
+    wire       res_is_zero     = fma_result == 77'h0;
+    wire       res_is_negative = fma_result[76];
+    wire[75:0] abs_fma_result  = res_is_negative ? -fma_result[75:0] : fma_result[75:0];
+    wire       result_sign     = mul_sign ^ res_is_negative;
+    
+    assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, res_is_negative, pipeReg.result_is_inf,
+                      pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign,
+                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend,
+                      pipeReg.mul_sign, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul};
+endmodule
+
+module FMA_WithFFlagsStage3(
+    input logic clk,
+    input FMAStage3RegPath stg3In,
+    output FMAStage4RegPath stg3Out
+);
+    function automatic [6:0] leading_zeros_count;
+        input[75:0] x;
+        for(leading_zeros_count = 0; leading_zeros_count <= 75; leading_zeros_count = leading_zeros_count + 1)
+            if(x[75-leading_zeros_count]) break;
+    endfunction
+    
+    FMAStage3RegPath pipeReg;
+    always_ff @(posedge clk) begin
+        pipeReg <= stg3In; 
+    end
+    wire[75:0] abs_fma_result  = pipeReg.abs_fma_result;
+    wire [9:0] mulres_expo     = pipeReg.mulres_expo;
+
+    wire [6:0] leading_zeros   = leading_zeros_count(abs_fma_result); // 0 <= leading_sign_bits <= 74 if !res_is_zero
+    wire [9:0] virtual_expo    = mulres_expo - { 3'b00, leading_zeros }; // There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs.
+    wire       subnormal       = $signed(virtual_expo) <= 0;
+    wire [6:0] fmares_shift    = subnormal ? mulres_expo[6:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs.
+                                           : leading_zeros + 1;   // (75 - addend_sticky(1bit)) - shifter_result(24bit)
+    
+    assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.res_is_negative, pipeReg.result_is_inf,
+                      pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign,
+                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend,
+                      pipeReg.mul_sign, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul};
+endmodule
+
+module FMA_WithFFlagsStage4(
+    input logic clk,
+    input FMAStage4RegPath stg4In,
+    output logic [31:0] result,
+    output logic [4:0] fflags
+);
+    function round_to_away;
+        input sign;
+        input last_place;
+        input guard_bit;
+        input sticky_bit;
+        input[2:0] round_mode;
+
+        case(round_mode)
+            3'b000:  round_to_away = guard_bit & (last_place | sticky_bit); // round to nearest, ties to even
+            3'b100:  round_to_away = guard_bit;                             // round to nearest, ties to away
+            3'b010:  round_to_away = sign & (guard_bit | sticky_bit);       // round downward
+            3'b011:  round_to_away = !sign & (guard_bit | sticky_bit);      // round upward
+            default: round_to_away = 0;                                     // round towards zero
+        endcase
+    endfunction
+
+    FMAStage4RegPath pipeReg;
+    always_ff @(posedge clk) begin
+        pipeReg <= stg4In; 
+    end
+
+    wire[75:0] abs_fma_result  = pipeReg.abs_fma_result;
+    wire [7:0] fmares_shift    = pipeReg.fmares_shift;
+    wire [9:0] virtual_expo    = pipeReg.virtual_expo;
+    wire[31:0] inf             = pipeReg.inf;
+    wire[31:0] nan             = pipeReg.nan;
+    wire[31:0] addend          = pipeReg.addend;
+    wire res_is_negative       = pipeReg.res_is_negative;
+    wire result_is_inf         = pipeReg.result_is_inf;
+    wire result_is_nan         = pipeReg.result_is_nan;
+    wire res_is_zero           = pipeReg.res_is_zero;
+    wire res_is_addend         = pipeReg.res_is_addend;
+    wire result_sign           = pipeReg.result_sign;
+    wire prop_inf_sign         = pipeReg.prop_inf_sign;
+    wire addend_sign           = pipeReg.addend_sign;
+    wire subnormal             = pipeReg.subnormal;
+    wire is_subtract           = pipeReg.is_subtract;
+    wire mulres_is_tiny        = pipeReg.mulres_is_tiny;
+    wire res_is_tiny           = pipeReg.res_is_tiny;
+    wire invalid_operation     = pipeReg.invalid_operation;
+    wire [2:0] round_mode      = pipeReg.round_mode;
+    wire is_fmul               = pipeReg.is_fmul;
+
+    // Normalize and rounding decision
+    /* verilator lint_off WIDTH */
+    wire[24:0] shifter_result  = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) }
+    /* verilator lint_on WIDTH */
+    wire       sticky          = abs_fma_result << (7'd25 + fmares_shift) != 0; // the part right-shifted out above
+
+    wire       round_away      = round_to_away(result_sign, shifter_result[2], shifter_result[1], shifter_result[0] | sticky, round_mode);
+    wire       exp_plus_one    = shifter_result >= 25'h1fffffc & round_away; // carry is generated with rounding taken into account
+    // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent)
+    wire       u_round_away    = round_to_away(result_sign, shifter_result[1], shifter_result[0], sticky, round_mode);
+    wire       u_exp_plus_one  = shifter_result >= 25'h1fffffe & u_round_away; // 0x1.fffffep-127 <= |mullhs*mulrhs+addend| < 0x1p-126 and the after rounding result become a normal number, not raising the underflow flag.
+
+    wire[22:0] result_mant = shifter_result[24:2] + { 22'h0, round_away }; // No special treatment is required even if an overflow occurs since the answer will be 0 and it will be correct.
+    wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one };
+
+    // Special cases
+    // wire       mulres_is_zero   = mullhs_is_zero | mulrhs_is_zero;
+    wire       res_is_huge      = $signed(virtual_expo) >= 255;
+    // wire       mulres_is_tiny   = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps)
+    // wire       res_is_tiny      = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN
+    // wire       res_is_addend    = mulres_is_zero & !addend_is_zero;
+    // wire       res_is_zero      = multiplier_result == 77'h0; // including mulres_is_zero & addend_is_zero
+    wire       dir_is_away      = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign);
+    wire       huge_is_inf      = round_mode == 0 | round_mode == 4 | dir_is_away;
+
+    wire[31:0] addend_plus_tiny = round_mode == 1                &  is_subtract ? addend - 1 :
+                                  round_mode == 2 & !addend_sign &  is_subtract ? addend - 1 :
+                                  round_mode == 3 &  addend_sign &  is_subtract ? addend - 1 :
+                                  round_mode == 2 &  addend_sign & !is_subtract ? addend + 1 :
+                                  round_mode == 3 & !addend_sign & !is_subtract ? addend + 1
+                                                                                : addend;
+    wire[31:0] huge             = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
+    wire[31:0] tiny             = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
+    
+    wire [7:0] addend_expo = addend[30:23];
+    wire[22:0] addend_mant = addend[22:0];
+    wire addend_is_zero         = addend_expo == 8'h00 & addend_mant == 0;
+    wire[31:0] zero             = { is_fmul ? result_sign : (is_subtract ? round_mode == 2 : addend_sign), 8'h00, 23'h0 };
+
+    // Final result
+    assign result = result_is_nan  ? nan              :
+                    result_is_inf  ? inf              :
+                    res_is_huge    ? huge             :
+                    res_is_tiny    ? tiny             :
+                    mulres_is_tiny ? addend_plus_tiny :
+                    res_is_addend  ? addend           :
+                    res_is_zero    ? zero             : { result_sign, result_expo, result_mant };
+
+    // Exception flags
+    wire divide_by_zero    = 1'b0;
+    wire overflow          = !result_is_nan & !result_is_inf & (mulres_is_tiny ? addend_plus_tiny[30:23] == 8'hff : res_is_huge | (virtual_expo == 254 & exp_plus_one));
+    wire inexact           = !result_is_nan & !result_is_inf & (overflow | res_is_tiny | mulres_is_tiny | shifter_result[1] | shifter_result[0] | sticky);
+    // === About underflow (UF) flag
+    // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) 
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
+    wire underflow         = inexact & (mulres_is_tiny ? addend[30:23] == 8'h00 | addend_plus_tiny[30:23] == 8'h00 : res_is_tiny | (subnormal & !u_exp_plus_one));
+    //                NV                 DZ              OF        UF         NX
+    assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact };
+endmodule
diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv
index 50286c7e..179a4f6d 100644
--- a/Processor/Src/FloatingPointUnit/FPUTypes.sv
+++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv
@@ -120,8 +120,14 @@ typedef struct packed {
     logic prop_inf_sign;
     logic addend_sign;
     logic is_subtract;
+    logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
+    logic mulres_is_tiny;
+    logic res_is_tiny;
+    logic invalid_operation;
+    logic [2:0] round_mode;
+    logic is_fmul;
 } FMAStage1RegPath;
 
 typedef struct packed {
@@ -133,8 +139,14 @@ typedef struct packed {
     logic prop_inf_sign;
     logic addend_sign;
     logic is_subtract;
+    logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
+    logic mulres_is_tiny;
+    logic res_is_tiny;
+    logic invalid_operation;
+    logic [2:0] round_mode;
+    logic is_fmul;
 } FMAStage2RegPath;
 
 typedef struct packed {
@@ -149,13 +161,20 @@ typedef struct packed {
     logic prop_inf_sign;
     logic addend_sign;
     logic is_subtract;
+    logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
+    logic mul_sign;
+    logic mulres_is_tiny;
+    logic res_is_tiny;
+    logic invalid_operation;
+    logic [2:0] round_mode;
+    logic is_fmul;
 } FMAStage3RegPath;
 
 typedef struct packed {
     logic [75:0] abs_fma_result;
-    logic [7:0] fmares_shift;
+    logic [6:0] fmares_shift;
     logic [9:0] virtual_expo;
     logic subnormal;
     logic res_is_negative;
@@ -167,8 +186,15 @@ typedef struct packed {
     logic prop_inf_sign;
     logic addend_sign;
     logic is_subtract;
+    logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
+    logic mul_sign;
+    logic mulres_is_tiny;
+    logic res_is_tiny;
+    logic invalid_operation;
+    logic [2:0] round_mode;
+    logic is_fmul;
 } FMAStage4RegPath;
 
 endpackage
\ No newline at end of file
diff --git a/Processor/Src/Makefiles/TestCommands.inc.mk b/Processor/Src/Makefiles/TestCommands.inc.mk
index 8c205b9d..9f03c726 100644
--- a/Processor/Src/Makefiles/TestCommands.inc.mk
+++ b/Processor/Src/Makefiles/TestCommands.inc.mk
@@ -325,68 +325,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS)
     fsub_b3-01 \
     fsub_b8-01 \
 
-# unsupported rounding mode
-    #fadd_b7-01 \
-    fadd_b2-01 \
-    fadd_b4-01 \
-    fadd_b5-01 \
-    fmadd_b4-01 \
-    fmadd_b5-01 \
-    fmadd_b6-01 \
-    fmadd_b7-01 \
-    fmsub_b4-01 \
-    fmsub_b5-01 \
-    fmsub_b6-01 \
-    fmsub_b7-01 \
-    fmul_b4-01 \
-    fmul_b5-01 \
-    fmul_b6-01 \
-    fmul_b7-01 \
-    fnmadd_b4-01 \
-    fnmadd_b5-01 \
-    fnmadd_b6-01 \
-    fnmadd_b7-01 \
-    fnmsub_b4-01 \
-    fnmsub_b5-01 \
-    fnmsub_b6-01 \
-    fnmsub_b7-01 \
-    fsub_b4-01 \
-    fsub_b5-01 \
-    fsub_b7-01 \
-
-# unsupported fflags
-    #fadd_b1-01 \
-    fadd_b10-01 \
-    fadd_b12-01 \
-    fadd_b13-01 \
-    fmadd_b14-01 \
-    fmadd_b16-01 \
-    fmadd_b17-01 \
-    fmadd_b18-01 \
-    fmadd_b2-01 \
-    fmsub_b14-01 \
-    fmsub_b16-01 \
-    fmsub_b17-01 \
-    fmsub_b18-01 \
-    fmsub_b2-01 \
-    fmul_b1-01 \
-    fmul_b2-01 \
-    fnmadd_b14-01 \
-    fnmadd_b16-01 \
-    fnmadd_b17-01 \
-    fnmadd_b18-01 \
-    fnmadd_b2-01 \
-    fnmsub_b14-01 \
-    fnmsub_b16-01 \
-    fnmsub_b17-01 \
-    fnmsub_b18-01 \
-    fnmsub_b2-01 \
-    fsub_b1-01 \
-    fsub_b10-01 \
-    fsub_b12-01 \
-    fsub_b13-01 \
-    fsub_b2-01 \
-
 RISCV_RV32F_COMPLIANCE_TESTS =    \
   fcvt.s.w_b25-01 \
   fcvt.s.w_b26-01 \
@@ -442,6 +380,65 @@ RISCV_RV32F_COMPLIANCE_TESTS =    \
   fsqrt_b5-01 \
   fsqrt_b7-01 \
   fsqrt_b8-01 \
+  fadd_b7-01 \
+  fadd_b2-01 \
+  fadd_b4-01 \
+  fadd_b5-01 \
+  fmadd_b4-01 \
+  fmadd_b5-01 \
+  fmadd_b6-01 \
+  fmadd_b7-01 \
+  fmsub_b4-01 \
+  fmsub_b5-01 \
+  fmsub_b6-01 \
+  fmsub_b7-01 \
+  fmul_b4-01 \
+  fmul_b6-01 \
+  fmul_b7-01 \
+  fnmadd_b4-01 \
+  fnmadd_b5-01 \
+  fnmadd_b6-01 \
+  fnmadd_b7-01 \
+  fnmsub_b4-01 \
+  fnmsub_b5-01 \
+  fnmsub_b6-01 \
+  fnmsub_b7-01 \
+  fsub_b4-01 \
+  fsub_b5-01 \
+  fsub_b7-01 \
+  fadd_b1-01 \
+  fadd_b10-01 \
+  fadd_b12-01 \
+  fadd_b13-01 \
+  fmadd_b14-01 \
+  fmadd_b16-01 \
+  fmadd_b17-01 \
+  fmadd_b18-01 \
+  fmadd_b2-01 \
+  fmsub_b14-01 \
+  fmsub_b16-01 \
+  fmsub_b17-01 \
+  fmsub_b18-01 \
+  fmsub_b2-01 \
+  fmul_b2-01 \
+  fnmadd_b14-01 \
+  fnmadd_b16-01 \
+  fnmadd_b17-01 \
+  fnmadd_b18-01 \
+  fnmadd_b2-01 \
+  fnmsub_b14-01 \
+  fnmsub_b16-01 \
+  fnmsub_b17-01 \
+  fnmsub_b18-01 \
+  fnmsub_b2-01 \
+  fsub_b1-01 \
+  fsub_b10-01 \
+  fsub_b12-01 \
+  fsub_b13-01 \
+  fsub_b2-01 \
+  fmul_b1-01 \
+  fmul_b5-01 \
+
 
 RISCV_RV32F_COMPLIANCE_TEST_TARGETS = $(RISCV_RV32F_COMPLIANCE_TESTS:%=test-riscv-compliance-%)
 
diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
index a0ef0e80..0fd3123d 100644
--- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
+++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
@@ -148,14 +148,15 @@ module FPExecutionStage(
     logic isDivSqrt         [ FP_ISSUE_WIDTH ]; 
 
     for ( genvar i = 0; i < FP_ISSUE_WIDTH; i++ ) begin
-        FP32PipelinedFMA fpFMA (
+        FP32PipelinedFMA_WithFFlags fpFMA (
             .clk (port.clk),
             .mullhs (fmaMulLHS[i]),
             .mulrhs (fmaMulRHS[i]),
             .addend (fmaAddend[i]),
-            //.rm (rm[i]),
-            .result ( fmaDataOut[i] )
-            //.fflags ( fmaFFlagsOut[i])
+            .round_mode (rm[i]),
+            .is_fmul (fpuCode[i] == FC_MUL),
+            .result ( fmaDataOut[i] ),
+            .fflags ( fmaFFlagsOut[i])
         );
         
         FP32PipelinedOther #(
@@ -294,8 +295,7 @@ module FPExecutionStage(
             unique case ( localPipeReg[i][FP_EXEC_STAGE_DEPTH-2].fpQueueData.fpOpInfo.opType )
                 FP_MOP_TYPE_ADD, FP_MOP_TYPE_MUL, FP_MOP_TYPE_FMA: begin
                     dataOut[i].data = fmaDataOut[i];
-                    //fflagsData[i] = fmaFFlagsOut[i];
-                    fflagsOut[i] = '0;
+                    fflagsOut[i] = fmaFFlagsOut[i];
                 end
                 FP_MOP_TYPE_DIV, FP_MOP_TYPE_SQRT: begin
                     dataOut[i].data = fpDivSqrtUnit.DataOut[i];

From 850916c2799bf1654ee64f6bead164a211b50e20 Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Fri, 20 Dec 2024 14:06:52 +0900
Subject: [PATCH 03/11] fix: fix bugs related to FMA

---
 .../FP32PipelinedFMA_WithFFlags.sv            | 57 ++++++--------
 Processor/Src/FloatingPointUnit/FPUTypes.sv   | 74 +++++++++++++++++--
 Processor/Src/Makefiles/CoreSources.inc.mk    |  1 +
 3 files changed, 88 insertions(+), 44 deletions(-)

diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
index b896f8e4..9e27d590 100644
--- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
+++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
@@ -13,10 +13,10 @@ output
     logic [4:0] fflags
 );
 
-    FMAStage1RegPath stg0Out;
-    FMAStage2RegPath stg1Out;
-    FMAStage3RegPath stg2Out;
-    FMAStage4RegPath stg3Out;
+    FMA_WithFFlagsStage1RegPath stg0Out;
+    FMA_WithFFlagsStage2RegPath stg1Out;
+    FMA_WithFFlagsStage3RegPath stg2Out;
+    FMA_WithFFlagsStage4RegPath stg3Out;
     
     // Fused-multiply-adder (24bit*24bit<<3+76bit+sign)
     // The multiplication result is shifted by 2 bits for the guard bit and the sticky bit.
@@ -38,15 +38,11 @@ output
     FMA_WithFFlagsStage2 stg2(clk, stg1Out, stg2Out, fma_result);
     FMA_WithFFlagsStage3 stg3(clk, stg2Out, stg3Out);
     FMA_WithFFlagsStage4 stg4(clk, stg3Out, result, fflags);
-
-    logic [31:0] ref_result;
-    logic [4:0] ref_fflags;
-    float_fused_multiply_adder fma(mullhs, mulrhs, addend, round_mode, ref_result, ref_fflags);
 endmodule
 
 module FMA_WithFFlagsStage0(
     input logic clk,
-    output FMAStage1RegPath stg0Out,
+    output FMA_WithFFlagsStage1RegPath stg0Out,
     input logic [31:0] mullhs,
     input logic [31:0] mulrhs,
     input logic [31:0] addend,
@@ -124,16 +120,13 @@ module FMA_WithFFlagsStage0(
     // Special cases
     wire       mulres_is_tiny   = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps)
     wire       res_is_tiny      = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN
-  
+
     // Fused-multiply-adder (24bit*24bit<<3+76bit+sign)
     // The multiplication result is shifted by 3 bits for the guard bit, the round bit, and the sticky bit.
     // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| < 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23.
     assign mlhs    = { 51'b0, v_mullhs_mant, 2'b0 };
     assign mrhs    = { 52'b0, v_mulrhs_mant, 1'b0 };
     assign maddend = { 1'b0, shifted_addend, addend_sticky };
-    // wire[76:0] multiplier_result = is_subtract ? multiplier_lhs * multiplier_rhs - multiplier_addend
-    //                                            : multiplier_lhs * multiplier_rhs + multiplier_addend;
-
     assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan,
                       res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, inf, nan, addend,
                       mulres_is_tiny, res_is_tiny, invalid_operation, round_mode, is_fmul};
@@ -141,10 +134,10 @@ endmodule
 
 module FMA_WithFFlagsStage1(
     input logic clk,
-    input FMAStage1RegPath stg1In,
-    output FMAStage2RegPath stg1Out
+    input FMA_WithFFlagsStage1RegPath stg1In,
+    output FMA_WithFFlagsStage2RegPath stg1Out
 );
-    FMAStage1RegPath pipeReg;
+    FMA_WithFFlagsStage1RegPath pipeReg;
     always_ff @(posedge clk) begin
         pipeReg <= stg1In; 
     end
@@ -153,11 +146,11 @@ endmodule
 
 module FMA_WithFFlagsStage2(
     input logic clk,
-    input FMAStage2RegPath stg2In,
-    output FMAStage3RegPath stg2Out,
+    input FMA_WithFFlagsStage2RegPath stg2In,
+    output FMA_WithFFlagsStage3RegPath stg2Out,
     input logic [76:0] fma_result
 );
-    FMAStage2RegPath pipeReg;
+    FMA_WithFFlagsStage2RegPath pipeReg;
     always_ff @(posedge clk) begin
         pipeReg <= stg2In; 
     end
@@ -168,16 +161,16 @@ module FMA_WithFFlagsStage2(
     wire[75:0] abs_fma_result  = res_is_negative ? -fma_result[75:0] : fma_result[75:0];
     wire       result_sign     = mul_sign ^ res_is_negative;
     
-    assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, res_is_negative, pipeReg.result_is_inf,
+    assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf,
                       pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign,
                       pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend,
-                      pipeReg.mul_sign, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul};
+                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul};
 endmodule
 
 module FMA_WithFFlagsStage3(
     input logic clk,
-    input FMAStage3RegPath stg3In,
-    output FMAStage4RegPath stg3Out
+    input FMA_WithFFlagsStage3RegPath stg3In,
+    output FMA_WithFFlagsStage4RegPath stg3Out
 );
     function automatic [6:0] leading_zeros_count;
         input[75:0] x;
@@ -185,7 +178,7 @@ module FMA_WithFFlagsStage3(
             if(x[75-leading_zeros_count]) break;
     endfunction
     
-    FMAStage3RegPath pipeReg;
+    FMA_WithFFlagsStage3RegPath pipeReg;
     always_ff @(posedge clk) begin
         pipeReg <= stg3In; 
     end
@@ -198,15 +191,15 @@ module FMA_WithFFlagsStage3(
     wire [6:0] fmares_shift    = subnormal ? mulres_expo[6:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs.
                                            : leading_zeros + 1;   // (75 - addend_sticky(1bit)) - shifter_result(24bit)
     
-    assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.res_is_negative, pipeReg.result_is_inf,
+    assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf,
                       pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign,
                       pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend,
-                      pipeReg.mul_sign, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul};
+                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul};
 endmodule
 
 module FMA_WithFFlagsStage4(
     input logic clk,
-    input FMAStage4RegPath stg4In,
+    input FMA_WithFFlagsStage4RegPath stg4In,
     output logic [31:0] result,
     output logic [4:0] fflags
 );
@@ -226,7 +219,7 @@ module FMA_WithFFlagsStage4(
         endcase
     endfunction
 
-    FMAStage4RegPath pipeReg;
+    FMA_WithFFlagsStage4RegPath pipeReg;
     always_ff @(posedge clk) begin
         pipeReg <= stg4In; 
     end
@@ -237,7 +230,6 @@ module FMA_WithFFlagsStage4(
     wire[31:0] inf             = pipeReg.inf;
     wire[31:0] nan             = pipeReg.nan;
     wire[31:0] addend          = pipeReg.addend;
-    wire res_is_negative       = pipeReg.res_is_negative;
     wire result_is_inf         = pipeReg.result_is_inf;
     wire result_is_nan         = pipeReg.result_is_nan;
     wire res_is_zero           = pipeReg.res_is_zero;
@@ -254,9 +246,7 @@ module FMA_WithFFlagsStage4(
     wire is_fmul               = pipeReg.is_fmul;
 
     // Normalize and rounding decision
-    /* verilator lint_off WIDTH */
     wire[24:0] shifter_result  = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) }
-    /* verilator lint_on WIDTH */
     wire       sticky          = abs_fma_result << (7'd25 + fmares_shift) != 0; // the part right-shifted out above
 
     wire       round_away      = round_to_away(result_sign, shifter_result[2], shifter_result[1], shifter_result[0] | sticky, round_mode);
@@ -269,12 +259,7 @@ module FMA_WithFFlagsStage4(
     wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one };
 
     // Special cases
-    // wire       mulres_is_zero   = mullhs_is_zero | mulrhs_is_zero;
     wire       res_is_huge      = $signed(virtual_expo) >= 255;
-    // wire       mulres_is_tiny   = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps)
-    // wire       res_is_tiny      = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN
-    // wire       res_is_addend    = mulres_is_zero & !addend_is_zero;
-    // wire       res_is_zero      = multiplier_result == 77'h0; // including mulres_is_zero & addend_is_zero
     wire       dir_is_away      = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign);
     wire       huge_is_inf      = round_mode == 0 | round_mode == 4 | dir_is_away;
 
diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv
index 179a4f6d..b34a4d01 100644
--- a/Processor/Src/FloatingPointUnit/FPUTypes.sv
+++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv
@@ -111,6 +111,68 @@ typedef struct packed {
     logic [25:0] quo;
 } FDivSqrtRegPath;
 
+// Pipeline registers for old FMA
+typedef struct packed {
+    logic [9:0] mulres_expo;
+    logic result_is_inf;
+    logic result_is_nan;
+    logic res_is_addend;
+    logic mul_sign;
+    logic prop_inf_sign;
+    logic addend_sign;
+    logic is_subtract;
+    logic [31:0] nan;
+    logic [31:0] addend;
+} FMAStage1RegPath;
+
+typedef struct packed {
+    logic [9:0] mulres_expo;
+    logic result_is_inf;
+    logic result_is_nan;
+    logic res_is_addend;
+    logic mul_sign;
+    logic prop_inf_sign;
+    logic addend_sign;
+    logic is_subtract;
+    logic [31:0] nan;
+    logic [31:0] addend;
+} FMAStage2RegPath;
+
+typedef struct packed {
+    logic [75:0] abs_fma_result;
+    logic [9:0] mulres_expo;
+    logic res_is_negative;
+    logic result_is_inf;
+    logic result_is_nan;
+    logic res_is_zero;
+    logic res_is_addend;
+    logic result_sign;
+    logic prop_inf_sign;
+    logic addend_sign;
+    logic is_subtract;
+    logic [31:0] nan;
+    logic [31:0] addend;
+} FMAStage3RegPath;
+
+typedef struct packed {
+    logic [75:0] abs_fma_result;
+    logic [7:0] fmares_shift;
+    logic [9:0] virtual_expo;
+    logic subnormal;
+    logic res_is_negative;
+    logic result_is_inf;
+    logic result_is_nan;
+    logic res_is_zero;
+    logic res_is_addend;
+    logic result_sign;
+    logic prop_inf_sign;
+    logic addend_sign;
+    logic is_subtract;
+    logic [31:0] nan;
+    logic [31:0] addend;
+} FMAStage4RegPath;
+
+// Pipeline registers for FMA with fflags
 typedef struct packed {
     logic [9:0] mulres_expo;
     logic result_is_inf;
@@ -128,7 +190,7 @@ typedef struct packed {
     logic invalid_operation;
     logic [2:0] round_mode;
     logic is_fmul;
-} FMAStage1RegPath;
+} FMA_WithFFlagsStage1RegPath;
 
 typedef struct packed {
     logic [9:0] mulres_expo;
@@ -147,12 +209,11 @@ typedef struct packed {
     logic invalid_operation;
     logic [2:0] round_mode;
     logic is_fmul;
-} FMAStage2RegPath;
+} FMA_WithFFlagsStage2RegPath;
 
 typedef struct packed {
     logic [75:0] abs_fma_result;
     logic [9:0] mulres_expo;
-    logic res_is_negative;
     logic result_is_inf;
     logic result_is_nan;
     logic res_is_zero;
@@ -164,20 +225,18 @@ typedef struct packed {
     logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
-    logic mul_sign;
     logic mulres_is_tiny;
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
     logic is_fmul;
-} FMAStage3RegPath;
+} FMA_WithFFlagsStage3RegPath;
 
 typedef struct packed {
     logic [75:0] abs_fma_result;
     logic [6:0] fmares_shift;
     logic [9:0] virtual_expo;
     logic subnormal;
-    logic res_is_negative;
     logic result_is_inf;
     logic result_is_nan;
     logic res_is_zero;
@@ -189,12 +248,11 @@ typedef struct packed {
     logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
-    logic mul_sign;
     logic mulres_is_tiny;
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
     logic is_fmul;
-} FMAStage4RegPath;
+} FMA_WithFFlagsStage4RegPath;
 
 endpackage
\ No newline at end of file
diff --git a/Processor/Src/Makefiles/CoreSources.inc.mk b/Processor/Src/Makefiles/CoreSources.inc.mk
index 52d974fa..6abd1698 100644
--- a/Processor/Src/Makefiles/CoreSources.inc.mk
+++ b/Processor/Src/Makefiles/CoreSources.inc.mk
@@ -114,6 +114,7 @@ CORE_MODULES = \
 	FloatingPointUnit/FP32PipelinedAdder.sv \
 	FloatingPointUnit/FP32PipelinedMultiplier.sv \
 	FloatingPointUnit/FP32PipelinedFMA.sv \
+	FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv \
 	FloatingPointUnit/FP32PipelinedOther.sv \
 	FloatingPointUnit/FP32DivSqrter.sv \
 	FloatingPointUnit/FP32DivSqrterWithFFlags.sv \

From 85aa273d3048b3f6004a92d7832de775f5cc351b Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Fri, 20 Dec 2024 14:41:05 +0900
Subject: [PATCH 04/11] refactor: remove unnecessary variables

---
 .../Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv      | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
index 9e27d590..c01d4375 100644
--- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
+++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
@@ -271,10 +271,6 @@ module FMA_WithFFlagsStage4(
                                                                                 : addend;
     wire[31:0] huge             = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
     wire[31:0] tiny             = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
-    
-    wire [7:0] addend_expo = addend[30:23];
-    wire[22:0] addend_mant = addend[22:0];
-    wire addend_is_zero         = addend_expo == 8'h00 & addend_mant == 0;
     wire[31:0] zero             = { is_fmul ? result_sign : (is_subtract ? round_mode == 2 : addend_sign), 8'h00, 23'h0 };
 
     // Final result

From 1f76587727f3055327d1515547e564fee7cf7b23 Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Fri, 20 Dec 2024 17:19:05 +0900
Subject: [PATCH 05/11] fix: fix compile errors when synthesize on Vivado

---
 Processor/Src/FloatingPointUnit/FP32DivSqrter.sv | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
index 759d3a50..6ffc1d2f 100644
--- a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
+++ b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
@@ -44,11 +44,11 @@ output
     logic [31:0] regResult, nextResult;
 
     wire       lhs_sign = input_lhs[31];
-    wire       rhs_sign = rhs[31];
+    wire       rhs_sign = input_rhs[31];
     wire [7:0] lhs_expo = input_lhs[30:23];
-    wire [7:0] rhs_expo = rhs[30:23];
+    wire [7:0] rhs_expo = input_rhs[30:23];
     wire[22:0] lhs_mant = input_lhs[22:0];
-    wire[22:0] rhs_mant = rhs[22:0];
+    wire[22:0] rhs_mant = input_rhs[22:0];
 
     // NaN handling
     wire lhs_is_zero = lhs_expo == 8'h00 & lhs_mant == 0;

From 0dec02eec3a558c5761bc73f6909f5850c74327c Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Fri, 20 Dec 2024 18:30:21 +0900
Subject: [PATCH 06/11] fix: fix DivSqrter not to be a critical path

---
 .../Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv
index 87c32b77..bf39c535 100644
--- a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv
+++ b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv
@@ -96,8 +96,8 @@ output
     wire[23:0] v_rhs_mant_w = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant };
     reg [23:0] v_lhs_mant, v_rhs_mant;
     wire dividend_normalize = v_lhs_mant < v_rhs_mant;
-    wire [9:0] virtual_expo = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals)
-    wire       subnormal    = is_divide & $signed(virtual_expo) <= 0;
+    wire [9:0] virtual_expo_w = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals)
+    wire       subnormal_w  = is_divide & $signed(virtual_expo_w) <= 0;
 
     // The SRT loop. rem needs 27 bits. 24(mantissa)+2(x8/3,SRT)+1(sign)
     wire[26:0] rem_0 = is_divide ? dividend_normalize ? { 2'b00, v_lhs_mant, 1'b0 } : { 3'b000, v_lhs_mant }
@@ -108,6 +108,8 @@ output
     reg  [3:0] stage;
     reg [26:0] rem;
     reg [25:0] quo;
+    reg  [9:0] virtual_expo;
+    reg        subnormal;
     always@(posedge clk) begin
         if (rst) begin
             lhs <= '0;
@@ -134,6 +136,8 @@ output
             rem <= rem_0;
             quo <= quo_0;
             stage <= is_divide ? 0 : 1;
+            virtual_expo <= virtual_expo_w;
+            subnormal <= subnormal_w;
         end else begin
             reg[3:0] div = is_divide ? { 1'b0, v_rhs_mant[22:20] }
                                      : { quo[25], quo[23:21] };

From 609740e2ef3fc623773d81a53bc130ac0898546d Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Fri, 20 Dec 2024 20:48:17 +0900
Subject: [PATCH 07/11] fix: fix sign bit of 0 in FMA

---
 .../FP32PipelinedFMA_WithFFlags.sv                  | 13 +++++--------
 Processor/Src/FloatingPointUnit/FPUTypes.sv         |  4 ----
 .../Src/Pipeline/FPBackEnd/FPExecutionStage.sv      |  5 +----
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
index c01d4375..d4608940 100644
--- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
+++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
@@ -7,7 +7,6 @@ input
     logic [31:0] mulrhs,
     logic [31:0] addend, 
     logic [2:0] round_mode,
-    logic is_fmul,
 output
     logic [31:0] result,
     logic [4:0] fflags
@@ -33,7 +32,7 @@ output
                                   : multiplier_lhs * multiplier_rhs + multiplier_addend;
     end
 
-    FMA_WithFFlagsStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_fmul, is_sub, mlhs, mrhs, maddend);
+    FMA_WithFFlagsStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_sub, mlhs, mrhs, maddend);
     FMA_WithFFlagsStage1 stg1(clk, stg0Out, stg1Out);
     FMA_WithFFlagsStage2 stg2(clk, stg1Out, stg2Out, fma_result);
     FMA_WithFFlagsStage3 stg3(clk, stg2Out, stg3Out);
@@ -47,7 +46,6 @@ module FMA_WithFFlagsStage0(
     input logic [31:0] mulrhs,
     input logic [31:0] addend,
     input logic [2:0] round_mode,
-    input logic is_fmul,
     output logic is_subtract,
     output logic [76:0] mlhs,
     output logic [76:0] mrhs,
@@ -129,7 +127,7 @@ module FMA_WithFFlagsStage0(
     assign maddend = { 1'b0, shifted_addend, addend_sticky };
     assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan,
                       res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, inf, nan, addend,
-                      mulres_is_tiny, res_is_tiny, invalid_operation, round_mode, is_fmul};
+                      mulres_is_tiny, res_is_tiny, invalid_operation, round_mode};
 endmodule
 
 module FMA_WithFFlagsStage1(
@@ -164,7 +162,7 @@ module FMA_WithFFlagsStage2(
     assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf,
                       pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign,
                       pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend,
-                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul};
+                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode};
 endmodule
 
 module FMA_WithFFlagsStage3(
@@ -194,7 +192,7 @@ module FMA_WithFFlagsStage3(
     assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf,
                       pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign,
                       pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend,
-                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul};
+                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode};
 endmodule
 
 module FMA_WithFFlagsStage4(
@@ -243,7 +241,6 @@ module FMA_WithFFlagsStage4(
     wire res_is_tiny           = pipeReg.res_is_tiny;
     wire invalid_operation     = pipeReg.invalid_operation;
     wire [2:0] round_mode      = pipeReg.round_mode;
-    wire is_fmul               = pipeReg.is_fmul;
 
     // Normalize and rounding decision
     wire[24:0] shifter_result  = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) }
@@ -271,7 +268,7 @@ module FMA_WithFFlagsStage4(
                                                                                 : addend;
     wire[31:0] huge             = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
     wire[31:0] tiny             = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
-    wire[31:0] zero             = { is_fmul ? result_sign : (is_subtract ? round_mode == 2 : addend_sign), 8'h00, 23'h0 };
+    wire[31:0] zero             = { is_subtract ? round_mode == 2 : addend_sign, 8'h00, 23'h0 };
 
     // Final result
     assign result = result_is_nan  ? nan              :
diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv
index b34a4d01..5ac4f562 100644
--- a/Processor/Src/FloatingPointUnit/FPUTypes.sv
+++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv
@@ -189,7 +189,6 @@ typedef struct packed {
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
-    logic is_fmul;
 } FMA_WithFFlagsStage1RegPath;
 
 typedef struct packed {
@@ -208,7 +207,6 @@ typedef struct packed {
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
-    logic is_fmul;
 } FMA_WithFFlagsStage2RegPath;
 
 typedef struct packed {
@@ -229,7 +227,6 @@ typedef struct packed {
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
-    logic is_fmul;
 } FMA_WithFFlagsStage3RegPath;
 
 typedef struct packed {
@@ -252,7 +249,6 @@ typedef struct packed {
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
-    logic is_fmul;
 } FMA_WithFFlagsStage4RegPath;
 
 endpackage
\ No newline at end of file
diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
index 0fd3123d..c6ac9edb 100644
--- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
+++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
@@ -154,7 +154,6 @@ module FPExecutionStage(
             .mulrhs (fmaMulRHS[i]),
             .addend (fmaAddend[i]),
             .round_mode (rm[i]),
-            .is_fmul (fpuCode[i] == FC_MUL),
             .result ( fmaDataOut[i] ),
             .fflags ( fmaFFlagsOut[i])
         );
@@ -178,7 +177,7 @@ module FPExecutionStage(
             fmaMulLHS[i] = fpuCode[i] inside {FC_FNMSUB, FC_FNMADD} ? {~fuOpA[i].data[31], fuOpA[i].data[30:0]} : fuOpA[i].data;
             fmaMulRHS[i] = fpuCode[i] inside {FC_ADD, FC_SUB} ? 32'h3f800000 : fuOpB[i].data;
             if(fpuCode[i] == FC_MUL) begin
-                fmaAddend[i] = 32'h00000000;
+                fmaAddend[i] = { fmaMulLHS[i][31] ^ fmaMulRHS[i][31] , 31'h0 };
             end
             else if (fpuCode[i] == FC_ADD) begin
                 fmaAddend[i] = fuOpB[i].data;
@@ -291,7 +290,6 @@ module FPExecutionStage(
             //
             dataOut[i].valid
                 = localPipeReg[i][FP_EXEC_STAGE_DEPTH-2].regValid;
-            // TODO fflagsをちゃんと実装
             unique case ( localPipeReg[i][FP_EXEC_STAGE_DEPTH-2].fpQueueData.fpOpInfo.opType )
                 FP_MOP_TYPE_ADD, FP_MOP_TYPE_MUL, FP_MOP_TYPE_FMA: begin
                     dataOut[i].data = fmaDataOut[i];
@@ -378,7 +376,6 @@ module FPExecutionStage(
 
             nextStage[i].fpQueueData
                 = localPipeReg[i][FP_EXEC_STAGE_DEPTH-2].fpQueueData;
-            // TODO implment fflags
             nextStage[i].fflagsOut = fflagsOut[i];
 
             // リセットorフラッシュ時はNOP

From ed21fdeb5f712d934f74048a14ffc9bc5ae08958 Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Fri, 20 Dec 2024 20:55:26 +0900
Subject: [PATCH 08/11] fix: remove redundancy variables in FMA

---
 .../Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv | 9 ++++-----
 Processor/Src/FloatingPointUnit/FPUTypes.sv              | 4 ----
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
index d4608940..3fdf8a7f 100644
--- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
+++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
@@ -101,7 +101,6 @@ module FMA_WithFFlagsStage0(
     wire res_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf;
     wire mul_sign       = mullhs_sign ^ mulrhs_sign;
     wire inf_sign   = addend_is_inf ? addend_sign : mul_sign;
-    wire[31:0] inf  = { inf_sign, 8'hff, 23'h0 };
 
     // Main path (including subnormal handling)
     wire [9:0] v_mullhs_expo  = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo };
@@ -126,7 +125,7 @@ module FMA_WithFFlagsStage0(
     assign mrhs    = { 52'b0, v_mulrhs_mant, 1'b0 };
     assign maddend = { 1'b0, shifted_addend, addend_sticky };
     assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan,
-                      res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, inf, nan, addend,
+                      res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, nan, addend,
                       mulres_is_tiny, res_is_tiny, invalid_operation, round_mode};
 endmodule
 
@@ -161,7 +160,7 @@ module FMA_WithFFlagsStage2(
     
     assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf,
                       pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign,
-                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend,
+                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend,
                       pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode};
 endmodule
 
@@ -191,7 +190,7 @@ module FMA_WithFFlagsStage3(
     
     assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf,
                       pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign,
-                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend,
+                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend,
                       pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode};
 endmodule
 
@@ -225,7 +224,6 @@ module FMA_WithFFlagsStage4(
     wire[75:0] abs_fma_result  = pipeReg.abs_fma_result;
     wire [7:0] fmares_shift    = pipeReg.fmares_shift;
     wire [9:0] virtual_expo    = pipeReg.virtual_expo;
-    wire[31:0] inf             = pipeReg.inf;
     wire[31:0] nan             = pipeReg.nan;
     wire[31:0] addend          = pipeReg.addend;
     wire result_is_inf         = pipeReg.result_is_inf;
@@ -269,6 +267,7 @@ module FMA_WithFFlagsStage4(
     wire[31:0] huge             = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
     wire[31:0] tiny             = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
     wire[31:0] zero             = { is_subtract ? round_mode == 2 : addend_sign, 8'h00, 23'h0 };
+    wire[31:0] inf  = { prop_inf_sign, 8'hff, 23'h0 };
 
     // Final result
     assign result = result_is_nan  ? nan              :
diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv
index 5ac4f562..2f313d85 100644
--- a/Processor/Src/FloatingPointUnit/FPUTypes.sv
+++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv
@@ -182,7 +182,6 @@ typedef struct packed {
     logic prop_inf_sign;
     logic addend_sign;
     logic is_subtract;
-    logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
     logic mulres_is_tiny;
@@ -200,7 +199,6 @@ typedef struct packed {
     logic prop_inf_sign;
     logic addend_sign;
     logic is_subtract;
-    logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
     logic mulres_is_tiny;
@@ -220,7 +218,6 @@ typedef struct packed {
     logic prop_inf_sign;
     logic addend_sign;
     logic is_subtract;
-    logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
     logic mulres_is_tiny;
@@ -242,7 +239,6 @@ typedef struct packed {
     logic prop_inf_sign;
     logic addend_sign;
     logic is_subtract;
-    logic [31:0] inf;
     logic [31:0] nan;
     logic [31:0] addend;
     logic mulres_is_tiny;

From bcf9e7f0949a17a83eba359854a8f38db31f4457 Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Sat, 21 Dec 2024 00:18:17 +0900
Subject: [PATCH 09/11] refactor: add a comment

---
 Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
index c6ac9edb..72703102 100644
--- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
+++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
@@ -177,6 +177,10 @@ module FPExecutionStage(
             fmaMulLHS[i] = fpuCode[i] inside {FC_FNMSUB, FC_FNMADD} ? {~fuOpA[i].data[31], fuOpA[i].data[30:0]} : fuOpA[i].data;
             fmaMulRHS[i] = fpuCode[i] inside {FC_ADD, FC_SUB} ? 32'h3f800000 : fuOpB[i].data;
             if(fpuCode[i] == FC_MUL) begin
+                // Hack: set sign bit considering rounding mode
+                // +a * +0.0 should return +0.0 regardless of rounding mode,
+                // However, when implemented with fma(+a, +0.0, -0.0),
+                // it returns -0.0 when round_mode = 2
                 fmaAddend[i] = { fmaMulLHS[i][31] ^ fmaMulRHS[i][31] , 31'h0 };
             end
             else if (fpuCode[i] == FC_ADD) begin

From c7232ff469fad48ac1a558732f73889d15e5f644 Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Sat, 21 Dec 2024 12:40:26 +0900
Subject: [PATCH 10/11] refactor: remove FMA and divider without fflags

---
 .../Src/FloatingPointUnit/FP32DivSqrter.sv    | 328 +++++++++---------
 .../FP32DivSqrterWithFFlags.sv                | 210 -----------
 .../Src/FloatingPointUnit/FP32PipelinedFMA.sv | 226 ++++++++----
 .../FP32PipelinedFMA_WithFFlags.sv            | 291 ----------------
 .../Src/FloatingPointUnit/FPDivSqrtUnit.sv    |   2 +-
 Processor/Src/FloatingPointUnit/FPUTypes.sv   |  70 +---
 Processor/Src/Makefiles/CoreSources.inc.mk    |   2 -
 .../Pipeline/FPBackEnd/FPExecutionStage.sv    |   2 +-
 8 files changed, 336 insertions(+), 795 deletions(-)
 delete mode 100644 Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv
 delete mode 100644 Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv

diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
index 6ffc1d2f..83356385 100644
--- a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
+++ b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv
@@ -1,54 +1,69 @@
-import FPUTypes::*;
 
-module FP32DivSqrter (
+module FP32DivSqrter(
 input
     logic clk, rst,
-    logic [31:0] input_lhs,
+    logic [31:0] input_lhs, 
     logic [31:0] input_rhs,
-    logic input_is_divide,
-    logic req,
+    logic input_is_divide, 
+    logic [2:0] input_round_mode,
+    logic req, 
 output
-    logic finished,
-    logic [31:0] result
+    logic [31:0] result,
+    logic [4:0] fflags,
+    logic finished
 );
 
-    function automatic [2:0] srt_table;
+    function round_to_away;
+        input[2:0] round_mode;
+        input      sign;
+        input      last_place;
+        input      guard_bit;
+        input      sticky_bit;
+        input      reminder_is_positive;
+        input      reminder_is_zero;
+
+        case(round_mode)
+            3'b000:  round_to_away = guard_bit & (sticky_bit | reminder_is_positive | (reminder_is_zero & last_place)); // round to nearest, ties to even
+            3'b100:  round_to_away = guard_bit & (sticky_bit | reminder_is_positive | reminder_is_zero);                // round to nearest, ties to away
+            3'b010:  round_to_away = sign & (guard_bit | sticky_bit | reminder_is_positive);  // round downward
+            3'b011:  round_to_away = !sign & (guard_bit | sticky_bit | reminder_is_positive); // round upward
+            default: round_to_away = 0; // round towards zero
+        endcase
+    endfunction
+
+    function[2:0] srt_table;
         input[5:0] rem;
         input[3:0] div;
 
-        reg[5:0] th12 = div < 1 ? 6 : div < 2 ? 7 : div < 4 ? 8 : div < 5 ? 9 : div < 6 ? 10 : 11;
-        reg[5:0] th01 =               div < 2 ? 2 :                             div < 6 ?  3 :  4;
+        reg[5:0] th12;
+        reg[5:0] th01;
+        th12 = div < 1 ? 6 : div < 2 ? 7 : div < 4 ? 8 : div < 5 ? 9 : div < 6 ? 10 : 11;
+        th01 =               div < 2 ? 2 :                             div < 6 ?  3 :  4;
 
-             if($signed(rem) < $signed(-th12)) srt_table = -2;
+            if($signed(rem) < $signed(-th12)) srt_table = -2;
         else if($signed(rem) < $signed(-th01)) srt_table = -1;
         else if($signed(rem) < $signed( th01)) srt_table =  0;
         else if($signed(rem) < $signed( th12)) srt_table =  1;
         else                                   srt_table =  2;
     endfunction
-    function automatic [9:0] leading_zeros_count;
+
+    reg [31:0] lhs;
+    reg [31:0] rhs;
+    reg        is_divide;
+    reg  [2:0] round_mode;
+
+    function [9:0] leading_zeros_count;
         input[22:0] x;
         for(leading_zeros_count = 0; leading_zeros_count <= 22; leading_zeros_count = leading_zeros_count + 1)
-            if(x >> (22-leading_zeros_count) != 0) break;
+            if(x[22-leading_zeros_count]) break;
     endfunction
-    typedef enum logic[1:0]
-    {
-        PHASE_FINISHED = 0,      // Division is finished. It outputs results.
-        PHASE_PREPARATION = 1,   // In preparation
-        PHASE_PROCESSING = 2,    // In processing (SRT loop)
-        PHASE_ROUNDING = 3       // In rounding & arrangement
-    } Phase;
-
-    Phase regPhase, nextPhase; 
-    logic [4:0] regCounter, nextCounter;
-    FDivSqrtRegPath regData, nextData;
-    logic [31:0] regResult, nextResult;
-
-    wire       lhs_sign = input_lhs[31];
-    wire       rhs_sign = input_rhs[31];
-    wire [7:0] lhs_expo = input_lhs[30:23];
-    wire [7:0] rhs_expo = input_rhs[30:23];
-    wire[22:0] lhs_mant = input_lhs[22:0];
-    wire[22:0] rhs_mant = input_rhs[22:0];
+
+    wire       lhs_sign = lhs[31];
+    wire       rhs_sign = rhs[31];
+    wire [7:0] lhs_expo = lhs[30:23];
+    wire [7:0] rhs_expo = rhs[30:23];
+    wire[22:0] lhs_mant = lhs[22:0];
+    wire[22:0] rhs_mant = rhs[22:0];
 
     // NaN handling
     wire lhs_is_zero = lhs_expo == 8'h00 & lhs_mant == 0;
@@ -57,136 +72,139 @@ output
     wire rhs_is_inf  = rhs_expo == 8'hff & rhs_mant == 0;
     wire lhs_is_nan  = lhs_expo == 8'hff & lhs_mant != 0;
     wire rhs_is_nan  = rhs_expo == 8'hff & rhs_mant != 0;
-    wire lhs_is_neg  = lhs_sign & input_lhs != 32'h80000000;
-    wire res_is_nan  = input_is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
+    wire lhs_is_snan = lhs_is_nan & lhs_mant[22] == 0;
+    wire rhs_is_snan = rhs_is_nan & rhs_mant[22] == 0;
+    wire lhs_is_neg  = !lhs_is_nan & lhs_sign & lhs != 32'h80000000;
+    wire res_is_nan  = is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
                                  : lhs_is_nan | lhs_is_neg;
-    //wire[31:0]  nan  = is_divide ? lhs_is_nan ? input_lhs | 32'h00400000 : rhs_is_nan ? rhs | 32'h00400000 : 32'hffc00000
-    //                             : lhs_is_nan ? input_lhs | 32'h00400000 : 32'hffc00000; // qNaN
-    wire[31:0]   nan = 32'h7fc00000;
+    // === About handling NaN ===
+    // x86 returns the following qNaN:
+    //  mullhs_is_nan ? mullhs | 32'h00400000 :
+    //  mulrhs_is_nan ? mulrhs | 32'h00400000 :
+    //  addend_is_nan ? addend | 32'h00400000 : 32'hffc00000
+    // RISC-V always returns canonical NaN (32'h7fc00000).
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
+    wire[31:0]  nan  = 32'h7fc00000;
+    wire invalid_operation = is_divide ? lhs_is_snan | rhs_is_snan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
+                                       : lhs_is_snan | lhs_is_neg;
 
     // Preparation
-    wire       result_sign  = input_is_divide & (lhs_sign ^ rhs_sign);
-    wire [9:0] v_lhs_expo   = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // virtual exponent (ignores subnormals, but is biased)
-    wire [9:0] v_rhs_expo   = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // virtual exponent (ignores subnormals, but is biased)
-    wire[23:0] v_lhs_mant = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant };
-    wire[23:0] v_rhs_mant = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant };
-
-    wire dividend_normalize = regData.v_lhs_mant < regData.v_rhs_mant;
-    wire [9:0] virtual_expo = regData.v_lhs_expo - regData.v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals)
-    wire       subnormal    = regData.is_divide & $signed(virtual_expo) <= 0;
-    wire       res_is_zero  = regData.is_divide ? $signed(virtual_expo) <= -24 | regData.res_is_zero
-                                        : regData.res_is_zero;
+    wire       result_sign  = is_divide ? lhs_sign ^ rhs_sign : lhs_sign;
+    wire [9:0] v_lhs_expo   = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // biased virtual exponent (ignores subnormals)
+    wire [9:0] v_rhs_expo   = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // biased virtual exponent (ignores subnormals)
+    wire[23:0] v_lhs_mant_w = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant };
+    wire[23:0] v_rhs_mant_w = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant };
+    reg [23:0] v_lhs_mant, v_rhs_mant;
+    wire dividend_normalize = v_lhs_mant < v_rhs_mant;
+    wire [9:0] virtual_expo_w = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals)
+    wire       subnormal_w  = is_divide & $signed(virtual_expo_w) <= 0;
 
     // The SRT loop. rem needs 27 bits. 24(mantissa)+2(x8/3,SRT)+1(sign)
-    wire[26:0] rem_0 = regData.is_divide ? dividend_normalize ? { 2'b00, regData.v_lhs_mant, 1'b0 } : { 3'b000, regData.v_lhs_mant }
-                                 : regData.v_lhs_expo[0] ? { 2'b0, regData.v_lhs_mant, 1'b0 } - 27'h1e40000 : { 1'b0, regData.v_lhs_mant, 2'b0 } - 27'h2400000; // 2 * (x - 1.375^2 or 1.5^2)
-    wire[25:0] quo_0 = regData.is_divide ? 26'h0
-                                 : regData.v_lhs_expo[0] ? 26'h1600000 : 26'h1800000; // magical initial guess: 1.375 or 1.5; this avoids SRT-table defects at ([-4.5,-4-11/36], 1.5) and ([-4,-4+1/144], 1.25)
-
-    logic [2:0] q;
-    logic [3:0] div;
-    logic [26:0] rem;
-    logic [25:0] quo;
-    always_comb begin
-        rem = regData.rem;
-        quo = regData.quo;
-        div = regData.is_divide ? { 1'b0, regData.v_rhs_mant[22:20] } : { quo[25], quo[23:21] };
-        q = srt_table( rem[26:21], div );
-        case(q)
-            3'b010: rem = regData.is_divide ? (rem << 2) - { regData.v_rhs_mant, 3'b000 }
-                                             : (rem << 2) - { quo[24:0], 2'b00 } - (27'd4 << (regCounter));
-            3'b001: rem = regData.is_divide ? (rem << 2) - { 1'b0, regData.v_rhs_mant, 2'b00 }
-                                             : (rem << 2) - { quo, 1'b0 } - (27'd1 << (regCounter));
-            3'b111: rem = regData.is_divide ? (rem << 2) + { 1'b0, regData.v_rhs_mant, 2'b00 }
-                                             : (rem << 2) + { quo, 1'b0 } - (27'd1 << (regCounter));
-            3'b110: rem = regData.is_divide ? (rem << 2) + { regData.v_rhs_mant, 3'b000 }
-                                             : (rem << 2) + { quo[24:0], 2'b00 } - (27'd4 << (regCounter));
-            default: rem = rem << 2;
-        endcase
-        quo = quo + ({ {23{q[2]}}, q } << (regCounter));
-    end
-    
-    wire[47:0] before_round = regData.subnormal ? { 1'b1, regData.quo[23:0], 23'h0 } >> -regData.virtual_expo : { regData.quo[23:0], 24'h0 };
-    wire       round_away   = before_round[24] & ( (before_round[23:0] == 0 & regData.rem == 0 & before_round[25]) | before_round[23:0] != 0 | $signed(regData.rem) > 0 ); // round nearest, ties to even
-    wire       exp_plus_one = before_round[47:25] == 23'h7fffff & round_away;
-    wire[22:0] result_mant  = before_round[47:25] + { 22'h0, round_away }; // No special treatment is required even if a overflow occurs since the answer will be 0 and it will be correct.
-    wire [7:0] result_expo  = regData.is_divide ? (subnormal ? 8'h00 : regData.virtual_expo[7:0]) + { 7'h0, exp_plus_one }
-                                                : regData.v_lhs_expo[8:1] + { 7'b0, regData.v_lhs_expo[0] } + 63;
-    wire       res_is_inf   = regData.is_divide ? $signed(regData.virtual_expo) >= 255 | regData.res_is_inf | result_expo == 8'hff
-                                                : regData.res_is_inf;
-    wire[31:0] inf          = { regData.result_sign, 8'hff, 23'h0 };
-    wire[31:0] zero         = {{ regData.is_divide ? regData.result_sign : regData.lhs_sign }, 8'h00, 23'h0 };
-
-    wire[31:0] final_result = regData.res_is_nan  ? regData.nan :
-                              regData.res_is_zero ? zero :
-                              res_is_inf  ? inf  : { regData.result_sign, result_expo, result_mant };
-    
-    always_ff @(posedge clk) begin
+    wire[26:0] rem_0 = is_divide ? dividend_normalize ? { 2'b00, v_lhs_mant, 1'b0 } : { 3'b000, v_lhs_mant }
+                                 : v_lhs_expo[0] ? { 2'b0, v_lhs_mant_w, 1'b0 } - 27'h1e40000 : { 1'b0, v_lhs_mant_w, 2'b0 } - 27'h2400000; // 2 * (x - 1.375^2 or 1.5^2)
+    wire[25:0] quo_0 = is_divide ? 26'h0
+                                 : v_lhs_expo[0] ? 26'h1600000 : 26'h1800000; // magical initial guess: 1.375 or 1.5; this avoids SRT-table defects at ([-4.5,-4-11/36], 1.5) and ([-4,-4+1/144], 1.25)
+
+    reg  [3:0] stage;
+    reg [26:0] rem;
+    reg [25:0] quo;
+    reg  [9:0] virtual_expo;
+    reg        subnormal;
+    always@(posedge clk) begin
         if (rst) begin
-            regPhase <= PHASE_FINISHED;
-            regCounter <= '0;
-            regData <= '0;
-            regResult <= '0; 
-        end
-        else begin
-            regPhase <= nextPhase;
-            regCounter <= nextCounter;
-            regData <= nextData;
-            regResult <= nextResult; 
-        end
-    end
-    always_comb begin
-        nextCounter = regCounter;
-        nextData = regData;
-        nextResult = regResult;
-        if (req && regPhase == PHASE_FINISHED) begin
-            nextData.v_lhs_expo = v_lhs_expo;
-            nextData.v_lhs_mant = v_lhs_mant;
-            nextData.v_rhs_expo = v_rhs_expo;
-            nextData.v_rhs_mant = v_rhs_mant;
-            nextData.result_sign = result_sign;
-            nextData.lhs_sign = lhs_sign;
-            nextData.is_divide = input_is_divide;
-            nextData.res_is_nan = res_is_nan;
-            nextData.res_is_inf = input_is_divide ? (lhs_is_inf | rhs_is_zero) : (!lhs_sign & lhs_is_inf);
-            nextData.res_is_zero = input_is_divide ? (lhs_is_zero | rhs_is_inf) : lhs_is_zero;
-            nextData.nan = nan;
-            nextPhase = PHASE_PREPARATION;
+            lhs <= '0;
+            rhs <= '0;
+            stage <= '0;
+            rem <= '0;
+            quo <= '0;
+            v_lhs_mant <= '0;
+            v_rhs_mant <= '0;
         end
-        else if (regPhase == PHASE_PREPARATION) begin
-            nextData.virtual_expo = virtual_expo; 
-            nextData.subnormal = subnormal;
-            nextData.res_is_zero = res_is_zero;
-            nextData.rem = rem_0;
-            nextData.quo = quo_0;
-            nextPhase = PHASE_PROCESSING;
-            nextCounter = regData.is_divide ? 24 : 22;
+        else if (stage == 13) begin
+            if (req) begin
+                lhs <= input_lhs;
+                rhs <= input_rhs;
+                is_divide <= input_is_divide;
+                round_mode <= input_round_mode;
+                stage <= input_is_divide ? 14 : 15;
+            end
+        end else if (stage == 14) begin
+            v_lhs_mant <= v_lhs_mant_w;
+            v_rhs_mant <= v_rhs_mant_w;
+            stage <= 15;
+        end else if (stage == 15) begin
+            rem <= rem_0;
+            quo <= quo_0;
+            stage <= is_divide ? 0 : 1;
+            virtual_expo <= virtual_expo_w;
+            subnormal <= subnormal_w;
+        end else begin
+            reg[3:0] div = is_divide ? { 1'b0, v_rhs_mant[22:20] }
+                                     : { quo[25], quo[23:21] };
+            reg[2:0] q = srt_table( rem[26:21], div );
+            case(q)
+            3'b010: rem <= is_divide ? (rem << 2) - { v_rhs_mant, 3'b000 }
+                                     : (rem << 2) - { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2));
+            3'b001: rem <= is_divide ? (rem << 2) - { 1'b0, v_rhs_mant, 2'b00 }
+                                     : (rem << 2) - { quo, 1'b0 } - (27'd1 << (24-stage*2));
+            3'b111: rem <= is_divide ? (rem << 2) + { 1'b0, v_rhs_mant, 2'b00 }
+                                     : (rem << 2) + { quo, 1'b0 } - (27'd1 << (24-stage*2));
+            3'b110: rem <= is_divide ? (rem << 2) + { v_rhs_mant, 3'b000 }
+                                     : (rem << 2) + { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2));
+            default: rem <= rem << 2;
+            endcase
+            quo <= quo + ({ {23{q[2]}}, q } << (24-stage*2));
+            stage <= stage + 1;
         end
-        else if (regPhase == PHASE_PROCESSING) begin
-            nextData.rem = rem;
-            nextData.quo = quo;
-            nextCounter = regCounter - 2;
-            nextPhase = (regCounter == 0) ? PHASE_ROUNDING : PHASE_PROCESSING;
-        end
-        // Here, quo has a <1/3ULP error.
-        else if (regPhase == PHASE_ROUNDING) begin
-            nextResult = final_result;
-            nextPhase = PHASE_FINISHED;
-            nextCounter = '0;
-            nextData = '0;
-        end
-        else begin
-            nextPhase = regPhase;
-        end
-        finished = regPhase == PHASE_FINISHED;
-        result = regResult;
     end
-
+    assign finished = stage == 13; // Here, quo has a <1/3ULP error.
+
+    wire[47:0] before_round = subnormal ? { 1'b1, quo[23:0], 23'h0 } >> -virtual_expo : { quo[23:0], 24'h0 };
+    wire       round_away   = round_to_away(round_mode, result_sign, before_round[25], before_round[24], before_round[23:0] != 0, $signed(rem) > 0, rem == 0);
+    wire       round_fall   = round_mode == 2 ? !result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud downward
+                              round_mode == 3 ? result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud upward
+                              round_mode == 1 ? before_round[24:0] == 0 & $signed(rem) < 0 // round towards zero
+                                              : 0;
+    wire       exp_plus_one = before_round[47:25] == 23'h7fffff & round_away;
+    // Since dividend is normalized, situations where before_round[24:0] == 0 & $signed(rem) < 0 do not happen; thus, `exp_minus_one' is always zero.
+    // wire   exp_minus_one = before_round[47:25] == 23'h000000 & round_fall;
+    wire[22:0] result_mant  = before_round[47:25] + { 22'h0, round_away } - { 22'h0, round_fall }; // No special treatment is required even if a overflow occurs since the answer will be correct.
+    wire [7:0] result_expo  = is_divide ? (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'h0, exp_plus_one }
+                                        : v_lhs_expo[8:1] + { 7'b0, v_lhs_expo[0] } + 63 + { 7'h0, exp_plus_one };
+
+    // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent)
+    wire       u_round_away = round_to_away(round_mode, result_sign, quo[1], quo[0], 1'b0, $signed(rem) > 0, rem == 0);
+    wire     u_exp_plus_one = before_round[47:24] == 24'hffffff & u_round_away;
+
+    // Special cases
+    wire       res_is_huge  = is_divide & $signed(virtual_expo) >= 255;
+    wire       res_is_tiny  = is_divide & !lhs_is_zero & !rhs_is_inf & $signed(virtual_expo) <= -24;
+    wire       res_is_inf   = is_divide ? lhs_is_inf | rhs_is_zero
+                                        : lhs_is_inf;
+    wire       res_is_zero  = is_divide ? lhs_is_zero | rhs_is_inf
+                                        : lhs_is_zero;
+    wire       dir_is_away  = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign);
+    wire       huge_is_inf  = round_mode == 0 | round_mode == 4 | dir_is_away;
+
+    wire[31:0] huge         = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
+    wire[31:0] tiny         = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
+    wire[31:0] inf          = { result_sign, 8'hff, 23'h0 };
+    wire[31:0] zero         = { result_sign, 8'h00, 23'h0 };
+
+    // Final result
+    assign result = res_is_nan  ? nan  :
+                    res_is_inf  ? inf  :
+                    res_is_huge ? huge :
+                    res_is_tiny ? tiny :
+                    res_is_zero ? zero : { result_sign, result_expo, result_mant };
+    // Exception flags
+    wire divide_by_zero     = is_divide & !res_is_nan & !lhs_is_inf & rhs_is_zero;
+    wire overflow           = is_divide & !res_is_nan & !lhs_is_inf & !rhs_is_zero & (res_is_huge | (virtual_expo == 254 & exp_plus_one));
+    wire inexact            = !res_is_nan & !(is_divide ? lhs_is_zero | lhs_is_inf | rhs_is_zero | rhs_is_inf : lhs_is_zero) & (overflow | res_is_tiny | before_round[24:0] != 0 | rem != 0);
+    // === About underflow (UF) flag
+    // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86)
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
+    wire underflow          = inexact & subnormal & !u_exp_plus_one;
+    //                NV                 DZ              OF        UF         NX
+    assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact };
 endmodule
-
-
-
-
-
-
-
diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv
deleted file mode 100644
index bf39c535..00000000
--- a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv
+++ /dev/null
@@ -1,210 +0,0 @@
-
-module FP32DivSqrterWithFFlags(
-input
-    logic clk, rst,
-    logic [31:0] input_lhs, 
-    logic [31:0] input_rhs,
-    logic input_is_divide, 
-    logic [2:0] input_round_mode,
-    logic req, 
-output
-    logic [31:0] result,
-    logic [4:0] fflags,
-    logic finished
-);
-
-    function round_to_away;
-        input[2:0] round_mode;
-        input      sign;
-        input      last_place;
-        input      guard_bit;
-        input      sticky_bit;
-        input      reminder_is_positive;
-        input      reminder_is_zero;
-
-        case(round_mode)
-            3'b000:  round_to_away = guard_bit & (sticky_bit | reminder_is_positive | (reminder_is_zero & last_place)); // round to nearest, ties to even
-            3'b100:  round_to_away = guard_bit & (sticky_bit | reminder_is_positive | reminder_is_zero);                // round to nearest, ties to away
-            3'b010:  round_to_away = sign & (guard_bit | sticky_bit | reminder_is_positive);  // round downward
-            3'b011:  round_to_away = !sign & (guard_bit | sticky_bit | reminder_is_positive); // round upward
-            default: round_to_away = 0; // round towards zero
-        endcase
-    endfunction
-
-    function[2:0] srt_table;
-        input[5:0] rem;
-        input[3:0] div;
-
-        reg[5:0] th12;
-        reg[5:0] th01;
-        th12 = div < 1 ? 6 : div < 2 ? 7 : div < 4 ? 8 : div < 5 ? 9 : div < 6 ? 10 : 11;
-        th01 =               div < 2 ? 2 :                             div < 6 ?  3 :  4;
-
-            if($signed(rem) < $signed(-th12)) srt_table = -2;
-        else if($signed(rem) < $signed(-th01)) srt_table = -1;
-        else if($signed(rem) < $signed( th01)) srt_table =  0;
-        else if($signed(rem) < $signed( th12)) srt_table =  1;
-        else                                   srt_table =  2;
-    endfunction
-
-    reg [31:0] lhs;
-    reg [31:0] rhs;
-    reg        is_divide;
-    reg  [2:0] round_mode;
-
-    function [9:0] leading_zeros_count;
-        input[22:0] x;
-        for(leading_zeros_count = 0; leading_zeros_count <= 22; leading_zeros_count = leading_zeros_count + 1)
-            if(x[22-leading_zeros_count]) break;
-    endfunction
-
-    wire       lhs_sign = lhs[31];
-    wire       rhs_sign = rhs[31];
-    wire [7:0] lhs_expo = lhs[30:23];
-    wire [7:0] rhs_expo = rhs[30:23];
-    wire[22:0] lhs_mant = lhs[22:0];
-    wire[22:0] rhs_mant = rhs[22:0];
-
-    // NaN handling
-    wire lhs_is_zero = lhs_expo == 8'h00 & lhs_mant == 0;
-    wire rhs_is_zero = rhs_expo == 8'h00 & rhs_mant == 0;
-    wire lhs_is_inf  = lhs_expo == 8'hff & lhs_mant == 0;
-    wire rhs_is_inf  = rhs_expo == 8'hff & rhs_mant == 0;
-    wire lhs_is_nan  = lhs_expo == 8'hff & lhs_mant != 0;
-    wire rhs_is_nan  = rhs_expo == 8'hff & rhs_mant != 0;
-    wire lhs_is_snan = lhs_is_nan & lhs_mant[22] == 0;
-    wire rhs_is_snan = rhs_is_nan & rhs_mant[22] == 0;
-    wire lhs_is_neg  = !lhs_is_nan & lhs_sign & lhs != 32'h80000000;
-    wire res_is_nan  = is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
-                                 : lhs_is_nan | lhs_is_neg;
-    // === About handling NaN ===
-    // x86 returns the following qNaN:
-    //  mullhs_is_nan ? mullhs | 32'h00400000 :
-    //  mulrhs_is_nan ? mulrhs | 32'h00400000 :
-    //  addend_is_nan ? addend | 32'h00400000 : 32'hffc00000
-    // RISC-V always returns canonical NaN (32'h7fc00000).
-    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
-    wire[31:0]  nan  = 32'h7fc00000;
-    wire invalid_operation = is_divide ? lhs_is_snan | rhs_is_snan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf)
-                                       : lhs_is_snan | lhs_is_neg;
-
-    // Preparation
-    wire       result_sign  = is_divide ? lhs_sign ^ rhs_sign : lhs_sign;
-    wire [9:0] v_lhs_expo   = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // biased virtual exponent (ignores subnormals)
-    wire [9:0] v_rhs_expo   = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // biased virtual exponent (ignores subnormals)
-    wire[23:0] v_lhs_mant_w = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant };
-    wire[23:0] v_rhs_mant_w = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant };
-    reg [23:0] v_lhs_mant, v_rhs_mant;
-    wire dividend_normalize = v_lhs_mant < v_rhs_mant;
-    wire [9:0] virtual_expo_w = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals)
-    wire       subnormal_w  = is_divide & $signed(virtual_expo_w) <= 0;
-
-    // The SRT loop. rem needs 27 bits. 24(mantissa)+2(x8/3,SRT)+1(sign)
-    wire[26:0] rem_0 = is_divide ? dividend_normalize ? { 2'b00, v_lhs_mant, 1'b0 } : { 3'b000, v_lhs_mant }
-                                 : v_lhs_expo[0] ? { 2'b0, v_lhs_mant_w, 1'b0 } - 27'h1e40000 : { 1'b0, v_lhs_mant_w, 2'b0 } - 27'h2400000; // 2 * (x - 1.375^2 or 1.5^2)
-    wire[25:0] quo_0 = is_divide ? 26'h0
-                                 : v_lhs_expo[0] ? 26'h1600000 : 26'h1800000; // magical initial guess: 1.375 or 1.5; this avoids SRT-table defects at ([-4.5,-4-11/36], 1.5) and ([-4,-4+1/144], 1.25)
-
-    reg  [3:0] stage;
-    reg [26:0] rem;
-    reg [25:0] quo;
-    reg  [9:0] virtual_expo;
-    reg        subnormal;
-    always@(posedge clk) begin
-        if (rst) begin
-            lhs <= '0;
-            rhs <= '0;
-            stage <= '0;
-            rem <= '0;
-            quo <= '0;
-            v_lhs_mant <= '0;
-            v_rhs_mant <= '0;
-        end
-        else if (stage == 13) begin
-            if (req) begin
-                lhs <= input_lhs;
-                rhs <= input_rhs;
-                is_divide <= input_is_divide;
-                round_mode <= input_round_mode;
-                stage <= input_is_divide ? 14 : 15;
-            end
-        end else if (stage == 14) begin
-            v_lhs_mant <= v_lhs_mant_w;
-            v_rhs_mant <= v_rhs_mant_w;
-            stage <= 15;
-        end else if (stage == 15) begin
-            rem <= rem_0;
-            quo <= quo_0;
-            stage <= is_divide ? 0 : 1;
-            virtual_expo <= virtual_expo_w;
-            subnormal <= subnormal_w;
-        end else begin
-            reg[3:0] div = is_divide ? { 1'b0, v_rhs_mant[22:20] }
-                                     : { quo[25], quo[23:21] };
-            reg[2:0] q = srt_table( rem[26:21], div );
-            case(q)
-            3'b010: rem <= is_divide ? (rem << 2) - { v_rhs_mant, 3'b000 }
-                                     : (rem << 2) - { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2));
-            3'b001: rem <= is_divide ? (rem << 2) - { 1'b0, v_rhs_mant, 2'b00 }
-                                     : (rem << 2) - { quo, 1'b0 } - (27'd1 << (24-stage*2));
-            3'b111: rem <= is_divide ? (rem << 2) + { 1'b0, v_rhs_mant, 2'b00 }
-                                     : (rem << 2) + { quo, 1'b0 } - (27'd1 << (24-stage*2));
-            3'b110: rem <= is_divide ? (rem << 2) + { v_rhs_mant, 3'b000 }
-                                     : (rem << 2) + { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2));
-            default: rem <= rem << 2;
-            endcase
-            quo <= quo + ({ {23{q[2]}}, q } << (24-stage*2));
-            stage <= stage + 1;
-        end
-    end
-    assign finished = stage == 13; // Here, quo has a <1/3ULP error.
-
-    wire[47:0] before_round = subnormal ? { 1'b1, quo[23:0], 23'h0 } >> -virtual_expo : { quo[23:0], 24'h0 };
-    wire       round_away   = round_to_away(round_mode, result_sign, before_round[25], before_round[24], before_round[23:0] != 0, $signed(rem) > 0, rem == 0);
-    wire       round_fall   = round_mode == 2 ? !result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud downward
-                              round_mode == 3 ? result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud upward
-                              round_mode == 1 ? before_round[24:0] == 0 & $signed(rem) < 0 // round towards zero
-                                              : 0;
-    wire       exp_plus_one = before_round[47:25] == 23'h7fffff & round_away;
-    // Since dividend is normalized, situations where before_round[24:0] == 0 & $signed(rem) < 0 do not happen; thus, `exp_minus_one' is always zero.
-    // wire   exp_minus_one = before_round[47:25] == 23'h000000 & round_fall;
-    wire[22:0] result_mant  = before_round[47:25] + { 22'h0, round_away } - { 22'h0, round_fall }; // No special treatment is required even if a overflow occurs since the answer will be correct.
-    wire [7:0] result_expo  = is_divide ? (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'h0, exp_plus_one }
-                                        : v_lhs_expo[8:1] + { 7'b0, v_lhs_expo[0] } + 63 + { 7'h0, exp_plus_one };
-
-    // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent)
-    wire       u_round_away = round_to_away(round_mode, result_sign, quo[1], quo[0], 1'b0, $signed(rem) > 0, rem == 0);
-    wire     u_exp_plus_one = before_round[47:24] == 24'hffffff & u_round_away;
-
-    // Special cases
-    wire       res_is_huge  = is_divide & $signed(virtual_expo) >= 255;
-    wire       res_is_tiny  = is_divide & !lhs_is_zero & !rhs_is_inf & $signed(virtual_expo) <= -24;
-    wire       res_is_inf   = is_divide ? lhs_is_inf | rhs_is_zero
-                                        : lhs_is_inf;
-    wire       res_is_zero  = is_divide ? lhs_is_zero | rhs_is_inf
-                                        : lhs_is_zero;
-    wire       dir_is_away  = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign);
-    wire       huge_is_inf  = round_mode == 0 | round_mode == 4 | dir_is_away;
-
-    wire[31:0] huge         = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
-    wire[31:0] tiny         = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
-    wire[31:0] inf          = { result_sign, 8'hff, 23'h0 };
-    wire[31:0] zero         = { result_sign, 8'h00, 23'h0 };
-
-    // Final result
-    assign result = res_is_nan  ? nan  :
-                    res_is_inf  ? inf  :
-                    res_is_huge ? huge :
-                    res_is_tiny ? tiny :
-                    res_is_zero ? zero : { result_sign, result_expo, result_mant };
-    // Exception flags
-    wire divide_by_zero     = is_divide & !res_is_nan & !lhs_is_inf & rhs_is_zero;
-    wire overflow           = is_divide & !res_is_nan & !lhs_is_inf & !rhs_is_zero & (res_is_huge | (virtual_expo == 254 & exp_plus_one));
-    wire inexact            = !res_is_nan & !(is_divide ? lhs_is_zero | lhs_is_inf | rhs_is_zero | rhs_is_inf : lhs_is_zero) & (overflow | res_is_tiny | before_round[24:0] != 0 | rem != 0);
-    // === About underflow (UF) flag
-    // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86)
-    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
-    wire underflow          = inexact & subnormal & !u_exp_plus_one;
-    //                NV                 DZ              OF        UF         NX
-    assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact };
-endmodule
diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA.sv
index c6ce7e59..a0ece5d2 100644
--- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA.sv
+++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA.sv
@@ -1,10 +1,15 @@
-import FPUTypes::*;
+
+
 module FP32PipelinedFMA(
-    input  logic clk,
-    input  logic [31:0] mullhs,
-    input  logic [31:0] mulrhs,
-    input  logic [31:0] addend,
-    output logic [31:0] result
+input
+    logic clk,
+    logic [31:0] mullhs, 
+    logic [31:0] mulrhs,
+    logic [31:0] addend, 
+    logic [2:0] round_mode,
+output
+    logic [31:0] result,
+    logic [4:0] fflags
 );
 
     FMAStage1RegPath stg0Out;
@@ -27,11 +32,11 @@ module FP32PipelinedFMA(
                                   : multiplier_lhs * multiplier_rhs + multiplier_addend;
     end
 
-    FMAStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, is_sub, mlhs, mrhs, maddend);
+    FMAStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_sub, mlhs, mrhs, maddend);
     FMAStage1 stg1(clk, stg0Out, stg1Out);
     FMAStage2 stg2(clk, stg1Out, stg2Out, fma_result);
     FMAStage3 stg3(clk, stg2Out, stg3Out);
-    FMAStage4 stg4(clk, stg3Out, result);
+    FMAStage4 stg4(clk, stg3Out, result, fflags);
 endmodule
 
 module FMAStage0(
@@ -40,11 +45,13 @@ module FMAStage0(
     input logic [31:0] mullhs,
     input logic [31:0] mulrhs,
     input logic [31:0] addend,
+    input logic [2:0] round_mode,
     output logic is_subtract,
     output logic [76:0] mlhs,
     output logic [76:0] mrhs,
     output logic [76:0] maddend
 );
+
     wire       mullhs_sign = mullhs[31];
     wire       mulrhs_sign = mulrhs[31];
     wire       addend_sign = addend[31];
@@ -54,44 +61,72 @@ module FMAStage0(
     wire[22:0] mullhs_mant = mullhs[22:0];
     wire[22:0] mulrhs_mant = mulrhs[22:0];
     wire[22:0] addend_mant = addend[22:0];
-    assign is_subtract = mullhs_sign ^ mulrhs_sign ^ addend_sign;
+
+    assign is_subtract     = mullhs_sign ^ mulrhs_sign ^ addend_sign;
 
     // NaN handling
-    wire mullhs_is_zero = mullhs_expo == 8'h00 & mullhs_mant == 0;
-    wire mulrhs_is_zero = mulrhs_expo == 8'h00 & mulrhs_mant == 0;
-    wire addend_is_zero = addend_expo == 8'h00 & addend_mant == 0;
-    wire mullhs_is_inf  = mullhs_expo == 8'hff & mullhs_mant == 0;
-    wire mulrhs_is_inf  = mulrhs_expo == 8'hff & mulrhs_mant == 0;
-    wire addend_is_inf  = addend_expo == 8'hff & addend_mant == 0;
-    wire mullhs_is_nan  = mullhs_expo == 8'hff & mullhs_mant != 0;
-    wire mulrhs_is_nan  = mulrhs_expo == 8'hff & mulrhs_mant != 0;
-    wire addend_is_nan  = addend_expo == 8'hff & addend_mant != 0;
-    wire result_is_nan  = mullhs_is_nan | mulrhs_is_nan | addend_is_nan // One of the input is NaN
-                          | (mullhs_is_zero & mulrhs_is_inf) | (mullhs_is_inf & mulrhs_is_zero) // Inf * Zero
-                          | (is_subtract & (mullhs_is_inf | mulrhs_is_inf) & addend_is_inf); // Inf - Inf
-    //wire[31:0]      nan = mullhs_is_nan ? mullhs | 32'h00400000 : mulrhs_is_nan ? mulrhs | 32'h00400000 : addend_is_nan ? addend | 32'h00400000 : 32'hffc00000; // qNan
-    wire[31:0]      nan = 32'h7fc00000;
+    wire mullhs_is_zero    = mullhs_expo == 8'h00 & mullhs_mant == 0;
+    wire mulrhs_is_zero    = mulrhs_expo == 8'h00 & mulrhs_mant == 0;
+    wire addend_is_zero    = addend_expo == 8'h00 & addend_mant == 0;
+    wire mullhs_is_inf     = mullhs_expo == 8'hff & mullhs_mant == 0;
+    wire mulrhs_is_inf     = mulrhs_expo == 8'hff & mulrhs_mant == 0;
+    wire addend_is_inf     = addend_expo == 8'hff & addend_mant == 0;
+    wire mullhs_is_nan     = mullhs_expo == 8'hff & mullhs_mant != 0;
+    wire mulrhs_is_nan     = mulrhs_expo == 8'hff & mulrhs_mant != 0;
+    wire addend_is_nan     = addend_expo == 8'hff & addend_mant != 0;
+    wire mullhs_is_snan    = mullhs_is_nan & mullhs_mant[22] == 0;
+    wire mulrhs_is_snan    = mulrhs_is_nan & mulrhs_mant[22] == 0;
+    wire addend_is_snan    = addend_is_nan & addend_mant[22] == 0;
+    wire mulres_is_inf     = (mullhs_is_inf & !mulrhs_is_nan) | (!mullhs_is_nan & mulrhs_is_inf); 
+    wire mulres_is_zero    = mullhs_is_zero | mulrhs_is_zero;
+    wire res_is_addend    = mulres_is_zero & !addend_is_zero;
+    // === About setting invalid operation (NV) flag ===
+    // x86 does not set the NV flag on ±0×±∞±qNaN.
+    // RISC-V sets the NV flag on ±0×±∞±qNaN.
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.116
+    wire invalid_operation = mullhs_is_snan | mulrhs_is_snan | addend_is_snan // One of the input values is sNaN
+                             | (mullhs_is_zero & mulrhs_is_inf) | (mullhs_is_inf & mulrhs_is_zero) // Inf * Zero
+                             | (is_subtract & mulres_is_inf & addend_is_inf); // Inf - Inf
+    wire result_is_nan     = mullhs_is_nan | mulrhs_is_nan | addend_is_nan | invalid_operation; 
+    // === About handling NaN ===
+    // x86 returns the following qNaN:
+    //  mullhs_is_nan ? mullhs | 32'h00400000 :
+    //  mulrhs_is_nan ? mulrhs | 32'h00400000 :
+    //  addend_is_nan ? addend | 32'h00400000 : 32'hffc00000
+    // RISC-V always returns canonical NaN (32'h7fc00000).
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
+    wire[31:0] nan         = 32'h7fc00000;
 
     // Inf handling
-    wire result_is_inf  = addend_is_inf | mullhs_is_inf | mulrhs_is_inf;
-    wire prop_inf_sign  = addend_is_inf ? addend_sign : mullhs_sign ^ mulrhs_sign;
+    wire res_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf;
     wire mul_sign       = mullhs_sign ^ mulrhs_sign;
+    wire inf_sign   = addend_is_inf ? addend_sign : mul_sign;
+
+    // Main path (including subnormal handling)
+    wire [9:0] v_mullhs_expo  = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo };
+    wire [9:0] v_mulrhs_expo  = { 2'b0, mulrhs_expo == 8'h00 ? 8'h01 : mulrhs_expo };
+    wire [9:0] v_addend_expo  = { 2'b0, addend_expo == 8'h00 ? 8'h01 : addend_expo };
+    wire[23:0] v_mullhs_mant  = { mullhs_expo != 8'h00, mullhs_mant };
+    wire[23:0] v_mulrhs_mant  = { mulrhs_expo != 8'h00, mulrhs_mant };
+    wire[23:0] v_addend_mant  = { addend_expo != 8'h00, addend_mant };
+    wire [9:0] v_fmares_expo  = v_mullhs_expo + v_mulrhs_expo - 127 + 26; // See below: There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs.
+    wire [9:0] addend_shift   = v_fmares_expo - v_addend_expo;
+    wire[74:0] shifted_addend = { v_addend_mant, 2'b00, 49'b0 } >> addend_shift; // The 2'b00 are the guard bit and the round bit.
+    wire       addend_sticky  = $signed(addend_shift) > 75 ? v_addend_mant != 0
+                                                           : v_addend_mant << (10'd75 - addend_shift) != 24'h000000; // the part shifted out above
+    // Special cases
+    wire       mulres_is_tiny   = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps)
+    wire       res_is_tiny      = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN
 
-    wire [9:0] v_mullhs_expo = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo };
-    wire [9:0] v_mulrhs_expo = { 2'b0, mulrhs_expo == 8'h00 ? 8'h01 : mulrhs_expo };
-    wire [9:0] v_addend_expo = { 2'b0, addend_expo == 8'h00 ? 8'h01 : addend_expo };
-    wire [9:0] mulres_expo   = v_mullhs_expo + v_mulrhs_expo - 127;
-    wire [9:0] addend_shift  = v_addend_expo - mulres_expo + 23;
-    wire       res_is_addend = ($signed(addend_shift) > 49 | mullhs_is_zero | mulrhs_is_zero) & !addend_is_zero; // |lhs*rhs| < 0.5ULP(|addend|-eps); assuming round to nearest, result is equal to the addend.
-    wire       addend_sticky = $signed(addend_shift) >=  0 ? 1'b0 :
-                               $signed(addend_shift) < -26 ? { addend_expo != 8'h00, addend_mant } != 0
-                                                           : { addend_expo != 8'h00, addend_mant } << (10'd26 + addend_shift) != 24'h000000; // shifted out part of { mantissa(24bit), guard(1bit), round(1bit) } >> -addend_shift
-    assign maddend = { 1'b0, { addend_expo != 8'h00, addend_mant, 2'b00, 49'b0 } >> (10'd49 - addend_shift), addend_sticky }; // The 1'b0 is the sign bit. The 2'b0 are the gaurd bit and the round bit.
-    assign mlhs    = { 51'b0, mullhs_expo != 8'h00, mullhs_mant, 2'b0 }; // lhs_expo != 8'h00 is the hidden bit of a normalized number
-    assign mrhs    = { 52'b0, mulrhs_expo != 8'h00, mulrhs_mant, 1'b0 }; // rhs_expo != 8'h00 is the hidden bit of a normalized number
-
-    assign stg0Out = {mulres_expo, result_is_inf, result_is_nan,
-                      res_is_addend, mul_sign, prop_inf_sign, addend_sign, is_subtract, nan, addend};
+    // Fused-multiply-adder (24bit*24bit<<3+76bit+sign)
+    // The multiplication result is shifted by 3 bits for the guard bit, the round bit, and the sticky bit.
+    // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| < 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23.
+    assign mlhs    = { 51'b0, v_mullhs_mant, 2'b0 };
+    assign mrhs    = { 52'b0, v_mulrhs_mant, 1'b0 };
+    assign maddend = { 1'b0, shifted_addend, addend_sticky };
+    assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan,
+                      res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, nan, addend,
+                      mulres_is_tiny, res_is_tiny, invalid_operation, round_mode};
 endmodule
 
 module FMAStage1(
@@ -123,9 +158,10 @@ module FMAStage2(
     wire[75:0] abs_fma_result  = res_is_negative ? -fma_result[75:0] : fma_result[75:0];
     wire       result_sign     = mul_sign ^ res_is_negative;
     
-    assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, res_is_negative, pipeReg.result_is_inf,
+    assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf,
                       pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign,
-                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend};
+                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend,
+                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode};
 endmodule
 
 module FMAStage3(
@@ -146,32 +182,50 @@ module FMAStage3(
     wire[75:0] abs_fma_result  = pipeReg.abs_fma_result;
     wire [9:0] mulres_expo     = pipeReg.mulres_expo;
 
-    wire [7:0] leading_zeros   = { 1'b0, leading_zeros_count(abs_fma_result) }; // 0 <= leading_sign_bits <= 74 if !res_is_zero
-    wire [9:0] virtual_expo    = mulres_expo - { 2'b00, leading_zeros } + 26; // There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs.
+    wire [6:0] leading_zeros   = leading_zeros_count(abs_fma_result); // 0 <= leading_sign_bits <= 74 if !res_is_zero
+    wire [9:0] virtual_expo    = mulres_expo - { 3'b00, leading_zeros }; // There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs.
     wire       subnormal       = $signed(virtual_expo) <= 0;
-    wire [7:0] fmares_shift    = subnormal ? 26 - mulres_expo[7:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs.
-                                           : 51 - leading_zeros;   // (75 - addend_sticky(1bit)) - shifter_result(24bit)
+    wire [6:0] fmares_shift    = subnormal ? mulres_expo[6:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs.
+                                           : leading_zeros + 1;   // (75 - addend_sticky(1bit)) - shifter_result(24bit)
     
-    assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.res_is_negative, pipeReg.result_is_inf,
+    assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf,
                       pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign,
-                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend};
+                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend,
+                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode};
 endmodule
 
 module FMAStage4(
     input logic clk,
     input FMAStage4RegPath stg4In,
-    output logic [31:0] result
+    output logic [31:0] result,
+    output logic [4:0] fflags
 );
+    function round_to_away;
+        input sign;
+        input last_place;
+        input guard_bit;
+        input sticky_bit;
+        input[2:0] round_mode;
+
+        case(round_mode)
+            3'b000:  round_to_away = guard_bit & (last_place | sticky_bit); // round to nearest, ties to even
+            3'b100:  round_to_away = guard_bit;                             // round to nearest, ties to away
+            3'b010:  round_to_away = sign & (guard_bit | sticky_bit);       // round downward
+            3'b011:  round_to_away = !sign & (guard_bit | sticky_bit);      // round upward
+            default: round_to_away = 0;                                     // round towards zero
+        endcase
+    endfunction
+
     FMAStage4RegPath pipeReg;
     always_ff @(posedge clk) begin
         pipeReg <= stg4In; 
     end
+
     wire[75:0] abs_fma_result  = pipeReg.abs_fma_result;
     wire [7:0] fmares_shift    = pipeReg.fmares_shift;
     wire [9:0] virtual_expo    = pipeReg.virtual_expo;
     wire[31:0] nan             = pipeReg.nan;
     wire[31:0] addend          = pipeReg.addend;
-    wire res_is_negative       = pipeReg.res_is_negative;
     wire result_is_inf         = pipeReg.result_is_inf;
     wire result_is_nan         = pipeReg.result_is_nan;
     wire res_is_zero           = pipeReg.res_is_zero;
@@ -181,23 +235,57 @@ module FMAStage4(
     wire addend_sign           = pipeReg.addend_sign;
     wire subnormal             = pipeReg.subnormal;
     wire is_subtract           = pipeReg.is_subtract;
-    
-    /* verilator lint_off WIDTH */
-    wire[23:0] shifter_result = { abs_fma_result, 23'b0 } >> (7'd23 + fmares_shift);
-    /* verilator lint_on WIDTH */
-    wire       sticky         = abs_fma_result << (76 - fmares_shift) != 0; // the part shifted out above
-
-    wire       round_to_away  = shifter_result[0] & (shifter_result[1] | sticky); // round to nearest, ties to even
-    wire       exp_plus_one   = shifter_result >= 24'hffffff; // carry is generated with rounding taken into account
-
-    wire[22:0] result_mant  = shifter_result[23:1] + { 22'h0, round_to_away }; // No special treatment is required even if a overflow occurs since the answer will be 0 and it will be correct.
-    wire [7:0] result_expo  = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one };
-    wire       res_is_inf   = result_is_inf | $signed(virtual_expo) >= 255;
-    wire[31:0] inf          = { result_is_inf ? prop_inf_sign : result_sign, 8'hff, 23'h0 };
-    wire[31:0] zero         = { is_subtract ? 1'b0 : addend_sign, 8'h00, 23'h0 };
-
-    wire[31:0] final_result = res_is_inf    ? inf    :
-                              res_is_addend ? addend :
-                              res_is_zero   ? zero   : { result_sign, result_expo, result_mant };
-    assign result = result_is_nan ? nan : final_result;
+    wire mulres_is_tiny        = pipeReg.mulres_is_tiny;
+    wire res_is_tiny           = pipeReg.res_is_tiny;
+    wire invalid_operation     = pipeReg.invalid_operation;
+    wire [2:0] round_mode      = pipeReg.round_mode;
+
+    // Normalize and rounding decision
+    wire[24:0] shifter_result  = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) }
+    wire       sticky          = abs_fma_result << (7'd25 + fmares_shift) != 0; // the part right-shifted out above
+
+    wire       round_away      = round_to_away(result_sign, shifter_result[2], shifter_result[1], shifter_result[0] | sticky, round_mode);
+    wire       exp_plus_one    = shifter_result >= 25'h1fffffc & round_away; // carry is generated with rounding taken into account
+    // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent)
+    wire       u_round_away    = round_to_away(result_sign, shifter_result[1], shifter_result[0], sticky, round_mode);
+    wire       u_exp_plus_one  = shifter_result >= 25'h1fffffe & u_round_away; // 0x1.fffffep-127 <= |mullhs*mulrhs+addend| < 0x1p-126 and the after rounding result become a normal number, not raising the underflow flag.
+
+    wire[22:0] result_mant = shifter_result[24:2] + { 22'h0, round_away }; // No special treatment is required even if an overflow occurs since the answer will be 0 and it will be correct.
+    wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one };
+
+    // Special cases
+    wire       res_is_huge      = $signed(virtual_expo) >= 255;
+    wire       dir_is_away      = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign);
+    wire       huge_is_inf      = round_mode == 0 | round_mode == 4 | dir_is_away;
+
+    wire[31:0] addend_plus_tiny = round_mode == 1                &  is_subtract ? addend - 1 :
+                                  round_mode == 2 & !addend_sign &  is_subtract ? addend - 1 :
+                                  round_mode == 3 &  addend_sign &  is_subtract ? addend - 1 :
+                                  round_mode == 2 &  addend_sign & !is_subtract ? addend + 1 :
+                                  round_mode == 3 & !addend_sign & !is_subtract ? addend + 1
+                                                                                : addend;
+    wire[31:0] huge             = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
+    wire[31:0] tiny             = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
+    wire[31:0] zero             = { is_subtract ? round_mode == 2 : addend_sign, 8'h00, 23'h0 };
+    wire[31:0] inf  = { prop_inf_sign, 8'hff, 23'h0 };
+
+    // Final result
+    assign result = result_is_nan  ? nan              :
+                    result_is_inf  ? inf              :
+                    res_is_huge    ? huge             :
+                    res_is_tiny    ? tiny             :
+                    mulres_is_tiny ? addend_plus_tiny :
+                    res_is_addend  ? addend           :
+                    res_is_zero    ? zero             : { result_sign, result_expo, result_mant };
+
+    // Exception flags
+    wire divide_by_zero    = 1'b0;
+    wire overflow          = !result_is_nan & !result_is_inf & (mulres_is_tiny ? addend_plus_tiny[30:23] == 8'hff : res_is_huge | (virtual_expo == 254 & exp_plus_one));
+    wire inexact           = !result_is_nan & !result_is_inf & (overflow | res_is_tiny | mulres_is_tiny | shifter_result[1] | shifter_result[0] | sticky);
+    // === About underflow (UF) flag
+    // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) 
+    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
+    wire underflow         = inexact & (mulres_is_tiny ? addend[30:23] == 8'h00 | addend_plus_tiny[30:23] == 8'h00 : res_is_tiny | (subnormal & !u_exp_plus_one));
+    //                NV                 DZ              OF        UF         NX
+    assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact };
 endmodule
diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
deleted file mode 100644
index 3fdf8a7f..00000000
--- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv
+++ /dev/null
@@ -1,291 +0,0 @@
-
-
-module FP32PipelinedFMA_WithFFlags(
-input
-    logic clk,
-    logic [31:0] mullhs, 
-    logic [31:0] mulrhs,
-    logic [31:0] addend, 
-    logic [2:0] round_mode,
-output
-    logic [31:0] result,
-    logic [4:0] fflags
-);
-
-    FMA_WithFFlagsStage1RegPath stg0Out;
-    FMA_WithFFlagsStage2RegPath stg1Out;
-    FMA_WithFFlagsStage3RegPath stg2Out;
-    FMA_WithFFlagsStage4RegPath stg3Out;
-    
-    // Fused-multiply-adder (24bit*24bit<<3+76bit+sign)
-    // The multiplication result is shifted by 2 bits for the guard bit and the sticky bit.
-    // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| ~ 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23.
-    logic [76:0] multiplier_lhs, multiplier_rhs, multiplier_addend, fma_result;
-    logic [76:0] mlhs, mrhs, maddend;
-    logic is_subtract, is_sub;
-    always_ff @(posedge clk) begin
-        multiplier_lhs    <= mlhs;
-        multiplier_rhs    <= mrhs;
-        multiplier_addend <= maddend;
-        is_subtract <= is_sub;
-        fma_result <= is_subtract ? multiplier_lhs * multiplier_rhs - multiplier_addend
-                                  : multiplier_lhs * multiplier_rhs + multiplier_addend;
-    end
-
-    FMA_WithFFlagsStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_sub, mlhs, mrhs, maddend);
-    FMA_WithFFlagsStage1 stg1(clk, stg0Out, stg1Out);
-    FMA_WithFFlagsStage2 stg2(clk, stg1Out, stg2Out, fma_result);
-    FMA_WithFFlagsStage3 stg3(clk, stg2Out, stg3Out);
-    FMA_WithFFlagsStage4 stg4(clk, stg3Out, result, fflags);
-endmodule
-
-module FMA_WithFFlagsStage0(
-    input logic clk,
-    output FMA_WithFFlagsStage1RegPath stg0Out,
-    input logic [31:0] mullhs,
-    input logic [31:0] mulrhs,
-    input logic [31:0] addend,
-    input logic [2:0] round_mode,
-    output logic is_subtract,
-    output logic [76:0] mlhs,
-    output logic [76:0] mrhs,
-    output logic [76:0] maddend
-);
-
-    wire       mullhs_sign = mullhs[31];
-    wire       mulrhs_sign = mulrhs[31];
-    wire       addend_sign = addend[31];
-    wire [7:0] mullhs_expo = mullhs[30:23];
-    wire [7:0] mulrhs_expo = mulrhs[30:23];
-    wire [7:0] addend_expo = addend[30:23];
-    wire[22:0] mullhs_mant = mullhs[22:0];
-    wire[22:0] mulrhs_mant = mulrhs[22:0];
-    wire[22:0] addend_mant = addend[22:0];
-
-    assign is_subtract     = mullhs_sign ^ mulrhs_sign ^ addend_sign;
-
-    // NaN handling
-    wire mullhs_is_zero    = mullhs_expo == 8'h00 & mullhs_mant == 0;
-    wire mulrhs_is_zero    = mulrhs_expo == 8'h00 & mulrhs_mant == 0;
-    wire addend_is_zero    = addend_expo == 8'h00 & addend_mant == 0;
-    wire mullhs_is_inf     = mullhs_expo == 8'hff & mullhs_mant == 0;
-    wire mulrhs_is_inf     = mulrhs_expo == 8'hff & mulrhs_mant == 0;
-    wire addend_is_inf     = addend_expo == 8'hff & addend_mant == 0;
-    wire mullhs_is_nan     = mullhs_expo == 8'hff & mullhs_mant != 0;
-    wire mulrhs_is_nan     = mulrhs_expo == 8'hff & mulrhs_mant != 0;
-    wire addend_is_nan     = addend_expo == 8'hff & addend_mant != 0;
-    wire mullhs_is_snan    = mullhs_is_nan & mullhs_mant[22] == 0;
-    wire mulrhs_is_snan    = mulrhs_is_nan & mulrhs_mant[22] == 0;
-    wire addend_is_snan    = addend_is_nan & addend_mant[22] == 0;
-    wire mulres_is_inf     = (mullhs_is_inf & !mulrhs_is_nan) | (!mullhs_is_nan & mulrhs_is_inf); 
-    wire mulres_is_zero    = mullhs_is_zero | mulrhs_is_zero;
-    wire res_is_addend    = mulres_is_zero & !addend_is_zero;
-    // === About setting invalid operation (NV) flag ===
-    // x86 does not set the NV flag on ±0×±∞±qNaN.
-    // RISC-V sets the NV flag on ±0×±∞±qNaN.
-    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.116
-    wire invalid_operation = mullhs_is_snan | mulrhs_is_snan | addend_is_snan // One of the input values is sNaN
-                             | (mullhs_is_zero & mulrhs_is_inf) | (mullhs_is_inf & mulrhs_is_zero) // Inf * Zero
-                             | (is_subtract & mulres_is_inf & addend_is_inf); // Inf - Inf
-    wire result_is_nan     = mullhs_is_nan | mulrhs_is_nan | addend_is_nan | invalid_operation; 
-    // === About handling NaN ===
-    // x86 returns the following qNaN:
-    //  mullhs_is_nan ? mullhs | 32'h00400000 :
-    //  mulrhs_is_nan ? mulrhs | 32'h00400000 :
-    //  addend_is_nan ? addend | 32'h00400000 : 32'hffc00000
-    // RISC-V always returns canonical NaN (32'h7fc00000).
-    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
-    wire[31:0] nan         = 32'h7fc00000;
-
-    // Inf handling
-    wire res_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf;
-    wire mul_sign       = mullhs_sign ^ mulrhs_sign;
-    wire inf_sign   = addend_is_inf ? addend_sign : mul_sign;
-
-    // Main path (including subnormal handling)
-    wire [9:0] v_mullhs_expo  = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo };
-    wire [9:0] v_mulrhs_expo  = { 2'b0, mulrhs_expo == 8'h00 ? 8'h01 : mulrhs_expo };
-    wire [9:0] v_addend_expo  = { 2'b0, addend_expo == 8'h00 ? 8'h01 : addend_expo };
-    wire[23:0] v_mullhs_mant  = { mullhs_expo != 8'h00, mullhs_mant };
-    wire[23:0] v_mulrhs_mant  = { mulrhs_expo != 8'h00, mulrhs_mant };
-    wire[23:0] v_addend_mant  = { addend_expo != 8'h00, addend_mant };
-    wire [9:0] v_fmares_expo  = v_mullhs_expo + v_mulrhs_expo - 127 + 26; // See below: There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs.
-    wire [9:0] addend_shift   = v_fmares_expo - v_addend_expo;
-    wire[74:0] shifted_addend = { v_addend_mant, 2'b00, 49'b0 } >> addend_shift; // The 2'b00 are the guard bit and the round bit.
-    wire       addend_sticky  = $signed(addend_shift) > 75 ? v_addend_mant != 0
-                                                           : v_addend_mant << (10'd75 - addend_shift) != 24'h000000; // the part shifted out above
-    // Special cases
-    wire       mulres_is_tiny   = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps)
-    wire       res_is_tiny      = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN
-
-    // Fused-multiply-adder (24bit*24bit<<3+76bit+sign)
-    // The multiplication result is shifted by 3 bits for the guard bit, the round bit, and the sticky bit.
-    // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| < 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23.
-    assign mlhs    = { 51'b0, v_mullhs_mant, 2'b0 };
-    assign mrhs    = { 52'b0, v_mulrhs_mant, 1'b0 };
-    assign maddend = { 1'b0, shifted_addend, addend_sticky };
-    assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan,
-                      res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, nan, addend,
-                      mulres_is_tiny, res_is_tiny, invalid_operation, round_mode};
-endmodule
-
-module FMA_WithFFlagsStage1(
-    input logic clk,
-    input FMA_WithFFlagsStage1RegPath stg1In,
-    output FMA_WithFFlagsStage2RegPath stg1Out
-);
-    FMA_WithFFlagsStage1RegPath pipeReg;
-    always_ff @(posedge clk) begin
-        pipeReg <= stg1In; 
-    end
-    assign stg1Out = pipeReg;
-endmodule
-
-module FMA_WithFFlagsStage2(
-    input logic clk,
-    input FMA_WithFFlagsStage2RegPath stg2In,
-    output FMA_WithFFlagsStage3RegPath stg2Out,
-    input logic [76:0] fma_result
-);
-    FMA_WithFFlagsStage2RegPath pipeReg;
-    always_ff @(posedge clk) begin
-        pipeReg <= stg2In; 
-    end
-
-    wire       mul_sign   = pipeReg.mul_sign;
-    wire       res_is_zero     = fma_result == 77'h0;
-    wire       res_is_negative = fma_result[76];
-    wire[75:0] abs_fma_result  = res_is_negative ? -fma_result[75:0] : fma_result[75:0];
-    wire       result_sign     = mul_sign ^ res_is_negative;
-    
-    assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf,
-                      pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign,
-                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend,
-                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode};
-endmodule
-
-module FMA_WithFFlagsStage3(
-    input logic clk,
-    input FMA_WithFFlagsStage3RegPath stg3In,
-    output FMA_WithFFlagsStage4RegPath stg3Out
-);
-    function automatic [6:0] leading_zeros_count;
-        input[75:0] x;
-        for(leading_zeros_count = 0; leading_zeros_count <= 75; leading_zeros_count = leading_zeros_count + 1)
-            if(x[75-leading_zeros_count]) break;
-    endfunction
-    
-    FMA_WithFFlagsStage3RegPath pipeReg;
-    always_ff @(posedge clk) begin
-        pipeReg <= stg3In; 
-    end
-    wire[75:0] abs_fma_result  = pipeReg.abs_fma_result;
-    wire [9:0] mulres_expo     = pipeReg.mulres_expo;
-
-    wire [6:0] leading_zeros   = leading_zeros_count(abs_fma_result); // 0 <= leading_sign_bits <= 74 if !res_is_zero
-    wire [9:0] virtual_expo    = mulres_expo - { 3'b00, leading_zeros }; // There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs.
-    wire       subnormal       = $signed(virtual_expo) <= 0;
-    wire [6:0] fmares_shift    = subnormal ? mulres_expo[6:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs.
-                                           : leading_zeros + 1;   // (75 - addend_sticky(1bit)) - shifter_result(24bit)
-    
-    assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf,
-                      pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign,
-                      pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend,
-                      pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode};
-endmodule
-
-module FMA_WithFFlagsStage4(
-    input logic clk,
-    input FMA_WithFFlagsStage4RegPath stg4In,
-    output logic [31:0] result,
-    output logic [4:0] fflags
-);
-    function round_to_away;
-        input sign;
-        input last_place;
-        input guard_bit;
-        input sticky_bit;
-        input[2:0] round_mode;
-
-        case(round_mode)
-            3'b000:  round_to_away = guard_bit & (last_place | sticky_bit); // round to nearest, ties to even
-            3'b100:  round_to_away = guard_bit;                             // round to nearest, ties to away
-            3'b010:  round_to_away = sign & (guard_bit | sticky_bit);       // round downward
-            3'b011:  round_to_away = !sign & (guard_bit | sticky_bit);      // round upward
-            default: round_to_away = 0;                                     // round towards zero
-        endcase
-    endfunction
-
-    FMA_WithFFlagsStage4RegPath pipeReg;
-    always_ff @(posedge clk) begin
-        pipeReg <= stg4In; 
-    end
-
-    wire[75:0] abs_fma_result  = pipeReg.abs_fma_result;
-    wire [7:0] fmares_shift    = pipeReg.fmares_shift;
-    wire [9:0] virtual_expo    = pipeReg.virtual_expo;
-    wire[31:0] nan             = pipeReg.nan;
-    wire[31:0] addend          = pipeReg.addend;
-    wire result_is_inf         = pipeReg.result_is_inf;
-    wire result_is_nan         = pipeReg.result_is_nan;
-    wire res_is_zero           = pipeReg.res_is_zero;
-    wire res_is_addend         = pipeReg.res_is_addend;
-    wire result_sign           = pipeReg.result_sign;
-    wire prop_inf_sign         = pipeReg.prop_inf_sign;
-    wire addend_sign           = pipeReg.addend_sign;
-    wire subnormal             = pipeReg.subnormal;
-    wire is_subtract           = pipeReg.is_subtract;
-    wire mulres_is_tiny        = pipeReg.mulres_is_tiny;
-    wire res_is_tiny           = pipeReg.res_is_tiny;
-    wire invalid_operation     = pipeReg.invalid_operation;
-    wire [2:0] round_mode      = pipeReg.round_mode;
-
-    // Normalize and rounding decision
-    wire[24:0] shifter_result  = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) }
-    wire       sticky          = abs_fma_result << (7'd25 + fmares_shift) != 0; // the part right-shifted out above
-
-    wire       round_away      = round_to_away(result_sign, shifter_result[2], shifter_result[1], shifter_result[0] | sticky, round_mode);
-    wire       exp_plus_one    = shifter_result >= 25'h1fffffc & round_away; // carry is generated with rounding taken into account
-    // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent)
-    wire       u_round_away    = round_to_away(result_sign, shifter_result[1], shifter_result[0], sticky, round_mode);
-    wire       u_exp_plus_one  = shifter_result >= 25'h1fffffe & u_round_away; // 0x1.fffffep-127 <= |mullhs*mulrhs+addend| < 0x1p-126 and the after rounding result become a normal number, not raising the underflow flag.
-
-    wire[22:0] result_mant = shifter_result[24:2] + { 22'h0, round_away }; // No special treatment is required even if an overflow occurs since the answer will be 0 and it will be correct.
-    wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one };
-
-    // Special cases
-    wire       res_is_huge      = $signed(virtual_expo) >= 255;
-    wire       dir_is_away      = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign);
-    wire       huge_is_inf      = round_mode == 0 | round_mode == 4 | dir_is_away;
-
-    wire[31:0] addend_plus_tiny = round_mode == 1                &  is_subtract ? addend - 1 :
-                                  round_mode == 2 & !addend_sign &  is_subtract ? addend - 1 :
-                                  round_mode == 3 &  addend_sign &  is_subtract ? addend - 1 :
-                                  round_mode == 2 &  addend_sign & !is_subtract ? addend + 1 :
-                                  round_mode == 3 & !addend_sign & !is_subtract ? addend + 1
-                                                                                : addend;
-    wire[31:0] huge             = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff };
-    wire[31:0] tiny             = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 };
-    wire[31:0] zero             = { is_subtract ? round_mode == 2 : addend_sign, 8'h00, 23'h0 };
-    wire[31:0] inf  = { prop_inf_sign, 8'hff, 23'h0 };
-
-    // Final result
-    assign result = result_is_nan  ? nan              :
-                    result_is_inf  ? inf              :
-                    res_is_huge    ? huge             :
-                    res_is_tiny    ? tiny             :
-                    mulres_is_tiny ? addend_plus_tiny :
-                    res_is_addend  ? addend           :
-                    res_is_zero    ? zero             : { result_sign, result_expo, result_mant };
-
-    // Exception flags
-    wire divide_by_zero    = 1'b0;
-    wire overflow          = !result_is_nan & !result_is_inf & (mulres_is_tiny ? addend_plus_tiny[30:23] == 8'hff : res_is_huge | (virtual_expo == 254 & exp_plus_one));
-    wire inexact           = !result_is_nan & !result_is_inf & (overflow | res_is_tiny | mulres_is_tiny | shifter_result[1] | shifter_result[0] | sticky);
-    // === About underflow (UF) flag
-    // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) 
-    // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114
-    wire underflow         = inexact & (mulres_is_tiny ? addend[30:23] == 8'h00 | addend_plus_tiny[30:23] == 8'h00 : res_is_tiny | (subnormal & !u_exp_plus_one));
-    //                NV                 DZ              OF        UF         NX
-    assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact };
-endmodule
diff --git a/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv b/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv
index 18f73d5f..b8e78866 100644
--- a/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv
+++ b/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv
@@ -25,7 +25,7 @@ module FPDivSqrtUnit(FPDivSqrtUnitIF.FPDivSqrtUnit port, RecoveryManagerIF.FPDiv
     ActiveListIndexPath nextActiveListPtr[FP_DIVSQRT_ISSUE_WIDTH];
 
     for (genvar i = 0; i < FP_DIVSQRT_ISSUE_WIDTH; i++) begin : BlockDivUnit
-        FP32DivSqrterWithFFlags fpDivSqrter(
+        FP32DivSqrter fpDivSqrter(
             .clk(port.clk),
             .rst(rst_divider[i]),
             .input_lhs(port.dataInA[i]),
diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv
index 2f313d85..d4c989f7 100644
--- a/Processor/Src/FloatingPointUnit/FPUTypes.sv
+++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv
@@ -111,68 +111,6 @@ typedef struct packed {
     logic [25:0] quo;
 } FDivSqrtRegPath;
 
-// Pipeline registers for old FMA
-typedef struct packed {
-    logic [9:0] mulres_expo;
-    logic result_is_inf;
-    logic result_is_nan;
-    logic res_is_addend;
-    logic mul_sign;
-    logic prop_inf_sign;
-    logic addend_sign;
-    logic is_subtract;
-    logic [31:0] nan;
-    logic [31:0] addend;
-} FMAStage1RegPath;
-
-typedef struct packed {
-    logic [9:0] mulres_expo;
-    logic result_is_inf;
-    logic result_is_nan;
-    logic res_is_addend;
-    logic mul_sign;
-    logic prop_inf_sign;
-    logic addend_sign;
-    logic is_subtract;
-    logic [31:0] nan;
-    logic [31:0] addend;
-} FMAStage2RegPath;
-
-typedef struct packed {
-    logic [75:0] abs_fma_result;
-    logic [9:0] mulres_expo;
-    logic res_is_negative;
-    logic result_is_inf;
-    logic result_is_nan;
-    logic res_is_zero;
-    logic res_is_addend;
-    logic result_sign;
-    logic prop_inf_sign;
-    logic addend_sign;
-    logic is_subtract;
-    logic [31:0] nan;
-    logic [31:0] addend;
-} FMAStage3RegPath;
-
-typedef struct packed {
-    logic [75:0] abs_fma_result;
-    logic [7:0] fmares_shift;
-    logic [9:0] virtual_expo;
-    logic subnormal;
-    logic res_is_negative;
-    logic result_is_inf;
-    logic result_is_nan;
-    logic res_is_zero;
-    logic res_is_addend;
-    logic result_sign;
-    logic prop_inf_sign;
-    logic addend_sign;
-    logic is_subtract;
-    logic [31:0] nan;
-    logic [31:0] addend;
-} FMAStage4RegPath;
-
-// Pipeline registers for FMA with fflags
 typedef struct packed {
     logic [9:0] mulres_expo;
     logic result_is_inf;
@@ -188,7 +126,7 @@ typedef struct packed {
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
-} FMA_WithFFlagsStage1RegPath;
+} FMAStage1RegPath;
 
 typedef struct packed {
     logic [9:0] mulres_expo;
@@ -205,7 +143,7 @@ typedef struct packed {
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
-} FMA_WithFFlagsStage2RegPath;
+} FMAStage2RegPath;
 
 typedef struct packed {
     logic [75:0] abs_fma_result;
@@ -224,7 +162,7 @@ typedef struct packed {
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
-} FMA_WithFFlagsStage3RegPath;
+} FMAStage3RegPath;
 
 typedef struct packed {
     logic [75:0] abs_fma_result;
@@ -245,6 +183,6 @@ typedef struct packed {
     logic res_is_tiny;
     logic invalid_operation;
     logic [2:0] round_mode;
-} FMA_WithFFlagsStage4RegPath;
+} FMAStage4RegPath;
 
 endpackage
\ No newline at end of file
diff --git a/Processor/Src/Makefiles/CoreSources.inc.mk b/Processor/Src/Makefiles/CoreSources.inc.mk
index 6abd1698..d33567dc 100644
--- a/Processor/Src/Makefiles/CoreSources.inc.mk
+++ b/Processor/Src/Makefiles/CoreSources.inc.mk
@@ -114,10 +114,8 @@ CORE_MODULES = \
 	FloatingPointUnit/FP32PipelinedAdder.sv \
 	FloatingPointUnit/FP32PipelinedMultiplier.sv \
 	FloatingPointUnit/FP32PipelinedFMA.sv \
-	FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv \
 	FloatingPointUnit/FP32PipelinedOther.sv \
 	FloatingPointUnit/FP32DivSqrter.sv \
-	FloatingPointUnit/FP32DivSqrterWithFFlags.sv \
 	FloatingPointUnit/FPDivSqrtUnit.sv \
 	FloatingPointUnit/FPDivSqrtUnitIF.sv \
 	RenameLogic/RenameLogic.sv \
diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
index 72703102..cd950f53 100644
--- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
+++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
@@ -148,7 +148,7 @@ module FPExecutionStage(
     logic isDivSqrt         [ FP_ISSUE_WIDTH ]; 
 
     for ( genvar i = 0; i < FP_ISSUE_WIDTH; i++ ) begin
-        FP32PipelinedFMA_WithFFlags fpFMA (
+        FP32PipelinedFMA fpFMA (
             .clk (port.clk),
             .mullhs (fmaMulLHS[i]),
             .mulrhs (fmaMulRHS[i]),

From 36f86021decf763d2daef7c8b9fa900268ca2eac Mon Sep 17 00:00:00 2001
From: Reoma Matsuo <matsuo@rsg.ci.i.u-tokyo.ac.jp>
Date: Sun, 22 Dec 2024 00:47:58 +0900
Subject: [PATCH 11/11] refactor: modify the comment related to the addend of
 FMA

---
 .../Src/Pipeline/FPBackEnd/FPExecutionStage.sv   | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
index cd950f53..b23bff92 100644
--- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
+++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv
@@ -177,10 +177,18 @@ module FPExecutionStage(
             fmaMulLHS[i] = fpuCode[i] inside {FC_FNMSUB, FC_FNMADD} ? {~fuOpA[i].data[31], fuOpA[i].data[30:0]} : fuOpA[i].data;
             fmaMulRHS[i] = fpuCode[i] inside {FC_ADD, FC_SUB} ? 32'h3f800000 : fuOpB[i].data;
             if(fpuCode[i] == FC_MUL) begin
-                // Hack: set sign bit considering rounding mode
-                // +a * +0.0 should return +0.0 regardless of rounding mode,
-                // However, when implemented with fma(+a, +0.0, -0.0),
-                // it returns -0.0 when round_mode = 2
+                // If the arithmetical result is not zero, 
+                // adding either -0.0 or +0.0 will produce the same result as the multiplication result.
+                // If the arithmetical result is zero, 
+                // adding a zero with the same sign ensures that the result matches the multiplication result.
+                // Therefore, this approach is valid.
+                //
+                // Always adding +0.0 is incorrect: 
+                //   when the round_mode != 2 (downward) and the multiplication result is -0.0, 
+                //   the output will incorrectly become +0.0.
+                // Similarly, always adding -0.0 is also incorrect:
+                //   when the round_mode == 2 (downward) and the multiplication result is +0.0,
+                //   the output will incorrectly become -0.0.
                 fmaAddend[i] = { fmaMulLHS[i][31] ^ fmaMulRHS[i][31] , 31'h0 };
             end
             else if (fpuCode[i] == FC_ADD) begin