From ce19922a5e283a005c6299410ebfca5898de3687 Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Thu, 19 Dec 2024 14:22:50 +0900 Subject: [PATCH 01/11] feat: integrate DivSqrter with fflags into the FP pipeline --- .../Src/FloatingPointUnit/FP32DivSqrter.sv | 28 +-- .../FP32DivSqrterWithFFlags.sv | 206 ++++++++++++++++++ .../Src/FloatingPointUnit/FPDivSqrtUnit.sv | 12 +- Processor/Src/Makefiles/CoreSources.inc.mk | 1 + Processor/Src/Makefiles/TestCommands.inc.mk | 32 +-- .../Pipeline/FPBackEnd/FPExecutionStage.sv | 3 +- 6 files changed, 245 insertions(+), 37 deletions(-) create mode 100644 Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv index 35df9ab8..759d3a50 100644 --- a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv +++ b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv @@ -3,9 +3,9 @@ import FPUTypes::*; module FP32DivSqrter ( input logic clk, rst, - logic [31:0] lhs, - logic [31:0] rhs, - logic is_divide, + logic [31:0] input_lhs, + logic [31:0] input_rhs, + logic input_is_divide, logic req, output logic finished, @@ -43,11 +43,11 @@ output FDivSqrtRegPath regData, nextData; logic [31:0] regResult, nextResult; - wire lhs_sign = lhs[31]; + wire lhs_sign = input_lhs[31]; wire rhs_sign = rhs[31]; - wire [7:0] lhs_expo = lhs[30:23]; + wire [7:0] lhs_expo = input_lhs[30:23]; wire [7:0] rhs_expo = rhs[30:23]; - wire[22:0] lhs_mant = lhs[22:0]; + wire[22:0] lhs_mant = input_lhs[22:0]; wire[22:0] rhs_mant = rhs[22:0]; // NaN handling @@ -57,15 +57,15 @@ output wire rhs_is_inf = rhs_expo == 8'hff & rhs_mant == 0; wire lhs_is_nan = lhs_expo == 8'hff & lhs_mant != 0; wire rhs_is_nan = rhs_expo == 8'hff & rhs_mant != 0; - wire lhs_is_neg = lhs_sign & lhs != 32'h80000000; - wire res_is_nan = is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) + wire lhs_is_neg = lhs_sign & input_lhs != 32'h80000000; + wire res_is_nan = input_is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) : lhs_is_nan | lhs_is_neg; - //wire[31:0] nan = is_divide ? lhs_is_nan ? lhs | 32'h00400000 : rhs_is_nan ? rhs | 32'h00400000 : 32'hffc00000 - // : lhs_is_nan ? lhs | 32'h00400000 : 32'hffc00000; // qNaN + //wire[31:0] nan = is_divide ? lhs_is_nan ? input_lhs | 32'h00400000 : rhs_is_nan ? rhs | 32'h00400000 : 32'hffc00000 + // : lhs_is_nan ? input_lhs | 32'h00400000 : 32'hffc00000; // qNaN wire[31:0] nan = 32'h7fc00000; // Preparation - wire result_sign = is_divide & (lhs_sign ^ rhs_sign); + wire result_sign = input_is_divide & (lhs_sign ^ rhs_sign); wire [9:0] v_lhs_expo = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // virtual exponent (ignores subnormals, but is biased) wire [9:0] v_rhs_expo = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // virtual exponent (ignores subnormals, but is biased) wire[23:0] v_lhs_mant = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant }; @@ -146,10 +146,10 @@ output nextData.v_rhs_mant = v_rhs_mant; nextData.result_sign = result_sign; nextData.lhs_sign = lhs_sign; - nextData.is_divide = is_divide; + nextData.is_divide = input_is_divide; nextData.res_is_nan = res_is_nan; - nextData.res_is_inf = is_divide ? (lhs_is_inf | rhs_is_zero) : (!lhs_sign & lhs_is_inf); - nextData.res_is_zero = is_divide ? (lhs_is_zero | rhs_is_inf) : lhs_is_zero; + nextData.res_is_inf = input_is_divide ? (lhs_is_inf | rhs_is_zero) : (!lhs_sign & lhs_is_inf); + nextData.res_is_zero = input_is_divide ? (lhs_is_zero | rhs_is_inf) : lhs_is_zero; nextData.nan = nan; nextPhase = PHASE_PREPARATION; end diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv new file mode 100644 index 00000000..87c32b77 --- /dev/null +++ b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv @@ -0,0 +1,206 @@ + +module FP32DivSqrterWithFFlags( +input + logic clk, rst, + logic [31:0] input_lhs, + logic [31:0] input_rhs, + logic input_is_divide, + logic [2:0] input_round_mode, + logic req, +output + logic [31:0] result, + logic [4:0] fflags, + logic finished +); + + function round_to_away; + input[2:0] round_mode; + input sign; + input last_place; + input guard_bit; + input sticky_bit; + input reminder_is_positive; + input reminder_is_zero; + + case(round_mode) + 3'b000: round_to_away = guard_bit & (sticky_bit | reminder_is_positive | (reminder_is_zero & last_place)); // round to nearest, ties to even + 3'b100: round_to_away = guard_bit & (sticky_bit | reminder_is_positive | reminder_is_zero); // round to nearest, ties to away + 3'b010: round_to_away = sign & (guard_bit | sticky_bit | reminder_is_positive); // round downward + 3'b011: round_to_away = !sign & (guard_bit | sticky_bit | reminder_is_positive); // round upward + default: round_to_away = 0; // round towards zero + endcase + endfunction + + function[2:0] srt_table; + input[5:0] rem; + input[3:0] div; + + reg[5:0] th12; + reg[5:0] th01; + th12 = div < 1 ? 6 : div < 2 ? 7 : div < 4 ? 8 : div < 5 ? 9 : div < 6 ? 10 : 11; + th01 = div < 2 ? 2 : div < 6 ? 3 : 4; + + if($signed(rem) < $signed(-th12)) srt_table = -2; + else if($signed(rem) < $signed(-th01)) srt_table = -1; + else if($signed(rem) < $signed( th01)) srt_table = 0; + else if($signed(rem) < $signed( th12)) srt_table = 1; + else srt_table = 2; + endfunction + + reg [31:0] lhs; + reg [31:0] rhs; + reg is_divide; + reg [2:0] round_mode; + + function [9:0] leading_zeros_count; + input[22:0] x; + for(leading_zeros_count = 0; leading_zeros_count <= 22; leading_zeros_count = leading_zeros_count + 1) + if(x[22-leading_zeros_count]) break; + endfunction + + wire lhs_sign = lhs[31]; + wire rhs_sign = rhs[31]; + wire [7:0] lhs_expo = lhs[30:23]; + wire [7:0] rhs_expo = rhs[30:23]; + wire[22:0] lhs_mant = lhs[22:0]; + wire[22:0] rhs_mant = rhs[22:0]; + + // NaN handling + wire lhs_is_zero = lhs_expo == 8'h00 & lhs_mant == 0; + wire rhs_is_zero = rhs_expo == 8'h00 & rhs_mant == 0; + wire lhs_is_inf = lhs_expo == 8'hff & lhs_mant == 0; + wire rhs_is_inf = rhs_expo == 8'hff & rhs_mant == 0; + wire lhs_is_nan = lhs_expo == 8'hff & lhs_mant != 0; + wire rhs_is_nan = rhs_expo == 8'hff & rhs_mant != 0; + wire lhs_is_snan = lhs_is_nan & lhs_mant[22] == 0; + wire rhs_is_snan = rhs_is_nan & rhs_mant[22] == 0; + wire lhs_is_neg = !lhs_is_nan & lhs_sign & lhs != 32'h80000000; + wire res_is_nan = is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) + : lhs_is_nan | lhs_is_neg; + // === About handling NaN === + // x86 returns the following qNaN: + // mullhs_is_nan ? mullhs | 32'h00400000 : + // mulrhs_is_nan ? mulrhs | 32'h00400000 : + // addend_is_nan ? addend | 32'h00400000 : 32'hffc00000 + // RISC-V always returns canonical NaN (32'h7fc00000). + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 + wire[31:0] nan = 32'h7fc00000; + wire invalid_operation = is_divide ? lhs_is_snan | rhs_is_snan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) + : lhs_is_snan | lhs_is_neg; + + // Preparation + wire result_sign = is_divide ? lhs_sign ^ rhs_sign : lhs_sign; + wire [9:0] v_lhs_expo = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // biased virtual exponent (ignores subnormals) + wire [9:0] v_rhs_expo = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // biased virtual exponent (ignores subnormals) + wire[23:0] v_lhs_mant_w = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant }; + wire[23:0] v_rhs_mant_w = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant }; + reg [23:0] v_lhs_mant, v_rhs_mant; + wire dividend_normalize = v_lhs_mant < v_rhs_mant; + wire [9:0] virtual_expo = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals) + wire subnormal = is_divide & $signed(virtual_expo) <= 0; + + // The SRT loop. rem needs 27 bits. 24(mantissa)+2(x8/3,SRT)+1(sign) + wire[26:0] rem_0 = is_divide ? dividend_normalize ? { 2'b00, v_lhs_mant, 1'b0 } : { 3'b000, v_lhs_mant } + : v_lhs_expo[0] ? { 2'b0, v_lhs_mant_w, 1'b0 } - 27'h1e40000 : { 1'b0, v_lhs_mant_w, 2'b0 } - 27'h2400000; // 2 * (x - 1.375^2 or 1.5^2) + wire[25:0] quo_0 = is_divide ? 26'h0 + : v_lhs_expo[0] ? 26'h1600000 : 26'h1800000; // magical initial guess: 1.375 or 1.5; this avoids SRT-table defects at ([-4.5,-4-11/36], 1.5) and ([-4,-4+1/144], 1.25) + + reg [3:0] stage; + reg [26:0] rem; + reg [25:0] quo; + always@(posedge clk) begin + if (rst) begin + lhs <= '0; + rhs <= '0; + stage <= '0; + rem <= '0; + quo <= '0; + v_lhs_mant <= '0; + v_rhs_mant <= '0; + end + else if (stage == 13) begin + if (req) begin + lhs <= input_lhs; + rhs <= input_rhs; + is_divide <= input_is_divide; + round_mode <= input_round_mode; + stage <= input_is_divide ? 14 : 15; + end + end else if (stage == 14) begin + v_lhs_mant <= v_lhs_mant_w; + v_rhs_mant <= v_rhs_mant_w; + stage <= 15; + end else if (stage == 15) begin + rem <= rem_0; + quo <= quo_0; + stage <= is_divide ? 0 : 1; + end else begin + reg[3:0] div = is_divide ? { 1'b0, v_rhs_mant[22:20] } + : { quo[25], quo[23:21] }; + reg[2:0] q = srt_table( rem[26:21], div ); + case(q) + 3'b010: rem <= is_divide ? (rem << 2) - { v_rhs_mant, 3'b000 } + : (rem << 2) - { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2)); + 3'b001: rem <= is_divide ? (rem << 2) - { 1'b0, v_rhs_mant, 2'b00 } + : (rem << 2) - { quo, 1'b0 } - (27'd1 << (24-stage*2)); + 3'b111: rem <= is_divide ? (rem << 2) + { 1'b0, v_rhs_mant, 2'b00 } + : (rem << 2) + { quo, 1'b0 } - (27'd1 << (24-stage*2)); + 3'b110: rem <= is_divide ? (rem << 2) + { v_rhs_mant, 3'b000 } + : (rem << 2) + { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2)); + default: rem <= rem << 2; + endcase + quo <= quo + ({ {23{q[2]}}, q } << (24-stage*2)); + stage <= stage + 1; + end + end + assign finished = stage == 13; // Here, quo has a <1/3ULP error. + + wire[47:0] before_round = subnormal ? { 1'b1, quo[23:0], 23'h0 } >> -virtual_expo : { quo[23:0], 24'h0 }; + wire round_away = round_to_away(round_mode, result_sign, before_round[25], before_round[24], before_round[23:0] != 0, $signed(rem) > 0, rem == 0); + wire round_fall = round_mode == 2 ? !result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud downward + round_mode == 3 ? result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud upward + round_mode == 1 ? before_round[24:0] == 0 & $signed(rem) < 0 // round towards zero + : 0; + wire exp_plus_one = before_round[47:25] == 23'h7fffff & round_away; + // Since dividend is normalized, situations where before_round[24:0] == 0 & $signed(rem) < 0 do not happen; thus, `exp_minus_one' is always zero. + // wire exp_minus_one = before_round[47:25] == 23'h000000 & round_fall; + wire[22:0] result_mant = before_round[47:25] + { 22'h0, round_away } - { 22'h0, round_fall }; // No special treatment is required even if a overflow occurs since the answer will be correct. + wire [7:0] result_expo = is_divide ? (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'h0, exp_plus_one } + : v_lhs_expo[8:1] + { 7'b0, v_lhs_expo[0] } + 63 + { 7'h0, exp_plus_one }; + + // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent) + wire u_round_away = round_to_away(round_mode, result_sign, quo[1], quo[0], 1'b0, $signed(rem) > 0, rem == 0); + wire u_exp_plus_one = before_round[47:24] == 24'hffffff & u_round_away; + + // Special cases + wire res_is_huge = is_divide & $signed(virtual_expo) >= 255; + wire res_is_tiny = is_divide & !lhs_is_zero & !rhs_is_inf & $signed(virtual_expo) <= -24; + wire res_is_inf = is_divide ? lhs_is_inf | rhs_is_zero + : lhs_is_inf; + wire res_is_zero = is_divide ? lhs_is_zero | rhs_is_inf + : lhs_is_zero; + wire dir_is_away = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign); + wire huge_is_inf = round_mode == 0 | round_mode == 4 | dir_is_away; + + wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; + wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; + wire[31:0] inf = { result_sign, 8'hff, 23'h0 }; + wire[31:0] zero = { result_sign, 8'h00, 23'h0 }; + + // Final result + assign result = res_is_nan ? nan : + res_is_inf ? inf : + res_is_huge ? huge : + res_is_tiny ? tiny : + res_is_zero ? zero : { result_sign, result_expo, result_mant }; + // Exception flags + wire divide_by_zero = is_divide & !res_is_nan & !lhs_is_inf & rhs_is_zero; + wire overflow = is_divide & !res_is_nan & !lhs_is_inf & !rhs_is_zero & (res_is_huge | (virtual_expo == 254 & exp_plus_one)); + wire inexact = !res_is_nan & !(is_divide ? lhs_is_zero | lhs_is_inf | rhs_is_zero | rhs_is_inf : lhs_is_zero) & (overflow | res_is_tiny | before_round[24:0] != 0 | rem != 0); + // === About underflow (UF) flag + // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 + wire underflow = inexact & subnormal & !u_exp_plus_one; + // NV DZ OF UF NX + assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact }; +endmodule diff --git a/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv b/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv index 14c97341..18f73d5f 100644 --- a/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv +++ b/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv @@ -25,15 +25,17 @@ module FPDivSqrtUnit(FPDivSqrtUnitIF.FPDivSqrtUnit port, RecoveryManagerIF.FPDiv ActiveListIndexPath nextActiveListPtr[FP_DIVSQRT_ISSUE_WIDTH]; for (genvar i = 0; i < FP_DIVSQRT_ISSUE_WIDTH; i++) begin : BlockDivUnit - FP32DivSqrter fpDivSqrter( + FP32DivSqrterWithFFlags fpDivSqrter( .clk(port.clk), .rst(rst_divider[i]), - .lhs(port.dataInA[i]), - .rhs(port.dataInB[i]), - .is_divide(port.is_divide[i]), + .input_lhs(port.dataInA[i]), + .input_rhs(port.dataInB[i]), + .input_is_divide(port.is_divide[i]), + .input_round_mode(port.rm[i]), .req(port.Req[i]), .finished(finished[i]), - .result(port.DataOut[i]) + .result(port.DataOut[i]), + .fflags(port.FFlagsOut[i]) ); end diff --git a/Processor/Src/Makefiles/CoreSources.inc.mk b/Processor/Src/Makefiles/CoreSources.inc.mk index d33567dc..52d974fa 100644 --- a/Processor/Src/Makefiles/CoreSources.inc.mk +++ b/Processor/Src/Makefiles/CoreSources.inc.mk @@ -116,6 +116,7 @@ CORE_MODULES = \ FloatingPointUnit/FP32PipelinedFMA.sv \ FloatingPointUnit/FP32PipelinedOther.sv \ FloatingPointUnit/FP32DivSqrter.sv \ + FloatingPointUnit/FP32DivSqrterWithFFlags.sv \ FloatingPointUnit/FPDivSqrtUnit.sv \ FloatingPointUnit/FPDivSqrtUnitIF.sv \ RenameLogic/RenameLogic.sv \ diff --git a/Processor/Src/Makefiles/TestCommands.inc.mk b/Processor/Src/Makefiles/TestCommands.inc.mk index a8e0b2ae..8c205b9d 100644 --- a/Processor/Src/Makefiles/TestCommands.inc.mk +++ b/Processor/Src/Makefiles/TestCommands.inc.mk @@ -330,10 +330,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS) fadd_b2-01 \ fadd_b4-01 \ fadd_b5-01 \ - fdiv_b4-01 \ - fdiv_b5-01 \ - fdiv_b6-01 \ - fdiv_b7-01 \ fmadd_b4-01 \ fmadd_b5-01 \ fmadd_b6-01 \ @@ -354,11 +350,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS) fnmsub_b5-01 \ fnmsub_b6-01 \ fnmsub_b7-01 \ - fsqrt_b3-01 \ - fsqrt_b4-01 \ - fsqrt_b5-01 \ - fsqrt_b7-01 \ - fsqrt_b8-01 \ fsub_b4-01 \ fsub_b5-01 \ fsub_b7-01 \ @@ -368,9 +359,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS) fadd_b10-01 \ fadd_b12-01 \ fadd_b13-01 \ - fdiv_b1-01 \ - fdiv_b2-01 \ - fdiv_b20-01 \ fmadd_b14-01 \ fmadd_b16-01 \ fmadd_b17-01 \ @@ -393,10 +381,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS) fnmsub_b17-01 \ fnmsub_b18-01 \ fnmsub_b2-01 \ - fsqrt_b1-01 \ - fsqrt_b2-01 \ - fsqrt_b20-01 \ - fsqrt_b9-01 \ fsub_b1-01 \ fsub_b10-01 \ fsub_b12-01 \ @@ -442,6 +426,22 @@ RISCV_RV32F_COMPLIANCE_TESTS = \ fsgnjn_b1-01 \ fsgnjx_b1-01 \ fsw-align-01 \ + fdiv_b1-01 \ + fdiv_b2-01 \ + fdiv_b4-01 \ + fdiv_b5-01 \ + fdiv_b6-01 \ + fdiv_b7-01 \ + fdiv_b20-01 \ + fsqrt_b1-01 \ + fsqrt_b2-01 \ + fsqrt_b20-01 \ + fsqrt_b9-01 \ + fsqrt_b3-01 \ + fsqrt_b4-01 \ + fsqrt_b5-01 \ + fsqrt_b7-01 \ + fsqrt_b8-01 \ RISCV_RV32F_COMPLIANCE_TEST_TARGETS = $(RISCV_RV32F_COMPLIANCE_TESTS:%=test-riscv-compliance-%) diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv index 6eedc552..a0ef0e80 100644 --- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv +++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv @@ -299,8 +299,7 @@ module FPExecutionStage( end FP_MOP_TYPE_DIV, FP_MOP_TYPE_SQRT: begin dataOut[i].data = fpDivSqrtUnit.DataOut[i]; - //fflagsData[i] = fpDivSqrtUnit.FFlagsOut[i]; - fflagsOut[i] = '0; + fflagsOut[i] = fpDivSqrtUnit.FFlagsOut[i]; end default: begin /* FP_MOP_TYPE_OTHER */ dataOut[i].data = otherDataOut[i]; From f42531d9535bbf4afb953c6650b901f155348206 Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Fri, 20 Dec 2024 13:42:27 +0900 Subject: [PATCH 02/11] feat: integrate FMA with fflags into the FP pipeline --- .../FP32PipelinedFMA_WithFFlags.sv | 314 ++++++++++++++++++ Processor/Src/FloatingPointUnit/FPUTypes.sv | 28 +- Processor/Src/Makefiles/TestCommands.inc.mk | 121 ++++--- .../Pipeline/FPBackEnd/FPExecutionStage.sv | 12 +- 4 files changed, 406 insertions(+), 69 deletions(-) create mode 100644 Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv new file mode 100644 index 00000000..b896f8e4 --- /dev/null +++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv @@ -0,0 +1,314 @@ + + +module FP32PipelinedFMA_WithFFlags( +input + logic clk, + logic [31:0] mullhs, + logic [31:0] mulrhs, + logic [31:0] addend, + logic [2:0] round_mode, + logic is_fmul, +output + logic [31:0] result, + logic [4:0] fflags +); + + FMAStage1RegPath stg0Out; + FMAStage2RegPath stg1Out; + FMAStage3RegPath stg2Out; + FMAStage4RegPath stg3Out; + + // Fused-multiply-adder (24bit*24bit<<3+76bit+sign) + // The multiplication result is shifted by 2 bits for the guard bit and the sticky bit. + // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| ~ 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23. + logic [76:0] multiplier_lhs, multiplier_rhs, multiplier_addend, fma_result; + logic [76:0] mlhs, mrhs, maddend; + logic is_subtract, is_sub; + always_ff @(posedge clk) begin + multiplier_lhs <= mlhs; + multiplier_rhs <= mrhs; + multiplier_addend <= maddend; + is_subtract <= is_sub; + fma_result <= is_subtract ? multiplier_lhs * multiplier_rhs - multiplier_addend + : multiplier_lhs * multiplier_rhs + multiplier_addend; + end + + FMA_WithFFlagsStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_fmul, is_sub, mlhs, mrhs, maddend); + FMA_WithFFlagsStage1 stg1(clk, stg0Out, stg1Out); + FMA_WithFFlagsStage2 stg2(clk, stg1Out, stg2Out, fma_result); + FMA_WithFFlagsStage3 stg3(clk, stg2Out, stg3Out); + FMA_WithFFlagsStage4 stg4(clk, stg3Out, result, fflags); + + logic [31:0] ref_result; + logic [4:0] ref_fflags; + float_fused_multiply_adder fma(mullhs, mulrhs, addend, round_mode, ref_result, ref_fflags); +endmodule + +module FMA_WithFFlagsStage0( + input logic clk, + output FMAStage1RegPath stg0Out, + input logic [31:0] mullhs, + input logic [31:0] mulrhs, + input logic [31:0] addend, + input logic [2:0] round_mode, + input logic is_fmul, + output logic is_subtract, + output logic [76:0] mlhs, + output logic [76:0] mrhs, + output logic [76:0] maddend +); + + wire mullhs_sign = mullhs[31]; + wire mulrhs_sign = mulrhs[31]; + wire addend_sign = addend[31]; + wire [7:0] mullhs_expo = mullhs[30:23]; + wire [7:0] mulrhs_expo = mulrhs[30:23]; + wire [7:0] addend_expo = addend[30:23]; + wire[22:0] mullhs_mant = mullhs[22:0]; + wire[22:0] mulrhs_mant = mulrhs[22:0]; + wire[22:0] addend_mant = addend[22:0]; + + assign is_subtract = mullhs_sign ^ mulrhs_sign ^ addend_sign; + + // NaN handling + wire mullhs_is_zero = mullhs_expo == 8'h00 & mullhs_mant == 0; + wire mulrhs_is_zero = mulrhs_expo == 8'h00 & mulrhs_mant == 0; + wire addend_is_zero = addend_expo == 8'h00 & addend_mant == 0; + wire mullhs_is_inf = mullhs_expo == 8'hff & mullhs_mant == 0; + wire mulrhs_is_inf = mulrhs_expo == 8'hff & mulrhs_mant == 0; + wire addend_is_inf = addend_expo == 8'hff & addend_mant == 0; + wire mullhs_is_nan = mullhs_expo == 8'hff & mullhs_mant != 0; + wire mulrhs_is_nan = mulrhs_expo == 8'hff & mulrhs_mant != 0; + wire addend_is_nan = addend_expo == 8'hff & addend_mant != 0; + wire mullhs_is_snan = mullhs_is_nan & mullhs_mant[22] == 0; + wire mulrhs_is_snan = mulrhs_is_nan & mulrhs_mant[22] == 0; + wire addend_is_snan = addend_is_nan & addend_mant[22] == 0; + wire mulres_is_inf = (mullhs_is_inf & !mulrhs_is_nan) | (!mullhs_is_nan & mulrhs_is_inf); + wire mulres_is_zero = mullhs_is_zero | mulrhs_is_zero; + wire res_is_addend = mulres_is_zero & !addend_is_zero; + // === About setting invalid operation (NV) flag === + // x86 does not set the NV flag on ±0×±∞±qNaN. + // RISC-V sets the NV flag on ±0×±∞±qNaN. + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.116 + wire invalid_operation = mullhs_is_snan | mulrhs_is_snan | addend_is_snan // One of the input values is sNaN + | (mullhs_is_zero & mulrhs_is_inf) | (mullhs_is_inf & mulrhs_is_zero) // Inf * Zero + | (is_subtract & mulres_is_inf & addend_is_inf); // Inf - Inf + wire result_is_nan = mullhs_is_nan | mulrhs_is_nan | addend_is_nan | invalid_operation; + // === About handling NaN === + // x86 returns the following qNaN: + // mullhs_is_nan ? mullhs | 32'h00400000 : + // mulrhs_is_nan ? mulrhs | 32'h00400000 : + // addend_is_nan ? addend | 32'h00400000 : 32'hffc00000 + // RISC-V always returns canonical NaN (32'h7fc00000). + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 + wire[31:0] nan = 32'h7fc00000; + + // Inf handling + wire res_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf; + wire mul_sign = mullhs_sign ^ mulrhs_sign; + wire inf_sign = addend_is_inf ? addend_sign : mul_sign; + wire[31:0] inf = { inf_sign, 8'hff, 23'h0 }; + + // Main path (including subnormal handling) + wire [9:0] v_mullhs_expo = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo }; + wire [9:0] v_mulrhs_expo = { 2'b0, mulrhs_expo == 8'h00 ? 8'h01 : mulrhs_expo }; + wire [9:0] v_addend_expo = { 2'b0, addend_expo == 8'h00 ? 8'h01 : addend_expo }; + wire[23:0] v_mullhs_mant = { mullhs_expo != 8'h00, mullhs_mant }; + wire[23:0] v_mulrhs_mant = { mulrhs_expo != 8'h00, mulrhs_mant }; + wire[23:0] v_addend_mant = { addend_expo != 8'h00, addend_mant }; + wire [9:0] v_fmares_expo = v_mullhs_expo + v_mulrhs_expo - 127 + 26; // See below: There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs. + wire [9:0] addend_shift = v_fmares_expo - v_addend_expo; + wire[74:0] shifted_addend = { v_addend_mant, 2'b00, 49'b0 } >> addend_shift; // The 2'b00 are the guard bit and the round bit. + wire addend_sticky = $signed(addend_shift) > 75 ? v_addend_mant != 0 + : v_addend_mant << (10'd75 - addend_shift) != 24'h000000; // the part shifted out above + // Special cases + wire mulres_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps) + wire res_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN + + // Fused-multiply-adder (24bit*24bit<<3+76bit+sign) + // The multiplication result is shifted by 3 bits for the guard bit, the round bit, and the sticky bit. + // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| < 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23. + assign mlhs = { 51'b0, v_mullhs_mant, 2'b0 }; + assign mrhs = { 52'b0, v_mulrhs_mant, 1'b0 }; + assign maddend = { 1'b0, shifted_addend, addend_sticky }; + // wire[76:0] multiplier_result = is_subtract ? multiplier_lhs * multiplier_rhs - multiplier_addend + // : multiplier_lhs * multiplier_rhs + multiplier_addend; + + assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan, + res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, inf, nan, addend, + mulres_is_tiny, res_is_tiny, invalid_operation, round_mode, is_fmul}; +endmodule + +module FMA_WithFFlagsStage1( + input logic clk, + input FMAStage1RegPath stg1In, + output FMAStage2RegPath stg1Out +); + FMAStage1RegPath pipeReg; + always_ff @(posedge clk) begin + pipeReg <= stg1In; + end + assign stg1Out = pipeReg; +endmodule + +module FMA_WithFFlagsStage2( + input logic clk, + input FMAStage2RegPath stg2In, + output FMAStage3RegPath stg2Out, + input logic [76:0] fma_result +); + FMAStage2RegPath pipeReg; + always_ff @(posedge clk) begin + pipeReg <= stg2In; + end + + wire mul_sign = pipeReg.mul_sign; + wire res_is_zero = fma_result == 77'h0; + wire res_is_negative = fma_result[76]; + wire[75:0] abs_fma_result = res_is_negative ? -fma_result[75:0] : fma_result[75:0]; + wire result_sign = mul_sign ^ res_is_negative; + + assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, res_is_negative, pipeReg.result_is_inf, + pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign, + pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend, + pipeReg.mul_sign, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul}; +endmodule + +module FMA_WithFFlagsStage3( + input logic clk, + input FMAStage3RegPath stg3In, + output FMAStage4RegPath stg3Out +); + function automatic [6:0] leading_zeros_count; + input[75:0] x; + for(leading_zeros_count = 0; leading_zeros_count <= 75; leading_zeros_count = leading_zeros_count + 1) + if(x[75-leading_zeros_count]) break; + endfunction + + FMAStage3RegPath pipeReg; + always_ff @(posedge clk) begin + pipeReg <= stg3In; + end + wire[75:0] abs_fma_result = pipeReg.abs_fma_result; + wire [9:0] mulres_expo = pipeReg.mulres_expo; + + wire [6:0] leading_zeros = leading_zeros_count(abs_fma_result); // 0 <= leading_sign_bits <= 74 if !res_is_zero + wire [9:0] virtual_expo = mulres_expo - { 3'b00, leading_zeros }; // There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs. + wire subnormal = $signed(virtual_expo) <= 0; + wire [6:0] fmares_shift = subnormal ? mulres_expo[6:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs. + : leading_zeros + 1; // (75 - addend_sticky(1bit)) - shifter_result(24bit) + + assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.res_is_negative, pipeReg.result_is_inf, + pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign, + pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend, + pipeReg.mul_sign, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul}; +endmodule + +module FMA_WithFFlagsStage4( + input logic clk, + input FMAStage4RegPath stg4In, + output logic [31:0] result, + output logic [4:0] fflags +); + function round_to_away; + input sign; + input last_place; + input guard_bit; + input sticky_bit; + input[2:0] round_mode; + + case(round_mode) + 3'b000: round_to_away = guard_bit & (last_place | sticky_bit); // round to nearest, ties to even + 3'b100: round_to_away = guard_bit; // round to nearest, ties to away + 3'b010: round_to_away = sign & (guard_bit | sticky_bit); // round downward + 3'b011: round_to_away = !sign & (guard_bit | sticky_bit); // round upward + default: round_to_away = 0; // round towards zero + endcase + endfunction + + FMAStage4RegPath pipeReg; + always_ff @(posedge clk) begin + pipeReg <= stg4In; + end + + wire[75:0] abs_fma_result = pipeReg.abs_fma_result; + wire [7:0] fmares_shift = pipeReg.fmares_shift; + wire [9:0] virtual_expo = pipeReg.virtual_expo; + wire[31:0] inf = pipeReg.inf; + wire[31:0] nan = pipeReg.nan; + wire[31:0] addend = pipeReg.addend; + wire res_is_negative = pipeReg.res_is_negative; + wire result_is_inf = pipeReg.result_is_inf; + wire result_is_nan = pipeReg.result_is_nan; + wire res_is_zero = pipeReg.res_is_zero; + wire res_is_addend = pipeReg.res_is_addend; + wire result_sign = pipeReg.result_sign; + wire prop_inf_sign = pipeReg.prop_inf_sign; + wire addend_sign = pipeReg.addend_sign; + wire subnormal = pipeReg.subnormal; + wire is_subtract = pipeReg.is_subtract; + wire mulres_is_tiny = pipeReg.mulres_is_tiny; + wire res_is_tiny = pipeReg.res_is_tiny; + wire invalid_operation = pipeReg.invalid_operation; + wire [2:0] round_mode = pipeReg.round_mode; + wire is_fmul = pipeReg.is_fmul; + + // Normalize and rounding decision + /* verilator lint_off WIDTH */ + wire[24:0] shifter_result = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) } + /* verilator lint_on WIDTH */ + wire sticky = abs_fma_result << (7'd25 + fmares_shift) != 0; // the part right-shifted out above + + wire round_away = round_to_away(result_sign, shifter_result[2], shifter_result[1], shifter_result[0] | sticky, round_mode); + wire exp_plus_one = shifter_result >= 25'h1fffffc & round_away; // carry is generated with rounding taken into account + // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent) + wire u_round_away = round_to_away(result_sign, shifter_result[1], shifter_result[0], sticky, round_mode); + wire u_exp_plus_one = shifter_result >= 25'h1fffffe & u_round_away; // 0x1.fffffep-127 <= |mullhs*mulrhs+addend| < 0x1p-126 and the after rounding result become a normal number, not raising the underflow flag. + + wire[22:0] result_mant = shifter_result[24:2] + { 22'h0, round_away }; // No special treatment is required even if an overflow occurs since the answer will be 0 and it will be correct. + wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one }; + + // Special cases + // wire mulres_is_zero = mullhs_is_zero | mulrhs_is_zero; + wire res_is_huge = $signed(virtual_expo) >= 255; + // wire mulres_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps) + // wire res_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN + // wire res_is_addend = mulres_is_zero & !addend_is_zero; + // wire res_is_zero = multiplier_result == 77'h0; // including mulres_is_zero & addend_is_zero + wire dir_is_away = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign); + wire huge_is_inf = round_mode == 0 | round_mode == 4 | dir_is_away; + + wire[31:0] addend_plus_tiny = round_mode == 1 & is_subtract ? addend - 1 : + round_mode == 2 & !addend_sign & is_subtract ? addend - 1 : + round_mode == 3 & addend_sign & is_subtract ? addend - 1 : + round_mode == 2 & addend_sign & !is_subtract ? addend + 1 : + round_mode == 3 & !addend_sign & !is_subtract ? addend + 1 + : addend; + wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; + wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; + + wire [7:0] addend_expo = addend[30:23]; + wire[22:0] addend_mant = addend[22:0]; + wire addend_is_zero = addend_expo == 8'h00 & addend_mant == 0; + wire[31:0] zero = { is_fmul ? result_sign : (is_subtract ? round_mode == 2 : addend_sign), 8'h00, 23'h0 }; + + // Final result + assign result = result_is_nan ? nan : + result_is_inf ? inf : + res_is_huge ? huge : + res_is_tiny ? tiny : + mulres_is_tiny ? addend_plus_tiny : + res_is_addend ? addend : + res_is_zero ? zero : { result_sign, result_expo, result_mant }; + + // Exception flags + wire divide_by_zero = 1'b0; + wire overflow = !result_is_nan & !result_is_inf & (mulres_is_tiny ? addend_plus_tiny[30:23] == 8'hff : res_is_huge | (virtual_expo == 254 & exp_plus_one)); + wire inexact = !result_is_nan & !result_is_inf & (overflow | res_is_tiny | mulres_is_tiny | shifter_result[1] | shifter_result[0] | sticky); + // === About underflow (UF) flag + // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 + wire underflow = inexact & (mulres_is_tiny ? addend[30:23] == 8'h00 | addend_plus_tiny[30:23] == 8'h00 : res_is_tiny | (subnormal & !u_exp_plus_one)); + // NV DZ OF UF NX + assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact }; +endmodule diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv index 50286c7e..179a4f6d 100644 --- a/Processor/Src/FloatingPointUnit/FPUTypes.sv +++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv @@ -120,8 +120,14 @@ typedef struct packed { logic prop_inf_sign; logic addend_sign; logic is_subtract; + logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; + logic mulres_is_tiny; + logic res_is_tiny; + logic invalid_operation; + logic [2:0] round_mode; + logic is_fmul; } FMAStage1RegPath; typedef struct packed { @@ -133,8 +139,14 @@ typedef struct packed { logic prop_inf_sign; logic addend_sign; logic is_subtract; + logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; + logic mulres_is_tiny; + logic res_is_tiny; + logic invalid_operation; + logic [2:0] round_mode; + logic is_fmul; } FMAStage2RegPath; typedef struct packed { @@ -149,13 +161,20 @@ typedef struct packed { logic prop_inf_sign; logic addend_sign; logic is_subtract; + logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; + logic mul_sign; + logic mulres_is_tiny; + logic res_is_tiny; + logic invalid_operation; + logic [2:0] round_mode; + logic is_fmul; } FMAStage3RegPath; typedef struct packed { logic [75:0] abs_fma_result; - logic [7:0] fmares_shift; + logic [6:0] fmares_shift; logic [9:0] virtual_expo; logic subnormal; logic res_is_negative; @@ -167,8 +186,15 @@ typedef struct packed { logic prop_inf_sign; logic addend_sign; logic is_subtract; + logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; + logic mul_sign; + logic mulres_is_tiny; + logic res_is_tiny; + logic invalid_operation; + logic [2:0] round_mode; + logic is_fmul; } FMAStage4RegPath; endpackage \ No newline at end of file diff --git a/Processor/Src/Makefiles/TestCommands.inc.mk b/Processor/Src/Makefiles/TestCommands.inc.mk index 8c205b9d..9f03c726 100644 --- a/Processor/Src/Makefiles/TestCommands.inc.mk +++ b/Processor/Src/Makefiles/TestCommands.inc.mk @@ -325,68 +325,6 @@ test-riscv-compliance: $(RISCV_RV32I_COMPLIANCE_TEST_TARGETS) fsub_b3-01 \ fsub_b8-01 \ -# unsupported rounding mode - #fadd_b7-01 \ - fadd_b2-01 \ - fadd_b4-01 \ - fadd_b5-01 \ - fmadd_b4-01 \ - fmadd_b5-01 \ - fmadd_b6-01 \ - fmadd_b7-01 \ - fmsub_b4-01 \ - fmsub_b5-01 \ - fmsub_b6-01 \ - fmsub_b7-01 \ - fmul_b4-01 \ - fmul_b5-01 \ - fmul_b6-01 \ - fmul_b7-01 \ - fnmadd_b4-01 \ - fnmadd_b5-01 \ - fnmadd_b6-01 \ - fnmadd_b7-01 \ - fnmsub_b4-01 \ - fnmsub_b5-01 \ - fnmsub_b6-01 \ - fnmsub_b7-01 \ - fsub_b4-01 \ - fsub_b5-01 \ - fsub_b7-01 \ - -# unsupported fflags - #fadd_b1-01 \ - fadd_b10-01 \ - fadd_b12-01 \ - fadd_b13-01 \ - fmadd_b14-01 \ - fmadd_b16-01 \ - fmadd_b17-01 \ - fmadd_b18-01 \ - fmadd_b2-01 \ - fmsub_b14-01 \ - fmsub_b16-01 \ - fmsub_b17-01 \ - fmsub_b18-01 \ - fmsub_b2-01 \ - fmul_b1-01 \ - fmul_b2-01 \ - fnmadd_b14-01 \ - fnmadd_b16-01 \ - fnmadd_b17-01 \ - fnmadd_b18-01 \ - fnmadd_b2-01 \ - fnmsub_b14-01 \ - fnmsub_b16-01 \ - fnmsub_b17-01 \ - fnmsub_b18-01 \ - fnmsub_b2-01 \ - fsub_b1-01 \ - fsub_b10-01 \ - fsub_b12-01 \ - fsub_b13-01 \ - fsub_b2-01 \ - RISCV_RV32F_COMPLIANCE_TESTS = \ fcvt.s.w_b25-01 \ fcvt.s.w_b26-01 \ @@ -442,6 +380,65 @@ RISCV_RV32F_COMPLIANCE_TESTS = \ fsqrt_b5-01 \ fsqrt_b7-01 \ fsqrt_b8-01 \ + fadd_b7-01 \ + fadd_b2-01 \ + fadd_b4-01 \ + fadd_b5-01 \ + fmadd_b4-01 \ + fmadd_b5-01 \ + fmadd_b6-01 \ + fmadd_b7-01 \ + fmsub_b4-01 \ + fmsub_b5-01 \ + fmsub_b6-01 \ + fmsub_b7-01 \ + fmul_b4-01 \ + fmul_b6-01 \ + fmul_b7-01 \ + fnmadd_b4-01 \ + fnmadd_b5-01 \ + fnmadd_b6-01 \ + fnmadd_b7-01 \ + fnmsub_b4-01 \ + fnmsub_b5-01 \ + fnmsub_b6-01 \ + fnmsub_b7-01 \ + fsub_b4-01 \ + fsub_b5-01 \ + fsub_b7-01 \ + fadd_b1-01 \ + fadd_b10-01 \ + fadd_b12-01 \ + fadd_b13-01 \ + fmadd_b14-01 \ + fmadd_b16-01 \ + fmadd_b17-01 \ + fmadd_b18-01 \ + fmadd_b2-01 \ + fmsub_b14-01 \ + fmsub_b16-01 \ + fmsub_b17-01 \ + fmsub_b18-01 \ + fmsub_b2-01 \ + fmul_b2-01 \ + fnmadd_b14-01 \ + fnmadd_b16-01 \ + fnmadd_b17-01 \ + fnmadd_b18-01 \ + fnmadd_b2-01 \ + fnmsub_b14-01 \ + fnmsub_b16-01 \ + fnmsub_b17-01 \ + fnmsub_b18-01 \ + fnmsub_b2-01 \ + fsub_b1-01 \ + fsub_b10-01 \ + fsub_b12-01 \ + fsub_b13-01 \ + fsub_b2-01 \ + fmul_b1-01 \ + fmul_b5-01 \ + RISCV_RV32F_COMPLIANCE_TEST_TARGETS = $(RISCV_RV32F_COMPLIANCE_TESTS:%=test-riscv-compliance-%) diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv index a0ef0e80..0fd3123d 100644 --- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv +++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv @@ -148,14 +148,15 @@ module FPExecutionStage( logic isDivSqrt [ FP_ISSUE_WIDTH ]; for ( genvar i = 0; i < FP_ISSUE_WIDTH; i++ ) begin - FP32PipelinedFMA fpFMA ( + FP32PipelinedFMA_WithFFlags fpFMA ( .clk (port.clk), .mullhs (fmaMulLHS[i]), .mulrhs (fmaMulRHS[i]), .addend (fmaAddend[i]), - //.rm (rm[i]), - .result ( fmaDataOut[i] ) - //.fflags ( fmaFFlagsOut[i]) + .round_mode (rm[i]), + .is_fmul (fpuCode[i] == FC_MUL), + .result ( fmaDataOut[i] ), + .fflags ( fmaFFlagsOut[i]) ); FP32PipelinedOther #( @@ -294,8 +295,7 @@ module FPExecutionStage( unique case ( localPipeReg[i][FP_EXEC_STAGE_DEPTH-2].fpQueueData.fpOpInfo.opType ) FP_MOP_TYPE_ADD, FP_MOP_TYPE_MUL, FP_MOP_TYPE_FMA: begin dataOut[i].data = fmaDataOut[i]; - //fflagsData[i] = fmaFFlagsOut[i]; - fflagsOut[i] = '0; + fflagsOut[i] = fmaFFlagsOut[i]; end FP_MOP_TYPE_DIV, FP_MOP_TYPE_SQRT: begin dataOut[i].data = fpDivSqrtUnit.DataOut[i]; From 850916c2799bf1654ee64f6bead164a211b50e20 Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Fri, 20 Dec 2024 14:06:52 +0900 Subject: [PATCH 03/11] fix: fix bugs related to FMA --- .../FP32PipelinedFMA_WithFFlags.sv | 57 ++++++-------- Processor/Src/FloatingPointUnit/FPUTypes.sv | 74 +++++++++++++++++-- Processor/Src/Makefiles/CoreSources.inc.mk | 1 + 3 files changed, 88 insertions(+), 44 deletions(-) diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv index b896f8e4..9e27d590 100644 --- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv +++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv @@ -13,10 +13,10 @@ output logic [4:0] fflags ); - FMAStage1RegPath stg0Out; - FMAStage2RegPath stg1Out; - FMAStage3RegPath stg2Out; - FMAStage4RegPath stg3Out; + FMA_WithFFlagsStage1RegPath stg0Out; + FMA_WithFFlagsStage2RegPath stg1Out; + FMA_WithFFlagsStage3RegPath stg2Out; + FMA_WithFFlagsStage4RegPath stg3Out; // Fused-multiply-adder (24bit*24bit<<3+76bit+sign) // The multiplication result is shifted by 2 bits for the guard bit and the sticky bit. @@ -38,15 +38,11 @@ output FMA_WithFFlagsStage2 stg2(clk, stg1Out, stg2Out, fma_result); FMA_WithFFlagsStage3 stg3(clk, stg2Out, stg3Out); FMA_WithFFlagsStage4 stg4(clk, stg3Out, result, fflags); - - logic [31:0] ref_result; - logic [4:0] ref_fflags; - float_fused_multiply_adder fma(mullhs, mulrhs, addend, round_mode, ref_result, ref_fflags); endmodule module FMA_WithFFlagsStage0( input logic clk, - output FMAStage1RegPath stg0Out, + output FMA_WithFFlagsStage1RegPath stg0Out, input logic [31:0] mullhs, input logic [31:0] mulrhs, input logic [31:0] addend, @@ -124,16 +120,13 @@ module FMA_WithFFlagsStage0( // Special cases wire mulres_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps) wire res_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN - + // Fused-multiply-adder (24bit*24bit<<3+76bit+sign) // The multiplication result is shifted by 3 bits for the guard bit, the round bit, and the sticky bit. // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| < 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23. assign mlhs = { 51'b0, v_mullhs_mant, 2'b0 }; assign mrhs = { 52'b0, v_mulrhs_mant, 1'b0 }; assign maddend = { 1'b0, shifted_addend, addend_sticky }; - // wire[76:0] multiplier_result = is_subtract ? multiplier_lhs * multiplier_rhs - multiplier_addend - // : multiplier_lhs * multiplier_rhs + multiplier_addend; - assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan, res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, inf, nan, addend, mulres_is_tiny, res_is_tiny, invalid_operation, round_mode, is_fmul}; @@ -141,10 +134,10 @@ endmodule module FMA_WithFFlagsStage1( input logic clk, - input FMAStage1RegPath stg1In, - output FMAStage2RegPath stg1Out + input FMA_WithFFlagsStage1RegPath stg1In, + output FMA_WithFFlagsStage2RegPath stg1Out ); - FMAStage1RegPath pipeReg; + FMA_WithFFlagsStage1RegPath pipeReg; always_ff @(posedge clk) begin pipeReg <= stg1In; end @@ -153,11 +146,11 @@ endmodule module FMA_WithFFlagsStage2( input logic clk, - input FMAStage2RegPath stg2In, - output FMAStage3RegPath stg2Out, + input FMA_WithFFlagsStage2RegPath stg2In, + output FMA_WithFFlagsStage3RegPath stg2Out, input logic [76:0] fma_result ); - FMAStage2RegPath pipeReg; + FMA_WithFFlagsStage2RegPath pipeReg; always_ff @(posedge clk) begin pipeReg <= stg2In; end @@ -168,16 +161,16 @@ module FMA_WithFFlagsStage2( wire[75:0] abs_fma_result = res_is_negative ? -fma_result[75:0] : fma_result[75:0]; wire result_sign = mul_sign ^ res_is_negative; - assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, res_is_negative, pipeReg.result_is_inf, + assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf, pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign, pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend, - pipeReg.mul_sign, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul}; + pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul}; endmodule module FMA_WithFFlagsStage3( input logic clk, - input FMAStage3RegPath stg3In, - output FMAStage4RegPath stg3Out + input FMA_WithFFlagsStage3RegPath stg3In, + output FMA_WithFFlagsStage4RegPath stg3Out ); function automatic [6:0] leading_zeros_count; input[75:0] x; @@ -185,7 +178,7 @@ module FMA_WithFFlagsStage3( if(x[75-leading_zeros_count]) break; endfunction - FMAStage3RegPath pipeReg; + FMA_WithFFlagsStage3RegPath pipeReg; always_ff @(posedge clk) begin pipeReg <= stg3In; end @@ -198,15 +191,15 @@ module FMA_WithFFlagsStage3( wire [6:0] fmares_shift = subnormal ? mulres_expo[6:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs. : leading_zeros + 1; // (75 - addend_sticky(1bit)) - shifter_result(24bit) - assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.res_is_negative, pipeReg.result_is_inf, + assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf, pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign, pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend, - pipeReg.mul_sign, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul}; + pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul}; endmodule module FMA_WithFFlagsStage4( input logic clk, - input FMAStage4RegPath stg4In, + input FMA_WithFFlagsStage4RegPath stg4In, output logic [31:0] result, output logic [4:0] fflags ); @@ -226,7 +219,7 @@ module FMA_WithFFlagsStage4( endcase endfunction - FMAStage4RegPath pipeReg; + FMA_WithFFlagsStage4RegPath pipeReg; always_ff @(posedge clk) begin pipeReg <= stg4In; end @@ -237,7 +230,6 @@ module FMA_WithFFlagsStage4( wire[31:0] inf = pipeReg.inf; wire[31:0] nan = pipeReg.nan; wire[31:0] addend = pipeReg.addend; - wire res_is_negative = pipeReg.res_is_negative; wire result_is_inf = pipeReg.result_is_inf; wire result_is_nan = pipeReg.result_is_nan; wire res_is_zero = pipeReg.res_is_zero; @@ -254,9 +246,7 @@ module FMA_WithFFlagsStage4( wire is_fmul = pipeReg.is_fmul; // Normalize and rounding decision - /* verilator lint_off WIDTH */ wire[24:0] shifter_result = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) } - /* verilator lint_on WIDTH */ wire sticky = abs_fma_result << (7'd25 + fmares_shift) != 0; // the part right-shifted out above wire round_away = round_to_away(result_sign, shifter_result[2], shifter_result[1], shifter_result[0] | sticky, round_mode); @@ -269,12 +259,7 @@ module FMA_WithFFlagsStage4( wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one }; // Special cases - // wire mulres_is_zero = mullhs_is_zero | mulrhs_is_zero; wire res_is_huge = $signed(virtual_expo) >= 255; - // wire mulres_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps) - // wire res_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN - // wire res_is_addend = mulres_is_zero & !addend_is_zero; - // wire res_is_zero = multiplier_result == 77'h0; // including mulres_is_zero & addend_is_zero wire dir_is_away = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign); wire huge_is_inf = round_mode == 0 | round_mode == 4 | dir_is_away; diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv index 179a4f6d..b34a4d01 100644 --- a/Processor/Src/FloatingPointUnit/FPUTypes.sv +++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv @@ -111,6 +111,68 @@ typedef struct packed { logic [25:0] quo; } FDivSqrtRegPath; +// Pipeline registers for old FMA +typedef struct packed { + logic [9:0] mulres_expo; + logic result_is_inf; + logic result_is_nan; + logic res_is_addend; + logic mul_sign; + logic prop_inf_sign; + logic addend_sign; + logic is_subtract; + logic [31:0] nan; + logic [31:0] addend; +} FMAStage1RegPath; + +typedef struct packed { + logic [9:0] mulres_expo; + logic result_is_inf; + logic result_is_nan; + logic res_is_addend; + logic mul_sign; + logic prop_inf_sign; + logic addend_sign; + logic is_subtract; + logic [31:0] nan; + logic [31:0] addend; +} FMAStage2RegPath; + +typedef struct packed { + logic [75:0] abs_fma_result; + logic [9:0] mulres_expo; + logic res_is_negative; + logic result_is_inf; + logic result_is_nan; + logic res_is_zero; + logic res_is_addend; + logic result_sign; + logic prop_inf_sign; + logic addend_sign; + logic is_subtract; + logic [31:0] nan; + logic [31:0] addend; +} FMAStage3RegPath; + +typedef struct packed { + logic [75:0] abs_fma_result; + logic [7:0] fmares_shift; + logic [9:0] virtual_expo; + logic subnormal; + logic res_is_negative; + logic result_is_inf; + logic result_is_nan; + logic res_is_zero; + logic res_is_addend; + logic result_sign; + logic prop_inf_sign; + logic addend_sign; + logic is_subtract; + logic [31:0] nan; + logic [31:0] addend; +} FMAStage4RegPath; + +// Pipeline registers for FMA with fflags typedef struct packed { logic [9:0] mulres_expo; logic result_is_inf; @@ -128,7 +190,7 @@ typedef struct packed { logic invalid_operation; logic [2:0] round_mode; logic is_fmul; -} FMAStage1RegPath; +} FMA_WithFFlagsStage1RegPath; typedef struct packed { logic [9:0] mulres_expo; @@ -147,12 +209,11 @@ typedef struct packed { logic invalid_operation; logic [2:0] round_mode; logic is_fmul; -} FMAStage2RegPath; +} FMA_WithFFlagsStage2RegPath; typedef struct packed { logic [75:0] abs_fma_result; logic [9:0] mulres_expo; - logic res_is_negative; logic result_is_inf; logic result_is_nan; logic res_is_zero; @@ -164,20 +225,18 @@ typedef struct packed { logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; - logic mul_sign; logic mulres_is_tiny; logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; logic is_fmul; -} FMAStage3RegPath; +} FMA_WithFFlagsStage3RegPath; typedef struct packed { logic [75:0] abs_fma_result; logic [6:0] fmares_shift; logic [9:0] virtual_expo; logic subnormal; - logic res_is_negative; logic result_is_inf; logic result_is_nan; logic res_is_zero; @@ -189,12 +248,11 @@ typedef struct packed { logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; - logic mul_sign; logic mulres_is_tiny; logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; logic is_fmul; -} FMAStage4RegPath; +} FMA_WithFFlagsStage4RegPath; endpackage \ No newline at end of file diff --git a/Processor/Src/Makefiles/CoreSources.inc.mk b/Processor/Src/Makefiles/CoreSources.inc.mk index 52d974fa..6abd1698 100644 --- a/Processor/Src/Makefiles/CoreSources.inc.mk +++ b/Processor/Src/Makefiles/CoreSources.inc.mk @@ -114,6 +114,7 @@ CORE_MODULES = \ FloatingPointUnit/FP32PipelinedAdder.sv \ FloatingPointUnit/FP32PipelinedMultiplier.sv \ FloatingPointUnit/FP32PipelinedFMA.sv \ + FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv \ FloatingPointUnit/FP32PipelinedOther.sv \ FloatingPointUnit/FP32DivSqrter.sv \ FloatingPointUnit/FP32DivSqrterWithFFlags.sv \ From 85aa273d3048b3f6004a92d7832de775f5cc351b Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Fri, 20 Dec 2024 14:41:05 +0900 Subject: [PATCH 04/11] refactor: remove unnecessary variables --- .../Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv index 9e27d590..c01d4375 100644 --- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv +++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv @@ -271,10 +271,6 @@ module FMA_WithFFlagsStage4( : addend; wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; - - wire [7:0] addend_expo = addend[30:23]; - wire[22:0] addend_mant = addend[22:0]; - wire addend_is_zero = addend_expo == 8'h00 & addend_mant == 0; wire[31:0] zero = { is_fmul ? result_sign : (is_subtract ? round_mode == 2 : addend_sign), 8'h00, 23'h0 }; // Final result From 1f76587727f3055327d1515547e564fee7cf7b23 Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Fri, 20 Dec 2024 17:19:05 +0900 Subject: [PATCH 05/11] fix: fix compile errors when synthesize on Vivado --- Processor/Src/FloatingPointUnit/FP32DivSqrter.sv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv index 759d3a50..6ffc1d2f 100644 --- a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv +++ b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv @@ -44,11 +44,11 @@ output logic [31:0] regResult, nextResult; wire lhs_sign = input_lhs[31]; - wire rhs_sign = rhs[31]; + wire rhs_sign = input_rhs[31]; wire [7:0] lhs_expo = input_lhs[30:23]; - wire [7:0] rhs_expo = rhs[30:23]; + wire [7:0] rhs_expo = input_rhs[30:23]; wire[22:0] lhs_mant = input_lhs[22:0]; - wire[22:0] rhs_mant = rhs[22:0]; + wire[22:0] rhs_mant = input_rhs[22:0]; // NaN handling wire lhs_is_zero = lhs_expo == 8'h00 & lhs_mant == 0; From 0dec02eec3a558c5761bc73f6909f5850c74327c Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Fri, 20 Dec 2024 18:30:21 +0900 Subject: [PATCH 06/11] fix: fix DivSqrter not to be a critical path --- .../Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv index 87c32b77..bf39c535 100644 --- a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv +++ b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv @@ -96,8 +96,8 @@ output wire[23:0] v_rhs_mant_w = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant }; reg [23:0] v_lhs_mant, v_rhs_mant; wire dividend_normalize = v_lhs_mant < v_rhs_mant; - wire [9:0] virtual_expo = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals) - wire subnormal = is_divide & $signed(virtual_expo) <= 0; + wire [9:0] virtual_expo_w = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals) + wire subnormal_w = is_divide & $signed(virtual_expo_w) <= 0; // The SRT loop. rem needs 27 bits. 24(mantissa)+2(x8/3,SRT)+1(sign) wire[26:0] rem_0 = is_divide ? dividend_normalize ? { 2'b00, v_lhs_mant, 1'b0 } : { 3'b000, v_lhs_mant } @@ -108,6 +108,8 @@ output reg [3:0] stage; reg [26:0] rem; reg [25:0] quo; + reg [9:0] virtual_expo; + reg subnormal; always@(posedge clk) begin if (rst) begin lhs <= '0; @@ -134,6 +136,8 @@ output rem <= rem_0; quo <= quo_0; stage <= is_divide ? 0 : 1; + virtual_expo <= virtual_expo_w; + subnormal <= subnormal_w; end else begin reg[3:0] div = is_divide ? { 1'b0, v_rhs_mant[22:20] } : { quo[25], quo[23:21] }; From 609740e2ef3fc623773d81a53bc130ac0898546d Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Fri, 20 Dec 2024 20:48:17 +0900 Subject: [PATCH 07/11] fix: fix sign bit of 0 in FMA --- .../FP32PipelinedFMA_WithFFlags.sv | 13 +++++-------- Processor/Src/FloatingPointUnit/FPUTypes.sv | 4 ---- .../Src/Pipeline/FPBackEnd/FPExecutionStage.sv | 5 +---- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv index c01d4375..d4608940 100644 --- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv +++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv @@ -7,7 +7,6 @@ input logic [31:0] mulrhs, logic [31:0] addend, logic [2:0] round_mode, - logic is_fmul, output logic [31:0] result, logic [4:0] fflags @@ -33,7 +32,7 @@ output : multiplier_lhs * multiplier_rhs + multiplier_addend; end - FMA_WithFFlagsStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_fmul, is_sub, mlhs, mrhs, maddend); + FMA_WithFFlagsStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_sub, mlhs, mrhs, maddend); FMA_WithFFlagsStage1 stg1(clk, stg0Out, stg1Out); FMA_WithFFlagsStage2 stg2(clk, stg1Out, stg2Out, fma_result); FMA_WithFFlagsStage3 stg3(clk, stg2Out, stg3Out); @@ -47,7 +46,6 @@ module FMA_WithFFlagsStage0( input logic [31:0] mulrhs, input logic [31:0] addend, input logic [2:0] round_mode, - input logic is_fmul, output logic is_subtract, output logic [76:0] mlhs, output logic [76:0] mrhs, @@ -129,7 +127,7 @@ module FMA_WithFFlagsStage0( assign maddend = { 1'b0, shifted_addend, addend_sticky }; assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan, res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, inf, nan, addend, - mulres_is_tiny, res_is_tiny, invalid_operation, round_mode, is_fmul}; + mulres_is_tiny, res_is_tiny, invalid_operation, round_mode}; endmodule module FMA_WithFFlagsStage1( @@ -164,7 +162,7 @@ module FMA_WithFFlagsStage2( assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf, pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign, pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend, - pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul}; + pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode}; endmodule module FMA_WithFFlagsStage3( @@ -194,7 +192,7 @@ module FMA_WithFFlagsStage3( assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf, pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign, pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend, - pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode, pipeReg.is_fmul}; + pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode}; endmodule module FMA_WithFFlagsStage4( @@ -243,7 +241,6 @@ module FMA_WithFFlagsStage4( wire res_is_tiny = pipeReg.res_is_tiny; wire invalid_operation = pipeReg.invalid_operation; wire [2:0] round_mode = pipeReg.round_mode; - wire is_fmul = pipeReg.is_fmul; // Normalize and rounding decision wire[24:0] shifter_result = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) } @@ -271,7 +268,7 @@ module FMA_WithFFlagsStage4( : addend; wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; - wire[31:0] zero = { is_fmul ? result_sign : (is_subtract ? round_mode == 2 : addend_sign), 8'h00, 23'h0 }; + wire[31:0] zero = { is_subtract ? round_mode == 2 : addend_sign, 8'h00, 23'h0 }; // Final result assign result = result_is_nan ? nan : diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv index b34a4d01..5ac4f562 100644 --- a/Processor/Src/FloatingPointUnit/FPUTypes.sv +++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv @@ -189,7 +189,6 @@ typedef struct packed { logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; - logic is_fmul; } FMA_WithFFlagsStage1RegPath; typedef struct packed { @@ -208,7 +207,6 @@ typedef struct packed { logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; - logic is_fmul; } FMA_WithFFlagsStage2RegPath; typedef struct packed { @@ -229,7 +227,6 @@ typedef struct packed { logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; - logic is_fmul; } FMA_WithFFlagsStage3RegPath; typedef struct packed { @@ -252,7 +249,6 @@ typedef struct packed { logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; - logic is_fmul; } FMA_WithFFlagsStage4RegPath; endpackage \ No newline at end of file diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv index 0fd3123d..c6ac9edb 100644 --- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv +++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv @@ -154,7 +154,6 @@ module FPExecutionStage( .mulrhs (fmaMulRHS[i]), .addend (fmaAddend[i]), .round_mode (rm[i]), - .is_fmul (fpuCode[i] == FC_MUL), .result ( fmaDataOut[i] ), .fflags ( fmaFFlagsOut[i]) ); @@ -178,7 +177,7 @@ module FPExecutionStage( fmaMulLHS[i] = fpuCode[i] inside {FC_FNMSUB, FC_FNMADD} ? {~fuOpA[i].data[31], fuOpA[i].data[30:0]} : fuOpA[i].data; fmaMulRHS[i] = fpuCode[i] inside {FC_ADD, FC_SUB} ? 32'h3f800000 : fuOpB[i].data; if(fpuCode[i] == FC_MUL) begin - fmaAddend[i] = 32'h00000000; + fmaAddend[i] = { fmaMulLHS[i][31] ^ fmaMulRHS[i][31] , 31'h0 }; end else if (fpuCode[i] == FC_ADD) begin fmaAddend[i] = fuOpB[i].data; @@ -291,7 +290,6 @@ module FPExecutionStage( // dataOut[i].valid = localPipeReg[i][FP_EXEC_STAGE_DEPTH-2].regValid; - // TODO fflagsをちゃんと実装 unique case ( localPipeReg[i][FP_EXEC_STAGE_DEPTH-2].fpQueueData.fpOpInfo.opType ) FP_MOP_TYPE_ADD, FP_MOP_TYPE_MUL, FP_MOP_TYPE_FMA: begin dataOut[i].data = fmaDataOut[i]; @@ -378,7 +376,6 @@ module FPExecutionStage( nextStage[i].fpQueueData = localPipeReg[i][FP_EXEC_STAGE_DEPTH-2].fpQueueData; - // TODO implment fflags nextStage[i].fflagsOut = fflagsOut[i]; // リセットorフラッシュ時はNOP From ed21fdeb5f712d934f74048a14ffc9bc5ae08958 Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Fri, 20 Dec 2024 20:55:26 +0900 Subject: [PATCH 08/11] fix: remove redundancy variables in FMA --- .../Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv | 9 ++++----- Processor/Src/FloatingPointUnit/FPUTypes.sv | 4 ---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv index d4608940..3fdf8a7f 100644 --- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv +++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv @@ -101,7 +101,6 @@ module FMA_WithFFlagsStage0( wire res_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf; wire mul_sign = mullhs_sign ^ mulrhs_sign; wire inf_sign = addend_is_inf ? addend_sign : mul_sign; - wire[31:0] inf = { inf_sign, 8'hff, 23'h0 }; // Main path (including subnormal handling) wire [9:0] v_mullhs_expo = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo }; @@ -126,7 +125,7 @@ module FMA_WithFFlagsStage0( assign mrhs = { 52'b0, v_mulrhs_mant, 1'b0 }; assign maddend = { 1'b0, shifted_addend, addend_sticky }; assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan, - res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, inf, nan, addend, + res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, nan, addend, mulres_is_tiny, res_is_tiny, invalid_operation, round_mode}; endmodule @@ -161,7 +160,7 @@ module FMA_WithFFlagsStage2( assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf, pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign, - pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend, + pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode}; endmodule @@ -191,7 +190,7 @@ module FMA_WithFFlagsStage3( assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf, pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign, - pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.inf, pipeReg.nan, pipeReg.addend, + pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend, pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode}; endmodule @@ -225,7 +224,6 @@ module FMA_WithFFlagsStage4( wire[75:0] abs_fma_result = pipeReg.abs_fma_result; wire [7:0] fmares_shift = pipeReg.fmares_shift; wire [9:0] virtual_expo = pipeReg.virtual_expo; - wire[31:0] inf = pipeReg.inf; wire[31:0] nan = pipeReg.nan; wire[31:0] addend = pipeReg.addend; wire result_is_inf = pipeReg.result_is_inf; @@ -269,6 +267,7 @@ module FMA_WithFFlagsStage4( wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; wire[31:0] zero = { is_subtract ? round_mode == 2 : addend_sign, 8'h00, 23'h0 }; + wire[31:0] inf = { prop_inf_sign, 8'hff, 23'h0 }; // Final result assign result = result_is_nan ? nan : diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv index 5ac4f562..2f313d85 100644 --- a/Processor/Src/FloatingPointUnit/FPUTypes.sv +++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv @@ -182,7 +182,6 @@ typedef struct packed { logic prop_inf_sign; logic addend_sign; logic is_subtract; - logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; logic mulres_is_tiny; @@ -200,7 +199,6 @@ typedef struct packed { logic prop_inf_sign; logic addend_sign; logic is_subtract; - logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; logic mulres_is_tiny; @@ -220,7 +218,6 @@ typedef struct packed { logic prop_inf_sign; logic addend_sign; logic is_subtract; - logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; logic mulres_is_tiny; @@ -242,7 +239,6 @@ typedef struct packed { logic prop_inf_sign; logic addend_sign; logic is_subtract; - logic [31:0] inf; logic [31:0] nan; logic [31:0] addend; logic mulres_is_tiny; From bcf9e7f0949a17a83eba359854a8f38db31f4457 Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Sat, 21 Dec 2024 00:18:17 +0900 Subject: [PATCH 09/11] refactor: add a comment --- Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv index c6ac9edb..72703102 100644 --- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv +++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv @@ -177,6 +177,10 @@ module FPExecutionStage( fmaMulLHS[i] = fpuCode[i] inside {FC_FNMSUB, FC_FNMADD} ? {~fuOpA[i].data[31], fuOpA[i].data[30:0]} : fuOpA[i].data; fmaMulRHS[i] = fpuCode[i] inside {FC_ADD, FC_SUB} ? 32'h3f800000 : fuOpB[i].data; if(fpuCode[i] == FC_MUL) begin + // Hack: set sign bit considering rounding mode + // +a * +0.0 should return +0.0 regardless of rounding mode, + // However, when implemented with fma(+a, +0.0, -0.0), + // it returns -0.0 when round_mode = 2 fmaAddend[i] = { fmaMulLHS[i][31] ^ fmaMulRHS[i][31] , 31'h0 }; end else if (fpuCode[i] == FC_ADD) begin From c7232ff469fad48ac1a558732f73889d15e5f644 Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Sat, 21 Dec 2024 12:40:26 +0900 Subject: [PATCH 10/11] refactor: remove FMA and divider without fflags --- .../Src/FloatingPointUnit/FP32DivSqrter.sv | 328 +++++++++--------- .../FP32DivSqrterWithFFlags.sv | 210 ----------- .../Src/FloatingPointUnit/FP32PipelinedFMA.sv | 226 ++++++++---- .../FP32PipelinedFMA_WithFFlags.sv | 291 ---------------- .../Src/FloatingPointUnit/FPDivSqrtUnit.sv | 2 +- Processor/Src/FloatingPointUnit/FPUTypes.sv | 70 +--- Processor/Src/Makefiles/CoreSources.inc.mk | 2 - .../Pipeline/FPBackEnd/FPExecutionStage.sv | 2 +- 8 files changed, 336 insertions(+), 795 deletions(-) delete mode 100644 Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv delete mode 100644 Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv index 6ffc1d2f..83356385 100644 --- a/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv +++ b/Processor/Src/FloatingPointUnit/FP32DivSqrter.sv @@ -1,54 +1,69 @@ -import FPUTypes::*; -module FP32DivSqrter ( +module FP32DivSqrter( input logic clk, rst, - logic [31:0] input_lhs, + logic [31:0] input_lhs, logic [31:0] input_rhs, - logic input_is_divide, - logic req, + logic input_is_divide, + logic [2:0] input_round_mode, + logic req, output - logic finished, - logic [31:0] result + logic [31:0] result, + logic [4:0] fflags, + logic finished ); - function automatic [2:0] srt_table; + function round_to_away; + input[2:0] round_mode; + input sign; + input last_place; + input guard_bit; + input sticky_bit; + input reminder_is_positive; + input reminder_is_zero; + + case(round_mode) + 3'b000: round_to_away = guard_bit & (sticky_bit | reminder_is_positive | (reminder_is_zero & last_place)); // round to nearest, ties to even + 3'b100: round_to_away = guard_bit & (sticky_bit | reminder_is_positive | reminder_is_zero); // round to nearest, ties to away + 3'b010: round_to_away = sign & (guard_bit | sticky_bit | reminder_is_positive); // round downward + 3'b011: round_to_away = !sign & (guard_bit | sticky_bit | reminder_is_positive); // round upward + default: round_to_away = 0; // round towards zero + endcase + endfunction + + function[2:0] srt_table; input[5:0] rem; input[3:0] div; - reg[5:0] th12 = div < 1 ? 6 : div < 2 ? 7 : div < 4 ? 8 : div < 5 ? 9 : div < 6 ? 10 : 11; - reg[5:0] th01 = div < 2 ? 2 : div < 6 ? 3 : 4; + reg[5:0] th12; + reg[5:0] th01; + th12 = div < 1 ? 6 : div < 2 ? 7 : div < 4 ? 8 : div < 5 ? 9 : div < 6 ? 10 : 11; + th01 = div < 2 ? 2 : div < 6 ? 3 : 4; - if($signed(rem) < $signed(-th12)) srt_table = -2; + if($signed(rem) < $signed(-th12)) srt_table = -2; else if($signed(rem) < $signed(-th01)) srt_table = -1; else if($signed(rem) < $signed( th01)) srt_table = 0; else if($signed(rem) < $signed( th12)) srt_table = 1; else srt_table = 2; endfunction - function automatic [9:0] leading_zeros_count; + + reg [31:0] lhs; + reg [31:0] rhs; + reg is_divide; + reg [2:0] round_mode; + + function [9:0] leading_zeros_count; input[22:0] x; for(leading_zeros_count = 0; leading_zeros_count <= 22; leading_zeros_count = leading_zeros_count + 1) - if(x >> (22-leading_zeros_count) != 0) break; + if(x[22-leading_zeros_count]) break; endfunction - typedef enum logic[1:0] - { - PHASE_FINISHED = 0, // Division is finished. It outputs results. - PHASE_PREPARATION = 1, // In preparation - PHASE_PROCESSING = 2, // In processing (SRT loop) - PHASE_ROUNDING = 3 // In rounding & arrangement - } Phase; - - Phase regPhase, nextPhase; - logic [4:0] regCounter, nextCounter; - FDivSqrtRegPath regData, nextData; - logic [31:0] regResult, nextResult; - - wire lhs_sign = input_lhs[31]; - wire rhs_sign = input_rhs[31]; - wire [7:0] lhs_expo = input_lhs[30:23]; - wire [7:0] rhs_expo = input_rhs[30:23]; - wire[22:0] lhs_mant = input_lhs[22:0]; - wire[22:0] rhs_mant = input_rhs[22:0]; + + wire lhs_sign = lhs[31]; + wire rhs_sign = rhs[31]; + wire [7:0] lhs_expo = lhs[30:23]; + wire [7:0] rhs_expo = rhs[30:23]; + wire[22:0] lhs_mant = lhs[22:0]; + wire[22:0] rhs_mant = rhs[22:0]; // NaN handling wire lhs_is_zero = lhs_expo == 8'h00 & lhs_mant == 0; @@ -57,136 +72,139 @@ output wire rhs_is_inf = rhs_expo == 8'hff & rhs_mant == 0; wire lhs_is_nan = lhs_expo == 8'hff & lhs_mant != 0; wire rhs_is_nan = rhs_expo == 8'hff & rhs_mant != 0; - wire lhs_is_neg = lhs_sign & input_lhs != 32'h80000000; - wire res_is_nan = input_is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) + wire lhs_is_snan = lhs_is_nan & lhs_mant[22] == 0; + wire rhs_is_snan = rhs_is_nan & rhs_mant[22] == 0; + wire lhs_is_neg = !lhs_is_nan & lhs_sign & lhs != 32'h80000000; + wire res_is_nan = is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) : lhs_is_nan | lhs_is_neg; - //wire[31:0] nan = is_divide ? lhs_is_nan ? input_lhs | 32'h00400000 : rhs_is_nan ? rhs | 32'h00400000 : 32'hffc00000 - // : lhs_is_nan ? input_lhs | 32'h00400000 : 32'hffc00000; // qNaN - wire[31:0] nan = 32'h7fc00000; + // === About handling NaN === + // x86 returns the following qNaN: + // mullhs_is_nan ? mullhs | 32'h00400000 : + // mulrhs_is_nan ? mulrhs | 32'h00400000 : + // addend_is_nan ? addend | 32'h00400000 : 32'hffc00000 + // RISC-V always returns canonical NaN (32'h7fc00000). + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 + wire[31:0] nan = 32'h7fc00000; + wire invalid_operation = is_divide ? lhs_is_snan | rhs_is_snan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) + : lhs_is_snan | lhs_is_neg; // Preparation - wire result_sign = input_is_divide & (lhs_sign ^ rhs_sign); - wire [9:0] v_lhs_expo = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // virtual exponent (ignores subnormals, but is biased) - wire [9:0] v_rhs_expo = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // virtual exponent (ignores subnormals, but is biased) - wire[23:0] v_lhs_mant = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant }; - wire[23:0] v_rhs_mant = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant }; - - wire dividend_normalize = regData.v_lhs_mant < regData.v_rhs_mant; - wire [9:0] virtual_expo = regData.v_lhs_expo - regData.v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals) - wire subnormal = regData.is_divide & $signed(virtual_expo) <= 0; - wire res_is_zero = regData.is_divide ? $signed(virtual_expo) <= -24 | regData.res_is_zero - : regData.res_is_zero; + wire result_sign = is_divide ? lhs_sign ^ rhs_sign : lhs_sign; + wire [9:0] v_lhs_expo = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // biased virtual exponent (ignores subnormals) + wire [9:0] v_rhs_expo = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // biased virtual exponent (ignores subnormals) + wire[23:0] v_lhs_mant_w = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant }; + wire[23:0] v_rhs_mant_w = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant }; + reg [23:0] v_lhs_mant, v_rhs_mant; + wire dividend_normalize = v_lhs_mant < v_rhs_mant; + wire [9:0] virtual_expo_w = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals) + wire subnormal_w = is_divide & $signed(virtual_expo_w) <= 0; // The SRT loop. rem needs 27 bits. 24(mantissa)+2(x8/3,SRT)+1(sign) - wire[26:0] rem_0 = regData.is_divide ? dividend_normalize ? { 2'b00, regData.v_lhs_mant, 1'b0 } : { 3'b000, regData.v_lhs_mant } - : regData.v_lhs_expo[0] ? { 2'b0, regData.v_lhs_mant, 1'b0 } - 27'h1e40000 : { 1'b0, regData.v_lhs_mant, 2'b0 } - 27'h2400000; // 2 * (x - 1.375^2 or 1.5^2) - wire[25:0] quo_0 = regData.is_divide ? 26'h0 - : regData.v_lhs_expo[0] ? 26'h1600000 : 26'h1800000; // magical initial guess: 1.375 or 1.5; this avoids SRT-table defects at ([-4.5,-4-11/36], 1.5) and ([-4,-4+1/144], 1.25) - - logic [2:0] q; - logic [3:0] div; - logic [26:0] rem; - logic [25:0] quo; - always_comb begin - rem = regData.rem; - quo = regData.quo; - div = regData.is_divide ? { 1'b0, regData.v_rhs_mant[22:20] } : { quo[25], quo[23:21] }; - q = srt_table( rem[26:21], div ); - case(q) - 3'b010: rem = regData.is_divide ? (rem << 2) - { regData.v_rhs_mant, 3'b000 } - : (rem << 2) - { quo[24:0], 2'b00 } - (27'd4 << (regCounter)); - 3'b001: rem = regData.is_divide ? (rem << 2) - { 1'b0, regData.v_rhs_mant, 2'b00 } - : (rem << 2) - { quo, 1'b0 } - (27'd1 << (regCounter)); - 3'b111: rem = regData.is_divide ? (rem << 2) + { 1'b0, regData.v_rhs_mant, 2'b00 } - : (rem << 2) + { quo, 1'b0 } - (27'd1 << (regCounter)); - 3'b110: rem = regData.is_divide ? (rem << 2) + { regData.v_rhs_mant, 3'b000 } - : (rem << 2) + { quo[24:0], 2'b00 } - (27'd4 << (regCounter)); - default: rem = rem << 2; - endcase - quo = quo + ({ {23{q[2]}}, q } << (regCounter)); - end - - wire[47:0] before_round = regData.subnormal ? { 1'b1, regData.quo[23:0], 23'h0 } >> -regData.virtual_expo : { regData.quo[23:0], 24'h0 }; - wire round_away = before_round[24] & ( (before_round[23:0] == 0 & regData.rem == 0 & before_round[25]) | before_round[23:0] != 0 | $signed(regData.rem) > 0 ); // round nearest, ties to even - wire exp_plus_one = before_round[47:25] == 23'h7fffff & round_away; - wire[22:0] result_mant = before_round[47:25] + { 22'h0, round_away }; // No special treatment is required even if a overflow occurs since the answer will be 0 and it will be correct. - wire [7:0] result_expo = regData.is_divide ? (subnormal ? 8'h00 : regData.virtual_expo[7:0]) + { 7'h0, exp_plus_one } - : regData.v_lhs_expo[8:1] + { 7'b0, regData.v_lhs_expo[0] } + 63; - wire res_is_inf = regData.is_divide ? $signed(regData.virtual_expo) >= 255 | regData.res_is_inf | result_expo == 8'hff - : regData.res_is_inf; - wire[31:0] inf = { regData.result_sign, 8'hff, 23'h0 }; - wire[31:0] zero = {{ regData.is_divide ? regData.result_sign : regData.lhs_sign }, 8'h00, 23'h0 }; - - wire[31:0] final_result = regData.res_is_nan ? regData.nan : - regData.res_is_zero ? zero : - res_is_inf ? inf : { regData.result_sign, result_expo, result_mant }; - - always_ff @(posedge clk) begin + wire[26:0] rem_0 = is_divide ? dividend_normalize ? { 2'b00, v_lhs_mant, 1'b0 } : { 3'b000, v_lhs_mant } + : v_lhs_expo[0] ? { 2'b0, v_lhs_mant_w, 1'b0 } - 27'h1e40000 : { 1'b0, v_lhs_mant_w, 2'b0 } - 27'h2400000; // 2 * (x - 1.375^2 or 1.5^2) + wire[25:0] quo_0 = is_divide ? 26'h0 + : v_lhs_expo[0] ? 26'h1600000 : 26'h1800000; // magical initial guess: 1.375 or 1.5; this avoids SRT-table defects at ([-4.5,-4-11/36], 1.5) and ([-4,-4+1/144], 1.25) + + reg [3:0] stage; + reg [26:0] rem; + reg [25:0] quo; + reg [9:0] virtual_expo; + reg subnormal; + always@(posedge clk) begin if (rst) begin - regPhase <= PHASE_FINISHED; - regCounter <= '0; - regData <= '0; - regResult <= '0; - end - else begin - regPhase <= nextPhase; - regCounter <= nextCounter; - regData <= nextData; - regResult <= nextResult; - end - end - always_comb begin - nextCounter = regCounter; - nextData = regData; - nextResult = regResult; - if (req && regPhase == PHASE_FINISHED) begin - nextData.v_lhs_expo = v_lhs_expo; - nextData.v_lhs_mant = v_lhs_mant; - nextData.v_rhs_expo = v_rhs_expo; - nextData.v_rhs_mant = v_rhs_mant; - nextData.result_sign = result_sign; - nextData.lhs_sign = lhs_sign; - nextData.is_divide = input_is_divide; - nextData.res_is_nan = res_is_nan; - nextData.res_is_inf = input_is_divide ? (lhs_is_inf | rhs_is_zero) : (!lhs_sign & lhs_is_inf); - nextData.res_is_zero = input_is_divide ? (lhs_is_zero | rhs_is_inf) : lhs_is_zero; - nextData.nan = nan; - nextPhase = PHASE_PREPARATION; + lhs <= '0; + rhs <= '0; + stage <= '0; + rem <= '0; + quo <= '0; + v_lhs_mant <= '0; + v_rhs_mant <= '0; end - else if (regPhase == PHASE_PREPARATION) begin - nextData.virtual_expo = virtual_expo; - nextData.subnormal = subnormal; - nextData.res_is_zero = res_is_zero; - nextData.rem = rem_0; - nextData.quo = quo_0; - nextPhase = PHASE_PROCESSING; - nextCounter = regData.is_divide ? 24 : 22; + else if (stage == 13) begin + if (req) begin + lhs <= input_lhs; + rhs <= input_rhs; + is_divide <= input_is_divide; + round_mode <= input_round_mode; + stage <= input_is_divide ? 14 : 15; + end + end else if (stage == 14) begin + v_lhs_mant <= v_lhs_mant_w; + v_rhs_mant <= v_rhs_mant_w; + stage <= 15; + end else if (stage == 15) begin + rem <= rem_0; + quo <= quo_0; + stage <= is_divide ? 0 : 1; + virtual_expo <= virtual_expo_w; + subnormal <= subnormal_w; + end else begin + reg[3:0] div = is_divide ? { 1'b0, v_rhs_mant[22:20] } + : { quo[25], quo[23:21] }; + reg[2:0] q = srt_table( rem[26:21], div ); + case(q) + 3'b010: rem <= is_divide ? (rem << 2) - { v_rhs_mant, 3'b000 } + : (rem << 2) - { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2)); + 3'b001: rem <= is_divide ? (rem << 2) - { 1'b0, v_rhs_mant, 2'b00 } + : (rem << 2) - { quo, 1'b0 } - (27'd1 << (24-stage*2)); + 3'b111: rem <= is_divide ? (rem << 2) + { 1'b0, v_rhs_mant, 2'b00 } + : (rem << 2) + { quo, 1'b0 } - (27'd1 << (24-stage*2)); + 3'b110: rem <= is_divide ? (rem << 2) + { v_rhs_mant, 3'b000 } + : (rem << 2) + { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2)); + default: rem <= rem << 2; + endcase + quo <= quo + ({ {23{q[2]}}, q } << (24-stage*2)); + stage <= stage + 1; end - else if (regPhase == PHASE_PROCESSING) begin - nextData.rem = rem; - nextData.quo = quo; - nextCounter = regCounter - 2; - nextPhase = (regCounter == 0) ? PHASE_ROUNDING : PHASE_PROCESSING; - end - // Here, quo has a <1/3ULP error. - else if (regPhase == PHASE_ROUNDING) begin - nextResult = final_result; - nextPhase = PHASE_FINISHED; - nextCounter = '0; - nextData = '0; - end - else begin - nextPhase = regPhase; - end - finished = regPhase == PHASE_FINISHED; - result = regResult; end - + assign finished = stage == 13; // Here, quo has a <1/3ULP error. + + wire[47:0] before_round = subnormal ? { 1'b1, quo[23:0], 23'h0 } >> -virtual_expo : { quo[23:0], 24'h0 }; + wire round_away = round_to_away(round_mode, result_sign, before_round[25], before_round[24], before_round[23:0] != 0, $signed(rem) > 0, rem == 0); + wire round_fall = round_mode == 2 ? !result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud downward + round_mode == 3 ? result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud upward + round_mode == 1 ? before_round[24:0] == 0 & $signed(rem) < 0 // round towards zero + : 0; + wire exp_plus_one = before_round[47:25] == 23'h7fffff & round_away; + // Since dividend is normalized, situations where before_round[24:0] == 0 & $signed(rem) < 0 do not happen; thus, `exp_minus_one' is always zero. + // wire exp_minus_one = before_round[47:25] == 23'h000000 & round_fall; + wire[22:0] result_mant = before_round[47:25] + { 22'h0, round_away } - { 22'h0, round_fall }; // No special treatment is required even if a overflow occurs since the answer will be correct. + wire [7:0] result_expo = is_divide ? (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'h0, exp_plus_one } + : v_lhs_expo[8:1] + { 7'b0, v_lhs_expo[0] } + 63 + { 7'h0, exp_plus_one }; + + // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent) + wire u_round_away = round_to_away(round_mode, result_sign, quo[1], quo[0], 1'b0, $signed(rem) > 0, rem == 0); + wire u_exp_plus_one = before_round[47:24] == 24'hffffff & u_round_away; + + // Special cases + wire res_is_huge = is_divide & $signed(virtual_expo) >= 255; + wire res_is_tiny = is_divide & !lhs_is_zero & !rhs_is_inf & $signed(virtual_expo) <= -24; + wire res_is_inf = is_divide ? lhs_is_inf | rhs_is_zero + : lhs_is_inf; + wire res_is_zero = is_divide ? lhs_is_zero | rhs_is_inf + : lhs_is_zero; + wire dir_is_away = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign); + wire huge_is_inf = round_mode == 0 | round_mode == 4 | dir_is_away; + + wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; + wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; + wire[31:0] inf = { result_sign, 8'hff, 23'h0 }; + wire[31:0] zero = { result_sign, 8'h00, 23'h0 }; + + // Final result + assign result = res_is_nan ? nan : + res_is_inf ? inf : + res_is_huge ? huge : + res_is_tiny ? tiny : + res_is_zero ? zero : { result_sign, result_expo, result_mant }; + // Exception flags + wire divide_by_zero = is_divide & !res_is_nan & !lhs_is_inf & rhs_is_zero; + wire overflow = is_divide & !res_is_nan & !lhs_is_inf & !rhs_is_zero & (res_is_huge | (virtual_expo == 254 & exp_plus_one)); + wire inexact = !res_is_nan & !(is_divide ? lhs_is_zero | lhs_is_inf | rhs_is_zero | rhs_is_inf : lhs_is_zero) & (overflow | res_is_tiny | before_round[24:0] != 0 | rem != 0); + // === About underflow (UF) flag + // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 + wire underflow = inexact & subnormal & !u_exp_plus_one; + // NV DZ OF UF NX + assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact }; endmodule - - - - - - - diff --git a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv deleted file mode 100644 index bf39c535..00000000 --- a/Processor/Src/FloatingPointUnit/FP32DivSqrterWithFFlags.sv +++ /dev/null @@ -1,210 +0,0 @@ - -module FP32DivSqrterWithFFlags( -input - logic clk, rst, - logic [31:0] input_lhs, - logic [31:0] input_rhs, - logic input_is_divide, - logic [2:0] input_round_mode, - logic req, -output - logic [31:0] result, - logic [4:0] fflags, - logic finished -); - - function round_to_away; - input[2:0] round_mode; - input sign; - input last_place; - input guard_bit; - input sticky_bit; - input reminder_is_positive; - input reminder_is_zero; - - case(round_mode) - 3'b000: round_to_away = guard_bit & (sticky_bit | reminder_is_positive | (reminder_is_zero & last_place)); // round to nearest, ties to even - 3'b100: round_to_away = guard_bit & (sticky_bit | reminder_is_positive | reminder_is_zero); // round to nearest, ties to away - 3'b010: round_to_away = sign & (guard_bit | sticky_bit | reminder_is_positive); // round downward - 3'b011: round_to_away = !sign & (guard_bit | sticky_bit | reminder_is_positive); // round upward - default: round_to_away = 0; // round towards zero - endcase - endfunction - - function[2:0] srt_table; - input[5:0] rem; - input[3:0] div; - - reg[5:0] th12; - reg[5:0] th01; - th12 = div < 1 ? 6 : div < 2 ? 7 : div < 4 ? 8 : div < 5 ? 9 : div < 6 ? 10 : 11; - th01 = div < 2 ? 2 : div < 6 ? 3 : 4; - - if($signed(rem) < $signed(-th12)) srt_table = -2; - else if($signed(rem) < $signed(-th01)) srt_table = -1; - else if($signed(rem) < $signed( th01)) srt_table = 0; - else if($signed(rem) < $signed( th12)) srt_table = 1; - else srt_table = 2; - endfunction - - reg [31:0] lhs; - reg [31:0] rhs; - reg is_divide; - reg [2:0] round_mode; - - function [9:0] leading_zeros_count; - input[22:0] x; - for(leading_zeros_count = 0; leading_zeros_count <= 22; leading_zeros_count = leading_zeros_count + 1) - if(x[22-leading_zeros_count]) break; - endfunction - - wire lhs_sign = lhs[31]; - wire rhs_sign = rhs[31]; - wire [7:0] lhs_expo = lhs[30:23]; - wire [7:0] rhs_expo = rhs[30:23]; - wire[22:0] lhs_mant = lhs[22:0]; - wire[22:0] rhs_mant = rhs[22:0]; - - // NaN handling - wire lhs_is_zero = lhs_expo == 8'h00 & lhs_mant == 0; - wire rhs_is_zero = rhs_expo == 8'h00 & rhs_mant == 0; - wire lhs_is_inf = lhs_expo == 8'hff & lhs_mant == 0; - wire rhs_is_inf = rhs_expo == 8'hff & rhs_mant == 0; - wire lhs_is_nan = lhs_expo == 8'hff & lhs_mant != 0; - wire rhs_is_nan = rhs_expo == 8'hff & rhs_mant != 0; - wire lhs_is_snan = lhs_is_nan & lhs_mant[22] == 0; - wire rhs_is_snan = rhs_is_nan & rhs_mant[22] == 0; - wire lhs_is_neg = !lhs_is_nan & lhs_sign & lhs != 32'h80000000; - wire res_is_nan = is_divide ? lhs_is_nan | rhs_is_nan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) - : lhs_is_nan | lhs_is_neg; - // === About handling NaN === - // x86 returns the following qNaN: - // mullhs_is_nan ? mullhs | 32'h00400000 : - // mulrhs_is_nan ? mulrhs | 32'h00400000 : - // addend_is_nan ? addend | 32'h00400000 : 32'hffc00000 - // RISC-V always returns canonical NaN (32'h7fc00000). - // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 - wire[31:0] nan = 32'h7fc00000; - wire invalid_operation = is_divide ? lhs_is_snan | rhs_is_snan | (lhs_is_zero & rhs_is_zero) | (lhs_is_inf & rhs_is_inf) - : lhs_is_snan | lhs_is_neg; - - // Preparation - wire result_sign = is_divide ? lhs_sign ^ rhs_sign : lhs_sign; - wire [9:0] v_lhs_expo = lhs_expo == 0 ? -leading_zeros_count(lhs_mant) : { 2'b0, lhs_expo }; // biased virtual exponent (ignores subnormals) - wire [9:0] v_rhs_expo = rhs_expo == 0 ? -leading_zeros_count(rhs_mant) : { 2'b0, rhs_expo }; // biased virtual exponent (ignores subnormals) - wire[23:0] v_lhs_mant_w = lhs_expo == 0 ? { lhs_mant, 1'b0 } << leading_zeros_count(lhs_mant) : { 1'b1, lhs_mant }; - wire[23:0] v_rhs_mant_w = rhs_expo == 0 ? { rhs_mant, 1'b0 } << leading_zeros_count(rhs_mant) : { 1'b1, rhs_mant }; - reg [23:0] v_lhs_mant, v_rhs_mant; - wire dividend_normalize = v_lhs_mant < v_rhs_mant; - wire [9:0] virtual_expo_w = v_lhs_expo - v_rhs_expo + 127 - { 8'h0, dividend_normalize }; // new biased virtual exponent (ignores subnormals) - wire subnormal_w = is_divide & $signed(virtual_expo_w) <= 0; - - // The SRT loop. rem needs 27 bits. 24(mantissa)+2(x8/3,SRT)+1(sign) - wire[26:0] rem_0 = is_divide ? dividend_normalize ? { 2'b00, v_lhs_mant, 1'b0 } : { 3'b000, v_lhs_mant } - : v_lhs_expo[0] ? { 2'b0, v_lhs_mant_w, 1'b0 } - 27'h1e40000 : { 1'b0, v_lhs_mant_w, 2'b0 } - 27'h2400000; // 2 * (x - 1.375^2 or 1.5^2) - wire[25:0] quo_0 = is_divide ? 26'h0 - : v_lhs_expo[0] ? 26'h1600000 : 26'h1800000; // magical initial guess: 1.375 or 1.5; this avoids SRT-table defects at ([-4.5,-4-11/36], 1.5) and ([-4,-4+1/144], 1.25) - - reg [3:0] stage; - reg [26:0] rem; - reg [25:0] quo; - reg [9:0] virtual_expo; - reg subnormal; - always@(posedge clk) begin - if (rst) begin - lhs <= '0; - rhs <= '0; - stage <= '0; - rem <= '0; - quo <= '0; - v_lhs_mant <= '0; - v_rhs_mant <= '0; - end - else if (stage == 13) begin - if (req) begin - lhs <= input_lhs; - rhs <= input_rhs; - is_divide <= input_is_divide; - round_mode <= input_round_mode; - stage <= input_is_divide ? 14 : 15; - end - end else if (stage == 14) begin - v_lhs_mant <= v_lhs_mant_w; - v_rhs_mant <= v_rhs_mant_w; - stage <= 15; - end else if (stage == 15) begin - rem <= rem_0; - quo <= quo_0; - stage <= is_divide ? 0 : 1; - virtual_expo <= virtual_expo_w; - subnormal <= subnormal_w; - end else begin - reg[3:0] div = is_divide ? { 1'b0, v_rhs_mant[22:20] } - : { quo[25], quo[23:21] }; - reg[2:0] q = srt_table( rem[26:21], div ); - case(q) - 3'b010: rem <= is_divide ? (rem << 2) - { v_rhs_mant, 3'b000 } - : (rem << 2) - { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2)); - 3'b001: rem <= is_divide ? (rem << 2) - { 1'b0, v_rhs_mant, 2'b00 } - : (rem << 2) - { quo, 1'b0 } - (27'd1 << (24-stage*2)); - 3'b111: rem <= is_divide ? (rem << 2) + { 1'b0, v_rhs_mant, 2'b00 } - : (rem << 2) + { quo, 1'b0 } - (27'd1 << (24-stage*2)); - 3'b110: rem <= is_divide ? (rem << 2) + { v_rhs_mant, 3'b000 } - : (rem << 2) + { quo[24:0], 2'b00 } - (27'd4 << (24-stage*2)); - default: rem <= rem << 2; - endcase - quo <= quo + ({ {23{q[2]}}, q } << (24-stage*2)); - stage <= stage + 1; - end - end - assign finished = stage == 13; // Here, quo has a <1/3ULP error. - - wire[47:0] before_round = subnormal ? { 1'b1, quo[23:0], 23'h0 } >> -virtual_expo : { quo[23:0], 24'h0 }; - wire round_away = round_to_away(round_mode, result_sign, before_round[25], before_round[24], before_round[23:0] != 0, $signed(rem) > 0, rem == 0); - wire round_fall = round_mode == 2 ? !result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud downward - round_mode == 3 ? result_sign & before_round[24:0] == 0 & $signed(rem) < 0 : // ronud upward - round_mode == 1 ? before_round[24:0] == 0 & $signed(rem) < 0 // round towards zero - : 0; - wire exp_plus_one = before_round[47:25] == 23'h7fffff & round_away; - // Since dividend is normalized, situations where before_round[24:0] == 0 & $signed(rem) < 0 do not happen; thus, `exp_minus_one' is always zero. - // wire exp_minus_one = before_round[47:25] == 23'h000000 & round_fall; - wire[22:0] result_mant = before_round[47:25] + { 22'h0, round_away } - { 22'h0, round_fall }; // No special treatment is required even if a overflow occurs since the answer will be correct. - wire [7:0] result_expo = is_divide ? (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'h0, exp_plus_one } - : v_lhs_expo[8:1] + { 7'b0, v_lhs_expo[0] } + 63 + { 7'h0, exp_plus_one }; - - // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent) - wire u_round_away = round_to_away(round_mode, result_sign, quo[1], quo[0], 1'b0, $signed(rem) > 0, rem == 0); - wire u_exp_plus_one = before_round[47:24] == 24'hffffff & u_round_away; - - // Special cases - wire res_is_huge = is_divide & $signed(virtual_expo) >= 255; - wire res_is_tiny = is_divide & !lhs_is_zero & !rhs_is_inf & $signed(virtual_expo) <= -24; - wire res_is_inf = is_divide ? lhs_is_inf | rhs_is_zero - : lhs_is_inf; - wire res_is_zero = is_divide ? lhs_is_zero | rhs_is_inf - : lhs_is_zero; - wire dir_is_away = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign); - wire huge_is_inf = round_mode == 0 | round_mode == 4 | dir_is_away; - - wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; - wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; - wire[31:0] inf = { result_sign, 8'hff, 23'h0 }; - wire[31:0] zero = { result_sign, 8'h00, 23'h0 }; - - // Final result - assign result = res_is_nan ? nan : - res_is_inf ? inf : - res_is_huge ? huge : - res_is_tiny ? tiny : - res_is_zero ? zero : { result_sign, result_expo, result_mant }; - // Exception flags - wire divide_by_zero = is_divide & !res_is_nan & !lhs_is_inf & rhs_is_zero; - wire overflow = is_divide & !res_is_nan & !lhs_is_inf & !rhs_is_zero & (res_is_huge | (virtual_expo == 254 & exp_plus_one)); - wire inexact = !res_is_nan & !(is_divide ? lhs_is_zero | lhs_is_inf | rhs_is_zero | rhs_is_inf : lhs_is_zero) & (overflow | res_is_tiny | before_round[24:0] != 0 | rem != 0); - // === About underflow (UF) flag - // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) - // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 - wire underflow = inexact & subnormal & !u_exp_plus_one; - // NV DZ OF UF NX - assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact }; -endmodule diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA.sv index c6ce7e59..a0ece5d2 100644 --- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA.sv +++ b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA.sv @@ -1,10 +1,15 @@ -import FPUTypes::*; + + module FP32PipelinedFMA( - input logic clk, - input logic [31:0] mullhs, - input logic [31:0] mulrhs, - input logic [31:0] addend, - output logic [31:0] result +input + logic clk, + logic [31:0] mullhs, + logic [31:0] mulrhs, + logic [31:0] addend, + logic [2:0] round_mode, +output + logic [31:0] result, + logic [4:0] fflags ); FMAStage1RegPath stg0Out; @@ -27,11 +32,11 @@ module FP32PipelinedFMA( : multiplier_lhs * multiplier_rhs + multiplier_addend; end - FMAStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, is_sub, mlhs, mrhs, maddend); + FMAStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_sub, mlhs, mrhs, maddend); FMAStage1 stg1(clk, stg0Out, stg1Out); FMAStage2 stg2(clk, stg1Out, stg2Out, fma_result); FMAStage3 stg3(clk, stg2Out, stg3Out); - FMAStage4 stg4(clk, stg3Out, result); + FMAStage4 stg4(clk, stg3Out, result, fflags); endmodule module FMAStage0( @@ -40,11 +45,13 @@ module FMAStage0( input logic [31:0] mullhs, input logic [31:0] mulrhs, input logic [31:0] addend, + input logic [2:0] round_mode, output logic is_subtract, output logic [76:0] mlhs, output logic [76:0] mrhs, output logic [76:0] maddend ); + wire mullhs_sign = mullhs[31]; wire mulrhs_sign = mulrhs[31]; wire addend_sign = addend[31]; @@ -54,44 +61,72 @@ module FMAStage0( wire[22:0] mullhs_mant = mullhs[22:0]; wire[22:0] mulrhs_mant = mulrhs[22:0]; wire[22:0] addend_mant = addend[22:0]; - assign is_subtract = mullhs_sign ^ mulrhs_sign ^ addend_sign; + + assign is_subtract = mullhs_sign ^ mulrhs_sign ^ addend_sign; // NaN handling - wire mullhs_is_zero = mullhs_expo == 8'h00 & mullhs_mant == 0; - wire mulrhs_is_zero = mulrhs_expo == 8'h00 & mulrhs_mant == 0; - wire addend_is_zero = addend_expo == 8'h00 & addend_mant == 0; - wire mullhs_is_inf = mullhs_expo == 8'hff & mullhs_mant == 0; - wire mulrhs_is_inf = mulrhs_expo == 8'hff & mulrhs_mant == 0; - wire addend_is_inf = addend_expo == 8'hff & addend_mant == 0; - wire mullhs_is_nan = mullhs_expo == 8'hff & mullhs_mant != 0; - wire mulrhs_is_nan = mulrhs_expo == 8'hff & mulrhs_mant != 0; - wire addend_is_nan = addend_expo == 8'hff & addend_mant != 0; - wire result_is_nan = mullhs_is_nan | mulrhs_is_nan | addend_is_nan // One of the input is NaN - | (mullhs_is_zero & mulrhs_is_inf) | (mullhs_is_inf & mulrhs_is_zero) // Inf * Zero - | (is_subtract & (mullhs_is_inf | mulrhs_is_inf) & addend_is_inf); // Inf - Inf - //wire[31:0] nan = mullhs_is_nan ? mullhs | 32'h00400000 : mulrhs_is_nan ? mulrhs | 32'h00400000 : addend_is_nan ? addend | 32'h00400000 : 32'hffc00000; // qNan - wire[31:0] nan = 32'h7fc00000; + wire mullhs_is_zero = mullhs_expo == 8'h00 & mullhs_mant == 0; + wire mulrhs_is_zero = mulrhs_expo == 8'h00 & mulrhs_mant == 0; + wire addend_is_zero = addend_expo == 8'h00 & addend_mant == 0; + wire mullhs_is_inf = mullhs_expo == 8'hff & mullhs_mant == 0; + wire mulrhs_is_inf = mulrhs_expo == 8'hff & mulrhs_mant == 0; + wire addend_is_inf = addend_expo == 8'hff & addend_mant == 0; + wire mullhs_is_nan = mullhs_expo == 8'hff & mullhs_mant != 0; + wire mulrhs_is_nan = mulrhs_expo == 8'hff & mulrhs_mant != 0; + wire addend_is_nan = addend_expo == 8'hff & addend_mant != 0; + wire mullhs_is_snan = mullhs_is_nan & mullhs_mant[22] == 0; + wire mulrhs_is_snan = mulrhs_is_nan & mulrhs_mant[22] == 0; + wire addend_is_snan = addend_is_nan & addend_mant[22] == 0; + wire mulres_is_inf = (mullhs_is_inf & !mulrhs_is_nan) | (!mullhs_is_nan & mulrhs_is_inf); + wire mulres_is_zero = mullhs_is_zero | mulrhs_is_zero; + wire res_is_addend = mulres_is_zero & !addend_is_zero; + // === About setting invalid operation (NV) flag === + // x86 does not set the NV flag on ±0×±∞±qNaN. + // RISC-V sets the NV flag on ±0×±∞±qNaN. + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.116 + wire invalid_operation = mullhs_is_snan | mulrhs_is_snan | addend_is_snan // One of the input values is sNaN + | (mullhs_is_zero & mulrhs_is_inf) | (mullhs_is_inf & mulrhs_is_zero) // Inf * Zero + | (is_subtract & mulres_is_inf & addend_is_inf); // Inf - Inf + wire result_is_nan = mullhs_is_nan | mulrhs_is_nan | addend_is_nan | invalid_operation; + // === About handling NaN === + // x86 returns the following qNaN: + // mullhs_is_nan ? mullhs | 32'h00400000 : + // mulrhs_is_nan ? mulrhs | 32'h00400000 : + // addend_is_nan ? addend | 32'h00400000 : 32'hffc00000 + // RISC-V always returns canonical NaN (32'h7fc00000). + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 + wire[31:0] nan = 32'h7fc00000; // Inf handling - wire result_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf; - wire prop_inf_sign = addend_is_inf ? addend_sign : mullhs_sign ^ mulrhs_sign; + wire res_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf; wire mul_sign = mullhs_sign ^ mulrhs_sign; + wire inf_sign = addend_is_inf ? addend_sign : mul_sign; + + // Main path (including subnormal handling) + wire [9:0] v_mullhs_expo = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo }; + wire [9:0] v_mulrhs_expo = { 2'b0, mulrhs_expo == 8'h00 ? 8'h01 : mulrhs_expo }; + wire [9:0] v_addend_expo = { 2'b0, addend_expo == 8'h00 ? 8'h01 : addend_expo }; + wire[23:0] v_mullhs_mant = { mullhs_expo != 8'h00, mullhs_mant }; + wire[23:0] v_mulrhs_mant = { mulrhs_expo != 8'h00, mulrhs_mant }; + wire[23:0] v_addend_mant = { addend_expo != 8'h00, addend_mant }; + wire [9:0] v_fmares_expo = v_mullhs_expo + v_mulrhs_expo - 127 + 26; // See below: There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs. + wire [9:0] addend_shift = v_fmares_expo - v_addend_expo; + wire[74:0] shifted_addend = { v_addend_mant, 2'b00, 49'b0 } >> addend_shift; // The 2'b00 are the guard bit and the round bit. + wire addend_sticky = $signed(addend_shift) > 75 ? v_addend_mant != 0 + : v_addend_mant << (10'd75 - addend_shift) != 24'h000000; // the part shifted out above + // Special cases + wire mulres_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps) + wire res_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN - wire [9:0] v_mullhs_expo = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo }; - wire [9:0] v_mulrhs_expo = { 2'b0, mulrhs_expo == 8'h00 ? 8'h01 : mulrhs_expo }; - wire [9:0] v_addend_expo = { 2'b0, addend_expo == 8'h00 ? 8'h01 : addend_expo }; - wire [9:0] mulres_expo = v_mullhs_expo + v_mulrhs_expo - 127; - wire [9:0] addend_shift = v_addend_expo - mulres_expo + 23; - wire res_is_addend = ($signed(addend_shift) > 49 | mullhs_is_zero | mulrhs_is_zero) & !addend_is_zero; // |lhs*rhs| < 0.5ULP(|addend|-eps); assuming round to nearest, result is equal to the addend. - wire addend_sticky = $signed(addend_shift) >= 0 ? 1'b0 : - $signed(addend_shift) < -26 ? { addend_expo != 8'h00, addend_mant } != 0 - : { addend_expo != 8'h00, addend_mant } << (10'd26 + addend_shift) != 24'h000000; // shifted out part of { mantissa(24bit), guard(1bit), round(1bit) } >> -addend_shift - assign maddend = { 1'b0, { addend_expo != 8'h00, addend_mant, 2'b00, 49'b0 } >> (10'd49 - addend_shift), addend_sticky }; // The 1'b0 is the sign bit. The 2'b0 are the gaurd bit and the round bit. - assign mlhs = { 51'b0, mullhs_expo != 8'h00, mullhs_mant, 2'b0 }; // lhs_expo != 8'h00 is the hidden bit of a normalized number - assign mrhs = { 52'b0, mulrhs_expo != 8'h00, mulrhs_mant, 1'b0 }; // rhs_expo != 8'h00 is the hidden bit of a normalized number - - assign stg0Out = {mulres_expo, result_is_inf, result_is_nan, - res_is_addend, mul_sign, prop_inf_sign, addend_sign, is_subtract, nan, addend}; + // Fused-multiply-adder (24bit*24bit<<3+76bit+sign) + // The multiplication result is shifted by 3 bits for the guard bit, the round bit, and the sticky bit. + // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| < 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23. + assign mlhs = { 51'b0, v_mullhs_mant, 2'b0 }; + assign mrhs = { 52'b0, v_mulrhs_mant, 1'b0 }; + assign maddend = { 1'b0, shifted_addend, addend_sticky }; + assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan, + res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, nan, addend, + mulres_is_tiny, res_is_tiny, invalid_operation, round_mode}; endmodule module FMAStage1( @@ -123,9 +158,10 @@ module FMAStage2( wire[75:0] abs_fma_result = res_is_negative ? -fma_result[75:0] : fma_result[75:0]; wire result_sign = mul_sign ^ res_is_negative; - assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, res_is_negative, pipeReg.result_is_inf, + assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf, pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign, - pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend}; + pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend, + pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode}; endmodule module FMAStage3( @@ -146,32 +182,50 @@ module FMAStage3( wire[75:0] abs_fma_result = pipeReg.abs_fma_result; wire [9:0] mulres_expo = pipeReg.mulres_expo; - wire [7:0] leading_zeros = { 1'b0, leading_zeros_count(abs_fma_result) }; // 0 <= leading_sign_bits <= 74 if !res_is_zero - wire [9:0] virtual_expo = mulres_expo - { 2'b00, leading_zeros } + 26; // There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs. + wire [6:0] leading_zeros = leading_zeros_count(abs_fma_result); // 0 <= leading_sign_bits <= 74 if !res_is_zero + wire [9:0] virtual_expo = mulres_expo - { 3'b00, leading_zeros }; // There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs. wire subnormal = $signed(virtual_expo) <= 0; - wire [7:0] fmares_shift = subnormal ? 26 - mulres_expo[7:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs. - : 51 - leading_zeros; // (75 - addend_sticky(1bit)) - shifter_result(24bit) + wire [6:0] fmares_shift = subnormal ? mulres_expo[6:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs. + : leading_zeros + 1; // (75 - addend_sticky(1bit)) - shifter_result(24bit) - assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.res_is_negative, pipeReg.result_is_inf, + assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf, pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign, - pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend}; + pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend, + pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode}; endmodule module FMAStage4( input logic clk, input FMAStage4RegPath stg4In, - output logic [31:0] result + output logic [31:0] result, + output logic [4:0] fflags ); + function round_to_away; + input sign; + input last_place; + input guard_bit; + input sticky_bit; + input[2:0] round_mode; + + case(round_mode) + 3'b000: round_to_away = guard_bit & (last_place | sticky_bit); // round to nearest, ties to even + 3'b100: round_to_away = guard_bit; // round to nearest, ties to away + 3'b010: round_to_away = sign & (guard_bit | sticky_bit); // round downward + 3'b011: round_to_away = !sign & (guard_bit | sticky_bit); // round upward + default: round_to_away = 0; // round towards zero + endcase + endfunction + FMAStage4RegPath pipeReg; always_ff @(posedge clk) begin pipeReg <= stg4In; end + wire[75:0] abs_fma_result = pipeReg.abs_fma_result; wire [7:0] fmares_shift = pipeReg.fmares_shift; wire [9:0] virtual_expo = pipeReg.virtual_expo; wire[31:0] nan = pipeReg.nan; wire[31:0] addend = pipeReg.addend; - wire res_is_negative = pipeReg.res_is_negative; wire result_is_inf = pipeReg.result_is_inf; wire result_is_nan = pipeReg.result_is_nan; wire res_is_zero = pipeReg.res_is_zero; @@ -181,23 +235,57 @@ module FMAStage4( wire addend_sign = pipeReg.addend_sign; wire subnormal = pipeReg.subnormal; wire is_subtract = pipeReg.is_subtract; - - /* verilator lint_off WIDTH */ - wire[23:0] shifter_result = { abs_fma_result, 23'b0 } >> (7'd23 + fmares_shift); - /* verilator lint_on WIDTH */ - wire sticky = abs_fma_result << (76 - fmares_shift) != 0; // the part shifted out above - - wire round_to_away = shifter_result[0] & (shifter_result[1] | sticky); // round to nearest, ties to even - wire exp_plus_one = shifter_result >= 24'hffffff; // carry is generated with rounding taken into account - - wire[22:0] result_mant = shifter_result[23:1] + { 22'h0, round_to_away }; // No special treatment is required even if a overflow occurs since the answer will be 0 and it will be correct. - wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one }; - wire res_is_inf = result_is_inf | $signed(virtual_expo) >= 255; - wire[31:0] inf = { result_is_inf ? prop_inf_sign : result_sign, 8'hff, 23'h0 }; - wire[31:0] zero = { is_subtract ? 1'b0 : addend_sign, 8'h00, 23'h0 }; - - wire[31:0] final_result = res_is_inf ? inf : - res_is_addend ? addend : - res_is_zero ? zero : { result_sign, result_expo, result_mant }; - assign result = result_is_nan ? nan : final_result; + wire mulres_is_tiny = pipeReg.mulres_is_tiny; + wire res_is_tiny = pipeReg.res_is_tiny; + wire invalid_operation = pipeReg.invalid_operation; + wire [2:0] round_mode = pipeReg.round_mode; + + // Normalize and rounding decision + wire[24:0] shifter_result = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) } + wire sticky = abs_fma_result << (7'd25 + fmares_shift) != 0; // the part right-shifted out above + + wire round_away = round_to_away(result_sign, shifter_result[2], shifter_result[1], shifter_result[0] | sticky, round_mode); + wire exp_plus_one = shifter_result >= 25'h1fffffc & round_away; // carry is generated with rounding taken into account + // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent) + wire u_round_away = round_to_away(result_sign, shifter_result[1], shifter_result[0], sticky, round_mode); + wire u_exp_plus_one = shifter_result >= 25'h1fffffe & u_round_away; // 0x1.fffffep-127 <= |mullhs*mulrhs+addend| < 0x1p-126 and the after rounding result become a normal number, not raising the underflow flag. + + wire[22:0] result_mant = shifter_result[24:2] + { 22'h0, round_away }; // No special treatment is required even if an overflow occurs since the answer will be 0 and it will be correct. + wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one }; + + // Special cases + wire res_is_huge = $signed(virtual_expo) >= 255; + wire dir_is_away = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign); + wire huge_is_inf = round_mode == 0 | round_mode == 4 | dir_is_away; + + wire[31:0] addend_plus_tiny = round_mode == 1 & is_subtract ? addend - 1 : + round_mode == 2 & !addend_sign & is_subtract ? addend - 1 : + round_mode == 3 & addend_sign & is_subtract ? addend - 1 : + round_mode == 2 & addend_sign & !is_subtract ? addend + 1 : + round_mode == 3 & !addend_sign & !is_subtract ? addend + 1 + : addend; + wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; + wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; + wire[31:0] zero = { is_subtract ? round_mode == 2 : addend_sign, 8'h00, 23'h0 }; + wire[31:0] inf = { prop_inf_sign, 8'hff, 23'h0 }; + + // Final result + assign result = result_is_nan ? nan : + result_is_inf ? inf : + res_is_huge ? huge : + res_is_tiny ? tiny : + mulres_is_tiny ? addend_plus_tiny : + res_is_addend ? addend : + res_is_zero ? zero : { result_sign, result_expo, result_mant }; + + // Exception flags + wire divide_by_zero = 1'b0; + wire overflow = !result_is_nan & !result_is_inf & (mulres_is_tiny ? addend_plus_tiny[30:23] == 8'hff : res_is_huge | (virtual_expo == 254 & exp_plus_one)); + wire inexact = !result_is_nan & !result_is_inf & (overflow | res_is_tiny | mulres_is_tiny | shifter_result[1] | shifter_result[0] | sticky); + // === About underflow (UF) flag + // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) + // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 + wire underflow = inexact & (mulres_is_tiny ? addend[30:23] == 8'h00 | addend_plus_tiny[30:23] == 8'h00 : res_is_tiny | (subnormal & !u_exp_plus_one)); + // NV DZ OF UF NX + assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact }; endmodule diff --git a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv b/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv deleted file mode 100644 index 3fdf8a7f..00000000 --- a/Processor/Src/FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv +++ /dev/null @@ -1,291 +0,0 @@ - - -module FP32PipelinedFMA_WithFFlags( -input - logic clk, - logic [31:0] mullhs, - logic [31:0] mulrhs, - logic [31:0] addend, - logic [2:0] round_mode, -output - logic [31:0] result, - logic [4:0] fflags -); - - FMA_WithFFlagsStage1RegPath stg0Out; - FMA_WithFFlagsStage2RegPath stg1Out; - FMA_WithFFlagsStage3RegPath stg2Out; - FMA_WithFFlagsStage4RegPath stg3Out; - - // Fused-multiply-adder (24bit*24bit<<3+76bit+sign) - // The multiplication result is shifted by 2 bits for the guard bit and the sticky bit. - // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| ~ 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23. - logic [76:0] multiplier_lhs, multiplier_rhs, multiplier_addend, fma_result; - logic [76:0] mlhs, mrhs, maddend; - logic is_subtract, is_sub; - always_ff @(posedge clk) begin - multiplier_lhs <= mlhs; - multiplier_rhs <= mrhs; - multiplier_addend <= maddend; - is_subtract <= is_sub; - fma_result <= is_subtract ? multiplier_lhs * multiplier_rhs - multiplier_addend - : multiplier_lhs * multiplier_rhs + multiplier_addend; - end - - FMA_WithFFlagsStage0 stg0(clk, stg0Out, mullhs, mulrhs, addend, round_mode, is_sub, mlhs, mrhs, maddend); - FMA_WithFFlagsStage1 stg1(clk, stg0Out, stg1Out); - FMA_WithFFlagsStage2 stg2(clk, stg1Out, stg2Out, fma_result); - FMA_WithFFlagsStage3 stg3(clk, stg2Out, stg3Out); - FMA_WithFFlagsStage4 stg4(clk, stg3Out, result, fflags); -endmodule - -module FMA_WithFFlagsStage0( - input logic clk, - output FMA_WithFFlagsStage1RegPath stg0Out, - input logic [31:0] mullhs, - input logic [31:0] mulrhs, - input logic [31:0] addend, - input logic [2:0] round_mode, - output logic is_subtract, - output logic [76:0] mlhs, - output logic [76:0] mrhs, - output logic [76:0] maddend -); - - wire mullhs_sign = mullhs[31]; - wire mulrhs_sign = mulrhs[31]; - wire addend_sign = addend[31]; - wire [7:0] mullhs_expo = mullhs[30:23]; - wire [7:0] mulrhs_expo = mulrhs[30:23]; - wire [7:0] addend_expo = addend[30:23]; - wire[22:0] mullhs_mant = mullhs[22:0]; - wire[22:0] mulrhs_mant = mulrhs[22:0]; - wire[22:0] addend_mant = addend[22:0]; - - assign is_subtract = mullhs_sign ^ mulrhs_sign ^ addend_sign; - - // NaN handling - wire mullhs_is_zero = mullhs_expo == 8'h00 & mullhs_mant == 0; - wire mulrhs_is_zero = mulrhs_expo == 8'h00 & mulrhs_mant == 0; - wire addend_is_zero = addend_expo == 8'h00 & addend_mant == 0; - wire mullhs_is_inf = mullhs_expo == 8'hff & mullhs_mant == 0; - wire mulrhs_is_inf = mulrhs_expo == 8'hff & mulrhs_mant == 0; - wire addend_is_inf = addend_expo == 8'hff & addend_mant == 0; - wire mullhs_is_nan = mullhs_expo == 8'hff & mullhs_mant != 0; - wire mulrhs_is_nan = mulrhs_expo == 8'hff & mulrhs_mant != 0; - wire addend_is_nan = addend_expo == 8'hff & addend_mant != 0; - wire mullhs_is_snan = mullhs_is_nan & mullhs_mant[22] == 0; - wire mulrhs_is_snan = mulrhs_is_nan & mulrhs_mant[22] == 0; - wire addend_is_snan = addend_is_nan & addend_mant[22] == 0; - wire mulres_is_inf = (mullhs_is_inf & !mulrhs_is_nan) | (!mullhs_is_nan & mulrhs_is_inf); - wire mulres_is_zero = mullhs_is_zero | mulrhs_is_zero; - wire res_is_addend = mulres_is_zero & !addend_is_zero; - // === About setting invalid operation (NV) flag === - // x86 does not set the NV flag on ±0×±∞±qNaN. - // RISC-V sets the NV flag on ±0×±∞±qNaN. - // --- The RISC-V Instruction Set Manual 20240411 Volume I p.116 - wire invalid_operation = mullhs_is_snan | mulrhs_is_snan | addend_is_snan // One of the input values is sNaN - | (mullhs_is_zero & mulrhs_is_inf) | (mullhs_is_inf & mulrhs_is_zero) // Inf * Zero - | (is_subtract & mulres_is_inf & addend_is_inf); // Inf - Inf - wire result_is_nan = mullhs_is_nan | mulrhs_is_nan | addend_is_nan | invalid_operation; - // === About handling NaN === - // x86 returns the following qNaN: - // mullhs_is_nan ? mullhs | 32'h00400000 : - // mulrhs_is_nan ? mulrhs | 32'h00400000 : - // addend_is_nan ? addend | 32'h00400000 : 32'hffc00000 - // RISC-V always returns canonical NaN (32'h7fc00000). - // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 - wire[31:0] nan = 32'h7fc00000; - - // Inf handling - wire res_is_inf = addend_is_inf | mullhs_is_inf | mulrhs_is_inf; - wire mul_sign = mullhs_sign ^ mulrhs_sign; - wire inf_sign = addend_is_inf ? addend_sign : mul_sign; - - // Main path (including subnormal handling) - wire [9:0] v_mullhs_expo = { 2'b0, mullhs_expo == 8'h00 ? 8'h01 : mullhs_expo }; - wire [9:0] v_mulrhs_expo = { 2'b0, mulrhs_expo == 8'h00 ? 8'h01 : mulrhs_expo }; - wire [9:0] v_addend_expo = { 2'b0, addend_expo == 8'h00 ? 8'h01 : addend_expo }; - wire[23:0] v_mullhs_mant = { mullhs_expo != 8'h00, mullhs_mant }; - wire[23:0] v_mulrhs_mant = { mulrhs_expo != 8'h00, mulrhs_mant }; - wire[23:0] v_addend_mant = { addend_expo != 8'h00, addend_mant }; - wire [9:0] v_fmares_expo = v_mullhs_expo + v_mulrhs_expo - 127 + 26; // See below: There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs. - wire [9:0] addend_shift = v_fmares_expo - v_addend_expo; - wire[74:0] shifted_addend = { v_addend_mant, 2'b00, 49'b0 } >> addend_shift; // The 2'b00 are the guard bit and the round bit. - wire addend_sticky = $signed(addend_shift) > 75 ? v_addend_mant != 0 - : v_addend_mant << (10'd75 - addend_shift) != 24'h000000; // the part shifted out above - // Special cases - wire mulres_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & !addend_is_zero; // |mullhs*mulrhs| < 0.5ULP(|addend|-eps) - wire res_is_tiny = $signed(addend_shift) < 0 & !mulres_is_zero & addend_is_zero; // |mullhs*mulrhs+addend| < 0.5FLT_TRUE_MIN - - // Fused-multiply-adder (24bit*24bit<<3+76bit+sign) - // The multiplication result is shifted by 3 bits for the guard bit, the round bit, and the sticky bit. - // The adder is sufficient for 76 bits + 1 sign bit because |lhs*rhs<<3| < 2^51 is <0.5 ULP when subtracted from 2^76. Note: ULP(1-eps) = 2^-24 while ULP(1+eps) = 2^-23. - assign mlhs = { 51'b0, v_mullhs_mant, 2'b0 }; - assign mrhs = { 52'b0, v_mulrhs_mant, 1'b0 }; - assign maddend = { 1'b0, shifted_addend, addend_sticky }; - assign stg0Out = {v_fmares_expo, res_is_inf, result_is_nan, - res_is_addend, mul_sign, inf_sign, addend_sign, is_subtract, nan, addend, - mulres_is_tiny, res_is_tiny, invalid_operation, round_mode}; -endmodule - -module FMA_WithFFlagsStage1( - input logic clk, - input FMA_WithFFlagsStage1RegPath stg1In, - output FMA_WithFFlagsStage2RegPath stg1Out -); - FMA_WithFFlagsStage1RegPath pipeReg; - always_ff @(posedge clk) begin - pipeReg <= stg1In; - end - assign stg1Out = pipeReg; -endmodule - -module FMA_WithFFlagsStage2( - input logic clk, - input FMA_WithFFlagsStage2RegPath stg2In, - output FMA_WithFFlagsStage3RegPath stg2Out, - input logic [76:0] fma_result -); - FMA_WithFFlagsStage2RegPath pipeReg; - always_ff @(posedge clk) begin - pipeReg <= stg2In; - end - - wire mul_sign = pipeReg.mul_sign; - wire res_is_zero = fma_result == 77'h0; - wire res_is_negative = fma_result[76]; - wire[75:0] abs_fma_result = res_is_negative ? -fma_result[75:0] : fma_result[75:0]; - wire result_sign = mul_sign ^ res_is_negative; - - assign stg2Out = {abs_fma_result, pipeReg.mulres_expo, pipeReg.result_is_inf, - pipeReg.result_is_nan, res_is_zero, pipeReg.res_is_addend, result_sign, - pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend, - pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode}; -endmodule - -module FMA_WithFFlagsStage3( - input logic clk, - input FMA_WithFFlagsStage3RegPath stg3In, - output FMA_WithFFlagsStage4RegPath stg3Out -); - function automatic [6:0] leading_zeros_count; - input[75:0] x; - for(leading_zeros_count = 0; leading_zeros_count <= 75; leading_zeros_count = leading_zeros_count + 1) - if(x[75-leading_zeros_count]) break; - endfunction - - FMA_WithFFlagsStage3RegPath pipeReg; - always_ff @(posedge clk) begin - pipeReg <= stg3In; - end - wire[75:0] abs_fma_result = pipeReg.abs_fma_result; - wire [9:0] mulres_expo = pipeReg.mulres_expo; - - wire [6:0] leading_zeros = leading_zeros_count(abs_fma_result); // 0 <= leading_sign_bits <= 74 if !res_is_zero - wire [9:0] virtual_expo = mulres_expo - { 3'b00, leading_zeros }; // There are 26 bits above lhs*rhs<<3, assuming no carryover occurs in lhs*rhs. - wire subnormal = $signed(virtual_expo) <= 0; - wire [6:0] fmares_shift = subnormal ? mulres_expo[6:0] // There are 3 bits below lhs*rhs<<3, and 23 bits will be lost due to rounding, assuming no carryover occurs in lhs*rhs. - : leading_zeros + 1; // (75 - addend_sticky(1bit)) - shifter_result(24bit) - - assign stg3Out = {abs_fma_result, fmares_shift, virtual_expo, subnormal, pipeReg.result_is_inf, - pipeReg.result_is_nan, pipeReg.res_is_zero, pipeReg.res_is_addend, pipeReg.result_sign, - pipeReg.prop_inf_sign, pipeReg.addend_sign, pipeReg.is_subtract, pipeReg.nan, pipeReg.addend, - pipeReg.mulres_is_tiny, pipeReg.res_is_tiny, pipeReg.invalid_operation, pipeReg.round_mode}; -endmodule - -module FMA_WithFFlagsStage4( - input logic clk, - input FMA_WithFFlagsStage4RegPath stg4In, - output logic [31:0] result, - output logic [4:0] fflags -); - function round_to_away; - input sign; - input last_place; - input guard_bit; - input sticky_bit; - input[2:0] round_mode; - - case(round_mode) - 3'b000: round_to_away = guard_bit & (last_place | sticky_bit); // round to nearest, ties to even - 3'b100: round_to_away = guard_bit; // round to nearest, ties to away - 3'b010: round_to_away = sign & (guard_bit | sticky_bit); // round downward - 3'b011: round_to_away = !sign & (guard_bit | sticky_bit); // round upward - default: round_to_away = 0; // round towards zero - endcase - endfunction - - FMA_WithFFlagsStage4RegPath pipeReg; - always_ff @(posedge clk) begin - pipeReg <= stg4In; - end - - wire[75:0] abs_fma_result = pipeReg.abs_fma_result; - wire [7:0] fmares_shift = pipeReg.fmares_shift; - wire [9:0] virtual_expo = pipeReg.virtual_expo; - wire[31:0] nan = pipeReg.nan; - wire[31:0] addend = pipeReg.addend; - wire result_is_inf = pipeReg.result_is_inf; - wire result_is_nan = pipeReg.result_is_nan; - wire res_is_zero = pipeReg.res_is_zero; - wire res_is_addend = pipeReg.res_is_addend; - wire result_sign = pipeReg.result_sign; - wire prop_inf_sign = pipeReg.prop_inf_sign; - wire addend_sign = pipeReg.addend_sign; - wire subnormal = pipeReg.subnormal; - wire is_subtract = pipeReg.is_subtract; - wire mulres_is_tiny = pipeReg.mulres_is_tiny; - wire res_is_tiny = pipeReg.res_is_tiny; - wire invalid_operation = pipeReg.invalid_operation; - wire [2:0] round_mode = pipeReg.round_mode; - - // Normalize and rounding decision - wire[24:0] shifter_result = { abs_fma_result, 24'b0 } >> (7'd75 - fmares_shift); // [75:0] -> [24:0] normalizing left shift emulation. The 24'b0 is needed for cases where large cancellations occur. [24:0] = { mantissa(23bit), guard(1bit), extra_guard_for_underflow_detection(1bit) } - wire sticky = abs_fma_result << (7'd25 + fmares_shift) != 0; // the part right-shifted out above - - wire round_away = round_to_away(result_sign, shifter_result[2], shifter_result[1], shifter_result[0] | sticky, round_mode); - wire exp_plus_one = shifter_result >= 25'h1fffffc & round_away; // carry is generated with rounding taken into account - // Treat p-127 as a normal for the underflow flag (rounding with unbounded exponent) - wire u_round_away = round_to_away(result_sign, shifter_result[1], shifter_result[0], sticky, round_mode); - wire u_exp_plus_one = shifter_result >= 25'h1fffffe & u_round_away; // 0x1.fffffep-127 <= |mullhs*mulrhs+addend| < 0x1p-126 and the after rounding result become a normal number, not raising the underflow flag. - - wire[22:0] result_mant = shifter_result[24:2] + { 22'h0, round_away }; // No special treatment is required even if an overflow occurs since the answer will be 0 and it will be correct. - wire [7:0] result_expo = (subnormal ? 8'h00 : virtual_expo[7:0]) + { 7'b0, exp_plus_one }; - - // Special cases - wire res_is_huge = $signed(virtual_expo) >= 255; - wire dir_is_away = (round_mode == 2 & result_sign) | (round_mode == 3 & !result_sign); - wire huge_is_inf = round_mode == 0 | round_mode == 4 | dir_is_away; - - wire[31:0] addend_plus_tiny = round_mode == 1 & is_subtract ? addend - 1 : - round_mode == 2 & !addend_sign & is_subtract ? addend - 1 : - round_mode == 3 & addend_sign & is_subtract ? addend - 1 : - round_mode == 2 & addend_sign & !is_subtract ? addend + 1 : - round_mode == 3 & !addend_sign & !is_subtract ? addend + 1 - : addend; - wire[31:0] huge = huge_is_inf ? { result_sign, 8'hff, 23'h0 } : { result_sign, 8'hfe, 23'h7fffff }; - wire[31:0] tiny = dir_is_away ? { result_sign, 8'h00, 23'h1 } : { result_sign, 8'h00, 23'h0 }; - wire[31:0] zero = { is_subtract ? round_mode == 2 : addend_sign, 8'h00, 23'h0 }; - wire[31:0] inf = { prop_inf_sign, 8'hff, 23'h0 }; - - // Final result - assign result = result_is_nan ? nan : - result_is_inf ? inf : - res_is_huge ? huge : - res_is_tiny ? tiny : - mulres_is_tiny ? addend_plus_tiny : - res_is_addend ? addend : - res_is_zero ? zero : { result_sign, result_expo, result_mant }; - - // Exception flags - wire divide_by_zero = 1'b0; - wire overflow = !result_is_nan & !result_is_inf & (mulres_is_tiny ? addend_plus_tiny[30:23] == 8'hff : res_is_huge | (virtual_expo == 254 & exp_plus_one)); - wire inexact = !result_is_nan & !result_is_inf & (overflow | res_is_tiny | mulres_is_tiny | shifter_result[1] | shifter_result[0] | sticky); - // === About underflow (UF) flag - // RISC-V sets the UF flag when the absolute value of the result after rounding is less than FLT_MIN and the result is inexact. (same as x86) - // --- The RISC-V Instruction Set Manual 20240411 Volume I p.114 - wire underflow = inexact & (mulres_is_tiny ? addend[30:23] == 8'h00 | addend_plus_tiny[30:23] == 8'h00 : res_is_tiny | (subnormal & !u_exp_plus_one)); - // NV DZ OF UF NX - assign fflags = { invalid_operation, divide_by_zero, overflow, underflow, inexact }; -endmodule diff --git a/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv b/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv index 18f73d5f..b8e78866 100644 --- a/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv +++ b/Processor/Src/FloatingPointUnit/FPDivSqrtUnit.sv @@ -25,7 +25,7 @@ module FPDivSqrtUnit(FPDivSqrtUnitIF.FPDivSqrtUnit port, RecoveryManagerIF.FPDiv ActiveListIndexPath nextActiveListPtr[FP_DIVSQRT_ISSUE_WIDTH]; for (genvar i = 0; i < FP_DIVSQRT_ISSUE_WIDTH; i++) begin : BlockDivUnit - FP32DivSqrterWithFFlags fpDivSqrter( + FP32DivSqrter fpDivSqrter( .clk(port.clk), .rst(rst_divider[i]), .input_lhs(port.dataInA[i]), diff --git a/Processor/Src/FloatingPointUnit/FPUTypes.sv b/Processor/Src/FloatingPointUnit/FPUTypes.sv index 2f313d85..d4c989f7 100644 --- a/Processor/Src/FloatingPointUnit/FPUTypes.sv +++ b/Processor/Src/FloatingPointUnit/FPUTypes.sv @@ -111,68 +111,6 @@ typedef struct packed { logic [25:0] quo; } FDivSqrtRegPath; -// Pipeline registers for old FMA -typedef struct packed { - logic [9:0] mulres_expo; - logic result_is_inf; - logic result_is_nan; - logic res_is_addend; - logic mul_sign; - logic prop_inf_sign; - logic addend_sign; - logic is_subtract; - logic [31:0] nan; - logic [31:0] addend; -} FMAStage1RegPath; - -typedef struct packed { - logic [9:0] mulres_expo; - logic result_is_inf; - logic result_is_nan; - logic res_is_addend; - logic mul_sign; - logic prop_inf_sign; - logic addend_sign; - logic is_subtract; - logic [31:0] nan; - logic [31:0] addend; -} FMAStage2RegPath; - -typedef struct packed { - logic [75:0] abs_fma_result; - logic [9:0] mulres_expo; - logic res_is_negative; - logic result_is_inf; - logic result_is_nan; - logic res_is_zero; - logic res_is_addend; - logic result_sign; - logic prop_inf_sign; - logic addend_sign; - logic is_subtract; - logic [31:0] nan; - logic [31:0] addend; -} FMAStage3RegPath; - -typedef struct packed { - logic [75:0] abs_fma_result; - logic [7:0] fmares_shift; - logic [9:0] virtual_expo; - logic subnormal; - logic res_is_negative; - logic result_is_inf; - logic result_is_nan; - logic res_is_zero; - logic res_is_addend; - logic result_sign; - logic prop_inf_sign; - logic addend_sign; - logic is_subtract; - logic [31:0] nan; - logic [31:0] addend; -} FMAStage4RegPath; - -// Pipeline registers for FMA with fflags typedef struct packed { logic [9:0] mulres_expo; logic result_is_inf; @@ -188,7 +126,7 @@ typedef struct packed { logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; -} FMA_WithFFlagsStage1RegPath; +} FMAStage1RegPath; typedef struct packed { logic [9:0] mulres_expo; @@ -205,7 +143,7 @@ typedef struct packed { logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; -} FMA_WithFFlagsStage2RegPath; +} FMAStage2RegPath; typedef struct packed { logic [75:0] abs_fma_result; @@ -224,7 +162,7 @@ typedef struct packed { logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; -} FMA_WithFFlagsStage3RegPath; +} FMAStage3RegPath; typedef struct packed { logic [75:0] abs_fma_result; @@ -245,6 +183,6 @@ typedef struct packed { logic res_is_tiny; logic invalid_operation; logic [2:0] round_mode; -} FMA_WithFFlagsStage4RegPath; +} FMAStage4RegPath; endpackage \ No newline at end of file diff --git a/Processor/Src/Makefiles/CoreSources.inc.mk b/Processor/Src/Makefiles/CoreSources.inc.mk index 6abd1698..d33567dc 100644 --- a/Processor/Src/Makefiles/CoreSources.inc.mk +++ b/Processor/Src/Makefiles/CoreSources.inc.mk @@ -114,10 +114,8 @@ CORE_MODULES = \ FloatingPointUnit/FP32PipelinedAdder.sv \ FloatingPointUnit/FP32PipelinedMultiplier.sv \ FloatingPointUnit/FP32PipelinedFMA.sv \ - FloatingPointUnit/FP32PipelinedFMA_WithFFlags.sv \ FloatingPointUnit/FP32PipelinedOther.sv \ FloatingPointUnit/FP32DivSqrter.sv \ - FloatingPointUnit/FP32DivSqrterWithFFlags.sv \ FloatingPointUnit/FPDivSqrtUnit.sv \ FloatingPointUnit/FPDivSqrtUnitIF.sv \ RenameLogic/RenameLogic.sv \ diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv index 72703102..cd950f53 100644 --- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv +++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv @@ -148,7 +148,7 @@ module FPExecutionStage( logic isDivSqrt [ FP_ISSUE_WIDTH ]; for ( genvar i = 0; i < FP_ISSUE_WIDTH; i++ ) begin - FP32PipelinedFMA_WithFFlags fpFMA ( + FP32PipelinedFMA fpFMA ( .clk (port.clk), .mullhs (fmaMulLHS[i]), .mulrhs (fmaMulRHS[i]), From 36f86021decf763d2daef7c8b9fa900268ca2eac Mon Sep 17 00:00:00 2001 From: Reoma Matsuo Date: Sun, 22 Dec 2024 00:47:58 +0900 Subject: [PATCH 11/11] refactor: modify the comment related to the addend of FMA --- .../Src/Pipeline/FPBackEnd/FPExecutionStage.sv | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv index cd950f53..b23bff92 100644 --- a/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv +++ b/Processor/Src/Pipeline/FPBackEnd/FPExecutionStage.sv @@ -177,10 +177,18 @@ module FPExecutionStage( fmaMulLHS[i] = fpuCode[i] inside {FC_FNMSUB, FC_FNMADD} ? {~fuOpA[i].data[31], fuOpA[i].data[30:0]} : fuOpA[i].data; fmaMulRHS[i] = fpuCode[i] inside {FC_ADD, FC_SUB} ? 32'h3f800000 : fuOpB[i].data; if(fpuCode[i] == FC_MUL) begin - // Hack: set sign bit considering rounding mode - // +a * +0.0 should return +0.0 regardless of rounding mode, - // However, when implemented with fma(+a, +0.0, -0.0), - // it returns -0.0 when round_mode = 2 + // If the arithmetical result is not zero, + // adding either -0.0 or +0.0 will produce the same result as the multiplication result. + // If the arithmetical result is zero, + // adding a zero with the same sign ensures that the result matches the multiplication result. + // Therefore, this approach is valid. + // + // Always adding +0.0 is incorrect: + // when the round_mode != 2 (downward) and the multiplication result is -0.0, + // the output will incorrectly become +0.0. + // Similarly, always adding -0.0 is also incorrect: + // when the round_mode == 2 (downward) and the multiplication result is +0.0, + // the output will incorrectly become -0.0. fmaAddend[i] = { fmaMulLHS[i][31] ^ fmaMulRHS[i][31] , 31'h0 }; end else if (fpuCode[i] == FC_ADD) begin