UberDDR3/delete_later/rtl/cpu/mpyop.v

429 lines
11 KiB
Verilog

////////////////////////////////////////////////////////////////////////////////
//
// Filename: mpyop.v
// {{{
// Project: 10Gb Ethernet switch
//
// Purpose: This code has been pulled from the cpuops.v file so as to
// encapsulate the multiply component--the one component that
// (can't be) formally verified well, and so must be abstracted away.
// This separation was done to support potential future abstraction.
//
//
// Creator: Dan Gisselquist, Ph.D.
// Gisselquist Technology, LLC
//
////////////////////////////////////////////////////////////////////////////////
// }}}
// Copyright (C) 2023, Gisselquist Technology, LLC
// {{{
// This file is part of the ETH10G project.
//
// The ETH10G project contains free software and gateware, licensed under the
// Apache License, Version 2.0 (the "License"). You may not use this project,
// or this file, except in compliance with the License. You may obtain a copy
// of the License at
// }}}
// http://www.apache.org/licenses/LICENSE-2.0
// {{{
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
//
////////////////////////////////////////////////////////////////////////////////
//
`default_nettype none
// }}}
module mpyop #(
// {{{
// The following parameter selects which multiply algorithm we
// use. Timing performance is strictly dependent upon it.
// OPY_MPY
// ------
// 0 No multiply
// 1 Single op multiply, same timing as an ADD
// 2 Two clock multiply
// 3 Three clock multiply, standard Xlnx DSP timing
// 4 Three clock multiply, Xilinx Spartan DSP timing
// (Anything else) -- low logic slow multiply
// 36 Required setting for the TB to work on the low
// logic slow multiply
parameter OPT_MPY = 1,
parameter [0:0] OPT_LOWPOWER = 1'b0
// }}}
) (
// {{{
input wire i_clk, i_reset, i_stb,
//
// Three types of multiply operations.
// 2'b00: 32x32 multiply, returning the low order 32 bits
// 2'b10: 32x32 unsigned multiply, returning upper 32 bits
// 2'b11: 32x32 signed multiply, returning upper 32 bits
input wire [1:0] i_op,
input wire [31:0] i_a, i_b,
output wire o_valid, // True if the result is valid
output wire o_busy, //
output wire [63:0] o_result, // multiply result
output wire o_hi // Return the high half of mpy
// }}}
);
// A 4-way multiplexer can be done in one 6-LUT.
// A 16-way multiplexer can therefore be done in 4x 6-LUT's with
// the Xilinx multiplexer fabric that follows.
// Given that we wish to apply this multiplexer approach to 33-bits,
// this will cost a minimum of 132 6-LUTs.
// i_stb instead of this_is_a_multiply_op
// o_result
// o_busy
// o_done
generate
if (OPT_MPY == 0)
begin : MPYNONE // No multiply support.
// {{{
assign o_result = 64'h00;
assign o_busy = 1'b0;
assign o_valid = i_stb;
assign o_hi = 1'b0; // Not needed
`ifdef VERILATOR
// verilator coverage_off
// verilator lint_off UNUSED
wire mpy_unused;
assign mpy_unused = &{ 1'b0, i_clk, i_reset, i_stb, i_op, i_a, i_b };
// verilator lint_on UNUSED
// verilator coverage_on
`endif
// }}}
end else begin : IMPY
if (OPT_MPY == 1)
begin : MPY1CK // Our single clock option (no extra clocks)
// {{{
wire signed [63:0] w_mpy_a_input, w_mpy_b_input;
assign w_mpy_a_input = {{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};
assign w_mpy_b_input = {{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};
assign o_result = (OPT_LOWPOWER && !i_stb) ? 0 : (w_mpy_a_input * w_mpy_b_input);
assign o_busy = 1'b0;
assign o_valid = i_stb;
assign o_hi = i_op[1];
`ifdef VERILATOR
// verilator coverage_off
// verilator lint_off UNUSED
wire mpy_unused;
assign mpy_unused = &{ 1'b0, i_clk, i_reset, i_stb, i_op[1] };
// verilator lint_on UNUSED
// verilator coverage_on
`endif
// }}}
end else begin: MPN1
if (OPT_MPY == 2)
begin : MPY2CK // Our two clock option (ALU must pause for 1 clock)
// {{{
// Declarations
// {{{
reg signed [63:0] r_mpy_a_input, r_mpy_b_input;
reg mpypipe, r_hi;
// }}}
// r_mpy_?_input: Register the inputs
// {{{
always @(posedge i_clk)
if (!OPT_LOWPOWER || i_stb)
begin
r_mpy_a_input <={{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};
r_mpy_b_input <={{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};
end else begin
r_mpy_a_input <= 0;
r_mpy_b_input <= 0;
end
// }}}
assign o_result = r_mpy_a_input * r_mpy_b_input;
assign o_busy = 1'b0;
// mpypipe
// {{{
initial mpypipe = 1'b0;
always @(posedge i_clk)
if (i_reset)
mpypipe <= 1'b0;
else
mpypipe <= (i_stb);
// }}}
assign o_valid = mpypipe; // this_is_a_multiply_op;
// o_hi
// {{{
always @(posedge i_clk)
if (i_stb)
r_hi <= i_op[1];
assign o_hi = r_hi;
// }}}
// }}}
end else begin : MPN2
if (OPT_MPY == 3)
begin : MPY3CK // Our three clock option (ALU pauses for 2 clocks)
// {{{
// Declarations
// {{{
reg signed [63:0] r_smpy_result;
reg [63:0] r_umpy_result;
reg signed [31:0] r_mpy_a_input, r_mpy_b_input;
reg [1:0] mpypipe;
reg [1:0] r_sgn;
reg r_hi;
// }}}
// mpypipe (FSM state)
// {{{
initial mpypipe = 2'b0;
always @(posedge i_clk)
if (i_reset)
mpypipe <= 2'b0;
else
mpypipe <= { mpypipe[0], i_stb };
// }}}
// First clock : register r_mpy_?_input, r_sgn
// {{{
always @(posedge i_clk)
r_sgn <= { r_sgn[0],
(i_op[0] && (!OPT_LOWPOWER || i_stb)) };
always @(posedge i_clk)
if (!OPT_LOWPOWER || i_stb)
begin
r_mpy_a_input <= i_a[31:0];
r_mpy_b_input <= i_b[31:0];
end else begin
r_mpy_a_input <= 0;
r_mpy_b_input <= 0;
end
// }}}
// Second clock : perform the multiply
// {{{
`ifdef VERILATOR
// Veri1ator only implementation
// {{{
wire signed [63:0] s_mpy_a_input, s_mpy_b_input;
wire [63:0] u_mpy_a_input, u_mpy_b_input;
assign s_mpy_a_input = {{(32){r_mpy_a_input[31]}},r_mpy_a_input};
assign s_mpy_b_input = {{(32){r_mpy_b_input[31]}},r_mpy_b_input};
assign u_mpy_a_input = {32'h00,r_mpy_a_input};
assign u_mpy_b_input = {32'h00,r_mpy_b_input};
always @(posedge i_clk)
if (!OPT_LOWPOWER || mpypipe[0])
r_smpy_result <= s_mpy_a_input * s_mpy_b_input;
always @(posedge i_clk)
if (!OPT_LOWPOWER || mpypipe[0])
r_umpy_result <= u_mpy_a_input * u_mpy_b_input;
// }}}
`else
// Synthesis implementation
// {{{
wire [31:0] u_mpy_a_input, u_mpy_b_input;
assign u_mpy_a_input = r_mpy_a_input;
assign u_mpy_b_input = r_mpy_b_input;
always @(posedge i_clk)
if (!OPT_LOWPOWER || mpypipe[0])
r_smpy_result <= r_mpy_a_input * r_mpy_b_input;
always @(posedge i_clk)
if (!OPT_LOWPOWER || mpypipe[0])
r_umpy_result <= u_mpy_a_input * u_mpy_b_input;
// }}}
`endif
always @(posedge i_clk)
if (i_stb)
r_hi <= i_op[1];
assign o_hi = r_hi;
assign o_busy = mpypipe[0];
assign o_result = (r_sgn[1])?r_smpy_result:r_umpy_result;
assign o_valid = mpypipe[1];
// }}}
// Results are then available and registered on the third clock
// }}}
end else begin : MPN3
if (OPT_MPY == 4)
begin : MPY4CK // The four clock option, polynomial multiplication
// {{{
// Declarations
// {{{
reg [63:0] r_mpy_result;
reg [31:0] r_mpy_a_input, r_mpy_b_input;
reg r_mpy_signed, r_hi;
reg [2:0] mpypipe;
reg [31:0] pp_f, pp_l; // F and L from FOIL
reg [32:0] pp_oi; // The O and I from FOIL
reg [32:0] pp_s;
// }}}
// First clock, latch in the inputs : mpypipe, r_mpy_?_input
// {{{
initial mpypipe = 3'b0;
always @(posedge i_clk)
begin
// mpypipe indicates we have a multiply in the
// pipeline. In this case, the multiply
// pipeline is a two stage pipeline, so we need
// two bits in the pipe.
if (i_reset)
mpypipe <= 3'h0;
else begin
mpypipe[0] <= i_stb;
mpypipe[1] <= mpypipe[0];
mpypipe[2] <= mpypipe[1];
end
if (i_op[0]) // i.e. if signed multiply
begin
r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};
r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};
end else begin
r_mpy_a_input <= i_a[31:0];
r_mpy_b_input <= i_b[31:0];
end
// The signed bit really only matters in the
// case of 64 bit multiply. We'll keep track
// of it, though, and pretend in all other
// cases.
r_mpy_signed <= i_op[0];
if (i_stb)
r_hi <= i_op[1];
else if (OPT_LOWPOWER)
begin
r_mpy_a_input <= 0;
r_mpy_b_input <= 0;
r_mpy_signed <= 0;
end
end
// }}}
assign o_hi = r_hi;
assign o_busy = |mpypipe[1:0];
assign o_valid = mpypipe[2];
// Second clock, do the multiplies, get the "partial products".
// {{{
// Here, we break our input up into two halves,
//
// A = (2^16 ah + al)
// B = (2^16 bh + bl)
//
// and use these to compute partial products.
//
// AB = (2^32 ah*bh + 2^16 (ah*bl + al*bh) + (al*bl)
//
// Since we're following the FOIL algorithm to get here,
// we'll name these partial products according to FOIL.
//
// The trick is what happens if A or B is signed. In
// those cases, the real value of A will not be given by
// A = (2^16 ah + al)
// but rather
// A = (2^16 ah[31^] + al) - 2^31
// (where we have flipped the sign bit of A)
// and so ...
//
// AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)
// = 2^32(ah*bh)
// +2^16 (ah*bl+al*bh)
// +(al*bl)
// - 2^31 (2^16 bh+bl + 2^16 ah+al)
// - 2^62
// = 2^32(ah*bh)
// +2^16 (ah*bl+al*bh)
// +(al*bl)
// - 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)
//
always @(posedge i_clk)
if (!OPT_LOWPOWER || mpypipe[0])
begin
pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16];
pp_oi<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0]
+ r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];
pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];
// And a special one for the sign
if (r_mpy_signed)
pp_s <= 32'h8000_0000-(
r_mpy_a_input[31:0]
+ r_mpy_b_input[31:0]);
else
pp_s <= 33'h0;
end
// }}}
// Third clock, add the results and get a product: r_mpy_result
// {{{
always @(posedge i_clk)
if (!OPT_LOWPOWER || mpypipe[1])
begin
r_mpy_result[15:0] <= pp_l[15:0];
r_mpy_result[63:16] <=
{ 32'h00, pp_l[31:16] }
+ { 15'h00, pp_oi }
+ { pp_s, 15'h00 }
+ { pp_f, 16'h00 };
end
// }}}
assign o_result = r_mpy_result;
// Fourth clock -- results are clocked into writeback
// }}}
end else begin : MPYSLOW
// {{{
// Use an external multiply implementation, for when DSPs aren't
// available.
//
// Declarations
// {{{
reg r_hi;
// verilator coverage_off
// verilator lint_off UNUSED
wire unused_aux;
wire [65:0] full_result;
// verilator lint_on UNUSED
// verilator coverage_on
// }}}
slowmpy #(.LGNA(6), .NA(33)
) slowmpyi(
i_clk, i_reset, i_stb,
{ (i_op[0])&(i_a[31]), i_a },
{ (i_op[0])&(i_b[31]), i_b }, 1'b0, o_busy,
o_valid, full_result, unused_aux
);
assign o_result = full_result[63:0];
always @(posedge i_clk)
if (i_stb)
r_hi <= i_op[1];
assign o_hi = r_hi;
// }}}
end end end end end
endgenerate // All possible multiply results have been determined
endmodule