UberDDR3/delete_later/rtl/cpu/mpyop.v

////////////////////////////////////////////////////////////////////////////////
//
// Filename:	mpyop.v
// {{{
// Project:	10Gb Ethernet switch
//
// Purpose:	This code has been pulled from the cpuops.v file so as to
//		encapsulate the multiply component--the one component that
//	(can't be) formally verified well, and so must be abstracted away.
//	This separation was done to support potential future abstraction.
//
//
// Creator:	Dan Gisselquist, Ph.D.
//		Gisselquist Technology, LLC
//
////////////////////////////////////////////////////////////////////////////////
// }}}
// Copyright (C) 2023, Gisselquist Technology, LLC
// {{{
// This file is part of the ETH10G project.
//
// The ETH10G project contains free software and gateware, licensed under the
// Apache License, Version 2.0 (the "License").  You may not use this project,
// or this file, except in compliance with the License.  You may obtain a copy
// of the License at
// }}}
//	http://www.apache.org/licenses/LICENSE-2.0
// {{{
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
// License for the specific language governing permissions and limitations
// under the License.
//
////////////////////////////////////////////////////////////////////////////////
//
`default_nettype	none
// }}}
module	mpyop #(
		// {{{
		// The following parameter selects which multiply algorithm we
		// use.  Timing performance is strictly dependent upon it.
		//	OPY_MPY
		//	------
		//	   0	No multiply
		//	   1	Single op multiply, same timing as an ADD
		//	   2	Two clock multiply
		//	   3	Three clock multiply, standard Xlnx DSP timing
		//	   4	Three clock multiply, Xilinx Spartan DSP timing
		//	(Anything else)	-- low logic slow multiply
		//	  36	Required setting for the TB to work on the low
		//		logic slow multiply
		parameter	OPT_MPY = 1,
		parameter [0:0]	OPT_LOWPOWER  = 1'b0
		// }}}
	) (
		// {{{
		input	wire		i_clk, i_reset, i_stb,
		//
		// Three types of multiply operations.
		// 2'b00: 32x32 multiply, returning the low order 32 bits
		// 2'b10: 32x32 unsigned multiply, returning upper 32 bits
		// 2'b11: 32x32   signed multiply, returning upper 32 bits
		input	wire	[1:0]	i_op,
		input	wire	[31:0]	i_a, i_b,
		output	wire		o_valid, // True if the result is valid
		output	wire		o_busy,	//
		output	wire	[63:0]	o_result, // multiply result
		output	wire		o_hi	// Return the high half of mpy
		// }}}
	);


	// A 4-way multiplexer can be done in one 6-LUT.
	// A 16-way multiplexer can therefore be done in 4x 6-LUT's with
	//	the Xilinx multiplexer fabric that follows.
	// Given that we wish to apply this multiplexer approach to 33-bits,
	// this will cost a minimum of 132 6-LUTs.

// i_stb instead of this_is_a_multiply_op
// o_result
// o_busy
// o_done
	generate
	if (OPT_MPY == 0)
	begin : MPYNONE // No multiply support.
		// {{{
		assign	o_result   = 64'h00;
		assign	o_busy     = 1'b0;
		assign	o_valid    = i_stb;
		assign	o_hi = 1'b0; // Not needed

`ifdef	VERILATOR
		// verilator coverage_off
		// verilator lint_off UNUSED
		wire	mpy_unused;
		assign	mpy_unused = &{ 1'b0, i_clk, i_reset, i_stb, i_op, i_a, i_b };
		// verilator lint_on  UNUSED
		// verilator coverage_on
`endif
		// }}}
	end else begin : IMPY
	if (OPT_MPY == 1)
	begin : MPY1CK // Our single clock option (no extra clocks)
		// {{{
		wire	signed	[63:0]	w_mpy_a_input, w_mpy_b_input;

		assign	w_mpy_a_input = {{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};
		assign	w_mpy_b_input = {{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};

		assign	o_result = (OPT_LOWPOWER && !i_stb) ? 0 : (w_mpy_a_input * w_mpy_b_input);

		assign	o_busy  = 1'b0;
		assign	o_valid = i_stb;
		assign	o_hi = i_op[1];

`ifdef	VERILATOR
		// verilator coverage_off
		// verilator lint_off UNUSED
		wire	mpy_unused;
		assign	mpy_unused = &{ 1'b0, i_clk, i_reset, i_stb, i_op[1] };
		// verilator lint_on  UNUSED
		// verilator coverage_on
`endif
		// }}}
	end else begin: MPN1
	if (OPT_MPY == 2)
	begin : MPY2CK // Our two clock option (ALU must pause for 1 clock)
		// {{{

		// Declarations
		// {{{
		reg	signed	[63:0]	r_mpy_a_input, r_mpy_b_input;
		reg			mpypipe, r_hi;
		// }}}

		// r_mpy_?_input: Register the inputs
		// {{{
		always @(posedge i_clk)
		if (!OPT_LOWPOWER || i_stb)
		begin
			r_mpy_a_input <={{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};
			r_mpy_b_input <={{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};
		end else begin
			r_mpy_a_input <= 0;
			r_mpy_b_input <= 0;
		end
		// }}}

		assign	o_result = r_mpy_a_input * r_mpy_b_input;
		assign	o_busy  = 1'b0;

		// mpypipe
		// {{{
		initial	mpypipe = 1'b0;
		always @(posedge i_clk)
		if (i_reset)
			mpypipe <= 1'b0;
		else
			mpypipe <= (i_stb);
		// }}}

		assign	o_valid = mpypipe; // this_is_a_multiply_op;

		// o_hi
		// {{{
		always @(posedge i_clk)
		if (i_stb)
			r_hi  <= i_op[1];

		assign	o_hi = r_hi;
		// }}}
		// }}}
	end else begin : MPN2
	if (OPT_MPY == 3)
	begin : MPY3CK // Our three clock option (ALU pauses for 2 clocks)
		// {{{

		// Declarations
		// {{{
		reg	signed	[63:0]	r_smpy_result;
		reg		[63:0]	r_umpy_result;
		reg	signed	[31:0]	r_mpy_a_input, r_mpy_b_input;
		reg		[1:0]	mpypipe;
		reg		[1:0]	r_sgn;
		reg			r_hi;
		// }}}

		// mpypipe (FSM state)
		// {{{
		initial	mpypipe = 2'b0;
		always @(posedge i_clk)
		if (i_reset)
			mpypipe <= 2'b0;
		else
			mpypipe <= { mpypipe[0], i_stb };
		// }}}

		// First clock : register r_mpy_?_input, r_sgn
		// {{{
		always @(posedge i_clk)
			r_sgn <= { r_sgn[0],
				(i_op[0] && (!OPT_LOWPOWER || i_stb)) };

		always @(posedge i_clk)
		if (!OPT_LOWPOWER || i_stb)
		begin
			r_mpy_a_input <= i_a[31:0];
			r_mpy_b_input <= i_b[31:0];
		end else begin
			r_mpy_a_input <= 0;
			r_mpy_b_input <= 0;
		end
		// }}}

		// Second clock : perform the multiply
		// {{{
`ifdef	VERILATOR
		// Veri1ator only implementation
		// {{{
		wire	signed	[63:0]	s_mpy_a_input, s_mpy_b_input;
		wire		[63:0]	u_mpy_a_input, u_mpy_b_input;

		assign	s_mpy_a_input = {{(32){r_mpy_a_input[31]}},r_mpy_a_input};
		assign	s_mpy_b_input = {{(32){r_mpy_b_input[31]}},r_mpy_b_input};
		assign	u_mpy_a_input = {32'h00,r_mpy_a_input};
		assign	u_mpy_b_input = {32'h00,r_mpy_b_input};
		always @(posedge i_clk)
		if (!OPT_LOWPOWER || mpypipe[0])
			r_smpy_result <= s_mpy_a_input * s_mpy_b_input;
		always @(posedge i_clk)
		if (!OPT_LOWPOWER || mpypipe[0])
			r_umpy_result <= u_mpy_a_input * u_mpy_b_input;
		// }}}
`else
		// Synthesis implementation
		// {{{
		wire		[31:0]	u_mpy_a_input, u_mpy_b_input;

		assign	u_mpy_a_input = r_mpy_a_input;
		assign	u_mpy_b_input = r_mpy_b_input;

		always @(posedge i_clk)
		if (!OPT_LOWPOWER || mpypipe[0])
			r_smpy_result <= r_mpy_a_input * r_mpy_b_input;
		always @(posedge i_clk)
		if (!OPT_LOWPOWER || mpypipe[0])
			r_umpy_result <= u_mpy_a_input * u_mpy_b_input;
		// }}}
`endif

		always @(posedge i_clk)
		if (i_stb)
			r_hi  <= i_op[1];

		assign	o_hi    = r_hi;
		assign	o_busy  = mpypipe[0];
		assign	o_result = (r_sgn[1])?r_smpy_result:r_umpy_result;
		assign	o_valid = mpypipe[1];
		// }}}

		// Results are then available and registered on the third clock
		// }}}
	end else begin : MPN3
	if (OPT_MPY == 4)
	begin : MPY4CK // The four clock option, polynomial multiplication
		// {{{
		// Declarations
		// {{{
		reg	[63:0]	r_mpy_result;
		reg	[31:0]	r_mpy_a_input, r_mpy_b_input;
		reg		r_mpy_signed, r_hi;
		reg	[2:0]	mpypipe;
		reg	[31:0]	pp_f, pp_l; // F and L from FOIL
		reg	[32:0]	pp_oi; // The O and I from FOIL
		reg	[32:0]	pp_s;
		// }}}

		// First clock, latch in the inputs : mpypipe, r_mpy_?_input
		// {{{
		initial	mpypipe = 3'b0;
		always @(posedge i_clk)
		begin
			// mpypipe indicates we have a multiply in the
			// pipeline.  In this case, the multiply
			// pipeline is a two stage pipeline, so we need
			// two bits in the pipe.
			if (i_reset)
				mpypipe <= 3'h0;
			else begin
				mpypipe[0] <= i_stb;
				mpypipe[1] <= mpypipe[0];
				mpypipe[2] <= mpypipe[1];
			end

			if (i_op[0]) // i.e. if signed multiply
			begin
				r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};
				r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};
			end else begin
				r_mpy_a_input <= i_a[31:0];
				r_mpy_b_input <= i_b[31:0];
			end
			// The signed bit really only matters in the
			// case of 64 bit multiply.  We'll keep track
			// of it, though, and pretend in all other
			// cases.
			r_mpy_signed  <= i_op[0];

			if (i_stb)
				r_hi  <= i_op[1];
			else if (OPT_LOWPOWER)
			begin
				r_mpy_a_input <= 0;
				r_mpy_b_input <= 0;
				r_mpy_signed  <= 0;
			end
		end
		// }}}

		assign	o_hi    = r_hi;
		assign	o_busy  = |mpypipe[1:0];
		assign	o_valid = mpypipe[2];

		// Second clock, do the multiplies, get the "partial products".
		// {{{
		// Here, we break our input up into two halves,
		//
		//   A  = (2^16 ah + al)
		//   B  = (2^16 bh + bl)
		//
		// and use these to compute partial products.
		//
		//   AB = (2^32 ah*bh + 2^16 (ah*bl + al*bh) + (al*bl)
		//
		// Since we're following the FOIL algorithm to get here,
		// we'll name these partial products according to FOIL.
		//
		// The trick is what happens if A or B is signed.  In
		// those cases, the real value of A will not be given by
		//	A = (2^16 ah + al)
		// but rather
		//	A = (2^16 ah[31^] + al) - 2^31
		//  (where we have flipped the sign bit of A)
		// and so ...
		//
		// AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)
		//	= 2^32(ah*bh)
		//		+2^16 (ah*bl+al*bh)
		//		+(al*bl)
		//		- 2^31 (2^16 bh+bl + 2^16 ah+al)
		//		- 2^62
		//	= 2^32(ah*bh)
		//		+2^16 (ah*bl+al*bh)
		//		+(al*bl)
		//		- 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)
		//
		always @(posedge i_clk)
		if (!OPT_LOWPOWER || mpypipe[0])
		begin
			pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16];
			pp_oi<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0]
				+ r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];
			pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];
			// And a special one for the sign
			if (r_mpy_signed)
				pp_s <= 32'h8000_0000-(
				  	r_mpy_a_input[31:0]
					+ r_mpy_b_input[31:0]);
			else
				pp_s <= 33'h0;
		end
		// }}}

		// Third clock, add the results and get a product: r_mpy_result
		// {{{
		always @(posedge i_clk)
		if (!OPT_LOWPOWER || mpypipe[1])
		begin
			r_mpy_result[15:0] <= pp_l[15:0];
			r_mpy_result[63:16] <=
			  	{ 32'h00, pp_l[31:16] }
				+ { 15'h00, pp_oi }
				+ { pp_s, 15'h00 }
				+ { pp_f, 16'h00 };
		end
		// }}}

		assign	o_result = r_mpy_result;
		// Fourth clock -- results are clocked into writeback
		// }}}
	end else begin : MPYSLOW
		// {{{
		// Use an external multiply implementation, for when DSPs aren't
		// available.
		//

		// Declarations
		// {{{
		reg		r_hi;
		// verilator coverage_off
		// verilator lint_off UNUSED
		wire		unused_aux;
		wire	[65:0]	full_result;
		// verilator lint_on  UNUSED
		// verilator coverage_on
		// }}}

		slowmpy #(.LGNA(6), .NA(33)
		) slowmpyi(
			i_clk, i_reset, i_stb,
			{ (i_op[0])&(i_a[31]), i_a },
			{ (i_op[0])&(i_b[31]), i_b }, 1'b0, o_busy,
				o_valid, full_result, unused_aux
		);

		assign	o_result = full_result[63:0];

		always @(posedge i_clk)
		if (i_stb)
			r_hi  <= i_op[1];

		assign	o_hi = r_hi;
		// }}}
	end end end end end
	endgenerate // All possible multiply results have been determined

endmodule