Merge pull request #1 from AngeloJacobo/dev

Dev branch pull request
This commit is contained in:
Angelo Jacobo 2024-04-20 15:18:30 +08:00 committed by GitHub
commit a354a8d4ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 302 additions and 113 deletions

View File

@ -17,10 +17,10 @@
// Engineer: Angelo C. Jacobo
//
////////////////////////////////////////////////////////////////////////////////
// NOTE TO SELF are questions which I still need to answer
// Comments are continuously added on this RTL for better readability
//`define FORMAL_COVER //skip reset sequence to fit in cover depth
//`define FORMAL_COVER //skip reset sequence during formal verification to fit in cover depth
`default_nettype none
`timescale 1ps / 1ps
//
@ -52,7 +52,7 @@ module ddr3_controller #(
ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
SECOND_WISHBONE = 0, //set to 1 if 2nd wishbone is needed
parameter // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD),
serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
wb_addr_bits = ROW_BITS + COL_BITS + BA_BITS - $clog2(serdes_ratio*2),
wb_sel_bits = wb_data_bits / 8,
@ -157,7 +157,7 @@ module ddr3_controller #(
// delayed and center-aligned to the center eye of data)
localparam DATA_INITIAL_ODELAY_TAP = 0;
//DQS needs to be edge-aligned to the center eye of the data.
//Posedge of DQS needs to be aligned to the center eye of the data.
//This means DQS needs to be delayed by a quarter of the ddr3
//clk period relative to the data. Subtract by 600ps to include
//the IODELAY insertion delay. Divide by a delay resolution of
@ -179,17 +179,30 @@ module ddr3_controller #(
localparam tRCD = 13_750; // ps Active to Read/Write command time
localparam tRP = 13_750; // ps Precharge command period
localparam tRAS = 35_000; // ps ACT to PRE command period
`elsif DDR3_1333_9_9_9 //DDR3-1333 (9-9-9) speed bin
localparam tRCD = 13_500; // ps Active to Read/Write command time
localparam tRP = 13_500; // ps Precharge command period
localparam tRAS = 36_000; // ps ACT to PRE command period
`elsif DDR3_1066_7_7_7 //DDR3-1066 (7-7-7) speed bin
localparam tRCD = 13_125; // ps Active to Read/Write command time
localparam tRP = 13_125; // ps Precharge command period
localparam tRAS = 37_500; // ps ACT to PRE command period
`else
"Throw an error here if speed bin is not recognized (or not defined)"
`endif
`ifdef RAM_1Gb
localparam tRFC = 110_000; // ps Refresh command to ACT or REF
`elsif RAM_2Gb
localparam tRFC = 160_000; // ps Refresh command to ACT or REF
`elsif RAM_4Gb
localparam tRFC = 300_000; // ps Refresh command to ACT or REF
`else
`elsif RAM_8Gb
localparam tRFC = 350_000; // ps Refresh command to ACT or REF
`else
"Throw an error here if capacity is not recognized (or not defined)"
`endif
localparam tREFI = 7_800_000; //ps Average periodic refresh interval
localparam tXPR = max(5*DDR3_CLK_PERIOD, tRFC+10_000); // ps Exit Reset from CKE HIGH to a valid command
localparam tWR = 15_000; // ps Write Recovery Time
@ -247,6 +260,8 @@ module ddr3_controller #(
//Also, worscase is when the anticipated bank still has the leftover of the
//WRITE_TO_PRECHARGE_DELAY thus consider also this.
localparam MARGIN_BEFORE_ANTICIPATE = PRECHARGE_TO_ACTIVATE_DELAY + ACTIVATE_TO_WRITE_DELAY + WRITE_TO_PRECHARGE_DELAY;
// STAGE2_DATA_DEPTH is the number of controller clk cycles of delay before issuing the data after the write command
// depends on the CWL_nCK
localparam STAGE2_DATA_DEPTH = (CWL_nCK - (3 - WRITE_SLOT + 1))/4 + 1; //this is always >= 1 (5 - (3 - 3 + 1))/4.0 -> floor(1) + 1 = floor(4
`ifdef FORMAL
wire stage2_data_depth;
@ -255,8 +270,11 @@ module ddr3_controller #(
assert(STAGE2_DATA_DEPTH-2 >= 0);
end
`endif
localparam READ_DELAY = $rtoi($floor((CL_nCK - (3 - READ_SLOT + 1))/4.0 ));
localparam READ_ACK_PIPE_WIDTH = READ_DELAY + 1 + 2 + 1 + 1;
localparam READ_DELAY = $rtoi($floor((CL_nCK - (3 - READ_SLOT + 1))/4.0 )); // how many controller clk cycles to satisfy CL_nCK of ddr3_clk cycles
// READ_ACK_PIPE_WIDTH is the delay between read command issued (starting from the controller) until the data is received by the controller
//the delays included the ODELAY and OSERDES when issuing the read command
//and the IDELAY and ISERDES when receiving the data (NOTE TO SELF: ELABORATE ON WHY THOSE MAGIC NUMBERS)
localparam READ_ACK_PIPE_WIDTH = READ_DELAY + 1 + 2 + 1 + 1;
localparam MAX_ADDED_READ_ACK_DELAY = 16;
localparam DELAY_BEFORE_WRITE_LEVEL_FEEDBACK = STAGE2_DATA_DEPTH + ps_to_cycles(tWLO+tWLOE) + 10; //plus 10 controller clocks for possible bus latency and
//the delay for receiving feedback DQ from IOBUF -> IDELAY -> ISERDES
@ -350,7 +368,6 @@ module ddr3_controller #(
reg reset_done = 0; //high if reset has already finished
reg pause_counter = 0;
wire issue_read_command;
wire issue_write_command;
reg stage2_update = 1;
reg stage2_stall = 0;
reg stage1_stall = 0;
@ -428,14 +445,17 @@ module ddr3_controller #(
reg[15:0] dqs_bitslip_arrangement = 0;
/* verilator lint_off UNUSEDSIGNAL */
(* mark_debug = "true" *) reg[3:0] added_read_pipe_max = 0;
(* mark_debug = "true" *) reg[3:0] added_read_pipe[LANES - 1:0];
(* mark_debug = "true" *) reg[3:0] added_read_pipe[LANES - 1:0];
//each lane will have added delay relative to when ISERDES should actually return the data
//this make sure that we will wait until the lane with longest delay (added_read_pipe_max) is received before
//all lanes are sent to wishbone data
//contains the ack shift reg for both read and write
reg[AUX_WIDTH:0] shift_reg_read_pipe_q[READ_ACK_PIPE_WIDTH-1:0];
reg[AUX_WIDTH:0] shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1:0]; //1=issue command delay (OSERDES delay), 2 = ISERDES delay
reg index_read_pipe; //tells which delay_read_pipe will be updated
reg[AUX_WIDTH:0] shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1:0]; //issue ack and AUX value , 1=issue command delay (OSERDES delay), 2 = ISERDES delay
reg index_read_pipe; //tells which delay_read_pipe will be updated (there are two delay_read_pipe)
reg index_wb_data; //tells which o_wb_data_q will be sent to o_wb_data
reg[15:0] delay_read_pipe[1:0]; //delay when each lane will retrieve i_phy_iserdes_data
reg[15:0] delay_read_pipe[1:0]; //delay when each lane will retrieve i_phy_iserdes_data (since different lanes might not be aligned with each other and needs to be retrieved at a different time)
reg[wb_data_bits - 1:0] o_wb_data_q[1:0]; //store data retrieved from i_phy_iserdes_data to be sent to o_wb_data
reg[AUX_WIDTH:0] o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1:0];
@ -807,8 +827,8 @@ module ddr3_controller #(
//stage2_data -> shiftreg(CWL) -> OSERDES(DDR) -> ODELAY -> RAM
end
if(!ODELAY_SUPPORTED) begin
stage2_data_unaligned <= stage2_data_unaligned_temp; //delayed by 1 clock cycle
stage2_dm_unaligned <= stage2_dm_unaligned_temp; //delayed by 1 clock cycle
stage2_data_unaligned <= stage2_data_unaligned_temp; //_temp is for added delay of 1 clock cycle (no ODELAY so no added delay)
stage2_dm_unaligned <= stage2_dm_unaligned_temp; //_temp is for added delay of 1 clock cycle (no ODELAY so no added delay)
end
// when not in refresh, transaction can only be processed when i_wb_cyc is high and not stall
@ -834,6 +854,7 @@ module ddr3_controller #(
/* verilator lint_on WIDTH */
stage1_data <= i_wb_data;
end
// request from calibrate FSM will be accepted here
else if(state_calibrate != DONE_CALIBRATE && !o_wb_stall_calib) begin
stage1_pending <= calib_stb;//actual request flag
stage1_we <= calib_we; //write-enable
@ -850,6 +871,8 @@ module ddr3_controller #(
for(index = 0; index < LANES; index = index + 1) begin
/* verilator lint_off WIDTH */
// stage2_data_unaligned is the DQ_BITS*LANES*8 raw data from stage 1 so not yet aligned
// unaligned_data is 64 bits
{unaligned_data[index], {
stage2_data[0][((DQ_BITS*LANES)*7 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*6 + 8*index) +: 8],
stage2_data[0][((DQ_BITS*LANES)*5 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*4 + 8*index) +: 8],
@ -860,7 +883,37 @@ module ddr3_controller #(
stage2_data_unaligned[((DQ_BITS*LANES)*3 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*2 + 8*index) +: 8],
stage2_data_unaligned[((DQ_BITS*LANES)*1 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*0 + 8*index) +: 8] }
<< data_start_index[index]) | unaligned_data[index];
/*
// Example with LANE 0:
// Burst_0 to burst_7 of unaligned LANE 0 will be extracted which will be shifted by data_start_index.
// Each 8 bits of shift means a burst will be moved to next ddr3_clk cycle, this is needed if for example
// the DQ trace is longer than the command trace where the DQ bits must be delayed by 1 ddr3_clk cycle
// to align the DQ data to the write command.
//
// Since 1 controller clk cycle will have 4 ddr3_clk cycle, and each ddr3_clk cycle is DDR:
// CONTROLLER CLK CYCLE 0: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
// CONTROLLER CLK CYCLE 1: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
// CONTROLLER CLK CYCLE 2: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
//
// shifting by 1 burst means burst 7 will be sent on next controller clk cycle and EVERY BURST WILL SHIFT:
// CONTROLLER CLK CYCLE 0: [xxxxxx,xxxxxx] [burst0,burst1] [burst2,burst3] [burst4,burst5]
// CONTROLLER CLK CYCLE 1: [burst6,burst7] [burst0,burst1] [burst2,burst3] [burst4,burst5]
// CONTROLLER CLK CYCLE 2: [burst6,burst7] [burst0,burst1] [burst2,burst3] [burst4,burst5]
//
// the [burst6,burst7] which has to be stored and delayed until next clk cycle will be handled by unaligned_data
{unaligned_data[0], {
stage2_data[0][((64)*7 + 8*0) +: 8], stage2_data[0][((64)*6 + 8*0) +: 8],
stage2_data[0][((64)*5 + 8*0) +: 8], stage2_data[0][((64)*4 + 8*0) +: 8],
stage2_data[0][((64)*3 + 8*0) +: 8], stage2_data[0][((64)*2 + 8*0) +: 8],
stage2_data[0][((64)*1 + 8*0) +: 8], stage2_data[0][((64)*0 + 8*0) +: 8] }}
<= ( { stage2_data_unaligned[((64)*7 + 8*0) +: 8], stage2_data_unaligned[((64)*6 + 8*0) +: 8],
stage2_data_unaligned[((64)*5 + 8*0) +: 8], stage2_data_unaligned[((64)*4 + 8*0) +: 8],
stage2_data_unaligned[((64)*3 + 8*0) +: 8], stage2_data_unaligned[((64)*2 + 8*0) +: 8],
stage2_data_unaligned[((64)*1 + 8*0) +: 8], stage2_data_unaligned[((64)*0 + 8*0) +: 8] }
<< data_start_index[0]) | unaligned_data[0];
*/
// The same alignment logic is done with data mask
{unaligned_dm[index], {
stage2_dm[0][LANES*7 + index], stage2_dm[0][LANES*6 + index],
stage2_dm[0][LANES*5 + index], stage2_dm[0][LANES*4 + index],
@ -873,6 +926,7 @@ module ddr3_controller #(
<< (data_start_index[index]>>3)) | unaligned_dm[index];
/* verilator lint_on WIDTH */
end
// stage2 can have multiple pipelined stages inside it which acts as delay before issuing the write data (after issuing write command)
for(index = 0; index < STAGE2_DATA_DEPTH-1; index = index+1) begin
stage2_data[index+1] <= stage2_data[index];
stage2_dm[index+1] <= stage2_dm[index];
@ -885,11 +939,11 @@ module ddr3_controller #(
end
end
end
assign o_phy_data = stage2_data[STAGE2_DATA_DEPTH-1];
assign o_phy_data = stage2_data[STAGE2_DATA_DEPTH-1]; // the data sent to PHY is the last stage of of stage 2 (since stage 2 can have multiple pipelined stages inside it_
assign o_phy_dm = stage2_dm[STAGE2_DATA_DEPTH-1];
/* verilator lint_off WIDTH */
assign wb_addr_plus_anticipate = i_wb_addr + MARGIN_BEFORE_ANTICIPATE;
assign calib_addr_plus_anticipate = calib_addr + MARGIN_BEFORE_ANTICIPATE;
assign wb_addr_plus_anticipate = i_wb_addr + MARGIN_BEFORE_ANTICIPATE; // wb_addr_plus_anticipate determines if it is near the end of column by checking if it jumps to next row
assign calib_addr_plus_anticipate = calib_addr + MARGIN_BEFORE_ANTICIPATE; // just same as wb_addr_plus_anticipate but while doing calibration
/* verilator lint_on WIDTH */
// DIAGRAM FOR ALL RELEVANT TIMING PARAMETERS:
//
@ -924,16 +978,17 @@ module ddr3_controller #(
bank_status_d[index] = bank_status_q[index];
bank_active_row_d[index] = bank_active_row_q[index];
end
//set cmd_0 to reset instruction, the remainings are NOP
//delay_counter_is_zero signifies start of new reset instruction (the time when the command must be issued)
cmd_d[PRECHARGE_SLOT] = {(!delay_counter_is_zero), instruction[DDR3_CMD_START-1:DDR3_CMD_END], cmd_odt, instruction[CLOCK_EN], instruction[RESET_N],
//set PRECHARGE_SLOT as reset instruction, the remainings are NOP (MSB is high)
//delay_counter_is_zero high signifies start of new reset instruction (the time when the command must be issued)
cmd_d[PRECHARGE_SLOT] = {(!delay_counter_is_zero), instruction[DDR3_CMD_START-1:DDR3_CMD_END] | {3{(!delay_counter_is_zero)}} , cmd_odt, instruction[CLOCK_EN], instruction[RESET_N],
instruction[MRS_BANK_START:(MRS_BANK_START-BA_BITS+1)], instruction[ROW_BITS-1:0]};
cmd_d[PRECHARGE_SLOT][10] = instruction[A10_CONTROL];
cmd_d[READ_SLOT] = {(!issue_read_command), CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}};
cmd_d[WRITE_SLOT] = {(!issue_write_command),CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}};
cmd_d[ACTIVATE_SLOT] = {1'b1,CMD_ACT[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}};
cmd_d[READ_SLOT] = {(!issue_read_command), CMD_RD[2:0] | {3{(!issue_read_command)}}, cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; // issued during MPR reads (address does not matter)
cmd_d[WRITE_SLOT] = {1'b0, 3'b111, cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; // always NOP by default
cmd_d[ACTIVATE_SLOT] = {1'b0, 3'b111 , cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; // always NOP by default
// decrement delay counters for every bank
// decrement delay counters for every bank , stay to 0 once 0 is reached
// every bank will have its own delay counters for precharge, activate, write, and read
for(index=0; index< (1<<BA_BITS); index=index+1) begin
delay_before_precharge_counter_d[index] = (delay_before_precharge_counter_q[index] == 0)? 0: delay_before_precharge_counter_q[index] - 1;
delay_before_activate_counter_d[index] = (delay_before_activate_counter_q[index] == 0)? 0: delay_before_activate_counter_q[index] - 1;
@ -941,9 +996,10 @@ module ddr3_controller #(
delay_before_read_counter_d[index] = (delay_before_read_counter_q[index] == 0)? 0:delay_before_read_counter_q[index] - 1;
end
for(index = 1; index < READ_ACK_PIPE_WIDTH; index = index + 1) begin
// shift is rightward where LSB gets MSB ([MSB] -> [] -> [] -> .... -> [] -[LSB])
shift_reg_read_pipe_d[index-1] = shift_reg_read_pipe_q[index];
end
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = 0;
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = 0; //MSB just receives zero when shifted rightward
//USE _d in ALL
@ -959,7 +1015,7 @@ module ddr3_controller #(
stage2_stall = 0;
stage2_update = 1;
cmd_odt = 1'b1;
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1};
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; // ack is sent to shift_reg which will be shifted until the wb ack output
//write acknowledge will use the same logic pipeline as the read acknowledge.
//This would mean write ack latency will be the same for
@ -968,9 +1024,11 @@ module ddr3_controller #(
//for write ack as there will be no need to analyze the
//contents of the shift_reg_read_pipe just to determine
//where best to place the write ack on the pipeline (since
//the order of ack must be maintaned). But this would mean
//the order of ack must be maintained). But this would mean
//the latency for write is fixed regardless if there is an
//outstanding read ack or none on the pipeline
//outstanding read ack or none on the pipeline. But this is
// acceptable in my opinion since this is a pipelined wishbone
// where the transaction can continue regardless when ack returns
//set-up delay before precharge, read, and write
if(delay_before_precharge_counter_q[stage2_bank] <= WRITE_TO_PRECHARGE_DELAY) begin
@ -984,7 +1042,7 @@ module ddr3_controller #(
delay_before_precharge_counter_d[stage2_bank] = WRITE_TO_PRECHARGE_DELAY;
end
for(index=0; index < (1<<BA_BITS); index=index+1) begin //the write to read delay applies to all banks (odt must be turned off properly before reading)
delay_before_read_counter_d[index] = WRITE_TO_READ_DELAY + 1;
delay_before_read_counter_d[index] = WRITE_TO_READ_DELAY + 1; //NOTE TO SELF: why plus 1?
end
delay_before_read_counter_d[stage2_bank] = WRITE_TO_READ_DELAY + 1;
delay_before_write_counter_d[stage2_bank] = WRITE_TO_WRITE_DELAY;
@ -992,9 +1050,8 @@ module ddr3_controller #(
if(COL_BITS <= 10) begin
cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd11}{1'b0}} , 1'b0 , stage2_col[9:0]};
end
else begin
cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd12}{1'b0}} ,
stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};
else begin // COL_BITS > 10 has different format from <= 10
cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd12}{1'b0}} , stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};
end
//turn on odt at same time as write cmd
cmd_d[0][CMD_ODT] = cmd_odt;
@ -1003,7 +1060,6 @@ module ddr3_controller #(
cmd_d[3][CMD_ODT] = cmd_odt;
write_dqs_d=1;
write_dq_d=1;
// write_data = 1;
end
//read request
@ -1016,19 +1072,18 @@ module ddr3_controller #(
delay_before_precharge_counter_d[stage2_bank] = READ_TO_PRECHARGE_DELAY;
end
delay_before_read_counter_d[stage2_bank] = READ_TO_READ_DELAY;
delay_before_write_counter_d[stage2_bank] = READ_TO_WRITE_DELAY + 1; //temporary solution since its possible odt to go hig already while reading previously
for(index=0; index < (1<<BA_BITS); index=index+1) begin //the read to write delay applies to all banks (odt must be turned on properly before writing)
delay_before_write_counter_d[index] = READ_TO_WRITE_DELAY + 1;
delay_before_write_counter_d[stage2_bank] = READ_TO_WRITE_DELAY + 1; //temporary solution since its possible odt to go high already while reading previously
for(index=0; index < (1<<BA_BITS); index=index+1) begin //the read to write delay applies to all banks (odt must be turned on properly before writing and this delay is for ODT to settle)
delay_before_write_counter_d[index] = READ_TO_WRITE_DELAY + 1; // NOTE TO SELF: why plus 1?
end
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1};
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; // ack is sent to shift_reg which will be shifted until the wb ack output
//issue read command
if(COL_BITS <= 10) begin
cmd_d[READ_SLOT] = {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd11}{1'b0}} , 1'b0 , stage2_col[9:0]};
end
else begin
cmd_d[READ_SLOT] = {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd12}{1'b0}} ,
stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};
else begin // COL_BITS > 10 has different format from <= 10
cmd_d[READ_SLOT] = {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd12}{1'b0}} , stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};
end
//turn off odt at same time as read cmd
cmd_d[0][CMD_ODT] = cmd_odt;
@ -1043,10 +1098,10 @@ module ddr3_controller #(
activate_slot_busy = 1'b1;
delay_before_precharge_counter_d[stage2_bank] = ACTIVATE_TO_PRECHARGE_DELAY;
//set-up delay before read and write
if(delay_before_read_counter_q[stage2_bank] <= ACTIVATE_TO_READ_DELAY) begin
if(delay_before_read_counter_q[stage2_bank] <= ACTIVATE_TO_READ_DELAY) begin // if current delay is > ACTIVATE_TO_READ_DELAY, then updating it to the lower delay will cause the previous delay to be violated
delay_before_read_counter_d[stage2_bank] = ACTIVATE_TO_READ_DELAY;
end
if(delay_before_write_counter_q[stage2_bank] <= ACTIVATE_TO_WRITE_DELAY) begin
if(delay_before_write_counter_q[stage2_bank] <= ACTIVATE_TO_WRITE_DELAY) begin // if current delay is > ACTIVATE_TO_WRITE_DELAY, then updating it to the lower delay will cause the previous delay to be violated
delay_before_write_counter_d[stage2_bank] = ACTIVATE_TO_WRITE_DELAY;
end
//issue activate command
@ -1069,16 +1124,18 @@ module ddr3_controller #(
//pending request on stage 1
if(stage1_pending && !((stage1_next_bank == stage2_bank) && stage2_pending)) begin
//stage 1 will mainly be for anticipation, but it can also handle
//precharge and activate request. This will depend if the request
//is on the end of the row and must start the anticipation. For
//example, we have 10 rows in a bank:
//[R][R][R][R][R][R][R][A][A][A]
//stage 1 will mainly be for anticipation (if next requests need to jump to new bank then
//anticipate the precharging and activate of that next bank, BUT it can also handle
//precharge and activate of CURRENT wishbone request.
//Anticipate will depend if the request is on the end of the row
// and must start the anticipation. For example if we have 10 rows in a bank:
//[R][R][R][R][R][R][R][A][A][A] -> [next bank]
//
//R = Request, A = Anticipate
//Unless we are near the third to the last column, stage 1 will
//issue Activate and Precharge on the CURRENT bank. Else, stage
//1 will issue Activate and Precharge for the NEXT bank
// Thus stage 1 anticipate makes sure smooth burst operation that jumps banks
if(bank_status_q[stage1_next_bank] && bank_active_row_q[stage1_next_bank] != stage1_next_row && delay_before_precharge_counter_q[stage1_next_bank] ==0 && !precharge_slot_busy) begin
//set-up delay before read and write
delay_before_activate_counter_d[stage1_next_bank] = PRECHARGE_TO_ACTIVATE_DELAY;
@ -1090,10 +1147,10 @@ module ddr3_controller #(
else if(!bank_status_q[stage1_next_bank] && delay_before_activate_counter_q[stage1_next_bank] == 0 && !activate_slot_busy) begin
delay_before_precharge_counter_d[stage1_next_bank] = ACTIVATE_TO_PRECHARGE_DELAY;
//set-up delay before read and write
if(delay_before_read_counter_d[stage1_next_bank] <= ACTIVATE_TO_READ_DELAY) begin
if(delay_before_read_counter_d[stage1_next_bank] <= ACTIVATE_TO_READ_DELAY) begin // if current delay is > ACTIVATE_TO_READ_DELAY, then updating it to the lower delay will cause the previous delay to be violated
delay_before_read_counter_d[stage1_next_bank] = ACTIVATE_TO_READ_DELAY;
end
if(delay_before_write_counter_d[stage1_next_bank] <= ACTIVATE_TO_WRITE_DELAY) begin
if(delay_before_write_counter_d[stage1_next_bank] <= ACTIVATE_TO_WRITE_DELAY) begin // if current delay is > ACTIVATE_TO_WRITE_DELAY, then updating it to the lower delay will cause the previous delay to be violated
delay_before_write_counter_d[stage1_next_bank] = ACTIVATE_TO_WRITE_DELAY;
end
cmd_d[ACTIVATE_SLOT] = {1'b0, CMD_ACT[2:0] , cmd_odt, cmd_ck_en, cmd_reset_n, stage1_next_bank , stage1_next_row};
@ -1110,10 +1167,10 @@ module ddr3_controller #(
if(!bank_status_d[stage1_bank] || (bank_status_d[stage1_bank] && bank_active_row_d[stage1_bank] != stage1_row)) begin
stage1_stall = 1;
end
else if(!stage1_we && delay_before_read_counter_d[stage1_bank] != 0) begin
else if(!stage1_we && delay_before_read_counter_d[stage1_bank] != 0) begin // if read request but delay before read is not yet met then stall
stage1_stall = 1;
end
else if(stage1_we && delay_before_write_counter_d[stage1_bank] != 0) begin
else if(stage1_we && delay_before_write_counter_d[stage1_bank] != 0) begin // if write request but delay before write is not yet met then stall
stage1_stall = 1;
end
//different request type will need a delay of more than 1 clk cycle so stall the pipeline
@ -1127,17 +1184,23 @@ module ddr3_controller #(
//control stage2 stall in advance
if(bank_status_d[stage2_bank] && bank_active_row_d[stage2_bank] == stage2_row) begin //read/write operation
//write request
if(stage2_we && delay_before_write_counter_d[stage2_bank] == 0) begin //if counter is 1 now, then next clock it will be zero thus lower stall at next cycle too
if(stage2_we && delay_before_write_counter_d[stage2_bank] == 0) begin // if write request and delay before write is already met then deassert stall
stage2_stall = 0; //to low stall next stage, but not yet at this stage
end
//read request
else if(!stage2_we && delay_before_read_counter_d[stage2_bank]==0) begin //if counter is 1 now, then next clock it will be zero thus lower stall at next cycle too
else if(!stage2_we && delay_before_read_counter_d[stage2_bank]==0) begin // if read request and delay before read is already met then deassert stall
stage2_stall = 0;
end
end
end
// control logic for stall
// this small logic is already optimized via listing all possible combinations on excel sheet, investigating the patterns
// and passing the formal verification. I recommend to not touch this and just left this as is.
// This logic makes sure stall will never go high unless the pipeline is full
// What made this complicated is that fact you have to predict the stall for next clock cycle in such
// a way that it will only stall next clock cycle if the pipeline will be full on the next clock cycle.
// Excel sheet: https://1drv.ms/x/s!AhWdq9CipeVagSqQXPwRmXhDgttL?e=vVYIxE&nav=MTVfezAwMDAwMDAwLTAwMDEtMDAwMC0wMDAwLTAwMDAwMDAwMDAwMH0
if(o_wb_stall_q) o_wb_stall_d = stage2_stall;
else if( (!i_wb_stb && state_calibrate == DONE_CALIBRATE) || (!calib_stb && state_calibrate != DONE_CALIBRATE) ) o_wb_stall_d = 0;
else if(!stage1_pending) o_wb_stall_d = stage2_stall;
@ -1149,7 +1212,6 @@ module ddr3_controller #(
assign o_phy_cmd = {cmd_d[3], cmd_d[2], cmd_d[1], cmd_d[0]};
/*********************************************************************************************************************************************/
/******************************************************* Align Read Data from ISERDES *******************************************************/
always @(posedge i_controller_clk) begin
if(sync_rst_controller) begin
@ -1195,30 +1257,62 @@ module ddr3_controller #(
write_dq[index+1] <= write_dq[index];
end
for(index = 0; index < READ_ACK_PIPE_WIDTH; index = index + 1) begin
// shifted rightward where LSB gets MSB ([MSB] -> [] -> [] -> .... -> [] -[LSB])
shift_reg_read_pipe_q[index] <= shift_reg_read_pipe_d[index];
end
for(index = 0; index < 2; index = index + 1) begin
for(index = 0; index < 2; index = index + 1) begin
// there are 2 read_pipes (each with 16 space for shifting), and each read pipes shift rightward
// so the bit 1 will be shifted to the right until it reach LSB which means data is already on ISERDES output of PHY
delay_read_pipe[index] <= (delay_read_pipe[index] >> 1);
end
if(shift_reg_read_pipe_q[1][0]) begin //delay is over and data is now starting to release from iserdes BUT NOT YET ALIGNED
index_read_pipe <= !index_read_pipe; //control which delay_read_pipe would get updated (we have 3 pipe to store read data)ss
if(shift_reg_read_pipe_q[1][0]) begin
//delay from shift_reg_read_pipe_q is about to be over (ack, which is the last bit, will be at LSB on next clk cycle)
//and data is now starting to be released from ISERDES from phy BUT NOT YET ALIGNED
index_read_pipe <= !index_read_pipe; //control which delay_read_pipe would get updated (we have 2 read_pipes to store read data,use the read_pipe alternatingly)
delay_read_pipe[index_read_pipe][added_read_pipe_max] <= 1'b1; //update delay_read_pipe
// NOTE: added_read_pipe_max can either be 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of delay_read_pipe)
// delay_read_pipe will get the ack bit from shift_reg_read_pipe_q[1] at the bit equal to
// added_read_pipe_max (0th or 1st bit). added_read_pipe_max is the max number of added controller clk cycles among all lanes
// So basically, the delay_read_pipe is the delay to make sure the "added_read_pipe_max" controller clk cycles
// will be met.
// Example:
// So for request #1 (e.g. write request, added_read_pipe_max=1), wait until the shift_reg_read_pipe_q[1] goes
// high (READ_ACK_PIPE_WIDTH of delay is met which means the data from ISERDES PHY is now available). The
// delay_read_pipe[0][1] will then be high. This high bit on read_pipe #0 will get shifted to LSB. [1] -> [] -> [LSB]
// Meanwhile when request #2 comes (e.g. read request, added_read_pipe_max=1), again wait until the shift_reg_read_pipe_q[1] goes
// high. The delay_read_pipe[1][1] will then be high. This high bit on read_pipe #1 will get shifted to LSB. [1] -> [] -> [LSB]
end
for(index = 0; index < LANES; index = index + 1) begin
/* verilator lint_off WIDTH */
if(delay_read_pipe[0][added_read_pipe_max != added_read_pipe[index]]) begin //same lane
// read_pipe #0
// NOTE: added_read_pipe_max and added_read_pipe can either be just 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of this)
// If the added_read_pipe (added number of controller clk cycles of delay to a lane due to pcb trace) is equal to the
// max delay (added_read_pipe_max, e.g. 0-0 or 1-1) for this lane, THEN we need to wait until the bit 1 reaches the LSB[0] of delay_read_pipe
// before retrieving the value from PHY. But if not the same (added_read_pipe is 0 while added_read_pipe_max is 1), then wait until
// the bit 1 reaches the one before LSB [1] before retrieving the value from PHY, so this means this lane with 0 delay will FIRST BE RETRIEVED
// while the lane with added_read_pipe_max of delay (delay of 1) will be retrieved SECOND
if(delay_read_pipe[0][added_read_pipe_max != added_read_pipe[index]]) begin
/* verilator lint_on WIDTH */
o_wb_data_q[0][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[0][((DQ_BITS*LANES)*1 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*1 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[0][((DQ_BITS*LANES)*2 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*2 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[0][((DQ_BITS*LANES)*3 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*3 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[0][((DQ_BITS*LANES)*4 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*4 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[0][((DQ_BITS*LANES)*5 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*5 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[0][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[0][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[0][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update lane for burst 0
o_wb_data_q[0][((DQ_BITS*LANES)*1 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*1 + 8*index) +: 8]; //update lane for burst 1
o_wb_data_q[0][((DQ_BITS*LANES)*2 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*2 + 8*index) +: 8]; //update lane for burst 2
o_wb_data_q[0][((DQ_BITS*LANES)*3 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*3 + 8*index) +: 8]; //update lane for burst 3
o_wb_data_q[0][((DQ_BITS*LANES)*4 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*4 + 8*index) +: 8]; //update lane for burst 4
o_wb_data_q[0][((DQ_BITS*LANES)*5 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*5 + 8*index) +: 8]; //update lane for burst 5
o_wb_data_q[0][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update lane for burst 6
o_wb_data_q[0][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update lane for burst 7
end
/* verilator lint_off WIDTH */
// read_pipe #1
// NOTE: added_read_pipe_max and added_read_pipe can either be just 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of this)
// If the added_read_pipe (added number of controller clk cycles of delay to a lane due to pcb trace) is equal to the
// max delay (added_read_pipe_max, e.g. 0-0 or 1-1) for this lane, THEN we need to wait until the bit 1 reaches the LSB[0] of delay_read_pipe
// before retrieving the value from PHY. But if not the same (added_read_pipe is 0 while added_read_pipe_max is 1), then wait until
// the bit 1 reaches the one before LSB [1] (which goes high already since bit 1 of delay_read_pipe is the first to go high once shift_reg_read_pipe_q
// bit 1 goes high) before retrieving the value from PHY. So this means this lane with 0 delay will FIRST BE RETRIEVED
// while the lane with added_read_pipe_max of delay (delay of 1) will be retrieved SECOND
if(delay_read_pipe[1][added_read_pipe_max != added_read_pipe[index]]) begin
/* verilator lint_on WIDTH */
o_wb_data_q[1][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update each lane of the burst
@ -1230,17 +1324,49 @@ module ddr3_controller #(
o_wb_data_q[1][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[1][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update each lane of the burst
end
// why are we alternatingly use the read_pipes?
// time | 0 | 1 | 2 | 3 |
// index_read_pipe | 1 | 0 | 1 | 0 |
// delay_read_pipe[1] | [0][0] | <[1][0] | [0][1]> | [1][0] |
// delay_read_pipe[0] | [0][0] | [0][0] | <[1][0] | [0][1]> |
//
// At time 1, request #1 is accepted by pipe[1], then wait until time 2 to retrieve the lane with longest added_read_pipe
// (lane with added_read_pipe 0 is retrieved from ISERDES at time 1, lane with added_read_pipe 1 is retrieved from ISERDES at time 2)
// At time 2, request #2 is accepted by pipe[0] (since pipe[1] is still busy on request #1), then wait until time 3 to retrieve
// the lane with longest added_read_pipe
// Thus pipe 0 and 1 is alternating to make sure that even if 0 is busy, 1 will retrieve the data. And then when 1 is busy, 0 will retrieve the data
// NOTE TO SELF: longest delay for a lane relative to others is 1 controller clk cycle, if more than 1 then it will not be retrievd
// and aligned properly. Thus try to optimize this logic since delay is at max 1 controller clk only.
end
// o_wb_ack_read_q[0][0] is also the wishbone ack (aligned with ack is the wishbone data) thus
// after sending the wishbone data on a particular index, invert it for the next ack
if(o_wb_ack_read_q[0][0]) begin
index_wb_data <= !index_wb_data;
index_wb_data <= !index_wb_data; //alternatingly uses the o_wb_data_q (either 0 or 1)
end
for(index = 1; index < MAX_ADDED_READ_ACK_DELAY; index = index + 1) begin
o_wb_ack_read_q[index-1] <= o_wb_ack_read_q[index];
o_wb_ack_read_q[index-1] <= o_wb_ack_read_q[index]; // shift rightward [ack] -> [] -> [LSB]
end
o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1] <= 0;
o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1] <= 0; // MSB always gets zero and is shifted rightwards
o_wb_ack_read_q[added_read_pipe_max] <= shift_reg_read_pipe_q[0];
// o_wb_ack_read_q[0] is the wishbone ack
// so once data is available from ISERDES (shift_reg_read_pipe_q[0] high) then need to wait added_read_pipe_max
// before the data is properly stored to o_wb_data_q and can be sent outside as wishbone data
//abort any outgoing ack when cyc is low
// BASICALLY:
// shift_reg_read_pipe_q is the delay from when the read command is issued from controller until the
// data is received by the PHY ISERDES (total delay of READ_ACK_PIPE_WIDTH). The shift_reg_read_pipe_q[1]
// is then connected to delay_read_pipe[added_read_pipe_max (0 or 1)] which is the delay to align the lanes.
// The shift_reg_read_pipe_q[0] is then connected to o_wb_ack_read_q[added_read_pipe_max (0 or 1)] which
// is the delay until the wishbone data and ack will be sent outside
// NOTE TO SELF: Optimize by removing o_wb_ack_read_q and just connect the woshbone ack and data to delay_read_pipe[0].
// Visualization:
// shift_reg_read_pipe_q [ ] -> [1] -> [ ] [ ] -> [ ] -> [1] [ ] -> [ ] -> [ ] [ ] -> [ ] -> [ ]
// delay_read_pipe [ ] -> [ ] -> [ ] ---> [ ] -> [1] -> [ ] ---> [ ] -> [ ] -> [1] ---> [ ] -> [ ] -> [ ]
// o_wb_ack_read_q [ ] -> [ ] -> [ ] [ ] -> [ ] -> [ ] [ ] -> [1] -> [ ] [ ] -> [ ] -> [1]
// request is now on [1] request passed request passed o_wb_ack_read_q[0]
// of shift_reg_read_pipe_q to delay_read_pipe to o_wb_ack_read_q high thus ready for wishbone ack
// abort any outgoing ack when cyc is low
if(!i_wb_cyc && state_calibrate == DONE_CALIBRATE) begin
for(index = 0; index < MAX_ADDED_READ_ACK_DELAY; index = index + 1) begin
o_wb_ack_read_q[index] <= 0;
@ -1252,7 +1378,7 @@ module ddr3_controller #(
end
end
assign o_wb_ack = o_wb_ack_read_q[0][0] && state_calibrate == DONE_CALIBRATE;
//o_wb_ack_read_q[0][0] is needed internally for write calibration but it must not go outside (since it is not an actual user wb request unless we are in DONE_CALIBRATE)
//o_wb_ack_read_q[0][0] is needed internally to ack during write calibration but it must not go outside (since it is not an actual user wb request unless we are in DONE_CALIBRATE)
assign o_aux = o_wb_ack_read_q[0][AUX_WIDTH:1];
assign o_wb_data = o_wb_data_q[index_wb_data];
assign o_phy_dqs_tri_control = !write_dqs[STAGE2_DATA_DEPTH];
@ -1360,15 +1486,22 @@ module ddr3_controller #(
idelay_data_cntvaluein[lane] <= o_phy_idelay_data_ld[lane]? idelay_data_cntvaluein[lane] + 1: idelay_data_cntvaluein[lane];
idelay_dqs_cntvaluein[lane] <= o_phy_idelay_dqs_ld[lane]? idelay_dqs_cntvaluein[lane] + 1: idelay_dqs_cntvaluein[lane];
end
// high initial_dqs is the time when the IDELAY of dqs and dq is not yet calibrated
// dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1; // move to next odd (if 3 then 5, if 4 then 5)
// so dqs_target_index_value is basically the next odd number of dqs_start_index_stored (original starting bit when dqs starts).
// The next odd number ensure that the DQS edge is aligned to edge of ddr3_clk (and thus DQ data eye is aligned to edges of of ddr3_clk
// since dq is 90 degree relative to dqs
// Some images to show why next odd number is used: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
if(initial_dqs) begin
dqs_target_index <= dqs_target_index_value;
dq_target_index[lane] <= {1'b0, dqs_target_index_value};
dqs_target_index_orig <= dqs_target_index_value;
dqs_target_index <= dqs_target_index_value; // target index for DQS to make sure the DQS is edge-aligned with ddr3_clk
dq_target_index[lane] <= {1'b0, dqs_target_index_value}; // target index for DQ (just same as DQS)
dqs_target_index_orig <= dqs_target_index_value; // this will remain the same until we finish calibrating this whole lane
end
if(idelay_dqs_cntvaluein[lane] == 0) begin //go back to previous odd
if(idelay_dqs_cntvaluein[lane] == 0) begin //the DQS got past cntvalue of 31 (and goes back to zero) THUS the target index should also go back (to previous odd)
dqs_target_index <= dqs_target_index_orig - 2;
end
if(idelay_data_cntvaluein[lane] == 0 && idelay_data_cntvaluein_prev == 31) begin
if(idelay_data_cntvaluein[lane] == 0 && idelay_data_cntvaluein_prev == 31) begin //the DQ got past cntvalue of 31 (and goes back to zero) thus the target index should also go back (to previous odd)
dq_target_index[lane] <= dqs_target_index_orig - 2;
end
@ -1397,6 +1530,7 @@ module ddr3_controller #(
another Bitslip command. If the ISERDESE2 is reset, the Bitslip logic is also reset
and returns back to its initial state.
*/
// bitslip basically is changing the arrangement of bytes on IOSERDES
if(i_phy_iserdes_bitslip_reference[lane*serdes_ratio*2 +: 8] == 8'b0111_1000) begin //initial arrangement
state_calibrate <= MPR_READ;
initial_dqs <= 1;
@ -1412,28 +1546,42 @@ module ddr3_controller #(
MPR_READ: if(delay_before_read_data == 0) begin //align the incoming DQS during reads to the controller clock
//issue_read_command = 1;
/* verilator lint_off WIDTH */
delay_before_read_data <= READ_DELAY + 1 + 2 + 1 - 1; ///1=issue command delay (OSERDES delay), 2 = ISERDES delay
delay_before_read_data <= READ_DELAY + 1 + 2 + 1 - 1; ///NOTE TO SELF: why these numbers? 1=issue command delay (OSERDES delay), 2 = ISERDES delay
/* verilator lint_on WIDTH */
state_calibrate <= COLLECT_DQS;
dqs_count_repeat <= 0;
end
COLLECT_DQS: if(delay_before_read_data == 0) begin
COLLECT_DQS: if(delay_before_read_data == 0) begin // data from MPR read is now received by controller
// dqs from ISERDES is received and stored
// DQS received from ISERDES: { {LANE_1_burst_7, LANE_1_burst_6, ... , LANE_1_burst_0} , {LANE_0_burst_7, LANE_0_burst_6, ... , LANE_0_burst_0}}
// NOTE TO SELF: WHY DQS IS DIVIDED PER LANE BUT DQ IS PER BURST ????
// dqs_store stores the 8 DQS (8 bursts) for a given lane but since the DQS might be shifted at next ddr3 clk cycles (due to trace delays), we must store the
// 8 DQS multiple times (dictated by STORED_DQS_SIZE)
dqs_store <= {i_phy_iserdes_dqs[serdes_ratio*2*lane +: 8], dqs_store[(STORED_DQS_SIZE*8-1):8]};
dqs_count_repeat <= dqs_count_repeat + 1;
if(dqs_count_repeat == STORED_DQS_SIZE - 1) begin
state_calibrate <= ANALYZE_DQS;
dqs_start_index_stored <= dqs_start_index;
dqs_start_index <= 0;
// store the previous value of dqs_start_index, if the ANALYZE_DQS is repeated then the dqs_start_index
// should be the same as the previous value, else the previous (or current one) has a glitch causing
// a different dqs_start_index
dqs_start_index_stored <= dqs_start_index;
// start the index from zero since this will be incremented until we pinpoint the real
// starting bit of dqs_store (dictated by the pattern 10'b01_01_01_01_00)
dqs_start_index <= 0;
end
end
// find the bit where the DQS starts to be issued (by finding when the pattern 10'b01_01_01_01_00 starts)
ANALYZE_DQS: if(dqs_store[dqs_start_index +: 10] == 10'b01_01_01_01_00) begin
dqs_start_index_repeat <= (dqs_start_index == dqs_start_index_stored)? dqs_start_index_repeat + 1: 0; //increase dqs_start_index_repeat when index is the same as before
if(dqs_start_index_repeat == REPEAT_DQS_ANALYZE) begin //the same index appeared REPEAT_DQS_ANALYZE times in a row, thus can proceed to CALIBRATE_DQS
initial_dqs <= 0;
//increase dqs_start_index_repeat when index is the same as before
dqs_start_index_repeat <= (dqs_start_index == dqs_start_index_stored)? dqs_start_index_repeat + 1: 0;
//the same dqs_start_index_repeat appeared REPEAT_DQS_ANALYZE times in a row, thus we can trust the value we got is accurate and not affected by glitch
if(dqs_start_index_repeat == REPEAT_DQS_ANALYZE) begin
// since we already know the starting bit when the dqs (and dq since they are aligned) will come,
// we will now start calibrating the IDELAY for dqs and dq (via CALIBRATE_DQS).
// high initial_dqs is the time when the IDELAY of dqs and dq is not yet calibrated so we zero this starting now
initial_dqs <= 0;
dqs_start_index_repeat <= 0;
state_calibrate <= CALIBRATE_DQS;
end
else begin
@ -1451,23 +1599,40 @@ module ddr3_controller #(
dqs_start_index <= dqs_start_index + 1;
end
end
// check if the index when the dqs starts is the same as the target index which is aligned to the ddr3_clk
// dqs_target_index is the next odd number to dqs_start_index BEFORE IDELAY CALIBRATION. We will increase the IDELAY
// until the dqs_start_index_stored (current value of dqs_start_index) matches the target index which is aligned to ddr3_clk
CALIBRATE_DQS: if(dqs_start_index_stored == dqs_target_index) begin
// dq_target_index still stores the original dqs_target_index_value. The bit size of dq_target_index is just enough
// to count the bits in dqs_store (the received 8 DQS stored STORED_DQS_SIZE times)
added_read_pipe[lane] <= { {( 4 - ($clog2(STORED_DQS_SIZE*8) - (3+1)) ){1'b0}} , dq_target_index[lane][$clog2(STORED_DQS_SIZE*8)-1:(3+1)] }
+ { 3'b0 , (dq_target_index[lane][3:0] >= (5+8)) };
// if target_index is > 13, then a 1 CONTROLLLER_CLK cycle delay (4 ddr3_clk cycles) is added on that particular lane (due to trace delay)
// added_read_pipe[lane] <= dq_target_index[lane][$clog2(STORED_DQS_SIZE*8)-1 : (4)] + ( dq_target_index[lane][3:0] >= 13 ) ;
dqs_bitslip_arrangement <= 16'b0011_1100_0011_1100 >> dq_target_index[lane][2:0];
// the dqs is delayed (to move starting bit to next odd number) so this means the original
// expected bitslip arrangement of 8'b0111_1000 will not be followed anymore, so here we form the bitslip
// arrangement pattern so incoming dqs (and thus DQ) is arranged in the proper way (first bute firs, last byte last)
state_calibrate <= BITSLIP_DQS_TRAIN_2;
end
else begin
// if we have not yet reached the target index then increment IDELAY
// we will keep incrementing the IDELAY until the next odd index is reached (which is
// the time we are sure the DQS is edge aligned with ddr3_clk and thus ddr3_clk posedge
// is hitting the center of DQ eye
// To show why next odd number is needed: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
o_phy_idelay_data_ld[lane] <= 1;
o_phy_idelay_dqs_ld[lane] <= 1;
state_calibrate <= MPR_READ;
delay_before_read_data <= 10; //wait for sometime to make sure idelay load settles
end
//the dqs is delayed (to move starting bit to next odd number) so this means the original
// expected bitslip arrangement of 8'b0111_1000 will not be followed anymore, so here the bitslip
// is re-arranged
BITSLIP_DQS_TRAIN_2: if(train_delay == 0) begin //train again the ISERDES to capture the DQ correctly
if(i_phy_iserdes_bitslip_reference[lane*serdes_ratio*2 +: 8] == dqs_bitslip_arrangement[7:0]) begin
/* verilator lint_off WIDTH */
// this is the end of training and calibration for a single lane, so proceed to next lane
if(lane == LANES - 1) begin
/* verilator lint_on WIDTH */
pause_counter <= 0; //read calibration now complete so continue the reset instruction sequence
@ -1481,8 +1646,11 @@ module ddr3_controller #(
end
else begin
lane <= lane + 1;
state_calibrate <= BITSLIP_DQS_TRAIN_1;
state_calibrate <= BITSLIP_DQS_TRAIN_1;// current lane is done so go back to BITSLIP_DQS_TRAIN_1 to train next lane
end
// stores the highest value of added_read_pipe among the lanes since all lanes (except the lane with highest
// added_read_pipe) will be delayed to align with the lane with highest added_read_pipe. This alignment
// is required to make sure the received DQ will be aligned and can form the 512 bit data (for 8 lanes) arranged properly.
added_read_pipe_max <= added_read_pipe_max > added_read_pipe[lane]? added_read_pipe_max:added_read_pipe[lane];
end
else begin
@ -1490,7 +1658,7 @@ module ddr3_controller #(
train_delay <= 3;
end
end
// CONTINUE COMMENT HERE (once blog is done)
START_WRITE_LEVEL: if(!ODELAY_SUPPORTED) begin //skip write levelling if ODELAY is not supported
pause_counter <= 0;
lane <= 0;
@ -1548,7 +1716,14 @@ module ddr3_controller #(
calib_sel <= {wb_sel_bits{1'b1}};
calib_we <= 1; //write-enable
calib_addr <= 0;
// burst_7 burst_6 burst_5 burst_4 burst_3 burst_2 burst_1 burst_0
calib_data <= { {LANES{8'h91}}, {LANES{8'h77}}, {LANES{8'h29}}, {LANES{8'h8c}}, {LANES{8'hd0}}, {LANES{8'had}}, {LANES{8'h51}}, {LANES{8'hc1}} };
// write to address 0 is a burst of 8 writes, where all lanes has same data written: 64'h9177298cd0ad51c1
// Example for LANES of 2, DQ of 8: the 128 bits (8 bursts * 2 lanes/burst * 8bits/lane) are 8 bursts with 8 bytes each:
// { burst_7, burst_6, burst_5, burst_4, burst_3, burst_2, burst_1, burst_0 } OR:
// { { {burst7_lane1} , {burst7_lane0} } , { {burst6_lane1} , {burst6_lane0} } , { {burst5_lane1} , {burst5_lane0} }
// , { {burst4_lane1} , {burst4_lane0} } , { {burst3_lane1} , {burst3_lane0} } , { {burst2_lane1} , {burst2_lane0} }
// , { {burst1_lane1} , {burst1_lane0} } , { {burst0_lane1} , {burst0_lane0} } }
state_calibrate <= ISSUE_WRITE_2;
end
ISSUE_WRITE_2: begin
@ -1557,9 +1732,18 @@ module ddr3_controller #(
calib_sel <= {wb_sel_bits{1'b1}};
calib_we <= 1; //write-enable
calib_addr <= 1;
calib_data <= { {LANES{8'h80}}, {LANES{8'hdb}}, {LANES{8'hcf}}, {LANES{8'hd2}}, {LANES{8'h75}}, {LANES{8'hf1}}, {LANES{8'h2c}}, {LANES{8'h3d}} };
// burst_7 burst_6 burst_5 burst_4 burst_3 burst_2 burst_1 burst_0
calib_data <= { {LANES{8'h80}}, {LANES{8'hdb}}, {LANES{8'hcf}}, {LANES{8'hd2}}, {LANES{8'h75}}, {LANES{8'hf1}}, {LANES{8'h2c}}, {LANES{8'h3d}} };
// write to address 1 is also a burst of 8 writes, where all lanes has same data written: 128'h80dbcfd275f12c3d
state_calibrate <= ISSUE_READ;
end
end
// NOTE: WHY THERE ARE TWO ISSUE_WRITE
// address 0 and 1 is written with a deterministic data, if the DQ trace has long delay (relative to command line) then the data will be delayed
// compared to the write command. Thus the data aligned to the write command for address 0 MIGHT START AT MIDDLE OF EXPECTED OUTPUT
// DATA (64'h9177298cd0ad51c1) e.g. the data written might be 64'h[2c3d][9177298cd0ad] where the data written starts
// at burst 2 (burst 0 and burst 1 are cut-off since each burst uses 1 ddr3_clk cycle)
// Note here that if DQ and DQS sa same delay, then we know the DQS will always be aligned with DQ data
ISSUE_READ: begin
calib_stb <= 1;//actual request flag
calib_aux <= 1; //AUX ID to determine later if ACK is for read or write
@ -1569,7 +1753,7 @@ module ddr3_controller #(
end
READ_DATA: if(o_wb_ack_read_q[0] == {{(AUX_WIDTH-2){1'b0}}, 2'd1, 1'b1}) begin //wait for the read ack (which has AUX ID of 1}
read_data_store <= o_wb_data;
read_data_store <= o_wb_data; // read data on address 0
calib_stb <= 0;
state_calibrate <= ANALYZE_DATA;
data_start_index[lane] <= 0;
@ -1583,7 +1767,12 @@ module ddr3_controller #(
else if(!o_wb_stall_calib) begin
calib_stb <= 0;
end
// extract burst_0-to-burst_7 data for a specified lane then determine which byte in write_pattern does it starts
// NOTE TO SELF: all "8" here assume DQ_BITS are 8? parameterize this properly
// data_start_index for a specified lane determine how many bits are off the data from the write command
// so for every 1 ddr3 clk cycle delay of DQ from write command, each lane will be 1 burst off:
// e.g. LANE={burst7, burst6, burst5, burst4, burst3, burst2, burst1, burst0} then with 1 ddr3 cycle delay between DQ and command
// burst0 will not be written but only starting on burst1
ANALYZE_DATA: if(write_pattern[data_start_index[lane] +: 64] == {read_data_store[((DQ_BITS*LANES)*7 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*6 + 8*lane) +: 8],
read_data_store[((DQ_BITS*LANES)*5 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*4 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*3 + 8*lane) +: 8],
read_data_store[((DQ_BITS*LANES)*2 + 8*lane) +: 8],read_data_store[((DQ_BITS*LANES)*1 + 8*lane) +: 8],read_data_store[((DQ_BITS*LANES)*0 + 8*lane) +: 8] }) begin
@ -1598,7 +1787,7 @@ module ddr3_controller #(
end
end
else begin
data_start_index[lane] <= data_start_index[lane] + 8;
data_start_index[lane] <= data_start_index[lane] + 8; //skip by 8
if(data_start_index[lane] == 56) begin //reached the end but no byte in write-pattern matches the data read, issue might be reading at wrong DQS toggle
data_start_index[lane] <= 0; //so we need to recalibrate the bitslip
start_index_check <= 0;
@ -1710,7 +1899,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
//if(write_test_address_counter[wb_addr_bits-1:0] == 500) begin //inject error at middle
// calib_data <= 1;
//end
if(write_test_address_counter[wb_addr_bits-1:0] == 999 ) begin //MUST END AT ODD NUMBER
if(write_test_address_counter[wb_addr_bits-1:0] == 99 ) begin //MUST END AT ODD NUMBER
state_calibrate <= BURST_READ;
end
end
@ -1733,7 +1922,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
calib_addr <= read_test_address_counter;
read_test_address_counter <= read_test_address_counter + 1;
if(MICRON_SIM) begin
if(read_test_address_counter == 999) begin //MUST END AT ODD NUMBER
if(read_test_address_counter == 99) begin //MUST END AT ODD NUMBER
state_calibrate <= RANDOM_WRITE;
end
end
@ -1759,7 +1948,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
//if(write_test_address_counter[wb_addr_bits-1:0] == 1500) begin //inject error
// calib_data <= 1;
//end
if(write_test_address_counter[wb_addr_bits-1:0] == 1999) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
if(write_test_address_counter[wb_addr_bits-1:0] == 199) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
state_calibrate <= RANDOM_READ;
end
end
@ -1784,7 +1973,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
<= read_test_address_counter[wb_addr_bits-1:ROW_BITS];
read_test_address_counter <= read_test_address_counter + 1;
if(MICRON_SIM) begin
if(read_test_address_counter == 1999) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
if(read_test_address_counter == 199) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
state_calibrate <= ALTERNATE_WRITE_READ;
end
end
@ -1803,7 +1992,7 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
calib_addr <= {1'b0, write_test_address_counter[wb_addr_bits-1:1]}; //same address to be used for write and read ( so basically write then read instantly)
calib_data <= {wb_sel_bits{write_test_address_counter[7:0]}};
if(MICRON_SIM) begin
if(write_test_address_counter == 4999) begin
if(write_test_address_counter == 499) begin
train_delay <= 15;
state_calibrate <= FINISH_READ;
end
@ -1858,12 +2047,12 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
end
end
assign issue_read_command = (state_calibrate == MPR_READ && delay_before_read_data == 0);
assign issue_write_command = 0;
assign o_phy_odelay_data_cntvaluein = odelay_data_cntvaluein[lane];
assign o_phy_odelay_dqs_cntvaluein = odelay_dqs_cntvaluein[lane];
assign o_phy_idelay_data_cntvaluein = idelay_data_cntvaluein[lane];
assign o_phy_idelay_dqs_cntvaluein = idelay_dqs_cntvaluein[lane];
assign dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1;
assign dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1; // move to next odd (if 3 then 5, if 4 then 5)
// To show why next odd number is needed: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
reg[31:0] wb_data_to_wb2 = 0;
always @(posedge i_controller_clk) begin
if(o_wb_ack_read_q[0][0]) wb_data_to_wb2 <= o_wb_data[31:0]; //save data read
@ -2193,7 +2382,7 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
// find anticipate activate command slot number
if(CL_nCK > CWL_nCK) slot_number = read_slot;
else slot_number = write_slot;
delay = ps_to_nCK($rtoi(tRCD));
delay = ps_to_nCK(tRCD);
for(slot_number = slot_number; delay != 0; delay = delay - 1) begin
slot_number = slot_number - 1'b1;
end

View File

@ -1,6 +1,6 @@
`default_nettype none
`timescale 1ps / 1ps
//`define DEBUG_DQS
//`define DEBUG_DQS // uncomment to route the raw DQS to output port for debugging
module ddr3_phy #(
parameter CONTROLLER_CLK_PERIOD = 10_000, //ps, clock period of the controller interface
@ -12,7 +12,7 @@ module ddr3_phy #(
parameter[0:0] ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
USE_IO_TERMINATION = 0, //use IOBUF_DCIEN and IOBUFDS_DCIEN when 1
// The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
parameter serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD),
parameter serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
wb_sel_bits = wb_data_bits / 8,
//4 is the width of a single ddr3 command {cs_n, ras_n, cas_n, we_n} plus 3 (ck_en, odt, reset_n) plus bank bits plus row bits

View File

@ -20,7 +20,7 @@ module ddr3_top #(
ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
SECOND_WISHBONE = 0, //set to 1 if 2nd wishbone is needed
parameter // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD),
serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
wb_addr_bits = ROW_BITS + COL_BITS + BA_BITS - $clog2(serdes_ratio*2),
wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
wb_sel_bits = wb_data_bits / 8,

View File

@ -53,8 +53,8 @@ module ddr3_dimm_micron_sim;
`endif
localparam CONTROLLER_CLK_PERIOD = 10, //ns, period of clock input to this DDR3 controller module
DDR3_CLK_PERIOD = 2.5, //ns, period of clock input to DDR3 RAM device
localparam CONTROLLER_CLK_PERIOD = 10_000, //ps, period of clock input to this DDR3 controller module
DDR3_CLK_PERIOD = 2500, //ps, period of clock input to DDR3 RAM device
AUX_WIDTH = 16, // AUX lines
OPT_LOWPOWER = 1, //1 = low power, 0 = low logic
OPT_BUS_ABORT = 1;
@ -126,13 +126,13 @@ module ddr3_dimm_micron_sim;
`else
assign clk_locked = 1;
always #(CONTROLLER_CLK_PERIOD*1000/2) i_controller_clk = !i_controller_clk;
always #(DDR3_CLK_PERIOD*1000/2) i_ddr3_clk = !i_ddr3_clk;
always #(CONTROLLER_CLK_PERIOD/2) i_controller_clk = !i_controller_clk;
always #(DDR3_CLK_PERIOD/2) i_ddr3_clk = !i_ddr3_clk;
always #2500 i_ref_clk = !i_ref_clk;
initial begin //90 degree phase shifted ddr3_clk
#(DDR3_CLK_PERIOD*1000/4);
#(DDR3_CLK_PERIOD/4);
while(1) begin
#(DDR3_CLK_PERIOD*1000/2) i_ddr3_clk_90 = !i_ddr3_clk_90;
#(DDR3_CLK_PERIOD/2) i_ddr3_clk_90 = !i_ddr3_clk_90;
end
end
initial begin