Merge pull request #1 from AngeloJacobo/dev

Dev branch pull request
This commit is contained in:
Angelo Jacobo 2024-04-20 15:18:30 +08:00 committed by GitHub
commit a354a8d4ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 302 additions and 113 deletions

View File

@ -17,10 +17,10 @@
// Engineer: Angelo C. Jacobo // Engineer: Angelo C. Jacobo
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// NOTE TO SELF are questions which I still need to answer
// Comments are continuously added on this RTL for better readability
//`define FORMAL_COVER //skip reset sequence during formal verification to fit in cover depth
//`define FORMAL_COVER //skip reset sequence to fit in cover depth
`default_nettype none `default_nettype none
`timescale 1ps / 1ps `timescale 1ps / 1ps
// //
@ -52,7 +52,7 @@ module ddr3_controller #(
ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
SECOND_WISHBONE = 0, //set to 1 if 2nd wishbone is needed SECOND_WISHBONE = 0, //set to 1 if 2nd wishbone is needed
parameter // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration parameter // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD), serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
wb_data_bits = DQ_BITS*LANES*serdes_ratio*2, wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
wb_addr_bits = ROW_BITS + COL_BITS + BA_BITS - $clog2(serdes_ratio*2), wb_addr_bits = ROW_BITS + COL_BITS + BA_BITS - $clog2(serdes_ratio*2),
wb_sel_bits = wb_data_bits / 8, wb_sel_bits = wb_data_bits / 8,
@ -157,7 +157,7 @@ module ddr3_controller #(
// delayed and center-aligned to the center eye of data) // delayed and center-aligned to the center eye of data)
localparam DATA_INITIAL_ODELAY_TAP = 0; localparam DATA_INITIAL_ODELAY_TAP = 0;
//DQS needs to be edge-aligned to the center eye of the data. //Posedge of DQS needs to be aligned to the center eye of the data.
//This means DQS needs to be delayed by a quarter of the ddr3 //This means DQS needs to be delayed by a quarter of the ddr3
//clk period relative to the data. Subtract by 600ps to include //clk period relative to the data. Subtract by 600ps to include
//the IODELAY insertion delay. Divide by a delay resolution of //the IODELAY insertion delay. Divide by a delay resolution of
@ -179,6 +179,16 @@ module ddr3_controller #(
localparam tRCD = 13_750; // ps Active to Read/Write command time localparam tRCD = 13_750; // ps Active to Read/Write command time
localparam tRP = 13_750; // ps Precharge command period localparam tRP = 13_750; // ps Precharge command period
localparam tRAS = 35_000; // ps ACT to PRE command period localparam tRAS = 35_000; // ps ACT to PRE command period
`elsif DDR3_1333_9_9_9 //DDR3-1333 (9-9-9) speed bin
localparam tRCD = 13_500; // ps Active to Read/Write command time
localparam tRP = 13_500; // ps Precharge command period
localparam tRAS = 36_000; // ps ACT to PRE command period
`elsif DDR3_1066_7_7_7 //DDR3-1066 (7-7-7) speed bin
localparam tRCD = 13_125; // ps Active to Read/Write command time
localparam tRP = 13_125; // ps Precharge command period
localparam tRAS = 37_500; // ps ACT to PRE command period
`else
"Throw an error here if speed bin is not recognized (or not defined)"
`endif `endif
`ifdef RAM_1Gb `ifdef RAM_1Gb
@ -187,9 +197,12 @@ module ddr3_controller #(
localparam tRFC = 160_000; // ps Refresh command to ACT or REF localparam tRFC = 160_000; // ps Refresh command to ACT or REF
`elsif RAM_4Gb `elsif RAM_4Gb
localparam tRFC = 300_000; // ps Refresh command to ACT or REF localparam tRFC = 300_000; // ps Refresh command to ACT or REF
`else `elsif RAM_8Gb
localparam tRFC = 350_000; // ps Refresh command to ACT or REF localparam tRFC = 350_000; // ps Refresh command to ACT or REF
`else
"Throw an error here if capacity is not recognized (or not defined)"
`endif `endif
localparam tREFI = 7_800_000; //ps Average periodic refresh interval localparam tREFI = 7_800_000; //ps Average periodic refresh interval
localparam tXPR = max(5*DDR3_CLK_PERIOD, tRFC+10_000); // ps Exit Reset from CKE HIGH to a valid command localparam tXPR = max(5*DDR3_CLK_PERIOD, tRFC+10_000); // ps Exit Reset from CKE HIGH to a valid command
localparam tWR = 15_000; // ps Write Recovery Time localparam tWR = 15_000; // ps Write Recovery Time
@ -247,6 +260,8 @@ module ddr3_controller #(
//Also, worscase is when the anticipated bank still has the leftover of the //Also, worscase is when the anticipated bank still has the leftover of the
//WRITE_TO_PRECHARGE_DELAY thus consider also this. //WRITE_TO_PRECHARGE_DELAY thus consider also this.
localparam MARGIN_BEFORE_ANTICIPATE = PRECHARGE_TO_ACTIVATE_DELAY + ACTIVATE_TO_WRITE_DELAY + WRITE_TO_PRECHARGE_DELAY; localparam MARGIN_BEFORE_ANTICIPATE = PRECHARGE_TO_ACTIVATE_DELAY + ACTIVATE_TO_WRITE_DELAY + WRITE_TO_PRECHARGE_DELAY;
// STAGE2_DATA_DEPTH is the number of controller clk cycles of delay before issuing the data after the write command
// depends on the CWL_nCK
localparam STAGE2_DATA_DEPTH = (CWL_nCK - (3 - WRITE_SLOT + 1))/4 + 1; //this is always >= 1 (5 - (3 - 3 + 1))/4.0 -> floor(1) + 1 = floor(4 localparam STAGE2_DATA_DEPTH = (CWL_nCK - (3 - WRITE_SLOT + 1))/4 + 1; //this is always >= 1 (5 - (3 - 3 + 1))/4.0 -> floor(1) + 1 = floor(4
`ifdef FORMAL `ifdef FORMAL
wire stage2_data_depth; wire stage2_data_depth;
@ -255,7 +270,10 @@ module ddr3_controller #(
assert(STAGE2_DATA_DEPTH-2 >= 0); assert(STAGE2_DATA_DEPTH-2 >= 0);
end end
`endif `endif
localparam READ_DELAY = $rtoi($floor((CL_nCK - (3 - READ_SLOT + 1))/4.0 )); localparam READ_DELAY = $rtoi($floor((CL_nCK - (3 - READ_SLOT + 1))/4.0 )); // how many controller clk cycles to satisfy CL_nCK of ddr3_clk cycles
// READ_ACK_PIPE_WIDTH is the delay between read command issued (starting from the controller) until the data is received by the controller
//the delays included the ODELAY and OSERDES when issuing the read command
//and the IDELAY and ISERDES when receiving the data (NOTE TO SELF: ELABORATE ON WHY THOSE MAGIC NUMBERS)
localparam READ_ACK_PIPE_WIDTH = READ_DELAY + 1 + 2 + 1 + 1; localparam READ_ACK_PIPE_WIDTH = READ_DELAY + 1 + 2 + 1 + 1;
localparam MAX_ADDED_READ_ACK_DELAY = 16; localparam MAX_ADDED_READ_ACK_DELAY = 16;
localparam DELAY_BEFORE_WRITE_LEVEL_FEEDBACK = STAGE2_DATA_DEPTH + ps_to_cycles(tWLO+tWLOE) + 10; //plus 10 controller clocks for possible bus latency and localparam DELAY_BEFORE_WRITE_LEVEL_FEEDBACK = STAGE2_DATA_DEPTH + ps_to_cycles(tWLO+tWLOE) + 10; //plus 10 controller clocks for possible bus latency and
@ -350,7 +368,6 @@ module ddr3_controller #(
reg reset_done = 0; //high if reset has already finished reg reset_done = 0; //high if reset has already finished
reg pause_counter = 0; reg pause_counter = 0;
wire issue_read_command; wire issue_read_command;
wire issue_write_command;
reg stage2_update = 1; reg stage2_update = 1;
reg stage2_stall = 0; reg stage2_stall = 0;
reg stage1_stall = 0; reg stage1_stall = 0;
@ -429,13 +446,16 @@ module ddr3_controller #(
/* verilator lint_off UNUSEDSIGNAL */ /* verilator lint_off UNUSEDSIGNAL */
(* mark_debug = "true" *) reg[3:0] added_read_pipe_max = 0; (* mark_debug = "true" *) reg[3:0] added_read_pipe_max = 0;
(* mark_debug = "true" *) reg[3:0] added_read_pipe[LANES - 1:0]; (* mark_debug = "true" *) reg[3:0] added_read_pipe[LANES - 1:0];
//each lane will have added delay relative to when ISERDES should actually return the data
//this make sure that we will wait until the lane with longest delay (added_read_pipe_max) is received before
//all lanes are sent to wishbone data
//contains the ack shift reg for both read and write //contains the ack shift reg for both read and write
reg[AUX_WIDTH:0] shift_reg_read_pipe_q[READ_ACK_PIPE_WIDTH-1:0]; reg[AUX_WIDTH:0] shift_reg_read_pipe_q[READ_ACK_PIPE_WIDTH-1:0];
reg[AUX_WIDTH:0] shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1:0]; //1=issue command delay (OSERDES delay), 2 = ISERDES delay reg[AUX_WIDTH:0] shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1:0]; //issue ack and AUX value , 1=issue command delay (OSERDES delay), 2 = ISERDES delay
reg index_read_pipe; //tells which delay_read_pipe will be updated reg index_read_pipe; //tells which delay_read_pipe will be updated (there are two delay_read_pipe)
reg index_wb_data; //tells which o_wb_data_q will be sent to o_wb_data reg index_wb_data; //tells which o_wb_data_q will be sent to o_wb_data
reg[15:0] delay_read_pipe[1:0]; //delay when each lane will retrieve i_phy_iserdes_data reg[15:0] delay_read_pipe[1:0]; //delay when each lane will retrieve i_phy_iserdes_data (since different lanes might not be aligned with each other and needs to be retrieved at a different time)
reg[wb_data_bits - 1:0] o_wb_data_q[1:0]; //store data retrieved from i_phy_iserdes_data to be sent to o_wb_data reg[wb_data_bits - 1:0] o_wb_data_q[1:0]; //store data retrieved from i_phy_iserdes_data to be sent to o_wb_data
reg[AUX_WIDTH:0] o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1:0]; reg[AUX_WIDTH:0] o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1:0];
@ -807,8 +827,8 @@ module ddr3_controller #(
//stage2_data -> shiftreg(CWL) -> OSERDES(DDR) -> ODELAY -> RAM //stage2_data -> shiftreg(CWL) -> OSERDES(DDR) -> ODELAY -> RAM
end end
if(!ODELAY_SUPPORTED) begin if(!ODELAY_SUPPORTED) begin
stage2_data_unaligned <= stage2_data_unaligned_temp; //delayed by 1 clock cycle stage2_data_unaligned <= stage2_data_unaligned_temp; //_temp is for added delay of 1 clock cycle (no ODELAY so no added delay)
stage2_dm_unaligned <= stage2_dm_unaligned_temp; //delayed by 1 clock cycle stage2_dm_unaligned <= stage2_dm_unaligned_temp; //_temp is for added delay of 1 clock cycle (no ODELAY so no added delay)
end end
// when not in refresh, transaction can only be processed when i_wb_cyc is high and not stall // when not in refresh, transaction can only be processed when i_wb_cyc is high and not stall
@ -834,6 +854,7 @@ module ddr3_controller #(
/* verilator lint_on WIDTH */ /* verilator lint_on WIDTH */
stage1_data <= i_wb_data; stage1_data <= i_wb_data;
end end
// request from calibrate FSM will be accepted here
else if(state_calibrate != DONE_CALIBRATE && !o_wb_stall_calib) begin else if(state_calibrate != DONE_CALIBRATE && !o_wb_stall_calib) begin
stage1_pending <= calib_stb;//actual request flag stage1_pending <= calib_stb;//actual request flag
stage1_we <= calib_we; //write-enable stage1_we <= calib_we; //write-enable
@ -850,6 +871,8 @@ module ddr3_controller #(
for(index = 0; index < LANES; index = index + 1) begin for(index = 0; index < LANES; index = index + 1) begin
/* verilator lint_off WIDTH */ /* verilator lint_off WIDTH */
// stage2_data_unaligned is the DQ_BITS*LANES*8 raw data from stage 1 so not yet aligned
// unaligned_data is 64 bits
{unaligned_data[index], { {unaligned_data[index], {
stage2_data[0][((DQ_BITS*LANES)*7 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*6 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*7 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*6 + 8*index) +: 8],
stage2_data[0][((DQ_BITS*LANES)*5 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*4 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*5 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*4 + 8*index) +: 8],
@ -860,7 +883,37 @@ module ddr3_controller #(
stage2_data_unaligned[((DQ_BITS*LANES)*3 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*2 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*3 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*2 + 8*index) +: 8],
stage2_data_unaligned[((DQ_BITS*LANES)*1 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*0 + 8*index) +: 8] } stage2_data_unaligned[((DQ_BITS*LANES)*1 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*0 + 8*index) +: 8] }
<< data_start_index[index]) | unaligned_data[index]; << data_start_index[index]) | unaligned_data[index];
/*
// Example with LANE 0:
// Burst_0 to burst_7 of unaligned LANE 0 will be extracted which will be shifted by data_start_index.
// Each 8 bits of shift means a burst will be moved to next ddr3_clk cycle, this is needed if for example
// the DQ trace is longer than the command trace where the DQ bits must be delayed by 1 ddr3_clk cycle
// to align the DQ data to the write command.
//
// Since 1 controller clk cycle will have 4 ddr3_clk cycle, and each ddr3_clk cycle is DDR:
// CONTROLLER CLK CYCLE 0: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
// CONTROLLER CLK CYCLE 1: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
// CONTROLLER CLK CYCLE 2: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
//
// shifting by 1 burst means burst 7 will be sent on next controller clk cycle and EVERY BURST WILL SHIFT:
// CONTROLLER CLK CYCLE 0: [xxxxxx,xxxxxx] [burst0,burst1] [burst2,burst3] [burst4,burst5]
// CONTROLLER CLK CYCLE 1: [burst6,burst7] [burst0,burst1] [burst2,burst3] [burst4,burst5]
// CONTROLLER CLK CYCLE 2: [burst6,burst7] [burst0,burst1] [burst2,burst3] [burst4,burst5]
//
// the [burst6,burst7] which has to be stored and delayed until next clk cycle will be handled by unaligned_data
{unaligned_data[0], {
stage2_data[0][((64)*7 + 8*0) +: 8], stage2_data[0][((64)*6 + 8*0) +: 8],
stage2_data[0][((64)*5 + 8*0) +: 8], stage2_data[0][((64)*4 + 8*0) +: 8],
stage2_data[0][((64)*3 + 8*0) +: 8], stage2_data[0][((64)*2 + 8*0) +: 8],
stage2_data[0][((64)*1 + 8*0) +: 8], stage2_data[0][((64)*0 + 8*0) +: 8] }}
<= ( { stage2_data_unaligned[((64)*7 + 8*0) +: 8], stage2_data_unaligned[((64)*6 + 8*0) +: 8],
stage2_data_unaligned[((64)*5 + 8*0) +: 8], stage2_data_unaligned[((64)*4 + 8*0) +: 8],
stage2_data_unaligned[((64)*3 + 8*0) +: 8], stage2_data_unaligned[((64)*2 + 8*0) +: 8],
stage2_data_unaligned[((64)*1 + 8*0) +: 8], stage2_data_unaligned[((64)*0 + 8*0) +: 8] }
<< data_start_index[0]) | unaligned_data[0];
*/
// The same alignment logic is done with data mask
{unaligned_dm[index], { {unaligned_dm[index], {
stage2_dm[0][LANES*7 + index], stage2_dm[0][LANES*6 + index], stage2_dm[0][LANES*7 + index], stage2_dm[0][LANES*6 + index],
stage2_dm[0][LANES*5 + index], stage2_dm[0][LANES*4 + index], stage2_dm[0][LANES*5 + index], stage2_dm[0][LANES*4 + index],
@ -873,6 +926,7 @@ module ddr3_controller #(
<< (data_start_index[index]>>3)) | unaligned_dm[index]; << (data_start_index[index]>>3)) | unaligned_dm[index];
/* verilator lint_on WIDTH */ /* verilator lint_on WIDTH */
end end
// stage2 can have multiple pipelined stages inside it which acts as delay before issuing the write data (after issuing write command)
for(index = 0; index < STAGE2_DATA_DEPTH-1; index = index+1) begin for(index = 0; index < STAGE2_DATA_DEPTH-1; index = index+1) begin
stage2_data[index+1] <= stage2_data[index]; stage2_data[index+1] <= stage2_data[index];
stage2_dm[index+1] <= stage2_dm[index]; stage2_dm[index+1] <= stage2_dm[index];
@ -885,11 +939,11 @@ module ddr3_controller #(
end end
end end
end end
assign o_phy_data = stage2_data[STAGE2_DATA_DEPTH-1]; assign o_phy_data = stage2_data[STAGE2_DATA_DEPTH-1]; // the data sent to PHY is the last stage of of stage 2 (since stage 2 can have multiple pipelined stages inside it_
assign o_phy_dm = stage2_dm[STAGE2_DATA_DEPTH-1]; assign o_phy_dm = stage2_dm[STAGE2_DATA_DEPTH-1];
/* verilator lint_off WIDTH */ /* verilator lint_off WIDTH */
assign wb_addr_plus_anticipate = i_wb_addr + MARGIN_BEFORE_ANTICIPATE; assign wb_addr_plus_anticipate = i_wb_addr + MARGIN_BEFORE_ANTICIPATE; // wb_addr_plus_anticipate determines if it is near the end of column by checking if it jumps to next row
assign calib_addr_plus_anticipate = calib_addr + MARGIN_BEFORE_ANTICIPATE; assign calib_addr_plus_anticipate = calib_addr + MARGIN_BEFORE_ANTICIPATE; // just same as wb_addr_plus_anticipate but while doing calibration
/* verilator lint_on WIDTH */ /* verilator lint_on WIDTH */
// DIAGRAM FOR ALL RELEVANT TIMING PARAMETERS: // DIAGRAM FOR ALL RELEVANT TIMING PARAMETERS:
// //
@ -924,16 +978,17 @@ module ddr3_controller #(
bank_status_d[index] = bank_status_q[index]; bank_status_d[index] = bank_status_q[index];
bank_active_row_d[index] = bank_active_row_q[index]; bank_active_row_d[index] = bank_active_row_q[index];
end end
//set cmd_0 to reset instruction, the remainings are NOP //set PRECHARGE_SLOT as reset instruction, the remainings are NOP (MSB is high)
//delay_counter_is_zero signifies start of new reset instruction (the time when the command must be issued) //delay_counter_is_zero high signifies start of new reset instruction (the time when the command must be issued)
cmd_d[PRECHARGE_SLOT] = {(!delay_counter_is_zero), instruction[DDR3_CMD_START-1:DDR3_CMD_END], cmd_odt, instruction[CLOCK_EN], instruction[RESET_N], cmd_d[PRECHARGE_SLOT] = {(!delay_counter_is_zero), instruction[DDR3_CMD_START-1:DDR3_CMD_END] | {3{(!delay_counter_is_zero)}} , cmd_odt, instruction[CLOCK_EN], instruction[RESET_N],
instruction[MRS_BANK_START:(MRS_BANK_START-BA_BITS+1)], instruction[ROW_BITS-1:0]}; instruction[MRS_BANK_START:(MRS_BANK_START-BA_BITS+1)], instruction[ROW_BITS-1:0]};
cmd_d[PRECHARGE_SLOT][10] = instruction[A10_CONTROL]; cmd_d[PRECHARGE_SLOT][10] = instruction[A10_CONTROL];
cmd_d[READ_SLOT] = {(!issue_read_command), CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; cmd_d[READ_SLOT] = {(!issue_read_command), CMD_RD[2:0] | {3{(!issue_read_command)}}, cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; // issued during MPR reads (address does not matter)
cmd_d[WRITE_SLOT] = {(!issue_write_command),CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; cmd_d[WRITE_SLOT] = {1'b0, 3'b111, cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; // always NOP by default
cmd_d[ACTIVATE_SLOT] = {1'b1,CMD_ACT[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; cmd_d[ACTIVATE_SLOT] = {1'b0, 3'b111 , cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; // always NOP by default
// decrement delay counters for every bank // decrement delay counters for every bank , stay to 0 once 0 is reached
// every bank will have its own delay counters for precharge, activate, write, and read
for(index=0; index< (1<<BA_BITS); index=index+1) begin for(index=0; index< (1<<BA_BITS); index=index+1) begin
delay_before_precharge_counter_d[index] = (delay_before_precharge_counter_q[index] == 0)? 0: delay_before_precharge_counter_q[index] - 1; delay_before_precharge_counter_d[index] = (delay_before_precharge_counter_q[index] == 0)? 0: delay_before_precharge_counter_q[index] - 1;
delay_before_activate_counter_d[index] = (delay_before_activate_counter_q[index] == 0)? 0: delay_before_activate_counter_q[index] - 1; delay_before_activate_counter_d[index] = (delay_before_activate_counter_q[index] == 0)? 0: delay_before_activate_counter_q[index] - 1;
@ -941,9 +996,10 @@ module ddr3_controller #(
delay_before_read_counter_d[index] = (delay_before_read_counter_q[index] == 0)? 0:delay_before_read_counter_q[index] - 1; delay_before_read_counter_d[index] = (delay_before_read_counter_q[index] == 0)? 0:delay_before_read_counter_q[index] - 1;
end end
for(index = 1; index < READ_ACK_PIPE_WIDTH; index = index + 1) begin for(index = 1; index < READ_ACK_PIPE_WIDTH; index = index + 1) begin
// shift is rightward where LSB gets MSB ([MSB] -> [] -> [] -> .... -> [] -[LSB])
shift_reg_read_pipe_d[index-1] = shift_reg_read_pipe_q[index]; shift_reg_read_pipe_d[index-1] = shift_reg_read_pipe_q[index];
end end
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = 0; shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = 0; //MSB just receives zero when shifted rightward
//USE _d in ALL //USE _d in ALL
@ -959,7 +1015,7 @@ module ddr3_controller #(
stage2_stall = 0; stage2_stall = 0;
stage2_update = 1; stage2_update = 1;
cmd_odt = 1'b1; cmd_odt = 1'b1;
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; // ack is sent to shift_reg which will be shifted until the wb ack output
//write acknowledge will use the same logic pipeline as the read acknowledge. //write acknowledge will use the same logic pipeline as the read acknowledge.
//This would mean write ack latency will be the same for //This would mean write ack latency will be the same for
@ -968,9 +1024,11 @@ module ddr3_controller #(
//for write ack as there will be no need to analyze the //for write ack as there will be no need to analyze the
//contents of the shift_reg_read_pipe just to determine //contents of the shift_reg_read_pipe just to determine
//where best to place the write ack on the pipeline (since //where best to place the write ack on the pipeline (since
//the order of ack must be maintaned). But this would mean //the order of ack must be maintained). But this would mean
//the latency for write is fixed regardless if there is an //the latency for write is fixed regardless if there is an
//outstanding read ack or none on the pipeline //outstanding read ack or none on the pipeline. But this is
// acceptable in my opinion since this is a pipelined wishbone
// where the transaction can continue regardless when ack returns
//set-up delay before precharge, read, and write //set-up delay before precharge, read, and write
if(delay_before_precharge_counter_q[stage2_bank] <= WRITE_TO_PRECHARGE_DELAY) begin if(delay_before_precharge_counter_q[stage2_bank] <= WRITE_TO_PRECHARGE_DELAY) begin
@ -984,7 +1042,7 @@ module ddr3_controller #(
delay_before_precharge_counter_d[stage2_bank] = WRITE_TO_PRECHARGE_DELAY; delay_before_precharge_counter_d[stage2_bank] = WRITE_TO_PRECHARGE_DELAY;
end end
for(index=0; index < (1<<BA_BITS); index=index+1) begin //the write to read delay applies to all banks (odt must be turned off properly before reading) for(index=0; index < (1<<BA_BITS); index=index+1) begin //the write to read delay applies to all banks (odt must be turned off properly before reading)
delay_before_read_counter_d[index] = WRITE_TO_READ_DELAY + 1; delay_before_read_counter_d[index] = WRITE_TO_READ_DELAY + 1; //NOTE TO SELF: why plus 1?
end end
delay_before_read_counter_d[stage2_bank] = WRITE_TO_READ_DELAY + 1; delay_before_read_counter_d[stage2_bank] = WRITE_TO_READ_DELAY + 1;
delay_before_write_counter_d[stage2_bank] = WRITE_TO_WRITE_DELAY; delay_before_write_counter_d[stage2_bank] = WRITE_TO_WRITE_DELAY;
@ -992,9 +1050,8 @@ module ddr3_controller #(
if(COL_BITS <= 10) begin if(COL_BITS <= 10) begin
cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd11}{1'b0}} , 1'b0 , stage2_col[9:0]}; cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd11}{1'b0}} , 1'b0 , stage2_col[9:0]};
end end
else begin else begin // COL_BITS > 10 has different format from <= 10
cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd12}{1'b0}} , cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd12}{1'b0}} , stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};
stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};
end end
//turn on odt at same time as write cmd //turn on odt at same time as write cmd
cmd_d[0][CMD_ODT] = cmd_odt; cmd_d[0][CMD_ODT] = cmd_odt;
@ -1003,7 +1060,6 @@ module ddr3_controller #(
cmd_d[3][CMD_ODT] = cmd_odt; cmd_d[3][CMD_ODT] = cmd_odt;
write_dqs_d=1; write_dqs_d=1;
write_dq_d=1; write_dq_d=1;
// write_data = 1;
end end
//read request //read request
@ -1016,19 +1072,18 @@ module ddr3_controller #(
delay_before_precharge_counter_d[stage2_bank] = READ_TO_PRECHARGE_DELAY; delay_before_precharge_counter_d[stage2_bank] = READ_TO_PRECHARGE_DELAY;
end end
delay_before_read_counter_d[stage2_bank] = READ_TO_READ_DELAY; delay_before_read_counter_d[stage2_bank] = READ_TO_READ_DELAY;
delay_before_write_counter_d[stage2_bank] = READ_TO_WRITE_DELAY + 1; //temporary solution since its possible odt to go hig already while reading previously delay_before_write_counter_d[stage2_bank] = READ_TO_WRITE_DELAY + 1; //temporary solution since its possible odt to go high already while reading previously
for(index=0; index < (1<<BA_BITS); index=index+1) begin //the read to write delay applies to all banks (odt must be turned on properly before writing) for(index=0; index < (1<<BA_BITS); index=index+1) begin //the read to write delay applies to all banks (odt must be turned on properly before writing and this delay is for ODT to settle)
delay_before_write_counter_d[index] = READ_TO_WRITE_DELAY + 1; delay_before_write_counter_d[index] = READ_TO_WRITE_DELAY + 1; // NOTE TO SELF: why plus 1?
end end
shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; // ack is sent to shift_reg which will be shifted until the wb ack output
//issue read command //issue read command
if(COL_BITS <= 10) begin if(COL_BITS <= 10) begin
cmd_d[READ_SLOT] = {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd11}{1'b0}} , 1'b0 , stage2_col[9:0]}; cmd_d[READ_SLOT] = {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd11}{1'b0}} , 1'b0 , stage2_col[9:0]};
end end
else begin else begin // COL_BITS > 10 has different format from <= 10
cmd_d[READ_SLOT] = {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd12}{1'b0}} , cmd_d[READ_SLOT] = {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd12}{1'b0}} , stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};
stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};
end end
//turn off odt at same time as read cmd //turn off odt at same time as read cmd
cmd_d[0][CMD_ODT] = cmd_odt; cmd_d[0][CMD_ODT] = cmd_odt;
@ -1043,10 +1098,10 @@ module ddr3_controller #(
activate_slot_busy = 1'b1; activate_slot_busy = 1'b1;
delay_before_precharge_counter_d[stage2_bank] = ACTIVATE_TO_PRECHARGE_DELAY; delay_before_precharge_counter_d[stage2_bank] = ACTIVATE_TO_PRECHARGE_DELAY;
//set-up delay before read and write //set-up delay before read and write
if(delay_before_read_counter_q[stage2_bank] <= ACTIVATE_TO_READ_DELAY) begin if(delay_before_read_counter_q[stage2_bank] <= ACTIVATE_TO_READ_DELAY) begin // if current delay is > ACTIVATE_TO_READ_DELAY, then updating it to the lower delay will cause the previous delay to be violated
delay_before_read_counter_d[stage2_bank] = ACTIVATE_TO_READ_DELAY; delay_before_read_counter_d[stage2_bank] = ACTIVATE_TO_READ_DELAY;
end end
if(delay_before_write_counter_q[stage2_bank] <= ACTIVATE_TO_WRITE_DELAY) begin if(delay_before_write_counter_q[stage2_bank] <= ACTIVATE_TO_WRITE_DELAY) begin // if current delay is > ACTIVATE_TO_WRITE_DELAY, then updating it to the lower delay will cause the previous delay to be violated
delay_before_write_counter_d[stage2_bank] = ACTIVATE_TO_WRITE_DELAY; delay_before_write_counter_d[stage2_bank] = ACTIVATE_TO_WRITE_DELAY;
end end
//issue activate command //issue activate command
@ -1069,16 +1124,18 @@ module ddr3_controller #(
//pending request on stage 1 //pending request on stage 1
if(stage1_pending && !((stage1_next_bank == stage2_bank) && stage2_pending)) begin if(stage1_pending && !((stage1_next_bank == stage2_bank) && stage2_pending)) begin
//stage 1 will mainly be for anticipation, but it can also handle //stage 1 will mainly be for anticipation (if next requests need to jump to new bank then
//precharge and activate request. This will depend if the request //anticipate the precharging and activate of that next bank, BUT it can also handle
//is on the end of the row and must start the anticipation. For //precharge and activate of CURRENT wishbone request.
//example, we have 10 rows in a bank: //Anticipate will depend if the request is on the end of the row
//[R][R][R][R][R][R][R][A][A][A] // and must start the anticipation. For example if we have 10 rows in a bank:
//[R][R][R][R][R][R][R][A][A][A] -> [next bank]
// //
//R = Request, A = Anticipate //R = Request, A = Anticipate
//Unless we are near the third to the last column, stage 1 will //Unless we are near the third to the last column, stage 1 will
//issue Activate and Precharge on the CURRENT bank. Else, stage //issue Activate and Precharge on the CURRENT bank. Else, stage
//1 will issue Activate and Precharge for the NEXT bank //1 will issue Activate and Precharge for the NEXT bank
// Thus stage 1 anticipate makes sure smooth burst operation that jumps banks
if(bank_status_q[stage1_next_bank] && bank_active_row_q[stage1_next_bank] != stage1_next_row && delay_before_precharge_counter_q[stage1_next_bank] ==0 && !precharge_slot_busy) begin if(bank_status_q[stage1_next_bank] && bank_active_row_q[stage1_next_bank] != stage1_next_row && delay_before_precharge_counter_q[stage1_next_bank] ==0 && !precharge_slot_busy) begin
//set-up delay before read and write //set-up delay before read and write
delay_before_activate_counter_d[stage1_next_bank] = PRECHARGE_TO_ACTIVATE_DELAY; delay_before_activate_counter_d[stage1_next_bank] = PRECHARGE_TO_ACTIVATE_DELAY;
@ -1090,10 +1147,10 @@ module ddr3_controller #(
else if(!bank_status_q[stage1_next_bank] && delay_before_activate_counter_q[stage1_next_bank] == 0 && !activate_slot_busy) begin else if(!bank_status_q[stage1_next_bank] && delay_before_activate_counter_q[stage1_next_bank] == 0 && !activate_slot_busy) begin
delay_before_precharge_counter_d[stage1_next_bank] = ACTIVATE_TO_PRECHARGE_DELAY; delay_before_precharge_counter_d[stage1_next_bank] = ACTIVATE_TO_PRECHARGE_DELAY;
//set-up delay before read and write //set-up delay before read and write
if(delay_before_read_counter_d[stage1_next_bank] <= ACTIVATE_TO_READ_DELAY) begin if(delay_before_read_counter_d[stage1_next_bank] <= ACTIVATE_TO_READ_DELAY) begin // if current delay is > ACTIVATE_TO_READ_DELAY, then updating it to the lower delay will cause the previous delay to be violated
delay_before_read_counter_d[stage1_next_bank] = ACTIVATE_TO_READ_DELAY; delay_before_read_counter_d[stage1_next_bank] = ACTIVATE_TO_READ_DELAY;
end end
if(delay_before_write_counter_d[stage1_next_bank] <= ACTIVATE_TO_WRITE_DELAY) begin if(delay_before_write_counter_d[stage1_next_bank] <= ACTIVATE_TO_WRITE_DELAY) begin // if current delay is > ACTIVATE_TO_WRITE_DELAY, then updating it to the lower delay will cause the previous delay to be violated
delay_before_write_counter_d[stage1_next_bank] = ACTIVATE_TO_WRITE_DELAY; delay_before_write_counter_d[stage1_next_bank] = ACTIVATE_TO_WRITE_DELAY;
end end
cmd_d[ACTIVATE_SLOT] = {1'b0, CMD_ACT[2:0] , cmd_odt, cmd_ck_en, cmd_reset_n, stage1_next_bank , stage1_next_row}; cmd_d[ACTIVATE_SLOT] = {1'b0, CMD_ACT[2:0] , cmd_odt, cmd_ck_en, cmd_reset_n, stage1_next_bank , stage1_next_row};
@ -1110,10 +1167,10 @@ module ddr3_controller #(
if(!bank_status_d[stage1_bank] || (bank_status_d[stage1_bank] && bank_active_row_d[stage1_bank] != stage1_row)) begin if(!bank_status_d[stage1_bank] || (bank_status_d[stage1_bank] && bank_active_row_d[stage1_bank] != stage1_row)) begin
stage1_stall = 1; stage1_stall = 1;
end end
else if(!stage1_we && delay_before_read_counter_d[stage1_bank] != 0) begin else if(!stage1_we && delay_before_read_counter_d[stage1_bank] != 0) begin // if read request but delay before read is not yet met then stall
stage1_stall = 1; stage1_stall = 1;
end end
else if(stage1_we && delay_before_write_counter_d[stage1_bank] != 0) begin else if(stage1_we && delay_before_write_counter_d[stage1_bank] != 0) begin // if write request but delay before write is not yet met then stall
stage1_stall = 1; stage1_stall = 1;
end end
//different request type will need a delay of more than 1 clk cycle so stall the pipeline //different request type will need a delay of more than 1 clk cycle so stall the pipeline
@ -1127,17 +1184,23 @@ module ddr3_controller #(
//control stage2 stall in advance //control stage2 stall in advance
if(bank_status_d[stage2_bank] && bank_active_row_d[stage2_bank] == stage2_row) begin //read/write operation if(bank_status_d[stage2_bank] && bank_active_row_d[stage2_bank] == stage2_row) begin //read/write operation
//write request //write request
if(stage2_we && delay_before_write_counter_d[stage2_bank] == 0) begin //if counter is 1 now, then next clock it will be zero thus lower stall at next cycle too if(stage2_we && delay_before_write_counter_d[stage2_bank] == 0) begin // if write request and delay before write is already met then deassert stall
stage2_stall = 0; //to low stall next stage, but not yet at this stage stage2_stall = 0; //to low stall next stage, but not yet at this stage
end end
//read request //read request
else if(!stage2_we && delay_before_read_counter_d[stage2_bank]==0) begin //if counter is 1 now, then next clock it will be zero thus lower stall at next cycle too else if(!stage2_we && delay_before_read_counter_d[stage2_bank]==0) begin // if read request and delay before read is already met then deassert stall
stage2_stall = 0; stage2_stall = 0;
end end
end end
end end
// control logic for stall // control logic for stall
// this small logic is already optimized via listing all possible combinations on excel sheet, investigating the patterns
// and passing the formal verification. I recommend to not touch this and just left this as is.
// This logic makes sure stall will never go high unless the pipeline is full
// What made this complicated is that fact you have to predict the stall for next clock cycle in such
// a way that it will only stall next clock cycle if the pipeline will be full on the next clock cycle.
// Excel sheet: https://1drv.ms/x/s!AhWdq9CipeVagSqQXPwRmXhDgttL?e=vVYIxE&nav=MTVfezAwMDAwMDAwLTAwMDEtMDAwMC0wMDAwLTAwMDAwMDAwMDAwMH0
if(o_wb_stall_q) o_wb_stall_d = stage2_stall; if(o_wb_stall_q) o_wb_stall_d = stage2_stall;
else if( (!i_wb_stb && state_calibrate == DONE_CALIBRATE) || (!calib_stb && state_calibrate != DONE_CALIBRATE) ) o_wb_stall_d = 0; else if( (!i_wb_stb && state_calibrate == DONE_CALIBRATE) || (!calib_stb && state_calibrate != DONE_CALIBRATE) ) o_wb_stall_d = 0;
else if(!stage1_pending) o_wb_stall_d = stage2_stall; else if(!stage1_pending) o_wb_stall_d = stage2_stall;
@ -1149,7 +1212,6 @@ module ddr3_controller #(
assign o_phy_cmd = {cmd_d[3], cmd_d[2], cmd_d[1], cmd_d[0]}; assign o_phy_cmd = {cmd_d[3], cmd_d[2], cmd_d[1], cmd_d[0]};
/*********************************************************************************************************************************************/ /*********************************************************************************************************************************************/
/******************************************************* Align Read Data from ISERDES *******************************************************/ /******************************************************* Align Read Data from ISERDES *******************************************************/
always @(posedge i_controller_clk) begin always @(posedge i_controller_clk) begin
if(sync_rst_controller) begin if(sync_rst_controller) begin
@ -1195,30 +1257,62 @@ module ddr3_controller #(
write_dq[index+1] <= write_dq[index]; write_dq[index+1] <= write_dq[index];
end end
for(index = 0; index < READ_ACK_PIPE_WIDTH; index = index + 1) begin for(index = 0; index < READ_ACK_PIPE_WIDTH; index = index + 1) begin
// shifted rightward where LSB gets MSB ([MSB] -> [] -> [] -> .... -> [] -[LSB])
shift_reg_read_pipe_q[index] <= shift_reg_read_pipe_d[index]; shift_reg_read_pipe_q[index] <= shift_reg_read_pipe_d[index];
end end
for(index = 0; index < 2; index = index + 1) begin for(index = 0; index < 2; index = index + 1) begin
// there are 2 read_pipes (each with 16 space for shifting), and each read pipes shift rightward
// so the bit 1 will be shifted to the right until it reach LSB which means data is already on ISERDES output of PHY
delay_read_pipe[index] <= (delay_read_pipe[index] >> 1); delay_read_pipe[index] <= (delay_read_pipe[index] >> 1);
end end
if(shift_reg_read_pipe_q[1][0]) begin //delay is over and data is now starting to release from iserdes BUT NOT YET ALIGNED if(shift_reg_read_pipe_q[1][0]) begin
index_read_pipe <= !index_read_pipe; //control which delay_read_pipe would get updated (we have 3 pipe to store read data)ss //delay from shift_reg_read_pipe_q is about to be over (ack, which is the last bit, will be at LSB on next clk cycle)
//and data is now starting to be released from ISERDES from phy BUT NOT YET ALIGNED
index_read_pipe <= !index_read_pipe; //control which delay_read_pipe would get updated (we have 2 read_pipes to store read data,use the read_pipe alternatingly)
delay_read_pipe[index_read_pipe][added_read_pipe_max] <= 1'b1; //update delay_read_pipe delay_read_pipe[index_read_pipe][added_read_pipe_max] <= 1'b1; //update delay_read_pipe
// NOTE: added_read_pipe_max can either be 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of delay_read_pipe)
// delay_read_pipe will get the ack bit from shift_reg_read_pipe_q[1] at the bit equal to
// added_read_pipe_max (0th or 1st bit). added_read_pipe_max is the max number of added controller clk cycles among all lanes
// So basically, the delay_read_pipe is the delay to make sure the "added_read_pipe_max" controller clk cycles
// will be met.
// Example:
// So for request #1 (e.g. write request, added_read_pipe_max=1), wait until the shift_reg_read_pipe_q[1] goes
// high (READ_ACK_PIPE_WIDTH of delay is met which means the data from ISERDES PHY is now available). The
// delay_read_pipe[0][1] will then be high. This high bit on read_pipe #0 will get shifted to LSB. [1] -> [] -> [LSB]
// Meanwhile when request #2 comes (e.g. read request, added_read_pipe_max=1), again wait until the shift_reg_read_pipe_q[1] goes
// high. The delay_read_pipe[1][1] will then be high. This high bit on read_pipe #1 will get shifted to LSB. [1] -> [] -> [LSB]
end end
for(index = 0; index < LANES; index = index + 1) begin for(index = 0; index < LANES; index = index + 1) begin
/* verilator lint_off WIDTH */ /* verilator lint_off WIDTH */
if(delay_read_pipe[0][added_read_pipe_max != added_read_pipe[index]]) begin //same lane // read_pipe #0
// NOTE: added_read_pipe_max and added_read_pipe can either be just 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of this)
// If the added_read_pipe (added number of controller clk cycles of delay to a lane due to pcb trace) is equal to the
// max delay (added_read_pipe_max, e.g. 0-0 or 1-1) for this lane, THEN we need to wait until the bit 1 reaches the LSB[0] of delay_read_pipe
// before retrieving the value from PHY. But if not the same (added_read_pipe is 0 while added_read_pipe_max is 1), then wait until
// the bit 1 reaches the one before LSB [1] before retrieving the value from PHY, so this means this lane with 0 delay will FIRST BE RETRIEVED
// while the lane with added_read_pipe_max of delay (delay of 1) will be retrieved SECOND
if(delay_read_pipe[0][added_read_pipe_max != added_read_pipe[index]]) begin
/* verilator lint_on WIDTH */ /* verilator lint_on WIDTH */
o_wb_data_q[0][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[0][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update lane for burst 0
o_wb_data_q[0][((DQ_BITS*LANES)*1 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*1 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[0][((DQ_BITS*LANES)*1 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*1 + 8*index) +: 8]; //update lane for burst 1
o_wb_data_q[0][((DQ_BITS*LANES)*2 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*2 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[0][((DQ_BITS*LANES)*2 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*2 + 8*index) +: 8]; //update lane for burst 2
o_wb_data_q[0][((DQ_BITS*LANES)*3 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*3 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[0][((DQ_BITS*LANES)*3 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*3 + 8*index) +: 8]; //update lane for burst 3
o_wb_data_q[0][((DQ_BITS*LANES)*4 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*4 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[0][((DQ_BITS*LANES)*4 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*4 + 8*index) +: 8]; //update lane for burst 4
o_wb_data_q[0][((DQ_BITS*LANES)*5 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*5 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[0][((DQ_BITS*LANES)*5 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*5 + 8*index) +: 8]; //update lane for burst 5
o_wb_data_q[0][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[0][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update lane for burst 6
o_wb_data_q[0][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[0][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update lane for burst 7
end end
/* verilator lint_off WIDTH */ /* verilator lint_off WIDTH */
// read_pipe #1
// NOTE: added_read_pipe_max and added_read_pipe can either be just 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of this)
// If the added_read_pipe (added number of controller clk cycles of delay to a lane due to pcb trace) is equal to the
// max delay (added_read_pipe_max, e.g. 0-0 or 1-1) for this lane, THEN we need to wait until the bit 1 reaches the LSB[0] of delay_read_pipe
// before retrieving the value from PHY. But if not the same (added_read_pipe is 0 while added_read_pipe_max is 1), then wait until
// the bit 1 reaches the one before LSB [1] (which goes high already since bit 1 of delay_read_pipe is the first to go high once shift_reg_read_pipe_q
// bit 1 goes high) before retrieving the value from PHY. So this means this lane with 0 delay will FIRST BE RETRIEVED
// while the lane with added_read_pipe_max of delay (delay of 1) will be retrieved SECOND
if(delay_read_pipe[1][added_read_pipe_max != added_read_pipe[index]]) begin if(delay_read_pipe[1][added_read_pipe_max != added_read_pipe[index]]) begin
/* verilator lint_on WIDTH */ /* verilator lint_on WIDTH */
o_wb_data_q[1][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[1][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update each lane of the burst
@ -1230,17 +1324,49 @@ module ddr3_controller #(
o_wb_data_q[1][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[1][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update each lane of the burst
o_wb_data_q[1][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update each lane of the burst o_wb_data_q[1][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update each lane of the burst
end end
// why are we alternatingly use the read_pipes?
// time | 0 | 1 | 2 | 3 |
// index_read_pipe | 1 | 0 | 1 | 0 |
// delay_read_pipe[1] | [0][0] | <[1][0] | [0][1]> | [1][0] |
// delay_read_pipe[0] | [0][0] | [0][0] | <[1][0] | [0][1]> |
//
// At time 1, request #1 is accepted by pipe[1], then wait until time 2 to retrieve the lane with longest added_read_pipe
// (lane with added_read_pipe 0 is retrieved from ISERDES at time 1, lane with added_read_pipe 1 is retrieved from ISERDES at time 2)
// At time 2, request #2 is accepted by pipe[0] (since pipe[1] is still busy on request #1), then wait until time 3 to retrieve
// the lane with longest added_read_pipe
// Thus pipe 0 and 1 is alternating to make sure that even if 0 is busy, 1 will retrieve the data. And then when 1 is busy, 0 will retrieve the data
// NOTE TO SELF: longest delay for a lane relative to others is 1 controller clk cycle, if more than 1 then it will not be retrievd
// and aligned properly. Thus try to optimize this logic since delay is at max 1 controller clk only.
end end
// o_wb_ack_read_q[0][0] is also the wishbone ack (aligned with ack is the wishbone data) thus
// after sending the wishbone data on a particular index, invert it for the next ack
if(o_wb_ack_read_q[0][0]) begin if(o_wb_ack_read_q[0][0]) begin
index_wb_data <= !index_wb_data; index_wb_data <= !index_wb_data; //alternatingly uses the o_wb_data_q (either 0 or 1)
end end
for(index = 1; index < MAX_ADDED_READ_ACK_DELAY; index = index + 1) begin for(index = 1; index < MAX_ADDED_READ_ACK_DELAY; index = index + 1) begin
o_wb_ack_read_q[index-1] <= o_wb_ack_read_q[index]; o_wb_ack_read_q[index-1] <= o_wb_ack_read_q[index]; // shift rightward [ack] -> [] -> [LSB]
end end
o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1] <= 0; o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1] <= 0; // MSB always gets zero and is shifted rightwards
o_wb_ack_read_q[added_read_pipe_max] <= shift_reg_read_pipe_q[0]; o_wb_ack_read_q[added_read_pipe_max] <= shift_reg_read_pipe_q[0];
// o_wb_ack_read_q[0] is the wishbone ack
// so once data is available from ISERDES (shift_reg_read_pipe_q[0] high) then need to wait added_read_pipe_max
// before the data is properly stored to o_wb_data_q and can be sent outside as wishbone data
//abort any outgoing ack when cyc is low // BASICALLY:
// shift_reg_read_pipe_q is the delay from when the read command is issued from controller until the
// data is received by the PHY ISERDES (total delay of READ_ACK_PIPE_WIDTH). The shift_reg_read_pipe_q[1]
// is then connected to delay_read_pipe[added_read_pipe_max (0 or 1)] which is the delay to align the lanes.
// The shift_reg_read_pipe_q[0] is then connected to o_wb_ack_read_q[added_read_pipe_max (0 or 1)] which
// is the delay until the wishbone data and ack will be sent outside
// NOTE TO SELF: Optimize by removing o_wb_ack_read_q and just connect the woshbone ack and data to delay_read_pipe[0].
// Visualization:
// shift_reg_read_pipe_q [ ] -> [1] -> [ ] [ ] -> [ ] -> [1] [ ] -> [ ] -> [ ] [ ] -> [ ] -> [ ]
// delay_read_pipe [ ] -> [ ] -> [ ] ---> [ ] -> [1] -> [ ] ---> [ ] -> [ ] -> [1] ---> [ ] -> [ ] -> [ ]
// o_wb_ack_read_q [ ] -> [ ] -> [ ] [ ] -> [ ] -> [ ] [ ] -> [1] -> [ ] [ ] -> [ ] -> [1]
// request is now on [1] request passed request passed o_wb_ack_read_q[0]
// of shift_reg_read_pipe_q to delay_read_pipe to o_wb_ack_read_q high thus ready for wishbone ack
// abort any outgoing ack when cyc is low
if(!i_wb_cyc && state_calibrate == DONE_CALIBRATE) begin if(!i_wb_cyc && state_calibrate == DONE_CALIBRATE) begin
for(index = 0; index < MAX_ADDED_READ_ACK_DELAY; index = index + 1) begin for(index = 0; index < MAX_ADDED_READ_ACK_DELAY; index = index + 1) begin
o_wb_ack_read_q[index] <= 0; o_wb_ack_read_q[index] <= 0;
@ -1252,7 +1378,7 @@ module ddr3_controller #(
end end
end end
assign o_wb_ack = o_wb_ack_read_q[0][0] && state_calibrate == DONE_CALIBRATE; assign o_wb_ack = o_wb_ack_read_q[0][0] && state_calibrate == DONE_CALIBRATE;
//o_wb_ack_read_q[0][0] is needed internally for write calibration but it must not go outside (since it is not an actual user wb request unless we are in DONE_CALIBRATE) //o_wb_ack_read_q[0][0] is needed internally to ack during write calibration but it must not go outside (since it is not an actual user wb request unless we are in DONE_CALIBRATE)
assign o_aux = o_wb_ack_read_q[0][AUX_WIDTH:1]; assign o_aux = o_wb_ack_read_q[0][AUX_WIDTH:1];
assign o_wb_data = o_wb_data_q[index_wb_data]; assign o_wb_data = o_wb_data_q[index_wb_data];
assign o_phy_dqs_tri_control = !write_dqs[STAGE2_DATA_DEPTH]; assign o_phy_dqs_tri_control = !write_dqs[STAGE2_DATA_DEPTH];
@ -1360,15 +1486,22 @@ module ddr3_controller #(
idelay_data_cntvaluein[lane] <= o_phy_idelay_data_ld[lane]? idelay_data_cntvaluein[lane] + 1: idelay_data_cntvaluein[lane]; idelay_data_cntvaluein[lane] <= o_phy_idelay_data_ld[lane]? idelay_data_cntvaluein[lane] + 1: idelay_data_cntvaluein[lane];
idelay_dqs_cntvaluein[lane] <= o_phy_idelay_dqs_ld[lane]? idelay_dqs_cntvaluein[lane] + 1: idelay_dqs_cntvaluein[lane]; idelay_dqs_cntvaluein[lane] <= o_phy_idelay_dqs_ld[lane]? idelay_dqs_cntvaluein[lane] + 1: idelay_dqs_cntvaluein[lane];
end end
// high initial_dqs is the time when the IDELAY of dqs and dq is not yet calibrated
// dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1; // move to next odd (if 3 then 5, if 4 then 5)
// so dqs_target_index_value is basically the next odd number of dqs_start_index_stored (original starting bit when dqs starts).
// The next odd number ensure that the DQS edge is aligned to edge of ddr3_clk (and thus DQ data eye is aligned to edges of of ddr3_clk
// since dq is 90 degree relative to dqs
// Some images to show why next odd number is used: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
if(initial_dqs) begin if(initial_dqs) begin
dqs_target_index <= dqs_target_index_value; dqs_target_index <= dqs_target_index_value; // target index for DQS to make sure the DQS is edge-aligned with ddr3_clk
dq_target_index[lane] <= {1'b0, dqs_target_index_value}; dq_target_index[lane] <= {1'b0, dqs_target_index_value}; // target index for DQ (just same as DQS)
dqs_target_index_orig <= dqs_target_index_value; dqs_target_index_orig <= dqs_target_index_value; // this will remain the same until we finish calibrating this whole lane
end end
if(idelay_dqs_cntvaluein[lane] == 0) begin //go back to previous odd if(idelay_dqs_cntvaluein[lane] == 0) begin //the DQS got past cntvalue of 31 (and goes back to zero) THUS the target index should also go back (to previous odd)
dqs_target_index <= dqs_target_index_orig - 2; dqs_target_index <= dqs_target_index_orig - 2;
end end
if(idelay_data_cntvaluein[lane] == 0 && idelay_data_cntvaluein_prev == 31) begin if(idelay_data_cntvaluein[lane] == 0 && idelay_data_cntvaluein_prev == 31) begin //the DQ got past cntvalue of 31 (and goes back to zero) thus the target index should also go back (to previous odd)
dq_target_index[lane] <= dqs_target_index_orig - 2; dq_target_index[lane] <= dqs_target_index_orig - 2;
end end
@ -1397,6 +1530,7 @@ module ddr3_controller #(
another Bitslip command. If the ISERDESE2 is reset, the Bitslip logic is also reset another Bitslip command. If the ISERDESE2 is reset, the Bitslip logic is also reset
and returns back to its initial state. and returns back to its initial state.
*/ */
// bitslip basically is changing the arrangement of bytes on IOSERDES
if(i_phy_iserdes_bitslip_reference[lane*serdes_ratio*2 +: 8] == 8'b0111_1000) begin //initial arrangement if(i_phy_iserdes_bitslip_reference[lane*serdes_ratio*2 +: 8] == 8'b0111_1000) begin //initial arrangement
state_calibrate <= MPR_READ; state_calibrate <= MPR_READ;
initial_dqs <= 1; initial_dqs <= 1;
@ -1412,28 +1546,42 @@ module ddr3_controller #(
MPR_READ: if(delay_before_read_data == 0) begin //align the incoming DQS during reads to the controller clock MPR_READ: if(delay_before_read_data == 0) begin //align the incoming DQS during reads to the controller clock
//issue_read_command = 1; //issue_read_command = 1;
/* verilator lint_off WIDTH */ /* verilator lint_off WIDTH */
delay_before_read_data <= READ_DELAY + 1 + 2 + 1 - 1; ///1=issue command delay (OSERDES delay), 2 = ISERDES delay delay_before_read_data <= READ_DELAY + 1 + 2 + 1 - 1; ///NOTE TO SELF: why these numbers? 1=issue command delay (OSERDES delay), 2 = ISERDES delay
/* verilator lint_on WIDTH */ /* verilator lint_on WIDTH */
state_calibrate <= COLLECT_DQS; state_calibrate <= COLLECT_DQS;
dqs_count_repeat <= 0; dqs_count_repeat <= 0;
end end
COLLECT_DQS: if(delay_before_read_data == 0) begin COLLECT_DQS: if(delay_before_read_data == 0) begin // data from MPR read is now received by controller
// dqs from ISERDES is received and stored
// DQS received from ISERDES: { {LANE_1_burst_7, LANE_1_burst_6, ... , LANE_1_burst_0} , {LANE_0_burst_7, LANE_0_burst_6, ... , LANE_0_burst_0}}
// NOTE TO SELF: WHY DQS IS DIVIDED PER LANE BUT DQ IS PER BURST ????
// dqs_store stores the 8 DQS (8 bursts) for a given lane but since the DQS might be shifted at next ddr3 clk cycles (due to trace delays), we must store the
// 8 DQS multiple times (dictated by STORED_DQS_SIZE)
dqs_store <= {i_phy_iserdes_dqs[serdes_ratio*2*lane +: 8], dqs_store[(STORED_DQS_SIZE*8-1):8]}; dqs_store <= {i_phy_iserdes_dqs[serdes_ratio*2*lane +: 8], dqs_store[(STORED_DQS_SIZE*8-1):8]};
dqs_count_repeat <= dqs_count_repeat + 1; dqs_count_repeat <= dqs_count_repeat + 1;
if(dqs_count_repeat == STORED_DQS_SIZE - 1) begin if(dqs_count_repeat == STORED_DQS_SIZE - 1) begin
state_calibrate <= ANALYZE_DQS; state_calibrate <= ANALYZE_DQS;
// store the previous value of dqs_start_index, if the ANALYZE_DQS is repeated then the dqs_start_index
// should be the same as the previous value, else the previous (or current one) has a glitch causing
// a different dqs_start_index
dqs_start_index_stored <= dqs_start_index; dqs_start_index_stored <= dqs_start_index;
// start the index from zero since this will be incremented until we pinpoint the real
// starting bit of dqs_store (dictated by the pattern 10'b01_01_01_01_00)
dqs_start_index <= 0; dqs_start_index <= 0;
end end
end end
// find the bit where the DQS starts to be issued (by finding when the pattern 10'b01_01_01_01_00 starts)
ANALYZE_DQS: if(dqs_store[dqs_start_index +: 10] == 10'b01_01_01_01_00) begin ANALYZE_DQS: if(dqs_store[dqs_start_index +: 10] == 10'b01_01_01_01_00) begin
dqs_start_index_repeat <= (dqs_start_index == dqs_start_index_stored)? dqs_start_index_repeat + 1: 0; //increase dqs_start_index_repeat when index is the same as before //increase dqs_start_index_repeat when index is the same as before
if(dqs_start_index_repeat == REPEAT_DQS_ANALYZE) begin //the same index appeared REPEAT_DQS_ANALYZE times in a row, thus can proceed to CALIBRATE_DQS dqs_start_index_repeat <= (dqs_start_index == dqs_start_index_stored)? dqs_start_index_repeat + 1: 0;
//the same dqs_start_index_repeat appeared REPEAT_DQS_ANALYZE times in a row, thus we can trust the value we got is accurate and not affected by glitch
if(dqs_start_index_repeat == REPEAT_DQS_ANALYZE) begin
// since we already know the starting bit when the dqs (and dq since they are aligned) will come,
// we will now start calibrating the IDELAY for dqs and dq (via CALIBRATE_DQS).
// high initial_dqs is the time when the IDELAY of dqs and dq is not yet calibrated so we zero this starting now
initial_dqs <= 0; initial_dqs <= 0;
dqs_start_index_repeat <= 0; dqs_start_index_repeat <= 0;
state_calibrate <= CALIBRATE_DQS; state_calibrate <= CALIBRATE_DQS;
end end
else begin else begin
@ -1451,23 +1599,40 @@ module ddr3_controller #(
dqs_start_index <= dqs_start_index + 1; dqs_start_index <= dqs_start_index + 1;
end end
end end
// check if the index when the dqs starts is the same as the target index which is aligned to the ddr3_clk
// dqs_target_index is the next odd number to dqs_start_index BEFORE IDELAY CALIBRATION. We will increase the IDELAY
// until the dqs_start_index_stored (current value of dqs_start_index) matches the target index which is aligned to ddr3_clk
CALIBRATE_DQS: if(dqs_start_index_stored == dqs_target_index) begin CALIBRATE_DQS: if(dqs_start_index_stored == dqs_target_index) begin
// dq_target_index still stores the original dqs_target_index_value. The bit size of dq_target_index is just enough
// to count the bits in dqs_store (the received 8 DQS stored STORED_DQS_SIZE times)
added_read_pipe[lane] <= { {( 4 - ($clog2(STORED_DQS_SIZE*8) - (3+1)) ){1'b0}} , dq_target_index[lane][$clog2(STORED_DQS_SIZE*8)-1:(3+1)] } added_read_pipe[lane] <= { {( 4 - ($clog2(STORED_DQS_SIZE*8) - (3+1)) ){1'b0}} , dq_target_index[lane][$clog2(STORED_DQS_SIZE*8)-1:(3+1)] }
+ { 3'b0 , (dq_target_index[lane][3:0] >= (5+8)) }; + { 3'b0 , (dq_target_index[lane][3:0] >= (5+8)) };
// if target_index is > 13, then a 1 CONTROLLLER_CLK cycle delay (4 ddr3_clk cycles) is added on that particular lane (due to trace delay)
// added_read_pipe[lane] <= dq_target_index[lane][$clog2(STORED_DQS_SIZE*8)-1 : (4)] + ( dq_target_index[lane][3:0] >= 13 ) ;
dqs_bitslip_arrangement <= 16'b0011_1100_0011_1100 >> dq_target_index[lane][2:0]; dqs_bitslip_arrangement <= 16'b0011_1100_0011_1100 >> dq_target_index[lane][2:0];
// the dqs is delayed (to move starting bit to next odd number) so this means the original
// expected bitslip arrangement of 8'b0111_1000 will not be followed anymore, so here we form the bitslip
// arrangement pattern so incoming dqs (and thus DQ) is arranged in the proper way (first bute firs, last byte last)
state_calibrate <= BITSLIP_DQS_TRAIN_2; state_calibrate <= BITSLIP_DQS_TRAIN_2;
end end
else begin else begin
// if we have not yet reached the target index then increment IDELAY
// we will keep incrementing the IDELAY until the next odd index is reached (which is
// the time we are sure the DQS is edge aligned with ddr3_clk and thus ddr3_clk posedge
// is hitting the center of DQ eye
// To show why next odd number is needed: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
o_phy_idelay_data_ld[lane] <= 1; o_phy_idelay_data_ld[lane] <= 1;
o_phy_idelay_dqs_ld[lane] <= 1; o_phy_idelay_dqs_ld[lane] <= 1;
state_calibrate <= MPR_READ; state_calibrate <= MPR_READ;
delay_before_read_data <= 10; //wait for sometime to make sure idelay load settles delay_before_read_data <= 10; //wait for sometime to make sure idelay load settles
end end
//the dqs is delayed (to move starting bit to next odd number) so this means the original
// expected bitslip arrangement of 8'b0111_1000 will not be followed anymore, so here the bitslip
// is re-arranged
BITSLIP_DQS_TRAIN_2: if(train_delay == 0) begin //train again the ISERDES to capture the DQ correctly BITSLIP_DQS_TRAIN_2: if(train_delay == 0) begin //train again the ISERDES to capture the DQ correctly
if(i_phy_iserdes_bitslip_reference[lane*serdes_ratio*2 +: 8] == dqs_bitslip_arrangement[7:0]) begin if(i_phy_iserdes_bitslip_reference[lane*serdes_ratio*2 +: 8] == dqs_bitslip_arrangement[7:0]) begin
/* verilator lint_off WIDTH */ /* verilator lint_off WIDTH */
// this is the end of training and calibration for a single lane, so proceed to next lane
if(lane == LANES - 1) begin if(lane == LANES - 1) begin
/* verilator lint_on WIDTH */ /* verilator lint_on WIDTH */
pause_counter <= 0; //read calibration now complete so continue the reset instruction sequence pause_counter <= 0; //read calibration now complete so continue the reset instruction sequence
@ -1481,8 +1646,11 @@ module ddr3_controller #(
end end
else begin else begin
lane <= lane + 1; lane <= lane + 1;
state_calibrate <= BITSLIP_DQS_TRAIN_1; state_calibrate <= BITSLIP_DQS_TRAIN_1;// current lane is done so go back to BITSLIP_DQS_TRAIN_1 to train next lane
end end
// stores the highest value of added_read_pipe among the lanes since all lanes (except the lane with highest
// added_read_pipe) will be delayed to align with the lane with highest added_read_pipe. This alignment
// is required to make sure the received DQ will be aligned and can form the 512 bit data (for 8 lanes) arranged properly.
added_read_pipe_max <= added_read_pipe_max > added_read_pipe[lane]? added_read_pipe_max:added_read_pipe[lane]; added_read_pipe_max <= added_read_pipe_max > added_read_pipe[lane]? added_read_pipe_max:added_read_pipe[lane];
end end
else begin else begin
@ -1490,7 +1658,7 @@ module ddr3_controller #(
train_delay <= 3; train_delay <= 3;
end end
end end
// CONTINUE COMMENT HERE (once blog is done)
START_WRITE_LEVEL: if(!ODELAY_SUPPORTED) begin //skip write levelling if ODELAY is not supported START_WRITE_LEVEL: if(!ODELAY_SUPPORTED) begin //skip write levelling if ODELAY is not supported
pause_counter <= 0; pause_counter <= 0;
lane <= 0; lane <= 0;
@ -1548,7 +1716,14 @@ module ddr3_controller #(
calib_sel <= {wb_sel_bits{1'b1}}; calib_sel <= {wb_sel_bits{1'b1}};
calib_we <= 1; //write-enable calib_we <= 1; //write-enable
calib_addr <= 0; calib_addr <= 0;
// burst_7 burst_6 burst_5 burst_4 burst_3 burst_2 burst_1 burst_0
calib_data <= { {LANES{8'h91}}, {LANES{8'h77}}, {LANES{8'h29}}, {LANES{8'h8c}}, {LANES{8'hd0}}, {LANES{8'had}}, {LANES{8'h51}}, {LANES{8'hc1}} }; calib_data <= { {LANES{8'h91}}, {LANES{8'h77}}, {LANES{8'h29}}, {LANES{8'h8c}}, {LANES{8'hd0}}, {LANES{8'had}}, {LANES{8'h51}}, {LANES{8'hc1}} };
// write to address 0 is a burst of 8 writes, where all lanes has same data written: 64'h9177298cd0ad51c1
// Example for LANES of 2, DQ of 8: the 128 bits (8 bursts * 2 lanes/burst * 8bits/lane) are 8 bursts with 8 bytes each:
// { burst_7, burst_6, burst_5, burst_4, burst_3, burst_2, burst_1, burst_0 } OR:
// { { {burst7_lane1} , {burst7_lane0} } , { {burst6_lane1} , {burst6_lane0} } , { {burst5_lane1} , {burst5_lane0} }
// , { {burst4_lane1} , {burst4_lane0} } , { {burst3_lane1} , {burst3_lane0} } , { {burst2_lane1} , {burst2_lane0} }
// , { {burst1_lane1} , {burst1_lane0} } , { {burst0_lane1} , {burst0_lane0} } }
state_calibrate <= ISSUE_WRITE_2; state_calibrate <= ISSUE_WRITE_2;
end end
ISSUE_WRITE_2: begin ISSUE_WRITE_2: begin
@ -1557,9 +1732,18 @@ module ddr3_controller #(
calib_sel <= {wb_sel_bits{1'b1}}; calib_sel <= {wb_sel_bits{1'b1}};
calib_we <= 1; //write-enable calib_we <= 1; //write-enable
calib_addr <= 1; calib_addr <= 1;
// burst_7 burst_6 burst_5 burst_4 burst_3 burst_2 burst_1 burst_0
calib_data <= { {LANES{8'h80}}, {LANES{8'hdb}}, {LANES{8'hcf}}, {LANES{8'hd2}}, {LANES{8'h75}}, {LANES{8'hf1}}, {LANES{8'h2c}}, {LANES{8'h3d}} }; calib_data <= { {LANES{8'h80}}, {LANES{8'hdb}}, {LANES{8'hcf}}, {LANES{8'hd2}}, {LANES{8'h75}}, {LANES{8'hf1}}, {LANES{8'h2c}}, {LANES{8'h3d}} };
// write to address 1 is also a burst of 8 writes, where all lanes has same data written: 128'h80dbcfd275f12c3d
state_calibrate <= ISSUE_READ; state_calibrate <= ISSUE_READ;
end end
// NOTE: WHY THERE ARE TWO ISSUE_WRITE
// address 0 and 1 is written with a deterministic data, if the DQ trace has long delay (relative to command line) then the data will be delayed
// compared to the write command. Thus the data aligned to the write command for address 0 MIGHT START AT MIDDLE OF EXPECTED OUTPUT
// DATA (64'h9177298cd0ad51c1) e.g. the data written might be 64'h[2c3d][9177298cd0ad] where the data written starts
// at burst 2 (burst 0 and burst 1 are cut-off since each burst uses 1 ddr3_clk cycle)
// Note here that if DQ and DQS sa same delay, then we know the DQS will always be aligned with DQ data
ISSUE_READ: begin ISSUE_READ: begin
calib_stb <= 1;//actual request flag calib_stb <= 1;//actual request flag
calib_aux <= 1; //AUX ID to determine later if ACK is for read or write calib_aux <= 1; //AUX ID to determine later if ACK is for read or write
@ -1569,7 +1753,7 @@ module ddr3_controller #(
end end
READ_DATA: if(o_wb_ack_read_q[0] == {{(AUX_WIDTH-2){1'b0}}, 2'd1, 1'b1}) begin //wait for the read ack (which has AUX ID of 1} READ_DATA: if(o_wb_ack_read_q[0] == {{(AUX_WIDTH-2){1'b0}}, 2'd1, 1'b1}) begin //wait for the read ack (which has AUX ID of 1}
read_data_store <= o_wb_data; read_data_store <= o_wb_data; // read data on address 0
calib_stb <= 0; calib_stb <= 0;
state_calibrate <= ANALYZE_DATA; state_calibrate <= ANALYZE_DATA;
data_start_index[lane] <= 0; data_start_index[lane] <= 0;
@ -1583,7 +1767,12 @@ module ddr3_controller #(
else if(!o_wb_stall_calib) begin else if(!o_wb_stall_calib) begin
calib_stb <= 0; calib_stb <= 0;
end end
// extract burst_0-to-burst_7 data for a specified lane then determine which byte in write_pattern does it starts
// NOTE TO SELF: all "8" here assume DQ_BITS are 8? parameterize this properly
// data_start_index for a specified lane determine how many bits are off the data from the write command
// so for every 1 ddr3 clk cycle delay of DQ from write command, each lane will be 1 burst off:
// e.g. LANE={burst7, burst6, burst5, burst4, burst3, burst2, burst1, burst0} then with 1 ddr3 cycle delay between DQ and command
// burst0 will not be written but only starting on burst1
ANALYZE_DATA: if(write_pattern[data_start_index[lane] +: 64] == {read_data_store[((DQ_BITS*LANES)*7 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*6 + 8*lane) +: 8], ANALYZE_DATA: if(write_pattern[data_start_index[lane] +: 64] == {read_data_store[((DQ_BITS*LANES)*7 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*6 + 8*lane) +: 8],
read_data_store[((DQ_BITS*LANES)*5 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*4 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*3 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*5 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*4 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*3 + 8*lane) +: 8],
read_data_store[((DQ_BITS*LANES)*2 + 8*lane) +: 8],read_data_store[((DQ_BITS*LANES)*1 + 8*lane) +: 8],read_data_store[((DQ_BITS*LANES)*0 + 8*lane) +: 8] }) begin read_data_store[((DQ_BITS*LANES)*2 + 8*lane) +: 8],read_data_store[((DQ_BITS*LANES)*1 + 8*lane) +: 8],read_data_store[((DQ_BITS*LANES)*0 + 8*lane) +: 8] }) begin
@ -1598,7 +1787,7 @@ module ddr3_controller #(
end end
end end
else begin else begin
data_start_index[lane] <= data_start_index[lane] + 8; data_start_index[lane] <= data_start_index[lane] + 8; //skip by 8
if(data_start_index[lane] == 56) begin //reached the end but no byte in write-pattern matches the data read, issue might be reading at wrong DQS toggle if(data_start_index[lane] == 56) begin //reached the end but no byte in write-pattern matches the data read, issue might be reading at wrong DQS toggle
data_start_index[lane] <= 0; //so we need to recalibrate the bitslip data_start_index[lane] <= 0; //so we need to recalibrate the bitslip
start_index_check <= 0; start_index_check <= 0;
@ -1710,7 +1899,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
//if(write_test_address_counter[wb_addr_bits-1:0] == 500) begin //inject error at middle //if(write_test_address_counter[wb_addr_bits-1:0] == 500) begin //inject error at middle
// calib_data <= 1; // calib_data <= 1;
//end //end
if(write_test_address_counter[wb_addr_bits-1:0] == 999 ) begin //MUST END AT ODD NUMBER if(write_test_address_counter[wb_addr_bits-1:0] == 99 ) begin //MUST END AT ODD NUMBER
state_calibrate <= BURST_READ; state_calibrate <= BURST_READ;
end end
end end
@ -1733,7 +1922,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
calib_addr <= read_test_address_counter; calib_addr <= read_test_address_counter;
read_test_address_counter <= read_test_address_counter + 1; read_test_address_counter <= read_test_address_counter + 1;
if(MICRON_SIM) begin if(MICRON_SIM) begin
if(read_test_address_counter == 999) begin //MUST END AT ODD NUMBER if(read_test_address_counter == 99) begin //MUST END AT ODD NUMBER
state_calibrate <= RANDOM_WRITE; state_calibrate <= RANDOM_WRITE;
end end
end end
@ -1759,7 +1948,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
//if(write_test_address_counter[wb_addr_bits-1:0] == 1500) begin //inject error //if(write_test_address_counter[wb_addr_bits-1:0] == 1500) begin //inject error
// calib_data <= 1; // calib_data <= 1;
//end //end
if(write_test_address_counter[wb_addr_bits-1:0] == 1999) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even if(write_test_address_counter[wb_addr_bits-1:0] == 199) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
state_calibrate <= RANDOM_READ; state_calibrate <= RANDOM_READ;
end end
end end
@ -1784,7 +1973,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
<= read_test_address_counter[wb_addr_bits-1:ROW_BITS]; <= read_test_address_counter[wb_addr_bits-1:ROW_BITS];
read_test_address_counter <= read_test_address_counter + 1; read_test_address_counter <= read_test_address_counter + 1;
if(MICRON_SIM) begin if(MICRON_SIM) begin
if(read_test_address_counter == 1999) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even if(read_test_address_counter == 199) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
state_calibrate <= ALTERNATE_WRITE_READ; state_calibrate <= ALTERNATE_WRITE_READ;
end end
end end
@ -1803,7 +1992,7 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
calib_addr <= {1'b0, write_test_address_counter[wb_addr_bits-1:1]}; //same address to be used for write and read ( so basically write then read instantly) calib_addr <= {1'b0, write_test_address_counter[wb_addr_bits-1:1]}; //same address to be used for write and read ( so basically write then read instantly)
calib_data <= {wb_sel_bits{write_test_address_counter[7:0]}}; calib_data <= {wb_sel_bits{write_test_address_counter[7:0]}};
if(MICRON_SIM) begin if(MICRON_SIM) begin
if(write_test_address_counter == 4999) begin if(write_test_address_counter == 499) begin
train_delay <= 15; train_delay <= 15;
state_calibrate <= FINISH_READ; state_calibrate <= FINISH_READ;
end end
@ -1858,12 +2047,12 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
end end
end end
assign issue_read_command = (state_calibrate == MPR_READ && delay_before_read_data == 0); assign issue_read_command = (state_calibrate == MPR_READ && delay_before_read_data == 0);
assign issue_write_command = 0;
assign o_phy_odelay_data_cntvaluein = odelay_data_cntvaluein[lane]; assign o_phy_odelay_data_cntvaluein = odelay_data_cntvaluein[lane];
assign o_phy_odelay_dqs_cntvaluein = odelay_dqs_cntvaluein[lane]; assign o_phy_odelay_dqs_cntvaluein = odelay_dqs_cntvaluein[lane];
assign o_phy_idelay_data_cntvaluein = idelay_data_cntvaluein[lane]; assign o_phy_idelay_data_cntvaluein = idelay_data_cntvaluein[lane];
assign o_phy_idelay_dqs_cntvaluein = idelay_dqs_cntvaluein[lane]; assign o_phy_idelay_dqs_cntvaluein = idelay_dqs_cntvaluein[lane];
assign dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1; assign dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1; // move to next odd (if 3 then 5, if 4 then 5)
// To show why next odd number is needed: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
reg[31:0] wb_data_to_wb2 = 0; reg[31:0] wb_data_to_wb2 = 0;
always @(posedge i_controller_clk) begin always @(posedge i_controller_clk) begin
if(o_wb_ack_read_q[0][0]) wb_data_to_wb2 <= o_wb_data[31:0]; //save data read if(o_wb_ack_read_q[0][0]) wb_data_to_wb2 <= o_wb_data[31:0]; //save data read
@ -2193,7 +2382,7 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
// find anticipate activate command slot number // find anticipate activate command slot number
if(CL_nCK > CWL_nCK) slot_number = read_slot; if(CL_nCK > CWL_nCK) slot_number = read_slot;
else slot_number = write_slot; else slot_number = write_slot;
delay = ps_to_nCK($rtoi(tRCD)); delay = ps_to_nCK(tRCD);
for(slot_number = slot_number; delay != 0; delay = delay - 1) begin for(slot_number = slot_number; delay != 0; delay = delay - 1) begin
slot_number = slot_number - 1'b1; slot_number = slot_number - 1'b1;
end end

View File

@ -1,6 +1,6 @@
`default_nettype none `default_nettype none
`timescale 1ps / 1ps `timescale 1ps / 1ps
//`define DEBUG_DQS //`define DEBUG_DQS // uncomment to route the raw DQS to output port for debugging
module ddr3_phy #( module ddr3_phy #(
parameter CONTROLLER_CLK_PERIOD = 10_000, //ps, clock period of the controller interface parameter CONTROLLER_CLK_PERIOD = 10_000, //ps, clock period of the controller interface
@ -12,7 +12,7 @@ module ddr3_phy #(
parameter[0:0] ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported parameter[0:0] ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
USE_IO_TERMINATION = 0, //use IOBUF_DCIEN and IOBUFDS_DCIEN when 1 USE_IO_TERMINATION = 0, //use IOBUF_DCIEN and IOBUFDS_DCIEN when 1
// The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
parameter serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD), parameter serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
wb_data_bits = DQ_BITS*LANES*serdes_ratio*2, wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
wb_sel_bits = wb_data_bits / 8, wb_sel_bits = wb_data_bits / 8,
//4 is the width of a single ddr3 command {cs_n, ras_n, cas_n, we_n} plus 3 (ck_en, odt, reset_n) plus bank bits plus row bits //4 is the width of a single ddr3 command {cs_n, ras_n, cas_n, we_n} plus 3 (ck_en, odt, reset_n) plus bank bits plus row bits

View File

@ -20,7 +20,7 @@ module ddr3_top #(
ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
SECOND_WISHBONE = 0, //set to 1 if 2nd wishbone is needed SECOND_WISHBONE = 0, //set to 1 if 2nd wishbone is needed
parameter // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration parameter // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD), serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
wb_addr_bits = ROW_BITS + COL_BITS + BA_BITS - $clog2(serdes_ratio*2), wb_addr_bits = ROW_BITS + COL_BITS + BA_BITS - $clog2(serdes_ratio*2),
wb_data_bits = DQ_BITS*LANES*serdes_ratio*2, wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
wb_sel_bits = wb_data_bits / 8, wb_sel_bits = wb_data_bits / 8,

View File

@ -53,8 +53,8 @@ module ddr3_dimm_micron_sim;
`endif `endif
localparam CONTROLLER_CLK_PERIOD = 10, //ns, period of clock input to this DDR3 controller module localparam CONTROLLER_CLK_PERIOD = 10_000, //ps, period of clock input to this DDR3 controller module
DDR3_CLK_PERIOD = 2.5, //ns, period of clock input to DDR3 RAM device DDR3_CLK_PERIOD = 2500, //ps, period of clock input to DDR3 RAM device
AUX_WIDTH = 16, // AUX lines AUX_WIDTH = 16, // AUX lines
OPT_LOWPOWER = 1, //1 = low power, 0 = low logic OPT_LOWPOWER = 1, //1 = low power, 0 = low logic
OPT_BUS_ABORT = 1; OPT_BUS_ABORT = 1;
@ -126,13 +126,13 @@ module ddr3_dimm_micron_sim;
`else `else
assign clk_locked = 1; assign clk_locked = 1;
always #(CONTROLLER_CLK_PERIOD*1000/2) i_controller_clk = !i_controller_clk; always #(CONTROLLER_CLK_PERIOD/2) i_controller_clk = !i_controller_clk;
always #(DDR3_CLK_PERIOD*1000/2) i_ddr3_clk = !i_ddr3_clk; always #(DDR3_CLK_PERIOD/2) i_ddr3_clk = !i_ddr3_clk;
always #2500 i_ref_clk = !i_ref_clk; always #2500 i_ref_clk = !i_ref_clk;
initial begin //90 degree phase shifted ddr3_clk initial begin //90 degree phase shifted ddr3_clk
#(DDR3_CLK_PERIOD*1000/4); #(DDR3_CLK_PERIOD/4);
while(1) begin while(1) begin
#(DDR3_CLK_PERIOD*1000/2) i_ddr3_clk_90 = !i_ddr3_clk_90; #(DDR3_CLK_PERIOD/2) i_ddr3_clk_90 = !i_ddr3_clk_90;
end end
end end
initial begin initial begin