Merge pull request #1 from AngeloJacobo/dev

Dev branch pull request
2024-04-20 15:18:30 +08:00 · 2024-04-20 15:18:30 +08:00 · a354a8d4ef
parent 666f98d6c1 81865ea2f8
commit a354a8d4ef
4 changed files with 302 additions and 113 deletions
--- a/rtl/ddr3_controller.v
+++ b/rtl/ddr3_controller.v
@ -17,10 +17,10 @@
 // Engineer: Angelo C. Jacobo
 //
 ////////////////////////////////////////////////////////////////////////////////
+// NOTE TO SELF are questions which I still need to answer
+// Comments are continuously added on this RTL for better readability

-
-
-//`define FORMAL_COVER //skip reset sequence to fit in cover depth
+//`define FORMAL_COVER //skip reset sequence during formal verification to fit in cover depth
 `default_nettype none
 `timescale 1ps / 1ps
 //
@ -52,7 +52,7 @@ module ddr3_controller #(
                   ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
                   SECOND_WISHBONE = 0, //set to 1 if 2nd wishbone is needed 
    parameter // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
-                serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD),
+                serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
                wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
                wb_addr_bits = ROW_BITS + COL_BITS + BA_BITS - $clog2(serdes_ratio*2),
                wb_sel_bits = wb_data_bits / 8,
@ -157,7 +157,7 @@ module ddr3_controller #(
    // delayed and center-aligned to the center eye of data)
    localparam DATA_INITIAL_ODELAY_TAP = 0; 

-    //DQS needs to be edge-aligned to the center eye of the data. 
+    //Posedge of DQS needs to be aligned to the center eye of the data. 
    //This means DQS needs to be delayed by a quarter of the ddr3
    //clk period relative to the data. Subtract by 600ps to include
    //the IODELAY insertion delay. Divide by a delay resolution of 
@ -179,17 +179,30 @@ module ddr3_controller #(
        localparam tRCD     =       13_750; // ps Active to Read/Write command time
        localparam tRP      =      13_750; // ps Precharge command period
        localparam tRAS     =      35_000; // ps ACT to PRE command period
+    `elsif DDR3_1333_9_9_9 //DDR3-1333 (9-9-9) speed bin
+        localparam tRCD     =       13_500; // ps Active to Read/Write command time
+        localparam tRP      =      13_500; // ps Precharge command period
+        localparam tRAS     =      36_000; // ps ACT to PRE command period
+    `elsif DDR3_1066_7_7_7 //DDR3-1066 (7-7-7) speed bin
+        localparam tRCD     =       13_125; // ps Active to Read/Write command time
+        localparam tRP      =      13_125; // ps Precharge command period
+        localparam tRAS     =      37_500; // ps ACT to PRE command period
+    `else
+        "Throw an error here if speed bin is not recognized (or not defined)"
    `endif
-
+    
    `ifdef RAM_1Gb
        localparam tRFC         =           110_000;      // ps Refresh command  to ACT or REF 
    `elsif RAM_2Gb
        localparam tRFC         =           160_000;      // ps Refresh command  to ACT or REF 
    `elsif RAM_4Gb
        localparam tRFC         =           300_000;      // ps Refresh command  to ACT or REF 
-    `else
+    `elsif RAM_8Gb
        localparam tRFC             =       350_000;      // ps Refresh command  to ACT or REF 
+    `else
+        "Throw an error here if capacity is not recognized (or not defined)"
    `endif
+    
    localparam tREFI = 7_800_000; //ps Average periodic refresh interval
    localparam tXPR = max(5*DDR3_CLK_PERIOD, tRFC+10_000); // ps Exit Reset from CKE HIGH to a valid command
    localparam tWR = 15_000; // ps Write Recovery Time
@ -247,6 +260,8 @@ module ddr3_controller #(
    //Also, worscase is when the anticipated bank still has the leftover of the 
    //WRITE_TO_PRECHARGE_DELAY thus consider also this.
    localparam MARGIN_BEFORE_ANTICIPATE = PRECHARGE_TO_ACTIVATE_DELAY + ACTIVATE_TO_WRITE_DELAY + WRITE_TO_PRECHARGE_DELAY;
+    // STAGE2_DATA_DEPTH is the number of controller clk cycles of delay before issuing the data after the write command
+    // depends on the CWL_nCK
    localparam STAGE2_DATA_DEPTH = (CWL_nCK - (3 - WRITE_SLOT + 1))/4 + 1; //this is always >= 1 (5 - (3 - 3 + 1))/4.0 -> floor(1) + 1 = floor(4
    `ifdef FORMAL
        wire stage2_data_depth;
@ -255,8 +270,11 @@ module ddr3_controller #(
            assert(STAGE2_DATA_DEPTH-2 >= 0);
        end
    `endif
-    localparam READ_DELAY = $rtoi($floor((CL_nCK - (3 - READ_SLOT + 1))/4.0 ));
-    localparam READ_ACK_PIPE_WIDTH = READ_DELAY + 1 + 2 + 1 + 1;
+    localparam READ_DELAY = $rtoi($floor((CL_nCK - (3 - READ_SLOT + 1))/4.0 )); // how many controller clk cycles to satisfy CL_nCK of ddr3_clk cycles
+     // READ_ACK_PIPE_WIDTH is the delay between read command issued (starting from the controller) until the data is received by the controller
+     //the delays included the ODELAY and OSERDES when issuing the read command
+     //and the IDELAY and ISERDES when receiving the data  (NOTE TO SELF: ELABORATE ON WHY THOSE MAGIC NUMBERS)
+    localparam READ_ACK_PIPE_WIDTH = READ_DELAY + 1 + 2 + 1 + 1;                           
    localparam MAX_ADDED_READ_ACK_DELAY = 16;
    localparam DELAY_BEFORE_WRITE_LEVEL_FEEDBACK = STAGE2_DATA_DEPTH + ps_to_cycles(tWLO+tWLOE) + 10;  //plus 10 controller clocks for possible bus latency and 
                                                                                                      //the delay for receiving feedback DQ from IOBUF -> IDELAY -> ISERDES
@ -350,7 +368,6 @@ module ddr3_controller #(
    reg reset_done = 0; //high if reset has already finished
    reg pause_counter = 0;
    wire issue_read_command;
-    wire issue_write_command;
    reg stage2_update = 1;
    reg stage2_stall = 0;
    reg stage1_stall = 0;
@ -428,14 +445,17 @@ module ddr3_controller #(
    reg[15:0] dqs_bitslip_arrangement = 0;
    /* verilator lint_off UNUSEDSIGNAL */
    (* mark_debug = "true" *) reg[3:0] added_read_pipe_max = 0;
-    (* mark_debug = "true" *) reg[3:0] added_read_pipe[LANES - 1:0];
+    (* mark_debug = "true" *) reg[3:0] added_read_pipe[LANES - 1:0]; 
+    //each lane will have added delay relative to when ISERDES should actually return the data
+    //this make sure that we will wait until the lane with longest delay (added_read_pipe_max) is received before
+    //all lanes are sent to wishbone data
    
    //contains the ack shift reg for both read and write
    reg[AUX_WIDTH:0] shift_reg_read_pipe_q[READ_ACK_PIPE_WIDTH-1:0]; 
-    reg[AUX_WIDTH:0] shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1:0]; //1=issue command delay (OSERDES delay), 2 =  ISERDES delay 
-    reg index_read_pipe; //tells which delay_read_pipe will be updated 
+    reg[AUX_WIDTH:0] shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1:0]; //issue ack and AUX value , 1=issue command delay (OSERDES delay), 2 =  ISERDES delay 
+    reg index_read_pipe; //tells which delay_read_pipe will be updated (there are two delay_read_pipe)
    reg index_wb_data; //tells which o_wb_data_q will be sent to o_wb_data
-    reg[15:0] delay_read_pipe[1:0]; //delay when each lane will retrieve i_phy_iserdes_data
+    reg[15:0] delay_read_pipe[1:0]; //delay when each lane will retrieve i_phy_iserdes_data (since different lanes might not be aligned with each other and needs to be retrieved at a different time)
    reg[wb_data_bits - 1:0] o_wb_data_q[1:0]; //store data retrieved from i_phy_iserdes_data to be sent to o_wb_data
    reg[AUX_WIDTH:0] o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1:0];
    
@ -807,8 +827,8 @@ module ddr3_controller #(
                //stage2_data -> shiftreg(CWL) -> OSERDES(DDR) -> ODELAY -> RAM
            end
            if(!ODELAY_SUPPORTED) begin
-                stage2_data_unaligned <= stage2_data_unaligned_temp; //delayed by 1 clock cycle 
-                stage2_dm_unaligned <= stage2_dm_unaligned_temp;  //delayed by 1 clock cycle 
+                stage2_data_unaligned <= stage2_data_unaligned_temp; //_temp is for added delay of 1 clock cycle (no ODELAY so no added delay)
+                stage2_dm_unaligned <= stage2_dm_unaligned_temp;  //_temp is for added delay of 1 clock cycle (no ODELAY so no added delay)
            end

            // when not in refresh, transaction can only be processed when i_wb_cyc is high and not stall
@ -834,6 +854,7 @@ module ddr3_controller #(
                /* verilator lint_on WIDTH */
                stage1_data <= i_wb_data;
            end
+            // request from calibrate FSM will be accepted here
            else if(state_calibrate != DONE_CALIBRATE && !o_wb_stall_calib) begin
                stage1_pending <= calib_stb;//actual request flag
                stage1_we <= calib_we; //write-enable
@ -850,6 +871,8 @@ module ddr3_controller #(
            
            for(index = 0; index < LANES; index = index + 1) begin
                /* verilator lint_off WIDTH */
+                // stage2_data_unaligned is the DQ_BITS*LANES*8 raw data from stage 1 so not yet aligned
+                // unaligned_data is 64 bits
                {unaligned_data[index], { 
                stage2_data[0][((DQ_BITS*LANES)*7 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*6 + 8*index) +: 8], 
                stage2_data[0][((DQ_BITS*LANES)*5 + 8*index) +: 8], stage2_data[0][((DQ_BITS*LANES)*4 + 8*index) +: 8], 
@ -860,7 +883,37 @@ module ddr3_controller #(
                        stage2_data_unaligned[((DQ_BITS*LANES)*3 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*2 + 8*index) +: 8],
                        stage2_data_unaligned[((DQ_BITS*LANES)*1 + 8*index) +: 8], stage2_data_unaligned[((DQ_BITS*LANES)*0 + 8*index) +: 8] }
                        << data_start_index[index]) | unaligned_data[index];
+                /*
+                // Example with LANE 0:
+                // Burst_0 to burst_7 of unaligned LANE 0 will be extracted which will be shifted by data_start_index.
+                // Each 8 bits of shift means a burst will be moved to next ddr3_clk cycle, this is needed if for example
+                // the DQ trace is longer than the command trace where the DQ bits must be delayed by 1 ddr3_clk cycle
+                // to align the DQ data to the write command.
+                //
+                // Since 1 controller clk cycle will have 4 ddr3_clk cycle, and each ddr3_clk cycle is DDR:
+                // CONTROLLER CLK CYCLE 0: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
+                // CONTROLLER CLK CYCLE 1: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
+                // CONTROLLER CLK CYCLE 2: [burst0,burst1] [burst2,burst3] [burst4,burst5] [burst6,burst7]
+                //
+                // shifting by 1 burst means burst 7 will be sent on next controller clk cycle and EVERY BURST WILL SHIFT:
+                // CONTROLLER CLK CYCLE 0: [xxxxxx,xxxxxx] [burst0,burst1] [burst2,burst3] [burst4,burst5]
+                // CONTROLLER CLK CYCLE 1: [burst6,burst7] [burst0,burst1] [burst2,burst3] [burst4,burst5] 
+                // CONTROLLER CLK CYCLE 2: [burst6,burst7] [burst0,burst1] [burst2,burst3] [burst4,burst5]
+                //
+                // the [burst6,burst7] which has to be stored and delayed until next clk cycle will be handled by unaligned_data
+                {unaligned_data[0], { 
+                stage2_data[0][((64)*7 + 8*0) +: 8], stage2_data[0][((64)*6 + 8*0) +: 8], 
+                stage2_data[0][((64)*5 + 8*0) +: 8], stage2_data[0][((64)*4 + 8*0) +: 8], 
+                stage2_data[0][((64)*3 + 8*0) +: 8], stage2_data[0][((64)*2 + 8*0) +: 8], 
+                stage2_data[0][((64)*1 + 8*0) +: 8], stage2_data[0][((64)*0 + 8*0) +: 8] }} 
+                <= ( {  stage2_data_unaligned[((64)*7 + 8*0) +: 8], stage2_data_unaligned[((64)*6 + 8*0) +: 8],
+                        stage2_data_unaligned[((64)*5 + 8*0) +: 8], stage2_data_unaligned[((64)*4 + 8*0) +: 8], 
+                        stage2_data_unaligned[((64)*3 + 8*0) +: 8], stage2_data_unaligned[((64)*2 + 8*0) +: 8],
+                        stage2_data_unaligned[((64)*1 + 8*0) +: 8], stage2_data_unaligned[((64)*0 + 8*0) +: 8] }
+                        << data_start_index[0]) | unaligned_data[0];
+                */

+                // The same alignment logic is done with data mask
                {unaligned_dm[index], {
                stage2_dm[0][LANES*7 + index], stage2_dm[0][LANES*6 + index], 
                stage2_dm[0][LANES*5 + index], stage2_dm[0][LANES*4 + index], 
@ -873,6 +926,7 @@ module ddr3_controller #(
                        << (data_start_index[index]>>3)) | unaligned_dm[index];
                /* verilator lint_on WIDTH */
            end
+            // stage2 can have multiple pipelined stages inside it which acts as delay before issuing the write data (after issuing write command)
            for(index = 0; index < STAGE2_DATA_DEPTH-1; index = index+1) begin
                stage2_data[index+1] <=  stage2_data[index];              
                stage2_dm[index+1] <= stage2_dm[index];
@ -885,11 +939,11 @@ module ddr3_controller #(
            end
        end
    end
-    assign o_phy_data = stage2_data[STAGE2_DATA_DEPTH-1];             
+    assign o_phy_data = stage2_data[STAGE2_DATA_DEPTH-1];  // the data sent to PHY is the last stage of of stage 2 (since stage 2 can have multiple pipelined stages inside it_           
    assign o_phy_dm = stage2_dm[STAGE2_DATA_DEPTH-1];
    /* verilator lint_off WIDTH */
-    assign wb_addr_plus_anticipate = i_wb_addr + MARGIN_BEFORE_ANTICIPATE;
-    assign calib_addr_plus_anticipate = calib_addr + MARGIN_BEFORE_ANTICIPATE;
+    assign wb_addr_plus_anticipate = i_wb_addr + MARGIN_BEFORE_ANTICIPATE; // wb_addr_plus_anticipate determines if it is near the end of column by checking if it jumps to next row
+    assign calib_addr_plus_anticipate = calib_addr + MARGIN_BEFORE_ANTICIPATE; // just same as wb_addr_plus_anticipate but while doing calibration
    /* verilator lint_on WIDTH */
    // DIAGRAM FOR ALL RELEVANT TIMING PARAMETERS:
    //
@ -924,16 +978,17 @@ module ddr3_controller #(
            bank_status_d[index] = bank_status_q[index];
            bank_active_row_d[index] = bank_active_row_q[index];
        end
-        //set cmd_0 to reset instruction, the remainings are NOP
-        //delay_counter_is_zero signifies start of new reset instruction (the time when the command must be issued)
-        cmd_d[PRECHARGE_SLOT] = {(!delay_counter_is_zero), instruction[DDR3_CMD_START-1:DDR3_CMD_END], cmd_odt, instruction[CLOCK_EN], instruction[RESET_N], 
+        //set PRECHARGE_SLOT as reset instruction, the remainings are NOP (MSB is high)
+        //delay_counter_is_zero high signifies start of new reset instruction (the time when the command must be issued)
+        cmd_d[PRECHARGE_SLOT] = {(!delay_counter_is_zero), instruction[DDR3_CMD_START-1:DDR3_CMD_END] | {3{(!delay_counter_is_zero)}} , cmd_odt, instruction[CLOCK_EN], instruction[RESET_N], 
                        instruction[MRS_BANK_START:(MRS_BANK_START-BA_BITS+1)], instruction[ROW_BITS-1:0]};
        cmd_d[PRECHARGE_SLOT][10] = instruction[A10_CONTROL];
-        cmd_d[READ_SLOT] = {(!issue_read_command), CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}};  
-        cmd_d[WRITE_SLOT] = {(!issue_write_command),CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; 
-        cmd_d[ACTIVATE_SLOT] = {1'b1,CMD_ACT[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; 
+        cmd_d[READ_SLOT] = {(!issue_read_command), CMD_RD[2:0] | {3{(!issue_read_command)}}, cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; // issued during MPR reads (address does not matter)
+        cmd_d[WRITE_SLOT] = {1'b0, 3'b111, cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}}; // always NOP by default
+        cmd_d[ACTIVATE_SLOT] = {1'b0, 3'b111 , cmd_odt, cmd_ck_en, cmd_reset_n, {(ROW_BITS+BA_BITS){1'b0}}};  // always NOP by default
            
-        // decrement delay counters for every bank
+        // decrement delay counters for every bank , stay to 0 once 0 is reached
+        // every bank will have its own delay counters for precharge, activate, write, and read 
        for(index=0; index< (1<<BA_BITS); index=index+1) begin
            delay_before_precharge_counter_d[index] = (delay_before_precharge_counter_q[index] == 0)? 0: delay_before_precharge_counter_q[index] - 1;
            delay_before_activate_counter_d[index] = (delay_before_activate_counter_q[index] == 0)? 0: delay_before_activate_counter_q[index] - 1;
@ -941,9 +996,10 @@ module ddr3_controller #(
            delay_before_read_counter_d[index] = (delay_before_read_counter_q[index] == 0)? 0:delay_before_read_counter_q[index] - 1;
        end
        for(index = 1; index < READ_ACK_PIPE_WIDTH; index = index + 1) begin
+            // shift is rightward where LSB gets MSB ([MSB] -> [] -> [] -> .... -> [] -[LSB])
            shift_reg_read_pipe_d[index-1] = shift_reg_read_pipe_q[index];
        end
-        shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = 0;
+        shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = 0; //MSB just receives zero when shifted rightward


        //USE _d in ALL
@ -959,7 +1015,7 @@ module ddr3_controller #(
                    stage2_stall = 0;
                    stage2_update = 1;
                    cmd_odt = 1'b1;
-                    shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; 
+                    shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; // ack is sent to shift_reg which will be shifted until the wb ack output

                    //write acknowledge will use the same logic pipeline as the read acknowledge. 
                    //This would mean write ack latency will be the same for
@ -968,9 +1024,11 @@ module ddr3_controller #(
                    //for write ack as there will be no need to analyze the
                    //contents of the shift_reg_read_pipe just to determine
                    //where best to place the write ack on the pipeline (since
-                    //the order of ack must be maintaned). But this would mean
+                    //the order of ack must be maintained). But this would mean
                    //the latency for write is fixed regardless if there is an 
-                    //outstanding read ack or none on the pipeline 
+                    //outstanding read ack or none on the pipeline. But this is
+                    // acceptable in my opinion since this is a pipelined wishbone
+                    // where the transaction can continue regardless when ack returns
                    
                    //set-up delay before precharge, read, and write
                    if(delay_before_precharge_counter_q[stage2_bank] <= WRITE_TO_PRECHARGE_DELAY) begin
@ -984,7 +1042,7 @@ module ddr3_controller #(
                        delay_before_precharge_counter_d[stage2_bank] = WRITE_TO_PRECHARGE_DELAY;
                    end
                    for(index=0; index < (1<<BA_BITS); index=index+1) begin //the write to read delay applies to all banks (odt must be turned off properly before reading)
-                        delay_before_read_counter_d[index] = WRITE_TO_READ_DELAY + 1;
+                        delay_before_read_counter_d[index] = WRITE_TO_READ_DELAY + 1; //NOTE TO SELF: why plus 1?
                    end
                    delay_before_read_counter_d[stage2_bank] = WRITE_TO_READ_DELAY + 1;     
                    delay_before_write_counter_d[stage2_bank] = WRITE_TO_WRITE_DELAY;
@ -992,9 +1050,8 @@ module ddr3_controller #(
                    if(COL_BITS <= 10) begin
                        cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd11}{1'b0}} , 1'b0 , stage2_col[9:0]};  
                    end
-                    else begin
-                        cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd12}{1'b0}} , 
-                            stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};  
+                    else begin // COL_BITS > 10 has different format from <= 10
+                        cmd_d[WRITE_SLOT] = {1'b0, CMD_WR[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank,{{ROW_BITS-32'd12}{1'b0}} , stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};  
                    end
                    //turn on odt at same time as write cmd
                    cmd_d[0][CMD_ODT] = cmd_odt;
@ -1003,7 +1060,6 @@ module ddr3_controller #(
                    cmd_d[3][CMD_ODT] = cmd_odt;
                    write_dqs_d=1;
                    write_dq_d=1;
-                   // write_data = 1;
                end
                
                //read request
@ -1016,19 +1072,18 @@ module ddr3_controller #(
                        delay_before_precharge_counter_d[stage2_bank] = READ_TO_PRECHARGE_DELAY;
                    end
                    delay_before_read_counter_d[stage2_bank] = READ_TO_READ_DELAY;     
-                    delay_before_write_counter_d[stage2_bank] = READ_TO_WRITE_DELAY + 1; //temporary solution since its possible odt to go hig already while reading previously
-                    for(index=0; index < (1<<BA_BITS); index=index+1) begin //the read to write delay applies to all banks (odt must be turned on properly before writing)
-                        delay_before_write_counter_d[index] = READ_TO_WRITE_DELAY + 1;
+                    delay_before_write_counter_d[stage2_bank] = READ_TO_WRITE_DELAY + 1; //temporary solution since its possible odt to go high already while reading previously
+                    for(index=0; index < (1<<BA_BITS); index=index+1) begin //the read to write delay applies to all banks (odt must be turned on properly before writing and this delay is for ODT to settle)
+                        delay_before_write_counter_d[index] = READ_TO_WRITE_DELAY + 1; // NOTE TO SELF: why plus 1?
                    end
-                    shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; 
+                    shift_reg_read_pipe_d[READ_ACK_PIPE_WIDTH-1] = {stage2_aux, 1'b1}; // ack is sent to shift_reg which will be shifted until the wb ack output

                    //issue read command
                    if(COL_BITS <= 10) begin
                        cmd_d[READ_SLOT] = {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd11}{1'b0}} , 1'b0 , stage2_col[9:0]};  
                    end
-                    else begin
-                        cmd_d[READ_SLOT] =  {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd12}{1'b0}} , 
-                            stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};  
+                    else begin // COL_BITS > 10 has different format from <= 10
+                        cmd_d[READ_SLOT] =  {1'b0, CMD_RD[2:0], cmd_odt, cmd_ck_en, cmd_reset_n, stage2_bank, {{ROW_BITS-32'd12}{1'b0}} , stage2_col[(COL_BITS <= 10) ? 0 : 10] , 1'b0 , stage2_col[9:0]};  
                    end
                    //turn off odt at same time as read cmd
                    cmd_d[0][CMD_ODT] = cmd_odt;
@ -1043,10 +1098,10 @@ module ddr3_controller #(
                activate_slot_busy = 1'b1;
                delay_before_precharge_counter_d[stage2_bank] = ACTIVATE_TO_PRECHARGE_DELAY;
                //set-up delay before read and write
-                if(delay_before_read_counter_q[stage2_bank] <= ACTIVATE_TO_READ_DELAY) begin
+                if(delay_before_read_counter_q[stage2_bank] <= ACTIVATE_TO_READ_DELAY) begin // if current delay is > ACTIVATE_TO_READ_DELAY, then updating it to the lower delay will cause the previous delay to be violated
                    delay_before_read_counter_d[stage2_bank] = ACTIVATE_TO_READ_DELAY;
                end
-                if(delay_before_write_counter_q[stage2_bank] <= ACTIVATE_TO_WRITE_DELAY) begin
+                if(delay_before_write_counter_q[stage2_bank] <= ACTIVATE_TO_WRITE_DELAY) begin // if current delay is > ACTIVATE_TO_WRITE_DELAY, then updating it to the lower delay will cause the previous delay to be violated
                    delay_before_write_counter_d[stage2_bank] = ACTIVATE_TO_WRITE_DELAY;
                end
                //issue activate command
@ -1069,16 +1124,18 @@ module ddr3_controller #(

        //pending request on stage 1
        if(stage1_pending && !((stage1_next_bank == stage2_bank) && stage2_pending)) begin
-            //stage 1 will mainly be for anticipation, but it can also handle
-            //precharge and activate request. This will depend if the request
-            //is on the end of the row and must start the anticipation. For
-            //example, we have 10 rows in a bank:
-            //[R][R][R][R][R][R][R][A][A][A]
+            //stage 1 will mainly be for anticipation (if next requests need to jump to new bank then 
+            //anticipate the precharging and activate of that next bank, BUT it can also handle
+            //precharge and activate of CURRENT wishbone request.
+            //Anticipate will depend if the request is on the end of the row 
+            // and must start the anticipation. For example if we have 10 rows in a bank:
+            //[R][R][R][R][R][R][R][A][A][A] -> [next bank]
            //
            //R = Request, A = Anticipate
            //Unless we are near the third to the last column, stage 1 will
            //issue Activate and Precharge on the CURRENT bank. Else, stage
            //1 will issue Activate and Precharge for the NEXT bank
+            // Thus stage 1 anticipate makes sure smooth burst operation that jumps banks
            if(bank_status_q[stage1_next_bank] &&  bank_active_row_q[stage1_next_bank] != stage1_next_row && delay_before_precharge_counter_q[stage1_next_bank] ==0 && !precharge_slot_busy) begin    
                //set-up delay before read and write
                 delay_before_activate_counter_d[stage1_next_bank] = PRECHARGE_TO_ACTIVATE_DELAY;
@ -1090,10 +1147,10 @@ module ddr3_controller #(
            else if(!bank_status_q[stage1_next_bank] && delay_before_activate_counter_q[stage1_next_bank] == 0 && !activate_slot_busy) begin 
                delay_before_precharge_counter_d[stage1_next_bank] = ACTIVATE_TO_PRECHARGE_DELAY;
                //set-up delay before read and write
-                if(delay_before_read_counter_d[stage1_next_bank] <= ACTIVATE_TO_READ_DELAY) begin
+                if(delay_before_read_counter_d[stage1_next_bank] <= ACTIVATE_TO_READ_DELAY) begin  // if current delay is > ACTIVATE_TO_READ_DELAY, then updating it to the lower delay will cause the previous delay to be violated
                    delay_before_read_counter_d[stage1_next_bank] = ACTIVATE_TO_READ_DELAY;
                end
-                if(delay_before_write_counter_d[stage1_next_bank] <= ACTIVATE_TO_WRITE_DELAY) begin
+                if(delay_before_write_counter_d[stage1_next_bank] <= ACTIVATE_TO_WRITE_DELAY) begin  // if current delay is > ACTIVATE_TO_WRITE_DELAY, then updating it to the lower delay will cause the previous delay to be violated
                    delay_before_write_counter_d[stage1_next_bank] = ACTIVATE_TO_WRITE_DELAY;
                end
                cmd_d[ACTIVATE_SLOT] = {1'b0, CMD_ACT[2:0] , cmd_odt, cmd_ck_en, cmd_reset_n, stage1_next_bank , stage1_next_row};
@ -1110,10 +1167,10 @@ module ddr3_controller #(
            if(!bank_status_d[stage1_bank] || (bank_status_d[stage1_bank] && bank_active_row_d[stage1_bank] != stage1_row)) begin 
                stage1_stall = 1;
            end
-            else if(!stage1_we && delay_before_read_counter_d[stage1_bank] != 0) begin
+            else if(!stage1_we && delay_before_read_counter_d[stage1_bank] != 0) begin // if read request but delay before read is not yet met then stall
                stage1_stall = 1;
            end
-            else if(stage1_we && delay_before_write_counter_d[stage1_bank] != 0) begin
+            else if(stage1_we && delay_before_write_counter_d[stage1_bank] != 0) begin // if write request but delay before write is not yet met then stall
                stage1_stall = 1;
            end
            //different request type will need a delay of more than 1 clk cycle so stall the pipeline 
@ -1127,17 +1184,23 @@ module ddr3_controller #(
            //control stage2 stall in advance
            if(bank_status_d[stage2_bank] &&  bank_active_row_d[stage2_bank] == stage2_row) begin //read/write operation
                //write request
-                if(stage2_we && delay_before_write_counter_d[stage2_bank] == 0) begin //if counter is 1 now, then next clock it will be zero thus lower stall at next cycle too      
+                if(stage2_we && delay_before_write_counter_d[stage2_bank] == 0) begin // if write request and delay before write is already met then deassert stall
                    stage2_stall = 0; //to low stall next stage, but not yet at this stage
                end
                //read request
-                else if(!stage2_we && delay_before_read_counter_d[stage2_bank]==0) begin //if counter is 1 now, then next clock it will be zero thus lower stall at next cycle too      
+                else if(!stage2_we && delay_before_read_counter_d[stage2_bank]==0) begin // if read request and delay before read is already met then deassert stall 
                    stage2_stall = 0;
                end
            end
        end

        // control logic for stall
+        // this small logic is already optimized via listing all possible combinations on excel sheet, investigating the patterns
+        // and passing the formal verification. I recommend to not touch this and just left this as is.
+        // This logic makes sure stall will never go high unless the pipeline is full
+        // What made this complicated is that fact you have to predict the stall for next clock cycle in such 
+        // a way that it will only stall next clock cycle if the pipeline will be full on the next clock cycle.
+        // Excel sheet: https://1drv.ms/x/s!AhWdq9CipeVagSqQXPwRmXhDgttL?e=vVYIxE&nav=MTVfezAwMDAwMDAwLTAwMDEtMDAwMC0wMDAwLTAwMDAwMDAwMDAwMH0
        if(o_wb_stall_q) o_wb_stall_d = stage2_stall;
        else if( (!i_wb_stb && state_calibrate == DONE_CALIBRATE) || (!calib_stb && state_calibrate != DONE_CALIBRATE) ) o_wb_stall_d = 0; 
        else if(!stage1_pending) o_wb_stall_d = stage2_stall;
@ -1149,7 +1212,6 @@ module ddr3_controller #(
    assign o_phy_cmd = {cmd_d[3], cmd_d[2], cmd_d[1], cmd_d[0]};
    /*********************************************************************************************************************************************/

-
    /******************************************************* Align Read Data from ISERDES *******************************************************/
    always @(posedge i_controller_clk) begin
        if(sync_rst_controller) begin
@ -1195,30 +1257,62 @@ module ddr3_controller #(
                write_dq[index+1] <= write_dq[index]; 
            end 
            for(index = 0; index < READ_ACK_PIPE_WIDTH; index = index + 1) begin
+                // shifted rightward where LSB gets MSB ([MSB] -> [] -> [] -> .... -> [] -[LSB])
                shift_reg_read_pipe_q[index] <= shift_reg_read_pipe_d[index];
            end

-            for(index = 0; index < 2; index = index + 1) begin
+            for(index = 0; index < 2; index = index + 1) begin 
+                // there are 2 read_pipes (each with 16 space for shifting), and each read pipes shift rightward
+                // so the bit 1 will be shifted to the right until it reach LSB which means data is already on ISERDES output of PHY
                delay_read_pipe[index] <= (delay_read_pipe[index] >> 1);
            end
-            if(shift_reg_read_pipe_q[1][0]) begin //delay is over and data is now starting to release from iserdes BUT NOT YET ALIGNED
-                index_read_pipe <= !index_read_pipe; //control which delay_read_pipe would get updated (we have 3 pipe to store read data)ss
+            if(shift_reg_read_pipe_q[1][0]) begin 
+                //delay from shift_reg_read_pipe_q is about to be over (ack, which is the last bit, will be at LSB on next clk cycle)
+                //and data is now starting to be released from ISERDES from phy BUT NOT YET ALIGNED
+                index_read_pipe <= !index_read_pipe; //control which delay_read_pipe would get updated (we have 2 read_pipes to store read data,use the read_pipe alternatingly)
                delay_read_pipe[index_read_pipe][added_read_pipe_max] <= 1'b1; //update delay_read_pipe
+                // NOTE: added_read_pipe_max can either be 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of delay_read_pipe)
+                // delay_read_pipe will get the ack bit from shift_reg_read_pipe_q[1] at the bit equal to 
+                // added_read_pipe_max (0th or 1st bit). added_read_pipe_max is the max number of added controller clk cycles among all lanes
+                // So basically, the delay_read_pipe is the delay to make sure the "added_read_pipe_max" controller clk cycles
+                // will be met.
+                // Example:
+                // So for request #1 (e.g. write request, added_read_pipe_max=1), wait until the shift_reg_read_pipe_q[1] goes 
+                // high (READ_ACK_PIPE_WIDTH of delay is met which means the data from ISERDES PHY is now available). The 
+                // delay_read_pipe[0][1] will then be high. This high bit on read_pipe #0 will get shifted to LSB. [1] -> [] -> [LSB]
+                // Meanwhile when request #2 comes (e.g. read request, added_read_pipe_max=1), again wait until the shift_reg_read_pipe_q[1] goes 
+                // high. The delay_read_pipe[1][1] will then be high. This high bit on read_pipe #1 will get shifted to LSB. [1] -> [] -> [LSB]
            end
+            
            for(index = 0; index < LANES; index = index + 1) begin
                /* verilator lint_off WIDTH */
-                if(delay_read_pipe[0][added_read_pipe_max != added_read_pipe[index]]) begin //same lane
+                // read_pipe #0
+                // NOTE: added_read_pipe_max and added_read_pipe can either be just 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of this)
+                // If the added_read_pipe (added number of controller clk cycles of delay to a lane due to pcb trace) is equal to the 
+                // max delay (added_read_pipe_max, e.g. 0-0 or 1-1) for this lane, THEN we need to wait until the bit 1 reaches the LSB[0] of delay_read_pipe
+                // before retrieving the value from PHY. But if not the same (added_read_pipe is 0 while added_read_pipe_max is 1), then wait until
+                // the bit 1 reaches the one before LSB [1] before retrieving the value from PHY, so this means this lane with 0 delay will FIRST BE RETRIEVED
+                // while the lane with added_read_pipe_max of delay (delay of 1) will be retrieved SECOND
+                if(delay_read_pipe[0][added_read_pipe_max != added_read_pipe[index]]) begin 
                /* verilator lint_on WIDTH */
-                    o_wb_data_q[0][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update each lane of the burst
-                    o_wb_data_q[0][((DQ_BITS*LANES)*1 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*1 + 8*index) +: 8]; //update each lane of the burst
-                    o_wb_data_q[0][((DQ_BITS*LANES)*2 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*2 + 8*index) +: 8]; //update each lane of the burst
-                    o_wb_data_q[0][((DQ_BITS*LANES)*3 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*3 + 8*index) +: 8]; //update each lane of the burst
-                    o_wb_data_q[0][((DQ_BITS*LANES)*4 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*4 + 8*index) +: 8]; //update each lane of the burst
-                    o_wb_data_q[0][((DQ_BITS*LANES)*5 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*5 + 8*index) +: 8]; //update each lane of the burst
-                    o_wb_data_q[0][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update each lane of the burst
-                    o_wb_data_q[0][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update each lane of the burst
+                    o_wb_data_q[0][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update lane for burst 0
+                    o_wb_data_q[0][((DQ_BITS*LANES)*1 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*1 + 8*index) +: 8]; //update lane for burst 1
+                    o_wb_data_q[0][((DQ_BITS*LANES)*2 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*2 + 8*index) +: 8]; //update lane for burst 2
+                    o_wb_data_q[0][((DQ_BITS*LANES)*3 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*3 + 8*index) +: 8]; //update lane for burst 3
+                    o_wb_data_q[0][((DQ_BITS*LANES)*4 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*4 + 8*index) +: 8]; //update lane for burst 4
+                    o_wb_data_q[0][((DQ_BITS*LANES)*5 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*5 + 8*index) +: 8]; //update lane for burst 5
+                    o_wb_data_q[0][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update lane for burst 6
+                    o_wb_data_q[0][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update lane for burst 7
                end
                /* verilator lint_off WIDTH */
+                // read_pipe #1
+                // NOTE: added_read_pipe_max and added_read_pipe can either be just 0 or 1 (NOTE TO SELF: optimize by lowering the bit size of this)
+                // If the added_read_pipe (added number of controller clk cycles of delay to a lane due to pcb trace) is equal to the 
+                // max delay (added_read_pipe_max, e.g. 0-0 or 1-1) for this lane, THEN we need to wait until the bit 1 reaches the LSB[0] of delay_read_pipe
+                // before retrieving the value from PHY. But if not the same (added_read_pipe is 0 while added_read_pipe_max is 1), then wait until
+                // the bit 1 reaches the one before LSB [1] (which goes high already since bit 1 of delay_read_pipe is the first to go high once shift_reg_read_pipe_q
+                // bit 1 goes high) before retrieving the value from PHY. So this means this lane with 0 delay will FIRST BE RETRIEVED
+                // while the lane with added_read_pipe_max of delay (delay of 1) will be retrieved SECOND
                if(delay_read_pipe[1][added_read_pipe_max != added_read_pipe[index]]) begin
                /* verilator lint_on WIDTH */
                    o_wb_data_q[1][((DQ_BITS*LANES)*0 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*0 + 8*index) +: 8]; //update each lane of the burst
@ -1230,17 +1324,49 @@ module ddr3_controller #(
                    o_wb_data_q[1][((DQ_BITS*LANES)*6 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*6 + 8*index) +: 8]; //update each lane of the burst
                    o_wb_data_q[1][((DQ_BITS*LANES)*7 + 8*index) +: 8] <= i_phy_iserdes_data[((DQ_BITS*LANES)*7 + 8*index) +: 8]; //update each lane of the burst
                end
+                // why are we alternatingly use the read_pipes?
+                //               time |   0    |   1     |   2     |   3     |
+                //    index_read_pipe |   1    |   0     |   1     |   0     |
+                // delay_read_pipe[1] | [0][0] | <[1][0] | [0][1]> | [1][0]  |   
+                // delay_read_pipe[0] | [0][0] | [0][0]  | <[1][0] | [0][1]> |  
+                //
+                // At time 1, request #1 is accepted by pipe[1], then wait until time 2 to retrieve the lane with longest added_read_pipe 
+                // (lane with added_read_pipe 0 is retrieved from ISERDES at time 1, lane with added_read_pipe 1 is retrieved from ISERDES at time 2)
+                // At time 2, request #2 is accepted by pipe[0] (since pipe[1] is still busy on request #1), then wait until time 3 to retrieve 
+                // the lane with longest added_read_pipe
+                // Thus pipe 0 and 1 is alternating to make sure that even if 0 is busy, 1 will retrieve the data. And then when 1 is busy, 0 will retrieve the data
+                // NOTE TO SELF: longest delay for a lane relative to others is 1 controller clk cycle, if more than 1 then it will not be retrievd 
+                // and aligned properly. Thus try to optimize this logic since delay is at max 1 controller clk only.
            end
+            // o_wb_ack_read_q[0][0] is also the wishbone ack (aligned with ack is the wishbone data) thus
+            // after sending the wishbone data on a particular index, invert it for the next ack
            if(o_wb_ack_read_q[0][0]) begin 
-                index_wb_data <= !index_wb_data;
+                index_wb_data <= !index_wb_data; //alternatingly uses the o_wb_data_q (either 0 or 1)
            end
            for(index = 1; index < MAX_ADDED_READ_ACK_DELAY; index = index + 1) begin
-                o_wb_ack_read_q[index-1] <= o_wb_ack_read_q[index];
+                o_wb_ack_read_q[index-1] <= o_wb_ack_read_q[index]; // shift rightward [ack] -> [] -> [LSB]
            end
-            o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1] <= 0;
+            o_wb_ack_read_q[MAX_ADDED_READ_ACK_DELAY-1] <= 0; // MSB always gets zero and is shifted rightwards
            o_wb_ack_read_q[added_read_pipe_max] <= shift_reg_read_pipe_q[0];
+            // o_wb_ack_read_q[0] is the wishbone ack
+            // so once data is available from ISERDES (shift_reg_read_pipe_q[0] high) then need to wait added_read_pipe_max
+            // before the data is properly stored to o_wb_data_q and can be sent outside as wishbone data

-            //abort any outgoing ack when cyc is low
+            // BASICALLY:
+            // shift_reg_read_pipe_q is the delay from when the read command is issued from controller until the 
+            // data is received by the PHY ISERDES (total delay of READ_ACK_PIPE_WIDTH). The shift_reg_read_pipe_q[1]
+            // is then connected to delay_read_pipe[added_read_pipe_max (0 or 1)] which is the delay to align the lanes.
+            // The shift_reg_read_pipe_q[0] is then connected to o_wb_ack_read_q[added_read_pipe_max (0 or 1)] which
+            // is the delay until the wishbone data and ack will be sent outside
+            // NOTE TO SELF: Optimize by removing o_wb_ack_read_q and just connect the woshbone ack and data to delay_read_pipe[0].
+            // Visualization:
+            // shift_reg_read_pipe_q  [ ] -> [1] -> [ ]         [ ] -> [ ] -> [1]         [ ] -> [ ] -> [ ]          [ ] -> [ ] -> [ ]
+            //       delay_read_pipe  [ ] -> [ ] -> [ ]   --->  [ ] -> [1] -> [ ]   --->  [ ] -> [ ] -> [1]   --->   [ ] -> [ ] -> [ ]
+            //       o_wb_ack_read_q  [ ] -> [ ] -> [ ]         [ ] -> [ ] -> [ ]         [ ] -> [1] -> [ ]          [ ] -> [ ] -> [1]
+            //                   request is now on [1]            request passed              request passed        o_wb_ack_read_q[0]
+            //                of shift_reg_read_pipe_q        to delay_read_pipe          to o_wb_ack_read_q        high thus ready for wishbone ack
+            
+            // abort any outgoing ack when cyc is low
            if(!i_wb_cyc && state_calibrate == DONE_CALIBRATE) begin
                for(index = 0; index < MAX_ADDED_READ_ACK_DELAY; index = index + 1) begin
                    o_wb_ack_read_q[index] <= 0;
@ -1252,7 +1378,7 @@ module ddr3_controller #(
       end
    end
    assign o_wb_ack = o_wb_ack_read_q[0][0] && state_calibrate == DONE_CALIBRATE;
-    //o_wb_ack_read_q[0][0] is needed internally for write calibration but it must not go outside (since it is not an actual user wb request unless we are in DONE_CALIBRATE) 
+    //o_wb_ack_read_q[0][0] is needed internally to ack during write calibration but it must not go outside (since it is not an actual user wb request unless we are in DONE_CALIBRATE) 
    assign o_aux = o_wb_ack_read_q[0][AUX_WIDTH:1]; 
    assign o_wb_data = o_wb_data_q[index_wb_data];
    assign o_phy_dqs_tri_control = !write_dqs[STAGE2_DATA_DEPTH];
@ -1360,15 +1486,22 @@ module ddr3_controller #(
                idelay_data_cntvaluein[lane] <= o_phy_idelay_data_ld[lane]? idelay_data_cntvaluein[lane] + 1: idelay_data_cntvaluein[lane];
                idelay_dqs_cntvaluein[lane] <= o_phy_idelay_dqs_ld[lane]? idelay_dqs_cntvaluein[lane] + 1: idelay_dqs_cntvaluein[lane];
            end
+            // high initial_dqs is the time when the IDELAY of dqs and dq is not yet calibrated 
+            // dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1; // move to next odd (if 3 then 5, if 4 then 5)
+            // so dqs_target_index_value is basically the next odd number of dqs_start_index_stored (original starting bit when dqs starts).
+            // The next odd number ensure that the DQS edge is aligned to edge of ddr3_clk (and thus DQ data eye is aligned to edges of of ddr3_clk
+            // since dq is 90 degree relative to dqs
+            // Some images to show why next odd number is used: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
+            
            if(initial_dqs) begin
-                dqs_target_index <= dqs_target_index_value;
-                dq_target_index[lane] <= {1'b0, dqs_target_index_value};
-                dqs_target_index_orig <= dqs_target_index_value;
+                dqs_target_index <= dqs_target_index_value; // target index for DQS to make sure the DQS is edge-aligned with ddr3_clk
+                dq_target_index[lane] <= {1'b0, dqs_target_index_value}; // target index for DQ (just same as DQS)
+                dqs_target_index_orig <= dqs_target_index_value; // this will remain the same until we finish calibrating this whole lane
            end
-            if(idelay_dqs_cntvaluein[lane] == 0) begin //go back to previous odd
+            if(idelay_dqs_cntvaluein[lane] == 0) begin //the DQS got past cntvalue of 31 (and goes back to zero) THUS the target index should also go back (to previous odd)
                dqs_target_index <= dqs_target_index_orig - 2;
            end
-            if(idelay_data_cntvaluein[lane] == 0 && idelay_data_cntvaluein_prev == 31) begin
+            if(idelay_data_cntvaluein[lane] == 0 && idelay_data_cntvaluein_prev == 31) begin //the DQ got past cntvalue of 31 (and goes back to zero) thus the target index should also go back (to previous odd)
                dq_target_index[lane] <= dqs_target_index_orig - 2;
            end
            
@ -1397,6 +1530,7 @@ module ddr3_controller #(
                        another Bitslip command. If the ISERDESE2 is reset, the Bitslip logic is also reset
                        and returns back to its initial state.
                        */
+                        // bitslip basically is changing the arrangement of bytes on IOSERDES
                        if(i_phy_iserdes_bitslip_reference[lane*serdes_ratio*2 +: 8] == 8'b0111_1000) begin //initial arrangement
                            state_calibrate <= MPR_READ;
                            initial_dqs <= 1;
@ -1412,28 +1546,42 @@ module ddr3_controller #(
            MPR_READ: if(delay_before_read_data == 0) begin //align the incoming DQS during reads to the controller clock 
                             //issue_read_command = 1;
                             /* verilator lint_off WIDTH */
-                             delay_before_read_data <= READ_DELAY + 1 + 2 + 1 - 1; ///1=issue command delay (OSERDES delay), 2 =  ISERDES delay 
+                             delay_before_read_data <= READ_DELAY + 1 + 2 + 1 - 1; ///NOTE TO SELF: why these numbers? 1=issue command delay (OSERDES delay), 2 =  ISERDES delay 
                             /* verilator lint_on WIDTH */
                             state_calibrate <= COLLECT_DQS;
                             dqs_count_repeat <= 0;
                      end    
                      
-        COLLECT_DQS: if(delay_before_read_data == 0) begin
+                COLLECT_DQS: if(delay_before_read_data == 0) begin // data from MPR read is now received by controller
+                        // dqs from ISERDES is received and stored
+                        // DQS received from ISERDES: { {LANE_1_burst_7, LANE_1_burst_6, ... , LANE_1_burst_0} , {LANE_0_burst_7, LANE_0_burst_6, ... , LANE_0_burst_0}}
+                        // NOTE TO SELF: WHY DQS IS DIVIDED PER LANE BUT DQ IS PER BURST ???? 
+                        // dqs_store stores the 8 DQS (8 bursts) for a given lane but since the DQS might be shifted at next ddr3 clk cycles (due to trace delays), we must store the
+                        // 8 DQS multiple times (dictated by STORED_DQS_SIZE)
                        dqs_store <= {i_phy_iserdes_dqs[serdes_ratio*2*lane +: 8], dqs_store[(STORED_DQS_SIZE*8-1):8]}; 
                        dqs_count_repeat <= dqs_count_repeat + 1;
                        if(dqs_count_repeat == STORED_DQS_SIZE - 1) begin
                            state_calibrate <= ANALYZE_DQS;
-                            dqs_start_index_stored <= dqs_start_index;
-                            dqs_start_index <= 0;
+                            // store the previous value of dqs_start_index, if the ANALYZE_DQS is repeated then the dqs_start_index
+                            // should be the same as the previous value, else the previous (or current one) has a glitch causing 
+                            // a different dqs_start_index 
+                            dqs_start_index_stored <= dqs_start_index; 
+                            // start the index from zero since this will be incremented until we pinpoint the real 
+                            // starting bit of dqs_store (dictated by the pattern 10'b01_01_01_01_00)
+                            dqs_start_index <= 0;                             
                        end
                      end
-                      
+                        // find the bit where the DQS starts to be issued (by finding when the pattern 10'b01_01_01_01_00 starts)
         ANALYZE_DQS: if(dqs_store[dqs_start_index +: 10] == 10'b01_01_01_01_00) begin
-                        dqs_start_index_repeat <= (dqs_start_index == dqs_start_index_stored)? dqs_start_index_repeat + 1: 0; //increase dqs_start_index_repeat when index is the same as before      
-                         if(dqs_start_index_repeat == REPEAT_DQS_ANALYZE) begin //the same index appeared  REPEAT_DQS_ANALYZE times in a row, thus can proceed to CALIBRATE_DQS 
-                            initial_dqs <= 0;
+                         //increase dqs_start_index_repeat when index is the same as before   
+                        dqs_start_index_repeat <= (dqs_start_index == dqs_start_index_stored)? dqs_start_index_repeat + 1: 0;   
+                         //the same dqs_start_index_repeat appeared REPEAT_DQS_ANALYZE times in a row, thus we can trust the value we got is accurate and not affected by glitch
+                         if(dqs_start_index_repeat == REPEAT_DQS_ANALYZE) begin
+                             // since we already know the starting bit when the dqs (and dq since they are aligned) will come,
+                             // we will now start calibrating the IDELAY for dqs and dq (via CALIBRATE_DQS). 
+                             // high initial_dqs is the time when the IDELAY of dqs and dq is not yet calibrated so we zero this starting now
+                            initial_dqs <= 0; 
                            dqs_start_index_repeat <= 0;
-
                            state_calibrate <= CALIBRATE_DQS;
                         end
                        else begin
@ -1451,23 +1599,40 @@ module ddr3_controller #(
                            dqs_start_index <= dqs_start_index + 1;
                        end
                      end
-
+                        // check if the index when the dqs starts is the same as the target index which is aligned to the ddr3_clk
+                        // dqs_target_index is the next odd number to dqs_start_index BEFORE IDELAY CALIBRATION. We will increase the IDELAY
+                        // until the dqs_start_index_stored (current value of dqs_start_index) matches the target index which is aligned to ddr3_clk
        CALIBRATE_DQS: if(dqs_start_index_stored == dqs_target_index) begin
+                            // dq_target_index still stores the original dqs_target_index_value. The bit size of dq_target_index is just enough
+                            // to count the bits in dqs_store (the received 8 DQS stored STORED_DQS_SIZE times)
                            added_read_pipe[lane] <= { {( 4 - ($clog2(STORED_DQS_SIZE*8) - (3+1)) ){1'b0}} , dq_target_index[lane][$clog2(STORED_DQS_SIZE*8)-1:(3+1)] } 
                                                        + { 3'b0 , (dq_target_index[lane][3:0] >= (5+8)) };
+                            // if target_index is > 13, then a 1 CONTROLLLER_CLK cycle delay (4 ddr3_clk cycles) is added on that particular lane (due to trace delay)
+                            // added_read_pipe[lane] <= dq_target_index[lane][$clog2(STORED_DQS_SIZE*8)-1 : (4)]  +  ( dq_target_index[lane][3:0] >= 13 ) ;
                            dqs_bitslip_arrangement <= 16'b0011_1100_0011_1100 >> dq_target_index[lane][2:0];
+                            // the dqs is delayed (to move starting bit to next odd number) so this means the original
+                            // expected bitslip arrangement of  8'b0111_1000 will not be followed anymore, so here we form the bitslip
+                            // arrangement pattern so incoming dqs (and thus DQ) is arranged in the proper way (first bute firs, last byte last)
                            state_calibrate <= BITSLIP_DQS_TRAIN_2;
                       end
                       else begin
+                            // if we have not yet reached the target index then increment IDELAY
+                            // we will keep incrementing the IDELAY until the next odd index is reached (which is
+                            // the time we are sure the DQS is edge aligned with ddr3_clk and thus ddr3_clk posedge
+                            // is hitting the center of DQ eye
+                            // To show why next odd number is needed: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
                            o_phy_idelay_data_ld[lane] <= 1;
                            o_phy_idelay_dqs_ld[lane] <= 1;
                            state_calibrate <= MPR_READ;
                            delay_before_read_data <= 10; //wait for sometime to make sure idelay load settles
                       end
-                       
+                        //the dqs is delayed (to move starting bit to next odd number) so this means the original
+                        // expected bitslip arrangement of  8'b0111_1000 will not be followed anymore, so here the bitslip
+                        // is re-arranged
  BITSLIP_DQS_TRAIN_2: if(train_delay == 0) begin //train again the ISERDES to capture the DQ correctly
                            if(i_phy_iserdes_bitslip_reference[lane*serdes_ratio*2 +: 8] == dqs_bitslip_arrangement[7:0]) begin
                                /* verilator lint_off WIDTH */
+                                // this is the end of training and calibration for a single lane, so proceed to next lane
                                if(lane == LANES - 1) begin
                                /* verilator lint_on WIDTH */
                                    pause_counter <= 0; //read calibration now complete so continue the reset instruction sequence
@ -1481,8 +1646,11 @@ module ddr3_controller #(
                                 end
                                 else begin
                                     lane <= lane + 1;
-                                     state_calibrate <= BITSLIP_DQS_TRAIN_1;
+                                     state_calibrate <= BITSLIP_DQS_TRAIN_1;// current lane is done so go back to BITSLIP_DQS_TRAIN_1 to train next lane
                                 end
+                                // stores the highest value of added_read_pipe among the lanes since all lanes (except the lane with highest 
+                                // added_read_pipe) will be delayed to align with the lane with highest added_read_pipe. This alignment 
+                                // is required to make sure the received DQ will be aligned and can form the 512 bit data (for 8 lanes) arranged properly.
                                 added_read_pipe_max <= added_read_pipe_max > added_read_pipe[lane]? added_read_pipe_max:added_read_pipe[lane];
                            end
                            else begin
@ -1490,7 +1658,7 @@ module ddr3_controller #(
                                train_delay <= 3;
                            end
                       end
-                      
+                // CONTINUE COMMENT HERE  (once blog is done)                
    START_WRITE_LEVEL: if(!ODELAY_SUPPORTED) begin //skip write levelling if ODELAY is not supported
                            pause_counter <= 0;
                            lane <= 0;
@ -1548,7 +1716,14 @@ module ddr3_controller #(
                        calib_sel <= {wb_sel_bits{1'b1}};
                        calib_we <= 1; //write-enable
                        calib_addr <= 0;
+                        //                burst_7          burst_6         burst_5         burst_4         burst_3         burst_2         burst_1         burst_0
                        calib_data <= { {LANES{8'h91}}, {LANES{8'h77}}, {LANES{8'h29}}, {LANES{8'h8c}}, {LANES{8'hd0}}, {LANES{8'had}}, {LANES{8'h51}}, {LANES{8'hc1}} }; 
+                        // write to address 0 is a burst of 8 writes, where all lanes has same data written:  64'h9177298cd0ad51c1
+                        // Example for LANES of 2, DQ of 8: the 128 bits (8 bursts * 2 lanes/burst * 8bits/lane) are 8 bursts with 8 bytes each:
+                        // { burst_7, burst_6, burst_5, burst_4, burst_3, burst_2, burst_1, burst_0 } OR:
+                        // { { {burst7_lane1} , {burst7_lane0} } , { {burst6_lane1} , {burst6_lane0} } , { {burst5_lane1} , {burst5_lane0} } 
+                        // , { {burst4_lane1} , {burst4_lane0} } , { {burst3_lane1} , {burst3_lane0} } , { {burst2_lane1} , {burst2_lane0} } 
+                        // , { {burst1_lane1} , {burst1_lane0} } , { {burst0_lane1} , {burst0_lane0} } }
                        state_calibrate <= ISSUE_WRITE_2;
                       end    
        ISSUE_WRITE_2: begin
@ -1557,9 +1732,18 @@ module ddr3_controller #(
                        calib_sel <= {wb_sel_bits{1'b1}};
                        calib_we <= 1; //write-enable
                        calib_addr <= 1;
-                        calib_data <= { {LANES{8'h80}}, {LANES{8'hdb}}, {LANES{8'hcf}}, {LANES{8'hd2}}, {LANES{8'h75}}, {LANES{8'hf1}}, {LANES{8'h2c}}, {LANES{8'h3d}} }; 
+                        //                burst_7          burst_6         burst_5         burst_4         burst_3         burst_2         burst_1         burst_0
+                        calib_data <= { {LANES{8'h80}}, {LANES{8'hdb}}, {LANES{8'hcf}}, {LANES{8'hd2}}, {LANES{8'h75}}, {LANES{8'hf1}}, {LANES{8'h2c}}, {LANES{8'h3d}} };
+                        // write to address 1 is also a burst of 8 writes, where all lanes has same data written:  128'h80dbcfd275f12c3d
                        state_calibrate <= ISSUE_READ;
-                       end    
+                       end   
+                // NOTE: WHY THERE ARE TWO ISSUE_WRITE
+                // address 0 and 1 is written with a deterministic data, if the DQ trace has long delay (relative to command line) then the data will be delayed 
+                // compared to the write command. Thus the data aligned to the write command for address 0 MIGHT START AT MIDDLE OF EXPECTED OUTPUT
+                // DATA (64'h9177298cd0ad51c1) e.g. the data written might be 64'h[2c3d][9177298cd0ad] where the data written starts
+                // at burst 2 (burst 0 and burst 1 are cut-off since each burst uses 1 ddr3_clk cycle)
+                // Note here that if DQ and DQS sa same delay, then we know the DQS will always be aligned with DQ data 
+                
          ISSUE_READ: begin
                        calib_stb <= 1;//actual request flag
                        calib_aux <= 1; //AUX ID to determine later if ACK is for read or write
@ -1569,7 +1753,7 @@ module ddr3_controller #(
                      end   
                      
           READ_DATA: if(o_wb_ack_read_q[0] == {{(AUX_WIDTH-2){1'b0}}, 2'd1, 1'b1}) begin //wait for the read ack (which has AUX ID of 1}
-                         read_data_store <= o_wb_data;
+                         read_data_store <= o_wb_data; // read data on address 0 
                         calib_stb <= 0;
                         state_calibrate <= ANALYZE_DATA;
                         data_start_index[lane] <= 0;
@ -1583,7 +1767,12 @@ module ddr3_controller #(
                      else if(!o_wb_stall_calib) begin
                            calib_stb <= 0;
                      end
-                 
+                        // extract burst_0-to-burst_7 data for a specified lane then determine which byte in write_pattern does it starts
+                        // NOTE TO SELF: all "8" here assume DQ_BITS are 8? parameterize this properly
+                        // data_start_index for a specified lane determine how many bits are off the data from the write command
+                        // so for every 1 ddr3 clk cycle delay of DQ from write command, each lane will be 1 burst off:
+                        // e.g. LANE={burst7, burst6, burst5, burst4, burst3, burst2, burst1, burst0} then with 1 ddr3 cycle delay between DQ and command 
+                        // burst0 will not be written but only starting on burst1
        ANALYZE_DATA: if(write_pattern[data_start_index[lane] +: 64] == {read_data_store[((DQ_BITS*LANES)*7 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*6 + 8*lane) +: 8],
                        read_data_store[((DQ_BITS*LANES)*5 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*4 + 8*lane) +: 8], read_data_store[((DQ_BITS*LANES)*3 + 8*lane) +: 8],
                        read_data_store[((DQ_BITS*LANES)*2 + 8*lane) +: 8],read_data_store[((DQ_BITS*LANES)*1 + 8*lane) +: 8],read_data_store[((DQ_BITS*LANES)*0 + 8*lane) +: 8] }) begin   
@ -1598,7 +1787,7 @@ module ddr3_controller #(
                            end
                      end 
                      else begin
-                            data_start_index[lane] <= data_start_index[lane] + 8;
+                          data_start_index[lane] <= data_start_index[lane] + 8; //skip by 8
                            if(data_start_index[lane] == 56) begin //reached the end but no byte in write-pattern matches the data read, issue might be reading at wrong DQS toggle 
                                data_start_index[lane] <= 0;        //so we need to recalibrate the bitslip
                                start_index_check <= 0;
@ -1710,7 +1899,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
                                    //if(write_test_address_counter[wb_addr_bits-1:0] == 500) begin //inject error at middle
                                    //    calib_data <= 1;
                                    //end 
-                                    if(write_test_address_counter[wb_addr_bits-1:0] == 999 ) begin //MUST END AT ODD NUMBER
+                                    if(write_test_address_counter[wb_addr_bits-1:0] == 99 ) begin //MUST END AT ODD NUMBER
                                        state_calibrate <= BURST_READ;
                                    end 
                                end
@ -1733,7 +1922,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
                            calib_addr <= read_test_address_counter;
                            read_test_address_counter <= read_test_address_counter + 1;
                            if(MICRON_SIM) begin
-                                if(read_test_address_counter == 999) begin //MUST END AT ODD NUMBER
+                                if(read_test_address_counter == 99) begin //MUST END AT ODD NUMBER
                                        state_calibrate <= RANDOM_WRITE;  
                                end
                            end
@ -1759,7 +1948,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
                                //if(write_test_address_counter[wb_addr_bits-1:0] == 1500) begin //inject error
                                //    calib_data <= 1;
                                //end
-                                if(write_test_address_counter[wb_addr_bits-1:0] == 1999) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
+                                if(write_test_address_counter[wb_addr_bits-1:0] == 199) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
                                    state_calibrate <= RANDOM_READ;
                                end
                            end
@ -1784,7 +1973,7 @@ BITSLIP_DQS_TRAIN_3: if(train_delay == 0) begin //train again the ISERDES to cap
                                       <= read_test_address_counter[wb_addr_bits-1:ROW_BITS];
                        read_test_address_counter <= read_test_address_counter + 1;
                        if(MICRON_SIM) begin
-                            if(read_test_address_counter == 1999) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
+                            if(read_test_address_counter == 199) begin //MUST END AT ODD NUMBER since ALTERNATE_WRITE_READ must start at even
                                state_calibrate <= ALTERNATE_WRITE_READ;
                            end
                        end
@ -1803,7 +1992,7 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
                        calib_addr <= {1'b0, write_test_address_counter[wb_addr_bits-1:1]}; //same address to be used for write and read ( so basically write then read instantly)
                        calib_data <= {wb_sel_bits{write_test_address_counter[7:0]}}; 
                        if(MICRON_SIM) begin
-                            if(write_test_address_counter == 4999) begin
+                            if(write_test_address_counter == 499) begin
                                train_delay <= 15;
                                state_calibrate <= FINISH_READ;
                            end
@ -1858,12 +2047,12 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
        end
    end      
    assign issue_read_command = (state_calibrate == MPR_READ && delay_before_read_data == 0);
-    assign issue_write_command = 0;
    assign o_phy_odelay_data_cntvaluein = odelay_data_cntvaluein[lane]; 
    assign o_phy_odelay_dqs_cntvaluein = odelay_dqs_cntvaluein[lane];
    assign o_phy_idelay_data_cntvaluein = idelay_data_cntvaluein[lane];
    assign o_phy_idelay_dqs_cntvaluein = idelay_dqs_cntvaluein[lane];
-    assign dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1;
+    assign dqs_target_index_value = dqs_start_index_stored[0]? dqs_start_index_stored + 2: dqs_start_index_stored + 1; // move to next odd (if 3 then 5, if 4 then 5)
+     // To show why next odd number is needed: https://github.com/AngeloJacobo/UberDDR3/tree/b762c464f6526159c1d8c2e4ee039b4ae4e78dbd#per-lane-read-calibration
    reg[31:0] wb_data_to_wb2 = 0;
    always @(posedge i_controller_clk) begin
        if(o_wb_ack_read_q[0][0]) wb_data_to_wb2 <= o_wb_data[31:0]; //save data read
@ -2193,7 +2382,7 @@ ALTERNATE_WRITE_READ: if(!o_wb_stall_calib) begin
            // find anticipate activate command slot number
            if(CL_nCK > CWL_nCK) slot_number = read_slot;
            else slot_number = write_slot;
-                delay = ps_to_nCK($rtoi(tRCD)); 
+                delay = ps_to_nCK(tRCD); 
            for(slot_number = slot_number;  delay != 0; delay = delay - 1) begin
                    slot_number = slot_number - 1'b1;
            end 
--- a/rtl/ddr3_phy.v
+++ b/rtl/ddr3_phy.v
@ -1,6 +1,6 @@
 `default_nettype none
 `timescale 1ps / 1ps
-//`define DEBUG_DQS
+//`define DEBUG_DQS // uncomment to route the raw DQS to output port for debugging

 module ddr3_phy #(
    parameter     CONTROLLER_CLK_PERIOD = 10_000, //ps, clock period of the controller interface
@ -12,7 +12,7 @@ module ddr3_phy #(
    parameter[0:0] ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
                   USE_IO_TERMINATION = 0, //use IOBUF_DCIEN and IOBUFDS_DCIEN when 1
              // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
-    parameter serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD),
+    parameter serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
              wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
              wb_sel_bits = wb_data_bits / 8,
              //4 is the width of a single ddr3 command {cs_n, ras_n, cas_n, we_n} plus 3 (ck_en, odt, reset_n) plus bank bits plus row bits
--- a/rtl/ddr3_top.v
+++ b/rtl/ddr3_top.v
@ -20,7 +20,7 @@ module ddr3_top #(
                   ODELAY_SUPPORTED = 1, //set to 1 when ODELAYE2 is supported
                   SECOND_WISHBONE = 0, //set to 1 if 2nd wishbone is needed 
    parameter // The next parameters act more like a localparam (since user does not have to set this manually) but was added here to simplify port declaration
-                serdes_ratio = $rtoi(CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD),
+                serdes_ratio = 4, // this controller is fixed as a 4:1 memory controller (CONTROLLER_CLK_PERIOD/DDR3_CLK_PERIOD = 4)
                wb_addr_bits = ROW_BITS + COL_BITS + BA_BITS - $clog2(serdes_ratio*2),
                wb_data_bits = DQ_BITS*LANES*serdes_ratio*2,
                wb_sel_bits = wb_data_bits / 8,
--- a/testbench/ddr3_dimm_micron_sim.sv
+++ b/testbench/ddr3_dimm_micron_sim.sv
@ -53,8 +53,8 @@ module ddr3_dimm_micron_sim;
 `endif


- localparam CONTROLLER_CLK_PERIOD = 10, //ns, period of clock input to this DDR3 controller module
-            DDR3_CLK_PERIOD = 2.5, //ns, period of clock input to DDR3 RAM device 
+ localparam CONTROLLER_CLK_PERIOD = 10_000, //ps, period of clock input to this DDR3 controller module
+            DDR3_CLK_PERIOD = 2500, //ps, period of clock input to DDR3 RAM device 
            AUX_WIDTH = 16, // AUX lines
            OPT_LOWPOWER = 1, //1 = low power, 0 = low logic
            OPT_BUS_ABORT = 1;
@ -126,13 +126,13 @@ module ddr3_dimm_micron_sim;
       
   `else
        assign clk_locked = 1;
-        always #(CONTROLLER_CLK_PERIOD*1000/2) i_controller_clk = !i_controller_clk;
-        always #(DDR3_CLK_PERIOD*1000/2) i_ddr3_clk = !i_ddr3_clk;
+        always #(CONTROLLER_CLK_PERIOD/2) i_controller_clk = !i_controller_clk;
+        always #(DDR3_CLK_PERIOD/2) i_ddr3_clk = !i_ddr3_clk;
        always #2500 i_ref_clk = !i_ref_clk;
        initial begin //90 degree phase shifted ddr3_clk
-            #(DDR3_CLK_PERIOD*1000/4);
+            #(DDR3_CLK_PERIOD/4);
            while(1) begin
-                #(DDR3_CLK_PERIOD*1000/2) i_ddr3_clk_90 = !i_ddr3_clk_90;
+                #(DDR3_CLK_PERIOD/2) i_ddr3_clk_90 = !i_ddr3_clk_90;
            end
        end
        initial begin