接上面一篇,继续学习Cache和TLB的设计,再接下来是总线的搭载。这里再讲一下延迟槽,上一篇没说清楚。
cpu执行分支或者跳转指令时,会根据条件计算结果修改程序计数器(PC),这本身没什么问题,但是分支指令在EX阶段才能确定是否跳转。
所以其之后紧随的指令,会受到两种影响:
prev_branch
分支延迟槽的标志// MEM模块中的分支延迟槽跟踪
reg prev_branch; // 标记前一条指令是否为分支
always @(posedge clk) begin
if (!resetn) prev_branch <= 1'b0;
else if (valid_i && done_o && ready_i)
prev_branch <= br_inst && !(valid_i && exc_i); // 更新分支标志
end
assign commit_epc = prev_branch ? pc_i - 32'd4 : pc_i; // 延迟槽时EPC回退4字节
assign commit_bd = prev_branch; // 标记异常是否发生在延迟槽
本文的ICache使用的是四路组相联,通过索引Index
寻址,标签Tag
匹配命中。指令地址inst_addr_reg
中的[31:12]位是标签位。用于标识缓存块的来源地址
assign hit_0 = (idle_rdata[0][19:0] == tag);
assign hit_1 = (idle_rdata[1][19:0] == tag);
assign hit_2 = (idle_rdata[2][19:0] == tag);
assign hit_3 = (idle_rdata[3][19:0] == tag);
assign hit = hit_0 | hit_1 | hit_2 | hit_3;
因为是四路组相联,所以这里如果在这四路中任意一个命中的话都是命中。
assign valid_0 = idle_rdata[0][20];
assign valid_1 = idle_rdata[1][20];
assign valid_2 = idle_rdata[2][20];
assign valid_3 = idle_rdata[3][20];
assign succeed_0 = hit_0 & valid_0;
assign succeed_1 = hit_1 & valid_1;
assign succeed_2 = hit_2 & valid_2;
assign succeed_3 = hit_3 & valid_3;
assign succeed = succeed_0 | succeed_1 | succeed_2 | succeed_3;
缓存控制器
的状态机,控制缓存操作流程,四个状态:空闲、处理缺失、axi读取、预取,后面关于预取器
也会涉及几个状态,也设计成状态机了always @(posedge clk)
if(rst)succeed_ack <= 1'b0;
else if((work_state == 4'b0000) || (work_state == 4'b0010))succeed_ack <= succeed;
else ;
always @(posedge clk)
begin
if(rst)
begin
inst_req_reg <= 1'b0;
end
else if((work_state == 4'b0000) && inst_addr_ok)
begin
inst_req_reg <= inst_req;
end
else if(inst_data_ok) // if axi ack addr, stop requiring //TBD
begin
inst_req_reg <= 1'b0;
end
end
always @(posedge clk)
begin
if(rst)
begin
inst_wr_reg <= 1'b0;
end
else if(work_state == 4'b0000)
begin
inst_wr_reg <= inst_wr;
end
end
always @(posedge clk)
begin
if(rst)
begin
inst_size_reg <= 2'b0;
end
else if(work_state == 4'b0000)
begin
inst_size_reg <= inst_size;
end
end
always @(posedge clk)
begin
if(rst)
begin
inst_addr_reg <= 32'b0;
end
else if(((work_state == 4'b0000) & inst_addr_ok) || ((work_state == 4'b1111) && (op_workstate == 4'd0) && cache_req))
begin
inst_addr_reg <= inst_addr;
end
end
always @(posedge clk)
begin
if(rst)
begin
target_bank <= 3'd0;
end
else if(((work_state == 4'b0011) || (work_state == 4'b0111)) && (rvalid && (rid == 4'd3)))
begin
target_bank <= target_bank + 3'd1;
end
end
always @(posedge clk)
begin
if(rst)
begin
prefetch <= 1'b0;
end
else if(work_state == 4'b0110)
begin
prefetch <= 1'd1;
end
else if((work_state == 4'b0011) && rvalid && (target_bank == 3'd7) && (rid == 4'd3))
begin
prefetch <= 1'd0;
end
end
always @(posedge clk)
begin
if(rst)
begin
work_state <= 4'b0100;
end
else if((work_state == 4'b0100) && cache_work[0])
begin
work_state <= 4'b0000;
end
else if((work_state == 4'b0000) && (cache_req && (cache_op[2:0] != 3'd0)))
begin
work_state <= 4'b1111;
end
else if((work_state == 4'b1111) && (op_workstate == 4'd1))
begin
work_state <= 4'b0000;
end
else if((work_state == 4'b0101) ||(work_state == 4'b1010) )
begin
work_state <= 4'b0000;
end
else if((work_state == 4'b0110))
begin
work_state <= 4'b1010;
end
else if(work_state == 4'b0010)
begin
if(prefetch_state == 4'd3)
begin
work_state <=4'b0110;
end
end
else if((work_state == 4'b0000) && req_but_miss) // miss or invalid, enter state 001
begin
work_state <= 4'b0001;
end
else if(work_state == 4'b0001)
begin
if(wait_prefetch && (prefetch_state == 4'd3))
begin
work_state <= 4'b0110;
end
else if(wait_prefetch)
begin
work_state <= 4'b0010;
end
else if(arready && (arid == 4'd3)) // after axi ack addr, enter state 011
begin
work_state <= 4'b0011;
end
end
else if((work_state == 4'b0011) && rlast && rvalid && (rid == 4'd3)) // after axi rlast(trans end), enter state 010
begin
work_state <= 4'b0000;
end
else if((work_state == 4'b0111) && rlast && rvalid && (rid == 4'd3)) // after axi rlast(trans end), enter state 010
begin
work_state <= 4'b0101;
end
else
begin
work_state <= work_state;
end
end
预取地址的生成:
预取器通过预测程序的执行流程,提前计算出可能需要访问的地址,以便提前加载数据到缓存中,减少访问延迟
assign prefetch_addr_input = inst_addr_input + 32'h00000020;
预取器状态机:
always @(posedge clk)
begin
if(rst)
begin
prefetch_state <= 4'd15;
end
else if(prefetch_state == 4'd15)
begin
prefetch_state <= 4'd0;
end
else if(prefetch_state == 4'd0)
begin
if(prefetch_work)
begin
prefetch_state <= 4'd1;
end
end
else if(prefetch_state == 4'd1)
begin
if(arready && (arid == 4'd2))
begin
prefetch_state <= 4'd2;
end
end
else if(prefetch_state == 4'd2)
begin
if(rlast && rvalid && (rid == 4'd2))
begin
prefetch_state <= 4'd3;
end
end
else if(prefetch_state == 4'd3)
begin
if(work_state == 4'b0110)
begin
prefetch_state <= 4'd0;
end
else if((work_state ==4'b0001) && !wait_prefetch)
begin
prefetch_state <= 4'd0;
end
end
end
管理预取操作的状态转换,包括发送预取请求、等待响应、接收数据等。
后面关于预取的请求和发送以及预取数据的接收和存储就不详述了。
通过lru_*
表存储每个Cache行中最近使用的块信息
always @(posedge clk)
begin
if(rst)
begin
lru_0_0 <= 128'h0;
lru_0_1 <= 128'h0;
lru_1_0 <= 128'hffffffffffffffffffffffffffffffff;
lru_1_1 <= 128'h0;
lru_2_0 <= 128'h0;
lru_2_1 <= 128'hffffffffffffffffffffffffffffffff;
lru_3_0 <= 128'hffffffffffffffffffffffffffffffff;
lru_3_1 <= 128'hffffffffffffffffffffffffffffffff;
end
else if((work_state == 3'b000) && inst_req_reg && succeed ) // require and hit, so update lru
begin
// 更新LRU表的逻辑
end
end
模块通过AXI总线与外部存储器通信。AXI接口支持读请求和读响应,用于加载未命中的缓存块。
assign araddr = (prefetch_state == 4'd1) ? {prefetch_tag[31:5], 5'b0} : {inst_addr_reg[31:5], 5'b0};
assign arvalid = (((work_state == 4'b0001) && !wait_prefetch) || (prefetch_state == 4'd1));
assign rready = ((work_state == 4'b0011) || (prefetch_state == 4'd2)) ? 1'b1 : 1'b0;
这里不太懂
本项目的DCache使用的是二路组相联
tag
是地址的高位部分,data_addr_reg[31:12]
和cache中的标签来判断,如果命中则从cache中取数据;未命中的话,则从外部存储器(主存)中加载数据。
两个实例tag_0
和tag_1
分别对应两路缓存
dcache_data_ram
每路包含8个32位RAM,组成32字节缓存assign hit_0 = (tag_rdata_0[19:0] == tag);
assign hit_1 = (tag_rdata_1[19:0] == tag);
assign hit = hit_0 | hit_1;
dirty_way_0/1
记录每路缓存行是否被修改过,写操作时更新脏位,在成功后将脏位置1,替换前检查脏位,若脏则触发写回,通过victim_buffer
暂存数据,后面也多了一个axi写回流程,通过victim_workstate
状态机控制写回地址和数据传输。reg [3:0] victim_workstate;
reg [26:0] victim_addr;
always @(posedge clk)
begin
if(rst)
begin
victim_workstate <= 4'd15;
end
else if(victim_workstate == 4'd15)
begin
victim_workstate <= 4'd0;
end
else if(victim_workstate == 4'd0)
begin
if(((work_state == 4'b0000) && req_but_miss && write_back) || (work_state == 4'b1110))
begin
victim_workstate <= 4'd1;
end
else if(work_state == 4'b0010)
begin
victim_workstate <= 4'd1;
end
end
else if(victim_workstate == 4'd1)
begin
victim_workstate <= 4'd2;
end
else if(victim_workstate == 4'd2)
begin
if(awready)
begin
victim_workstate <= 4'd3;
end
end
else if(victim_workstate == 4'd3)
begin
if((target_bank_write == 3'd7) && wready)
begin
victim_workstate <= 4'd4;
end
end
else if(victim_workstate == 4'd4)
begin
if(bvalid && (bid == 4'd0))
begin
victim_workstate <= 4'd0;
end
end
end
// 写操作成功时更新脏位
always @(posedge clk) begin
if (write_hit)
dirty_way_0[index] <= (succeed_0) ? 1 : dirty_way_0[index];
end
reg [3:0] work_state; // 00: hit /01: seek to replace and require /11: wait for axi
always @(posedge clk)
begin
if(rst)
begin
work_state <= 4'b0100;
end
else if((work_state == 4'b0100) && cache_work_1)
begin
work_state <= 4'b0000;
end
else if((work_state == 4'b0000) && (cache_req && (cache_op[6:3] != 4'd0)))
begin
work_state <= 4'b1111;
end
else if((work_state == 4'b1111) && (op_workstate == 4'd1))
begin
work_state <= 4'b0000;
end
else if((work_state == 4'b0101) || (work_state == 4'b0110))
begin
work_state <= 4'b0000;
end
else if((work_state == 4'b0010) && (victim_workstate == 4'd1)) // write back prepare
begin
work_state <= 4'b0001;
end
else if((work_state == 4'b0000) && data_addr_ok && data_wr) // write into cache
begin
work_state <= 4'b0111;
end
else if((work_state == 4'b0111) && req_but_miss && write_back) // write back prepare
begin
work_state <= 4'b1110;
end
else if(work_state == 4'b1110) // write back prepare
begin
work_state <= 4'b0010;
end
else if((work_state == 4'b0111) && req_but_miss && !write_back) // write into cache
begin
work_state <= 4'b0001;
end
else if((work_state == 4'b0000) && req_but_miss && write_back) // write back prepare
begin
work_state <= 4'b0010;
end
else if((work_state == 4'b0111) && succeed && !continue_sw) // write into cache
begin
work_state <= 4'b0000;
end
else if((work_state == 4'b0000) && req_but_miss && !write_back) // miss or invalid, enter state 001
begin
work_state <= 4'b0001;
end
else if((work_state == 4'b0001) && arready && !wait_victim_buffer) // after axi ack addr, enter state 011
begin
work_state <= 4'b0011;
end
else if((work_state == 4'b0011) && rvalid && (rid == 4'd1)) //???????
begin
work_state <= 4'b1000;
end
else if(work_state == 4'b1000) //???????
begin
work_state <= 4'b1001;
end
else if((work_state == 4'b1001) && rvalid && rlast && (rid == 4'd1) && data_wr_reg) // after axi rlast(trans end), enter state 010
begin
work_state <= 4'b0110;
end
else if((work_state == 4'b1001) && rvalid && rlast && (rid == 4'd1)) // after axi rlast(trans end), enter state 010
begin
work_state <= 4'b0000;
end
end
还差AXI总线的搭载和TLB,因为现在已经是过了0点,1点多了,今天白天会补上,拖延症太严重了。。。