// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_dispatch_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1,
parameter OUT_BUF = 0,
parameter MAX_FANOUT = `MAX_FANOUT
) (
input wire clk,
input wire reset,
// inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// outputs
VX_execute_if.master execute_if [BLOCK_SIZE]
);
`STATIC_ASSERT (`IS_DIVISBLE(`ISSUE_WIDTH, BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(`NUM_THREADS, NUM_LANES), ("invalid parameter"))
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
localparam PID_BITS = `CLOG2(NUM_PACKETS);
localparam PID_WIDTH = `UP(PID_BITS);
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2));
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
localparam DATA_REGS_OFF = 0;
wire [`ISSUE_WIDTH-1:0] dispatch_valid;
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_data[i] = dispatch_if[i].data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
wire [BLOCK_SIZE-1:0] block_ready;
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
wire [BLOCK_SIZE-1:0][PID_WIDTH-1:0] block_pid;
wire [BLOCK_SIZE-1:0] block_sop;
wire [BLOCK_SIZE-1:0] block_eop;
wire [BLOCK_SIZE-1:0] block_done;
wire batch_done = (& block_done);
logic [BATCH_COUNT_W-1:0] batch_idx;
if (BATCH_COUNT != 1) begin
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end
end
end else begin
assign batch_idx = 0;
`UNUSED_VAR (batch_done)
end
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
wire valid_p, ready_p;
if (`NUM_THREADS != NUM_LANES) begin
reg [NUM_PACKETS-1:0] sent_mask_p;
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
wire dispatch_valid_r;
reg is_first_p;
wire fire_p = valid_p && ready_p;
wire is_last_p = (start_p == end_p);
wire fire_eop = fire_p && is_last_p;
always @(posedge clk) begin
if (block_reset) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else begin
if ((BATCH_COUNT != 1) ? batch_done : fire_eop) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else if (fire_p) begin
sent_mask_p[start_p] <= 1;
is_first_p <= 0;
end
end
end
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar j = 0; j < NUM_LANES; ++j) begin
localparam k = i * NUM_LANES + j;
assign per_packet_tmask[i][j] = dispatch_tmask[k];
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
assign per_packet_regs[i][1][j] = dispatch_rs2_data[k];
assign per_packet_regs[i][2][j] = dispatch_rs3_data[k];
end
end
wire [NUM_PACKETS-1:0] packet_valids;
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
assign packet_valids[i] = (| per_packet_tmask[i]);
assign packet_ids[i] = PID_WIDTH'(i);
end
VX_find_first #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (0)
) find_first (
.valid_in (packet_valids & ~sent_mask_p),
.data_in (packet_ids),
.data_out (start_p_n),
`UNUSED_PIN (valid_out)
);
VX_find_first #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (1)
) find_last (
.valid_in (packet_valids),
.data_in (packet_ids),
.data_out (end_p),
`UNUSED_PIN (valid_out)
);
VX_pipe_register #(
.DATAW (1 + PID_WIDTH),
.RESETW (1),
.DEPTH (FANOUT_ENABLE ? 1 : 0)
) pipe_reg (
.clk (clk),
.reset (reset || fire_p), // should flush on fire
.enable (1'b1),
.data_in ({dispatch_valid[issue_idx], start_p_n}),
.data_out ({dispatch_valid_r, start_p})
);
wire [NUM_LANES-1:0] tmask_p = per_packet_tmask[start_p];
wire [2:0][NUM_LANES-1:0][`XLEN-1:0] regs_p = per_packet_regs[start_p];
wire block_enable = (BATCH_COUNT == 1 || ~(& sent_mask_p));
assign valid_p = dispatch_valid_r && block_enable;
assign block_tmask[block_idx] = tmask_p;
assign block_regs[block_idx] = regs_p;
assign block_pid[block_idx] = start_p;
assign block_sop[block_idx] = is_first_p;
assign block_eop[block_idx] = is_last_p;
if (FANOUT_ENABLE) begin
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
end else begin
assign block_ready[block_idx] = ready_p && block_enable;
end
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
end else begin
assign valid_p = dispatch_valid[issue_idx];
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_pid[block_idx] = '0;
assign block_sop[block_idx] = 1'b1;
assign block_eop[block_idx] = 1'b1;
assign block_ready[block_idx] = ready_p;
assign block_done[block_idx] = ~valid_p || ready_p;
end
wire [ISSUE_ISW_W-1:0] isw;
if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin
assign isw = batch_idx;
end
end else begin
assign isw = block_idx;
end
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
VX_elastic_buffer #(
.DATAW (OUT_DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) buf_out (
.clk (clk),
.reset (block_reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({
dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W],
block_wid,
block_tmask[block_idx],
dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN],
block_regs[block_idx][0],
block_regs[block_idx][1],
block_regs[block_idx][2],
block_pid[block_idx],
block_sop[block_idx],
block_eop[block_idx]}),
.data_out (execute_if[block_idx].data),
.valid_out (execute_if[block_idx].valid),
.ready_out (execute_if[block_idx].ready)
);
end
reg [`ISSUE_WIDTH-1:0] ready_in;
always @(*) begin
ready_in = 0;
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
end
end
assign dispatch_ready = ready_in;
endmodule
VX_dispatch_unit
模块是一个用于调度和分发操作数到执行单元的调度单元。它根据输入的调度接口,将操作数按块进行分发,并生成相应的控制信号传递给执行单元。
输入接口:
clk
:时钟信号。reset
:复位信号。VX_dispatch_if.slave dispatch_if [
ISSUE_WIDTH]`:调度接口数组,输入的操作数和控制信号。输出接口:
VX_execute_if.master execute_if [BLOCK_SIZE]
:执行接口数组,输出的操作数和控制信号。BLOCK_SIZE
:块大小,默认值为1。NUM_LANES
:通道数量,默认值为1。OUT_BUF
:输出缓冲区参数,默认值为0。MAX_FANOUT
:最大扇出数量,默认值为 MAX_FANOUT
。VX_dispatch_unit
模块的主要作用是:
dispatch_if
)接收操作数和控制信号。BLOCK_SIZE
和 NUM_LANES
参数,将操作数按块分发到执行接口(execute_if
)。VX_elastic_buffer
模块传递给执行单元。参数计算:
BLOCK_SIZE_W
,NUM_PACKETS
,PID_BITS
,IN_DATAW
,OUT_DATAW
等,用于后续的数据和信号处理。调度接口信号分配:
dispatch_valid
,dispatch_data
和 dispatch_ready
。块分发逻辑:
for
循环生成多个块的分发逻辑。NUM_LANES
参数,处理线程掩码和寄存器数据,并生成相应的包标识符和控制信号。VX_elastic_buffer
模块,将分发后的数据和控制信号传递给执行单元。批处理逻辑:
batch_idx
,根据批次数量和块大小调整批次索引。准备信号生成:
dispatch_ready
。VX_dispatch.sv
的关系功能关系:
VX_dispatch.sv
模块负责从操作数接口接收操作数,并将其分发到多个执行单元。VX_dispatch_unit.sv
模块是 VX_dispatch.sv
模块的一个子模块,具体实现了调度单元的分发逻辑。接口关系:
VX_dispatch.sv
中的 VX_dispatch_if.master dispatch_if
是 VX_dispatch_unit.sv
的输入接口,表示从调度接口接收的数据。VX_dispatch_unit.sv
中的 VX_execute_if.master execute_if
是 VX_dispatch.sv
的输出接口,表示分发到执行单元的数据。数据关系:
VX_dispatch.sv
中的调度接口 VX_dispatch_if.master dispatch_if
包含的操作数和控制信号,通过 VX_dispatch_unit.sv
的逻辑处理后,按块分发到执行接口 VX_execute_if.master execute_if
。VX_dispatch_unit.sv
模块是调度单元的核心逻辑实现,负责将操作数和控制信号按块分发到执行单元。它与 VX_dispatch.sv
模块紧密相关,共同完成操作数从接收到分发的全过程。
`STATIC_ASSERT (`IS_DIVISBLE(`ISSUE_WIDTH, BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(`NUM_THREADS, NUM_LANES), ("invalid parameter"))
ISSUE_WIDTH
可以被 BLOCK_SIZE
整除,以及 NUM_THREADS
可以被 NUM_LANES
整除。如果这些条件不满足,将会报错,提示参数无效。备注:
在硬件设计中,某些参数之间的整除关系可以确保系统的正确行为和高效操作。以下是这两行静态断言要求参数整除的原因:
ISSUE_WIDTH
和 BLOCK_SIZE
:
ISSUE_WIDTH
表示调度的宽度,即同时可以调度的指令数量。BLOCK_SIZE
表示每个块的大小,即每次分发的指令数量。ISSUE_WIDTH
可以被 BLOCK_SIZE
整除,确保每个块可以整齐地分发指令,而不会出现多余的指令无法分配的问题。ISSUE_WIDTH
是 8,而 BLOCK_SIZE
是 4,意味着可以将 8 个指令分成两个 4 指令的块。如果不能整除,可能会导致最后一个块无法完整分发,影响系统的调度逻辑。NUM_THREADS
和 NUM_LANES
:
NUM_THREADS
表示线程的总数。NUM_LANES
表示每次操作的通道数量。NUM_THREADS
可以被 NUM_LANES
整除,确保每个通道可以均匀地分配线程,而不会出现某些通道没有分配到线程的问题。NUM_THREADS
是 16,而 NUM_LANES
是 4,意味着可以将 16 个线程均匀分成 4 个通道,每个通道 4 个线程。如果不能整除,可能会导致某些通道缺少线程,影响并行处理的效率。通过确保这些参数之间的整除关系,可以保证硬件设计中资源的合理分配和高效利用,避免由于不均匀分配导致的潜在问题。静态断言在编译时检测这些条件,可以在设计的早期阶段发现并修正参数配置错误。
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
localparam PID_BITS = `CLOG2(NUM_PACKETS);
localparam PID_WIDTH = `UP(PID_BITS);
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2));
BLOCK_SIZE_W
:块大小的对数值(向上取整)。NUM_PACKETS
:每个通道的包数量。PID_BITS
和 PID_WIDTH
:包标识符的位宽。BATCH_COUNT
和 BATCH_COUNT_W
:每批次的块数量及其对数值。ISSUE_W
:调度宽度的对数值。IN_DATAW
和 OUT_DATAW
:输入和输出数据宽度。FANOUT_ENABLE
:根据 NUM_THREADS
和 MAX_FANOUT
判断是否启用扇出。localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
localparam DATA_REGS_OFF = 0;
wire [`ISSUE_WIDTH-1:0] dispatch_valid;
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_data[i] = dispatch_if[i].data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
dispatch_valid
,dispatch_data
和 dispatch_ready
。wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
wire [BLOCK_SIZE-1:0] block_ready;
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
wire [BLOCK_SIZE-1:0][PID_WIDTH-1:0] block_pid;
wire [BLOCK_SIZE-1:0] block_sop;
wire [BLOCK_SIZE-1:0] block_eop;
wire [BLOCK_SIZE-1:0] block_done;
这段代码主要用于处理批次索引(batch_idx
)的更新逻辑。根据批次数量(BATCH_COUNT
)来决定如何更新或设置批次索引。以下是详细分析:
变量定义:
logic [BATCH_COUNT_W-1:0] batch_idx;
batch_idx
,其位宽为 BATCH_COUNT_W
。该变量用于存储批次索引。条件判断:
if (BATCH_COUNT != 1) begin
BATCH_COUNT
是否不等于 1。如果 BATCH_COUNT
不等于 1,则进入 always
块,否则进入 else
块。always
块:
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end
end
always @(posedge clk)
意味着该块中的逻辑在时钟上升沿触发。reset
信号有效,则将 batch_idx
置零。reset
信号无效,则将 batch_idx
增加 batch_done
。其中 batch_done
是一个标志,表示当前批次是否完成。else
块:
else begin
assign batch_idx = 0;
`UNUSED_VAR (batch_done)
end
BATCH_COUNT
等于 1,则直接将 batch_idx
赋值为 0。UNUSED_VAR
宏标记 batch_done
,表示该变量在这种情况下未被使用。批次索引更新:
BATCH_COUNT
不等于 1 时,batch_idx
在每个时钟周期(在时钟上升沿)根据 batch_done
的状态进行更新。如果 batch_done
为真,则 batch_idx
增加 1。BATCH_COUNT
等于 1 时,直接将 batch_idx
设置为 0,因为没有多个批次需要处理。复位信号处理:
batch_idx
置零,确保系统在复位后处于初始状态。未使用变量处理:
BATCH_COUNT
等于 1 的情况下,使用 UNUSED_VAR
宏标记 batch_done
,以避免未使用变量的警告。这段代码的主要作用是根据批次数量(BATCH_COUNT
)来管理和更新批次索引(batch_idx
),从而控制数据处理的批次进度。在多批次的情况下,batch_idx
随着每个批次的完成而增加;在单批次的情况下,batch_idx
始终为 0。
这段代码是 VX_dispatch_unit.sv
模块的一部分,用于将调度接口中的操作数和控制信号分发到执行单元。以下是详细的分析:
生成块索引:
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
genvar
生成多个块,每个块有一个索引 block_idx
,范围是 0 到 BLOCK_SIZE-1
。issue_idx
,并将其分配给 issue_indices
。复位信号处理:
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
RESET_RELAY_EN
宏定义处理块复位信号,根据 BLOCK_SIZE
是否大于 1 来决定复位行为。有效和准备信号:
wire valid_p, ready_p;
多线程处理:
NUM_THREADS
不等于 NUM_LANES
,则启用多线程处理逻辑:if (`NUM_THREADS != NUM_LANES) begin
reg [NUM_PACKETS-1:0] sent_mask_p;
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
wire dispatch_valid_r;
reg is_first_p;
wire fire_p = valid_p && ready_p;
wire is_last_p = (start_p == end_p);
wire fire_eop = fire_p && is_last_p;
时钟上升沿触发:
always @(posedge clk) begin
if (block_reset) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else begin
if ((BATCH_COUNT != 1) ? batch_done : fire_eop) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else if (fire_p) begin
sent_mask_p[start_p] <= 1;
is_first_p <= 0;
end
end
end
always
块中,处理块复位和发送掩码 sent_mask_p
的更新。数据处理和分发:
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
包有效性和标识符:
wire [NUM_PACKETS-1:0] packet_valids;
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
assign packet_valids[i] = (| per_packet_tmask[i]);
assign packet_ids[i] = PID_WIDTH'(i);
end
查找第一个和最后一个有效包:
VX_find_first #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (0)
) find_first (
.valid_in (packet_valids & ~sent_mask_p),
.data_in (packet_ids),
.data_out (start_p_n),
`UNUSED_PIN (valid_out)
);
VX_find_first #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (1)
) find_last (
.valid_in (packet_valids),
.data_in (packet_ids),
.data_out (end_p),
`UNUSED_PIN (valid_out)
);
VX_find_first
模块查找第一个和最后一个有效包的标识符。管道寄存器:
VX_pipe_register #(
.DATAW (1 + PID_WIDTH),
.RESETW (1),
.DEPTH (FANOUT_ENABLE ? 1 : 0)
) pipe_reg (
.clk (clk),
.reset (reset || fire_p), // should flush on fire
.enable (1'b1),
.data_in ({dispatch_valid[issue_idx], start_p_n}),
.data_out ({dispatch_valid_r, start_p})
);
VX_pipe_register
模块处理管道寄存器,传递有效信号和包标识符。块使能和准备信号:
wire block_enable = (BATCH_COUNT == 1 || ~(& sent_mask_p));
assign valid_p = dispatch_valid_r && block_enable;
assign block_tmask[block_idx] = tmask_p;
assign block_regs[block_idx] = regs_p;
assign block_pid[block_idx] = start_p;
assign block_sop[block_idx] = is_first_p;
assign block_eop[block_idx] = is_last_p;
if (FANOUT_ENABLE) begin
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
end else begin
assign block_ready[block_idx] = ready_p && block_enable;
end
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
单线程处理:
NUM_THREADS
等于 NUM_LANES
,则简化处理逻辑:else begin
assign valid_p = dispatch_valid[issue_idx];
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
assign block_pid[block_idx] = '0;
assign block_sop[block_idx] = 1'b1;
assign block_eop[block_idx] = 1'b1;
assign block_ready[block_idx] = ready_p;
assign block_done[block_idx] = ~valid_p || ready_p;
end
块索引和宽度:
wire [ISSUE_ISW_W-1:0] isw;
if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin
assign isw = batch_idx;
end
end else begin
assign isw = block_idx;
end
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
弹性缓冲区:
VX_elastic_buffer #(
.DATAW (OUT_DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) buf_out (
.clk (clk),
.reset (block_reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({
dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W],
block_wid,
block_tmask[block_idx],
dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN],
block_regs[block_idx][0],
block_regs[block_idx][1],
block_regs[block_idx][2],
block_pid[block_idx],
block_sop[block_idx],
block_eop[block_idx]}),
.data_out (execute_if[block_idx].data),
.valid_out (execute_if[block_idx].valid),
.ready_out (execute_if[block_idx].ready)
);
主要内容:
VX_elastic_buffer
模块传递数据和控制信号。接口:
dispatch_if
,用于接收调度数据和控制信号。execute_if
,用于输出分发后的数据和控制信号。作用:
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
这段代码主要处理从调度数据(dispatch_data
)中提取线程掩码和寄存器的数据,并将其分配到每个包和每个通道。以下是详细分析:
per_packet_tmask
:
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
NUM_PACKETS
表示包的数量。NUM_LANES
表示每个包中的通道数量。per_packet_regs
:
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
2:0
表示寄存器的数据分为三个部分(假设是 rs1、rs2 和 rs3)。XLEN
表示寄存器的数据宽度(例如 32 位或 64 位)。线程掩码:
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
dispatch_tmask
。DATA_TMASK_OFF
表示线程掩码在调度数据中的偏移量。NUM_THREADS
表示线程的数量。寄存器数据:
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
dispatch_rs1_data
、dispatch_rs2_data
和 dispatch_rs3_data
。DATA_REGS_OFF
表示寄存器数据在调度数据中的起始偏移量。NUM_THREADS * XLEN
。for
循环将提取的线程掩码和寄存器数据分配到每个包和每个通道:for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar j = 0; j < NUM_LANES; ++j) begin
localparam k = i * NUM_LANES + j;
assign per_packet_tmask[i][j] = dispatch_tmask[k];
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
assign per_packet_regs[i][1][j] = dispatch_rs2_data[k];
assign per_packet_regs[i][2][j] = dispatch_rs3_data[k];
end
end
i
是包的索引,j
是通道的索引。k
是线程的全局索引,计算方式为 i * NUM_LANES + j
。k
对应的线程掩码和寄存器数据分配到每个包和每个通道。for
循环,将全局线程索引的数据映射到包和通道的局部索引。dispatch_tmask
和寄存器数据 dispatch_rs1_data
、dispatch_rs2_data
和 dispatch_rs3_data
。