Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA Warp Level Collectives #229

Open
wants to merge 57 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
3729e3c
Added code for vote and shfl for decode stage of pipeline
borlaugg Jun 28, 2024
a18de95
Added vote and shfl instructions (opcode)
borlaugg Jun 28, 2024
cb57ecc
Added SFUtypes Vote and SHFL
borlaugg Jun 28, 2024
10e2d35
Added code for execution of vote and shfl
borlaugg Jun 28, 2024
0a5279e
resolved merge---Merge branch 'master' of https://github.com/vortexgp…
borlaugg Jul 1, 2024
b34b49d
Added intrinsic for vote and shfl
borlaugg Jul 15, 2024
f5961d9
Fixed decode steps of shfl and vote
borlaugg Jul 15, 2024
0af1a7c
Added and removed debug prints to identify cause of error
borlaugg Jul 15, 2024
36b6f53
Fixed vote and shfl execute steps
borlaugg Jul 15, 2024
effcfa5
Added lines to make shfl and vote
borlaugg Jul 15, 2024
fbfa5d4
Created tests for shfl
borlaugg Jul 15, 2024
446db75
Added test for vote
borlaugg Jul 15, 2024
b7be811
Updated execute
borlaugg Jul 31, 2024
b249a1c
Added vote and shfl for rtlsim
borlaugg Jul 31, 2024
54d7669
Added shfl and vote definitions
borlaugg Jul 31, 2024
35aab8f
Added decode stage for vote and shfl
borlaugg Jul 31, 2024
46bda29
Added shfl and vote decode
borlaugg Jul 31, 2024
6f5094f
Added vote and shfl
borlaugg Jul 31, 2024
61791a0
Reverted to old version
borlaugg Jul 31, 2024
58be8fa
Reverted to old working commit
borlaugg Jul 31, 2024
4e87ca1
Added Vote and shfl definitions (working)
borlaugg Jul 31, 2024
94e74a2
Added decode stage for vote and shfl (working)
borlaugg Jul 31, 2024
afec564
Added vote and shfl execution blocks (working)
borlaugg Jul 31, 2024
df049ca
Execute stage working patch for vote and shfl
borlaugg Aug 2, 2024
0ab27e2
Working shfl and vote decode phase
borlaugg Aug 2, 2024
2fd64d5
Minor fix
borlaugg Aug 2, 2024
56b5cac
No changes
borlaugg Aug 2, 2024
84d86f2
Working vote and shfl execute
borlaugg Aug 2, 2024
2bbec22
Added vote and shfl func units
borlaugg Aug 2, 2024
422464b
No changes
borlaugg Aug 2, 2024
04f7892
Latest execute fix for vote and shfl
borlaugg Aug 4, 2024
c451689
Fixed rtlsim of vote and shfl
borlaugg Aug 21, 2024
1ba9287
Fixed simx for vote and shfl
borlaugg Aug 21, 2024
18d53d9
Checked test files
borlaugg Aug 21, 2024
444bb1e
Fixed intrinsics
borlaugg Aug 21, 2024
323cbf1
Removed unecessary vote and shfl
borlaugg Aug 21, 2024
16c69ae
Added srcReg and deallocated it for maskreg and c_add (vote and shfl)
borlaugg Sep 19, 2024
d12f412
Small typo
borlaugg Sep 19, 2024
9feba99
Fixed typo
borlaugg Sep 19, 2024
0588b58
Fixed function unit for vote and shfl
borlaugg Sep 19, 2024
6dfacec
Changed vote shfl to alu
borlaugg Sep 19, 2024
5c56f53
Fixed typo
borlaugg Sep 20, 2024
af593d6
Fixed typo
borlaugg Sep 20, 2024
d13cdc3
Removed tile prints
borlaugg Sep 26, 2024
7c37dc0
Chnaged opcodes of vote and SHFL
borlaugg Nov 1, 2024
c44e6cb
Updated Vote and SHFL opcodes
borlaugg Nov 1, 2024
4046d5f
Fixed error with lane and p variables
borlaugg Jan 9, 2025
be02702
Debugged issue with active_l
borlaugg Jan 10, 2025
33e90af
Debugged Vote ballor
borlaugg Jan 15, 2025
e2aed49
Fixed Comments
borlaugg Jan 15, 2025
be38cf6
Modified Vote Test
borlaugg Jan 15, 2025
bfa5bbc
Fixed Vote shfl
borlaugg Jan 15, 2025
809905d
Merge branch 'master' into cuda_VoteShfl3
borlaugg Feb 25, 2025
31a9fac
Merged with master (Rishabh)
borlaugg Feb 25, 2025
0c8a7a1
Patched pull request issue in ALU_int
borlaugg Feb 26, 2025
eb1015b
Patched issue for XLEN64 with vote shfl
borlaugg Feb 26, 2025
bc6d707
Fix for PR
borlaugg Mar 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions hw/rtl/VX_define.vh
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,26 @@

// Custom extension opcodes
`define INST_EXT1 7'b0001011 // 0x0B
`define INST_EXT2 7'b0101011 // 0x2B
`define INST_EXT3 7'b1011011 // 0x5B
`define INST_EXT2 7'b0101011 // 0x2B Vote
`define INST_EXT3 7'b1011011 // 0x5B Shfl
`define INST_EXT4 7'b1111011 // 0x7B

// CUDA Vote Extension

`define VOTE_ALL 4'b0000
`define VOTE_ANY 4'b0001
`define VOTE_UNI 4'b0010
`define VOTE_BALLOT 4'b0011

`define SHFL_BFLY 4'b0100
`define SHFL_UP 4'b0101
`define SHFL_DOWN 4'b0110
`define SHFL_IDX 4'b0111

`define VOTE_NONE 4'b1000
`define VOTE_NOT_ALL 4'b1001


// Opcode extensions
`define INST_R_F7_MUL 7'b0000001
`define INST_R_F7_ZICOND 7'b0000111
Expand Down
105 changes: 93 additions & 12 deletions hw/rtl/core/VX_alu_int.sv
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ module VX_alu_int #(
localparam PID_WIDTH = `UP(PID_BITS);
localparam SHIFT_IMM_BITS = `CLOG2(`XLEN);

`UNUSED_VAR (execute_if.data.rs3_data)
//`UNUSED_VAR (execute_if.data.rs3_data)

wire [NUM_LANES-1:0][`XLEN-1:0] add_result;
wire [NUM_LANES-1:0][`XLEN:0] sub_result; // +1 bit for branch compare
Expand All @@ -47,6 +47,8 @@ module VX_alu_int #(
wire [NUM_LANES-1:0][`XLEN-1:0] sub_result_w;
wire [NUM_LANES-1:0][`XLEN-1:0] shr_result_w;
reg [NUM_LANES-1:0][`XLEN-1:0] msc_result_w;
reg [NUM_LANES-1:0][`XLEN-1:0] vote_result;
reg [NUM_LANES-1:0][`XLEN-1:0] shfl_result;

reg [NUM_LANES-1:0][`XLEN-1:0] alu_result;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_result_r;
Expand All @@ -66,6 +68,7 @@ module VX_alu_int #(

wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1 = execute_if.data.rs1_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2 = execute_if.data.rs2_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in3 = execute_if.data.rs3_data;

wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.op_args.alu.use_PC ? {NUM_LANES{execute_if.data.PC, 1'd0}} : alu_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
Expand Down Expand Up @@ -114,20 +117,98 @@ module VX_alu_int #(
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW
end

for (genvar i = 0; i < NUM_LANES; ++i) begin : g_alu_result
// VOTE
wire [NUM_LANES-1:0] active_t = (alu_in2[0][NUM_LANES-1:0] & (execute_if.data.tmask));
wire [NUM_LANES-1:0] is_pred;
wire [NUM_LANES-1:0] vote_in = (is_pred & active_t);
wire is_neg = alu_op[2];

wire vote_all = (is_neg) ? (vote_in == NUM_LANES'(1'b0)) : (vote_in == active_t);
wire vote_any = (is_neg) ? (vote_in != active_t) : (vote_in > NUM_LANES'(1'b0));
wire vote_uni = ((vote_in == active_t) || (vote_in == NUM_LANES'(1'b0)));
wire [NUM_LANES-1:0] vote_ballot;
for (genvar i = 0; i < NUM_LANES; ++i) begin : vote_result_update
assign is_pred[i] = alu_in1[i][0] & alu_in2[0][i];
assign vote_ballot[i] = vote_in[NUM_LANES - 1 -i];
always @(*) begin
case (alu_op[1:0])
2'b00: vote_result[i] = `XLEN'(vote_all); // ALL, NONE
2'b01: vote_result[i] = `XLEN'(vote_any); // ANY, NOT_ALL
2'b10: vote_result[i] = `XLEN'(vote_uni); // UNI
2'b11: vote_result[i] = `XLEN'(vote_ballot); // BALLOT
endcase
end
end

// SHFL
wire [NUM_LANES-1:0][`XLEN-1:0] b;
wire [NUM_LANES-1:0][`XLEN-1:0] segmask;
wire [NUM_LANES-1:0][`XLEN-1:0] c;
wire [NUM_LANES-1:0][`XLEN-1:0] maxLane, minLane;
reg [NUM_LANES-1:0][`XLEN-1:0] lane;
reg [NUM_LANES-1:0] p;
reg [NUM_LANES-1:0] active_l;

for (genvar i = 0; i < NUM_LANES; ++i) begin : shfl_result_update
assign b[i] = (alu_in2_imm[i]>>5)&(`XLEN'(5'b11111));
assign segmask[i] = ((alu_in3[i]>>5)&(`XLEN'(5'b11111)));
assign c[i] = (alu_in3[i] & `XLEN'(5'b11111));
assign maxLane[i] = ((`XLEN'(i) & segmask[i]) | (c[i] & ~(segmask[i])));
assign minLane[i] = (`XLEN'(i) & segmask[i]);
always @(*) begin
case (alu_op)
`SHFL_BFLY: begin
lane[i] = `XLEN'(i) - b[i];
p[i] = (lane[i] >= maxLane[i]);
end
`SHFL_UP: begin
lane[i] = `XLEN'(i) + b[i];
p[i] = (lane[i] <= maxLane[i]);
end
`SHFL_DOWN: begin
lane[i] = `XLEN'(i) ^ b[i];
p[i] = (lane[i] <= maxLane[i]);
end
`SHFL_IDX: begin
lane[i] = minLane[i] | (b[i] & ~(segmask[i]));
p[i] = (lane[i] <= maxLane[i]);
end
default: begin
lane[i] = ~(`XLEN'(1'b0));
p[i] = ~(1'b0);
end
endcase
if(p[i] == 1'b0) begin
lane[i] = `XLEN'(i);
end
active_l[i] = (lane[i] < NUM_LANES) ? alu_in2[0][$signed(lane[i][SHIFT_IMM_BITS-1:0])] : alu_in2[0][i];
shfl_result[i] = (active_t[i] && active_l[i]) ? ( (lane[i] < NUM_LANES) ? alu_in1[$signed(lane[i][SHIFT_IMM_BITS-1:0])] : alu_in1[i]) : `XLEN'(1'b0);
end
end

for (genvar i = 0; i < NUM_LANES; ++i) begin : Final_results
wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]});
wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result;
always @(*) begin
case ({is_alu_w, op_class})
3'b000: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
3'b001: alu_result[i] = sub_slt_br_result; // SUB, SLTU, SLTI, BR*
3'b010: alu_result[i] = shr_zic_result[i]; // SRL, SRA, SRLI, SRAI, CZERO*
3'b011: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL, SLLI
3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW
3'b101: alu_result[i] = sub_result_w[i]; // SUBW
3'b110: alu_result[i] = shr_result_w[i]; // SRLW, SRAW, SRLIW, SRAIW
3'b111: alu_result[i] = msc_result_w[i]; // SLLW
endcase
if (execute_if.data.op_args.alu.xtype == `ALU_TYPE_OTHER) begin
case (alu_op[2])
1'b0: alu_result[i] = vote_result[i];
1'b1: alu_result[i] = shfl_result[i];
default:;
endcase
end
else begin
case ({is_alu_w, op_class})
3'b000: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
3'b001: alu_result[i] = sub_slt_br_result; // SUB, SLTU, SLTI, BR*
3'b010: alu_result[i] = shr_zic_result[i]; // SRL, SRA, SRLI, SRAI, CZERO*
3'b011: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL, SLLI
3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW
3'b101: alu_result[i] = sub_result_w[i]; // SUBW
3'b110: alu_result[i] = shr_result_w[i]; // SRLW, SRAW, SRLIW, SRAIW
3'b111: alu_result[i] = msc_result_w[i]; // SLLW
endcase
end
end
end

Expand Down
61 changes: 60 additions & 1 deletion hw/rtl/core/VX_decode.sv
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ module VX_decode import VX_gpu_pkg::*; #(
wire [4:0] rd = instr[11:7];
wire [4:0] rs1 = instr[19:15];
wire [4:0] rs2 = instr[24:20];
wire [4:0] rs3 = instr[31:27];
wire [4:0] rs3 = (opcode == `INST_EXT4)? {3'b000,instr[31:30]} + instr[24:20] : instr[31:27];

`UNUSED_VAR (func2)
`UNUSED_VAR (func5)
Expand Down Expand Up @@ -532,6 +532,65 @@ module VX_decode import VX_gpu_pkg::*; #(
default:;
endcase
end
`INST_EXT2: begin
ex_type = `EX_ALU;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, u_12);
op_args.alu.xtype = `ALU_TYPE_OTHER;
op_args.alu.is_w = 1;
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2); //membermask imm[24:20]
case (func3)
3'b000: begin
op_type = `INST_OP_BITS'(`VOTE_ALL);
end
3'b001: begin
op_type = `INST_OP_BITS'(`VOTE_ANY);
end
3'b010: begin
op_type = `INST_OP_BITS'(`VOTE_UNI);
end
3'b011: begin
op_type = `INST_OP_BITS'(`VOTE_BALLOT);
end
3'b100: begin
op_type = `INST_OP_BITS'(`VOTE_NONE);
end
3'b101: begin
op_type = `INST_OP_BITS'(`VOTE_NOT_ALL);
end
default:;
endcase
end
`INST_EXT3: begin
ex_type = `EX_ALU;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, u_12);
op_args.alu.xtype = `ALU_TYPE_OTHER;
op_args.alu.is_w = 1;
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2); //membermask imm[24:20]
`USED_IREG (rs3); //c offset imm[31:28]
case (func3)
3'b000: begin
op_type = `INST_OP_BITS'(`SHFL_BFLY);
end
3'b001: begin
op_type = `INST_OP_BITS'(`SHFL_UP);
end
3'b010: begin
op_type = `INST_OP_BITS'(`SHFL_DOWN);
end
3'b011: begin
op_type = `INST_OP_BITS'(`SHFL_IDX);
end
default:;
endcase
end
default:;
endcase
end
Expand Down
2 changes: 1 addition & 1 deletion hw/rtl/core/VX_lsu_slice.sv
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) commit_no_rsp_if();

`UNUSED_VAR (execute_if.data.rs3_data)
//`UNUSED_VAR (execute_if.data.rs3_data)
`UNUSED_VAR (execute_if.data.tid)

// full address calculation
Expand Down
Loading
Loading