-
Notifications
You must be signed in to change notification settings - Fork 6
/
loop_cuda_chill.hh
260 lines (199 loc) · 10.1 KB
/
loop_cuda_chill.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#ifndef LOOP_CUDA_CHILL_HH
#define LOOP_CUDA_CHILL_HH
// this should be used if Loop has chillAST internally
// and not an AST from the front end compiler
#include "chill_ast.hh"
#include "chill_io.hh"
#include "loop.hh"
//#include "mem_mapping_utils.hh" // rose dependent
#include <string.h>
#include <code_gen/CG_chillRepr.h>
#include <code_gen/CG_chillBuilder.h>
#ifndef ENUMMEMORYMODE
#define ENUMMEMORYMODE
enum MemoryMode { GlobalMem, SharedMem, TexMem };
#endif
#ifndef CUDAVARDEFS
#define CUDAVARDEFS
struct VarDefs {
std::string name;
std::string secondName;
char* type;
chillAST_node *size_expr; //array size as an expression (can be a product of other variables etc)
//omega::chillRepr *size_expr; // ??
chillAST_VarDecl *vardecl;
chillAST_node *in_data; //Variable of array to copy data in from (before kernel call)
chillAST_node *out_data; //Variable of array to copy data out to (after kernel call)
chillAST_VarDecl *CPUside_param; // name of CPU side parameter (see: in_data, out_data, when not NULL)
std::vector<int> size_multi_dim; //-1 if linearized, the constant size N, of a NxN 2D array otherwise
bool tex_mapped; //protonu-- true if this variable will be texture mapped, so no need to pass it as a argument
bool cons_mapped;
std::string original_name; //this is such a hack, to store the original name, to store a table to textures used
VarDefs() { // constructor
//debug_fprintf(stderr, "constructing VarDef\n");
vardecl = NULL;
in_data = out_data = NULL;
CPUside_param = 0;
tex_mapped = cons_mapped = false;
}
void print() {
debug_fprintf(stderr, "Vardefs:\n"); // 0x%x\n", this);
debug_fprintf(stderr, "name %s\n", name.c_str());
debug_fprintf(stderr, "second name %s\n", secondName.c_str());
debug_fprintf(stderr, "original name %s\n", original_name.c_str());
debug_fprintf(stderr, "type ");
if (!type) debug_fprintf(stderr, "NULL)\n");
else debug_fprintf(stderr, "%s\n", type);
debug_fprintf(stderr, "size ");
size_expr->print(0, stderr);
if ( vardecl ) debug_fprintf(stderr, "\nvardecl %p\n", vardecl);
else debug_fprintf(stderr, "\nvardecl NULL\n");
//for (int i=0; i<size_multi_dim.size(); i++) {
// if (i) debug_fprintf(stderr, "x");
// debug_fprintf(stderr, "%d", size_multi_dim[i]);
//}
debug_fprintf(stderr, "\n");
if (tex_mapped) debug_fprintf(stderr, "tex mapped\n");
if (cons_mapped) debug_fprintf(stderr, "cons mapped\n");
};
};
#endif
chillAST_VarDecl *addBuiltin( char *nameofbuiltin, char *typeOfBuiltin, chillAST_node *somecode); // fwd decl
class LoopCuda: public Loop{ // chill version
public:
//std::vector<proc_sym*> new_procs; //Need adding to a fse
std::vector< std::vector<std::string> > idxNames;
std::vector< std::pair<int, std::string> > syncs;
bool useIdxNames;
std::vector<std::string> index;
std::vector<std::set< int> > cudaized;
//Anand: Adding a placeholder for variables that
//will be passed as parameters to cuda kernel function
//could be possibly modified by cudaize
std::set<std::string> kernel_parameters;
// typedef std::vector<chillAST_VarDecl *> symbolTable;
chillAST_SymbolTable *CPUparamSymtab; //
chillAST_SymbolTable *CPUbodySymtab; //
void printsyms() {
fflush(stdout);
printf("\nparameter_symtab has %d entries\n", CPUparamSymtab->size());
printSymbolTable( CPUparamSymtab );
printf("\n"); fflush(stdout);
printf("body_symtab has %d entries\n", CPUbodySymtab->size());
printSymbolTable( CPUbodySymtab );
printf("\n"); fflush(stdout);
}
//protonu--inserting this here, Gabe's implementation had it
//the struct statment as nonSplitLevels
std::vector<std::vector<int> > stmt_nonSplitLevels;
#ifdef INTERNALS_ROSE
texture_memory_mapping *texture; //protonu depends on rose internals
constant_memory_mapping *constant_mem; // depends on rose
#endif
std::map<std::string, int> array_dims;
std::vector<std::map<std::string, int> >Varray_dims;
omega::CG_outputRepr *setup_code;
omega::CG_outputRepr *teardown_code;
unsigned int code_gen_flags;
enum CodeGenFlags {
GenInit = 0x00,
GenCudaizeV2 = 0x02,
};
//varibles used by cudaize_codegen
//block x, y sizes, N and num_red
int cu_bx, cu_by;
int cu_n, cu_num_reduce;
//thread x, y, z
int cu_tx, cu_ty, cu_tz;
//Anand: Adding CG_outputRepr* representations of cu_bx, cu_by, cu_tx, cu_ty
//and cu_tz for non constant loop bounds
omega::CG_chillRepr *cu_bx_repr, *cu_by_repr, *cu_tx_repr, *cu_ty_repr, *cu_tz_repr;
// currently, using int cu_[bt][xy] for constant, or CG_chillRepr *cu_[bt][xy]_repr for non-constant.
// this leads to HORRIBLE code.
// tru using chillAST_node *[bt][xy]_ast instead
//chillAST_node *bxAst, *byAst, *txAst, *tyAst;
// Anand's cudaize needs vectors for cu_bx, cu_by, cu_tx, cu_ty, cu_tz
std::vector<int> Vcu_bx, Vcu_by;
std::vector<int> Vcu_tx, Vcu_ty, Vcu_tz;
std::vector<omega::CG_outputRepr *> Vcu_bx_repr, Vcu_by_repr, Vcu_tx_repr, Vcu_ty_repr, Vcu_tz_repr;
std::vector<chillAST_node *> VbxAst, VbyAst, VtxAst, VtyAst, VtzAst;
//block statement and level
int cu_block_stmt, cu_block_level;
//Anand: adding map of blockids and threadids per statements that are cudaized
std::map<int, std::vector<int> > block_and_thread_levels;
//tile statements, and loop-levels (cudaize v1)
std::vector< std::vector<int> > cu_thread_loop;
std::vector<int> cu_thread_sync;
MemoryMode cu_mode;
std::string cu_nx_name, cu_ny_name, cu_kernel_name; // TODO remove
std::vector<std::string> Vcu_kernel_name;
int nonDummyLevel(int stmt, int level);
bool symbolExists(std::string s);
void addSync(int stmt, std::string idx);
void printSyncs();
void renameIndex(int stmt, std::string idx, std::string newName);
bool validIndexes(int stmt, const std::vector<std::string>& idxs);
omega::CG_outputRepr* extractCudaUB(int stmt_num, int level, int &outUpperBound, int &outLowerBound);
void printCode(int stmt_num, int effort=3, bool actuallyPrint=true) const;
void printRuntimeInfo() const;
void printIndexes() const;
chillAST_node* getCode(int effort=3) const;
chillAST_node* getCode(int effort, std::set<int> stmts) const;
void printIS();
void permute_cuda(int stmt, const std::vector<std::string>& curOrder);
//protonu-writing a wrapper for the Chun's new permute function
bool permute(int stmt_num, const std::vector<int> &pi);
//end--protonu.
void tile_cuda(int stmt, int level, int outer_level, TilingMethodType method = CountedTile);
//void tile_cuda(int stmt, int level, int outer_level);
void tile_cuda(int level, int tile_size, int outer_level, std::string idxName, std::string ctrlName, TilingMethodType method=StridedTile);
void tile_cuda(int stmt, int level, int tile_size, int outer_level, std::string idxName, std::string ctrlName, TilingMethodType method=StridedTile);
bool datacopy_privatized_cuda(int stmt_num, int level, const std::string &array_name, const std::vector<int> &privatized_levels, bool allow_extra_read = false, int fastest_changing_dimension = -1, int padding_stride = 1, int padding_alignment = 1, bool cuda_shared=false);
bool datacopy_cuda(int stmt_num, int level, const std::string &array_name, std::vector<std::string> new_idxs, bool allow_extra_read = false, int fastest_changing_dimension = -1, int padding_stride = 1, int padding_alignment = 4, bool cuda_shared=false);
bool unroll_cuda(int stmt_num, int level, int unroll_amount);
void flatten_cuda(int stmt_num, std::string idxs, std::vector<int> &loop_levels, std::string inspector_name);
void ELLify_cuda(int stmt_num, std::vector<std::string> arrays_to_pad, int pad_to,bool dense_pad, std::string pos_array_name);
void distribute_cuda(std::vector<int> &stmt_nums, int loop_level);
void fuse_cuda(std::vector<int> &stmt_nums, int loop_level);
void peel_cuda(int stmt_num, int level, int amount);
void shift_to_cuda(int stmt_num, int level, int absolute_position);
void scalar_expand_cuda(int stmt_num, std::vector<int> level, std::string arrName, int memory_type =0, int padding =0,int assign_then_accumulate = 1);
void split_with_alignment_cuda(int stmt_num, int level, int alignment, int direction=0);
void compact_cuda(int stmt_num, int level, std::string new_array, int zero,
std::string data_array);
void make_dense_cuda(int stmt_num, int loop_level, std::string new_loop_index);
void addKnown_cuda(std::string var, int value);
void skew_cuda(std::vector<int> stmt_num,int level, std::vector<int> coefs);
void reduce_cuda(int stmt_num, std::vector<int> level, int param, std::string func_name, std::vector<int> seq_level, int bound_level=-1);
//protonu--using texture memory
void copy_to_texture(const char *array_name);
void copy_to_constant(const char *array_name);
int findCurLevel(int stmt, std::string idx);
/**
*
* @param kernel_name Name of the GPU generated kernel
* @param nx Iteration space over the x dimention
* @param ny Iteration space over the y dimention
* @param tx Tile dimention over x dimention
* @param ty Tile dimention over the y dimention
* @param num_reduce The number of dimentions to reduce by mapping to the GPU implicit blocks/threads
*/
//stmnt_num is referenced from the perspective of being inside the cudize block loops
bool cudaize_v2(std::string kernel_name, std::map<std::string, int> array_dims,
std::vector<std::string> blockIdxs, std::vector<std::string> threadIdxs);
bool cudaize_v3(int stmt_num, // 5 args, starting with stmt number
std::string kernel_name,
std::map<std::string, int> array_dims,
std::vector<std::string> blockIdxs,
std::vector<std::string> threadIdxs,std::vector<std::string> kernel_params);
chillAST_FunctionDecl *function_that_contains_this_loop;
chillAST_node* cudaize_codegen_v2();
chillAST_node* codegen();
//protonu--have to add the constructors for the new class
//and maybe destructors (?)
LoopCuda();
//LoopCuda(IR_Code *ir, tree_for *tf, global_symtab* gsym);
LoopCuda(IR_Control *ir_c, int loop_num);
~LoopCuda();
};
#endif