-
Notifications
You must be signed in to change notification settings - Fork 1
/
2DTLM.cu
284 lines (240 loc) · 9.18 KB
/
2DTLM.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
/***********************************************************************************************************************
* 2D TLM for CUDA - Seámus Doran, Universiy of Nottingham 2020
*
* Simulates a network divided into N*N segments (nodes) of length dl
*
* Origin of line is matched to the source impedance i.e. no reflection from the left side of the source
*
* Line is excited at node Ein{ x, y } with a gaussian voltage
*
* Line is terminated with a short circuit to ground
*
* (results in an equal and opposite reflection at the end of the line)
***********************************************************************************************************************/
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <chrono>
#include <fstream>
#include <ctime> // for clock
#include <cmath>
#define c 299792458 // speed of light in a vacuum
#define mu0 M_PI*4e-7 // magnetic permeability in a vacuum H/m
#define eta0 c*mu0 // wave impedance in free space
using namespace std;
double** declare_array2D(int, int); // Population function
ofstream output("output.out"); // log probe voltage at a pint on the line versus time
/**
* A Struct containing pointers to Data stored on Device
*/
struct dev_data{
double* d_V1;
double* d_V2;
double* d_V3;
double* d_V4;
const double* coeff; /// A list of Coefficients, containing Z and boundary conditions
double* out; /// A list of voltages at the output node
const int* d_Ein; /// The input node
const int* d_Eout; /// The output node
};
/**
* A kernel to apply a Source Voltage to a supplied input node (Ein)
* @param dev pointers to device data
* @param source The source to apply
* @param N The size of Computational Domain
*/
__global__ void tlmApplySource( dev_data dev,double source, int N){
//Apply Source
auto tmp_idx = dev.d_Ein[0] + dev.d_Ein[1] * N;
dev.d_V1[tmp_idx] = dev.d_V1[tmp_idx] + source;
dev.d_V2[tmp_idx] = dev.d_V2[tmp_idx] - source;
dev.d_V3[tmp_idx] = dev.d_V3[tmp_idx] - source;
dev.d_V4[tmp_idx] = dev.d_V4[tmp_idx] + source;
}
/**
* A kernel to 'scatter' impulses based on a previously applied source
* @param dev pointers to device data
* @param N The size of Computational Domain
*/
__global__ void tlmScatter(dev_data dev, int N){
auto idx = blockIdx.x * blockDim.x + threadIdx.x;
auto idy = blockIdx.y * blockDim.y + threadIdx.y;
auto index = idx + idy * N;
//scatter
double Z = dev.coeff[0];
if ( index < N*N){
double I = (2 * dev.d_V1[index] + 2 * dev.d_V4[index] - 2 * dev.d_V2[index] - 2 * dev.d_V3[index]) / (4 * Z);
double V = I*Z;
dev.d_V1[index] = dev.d_V1[index] - V; //port1
dev.d_V2[index] = dev.d_V2[index] + V; //port2
dev.d_V3[index] = dev.d_V3[index] + V; //port3
dev.d_V4[index] = dev.d_V4[index] - V; //port4
}
}
/**
* A kernel to propagate scattered impulses and apply boundary conditions
* @param dev pointers to device data
* @param N The size of Computational Domain
*/
__global__ void tlmConnect(dev_data dev, int N){
auto idx = blockIdx.x*blockDim.x + threadIdx.x;
auto idy = blockIdx.y*blockDim.y + threadIdx.y;
auto index = idx + idy * N;
//Connect
if ( idx > 0 && index < N*N){
auto V = dev.d_V2[index];
dev.d_V2[index] = dev.d_V4[(idx - 1)+ idy * N];
dev.d_V4[(idx - 1) + idy * N] = V;
}
if ( idy > 0 && index < N*N){
auto V = dev.d_V1[index];
dev.d_V1[index] = dev.d_V3[idx + (idy - 1)*N];
dev.d_V3[idx + (idy - 1)*N] = V;
}
//Apply Boundaries
// rXmin = dev.coeff[1], rXmax = dev.coeff[2]
// rYmin = dev.coeff[3], rYmax = dev.coeff[4]
if (idy == N-1*N && idx < N){
dev.d_V3[idx + (N - 1)*N] = dev.coeff[4] * dev.d_V3[idx + (N - 1)*N];
dev.d_V1[idx] = dev.coeff[3] * dev.d_V1[idx];
}
if (idx == N-1 && idy < N) {
dev.d_V4[(N - 1) + idy*N] = dev.coeff[2] * dev.d_V4[(N - 1) + idy*N];
dev.d_V2[idy*N] = dev.coeff[1] * dev.d_V2[idy*N];
}
}
/**
*
* A kernel to evaluate the voltage at a supplied output node (Eout)
* @param dev pointers to device data
* @param n The current time-step index
* @param N The size of Computational Domain
*/
__global__ void tlmApplyProbe(dev_data dev, int n, int N){
auto tmp_idx = dev.d_Eout[0] + dev.d_Eout[1] * N;
dev.out[n] = dev.d_V2[tmp_idx] + dev.d_V4[tmp_idx];
}
int main(){
//Specify Simulation Meta Parameters
int NX = 100; // dim one of nodes
int NY = 100; // dim 2 of nodes
int NT = 8192; // number of time steps
double dl = 1; // set node line segment length in metres
double dt = dl / (sqrt(2.) * c); // set time step duration
//2D mesh variables
double** V1 = declare_array2D(NX, NY);
double** V2 = declare_array2D(NX, NY);
double** V3 = declare_array2D(NX, NY);
double** V4 = declare_array2D(NX, NY);
double v_output[NT];
for (int n = 0; n < NT; n++){
v_output[n] = 0;
}
//boundary coefficients
double rXmin = -1;
double rXmax = -1;
double rYmin = -1;
double rYmax = -1;
// specify mesh simulation parameters
double Z = eta0 / sqrt(2.);
double width = 20 * dt * sqrt(2.);
double delay = 100 * dt * sqrt(2.);
int Ein[] = { 10,10 };
int Eout[] = { 15,15 };
//Group Coefficients
double coeff[] = {Z, rXmin, rXmax, rYmin, rYmax};
//device arrays
double* dev_V1;
double* dev_V2;
double* dev_V3;
double* dev_V4;
double* dev_coeff;
double* dev_output;
int* dev_Ein;
int* dev_Eout;
//allocate memory on device
auto sz = NX * NY * sizeof(double);
cudaMalloc((void**)&dev_V1, sz);
cudaMalloc((void**)&dev_V2, sz);
cudaMalloc((void**)&dev_V3, sz);
cudaMalloc((void**)&dev_V4, sz);
cudaMalloc((void**)&dev_coeff, sizeof(double)*6);
cudaMalloc((void**)&dev_output, sizeof(double)*NT);
cudaMalloc((void**)&dev_Ein, sizeof(int)*2);
cudaMalloc((void**)&dev_Eout, sizeof(int)*2);
//copy memory areas from host to device
cudaMemcpy(dev_V1, V1, NX * NY * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_V2, V2, NX * NY * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_V3, V3, NX * NY * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_V4, V4, NX * NY * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(dev_coeff, coeff, sizeof(double)*6, cudaMemcpyHostToDevice);
cudaMemcpy(dev_Ein, Ein, sizeof(int)*2, cudaMemcpyHostToDevice);
cudaMemcpy(dev_Eout, Eout, sizeof(int)*2, cudaMemcpyHostToDevice);
//Group Device Variables to simplify Kernel Calls
dev_data dev_Data{dev_V1, dev_V2, dev_V3, dev_V4, dev_coeff, dev_output, dev_Ein, dev_Eout};
//Determine Kernel Size
dim3 dimBlock(10,10);
dim3 dimGrid(ceil(NX/dimBlock.x),ceil(NY/dimBlock.y));
//Start Timer
auto t1 = std::chrono::high_resolution_clock::now();
// Start of TLM algorithm
//
// loop over total time NT in steps of dt
for (int n = 0; n < NT; n++)
{
//Calculate V Source for this delta
double source = (1 / sqrt(2.)) * exp(-(n * dt - delay) * (n * dt - delay) / (width * width));
//Apply the newly calculated Source
tlmApplySource <<<1, 1>>> (dev_Data, source, NX);
//Apply Scatter Algorithm
tlmScatter <<<dimGrid, dimBlock>>> (dev_Data, NX);
//Apply Connect Algorithm (Including Boundaries)
tlmConnect <<<dimGrid, dimBlock>>> (dev_Data, NX);
//Get the Output from the mesh
tlmApplyProbe <<<1, 1>>> (dev_Data, n, NX);
}
//Get Result from Device
cudaMemcpy(v_output, dev_output, sizeof(double)*NT, cudaMemcpyDeviceToHost);
//Save output to file
for (int n = 0; n < NT; n++){
output << n * dt << " " << v_output[n] << endl;
}
//End Timer
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = t2-t1;
// End of TLM algorithm
//Close output file
output.close();
//Signify finished
cout << "Done";
//Calculate time / Clocks
std::cout << "\nExecuted in: " << (diff.count()) << "s \n";
cin.get();
// free memory allocated on the GPU
cudaFree(dev_V1);
cudaFree(dev_V2);
cudaFree(dev_V3);
cudaFree(dev_V4);
cudaFree(dev_output);
cudaFree(dev_coeff);
cudaFree(dev_Ein);
cudaFree(dev_Eout);
}
/**
* A function to fill 2D arrays with 0s
* @param NX The Array's X Dimension
* @param NY The Array's Y Dimension
* @return A 2D array of 0s
*/
double** declare_array2D(int NX, int NY) {
auto** V = new double* [NX];
for (int x = 0; x < NX; x++) {
V[x] = new double[NY];
}
for (int x = 0; x < NX; x++) {
for (int y = 0; y < NY; y++) {
V[x][y] = 0;
}
}
return V;
}