89 #include "cuKernels.cuh" 95 const real
IDPI = (real)0.15915494309189534;
112 maxpt = (realPoint*)cudaNew(blocks * FACTOR1,
sizeof(realPoint));
113 minpt = (realPoint*)cudaNew(blocks * FACTOR1,
sizeof(realPoint));
116 float calc(
int npoints,
const realVortex* pointsl)
119 time = McuBoundingBoxKernelFree(
nullptr, maxpt, minpt, npoints, pointsl);
143 : npoints_(npoints), pointsl_(pointsl)
145 pointsMortonCodesKeyl = (
int*)cudaNew(npoints,
sizeof(
int));
146 pointsMortonCodesKeyUnsortl = (
int*)cudaNew(npoints,
sizeof(
int));
148 pointsMortonCodesIdxl = (
int*)cudaNew(npoints,
sizeof(
int));
149 pointsMortonCodesIdxUnsortl = (
int*)cudaNew(npoints,
sizeof(
int));
154 float timeGab, timeCodes;
155 timeGab = gab.
calc(npoints_, pointsl_);
157 timeCodes = McuMortonCodesKernelFree(gab.
maxpt, gab.
minpt,
158 pointsMortonCodesKeyUnsortl, pointsMortonCodesIdxUnsortl,
159 pointsMortonCodesKeyl, pointsMortonCodesIdxl,
nullptr,
161 return timeGab + timeCodes;
166 cudaDelete(pointsMortonCodesKeyl);
167 cudaDelete(pointsMortonCodesKeyUnsortl);
168 cudaDelete(pointsMortonCodesIdxl);
169 cudaDelete(pointsMortonCodesIdxUnsortl);
174 void rebuildBaseTree(CUDApointers& ptrs,
const int nbodies,
const realVortex* vtxl,
int nnodes,
int order,
double* timing)
176 timing[0] += cuInitializationKernel();
177 timing[1] += McuBoundingBoxKernel(ptrs, nbodies, vtxl);
179 timing[2] += McuMortonCodesKernel(ptrs, nbodies, vtxl);
180 timing[2] += McuMortonInternalNodesKernel(ptrs, nbodies);
181 timing[2] += McuMortonInternalCellsGeometryKernel(ptrs, nbodies, nnodes);
183 timing[3] += cuClearKernel2(ptrs, order, nnodes, nbodies);
185 timing[4] += cuAABBKernel2(ptrs, nnodes, nbodies, vtxl);
187 timing[4] += cuClearKernel2(ptrs, order, nnodes, nbodies);
189 timing[4] += cuSummarizationKernel2(ptrs, order, nnodes, nbodies, vtxl);
195 double starttime, endtime;
196 starttime = omp_get_wtime();
201 cudaDelete(ptrs.massl);
203 cudaDelete(ptrs.momsl);
206 cudaDelete(ptrs.maxrl);
207 cudaDelete(ptrs.minrl);
210 cudaDelete(ptrs.MmortonCodesKeyUnsortl);
211 cudaDelete(ptrs.MmortonCodesIdxUnsortl);
212 cudaDelete(ptrs.MmortonCodesKeyl);
213 cudaDelete(ptrs.MmortonCodesIdxl);
215 cudaDelete(ptrs.Mposl);
216 cudaDelete(ptrs.Mlowerl);
217 cudaDelete(ptrs.Mupperl);
218 cudaDelete(ptrs.Mparentl);
219 cudaDelete(ptrs.Mchildl);
220 cudaDelete(ptrs.Mrangel);
222 cudaDelete(ptrs.MlevelUnsortl);
223 cudaDelete(ptrs.MlevelSortl);
224 cudaDelete(ptrs.MindexUnsortl);
225 cudaDelete(ptrs.MindexSortl);
226 cudaDelete(ptrs.MindexSortTl);
232 ptrs.massl = (
int*)cudaNew(nbodies - 1,
sizeof(
int));
235 ptrs.momsl = (realPoint*)cudaNew((nbodies - 1) * order,
sizeof(realPoint));
242 ptrs.maxrl = (realPoint*)cudaNew(blocks * FACTOR1,
sizeof(realPoint));
243 ptrs.minrl = (realPoint*)cudaNew(blocks * FACTOR1,
sizeof(realPoint));
247 ptrs.MmortonCodesKeyUnsortl = (
int*)cudaNew(nbodies,
sizeof(
int));
248 ptrs.MmortonCodesKeyl = (
int*)cudaNew(nbodies,
sizeof(
int));
249 ptrs.MmortonCodesIdxUnsortl = (
int*)cudaNew(nbodies,
sizeof(
int));
250 ptrs.MmortonCodesIdxl = (
int*)cudaNew(nbodies,
sizeof(
int));
253 ptrs.Mposl = (realPoint*)cudaNew(nbodies - 1,
sizeof(realPoint));
254 ptrs.Mlowerl = (realPoint*)cudaNew(nbodies - 1,
sizeof(realPoint));
255 ptrs.Mupperl = (realPoint*)cudaNew(nbodies - 1,
sizeof(realPoint));
258 ptrs.Mparentl = (
int*)cudaNew(nnodes,
sizeof(
int));
261 ptrs.Mchildl = (intPair*)cudaNew(nbodies - 1,
sizeof(intPair));
264 ptrs.Mrangel = (intPair*)cudaNew(nnodes,
sizeof(intPair));
267 ptrs.MlevelUnsortl = (
int*)cudaNew(nbodies - 1,
sizeof(
int));
268 ptrs.MlevelSortl = (
int*)cudaNew(nbodies - 1,
sizeof(
int));
269 ptrs.MindexUnsortl = (
int*)cudaNew(nbodies - 1,
sizeof(
int));
270 ptrs.MindexSortl = (
int*)cudaNew(nbodies - 1,
sizeof(
int));
271 ptrs.MindexSortTl = (
int*)cudaNew(nbodies - 1,
sizeof(
int));
274 endtime = omp_get_wtime();
275 return endtime - starttime;
281 real* epsastl, CUDApointers& ptrs,
282 int nbodies,
double* timing, real eps, real theta,
283 size_t& nbodiesOld,
int nbodiesUp,
285 size_t nAfls,
size_t* nVtxs,
double** ptrVtxs)
287 double starttime, endtime;
288 starttime = omp_get_wtime();
294 int nnodes, nnodesUp;
297 real epssq = (real)(eps * eps);
298 real itolsq = (real)(1 / (theta * theta));
303 nnodes = nbodies * 2;
304 if (nnodes < 1024 * blocks)
306 while ((nnodes & (32 - 1)) != 0)
311 nnodesUp = nbodiesUp * 2;
312 if (nnodesUp < 1024 * blocks)
314 while ((nnodesUp & (32 - 1)) != 0)
318 KernelsOptimization();
320 for (
int i = 0; i < 6; i++)
323 if (nbodiesUp > nbodiesOld)
324 timing[1] +=
memoryAllocate(ptrs, nnodesUp, nbodiesUp, (
int)nbodiesOld, blocks, order);
326 nbodiesOld = nbodiesUp;
329 timing[5] += cuForceCalculationKernel2points(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl,
330 ptrs.MmortonCodesIdxl, nbodies, vtxl, vell,
true, epsastl, nAfls, nVtxs, ptrVtxs);
331 timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
333 endtime = omp_get_wtime();
334 return endtime - starttime;
341 const realVortex* vtxl,
const realVortex* pointsl, realPoint* vell, real* epsastl,
342 CUDApointers& ptrs,
bool rebuild,
int nbodies,
int npoints,
double* timing, real eps, real theta,
343 size_t& nbodiesOld,
int nbodiesUp,
int order,
344 size_t nAfls,
size_t* nVtxs,
double** ptrVtxs)
346 double starttime, endtime;
347 starttime = omp_get_wtime();
353 int nnodes, nnodesUp;
356 real epssq = (real)(eps * eps);
357 real itolsq = (real)(1 / (theta * theta));
362 nnodes = nbodies * 2;
363 if (nnodes < 1024 * blocks)
365 while ((nnodes & (32 - 1)) != 0)
371 nnodesUp = nbodiesUp * 2;
372 if (nnodesUp < 1024 * blocks)
374 while ((nnodesUp & (32 - 1)) != 0)
379 KernelsOptimization();
382 for (
int i = 0; i < 6; i++)
387 if (nbodiesUp > nbodiesOld)
388 timing[1] +=
memoryAllocate(ptrs, nnodesUp, nbodiesUp, (
int)nbodiesOld, blocks, order);
390 nbodiesOld = nbodiesUp;
395 timing[5] += srt.
calc();
397 timing[5] += cuForceCalculationKernel2points(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl, srt.
pointsMortonCodesIdxl, npoints, pointsl, vell,
true, epsastl,
398 nAfls, nVtxs, ptrVtxs);
401 timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
403 endtime = omp_get_wtime();
404 return endtime - starttime;
411 const realVortex* dev_ptr_vt,
412 const double* dev_ptr_pt,
414 double* dev_ptr_rhslin,
421 double* timingsToRHS,
423 size_t& nbodiesOld,
int nbodiesUp,
int order,
int scheme)
425 double starttime, endtime;
426 starttime = omp_get_wtime();
432 int nnodes, nnodesUp;
435 real itolsq = (real)(1 / (theta * theta));
441 if (nnodes < 1024 * blocks)
443 while ((nnodes & (32 - 1)) != 0)
449 nnodesUp = nbodiesUp * 2;
450 if (nnodesUp < 1024 * blocks)
452 while ((nnodesUp & (32 - 1)) != 0)
457 KernelsOptimization();
459 for (
int i = 0; i < 6; i++)
464 if (nbodiesUp > nbodiesOld)
465 timingsToRHS[1] +=
memoryAllocate(ptrs, nnodesUp, nbodiesUp, (
int)nbodiesOld, blocks, order);
467 nbodiesOld = nbodiesUp;
471 Vortex2D* pointsl = (Vortex2D*)cudaNew(nTotPan,
sizeof(Vortex2D));
472 realPoint* El = (realPoint*)cudaNew(nTotPan * order,
sizeof(realPoint));
474 McuVerticesToControlPoints(nTotPan, (
double*)dev_ptr_pt, (
double*)pointsl);
477 timingsToRHS[5] += srt.
calc();
479 double* ptrToLin =
nullptr;
481 ptrToLin = dev_ptr_rhslin;
483 timingsToRHS[5] += cuRhsCalculationKernel(ptrs, order, nnodes, nvt, itolsq, dev_ptr_vt,
485 nTotPan, dev_ptr_pt, (
const real*)pointsl, dev_ptr_rhs, ptrToLin);
490 timingsToRHS[6] = timingsToRHS[1] + timingsToRHS[2] + timingsToRHS[3] + timingsToRHS[4] + timingsToRHS[5];
492 endtime = omp_get_wtime();
493 return endtime - starttime;
497 double wrapperDiffusiveVelo(
const realVortex* vtxl, real* i1l, realPoint* i2l, real* epsastl, CUDApointers& ptrs,
bool rebuild,
int nbodies,
double* timing, real eps, real theta,
size_t& nbodiesOld,
int nbodiesUp,
int order,
498 size_t nAfls,
size_t* nVtxs,
double** ptrVtxs)
500 double starttime, endtime;
501 starttime = omp_get_wtime();
507 int nnodes, nnodesUp;
510 real epssq = (real)(eps * eps);
511 real itolsq = (real)(1 / (theta * theta));
516 nnodes = nbodies * 2;
517 if (nnodes < 1024 * blocks)
519 while ((nnodes & (32 - 1)) != 0)
525 nnodesUp = nbodiesUp * 2;
526 if (nnodesUp < 1024 * blocks)
528 while ((nnodesUp & (32 - 1)) != 0)
533 KernelsOptimization();
535 for (
int i = 0; i < 6; i++)
540 if (nbodiesUp > nbodiesOld)
541 timing[1] +=
memoryAllocate(ptrs, nnodesUp, nbodiesUp, (
int)nbodiesOld, blocks, order);
543 nbodiesOld = nbodiesUp;
547 timing[5] += cuDiffVelCalculationKernel2(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl, i1l, i2l,
true, epsastl, nAfls, nVtxs, ptrVtxs);
548 timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
550 endtime = omp_get_wtime();
551 return endtime - starttime;
int * pointsMortonCodesIdxUnsortl
double wrapperInfluence(const realVortex *vtxl, realPoint *vell, real *epsastl, CUDApointers &ptrs, int nbodies, double *timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t *nVtxs, double **ptrVtxs)
int * pointsMortonCodesIdxl
double wrapperDiffusiveVelo(const realVortex *vtxl, real *i1l, realPoint *i2l, real *epsastl, CUDApointers &ptrs, bool rebuild, int nbodies, double *timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t *nVtxs, double **ptrVtxs)
int * pointsMortonCodesKeyUnsortl
double wrapperInfluenceToRHS(const realVortex *dev_ptr_vt, const double *dev_ptr_pt, double *dev_ptr_rhs, double *dev_ptr_rhslin, CUDApointers &ptrs, bool rebuild, int nvt, int nTotPan, double *timingsToRHS, double theta, size_t &nbodiesOld, int nbodiesUp, int order, int scheme)
void rebuildBaseTree(CUDApointers &ptrs, const int nbodies, const realVortex *vtxl, int nnodes, int order, double *timing)
float calc(int npoints, const realVortex *pointsl)
const realVortex * pointsl_
CudaSorter(int npoints, const realVortex *pointsl)
int * pointsMortonCodesKeyl
double wrapperInfluenceToPoints(const realVortex *vtxl, const realVortex *pointsl, realPoint *vell, real *epsastl, CUDApointers &ptrs, bool rebuild, int nbodies, int npoints, double *timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t *nVtxs, double **ptrVtxs)
npoints -
double memoryAllocate(CUDApointers &ptrs, int nnodes, int nbodies, int nbodiesOld, int blocks, int order)