VM2D  1.12
Vortex methods for 2D flows simulation
BHcu Namespace Reference

Classes

struct  CudaCalcGab
 
struct  CudaSorter
 

Functions

void rebuildBaseTree (CUDApointers &ptrs, const int nbodies, const realVortex *vtxl, int nnodes, int order, double *timing)
 
double memoryAllocate (CUDApointers &ptrs, int nnodes, int nbodies, int nbodiesOld, int blocks, int order)
 
double wrapperInfluence (const realVortex *vtxl, realPoint *vell, real *epsastl, CUDApointers &ptrs, int nbodies, double *timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t *nVtxs, double **ptrVtxs)
 
double wrapperInfluenceToPoints (const realVortex *vtxl, const realVortex *pointsl, realPoint *vell, real *epsastl, CUDApointers &ptrs, bool rebuild, int nbodies, int npoints, double *timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t *nVtxs, double **ptrVtxs)
 npoints - More...
 
double wrapperInfluenceToRHS (const realVortex *dev_ptr_vt, const double *dev_ptr_pt, double *dev_ptr_rhs, double *dev_ptr_rhslin, CUDApointers &ptrs, bool rebuild, int nvt, int nTotPan, double *timingsToRHS, double theta, size_t &nbodiesOld, int nbodiesUp, int order, int scheme)
 
double wrapperDiffusiveVelo (const realVortex *vtxl, real *i1l, realPoint *i2l, real *epsastl, CUDApointers &ptrs, bool rebuild, int nbodies, double *timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t *nVtxs, double **ptrVtxs)
 

Variables

const real IDPI = (real)0.15915494309189534
 

Function Documentation

double BHcu::memoryAllocate ( CUDApointers &  ptrs,
int  nnodes,
int  nbodies,
int  nbodiesOld,
int  blocks,
int  order 
)

For Morton tree

For MortonTree

Definition at line 193 of file wrapper.cpp.

194  {
195  double starttime, endtime;
196  starttime = omp_get_wtime();
197 
198  if (nbodiesOld > 0)
199  {
200  //std::cout << "BHgpu: free CUDA-memory" << std::endl;
201  cudaDelete(ptrs.massl);
202 
203  cudaDelete(ptrs.momsl);
204  cudaDelete(ptrs.El);
205 
206  cudaDelete(ptrs.maxrl);
207  cudaDelete(ptrs.minrl);
208 
210  cudaDelete(ptrs.MmortonCodesKeyUnsortl);
211  cudaDelete(ptrs.MmortonCodesIdxUnsortl);
212  cudaDelete(ptrs.MmortonCodesKeyl);
213  cudaDelete(ptrs.MmortonCodesIdxl);
214 
215  cudaDelete(ptrs.Mposl);
216  cudaDelete(ptrs.Mlowerl);
217  cudaDelete(ptrs.Mupperl);
218  cudaDelete(ptrs.Mparentl);
219  cudaDelete(ptrs.Mchildl);
220  cudaDelete(ptrs.Mrangel);
221 
222  cudaDelete(ptrs.MlevelUnsortl);
223  cudaDelete(ptrs.MlevelSortl);
224  cudaDelete(ptrs.MindexUnsortl);
225  cudaDelete(ptrs.MindexSortl);
226  cudaDelete(ptrs.MindexSortTl);
227  }
228 
229  //std::cout << "BHgpu: allocation GPU-memory: nbodies = " << nbodies << ", nnodes = " << nnodes << ", order = " << order << std::endl;
230 
231  //unsigned long long int mem = 0;
232  ptrs.massl = (int*)cudaNew(nbodies - 1, sizeof(int));
233  //mem += (nbodies - 1) * sizeof(int);
234 
235  ptrs.momsl = (realPoint*)cudaNew((nbodies - 1) * order, sizeof(realPoint));
236 
237  //printf("ALLOCATED for MOMS = %d bytes for %d bodies, order = %d, sizeof = %d\n", int((nbodies - 1) * order * sizeof(realPoint)), nbodies - 1, order, sizeof(realPoint));
238 
239  ptrs.El = nullptr;
240  //mem += (nbodies - 1) * order * sizeof(realPoint);
241 
242  ptrs.maxrl = (realPoint*)cudaNew(blocks * FACTOR1, sizeof(realPoint));
243  ptrs.minrl = (realPoint*)cudaNew(blocks * FACTOR1, sizeof(realPoint));
244  //mem += 2 * blocks * FACTOR1 * sizeof(realPoint);
245 
247  ptrs.MmortonCodesKeyUnsortl = (int*)cudaNew(nbodies, sizeof(int));
248  ptrs.MmortonCodesKeyl = (int*)cudaNew(nbodies, sizeof(int));
249  ptrs.MmortonCodesIdxUnsortl = (int*)cudaNew(nbodies, sizeof(int));
250  ptrs.MmortonCodesIdxl = (int*)cudaNew(nbodies, sizeof(int));
251  //mem += 4 * nbodies * sizeof(int);
252 
253  ptrs.Mposl = (realPoint*)cudaNew(nbodies - 1, sizeof(realPoint));
254  ptrs.Mlowerl = (realPoint*)cudaNew(nbodies - 1, sizeof(realPoint));
255  ptrs.Mupperl = (realPoint*)cudaNew(nbodies - 1, sizeof(realPoint));
256  //mem += 3 * (nbodies - 1) * sizeof(realPoint);
257 
258  ptrs.Mparentl = (int*)cudaNew(nnodes, sizeof(int));
259  //mem += nnodes * sizeof(int);
260 
261  ptrs.Mchildl = (intPair*)cudaNew(nbodies - 1, sizeof(intPair));
262  //mem += (nbodies - 1) * sizeof(intPair);
263 
264  ptrs.Mrangel = (intPair*)cudaNew(nnodes, sizeof(intPair)); //Нужно ли для всех?
265  //mem += nnodes * sizeof(intPair);
266 
267  ptrs.MlevelUnsortl = (int*)cudaNew(nbodies - 1, sizeof(int));
268  ptrs.MlevelSortl = (int*)cudaNew(nbodies - 1, sizeof(int));
269  ptrs.MindexUnsortl = (int*)cudaNew(nbodies - 1, sizeof(int));
270  ptrs.MindexSortl = (int*)cudaNew(nbodies - 1, sizeof(int));
271  ptrs.MindexSortTl = (int*)cudaNew(nbodies - 1, sizeof(int));
272  //mem += 5 * (nbodies - 1) * sizeof(int);
273 
274  endtime = omp_get_wtime();
275  return endtime - starttime;
276  }

Here is the caller graph for this function:

void BHcu::rebuildBaseTree ( CUDApointers &  ptrs,
const int  nbodies,
const realVortex *  vtxl,
int  nnodes,
int  order,
double *  timing 
)

Definition at line 174 of file wrapper.cpp.

175  {
176  timing[0] += cuInitializationKernel();
177  timing[1] += McuBoundingBoxKernel(ptrs, nbodies, vtxl);
178 
179  timing[2] += McuMortonCodesKernel(ptrs, nbodies, vtxl);
180  timing[2] += McuMortonInternalNodesKernel(ptrs, nbodies);
181  timing[2] += McuMortonInternalCellsGeometryKernel(ptrs, nbodies, nnodes);
182 
183  timing[3] += cuClearKernel2(ptrs, order, nnodes, nbodies);
184 
185  timing[4] += cuAABBKernel2(ptrs, nnodes, nbodies, vtxl);
186 
187  timing[4] += cuClearKernel2(ptrs, order, nnodes, nbodies);
188 
189  timing[4] += cuSummarizationKernel2(ptrs, order, nnodes, nbodies, vtxl);
190  }

Here is the caller graph for this function:

double BHcu::wrapperDiffusiveVelo ( const realVortex *  vtxl,
real *  i1l,
realPoint *  i2l,
real *  epsastl,
CUDApointers &  ptrs,
bool  rebuild,
int  nbodies,
double *  timing,
real  eps,
real  theta,
size_t &  nbodiesOld,
int  nbodiesUp,
int  order,
size_t  nAfls,
size_t *  nVtxs,
double **  ptrVtxs 
)

Definition at line 497 of file wrapper.cpp.

499  {
500  double starttime, endtime;
501  starttime = omp_get_wtime();
502 
503  //Число мультипроцессоров, заполняется функцией setBlocks(blocks)
504  int blocks;
505 
506  //Число ячеек дерева и тел
507  int nnodes, nnodesUp;
508 
509  //Радиус вихря и параметр близости и их квадраты
510  real epssq = (real)(eps * eps);
511  real itolsq = (real)(1 / (theta * theta));
512 
513  CudaSelect(0);
514  setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)
515 
516  nnodes = nbodies * 2;
517  if (nnodes < 1024 * blocks)
518  nnodes = 1024 * blocks;
519  while ((nnodes & (32 - 1)) != 0) // 32 - это размер варпа
520  nnodes++;
521  nnodes--;
522 
523  if (rebuild)
524  {
525  nnodesUp = nbodiesUp * 2;
526  if (nnodesUp < 1024 * blocks)
527  nnodesUp = 1024 * blocks;
528  while ((nnodesUp & (32 - 1)) != 0) // 32 - это размер варпа
529  nnodesUp++;
530  nnodesUp--;
531  }
532 
533  KernelsOptimization();
534 
535  for (int i = 0; i < 6; i++)
536  timing[i] = 0;
537 
538  if (rebuild)
539  {
540  if (nbodiesUp > nbodiesOld)
541  timing[1] += memoryAllocate(ptrs, nnodesUp, nbodiesUp, (int)nbodiesOld, blocks, order);
542 
543  nbodiesOld = nbodiesUp;
544  rebuildBaseTree(ptrs, nbodies, vtxl, nnodes, order, timing);
545  }
546 
547  timing[5] += cuDiffVelCalculationKernel2(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl, i1l, i2l, true, epsastl, nAfls, nVtxs, ptrVtxs);
548  timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
549 
550  endtime = omp_get_wtime();
551  return endtime - starttime;
552  }
void rebuildBaseTree(CUDApointers &ptrs, const int nbodies, const realVortex *vtxl, int nnodes, int order, double *timing)
Definition: wrapper.cpp:174
double memoryAllocate(CUDApointers &ptrs, int nnodes, int nbodies, int nbodiesOld, int blocks, int order)
Definition: wrapper.cpp:193

Here is the call graph for this function:

Here is the caller graph for this function:

double BHcu::wrapperInfluence ( const realVortex *  vtxl,
realPoint *  vell,
real *  epsastl,
CUDApointers &  ptrs,
int  nbodies,
double *  timing,
real  eps,
real  theta,
size_t &  nbodiesOld,
int  nbodiesUp,
int  order,
size_t  nAfls,
size_t *  nVtxs,
double **  ptrVtxs 
)

Parameters
vtxl

GPU

Parameters
vell

GPU

Parameters
epsastl

eps* GPU

Parameters
ptrs

,

Parameters
nbodies

Parameters
timing

7-

Parameters
eps

eps

Parameters
theta

theta

Parameters
nbodiesOld

,

Parameters
nbodiesUp

( ),

Parameters
order

order

Parameters
nAfls

Parameters
nVtxs

,

Parameters
ptrVtxs

Returns

Definition at line 280 of file wrapper.cpp.

286  {
287  double starttime, endtime;
288  starttime = omp_get_wtime();
289 
290  //Число мультипроцессоров, заполняется функцией setBlocks(blocks)
291  int blocks;
292 
293  //Число ячеек дерева и тел
294  int nnodes, nnodesUp;
295 
296  //Радиус вихря и параметр близости и их квадраты
297  real epssq = (real)(eps * eps);
298  real itolsq = (real)(1 / (theta * theta));
299 
300  CudaSelect(0);
301  setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)
302 
303  nnodes = nbodies * 2;
304  if (nnodes < 1024 * blocks)
305  nnodes = 1024 * blocks;
306  while ((nnodes & (32 - 1)) != 0) // 32 - это размер варпа
307  nnodes++;
308  nnodes--;
309 
310 
311  nnodesUp = nbodiesUp * 2;
312  if (nnodesUp < 1024 * blocks)
313  nnodesUp = 1024 * blocks;
314  while ((nnodesUp & (32 - 1)) != 0) // 32 - это размер варпа
315  nnodesUp++;
316  nnodesUp--;
317 
318  KernelsOptimization();
319 
320  for (int i = 0; i < 6; i++)
321  timing[i] = 0;
322 
323  if (nbodiesUp > nbodiesOld)
324  timing[1] += memoryAllocate(ptrs, nnodesUp, nbodiesUp, (int)nbodiesOld, blocks, order);
325 
326  nbodiesOld = nbodiesUp;
327  rebuildBaseTree(ptrs, nbodies, vtxl, nnodes, order, timing);
328 
329  timing[5] += cuForceCalculationKernel2points(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl,
330  ptrs.MmortonCodesIdxl, nbodies, vtxl, vell, true, epsastl, nAfls, nVtxs, ptrVtxs);
331  timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
332 
333  endtime = omp_get_wtime();
334  return endtime - starttime;
335 
336  }
void rebuildBaseTree(CUDApointers &ptrs, const int nbodies, const realVortex *vtxl, int nnodes, int order, double *timing)
Definition: wrapper.cpp:174
double memoryAllocate(CUDApointers &ptrs, int nnodes, int nbodies, int nbodiesOld, int blocks, int order)
Definition: wrapper.cpp:193

Here is the call graph for this function:

Here is the caller graph for this function:

double BHcu::wrapperInfluenceToPoints ( const realVortex *  vtxl,
const realVortex *  pointsl,
realPoint *  vell,
real *  epsastl,
CUDApointers &  ptrs,
bool  rebuild,
int  nbodies,
int  npoints,
double *  timing,
real  eps,
real  theta,
size_t &  nbodiesOld,
int  nbodiesUp,
int  order,
size_t  nAfls,
size_t *  nVtxs,
double **  ptrVtxs 
)

npoints -

Definition at line 340 of file wrapper.cpp.

345  {
346  double starttime, endtime;
347  starttime = omp_get_wtime();
348 
349  //Число мультипроцессоров, заполняется функцией setBlocks(blocks)
350  int blocks;
351 
352  //Число ячеек дерева и тел
353  int nnodes, nnodesUp;
354 
355  //Радиус вихря и параметр близости и их квадраты
356  real epssq = (real)(eps * eps);
357  real itolsq = (real)(1 / (theta * theta));
358 
359  CudaSelect(0);
360  setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)
361 
362  nnodes = nbodies * 2;
363  if (nnodes < 1024 * blocks)
364  nnodes = 1024 * blocks;
365  while ((nnodes & (32 - 1)) != 0) // 32 - это размер варпа
366  nnodes++;
367  nnodes--;
368 
369  if (rebuild)
370  {
371  nnodesUp = nbodiesUp * 2;
372  if (nnodesUp < 1024 * blocks)
373  nnodesUp = 1024 * blocks;
374  while ((nnodesUp & (32 - 1)) != 0) // 32 - это размер варпа
375  nnodesUp++;
376  nnodesUp--;
377  }
378 
379  KernelsOptimization();
380 
381 
382  for (int i = 0; i < 6; i++)
383  timing[i] = 0;
384 
385  if (rebuild)
386  {
387  if (nbodiesUp > nbodiesOld)
388  timing[1] += memoryAllocate(ptrs, nnodesUp, nbodiesUp, (int)nbodiesOld, blocks, order);
389 
390  nbodiesOld = nbodiesUp;
391  rebuildBaseTree(ptrs, nbodies, vtxl, nnodes, order, timing);
392  }
393 
394  CudaSorter srt(npoints, pointsl);
395  timing[5] += srt.calc();
396 
397  timing[5] += cuForceCalculationKernel2points(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl, srt.pointsMortonCodesIdxl, npoints, pointsl, vell, true, epsastl,
398  nAfls, nVtxs, ptrVtxs);
399 
400 
401  timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
402 
403  endtime = omp_get_wtime();
404  return endtime - starttime;
405 
406  }
void rebuildBaseTree(CUDApointers &ptrs, const int nbodies, const realVortex *vtxl, int nnodes, int order, double *timing)
Definition: wrapper.cpp:174
double memoryAllocate(CUDApointers &ptrs, int nnodes, int nbodies, int nbodiesOld, int blocks, int order)
Definition: wrapper.cpp:193

Here is the call graph for this function:

double BHcu::wrapperInfluenceToRHS ( const realVortex *  dev_ptr_vt,
const double *  dev_ptr_pt,
double *  dev_ptr_rhs,
double *  dev_ptr_rhslin,
CUDApointers &  ptrs,
bool  rebuild,
int  nvt,
int  nTotPan,
double *  timingsToRHS,
double  theta,
size_t &  nbodiesOld,
int  nbodiesUp,
int  order,
int  scheme 
)

Definition at line 410 of file wrapper.cpp.

424  {
425  double starttime, endtime;
426  starttime = omp_get_wtime();
427 
428  //Число мультипроцессоров, заполняется функцией setBlocks(blocks)
429  int blocks;
430 
431  //Число ячеек дерева и тел
432  int nnodes, nnodesUp;
433 
434  //Радиус вихря и параметр близости и их квадраты
435  real itolsq = (real)(1 / (theta * theta));
436 
437  CudaSelect(0);
438  setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)
439 
440  nnodes = nvt * 2;
441  if (nnodes < 1024 * blocks)
442  nnodes = 1024 * blocks;
443  while ((nnodes & (32 - 1)) != 0) // 32 - это размер варпа
444  nnodes++;
445  nnodes--;
446 
447  if (rebuild)
448  {
449  nnodesUp = nbodiesUp * 2;
450  if (nnodesUp < 1024 * blocks)
451  nnodesUp = 1024 * blocks;
452  while ((nnodesUp & (32 - 1)) != 0) // 32 - это размер варпа
453  nnodesUp++;
454  nnodesUp--;
455  }
456 
457  KernelsOptimization();
458 
459  for (int i = 0; i < 6; i++)
460  timingsToRHS[i] = 0;
461 
462  if (rebuild)
463  {
464  if (nbodiesUp > nbodiesOld)
465  timingsToRHS[1] += memoryAllocate(ptrs, nnodesUp, nbodiesUp, (int)nbodiesOld, blocks, order);
466 
467  nbodiesOld = nbodiesUp;
468  rebuildBaseTree(ptrs, nvt, dev_ptr_vt, nnodes, order, timingsToRHS);
469  }
470 
471  Vortex2D* pointsl = (Vortex2D*)cudaNew(nTotPan, sizeof(Vortex2D));
472  realPoint* El = (realPoint*)cudaNew(nTotPan * order, sizeof(realPoint));
473 
474  McuVerticesToControlPoints(nTotPan, (double*)dev_ptr_pt, (double*)pointsl);
475 
476  CudaSorter srt(nTotPan, pointsl);
477  timingsToRHS[5] += srt.calc();
478 
479  double* ptrToLin = nullptr;
480  if (scheme == 1)
481  ptrToLin = dev_ptr_rhslin;
482 
483  timingsToRHS[5] += cuRhsCalculationKernel(ptrs, order, nnodes, nvt, itolsq, dev_ptr_vt,
484  srt.pointsMortonCodesIdxl, El,
485  nTotPan, dev_ptr_pt, (const real*)pointsl, dev_ptr_rhs, ptrToLin);
486 
487  cudaDelete(El);
488  cudaDelete(pointsl);
489 
490  timingsToRHS[6] = timingsToRHS[1] + timingsToRHS[2] + timingsToRHS[3] + timingsToRHS[4] + timingsToRHS[5];
491 
492  endtime = omp_get_wtime();
493  return endtime - starttime;
494 
495  }
void rebuildBaseTree(CUDApointers &ptrs, const int nbodies, const realVortex *vtxl, int nnodes, int order, double *timing)
Definition: wrapper.cpp:174
double memoryAllocate(CUDApointers &ptrs, int nnodes, int nbodies, int nbodiesOld, int blocks, int order)
Definition: wrapper.cpp:193

Here is the call graph for this function:

Here is the caller graph for this function:

Variable Documentation

const real BHcu::IDPI = (real)0.15915494309189534

Definition at line 95 of file wrapper.cpp.