Classes
struct	CudaCalcGab

struct	CudaSorter

Functions
void	rebuildBaseTree (CUDApointers &ptrs, const int nbodies, const realVortex vtxl, int nnodes, int order, double timing)

double	memoryAllocate (CUDApointers &ptrs, int nnodes, int nbodies, int nbodiesOld, int blocks, int order)

double	wrapperInfluence (const realVortex vtxl, realPoint vell, real epsastl, CUDApointers &ptrs, int nbodies, double timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t nVtxs, double *ptrVtxs)

double	wrapperInfluenceToPoints (const realVortex vtxl, const realVortex pointsl, realPoint vell, real epsastl, CUDApointers &ptrs, bool rebuild, int nbodies, int npoints, double timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t nVtxs, double **ptrVtxs)
	npoints - �� More...

double	wrapperInfluenceToRHS (const realVortex dev_ptr_vt, const double dev_ptr_pt, double dev_ptr_rhs, double dev_ptr_rhslin, CUDApointers &ptrs, bool rebuild, int nvt, int nTotPan, double *timingsToRHS, double theta, size_t &nbodiesOld, int nbodiesUp, int order, int scheme)

double	wrapperDiffusiveVelo (const realVortex vtxl, real i1l, realPoint i2l, real epsastl, CUDApointers &ptrs, bool rebuild, int nbodies, double timing, real eps, real theta, size_t &nbodiesOld, int nbodiesUp, int order, size_t nAfls, size_t nVtxs, double **ptrVtxs)

Variables
const real	IDPI = (real)0.15915494309189534

Function Documentation

double BHcu::memoryAllocate	(	CUDApointers &	ptrs,
		int	nnodes,
		int	nbodies,
		int	nbodiesOld,
		int	blocks,
		int	order
	)

For Morton tree

For MortonTree

Definition at line 193 of file wrapper.cpp.

     {
         double starttime, endtime;
         starttime = omp_get_wtime();
 
         if (nbodiesOld > 0)
         {
             //std::cout << "BHgpu: free CUDA-memory" << std::endl;
             cudaDelete(ptrs.massl);
 
             cudaDelete(ptrs.momsl);
             cudaDelete(ptrs.El);
 
             cudaDelete(ptrs.maxrl);
             cudaDelete(ptrs.minrl);
 
             cudaDelete(ptrs.MmortonCodesKeyUnsortl);
             cudaDelete(ptrs.MmortonCodesIdxUnsortl);
             cudaDelete(ptrs.MmortonCodesKeyl);
             cudaDelete(ptrs.MmortonCodesIdxl);
 
             cudaDelete(ptrs.Mposl);
             cudaDelete(ptrs.Mlowerl);
             cudaDelete(ptrs.Mupperl);
             cudaDelete(ptrs.Mparentl);
             cudaDelete(ptrs.Mchildl);
             cudaDelete(ptrs.Mrangel);
 
             cudaDelete(ptrs.MlevelUnsortl);
             cudaDelete(ptrs.MlevelSortl);
             cudaDelete(ptrs.MindexUnsortl);
             cudaDelete(ptrs.MindexSortl);
             cudaDelete(ptrs.MindexSortTl);
         }
 
         //std::cout << "BHgpu: allocation GPU-memory: nbodies = " << nbodies << ", nnodes = " << nnodes << ", order = " << order << std::endl;
 
         //unsigned long long int mem = 0;
         ptrs.massl = (int*)cudaNew(nbodies - 1, sizeof(int));
         //mem += (nbodies - 1) * sizeof(int);
 
         ptrs.momsl = (realPoint*)cudaNew((nbodies - 1) * order, sizeof(realPoint));
 
         //printf("ALLOCATED for MOMS = %d bytes for %d bodies, order = %d, sizeof = %d\n", int((nbodies - 1) * order * sizeof(realPoint)), nbodies - 1, order, sizeof(realPoint));
 
         ptrs.El = nullptr;
         //mem += (nbodies - 1) * order * sizeof(realPoint);
 
         ptrs.maxrl = (realPoint*)cudaNew(blocks * FACTOR1, sizeof(realPoint));
         ptrs.minrl = (realPoint*)cudaNew(blocks * FACTOR1, sizeof(realPoint));
         //mem += 2 * blocks * FACTOR1 * sizeof(realPoint);
 
         ptrs.MmortonCodesKeyUnsortl = (int*)cudaNew(nbodies, sizeof(int));
         ptrs.MmortonCodesKeyl = (int*)cudaNew(nbodies, sizeof(int));
         ptrs.MmortonCodesIdxUnsortl = (int*)cudaNew(nbodies, sizeof(int));
         ptrs.MmortonCodesIdxl = (int*)cudaNew(nbodies, sizeof(int));
         //mem += 4 * nbodies * sizeof(int);
 
         ptrs.Mposl = (realPoint*)cudaNew(nbodies - 1, sizeof(realPoint));
         ptrs.Mlowerl = (realPoint*)cudaNew(nbodies - 1, sizeof(realPoint));
         ptrs.Mupperl = (realPoint*)cudaNew(nbodies - 1, sizeof(realPoint));
         //mem += 3 * (nbodies - 1) * sizeof(realPoint);
 
         ptrs.Mparentl = (int*)cudaNew(nnodes, sizeof(int));
         //mem += nnodes * sizeof(int);
 
         ptrs.Mchildl = (intPair*)cudaNew(nbodies - 1, sizeof(intPair));
         //mem += (nbodies - 1) * sizeof(intPair);
 
         ptrs.Mrangel = (intPair*)cudaNew(nnodes, sizeof(intPair)); //Нужно ли для всех?
         //mem += nnodes * sizeof(intPair);
 
         ptrs.MlevelUnsortl = (int*)cudaNew(nbodies - 1, sizeof(int));
         ptrs.MlevelSortl = (int*)cudaNew(nbodies - 1, sizeof(int));
         ptrs.MindexUnsortl = (int*)cudaNew(nbodies - 1, sizeof(int));
         ptrs.MindexSortl = (int*)cudaNew(nbodies - 1, sizeof(int));
         ptrs.MindexSortTl = (int*)cudaNew(nbodies - 1, sizeof(int));
         //mem += 5 * (nbodies - 1) * sizeof(int);
 
         endtime = omp_get_wtime();
         return endtime - starttime;
     }

Here is the caller graph for this function:

void BHcu::rebuildBaseTree	(	CUDApointers &	ptrs,
		const int	nbodies,
		const realVortex *	vtxl,
		int	nnodes,
		int	order,
		double *	timing
	)

Definition at line 174 of file wrapper.cpp.

     {
         timing[0] += cuInitializationKernel();
         timing[1] += McuBoundingBoxKernel(ptrs, nbodies, vtxl);
         
         timing[2] += McuMortonCodesKernel(ptrs, nbodies, vtxl);
         timing[2] += McuMortonInternalNodesKernel(ptrs, nbodies);
         timing[2] += McuMortonInternalCellsGeometryKernel(ptrs, nbodies, nnodes);
 
         timing[3] += cuClearKernel2(ptrs, order, nnodes, nbodies);
 
         timing[4] += cuAABBKernel2(ptrs, nnodes, nbodies, vtxl);
 
         timing[4] += cuClearKernel2(ptrs, order, nnodes, nbodies);
 
         timing[4] += cuSummarizationKernel2(ptrs, order, nnodes, nbodies, vtxl);
     }

Here is the caller graph for this function:

double BHcu::wrapperDiffusiveVelo	(	const realVortex *	vtxl,
		real *	i1l,
		realPoint *	i2l,
		real *	epsastl,
		CUDApointers &	ptrs,
		bool	rebuild,
		int	nbodies,
		double *	timing,
		real	eps,
		real	theta,
		size_t &	nbodiesOld,
		int	nbodiesUp,
		int	order,
		size_t	nAfls,
		size_t *	nVtxs,
		double **	ptrVtxs
	)

Definition at line 497 of file wrapper.cpp.

     {
         double starttime, endtime;
         starttime = omp_get_wtime();
 
         //Число мультипроцессоров, заполняется функцией  setBlocks(blocks)
         int blocks;
 
         //Число ячеек дерева и тел
         int nnodes, nnodesUp;
 
         //Радиус вихря и параметр близости и их квадраты
         real epssq = (real)(eps * eps);
         real itolsq = (real)(1 / (theta * theta));
 
         CudaSelect(0);
         setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)        
 
         nnodes = nbodies * 2;
         if (nnodes < 1024 * blocks)
             nnodes = 1024 * blocks;
         while ((nnodes & (32 - 1)) != 0)  // 32 - это размер варпа
             nnodes++;
         nnodes--;
 
         if (rebuild)
         {
             nnodesUp = nbodiesUp * 2;
             if (nnodesUp < 1024 * blocks)
                 nnodesUp = 1024 * blocks;
             while ((nnodesUp & (32 - 1)) != 0)  // 32 - это размер варпа
                 nnodesUp++;
             nnodesUp--;
         }
 
         KernelsOptimization();
 
         for (int i = 0; i < 6; i++)
             timing[i] = 0;
 
         if (rebuild)
         {
             if (nbodiesUp > nbodiesOld)
                 timing[1] += memoryAllocate(ptrs, nnodesUp, nbodiesUp, (int)nbodiesOld, blocks, order);
 
             nbodiesOld = nbodiesUp;
             rebuildBaseTree(ptrs, nbodies, vtxl, nnodes, order, timing);
         }
 
         timing[5] += cuDiffVelCalculationKernel2(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl, i1l, i2l, true, epsastl, nAfls, nVtxs, ptrVtxs);
         timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
 
         endtime = omp_get_wtime();
         return endtime - starttime;
     }

Here is the call graph for this function:

Here is the caller graph for this function:

double BHcu::wrapperInfluence	(	const realVortex *	vtxl,
		realPoint *	vell,
		real *	epsastl,
		CUDApointers &	ptrs,
		int	nbodies,
		double *	timing,
		real	eps,
		real	theta,
		size_t &	nbodiesOld,
		int	nbodiesUp,
		int	order,
		size_t	nAfls,
		size_t *	nVtxs,
		double **	ptrVtxs
	)

Parameters

vtxl

�� GPU

Parameters

vell

�� GPU

Parameters

epsastl

�� eps* �� GPU

Parameters

ptrs

�� , ��

Parameters

nbodies

��

Parameters

timing

�� 7-��

Parameters

eps

Parameters

theta

Parameters

nbodiesOld

�� , ��

Parameters

nbodiesUp

�� (�� ), ��

Parameters

order

Parameters

nAfls

��

Parameters

nVtxs

�� , ��

Parameters

ptrVtxs

��

Returns

Definition at line 280 of file wrapper.cpp.

     {
         double starttime, endtime;
         starttime = omp_get_wtime();
 
         //Число мультипроцессоров, заполняется функцией  setBlocks(blocks)
         int blocks;
 
         //Число ячеек дерева и тел
         int nnodes, nnodesUp;
 
         //Радиус вихря и параметр близости и их квадраты
         real epssq = (real)(eps * eps);
         real itolsq = (real)(1 / (theta * theta));
 
         CudaSelect(0);
         setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)        
 
         nnodes = nbodies * 2;
         if (nnodes < 1024 * blocks)
             nnodes = 1024 * blocks;
         while ((nnodes & (32 - 1)) != 0)  // 32 - это размер варпа
             nnodes++;
         nnodes--;
 
 
         nnodesUp = nbodiesUp * 2;
         if (nnodesUp < 1024 * blocks)
             nnodesUp = 1024 * blocks;
         while ((nnodesUp & (32 - 1)) != 0)  // 32 - это размер варпа
             nnodesUp++;
         nnodesUp--;
 
         KernelsOptimization();
 
         for (int i = 0; i < 6; i++)
             timing[i] = 0;
 
         if (nbodiesUp > nbodiesOld)
             timing[1] += memoryAllocate(ptrs, nnodesUp, nbodiesUp, (int)nbodiesOld, blocks, order);
 
         nbodiesOld = nbodiesUp;
         rebuildBaseTree(ptrs, nbodies, vtxl, nnodes, order, timing);
 
         timing[5] += cuForceCalculationKernel2points(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl, 
             ptrs.MmortonCodesIdxl, nbodies, vtxl,  vell, true, epsastl, nAfls, nVtxs, ptrVtxs);
         timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
 
         endtime = omp_get_wtime();
         return endtime - starttime;
 
     }

Here is the call graph for this function:

Here is the caller graph for this function:

double BHcu::wrapperInfluenceToPoints	(	const realVortex *	vtxl,
		const realVortex *	pointsl,
		realPoint *	vell,
		real *	epsastl,
		CUDApointers &	ptrs,
		bool	rebuild,
		int	nbodies,
		int	npoints,
		double *	timing,
		real	eps,
		real	theta,
		size_t &	nbodiesOld,
		int	nbodiesUp,
		int	order,
		size_t	nAfls,
		size_t *	nVtxs,
		double **	ptrVtxs
	)

npoints - ��

Definition at line 340 of file wrapper.cpp.

     {
         double starttime, endtime;
         starttime = omp_get_wtime();
 
         //Число мультипроцессоров, заполняется функцией  setBlocks(blocks)
         int blocks;
 
         //Число ячеек дерева и тел
         int nnodes, nnodesUp;
 
         //Радиус вихря и параметр близости и их квадраты
         real epssq = (real)(eps * eps);
         real itolsq = (real)(1 / (theta * theta));
 
         CudaSelect(0);
         setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)        
 
         nnodes = nbodies * 2;
         if (nnodes < 1024 * blocks)
             nnodes = 1024 * blocks;
         while ((nnodes & (32 - 1)) != 0)  // 32 - это размер варпа
             nnodes++;
         nnodes--;
 
         if (rebuild)
         {
             nnodesUp = nbodiesUp * 2;
             if (nnodesUp < 1024 * blocks)
                 nnodesUp = 1024 * blocks;
             while ((nnodesUp & (32 - 1)) != 0)  // 32 - это размер варпа
                 nnodesUp++;
             nnodesUp--;
         }
 
         KernelsOptimization();
 
 
         for (int i = 0; i < 6; i++)
             timing[i] = 0;
 
         if (rebuild)
         {
             if (nbodiesUp > nbodiesOld)
                 timing[1] += memoryAllocate(ptrs, nnodesUp, nbodiesUp, (int)nbodiesOld, blocks, order);
 
             nbodiesOld = nbodiesUp;
             rebuildBaseTree(ptrs, nbodies, vtxl, nnodes, order, timing);
         }
 
         CudaSorter srt(npoints, pointsl);
         timing[5] += srt.calc();
 
         timing[5] += cuForceCalculationKernel2points(ptrs, order, nnodes, nbodies, itolsq, epssq, vtxl, srt.pointsMortonCodesIdxl, npoints, pointsl, vell, true, epsastl,
             nAfls, nVtxs, ptrVtxs);
 
 
         timing[6] = timing[1] + timing[2] + timing[3] + timing[4] + timing[5];
 
         endtime = omp_get_wtime();
         return endtime - starttime;
 
     }

Here is the call graph for this function:

double BHcu::wrapperInfluenceToRHS	(	const realVortex *	dev_ptr_vt,
		const double *	dev_ptr_pt,
		double *	dev_ptr_rhs,
		double *	dev_ptr_rhslin,
		CUDApointers &	ptrs,
		bool	rebuild,
		int	nvt,
		int	nTotPan,
		double *	timingsToRHS,
		double	theta,
		size_t &	nbodiesOld,
		int	nbodiesUp,
		int	order,
		int	scheme
	)

Definition at line 410 of file wrapper.cpp.

     {
         double starttime, endtime;
         starttime = omp_get_wtime();
         
         //Число мультипроцессоров, заполняется функцией  setBlocks(blocks)
         int blocks;
 
         //Число ячеек дерева и тел
         int nnodes, nnodesUp;
 
         //Радиус вихря и параметр близости и их квадраты
         real itolsq = (real)(1 / (theta * theta));
 
         CudaSelect(0);
         setBlocks(blocks); //"достает" число блоков, равное числу мультипроцессоров (blocks - по ссылке)        
 
         nnodes = nvt * 2;
         if (nnodes < 1024 * blocks)
             nnodes = 1024 * blocks;
         while ((nnodes & (32 - 1)) != 0)  // 32 - это размер варпа
             nnodes++;
         nnodes--;
 
         if (rebuild)
         {
             nnodesUp = nbodiesUp * 2;
             if (nnodesUp < 1024 * blocks)
                 nnodesUp = 1024 * blocks;
             while ((nnodesUp & (32 - 1)) != 0)  // 32 - это размер варпа
                 nnodesUp++;
             nnodesUp--;
         }
 
         KernelsOptimization();
 
         for (int i = 0; i < 6; i++)
             timingsToRHS[i] = 0;
 
         if (rebuild)
         {
             if (nbodiesUp > nbodiesOld)
                 timingsToRHS[1] += memoryAllocate(ptrs, nnodesUp, nbodiesUp, (int)nbodiesOld, blocks, order);
 
             nbodiesOld = nbodiesUp;
             rebuildBaseTree(ptrs, nvt, dev_ptr_vt, nnodes, order, timingsToRHS);
         }
 
         Vortex2D* pointsl = (Vortex2D*)cudaNew(nTotPan, sizeof(Vortex2D));
         realPoint* El = (realPoint*)cudaNew(nTotPan * order, sizeof(realPoint));
 
         McuVerticesToControlPoints(nTotPan, (double*)dev_ptr_pt, (double*)pointsl);
 
         CudaSorter srt(nTotPan, pointsl);
         timingsToRHS[5] += srt.calc();
 
         double* ptrToLin = nullptr;
         if (scheme == 1)
             ptrToLin = dev_ptr_rhslin;
 
         timingsToRHS[5] += cuRhsCalculationKernel(ptrs, order, nnodes, nvt, itolsq, dev_ptr_vt, 
             srt.pointsMortonCodesIdxl, El,
             nTotPan, dev_ptr_pt, (const real*)pointsl, dev_ptr_rhs, ptrToLin);
 
         cudaDelete(El);
         cudaDelete(pointsl);
 
         timingsToRHS[6] = timingsToRHS[1] + timingsToRHS[2] + timingsToRHS[3] + timingsToRHS[4] + timingsToRHS[5];
 
         endtime = omp_get_wtime();
         return endtime - starttime;
         
     }

Here is the call graph for this function:

Here is the caller graph for this function:

Variable Documentation

const real BHcu::IDPI = (real)0.15915494309189534

Definition at line 95 of file wrapper.cpp.

Classes

Functions

Variables

Function Documentation

Variable Documentation